├── HBLR ├── HBLR.py ├── HBLRWrapper.py └── HBLR_Distribution.py ├── LSSVM ├── LSSVM.py └── LSSVMWrapper.py ├── LogisticRegression ├── LR.py └── LRWrapper.py ├── MTMKL ├── MTMKL.py └── MTMKLWrapper.py ├── NeuralNetworks ├── tensorFlowNetwork.py ├── tensorFlowNetworkMultiTask.py ├── tensorFlowWrapper.py └── tensorFlowWrapperSTL.py ├── README.md ├── __pycache__ └── helperFuncs.cpython-35.pyc ├── example_data.csv ├── generic_wrapper.py ├── helperFuncs.py ├── jobs_to_run.txt ├── make_datasets.py ├── mtl_nn_clusters.png └── run_jobs.py /HBLR/HBLR.py: -------------------------------------------------------------------------------- 1 | ''' Hierarchical Bayesian Logistic Regression (HBLR) 2 | 3 | This model draws logistic regression weights for each task from a shared 4 | Dirichlet Process (DP) prior. The DP prior induces a clustering of tasks based 5 | on the learned decision boundaries, such that the number of distinct decision 6 | boundaries is equivalent to the number of clusters. 7 | 8 | A set of matrices stores the degree of membership of each task in each cluster, 9 | and the weights for each cluster. 10 | 11 | For more information about this method, see: 12 | Xue, Y., Liao, X., Carin, L., & Krishnapuram, B. (2007). Multi-task learning 13 | for classification with dirichlet process priors. Journal of Machine Learning 14 | Research, 8(Jan), 35-63. 15 | ''' 16 | 17 | import matplotlib 18 | matplotlib.use('Agg') 19 | import numpy as np 20 | import pandas as pd 21 | import math 22 | import scipy #scipy.special.psi is the derivative of the log of the gamma function 23 | import scipy.linalg as la 24 | import scipy.special 25 | import copy 26 | import sys 27 | import matplotlib.pyplot as plt 28 | from sklearn.metrics import roc_auc_score 29 | import HBLR_Distribution 30 | 31 | 32 | ACC_LOGGED_EVERY_N_STEPS = 10 33 | 34 | def plotConvergence(metric, title, save_path=None): 35 | plt.figure() 36 | plt.plot(metric,'o-') 37 | plt.xlabel('Iteration') 38 | plt.ylabel(title) 39 | if save_path is not None: 40 | plt.savefig(save_path) 41 | plt.close() 42 | else: 43 | plt.show() 44 | 45 | '''Given a dataset, trains the model''' 46 | class HBLR: 47 | 48 | ''' DATA FORMAT: a list of dicts. Each list item is a task, indexed by its number. Each task is a dict, 49 | containing keys 'X' and 'Y', which are the data matrix and label vector, respectively 50 | Note that the X matrix should not contain columns like user_id, timestamp 51 | Each X is of size num points for that task by number features 52 | Each Y is of size num points for that task by 1 (column vector) 53 | 54 | Conventions: 55 | -compute functions are for computing internal parameters used in update functions 56 | -update functions are for computing parameters of the model used for prediction 57 | ''' 58 | def __init__(self, task_dict, mu=None, sigma=None, tau10=5e-2, tau20=5e-2, K=None, max_iterations=150, 59 | xi_tolerance=1e-2,debug=False,verbose=True): 60 | self.n_tasks = len(task_dict) 61 | self.K = self.n_tasks if K is None else K 62 | 63 | self.debug = debug 64 | self.verbose = verbose 65 | self.task_dict = task_dict 66 | self.num_feats = np.shape(task_dict[0]['X'])[1] 67 | # TODO: Should we be checking if every task has the same number of features? 68 | 69 | #hyperparameters 70 | self.mu = mu if mu is not None else np.zeros((1,self.num_feats)) 71 | self.sigma = sigma if sigma is not None else np.eye(self.num_feats) * 10.0 72 | self.tau10 = tau10 73 | self.tau20 = tau20 74 | 75 | #model parameters 76 | self.phi = None # np array shape is n_tasks x K 77 | self.xi = None # a list of lists. First index is task number, second is data point within task 78 | self.theta = None # a matrix of size K x num_feats 79 | self.gamma = None # a list of size K of covariance matrices of size num_feats x num_feats 80 | 81 | #internal parameters 82 | self.small_phi1 = None # a vector of size K-1 used in computing phi 83 | self.small_phi2 = None # a vector of size K-1 used in computing phi 84 | self.s = None # a matrix of size n_tasks x K using in computing phi 85 | self.tau1 = None # used to compute small phi2 86 | self.tau2 = None # used to compute small phi2 87 | self.task_vectors = None # used to compute theta, matrix of size n_tasks x num_feats, 88 | # only computed once at the beginning 89 | 90 | #store metrics for convergence of parameters 91 | self.xi_convergence_list = [] #take max of abs(prev - new) over all tasks 92 | self.phi_convergence_list = [] #take norm of prev matrix - new matrix 93 | self.s_convergence_list = [] #take norm of prev matrix - new matrix 94 | self.gamma_convergence_list = [] #take max of abs(prev - new) over all clusters 95 | self.theta_convergence_list = [] #take norm of prev matrix - new matrix 96 | 97 | # 98 | self.max_iterations = max_iterations 99 | self.xi_tolerance = xi_tolerance 100 | 101 | def setHyperParameters(self, mu, sigma, tau10, tau20): 102 | self.mu = mu 103 | self.sigma = sigma 104 | self.tau10 = tau10 105 | self.tau20 = tau20 106 | 107 | def initializeAllParameters(self): 108 | self.phi = (1.0 / self.K) * np.ones((self.n_tasks,self.K)) 109 | self.theta = np.tile(self.mu, (self.K, 1)) 110 | self.gamma = [self.sigma for i in range(self.K)] 111 | self.xi = [[0]* len(self.task_dict[i]['Y']) for i in range(self.n_tasks)] 112 | self.computeXi() 113 | self.tau1 = self.tau10 114 | self.tau2 = self.tau20 115 | self.computeSmallPhis() 116 | self.computeTaus() 117 | self.s = np.zeros((self.n_tasks,self.K)) 118 | self.computeTaskVectors() 119 | 120 | self.xi_convergence_list = [] 121 | self.phi_convergence_list = [] 122 | self.s_convergence_list = [] 123 | self.gamma_convergence_list = [] 124 | self.theta_convergence_list = [] 125 | 126 | if self.debug: 127 | print "initial phi", self.phi 128 | print "initial small phi1", self.small_phi1 129 | print "initial small phi2", self.small_phi2 130 | print "initial tau1", self.tau1, "tau2", self.tau2 131 | 132 | def trainUntilConverged(self): 133 | self.initializeAllParameters() 134 | 135 | i=0 136 | while iself.xi_tolerance): 137 | if self.debug: 138 | print "----------------" 139 | print "iteration",i 140 | 141 | plt.imshow(self.phi) 142 | plt.show() 143 | 144 | prev_xi = copy.deepcopy(self.xi) 145 | prev_phi = copy.deepcopy(self.phi) 146 | prev_s = copy.deepcopy(self.s) 147 | prev_gamma = copy.deepcopy(self.gamma) 148 | prev_theta = copy.deepcopy(self.theta) 149 | 150 | self.updateAllParameters() 151 | if self.K>2: 152 | restart = self.pruneK() 153 | if restart: 154 | if self.verbose: print "Restarting now with K=",self.K 155 | self.initializeAllParameters() 156 | self.updateAllParameters() 157 | i=0 158 | continue 159 | 160 | if i % ACC_LOGGED_EVERY_N_STEPS == 0: 161 | acc = [] 162 | auc = [] 163 | for j in range(len(self.task_dict)): 164 | preds0 = self.predictBinary(self.task_dict[j]['X'],j) 165 | task_Y = self.task_dict[j]['Y'] 166 | if 0 in task_Y and 1 in task_Y: 167 | auc.append(roc_auc_score(task_Y, preds0)) 168 | acc.append(getBinaryAccuracy(preds0,task_Y)) 169 | #else: 170 | # print "doesn't have both tasks",j,task_Y 171 | if self.verbose: 172 | print "Training. Iteration", i 173 | if i>0: 174 | print "\tXi convergence", self.xi_convergence_list[-1] 175 | print "\tavg training accuracy",np.mean(acc) 176 | print "\tavg ROC AUC", np.mean(auc),"\n" 177 | 178 | #compute convergence metrics 179 | if i > 0: 180 | self.xi_convergence_list.append(computeMatrixConvergence(flattenListLists(prev_xi), flattenListLists(self.xi))) 181 | self.phi_convergence_list.append(computeMatrixConvergence(prev_phi, self.phi)) 182 | self.s_convergence_list.append(computeMatrixConvergence(0,self.s)) 183 | self.gamma_convergence_list.append(computeListOfListsConvergence(prev_gamma, self.gamma)) 184 | self.theta_convergence_list.append(computeMatrixConvergence(0, self.theta)) 185 | if self.debug: print "Training. Iteration", i, "- Xi convergence:", self.xi_convergence_list[-1] 186 | 187 | i+=1 188 | 189 | sys.stdout.flush() 190 | 191 | 192 | 193 | def updateAllParameters(self): 194 | self.computeSMatrix() 195 | self.updatePhi() 196 | 197 | self.computeSmallPhis() 198 | self.computeTaus() 199 | self.updateGamma() 200 | self.updateTheta() 201 | self.computeXi() 202 | 203 | def computeTaskVectors(self): 204 | self.task_vectors = np.zeros((self.n_tasks, self.num_feats)) 205 | for m in range(self.n_tasks): 206 | task_X = self.task_dict[m]['X'] 207 | task_Y = self.task_dict[m]['Y'] 208 | # Note that transposes are different because we are using different notation than in the paper - specifically we use row vectors where they are using column vectors 209 | self.task_vectors[m,:] = np.dot((task_Y-0.5).T,task_X) 210 | 211 | def pruneK(self): 212 | num_tasks_in_cluster = self.n_tasks - np.sum(1*(self.phi<1e-16),axis=0) 213 | for k in range(len(num_tasks_in_cluster))[::-1]: 214 | if num_tasks_in_cluster[k]==0: 215 | self.K = self.K - 1 216 | return True 217 | return False 218 | 219 | def computeSMatrix(self): 220 | for m in range(self.n_tasks): 221 | task_X = self.task_dict[m]['X'] 222 | task_Y = self.task_dict[m]['Y'] 223 | task_xi = np.array(self.xi[m]) 224 | 225 | for k in range(self.K): 226 | # Note that transposes are different because we are using different notation than in the paper - specifically we use row vectors where they are using column vectors 227 | 228 | # This does all data points (n) at once 229 | inner = np.dot(np.atleast_2d(self.theta[k,:]).T, np.atleast_2d(self.theta[k,:])) + self.gamma[k] 230 | diag_entries = np.einsum('ij,ij->i', np.dot(task_X, inner), task_X) 231 | s_sum = -rhoFunction(task_xi)*diag_entries 232 | 233 | s_sum += ((task_Y.T - 0.5)* np.dot(np.atleast_2d(self.theta[k,:]), task_X.T))[0,:] 234 | s_sum += np.log(sigmoid(task_xi)) 235 | s_sum += (-0.5)*task_xi 236 | s_sum += rhoFunction(task_xi)*(task_xi**2) 237 | 238 | s_sum = np.sum(s_sum) 239 | 240 | if k < self.K-1: 241 | s_sum = s_sum + scipy.special.psi(self.small_phi1[k]) \ 242 | - scipy.special.psi(self.small_phi1[k] + self.small_phi2[k]) 243 | if k > 0: 244 | for i in range(k): 245 | s_sum = s_sum + scipy.special.psi(self.small_phi2[i]) \ 246 | - scipy.special.psi(self.small_phi1[i] + self.small_phi2[i]) 247 | 248 | 249 | self.s[m,k] = s_sum 250 | if self.debug: print "s:", self.s 251 | 252 | 253 | def updatePhi(self): 254 | a = np.array([np.max(self.s, axis=1)]).T #as used in logsumexp trick https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ 255 | self.phi = np.exp(self.s - (a + np.log(np.atleast_2d(np.sum(np.exp(self.s - a),axis=1)).T))) 256 | if self.debug: 257 | print "phi:", self.phi 258 | 259 | def computeSmallPhis(self): 260 | self.small_phi1 = (1 + np.sum(self.phi,axis=0))[0:-1] 261 | self.small_phi2 = self.tau1 / self.tau2 + np.array([np.sum(self.phi[:,i:]) for i in range(1,self.K)]) 262 | if self.debug: 263 | print "small phi1", self.small_phi1 264 | print "small phi2", self.small_phi2 265 | 266 | def computeTaus(self): 267 | self.tau1 = self.tau10 + self.K - 1 268 | tau2_sum = 0 269 | for k in range(self.K-1): 270 | tau2_sum = tau2_sum + (scipy.special.psi(self.small_phi2[k]) \ 271 | - scipy.special.psi(self.small_phi1[k] + self.small_phi2[k])) 272 | self.tau2 = self.tau20 - tau2_sum 273 | if self.debug: print "tau1", self.tau1, "tau2", self.tau2 274 | 275 | def updateGamma(self): 276 | task_matrices = np.zeros((self.n_tasks, self.num_feats, self.num_feats)) 277 | for m in range(self.n_tasks): 278 | rho_vector = rhoFunction(np.array(self.xi[m])) 279 | rho_vector = rho_vector.reshape((1,-1)) # Make rho vector 2D 280 | task_X = self.task_dict[m]['X'] 281 | # Note that the transposing doesn't exactly match the paper because our data format is slightly different 282 | rho_matrix = abs(rho_vector) * task_X.T 283 | task_matrices[m,:,:] = np.dot(rho_matrix, task_X) 284 | 285 | for k in range(self.K): 286 | inner_sum = np.zeros((self.num_feats,self.num_feats)) 287 | for m in range(self.n_tasks): 288 | inner_sum = inner_sum + self.phi[m,k] * task_matrices[m,:,:] 289 | self.gamma[k] = la.inv(la.inv(self.sigma) + 2*inner_sum) 290 | if self.debug: 291 | print "gamma computation {0}".format(k), la.det(la.inv(self.sigma) + 2*inner_sum) 292 | 293 | def updateTheta(self): 294 | for k in range(self.K): 295 | inner_sum = np.zeros((1,self.num_feats)) 296 | for m in range(self.n_tasks): 297 | inner_sum = inner_sum + self.phi[m,k] * np.atleast_2d(self.task_vectors[m,:]) 298 | self.theta[k,:] = (np.dot(self.gamma[k],(np.dot(la.inv(self.sigma),self.mu.T) + inner_sum.T) )).T 299 | 300 | def computeXi(self): 301 | for m in range(self.n_tasks): 302 | task_X = self.task_dict[m]['X'] 303 | for n in range(len(task_X)): 304 | inner_sum = 0 305 | for k in range(self.K): 306 | # Note that transposes are different because we are using different notation than in the paper - specifically we use row vectors where they are using column vectors 307 | inner_sum += self.phi[m,k]*np.dot((np.dot(np.atleast_2d(task_X[n,:]), 308 | (np.dot(np.atleast_2d(self.theta[k,:]).T, np.atleast_2d(self.theta[k,:])) + self.gamma[k]))), 309 | np.atleast_2d(task_X[n,:]).T) 310 | assert inner_sum >= 0 # This number can't be negative since we are taking the square root 311 | 312 | self.xi[m][n] = np.sqrt(inner_sum[0,0]) 313 | if self.xi[m][n]==0: 314 | print m,n 315 | 316 | def predictBinary(self, X, task): 317 | preds = self.predictProbability(task,X) 318 | return [1.0 if p>= 0.5 else 0.0 for p in preds.flatten() ] 319 | 320 | def predictProbability(self, task, X): 321 | prob = 0 322 | for k in range(self.K): 323 | numerator = np.dot(np.atleast_2d(self.theta[k,:]),X.T) 324 | diag_entries = np.einsum('ij,ij->i', np.dot(X, self.gamma[k]), X) ## 325 | denom = np.sqrt(1.0 + np.pi/8 * diag_entries) 326 | prob = prob + self.phi[task,k] * sigmoid(numerator / denom) 327 | return prob 328 | 329 | 330 | # Code for Predicting for a new task 331 | def metropolisHastingsAlgorithm(self, new_task_X, new_task_y,N_sam=1000): 332 | gauss_weight = (self.tau1/self.tau2)/(self.n_tasks+(self.tau1/self.tau2)) 333 | point_dist_weight = 1.0/(self.n_tasks+(self.tau1/self.tau2)) 334 | point_centers_matrix = self.theta 335 | point_weights = [sum([phi_m[k] for phi_m in self.phi]) for k in range(len(self.phi[0]))] 336 | mu_mult = self.mu[0] # Mu is assumed to be the same for each weight 337 | sigma_mult = self.sigma[0,0] # Sigma is assumed to be a scalar times the idenity matrix 338 | 339 | dist = HBLR_Distribution.MainDistribution(gauss_weight,point_dist_weight,point_centers_matrix, point_weights,mu_mult,sigma_mult) 340 | 341 | w_dot_array = [np.atleast_2d(dist.rvs(size=1))] 342 | for i in range(N_sam-1): 343 | w_hat = np.atleast_2d(dist.rvs(size=1)) 344 | accept_prob = min(1,self.dataProb(new_task_X,new_task_y,w_hat)/self.dataProb(new_task_X,new_task_y,w_dot_array[-1])) 345 | if np.random.uniform()=0.5 else 0.0 for p in predictions] 366 | return predictions 367 | 368 | 369 | # Helper function 370 | def flattenListLists(listLists): 371 | return np.array([item for sublist in listLists for item in sublist]) 372 | 373 | # mathematical helper functions 374 | def sigmoid(x): 375 | return 1.0 / (1.0 + np.exp(-x)) 376 | 377 | def rhoFunction(x): 378 | assert len(np.where(x==0)[0]) == 0 #there should not be any zeros passed to this function 379 | 380 | return (0.5 - sigmoid(x)) / (2.0*x) 381 | 382 | def computeMatrixConvergence(prev, new): 383 | return la.norm(new-prev) 384 | 385 | def computeListOfListsConvergence(prev, new): 386 | assert len(prev) == len(new) 387 | 388 | max_diff = 0 389 | for i in range(len(prev)): 390 | diff = la.norm(np.array(new[i])-np.array(prev[i])) 391 | if diff > max_diff: 392 | max_diff = diff 393 | return max_diff 394 | 395 | def getBinaryAccuracy(pred,true_labels): 396 | assert len(pred)==len(true_labels) 397 | 398 | correct_labels = [1 for i in range(len(pred)) if pred[i]==true_labels[i]] 399 | 400 | return len(correct_labels)/float(len(pred)) 401 | -------------------------------------------------------------------------------- /HBLR/HBLRWrapper.py: -------------------------------------------------------------------------------- 1 | """Performs hyperparameter sweep for Hierarchical Bayesian Logistic Regression 2 | (HBLR)""" 3 | 4 | import matplotlib 5 | matplotlib.use('Agg') 6 | import numpy as np 7 | import pandas as pd 8 | import pickle 9 | import sys 10 | import os 11 | import copy 12 | from time import time 13 | from sklearn.metrics import roc_auc_score 14 | import matplotlib.pyplot as plt 15 | 16 | CODE_PATH = os.path.dirname(os.getcwd()) 17 | sys.path.append(CODE_PATH) 18 | 19 | DEFAULT_RESULTS_PATH = '/Your/path/here/' 20 | DEFAULT_DATASETS_PATH = '/Your/path/here/' 21 | DEFAULT_FIGURES_PATH = '/Your/path/here/' 22 | 23 | import HBLR as hblr 24 | import helperFuncs as helper 25 | 26 | DEFAULT_NUM_CROSS_FOLDS = 5 27 | DEFAULT_MAX_ITERS = 75 28 | SAVE_RESULTS_EVERY_X_TESTS = 1 29 | DEFAULT_VALIDATION_TYPE = 'cross' 30 | 31 | 32 | ''' Notes: 33 | -Parameters to tune: tau10, tau20, mu, sigma 34 | -ratio between tau10 and tau20 controls the number of clusters. A greater ratio = more clusters 35 | -successful run was done with tau10 = tau20 = 0.05 36 | -small sigma might be good. e.g. 0.1*I 37 | -mu is usually 0. not testing for now 38 | -set number of clusters: 39 | -for wellbeing measures as tasks go with default (K=num_tasks) 40 | -for users as tasks no more than 25 41 | ''' 42 | 43 | def reloadFiles(): 44 | reload(hblr) 45 | reload(helper) 46 | 47 | class HBLRWrapper: 48 | 49 | def __init__(self, file_prefix, users_as_tasks=False, num_cross_folds=DEFAULT_NUM_CROSS_FOLDS, cont=False, 50 | results_path=DEFAULT_RESULTS_PATH, figures_path=DEFAULT_FIGURES_PATH, datasets_path=DEFAULT_DATASETS_PATH, 51 | test_run=False, max_iters=DEFAULT_MAX_ITERS, val_type=DEFAULT_VALIDATION_TYPE, optimize_labels=None, 52 | test_csv_filename=None): 53 | self.results_path = results_path 54 | self.figures_path = figures_path 55 | self.datasets_path = datasets_path 56 | self.save_prefix = self.getSavePrefix(file_prefix, replace=cont) 57 | self.cont=cont 58 | self.max_iters = max_iters 59 | self.val_type = val_type 60 | self.users_as_tasks = users_as_tasks 61 | self.file_prefix = file_prefix 62 | if test_csv_filename is not None: 63 | self.test_csv_filename = self.datasets_path + test_csv_filename 64 | else: 65 | self.test_csv_filename = None 66 | self.test_tasks = helper.loadPickledTaskList(datasets_path, file_prefix, "Test") 67 | self.train_tasks = helper.loadPickledTaskList(datasets_path, file_prefix, "Train") 68 | if self.val_type != 'cross': 69 | self.val_tasks = helper.loadPickledTaskList(datasets_path, file_prefix, "Val") 70 | self.initializeHBLRModel(self.train_tasks) 71 | else: 72 | self.classifier = None 73 | 74 | if users_as_tasks: 75 | self.K = 25 76 | else: 77 | self.K = len(self.test_tasks) 78 | self.n_feats = helper.calculateNumFeatsInTaskList(self.test_tasks) 79 | self.n_tasks = len(self.test_tasks) 80 | 81 | if optimize_labels is None: 82 | self.optimize_labels = ['tomorrow_Group_Happiness_Evening_Label', 'tomorrow_Group_Health_Evening_Label', 'tomorrow_Group_Calmness_Evening_Label'] 83 | else: 84 | self.optimize_labels = optimize_labels 85 | 86 | #parameters that can be tuned 87 | self.tau10s=[10, 1, 0.05, 0.01] 88 | self.tau20s=[1.0, 0.05, 0.01] 89 | self.sigma_multipliers = [.01,0.1, 1] 90 | self.mu_multipliers = [0.0] 91 | 92 | if test_run: 93 | print "This is only a testing run. Using cheap settings to make it faster" 94 | self.K = 2 95 | self.max_iters = 5 96 | self.n_tasks = 2 97 | self.tau10s=[1] 98 | self.tau20s=[.1] 99 | self.sigma_multipliers=[.01] 100 | self.mu_multipliers=[0] 101 | 102 | self.calcNumSettingsDesired() 103 | 104 | #storing the results 105 | self.time_sum = 0 106 | if cont: 107 | self.val_results_df = pd.DataFrame.from_csv(self.results_path + self.save_prefix + '.csv') 108 | print '\nPrevious validation results df loaded. It has', len(self.val_results_df), "rows" 109 | self.started_from = len(self.val_results_df) 110 | else: 111 | self.val_results_df = pd.DataFrame() 112 | self.started_from = 0 113 | 114 | self.num_cross_folds = num_cross_folds 115 | if self.val_type == 'cross': 116 | helper.generateCrossValPickleFiles(self.datasets_path, self.file_prefix, self.num_cross_folds) 117 | 118 | 119 | def initializeHBLRModel(self, train_tasks): 120 | self.classifier = hblr.HBLR(train_tasks,K=self.K, debug=False,max_iterations=self.max_iters, verbose=False) 121 | 122 | def getSavePrefix(self, file_prefix, replace=False): 123 | dash_loc = file_prefix.find('-') 124 | prefix = "hblr" + file_prefix[dash_loc:-1] 125 | if not replace: 126 | while os.path.exists(self.results_path + prefix + '.csv'): 127 | prefix = prefix + '2' 128 | return prefix 129 | 130 | def calcNumSettingsDesired(self): 131 | self.num_settings = len(self.tau10s) * len(self.tau20s) * len(self.mu_multipliers) \ 132 | * len(self.sigma_multipliers) 133 | 134 | # use something like the following to test only one set of parameters: 135 | # wrapper.setParams(tau10s=[.05], tau20s=[.05], sigma_multipliers=[.1,.01]) 136 | def setParams(self, tau10s=None, tau20s=None, sigma_multipliers=None, mu_multipliers=None): 137 | '''does not override existing parameter settings if the parameter is not set''' 138 | self.tau10s = tau10s if tau10s is not None else self.tau10s 139 | self.tau20s = tau20s if tau20s is not None else self.tau10s 140 | self.sigma_multipliers = sigma_multipliers if sigma_multipliers is not None else self.sigma_multipliers 141 | self.mu_multipliers = mu_multipliers if mu_multipliers is not None else self.mu_multipliers 142 | 143 | def settingAlreadyDone(self, tau10, tau20, mu_mult, sigma_mult): 144 | if len(self.val_results_df[(self.val_results_df['tau10']== tau10) & \ 145 | (self.val_results_df['tau20']== tau20) & \ 146 | (self.val_results_df['sigma_multiplier']== mu_mult) & \ 147 | (self.val_results_df['mu_multiplier']== sigma_mult)]) > 0: 148 | print "setting already tested" 149 | return True 150 | else: 151 | return False 152 | 153 | def setClassifierToSetting(self, tau10, tau20, sigma_mult, mu_mult): 154 | sigma = sigma_mult * np.eye(self.n_feats) 155 | mu = mu_mult * np.ones((1,self.n_feats)) 156 | 157 | self.classifier.setHyperParameters(mu, sigma, tau10, tau20) 158 | 159 | def getAccuracyAucOnAllTasks(self, task_list): 160 | all_task_Y = [] 161 | all_preds = [] 162 | for i in range(len(task_list)): 163 | preds, task_Y = self.getPredsTrueOnOneTask(task_list,i) 164 | if preds is None: 165 | # Skipping task because it does not have valid data 166 | continue 167 | if len(task_Y)>0: 168 | all_task_Y.extend(task_Y) 169 | all_preds.extend(preds) 170 | if not helper.containsEachLabelType(all_preds): 171 | print "for some bizarre reason, the preds for all tasks are the same class" 172 | print "preds", all_preds 173 | print "true_y", all_task_Y 174 | auc = np.nan 175 | else: 176 | auc=roc_auc_score(all_task_Y, all_preds) 177 | acc=hblr.getBinaryAccuracy(all_preds,all_task_Y) 178 | return acc,auc 179 | 180 | def getPredsTrueOnOneTask(self, task_list, task): 181 | if not helper.isValidTask(task_list, task): 182 | return None, None 183 | task_Y = list(task_list[task]["Y"]) 184 | return self.classifier.predictBinary(task_list[task]['X'], task), task_Y 185 | 186 | def getAccuracyAucOnOneTask(self, task_list, task): 187 | preds, task_Y = self.getPredsTrueOnOneTask(task_list,task) 188 | if preds is None: 189 | # Returning nan for task because it does not have valid data 190 | return np.nan, np.nan 191 | acc = hblr.getBinaryAccuracy(preds,task_Y) 192 | if len(task_Y) <= 1 or not helper.containsEachLabelType(preds): 193 | auc = np.nan 194 | else: 195 | auc = roc_auc_score(task_Y, preds) 196 | return acc,auc 197 | 198 | def getValidationResults(self, results_dict): 199 | self.classifier.trainUntilConverged() 200 | results_dict['num_clusters'] = self.classifier.K 201 | 202 | if self.users_as_tasks: 203 | val_acc, val_auc = self.getAccuracyAucOnAllTasks(self.val_tasks) 204 | results_dict['val_acc'] = val_acc 205 | results_dict['val_auc'] = val_auc 206 | else: 207 | accs = [] 208 | aucs = [] 209 | for t in range(self.n_tasks): 210 | acc, auc = self.getAccuracyAucOnOneTask(self.val_tasks, t) 211 | task_name = self.val_tasks[t]['Name'] 212 | results_dict['TaskAcc-' + helper.getFriendlyLabelName(task_name)] = acc 213 | results_dict['TaskAuc-' + helper.getFriendlyLabelName(task_name)] = auc 214 | if task_name in self.optimize_labels: 215 | accs.append(acc) 216 | aucs.append(auc) 217 | results_dict['val_acc'] = np.nanmean(accs) 218 | results_dict['val_auc'] = np.nanmean(aucs) 219 | return results_dict 220 | 221 | def getCrossValidationResults(self, results_dict, tau10, tau20, sigma_mult, mu_mult, save_plots=False, print_per_fold=False): 222 | if save_plots: 223 | same_task_matrix = np.zeros((self.n_tasks,self.n_tasks)) 224 | 225 | clusters = [0] * self.num_cross_folds 226 | 227 | all_acc = [] 228 | all_auc = [] 229 | all_f1 = [] 230 | all_precision = [] 231 | all_recall = [] 232 | if not self.users_as_tasks: 233 | per_task_accs = [[] for i in range(self.n_tasks)] 234 | per_task_aucs = [[] for i in range(self.n_tasks)] 235 | per_task_f1 = [[] for i in range(self.n_tasks)] 236 | per_task_precision = [[] for i in range(self.n_tasks)] 237 | per_task_recall = [[] for i in range(self.n_tasks)] 238 | 239 | for f in range(self.num_cross_folds): 240 | train_tasks, val_tasks = helper.loadCrossValData(self.datasets_path, self.file_prefix, f, reshape=True) 241 | 242 | self.initializeHBLRModel(train_tasks) 243 | self.setClassifierToSetting(tau10, tau20, sigma_mult, mu_mult) 244 | self.classifier.trainUntilConverged() 245 | 246 | clusters[f] = self.classifier.K 247 | 248 | if save_plots: same_task_matrix = self.updateSameTaskMatrix(same_task_matrix) 249 | 250 | # Get results! 251 | fold_preds = [] 252 | fold_true_y = [] 253 | for t in range(self.n_tasks): 254 | preds = self.classifier.predictBinary(val_tasks[t]['X'], t) 255 | true_y = list(val_tasks[t]['Y'].flatten()) 256 | 257 | if len(preds)==0 or len(true_y) == 0: 258 | continue 259 | 260 | if not self.users_as_tasks: 261 | # save the per-task results 262 | t_acc, t_auc, t_f1, t_precision, t_recall = helper.computeAllMetricsForPreds(preds, true_y) 263 | per_task_accs[t].append(t_acc) 264 | per_task_aucs[t].append(t_auc) 265 | per_task_f1[t].append(t_f1) 266 | per_task_precision[t].append(t_precision) 267 | per_task_recall[t].append(t_recall) 268 | if print_per_fold: print "Fold", f, "Task", val_tasks[t]['Name'], "acc", t_acc, "auc", t_auc, "f1", t_f1, "precision",t_precision,"recall",t_recall 269 | 270 | fold_preds.extend(preds) 271 | fold_true_y.extend(true_y) 272 | 273 | 274 | acc, auc, f1, precision, recall = helper.computeAllMetricsForPreds(fold_preds, fold_true_y) 275 | all_acc.append(acc) 276 | all_auc.append(auc) 277 | all_f1.append(f1) 278 | all_precision.append(precision) 279 | all_recall.append(recall) 280 | if print_per_fold: print "Fold", f, "acc", acc, "auc", auc, "f1", f1, "precision",precision,"recall",recall 281 | 282 | print "accs for all folds", all_acc 283 | print "aucs for all folds", all_auc 284 | print "clusters for all folds", clusters 285 | 286 | if save_plots: 287 | self.plotAccuracyAucAndClusters(all_acc, all_auc, clusters) 288 | self.saveHintonPlot(same_task_matrix, self.num_cross_folds) 289 | pd.DataFrame(same_task_matrix).to_csv(self.results_path + self.save_prefix + "-same_task_matrix.csv") 290 | 291 | # Add results to the dictionary 292 | results_dict['val_acc'] = np.nanmean(all_acc) 293 | results_dict['val_auc'] = np.nanmean(all_auc) 294 | results_dict['val_f1'] = np.nanmean(all_f1) 295 | results_dict['val_precision'] = np.nanmean(all_precision) 296 | results_dict['val_recall'] = np.nanmean(all_recall) 297 | results_dict['num_clusters'] = np.nanmean(clusters) 298 | 299 | # Add per-task results to the dictionary 300 | if not self.users_as_tasks: 301 | for t in range(self.n_tasks): 302 | task_name = val_tasks[t]['Name'] 303 | results_dict['TaskAcc-' + helper.getFriendlyLabelName(task_name)] = np.nanmean(per_task_accs[t]) 304 | results_dict['TaskAuc-' + helper.getFriendlyLabelName(task_name)] = np.nanmean(per_task_aucs[t]) 305 | results_dict['TaskF1-' + helper.getFriendlyLabelName(task_name)] = np.nanmean(per_task_f1[t]) 306 | results_dict['TaskPrecision-' + helper.getFriendlyLabelName(task_name)] = np.nanmean(per_task_precision[t]) 307 | results_dict['TaskRecall-' + helper.getFriendlyLabelName(task_name)] = np.nanmean(per_task_recall[t]) 308 | 309 | return results_dict 310 | 311 | def testOneSetting(self, tau10, tau20, sigma_mult, mu_mult): 312 | if self.cont: 313 | if self.settingAlreadyDone(tau10, tau20, sigma_mult, mu_mult): 314 | return 315 | 316 | t0 = time() 317 | 318 | results_dict = {'tau10':tau10, 'tau20': tau20, 'sigma_multiplier':sigma_mult, 'mu_multiplier':mu_mult} 319 | 320 | if self.val_type == 'cross': 321 | results_dict = self.getCrossValidationResults(results_dict, tau10, tau20, sigma_mult, mu_mult) 322 | else: 323 | self.setClassifierToSetting(tau10, tau20, sigma_mult, mu_mult) 324 | results_dict = self.getValidationResults(results_dict) 325 | 326 | self.val_results_df = self.val_results_df.append(results_dict,ignore_index=True) 327 | 328 | print "\n", self.val_results_df.tail(n=1) 329 | t1 = time() 330 | this_time = t1 - t0 331 | print "It took", this_time, "seconds to obtain this result" 332 | 333 | self.time_sum = self.time_sum + this_time 334 | 335 | self.printTimeEstimate() 336 | sys.stdout.flush() 337 | 338 | #output the file every few iterations for safekeeping 339 | if len(self.val_results_df) % SAVE_RESULTS_EVERY_X_TESTS == 0: 340 | self.val_results_df.to_csv(self.results_path + self.save_prefix + '.csv') 341 | 342 | def printTimeEstimate(self): 343 | num_done = len(self.val_results_df)-self.started_from 344 | num_remaining = self.num_settings - num_done - self.started_from 345 | avg_time = self.time_sum / num_done 346 | total_secs_remaining = int(avg_time * num_remaining) 347 | hours = total_secs_remaining / 60 / 60 348 | mins = (total_secs_remaining % 3600) / 60 349 | secs = (total_secs_remaining % 3600) % 60 350 | 351 | print "\n", num_done, "settings processed so far,", num_remaining, "left to go" 352 | print "Estimated time remaining:", hours, "hours", mins, "mins", secs, "secs" 353 | 354 | def sweepAllParameters(self): 355 | print "\nSweeping all parameters!" 356 | 357 | self.calcNumSettingsDesired() 358 | print "\nYou have chosen to test a total of", self.num_settings, "settings" 359 | sys.stdout.flush() 360 | 361 | #sweep all possible combinations of parameters 362 | for tau10 in self.tau10s: 363 | for tau20 in self.tau20s: 364 | for sigma_mult in self.sigma_multipliers: 365 | for mu_mult in self.mu_multipliers: 366 | self.testOneSetting(tau10, tau20, sigma_mult, mu_mult) 367 | self.val_results_df.to_csv(self.results_path + self.save_prefix + '.csv') 368 | 369 | def findBestSetting(self, save_final_results=False): 370 | accuracies = self.val_results_df['val_acc'].tolist() 371 | max_acc = max(accuracies) 372 | max_idx = accuracies.index(max_acc) 373 | 374 | print "BEST SETTING!" 375 | print "The highest validation accuracy of", max_acc, "was found with the following settings:" 376 | print self.val_results_df.iloc[max_idx] 377 | 378 | if self.test_csv_filename is not None or save_final_results: 379 | self.getFinalResultsAndSave(self.val_results_df.iloc[max_idx]) 380 | else: 381 | print "Not running Final results" 382 | return self.val_results_df.iloc[max_idx] 383 | 384 | def run(self): 385 | self.sweepAllParameters() 386 | return self.findBestSetting() 387 | 388 | def getFinalResultsAndSave(self, setting_dict): 389 | if self.val_type == 'cross': 390 | print "\nPlotting cross-validation results for best settings..." 391 | self.getCrossValidationResults(dict(), setting_dict['tau10'], setting_dict['tau20'], 392 | setting_dict['sigma_multiplier'], setting_dict['mu_multiplier'], 393 | save_plots=True) 394 | 395 | 396 | print "\nRetraining on training data with the best settings..." 397 | self.initializeHBLRModel(self.train_tasks) 398 | self.classifier.verbose = True 399 | self.setClassifierToSetting(setting_dict['tau10'], setting_dict['tau20'], setting_dict['sigma_multiplier'], setting_dict['mu_multiplier']) 400 | self.classifier.trainUntilConverged() 401 | 402 | print "\nPlotting and saving cool stuff about the final model..." 403 | self.saveImagePlot(self.classifier.phi, 'Phi') 404 | pd.DataFrame(self.classifier.phi).to_csv(self.results_path + self.save_prefix + "-phi.csv") 405 | self.saveConvergencePlots() 406 | 407 | print "\nEvaluating results on held-out test set!! ..." 408 | all_preds = [] 409 | all_true_y = [] 410 | all_X_data = [] 411 | per_task_accs = [np.nan] * self.n_tasks 412 | per_task_aucs = [np.nan] * self.n_tasks 413 | per_task_f1 = [np.nan] * self.n_tasks 414 | per_task_precision = [np.nan] * self.n_tasks 415 | per_task_recall = [np.nan] * self.n_tasks 416 | for t in range(self.n_tasks): 417 | preds = self.classifier.predictBinary(self.test_tasks[t]['X'], t) 418 | true_y = list(self.test_tasks[t]['Y'].flatten()) 419 | 420 | if len(preds)==0 or len(true_y) == 0: 421 | continue 422 | 423 | all_preds.extend(preds) 424 | all_true_y.extend(true_y) 425 | all_X_data.extend(self.test_tasks[t]['X']) 426 | 427 | # save the per-task results 428 | t_acc, t_auc, t_f1, t_precision, t_recall = helper.computeAllMetricsForPreds(preds, true_y) 429 | per_task_accs[t] = t_acc 430 | per_task_aucs[t] = t_auc 431 | per_task_f1[t] = t_f1 432 | per_task_precision[t] = t_precision 433 | per_task_recall[t] = t_recall 434 | 435 | print "\tHELD OUT TEST METRICS COMPUTED BY APPENDING ALL PREDS" 436 | acc, auc, f1, precision, recall = helper.computeAllMetricsForPreds(all_preds, all_true_y) 437 | print '\t\tAcc:', acc, 'AUC:', auc, 'F1:', f1, 'Precision:', precision, 'Recall:', recall 438 | 439 | print "\n\tHELD OUT TEST METRICS COMPUTED BY AVERAGING OVER TASKS" 440 | avg_acc = np.nanmean(per_task_accs) 441 | avg_auc = np.nanmean(per_task_aucs) 442 | avg_f1 = np.nanmean(per_task_f1) 443 | avg_precision = np.nanmean(per_task_precision) 444 | avg_recall = np.nanmean(per_task_recall) 445 | print '\t\tAcc:', avg_acc, 'AUC:', avg_auc, 'F1:', avg_f1, 'Precision:', avg_precision, 'Recall:', avg_recall 446 | 447 | print "\n\tHELD OUT TEST METRICS COMPUTED FOR EACH TASK" 448 | if not self.users_as_tasks: 449 | for t in range(self.n_tasks): 450 | task_name = self.test_tasks[t]['Name'] 451 | if not self.users_as_tasks: task_name=helper.getFriendlyLabelName(task_name) 452 | print "\t\t", task_name, "- Acc:", per_task_accs[t], "AUC:", per_task_aucs[t], 'F1:', per_task_f1[t], 'Precision:', per_task_precision[t], 'Recall:', per_task_recall[t] 453 | 454 | if self.test_csv_filename is not None: 455 | print "\tSAVING HELD OUT PREDICITONS" 456 | if self.users_as_tasks: 457 | task_column = 'user_id' 458 | label_name = helper.getFriendlyLabelName(self.file_prefix) 459 | wanted_label = helper.getOfficialLabelName(label_name) 460 | predictions_df = helper.get_test_predictions_for_df_with_task_column( 461 | self.classifier.predictBinary, self.test_csv_filename, task_column, self.test_tasks, 462 | wanted_label=wanted_label, num_feats_expected=np.shape(self.test_tasks[0]['X'])[1], 463 | label_name=label_name, tasks_are_ints=False) 464 | else: 465 | predictions_df = helper.get_test_predictions_for_df_with_no_task_column(self.classifier.predictBinary, 466 | self.test_csv_filename, self.test_tasks, num_feats_expected=np.shape(self.test_tasks[0]['X'])[1]) 467 | predictions_df.to_csv(self.results_path + "Preds-" + self.save_prefix + '.csv') 468 | else: 469 | print "Uh oh, the test csv filename was not set, can't save test preds" 470 | 471 | print "\t SAVING CLASSIFIER" 472 | with open(self.results_path + "PickledModel-" + self.save_prefix + '.p',"w") as f: 473 | pickle.dump(self.classifier,f) 474 | 475 | def saveHintonPlot(self, matrix, num_tests, max_weight=None, ax=None): 476 | """Draw Hinton diagram for visualizing a weight matrix.""" 477 | fig,ax = plt.subplots(1,1) 478 | 479 | if not max_weight: 480 | max_weight = 2**np.ceil(np.log(np.abs(matrix).max())/np.log(2)) 481 | 482 | ax.patch.set_facecolor('gray') 483 | ax.set_aspect('equal', 'box') 484 | ax.xaxis.set_major_locator(plt.NullLocator()) 485 | ax.yaxis.set_major_locator(plt.NullLocator()) 486 | 487 | for (x, y), w in np.ndenumerate(matrix): 488 | color = 'white' if w > 0 else 'black' 489 | size = np.sqrt(np.abs(0.5*w/num_tests)) # Need to scale so that it is between 0 and 0.5 490 | rect = plt.Rectangle([x - size / 2, y - size / 2], size, size, 491 | facecolor=color, edgecolor=color) 492 | ax.add_patch(rect) 493 | 494 | ax.autoscale_view() 495 | ax.invert_yaxis() 496 | plt.savefig(self.figures_path + self.save_prefix + '-Hinton.eps') 497 | plt.close() 498 | 499 | def plotAccuracyAucAndClusters(self, accs, aucs, clusters): 500 | fig,(ax1,ax2,ax3) = plt.subplots(3,1,figsize=(10,10)) 501 | ax1.hist(accs) 502 | ax1.set_title("Accuracy") 503 | ax2.hist(aucs) 504 | ax2.set_title("AUC") 505 | ax3.hist(clusters) 506 | ax3.set_title("Number of Clusters (K)") 507 | plt.savefig(self.figures_path + self.save_prefix + '-AccAucClusters.eps') 508 | plt.close() 509 | 510 | def saveConvergencePlots(self): 511 | hblr.plotConvergence(self.classifier.xi_convergence_list, 'Xi convergence', save_path=self.figures_path + self.save_prefix + '-ConvergenceXi.eps') 512 | hblr.plotConvergence(self.classifier.theta_convergence_list, 'Theta convergence', save_path=self.figures_path +self.save_prefix + '-ConvergenceTheta.eps') 513 | hblr.plotConvergence(self.classifier.phi_convergence_list, 'Phi convergence', save_path=self.figures_path +self.save_prefix + '-ConvergencePhi.eps') 514 | 515 | def saveImagePlot(self, matrix, name): 516 | plt.figure() 517 | plt.imshow(matrix) 518 | plt.savefig(self.figures_path + self.save_prefix + "-" + name + ".eps") 519 | plt.close() 520 | 521 | def updateSameTaskMatrix(self, same_task_matrix): 522 | most_likely_cluster = np.argmax(self.classifier.phi,axis=1) 523 | for row_task in range(self.n_tasks): 524 | for col_task in range(self.n_tasks): 525 | if most_likely_cluster[row_task]==most_likely_cluster[col_task]: 526 | same_task_matrix[row_task,col_task]+=1 527 | return same_task_matrix 528 | 529 | 530 | if __name__ == "__main__": 531 | print "HBLR MODEL SELECTION" 532 | print "\tThis code will sweep a set of parameters to find the ideal settings for HBLR for a single dataset" 533 | 534 | if len(sys.argv) < 3: 535 | print "Error: usage is python HBLRWrapper.py " 536 | print "\t: e.g. datasetTaskList-Discard-Future-Group_ - program will look in the following directory for this file", DEFAULT_DATASETS_PATH 537 | print "\t: type 'users' for users as tasks, or 'wellbeing' for wellbeing measures as tasks" 538 | print "\t: optional. If 'True', the wrapper will pick up from where it left off by loading a previous validation results file" 539 | print "\t: optional. If you want to get the final test results, provide the name of a csv file to test on" 540 | sys.exit() 541 | filename= sys.argv[1] #get data file from command line argument 542 | print "\nLoading dataset", DEFAULT_DATASETS_PATH + filename 543 | print "" 544 | 545 | if sys.argv[2] == 'users': 546 | users_as_tasks = True 547 | print "Okay, treating users as tasks. Will not print per-task results" 548 | else: 549 | users_as_tasks = False 550 | print "Okay, treating wellbeing measures as tasks. Will save and print per-task results" 551 | 552 | if len(sys.argv) >= 4 and sys.argv[3] == 'True': 553 | cont = True 554 | print "Okay, will continue from a previously saved validation results file for this problem" 555 | else: 556 | cont = False 557 | print "" 558 | 559 | if len(sys.argv) >= 5: 560 | csv_test_file = sys.argv[4] 561 | print "Okay, will get final test results on file", csv_test_file 562 | print "" 563 | else: 564 | csv_test_file = None 565 | 566 | wrapper = HBLRWrapper(filename, users_as_tasks=users_as_tasks, cont=cont, test_csv_filename=csv_test_file) 567 | 568 | print "\nThe following parameter settings will be tested:" 569 | print "\ttau10: \t", wrapper.tau10s 570 | print "\ttau20: \t", wrapper.tau20s 571 | print "\tsigma multipliers: \t", wrapper.sigma_multipliers 572 | print "\tmu multipliers: \t", wrapper.mu_multipliers 573 | 574 | print "\nThe validation results dataframe will be saved in:", wrapper.results_path + wrapper.save_prefix + '.csv' 575 | print "\nThe validation and testing figures will be saved in:", wrapper.figures_path + wrapper.save_prefix 576 | 577 | wrapper.run() 578 | -------------------------------------------------------------------------------- /HBLR/HBLR_Distribution.py: -------------------------------------------------------------------------------- 1 | from scipy import stats 2 | import numpy as np 3 | 4 | 5 | # 6 | 7 | class SubDistribution(stats.rv_continuous): 8 | def __init__(self, gauss_weight, point_dist_weight, point_centers, point_weights, mu=0, sigma=1): 9 | super(SubDistribution, self).__init__() 10 | self.gauss_weight = gauss_weight 11 | self.point_dist_weight = point_dist_weight 12 | self.point_centers = point_centers 13 | self.point_weights = point_weights 14 | 15 | self.normal_dist = stats.norm(loc=mu, scale=sigma) # scale is standard deviation 16 | 17 | def _pdf(self, x): 18 | gauss_pdf = self.normal_dist.pdf(x) 19 | point_pdf = np.array( 20 | [self.point_weights[self.point_centers.index(x_i)] if x_i in self.point_centers else 0 for x_i in x]) 21 | 22 | return self.gauss_weight * gauss_pdf + self.point_dist_weight * point_pdf 23 | 24 | def _cdf(self, x): 25 | gauss_cdf = self.normal_dist.cdf(x) 26 | point_cdf = np.array( 27 | [sum(w for p_c, w in zip(self.point_centers, self.point_weights) if p_c < x_i) for x_i in x]) 28 | 29 | return self.gauss_weight * gauss_cdf + self.point_dist_weight * point_cdf 30 | 31 | 32 | class MainDistribution(): 33 | def __init__(self, gauss_weight, point_dist_weight, point_centers_matrix, point_weights, mu=0, sigma=1): 34 | self.gauss_weight = gauss_weight # (tau1/tau2)/(M+(tau1/tau2)) 35 | self.point_dist_weight = point_dist_weight # (1/(M+(tau1/tau2)) 36 | self.point_centers_matrix = point_centers_matrix # theta_ks; size K by num_feats 37 | self.point_weights = point_weights # sum_m=1^M phi_m,k; size K 38 | self.mu = mu 39 | self.sigma = sigma 40 | self.dists = [] # list of distributions, assumed to be independent of one another; size of num_feats 41 | self.set_up_dists() 42 | 43 | def set_up_dists(self): 44 | # print [self.point_centers_matrix[j][0] for j in range(len(self.point_centers_matrix))] 45 | # print [self.point_centers_matrix[j][1] for j in range(len(self.point_centers_matrix))] 46 | self.dists = [SubDistribution(self.gauss_weight, self.point_dist_weight, 47 | [self.point_centers_matrix[j][i] for j in range(len(self.point_centers_matrix))], 48 | self.point_weights, self.mu, self.sigma) for i in 49 | range(len(self.point_centers_matrix[0]))] 50 | 51 | def rvs(self, size): 52 | random_sample = [] 53 | for i in range(size): 54 | random_sample.append([d.rvs() for d in self.dists]) 55 | 56 | return random_sample 57 | 58 | def marginal_pdfs(self, x): 59 | return [d.pdf(x) for d in self.dists] 60 | 61 | def marginal_cdfs(self, x): 62 | return [d.cdf(x) for d in self.dists] 63 | 64 | 65 | def test_single_distribution(): 66 | import matplotlib.pyplot as plt 67 | from matplotlib.colors import LogNorm 68 | from mpl_toolkits.mplot3d import axes3d 69 | from matplotlib import cm 70 | 71 | distribution = SubDistribution(.5, .5, [0, 2.39994], [.2, .8], mu=.5) 72 | 73 | x_vals = np.linspace(-3, 3, 100001) 74 | plt.plot(x_vals, distribution.pdf(x_vals)) 75 | plt.show() 76 | 77 | plt.plot(x_vals, distribution.cdf(x_vals)) 78 | plt.show() 79 | 80 | samples = distribution.rvs(size=1000) 81 | 82 | print "Percent of samples at 0", sum(abs(samples) < 1e-13) / float(len(samples)) 83 | print "Percent of samples at 2", sum(abs(samples - 2.39994) < 1e-13) / float(len(samples)) 84 | 85 | plt.hist(samples, bins=np.arange(-4, 4, .25), normed=True) 86 | plt.plot(x_vals, distribution.pdf(x_vals)) 87 | plt.show() 88 | 89 | 90 | def test_full_distribution(): 91 | import matplotlib.pyplot as plt 92 | from matplotlib.colors import LogNorm 93 | from mpl_toolkits.mplot3d import axes3d 94 | from matplotlib import cm 95 | 96 | distribution = MainDistribution(.5, .5, [[0, .5], [0, -2.0], [-3.0, 3.0]], [.2, .6, .2]) 97 | 98 | x_vals = np.linspace(-5, 5, 1001) 99 | pdfs = distribution.marginal_pdfs(x_vals) 100 | cdfs = distribution.marginal_cdfs(x_vals) 101 | plt.figure() 102 | plt.subplot(2, 1, 1) 103 | plt.plot(x_vals, pdfs[0]) 104 | plt.subplot(2, 1, 2) 105 | plt.plot(x_vals, pdfs[1]) 106 | plt.show() 107 | 108 | plt.figure() 109 | plt.subplot(2, 1, 1) 110 | plt.plot(x_vals, cdfs[0]) 111 | plt.subplot(2, 1, 2) 112 | plt.plot(x_vals, cdfs[1]) 113 | plt.show() 114 | 115 | samples = distribution.rvs(size=3000) 116 | plt.figure() 117 | plt.subplot(2, 1, 1) 118 | plt.hist([s[0] for s in samples]) 119 | plt.subplot(2, 1, 2) 120 | plt.hist([s[1] for s in samples]) 121 | plt.show() 122 | 123 | # normal distribution center at x=0 and y=5 124 | 125 | plt.hist2d([s[0] for s in samples], [s[1] for s in samples], bins=40, norm=LogNorm()) 126 | plt.colorbar() 127 | plt.show() 128 | 129 | joint_pdf = np.atleast_2d(pdfs[0]) * np.atleast_2d(pdfs[0]).T 130 | 131 | fig = plt.figure() 132 | ax = fig.add_subplot(111, projection='3d') 133 | # Plot a basic wireframe. 134 | # ax.plot_wireframe(np.matlib.repmat(x_vals,1001,1), np.matlib.repmat(np.atleast_2d(x_vals).T,1,1001), joint_pdf, rstride=10, cstride=10) 135 | surf = ax.plot_surface(np.matlib.repmat(x_vals, 1001, 1), np.matlib.repmat(np.atleast_2d(x_vals).T, 1, 1001), 136 | joint_pdf, cmap=cm.coolwarm, linewidth=0, antialiased=False) 137 | 138 | plt.show() 139 | 140 | 141 | -------------------------------------------------------------------------------- /LSSVM/LSSVM.py: -------------------------------------------------------------------------------- 1 | """Implements a Least Squares Support Vector Machine (LS-SVM).""" 2 | from sklearn.metrics.pairwise import rbf_kernel 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | class LSSVM: 8 | def __init__(self, C, kernel_func, debug=False): 9 | ''' Least-squares svm : svm with squared loss and binary classification yi in {-1,1} 10 | C: 11 | kernel_func: function that takes 2 arguments (X1, X2) and returns kernel matrix of size (len(X1),len(X2)) 12 | ''' 13 | 14 | self.C = C 15 | self.kernel_func = kernel_func 16 | 17 | self.data = None 18 | self.y = None 19 | self.b = None 20 | self.alphas = None 21 | 22 | self.debug = debug 23 | 24 | def fit(self,X,y): 25 | ''' Linear program 26 | AX = b 27 | |0 Y.T | * | b | = |0| 28 | |Y Omega+(1/C)*I | | alpha | |1| 29 | 30 | Note Omega[i,j] = y[i]*y[j]*K(x[i],x[j]) 31 | ''' 32 | self.data = X 33 | 34 | 35 | # Make sure y is the right dimension 36 | y = np.atleast_2d(y) 37 | if np.shape(y)[0]==1: 38 | y = y.T 39 | 40 | self.y = y 41 | 42 | N = len(X) 43 | 44 | K = self.kernel_func(self.data,self.data) 45 | 46 | Omega = np.dot(self.y,self.y.T)*K 47 | bottom_right = Omega+(1.0/self.C)*np.eye(N) 48 | 49 | assert np.shape(bottom_right)==(N,N), "The bottom left matrix is the wrong size" 50 | 51 | if self.debug: 52 | print "K",K 53 | print "K nans", np.sum(np.isnan(K)) 54 | 55 | 56 | 57 | first_row = np.hstack([np.zeros((1,1)),self.y.T]) 58 | bottom_mat = np.hstack([self.y,bottom_right]) 59 | A = np.vstack([first_row,bottom_mat]) 60 | 61 | b_vec = np.vstack([0,np.ones((N,1))]) 62 | 63 | try: 64 | params,residuals, rank,s = np.linalg.lstsq(A,b_vec) 65 | except: 66 | print "\n------WARNING!!! These parameters didn't converge!------\n" 67 | return False 68 | 69 | self.b = params[0] 70 | self.alphas = params[1:] 71 | 72 | return True 73 | 74 | 75 | def predict(self,test_data): 76 | assert (self.b is not None) and (self.alphas is not None), "Model not trained yet" 77 | 78 | K = self.kernel_func(self.data,test_data) 79 | 80 | alphaY = self.alphas*self.y 81 | 82 | y_hat = np.sign(np.dot(alphaY.T,K)+self.b) 83 | 84 | return y_hat[0] 85 | -------------------------------------------------------------------------------- /LSSVM/LSSVMWrapper.py: -------------------------------------------------------------------------------- 1 | """Performs hyperparameter sweep for the Least Squares Support Vector Machine 2 | (LS-SVM).""" 3 | import matplotlib 4 | matplotlib.use('Agg') 5 | import numpy as np 6 | import pandas as pd 7 | import sys 8 | import os 9 | import copy 10 | from time import time 11 | from sklearn.metrics.pairwise import rbf_kernel 12 | 13 | CODE_PATH = os.path.dirname(os.getcwd()) 14 | sys.path.append(CODE_PATH) 15 | 16 | DEFAULT_RESULTS_PATH = '/Your/path/here/' 17 | DEFAULT_DATASETS_PATH = '/Your/path/here/' 18 | DEFAULT_FIGURES_PATH = '/Your/path/here/' 19 | 20 | from generic_wrapper import STLWrapper 21 | import helperFuncs as helper 22 | import LSSVM as lssvm 23 | 24 | C_VALS = [0.1, 1.0, 10.0, 100.0] #values for the C parameter of SVM to test 25 | BETA_VALS = [.0001, .01, .1, 1] #values for the Beta parameter of rbf kernel to test 26 | KERNELS = ['linear', 'rbf'] #could also try 'poly' and 'sigmoid' 27 | DEFAULT_VALIDATION_TYPE = 'cross' #'val' #'cross' for cross-validation, 'val' for single validation 28 | VERBOSE = True #set to true to see more output 29 | NUM_BOOTSTRAPS = 5 30 | DEFAULT_NUM_CROSS_FOLDS = 5 31 | SAVE_RESULTS_EVERY_X_TESTS = 1 32 | 33 | def reload_dependencies(): 34 | reload(helper) 35 | reload(lssvm) 36 | 37 | class LSSVMWrapper(STLWrapper): 38 | def __init__(self, file_prefix, users_as_tasks=False, cont=False, c_vals=C_VALS, beta_vals=BETA_VALS, 39 | kernels=KERNELS, num_cross_folds=DEFAULT_NUM_CROSS_FOLDS, dropbox_path=PATH_TO_DROPBOX, 40 | datasets_path='Data/', test_csv_filename=None): 41 | self.c_vals = c_vals 42 | self.beta_vals=beta_vals 43 | self.kernels = kernels 44 | 45 | STLWrapper.__init__(self, file_prefix, users_as_tasks=users_as_tasks, cont=cont, 46 | classifier_name='LSSVM', num_cross_folds=num_cross_folds, dropbox_path=dropbox_path, 47 | datasets_path=datasets_path, cant_train_with_one_class=True, 48 | save_results_every_nth=SAVE_RESULTS_EVERY_X_TESTS, test_csv_filename=test_csv_filename) 49 | 50 | self.trim_extra_linear_params() 51 | 52 | self.models = [None] * self.n_tasks 53 | 54 | def define_params(self): 55 | self.params = {} 56 | self.params['C'] = self.c_vals 57 | self.params['beta'] = self.beta_vals 58 | self.params['kernel'] = self.kernels 59 | 60 | def train_and_predict_task(self, t, train_X, train_y, eval_X, param_dict): 61 | kernel_func = self.get_kernel_func(param_dict['kernel'], param_dict['beta']) 62 | self.models[t] = lssvm.LSSVM(C=param_dict['C'], kernel_func=kernel_func) 63 | converged = self.models[t].fit(train_X,train_y) 64 | 65 | if converged: 66 | preds = self.models[t].predict(eval_X) 67 | else: 68 | # predict majority class 69 | preds = np.sign(np.mean(train_y))*np.ones(len(eval_X)) 70 | 71 | return preds 72 | 73 | def predict_task(self, X, t): 74 | if self.models[t] is None: 75 | print "ERROR! No model has been trained!" 76 | 77 | preds = self.models[t].predict(X) 78 | return (preds + 1.0) / 2 79 | 80 | # use something like the following to test only one set of parameters: 81 | # wrapper.setParams(c_vals=[10], beta_vals=[.01], kernels=['rbf']) 82 | def set_params(self, c_vals=None, beta_vals=None, kernels=None): 83 | '''does not override existing parameter settings if the parameter is not set''' 84 | self.c_vals = c_vals if c_vals is not None else self.c_vals 85 | self.beta_vals = beta_vals if beta_vals is not None else self.beta_vals 86 | self.kernels= kernels if kernels is not None else self.kernels 87 | self.define_params() 88 | 89 | def get_kernel_func(self,kernel_name, beta): 90 | if kernel_name == 'rbf': 91 | def rbf(x1,x2): 92 | return rbf_kernel(x1,x2, gamma=beta) # from sklearn 93 | return rbf 94 | else: 95 | def dot_product(x1,x2): 96 | return np.dot(x1,x2.T) 97 | return dot_product 98 | 99 | def trim_extra_linear_params(self): 100 | single_beta = None 101 | i = 0 102 | while i < len(self.list_of_param_settings): 103 | setting = self.list_of_param_settings[i] 104 | if setting['kernel'] == 'linear': 105 | if single_beta is None: 106 | single_beta = setting['beta'] 107 | elif setting['beta'] != single_beta: 108 | self.list_of_param_settings.remove(setting) 109 | continue 110 | i += 1 111 | 112 | if __name__ == "__main__": 113 | print "LSSVM MODEL SELECTION" 114 | print "\tThis code will sweep a set of parameters to find the ideal settings for LS SVM for a single dataset" 115 | 116 | if len(sys.argv) < 3: 117 | print "Error: usage is python LSSVMWrapper.py " 118 | print "\t: e.g. datasetTaskList-Discard-Future-Group_ - program will look in the following directory for this file", DEFAULT_DATASETS_PATH 119 | print "\t: type 'users' for users as tasks, or 'wellbeing' for wellbeing measures as tasks" 120 | print "\t: optional. If 'True', the wrapper will pick up from where it left off by loading a previous validation results file" 121 | print "\t: optional. If you want to get the final test results, provide the name of a csv file to test on" 122 | sys.exit() 123 | file_prefix= sys.argv[1] #get data file from command line argument 124 | print "\nLoading dataset", DEFAULT_DATASETS_PATH + file_prefix 125 | print "" 126 | 127 | if sys.argv[2] == 'users': 128 | users_as_tasks = True 129 | print "Okay, treating users as tasks. Will not print per-task results" 130 | else: 131 | users_as_tasks = False 132 | print "Okay, treating wellbeing measures as tasks. Will save and print per-task results" 133 | 134 | if len(sys.argv) >= 4 and sys.argv[3] == 'True': 135 | cont = True 136 | print "Okay, will continue from a previously saved validation results file for this problem" 137 | else: 138 | cont = False 139 | print "" 140 | 141 | if len(sys.argv) >= 5: 142 | csv_test_file = sys.argv[4] 143 | print "Okay, will get final test results on file", csv_test_file 144 | print "" 145 | else: 146 | csv_test_file = None 147 | 148 | wrapper = LSSVMWrapper(file_prefix, users_as_tasks=users_as_tasks, cont=cont, 149 | test_csv_filename=csv_test_file) 150 | 151 | print "\nThe following parameter settings will be tested:" 152 | print "\tC_VALS: \t", wrapper.c_vals 153 | print "\tBETAS: \t", wrapper.beta_vals 154 | print "\tKERNELS: \t", wrapper.kernels 155 | 156 | print "\nThe validation results dataframe will be saved in:", wrapper.results_path + wrapper.save_prefix + '.csv' 157 | print "\nThe validation and testing figures will be saved in:", wrapper.figures_path + wrapper.save_prefix 158 | 159 | wrapper.run() 160 | 161 | -------------------------------------------------------------------------------- /LogisticRegression/LR.py: -------------------------------------------------------------------------------- 1 | """Simple Logistic Regression (LR) classifier.""" 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import pandas as pd 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn.metrics import roc_curve, auc 7 | import sys 8 | import os 9 | import pickle 10 | 11 | CODE_PATH = os.path.dirname(os.getcwd()) 12 | sys.path.append(CODE_PATH) 13 | import helperFuncs as helper 14 | 15 | def reloadHelper(): 16 | reload(helper) 17 | 18 | # http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html 19 | class LR: 20 | def __init__(self, penalty='l2', C=0.01, tol=0.001, solver= 'liblinear'): 21 | #data features 22 | self.n_features = None 23 | self.train_X = [] 24 | self.train_Y = [] 25 | self.val_X = [] 26 | self.val_Y = [] 27 | self.test_X = [] 28 | self.test_Y = [] 29 | 30 | #classifier features 31 | self.penalty = penalty 32 | self.C = C 33 | self.tolerance = tol 34 | self.solver = solver 35 | 36 | def setTrainData(self, X, Y): 37 | self.train_X = X 38 | self.train_Y = Y 39 | 40 | self.n_features = self.train_X.shape[1] 41 | 42 | def setTestData(self, X, Y): 43 | self.test_X = X 44 | self.test_Y = Y 45 | 46 | def setPenalty(self, penalty): 47 | self.penalty = penalty 48 | 49 | def setC(self, C): 50 | self.C = C 51 | 52 | def setSolver(self, solver): 53 | self.solver = solver 54 | 55 | def setValData(self, X, Y): 56 | self.val_X = X 57 | self.val_Y = Y 58 | 59 | def train(self): 60 | self.classifier = LogisticRegression(penalty=self.penalty, C=self.C, tol=self.tolerance, solver=self.solver) 61 | self.classifier.fit(self.train_X, self.train_Y) 62 | 63 | def predict(self, X): 64 | return self.classifier.predict(X) 65 | 66 | def getScore(self, X, Y): 67 | #returns accuracy 68 | return self.classifier.score(X, Y) 69 | 70 | def getFPRandTPR(self,X,Y): 71 | probas_ = self.classifier.fit(self.train_X, self.train_Y).predict_proba(X) 72 | fpr, tpr, thresholds = roc_curve(Y, probas_[:, 1]) 73 | return fpr, tpr 74 | 75 | def getAUC(self,X,Y): 76 | fpr, tpr = self.getFPRandTPR(X,Y) 77 | return auc(fpr,tpr) 78 | 79 | def saveClassifierToFile(self, filepath): 80 | s = pickle.dumps(self.classifier) 81 | f = open(filepath, 'w') 82 | f.write(s) 83 | 84 | def loadClassifierFromFile(self, filepath): 85 | f2 = open(filepath, 'r') 86 | s2 = f2.read() 87 | self.classifier = pickle.loads(s2) 88 | 89 | -------------------------------------------------------------------------------- /LogisticRegression/LRWrapper.py: -------------------------------------------------------------------------------- 1 | """Performs hyperparameter sweep for the logistic regression (LR) model.""" 2 | import matplotlib 3 | matplotlib.use('Agg') 4 | import numpy as np 5 | import pandas as pd 6 | import sys 7 | import os 8 | import copy 9 | from time import time 10 | from sklearn.metrics.pairwise import rbf_kernel 11 | 12 | CODE_PATH = os.path.dirname(os.getcwd()) 13 | sys.path.append(CODE_PATH) 14 | 15 | DEFAULT_RESULTS_PATH = '/Your/path/here/' 16 | DEFAULT_DATASETS_PATH = '/Your/path/here/' 17 | DEFAULT_FIGURES_PATH = '/Your/path/here/' 18 | 19 | from generic_wrapper import STLWrapper 20 | import helperFuncs as helper 21 | import LR as lr 22 | 23 | #Parameter values 24 | C_VALS = [ 0.001, 0.01, 0.1, 1.0, 10.0, 100.0] 25 | PENALTIES = ['l1', 'l2'] 26 | SOLVER = 'liblinear' #newton-cg, lbfgs, liblinear, sag 27 | DEFAULT_VALIDATION_TYPE = 'cross' #'cross' for cross-validation, 'val' for single validation 28 | DEFAULT_NUM_CROSS_FOLDS = 5 29 | NUM_BOOTSTRAPS = 5 30 | VERBOSE = True #set to true to see more output 31 | SAVE_RESULTS_EVERY_X_TESTS = 1 32 | 33 | def reload_dependencies(): 34 | reload(helper) 35 | reload(lssvm) 36 | 37 | class LRWrapper(STLWrapper): 38 | def __init__(self, file_prefix, users_as_tasks=False, cont=False, c_vals=C_VALS, 39 | penalties=PENALTIES, solver=SOLVER, num_cross_folds=DEFAULT_NUM_CROSS_FOLDS, 40 | dropbox_path=PATH_TO_DROPBOX, datasets_path='Data/', 41 | test_csv_filename=None): 42 | self.c_vals = c_vals 43 | self.penalties = penalties 44 | self.solver = solver 45 | 46 | STLWrapper.__init__(self, file_prefix, users_as_tasks=users_as_tasks, cont=cont, 47 | classifier_name='LR', num_cross_folds=num_cross_folds, dropbox_path=dropbox_path, 48 | datasets_path=datasets_path, cant_train_with_one_class=False, 49 | save_results_every_nth=SAVE_RESULTS_EVERY_X_TESTS, test_csv_filename=test_csv_filename) 50 | 51 | self.models = [None] * self.n_tasks 52 | 53 | def define_params(self): 54 | self.params = {} 55 | self.params['C'] = self.c_vals 56 | self.params['penalty'] = self.penalties 57 | 58 | def train_and_predict_task(self, t, train_X, train_y, eval_X, param_dict): 59 | self.models[t] = lr.LR(penalty=param_dict['penalty'], C=param_dict['C'], solver=self.solver) 60 | self.models[t].setTrainData(train_X, train_y) 61 | self.models[t].train() 62 | preds = self.models[t].predict(eval_X) 63 | 64 | return preds 65 | 66 | def predict_task(self, X, t): 67 | if self.models[t] is None: 68 | print "ERROR! No model has been trained!" 69 | 70 | preds = self.models[t].predict(X) 71 | return (preds + 1.0) / 2 72 | 73 | # use something like the following to test only one set of parameters: 74 | # wrapper.setParams(c_vals=[10], beta_vals=[.01], kernels=['rbf']) 75 | def set_params(self, c_vals=None, penalties=None, solver=None): 76 | '''does not override existing parameter settings if the parameter is not set''' 77 | self.c_vals = c_vals if c_vals is not None else self.c_vals 78 | self.penalties = penalties if penalties is not None else self.penalties 79 | self.solver = solver if solver is not None else self.solver 80 | self.define_params() 81 | 82 | 83 | if __name__ == "__main__": 84 | print "LOGISTIC REGRESSION (LR) MODEL SELECTION" 85 | print "\tThis code will sweep a set of parameters to find the ideal settings for LR for a single dataset" 86 | 87 | if len(sys.argv) < 3: 88 | print "Error: usage is python LRWrapper.py " 89 | print "\t: e.g. datasetTaskList-Discard-Future-Group_ - program will look in the following directory for this file", DEFAULT_DATASETS_PATH 90 | print "\t: type 'users' for users as tasks, or 'wellbeing' for wellbeing measures as tasks" 91 | print "\t: optional. If 'True', the wrapper will pick up from where it left off by loading a previous validation results file" 92 | print "\t: optional. If you want to get the final test results, provide the name of a csv file to test on" 93 | sys.exit() 94 | file_prefix= sys.argv[1] #get data file from command line argument 95 | print "\nLoading dataset", DEFAULT_DATASETS_PATH + file_prefix 96 | print "" 97 | 98 | if sys.argv[2] == 'users': 99 | users_as_tasks = True 100 | print "Okay, treating users as tasks. Will not print per-task results" 101 | else: 102 | users_as_tasks = False 103 | print "Okay, treating wellbeing measures as tasks. Will save and print per-task results" 104 | 105 | if len(sys.argv) >= 4 and sys.argv[3] == 'True': 106 | cont = True 107 | print "Okay, will continue from a previously saved validation results file for this problem" 108 | else: 109 | cont = False 110 | print "" 111 | 112 | if len(sys.argv) >= 5: 113 | csv_test_file = sys.argv[4] 114 | print "Okay, will get final test results on file", csv_test_file 115 | print "" 116 | else: 117 | csv_test_file = None 118 | 119 | wrapper = LRWrapper(file_prefix, users_as_tasks=users_as_tasks, cont=cont, 120 | test_csv_filename=csv_test_file) 121 | 122 | print "\nThe following parameter settings will be tested:" 123 | print "\tC_VALS: \t", wrapper.c_vals 124 | print "\tPENALTIES: \t", wrapper.penalties 125 | 126 | print "\nOptimization will be performed with the following solver:" 127 | print "\tSolver: \t", wrapper.solver 128 | 129 | print "\nThe validation results dataframe will be saved in:", wrapper.results_path + wrapper.save_prefix + '.csv' 130 | print "\nThe validation and testing figures will be saved in:", wrapper.figures_path + wrapper.save_prefix 131 | 132 | wrapper.run() 133 | 134 | -------------------------------------------------------------------------------- /MTMKL/MTMKL.py: -------------------------------------------------------------------------------- 1 | """Implements Multi-task Multi-kernel Learning (MTMKL) 2 | 3 | This multi-task learning (MTL) classifier learns a set of kernels for different 4 | groups of features (or feature modalities). Each task learns to combine these 5 | kernels with a different set of weights. The weights are regularized globally 6 | to share information among the tasks. 7 | 8 | This model was originally proposed in: 9 | Kandemir, M., Vetek, A., Goenen, M., Klami, A., & Kaski, S. (2014). 10 | Multi-task and multi-view learning of user state. Neurocomputing, 139, 97-106. 11 | """ 12 | import numpy as np 13 | import scipy.optimize as opt 14 | import scipy.linalg as la 15 | from scipy import interp 16 | import math 17 | 18 | from sklearn.metrics import roc_curve, auc 19 | from sklearn.metrics import roc_auc_score 20 | from sklearn.metrics.pairwise import rbf_kernel, euclidean_distances, cosine_similarity 21 | import numpy.linalg as LA 22 | 23 | import pandas as pd 24 | import sys 25 | import os 26 | import random 27 | import pickle 28 | import copy 29 | import operator 30 | import datetime 31 | 32 | from scipy.optimize import minimize 33 | 34 | CODE_PATH = os.path.dirname(os.getcwd()) 35 | sys.path.append(CODE_PATH) 36 | 37 | import helperFuncs as helper 38 | from LSSVM import LSSVM 39 | 40 | def reloadFiles(): 41 | reload(helper) 42 | print "Cannot reload LSSVM because of the way it was imported" 43 | 44 | 45 | reloadFiles() 46 | 47 | DEBUG = False 48 | VERBOSE = False 49 | 50 | class MTMKL: 51 | def __init__(self, task_dict_list, C=100.0, V=0.1, kernel_name='rbf', kernel_param=.01, regularizer=None, max_iter=50, 52 | max_iter_internal=-1, tol=0.001, eta_filename=None, debug=DEBUG, verbose=VERBOSE, drop20PercentTrainingData=False): 53 | '''INPUTS: 54 | task_dict_list: a particular format, defined here: https://docs.google.com/document/d/1BlMaluZnPTa0oznWrfy5sku44ydunv_kalGfG_yz49c/edit?usp=sharing''' 55 | #possible kernels: linear, 'poly', 'rbf', 'sigmoid', 'precomputed' or a callable 56 | 57 | #data features 58 | self.train_tasks = task_dict_list 59 | self.val_tasks = None 60 | self.test_tasks = None 61 | 62 | self.modality_names, self.modality_start_indices = self.getModalityNamesIndices(task_dict_list) 63 | self.modality_start_indices.append(np.shape(task_dict_list[0]['X'])[1]) #append the number of columns 64 | 65 | self.n_tasks = len(self.train_tasks) #number of tasks 66 | self.n_views = len(self.modality_names) #number of views (one view can be one sensor of feature set e.g. physiology features) 67 | self.eta = np.array([[1.0/self.n_views] * self.n_views] * self.n_tasks) #a matrix of size number of tasks x number of sensors 68 | self.last_eta = self.eta 69 | 70 | self.eta_filename = eta_filename 71 | if eta_filename is not None: 72 | eta_file = open(self.eta_filename,'w') 73 | #eta_file.write("//Eta matrix") 74 | eta_file.close() 75 | self.save_etas = True 76 | else: 77 | self.save_etas = False 78 | 79 | #MTMKL parameters 80 | self.V = V #V is a weight placed on the regularization . Small corresponds to unrelated tasks. 81 | #Large is enforcing similar kernel weights across tasks 82 | #Kandemir et al. recommends testing a range from 10^-4 to 10^4 83 | #V=0 is an independent, multi-kernel learner for each task 84 | self.C = C #C parameter for SVM classifiers 85 | self.regularizer = regularizer 86 | self.max_iter = max_iter #max iterations that MTMKL algorithm will run for 87 | self.regularizer_func= None 88 | self.regularizing_grad = None 89 | self.kernel_name = kernel_name 90 | self.setKernel(kernel_name, kernel_param) 91 | self.setRegularizer(regularizer) 92 | 93 | #internal SVM parameters 94 | self.max_iter_internal = max_iter_internal #max iterations for each scikit learn SVM within MTMKL 95 | self.tolerance = tol #convergence criteria for each scikit learn SVM within MTMKL 96 | 97 | self.classifiers = [0] * self.n_tasks 98 | 99 | self.debug=debug 100 | self.verbose=verbose 101 | self.drop20 = drop20PercentTrainingData 102 | 103 | if self.debug: print "MTMKL class has been initialized with", self.n_tasks, "tasks and", self.n_views, "sensors" 104 | 105 | @staticmethod 106 | def getModalityNamesIndices(task_dict_list): 107 | modality_dict = task_dict_list[0]['ModalityDict'] 108 | sorted_tuples = sorted(modality_dict.items(), key=operator.itemgetter(1)) 109 | names = [n for (n,i) in sorted_tuples] 110 | indices = [i for (n,i) in sorted_tuples] 111 | return names,indices 112 | 113 | def setTrainData(self, task_dict_list): 114 | self.train_tasks = task_dict_list 115 | 116 | def setTestData(self, task_dict_list): 117 | self.test_tasks = task_dict_list 118 | 119 | def setValData(self, task_dict_list): 120 | self.val_tasks = task_dict_list 121 | 122 | def setC(self, c): 123 | self.C = c 124 | 125 | def setV(self, V): 126 | self.V = V 127 | 128 | def setKernel(self, kernel_name, kernel_param): 129 | self.kernel_name = kernel_name 130 | if kernel_name == 'rbf': 131 | def rbf(x1,x2): 132 | return rbf_kernel(x1,x2, gamma=kernel_param) # from sklearn 133 | 134 | self.internal_kernel_func = rbf 135 | else: 136 | def dot_product(x1,x2): 137 | return cosine_similarity(x1,x2) # from sklearn - a normalized version of dot product #np.dot(x1,x2.T) 138 | self.internal_kernel_func = dot_product 139 | 140 | def setRegularizer(self,regularizer): 141 | self.regularizer = regularizer 142 | if regularizer == 'L1': 143 | self.regularizer_func = self.eta_L1 144 | self.regularizing_grad = self.eta_grad_L1 145 | else: 146 | self.regularizer_func = self.eta_L2 147 | self.regularizing_grad = self.eta_grad_L2 148 | 149 | def setAllSettings(self, c, v, kernel, beta, regularizer): 150 | self.setC(c) 151 | self.setV(v) 152 | self.setKernel(kernel,beta) 153 | self.setRegularizer(regularizer) 154 | 155 | #kernel will know which column indices belong to which sensor 156 | def constructKernelFunction(self, task): 157 | task_eta = self.eta[task,:] 158 | 159 | def overallKernel(X1,X2): #change to static 160 | K = np.zeros((len(X1),len(X2))) 161 | 162 | for m in range(self.n_views): 163 | sub_x1 = X1[:,self.modality_start_indices[m]:self.modality_start_indices[m+1]] 164 | sub_x2 = X2[:,self.modality_start_indices[m]:self.modality_start_indices[m+1]] 165 | 166 | internal_K = self.internal_kernel_func(sub_x1,sub_x2) 167 | 168 | K = K + task_eta[m] * internal_K/np.max(abs(internal_K)) 169 | 170 | return K 171 | 172 | return overallKernel 173 | 174 | def eta_L1(self): 175 | return -self.V*np.sum(np.dot(self.eta,self.eta.T)) 176 | 177 | def eta_L2(self): 178 | # Note that V should be positive 179 | return self.V*np.sum(euclidean_distances(self.eta,squared=True)) 180 | 181 | def eta_grad_L1(self, eta_mat,v,task_index): 182 | return -v*np.sum(eta_mat,axis=0) 183 | 184 | def eta_grad_L2(self, eta_mat,v,task_index): 185 | # Note that V should be positive 186 | return 2*v*np.sum(eta_mat[task_index,:]-eta_mat,axis=0) 187 | 188 | def computeObjectiveFunction(self,eta_from_fmin): 189 | eta_from_fmin = eta_from_fmin.reshape(self.n_tasks,-1) 190 | #if self.debug: print "eta:", eta_from_fmin 191 | if self.debug: print "sum eta per task:", np.sum(eta_from_fmin,axis=1) 192 | if self.save_etas: 193 | self.saveEtas() 194 | self.eta = eta_from_fmin 195 | 196 | #steps 1 and 2 of Kandemir algorithm 197 | for t in range(self.n_tasks): 198 | if self.debug: 199 | print "Training task", t 200 | print "etas have size", self.eta.shape 201 | sys.stdout.flush() 202 | 203 | X_t, Y_t = self.extractTaskData(self.train_tasks,t,drop20=self.drop20) 204 | 205 | overallKernel = self.constructKernelFunction(t) 206 | 207 | self.classifiers[t] = LSSVM.LSSVM(self.C,kernel_func=overallKernel) 208 | #SVC(C=self.C, kernel=overallKernel, probability=True, max_iter=self.max_iter_internal, tol=self.tolerance) 209 | converged = self.classifiers[t].fit(X_t, Y_t) 210 | assert converged 211 | 212 | 213 | 214 | # Compute the objective function 215 | obj_value = 0 216 | for t in range(self.n_tasks): 217 | X_t, Y_t = self.extractTaskData(self.train_tasks,t,drop20=self.drop20) 218 | 219 | alpha = self.classifiers[t].alphas 220 | 221 | overallKernel = self.constructKernelFunction(t) 222 | K = overallKernel(X_t,X_t) 223 | 224 | obj_value += sum(alpha)-(0.5*1.0/self.C)*sum(alpha**2) -(1.0/2.0)*(np.dot((alpha*Y_t).T,np.dot(K,alpha*Y_t))) 225 | 226 | # add regularizer 227 | obj_value += self.regularizer_func() 228 | 229 | if self.debug: 230 | print "obj function value:", obj_value 231 | print "Eta difference:",self.computeEtaDifference() 232 | print "Training ACC", self.predictAndGetAccuracy(self.train_tasks) 233 | print 234 | 235 | return obj_value 236 | 237 | 238 | # eta_mat has rows for tasks, columns for sensors 239 | def computeMatrixGradient(self,eta_from_fmin): 240 | update = np.zeros((self.n_tasks,self.n_views)) 241 | 242 | for t in range(self.n_tasks): 243 | X_t, Y_t = self.extractTaskData(self.train_tasks,t,drop20=self.drop20) 244 | 245 | alpha = self.classifiers[t].alphas 246 | alphaY = alpha*Y_t 247 | 248 | for m in range(self.n_views): #Used to be numSensors-1 249 | sub_x1 = X_t[:, self.modality_start_indices[m]:self.modality_start_indices[m+1]] 250 | sub_x2 = X_t[:, self.modality_start_indices[m]:self.modality_start_indices[m+1]] 251 | 252 | # Normalize the kernel, could also use k(i, j) = k (i, j) / sqrt(k(i,i) * k(j,j)) 253 | #note, the same procedure for finding the min of sub_x1 and sub_x2 that is used in 254 | #the overall kernel is not required here, since sub_x1 and sub_x2 are guaranteed 255 | #to be the same 256 | internal_K = self.internal_kernel_func(sub_x1,sub_x2) 257 | 258 | update[t,m] = -(1.0/2.0)*(np.dot(alphaY.T,np.dot(internal_K,alphaY))) 259 | 260 | grad_reg = self.regularizing_grad(eta_from_fmin.reshape(self.n_tasks,-1),self.V,t) 261 | 262 | update[t,:] = grad_reg + update[t,:] 263 | 264 | return update.flatten() 265 | 266 | def saveEtas(self): 267 | if self.eta_filename is not None: 268 | eta_file = open(self.eta_filename,'a') 269 | np.savetxt(eta_file,self.eta.flatten()) 270 | eta_file.close() 271 | 272 | def computeEtaDifference(self): 273 | max_diff = 0 274 | for t in range(self.n_tasks): 275 | last_eta_list = self.last_eta[t,:] 276 | eta_list = self.eta[t,:] 277 | 278 | norm = la.norm(last_eta_list - eta_list) 279 | 280 | if norm > max_diff: 281 | max_diff = norm 282 | return max_diff 283 | 284 | def createConstraintList(self): 285 | constraints = [] 286 | 287 | # Equality constraints 288 | for t in range(self.n_tasks): 289 | start = t*self.n_views 290 | end = (t+1)*self.n_views 291 | def fun_eq(x,start=start, end=end): 292 | res = np.array([np.sum(x[start:end])-1.0]) 293 | return res 294 | def jac_func(x,start=start,end=end): 295 | jac= np.zeros(self.n_tasks*self.n_views) 296 | jac[start:end] = 1.0 297 | return jac 298 | cons = {'type':'eq', 299 | 'fun':fun_eq, 300 | 'jac':jac_func} 301 | constraints.append(cons) 302 | 303 | # Inequality constraints 304 | for i in range(self.n_tasks*self.n_views): 305 | def jac_func(x,i=i): 306 | jac= np.zeros(self.n_tasks*self.n_views) 307 | jac[i] = 1.0 308 | return jac 309 | cons = {'type':'ineq', 310 | 'fun':lambda x,i=i: np.array([x[i]]), 311 | 'jac':jac_func} 312 | constraints.append(cons) 313 | 314 | return constraints 315 | 316 | def train(self): 317 | init_etas = self.eta.flatten() 318 | cons = self.createConstraintList() 319 | try: 320 | res = minimize(self.computeObjectiveFunction, init_etas, jac=self.computeMatrixGradient,constraints=cons, method='SLSQP', options={'disp': self.verbose,'maxiter':self.max_iter}) 321 | except: 322 | return False 323 | self.eta = res.x.reshape(self.n_tasks,-1) 324 | 325 | if self.verbose: 326 | print "Results of this run!" 327 | print "\t ETA", self.eta 328 | print "\t Training ACC", self.predictAndGetAccuracy(self.train_tasks) 329 | 330 | return True 331 | 332 | 333 | @staticmethod 334 | def extractTaskData(task_dict_list,t,drop20=False): 335 | X_t = task_dict_list[t]['X'] 336 | Y_t = (task_dict_list[t]['Y']).reshape(-1,1) 337 | 338 | if drop20: 339 | keep_indices = task_dict_list[t]['KeepIndices'] 340 | X_t = X_t[keep_indices] 341 | Y_t = Y_t[keep_indices] 342 | 343 | return X_t, Y_t 344 | 345 | def predict(self, task_dict_list): 346 | ''' input: task_dict_list in the usual format. Will not use the 'Y' key 347 | output: predictions for the y values for each task. So a list of lists, where each inner list 348 | is the y_hat values for a particular task''' 349 | Y_hat = [0] * len(task_dict_list) 350 | for t in range(len(task_dict_list)): 351 | Y_hat[t] = self.predictOneTask(task_dict_list,t) 352 | return Y_hat 353 | 354 | def predictOneTask(self, task_dict_list, t): 355 | X_t, y_t = self.extractTaskData(task_dict_list,t) 356 | if len(X_t) == 0: 357 | return None 358 | else: 359 | return self.internal_predict(X_t, int(t)) 360 | 361 | def internal_predict(self, X_t, t): 362 | return self.classifiers[t].predict(X_t).reshape(-1,1) 363 | 364 | def predict_01(self, X, t): 365 | preds = self.classifiers[t].predict(X).reshape(-1,1) 366 | return (preds + 1.0) / 2 367 | 368 | def getNumErrors(self, Y, Y_hat): 369 | #returns accuracy 370 | errors = np.where(Y * Y_hat < 0)[0] 371 | return len(errors) 372 | 373 | def getAccuracy(self, Y, Y_hat): 374 | score = self.getNumErrors(Y,Y_hat) 375 | return 1.0 - (float(score) / float(len(Y_hat))) 376 | 377 | def predictAndGetNumErrors(self,task_dict_list): 378 | Y_hat = self.predict(task_dict_list) 379 | return self.getNumErrors(task_dict_list['Y'],Y_hat) 380 | 381 | def predictAndGetAccuracy(self,task_dict_list): 382 | Y_hat = self.predict(task_dict_list) 383 | accs = []#[0] * len(task_dict_list) 384 | for t in range(len(task_dict_list)): 385 | accs.append(self.getAccuracy(task_dict_list[t]['Y'],Y_hat[t])) 386 | return np.mean(accs) 387 | 388 | def predictAndGetAccuracyOneTask(self,task_dict_list,t): 389 | Y_hat = self.predictOneTask(task_dict_list,t) 390 | return self.getAccuracy(task_dict_list[t]['Y'],Y_hat[t]) 391 | 392 | def getAccuracyAucAllTasks(self, tasks): 393 | all_task_Y = [] 394 | all_preds = [] 395 | for t in range(len(tasks)): 396 | X_t, y_t = self.extractTaskData(tasks,t) 397 | if len(X_t) == 0: 398 | continue 399 | preds = self.internal_predict(X_t, int(t)) 400 | all_task_Y.extend(y_t) 401 | all_preds.extend(preds) 402 | auc=roc_auc_score(all_task_Y, all_preds) 403 | acc=helper.getBinaryAccuracy(all_preds,all_task_Y) 404 | return acc,auc 405 | 406 | def getAccuracyAucOnOneTask(self, task_list, task, debug=False): 407 | X_t, y_t = self.extractTaskData(task_list,task) 408 | if len(X_t) == 0: 409 | return np.nan, np.nan 410 | 411 | preds = self.internal_predict(X_t, int(task)) 412 | 413 | if debug: 414 | print "y_t:", y_t 415 | print "preds:", preds 416 | 417 | acc = helper.getBinaryAccuracy(preds,y_t) 418 | if len(y_t) > 1 and helper.containsEachSVMLabelType(y_t) and helper.containsEachSVMLabelType(preds): 419 | auc = roc_auc_score(y_t, preds) 420 | else: 421 | auc = np.nan 422 | 423 | return acc, auc 424 | 425 | def getAUC(self,test_tasks): 426 | mean_tpr = 0.0 427 | mean_fpr = np.linspace(0, 1, 100) 428 | for t in range(self.n_tasks): 429 | X_t, Y_t = self.extractTaskData(self.train_tasks,t) 430 | X_test_t, Y_test_t = self.extractTaskData(test_tasks, t) 431 | 432 | overallKernel = self.constructKernelFunction(t) 433 | 434 | self.classifiers[t] = SVC(C=self.C, kernel=overallKernel, probability=True, max_iter=self.max_iter_internal, tol=self.tolerance) 435 | probas_ = self.classifiers[t].fit(X_t, Y_t).predict_proba(X_test_t) 436 | fpr, tpr, thresholds = roc_curve(Y_test_t, probas_[:, 1]) 437 | 438 | mean_tpr += interp(mean_fpr, fpr, tpr) 439 | mean_tpr[0] = 0.0 440 | 441 | mean_tpr /= self.n_tasks 442 | mean_tpr[-1] = 1.0 443 | mean_auc = auc(mean_fpr, mean_tpr) 444 | 445 | return mean_auc, mean_fpr, mean_tpr 446 | 447 | def getAUCOneTask(self,test_tasks,t): 448 | global eta_global 449 | 450 | X_t, Y_t = self.extractTaskData(self.train_tasks,t) 451 | X_test_t, Y_test_t = self.extractTaskData(test_tasks, t) 452 | 453 | overallKernel = self.constructKernelFunction(t) 454 | 455 | self.classifiers[t] = SVC(C=self.C, kernel=overallKernel, probability=True, max_iter=self.max_iter_internal, tol=self.tolerance) 456 | probas_ = self.classifiers[t].fit(X_t, Y_t).predict_proba(X_test_t) 457 | fpr, tpr, thresholds = roc_curve(Y_test_t, probas_[:, 1]) 458 | 459 | return auc(fpr, tpr), fpr, tpr 460 | 461 | def saveClassifierToFile(self, filepath): 462 | s = pickle.dumps(self.classifier) 463 | f = open(filepath, 'w') 464 | f.write(s) 465 | 466 | def loadClassifierFromFile(self, filepath): 467 | f2 = open(filepath, 'r') 468 | s2 = f2.read() 469 | self.classifier = pickle.loads(s2) 470 | 471 | 472 | 473 | 474 | -------------------------------------------------------------------------------- /MTMKL/MTMKLWrapper.py: -------------------------------------------------------------------------------- 1 | """Performs hyperparameter sweep for Multi-task Multi-kernel Learning (MTMKL).""" 2 | import matplotlib 3 | matplotlib.use('Agg') 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import scipy.optimize as opt 7 | #from cvxopt import matrix, solvers 8 | import scipy.linalg as la 9 | import math 10 | from sklearn.svm import SVC 11 | from sklearn.metrics import roc_curve, auc 12 | from sklearn.metrics.pairwise import rbf_kernel 13 | from scipy import interp 14 | import pandas as pd 15 | import sys 16 | import os 17 | import random 18 | import pickle 19 | import copy 20 | import operator 21 | import datetime 22 | from time import time 23 | 24 | CODE_PATH = os.path.dirname(os.getcwd()) 25 | sys.path.append(CODE_PATH) 26 | 27 | DEFAULT_RESULTS_PATH = '/Your/path/here/' 28 | DEFAULT_DATASETS_PATH = '/Your/path/here/' 29 | DEFAULT_FIGURES_PATH = '/Your/path/here/' 30 | DEFAULT_ETAS_PATH = DEFAULT_RESULTS_PATH + 'etas/' 31 | 32 | import helperFuncs as helper 33 | import MTMKL as mtmkl 34 | 35 | USE_TENSORFLOW = False 36 | 37 | C_VALS = [1.0, 10.0, 100.0] #10.0,100.0, #values for the C parameter of SVM to test 38 | B_VALS = [0.0001, 0.001, 0.01] 39 | V_VALS = [100.0, 10.0, 1.0, .1, .01] #a small V works well for MKL 40 | REGULARIZERS = ['L1','L2'] 41 | KERNELS = ['rbf','linear'] # could also do linear 42 | 43 | VALIDATION_TYPE = 'cross' 44 | DEFAULT_NUM_CROSS_FOLDS = 5 45 | SAVE_RESULTS_EVERY_X_TESTS = 1 46 | 47 | 48 | def reloadFiles(): 49 | reload(helper) 50 | reload(mtmkl) 51 | mtmkl.reloadFiles() 52 | 53 | 54 | class MTMKLWrapper: 55 | def __init__(self, file_prefix, users_as_tasks, user_clusters=True, eta_filename=None, regularizers=REGULARIZERS, tolerance = .0001, 56 | max_iter = 100, val_type=VALIDATION_TYPE, c_vals=C_VALS, beta_vals=B_VALS, 57 | v_vals = V_VALS, kernels=KERNELS, print_iters=False, optimize_labels=None, cont=False, test_run=False, 58 | results_path=DEFAULT_RESULTS_PATH, figures_path=DEFAULT_FIGURES_PATH, datasets_path=DEFAULT_DATASETS_PATH, 59 | etas_path=DEFAULT_ETAS_PATH, num_cross_folds=DEFAULT_NUM_CROSS_FOLDS, drop20=False, 60 | test_csv_filename=None): 61 | self.results_path = results_path 62 | self.figures_path = figures_path 63 | self.datasets_path = datasets_path 64 | self.etas_path = etas_path 65 | self.file_prefix = file_prefix 66 | self.cont=cont 67 | self.val_type = val_type 68 | self.users_as_tasks = users_as_tasks 69 | self.cluster_users = user_clusters 70 | self.drop20=drop20 71 | if test_csv_filename is not None: 72 | self.test_csv_filename = self.datasets_path + test_csv_filename 73 | else: 74 | self.test_csv_filename = None 75 | self.save_prefix = self.getSavePrefix(file_prefix, replace=cont) 76 | 77 | self.test_tasks = helper.loadPickledTaskList(datasets_path, file_prefix, "Test", fix_y=True) 78 | self.train_tasks = helper.loadPickledTaskList(datasets_path, file_prefix, "Train", fix_y=True) 79 | if self.val_type != 'cross': 80 | self.val_tasks = helper.loadPickledTaskList(datasets_path, file_prefix, "Val", fix_y=True) 81 | 82 | # print dataset sizes 83 | print "Num train points:", sum([len(t['Y']) for t in self.train_tasks]) 84 | if self.val_type != 'cross': 85 | print "Num val points:", sum([len(t['Y']) for t in self.val_tasks]) 86 | print "Num test points:", sum([len(t['Y']) for t in self.test_tasks]) 87 | 88 | if self.val_type != 'cross': 89 | self.initializeMTMKLModel(self.train_tasks) 90 | else: 91 | self.classifier = None 92 | 93 | self.n_feats = helper.calculateNumFeatsInTaskList(self.test_tasks) 94 | self.n_tasks = len(self.test_tasks) 95 | 96 | if optimize_labels is None: 97 | self.optimize_labels = ['tomorrow_Group_Happiness_Evening_Label', 'tomorrow_Group_Health_Evening_Label', 'tomorrow_Group_Calmness_Evening_Label'] 98 | else: 99 | self.optimize_labels = optimize_labels 100 | 101 | self.c_vals = c_vals 102 | self.v_vals = v_vals 103 | self.kernels = kernels 104 | self.beta_vals=beta_vals 105 | self.regularizers = regularizers 106 | 107 | self.tolerance = tolerance 108 | self.max_iter = max_iter 109 | self.print_iters = print_iters 110 | 111 | if test_run: 112 | print "This is only a testing run. Using cheap settings to make it faster" 113 | self.c_vals = [100] 114 | self.beta_vals = [.01] 115 | self.kernels = ['linear'] 116 | self.v_vals = [1.0] 117 | self.regularizers = ['L1'] 118 | self.max_iter = 1 119 | 120 | self.calcNumSettingsDesired() 121 | 122 | #storing the results 123 | self.time_sum = 0 124 | if cont: 125 | self.val_results_df = pd.DataFrame.from_csv(self.results_path + self.save_prefix + '.csv') 126 | print '\nPrevious validation results df loaded. It has', len(self.val_results_df), "rows" 127 | self.started_from = len(self.val_results_df) 128 | else: 129 | self.val_results_df = pd.DataFrame() 130 | self.started_from = 0 131 | 132 | self.num_cross_folds = num_cross_folds 133 | if self.val_type == 'cross': 134 | helper.generateCrossValPickleFiles(self.datasets_path, self.file_prefix, self.num_cross_folds) 135 | #helper.addKeepIndicesToCrossValPickleFiles(self.datasets_path, self.file_prefix, self.num_cross_folds, .80) 136 | 137 | def getSavePrefix(self, file_prefix, replace=False): 138 | name_modifier = "" 139 | if '/' in file_prefix: 140 | if "NoLocation" in file_prefix: 141 | name_modifier = "-noloc" 142 | slash_loc = file_prefix.find('/') 143 | path_modifier = file_prefix[0:slash_loc+1] 144 | file_prefix = file_prefix[slash_loc+1:] 145 | self.file_prefix = file_prefix 146 | self.datasets_path += path_modifier 147 | 148 | dash_loc = file_prefix.find('-') 149 | 150 | if self.users_as_tasks: 151 | task_str = '_users' 152 | else: 153 | task_str = '_wellbeing' 154 | 155 | prefix = "MTMKL" + task_str + file_prefix[dash_loc:-1] + name_modifier 156 | 157 | if not replace: 158 | while os.path.exists(self.results_path + prefix + '.csv'): 159 | prefix = prefix + '2' 160 | return prefix 161 | 162 | def calcNumSettingsDesired(self): 163 | self.num_settings = len(self.c_vals) * len(self.beta_vals) * len(self.kernels) \ 164 | * len(self.v_vals) * len(self.regularizers) 165 | 166 | # use something like the following to test only one set of parameters: 167 | # wrapper.setParams(tau10s=[.05], tau20s=[.05], sigma_multipliers=[.1,.01]) 168 | def setParams(self, c_vals=None, beta_vals=None, kernels=None, v_vals=None, regularizers=None): 169 | '''does not override existing parameter settings if the parameter is not set''' 170 | self.c_vals = c_vals if c_vals is not None else self.c_vals 171 | self.beta_vals = beta_vals if beta_vals is not None else self.beta_vals 172 | self.kernels = kernels if kernels is not None else self.kernels 173 | self.v_vals = v_vals if v_vals is not None else self.v_vals 174 | self.regularizers = regularizers if regularizers is not None else self.regularizers 175 | 176 | def settingAlreadyDone(self, C, beta, kernel, v, regularizer): 177 | if kernel == 'linear': 178 | if len(self.val_results_df[(self.val_results_df['C']== C) & \ 179 | (self.val_results_df['kernel']== kernel) & \ 180 | (self.val_results_df['v']== v) & \ 181 | (self.val_results_df['regularizer']== regularizer)]) > 0: 182 | print "setting already tested" 183 | return True 184 | else: 185 | return False 186 | else: 187 | if len(self.val_results_df[(self.val_results_df['C']== C) & \ 188 | (self.val_results_df['beta']== beta) & \ 189 | (self.val_results_df['kernel']== kernel) & \ 190 | (self.val_results_df['v']== v) & \ 191 | (self.val_results_df['regularizer']== regularizer)]) > 0: 192 | print "setting already tested" 193 | return True 194 | else: 195 | return False 196 | 197 | def initializeMTMKLModel(self, train_tasks, verbose=False): 198 | if USE_TENSORFLOW: 199 | self.classifier = mtmkl_tf.MTMKL(train_tasks,verbose=verbose,tol=self.tolerance, debug=False, max_iter=self.max_iter) 200 | else: 201 | self.classifier = mtmkl.MTMKL(train_tasks,verbose=verbose,tol=self.tolerance, debug=False, max_iter=self.max_iter, drop20PercentTrainingData=self.drop20) 202 | 203 | def setClassifierToSetting(self, C, beta, kernel, v, regularizer): 204 | self.classifier.setAllSettings(C, v, kernel, beta, regularizer) 205 | 206 | #must have called setValData for now 207 | def initializeAndTrainMTMKL(self, train_tasks, C, beta, kernel, v, regularizer, verbose=False): 208 | self.initializeMTMKLModel(train_tasks,verbose=verbose) 209 | self.setClassifierToSetting(C, beta, kernel, v, regularizer) 210 | converged = self.classifier.train() 211 | return converged 212 | 213 | def getValidationResults(self, results_dict, C, beta, kernel, v, regularizer): 214 | converged = self.initializeAndTrainMTMKL(self.train_tasks, C, beta, kernel, v, regularizer) 215 | 216 | if self.users_as_tasks: 217 | if not converged: 218 | val_acc = np.nan 219 | val_auc = np.nan 220 | else: 221 | val_acc, val_auc = self.classifier.getAccuracyAucAllTasks(self.val_tasks) 222 | results_dict['val_acc'] = val_acc 223 | results_dict['val_auc'] = val_auc 224 | else: 225 | accs = [] 226 | aucs = [] 227 | for t in range(self.n_tasks): 228 | if not converged: 229 | acc = np.nan 230 | auc = np.nan 231 | else: 232 | acc, auc = self.classifier.getAccuracyAucOnOneTask(self.val_tasks, t) 233 | task_name = self.val_tasks[t]['Name'] 234 | results_dict['TaskAcc-' + helper.getFriendlyLabelName(task_name)] = acc 235 | results_dict['TaskAuc-' + helper.getFriendlyLabelName(task_name)] = auc 236 | if self.cluster_users or task_name in self.optimize_labels: 237 | accs.append(acc) 238 | aucs.append(auc) 239 | results_dict['val_acc'] = np.mean(accs) 240 | results_dict['val_auc'] = np.mean(aucs) 241 | return results_dict 242 | 243 | def getCrossValidationResults(self, results_dict, C, beta, kernel, v, regularizer, save_plots=False,print_per_fold=True): 244 | all_acc = [] 245 | all_auc = [] 246 | all_f1 = [] 247 | all_precision = [] 248 | all_recall = [] 249 | if not self.users_as_tasks: 250 | per_task_accs = [[] for i in range(self.n_tasks)] 251 | per_task_aucs = [[] for i in range(self.n_tasks)] 252 | per_task_f1 = [[] for i in range(self.n_tasks)] 253 | per_task_precision = [[] for i in range(self.n_tasks)] 254 | per_task_recall = [[] for i in range(self.n_tasks)] 255 | 256 | for f in range(self.num_cross_folds): 257 | train_tasks, val_tasks = helper.loadCrossValData(self.datasets_path, self.file_prefix, f, reshape=False, fix_y=True) 258 | converged = self.initializeAndTrainMTMKL(train_tasks, C, beta, kernel, v, regularizer) 259 | if not converged: 260 | all_acc.append(np.nan) 261 | all_auc.append(np.nan) 262 | all_f1.append(np.nan) 263 | all_precision.append(np.nan) 264 | all_recall.append(np.nan) 265 | continue 266 | 267 | # Get results! 268 | fold_preds = [] 269 | fold_true_y = [] 270 | for t in range(self.n_tasks): 271 | preds = self.classifier.predictOneTask(val_tasks,t) 272 | true_y = list(val_tasks[t]['Y'].flatten()) 273 | 274 | if not self.users_as_tasks: 275 | # save the per-task results 276 | t_acc, t_auc, t_f1, t_precision, t_recall = helper.computeAllMetricsForPreds(preds, true_y) 277 | per_task_accs[t].append(t_acc) 278 | per_task_aucs[t].append(t_auc) 279 | per_task_f1[t].append(t_f1) 280 | per_task_precision[t].append(t_precision) 281 | per_task_recall[t].append(t_recall) 282 | if print_per_fold: print "Fold", f, "Task", val_tasks[t]['Name'], "acc", t_acc, "auc", t_auc, "f1", t_f1, "precision",t_precision,"recall",t_recall 283 | 284 | fold_preds.extend(preds) 285 | fold_true_y.extend(true_y) 286 | 287 | 288 | acc, auc, f1, precision, recall = helper.computeAllMetricsForPreds(fold_preds, fold_true_y) 289 | all_acc.append(acc) 290 | all_auc.append(auc) 291 | all_f1.append(f1) 292 | all_precision.append(precision) 293 | all_recall.append(recall) 294 | if print_per_fold: print "Fold", f, "acc", acc, "auc", auc, "f1", f1, "precision",precision,"recall",recall 295 | 296 | print "accs for all folds", all_acc 297 | print "aucs for all folds", all_auc 298 | 299 | # Add results to the dictionary 300 | results_dict['val_acc'] = np.nanmean(all_acc) 301 | results_dict['val_auc'] = np.nanmean(all_auc) 302 | results_dict['val_f1'] = np.nanmean(all_f1) 303 | results_dict['val_precision'] = np.nanmean(all_precision) 304 | results_dict['val_recall'] = np.nanmean(all_recall) 305 | 306 | # Add per-task results to the dictionary 307 | if not self.users_as_tasks: 308 | for t in range(self.n_tasks): 309 | task_name = val_tasks[t]['Name'] 310 | results_dict['TaskAcc-' + helper.getFriendlyLabelName(task_name)] = np.nanmean(per_task_accs[t]) 311 | results_dict['TaskAuc-' + helper.getFriendlyLabelName(task_name)] = np.nanmean(per_task_aucs[t]) 312 | results_dict['TaskF1-' + helper.getFriendlyLabelName(task_name)] = np.nanmean(per_task_f1[t]) 313 | results_dict['TaskPrecision-' + helper.getFriendlyLabelName(task_name)] = np.nanmean(per_task_precision[t]) 314 | results_dict['TaskRecall-' + helper.getFriendlyLabelName(task_name)] = np.nanmean(per_task_recall[t]) 315 | 316 | return results_dict 317 | 318 | def testOneSetting(self, C, beta, kernel, v, regularizer): 319 | if self.cont: 320 | if self.settingAlreadyDone(C, beta, kernel, v, regularizer): 321 | return 322 | 323 | t0 = time() 324 | 325 | results_dict = {'C':C, 'beta': beta, 'kernel':kernel, 'v':v, 'regularizer':regularizer} 326 | print results_dict 327 | 328 | if self.val_type == 'cross': 329 | results_dict = self.getCrossValidationResults(results_dict, C, beta, kernel, v, regularizer) 330 | else: 331 | results_dict = self.getValidationResults(results_dict, C, beta, kernel, v, regularizer) 332 | 333 | self.val_results_df = self.val_results_df.append(results_dict,ignore_index=True) 334 | 335 | print "\n", self.val_results_df.tail(n=1) 336 | t1 = time() 337 | this_time = t1 - t0 338 | print "It took", this_time, "seconds to obtain this result" 339 | 340 | self.time_sum = self.time_sum + this_time 341 | 342 | self.printTimeEstimate() 343 | sys.stdout.flush() 344 | 345 | #output the file every few iterations for safekeeping 346 | if len(self.val_results_df) % SAVE_RESULTS_EVERY_X_TESTS == 0: 347 | self.val_results_df.to_csv(self.results_path + self.save_prefix + '.csv') 348 | 349 | def printTimeEstimate(self): 350 | num_done = len(self.val_results_df)-self.started_from 351 | num_remaining = self.num_settings - num_done - self.started_from 352 | avg_time = self.time_sum / num_done 353 | total_secs_remaining = int(avg_time * num_remaining) 354 | hours = total_secs_remaining / 60 / 60 355 | mins = (total_secs_remaining % 3600) / 60 356 | secs = (total_secs_remaining % 3600) % 60 357 | 358 | print "\n", num_done, "settings processed so far,", num_remaining, "left to go" 359 | print "Estimated time remaining:", hours, "hours", mins, "mins", secs, "secs" 360 | 361 | def sweepAllParameters(self): 362 | print "\nSweeping all parameters!" 363 | 364 | self.calcNumSettingsDesired() 365 | print "\nYou have chosen to test a total of", self.num_settings, "settings" 366 | sys.stdout.flush() 367 | 368 | #sweep all possible combinations of parameters 369 | for C in self.c_vals: 370 | for v in self.v_vals: 371 | for regularizer in self.regularizers: 372 | for kernel in self.kernels: 373 | if kernel == 'linear': 374 | self.testOneSetting(C, np.nan, kernel, v, regularizer) 375 | else: 376 | for beta in self.beta_vals: 377 | self.testOneSetting(C, beta, kernel, v, regularizer) 378 | self.val_results_df.to_csv(self.results_path + self.save_prefix + '.csv') 379 | 380 | def run(self): 381 | self.sweepAllParameters() 382 | return self.findBestSetting(criteria='AUC') 383 | 384 | 385 | def findBestSetting(self, criteria="accuracy", minimize=False, save_final_results=True): 386 | if criteria=="accuracy": 387 | search_col = 'val_auc' 388 | elif criteria=="AUC": 389 | search_col = 'val_auc' 390 | 391 | results = self.val_results_df[search_col].tolist() 392 | if minimize: 393 | best_result = min(results) 394 | opt_word = "minimized" 395 | else: 396 | best_result = max(results) 397 | opt_word = "maximized" 398 | best_idx = results.index(best_result) 399 | 400 | print "BEST SETTING!" 401 | print "Settings which", opt_word, "the", criteria, "were:" 402 | print self.val_results_df.iloc[best_idx] 403 | 404 | if save_final_results: 405 | self.getFinalResultsAndSave(self.val_results_df.iloc[best_idx]) 406 | else: 407 | return self.val_results_df.iloc[best_idx] 408 | 409 | def getFinalResultsAndSave(self, results_dict): 410 | print "\nRetraining on full training data with the best settings..." 411 | self.drop20=False 412 | self.initializeAndTrainMTMKL(self.train_tasks, results_dict['C'], results_dict['beta'], 413 | results_dict['kernel'], results_dict['v'], results_dict['regularizer'], 414 | verbose=True) 415 | 416 | print "\nEvaluating results on held-out test set!! ..." 417 | all_preds = [] 418 | all_true_y = [] 419 | per_task_accs = [np.nan] * self.n_tasks 420 | per_task_aucs = [np.nan] * self.n_tasks 421 | per_task_f1 = [np.nan] * self.n_tasks 422 | per_task_precision = [np.nan] * self.n_tasks 423 | per_task_recall = [np.nan] * self.n_tasks 424 | for t in range(self.n_tasks): 425 | preds = self.classifier.predictOneTask(self.test_tasks,t) 426 | true_y = list(self.test_tasks[t]['Y'].flatten()) 427 | 428 | if len(preds)==0 or len(true_y) == 0: 429 | print "no y for task", t, "... skipping" 430 | continue 431 | 432 | all_preds.extend(preds) 433 | all_true_y.extend(true_y) 434 | 435 | # save the per-task results 436 | t_acc, t_auc, t_f1, t_precision, t_recall = helper.computeAllMetricsForPreds(preds, true_y) 437 | per_task_accs[t] = t_acc 438 | per_task_aucs[t] = t_auc 439 | per_task_f1[t] = t_f1 440 | per_task_precision[t] = t_precision 441 | per_task_recall[t] = t_recall 442 | 443 | print "\nPlotting cool stuff about the final model..." 444 | self.saveImagePlot(self.classifier.eta, 'Etas') 445 | pd.DataFrame(self.classifier.eta).to_csv(self.etas_path + self.save_prefix + "-etas.csv") 446 | 447 | print "\tHELD OUT TEST METRICS COMPUTED BY APPENDING ALL PREDS" 448 | acc, auc, f1, precision, recall = helper.computeAllMetricsForPreds(all_preds, all_true_y) 449 | print '\t\tAcc:', acc, 'AUC:', auc, 'F1:', f1, 'Precision:', precision, 'Recall:', recall 450 | 451 | print "\n\tHELD OUT TEST METRICS COMPUTED BY AVERAGING OVER TASKS" 452 | avg_acc = np.nanmean(per_task_accs) 453 | avg_auc = np.nanmean(per_task_aucs) 454 | avg_f1 = np.nanmean(per_task_f1) 455 | avg_precision = np.nanmean(per_task_precision) 456 | avg_recall = np.nanmean(per_task_recall) 457 | print '\t\tAcc:', avg_acc, 'AUC:', avg_auc, 'F1:', avg_f1, 'Precision:', avg_precision, 'Recall:', avg_recall 458 | 459 | print "\n\tHELD OUT TEST METRICS COMPUTED FOR EACH TASK" 460 | if not self.users_as_tasks: 461 | for t in range(self.n_tasks): 462 | task_name = self.test_tasks[t]['Name'] 463 | task_name=helper.getFriendlyLabelName(task_name) 464 | print "\t\t", task_name, "- Acc:", per_task_accs[t], "AUC:", per_task_aucs[t], 'F1:', per_task_f1[t], 'Precision:', per_task_precision[t], 'Recall:', per_task_recall[t] 465 | 466 | if self.test_csv_filename is not None: 467 | print "\tSAVING HELD OUT PREDICITONS" 468 | if 'Big5GenderKMeansCluster' in self.file_prefix: 469 | task_column = 'Big5GenderKMeansCluster' 470 | tasks_are_ints = True 471 | label_name = helper.getFriendlyLabelName(self.file_prefix) 472 | wanted_label = helper.getOfficialLabelName(label_name) 473 | predictions_df = helper.get_test_predictions_for_df_with_task_column( 474 | self.classifier.predict_01, self.test_csv_filename, task_column, self.test_tasks, 475 | wanted_label=wanted_label, num_feats_expected=np.shape(self.test_tasks[0]['X'])[1], 476 | label_name=label_name, tasks_are_ints=tasks_are_ints) 477 | elif not self.users_as_tasks: 478 | predictions_df = helper.get_test_predictions_for_df_with_no_task_column(self.classifier.predict_01, 479 | self.test_csv_filename, self.test_tasks, num_feats_expected=np.shape(self.test_tasks[0]['X'])[1]) 480 | else: 481 | print "Error! Cannot determine what type of model you are training and therefore cannot save predictions." 482 | return 483 | predictions_df.to_csv(self.results_path + "Preds-" + self.save_prefix + '.csv') 484 | else: 485 | print "Uh oh, the test csv filename was not set, can't save test preds" 486 | 487 | def saveImagePlot(self, matrix, name): 488 | plt.figure() 489 | plt.imshow(matrix) 490 | plt.savefig(self.figures_path + self.save_prefix + "-" + name + ".eps") 491 | plt.close() 492 | 493 | 494 | 495 | if __name__ == "__main__": 496 | print "MTMKL MODEL SELECTION" 497 | print "\tThis code will sweep a set of parameters to find the ideal settings for MTMLK for a single dataset" 498 | 499 | if len(sys.argv) < 3: 500 | print "Error: usage is python MTMKLWrapper.py " 501 | print "\t: e.g. datasetTaskList-Discard-Future-Group_ - program will look in the following directory for this file", DEFAULT_DATASETS_PATH 502 | print "\t: type 'users' for users as tasks, 'wellbeing' for wellbeing measures as tasks, or 'clusters' for user clusters as tasks" 503 | print "\t: optional. If 'True', the wrapper will pick up from where it left off by loading a previous validation results file" 504 | print "\t: optional. If you want to get the final test results, provide the name of a csv file to test on" 505 | sys.exit() 506 | filename= sys.argv[1] #get data file from command line argument 507 | print "\nLoading dataset", DEFAULT_DATASETS_PATH + filename 508 | print "" 509 | 510 | if sys.argv[2] == 'users': 511 | users_as_tasks = True 512 | cluster_users = False 513 | print "Okay, treating users as tasks. Will not print per-task results" 514 | elif sys.argv[2] == 'wellbeing': 515 | users_as_tasks = False 516 | cluster_users = False 517 | print "Okay, treating wellbeing measures as tasks. Will save and print per-task results" 518 | elif sys.argv[2] == 'clusters': 519 | users_as_tasks = False 520 | cluster_users = True 521 | print "Okay, treating user clusters as tasks. Will save and print per-task results and optimize for accuracy over all clusters." 522 | 523 | if len(sys.argv) >= 4 and sys.argv[3] == 'True': 524 | cont = True 525 | print "Okay, will continue from a previously saved validation results file for this problem" 526 | else: 527 | cont = False 528 | print "" 529 | 530 | if len(sys.argv) >= 5: 531 | csv_test_file = sys.argv[4] 532 | print "Okay, will get final test results on file", csv_test_file 533 | print "" 534 | else: 535 | csv_test_file = None 536 | 537 | if USE_TENSORFLOW: 538 | print "\nWill use the TENSORFLOW version of the code\n" 539 | 540 | wrapper = MTMKLWrapper(filename, users_as_tasks=users_as_tasks, user_clusters=cluster_users, cont=cont, 541 | test_csv_filename=csv_test_file) 542 | 543 | print "\nThe following parameter settings will be tested:" 544 | print "\tCs: \t", wrapper.c_vals 545 | print "\tbetas: \t", wrapper.beta_vals 546 | print "\tkernels: \t", wrapper.kernels 547 | print "\tvs: \t", wrapper.v_vals 548 | print "\tregularizers: \t", wrapper.regularizers 549 | 550 | print "\nThe validation results dataframe will be saved in:", wrapper.results_path + wrapper.save_prefix + '.csv' 551 | print "\nThe validation and testing figures will be saved in:", wrapper.figures_path + wrapper.save_prefix 552 | 553 | wrapper.run() 554 | -------------------------------------------------------------------------------- /NeuralNetworks/tensorFlowWrapper.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import tensorflow as tf 4 | import sys 5 | import os 6 | import pickle 7 | from time import time 8 | 9 | CODE_PATH = os.path.dirname(os.getcwd()) 10 | sys.path.append(CODE_PATH) 11 | 12 | DEFAULT_RESULTS_PATH = '/Your/path/here/' 13 | DEFAULT_DATASETS_PATH = '/Your/path/here/' 14 | DEFAULT_FIGURES_PATH = '/Your/path/here/' 15 | 16 | DEFAULT_VAL_TYPE = 'cross' 17 | OUTPUT_EVERY_NTH = 3 18 | 19 | sys.path.append(PATH_TO_REPO) 20 | import tensorFlowNetwork as tfnet 21 | import tensorFlowNetworkMultiTask as mtltf 22 | import helperFuncs as helper 23 | 24 | def reloadFiles(): 25 | reload(tfnet) 26 | reload(mtltf) 27 | reload(helper) 28 | tfnet.reloadHelper() 29 | mtltf.reloadFiles() 30 | 31 | class TensorFlowWrapper: 32 | def __init__(self, dataset_name, target_label=None, trial_name=None, multilabel=False, multitask=False, 33 | print_per_task=False, test_steps=9001, results_path=DEFAULT_RESULTS_PATH, 34 | datasets_path=DEFAULT_DATASETS_PATH, figures_path=DEFAULT_FIGURES_PATH, val_output_file=None, 35 | val_type=DEFAULT_VAL_TYPE, cont=False, architectures=None, test_csv_filename=None): 36 | assert not(multilabel and multitask) 37 | 38 | self.multilabel = multilabel 39 | self.multitask = multitask 40 | self.results_path = results_path 41 | self.figures_path = figures_path 42 | self.datasets_path = datasets_path 43 | self.dataset_name = dataset_name 44 | self.test_steps = test_steps 45 | self.val_type = val_type 46 | self.cont = cont 47 | self.print_per_task = print_per_task 48 | if test_csv_filename is not None: 49 | self.test_csv_filename = self.datasets_path + test_csv_filename 50 | else: 51 | self.test_csv_filename = None 52 | if cont: 53 | replace = True 54 | else: 55 | replace = False 56 | if trial_name is None and target_label is not None: 57 | trial_name = helper.getFriendlyLabelName(target_label) 58 | self.trial_name = trial_name 59 | self.val_output_prefix = self.getValOutputName(val_output_file, dataset_name, trial_name, replace=replace) 60 | 61 | #dataset stuff 62 | if multitask: 63 | train_tasks = pickle.load(open(self.datasets_path + dataset_name + "Train.p","rb")) 64 | val_tasks = pickle.load(open(self.datasets_path + dataset_name + "Val.p","rb")) 65 | test_tasks = pickle.load(open(self.datasets_path + dataset_name + "Test.p","rb")) 66 | 67 | self.net = mtltf.TensorFlowNetworkMTL(train_tasks, val_tasks, test_tasks, verbose=False, 68 | val_type=self.val_type, print_per_task=print_per_task) 69 | self.wanted_labels = self.net.optimize_labels 70 | else: 71 | self.data_df = pd.DataFrame.from_csv(self.datasets_path + self.dataset_name) 72 | self.wanted_feats = [x for x in self.data_df.columns.values if x != 'user_id' and x != 'timestamp' and x!= 'dataset' and '_Label' not in x] 73 | if self.multilabel: 74 | self.wanted_labels = [x for x in self.data_df.columns.values if '_Label' in x and 'tomorrow_' in x and 'Evening' in x and 'Alertness' not in x and 'Energy' not in x] 75 | self.optimize_labels = [x for x in self.wanted_labels if 'tomorrow_' in x and 'Evening_' in x] 76 | else: 77 | self.wanted_labels = [target_label] 78 | 79 | #actual network 80 | self.net = tfnet.TensorFlowNetwork(self.data_df, self.wanted_feats, self.wanted_labels, optimize_labels=self.wanted_labels, 81 | multilabel=self.multilabel, verbose=False, val_type=self.val_type) 82 | 83 | #parameters that can be tuned: 84 | self.l2_regularizers = [1e-2, 1e-4] 85 | self.dropout = [True, False] 86 | self.decay = [True] 87 | self.decay_steps = [1000] 88 | self.decay_rates = [0.95] 89 | self.optimizers = [tf.train.AdamOptimizer] #[tf.train.AdagradOptimizer, tf.train.GradientDescentOptimizer 90 | self.train_steps =[5001] 91 | if multitask: 92 | self.batch_sizes = [20] 93 | self.learning_rates = [.01, .001, .0001] 94 | self.architectures = [[500,50],[300,20,10]] if architectures is None else architectures 95 | else: 96 | self.batch_sizes = [50,75] 97 | self.learning_rates = [.01, .001, .0001] 98 | self.architectures = [[1024,256],[500,50],[1024]] if architectures is None else architectures 99 | 100 | #storing the results 101 | self.time_sum = 0 102 | if cont: 103 | self.val_results_df = pd.DataFrame.from_csv(self.results_path + self.val_output_prefix + '.csv') 104 | print '\nPrevious validation results df loaded. It has', len(self.val_results_df), "rows" 105 | self.started_from = len(self.val_results_df) 106 | else: 107 | self.val_results_df = pd.DataFrame() 108 | self.started_from = 0 109 | 110 | def getValOutputName(self, val_output_file, dataset_file, trial_name, replace=False): 111 | if self.multitask: 112 | multilabel_str = 'MTL_' 113 | elif self.multilabel: 114 | multilabel_str = 'multilabel_' 115 | else: 116 | multilabel_str = '' 117 | 118 | name_modifier = "" 119 | if '/' in dataset_file: 120 | if "NoLocation" in dataset_file: 121 | name_modifier = "-noloc" 122 | slash_loc = dataset_file.find('/') 123 | dataset_file = dataset_file[slash_loc+1:] 124 | 125 | if replace or val_output_file is None: 126 | val_output_file = 'nn_' + multilabel_str + dataset_file[0:-4] + name_modifier + "_" 127 | if trial_name is not None: 128 | val_output_file = val_output_file + trial_name 129 | if not replace: 130 | while os.path.exists(self.results_path + val_output_file + '.csv') \ 131 | or os.path.exists(self.figures_path + val_output_file + '.eps'): 132 | val_output_file = val_output_file + '2' 133 | return val_output_file 134 | 135 | def setNetworkArchitecturesToTest(self, architectures): 136 | self.architectures = architectures 137 | 138 | def constructNetwork(self, hidden_layers): 139 | if self.multitask: 140 | hidden_layers_shared = hidden_layers[:-1] 141 | hidden_task_nodes = hidden_layers[-1] 142 | connections_shared = ['full'] * (len(hidden_layers)) 143 | self.net.setUpNetworkStructure(hidden_layers_shared,hidden_task_nodes,connections_shared,['full','full']) 144 | else: 145 | connections = ['full'] * (len(hidden_layers)+1) 146 | self.net.setUpNetworkStructure(hidden_layers,connections) 147 | 148 | # use something like the following to test only one set of parameters: 149 | # wrapper.setParams(l2_regularizers=[1e-4], learning_rates=[.01], dropout=[True], decay=[True], batch_sizes=[50], optimizers=[tf.train.GradientDescentOptimizer]) 150 | def setParams(self, l2_regularizers=None, learning_rates=None, dropout=None, 151 | decay=None, decay_steps=None, decay_rates=None, batch_sizes=None, 152 | optimizers=None, train_steps=None): 153 | '''does not override existing parameter settings if the parameter is not set''' 154 | self.l2_regularizers = l2_regularizers if l2_regularizers is not None else self.l2_regularizers 155 | self.learning_rates = learning_rates if learning_rates is not None else self.learning_rates 156 | self.dropout= dropout if dropout is not None else self.dropout 157 | self.decay= decay if decay is not None else self.decay 158 | self.decay_steps= decay_steps if decay_steps is not None else self.decay_steps 159 | self.decay_rates= decay_rates if decay_rates is not None else self.decay_rates 160 | self.batch_sizes = batch_sizes if batch_sizes is not None else self.batch_sizes 161 | self.optimizers = optimizers if optimizers is not None else self.optimizers 162 | 163 | def settingAlreadyDone(self, hidden_layers, l2_beta, lrate, dropout, decay, dsteps, drate, bsize, opt, tsteps): 164 | if len(self.val_results_df[(self.val_results_df['hidden_layers']== str(hidden_layers)) & \ 165 | (self.val_results_df['l2_beta']== l2_beta) & \ 166 | (self.val_results_df['learning_rate']== lrate) & \ 167 | (self.val_results_df['dropout']== dropout) & \ 168 | (self.val_results_df['decay']== decay) & \ 169 | (self.val_results_df['decay_steps']== dsteps) & \ 170 | (self.val_results_df['decay_rate']== drate) & \ 171 | (self.val_results_df['batch_size']== bsize) & \ 172 | (self.val_results_df['optimizer']== str(opt))]) > 0: 173 | print "setting already tested" 174 | return True 175 | else: 176 | return False 177 | 178 | def testOneSetting(self, hidden_layers, l2_beta, lrate, dropout, decay, dsteps, drate, bsize, opt, tsteps, num_settings): 179 | print "Testing setting with layers", hidden_layers, "beta", l2_beta, "lrate", lrate, "dropout", dropout, "decay", decay, "dsteps", dsteps, "drate", drate, "bsize", bsize, "opt", opt, "tsteps", tsteps 180 | if self.cont: 181 | if self.settingAlreadyDone(hidden_layers, l2_beta, lrate, dropout, decay, dsteps, drate, bsize, opt, tsteps): 182 | return 183 | 184 | t0 = time() 185 | self.net.setParams(l2_beta=l2_beta, initial_learning_rate=lrate, decay=decay, 186 | decay_steps=dsteps, decay_rate=drate, batch_size=bsize, 187 | optimizer=opt, n_steps=tsteps, dropout=dropout) 188 | self.constructNetwork(hidden_layers) 189 | if self.val_type == 'cross': 190 | acc, auc, f1, precision, recall = self.net.trainAndCrossValidate() 191 | else: 192 | acc, auc, f1, precision, recall = self.net.trainAndValidate() 193 | 194 | results_dict = {'hidden_layers':hidden_layers, 'l2_beta': l2_beta, 'learning_rate': lrate, 195 | 'dropout': dropout, 'decay': decay, 'decay_steps': dsteps, 196 | 'decay_rate': drate, 'batch_size': bsize, 197 | 'optimizer': opt, 'val_acc': acc, 'val_auc':auc, 198 | 'val_f1':f1, 'val_precision':precision, 'val_recall':recall} 199 | if self.multitask: 200 | results_dict['train_nan_percent'] = self.net.train_nan_percent[-1] 201 | results_dict['val_nan_percent'] = self.net.val_nan_percent[-1] 202 | 203 | if self.multilabel or self.print_per_task: 204 | for label in self.wanted_labels: 205 | friendly_label = helper.getFriendlyLabelName(label) 206 | results_dict[friendly_label + '_acc'] = self.net.training_val_results_per_task['acc'][label][-1] 207 | results_dict[friendly_label + '_auc'] = self.net.training_val_results_per_task['auc'][label][-1] 208 | results_dict[friendly_label + '_f1'] = self.net.training_val_results_per_task['f1'][label][-1] 209 | results_dict[friendly_label + '_precision'] = self.net.training_val_results_per_task['precision'][label][-1] 210 | results_dict[friendly_label + '_recall'] = self.net.training_val_results_per_task['recall'][label][-1] 211 | self.val_results_df = self.val_results_df.append(results_dict,ignore_index=True) 212 | 213 | print self.val_results_df.tail(n=1) 214 | t1 = time() 215 | this_time = t1 - t0 216 | print "It took", this_time, "seconds to obtain this result" 217 | 218 | self.time_sum = self.time_sum + this_time 219 | 220 | self.printTimeEstimate(len(self.val_results_df)-self.started_from, num_settings) 221 | sys.stdout.flush() 222 | 223 | #output the file every few iterations for safekeeping 224 | if len(self.val_results_df) % OUTPUT_EVERY_NTH == 0: 225 | self.val_results_df.to_csv(self.results_path + self.val_output_prefix + '.csv') 226 | 227 | def printTimeEstimate(self, num_done, num_desired): 228 | num_remaining = num_desired - num_done 229 | avg_time = self.time_sum / num_done 230 | total_secs_remaining = int(avg_time * num_remaining) 231 | hours = total_secs_remaining / 60 / 60 232 | mins = (total_secs_remaining % 3600) / 60 233 | secs = (total_secs_remaining % 3600) % 60 234 | 235 | print "\n", num_done, "settings processed so far,", num_remaining, "left to go" 236 | print "Estimated time remaining:", hours, "hours", mins, "mins", secs, "secs" 237 | 238 | def calcNumSettingsPerStructure(self): 239 | num_settings = len(self.l2_regularizers) * len(self.learning_rates) * len(self.dropout) * len(self.decay) \ 240 | * len(self.batch_sizes) * len(self.optimizers) * len(self.train_steps) 241 | if True in self.decay and (len(self.decay_steps) > 1 or len(self.decay_rates) > 1): 242 | num_settings = num_settings * ((len(self.decay_steps) * len(self.decay_rates)) / 2.0) 243 | return num_settings 244 | 245 | def sweepParameters(self, hidden_layers, num_settings): 246 | print "\nSweeping all parameters for structure:", hidden_layers 247 | 248 | #sweep all possible combinations of parameters 249 | for l2_beta in self.l2_regularizers: 250 | for lrate in self.learning_rates: 251 | for dropout in self.dropout: 252 | for bsize in self.batch_sizes: 253 | for opt in self.optimizers: 254 | for tsteps in self.train_steps: 255 | for decay in self.decay: 256 | if decay: 257 | for dsteps in self.decay_steps: 258 | for drate in self.decay_rates: 259 | self.testOneSetting(hidden_layers, l2_beta, lrate, dropout, decay, dsteps, drate, bsize, opt, tsteps, num_settings) 260 | else: 261 | #decay steps and decay rate don't matter if decay is set to false 262 | self.testOneSetting(hidden_layers, l2_beta, lrate, dropout, decay, 10000, 0.95, bsize, opt, tsteps, num_settings) 263 | self.val_results_df.to_csv(self.results_path + self.val_output_prefix + '.csv') 264 | 265 | def sweepStructuresAndParameters(self): 266 | num_settings = self.calcNumSettingsPerStructure() 267 | num_settings_total = num_settings * len(self.architectures) 268 | 269 | print "\nYou have chosen to test", num_settings, "settings for each of", len(self.architectures), "architectures" 270 | print "This is a total of", num_settings_total, "tests." 271 | for hidden_layers in self.architectures: 272 | self.sweepParameters(hidden_layers,num_settings_total) 273 | 274 | def findBestSetting(self, retrain_and_plot=True, optimize_for='val_auc'): 275 | accuracies = self.val_results_df[optimize_for].tolist() 276 | max_acc = max(accuracies) 277 | max_idx = accuracies.index(max_acc) 278 | best_setting = self.val_results_df.iloc[max_idx] 279 | 280 | print "BEST SETTING!" 281 | print "The highest", optimize_for, "of", max_acc, "was found with the following settings:" 282 | print best_setting 283 | 284 | best_setting = helper.fixSettingDictLoadedFromResultsDf(best_setting) 285 | 286 | if retrain_and_plot: 287 | self.retrainAndPlot(best_setting) 288 | else: 289 | return best_setting 290 | 291 | def retrainAndPlot(self, setting_dict): 292 | print "\nRETRAINING WITH THE BEST SETTINGS:" 293 | 294 | self.net.verbose = True 295 | self.net.setParams(l2_beta=setting_dict['l2_beta'], initial_learning_rate=setting_dict['learning_rate'], decay=setting_dict['decay'], 296 | decay_steps=setting_dict['decay_steps'], decay_rate=setting_dict['decay_rate'], batch_size=setting_dict['batch_size'], 297 | optimizer=setting_dict['optimizer'], dropout=setting_dict['dropout']) 298 | self.constructNetwork(setting_dict['hidden_layers']) 299 | 300 | self.net.setUpGraph() 301 | self.net.runGraph(self.test_steps, print_test=True) 302 | 303 | if self.multilabel: 304 | for label in self.optimize_labels: 305 | friendly_label = helper.getFriendlyLabelName(label) 306 | self.net.plotValResults(save_path=self.figures_path + self.val_output_prefix + '-' + friendly_label + '.eps', label=label) 307 | self.net.plotValResults(save_path=self.figures_path + self.val_output_prefix + '-' + friendly_label + '.png', label=label) 308 | print "Final validation results for", friendly_label,"... Acc:", \ 309 | self.net.training_val_results_per_task['acc'][label][-1], "Auc:", self.net.training_val_results_per_task['auc'][label][-1] 310 | elif self.print_per_task: 311 | for label in self.wanted_labels: 312 | friendly_label = helper.getFriendlyLabelName(label) 313 | self.net.plotValResults(save_path=self.figures_path + self.val_output_prefix + '-' + friendly_label + '.eps', label=label) 314 | self.net.plotValResults(save_path=self.figures_path + self.val_output_prefix + '-' + friendly_label + '.png', label=label) 315 | print "Final validation results for", friendly_label,"... Acc:", \ 316 | self.net.training_val_results_per_task['acc'][label][-1], "Auc:", self.net.training_val_results_per_task['auc'][label][-1] 317 | else: 318 | self.net.plotValResults(save_path=self.figures_path + self.val_output_prefix + '.eps') 319 | self.net.plotValResults(save_path=self.figures_path + self.val_output_prefix + '.png') 320 | print "Final AUC:", self.net.training_val_results['auc'][-1] 321 | 322 | if self.test_csv_filename is not None: 323 | if self.multitask: 324 | task_column = None 325 | if 'Cluster' in self.dataset_name: 326 | print "Guessing the task column is Big5GenderKMeansCluster - if this is incorrect expect errors" 327 | task_column = 'Big5GenderKMeansCluster' 328 | tasks_are_ints = True 329 | 330 | if 'User' in self.dataset_name: 331 | print "Guessing the task column is user_id - if this is incorrect expect errors" 332 | task_column = 'user_id' 333 | tasks_are_ints = False 334 | 335 | if task_column is not None: 336 | label_name = helper.getFriendlyLabelName(self.dataset_name) 337 | wanted_label = helper.getOfficialLabelName(label_name) 338 | test_preds_df = helper.get_test_predictions_for_df_with_task_column( 339 | self.net.predict, self.test_csv_filename, task_column, self.net.test_tasks, 340 | wanted_label=wanted_label, num_feats_expected=np.shape(self.net.test_tasks[0]['X'])[1], 341 | label_name=label_name, tasks_are_ints=tasks_are_ints) 342 | else: 343 | test_preds_df = helper.get_test_predictions_for_df_with_no_task_column(self.net.predict, self.test_csv_filename, 344 | self.net.test_tasks, 345 | num_feats_expected=np.shape(self.net.test_tasks[0]['X'])[1]) 346 | else: 347 | test_preds_df = self.net.get_preds_for_df() 348 | print "Got a test preds df! Saving it to:", self.results_path + "Preds-" + self.val_output_prefix + '.csv' 349 | test_preds_df.to_csv(self.results_path + 'Preds-' + self.val_output_prefix + '.csv') 350 | else: 351 | print "Uh oh, the test csv filename was not set, can't save test preds" 352 | 353 | print "Saving a copy of the final model!" 354 | self.net.save_model(self.val_output_prefix, self.results_path) 355 | 356 | 357 | def run(self): 358 | self.sweepStructuresAndParameters() 359 | self.findBestSetting() 360 | 361 | if __name__ == "__main__": 362 | print "TENSOR FLOW MODEL SELECTION" 363 | print "\tThis code will sweep a set of network architectures and parameters to find the ideal settings for a single dataset" 364 | 365 | if len(sys.argv) < 4: 366 | print "Error: usage is python tensorFlowWrapper.py " 367 | print "\t: e.g. dataset-Simple-Group.csv or datasetTaskList-Discard40-Future-Personal_ ... Program will look in the following directory for this file", DEFAULT_DATASETS_PATH 368 | print "\t:" 369 | print "\t\tFor single task learning, enter the name of the label you would like classify on. E.g. Group_Happiness_Evening_Label" 370 | print "\t\tFor multi task learning, in which the same net learns several tasks (like several wellbeing measures) enter: multilabel" 371 | print "\t\tFor multi task learning, in which each task gets its own piece of the network, but the first layers are shared (like users as tasks) enter: multitask" 372 | print "\t For wellbeing-ask-tasks use 'wellbeing', for users-as-tasks use 'users'" 373 | print "\t: optional. If 'True', the neural net will pick up from where it left off by loading a previous validation results file" 374 | print "\t: optional. If you want to get the final test results, provide the name of a csv file to test on" 375 | sys.exit() 376 | filename= sys.argv[1] #get data file from command line argument 377 | print "\nLoading dataset", DEFAULT_DATASETS_PATH + filename 378 | print "" 379 | 380 | multilabel = False 381 | multitask = False 382 | target_label = None 383 | if sys.argv[2] == 'multilabel': 384 | multilabel = True 385 | print "Performing multi-task classification, in which the same net is shared by all tasks" 386 | print "Optimizing for accuracy on tomorrow evening" 387 | elif sys.argv[2] == 'multitask': 388 | multitask = True 389 | print "Performing multi-task classification, in which each task gets it's own private final hidden layer" 390 | else: 391 | target_label = sys.argv[2] 392 | print "Performing single-task classification, classifying on", target_label 393 | 394 | if sys.argv[3] == 'wellbeing': 395 | print_per_task = True 396 | else: 397 | print_per_task = False 398 | 399 | if len(sys.argv) >= 5 and sys.argv[4] == 'True': 400 | cont = True 401 | print "Okay, will continue from a previously saved validation results file for this problem" 402 | else: 403 | cont = False 404 | print "" 405 | 406 | if len(sys.argv) >= 6: 407 | csv_test_file = sys.argv[5] 408 | print "Okay, will get final test results on file", csv_test_file 409 | print "" 410 | else: 411 | csv_test_file = None 412 | 413 | wrapper = TensorFlowWrapper(filename, target_label=target_label, multilabel=multilabel, multitask=multitask, 414 | print_per_task=print_per_task, cont=cont, test_csv_filename=csv_test_file) 415 | 416 | print "\nThe following parameter settings will be tested:" 417 | print "\tl2_regularizers: \t", wrapper.l2_regularizers 418 | print "\tlearning_rates: \t", wrapper.learning_rates 419 | print "\tdropout: \t", wrapper.dropout 420 | print "\tdecay: \t", wrapper.decay 421 | print "\tdecay_steps: \t", wrapper.decay_steps 422 | print "\tdecay_rates: \t", wrapper.decay_rates 423 | print "\tbatch_sizes: \t", wrapper.batch_sizes 424 | print "\toptimizers: \t", wrapper.optimizers 425 | print "\ttrain_steps: \t", wrapper.train_steps 426 | 427 | print "\nThe following network structures will be tested:" 428 | print "\t", wrapper.architectures 429 | 430 | print "\nThe validation results dataframe will be saved in:", wrapper.results_path + wrapper.val_output_prefix + '.csv' 431 | print "\nThe validation accuracy figures will be saved in:", wrapper.figures_path + wrapper.val_output_prefix + '.eps' 432 | 433 | wrapper.run() -------------------------------------------------------------------------------- /NeuralNetworks/tensorFlowWrapperSTL.py: -------------------------------------------------------------------------------- 1 | """Performs a hyperparameter sweep for the Single Task Learning (STL) neural 2 | network.""" 3 | import pandas as pd 4 | import numpy as np 5 | import tensorflow as tf 6 | import sys 7 | import os 8 | import pickle 9 | import copy 10 | from time import time 11 | 12 | CODE_PATH = os.path.dirname(os.getcwd()) 13 | sys.path.append(CODE_PATH) 14 | 15 | DEFAULT_RESULTS_PATH = '/Your/path/here/' 16 | DEFAULT_DATASETS_PATH = '/Your/path/here/' 17 | DEFAULT_FIGURES_PATH = '/Your/path/here/' 18 | 19 | import tensorFlowNetwork as tfnet 20 | import helperFuncs as helper 21 | 22 | DEFAULT_VAL_TYPE = 'cross' 23 | DEFAULT_NUM_CROSS_FOLDS = 5 24 | SAVE_RESULTS_EVERY_X_TESTS = 1 25 | 26 | def reloadFiles(): 27 | reload(helper) 28 | reload(tfnet) 29 | tfnet.reloadHelper() 30 | 31 | class TensorFlowSTLWrapper: 32 | 33 | def __init__(self, dataset_name, target_label, users_as_tasks=True, test_steps=9001, val_output_file=None, 34 | val_type=DEFAULT_VAL_TYPE, cont=False, results_path=DEFAULT_RESULTS_PATH, 35 | datasets_path=DEFAULT_DATASETS_PATH, figures_path=DEFAULT_FIGURES_PATH, architectures=None, 36 | num_cross_folds=DEFAULT_NUM_CROSS_FOLDS, test_run=False, redo_test=False): 37 | self.datasets_path = datasets_path 38 | self.cont = cont 39 | self.val_type = val_type 40 | self.num_cross_folds = num_cross_folds 41 | self.test_steps = test_steps 42 | self.redo_test = redo_test 43 | self.users_as_tasks = users_as_tasks 44 | self.target_label = target_label 45 | if self.users_as_tasks: 46 | self.results_path = results_path + 'STL-OneModelPerUser/' 47 | self.figures_path = figures_path + 'STL-OneModelPerUser/' 48 | else: 49 | self.results_path = results_path + 'STL-Wellbeing/' 50 | self.figures_path = figures_path + 'STL-Wellbeing/' 51 | self.save_prefix = self.getSavePrefix(dataset_name, target_label, replace=cont) 52 | 53 | self.dataset_name = dataset_name 54 | self.data_df = pd.DataFrame.from_csv(self.datasets_path + self.dataset_name) 55 | self.wanted_feats = [x for x in self.data_df.columns.values if x != 'user_id' and x != 'timestamp' and x!= 'dataset' and '_Label' not in x] 56 | if self.users_as_tasks: 57 | self.wanted_labels = [target_label] 58 | self.n_tasks = len(self.data_df['user_id'].unique()) 59 | else: 60 | self.wanted_labels = [x for x in self.data_df.columns.values if '_Label' in x and 'tomorrow_' in x and 'Evening' in x and 'Alertness' not in x and 'Energy' not in x] 61 | self.n_tasks = len(self.wanted_labels) 62 | 63 | #parameters that can be tuned: 64 | self.l2_regularizers = [1e-2, 1e-4] 65 | self.dropout = [True, False] 66 | self.decay = [True] 67 | self.decay_steps = [10000] 68 | self.decay_rates = [0.95] 69 | self.optimizers = [tf.train.AdamOptimizer] #[tf.train.AdagradOptimizer, tf.train.GradientDescentOptimizer 70 | self.train_steps =[4001] 71 | self.batch_sizes = [5,10,20] 72 | self.learning_rates = [.01, .001] 73 | self.architectures = [[100],[50,5],[100,10]] if architectures is None else architectures 74 | 75 | self.test_run = test_run 76 | if test_run: 77 | print "This is only a testing run. Using cheap settings to make it faster" 78 | self.l2_regularizers = [1e-2] 79 | self.dropout = [True] 80 | self.decay = [True] 81 | self.decay_steps = [10000] 82 | self.decay_rates = [0.95] 83 | self.optimizers = [tf.train.AdamOptimizer] #[tf.train.AdagradOptimizer, tf.train.GradientDescentOptimizer 84 | self.train_steps =[1001] 85 | self.batch_sizes = [10] 86 | self.learning_rates = [.001] 87 | self.architectures = [[100],[50,5]] if architectures is None else architectures 88 | 89 | self.calcNumSettingsDesired() 90 | 91 | #storing the results 92 | self.time_sum = 0 93 | if cont: 94 | self.val_results_df = pd.DataFrame.from_csv(self.results_path + self.save_prefix + '.csv') 95 | print '\nPrevious validation results df loaded. It has', len(self.val_results_df), "rows" 96 | self.started_from = len(self.val_results_df) 97 | else: 98 | self.val_results_df = pd.DataFrame() 99 | self.started_from = 0 100 | 101 | # store for computing the accuracy/auc the unfair way 102 | self.cumulative_test_preds = [] 103 | self.cumulative_test_true = [] 104 | 105 | def getSavePrefix(self, file_name, target_label, replace=False): 106 | if '/' in file_name: 107 | slash_loc = file_name.find('/') 108 | file_name = file_name[slash_loc:] 109 | dash_loc = file_name.find('-') 110 | if self.users_as_tasks: 111 | task_name = "tfSTLUsers" 112 | label_name = '-' + helper.getFriendlyLabelName(target_label) 113 | else: 114 | task_name = "tfSTLWellbeing" 115 | label_name = "" 116 | prefix = task_name + file_name[dash_loc:-4] + label_name 117 | if not replace: 118 | while os.path.exists(self.results_path + prefix + '.csv'): 119 | prefix = prefix + '2' 120 | return prefix 121 | 122 | def calcNumSettingsDesired(self): 123 | self.num_settings = len(self.l2_regularizers) * len(self.learning_rates) * len(self.dropout) * len(self.decay) \ 124 | * len(self.batch_sizes) * len(self.optimizers) * len(self.train_steps) * len(self.architectures) 125 | if True in self.decay and (len(self.decay_steps) > 1 or len(self.decay_rates) > 1): 126 | self.num_settings = num_settings * ((len(self.decay_steps) * len(self.decay_rates)) / 2.0) 127 | 128 | # use something like the following to test only one set of parameters: 129 | # wrapper.setParams(l2_regularizers=[1e-4, learning_rates=[.01], dropout=[True], decay=[True], batch_sizes=[50], optimizers=[tf.train.GradientDescentOptimizer]) 130 | def setParams(self, l2_regularizers=None, learning_rates=None, dropout=None, 131 | decay=None, decay_steps=None, decay_rates=None, batch_sizes=None, 132 | optimizers=None, train_steps=None): 133 | '''does not override existing parameter settings if the parameter is not set''' 134 | self.l2_regularizers = l2_regularizers if l2_regularizers is not None else self.l2_regularizers 135 | self.learning_rates = learning_rates if learning_rates is not None else self.learning_rates 136 | self.dropout= dropout if dropout is not None else self.dropout 137 | self.decay= decay if decay is not None else self.decay 138 | self.decay_steps= decay_steps if decay_steps is not None else self.decay_steps 139 | self.decay_rates= decay_rates if decay_rates is not None else self.decay_rates 140 | self.batch_sizes = batch_sizes if batch_sizes is not None else self.batch_sizes 141 | self.optimizers = optimizers if optimizers is not None else self.optimizers 142 | 143 | def settingAlreadyDone(self, task): 144 | if len(self.val_results_df[(self.val_results_df['task_name']== task)]) > 0: 145 | print "setting already tested" 146 | return True 147 | else: 148 | return False 149 | 150 | def getResultsDictFromRow(self,row_df): 151 | best_results_dict = dict() 152 | for col in row_df.columns.values: 153 | best_results_dict[col] = row_df[col].tolist()[0] 154 | 155 | for arch in self.architectures: 156 | if str(arch) == best_results_dict['hidden_layers']: 157 | best_results_dict['hidden_layers'] = arch 158 | 159 | for opt_func in self.optimizers: 160 | if str(opt_func) == best_results_dict['optimizer']: 161 | best_results_dict['optimizer'] = opt_func 162 | 163 | return best_results_dict 164 | 165 | def constructNetwork(self, hidden_layers): 166 | connections = ['full'] * (len(hidden_layers)+1) 167 | self.net.setUpNetworkStructure(hidden_layers,connections) 168 | 169 | def sweepParametersForOneTask(self, task_name, target_label): 170 | if self.users_as_tasks: 171 | task_df = self.data_df[self.data_df['user_id'] == task_name] 172 | else: 173 | task_df = self.data_df 174 | self.net = tfnet.TensorFlowNetwork(task_df, copy.deepcopy(self.wanted_feats), self.wanted_labels, verbose=False, val_type=self.val_type) 175 | 176 | if len(self.net.train_X) == 0 or len(self.net.train_y) == 0: 177 | print "No training data for this task!" 178 | return dict() 179 | if len(self.net.test_X) == 0: 180 | print "No testing data for this task! Skipping", 181 | return dict() 182 | if np.shape(self.net.train_X)[1] == 0: 183 | print "All columns were null, this task has no features left!" 184 | return dict() 185 | if len(self.net.train_X) != len(self.net.train_y): 186 | print "Unequal length of X and Y dataframe!" 187 | return dict() 188 | 189 | df = pd.DataFrame() 190 | 191 | #sweep all possible combinations of parameters 192 | print "...sweeping all parameters for this task..." 193 | for hidden_layers in self.architectures: 194 | for l2_beta in self.l2_regularizers: 195 | for lrate in self.learning_rates: 196 | for dropout in self.dropout: 197 | for bsize in self.batch_sizes: 198 | for opt in self.optimizers: 199 | for tsteps in self.train_steps: 200 | for decay in self.decay: 201 | if decay: 202 | for dsteps in self.decay_steps: 203 | for drate in self.decay_rates: 204 | results_dict = self.testOneSettingForOneTask(hidden_layers, l2_beta, lrate, dropout, decay, dsteps, drate, bsize, opt, tsteps) 205 | df = df.append(results_dict ,ignore_index=True) 206 | else: 207 | #decay steps and decay rate don't matter if decay is set to false 208 | results_dict = self.testOneSettingForOneTask(hidden_layers, l2_beta, lrate, dropout, decay, 10000, 0.95, bsize, opt, tsteps) 209 | df = df.append(results_dict ,ignore_index=True) 210 | 211 | accuracies = df['val_acc'].tolist() 212 | max_acc = max(accuracies) 213 | max_idx = accuracies.index(max_acc) 214 | 215 | best_results_dict = df.iloc[max_idx] 216 | 217 | #retrain with the best settings 218 | 219 | test_acc, test_auc, test_preds = self.getFinalResultsForTask(best_results_dict) 220 | self.cumulative_test_preds.extend(test_preds) 221 | self.cumulative_test_true.extend(self.net.test_X) 222 | 223 | best_results_dict['test_acc'] = test_acc 224 | best_results_dict['test_auc'] = test_auc 225 | return best_results_dict 226 | 227 | def find_best_setting(self, task): 228 | df = self.val_results_df[self.val_results_df['task_name']==task] 229 | accuracies = df['val_acc'].tolist() 230 | max_acc = max(accuracies) 231 | max_idx = accuracies.index(max_acc) 232 | 233 | best_results_dict = df.iloc[max_idx] 234 | return helper.fixSettingDictLoadedFromResultsDf(best_results_dict) 235 | 236 | def getFinalResultsForTask(self, setting_dict): 237 | if self.users_as_tasks: 238 | task_df = self.data_df[self.data_df['user_id'] == setting_dict['task_name']] 239 | target_label = [self.target_label] 240 | else: 241 | task_df = self.data_df 242 | target_label = [helper.getOfficialLabelName(setting_dict['task_name'])] 243 | self.net = tfnet.TensorFlowNetwork(task_df, copy.deepcopy(self.wanted_feats),target_label, verbose=False, val_type=self.val_type) 244 | self.net.setParams(l2_beta=setting_dict['l2_beta'], initial_learning_rate=setting_dict['learning_rate'], decay=setting_dict['decay'], 245 | decay_steps=setting_dict['decay_steps'], decay_rate=setting_dict['decay_rate'], batch_size=setting_dict['batch_size'], 246 | optimizer=setting_dict['optimizer'], dropout=setting_dict['dropout']) 247 | self.constructNetwork(setting_dict['hidden_layers']) 248 | 249 | self.net.setUpGraph() 250 | preds = self.net.runGraph(self.test_steps, print_test=True, return_test_preds=True) 251 | 252 | preds_df = self.net.get_preds_for_df() 253 | label_name = setting_dict['task_name'] 254 | preds_df.to_csv(self.results_path + "Preds-" + self.save_prefix + label_name + '.csv') 255 | print "Preds df saved to", self.results_path + "Preds-" + self.save_prefix + label_name + '.csv' 256 | 257 | return self.net.final_test_results['acc'], self.net.final_test_results['auc'], preds 258 | 259 | def testOneSettingForOneTask(self, hidden_layers, l2_beta, lrate, dropout, decay, dsteps, drate, bsize, opt, tsteps): 260 | self.net.setParams(l2_beta=l2_beta, initial_learning_rate=lrate, decay=decay, 261 | decay_steps=dsteps, decay_rate=drate, batch_size=bsize, 262 | optimizer=opt, n_steps=tsteps, dropout=dropout) 263 | self.constructNetwork(hidden_layers) 264 | if self.val_type == 'cross': 265 | val_acc, val_auc, val_f1, val_prec, val_recall = self.net.trainAndCrossValidate() 266 | else: 267 | val_acc, val_auc, val_f1, val_prec, val_recall = self.net.trainAndValidate() 268 | 269 | results_dict = {'hidden_layers':hidden_layers, 'l2_beta': l2_beta, 'learning_rate': lrate, 270 | 'dropout': dropout, 'decay': decay, 'decay_steps': dsteps, 271 | 'decay_rate': drate, 'batch_size': bsize, 272 | 'optimizer': opt, 'val_acc': val_acc, 'val_auc':val_auc} 273 | 274 | return results_dict 275 | 276 | 277 | def runOneTask(self, task, target_label): 278 | print "\nRunning task", task 279 | if self.cont: 280 | if self.settingAlreadyDone(task): 281 | if self.redo_test: 282 | self.redoTestResult(task) 283 | best_setting = self.find_best_setting(task) 284 | print "The setting that produced the best validation results for task", task, "was:" 285 | print best_setting 286 | self.getFinalResultsForTask(best_setting) 287 | return 288 | 289 | t0 = time() 290 | 291 | results_dict = self.sweepParametersForOneTask(task, target_label) 292 | results_dict['task_name'] = task 293 | self.val_results_df = self.val_results_df.append(results_dict,ignore_index=True) 294 | 295 | print "\n", self.val_results_df.tail(n=1) 296 | t1 = time() 297 | this_time = t1 - t0 298 | print "It took", this_time, "seconds to obtain this result" 299 | 300 | self.time_sum = self.time_sum + this_time 301 | 302 | self.printTimeEstimate() 303 | sys.stdout.flush() 304 | 305 | #output the file every few iterations for safekeeping 306 | if len(self.val_results_df) % SAVE_RESULTS_EVERY_X_TESTS == 0: 307 | self.val_results_df.to_csv(self.results_path + self.save_prefix + '.csv') 308 | 309 | def printTimeEstimate(self): 310 | num_done = len(self.val_results_df)-self.started_from 311 | num_remaining = self.n_tasks - num_done - self.started_from 312 | avg_time = self.time_sum / num_done 313 | total_secs_remaining = int(avg_time * num_remaining) 314 | hours = total_secs_remaining / 60 / 60 315 | mins = (total_secs_remaining % 3600) / 60 316 | secs = (total_secs_remaining % 3600) % 60 317 | 318 | print "\n", num_done, "settings processed so far,", num_remaining, "left to go" 319 | print "Estimated time remaining:", hours, "hours", mins, "mins", secs, "secs" 320 | 321 | def run(self): 322 | print "\nYou have chosen to test a total of", self.num_settings, "settings for each task" 323 | print "There are", self.n_tasks, "tasks, meaning you are training a total of..." 324 | print "\t", self.num_settings * self.n_tasks, "neural networks!!" 325 | sys.stdout.flush() 326 | 327 | if self.users_as_tasks: 328 | tasks = self.data_df['user_id'].unique() 329 | else: 330 | tasks = [helper.getFriendlyLabelName(x) for x in self.wanted_labels] 331 | 332 | i = 0 333 | for t in range(len(tasks)): 334 | if self.users_as_tasks: 335 | self.runOneTask(tasks[i], self.target_label) 336 | else: 337 | self.runOneTask(tasks[i], self.wanted_labels[i]) 338 | if self.test_run and i > 2: 339 | break 340 | i += 1 341 | self.val_results_df.to_csv(self.results_path + self.save_prefix + '.csv') 342 | 343 | if self.users_as_tasks: 344 | print "\n\nFINAL RESULTS - Averaging individual models:" 345 | print "\tValidation set: Accuracy =", np.nanmean(self.val_results_df['val_acc']), "AUC = ", np.nanmean(self.val_results_df['val_auc']) 346 | print "\tTest set: Accuracy =", np.nanmean(self.val_results_df['test_acc']), "AUC = ", np.nanmean(self.val_results_df['test_auc']) 347 | print "" 348 | print "FINAL RESULTS - Aggregating predictions of individual models" 349 | agg_auc = helper.computeAuc(self.cumulative_test_preds, self.cumulative_test_true) 350 | agg_acc = helper.getBinaryAccuracy(self.cumulative_test_preds, self.cumulative_test_true) 351 | print "\tTest set: Accuracy =", agg_acc, "AUC = ", agg_auc 352 | 353 | 354 | if __name__ == "__main__": 355 | print "TENSOR FLOW STL MODEL SELECTION" 356 | print "\tFor each tasl individually, this code will sweep a set of network architectures and parameters to find the ideal settings" 357 | print "\tIt will record the settings, validation and test results for each user" 358 | 359 | if len(sys.argv) < 3: 360 | print "Error: usage is python tensorFlowWrapperSTL.py " 361 | print "\t: e.g. dataset-Simple-Group.csv - program will look in the following directory for this file", DEFAULT_DATASETS_PATH 362 | print "\t: type 'users' for users as tasks, or 'wellbeing' for wellbeing measures as tasks" 363 | print "\t: Only required for users-as-tasks. Enter the name of the label you would like classify on. E.g. tomorrow_Group_Happiness_Evening_Label." 364 | print "\t: optional. If 'True', the neural net will pick up from where it left off by loading a previous validation results file" 365 | print "\t: optional. If 'redo' the neural net will go through the saved validation results file and compute test predictions for each user for each setting. It will collect all the preds and only compute AUC at the end" 366 | sys.exit() 367 | filename= sys.argv[1] #get data file from command line argument 368 | task_type = sys.argv[2] 369 | if len(sys.argv) >= 4: 370 | target_label = sys.argv[3] 371 | print "Classifying on target label:", target_label 372 | else: 373 | target_label = None 374 | 375 | print "\nLoading dataset", DEFAULT_DATASETS_PATH + filename 376 | if task_type == 'wellbeing': 377 | users_as_tasks = False 378 | print "Performing wellbeing-as-tasks classification\n" 379 | else: 380 | users_as_tasks = True 381 | print "Performing users-as-tasks classification\n" 382 | 383 | if len(sys.argv) >= 5 and sys.argv[4] == 'True': 384 | cont = True 385 | print "Okay, will continue from a previously saved validation results file for this problem" 386 | else: 387 | cont = False 388 | print "" 389 | 390 | redo = False 391 | if len(sys.argv) >= 6 and sys.argv[5] == 'redo': 392 | redo = True 393 | print "Okay, will redo all the test results to get a better AUC" 394 | 395 | 396 | wrapper = TensorFlowSTLWrapper(filename, target_label=target_label, users_as_tasks=users_as_tasks, cont=cont, 397 | results_path=DEFAULT_RESULTS_PATH, datasets_path=DEFAULT_DATASETS_PATH, figures_path=DEFAULT_FIGURES_PATH) 398 | 399 | if not redo: 400 | print "\nThe following parameter settings will be tested for each task:" 401 | print "\tl2_regularizers: \t", wrapper.l2_regularizers 402 | print "\tlearning_rates: \t", wrapper.learning_rates 403 | print "\tdropout: \t", wrapper.dropout 404 | print "\tdecay: \t", wrapper.decay 405 | print "\tdecay_steps: \t", wrapper.decay_steps 406 | print "\tdecay_rates: \t", wrapper.decay_rates 407 | print "\tbatch_sizes: \t", wrapper.batch_sizes 408 | print "\toptimizers: \t", wrapper.optimizers 409 | print "\ttrain_steps: \t", wrapper.train_steps 410 | 411 | print "\nThe following network structures will be tested:" 412 | print "\t", wrapper.architectures 413 | 414 | print "\nThe validation results dataframe will be saved in:", wrapper.results_path + wrapper.save_prefix + '.csv' 415 | print "\nThe validation accuracy figures will be saved in:", wrapper.figures_path + wrapper.save_prefix + '.eps' 416 | 417 | wrapper.run() 418 | else: 419 | wrapper.redoAllTestsResults() 420 | 421 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Personalized Multitask Learning 2 | This repo contains code for 3 multitask machine learning methods: deep neural networks, Multitask Multi-kernel Learning (MTMKL), and a hierarchical Bayesian model (HBLR). These methods can be used to personalize the prediction of outcomes like stress, happiness, etc. to each individual, by treating predicting the outcome of a single individual (or a cluster of related individuals) as a task. 3 | 4 | The code is related to two research papers which explain this approach in further detail: 5 | 6 | Taylor, S.\*, Jaques, N.\*, Nosakhare, E., Sano, A., Picard, R., "Personalized Multitask Learning for Predicting Tomorrow’s Mood, Stress, and Health", IEEE Transactions on Affective Computing December 2017. (\*equal contribution) PDF 7 | 8 | Jaques, N.\*, Taylor, S.\*, Nosakhare, E., Sano, A., Picard, R., "Multi-task Learning for Predicting Health, Stress, and Happiness", NIPS Workshop on Machine Learning for Healthcare, December 2016, Barcelona, Spain. (\*equal contribution) PDF *BEST PAPER AWARD*
9 | 10 | If you find this code useful, please cite our work! 11 | 12 | If you have any questions about this code or the associated papers, please email us at jaquesn@mit.edu or sataylor@mit.edu. 13 | 14 | # Models in this code: 15 | 16 | ## Multitask Neural Network (MTL-NN) 17 | 18 | ![image](mtl_nn_clusters.png) 19 | 20 | The intuition behind the multitask neural network design is that the shared layers will learn to extract information 21 | that is useful for summarizing relevant characteristics of any person’s day into an efficient, generalizable embedding. 22 | The final, task-specific layers are then expected to learn how to map this embedding to a prediction customized for each person or cluster of people. 23 | 24 | For example, if the shared layers learn to condense all of the relevant smartphone app data about phone calls and 25 | texting into an aggregate measure of social support, the task-specific layers can then learn a unique weighting of this 26 | measure for each cluster of participants. Perhaps a cluster containing participants with high extroversion scores will 27 | be more strongly affected by a lack of social support than another cluster. 28 | 29 | ## Multitask Multi-kernel Learning (MTMKL) 30 | 31 | MTMKL (originally developed by Kandemir 32 | et. al.) is a modified version of Multi-Kernel Learning (MKL) in which tasks 33 | share information through kernel weights on the modalities. MTMKL uses a least-squares support vector machine (LSSVM) 34 | for each task-specific model. Unlike the canonical SVM, the LSSVM uses a quadratic error on the “slack” variables 35 | instead of an L1 error. As a result, the LSSVM can be learned by solving a series of linear equations in contrast to 36 | using quadratic programing to learn a canonical SVM model. 37 | 38 | 39 | ## Hierarchical Bayesian Logistic Regression (HBLR) 40 | 41 | In hierarchical Bayesian MTL approaches, the model for each task draws its parameters from a common prior distribution. 42 | As the model is trained, the common prior is updated, allowing information to be shared across tasks. The model we 43 | adopt, which was originally proposed by Xue et. al., draws logistic regression (LR) weights for each task 44 | from a shared Dirichlet Process (DP) prior; we call this model Hierarchical Bayesian Logistic Regression (HBLR). 45 | 46 | In contrast with our prior approaches (MTL-NN and MTMKL), the HBLR model allows us to directly define each task as 47 | predicting a label (e.g. tomorrow's stress level) of a single user, since the model is able to implicitly learn its 48 | own (soft) clustering. This model clusters tasks that are most similar in terms of their relationship between the 49 | input features and their resulting outcome (i.e. decision boundaries) while simultaneously learning the prediction 50 | function. 51 | 52 | ## Single Task Learning models 53 | Code to train a logistic regression model, an LSSVM, and a single-task neural network is include for comparison purposes. 54 | 55 | # Structure 56 | 57 | ## Code structure 58 | Wrappers are used to perform a grid search over hyperparameters. The file `run_jobs.py` can be used to launch the training of several models in sequence, and send emails after they complete. To see an example of how to run the training code for the models, see `jobs_to_run.txt`. 59 | 60 | ## Input data format 61 | ### .csv files 62 | Assume csvs have columns for 'user_id', 'timestamp', and columns for the outcome labels containing the string '_Label'. 63 | 64 | ### 'Task dict list' 65 | For the multi-task algorithms, we use a special data structure saved to a pickle file to represent the data from multiple tasks. The code for generating files in this format given a .csv file is available in make_datasets.py. To run it, use: 66 | 67 | ```python make_datasets.py --datafile='./example_data.csv' --task_type='users'``` 68 | 69 | #### File Format details 70 | - Data for both labels-as-tasks and users-as-tasks are stored in pickled files as a list of dicts (each list item represents a task) 71 | - Labels-as-tasks 72 | - The .csv file will be partitioned such that predicting related outcomes is each task (e.g. predicting stress is one task and predicting happiness is another) 73 | - Normalization is done based on training data for entire group 74 | - Users-as-tasks: 75 | - The .csv file will be partioned such that predicting the outcome of each user is one task. 76 | - Need to specify which label to target (i.e., the label that you will be predicting) 77 | - Normalization is done per-person 78 | 79 | - Each task is a dict containing 4 keys: 80 | - ‘Name’: gives the name of the task, eg. "Group_Happiness_Evening_Label" or a user ID 81 | - ‘X’: the data matrix. Rows are samples, columns are features. Does not contain unnecessary stuff like ‘user_id’ and ‘timestamp’, and has already been normalized and empty cells filled 82 | - ‘Y’: the classification labels for this task, in the same order as the rows of X 83 | - ‘ModalityDict’: used for MTMKL model. Maps modalities like “phys” or “location” to their start index in the feature list 84 | 85 | -------------------------------------------------------------------------------- /__pycache__/helperFuncs.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mitmedialab/PersonalizedMultitaskLearning/2de7d9485f5ac09264bfa624f16c5b05a5a44ada/__pycache__/helperFuncs.cpython-35.pyc -------------------------------------------------------------------------------- /example_data.csv: -------------------------------------------------------------------------------- 1 | ,user_id,timestamp,dataset,classifier_friendly_ppt_id,classifier_friendly_day_of_week,location_time_spent_on_campus,location_log_likelihood_of_day,weather_max_precip_intensity,call_0H-24H_total_num_missed,phys_3H-10H:percentHighPeakNoArtifact,phys_3H-10H:percentMedPeakNoArtifact,phys_3H-10H:sumTempWeightedAUC,screen_0H-3H_total_duration,sms_0H-3H_total_num_incoming,sms_17H-24H_unique_num_incoming,sms_17H-24H_unique_num_outgoing,sms_0H-24H_unique_num_outgoing,tomorrow_Happiness_Evening_Label,tomorrow_Health_Evening_Label,tomorrow_Calmness_Evening_Label 0,1,8/22/17 0:00,Val,0,4,0,1434.850264,0,1,0,0,-0.369250141,49,0,4,3,4,1,1, 1,1,8/23/17 0:00,Val,0,5,0,1384.511324,0,0,0,0,-47.11177225,21,0,2,1,1,1,1,0 2,1,8/24/17 0:00,Train,0,6,0,1432.698762,0,0,0,0,-8.383537082,10,1,3,3,5,1,0,1 3,1,8/25/17 0:00,Val,0,0,900,1282.323883,0,0,0,0,-32.14207652,0,0,4,4,6,0,0,0 4,1,8/26/17 0:00,Test,0,1,0,617.3508313,0,0,0,0,-123.1579134,66,0,3,3,3,1,1,0 5,2,8/22/17 0:00,Train,1,4,0,-24.69563937,0,0,0,0,-1.351502791,0,0,5,5,6,1,,0 6,2,8/23/17 0:00,Train,1,5,900,1433.587585,0,1,0.238095238,0.238095238,-65.58436476,0,0,4,3,3,1,,1 7,2,8/24/17 0:00,Train,1,6,0,662.5491124,0,0,0,0,-16.83429783,0,0,5,5,9,1,1, 8,2,8/25/17 0:00,Test,1,0,0,966.1353508,0,0,0,0,-135.3018584,0,0,4,4,9,1,1,1 9,2,8/26/17 0:00,Test,1,1,0,757.6295022,0.03,0,0.476190476,0.476190476,-217.5607483,20,0,1,1,6,1,1, -------------------------------------------------------------------------------- /generic_wrapper.py: -------------------------------------------------------------------------------- 1 | """These abstract wrapper classes are designed to enable hyperparameter sweeps 2 | for a variety of different models that inherit them. 3 | 4 | Note: STL stands for Single-Task-Learning, i.e. normal machine learning 5 | algorithms like SVM, logistic regression, etc.""" 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import os 10 | import sys 11 | import copy 12 | from time import time 13 | 14 | CODE_PATH = os.path.dirname(os.getcwd()) 15 | sys.path.append(CODE_PATH) 16 | 17 | DEFAULT_MAIN_DIRECTORY = '/Your/path/here/' 18 | 19 | DEFAULT_VALIDATION_TYPE = 'cross' #'val' 20 | DEFAULT_NUM_CROSS_FOLDS = 5 21 | 22 | import helperFuncs as helper 23 | 24 | def reload_dependencies(): 25 | reload(helper) 26 | 27 | # This optimizes parameters individually for each task 28 | 29 | class STLWrapper: 30 | """ WARNING: This code only deals with input files in the form of pickled task lists, 31 | and only implements cross validation.""" 32 | def __init__(self, file_prefix, users_as_tasks=False, cont=False, classifier_name='LSSVM', 33 | num_cross_folds=DEFAULT_NUM_CROSS_FOLDS, main_directory=DEFAULT_MAIN_DIRECTORY, 34 | datasets_path='Data/Datasets/Discard20/', cant_train_with_one_class=True, 35 | check_test=False, save_results_every_nth=3, test_csv_filename=None): 36 | """ Initializes the parent model with fields useful for all child wrapper classes 37 | 38 | Args: 39 | file_prefix: The first portion of the name of a set of pickled task lists, e.g. 40 | 'datasetTaskList-Discard-Future-Group_' 41 | users_as_tasks: A boolean. If true, will assume there are many tasks and each task 42 | is one person. Will not print results per task. 43 | cont: A boolean. If true, will try to load a saved results .csv and continue 44 | training on the next unfinished result. 45 | classifier_name: String name of the classifier trained. Used to know where to save 46 | results. 47 | num_cross_folds: An integer number of folds to use in cross validation. 48 | main_directory: The path to the main dropbox directory which contains the results and 49 | data directories. 50 | datasets_path: The path from the main dropbox to the datasets directory. 51 | cant_train_with_one_class: A boolean. If true, if the model encounters a task with 52 | only one type of label in the training data, it will just predict the most 53 | frequent class. 54 | check_test: A boolean. If true, will evaluate final results on held-out test set 55 | after running. 56 | save_results_every_nth: An integer representing the number of settings to test before 57 | writing the results df to a csv file. 58 | """ 59 | # memorize arguments and construct paths 60 | self.main_directory = main_directory 61 | self.classifier_name = classifier_name 62 | self.results_path = main_directory + 'Results/' + classifier_name + '/' 63 | self.figures_path = main_directory + 'Figures/' + classifier_name + '/' 64 | self.datasets_path = main_directory + datasets_path 65 | self.cont = cont 66 | self.users_as_tasks = users_as_tasks 67 | self.cant_train_with_one_class = cant_train_with_one_class 68 | self.check_test = check_test 69 | self.save_results_every_nth = save_results_every_nth 70 | self.file_prefix = file_prefix 71 | self.save_prefix = self.get_save_prefix(file_prefix, replace=cont) 72 | if test_csv_filename is not None: 73 | self.test_csv_filename = self.datasets_path + test_csv_filename 74 | else: 75 | self.test_csv_filename = None 76 | 77 | self.params = {} 78 | self.define_params() 79 | 80 | self.load_data() 81 | 82 | self.calc_num_param_settings() 83 | self.construct_list_of_params_to_test() 84 | 85 | #storing the results 86 | self.time_sum = 0 87 | if cont: 88 | self.val_results_df = pd.DataFrame.from_csv(self.results_path + self.save_prefix + '.csv') 89 | print '\nPrevious validation results df loaded. It has', len(self.val_results_df), "rows" 90 | self.started_from = len(self.val_results_df) 91 | else: 92 | self.val_results_df = pd.DataFrame() 93 | self.started_from = 0 94 | 95 | self.num_cross_folds = num_cross_folds 96 | helper.generateCrossValPickleFiles(self.datasets_path, self.file_prefix, self.num_cross_folds) 97 | 98 | # These functions need to be overwritten by the child class 99 | def define_params(self): 100 | """ This function should set self.params to a dict where they keys represent names of parameters 101 | to test (e.g. for SVM, 'C') as they should be saved to the val_results_df, and the values of 102 | self.params should be a list of values for the parameter that need to be tested. An example 103 | dict: 104 | self.params['C'] = [1,10,100] 105 | self.params['beta'] = [.001, .01, .1] 106 | """ 107 | print "Error! define_params should be overwritten in child class" 108 | raise NotImplementedError 109 | 110 | def train_and_predict_task(self, t, train_X, train_y, eval_X, param_dict): 111 | print "Error! train_model_for_task should be overwritten in child class" 112 | raise NotImplementedError 113 | 114 | def predict_task(self, X, t): 115 | print "Error! predict_task should be overwritten in child class" 116 | raise NotImplementedError 117 | 118 | def calc_num_param_settings(self): 119 | self.num_settings = self.n_tasks 120 | for key in self.params: 121 | self.num_settings = self.num_settings * len(self.params[key]) 122 | 123 | def construct_list_of_params_to_test(self): 124 | """Will make a class level variable that is a list of parameter dicts. 125 | Each entry in the list is a dict of parameter settings, 126 | eg. {'C'=1.0, 'beta'=.01, ...}. All tasks can use this list to train 127 | against all settings.""" 128 | self.list_of_param_settings = [] 129 | self.recurse_and_append_params(copy.deepcopy(self.params), {}) 130 | 131 | def recurse_and_append_params(self, param_settings_left, this_param_dict, debug=False): 132 | """param_settings_left is a dictionary of lists. The keys are parameters 133 | (like 'C'), the values are the list of settings for those parameters that 134 | need to be tested (like [1.0, 10.0, 100.0]). this_param_dict is a dictionary 135 | containing a single setting for each parameter. If a parameter is not in 136 | this_param_dict's keys, a setting for it has not been chosen yet. 137 | 138 | Performs breadth-first-search""" 139 | if debug: print "Working on a parameter dict containing", this_param_dict 140 | for key in self.params.keys(): 141 | if key in this_param_dict: 142 | continue 143 | else: 144 | this_setting = param_settings_left[key].pop() 145 | if debug: print "Popped", key, "=", this_setting, "off the params left" 146 | if len(param_settings_left[key]) > 0: 147 | if debug: print "Recursing on remaining parameters", param_settings_left 148 | self.recurse_and_append_params(copy.deepcopy(param_settings_left), 149 | copy.deepcopy(this_param_dict)) 150 | if debug: print "Placing the popped setting", key, "=", this_setting, "into the parameter dict" 151 | this_param_dict[key] = this_setting 152 | 153 | self.list_of_param_settings.append(this_param_dict) 154 | if debug: print "Appending parameter dict to list:", this_param_dict, "\n" 155 | 156 | def load_data(self): 157 | self.test_tasks = helper.loadPickledTaskList(self.datasets_path, self.file_prefix, "Test",fix_y=True) 158 | self.train_tasks = helper.loadPickledTaskList(self.datasets_path, self.file_prefix, "Train",fix_y=True) 159 | self.n_tasks = len(self.train_tasks) 160 | 161 | def get_save_prefix(self, file_prefix, replace=False): 162 | name_modifier = "" 163 | if '/' in file_prefix: 164 | if "NoLocation" in file_prefix: 165 | name_modifier = "-noloc" 166 | slash_loc = file_prefix.find('/') 167 | path_modifier = file_prefix[0:slash_loc+1] 168 | file_prefix = file_prefix[slash_loc+1:] 169 | self.file_prefix = file_prefix 170 | self.datasets_path += path_modifier 171 | 172 | dash_loc = file_prefix.find('-') 173 | 174 | if self.users_as_tasks: 175 | task_str = '_users' 176 | else: 177 | task_str = '_wellbeing' 178 | 179 | prefix = self.classifier_name + task_str + file_prefix[dash_loc:-1] + name_modifier 180 | 181 | if not replace: 182 | while os.path.exists(self.results_path + prefix + '.csv'): 183 | prefix = prefix + '2' 184 | return prefix 185 | 186 | def setting_already_done(self, param_dict): 187 | mini_df = self.val_results_df 188 | for key in param_dict.keys(): 189 | mini_df = mini_df[mini_df[key] == param_dict[key]] 190 | if len(mini_df) == 0: 191 | return False 192 | print "Setting already tested" 193 | return True 194 | 195 | def convert_param_dict_for_use(self, param_dict): 196 | """When loading rows from a saved results df in csv format, some 197 | of the settings may end up being converted to a string representation 198 | and need to be converted back to actual numbers and objects. 199 | 200 | May need to be overwritten in child class.""" 201 | param_dict['task_num'] = int(param_dict['task_num']) 202 | return param_dict 203 | 204 | def get_preds_true_for_task(self,train_tasks, test_tasks, param_dict): 205 | t = param_dict['task_num'] 206 | X = train_tasks[t]['X'] 207 | y = train_tasks[t]['Y'] 208 | 209 | test_X = test_tasks[t]['X'] 210 | true_y = list(test_tasks[t]['Y'].flatten()) 211 | 212 | if len(y)==0 or len(X)==0 or len(test_X) == 0 or len(true_y)==0: 213 | return None, None 214 | 215 | if self.cant_train_with_one_class and len(np.unique(y))==1: 216 | preds = list(np.unique(y)[0]*np.ones(len(true_y))) 217 | else: 218 | preds = self.train_and_predict_task(t, X, y, test_X, param_dict) 219 | 220 | return preds, true_y 221 | 222 | def sweep_all_parameters(self): 223 | print "\nYou have chosen to test a total of", self.num_settings / self.n_tasks, "settings" 224 | print "for each of", self.n_tasks, "tasks, leading to a total of..." 225 | print self.num_settings, "models to train!!" 226 | sys.stdout.flush() 227 | 228 | #sweep all possible combinations of parameters 229 | for t in range(self.n_tasks): 230 | print "\nSweeping all parameters for task t:", self.train_tasks[t]['Name'] 231 | for param_dict in self.list_of_param_settings: 232 | these_params = copy.deepcopy(param_dict) 233 | these_params['task_num'] = t 234 | these_params['task_name'] = self.train_tasks[t]['Name'] 235 | self.test_one_setting(these_params) 236 | 237 | self.val_results_df.to_csv(self.results_path + self.save_prefix + '.csv') 238 | 239 | def test_one_setting(self, param_dict): 240 | if self.cont and self.setting_already_done(param_dict): 241 | return 242 | t0 = time() 243 | 244 | results_dict = self.get_cross_validation_results(param_dict) 245 | self.val_results_df = self.val_results_df.append(results_dict,ignore_index=True) 246 | 247 | t1 = time() 248 | this_time = t1 - t0 249 | self.time_sum = self.time_sum + this_time 250 | 251 | print "\n", self.val_results_df.tail(n=1) 252 | print "It took", this_time, "seconds to obtain this result" 253 | self.print_time_estimate() 254 | 255 | sys.stdout.flush() 256 | 257 | #output the file every few iterations for safekeeping 258 | if len(self.val_results_df) % self.save_results_every_nth == 0: 259 | self.val_results_df.to_csv(self.results_path + self.save_prefix + '.csv') 260 | 261 | def get_cross_validation_results(self, param_dict, print_per_fold=False): 262 | all_acc = [] 263 | all_auc = [] 264 | all_f1 = [] 265 | all_precision = [] 266 | all_recall = [] 267 | 268 | for f in range(self.num_cross_folds): 269 | train_tasks, val_tasks = helper.loadCrossValData(self.datasets_path, self.file_prefix, f, fix_y=True) 270 | 271 | preds, true_y = self.get_preds_true_for_task(train_tasks, val_tasks, param_dict) 272 | if preds is None or true_y is None: 273 | continue 274 | 275 | acc, auc, f1, precision, recall = helper.computeAllMetricsForPreds(preds, true_y) 276 | all_acc.append(acc) 277 | all_auc.append(auc) 278 | all_f1.append(f1) 279 | all_precision.append(precision) 280 | all_recall.append(recall) 281 | if print_per_fold: print "Fold", f, "acc", acc, "auc", auc, "f1", f1, "precision",precision,"recall",recall 282 | 283 | if print_per_fold: 284 | print "accs for all folds", all_acc 285 | print "aucs for all folds", all_auc 286 | 287 | # Add results to the dictionary 288 | param_dict['val_acc'] = np.nanmean(all_acc) 289 | param_dict['val_auc'] = np.nanmean(all_auc) 290 | param_dict['val_f1'] = np.nanmean(all_f1) 291 | param_dict['val_precision'] = np.nanmean(all_precision) 292 | param_dict['val_recall'] = np.nanmean(all_recall) 293 | 294 | return param_dict 295 | 296 | def print_time_estimate(self): 297 | num_done = len(self.val_results_df)-self.started_from 298 | num_remaining = self.num_settings - num_done - self.started_from 299 | avg_time = self.time_sum / num_done 300 | total_secs_remaining = int(avg_time * num_remaining) 301 | hours = total_secs_remaining / 60 / 60 302 | mins = (total_secs_remaining % 3600) / 60 303 | secs = (total_secs_remaining % 3600) % 60 304 | 305 | print "\n", num_done, "settings processed so far,", num_remaining, "left to go" 306 | print "Estimated time remaining:", hours, "hours", mins, "mins", secs, "secs" 307 | 308 | def get_baseline(self, Y): 309 | Y = Y.tolist() 310 | percent_true = float(Y.count(1.0)) / float(len(Y)) 311 | if percent_true < 0.5: 312 | return 1.0 - percent_true 313 | else: 314 | return percent_true 315 | 316 | def find_best_setting_for_task(self, task_num, optimize_for='val_acc'): 317 | task_df = self.val_results_df[self.val_results_df['task_num']==task_num] 318 | accuracies = task_df[optimize_for].tolist() 319 | max_acc = max(accuracies) 320 | max_idx = accuracies.index(max_acc) 321 | return task_df.iloc[max_idx] 322 | 323 | def get_final_results(self, optimize_for='val_acc'): 324 | if self.users_as_tasks and not self.check_test: 325 | print "check_test is set to false, Will not evaluate performance on held-out test set." 326 | return 327 | print "\nAbout to evaluate results on held-out test set!!" 328 | print "Will use the settings that produced the best", optimize_for 329 | 330 | all_preds = [] 331 | all_true_y = [] 332 | per_task_accs = [] 333 | per_task_aucs = [] 334 | per_task_f1 = [] 335 | per_task_precision = [] 336 | per_task_recall = [] 337 | 338 | for t in range(self.n_tasks): 339 | task_settings = self.find_best_setting_for_task(t, optimize_for=optimize_for) 340 | assert(task_settings['task_num'] == t) 341 | if not self.users_as_tasks: 342 | print "\nBEST SETTING FOR TASK", t, "-", task_settings['task_name'] 343 | print "The highest", optimize_for, "of", task_settings[optimize_for], "was found with the following settings:" 344 | print task_settings 345 | 346 | task_settings = self.convert_param_dict_for_use(task_settings) 347 | preds, true_y = self.get_preds_true_for_task(self.train_tasks, self.test_tasks, task_settings) 348 | if preds is None or true_y is None: 349 | continue 350 | 351 | all_preds.extend(preds) 352 | all_true_y.extend(true_y) 353 | 354 | # save the per-task results 355 | t_acc, t_auc, t_f1, t_precision, t_recall = helper.computeAllMetricsForPreds(preds, true_y) 356 | per_task_accs.append(t_acc) 357 | per_task_aucs.append(t_auc) 358 | per_task_f1.append(t_f1) 359 | per_task_precision.append(t_precision) 360 | per_task_recall.append(t_recall) 361 | 362 | if not self.users_as_tasks: 363 | print "\nFINAL TEST RESULTS FOR", helper.getFriendlyLabelName(self.train_tasks[t]['Name']) 364 | print 'Acc:', t_acc, 'AUC:', t_auc, 'F1:', t_f1, 'Precision:', t_precision, 'Recall:', t_recall 365 | 366 | print "\nHELD OUT TEST METRICS COMPUTED BY AVERAGING OVER TASKS" 367 | avg_acc = np.nanmean(per_task_accs) 368 | avg_auc = np.nanmean(per_task_aucs) 369 | avg_f1 = np.nanmean(per_task_f1) 370 | avg_precision = np.nanmean(per_task_precision) 371 | avg_recall = np.nanmean(per_task_recall) 372 | print 'Acc:', avg_acc, 'AUC:', avg_auc, 'F1:', avg_f1, 'Precision:', avg_precision, 'Recall:', avg_recall 373 | 374 | if self.test_csv_filename is not None: 375 | print "\tSAVING HELD OUT PREDICITONS" 376 | if self.users_as_tasks: 377 | task_column = 'user_id' 378 | label_name = helper.getFriendlyLabelName(self.file_prefix) 379 | wanted_label = helper.getOfficialLabelName(label_name) 380 | predictions_df = helper.get_test_predictions_for_df_with_task_column( 381 | self.predict_task, self.test_csv_filename, task_column, self.test_tasks, 382 | wanted_label=wanted_label, num_feats_expected=np.shape(self.test_tasks[0]['X'])[1], 383 | label_name=label_name, tasks_are_ints=False) 384 | else: 385 | predictions_df = helper.get_test_predictions_for_df_with_no_task_column(self.predict_task, 386 | self.test_csv_filename, self.test_tasks, num_feats_expected=np.shape(self.test_tasks[0]['X'])[1]) 387 | predictions_df.to_csv(self.results_path + "Preds-" + self.save_prefix + '.csv') 388 | else: 389 | print "Uh oh, the test csv filename was not set, can't save test preds" 390 | 391 | def run(self): 392 | self.sweep_all_parameters() 393 | self.get_final_results() 394 | 395 | -------------------------------------------------------------------------------- /helperFuncs.py: -------------------------------------------------------------------------------- 1 | """Collection of utility functions to support the rest of the code.""" 2 | import numpy as np 3 | import pandas as pd 4 | import copy 5 | import os 6 | import pickle 7 | from scipy import stats 8 | from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score 9 | import ast 10 | import tensorflow as tf 11 | 12 | NAN_FILL_VALUE = 0 13 | 14 | def computeAuc(preds, true_y): 15 | try: 16 | return roc_auc_score(true_y, preds) 17 | except: 18 | return np.nan 19 | 20 | def computeF1(preds, true_y): 21 | try: 22 | if (1 not in true_y) or (1 not in preds): 23 | # F-score is ill-defined when there are no true samples 24 | # F-score is ill-defined when there are no predicted samples. 25 | return np.nan 26 | return f1_score(true_y, preds) 27 | except: 28 | return np.nan 29 | 30 | #The precision is the ratio tp / (tp + fp) where tp is the number of 31 | #true positives and fp the number of false positives. 32 | def computePrecision(preds, true_y): 33 | try: 34 | if (1 not in preds): 35 | #Precision is ill-defined when there are no predicted samples. 36 | return np.nan 37 | return precision_score(true_y, preds) 38 | except: 39 | return np.nan 40 | 41 | #The recall is the ratio tp / (tp + fn) where tp is the number of true 42 | #positives and fn the number of false negatives. The recall is intuitively 43 | #the ability of the classifier to find all the positive samples. 44 | def computeRecall(preds, true_y): 45 | try: 46 | if 1 not in true_y: 47 | # Recall is ill-defined and being set to 0.0 due to no true samples 48 | return np.nan 49 | return recall_score(true_y, preds) 50 | except: 51 | return np.nan 52 | 53 | def computeDistanceFromBaseline(preds, true_y): 54 | if len(np.shape(preds)) > 1: 55 | print("ERROR! Baseline distance function not defined for multi-dimensional predictions") 56 | return np.nan 57 | baseline = getBaseline(true_y) 58 | acc = getBinaryAccuracy(preds,true_y) 59 | return acc - baseline 60 | 61 | def computeAllMetricsForPreds(preds, true_y): 62 | acc = getBinaryAccuracy(preds,true_y) 63 | auc = computeAuc(preds, true_y) 64 | f1 = computeF1(preds, true_y) 65 | precision = computePrecision(preds, true_y) 66 | recall = computeRecall(preds, true_y) 67 | return acc, auc, f1, precision, recall 68 | 69 | def checkTaskList(train_tasks): 70 | for t in range(len(train_tasks)): 71 | isValidTask(train_tasks,t) 72 | print("...done!") 73 | 74 | def isValidTask(train_tasks, t, print_msgs=True): 75 | if train_tasks[t]['Y'] is None or train_tasks[t]['X'] is None: 76 | if print_msgs: print("Uh oh,", train_tasks[t]['Name'], "is None!!") 77 | return False 78 | elif len(train_tasks[t]['X']) == 0: 79 | if print_msgs: print("Uh oh,", train_tasks[t]['Name'], "has no data!") 80 | return False 81 | elif len(train_tasks[t]['X']) != len(train_tasks[t]['Y']): 82 | if print_msgs: print("Uh oh,", train_tasks[t]['Name'], 83 | "has messed up data! Lengths of X and Y don't match") 84 | return False 85 | return True 86 | 87 | def getBootstrapSample(test_df): 88 | bootstrap_ix = np.random.choice(test_df.index,len(test_df)) 89 | 90 | test_df = test_df.loc[bootstrap_ix] 91 | test_df = test_df.reset_index() 92 | test_df = test_df.drop('index',1) 93 | return test_df 94 | 95 | def plotROC(auc_list,fpr_list,tpr_list): 96 | mean_tpr = 0.0 97 | mean_fpr = np.linspace(0,1,100) 98 | 99 | plt.figure(figsize=(5,5)) 100 | 101 | for i in range(len(fpr_list)): 102 | mean_tpr += np.interp(mean_fpr, fpr_list[i], tpr_list[i]) 103 | mean_tpr[0] = 0.0 104 | plt.plot(fpr_list[i], tpr_list[i], lw=1, label='ROC fold %d (area = %0.2f)' % (i, auc_list[i])) 105 | 106 | plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') 107 | 108 | mean_tpr /= len(fpr_list) 109 | mean_tpr[-1] = 1.0 110 | mean_auc = auc(mean_fpr, mean_tpr) 111 | plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2) 112 | 113 | plt.xlim([-0.05, 1.05]) 114 | plt.ylim([-0.05, 1.05]) 115 | plt.xlabel('False Positive Rate') 116 | plt.ylabel('True Positive Rate') 117 | plt.title('') 118 | plt.legend(loc="lower right") 119 | plt.show() 120 | 121 | return mean_auc, mean_fpr, mean_tpr 122 | 123 | def getBinaryAccuracy(pred,true_labels): 124 | assert len(pred)==len(true_labels) 125 | 126 | correct_labels = [1 for i in range(len(pred)) if pred[i]==true_labels[i]] 127 | try: 128 | return len(correct_labels)/float(len(pred)) 129 | except: 130 | return np.nan 131 | 132 | def getBaseline(Y): 133 | if type(Y) != list: 134 | Y = Y.tolist() 135 | percentTrue = float(Y.count(1.0)) / float(len(Y)) 136 | if percentTrue < 0.5: 137 | return 1.0 - percentTrue 138 | else: 139 | return percentTrue 140 | 141 | def getTaskListFileCoreName(file_prefix): 142 | dash_loc = file_prefix.find('-') 143 | return file_prefix[dash_loc:-1] 144 | 145 | def loadPickledTaskList(datasets_path, file_prefix, dataset, reshape=False, fix_y=False): 146 | task_list = pickle.load(open(datasets_path + file_prefix + dataset + ".p","rb")) 147 | 148 | task_list = fixTaskListFile(task_list) 149 | 150 | if reshape: 151 | for i in range(len(task_list)): 152 | if task_list[i]["Y"] is not None: 153 | task_list[i]["Y"] = task_list[i]["Y"].reshape(-1,1) 154 | 155 | if fix_y: 156 | for t in range(len(task_list)): 157 | task_list[t]["Y"] = 2*task_list[t]["Y"]-1 158 | 159 | return task_list 160 | 161 | 162 | def fixTaskListFile(task_list,debug=False): 163 | num_feats = calculateNumFeatsInTaskList(task_list) 164 | for i in range(len(task_list)): 165 | if task_list[i]["Y"] is None: 166 | if debug: print("Y for task", task_list[i]['Name'], 167 | "is None, fixing") 168 | task_list[i]['Y'] = np.zeros((0)) 169 | if task_list[i]['X'] is None: 170 | if debug: print("X for task", task_list[i]['Name'], 171 | "is None, fixing") 172 | task_list[i]['X'] = np.zeros((0,num_feats)) 173 | return task_list 174 | 175 | 176 | def loadCrossValData(datasets_path, file_prefix, fold, reshape=True, fix_y=False): 177 | save_prefix = getTaskListFileCoreName(file_prefix) 178 | 179 | train_tasks = loadPickledTaskList(datasets_path, "CVFold" + str(fold) + save_prefix, "Train", reshape=reshape, fix_y=fix_y) 180 | val_tasks = loadPickledTaskList(datasets_path, "CVFold" + str(fold) + save_prefix, "Val", reshape=reshape, fix_y=fix_y) 181 | 182 | return train_tasks, val_tasks 183 | 184 | def generateCrossValPickleFiles(datasets_path, file_prefix, num_cross_folds): 185 | save_prefix = getTaskListFileCoreName(file_prefix) 186 | 187 | if os.path.exists(datasets_path + "CVFold0" + save_prefix + "Train.p"): 188 | print("\nCross validation folds have already been created") 189 | return 190 | 191 | train_tasks = pickle.load(open(datasets_path + file_prefix + "Train.p","rb")) 192 | val_tasks = pickle.load(open(datasets_path + file_prefix + "Val.p","rb")) 193 | 194 | print("\nGenerating cross validation sets") 195 | new_train_tasks = [0] * (num_cross_folds+1) 196 | new_val_tasks = [0] * num_cross_folds 197 | for f in range(num_cross_folds): 198 | new_train_tasks[f] = copy.deepcopy(train_tasks) 199 | new_val_tasks[f] = copy.deepcopy(val_tasks) 200 | new_train_tasks[num_cross_folds] = copy.deepcopy(train_tasks) 201 | 202 | n_tasks = len(train_tasks) 203 | for t in range(n_tasks): 204 | crossVal_X, crossVal_y = generateCrossValSet(train_tasks[t]['X'], train_tasks[t]['Y'], val_tasks[t]['X'], val_tasks[t]['Y'], num_cross_folds, verbose=False) 205 | 206 | for f in range(num_cross_folds): 207 | train_X, train_Y, val_X, val_Y = getTrainAndValDataForCrossValFold(crossVal_X, crossVal_y, f) 208 | new_train_tasks[f][t]['X'] = train_X 209 | new_train_tasks[f][t]['Y'] = train_Y 210 | new_val_tasks[f][t]['X'] = val_X 211 | new_val_tasks[f][t]['Y'] = val_Y 212 | 213 | new_train_tasks[num_cross_folds][t]['X'],new_train_tasks[num_cross_folds][t]['Y'] = getFullTrain(crossVal_X, crossVal_y) 214 | 215 | for f in range(num_cross_folds): 216 | pickle.dump(new_train_tasks[f], open(datasets_path + "CVFold" + str(f) + save_prefix + "Train.p","wb")) 217 | pickle.dump(new_val_tasks[f], open(datasets_path + "CVFold" + str(f) + save_prefix + "Val.p","wb")) 218 | pickle.dump(new_train_tasks[num_cross_folds], open(datasets_path + "CVFullTrain" + save_prefix + ".p","wb")) 219 | 220 | 221 | def addKeepIndicesToCrossValPickleFiles(datasets_path, file_prefix, num_cross_folds, keep_percent): 222 | save_prefix = getTaskListFileCoreName(file_prefix) 223 | 224 | for f in range(num_cross_folds): 225 | task_dict_list = pickle.load(open(datasets_path + "CVFold" + str(f) + save_prefix + "Train.p","rb")) 226 | for t in range(len(task_dict_list)): 227 | if not 'KeepIndices' in task_dict_list[t] or task_dict_list[t]['KeepIndices'] is None: 228 | n = len(task_dict_list[t]['X']) 229 | keep_indices = np.random.choice(n, n*keep_percent, replace=False) 230 | task_dict_list[t]['KeepIndices'] = keep_indices 231 | pickle.dump(task_dict_list, open(datasets_path + "CVFold" + str(f) + save_prefix + "Train.p","wb")) 232 | 233 | def getTrainAndValDataForCrossValFold(crossVal_X, crossVal_y, fold, only_train=False): 234 | num_folds = len(crossVal_X) 235 | if fold >= num_folds: 236 | if only_train: 237 | return None, None 238 | else: 239 | return None, None, None, None 240 | 241 | train_folds_X = [crossVal_X[x] for x in range(num_folds) if x != fold] 242 | train_folds_Y = [crossVal_y[x] for x in range(num_folds) if x != fold] 243 | 244 | train_X = train_folds_X[0] 245 | train_Y = train_folds_Y[0] 246 | for i in range(1,len(train_folds_X)): 247 | train_X = np.concatenate((train_X,train_folds_X[i])) 248 | train_Y = np.concatenate((train_Y,train_folds_Y[i])) 249 | 250 | val_X = crossVal_X[fold] 251 | val_Y = crossVal_y[fold] 252 | return train_X, train_Y, val_X, val_Y 253 | 254 | def containsEachLabelType(labels): 255 | ''' Checks if a set of labels contains all labels types (-1, 0, 1)''' 256 | return 1 in labels and 0 in labels 257 | 258 | def containsEachSVMLabelType(labels): 259 | return -1 in labels and 1 in labels 260 | 261 | def getFullTrain(crossVal_X, crossVal_y): 262 | full_X = crossVal_X[0] 263 | full_Y = crossVal_y[0] 264 | for i in range(1,len(crossVal_X)): 265 | full_X = np.concatenate((full_X,crossVal_X[i])) 266 | full_Y = np.concatenate((full_Y,crossVal_y[i])) 267 | return full_X, full_Y 268 | 269 | def getFriendlyLabelName(col): 270 | if col is None: 271 | return "" 272 | if type(col) != str: 273 | return str(col) 274 | 275 | name = "" 276 | if 'Happiness' in col: 277 | name ='Happiness' 278 | elif 'Calmness' in col: 279 | name = 'Calmness' 280 | elif 'Health' in col: 281 | name = 'Health' 282 | if 'Morning' in col: 283 | name = 'Morning-' + name 284 | if 'tomorrow' in col: 285 | name = 'tomorrow-' + name 286 | elif 'yesterday' in col: 287 | name = 'yesterday-' + name 288 | 289 | return name 290 | 291 | def getOfficialLabelName(string): 292 | type_mod = 'Group' 293 | if 'Personal' in string: 294 | type_mod = 'Personal' 295 | 296 | if 'Happiness' in string: 297 | return 'tomorrow_'+type_mod+'_Happiness_Evening_Label' 298 | elif 'Calmness' in string: 299 | return 'tomorrow_'+type_mod+'_Calmness_Evening_Label' 300 | elif 'Health' in string: 301 | return 'tomorrow_'+type_mod+'_Health_Evening_Label' 302 | else: 303 | print("Error! Could not determine official label name") 304 | return None 305 | 306 | def getMinutesFromMidnight(df, feature): 307 | time_deltas = pd.to_datetime(df[feature]) - pd.to_datetime(df['timestamp']) 308 | mins = [time / pd.Timedelta('1 minute') for time in time_deltas] 309 | return [time if not pd.isnull(time) else np.nan for time in mins] 310 | 311 | def mergeDataframes(all_df, mod_df, mod_name, merge_type='inner',merge_keys=['user_id','timestamp']): 312 | print("Merging", mod_name) 313 | old_len = len(all_df) 314 | print("\tMerged df started with", old_len, "samples") 315 | print("\t", mod_name, "has", len(mod_df), "samples") 316 | all_df = pd.merge(all_df, mod_df, how=merge_type, on=merge_keys) 317 | print("\tMerged df now has", len(all_df), "samples") 318 | print(mod_name, "is missing at least", old_len - len(all_df), "samples") 319 | 320 | return all_df 321 | 322 | def renameAllColsWithPrefix(df,prefix,remove_len=0): 323 | for feat in df.columns.values: 324 | if feat != 'user_id' and feat != 'timestamp': 325 | df = df.rename(columns={feat:prefix+feat[remove_len:]}) 326 | return df 327 | 328 | def normalizeColumns(df, wanted_feats): 329 | train_df = df[df['dataset']=='Train'] 330 | for feat in wanted_feats: 331 | train_mean = np.mean(train_df[feat].dropna().tolist()) 332 | train_std = np.std(train_df[feat].dropna().tolist()) 333 | zscore = lambda x: (x - train_mean) / train_std 334 | df[feat] = df[feat].apply(zscore) 335 | return df 336 | 337 | def findNullColumns(df, features): 338 | df_len = len(df) 339 | bad_feats = [] 340 | for feat in features: 341 | null_len = len(df[df[feat].isnull()]) 342 | if df_len == null_len: 343 | bad_feats.append(feat) 344 | return bad_feats 345 | 346 | def removeNullCols(df, features): 347 | '''Must check if a column is completely null in any of the datasets. Then it will remove it''' 348 | train_df = df[df['dataset']=='Train'] 349 | test_df = df[df['dataset']=='Test'] 350 | val_df = df[df['dataset']=='Val'] 351 | 352 | null_cols = findNullColumns(train_df,features) 353 | null_cols_test= findNullColumns(test_df,features) 354 | null_cols_val = findNullColumns(val_df,features) 355 | 356 | if len(null_cols) > 0 or len(null_cols_test) > 0 or len(null_cols_val) > 0: 357 | for feat in null_cols_test: 358 | if feat not in null_cols: 359 | null_cols.append(feat) 360 | for feat in null_cols_val: 361 | if feat not in null_cols: 362 | null_cols.append(feat) 363 | print("Found", len(null_cols), 364 | "columns that were completely null. Removing", null_cols) 365 | 366 | df = dropCols(df,null_cols) 367 | for col in null_cols: 368 | features.remove(col) 369 | return df, features 370 | 371 | def generateWekaFile(X,Y,features,path,name): 372 | f = open(path + name + '.arff', 'w') 373 | f.write("@relation '" + name + "'\n\n") 374 | 375 | for feat in features: 376 | f.write("@attribute " + feat + " numeric\n") 377 | f.write("@attribute cluster {True,False}\n\n") 378 | 379 | f.write("@data\n\n") 380 | for i in range(X.shape[0]): 381 | for j in range(X.shape[1]): 382 | if np.isnan(X[i,j]): 383 | f.write("?,") 384 | else: 385 | f.write(str(X[i,j]) + ",") 386 | if Y[i] == 1.0 or Y[i] == True: 387 | f.write("True\n") 388 | else: 389 | f.write("False\n") 390 | 391 | f.close() 392 | 393 | def getMatrixData(data_df, wanted_feats, wanted_labels, dataset=None,single_output=False): 394 | if dataset is not None: 395 | set_df = data_df[data_df['dataset']==dataset] 396 | else: 397 | set_df = data_df 398 | 399 | X = set_df[wanted_feats].astype(float).as_matrix() 400 | 401 | if single_output: 402 | y = set_df[wanted_labels[0]].tolist() 403 | else: 404 | y = set_df[wanted_labels].as_matrix() 405 | 406 | return X,y 407 | 408 | def normalizeAndFillDataDf(df, wanted_feats, wanted_labels, suppress_output=False, remove_cols=True): 409 | data_df = normalizeColumns(copy.deepcopy(df), wanted_feats) 410 | if remove_cols: 411 | data_df, wanted_feats = removeNullCols(data_df, wanted_feats) 412 | 413 | if not suppress_output: print("Original data length was", len(data_df)) 414 | data_df = data_df.dropna(subset=wanted_labels, how='any') 415 | if not suppress_output: print( 416 | "After dropping rows with nan in any label column, length is", 417 | len(data_df)) 418 | 419 | data_df = data_df.fillna(NAN_FILL_VALUE) #if dataset is already filled, won't do anything 420 | 421 | return data_df 422 | 423 | def getSvmPartitionDf(data_df, wanted_feats, wanted_labels, dataset='Train'): 424 | set_df = data_df[data_df['dataset']==dataset] 425 | 426 | keep_cols = copy.deepcopy(wanted_feats) 427 | keep_cols.extend(wanted_labels) 428 | set_df = set_df[keep_cols] 429 | 430 | return set_df 431 | 432 | def getTensorFlowMatrixData(data_df, wanted_feats, wanted_labels, dataset='Train',single_output=False): 433 | set_df = data_df[data_df['dataset']==dataset] 434 | 435 | X = set_df[wanted_feats].astype(float).as_matrix() 436 | 437 | if single_output: 438 | y = set_df[wanted_labels[0]].tolist() 439 | else: 440 | y = set_df[wanted_labels].as_matrix() 441 | 442 | X = convertMatrixToTensorFlowFriendlyFormat(X) 443 | y = convertMatrixToTensorFlowFriendlyFormat(y) 444 | 445 | return X,y 446 | 447 | def convertMatrixToTensorFlowFriendlyFormat(X): 448 | X = np.asarray(X) 449 | X = X.astype(np.float32) 450 | return X 451 | 452 | def dropCols(df,cols): 453 | for col in cols: 454 | df = df.drop(col, 1) 455 | return df 456 | 457 | def convertTimestampViaString(row): 458 | return str(row['timestamp']) 459 | 460 | def getMinutesFromMidnight(df, feature): 461 | time_deltas = pd.to_datetime(df[feature]) - pd.to_datetime(df['timestamp']) 462 | mins = [time / pd.Timedelta('1 minute') for time in time_deltas] 463 | return [time if not pd.isnull(time) else np.nan for time in mins] 464 | 465 | def renameAllColsWithPrefix(df,prefix,remove_len=0): 466 | for feat in df.columns.values: 467 | if feat != 'user_id' and feat != 'timestamp': 468 | df = df.rename(columns={feat:prefix+feat[remove_len:]}) 469 | return df 470 | 471 | def combineFilesIntoDf(file_path, filenames, reset_index=False, drop_cols=None): 472 | df = None 473 | for filename in filenames: 474 | fdf = pd.DataFrame.from_csv(file_path + filename) 475 | 476 | if reset_index: 477 | fdf = fdf.reset_index() 478 | 479 | if df is None: 480 | df = fdf.copy(deep=True) 481 | else: 482 | df = pd.concat([df,fdf]) 483 | 484 | if drop_cols is not None: 485 | for feat in drop_cols: 486 | df = df.drop(feat, 1) 487 | 488 | return df 489 | 490 | def partitionRandomSubset(X, Y, size, replace=False, return_remainder=True): 491 | subset_indices = np.random.choice(len(X), size, replace=replace) 492 | 493 | sub_X = X[subset_indices] 494 | sub_Y = Y[subset_indices] 495 | 496 | if return_remainder: 497 | remainder_indices = [x for x in range(0,len(X)) if x not in subset_indices] 498 | remainder_X = X[remainder_indices] 499 | remainder_Y = Y[remainder_indices] 500 | return sub_X, sub_Y, remainder_X, remainder_Y 501 | else: 502 | return sub_X, sub_Y 503 | 504 | def generateCrossValSet(train_X, train_y, val_X, val_y, num_cross_folds, verbose=True): 505 | if verbose: 506 | print("...generating cross validation folds...") 507 | 508 | fullTrain_X = np.concatenate((train_X,val_X)) 509 | fullTrain_y = np.concatenate((train_y,val_y)) 510 | if len(fullTrain_X) <= 1: 511 | print("LENGTH IS", len(fullTrain_X)) 512 | crossVal_X = [] 513 | crossVal_y = [] 514 | 515 | size = int(len(fullTrain_X) / num_cross_folds) 516 | if size < 1: 517 | size = 1 518 | remainder_X = fullTrain_X 519 | remainder_y = fullTrain_y 520 | for i in range(num_cross_folds-1): 521 | sub_X, sub_y, remainder_X, remainder_y = partitionRandomSubset(remainder_X, remainder_y, size) 522 | crossVal_X.append(sub_X) 523 | crossVal_y.append(sub_y) 524 | if len(remainder_X) == 0: 525 | # Insufficient data to make all folds, returning remaining. 526 | return crossVal_X, crossVal_y 527 | crossVal_X.append(remainder_X) 528 | crossVal_y.append(remainder_y) 529 | 530 | return crossVal_X, crossVal_y 531 | 532 | def discardNans(df,col1,col2): 533 | small_df = df[[col1,col2]] 534 | small_df = small_df.dropna() 535 | x = small_df[col1].tolist() 536 | y = small_df[col2].tolist() 537 | n = len(x) 538 | return x,y,n 539 | 540 | def calcCorrelation(df,col1,col2): 541 | x,y,n = discardNans(df,col1,col2) 542 | return stats.pearsonr(x, y) 543 | 544 | def calculateNumFeatsInTaskList(task_dict_list): 545 | i=0 546 | X = task_dict_list[i]['X'] 547 | while len(X) == 0 and i < len(task_dict_list): 548 | i=i+1 549 | X = task_dict_list[i]['X'] 550 | return np.shape(X)[1] 551 | 552 | def addPredsToPredsDf(df, preds, true, task_name): 553 | assert len(preds) == len(true) 554 | 555 | for i in range(len(preds)): 556 | df = df.append({'task_name':task_name, 'prediction':preds[i], 557 | 'true':true[i]}, ignore_index=True) 558 | 559 | return df 560 | 561 | def fixSettingDictLoadedFromResultsDf(setting_dict): 562 | if 'hidden_layers' in setting_dict.keys(): 563 | if type(setting_dict['hidden_layers']) == str: 564 | setting_dict['hidden_layers'] = ast.literal_eval(setting_dict['hidden_layers']) 565 | 566 | if 'optimizer' in setting_dict.keys(): 567 | if 'GradientDescent' in setting_dict['optimizer']: 568 | setting_dict['optimizer'] = tf.train.GradientDescentOptimizer 569 | elif 'Adagrad' in setting_dict['optimizer']: 570 | setting_dict['optimizer'] = tf.train.AdagradOptimizer 571 | else: 572 | setting_dict['optimizer'] = tf.train.AdamOptimizer 573 | 574 | for setting in ['batch_size','decay_steps']: 575 | if setting in setting_dict.keys(): 576 | setting_dict[setting] = int(setting_dict[setting]) 577 | 578 | return setting_dict 579 | 580 | def get_secs_mins_hours_from_secs(total_secs): 581 | hours = total_secs / 60 / 60 582 | mins = (total_secs % 3600) / 60 583 | secs = (total_secs % 3600) % 60 584 | 585 | if hours < 1: hours = 0 586 | if mins < 1: mins = 0 587 | 588 | return hours, mins, secs 589 | 590 | def tf_weight_variable(shape, name): 591 | """Initializes a tensorflow weight variable with random values 592 | centered around 0. 593 | """ 594 | initial = tf.truncated_normal(shape, stddev=1.0 / math.sqrt(float(shape[0])), dtype=tf.float64) 595 | return tf.Variable(initial, name=name) 596 | 597 | def tf_bias_variable(shape, name): 598 | """Initializes a tensorflow bias variable to a small constant value.""" 599 | initial = tf.constant(0.1, shape=shape, dtype=tf.float64) 600 | return tf.Variable(initial, name=name) 601 | 602 | def get_test_predictions_for_df_with_task_column(model_predict_func, csv_path, task_column, tasks, 603 | wanted_label=None, num_feats_expected=None, label_name="", 604 | tasks_are_ints=True): 605 | data_df = pd.DataFrame.from_csv(csv_path) 606 | 607 | wanted_feats = [x for x in data_df.columns.values if x != 'user_id' and x != 'timestamp' and 'ppt_id' not in x and x!= 'dataset' and '_Label' not in x and 'Cluster' not in x] 608 | if num_feats_expected is not None and len(wanted_feats) != num_feats_expected: 609 | print("Error! Found", len(wanted_feats), 610 | "features but was expecting to find", num_feats_expected) 611 | return 612 | 613 | if wanted_label is not None: 614 | wanted_labels = [wanted_label] 615 | else: 616 | wanted_labels = [x for x in data_df.columns.values if '_Label' in x and 'tomorrow_' in x and 'Evening' in x and 'Alertness' not in x and 'Energy' not in x] 617 | 618 | data_df = normalizeAndFillDataDf(data_df, wanted_feats, wanted_labels) 619 | 620 | if label_name is "" and wanted_label is not None: 621 | label_name = getFriendlyLabelName(wanted_label) 622 | 623 | for i,task_dict in enumerate(tasks): 624 | task = task_dict['Name'] 625 | if tasks_are_ints: 626 | task = int(task) 627 | task_df = data_df[data_df[task_column]==task] 628 | X = task_df[wanted_feats].as_matrix() 629 | preds = model_predict_func(X, i) 630 | data_df.loc[task_df.index.values,'test_pred_'+label_name] = preds 631 | 632 | print("Predictions have been computed and are stored in dataframe.") 633 | 634 | if wanted_label is not None and wanted_label in data_df.columns.values: 635 | test_df = data_df[data_df['dataset']=='Test'] 636 | all_preds = test_df['test_pred_'+label_name].tolist() 637 | all_true = test_df[wanted_label].tolist() 638 | print("FINAL METRICS ON TEST SET:", 639 | computeAllMetricsForPreds(all_preds, all_true)) 640 | else: 641 | print("Cannot print test results unless wanted_label is set correctly") 642 | 643 | return data_df 644 | 645 | def get_test_predictions_for_df_with_no_task_column(model_predict_func, csv_path, tasks, 646 | num_feats_expected=None): 647 | data_df = pd.DataFrame.from_csv(csv_path) 648 | 649 | wanted_feats = [x for x in data_df.columns.values if x != 'user_id' and x != 'timestamp' and x!= 'dataset' and '_Label' not in x and 'Cluster' not in x] 650 | if num_feats_expected is not None and len(wanted_feats) != num_feats_expected: 651 | print("Error! Found", len(wanted_feats), 652 | "features but was expecting to find", num_feats_expected) 653 | return 654 | 655 | for i,task_dict in enumerate(tasks): 656 | wanted_label = task_dict['Name'] 657 | label_name = getFriendlyLabelName(wanted_label) 658 | label_df = normalizeAndFillDataDf(copy.deepcopy(data_df), wanted_feats, [wanted_label]) 659 | 660 | X = label_df[wanted_feats].as_matrix() 661 | preds = model_predict_func(X, i) 662 | data_df.loc[label_df.index.values,'test_pred_'+label_name] = preds 663 | 664 | test_df = data_df[data_df['dataset']=='Test'] 665 | test_df = test_df.dropna(subset=[wanted_label], how='any') 666 | all_preds = test_df['test_pred_'+label_name].tolist() 667 | all_true = test_df[wanted_label].tolist() 668 | print("FINAL METRICS ON TEST SET for label", label_name, ":", 669 | computeAllMetricsForPreds(all_preds, all_true)) 670 | 671 | print("Predictions have been computed and are stored in dataframe.") 672 | 673 | return data_df 674 | -------------------------------------------------------------------------------- /jobs_to_run.txt: -------------------------------------------------------------------------------- 1 | NN job of some type - happiness 2 | python NeuralNetworks/tensorFlowWrapper.py Path/task_list_file-Happiness_ multitask wellbeing 3 | ../outputs/some_result_for_happiness.txt 4 | 5 | NN job of some type - calmness 6 | python NeuralNetworks/tensorFlowWrapper.py Path/task_list_file-Calmness_ multitask wellbeing 7 | ../outputs/some_result_for_calmness.txt -------------------------------------------------------------------------------- /make_datasets.py: -------------------------------------------------------------------------------- 1 | """ This file contains functions for converting a .csv dataset into the 2 | 'task dict list' format used by the rest of the code. The .csv file must 3 | have a particular format, with columns like 'user_id', and outcome columns 4 | containing '_Label'. For an example, see the file 'example_data.csv'. 5 | 6 | How to partition tasks: 7 | 'users-as-tasks': The .csv file will be partioned such that predicting 8 | the outcome of each user is one task. 9 | 'labels-as-tasks': The .csv file will be partitioned such that 10 | predicting related outcomes is each task (e.g. predicting stress 11 | is one task and predicting happiness is another) 12 | """ 13 | 14 | import numpy as np 15 | import pandas as pd 16 | import sklearn as sk 17 | import sys 18 | import os 19 | import pickle 20 | import random 21 | import time 22 | import copy 23 | import argparse 24 | import helperFuncs as helper 25 | from sklearn.cross_validation import StratifiedShuffleSplit 26 | 27 | CODE_PATH = os.path.dirname(os.getcwd()) 28 | sys.path.append(CODE_PATH) 29 | 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument('--datafile', type=str, default='/Your/path/here/') 32 | parser.add_argument('--task_type', type=str, default='users', 33 | help="How to partition related tasks; can be 'users' so " 34 | "that predicting the outcome for each user is its own " 35 | "task, or 'labels', so that predicting related " 36 | "outcomes (like stress, happiness, etc) are their " 37 | "own tasks.") 38 | parser.add_argument('--target_label', type=str, 39 | default='tomorrow_Happiness_Evening_Label', 40 | help="Outcome label to predict for each user in " 41 | "users-as-tasks") 42 | parser.add_argument('--group_users_on', type=str, 43 | default='user_id', 44 | help="Name of column that indicates user or cluster ID " 45 | "for partitioning users into tasks.") 46 | 47 | def getDatasetCoreNameAndPath(datafile): 48 | core_name = os.path.basename(datafile) 49 | core_name = os.path.splitext(core_name)[0] 50 | path = os.path.splitext(datafile)[0].replace(core_name, '') 51 | return core_name, path 52 | 53 | def getLabelTaskListFromDataset(datafile, subdivide_phys=True): 54 | """Partitions a .csv file into a task-dict-list pickle file by separating 55 | related labels into the different tasks.""" 56 | df = pd.DataFrame.from_csv(datafile) 57 | wanted_labels = [x for x in df.columns.values if '_Label' in x and 'tomorrow_' in x and 'Evening' in x and 'Alertness' not in x and 'Energy' not in x] 58 | wanted_feats = [x for x in df.columns.values if x != 'user_id' and x != 'timestamp' and x!= 'dataset' and x!='Cluster' and '_Label' not in x] 59 | 60 | core_name, data_path = getDatasetCoreNameAndPath(datafile) 61 | 62 | modality_dict = getModalityDict(wanted_feats, subdivide_phys=subdivide_phys) 63 | 64 | for dataset in ['Train','Val','Test']: 65 | task_dict_list = [] 66 | for target_label in wanted_labels: 67 | mini_df = helper.normalizeAndFillDataDf(df, wanted_feats, [target_label], suppress_output=True) 68 | mini_df.reindex(np.random.permutation(mini_df.index)) 69 | 70 | X,y = helper.getTensorFlowMatrixData(mini_df, wanted_feats, [target_label], dataset=dataset, single_output=True) 71 | task_dict = dict() 72 | task_dict['X'] = X 73 | task_dict['Y'] = y 74 | task_dict['Name'] = target_label 75 | task_dict['ModalityDict'] = modality_dict 76 | task_dict_list.append(task_dict) 77 | pickle.dump(task_dict_list, open(data_path + "datasetTaskList-" + core_name + "_" + dataset + ".p","wb")) 78 | 79 | def getModalityDict(wanted_feats, subdivide_phys=False): 80 | modalities = list(set([getFeatPrefix(x, subdivide_phys=subdivide_phys) for x in wanted_feats])) 81 | mod_dict = dict() 82 | for modality in modalities: 83 | mod_dict[modality] = getStartIndex(wanted_feats, modality) 84 | return mod_dict 85 | 86 | def getStartIndex(wanted_feats, modality): 87 | for i,s in enumerate(wanted_feats): 88 | if modality[0:4] == 'phys' and 'H' in modality and modality != 'physTemp': 89 | if modality + ':' in s: 90 | return i 91 | else: 92 | if modality + '_' in s: 93 | return i 94 | 95 | def getFeatPrefix(feat_name, subdivide_phys=False): 96 | idx = feat_name.find('_') 97 | prefix = feat_name[0:idx] 98 | if not subdivide_phys or prefix != 'phys': 99 | return prefix 100 | else: 101 | idx = feat_name.find(':') 102 | return feat_name[0:idx] 103 | 104 | def getUserTaskListFromDataset(datafile, target_label, suppress_output=False, 105 | group_on='user_id', subdivide_phys=False): 106 | """Partitions a .csv file into a task-dict-list pickle file by separating 107 | different individuals (users) into the different tasks.""" 108 | df = pd.DataFrame.from_csv(datafile) 109 | wanted_feats = [x for x in df.columns.values if x != 'user_id' and x != 'timestamp' and x!= 'dataset' and x!='classifier_friendly_ppt_id' and 'Cluster' not in x and '_Label' not in x] 110 | 111 | df = helper.normalizeAndFillDataDf(df, wanted_feats, [target_label], suppress_output=True) 112 | df = df.reindex(np.random.permutation(df.index)) 113 | 114 | dataset_name, datapath = getDatasetCoreNameAndPath(datafile) 115 | label_name = helper.getFriendlyLabelName(target_label) 116 | 117 | modality_dict = getModalityDict(wanted_feats, subdivide_phys=subdivide_phys) 118 | 119 | train_task_dict_list = [] 120 | val_task_dict_list = [] 121 | test_task_dict_list = [] 122 | for user in df[group_on].unique(): 123 | if not suppress_output: 124 | print("Processing task", user) 125 | mini_df = df[df[group_on] == user] 126 | 127 | train_task_dict_list.append(constructTaskDict(user, mini_df, wanted_feats, target_label, modality_dict, 'Train')) 128 | val_task_dict_list.append(constructTaskDict(user, mini_df, wanted_feats, target_label, modality_dict, 'Val')) 129 | test_task_dict_list.append(constructTaskDict(user, mini_df, wanted_feats, target_label, modality_dict, 'Test')) 130 | 131 | if group_on == 'user_id': 132 | dataset_prefix = "datasetUserTaskList-" 133 | elif group_on == 'Cluster': 134 | dataset_prefix = 'datasetClusterTasks-' 135 | else: 136 | dataset_prefix = group_on 137 | pickle.dump(train_task_dict_list, open(datapath + dataset_prefix + dataset_name + "-" + label_name + "_Train.p","wb")) 138 | pickle.dump(val_task_dict_list, open(datapath + dataset_prefix + dataset_name + "-" + label_name + "_Val.p","wb")) 139 | pickle.dump(test_task_dict_list, open(datapath + dataset_prefix + dataset_name + "-" + label_name + "_Test.p","wb")) 140 | 141 | return dataset_prefix + dataset_name + "-" + label_name 142 | 143 | def constructTaskDict(task_name, mini_df, wanted_feats, target_label, modality_dict, dataset): 144 | X,y = helper.getTensorFlowMatrixData(mini_df, wanted_feats, [target_label], dataset=dataset, single_output=True) 145 | task_dict = dict() 146 | task_dict['X'] = X 147 | task_dict['Y'] = y 148 | task_dict['Name'] = task_name 149 | task_dict['ModalityDict'] = modality_dict 150 | return task_dict 151 | 152 | if __name__ == '__main__': 153 | kwargs = vars(parser.parse_args()) 154 | 155 | if kwargs['task_type'] == 'labels': 156 | print("Creating a label task-dict-list dataset where tasks are " 157 | "predicting related outcome labels.") 158 | getLabelTaskListFromDataset(kwargs['datafile']) 159 | else: 160 | print("Creating a user task-dict-list dataset where tasks are " 161 | "predicting the outcome of each different person (user).") 162 | getUserTaskListFromDataset(kwargs['datafile'], 163 | target_label=kwargs['target_label'], 164 | group_on=kwargs['group_users_on']) -------------------------------------------------------------------------------- /mtl_nn_clusters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mitmedialab/PersonalizedMultitaskLearning/2de7d9485f5ac09264bfa624f16c5b05a5a44ada/mtl_nn_clusters.png -------------------------------------------------------------------------------- /run_jobs.py: -------------------------------------------------------------------------------- 1 | """ This file allows multiple jobs to be run on a server. After each job, an 2 | email is sent to notify desired people of its completion. 3 | 4 | Must specify a text job file that contains the names and commands for each 5 | job. Each job has 4 lines, containing: 6 | 1) the name, 7 | 2) the command, 8 | 3) the location of a file where the job output should be saved, 9 | 4) a blank line. 10 | 11 | An example job file format is as follows: 12 | 13 | Job1 14 | python job.py path1 arg1 15 | path/output1.txt 16 | 17 | Job2 18 | python job.py path2 arg2 19 | path/output2.txt 20 | 21 | Usage: python run_jobs.py jobs.txt 22 | """ 23 | 24 | import os 25 | import sys 26 | import smtplib 27 | import string 28 | from time import time 29 | import helperFuncs as helper 30 | 31 | DEFAULT_EMAIL_LIST = ['myemail@gmail.com', 'youremail@gmail.com'] 32 | SENDING_ADDRESS = 'myemail@gmail.com' 33 | MINIMUM_JOB_SECONDS = 600 # 10 minutes 34 | PRINT_LAST_X_LINES = 300 35 | ERROR = 1 36 | SUCCESS = 0 37 | WARNING = 2 38 | 39 | 40 | def reload_files(): 41 | reload(helper) 42 | 43 | class Job: 44 | def __init__(self, name, command, output_file): 45 | self.name = name 46 | self.command = command 47 | self.output_file = output_file.rstrip('\n') 48 | 49 | 50 | def send_email(subject, text, to_addr_list=DEFAULT_EMAIL_LIST): 51 | body = string.join(('From: %s' % SENDING_ADDRESS, 52 | 'To: %s' % to_addr_list, 53 | 'Subject: %s' % subject, 54 | '', 55 | text), '\r\n') 56 | 57 | try: 58 | server = smtplib.SMTP('smtp.gmail.com:587') # NOTE: This is the GMAIL SSL port. 59 | server.ehlo() # this line was not required in a previous working version 60 | server.starttls() 61 | server.login(SENDING_ADDRESS, 'gmail_password') 62 | server.sendmail(SENDING_ADDRESS, to_addr_list, body) 63 | server.quit() 64 | print "Email sent successfully!" 65 | except: 66 | return "Email failed to send!" 67 | 68 | def load_job_file(filename): 69 | f = open(filename, 'r') 70 | lines = f.readlines() 71 | 72 | jobs = [] 73 | 74 | i = 0 75 | while i < len(lines): 76 | jobname = lines[i] 77 | command = lines[i+1] 78 | output_file = lines[i+2] 79 | job = Job(jobname, command, output_file) 80 | jobs.append(job) 81 | i = i+4 82 | 83 | return jobs 84 | 85 | def run_job(job_obj): 86 | """ Runs a system command for a job, returns whether it 87 | succeeded and output text to be emailed. 88 | 89 | Inputs: 90 | job_obj: an instance of the Job class 91 | 92 | Returns 93 | A code indicating whether the job was successful, and 94 | a string containing text about the job and job output to 95 | be mailed to the user 96 | """ 97 | 98 | print "\nRunning job", job_obj.name 99 | 100 | if os.path.exists(job_obj.output_file): 101 | message = "The desired output file " + job_obj.output_file + " already exists." 102 | print "Error!", message 103 | return ERROR, message 104 | 105 | t0 = time() 106 | 107 | # execute the command 108 | stream = os.popen(job_obj.command) 109 | output = stream.read() 110 | 111 | # save output to desired file 112 | of = open(job_obj.output_file, 'w') 113 | of.write(output) 114 | of.close() 115 | 116 | t1 = time() 117 | total_secs = t1 - t0 118 | 119 | hours, mins, secs = helper.get_secs_mins_hours_from_secs(total_secs) 120 | time_str = "Job ended. Total time taken: " + str(int(hours)) + "h " + str(int(mins)) + "m " + str(int(secs)) + "s" 121 | print time_str 122 | 123 | if not os.path.exists(job_obj.output_file): 124 | message = "Job failed to create the desired output file." 125 | print "Error!", message 126 | code = ERROR 127 | elif total_secs < MINIMUM_JOB_SECONDS: 128 | message = "The total time taken for the job was suspiciously short." 129 | print "Warning!", message 130 | code = WARNING 131 | else: 132 | message = "" 133 | print "Job finished successfully!" 134 | code = SUCCESS 135 | 136 | lines = output.split('\n') 137 | tail = "\n".join(lines[-PRINT_LAST_X_LINES:]) 138 | 139 | message += "\n\n" + time_str + "\n\n" 140 | message += "The last " + str(PRINT_LAST_X_LINES) + " lines of job output were:\n\n" 141 | message += tail 142 | 143 | return code, message 144 | 145 | def email_about_job(job_obj, status, output): 146 | if status == ERROR: 147 | title = "Error! Problem with job " + job_obj.name 148 | elif status == SUCCESS: 149 | title = "Success! Job " + job_obj.name + " is finished" 150 | else: 151 | title = "Warning! Job " + job_obj.name + " finished too quickly" 152 | 153 | send_email(title, output) 154 | 155 | def run_jobs(jobfile): 156 | jobs = load_job_file(filename) 157 | 158 | for job in jobs: 159 | status, output = run_job(job) 160 | email_about_job(job, status, output) 161 | 162 | send_email("ALL JOBS FINISHED!!", "Congratulations, all of the jobs in the file " + jobfile + " have finished running.") 163 | 164 | if __name__ == "__main__": 165 | if len(sys.argv) < 1: 166 | print "Error! Usage is python run_jobs.py jobs.txt" 167 | print "See this file's documentation for required format for jobs.txt" 168 | 169 | filename= sys.argv[1] 170 | jobfile=sys.argv[1] 171 | print "Running all jobs in file", jobfile, ". . ." 172 | 173 | run_jobs(jobfile) 174 | --------------------------------------------------------------------------------