├── HBLR
    ├── HBLR.py
    ├── HBLRWrapper.py
    └── HBLR_Distribution.py
├── LSSVM
    ├── LSSVM.py
    └── LSSVMWrapper.py
├── LogisticRegression
    ├── LR.py
    └── LRWrapper.py
├── MTMKL
    ├── MTMKL.py
    └── MTMKLWrapper.py
├── NeuralNetworks
    ├── tensorFlowNetwork.py
    ├── tensorFlowNetworkMultiTask.py
    ├── tensorFlowWrapper.py
    └── tensorFlowWrapperSTL.py
├── README.md
├── __pycache__
    └── helperFuncs.cpython-35.pyc
├── example_data.csv
├── generic_wrapper.py
├── helperFuncs.py
├── jobs_to_run.txt
├── make_datasets.py
├── mtl_nn_clusters.png
└── run_jobs.py


/HBLR/HBLR.py:
--------------------------------------------------------------------------------
  1 | ''' Hierarchical Bayesian Logistic Regression (HBLR)
  2 | 
  3 | This model draws logistic regression weights for each task from a shared
  4 | Dirichlet Process (DP) prior. The DP prior induces a clustering of tasks based
  5 | on the learned decision boundaries, such that the number of distinct decision 
  6 | boundaries is equivalent to the number of clusters.
  7 | 
  8 | A set of matrices stores the degree of membership of each task in each cluster,
  9 | and the weights for each cluster. 
 10 | 
 11 | For more information about this method, see:
 12 | Xue, Y., Liao, X., Carin, L., & Krishnapuram, B. (2007). Multi-task learning 
 13 | for classification with dirichlet process priors. Journal of Machine Learning 
 14 | Research, 8(Jan), 35-63.
 15 | '''
 16 | 
 17 | import matplotlib
 18 | matplotlib.use('Agg')
 19 | import numpy as np
 20 | import pandas as pd
 21 | import math
 22 | import scipy #scipy.special.psi is the derivative of the log of the gamma function
 23 | import scipy.linalg as la
 24 | import scipy.special
 25 | import copy
 26 | import sys
 27 | import matplotlib.pyplot as plt
 28 | from sklearn.metrics import roc_auc_score
 29 | import HBLR_Distribution
 30 | 
 31 | 
 32 | ACC_LOGGED_EVERY_N_STEPS = 10
 33 | 
 34 | def plotConvergence(metric, title, save_path=None):
 35 |     plt.figure()
 36 |     plt.plot(metric,'o-')
 37 |     plt.xlabel('Iteration')
 38 |     plt.ylabel(title)
 39 |     if save_path is not None:
 40 |     	plt.savefig(save_path)
 41 |     	plt.close()
 42 |     else:
 43 |     	plt.show()
 44 | 
 45 | '''Given a dataset, trains the model'''
 46 | class HBLR:
 47 | 
 48 | 	''' DATA FORMAT: a list of dicts. Each list item is a task, indexed by its number. Each task is a dict,
 49 | 			containing keys 'X' and 'Y', which are the data matrix and label vector, respectively  
 50 | 			Note that the X matrix should not contain columns like user_id, timestamp
 51 | 			Each X is of size num points for that task by number features
 52 | 			Each Y is of size num points for that task by 1 (column vector)
 53 | 
 54 | 		Conventions:
 55 | 		-compute functions are for computing internal parameters used in update functions
 56 | 		-update functions are for computing parameters of the model used for prediction
 57 | 		'''
 58 | 	def __init__(self, task_dict, mu=None, sigma=None, tau10=5e-2, tau20=5e-2, K=None, max_iterations=150, 
 59 | 				xi_tolerance=1e-2,debug=False,verbose=True):
 60 | 		self.n_tasks = len(task_dict)
 61 | 		self.K = self.n_tasks if K is None else K
 62 | 
 63 | 		self.debug = debug
 64 | 		self.verbose = verbose
 65 | 		self.task_dict = task_dict
 66 | 		self.num_feats = np.shape(task_dict[0]['X'])[1]
 67 | 		# TODO: Should we be checking if every task has the same number of features?
 68 | 
 69 | 		#hyperparameters
 70 | 		self.mu = mu if mu is not None else np.zeros((1,self.num_feats)) 
 71 | 		self.sigma = sigma if sigma is not None else np.eye(self.num_feats) * 10.0 
 72 | 		self.tau10 = tau10
 73 | 		self.tau20 = tau20
 74 | 
 75 | 		#model parameters
 76 | 		self.phi = None		# np array shape is n_tasks x K
 77 | 		self.xi = None		# a list of lists. First index is task number, second is data point within task
 78 | 		self.theta = None	# a matrix of size K x num_feats
 79 | 		self.gamma = None	# a list of size K of covariance matrices of size num_feats x num_feats
 80 | 		
 81 | 		#internal parameters
 82 | 		self.small_phi1 = None		# a vector of size K-1 used in computing phi
 83 | 		self.small_phi2 = None		# a vector of size K-1 used in computing phi
 84 | 		self.s = None				# a matrix of size n_tasks x K using in computing phi
 85 | 		self.tau1 = None			# used to compute small phi2
 86 | 		self.tau2 = None			# used to compute small phi2
 87 | 		self.task_vectors = None	# used to compute theta, matrix of size n_tasks x num_feats, 
 88 | 									# only computed once at the beginning
 89 | 
 90 | 		#store metrics for convergence of parameters
 91 | 		self.xi_convergence_list = []			#take max of abs(prev - new) over all tasks
 92 | 		self.phi_convergence_list = []			#take norm of prev matrix - new matrix
 93 | 		self.s_convergence_list = []			#take norm of prev matrix - new matrix
 94 | 		self.gamma_convergence_list = []		#take max of abs(prev - new) over all clusters
 95 | 		self.theta_convergence_list = []		#take norm of prev matrix - new matrix
 96 | 
 97 | 		#
 98 | 		self.max_iterations = max_iterations
 99 | 		self.xi_tolerance = xi_tolerance
100 | 
101 | 	def setHyperParameters(self, mu, sigma, tau10, tau20):
102 | 		self.mu = mu
103 | 		self.sigma = sigma
104 | 		self.tau10 = tau10
105 | 		self.tau20 = tau20
106 | 
107 | 	def initializeAllParameters(self):
108 | 		self.phi = (1.0 / self.K) * np.ones((self.n_tasks,self.K))
109 | 		self.theta = np.tile(self.mu, (self.K, 1)) 
110 | 		self.gamma = [self.sigma for i in range(self.K)]
111 | 		self.xi = [[0]* len(self.task_dict[i]['Y']) for i in range(self.n_tasks)]
112 | 		self.computeXi()
113 | 		self.tau1 = self.tau10
114 | 		self.tau2 = self.tau20
115 | 		self.computeSmallPhis()	
116 | 		self.computeTaus()
117 | 		self.s = np.zeros((self.n_tasks,self.K))
118 | 		self.computeTaskVectors()
119 | 
120 | 		self.xi_convergence_list = []
121 | 		self.phi_convergence_list = []
122 | 		self.s_convergence_list = []
123 | 		self.gamma_convergence_list = []
124 | 		self.theta_convergence_list = []
125 | 
126 | 		if self.debug:
127 | 			print "initial phi", self.phi
128 | 			print "initial small phi1", self.small_phi1
129 | 			print "initial small phi2", self.small_phi2
130 | 			print "initial tau1", self.tau1, "tau2", self.tau2 
131 | 
132 | 	def trainUntilConverged(self):
133 | 		self.initializeAllParameters()
134 | 
135 | 		i=0
136 | 		while i<self.max_iterations and (i<2 or self.xi_convergence_list[-1]>self.xi_tolerance):
137 | 			if self.debug:
138 | 				print "----------------"
139 | 				print "iteration",i
140 | 
141 | 				plt.imshow(self.phi)
142 | 				plt.show()
143 | 
144 | 			prev_xi = copy.deepcopy(self.xi)
145 | 			prev_phi = copy.deepcopy(self.phi)
146 | 			prev_s = copy.deepcopy(self.s)
147 | 			prev_gamma = copy.deepcopy(self.gamma)
148 | 			prev_theta = copy.deepcopy(self.theta)
149 | 
150 | 			self.updateAllParameters()
151 | 			if self.K>2:
152 | 				restart = self.pruneK()
153 | 				if restart:
154 | 					if self.verbose: print "Restarting now with K=",self.K
155 | 					self.initializeAllParameters()
156 | 					self.updateAllParameters()
157 | 					i=0
158 | 					continue
159 | 			
160 | 			if i % ACC_LOGGED_EVERY_N_STEPS == 0:
161 | 				acc = []
162 | 				auc = []
163 | 				for j in range(len(self.task_dict)):
164 | 					preds0 = self.predictBinary(self.task_dict[j]['X'],j)
165 | 					task_Y = self.task_dict[j]['Y']
166 | 					if 0 in task_Y and 1 in task_Y:
167 | 						auc.append(roc_auc_score(task_Y, preds0))
168 | 						acc.append(getBinaryAccuracy(preds0,task_Y))
169 | 					#else:
170 | 					#	print "doesn't have both tasks",j,task_Y
171 | 				if self.verbose:
172 | 					print "Training. Iteration", i
173 | 					if i>0:
174 | 						print "\tXi convergence", self.xi_convergence_list[-1]
175 | 					print "\tavg training accuracy",np.mean(acc)
176 | 					print "\tavg ROC AUC", np.mean(auc),"\n"
177 | 
178 | 			#compute convergence metrics
179 | 			if i > 0:
180 | 				self.xi_convergence_list.append(computeMatrixConvergence(flattenListLists(prev_xi), flattenListLists(self.xi)))
181 | 				self.phi_convergence_list.append(computeMatrixConvergence(prev_phi, self.phi))
182 | 				self.s_convergence_list.append(computeMatrixConvergence(0,self.s))
183 | 				self.gamma_convergence_list.append(computeListOfListsConvergence(prev_gamma, self.gamma))
184 | 				self.theta_convergence_list.append(computeMatrixConvergence(0, self.theta))
185 | 				if self.debug: print "Training. Iteration", i, "- Xi convergence:", self.xi_convergence_list[-1]
186 | 
187 | 			i+=1
188 | 
189 | 			sys.stdout.flush()
190 | 
191 | 
192 | 
193 | 	def updateAllParameters(self):
194 | 		self.computeSMatrix()
195 | 		self.updatePhi()
196 | 		
197 | 		self.computeSmallPhis()
198 | 		self.computeTaus()
199 | 		self.updateGamma()
200 | 		self.updateTheta()
201 | 		self.computeXi()
202 | 
203 | 	def computeTaskVectors(self):
204 | 		self.task_vectors = np.zeros((self.n_tasks, self.num_feats))
205 | 		for m in range(self.n_tasks):
206 | 			task_X = self.task_dict[m]['X']
207 | 			task_Y = self.task_dict[m]['Y']
208 | 			# Note that transposes are different because we are using different notation than in the paper - specifically we use row vectors where they are using column vectors
209 | 			self.task_vectors[m,:] = np.dot((task_Y-0.5).T,task_X)
210 | 
211 | 	def pruneK(self):
212 | 		num_tasks_in_cluster = self.n_tasks - np.sum(1*(self.phi<1e-16),axis=0)
213 | 		for k in range(len(num_tasks_in_cluster))[::-1]:
214 | 			if num_tasks_in_cluster[k]==0:
215 | 				self.K = self.K - 1
216 | 				return True
217 | 		return False
218 | 
219 | 	def computeSMatrix(self):
220 | 		for m in range(self.n_tasks):
221 | 			task_X = self.task_dict[m]['X']
222 | 			task_Y = self.task_dict[m]['Y']
223 | 			task_xi = np.array(self.xi[m])
224 | 
225 | 			for k in range(self.K):
226 | 				# Note that transposes are different because we are using different notation than in the paper - specifically we use row vectors where they are using column vectors
227 | 
228 | 				# This does all data points (n) at once 
229 | 				inner = np.dot(np.atleast_2d(self.theta[k,:]).T, np.atleast_2d(self.theta[k,:])) + self.gamma[k]
230 | 				diag_entries = np.einsum('ij,ij->i', np.dot(task_X, inner), task_X)
231 | 				s_sum = -rhoFunction(task_xi)*diag_entries
232 | 				
233 | 				s_sum += ((task_Y.T - 0.5)* np.dot(np.atleast_2d(self.theta[k,:]), task_X.T))[0,:]
234 | 				s_sum += np.log(sigmoid(task_xi))
235 | 				s_sum += (-0.5)*task_xi
236 | 				s_sum += rhoFunction(task_xi)*(task_xi**2)
237 | 				
238 | 				s_sum = np.sum(s_sum)
239 | 					
240 | 				if k < self.K-1:
241 | 					s_sum = s_sum + scipy.special.psi(self.small_phi1[k]) \
242 | 									- scipy.special.psi(self.small_phi1[k] + self.small_phi2[k])
243 | 				if k > 0:
244 | 					for i in range(k):
245 | 						s_sum = s_sum + scipy.special.psi(self.small_phi2[i]) \
246 | 									- scipy.special.psi(self.small_phi1[i] + self.small_phi2[i])
247 | 
248 | 				
249 | 				self.s[m,k] = s_sum
250 | 		if self.debug: print "s:", self.s
251 | 
252 | 
253 | 	def updatePhi(self):
254 | 		a = np.array([np.max(self.s, axis=1)]).T #as used in logsumexp trick https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
255 | 		self.phi = np.exp(self.s - (a + np.log(np.atleast_2d(np.sum(np.exp(self.s - a),axis=1)).T)))
256 | 		if self.debug: 
257 | 			print "phi:", self.phi
258 | 			
259 | 	def computeSmallPhis(self):
260 | 		self.small_phi1 = (1 + np.sum(self.phi,axis=0))[0:-1]
261 | 		self.small_phi2 = self.tau1 / self.tau2 + np.array([np.sum(self.phi[:,i:]) for i in range(1,self.K)])
262 | 		if self.debug: 
263 | 			print "small phi1", self.small_phi1
264 | 			print "small phi2", self.small_phi2
265 | 
266 | 	def computeTaus(self):
267 | 		self.tau1 = self.tau10 + self.K - 1
268 | 		tau2_sum = 0
269 | 		for k in range(self.K-1):
270 | 			tau2_sum = tau2_sum + (scipy.special.psi(self.small_phi2[k]) \
271 | 									- scipy.special.psi(self.small_phi1[k] + self.small_phi2[k]))
272 | 		self.tau2 = self.tau20 - tau2_sum
273 | 		if self.debug: print "tau1", self.tau1, "tau2", self.tau2
274 | 
275 | 	def updateGamma(self):
276 | 		task_matrices = np.zeros((self.n_tasks, self.num_feats, self.num_feats))
277 | 		for m in range(self.n_tasks):
278 | 			rho_vector = rhoFunction(np.array(self.xi[m]))
279 | 			rho_vector = rho_vector.reshape((1,-1))				# Make rho vector 2D
280 | 			task_X = self.task_dict[m]['X']
281 | 			# Note that the transposing doesn't exactly match the paper because our data format is slightly different
282 | 			rho_matrix = abs(rho_vector) * task_X.T
283 | 			task_matrices[m,:,:] = np.dot(rho_matrix, task_X)  
284 | 
285 | 		for k in range(self.K):
286 | 			inner_sum = np.zeros((self.num_feats,self.num_feats))
287 | 			for m in range(self.n_tasks):
288 | 				inner_sum = inner_sum + self.phi[m,k] * task_matrices[m,:,:]
289 | 			self.gamma[k] = la.inv(la.inv(self.sigma) + 2*inner_sum)
290 | 			if self.debug: 
291 | 				print "gamma computation {0}".format(k), la.det(la.inv(self.sigma) + 2*inner_sum)
292 | 
293 | 	def updateTheta(self):
294 | 		for k in range(self.K):
295 | 			inner_sum = np.zeros((1,self.num_feats))
296 | 			for m in range(self.n_tasks):
297 | 				inner_sum = inner_sum + self.phi[m,k] * np.atleast_2d(self.task_vectors[m,:])
298 | 			self.theta[k,:] = (np.dot(self.gamma[k],(np.dot(la.inv(self.sigma),self.mu.T) + inner_sum.T)  )).T
299 | 
300 | 	def computeXi(self):
301 | 		for m in range(self.n_tasks):
302 | 			task_X = self.task_dict[m]['X']
303 | 			for n in range(len(task_X)):
304 | 				inner_sum = 0
305 | 				for k in range(self.K):
306 | 					# Note that transposes are different because we are using different notation than in the paper - specifically we use row vectors where they are using column vectors
307 | 					inner_sum += self.phi[m,k]*np.dot((np.dot(np.atleast_2d(task_X[n,:]), 
308 | 														(np.dot(np.atleast_2d(self.theta[k,:]).T, np.atleast_2d(self.theta[k,:])) + self.gamma[k]))),
309 | 														np.atleast_2d(task_X[n,:]).T)
310 | 				assert inner_sum >= 0			# This number can't be negative since we are taking the square root
311 | 
312 | 				self.xi[m][n] = np.sqrt(inner_sum[0,0])
313 | 				if self.xi[m][n]==0:
314 | 					print m,n
315 | 
316 | 	def predictBinary(self, X, task):
317 | 		preds = self.predictProbability(task,X) 
318 | 		return [1.0 if p>= 0.5 else 0.0 for p in preds.flatten() ]
319 | 
320 | 	def predictProbability(self, task, X):
321 | 		prob = 0
322 | 		for k in range(self.K):
323 | 			numerator = np.dot(np.atleast_2d(self.theta[k,:]),X.T)
324 | 			diag_entries = np.einsum('ij,ij->i', np.dot(X, self.gamma[k]), X) ##
325 | 			denom = np.sqrt(1.0 + np.pi/8 * diag_entries)
326 | 			prob = prob + self.phi[task,k] * sigmoid(numerator / denom)
327 | 		return prob
328 | 
329 | 
330 | 	# Code for Predicting for a new task
331 | 	def metropolisHastingsAlgorithm(self, new_task_X, new_task_y,N_sam=1000):
332 | 		gauss_weight = (self.tau1/self.tau2)/(self.n_tasks+(self.tau1/self.tau2))
333 | 		point_dist_weight = 1.0/(self.n_tasks+(self.tau1/self.tau2))
334 | 		point_centers_matrix  = self.theta
335 | 		point_weights = [sum([phi_m[k] for phi_m in self.phi]) for k in range(len(self.phi[0]))]
336 | 		mu_mult = self.mu[0] # Mu is assumed to be the same for each weight
337 | 		sigma_mult = self.sigma[0,0] # Sigma is assumed to be a scalar times the idenity matrix
338 | 
339 | 		dist = HBLR_Distribution.MainDistribution(gauss_weight,point_dist_weight,point_centers_matrix, point_weights,mu_mult,sigma_mult)
340 | 
341 | 		w_dot_array = [np.atleast_2d(dist.rvs(size=1))]
342 | 		for i in range(N_sam-1):
343 | 			w_hat = np.atleast_2d(dist.rvs(size=1))
344 | 			accept_prob = min(1,self.dataProb(new_task_X,new_task_y,w_hat)/self.dataProb(new_task_X,new_task_y,w_dot_array[-1]))
345 | 			if np.random.uniform()<accept_prob:
346 | 				w_dot_array.append(w_hat)
347 | 			else:
348 | 				w_dot_array.append(w_dot_array[-1])
349 | 		return w_dot_array
350 | 
351 | 	def dataProb(self,new_task_X,new_task_y,weights):
352 | 		prod = 1
353 | 		for i in range(len(new_task_X)):
354 | 			sig = sigmoid(np.dot(weights,np.atleast_2d(new_task_X[i,:]).T ))
355 | 			prod = prod*(sig**new_task_y[i]) * (1.0-sig)**(1-new_task_y[i])
356 | 	
357 | 		return prod
358 | 
359 | 	def predictNewTask(self,new_task_X,new_task_y,pred_X,N_sam=1000):
360 | 		w_dot_array = self.metropolisHastingsAlgorithm(new_task_X,new_task_y,N_sam)
361 | 	
362 | 		predictions = []
363 | 		for x_star in pred_X:
364 | 			predictions.append(sum([sigmoid(np.dot(w,np.atleast_2d(x_star).T))[0,0] for w in w_dot_array])/float(N_sam))
365 | 		predictions = [1.0 if p>=0.5 else 0.0 for p in predictions]
366 | 		return predictions
367 | 
368 | 
369 | # Helper function
370 | def flattenListLists(listLists):
371 | 	return np.array([item for sublist in listLists for item in sublist])
372 | 
373 | # mathematical helper functions
374 | def sigmoid(x):
375 | 	return 1.0 / (1.0 + np.exp(-x))
376 | 
377 | def rhoFunction(x):
378 | 	assert len(np.where(x==0)[0]) == 0 		#there should not be any zeros passed to this function
379 | 
380 | 	return (0.5 - sigmoid(x)) / (2.0*x)
381 | 
382 | def computeMatrixConvergence(prev, new):
383 | 	return la.norm(new-prev)
384 | 
385 | def computeListOfListsConvergence(prev, new):
386 | 	assert len(prev) == len(new)
387 | 
388 | 	max_diff = 0
389 | 	for i in range(len(prev)):
390 | 		diff = la.norm(np.array(new[i])-np.array(prev[i]))
391 | 		if diff > max_diff:
392 | 			max_diff = diff
393 | 	return max_diff
394 | 
395 | def getBinaryAccuracy(pred,true_labels):
396 | 	assert len(pred)==len(true_labels)
397 | 
398 | 	correct_labels = [1 for i in range(len(pred)) if pred[i]==true_labels[i]]
399 | 
400 | 	return len(correct_labels)/float(len(pred))
401 | 


--------------------------------------------------------------------------------
/HBLR/HBLRWrapper.py:
--------------------------------------------------------------------------------
  1 | """Performs hyperparameter sweep for Hierarchical Bayesian Logistic Regression 
  2 | (HBLR)"""
  3 | 
  4 | import matplotlib
  5 | matplotlib.use('Agg')
  6 | import numpy as np
  7 | import pandas as pd
  8 | import pickle
  9 | import sys
 10 | import os
 11 | import copy
 12 | from time import time
 13 | from sklearn.metrics import roc_auc_score
 14 | import matplotlib.pyplot as plt
 15 | 
 16 | CODE_PATH = os.path.dirname(os.getcwd())
 17 | sys.path.append(CODE_PATH)
 18 | 
 19 | DEFAULT_RESULTS_PATH = '/Your/path/here/'
 20 | DEFAULT_DATASETS_PATH = '/Your/path/here/'
 21 | DEFAULT_FIGURES_PATH = '/Your/path/here/'
 22 | 
 23 | import HBLR as hblr
 24 | import helperFuncs as helper
 25 | 
 26 | DEFAULT_NUM_CROSS_FOLDS = 5
 27 | DEFAULT_MAX_ITERS = 75
 28 | SAVE_RESULTS_EVERY_X_TESTS = 1
 29 | DEFAULT_VALIDATION_TYPE = 'cross'
 30 | 
 31 | 
 32 | '''	Notes:
 33 | 	-Parameters to tune: tau10, tau20, mu, sigma
 34 | 		-ratio between tau10 and tau20 controls the number of clusters. A greater ratio = more clusters
 35 | 			-successful run was done with tau10 = tau20 = 0.05
 36 | 		-small sigma might be good. e.g. 0.1*I
 37 | 		-mu is usually 0. not testing for now
 38 | 	-set number of clusters:
 39 | 		-for wellbeing measures as tasks go with default (K=num_tasks)
 40 | 		-for users as tasks no more than 25 
 41 | '''
 42 | 
 43 | def reloadFiles():
 44 | 	reload(hblr)
 45 | 	reload(helper)
 46 | 
 47 | class HBLRWrapper:
 48 | 
 49 | 	def __init__(self, file_prefix, users_as_tasks=False, num_cross_folds=DEFAULT_NUM_CROSS_FOLDS, cont=False,
 50 | 				results_path=DEFAULT_RESULTS_PATH, figures_path=DEFAULT_FIGURES_PATH, datasets_path=DEFAULT_DATASETS_PATH, 
 51 | 				test_run=False, max_iters=DEFAULT_MAX_ITERS, val_type=DEFAULT_VALIDATION_TYPE, optimize_labels=None,
 52 | 				test_csv_filename=None):
 53 | 		self.results_path = results_path
 54 | 		self.figures_path = figures_path
 55 | 		self.datasets_path = datasets_path
 56 | 		self.save_prefix = self.getSavePrefix(file_prefix, replace=cont)
 57 | 		self.cont=cont
 58 | 		self.max_iters = max_iters
 59 | 		self.val_type = val_type
 60 | 		self.users_as_tasks = users_as_tasks
 61 | 		self.file_prefix = file_prefix
 62 | 		if test_csv_filename is not None:
 63 | 			self.test_csv_filename = self.datasets_path + test_csv_filename
 64 | 		else:
 65 | 			self.test_csv_filename = None
 66 | 		self.test_tasks = helper.loadPickledTaskList(datasets_path, file_prefix, "Test")
 67 | 		self.train_tasks = helper.loadPickledTaskList(datasets_path, file_prefix, "Train")
 68 | 		if self.val_type != 'cross':
 69 | 			self.val_tasks = helper.loadPickledTaskList(datasets_path, file_prefix, "Val")
 70 | 			self.initializeHBLRModel(self.train_tasks)
 71 | 		else:
 72 | 			self.classifier = None
 73 | 		
 74 | 		if users_as_tasks:
 75 | 			self.K = 25
 76 | 		else:
 77 | 			self.K = len(self.test_tasks)
 78 | 		self.n_feats = helper.calculateNumFeatsInTaskList(self.test_tasks)
 79 | 		self.n_tasks = len(self.test_tasks)
 80 | 
 81 | 		if optimize_labels is None:
 82 | 			self.optimize_labels = ['tomorrow_Group_Happiness_Evening_Label', 'tomorrow_Group_Health_Evening_Label', 'tomorrow_Group_Calmness_Evening_Label']
 83 | 		else:
 84 | 			self.optimize_labels = optimize_labels
 85 | 
 86 | 		#parameters that can be tuned
 87 | 		self.tau10s=[10, 1, 0.05, 0.01]
 88 | 		self.tau20s=[1.0, 0.05, 0.01]
 89 | 		self.sigma_multipliers = [.01,0.1, 1]
 90 | 		self.mu_multipliers = [0.0]
 91 | 
 92 | 		if test_run:
 93 | 			print "This is only a testing run. Using cheap settings to make it faster"
 94 | 			self.K = 2
 95 | 			self.max_iters = 5
 96 | 			self.n_tasks = 2
 97 | 			self.tau10s=[1]
 98 | 			self.tau20s=[.1]
 99 | 			self.sigma_multipliers=[.01]
100 | 			self.mu_multipliers=[0]
101 | 
102 | 		self.calcNumSettingsDesired()
103 | 
104 | 		#storing the results
105 | 		self.time_sum = 0
106 | 		if cont:
107 | 			self.val_results_df = pd.DataFrame.from_csv(self.results_path + self.save_prefix + '.csv')
108 | 			print '\nPrevious validation results df loaded. It has', len(self.val_results_df), "rows"
109 | 			self.started_from = len(self.val_results_df)
110 | 		else:
111 | 			self.val_results_df = pd.DataFrame()
112 | 			self.started_from = 0
113 | 
114 | 		self.num_cross_folds = num_cross_folds
115 | 		if self.val_type == 'cross':
116 | 			helper.generateCrossValPickleFiles(self.datasets_path, self.file_prefix, self.num_cross_folds)
117 | 
118 | 
119 | 	def initializeHBLRModel(self, train_tasks):
120 | 		self.classifier = hblr.HBLR(train_tasks,K=self.K, debug=False,max_iterations=self.max_iters, verbose=False)
121 | 
122 | 	def getSavePrefix(self, file_prefix, replace=False):
123 | 		dash_loc = file_prefix.find('-')
124 | 		prefix = "hblr" + file_prefix[dash_loc:-1]
125 | 		if not replace:
126 | 			while os.path.exists(self.results_path + prefix + '.csv'):
127 | 				prefix = prefix + '2'
128 | 		return prefix
129 | 
130 | 	def calcNumSettingsDesired(self):
131 | 		self.num_settings = len(self.tau10s) * len(self.tau20s) * len(self.mu_multipliers)  \
132 | 						* len(self.sigma_multipliers)
133 | 
134 | 	# use something like the following to test only one set of parameters:
135 | 	# wrapper.setParams(tau10s=[.05], tau20s=[.05], sigma_multipliers=[.1,.01])
136 | 	def setParams(self, tau10s=None, tau20s=None, sigma_multipliers=None, mu_multipliers=None):
137 | 		'''does not override existing parameter settings if the parameter is not set'''
138 | 		self.tau10s = tau10s if tau10s is not None else self.tau10s
139 | 		self.tau20s = tau20s if tau20s is not None else self.tau10s
140 | 		self.sigma_multipliers = sigma_multipliers if sigma_multipliers is not None else self.sigma_multipliers
141 | 		self.mu_multipliers = mu_multipliers if mu_multipliers is not None else self.mu_multipliers
142 | 
143 | 	def settingAlreadyDone(self, tau10, tau20, mu_mult, sigma_mult):
144 | 		if len(self.val_results_df[(self.val_results_df['tau10']== tau10) & \
145 | 									(self.val_results_df['tau20']== tau20) & \
146 | 									(self.val_results_df['sigma_multiplier']== mu_mult) & \
147 | 									(self.val_results_df['mu_multiplier']== sigma_mult)]) > 0:
148 | 			print "setting already tested"
149 | 			return True
150 | 		else:
151 | 			return False
152 | 
153 | 	def setClassifierToSetting(self, tau10, tau20, sigma_mult, mu_mult):
154 | 		sigma = sigma_mult * np.eye(self.n_feats)
155 | 		mu = mu_mult * np.ones((1,self.n_feats))
156 | 
157 | 		self.classifier.setHyperParameters(mu, sigma, tau10, tau20)
158 | 
159 | 	def getAccuracyAucOnAllTasks(self, task_list):
160 | 		all_task_Y = []
161 | 		all_preds = []
162 | 		for i in range(len(task_list)):
163 | 			preds, task_Y = self.getPredsTrueOnOneTask(task_list,i)
164 | 			if preds is None:
165 | 				# Skipping task because it does not have valid data
166 | 				continue
167 | 			if len(task_Y)>0:
168 | 				all_task_Y.extend(task_Y)
169 | 				all_preds.extend(preds)
170 | 		if not helper.containsEachLabelType(all_preds):
171 | 			print "for some bizarre reason, the preds for all tasks are the same class"
172 | 			print "preds", all_preds
173 | 			print "true_y", all_task_Y
174 | 			auc = np.nan
175 | 		else:
176 | 			auc=roc_auc_score(all_task_Y, all_preds)
177 | 		acc=hblr.getBinaryAccuracy(all_preds,all_task_Y)
178 | 		return acc,auc
179 | 
180 | 	def getPredsTrueOnOneTask(self, task_list, task):
181 | 		if not helper.isValidTask(task_list, task):
182 | 			return None, None
183 | 		task_Y = list(task_list[task]["Y"])
184 | 		return self.classifier.predictBinary(task_list[task]['X'], task), task_Y
185 | 		
186 | 	def getAccuracyAucOnOneTask(self, task_list, task):
187 | 		preds, task_Y = self.getPredsTrueOnOneTask(task_list,task)
188 | 		if preds is None:
189 | 			# Returning nan for task because it does not have valid data
190 | 			return np.nan, np.nan
191 | 		acc = hblr.getBinaryAccuracy(preds,task_Y)
192 | 		if len(task_Y) <= 1 or not helper.containsEachLabelType(preds):
193 | 			auc = np.nan
194 | 		else:
195 | 			auc = roc_auc_score(task_Y, preds)
196 | 		return acc,auc
197 | 
198 | 	def getValidationResults(self, results_dict):
199 | 		self.classifier.trainUntilConverged()
200 | 		results_dict['num_clusters'] = self.classifier.K
201 | 
202 | 		if self.users_as_tasks:
203 | 			val_acc, val_auc = self.getAccuracyAucOnAllTasks(self.val_tasks)
204 | 			results_dict['val_acc'] = val_acc
205 | 			results_dict['val_auc'] = val_auc
206 | 		else:
207 | 			accs = []
208 | 			aucs = []
209 | 			for t in range(self.n_tasks):
210 | 				acc, auc = self.getAccuracyAucOnOneTask(self.val_tasks, t)
211 | 				task_name = self.val_tasks[t]['Name']
212 | 				results_dict['TaskAcc-' + helper.getFriendlyLabelName(task_name)] = acc
213 | 				results_dict['TaskAuc-' + helper.getFriendlyLabelName(task_name)] = auc
214 | 				if task_name in self.optimize_labels:
215 | 					accs.append(acc)
216 | 					aucs.append(auc)
217 | 			results_dict['val_acc'] = np.nanmean(accs)
218 | 			results_dict['val_auc'] = np.nanmean(aucs)
219 | 		return results_dict
220 | 
221 | 	def getCrossValidationResults(self, results_dict, tau10, tau20, sigma_mult, mu_mult, save_plots=False, print_per_fold=False):
222 | 		if save_plots:
223 | 			same_task_matrix = np.zeros((self.n_tasks,self.n_tasks))
224 | 
225 | 		clusters = [0] * self.num_cross_folds
226 | 
227 | 		all_acc = []
228 | 		all_auc = []
229 | 		all_f1 = []
230 | 		all_precision = []
231 | 		all_recall = []
232 | 		if not self.users_as_tasks:	
233 | 			per_task_accs = [[] for i in range(self.n_tasks)]
234 | 			per_task_aucs = [[] for i in range(self.n_tasks)]
235 | 			per_task_f1 = [[] for i in range(self.n_tasks)]
236 | 			per_task_precision = [[] for i in range(self.n_tasks)]
237 | 			per_task_recall = [[] for i in range(self.n_tasks)]
238 | 
239 | 		for f in range(self.num_cross_folds):
240 | 			train_tasks, val_tasks = helper.loadCrossValData(self.datasets_path, self.file_prefix, f, reshape=True)
241 | 
242 | 			self.initializeHBLRModel(train_tasks)
243 | 			self.setClassifierToSetting(tau10, tau20, sigma_mult, mu_mult)
244 | 			self.classifier.trainUntilConverged()
245 | 
246 | 			clusters[f] = self.classifier.K
247 | 		
248 | 			if save_plots: same_task_matrix = self.updateSameTaskMatrix(same_task_matrix)
249 | 
250 | 			# Get results!
251 | 			fold_preds = []
252 | 			fold_true_y = []
253 | 			for t in range(self.n_tasks):
254 | 				preds = self.classifier.predictBinary(val_tasks[t]['X'], t)
255 | 				true_y = list(val_tasks[t]['Y'].flatten())
256 | 
257 | 				if len(preds)==0 or len(true_y) == 0:
258 | 					continue
259 | 
260 | 				if not self.users_as_tasks:
261 | 					# save the per-task results
262 | 					t_acc, t_auc, t_f1, t_precision, t_recall = helper.computeAllMetricsForPreds(preds, true_y)
263 | 					per_task_accs[t].append(t_acc)
264 | 					per_task_aucs[t].append(t_auc)
265 | 					per_task_f1[t].append(t_f1)
266 | 					per_task_precision[t].append(t_precision)
267 | 					per_task_recall[t].append(t_recall)
268 | 					if print_per_fold: print "Fold", f, "Task", val_tasks[t]['Name'], "acc", t_acc, "auc", t_auc, "f1", t_f1, "precision",t_precision,"recall",t_recall
269 | 
270 | 				fold_preds.extend(preds)
271 | 				fold_true_y.extend(true_y)
272 | 
273 | 
274 | 			acc, auc, f1, precision, recall = helper.computeAllMetricsForPreds(fold_preds, fold_true_y)
275 | 			all_acc.append(acc)
276 | 			all_auc.append(auc)
277 | 			all_f1.append(f1)
278 | 			all_precision.append(precision)
279 | 			all_recall.append(recall)
280 | 			if print_per_fold: print "Fold", f, "acc", acc, "auc", auc, "f1", f1, "precision",precision,"recall",recall
281 | 
282 | 		print "accs for all folds", all_acc
283 | 		print "aucs for all folds", all_auc
284 | 		print "clusters for all folds", clusters
285 | 
286 | 		if save_plots:
287 | 			self.plotAccuracyAucAndClusters(all_acc, all_auc, clusters)
288 | 			self.saveHintonPlot(same_task_matrix, self.num_cross_folds) 
289 | 			pd.DataFrame(same_task_matrix).to_csv(self.results_path + self.save_prefix + "-same_task_matrix.csv")
290 | 		
291 | 		# Add results to the dictionary
292 | 		results_dict['val_acc'] = np.nanmean(all_acc)
293 | 		results_dict['val_auc'] = np.nanmean(all_auc)
294 | 		results_dict['val_f1'] = np.nanmean(all_f1)
295 | 		results_dict['val_precision'] = np.nanmean(all_precision)
296 | 		results_dict['val_recall'] = np.nanmean(all_recall)
297 | 		results_dict['num_clusters'] = np.nanmean(clusters)
298 | 
299 | 		# Add per-task results to the dictionary
300 | 		if not self.users_as_tasks:
301 | 			for t in range(self.n_tasks):
302 | 				task_name = val_tasks[t]['Name']
303 | 				results_dict['TaskAcc-' + helper.getFriendlyLabelName(task_name)] = np.nanmean(per_task_accs[t])
304 | 				results_dict['TaskAuc-' + helper.getFriendlyLabelName(task_name)] = np.nanmean(per_task_aucs[t])
305 | 				results_dict['TaskF1-' + helper.getFriendlyLabelName(task_name)] = np.nanmean(per_task_f1[t])
306 | 				results_dict['TaskPrecision-' + helper.getFriendlyLabelName(task_name)] = np.nanmean(per_task_precision[t])
307 | 				results_dict['TaskRecall-' + helper.getFriendlyLabelName(task_name)] = np.nanmean(per_task_recall[t])
308 | 
309 | 		return results_dict
310 | 
311 | 	def testOneSetting(self, tau10, tau20, sigma_mult, mu_mult):
312 | 		if self.cont:
313 | 			if self.settingAlreadyDone(tau10, tau20, sigma_mult, mu_mult):
314 | 				return
315 | 
316 | 		t0 = time()
317 | 		
318 | 		results_dict = {'tau10':tau10, 'tau20': tau20, 'sigma_multiplier':sigma_mult, 'mu_multiplier':mu_mult}
319 | 		
320 | 		if self.val_type == 'cross':
321 | 			results_dict = self.getCrossValidationResults(results_dict, tau10, tau20, sigma_mult, mu_mult)
322 | 		else:
323 | 			self.setClassifierToSetting(tau10, tau20, sigma_mult, mu_mult)
324 | 			results_dict = self.getValidationResults(results_dict)
325 | 		
326 | 		self.val_results_df = self.val_results_df.append(results_dict,ignore_index=True)
327 | 		
328 | 		print "\n", self.val_results_df.tail(n=1)
329 | 		t1 = time()
330 | 		this_time = t1 - t0
331 | 		print "It took", this_time, "seconds to obtain this result"
332 | 
333 | 		self.time_sum = self.time_sum + this_time
334 | 
335 | 		self.printTimeEstimate()
336 | 		sys.stdout.flush()
337 | 
338 | 		#output the file every few iterations for safekeeping 
339 | 		if len(self.val_results_df) % SAVE_RESULTS_EVERY_X_TESTS == 0:
340 | 			self.val_results_df.to_csv(self.results_path + self.save_prefix + '.csv')
341 | 
342 | 	def printTimeEstimate(self):
343 | 		num_done = len(self.val_results_df)-self.started_from
344 | 		num_remaining = self.num_settings - num_done - self.started_from
345 | 		avg_time = self.time_sum / num_done
346 | 		total_secs_remaining = int(avg_time * num_remaining)
347 | 		hours = total_secs_remaining / 60 / 60
348 | 		mins = (total_secs_remaining % 3600) / 60
349 | 		secs = (total_secs_remaining % 3600) % 60
350 | 
351 | 		print "\n", num_done, "settings processed so far,", num_remaining, "left to go"
352 | 		print "Estimated time remaining:", hours, "hours", mins, "mins", secs, "secs"
353 | 
354 | 	def sweepAllParameters(self):
355 | 		print "\nSweeping all parameters!"
356 | 		
357 | 		self.calcNumSettingsDesired()
358 | 		print "\nYou have chosen to test a total of", self.num_settings, "settings"
359 | 		sys.stdout.flush()
360 | 
361 | 		#sweep all possible combinations of parameters
362 | 		for tau10 in self.tau10s:
363 | 			for tau20 in self.tau20s:
364 | 				for sigma_mult in self.sigma_multipliers:
365 | 					for mu_mult in self.mu_multipliers:
366 | 						self.testOneSetting(tau10, tau20, sigma_mult, mu_mult)
367 | 		self.val_results_df.to_csv(self.results_path + self.save_prefix + '.csv')
368 | 
369 | 	def findBestSetting(self, save_final_results=False):
370 | 		accuracies = self.val_results_df['val_acc'].tolist()
371 | 		max_acc = max(accuracies)
372 | 		max_idx = accuracies.index(max_acc)
373 | 
374 | 		print "BEST SETTING!"
375 | 		print "The highest validation accuracy of", max_acc, "was found with the following settings:"
376 | 		print self.val_results_df.iloc[max_idx]
377 | 
378 | 		if self.test_csv_filename is not None or save_final_results:
379 | 			self.getFinalResultsAndSave(self.val_results_df.iloc[max_idx])
380 | 		else:
381 | 			print "Not running Final results"
382 | 			return self.val_results_df.iloc[max_idx]
383 | 
384 | 	def run(self):
385 | 		self.sweepAllParameters()
386 | 		return self.findBestSetting()
387 | 
388 | 	def getFinalResultsAndSave(self, setting_dict):
389 | 		if self.val_type == 'cross':
390 | 			print "\nPlotting cross-validation results for best settings..."
391 | 			self.getCrossValidationResults(dict(), setting_dict['tau10'], setting_dict['tau20'], 
392 | 											setting_dict['sigma_multiplier'], setting_dict['mu_multiplier'],
393 | 											save_plots=True)
394 | 
395 | 		
396 | 		print "\nRetraining on training data with the best settings..."
397 | 		self.initializeHBLRModel(self.train_tasks)
398 | 		self.classifier.verbose = True
399 | 		self.setClassifierToSetting(setting_dict['tau10'], setting_dict['tau20'], setting_dict['sigma_multiplier'], setting_dict['mu_multiplier'])
400 | 		self.classifier.trainUntilConverged()
401 | 		
402 | 		print "\nPlotting and saving cool stuff about the final model..."
403 | 		self.saveImagePlot(self.classifier.phi, 'Phi')
404 | 		pd.DataFrame(self.classifier.phi).to_csv(self.results_path + self.save_prefix + "-phi.csv")
405 | 		self.saveConvergencePlots()
406 | 
407 | 		print "\nEvaluating results on held-out test set!! ..."
408 | 		all_preds = []
409 | 		all_true_y = []
410 | 		all_X_data = []
411 | 		per_task_accs = [np.nan] * self.n_tasks
412 | 		per_task_aucs = [np.nan] * self.n_tasks
413 | 		per_task_f1 = [np.nan] * self.n_tasks
414 | 		per_task_precision = [np.nan] * self.n_tasks
415 | 		per_task_recall = [np.nan] * self.n_tasks
416 | 		for t in range(self.n_tasks):
417 | 			preds = self.classifier.predictBinary(self.test_tasks[t]['X'], t)
418 | 			true_y = list(self.test_tasks[t]['Y'].flatten())
419 | 
420 | 			if len(preds)==0 or len(true_y) == 0:
421 | 				continue
422 | 
423 | 			all_preds.extend(preds)
424 | 			all_true_y.extend(true_y)
425 | 			all_X_data.extend(self.test_tasks[t]['X'])
426 | 
427 | 			# save the per-task results
428 | 			t_acc, t_auc, t_f1, t_precision, t_recall = helper.computeAllMetricsForPreds(preds, true_y)
429 | 			per_task_accs[t] = t_acc
430 | 			per_task_aucs[t] = t_auc
431 | 			per_task_f1[t] = t_f1
432 | 			per_task_precision[t] = t_precision
433 | 			per_task_recall[t] = t_recall
434 | 
435 | 		print "\tHELD OUT TEST METRICS COMPUTED BY APPENDING ALL PREDS"
436 | 		acc, auc, f1, precision, recall = helper.computeAllMetricsForPreds(all_preds, all_true_y)
437 | 		print '\t\tAcc:', acc, 'AUC:', auc, 'F1:', f1, 'Precision:', precision, 'Recall:', recall
438 | 
439 | 		print "\n\tHELD OUT TEST METRICS COMPUTED BY AVERAGING OVER TASKS"
440 | 		avg_acc = np.nanmean(per_task_accs)
441 | 		avg_auc = np.nanmean(per_task_aucs)
442 | 		avg_f1 = np.nanmean(per_task_f1)
443 | 		avg_precision = np.nanmean(per_task_precision)
444 | 		avg_recall = np.nanmean(per_task_recall)
445 | 		print '\t\tAcc:', avg_acc, 'AUC:', avg_auc, 'F1:', avg_f1, 'Precision:', avg_precision, 'Recall:', avg_recall
446 | 
447 | 		print "\n\tHELD OUT TEST METRICS COMPUTED FOR EACH TASK"
448 | 		if not self.users_as_tasks:
449 | 			for t in range(self.n_tasks):
450 | 				task_name = self.test_tasks[t]['Name']
451 | 				if not self.users_as_tasks: task_name=helper.getFriendlyLabelName(task_name)
452 | 				print "\t\t", task_name, "- Acc:", per_task_accs[t], "AUC:", per_task_aucs[t], 'F1:', per_task_f1[t], 'Precision:', per_task_precision[t], 'Recall:', per_task_recall[t]
453 | 
454 | 		if self.test_csv_filename is not None:
455 | 			print "\tSAVING HELD OUT PREDICITONS"
456 | 			if self.users_as_tasks:
457 | 				task_column = 'user_id'
458 | 				label_name = helper.getFriendlyLabelName(self.file_prefix)
459 | 				wanted_label = helper.getOfficialLabelName(label_name)
460 | 				predictions_df = helper.get_test_predictions_for_df_with_task_column(
461 | 						self.classifier.predictBinary, self.test_csv_filename, task_column, self.test_tasks, 
462 | 						wanted_label=wanted_label, num_feats_expected=np.shape(self.test_tasks[0]['X'])[1], 
463 | 						label_name=label_name, tasks_are_ints=False)
464 | 			else:
465 | 				predictions_df = helper.get_test_predictions_for_df_with_no_task_column(self.classifier.predictBinary, 
466 | 				self.test_csv_filename, self.test_tasks, num_feats_expected=np.shape(self.test_tasks[0]['X'])[1])
467 | 			predictions_df.to_csv(self.results_path + "Preds-" + self.save_prefix + '.csv')
468 | 		else:
469 | 			print "Uh oh, the test csv filename was not set, can't save test preds"
470 | 
471 | 		print "\t SAVING CLASSIFIER"
472 | 		with open(self.results_path + "PickledModel-" + self.save_prefix + '.p',"w") as f:
473 | 			pickle.dump(self.classifier,f)
474 | 
475 | 	def saveHintonPlot(self, matrix, num_tests, max_weight=None, ax=None):
476 | 		"""Draw Hinton diagram for visualizing a weight matrix."""
477 | 		fig,ax = plt.subplots(1,1)
478 | 		
479 | 		if not max_weight:
480 | 			max_weight = 2**np.ceil(np.log(np.abs(matrix).max())/np.log(2))
481 | 
482 | 		ax.patch.set_facecolor('gray')
483 | 		ax.set_aspect('equal', 'box')
484 | 		ax.xaxis.set_major_locator(plt.NullLocator())
485 | 		ax.yaxis.set_major_locator(plt.NullLocator())
486 | 
487 | 		for (x, y), w in np.ndenumerate(matrix):
488 | 			color = 'white' if w > 0 else 'black'
489 | 			size = np.sqrt(np.abs(0.5*w/num_tests)) # Need to scale so that it is between 0 and 0.5
490 | 			rect = plt.Rectangle([x - size / 2, y - size / 2], size, size,
491 | 								 facecolor=color, edgecolor=color)
492 | 			ax.add_patch(rect)
493 | 
494 | 		ax.autoscale_view()
495 | 		ax.invert_yaxis()
496 | 		plt.savefig(self.figures_path + self.save_prefix + '-Hinton.eps')
497 | 		plt.close()
498 | 
499 | 	def plotAccuracyAucAndClusters(self, accs, aucs, clusters):
500 | 		fig,(ax1,ax2,ax3) = plt.subplots(3,1,figsize=(10,10))
501 | 		ax1.hist(accs)
502 | 		ax1.set_title("Accuracy")
503 | 		ax2.hist(aucs)
504 | 		ax2.set_title("AUC")
505 | 		ax3.hist(clusters)
506 | 		ax3.set_title("Number of Clusters (K)")
507 | 		plt.savefig(self.figures_path + self.save_prefix + '-AccAucClusters.eps')
508 | 		plt.close()
509 | 
510 | 	def saveConvergencePlots(self):
511 | 		hblr.plotConvergence(self.classifier.xi_convergence_list, 'Xi convergence', save_path=self.figures_path + self.save_prefix + '-ConvergenceXi.eps')
512 | 		hblr.plotConvergence(self.classifier.theta_convergence_list, 'Theta convergence', save_path=self.figures_path +self.save_prefix + '-ConvergenceTheta.eps')
513 | 		hblr.plotConvergence(self.classifier.phi_convergence_list, 'Phi convergence', save_path=self.figures_path +self.save_prefix + '-ConvergencePhi.eps')
514 | 
515 | 	def saveImagePlot(self, matrix, name):
516 | 		plt.figure()
517 | 		plt.imshow(matrix)
518 | 		plt.savefig(self.figures_path + self.save_prefix + "-" + name + ".eps")
519 | 		plt.close()
520 | 
521 | 	def updateSameTaskMatrix(self, same_task_matrix):
522 | 		most_likely_cluster = np.argmax(self.classifier.phi,axis=1)
523 | 		for row_task in range(self.n_tasks):
524 | 			for col_task in range(self.n_tasks):
525 | 				if most_likely_cluster[row_task]==most_likely_cluster[col_task]:
526 | 					same_task_matrix[row_task,col_task]+=1
527 | 		return same_task_matrix
528 | 
529 | 	
530 | if __name__ == "__main__":
531 | 	print "HBLR MODEL SELECTION"
532 | 	print "\tThis code will sweep a set of parameters to find the ideal settings for HBLR for a single dataset"
533 | 
534 | 	if len(sys.argv) < 3:
535 | 		print "Error: usage is python HBLRWrapper.py <file prefix> <users as tasks> <continue>"
536 | 		print "\t<file prefix>: e.g. datasetTaskList-Discard-Future-Group_ - program will look in the following directory for this file", DEFAULT_DATASETS_PATH
537 | 		print "\t<users as tasks>: type 'users' for users as tasks, or 'wellbeing' for wellbeing measures as tasks"
538 | 		print "\t<continue>: optional. If 'True', the wrapper will pick up from where it left off by loading a previous validation results file"
539 | 		print "\t<csv file for testing>: optional. If you want to get the final test results, provide the name of a csv file to test on"
540 | 		sys.exit()
541 | 	filename= sys.argv[1] #get data file from command line argument
542 | 	print "\nLoading dataset", DEFAULT_DATASETS_PATH + filename
543 | 	print ""
544 | 
545 | 	if sys.argv[2] == 'users':
546 | 		users_as_tasks = True
547 | 		print "Okay, treating users as tasks. Will not print per-task results"
548 | 	else:
549 | 		users_as_tasks = False
550 | 		print "Okay, treating wellbeing measures as tasks. Will save and print per-task results"
551 | 
552 | 	if len(sys.argv) >= 4 and sys.argv[3] == 'True':
553 | 		cont = True
554 | 		print "Okay, will continue from a previously saved validation results file for this problem"
555 | 	else:
556 | 		cont = False
557 | 	print ""
558 | 
559 | 	if len(sys.argv) >= 5:
560 | 		csv_test_file = sys.argv[4]
561 | 		print "Okay, will get final test results on file", csv_test_file
562 | 		print ""
563 | 	else:
564 | 		csv_test_file = None
565 | 
566 | 	wrapper = HBLRWrapper(filename, users_as_tasks=users_as_tasks, cont=cont, test_csv_filename=csv_test_file)
567 | 	
568 | 	print "\nThe following parameter settings will be tested:"
569 | 	print "\ttau10:  	\t", wrapper.tau10s
570 | 	print "\ttau20:   	\t", wrapper.tau20s
571 | 	print "\tsigma multipliers: \t", wrapper.sigma_multipliers
572 | 	print "\tmu multipliers:    \t", wrapper.mu_multipliers
573 | 
574 | 	print "\nThe validation results dataframe will be saved in:", wrapper.results_path + wrapper.save_prefix + '.csv'
575 | 	print "\nThe validation and testing figures will be saved in:", wrapper.figures_path + wrapper.save_prefix
576 | 
577 | 	wrapper.run()
578 | 


--------------------------------------------------------------------------------
/HBLR/HBLR_Distribution.py:
--------------------------------------------------------------------------------
  1 | from scipy import stats
  2 | import numpy as np
  3 | 
  4 | 
  5 | #
  6 | 
  7 | class SubDistribution(stats.rv_continuous):
  8 |     def __init__(self, gauss_weight, point_dist_weight, point_centers, point_weights, mu=0, sigma=1):
  9 |         super(SubDistribution, self).__init__()
 10 |         self.gauss_weight = gauss_weight
 11 |         self.point_dist_weight = point_dist_weight
 12 |         self.point_centers = point_centers
 13 |         self.point_weights = point_weights
 14 | 
 15 |         self.normal_dist = stats.norm(loc=mu, scale=sigma)  # scale is standard deviation
 16 | 
 17 |     def _pdf(self, x):
 18 |         gauss_pdf = self.normal_dist.pdf(x)
 19 |         point_pdf = np.array(
 20 |             [self.point_weights[self.point_centers.index(x_i)] if x_i in self.point_centers else 0 for x_i in x])
 21 | 
 22 |         return self.gauss_weight * gauss_pdf + self.point_dist_weight * point_pdf
 23 | 
 24 |     def _cdf(self, x):
 25 |         gauss_cdf = self.normal_dist.cdf(x)
 26 |         point_cdf = np.array(
 27 |             [sum(w for p_c, w in zip(self.point_centers, self.point_weights) if p_c < x_i) for x_i in x])
 28 | 
 29 |         return self.gauss_weight * gauss_cdf + self.point_dist_weight * point_cdf
 30 | 
 31 | 
 32 | class MainDistribution():
 33 |     def __init__(self, gauss_weight, point_dist_weight, point_centers_matrix, point_weights, mu=0, sigma=1):
 34 |         self.gauss_weight = gauss_weight  # (tau1/tau2)/(M+(tau1/tau2))
 35 |         self.point_dist_weight = point_dist_weight  # (1/(M+(tau1/tau2))
 36 |         self.point_centers_matrix = point_centers_matrix  # theta_ks; size K by num_feats
 37 |         self.point_weights = point_weights  # sum_m=1^M phi_m,k; size K
 38 |         self.mu = mu
 39 |         self.sigma = sigma
 40 |         self.dists = []  # list of distributions, assumed to be independent of one another; size of num_feats
 41 |         self.set_up_dists()
 42 | 
 43 |     def set_up_dists(self):
 44 |         # print [self.point_centers_matrix[j][0] for j in range(len(self.point_centers_matrix))]
 45 |         # print [self.point_centers_matrix[j][1] for j in range(len(self.point_centers_matrix))]
 46 |         self.dists = [SubDistribution(self.gauss_weight, self.point_dist_weight,
 47 |                                       [self.point_centers_matrix[j][i] for j in range(len(self.point_centers_matrix))],
 48 |                                       self.point_weights, self.mu, self.sigma) for i in
 49 |                       range(len(self.point_centers_matrix[0]))]
 50 | 
 51 |     def rvs(self, size):
 52 |         random_sample = []
 53 |         for i in range(size):
 54 |             random_sample.append([d.rvs() for d in self.dists])
 55 | 
 56 |         return random_sample
 57 | 
 58 |     def marginal_pdfs(self, x):
 59 |         return [d.pdf(x) for d in self.dists]
 60 | 
 61 |     def marginal_cdfs(self, x):
 62 |         return [d.cdf(x) for d in self.dists]
 63 | 
 64 | 
 65 | def test_single_distribution():
 66 |     import matplotlib.pyplot as plt
 67 |     from matplotlib.colors import LogNorm
 68 |     from mpl_toolkits.mplot3d import axes3d
 69 |     from matplotlib import cm
 70 | 
 71 |     distribution = SubDistribution(.5, .5, [0, 2.39994], [.2, .8], mu=.5)
 72 | 
 73 |     x_vals = np.linspace(-3, 3, 100001)
 74 |     plt.plot(x_vals, distribution.pdf(x_vals))
 75 |     plt.show()
 76 | 
 77 |     plt.plot(x_vals, distribution.cdf(x_vals))
 78 |     plt.show()
 79 | 
 80 |     samples = distribution.rvs(size=1000)
 81 | 
 82 |     print "Percent of samples at 0", sum(abs(samples) < 1e-13) / float(len(samples))
 83 |     print "Percent of samples at 2", sum(abs(samples - 2.39994) < 1e-13) / float(len(samples))
 84 | 
 85 |     plt.hist(samples, bins=np.arange(-4, 4, .25), normed=True)
 86 |     plt.plot(x_vals, distribution.pdf(x_vals))
 87 |     plt.show()
 88 | 
 89 | 
 90 | def test_full_distribution():
 91 |     import matplotlib.pyplot as plt
 92 |     from matplotlib.colors import LogNorm
 93 |     from mpl_toolkits.mplot3d import axes3d
 94 |     from matplotlib import cm
 95 | 
 96 |     distribution = MainDistribution(.5, .5, [[0, .5], [0, -2.0], [-3.0, 3.0]], [.2, .6, .2])
 97 | 
 98 |     x_vals = np.linspace(-5, 5, 1001)
 99 |     pdfs = distribution.marginal_pdfs(x_vals)
100 |     cdfs = distribution.marginal_cdfs(x_vals)
101 |     plt.figure()
102 |     plt.subplot(2, 1, 1)
103 |     plt.plot(x_vals, pdfs[0])
104 |     plt.subplot(2, 1, 2)
105 |     plt.plot(x_vals, pdfs[1])
106 |     plt.show()
107 | 
108 |     plt.figure()
109 |     plt.subplot(2, 1, 1)
110 |     plt.plot(x_vals, cdfs[0])
111 |     plt.subplot(2, 1, 2)
112 |     plt.plot(x_vals, cdfs[1])
113 |     plt.show()
114 | 
115 |     samples = distribution.rvs(size=3000)
116 |     plt.figure()
117 |     plt.subplot(2, 1, 1)
118 |     plt.hist([s[0] for s in samples])
119 |     plt.subplot(2, 1, 2)
120 |     plt.hist([s[1] for s in samples])
121 |     plt.show()
122 | 
123 |     # normal distribution center at x=0 and y=5
124 | 
125 |     plt.hist2d([s[0] for s in samples], [s[1] for s in samples], bins=40, norm=LogNorm())
126 |     plt.colorbar()
127 |     plt.show()
128 | 
129 |     joint_pdf = np.atleast_2d(pdfs[0]) * np.atleast_2d(pdfs[0]).T
130 | 
131 |     fig = plt.figure()
132 |     ax = fig.add_subplot(111, projection='3d')
133 |     # Plot a basic wireframe.
134 |     # ax.plot_wireframe(np.matlib.repmat(x_vals,1001,1), np.matlib.repmat(np.atleast_2d(x_vals).T,1,1001), joint_pdf, rstride=10, cstride=10)
135 |     surf = ax.plot_surface(np.matlib.repmat(x_vals, 1001, 1), np.matlib.repmat(np.atleast_2d(x_vals).T, 1, 1001),
136 |                            joint_pdf, cmap=cm.coolwarm, linewidth=0, antialiased=False)
137 | 
138 |     plt.show()
139 | 
140 | 
141 | 


--------------------------------------------------------------------------------
/LSSVM/LSSVM.py:
--------------------------------------------------------------------------------
 1 | """Implements a Least Squares Support Vector Machine (LS-SVM)."""
 2 | from sklearn.metrics.pairwise import rbf_kernel
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | 
 7 | class LSSVM:
 8 | 	def __init__(self, C, kernel_func, debug=False):
 9 | 		''' Least-squares svm : svm with squared loss and binary classification yi in {-1,1}
10 | 			C: 
11 | 			kernel_func:	function that takes 2 arguments (X1, X2) and returns kernel matrix of size (len(X1),len(X2))
12 | 		'''
13 | 		
14 | 		self.C = C
15 | 		self.kernel_func = kernel_func
16 | 
17 | 		self.data = None
18 | 		self.y = None
19 | 		self.b = None
20 | 		self.alphas = None
21 | 
22 | 		self.debug = debug
23 | 
24 | 	def fit(self,X,y):
25 | 		''' Linear program
26 | 		        AX = b
27 | 		|0      Y.T         | * |   b   | = |0| 
28 | 		|Y   Omega+(1/C)*I  |   | alpha |   |1|
29 | 
30 | 		Note Omega[i,j] = y[i]*y[j]*K(x[i],x[j])
31 | 		'''
32 | 		self.data = X
33 | 
34 | 
35 | 		# Make sure y is the right dimension
36 | 		y = np.atleast_2d(y)
37 | 		if np.shape(y)[0]==1:
38 | 			y = y.T
39 | 
40 | 		self.y = y
41 | 
42 | 		N = len(X) 
43 | 
44 | 		K = self.kernel_func(self.data,self.data)
45 | 
46 | 		Omega = np.dot(self.y,self.y.T)*K
47 | 		bottom_right = Omega+(1.0/self.C)*np.eye(N)
48 | 
49 | 		assert np.shape(bottom_right)==(N,N), "The bottom left matrix is the wrong size"
50 | 		
51 | 		if self.debug:
52 | 			print "K",K
53 | 			print "K nans", np.sum(np.isnan(K))
54 | 		
55 | 
56 | 
57 | 		first_row = np.hstack([np.zeros((1,1)),self.y.T])
58 | 		bottom_mat = np.hstack([self.y,bottom_right])
59 | 		A = np.vstack([first_row,bottom_mat])
60 | 	
61 | 		b_vec = np.vstack([0,np.ones((N,1))])
62 | 
63 | 		try:
64 | 			params,residuals, rank,s = np.linalg.lstsq(A,b_vec)
65 | 		except:
66 | 			print "\n------WARNING!!!  These parameters didn't converge!------\n"
67 | 			return False
68 | 		
69 | 		self.b = params[0]
70 | 		self.alphas = params[1:]
71 | 
72 | 		return True
73 | 		
74 | 
75 | 	def predict(self,test_data):
76 | 		assert (self.b is not None) and (self.alphas is not None), "Model not trained yet"
77 | 		
78 | 		K = self.kernel_func(self.data,test_data)
79 | 
80 | 		alphaY = self.alphas*self.y
81 | 
82 | 		y_hat = np.sign(np.dot(alphaY.T,K)+self.b)
83 | 
84 | 		return y_hat[0]
85 | 


--------------------------------------------------------------------------------
/LSSVM/LSSVMWrapper.py:
--------------------------------------------------------------------------------
  1 | """Performs hyperparameter sweep for the Least Squares Support Vector Machine 
  2 | (LS-SVM)."""
  3 | import matplotlib
  4 | matplotlib.use('Agg')
  5 | import numpy as np
  6 | import pandas as pd
  7 | import sys
  8 | import os
  9 | import copy
 10 | from time import time
 11 | from sklearn.metrics.pairwise import rbf_kernel
 12 | 
 13 | CODE_PATH = os.path.dirname(os.getcwd())
 14 | sys.path.append(CODE_PATH)
 15 | 
 16 | DEFAULT_RESULTS_PATH = '/Your/path/here/'
 17 | DEFAULT_DATASETS_PATH = '/Your/path/here/'
 18 | DEFAULT_FIGURES_PATH = '/Your/path/here/'
 19 | 
 20 | from generic_wrapper import STLWrapper
 21 | import helperFuncs as helper
 22 | import LSSVM as lssvm
 23 | 
 24 | C_VALS = [0.1, 1.0, 10.0, 100.0] 						#values for the C parameter of SVM to test
 25 | BETA_VALS = [.0001, .01, .1, 1]							#values for the Beta parameter of rbf kernel to test
 26 | KERNELS = ['linear', 'rbf'] 							#could also try 'poly' and 'sigmoid'
 27 | DEFAULT_VALIDATION_TYPE = 'cross' #'val' 				#'cross' for cross-validation, 'val' for single validation
 28 | VERBOSE = True											#set to true to see more output
 29 | NUM_BOOTSTRAPS = 5
 30 | DEFAULT_NUM_CROSS_FOLDS = 5
 31 | SAVE_RESULTS_EVERY_X_TESTS = 1
 32 | 
 33 | def reload_dependencies():
 34 | 	reload(helper)
 35 | 	reload(lssvm)
 36 | 
 37 | class LSSVMWrapper(STLWrapper):
 38 | 	def __init__(self, file_prefix, users_as_tasks=False, cont=False, c_vals=C_VALS, beta_vals=BETA_VALS, 
 39 | 				 kernels=KERNELS, num_cross_folds=DEFAULT_NUM_CROSS_FOLDS, dropbox_path=PATH_TO_DROPBOX, 
 40 | 				 datasets_path='Data/', test_csv_filename=None):
 41 | 		self.c_vals = c_vals
 42 | 		self.beta_vals=beta_vals
 43 | 		self.kernels = kernels
 44 | 		
 45 | 		STLWrapper.__init__(self, file_prefix, users_as_tasks=users_as_tasks, cont=cont, 
 46 | 				classifier_name='LSSVM', num_cross_folds=num_cross_folds, dropbox_path=dropbox_path, 
 47 | 				datasets_path=datasets_path, cant_train_with_one_class=True, 
 48 | 				save_results_every_nth=SAVE_RESULTS_EVERY_X_TESTS, test_csv_filename=test_csv_filename)
 49 | 
 50 | 		self.trim_extra_linear_params()
 51 | 
 52 | 		self.models = [None] * self.n_tasks
 53 | 
 54 | 	def define_params(self):
 55 | 		self.params = {}
 56 | 		self.params['C'] = self.c_vals
 57 | 		self.params['beta'] = self.beta_vals
 58 | 		self.params['kernel'] = self.kernels
 59 | 	
 60 | 	def train_and_predict_task(self, t, train_X, train_y, eval_X, param_dict):
 61 | 		kernel_func = self.get_kernel_func(param_dict['kernel'], param_dict['beta'])
 62 | 		self.models[t] = lssvm.LSSVM(C=param_dict['C'], kernel_func=kernel_func) 
 63 | 		converged = self.models[t].fit(train_X,train_y)
 64 | 
 65 | 		if converged:
 66 | 			preds = self.models[t].predict(eval_X)
 67 | 		else:
 68 | 			# predict majority class
 69 | 			preds = np.sign(np.mean(train_y))*np.ones(len(eval_X))
 70 | 		
 71 | 		return preds 
 72 | 	
 73 | 	def predict_task(self, X, t):
 74 | 		if self.models[t] is None:
 75 | 			print "ERROR! No model has been trained!"
 76 | 			
 77 | 		preds = self.models[t].predict(X)
 78 | 		return (preds + 1.0) / 2
 79 | 
 80 | 	# use something like the following to test only one set of parameters:
 81 | 	# wrapper.setParams(c_vals=[10], beta_vals=[.01], kernels=['rbf'])
 82 | 	def set_params(self, c_vals=None, beta_vals=None, kernels=None):
 83 | 		'''does not override existing parameter settings if the parameter is not set'''
 84 | 		self.c_vals = c_vals if c_vals is not None else self.c_vals
 85 | 		self.beta_vals = beta_vals if beta_vals is not None else self.beta_vals
 86 | 		self.kernels= kernels if kernels is not None else self.kernels
 87 | 		self.define_params()
 88 | 
 89 | 	def get_kernel_func(self,kernel_name, beta):
 90 | 		if kernel_name == 'rbf':
 91 | 			def rbf(x1,x2):
 92 | 				return rbf_kernel(x1,x2, gamma=beta) # from sklearn
 93 | 			return rbf
 94 | 		else:
 95 | 			def dot_product(x1,x2):
 96 | 				return np.dot(x1,x2.T)
 97 | 			return dot_product
 98 | 	
 99 | 	def trim_extra_linear_params(self):
100 | 		single_beta = None
101 | 		i = 0
102 | 		while i < len(self.list_of_param_settings):
103 | 			setting = self.list_of_param_settings[i]
104 | 			if setting['kernel'] == 'linear':
105 | 				if single_beta is None:
106 | 					single_beta = setting['beta']
107 | 				elif setting['beta'] != single_beta:
108 | 					self.list_of_param_settings.remove(setting)
109 | 					continue
110 | 			i += 1
111 | 	
112 | if __name__ == "__main__":
113 | 	print "LSSVM MODEL SELECTION"
114 | 	print "\tThis code will sweep a set of parameters to find the ideal settings for LS SVM for a single dataset"
115 | 
116 | 	if len(sys.argv) < 3:
117 | 		print "Error: usage is python LSSVMWrapper.py <file prefix> <users as tasks> <continue>"
118 | 		print "\t<file prefix>: e.g. datasetTaskList-Discard-Future-Group_ - program will look in the following directory for this file", DEFAULT_DATASETS_PATH
119 | 		print "\t<users as tasks>: type 'users' for users as tasks, or 'wellbeing' for wellbeing measures as tasks"
120 | 		print "\t<continue>: optional. If 'True', the wrapper will pick up from where it left off by loading a previous validation results file"
121 | 		print "\t<csv file for testing>: optional. If you want to get the final test results, provide the name of a csv file to test on"
122 | 		sys.exit()
123 | 	file_prefix= sys.argv[1] #get data file from command line argument
124 | 	print "\nLoading dataset", DEFAULT_DATASETS_PATH + file_prefix
125 | 	print ""
126 | 
127 | 	if sys.argv[2] == 'users':
128 | 		users_as_tasks = True
129 | 		print "Okay, treating users as tasks. Will not print per-task results"
130 | 	else:
131 | 		users_as_tasks = False
132 | 		print "Okay, treating wellbeing measures as tasks. Will save and print per-task results"
133 | 
134 | 	if len(sys.argv) >= 4 and sys.argv[3] == 'True':
135 | 		cont = True
136 | 		print "Okay, will continue from a previously saved validation results file for this problem"
137 | 	else:
138 | 		cont = False
139 | 	print ""
140 | 
141 | 	if len(sys.argv) >= 5:
142 | 		csv_test_file = sys.argv[4]
143 | 		print "Okay, will get final test results on file", csv_test_file
144 | 		print ""
145 | 	else:
146 | 		csv_test_file = None
147 | 
148 | 	wrapper = LSSVMWrapper(file_prefix, users_as_tasks=users_as_tasks, cont=cont, 
149 | 						   test_csv_filename=csv_test_file)
150 | 	
151 | 	print "\nThe following parameter settings will be tested:"
152 | 	print "\tC_VALS:  	\t", wrapper.c_vals
153 | 	print "\tBETAS:   	\t", wrapper.beta_vals
154 | 	print "\tKERNELS:   \t", wrapper.kernels
155 | 
156 | 	print "\nThe validation results dataframe will be saved in:", wrapper.results_path + wrapper.save_prefix + '.csv'
157 | 	print "\nThe validation and testing figures will be saved in:", wrapper.figures_path + wrapper.save_prefix
158 | 
159 | 	wrapper.run()
160 | 
161 | 


--------------------------------------------------------------------------------
/LogisticRegression/LR.py:
--------------------------------------------------------------------------------
 1 | """Simple Logistic Regression (LR) classifier."""
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import pandas as pd
 5 | from sklearn.linear_model import LogisticRegression
 6 | from sklearn.metrics import roc_curve, auc
 7 | import sys
 8 | import os
 9 | import pickle
10 | 
11 | CODE_PATH = os.path.dirname(os.getcwd())
12 | sys.path.append(CODE_PATH)
13 | import helperFuncs as helper
14 | 
15 | def reloadHelper():
16 | 	reload(helper)
17 | 
18 | # http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
19 | class LR:
20 | 	def __init__(self, penalty='l2', C=0.01, tol=0.001, solver= 'liblinear'):
21 | 		#data features
22 | 		self.n_features = None
23 | 		self.train_X = []
24 | 		self.train_Y = []
25 | 		self.val_X = []
26 | 		self.val_Y = []
27 | 		self.test_X = []
28 | 		self.test_Y = []
29 | 
30 | 		#classifier features
31 | 		self.penalty = penalty
32 | 		self.C = C
33 | 		self.tolerance = tol
34 | 		self.solver = solver
35 | 	
36 | 	def setTrainData(self, X, Y):
37 | 		self.train_X = X
38 | 		self.train_Y = Y
39 | 
40 | 		self.n_features = self.train_X.shape[1]
41 | 
42 | 	def setTestData(self, X, Y):
43 | 		self.test_X = X
44 | 		self.test_Y = Y
45 | 
46 | 	def setPenalty(self, penalty):
47 | 		self.penalty = penalty
48 | 
49 | 	def setC(self, C):
50 | 		self.C = C
51 | 
52 | 	def setSolver(self, solver):
53 | 		self.solver = solver
54 | 
55 | 	def setValData(self, X, Y):
56 | 		self.val_X = X
57 | 		self.val_Y = Y
58 | 
59 | 	def train(self):
60 | 		self.classifier = LogisticRegression(penalty=self.penalty, C=self.C, tol=self.tolerance, solver=self.solver)
61 | 		self.classifier.fit(self.train_X, self.train_Y)
62 | 
63 | 	def predict(self, X):
64 | 		return self.classifier.predict(X)
65 | 
66 | 	def getScore(self, X, Y):
67 | 		#returns accuracy
68 | 		return self.classifier.score(X, Y)
69 | 
70 | 	def getFPRandTPR(self,X,Y):
71 | 		probas_ = self.classifier.fit(self.train_X, self.train_Y).predict_proba(X)
72 | 		fpr, tpr, thresholds = roc_curve(Y, probas_[:, 1])
73 | 		return fpr, tpr
74 | 
75 | 	def getAUC(self,X,Y):
76 | 		fpr, tpr = self.getFPRandTPR(X,Y)
77 | 		return auc(fpr,tpr)
78 | 
79 | 	def saveClassifierToFile(self, filepath):
80 | 		s = pickle.dumps(self.classifier)
81 | 		f = open(filepath, 'w')
82 | 		f.write(s)
83 | 
84 | 	def loadClassifierFromFile(self, filepath):
85 | 		f2 = open(filepath, 'r')
86 | 		s2 = f2.read()
87 | 		self.classifier = pickle.loads(s2)
88 | 
89 | 


--------------------------------------------------------------------------------
/LogisticRegression/LRWrapper.py:
--------------------------------------------------------------------------------
  1 | """Performs hyperparameter sweep for the logistic regression (LR) model."""
  2 | import matplotlib
  3 | matplotlib.use('Agg')
  4 | import numpy as np
  5 | import pandas as pd
  6 | import sys
  7 | import os
  8 | import copy
  9 | from time import time
 10 | from sklearn.metrics.pairwise import rbf_kernel
 11 | 
 12 | CODE_PATH = os.path.dirname(os.getcwd())
 13 | sys.path.append(CODE_PATH)
 14 | 
 15 | DEFAULT_RESULTS_PATH = '/Your/path/here/'
 16 | DEFAULT_DATASETS_PATH = '/Your/path/here/'
 17 | DEFAULT_FIGURES_PATH = '/Your/path/here/'
 18 | 
 19 | from generic_wrapper import STLWrapper
 20 | import helperFuncs as helper
 21 | import LR as lr
 22 | 
 23 | #Parameter values
 24 | C_VALS = [ 0.001, 0.01, 0.1, 1.0, 10.0, 100.0] 
 25 | PENALTIES = ['l1', 'l2']
 26 | SOLVER = 'liblinear' 				#newton-cg, lbfgs, liblinear, sag
 27 | DEFAULT_VALIDATION_TYPE = 'cross' 	#'cross' for cross-validation, 'val' for single validation
 28 | DEFAULT_NUM_CROSS_FOLDS = 5
 29 | NUM_BOOTSTRAPS = 5
 30 | VERBOSE = True						#set to true to see more output
 31 | SAVE_RESULTS_EVERY_X_TESTS = 1
 32 | 
 33 | def reload_dependencies():
 34 | 	reload(helper)
 35 | 	reload(lssvm)
 36 | 
 37 | class LRWrapper(STLWrapper):
 38 | 	def __init__(self, file_prefix, users_as_tasks=False, cont=False, c_vals=C_VALS, 
 39 | 				 penalties=PENALTIES, solver=SOLVER, num_cross_folds=DEFAULT_NUM_CROSS_FOLDS, 
 40 | 				 dropbox_path=PATH_TO_DROPBOX, datasets_path='Data/',
 41 | 				 test_csv_filename=None):
 42 | 		self.c_vals = c_vals
 43 | 		self.penalties = penalties
 44 | 		self.solver = solver
 45 | 		
 46 | 		STLWrapper.__init__(self, file_prefix, users_as_tasks=users_as_tasks, cont=cont, 
 47 | 				classifier_name='LR', num_cross_folds=num_cross_folds, dropbox_path=dropbox_path, 
 48 | 				datasets_path=datasets_path, cant_train_with_one_class=False, 
 49 | 				save_results_every_nth=SAVE_RESULTS_EVERY_X_TESTS, test_csv_filename=test_csv_filename)
 50 | 
 51 | 		self.models = [None] * self.n_tasks
 52 | 
 53 | 	def define_params(self):
 54 | 		self.params = {}
 55 | 		self.params['C'] = self.c_vals
 56 | 		self.params['penalty'] = self.penalties
 57 | 	
 58 | 	def train_and_predict_task(self, t, train_X, train_y, eval_X, param_dict):
 59 | 		self.models[t] = lr.LR(penalty=param_dict['penalty'], C=param_dict['C'], solver=self.solver)
 60 | 		self.models[t].setTrainData(train_X, train_y)
 61 | 		self.models[t].train()
 62 | 		preds = self.models[t].predict(eval_X)
 63 | 
 64 | 		return preds 
 65 | 
 66 | 	def predict_task(self, X, t):
 67 | 		if self.models[t] is None:
 68 | 			print "ERROR! No model has been trained!"
 69 | 			
 70 | 		preds = self.models[t].predict(X)
 71 | 		return (preds + 1.0) / 2
 72 | 
 73 | 	# use something like the following to test only one set of parameters:
 74 | 	# wrapper.setParams(c_vals=[10], beta_vals=[.01], kernels=['rbf'])
 75 | 	def set_params(self, c_vals=None, penalties=None, solver=None):
 76 | 		'''does not override existing parameter settings if the parameter is not set'''
 77 | 		self.c_vals = c_vals if c_vals is not None else self.c_vals
 78 | 		self.penalties = penalties if penalties is not None else self.penalties
 79 | 		self.solver = solver if solver is not None else self.solver
 80 | 		self.define_params()
 81 | 
 82 | 	
 83 | if __name__ == "__main__":
 84 | 	print "LOGISTIC REGRESSION (LR) MODEL SELECTION"
 85 | 	print "\tThis code will sweep a set of parameters to find the ideal settings for LR for a single dataset"
 86 | 
 87 | 	if len(sys.argv) < 3:
 88 | 		print "Error: usage is python LRWrapper.py <file prefix> <users as tasks> <continue>"
 89 | 		print "\t<file prefix>: e.g. datasetTaskList-Discard-Future-Group_ - program will look in the following directory for this file", DEFAULT_DATASETS_PATH
 90 | 		print "\t<users as tasks>: type 'users' for users as tasks, or 'wellbeing' for wellbeing measures as tasks"
 91 | 		print "\t<continue>: optional. If 'True', the wrapper will pick up from where it left off by loading a previous validation results file"
 92 | 		print "\t<csv file for testing>: optional. If you want to get the final test results, provide the name of a csv file to test on"
 93 | 		sys.exit()
 94 | 	file_prefix= sys.argv[1] #get data file from command line argument
 95 | 	print "\nLoading dataset", DEFAULT_DATASETS_PATH + file_prefix
 96 | 	print ""
 97 | 
 98 | 	if sys.argv[2] == 'users':
 99 | 		users_as_tasks = True
100 | 		print "Okay, treating users as tasks. Will not print per-task results"
101 | 	else:
102 | 		users_as_tasks = False
103 | 		print "Okay, treating wellbeing measures as tasks. Will save and print per-task results"
104 | 
105 | 	if len(sys.argv) >= 4 and sys.argv[3] == 'True':
106 | 		cont = True
107 | 		print "Okay, will continue from a previously saved validation results file for this problem"
108 | 	else:
109 | 		cont = False
110 | 	print ""
111 | 
112 | 	if len(sys.argv) >= 5:
113 | 		csv_test_file = sys.argv[4]
114 | 		print "Okay, will get final test results on file", csv_test_file
115 | 		print ""
116 | 	else:
117 | 		csv_test_file = None
118 | 
119 | 	wrapper = LRWrapper(file_prefix, users_as_tasks=users_as_tasks, cont=cont,
120 | 						test_csv_filename=csv_test_file)
121 | 	
122 | 	print "\nThe following parameter settings will be tested:"
123 | 	print "\tC_VALS:  	\t", wrapper.c_vals
124 | 	print "\tPENALTIES:   	\t", wrapper.penalties
125 | 	
126 | 	print "\nOptimization will be performed with the following solver:"
127 | 	print "\tSolver:   \t", wrapper.solver
128 | 
129 | 	print "\nThe validation results dataframe will be saved in:", wrapper.results_path + wrapper.save_prefix + '.csv'
130 | 	print "\nThe validation and testing figures will be saved in:", wrapper.figures_path + wrapper.save_prefix
131 | 
132 | 	wrapper.run()
133 | 
134 | 


--------------------------------------------------------------------------------
/MTMKL/MTMKL.py:
--------------------------------------------------------------------------------
  1 | """Implements Multi-task Multi-kernel Learning (MTMKL)
  2 | 
  3 | This multi-task learning (MTL) classifier learns a set of kernels for different 
  4 | groups of features (or feature modalities). Each task learns to combine these
  5 | kernels with a different set of weights. The weights are regularized globally
  6 | to share information among the tasks.
  7 | 
  8 | This model was originally proposed in:
  9 | Kandemir, M., Vetek, A., Goenen, M., Klami, A., & Kaski, S. (2014). 
 10 | Multi-task and multi-view learning of user state. Neurocomputing, 139, 97-106.
 11 | """
 12 | import numpy as np
 13 | import scipy.optimize as opt
 14 | import scipy.linalg as la
 15 | from scipy import interp
 16 | import math
 17 | 
 18 | from sklearn.metrics import roc_curve, auc
 19 | from sklearn.metrics import roc_auc_score
 20 | from sklearn.metrics.pairwise import rbf_kernel, euclidean_distances, cosine_similarity
 21 | import numpy.linalg as LA
 22 | 
 23 | import pandas as pd
 24 | import sys
 25 | import os
 26 | import random
 27 | import pickle
 28 | import copy
 29 | import operator
 30 | import datetime
 31 | 
 32 | from scipy.optimize import minimize
 33 | 
 34 | CODE_PATH = os.path.dirname(os.getcwd())
 35 | sys.path.append(CODE_PATH)
 36 | 
 37 | import helperFuncs as helper
 38 | from LSSVM import LSSVM
 39 | 
 40 | def reloadFiles():
 41 | 	reload(helper)
 42 | 	print "Cannot reload LSSVM because of the way it was imported"
 43 | 
 44 | 
 45 | reloadFiles()
 46 | 
 47 | DEBUG = False
 48 | VERBOSE = False
 49 | 
 50 | class MTMKL:
 51 | 	def __init__(self, task_dict_list, C=100.0, V=0.1, kernel_name='rbf', kernel_param=.01, regularizer=None, max_iter=50, 
 52 | 					max_iter_internal=-1, tol=0.001, eta_filename=None, debug=DEBUG, verbose=VERBOSE, drop20PercentTrainingData=False):
 53 | 		'''INPUTS:
 54 | 				task_dict_list: 	a particular format, defined here: https://docs.google.com/document/d/1BlMaluZnPTa0oznWrfy5sku44ydunv_kalGfG_yz49c/edit?usp=sharing'''
 55 | 		#possible kernels: linear, 'poly', 'rbf', 'sigmoid', 'precomputed' or a callable
 56 | 
 57 | 		#data features
 58 | 		self.train_tasks = task_dict_list
 59 | 		self.val_tasks = None
 60 | 		self.test_tasks = None
 61 | 
 62 | 		self.modality_names, self.modality_start_indices = self.getModalityNamesIndices(task_dict_list)
 63 | 		self.modality_start_indices.append(np.shape(task_dict_list[0]['X'])[1]) #append the number of columns 
 64 | 
 65 | 		self.n_tasks = len(self.train_tasks)		#number of tasks
 66 | 		self.n_views = len(self.modality_names)		#number of views (one view can be one sensor of feature set e.g. physiology features)
 67 | 		self.eta = np.array([[1.0/self.n_views] * self.n_views] * self.n_tasks)		#a matrix of size number of tasks x number of sensors
 68 | 		self.last_eta = self.eta
 69 | 
 70 | 		self.eta_filename = eta_filename
 71 | 		if eta_filename is not None:
 72 | 			eta_file = open(self.eta_filename,'w')
 73 | 			#eta_file.write("//Eta matrix")
 74 | 			eta_file.close()
 75 | 			self.save_etas = True
 76 | 		else:
 77 | 			self.save_etas = False
 78 | 
 79 | 		#MTMKL parameters
 80 | 		self.V = V 				#V is a weight placed on the regularization . Small corresponds to unrelated tasks. 
 81 | 								#Large is enforcing similar kernel weights across tasks
 82 | 								#Kandemir et al. recommends testing a range from 10^-4 to 10^4
 83 | 								#V=0 is an independent, multi-kernel learner for each task
 84 | 		self.C = C 				#C parameter for SVM classifiers
 85 | 		self.regularizer = regularizer
 86 | 		self.max_iter = max_iter 					#max iterations that MTMKL algorithm will run for
 87 | 		self.regularizer_func= None
 88 | 		self.regularizing_grad = None
 89 | 		self.kernel_name = kernel_name
 90 | 		self.setKernel(kernel_name, kernel_param)
 91 | 		self.setRegularizer(regularizer)
 92 | 
 93 | 		#internal SVM parameters
 94 | 		self.max_iter_internal = max_iter_internal 	#max iterations for each scikit learn SVM within MTMKL
 95 | 		self.tolerance = tol 						#convergence criteria for each scikit learn SVM within MTMKL
 96 | 		
 97 | 		self.classifiers = [0] * self.n_tasks
 98 | 
 99 | 		self.debug=debug
100 | 		self.verbose=verbose
101 | 		self.drop20 = drop20PercentTrainingData
102 | 
103 | 		if self.debug: print "MTMKL class has been initialized with", self.n_tasks, "tasks and", self.n_views, "sensors"
104 | 
105 | 	@staticmethod
106 | 	def getModalityNamesIndices(task_dict_list):
107 | 		modality_dict = task_dict_list[0]['ModalityDict']
108 | 		sorted_tuples = sorted(modality_dict.items(), key=operator.itemgetter(1))
109 | 		names = [n for (n,i) in sorted_tuples] 
110 | 		indices = [i for (n,i) in sorted_tuples]
111 | 		return names,indices
112 | 
113 | 	def setTrainData(self, task_dict_list):
114 | 		self.train_tasks = task_dict_list
115 | 
116 | 	def setTestData(self, task_dict_list):
117 | 		self.test_tasks = task_dict_list
118 | 
119 | 	def setValData(self, task_dict_list):
120 | 		self.val_tasks = task_dict_list
121 | 
122 | 	def setC(self, c):
123 | 		self.C = c
124 | 
125 | 	def setV(self, V):
126 | 		self.V = V
127 | 
128 | 	def setKernel(self, kernel_name, kernel_param):
129 | 		self.kernel_name = kernel_name
130 | 		if kernel_name == 'rbf':
131 | 			def rbf(x1,x2):
132 | 				return rbf_kernel(x1,x2, gamma=kernel_param) # from sklearn
133 | 
134 | 			self.internal_kernel_func = rbf
135 | 		else:
136 | 			def dot_product(x1,x2):
137 | 				return cosine_similarity(x1,x2) # from sklearn - a normalized version of dot product #np.dot(x1,x2.T)
138 | 			self.internal_kernel_func = dot_product
139 | 
140 | 	def setRegularizer(self,regularizer):
141 | 		self.regularizer = regularizer
142 | 		if regularizer == 'L1':
143 | 			self.regularizer_func = self.eta_L1
144 | 			self.regularizing_grad = self.eta_grad_L1
145 | 		else:
146 | 			self.regularizer_func = self.eta_L2
147 | 			self.regularizing_grad = self.eta_grad_L2
148 | 
149 | 	def setAllSettings(self, c, v, kernel, beta, regularizer):
150 | 		self.setC(c)
151 | 		self.setV(v)
152 | 		self.setKernel(kernel,beta)
153 | 		self.setRegularizer(regularizer)
154 | 
155 | 	#kernel will know which column indices belong to which sensor 
156 | 	def constructKernelFunction(self, task):
157 | 		task_eta = self.eta[task,:]
158 | 
159 | 		def overallKernel(X1,X2): #change to static
160 | 			K = np.zeros((len(X1),len(X2)))
161 | 
162 | 			for m in range(self.n_views):
163 | 				sub_x1 = X1[:,self.modality_start_indices[m]:self.modality_start_indices[m+1]]
164 | 				sub_x2 = X2[:,self.modality_start_indices[m]:self.modality_start_indices[m+1]]
165 | 
166 | 				internal_K = self.internal_kernel_func(sub_x1,sub_x2)
167 | 				
168 | 				K = K + task_eta[m] * internal_K/np.max(abs(internal_K))
169 | 
170 | 			return K
171 | 
172 | 		return overallKernel
173 | 
174 | 	def eta_L1(self):
175 | 		return -self.V*np.sum(np.dot(self.eta,self.eta.T))
176 | 
177 | 	def eta_L2(self):
178 | 		# Note that V should be positive
179 | 		return self.V*np.sum(euclidean_distances(self.eta,squared=True))
180 | 
181 | 	def eta_grad_L1(self, eta_mat,v,task_index):
182 | 		return -v*np.sum(eta_mat,axis=0)
183 | 
184 | 	def eta_grad_L2(self, eta_mat,v,task_index):
185 | 		# Note that V should be positive
186 | 		return 2*v*np.sum(eta_mat[task_index,:]-eta_mat,axis=0)
187 | 
188 | 	def computeObjectiveFunction(self,eta_from_fmin):
189 | 		eta_from_fmin = eta_from_fmin.reshape(self.n_tasks,-1)
190 | 		#if self.debug: print "eta:", eta_from_fmin
191 | 		if self.debug: print "sum eta per task:", np.sum(eta_from_fmin,axis=1)
192 | 		if self.save_etas:
193 | 			self.saveEtas()
194 | 		self.eta = eta_from_fmin
195 | 
196 | 		#steps 1 and 2 of Kandemir algorithm
197 | 		for t in range(self.n_tasks):
198 | 			if self.debug: 
199 | 				print "Training task", t
200 | 				print "etas have size", self.eta.shape
201 | 				sys.stdout.flush()
202 | 
203 | 			X_t, Y_t = self.extractTaskData(self.train_tasks,t,drop20=self.drop20)
204 | 
205 | 			overallKernel = self.constructKernelFunction(t)
206 | 
207 | 			self.classifiers[t] = LSSVM.LSSVM(self.C,kernel_func=overallKernel)
208 | 			#SVC(C=self.C, kernel=overallKernel, probability=True, max_iter=self.max_iter_internal, tol=self.tolerance)
209 | 			converged = self.classifiers[t].fit(X_t, Y_t)
210 | 			assert converged
211 | 
212 | 
213 | 
214 | 		# Compute the objective function
215 | 		obj_value = 0
216 | 		for t in range(self.n_tasks):
217 | 			X_t, Y_t = self.extractTaskData(self.train_tasks,t,drop20=self.drop20)
218 | 
219 | 			alpha = self.classifiers[t].alphas
220 | 
221 | 			overallKernel = self.constructKernelFunction(t)
222 | 			K = overallKernel(X_t,X_t)
223 | 		
224 | 			obj_value += sum(alpha)-(0.5*1.0/self.C)*sum(alpha**2) -(1.0/2.0)*(np.dot((alpha*Y_t).T,np.dot(K,alpha*Y_t)))
225 | 
226 | 		# add regularizer
227 | 		obj_value += self.regularizer_func()
228 | 
229 | 		if self.debug: 
230 | 			print "obj function value:", obj_value
231 | 			print "Eta difference:",self.computeEtaDifference()
232 | 			print "Training ACC", self.predictAndGetAccuracy(self.train_tasks)
233 | 			print 
234 | 
235 | 		return obj_value
236 | 
237 | 
238 | 	# eta_mat has rows for tasks, columns for sensors
239 | 	def computeMatrixGradient(self,eta_from_fmin):
240 | 		update = np.zeros((self.n_tasks,self.n_views))
241 | 		
242 | 		for t in range(self.n_tasks):
243 | 			X_t, Y_t = self.extractTaskData(self.train_tasks,t,drop20=self.drop20)
244 | 
245 | 			alpha = self.classifiers[t].alphas
246 | 			alphaY = alpha*Y_t
247 | 
248 | 			for m in range(self.n_views): #Used to be numSensors-1
249 | 				sub_x1 = X_t[:, self.modality_start_indices[m]:self.modality_start_indices[m+1]]
250 | 				sub_x2 = X_t[:, self.modality_start_indices[m]:self.modality_start_indices[m+1]]
251 | 
252 | 				# Normalize the kernel, could also use  k(i, j) = k (i, j) / sqrt(k(i,i) * k(j,j))
253 | 				#note, the same procedure for finding the min of sub_x1 and sub_x2 that is used in
254 | 				#the overall kernel is not required here, since sub_x1 and sub_x2 are guaranteed
255 | 				#to be the same
256 | 				internal_K = self.internal_kernel_func(sub_x1,sub_x2)
257 | 
258 | 				update[t,m] = -(1.0/2.0)*(np.dot(alphaY.T,np.dot(internal_K,alphaY)))
259 | 
260 | 			grad_reg = self.regularizing_grad(eta_from_fmin.reshape(self.n_tasks,-1),self.V,t)
261 | 
262 | 			update[t,:] = grad_reg + update[t,:]
263 | 
264 | 		return update.flatten()  
265 | 
266 | 	def saveEtas(self):
267 | 		if self.eta_filename is not None:
268 | 			eta_file = open(self.eta_filename,'a')
269 | 			np.savetxt(eta_file,self.eta.flatten())
270 | 			eta_file.close()
271 | 
272 | 	def computeEtaDifference(self):
273 | 		max_diff = 0
274 | 		for t in range(self.n_tasks):
275 | 			last_eta_list = self.last_eta[t,:]
276 | 			eta_list = self.eta[t,:]
277 | 
278 | 			norm = la.norm(last_eta_list - eta_list)
279 | 			
280 | 			if norm > max_diff:
281 | 				max_diff = norm
282 | 		return max_diff
283 | 
284 | 	def createConstraintList(self):
285 | 		constraints = []
286 | 
287 | 		# Equality constraints
288 | 		for t in range(self.n_tasks):
289 | 			start = t*self.n_views
290 | 			end = (t+1)*self.n_views
291 | 			def fun_eq(x,start=start, end=end):
292 | 				res = np.array([np.sum(x[start:end])-1.0])
293 | 				return res
294 | 			def jac_func(x,start=start,end=end):
295 | 				jac= np.zeros(self.n_tasks*self.n_views)
296 | 				jac[start:end] = 1.0
297 | 				return jac
298 | 			cons = {'type':'eq',
299 | 					'fun':fun_eq,
300 | 					'jac':jac_func}
301 | 			constraints.append(cons)
302 | 
303 | 		# Inequality constraints
304 | 		for i in range(self.n_tasks*self.n_views):
305 | 			def jac_func(x,i=i):
306 | 				jac= np.zeros(self.n_tasks*self.n_views)
307 | 				jac[i] = 1.0
308 | 				return jac
309 | 			cons = {'type':'ineq',
310 | 					'fun':lambda x,i=i: np.array([x[i]]),
311 | 					'jac':jac_func}
312 | 			constraints.append(cons)
313 | 
314 | 		return constraints
315 | 
316 | 	def train(self):
317 | 		init_etas = self.eta.flatten()
318 | 		cons = self.createConstraintList()
319 | 		try:
320 | 			res = minimize(self.computeObjectiveFunction, init_etas, jac=self.computeMatrixGradient,constraints=cons, method='SLSQP', options={'disp': self.verbose,'maxiter':self.max_iter})
321 | 		except:
322 | 			return False
323 | 		self.eta = res.x.reshape(self.n_tasks,-1)
324 | 
325 | 		if self.verbose: 
326 | 			print "Results of this run!"
327 | 			print "\t ETA", self.eta
328 | 			print "\t Training ACC", self.predictAndGetAccuracy(self.train_tasks)
329 | 
330 | 		return True
331 | 
332 | 
333 | 	@staticmethod
334 | 	def extractTaskData(task_dict_list,t,drop20=False):
335 | 		X_t = task_dict_list[t]['X']
336 | 		Y_t = (task_dict_list[t]['Y']).reshape(-1,1)
337 | 
338 | 		if drop20:
339 | 			keep_indices = task_dict_list[t]['KeepIndices']
340 | 			X_t = X_t[keep_indices]
341 | 			Y_t = Y_t[keep_indices]
342 | 
343 | 		return X_t, Y_t
344 | 
345 | 	def predict(self, task_dict_list):
346 | 		''' input: 		task_dict_list in the usual format. Will not use the 'Y' key
347 | 			output:		predictions for the y values for each task. So a list of lists, where each inner list
348 | 						is the y_hat values for a particular task'''
349 | 		Y_hat = [0] * len(task_dict_list)
350 | 		for t in range(len(task_dict_list)): 
351 | 			Y_hat[t] = self.predictOneTask(task_dict_list,t)
352 | 		return Y_hat
353 | 
354 | 	def predictOneTask(self, task_dict_list, t):
355 | 		X_t, y_t = self.extractTaskData(task_dict_list,t)
356 | 		if len(X_t) == 0:
357 | 			return None
358 | 		else:
359 | 			return self.internal_predict(X_t, int(t))
360 | 
361 | 	def internal_predict(self, X_t, t):
362 | 		return self.classifiers[t].predict(X_t).reshape(-1,1)
363 | 
364 | 	def predict_01(self, X, t):
365 | 		preds = self.classifiers[t].predict(X).reshape(-1,1)
366 | 		return (preds + 1.0) / 2
367 | 
368 | 	def getNumErrors(self, Y, Y_hat):
369 | 		#returns accuracy
370 | 		errors = np.where(Y * Y_hat < 0)[0] 
371 | 		return len(errors)
372 | 
373 | 	def getAccuracy(self, Y, Y_hat):
374 | 		score = self.getNumErrors(Y,Y_hat)
375 | 		return 1.0 - (float(score) / float(len(Y_hat)))
376 | 
377 | 	def predictAndGetNumErrors(self,task_dict_list):
378 | 		Y_hat = self.predict(task_dict_list)
379 | 		return self.getNumErrors(task_dict_list['Y'],Y_hat)
380 | 
381 | 	def predictAndGetAccuracy(self,task_dict_list):
382 | 		Y_hat = self.predict(task_dict_list)
383 | 		accs = []#[0] * len(task_dict_list)
384 | 		for t in range(len(task_dict_list)):
385 | 			accs.append(self.getAccuracy(task_dict_list[t]['Y'],Y_hat[t]))
386 | 		return np.mean(accs)
387 | 
388 | 	def predictAndGetAccuracyOneTask(self,task_dict_list,t):
389 | 		Y_hat = self.predictOneTask(task_dict_list,t)
390 | 		return self.getAccuracy(task_dict_list[t]['Y'],Y_hat[t])
391 | 
392 | 	def getAccuracyAucAllTasks(self, tasks):
393 | 		all_task_Y = []
394 | 		all_preds = []
395 | 		for t in range(len(tasks)):
396 | 			X_t, y_t = self.extractTaskData(tasks,t)
397 | 			if len(X_t) == 0:
398 | 				continue
399 | 			preds = self.internal_predict(X_t, int(t))
400 | 			all_task_Y.extend(y_t)
401 | 			all_preds.extend(preds)
402 | 		auc=roc_auc_score(all_task_Y, all_preds)
403 | 		acc=helper.getBinaryAccuracy(all_preds,all_task_Y)
404 | 		return acc,auc
405 | 
406 | 	def getAccuracyAucOnOneTask(self, task_list, task, debug=False):
407 | 		X_t, y_t = self.extractTaskData(task_list,task)
408 | 		if len(X_t) == 0:
409 | 			return np.nan, np.nan
410 | 		
411 | 		preds = self.internal_predict(X_t, int(task))
412 | 
413 | 		if debug:
414 | 			print "y_t:", y_t
415 | 			print "preds:", preds
416 | 		
417 | 		acc = helper.getBinaryAccuracy(preds,y_t)
418 | 		if len(y_t) > 1 and helper.containsEachSVMLabelType(y_t) and helper.containsEachSVMLabelType(preds):
419 | 			auc = roc_auc_score(y_t, preds)
420 | 		else:
421 | 			auc = np.nan
422 | 
423 | 		return acc, auc
424 | 
425 | 	def getAUC(self,test_tasks):
426 | 		mean_tpr = 0.0
427 | 		mean_fpr = np.linspace(0, 1, 100)
428 | 		for t in range(self.n_tasks):
429 | 			X_t, Y_t = self.extractTaskData(self.train_tasks,t)
430 | 			X_test_t, Y_test_t = self.extractTaskData(test_tasks, t)
431 | 
432 | 			overallKernel = self.constructKernelFunction(t)
433 | 
434 | 			self.classifiers[t] = SVC(C=self.C, kernel=overallKernel, probability=True, max_iter=self.max_iter_internal, tol=self.tolerance)
435 | 			probas_ = self.classifiers[t].fit(X_t, Y_t).predict_proba(X_test_t)
436 | 			fpr, tpr, thresholds = roc_curve(Y_test_t, probas_[:, 1])
437 | 
438 | 			mean_tpr += interp(mean_fpr, fpr, tpr)
439 | 			mean_tpr[0] = 0.0
440 | 
441 | 		mean_tpr /= self.n_tasks
442 | 		mean_tpr[-1] = 1.0
443 | 		mean_auc = auc(mean_fpr, mean_tpr)
444 | 
445 | 		return mean_auc, mean_fpr, mean_tpr
446 | 
447 | 	def getAUCOneTask(self,test_tasks,t):
448 | 		global eta_global
449 | 
450 | 		X_t, Y_t = self.extractTaskData(self.train_tasks,t)
451 | 		X_test_t, Y_test_t = self.extractTaskData(test_tasks, t)
452 | 
453 | 		overallKernel = self.constructKernelFunction(t)
454 | 
455 | 		self.classifiers[t] = SVC(C=self.C, kernel=overallKernel, probability=True, max_iter=self.max_iter_internal, tol=self.tolerance)
456 | 		probas_ = self.classifiers[t].fit(X_t, Y_t).predict_proba(X_test_t)
457 | 		fpr, tpr, thresholds = roc_curve(Y_test_t, probas_[:, 1])
458 | 
459 | 		return auc(fpr, tpr), fpr, tpr
460 | 
461 | 	def saveClassifierToFile(self, filepath):
462 | 		s = pickle.dumps(self.classifier)
463 | 		f = open(filepath, 'w')
464 | 		f.write(s)
465 | 
466 | 	def loadClassifierFromFile(self, filepath):
467 | 		f2 = open(filepath, 'r')
468 | 		s2 = f2.read()
469 | 		self.classifier = pickle.loads(s2)
470 | 
471 | 
472 | 
473 | 
474 | 


--------------------------------------------------------------------------------
/MTMKL/MTMKLWrapper.py:
--------------------------------------------------------------------------------
  1 | """Performs hyperparameter sweep for Multi-task Multi-kernel Learning (MTMKL)."""
  2 | import matplotlib
  3 | matplotlib.use('Agg')
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | import scipy.optimize as opt
  7 | #from cvxopt import matrix, solvers
  8 | import scipy.linalg as la
  9 | import math
 10 | from sklearn.svm import SVC
 11 | from sklearn.metrics import roc_curve, auc
 12 | from sklearn.metrics.pairwise import rbf_kernel
 13 | from scipy import interp
 14 | import pandas as pd
 15 | import sys
 16 | import os
 17 | import random
 18 | import pickle
 19 | import copy
 20 | import operator
 21 | import datetime
 22 | from time import time
 23 | 
 24 | CODE_PATH = os.path.dirname(os.getcwd())
 25 | sys.path.append(CODE_PATH)
 26 | 
 27 | DEFAULT_RESULTS_PATH = '/Your/path/here/'
 28 | DEFAULT_DATASETS_PATH = '/Your/path/here/'
 29 | DEFAULT_FIGURES_PATH = '/Your/path/here/'
 30 | DEFAULT_ETAS_PATH = DEFAULT_RESULTS_PATH + 'etas/'
 31 | 
 32 | import helperFuncs as helper
 33 | import MTMKL as mtmkl
 34 | 
 35 | USE_TENSORFLOW = False
 36 | 
 37 | C_VALS = [1.0, 10.0, 100.0]   #10.0,100.0, #values for the C parameter of SVM to test
 38 | B_VALS = [0.0001, 0.001, 0.01]
 39 | V_VALS = [100.0, 10.0, 1.0, .1, .01]               #a small V works well for MKL
 40 | REGULARIZERS = ['L1','L2']       
 41 | KERNELS = ['rbf','linear'] # could also do linear
 42 | 
 43 | VALIDATION_TYPE = 'cross'
 44 | DEFAULT_NUM_CROSS_FOLDS = 5
 45 | SAVE_RESULTS_EVERY_X_TESTS = 1
 46 | 
 47 | 
 48 | def reloadFiles():
 49 | 	reload(helper)
 50 | 	reload(mtmkl)
 51 | 	mtmkl.reloadFiles()
 52 | 
 53 | 
 54 | class MTMKLWrapper:
 55 | 	def __init__(self, file_prefix, users_as_tasks, user_clusters=True, eta_filename=None, regularizers=REGULARIZERS, tolerance = .0001, 
 56 | 				max_iter = 100, val_type=VALIDATION_TYPE, c_vals=C_VALS, beta_vals=B_VALS, 
 57 | 				v_vals = V_VALS, kernels=KERNELS, print_iters=False, optimize_labels=None, cont=False, test_run=False,
 58 | 				results_path=DEFAULT_RESULTS_PATH, figures_path=DEFAULT_FIGURES_PATH, datasets_path=DEFAULT_DATASETS_PATH,
 59 | 				etas_path=DEFAULT_ETAS_PATH, num_cross_folds=DEFAULT_NUM_CROSS_FOLDS, drop20=False,
 60 | 				test_csv_filename=None):
 61 | 		self.results_path = results_path
 62 | 		self.figures_path = figures_path
 63 | 		self.datasets_path = datasets_path
 64 | 		self.etas_path = etas_path
 65 | 		self.file_prefix = file_prefix
 66 | 		self.cont=cont
 67 | 		self.val_type = val_type
 68 | 		self.users_as_tasks = users_as_tasks
 69 | 		self.cluster_users = user_clusters
 70 | 		self.drop20=drop20
 71 | 		if test_csv_filename is not None:
 72 | 			self.test_csv_filename = self.datasets_path + test_csv_filename
 73 | 		else:
 74 | 			self.test_csv_filename = None
 75 | 		self.save_prefix = self.getSavePrefix(file_prefix, replace=cont)
 76 | 
 77 | 		self.test_tasks = helper.loadPickledTaskList(datasets_path, file_prefix, "Test", fix_y=True)
 78 | 		self.train_tasks = helper.loadPickledTaskList(datasets_path, file_prefix, "Train", fix_y=True)
 79 | 		if self.val_type != 'cross':
 80 | 			self.val_tasks = helper.loadPickledTaskList(datasets_path, file_prefix, "Val", fix_y=True)
 81 | 
 82 | 		# print dataset sizes
 83 | 		print "Num train points:", sum([len(t['Y']) for t in self.train_tasks])
 84 | 		if self.val_type != 'cross':
 85 | 			print "Num val points:", sum([len(t['Y']) for t in self.val_tasks])
 86 | 		print "Num test points:", sum([len(t['Y']) for t in self.test_tasks])
 87 | 
 88 |    		if self.val_type != 'cross':
 89 |    			self.initializeMTMKLModel(self.train_tasks)
 90 | 		else:
 91 | 			self.classifier = None	
 92 | 
 93 | 		self.n_feats = helper.calculateNumFeatsInTaskList(self.test_tasks)
 94 | 		self.n_tasks = len(self.test_tasks)
 95 | 
 96 | 		if optimize_labels is None:
 97 | 			self.optimize_labels = ['tomorrow_Group_Happiness_Evening_Label', 'tomorrow_Group_Health_Evening_Label', 'tomorrow_Group_Calmness_Evening_Label']
 98 | 		else:
 99 | 			self.optimize_labels = optimize_labels
100 | 
101 | 		self.c_vals = c_vals
102 | 		self.v_vals = v_vals
103 | 		self.kernels = kernels
104 | 		self.beta_vals=beta_vals
105 | 		self.regularizers = regularizers
106 | 
107 | 		self.tolerance = tolerance
108 | 		self.max_iter = max_iter
109 | 		self.print_iters = print_iters
110 | 
111 | 		if test_run:
112 | 			print "This is only a testing run. Using cheap settings to make it faster"
113 | 			self.c_vals = [100]
114 | 			self.beta_vals = [.01]
115 | 			self.kernels = ['linear']
116 | 			self.v_vals = [1.0]
117 | 			self.regularizers = ['L1']
118 | 			self.max_iter = 1
119 | 
120 | 		self.calcNumSettingsDesired()
121 | 
122 | 		#storing the results
123 | 		self.time_sum = 0
124 | 		if cont:
125 | 			self.val_results_df = pd.DataFrame.from_csv(self.results_path + self.save_prefix + '.csv')
126 | 			print '\nPrevious validation results df loaded. It has', len(self.val_results_df), "rows"
127 | 			self.started_from = len(self.val_results_df)
128 | 		else:
129 | 			self.val_results_df = pd.DataFrame()
130 | 			self.started_from = 0
131 | 
132 | 		self.num_cross_folds = num_cross_folds
133 | 		if self.val_type == 'cross':
134 | 			helper.generateCrossValPickleFiles(self.datasets_path, self.file_prefix, self.num_cross_folds)
135 | 			#helper.addKeepIndicesToCrossValPickleFiles(self.datasets_path, self.file_prefix, self.num_cross_folds, .80)
136 | 
137 | 	def getSavePrefix(self, file_prefix, replace=False):
138 | 		name_modifier = ""
139 | 		if '/' in file_prefix:
140 | 			if "NoLocation" in file_prefix:
141 | 				name_modifier = "-noloc"
142 | 			slash_loc = file_prefix.find('/')
143 | 			path_modifier = file_prefix[0:slash_loc+1]
144 | 			file_prefix = file_prefix[slash_loc+1:]
145 | 			self.file_prefix = file_prefix
146 | 			self.datasets_path += path_modifier
147 | 
148 | 		dash_loc = file_prefix.find('-')
149 | 
150 | 		if self.users_as_tasks:
151 | 			task_str = '_users'
152 | 		else:
153 | 			task_str = '_wellbeing'
154 | 
155 | 		prefix = "MTMKL" + task_str + file_prefix[dash_loc:-1] + name_modifier
156 | 		
157 | 		if not replace:
158 | 			while os.path.exists(self.results_path + prefix + '.csv'):
159 | 				prefix = prefix + '2'
160 | 		return prefix
161 | 
162 | 	def calcNumSettingsDesired(self):
163 | 		self.num_settings = len(self.c_vals) * len(self.beta_vals) * len(self.kernels)  \
164 | 						* len(self.v_vals) * len(self.regularizers)
165 | 
166 | 	# use something like the following to test only one set of parameters:
167 | 	# wrapper.setParams(tau10s=[.05], tau20s=[.05], sigma_multipliers=[.1,.01])
168 | 	def setParams(self, c_vals=None, beta_vals=None, kernels=None, v_vals=None, regularizers=None):
169 | 		'''does not override existing parameter settings if the parameter is not set'''
170 | 		self.c_vals = c_vals if c_vals is not None else self.c_vals
171 | 		self.beta_vals = beta_vals if beta_vals is not None else self.beta_vals
172 | 		self.kernels = kernels if kernels is not None else self.kernels
173 | 		self.v_vals = v_vals if v_vals is not None else self.v_vals
174 | 		self.regularizers = regularizers if regularizers is not None else self.regularizers
175 | 
176 | 	def settingAlreadyDone(self, C, beta, kernel, v, regularizer):
177 | 		if kernel == 'linear':
178 | 			if len(self.val_results_df[(self.val_results_df['C']== C) & \
179 | 										(self.val_results_df['kernel']== kernel) & \
180 | 										(self.val_results_df['v']== v) & \
181 | 										(self.val_results_df['regularizer']== regularizer)]) > 0:
182 | 				print "setting already tested"
183 | 				return True
184 | 			else:
185 | 				return False
186 | 		else:
187 | 			if len(self.val_results_df[(self.val_results_df['C']== C) & \
188 | 										(self.val_results_df['beta']== beta) & \
189 | 										(self.val_results_df['kernel']== kernel) & \
190 | 										(self.val_results_df['v']== v) & \
191 | 										(self.val_results_df['regularizer']== regularizer)]) > 0:
192 | 				print "setting already tested"
193 | 				return True
194 | 			else:
195 | 				return False
196 | 
197 | 	def initializeMTMKLModel(self, train_tasks, verbose=False):
198 | 		if USE_TENSORFLOW:
199 | 			self.classifier = mtmkl_tf.MTMKL(train_tasks,verbose=verbose,tol=self.tolerance, debug=False, max_iter=self.max_iter)
200 | 		else:
201 | 			self.classifier = mtmkl.MTMKL(train_tasks,verbose=verbose,tol=self.tolerance, debug=False, max_iter=self.max_iter, drop20PercentTrainingData=self.drop20)
202 | 
203 | 	def setClassifierToSetting(self, C, beta, kernel, v, regularizer):
204 | 		self.classifier.setAllSettings(C, v, kernel, beta, regularizer)
205 | 
206 | 	#must have called setValData for now
207 | 	def initializeAndTrainMTMKL(self, train_tasks, C, beta, kernel, v, regularizer, verbose=False):
208 | 		self.initializeMTMKLModel(train_tasks,verbose=verbose)
209 | 		self.setClassifierToSetting(C, beta, kernel, v, regularizer)
210 | 		converged = self.classifier.train()
211 | 		return converged
212 | 
213 | 	def getValidationResults(self, results_dict, C, beta, kernel, v, regularizer):
214 | 		converged = self.initializeAndTrainMTMKL(self.train_tasks, C, beta, kernel, v, regularizer)
215 | 
216 | 		if self.users_as_tasks:
217 | 			if not converged:
218 | 				val_acc = np.nan
219 | 				val_auc = np.nan
220 | 			else:
221 | 				val_acc, val_auc = self.classifier.getAccuracyAucAllTasks(self.val_tasks)
222 | 			results_dict['val_acc'] = val_acc
223 | 			results_dict['val_auc'] = val_auc
224 | 		else:
225 | 			accs = []
226 | 			aucs = []
227 | 			for t in range(self.n_tasks):
228 | 				if not converged:
229 | 					acc = np.nan
230 | 					auc = np.nan
231 | 				else:
232 | 					acc, auc = self.classifier.getAccuracyAucOnOneTask(self.val_tasks, t)
233 | 				task_name = self.val_tasks[t]['Name']
234 | 				results_dict['TaskAcc-' + helper.getFriendlyLabelName(task_name)] = acc
235 | 				results_dict['TaskAuc-' + helper.getFriendlyLabelName(task_name)] = auc
236 | 				if self.cluster_users or task_name in self.optimize_labels:
237 | 					accs.append(acc)
238 | 					aucs.append(auc)
239 | 			results_dict['val_acc'] = np.mean(accs)
240 | 			results_dict['val_auc'] = np.mean(aucs)
241 | 		return results_dict
242 | 
243 | 	def getCrossValidationResults(self, results_dict, C, beta, kernel, v, regularizer, save_plots=False,print_per_fold=True):
244 | 		all_acc = []
245 | 		all_auc = []
246 | 		all_f1 = []
247 | 		all_precision = []
248 | 		all_recall = []
249 | 		if not self.users_as_tasks:	
250 | 			per_task_accs = [[] for i in range(self.n_tasks)]
251 | 			per_task_aucs = [[] for i in range(self.n_tasks)]
252 | 			per_task_f1 = [[] for i in range(self.n_tasks)]
253 | 			per_task_precision = [[] for i in range(self.n_tasks)]
254 | 			per_task_recall = [[] for i in range(self.n_tasks)]
255 | 
256 | 		for f in range(self.num_cross_folds):
257 | 			train_tasks, val_tasks = helper.loadCrossValData(self.datasets_path, self.file_prefix, f, reshape=False, fix_y=True)
258 | 			converged = self.initializeAndTrainMTMKL(train_tasks, C, beta, kernel, v, regularizer)
259 | 			if not converged:
260 | 				all_acc.append(np.nan)
261 | 				all_auc.append(np.nan)
262 | 				all_f1.append(np.nan)
263 | 				all_precision.append(np.nan)
264 | 				all_recall.append(np.nan)
265 | 				continue
266 | 	
267 | 			# Get results!
268 | 			fold_preds = []
269 | 			fold_true_y = []
270 | 			for t in range(self.n_tasks):
271 | 				preds = self.classifier.predictOneTask(val_tasks,t)
272 | 				true_y = list(val_tasks[t]['Y'].flatten())
273 | 
274 | 				if not self.users_as_tasks:
275 | 					# save the per-task results
276 | 					t_acc, t_auc, t_f1, t_precision, t_recall = helper.computeAllMetricsForPreds(preds, true_y)
277 | 					per_task_accs[t].append(t_acc)
278 | 					per_task_aucs[t].append(t_auc)
279 | 					per_task_f1[t].append(t_f1)
280 | 					per_task_precision[t].append(t_precision)
281 | 					per_task_recall[t].append(t_recall)
282 | 					if print_per_fold: print "Fold", f, "Task", val_tasks[t]['Name'], "acc", t_acc, "auc", t_auc, "f1", t_f1, "precision",t_precision,"recall",t_recall
283 | 
284 | 				fold_preds.extend(preds)
285 | 				fold_true_y.extend(true_y)
286 | 
287 | 
288 | 			acc, auc, f1, precision, recall = helper.computeAllMetricsForPreds(fold_preds, fold_true_y)
289 | 			all_acc.append(acc)
290 | 			all_auc.append(auc)
291 | 			all_f1.append(f1)
292 | 			all_precision.append(precision)
293 | 			all_recall.append(recall)
294 | 			if print_per_fold: print "Fold", f, "acc", acc, "auc", auc, "f1", f1, "precision",precision,"recall",recall
295 | 
296 | 		print "accs for all folds", all_acc
297 | 		print "aucs for all folds", all_auc
298 | 		
299 | 		# Add results to the dictionary
300 | 		results_dict['val_acc'] = np.nanmean(all_acc)
301 | 		results_dict['val_auc'] = np.nanmean(all_auc)
302 | 		results_dict['val_f1'] = np.nanmean(all_f1)
303 | 		results_dict['val_precision'] = np.nanmean(all_precision)
304 | 		results_dict['val_recall'] = np.nanmean(all_recall)
305 | 
306 | 		# Add per-task results to the dictionary
307 | 		if not self.users_as_tasks:
308 | 			for t in range(self.n_tasks):
309 | 				task_name = val_tasks[t]['Name']
310 | 				results_dict['TaskAcc-' + helper.getFriendlyLabelName(task_name)] = np.nanmean(per_task_accs[t])
311 | 				results_dict['TaskAuc-' + helper.getFriendlyLabelName(task_name)] = np.nanmean(per_task_aucs[t])
312 | 				results_dict['TaskF1-' + helper.getFriendlyLabelName(task_name)] = np.nanmean(per_task_f1[t])
313 | 				results_dict['TaskPrecision-' + helper.getFriendlyLabelName(task_name)] = np.nanmean(per_task_precision[t])
314 | 				results_dict['TaskRecall-' + helper.getFriendlyLabelName(task_name)] = np.nanmean(per_task_recall[t])
315 | 
316 | 		return results_dict
317 | 
318 | 	def testOneSetting(self, C, beta, kernel, v, regularizer):
319 | 		if self.cont:
320 | 			if self.settingAlreadyDone(C, beta, kernel, v, regularizer):
321 | 				return
322 | 
323 | 		t0 = time()
324 | 		
325 | 		results_dict = {'C':C, 'beta': beta, 'kernel':kernel, 'v':v, 'regularizer':regularizer}
326 | 		print results_dict
327 | 		
328 | 		if self.val_type == 'cross':
329 | 			results_dict = self.getCrossValidationResults(results_dict, C, beta, kernel, v, regularizer)
330 | 		else:
331 | 			results_dict = self.getValidationResults(results_dict, C, beta, kernel, v, regularizer)
332 | 		
333 | 		self.val_results_df = self.val_results_df.append(results_dict,ignore_index=True)
334 | 		
335 | 		print "\n", self.val_results_df.tail(n=1)
336 | 		t1 = time()
337 | 		this_time = t1 - t0
338 | 		print "It took", this_time, "seconds to obtain this result"
339 | 
340 | 		self.time_sum = self.time_sum + this_time
341 | 
342 | 		self.printTimeEstimate()
343 | 		sys.stdout.flush()
344 | 
345 | 		#output the file every few iterations for safekeeping 
346 | 		if len(self.val_results_df) % SAVE_RESULTS_EVERY_X_TESTS == 0:
347 | 			self.val_results_df.to_csv(self.results_path + self.save_prefix + '.csv')
348 | 
349 | 	def printTimeEstimate(self):
350 | 		num_done = len(self.val_results_df)-self.started_from
351 | 		num_remaining = self.num_settings - num_done - self.started_from
352 | 		avg_time = self.time_sum / num_done
353 | 		total_secs_remaining = int(avg_time * num_remaining)
354 | 		hours = total_secs_remaining / 60 / 60
355 | 		mins = (total_secs_remaining % 3600) / 60
356 | 		secs = (total_secs_remaining % 3600) % 60
357 | 
358 | 		print "\n", num_done, "settings processed so far,", num_remaining, "left to go"
359 | 		print "Estimated time remaining:", hours, "hours", mins, "mins", secs, "secs"
360 | 
361 | 	def sweepAllParameters(self):
362 | 		print "\nSweeping all parameters!"
363 | 		
364 | 		self.calcNumSettingsDesired()
365 | 		print "\nYou have chosen to test a total of", self.num_settings, "settings"
366 | 		sys.stdout.flush()
367 | 
368 | 		#sweep all possible combinations of parameters
369 | 		for C in self.c_vals:
370 | 			for v in self.v_vals:
371 | 				for regularizer in self.regularizers:
372 | 					for kernel in self.kernels:
373 | 						if kernel == 'linear':
374 | 							self.testOneSetting(C, np.nan, kernel, v, regularizer)
375 | 						else:
376 | 							for beta in self.beta_vals:
377 | 								self.testOneSetting(C, beta, kernel, v, regularizer)
378 | 		self.val_results_df.to_csv(self.results_path + self.save_prefix + '.csv')
379 | 
380 | 	def run(self):
381 | 		self.sweepAllParameters()
382 | 		return self.findBestSetting(criteria='AUC')
383 | 
384 | 
385 | 	def findBestSetting(self, criteria="accuracy", minimize=False, save_final_results=True):
386 | 		if criteria=="accuracy":
387 | 			search_col = 'val_auc'
388 | 		elif criteria=="AUC":
389 | 			search_col = 'val_auc'
390 | 
391 | 		results = self.val_results_df[search_col].tolist()
392 | 		if minimize:
393 | 			best_result = min(results)
394 | 			opt_word = "minimized"
395 | 		else:
396 | 			best_result = max(results)
397 | 			opt_word = "maximized"
398 | 		best_idx = results.index(best_result)
399 | 
400 | 		print "BEST SETTING!"
401 | 		print "Settings which", opt_word, "the", criteria, "were:"
402 | 		print self.val_results_df.iloc[best_idx]
403 | 
404 | 		if save_final_results:
405 | 			self.getFinalResultsAndSave(self.val_results_df.iloc[best_idx])
406 | 		else:
407 | 			return self.val_results_df.iloc[best_idx]
408 | 
409 | 	def getFinalResultsAndSave(self, results_dict):
410 | 		print "\nRetraining on full training data with the best settings..."
411 | 		self.drop20=False
412 | 		self.initializeAndTrainMTMKL(self.train_tasks, results_dict['C'], results_dict['beta'], 
413 | 									results_dict['kernel'], results_dict['v'], results_dict['regularizer'], 
414 | 									verbose=True)
415 | 		
416 | 		print "\nEvaluating results on held-out test set!! ..."
417 | 		all_preds = []
418 | 		all_true_y = []
419 | 		per_task_accs = [np.nan] * self.n_tasks
420 | 		per_task_aucs = [np.nan] * self.n_tasks
421 | 		per_task_f1 = [np.nan] * self.n_tasks
422 | 		per_task_precision = [np.nan] * self.n_tasks
423 | 		per_task_recall = [np.nan] * self.n_tasks
424 | 		for t in range(self.n_tasks):
425 | 			preds = self.classifier.predictOneTask(self.test_tasks,t)
426 | 			true_y = list(self.test_tasks[t]['Y'].flatten())
427 | 
428 | 			if len(preds)==0 or len(true_y) == 0:
429 | 				print "no y for task", t, "... skipping"
430 | 				continue
431 | 				
432 | 			all_preds.extend(preds)
433 | 			all_true_y.extend(true_y)
434 | 
435 | 			# save the per-task results
436 | 			t_acc, t_auc, t_f1, t_precision, t_recall = helper.computeAllMetricsForPreds(preds, true_y)
437 | 			per_task_accs[t] = t_acc
438 | 			per_task_aucs[t] = t_auc
439 | 			per_task_f1[t] = t_f1
440 | 			per_task_precision[t] = t_precision
441 | 			per_task_recall[t] = t_recall
442 | 
443 | 		print "\nPlotting cool stuff about the final model..."
444 | 		self.saveImagePlot(self.classifier.eta, 'Etas')
445 | 		pd.DataFrame(self.classifier.eta).to_csv(self.etas_path + self.save_prefix + "-etas.csv")
446 | 
447 | 		print "\tHELD OUT TEST METRICS COMPUTED BY APPENDING ALL PREDS"
448 | 		acc, auc, f1, precision, recall = helper.computeAllMetricsForPreds(all_preds, all_true_y)
449 | 		print '\t\tAcc:', acc, 'AUC:', auc, 'F1:', f1, 'Precision:', precision, 'Recall:', recall
450 | 
451 | 		print "\n\tHELD OUT TEST METRICS COMPUTED BY AVERAGING OVER TASKS"
452 | 		avg_acc = np.nanmean(per_task_accs)
453 | 		avg_auc = np.nanmean(per_task_aucs)
454 | 		avg_f1 = np.nanmean(per_task_f1)
455 | 		avg_precision = np.nanmean(per_task_precision)
456 | 		avg_recall = np.nanmean(per_task_recall)
457 | 		print '\t\tAcc:', avg_acc, 'AUC:', avg_auc, 'F1:', avg_f1, 'Precision:', avg_precision, 'Recall:', avg_recall
458 | 
459 | 		print "\n\tHELD OUT TEST METRICS COMPUTED FOR EACH TASK"
460 | 		if not self.users_as_tasks:
461 | 			for t in range(self.n_tasks):
462 | 				task_name = self.test_tasks[t]['Name']
463 | 				task_name=helper.getFriendlyLabelName(task_name)
464 | 				print "\t\t", task_name, "- Acc:", per_task_accs[t], "AUC:", per_task_aucs[t], 'F1:', per_task_f1[t], 'Precision:', per_task_precision[t], 'Recall:', per_task_recall[t]
465 | 
466 | 		if self.test_csv_filename is not None:
467 | 			print "\tSAVING HELD OUT PREDICITONS"
468 | 			if 'Big5GenderKMeansCluster' in self.file_prefix:
469 | 				task_column = 'Big5GenderKMeansCluster'
470 | 				tasks_are_ints = True
471 | 				label_name = helper.getFriendlyLabelName(self.file_prefix)
472 | 				wanted_label = helper.getOfficialLabelName(label_name)
473 | 				predictions_df = helper.get_test_predictions_for_df_with_task_column(
474 | 						self.classifier.predict_01, self.test_csv_filename, task_column, self.test_tasks, 
475 | 						wanted_label=wanted_label, num_feats_expected=np.shape(self.test_tasks[0]['X'])[1], 
476 | 						label_name=label_name, tasks_are_ints=tasks_are_ints)
477 | 			elif not self.users_as_tasks:
478 | 				predictions_df = helper.get_test_predictions_for_df_with_no_task_column(self.classifier.predict_01, 
479 | 					self.test_csv_filename, self.test_tasks, num_feats_expected=np.shape(self.test_tasks[0]['X'])[1])
480 | 			else:
481 | 				print "Error! Cannot determine what type of model you are training and therefore cannot save predictions."
482 | 				return
483 | 			predictions_df.to_csv(self.results_path + "Preds-" + self.save_prefix + '.csv')
484 | 		else:
485 | 			print "Uh oh, the test csv filename was not set, can't save test preds"
486 | 
487 | 	def saveImagePlot(self, matrix, name):
488 | 		plt.figure()
489 | 		plt.imshow(matrix)
490 | 		plt.savefig(self.figures_path + self.save_prefix + "-" + name + ".eps")
491 | 		plt.close()
492 | 
493 | 	
494 | 
495 | if __name__ == "__main__":
496 | 	print "MTMKL MODEL SELECTION"
497 | 	print "\tThis code will sweep a set of parameters to find the ideal settings for MTMLK for a single dataset"
498 | 
499 | 	if len(sys.argv) < 3:
500 | 		print "Error: usage is python MTMKLWrapper.py <data file> <test type> <continue>"
501 | 		print "\t<file prefix>: e.g. datasetTaskList-Discard-Future-Group_ - program will look in the following directory for this file", DEFAULT_DATASETS_PATH
502 | 		print "\t<test type>: type 'users' for users as tasks, 'wellbeing' for wellbeing measures as tasks, or 'clusters' for user clusters as tasks"
503 | 		print "\t<continue>: optional. If 'True', the wrapper will pick up from where it left off by loading a previous validation results file"
504 | 		print "\t<csv file for testing>: optional. If you want to get the final test results, provide the name of a csv file to test on"
505 | 		sys.exit()
506 | 	filename= sys.argv[1] #get data file from command line argument
507 | 	print "\nLoading dataset", DEFAULT_DATASETS_PATH + filename
508 | 	print ""
509 | 
510 | 	if sys.argv[2] == 'users':
511 | 		users_as_tasks = True
512 | 		cluster_users = False
513 | 		print "Okay, treating users as tasks. Will not print per-task results"
514 | 	elif sys.argv[2] == 'wellbeing':
515 | 		users_as_tasks = False
516 | 		cluster_users = False
517 | 		print "Okay, treating wellbeing measures as tasks. Will save and print per-task results"
518 | 	elif sys.argv[2] == 'clusters':
519 | 		users_as_tasks = False
520 | 		cluster_users = True
521 | 		print "Okay, treating user clusters as tasks. Will save and print per-task results and optimize for accuracy over all clusters."
522 | 		
523 | 	if len(sys.argv) >= 4 and sys.argv[3] == 'True':
524 | 		cont = True
525 | 		print "Okay, will continue from a previously saved validation results file for this problem"
526 | 	else:
527 | 		cont = False
528 | 	print ""
529 | 
530 | 	if len(sys.argv) >= 5:
531 | 		csv_test_file = sys.argv[4]
532 | 		print "Okay, will get final test results on file", csv_test_file
533 | 		print ""
534 | 	else:
535 | 		csv_test_file = None
536 | 
537 | 	if USE_TENSORFLOW:
538 | 		print "\nWill use the TENSORFLOW version of the code\n"
539 | 
540 | 	wrapper = MTMKLWrapper(filename, users_as_tasks=users_as_tasks, user_clusters=cluster_users, cont=cont,
541 | 						   test_csv_filename=csv_test_file)
542 | 	
543 | 	print "\nThe following parameter settings will be tested:"
544 | 	print "\tCs:            \t", wrapper.c_vals
545 | 	print "\tbetas:         \t", wrapper.beta_vals
546 | 	print "\tkernels:       \t", wrapper.kernels
547 | 	print "\tvs:            \t", wrapper.v_vals
548 | 	print "\tregularizers:  \t", wrapper.regularizers
549 | 
550 | 	print "\nThe validation results dataframe will be saved in:", wrapper.results_path + wrapper.save_prefix + '.csv'
551 | 	print "\nThe validation and testing figures will be saved in:", wrapper.figures_path + wrapper.save_prefix
552 | 
553 | 	wrapper.run()
554 | 


--------------------------------------------------------------------------------
/NeuralNetworks/tensorFlowWrapper.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import sys
  5 | import os
  6 | import pickle
  7 | from time import time
  8 | 
  9 | CODE_PATH = os.path.dirname(os.getcwd())
 10 | sys.path.append(CODE_PATH)
 11 | 
 12 | DEFAULT_RESULTS_PATH = '/Your/path/here/'
 13 | DEFAULT_DATASETS_PATH = '/Your/path/here/'
 14 | DEFAULT_FIGURES_PATH = '/Your/path/here/'
 15 | 
 16 | DEFAULT_VAL_TYPE = 'cross'
 17 | OUTPUT_EVERY_NTH = 3
 18 | 
 19 | sys.path.append(PATH_TO_REPO)
 20 | import tensorFlowNetwork as tfnet
 21 | import tensorFlowNetworkMultiTask as mtltf
 22 | import helperFuncs as helper
 23 | 
 24 | def reloadFiles():
 25 | 	reload(tfnet)
 26 | 	reload(mtltf)
 27 | 	reload(helper)
 28 | 	tfnet.reloadHelper()
 29 | 	mtltf.reloadFiles()
 30 | 
 31 | class TensorFlowWrapper:
 32 | 	def __init__(self, dataset_name, target_label=None, trial_name=None, multilabel=False, multitask=False, 
 33 | 				 print_per_task=False, test_steps=9001, results_path=DEFAULT_RESULTS_PATH, 
 34 | 				 datasets_path=DEFAULT_DATASETS_PATH, figures_path=DEFAULT_FIGURES_PATH, val_output_file=None, 
 35 | 				 val_type=DEFAULT_VAL_TYPE, cont=False, architectures=None, test_csv_filename=None):
 36 | 		assert not(multilabel and multitask)
 37 | 
 38 | 		self.multilabel = multilabel
 39 | 		self.multitask = multitask
 40 | 		self.results_path = results_path
 41 | 		self.figures_path = figures_path
 42 | 		self.datasets_path = datasets_path
 43 | 		self.dataset_name = dataset_name 
 44 | 		self.test_steps = test_steps
 45 | 		self.val_type = val_type
 46 | 		self.cont = cont
 47 | 		self.print_per_task = print_per_task
 48 | 		if test_csv_filename is not None:
 49 | 			self.test_csv_filename = self.datasets_path + test_csv_filename
 50 | 		else:
 51 | 			self.test_csv_filename = None
 52 | 		if cont:
 53 | 			replace = True
 54 | 		else:
 55 | 			replace = False
 56 | 		if trial_name is None and target_label is not None:
 57 | 			trial_name = helper.getFriendlyLabelName(target_label)
 58 | 		self.trial_name = trial_name
 59 | 		self.val_output_prefix = self.getValOutputName(val_output_file, dataset_name, trial_name, replace=replace)  
 60 | 
 61 | 		#dataset stuff
 62 | 		if multitask:
 63 | 			train_tasks = pickle.load(open(self.datasets_path + dataset_name + "Train.p","rb"))
 64 | 			val_tasks = pickle.load(open(self.datasets_path + dataset_name + "Val.p","rb"))
 65 | 			test_tasks = pickle.load(open(self.datasets_path + dataset_name + "Test.p","rb"))
 66 | 
 67 | 			self.net = mtltf.TensorFlowNetworkMTL(train_tasks, val_tasks, test_tasks, verbose=False, 
 68 | 												  val_type=self.val_type, print_per_task=print_per_task)
 69 | 			self.wanted_labels = self.net.optimize_labels
 70 | 		else:
 71 | 			self.data_df = pd.DataFrame.from_csv(self.datasets_path + self.dataset_name)
 72 | 			self.wanted_feats = [x for x in self.data_df.columns.values if x != 'user_id' and x != 'timestamp' and x!= 'dataset' and '_Label' not in x]
 73 | 			if self.multilabel:
 74 | 				self.wanted_labels = [x for x in self.data_df.columns.values if '_Label' in x and 'tomorrow_' in x and 'Evening' in x and 'Alertness' not in x and 'Energy' not in x]
 75 | 				self.optimize_labels = [x for x in self.wanted_labels if 'tomorrow_' in x and 'Evening_' in x]
 76 | 			else:
 77 | 				self.wanted_labels = [target_label]
 78 | 
 79 | 			#actual network
 80 | 			self.net = tfnet.TensorFlowNetwork(self.data_df, self.wanted_feats, self.wanted_labels, optimize_labels=self.wanted_labels,
 81 | 											multilabel=self.multilabel, verbose=False, val_type=self.val_type)
 82 | 
 83 | 		#parameters that can be tuned:
 84 | 		self.l2_regularizers = [1e-2, 1e-4]
 85 | 		self.dropout = [True, False]
 86 | 		self.decay = [True]
 87 | 		self.decay_steps = [1000]
 88 | 		self.decay_rates = [0.95]
 89 | 		self.optimizers = [tf.train.AdamOptimizer] #[tf.train.AdagradOptimizer,  tf.train.GradientDescentOptimizer
 90 | 		self.train_steps =[5001]
 91 | 		if multitask:
 92 | 			self.batch_sizes = [20]
 93 | 			self.learning_rates = [.01, .001, .0001]
 94 | 			self.architectures = [[500,50],[300,20,10]] if architectures is None else architectures
 95 | 		else:
 96 | 			self.batch_sizes = [50,75]
 97 | 			self.learning_rates = [.01, .001, .0001]
 98 | 			self.architectures = [[1024,256],[500,50],[1024]] if architectures is None else architectures
 99 | 
100 | 		#storing the results
101 | 		self.time_sum = 0
102 | 		if cont:
103 | 			self.val_results_df = pd.DataFrame.from_csv(self.results_path + self.val_output_prefix + '.csv')
104 | 			print '\nPrevious validation results df loaded. It has', len(self.val_results_df), "rows"
105 | 			self.started_from = len(self.val_results_df)
106 | 		else:
107 | 			self.val_results_df = pd.DataFrame()
108 | 			self.started_from = 0
109 | 
110 | 	def getValOutputName(self, val_output_file, dataset_file, trial_name, replace=False):
111 | 		if self.multitask:
112 | 			multilabel_str = 'MTL_'
113 | 		elif self.multilabel:
114 | 			multilabel_str = 'multilabel_'
115 | 		else:
116 | 			multilabel_str = ''
117 | 
118 | 		name_modifier = ""
119 | 		if '/' in dataset_file:
120 | 			if "NoLocation" in dataset_file:
121 | 				name_modifier = "-noloc"
122 | 			slash_loc = dataset_file.find('/')
123 | 			dataset_file = dataset_file[slash_loc+1:]
124 | 
125 | 		if replace or val_output_file is None:
126 | 			val_output_file = 'nn_' + multilabel_str + dataset_file[0:-4] + name_modifier + "_"
127 | 		if trial_name is not None:
128 | 			val_output_file = val_output_file + trial_name
129 | 		if not replace:
130 | 			while os.path.exists(self.results_path + val_output_file + '.csv') \
131 | 				or os.path.exists(self.figures_path + val_output_file + '.eps'):
132 | 				val_output_file = val_output_file + '2'
133 | 		return val_output_file
134 | 
135 | 	def setNetworkArchitecturesToTest(self, architectures):
136 | 		self.architectures = architectures
137 | 
138 | 	def constructNetwork(self, hidden_layers):
139 | 		if self.multitask:
140 | 			hidden_layers_shared = hidden_layers[:-1]
141 | 			hidden_task_nodes = hidden_layers[-1]
142 | 			connections_shared = ['full'] * (len(hidden_layers))
143 | 			self.net.setUpNetworkStructure(hidden_layers_shared,hidden_task_nodes,connections_shared,['full','full'])
144 | 		else:
145 | 			connections = ['full'] * (len(hidden_layers)+1)
146 | 			self.net.setUpNetworkStructure(hidden_layers,connections)
147 | 
148 | 	# use something like the following to test only one set of parameters:
149 | 	# wrapper.setParams(l2_regularizers=[1e-4], learning_rates=[.01], dropout=[True], decay=[True], batch_sizes=[50], optimizers=[tf.train.GradientDescentOptimizer])
150 | 	def setParams(self, l2_regularizers=None, learning_rates=None, dropout=None, 
151 | 				decay=None, decay_steps=None, decay_rates=None, batch_sizes=None,
152 | 				optimizers=None, train_steps=None):
153 | 		'''does not override existing parameter settings if the parameter is not set'''
154 | 		self.l2_regularizers = l2_regularizers if l2_regularizers is not None else self.l2_regularizers
155 | 		self.learning_rates = learning_rates if learning_rates is not None else self.learning_rates
156 | 		self.dropout= dropout if dropout is not None else self.dropout
157 | 		self.decay= decay if decay is not None else self.decay
158 | 		self.decay_steps= decay_steps if decay_steps is not None else self.decay_steps
159 | 		self.decay_rates= decay_rates if decay_rates is not None else self.decay_rates
160 | 		self.batch_sizes = batch_sizes if batch_sizes is not None else self.batch_sizes
161 | 		self.optimizers = optimizers if optimizers is not None else self.optimizers
162 | 
163 | 	def settingAlreadyDone(self, hidden_layers, l2_beta, lrate, dropout, decay, dsteps, drate, bsize, opt, tsteps):
164 | 		if len(self.val_results_df[(self.val_results_df['hidden_layers']== str(hidden_layers)) & \
165 | 									(self.val_results_df['l2_beta']== l2_beta) & \
166 | 									(self.val_results_df['learning_rate']== lrate) & \
167 | 									(self.val_results_df['dropout']== dropout) & \
168 | 									(self.val_results_df['decay']== decay) & \
169 | 									(self.val_results_df['decay_steps']== dsteps) & \
170 | 									(self.val_results_df['decay_rate']== drate) & \
171 | 									(self.val_results_df['batch_size']== bsize) & \
172 | 									(self.val_results_df['optimizer']== str(opt))]) > 0:
173 | 			print "setting already tested"
174 | 			return True
175 | 		else:
176 | 			return False
177 | 
178 | 	def testOneSetting(self, hidden_layers, l2_beta, lrate, dropout, decay, dsteps, drate, bsize, opt, tsteps, num_settings):
179 | 		print "Testing setting with layers", hidden_layers, "beta", l2_beta, "lrate", lrate, "dropout", dropout, "decay", decay, "dsteps", dsteps, "drate", drate, "bsize", bsize, "opt", opt, "tsteps", tsteps
180 | 		if self.cont:
181 | 			if self.settingAlreadyDone(hidden_layers, l2_beta, lrate, dropout, decay, dsteps, drate, bsize, opt, tsteps):
182 | 				return
183 | 
184 | 		t0 = time()
185 | 		self.net.setParams(l2_beta=l2_beta, initial_learning_rate=lrate, decay=decay, 
186 | 							decay_steps=dsteps, decay_rate=drate, batch_size=bsize,
187 | 							optimizer=opt, n_steps=tsteps, dropout=dropout)
188 | 		self.constructNetwork(hidden_layers)
189 | 		if self.val_type == 'cross':
190 | 			acc, auc, f1, precision, recall = self.net.trainAndCrossValidate()
191 | 		else:
192 | 			acc, auc, f1, precision, recall = self.net.trainAndValidate()
193 | 
194 | 		results_dict = {'hidden_layers':hidden_layers, 'l2_beta': l2_beta, 'learning_rate': lrate, 
195 | 						'dropout': dropout, 'decay': decay, 'decay_steps': dsteps, 
196 | 						'decay_rate': drate, 'batch_size': bsize, 
197 | 						'optimizer': opt, 'val_acc': acc, 'val_auc':auc,
198 | 						'val_f1':f1, 'val_precision':precision, 'val_recall':recall}
199 | 		if self.multitask:
200 | 			results_dict['train_nan_percent'] = self.net.train_nan_percent[-1]
201 | 			results_dict['val_nan_percent'] = self.net.val_nan_percent[-1]
202 | 
203 | 		if self.multilabel or self.print_per_task:
204 | 			for label in self.wanted_labels:
205 | 				friendly_label = helper.getFriendlyLabelName(label)
206 | 				results_dict[friendly_label + '_acc'] = self.net.training_val_results_per_task['acc'][label][-1]
207 | 				results_dict[friendly_label + '_auc'] = self.net.training_val_results_per_task['auc'][label][-1]
208 | 				results_dict[friendly_label + '_f1'] = self.net.training_val_results_per_task['f1'][label][-1]
209 | 				results_dict[friendly_label + '_precision'] = self.net.training_val_results_per_task['precision'][label][-1]
210 | 				results_dict[friendly_label + '_recall'] = self.net.training_val_results_per_task['recall'][label][-1]
211 | 		self.val_results_df = self.val_results_df.append(results_dict,ignore_index=True)
212 | 		
213 | 		print self.val_results_df.tail(n=1)
214 | 		t1 = time()
215 | 		this_time = t1 - t0
216 | 		print "It took", this_time, "seconds to obtain this result"
217 | 
218 | 		self.time_sum = self.time_sum + this_time
219 | 
220 | 		self.printTimeEstimate(len(self.val_results_df)-self.started_from, num_settings)
221 | 		sys.stdout.flush()
222 | 
223 | 		#output the file every few iterations for safekeeping 
224 | 		if len(self.val_results_df) % OUTPUT_EVERY_NTH == 0:
225 | 			self.val_results_df.to_csv(self.results_path + self.val_output_prefix + '.csv')
226 | 
227 | 	def printTimeEstimate(self, num_done, num_desired):
228 | 		num_remaining = num_desired - num_done
229 | 		avg_time = self.time_sum / num_done
230 | 		total_secs_remaining = int(avg_time * num_remaining)
231 | 		hours = total_secs_remaining / 60 / 60
232 | 		mins = (total_secs_remaining % 3600) / 60
233 | 		secs = (total_secs_remaining % 3600) % 60
234 | 
235 | 		print "\n", num_done, "settings processed so far,", num_remaining, "left to go"
236 | 		print "Estimated time remaining:", hours, "hours", mins, "mins", secs, "secs"
237 | 
238 | 	def calcNumSettingsPerStructure(self):
239 | 		num_settings = len(self.l2_regularizers) * len(self.learning_rates) * len(self.dropout) * len(self.decay) \
240 | 						* len(self.batch_sizes)  * len(self.optimizers) * len(self.train_steps)
241 | 		if True in self.decay and (len(self.decay_steps) > 1 or len(self.decay_rates) > 1):
242 | 			num_settings = num_settings * ((len(self.decay_steps) * len(self.decay_rates)) / 2.0) 
243 | 		return num_settings
244 | 
245 | 	def sweepParameters(self, hidden_layers, num_settings):
246 | 		print "\nSweeping all parameters for structure:", hidden_layers
247 | 	
248 | 		#sweep all possible combinations of parameters
249 | 		for l2_beta in self.l2_regularizers:
250 | 			for lrate in self.learning_rates:
251 | 				for dropout in self.dropout:
252 | 					for bsize in self.batch_sizes:
253 | 						for opt in self.optimizers:
254 | 							for tsteps in self.train_steps:
255 | 								for decay in self.decay:
256 | 									if decay:
257 | 										for dsteps in self.decay_steps:
258 | 											for drate in self.decay_rates:
259 | 												self.testOneSetting(hidden_layers, l2_beta, lrate, dropout, decay, dsteps, drate, bsize, opt, tsteps, num_settings)
260 | 									else:
261 | 										#decay steps and decay rate don't matter if decay is set to false
262 | 										self.testOneSetting(hidden_layers, l2_beta, lrate, dropout, decay, 10000, 0.95, bsize, opt, tsteps, num_settings)
263 | 		self.val_results_df.to_csv(self.results_path + self.val_output_prefix + '.csv')
264 | 
265 | 	def sweepStructuresAndParameters(self):
266 | 		num_settings = self.calcNumSettingsPerStructure()
267 | 		num_settings_total = num_settings * len(self.architectures)
268 | 
269 | 		print "\nYou have chosen to test", num_settings, "settings for each of", len(self.architectures), "architectures"
270 | 		print "This is a total of", num_settings_total, "tests."
271 | 		for hidden_layers in self.architectures:
272 | 			self.sweepParameters(hidden_layers,num_settings_total)
273 | 
274 | 	def findBestSetting(self, retrain_and_plot=True, optimize_for='val_auc'):
275 | 		accuracies = self.val_results_df[optimize_for].tolist()
276 | 		max_acc = max(accuracies)
277 | 		max_idx = accuracies.index(max_acc)
278 | 		best_setting = self.val_results_df.iloc[max_idx]
279 | 
280 | 		print "BEST SETTING!"
281 | 		print "The highest", optimize_for, "of", max_acc, "was found with the following settings:"
282 | 		print best_setting
283 | 
284 | 		best_setting = helper.fixSettingDictLoadedFromResultsDf(best_setting)
285 | 
286 | 		if retrain_and_plot:
287 | 			self.retrainAndPlot(best_setting)
288 | 		else:
289 | 			return best_setting
290 | 
291 | 	def retrainAndPlot(self, setting_dict):
292 | 		print "\nRETRAINING WITH THE BEST SETTINGS:"
293 | 
294 | 		self.net.verbose = True
295 | 		self.net.setParams(l2_beta=setting_dict['l2_beta'], initial_learning_rate=setting_dict['learning_rate'], decay=setting_dict['decay'], 
296 | 							decay_steps=setting_dict['decay_steps'], decay_rate=setting_dict['decay_rate'], batch_size=setting_dict['batch_size'],
297 | 							optimizer=setting_dict['optimizer'], dropout=setting_dict['dropout'])
298 | 		self.constructNetwork(setting_dict['hidden_layers'])
299 | 
300 | 		self.net.setUpGraph()
301 | 		self.net.runGraph(self.test_steps, print_test=True)
302 | 
303 | 		if self.multilabel:
304 | 			for label in self.optimize_labels:
305 | 				friendly_label = helper.getFriendlyLabelName(label)
306 | 				self.net.plotValResults(save_path=self.figures_path + self.val_output_prefix + '-' + friendly_label + '.eps', label=label)
307 | 				self.net.plotValResults(save_path=self.figures_path + self.val_output_prefix + '-' + friendly_label + '.png', label=label)
308 | 				print "Final validation results for", friendly_label,"... Acc:", \
309 | 						self.net.training_val_results_per_task['acc'][label][-1], "Auc:", self.net.training_val_results_per_task['auc'][label][-1]
310 | 		elif self.print_per_task:
311 | 			for label in self.wanted_labels:
312 | 				friendly_label = helper.getFriendlyLabelName(label)
313 | 				self.net.plotValResults(save_path=self.figures_path + self.val_output_prefix + '-' + friendly_label + '.eps', label=label)
314 | 				self.net.plotValResults(save_path=self.figures_path + self.val_output_prefix + '-' + friendly_label + '.png', label=label)
315 | 				print "Final validation results for", friendly_label,"... Acc:", \
316 | 					self.net.training_val_results_per_task['acc'][label][-1], "Auc:", self.net.training_val_results_per_task['auc'][label][-1]
317 | 		else:
318 | 			self.net.plotValResults(save_path=self.figures_path + self.val_output_prefix + '.eps')
319 | 			self.net.plotValResults(save_path=self.figures_path + self.val_output_prefix + '.png')
320 | 			print "Final AUC:", self.net.training_val_results['auc'][-1]
321 | 
322 | 		if self.test_csv_filename is not None:
323 | 			if self.multitask:
324 | 				task_column = None
325 | 				if 'Cluster' in self.dataset_name:
326 | 					print "Guessing the task column is Big5GenderKMeansCluster - if this is incorrect expect errors"
327 | 					task_column = 'Big5GenderKMeansCluster'
328 | 					tasks_are_ints = True
329 | 				
330 | 				if 'User' in self.dataset_name:
331 | 					print "Guessing the task column is user_id - if this is incorrect expect errors"
332 | 					task_column = 'user_id'
333 | 					tasks_are_ints = False
334 | 				
335 | 				if task_column is not None:
336 | 					label_name = helper.getFriendlyLabelName(self.dataset_name)
337 | 					wanted_label = helper.getOfficialLabelName(label_name)
338 | 					test_preds_df = helper.get_test_predictions_for_df_with_task_column(
339 | 						self.net.predict, self.test_csv_filename, task_column, self.net.test_tasks, 
340 | 						wanted_label=wanted_label, num_feats_expected=np.shape(self.net.test_tasks[0]['X'])[1], 
341 | 						label_name=label_name, tasks_are_ints=tasks_are_ints)
342 | 				else:
343 | 					test_preds_df = helper.get_test_predictions_for_df_with_no_task_column(self.net.predict, self.test_csv_filename,
344 | 																	self.net.test_tasks, 
345 | 																	num_feats_expected=np.shape(self.net.test_tasks[0]['X'])[1])
346 | 			else:
347 | 				test_preds_df = self.net.get_preds_for_df()
348 | 			print "Got a test preds df! Saving it to:", self.results_path + "Preds-" + self.val_output_prefix + '.csv'
349 | 			test_preds_df.to_csv(self.results_path + 'Preds-' + self.val_output_prefix + '.csv')
350 | 		else:
351 | 			print "Uh oh, the test csv filename was not set, can't save test preds"
352 | 
353 | 		print "Saving a copy of the final model!"
354 | 		self.net.save_model(self.val_output_prefix, self.results_path)
355 | 		
356 | 
357 | 	def run(self):
358 | 		self.sweepStructuresAndParameters()
359 | 		self.findBestSetting()
360 | 
361 | if __name__ == "__main__":
362 | 	print "TENSOR FLOW MODEL SELECTION"
363 | 	print "\tThis code will sweep a set of network architectures and parameters to find the ideal settings for a single dataset"
364 | 
365 | 	if len(sys.argv) < 4:
366 | 		print "Error: usage is python tensorFlowWrapper.py <data file> <classification type> <multitasking over> <continue>"
367 | 		print "\t<data file>: e.g. dataset-Simple-Group.csv or datasetTaskList-Discard40-Future-Personal_ ... Program will look in the following directory for this file", DEFAULT_DATASETS_PATH
368 | 		print "\t<classification type>:"
369 | 		print "\t\tFor single task learning, enter the name of the label you would like classify on. E.g. Group_Happiness_Evening_Label"
370 | 		print "\t\tFor multi task learning, in which the same net learns several tasks (like several wellbeing measures) enter: multilabel"
371 | 		print "\t\tFor multi task learning, in which each task gets its own piece of the network, but the first layers are shared (like users as tasks) enter: multitask"
372 | 		print "\t<multitasking over> For wellbeing-ask-tasks use 'wellbeing', for users-as-tasks use 'users'"
373 | 		print "\t<continue>: optional. If 'True', the neural net will pick up from where it left off by loading a previous validation results file"
374 | 		print "\t<csv file for testing>: optional. If you want to get the final test results, provide the name of a csv file to test on"
375 | 		sys.exit()
376 | 	filename= sys.argv[1] #get data file from command line argument
377 | 	print "\nLoading dataset", DEFAULT_DATASETS_PATH + filename
378 | 	print ""
379 | 
380 | 	multilabel = False
381 | 	multitask = False
382 | 	target_label = None
383 | 	if sys.argv[2] == 'multilabel':
384 | 		multilabel = True
385 | 		print "Performing multi-task classification, in which the same net is shared by all tasks"
386 | 		print "Optimizing for accuracy on tomorrow evening"
387 | 	elif sys.argv[2] == 'multitask':
388 | 		multitask = True
389 | 		print "Performing multi-task classification, in which each task gets it's own private final hidden layer"
390 | 	else:
391 | 		target_label = sys.argv[2]
392 | 		print "Performing single-task classification, classifying on", target_label
393 | 
394 | 	if sys.argv[3] == 'wellbeing':
395 | 		print_per_task = True
396 | 	else:
397 | 		print_per_task = False
398 | 
399 | 	if len(sys.argv) >= 5 and sys.argv[4] == 'True':
400 | 		cont = True
401 | 		print "Okay, will continue from a previously saved validation results file for this problem"
402 | 	else:
403 | 		cont = False
404 | 	print ""
405 | 
406 | 	if len(sys.argv) >= 6:
407 | 		csv_test_file = sys.argv[5]
408 | 		print "Okay, will get final test results on file", csv_test_file
409 | 		print ""
410 | 	else:
411 | 		csv_test_file = None
412 | 
413 | 	wrapper = TensorFlowWrapper(filename, target_label=target_label, multilabel=multilabel, multitask=multitask, 
414 | 							    print_per_task=print_per_task, cont=cont, test_csv_filename=csv_test_file)
415 | 	
416 | 	print "\nThe following parameter settings will be tested:"
417 | 	print "\tl2_regularizers:  \t", wrapper.l2_regularizers
418 | 	print "\tlearning_rates:   \t", wrapper.learning_rates
419 | 	print "\tdropout:          \t", wrapper.dropout
420 | 	print "\tdecay:            \t", wrapper.decay
421 | 	print "\tdecay_steps:      \t", wrapper.decay_steps
422 | 	print "\tdecay_rates:      \t", wrapper.decay_rates
423 | 	print "\tbatch_sizes:      \t", wrapper.batch_sizes
424 | 	print "\toptimizers:       \t", wrapper.optimizers
425 | 	print "\ttrain_steps:      \t", wrapper.train_steps
426 | 
427 | 	print "\nThe following network structures will be tested:"
428 | 	print "\t", wrapper.architectures
429 | 
430 | 	print "\nThe validation results dataframe will be saved in:", wrapper.results_path + wrapper.val_output_prefix + '.csv'
431 | 	print "\nThe validation accuracy figures will be saved in:", wrapper.figures_path + wrapper.val_output_prefix + '.eps'
432 | 
433 | 	wrapper.run()


--------------------------------------------------------------------------------
/NeuralNetworks/tensorFlowWrapperSTL.py:
--------------------------------------------------------------------------------
  1 | """Performs a hyperparameter sweep for the Single Task Learning (STL) neural
  2 | network."""
  3 | import pandas as pd
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | import sys
  7 | import os
  8 | import pickle
  9 | import copy
 10 | from time import time
 11 | 
 12 | CODE_PATH = os.path.dirname(os.getcwd())
 13 | sys.path.append(CODE_PATH)
 14 | 
 15 | DEFAULT_RESULTS_PATH = '/Your/path/here/'
 16 | DEFAULT_DATASETS_PATH = '/Your/path/here/'
 17 | DEFAULT_FIGURES_PATH = '/Your/path/here/'
 18 | 
 19 | import tensorFlowNetwork as tfnet
 20 | import helperFuncs as helper
 21 | 
 22 | DEFAULT_VAL_TYPE = 'cross'
 23 | DEFAULT_NUM_CROSS_FOLDS = 5
 24 | SAVE_RESULTS_EVERY_X_TESTS = 1
 25 | 
 26 | def reloadFiles():
 27 | 	reload(helper)
 28 | 	reload(tfnet)
 29 | 	tfnet.reloadHelper()
 30 | 
 31 | class TensorFlowSTLWrapper:
 32 | 	
 33 | 	def __init__(self, dataset_name, target_label, users_as_tasks=True, test_steps=9001, val_output_file=None, 
 34 | 				val_type=DEFAULT_VAL_TYPE, cont=False, results_path=DEFAULT_RESULTS_PATH, 
 35 | 				datasets_path=DEFAULT_DATASETS_PATH, figures_path=DEFAULT_FIGURES_PATH, architectures=None, 
 36 | 				num_cross_folds=DEFAULT_NUM_CROSS_FOLDS, test_run=False, redo_test=False):
 37 | 		self.datasets_path = datasets_path
 38 | 		self.cont = cont
 39 | 		self.val_type = val_type
 40 | 		self.num_cross_folds = num_cross_folds
 41 | 		self.test_steps = test_steps
 42 | 		self.redo_test = redo_test
 43 | 		self.users_as_tasks = users_as_tasks
 44 | 		self.target_label = target_label
 45 | 		if self.users_as_tasks:
 46 | 			self.results_path = results_path + 'STL-OneModelPerUser/'
 47 | 			self.figures_path = figures_path + 'STL-OneModelPerUser/'
 48 | 		else:
 49 | 			self.results_path = results_path + 'STL-Wellbeing/'
 50 | 			self.figures_path = figures_path + 'STL-Wellbeing/'
 51 | 		self.save_prefix = self.getSavePrefix(dataset_name, target_label, replace=cont)
 52 | 
 53 | 		self.dataset_name = dataset_name 
 54 | 		self.data_df = pd.DataFrame.from_csv(self.datasets_path + self.dataset_name)
 55 | 		self.wanted_feats = [x for x in self.data_df.columns.values if x != 'user_id' and x != 'timestamp' and x!= 'dataset' and '_Label' not in x]
 56 | 		if self.users_as_tasks:
 57 | 			self.wanted_labels = [target_label]
 58 | 			self.n_tasks = len(self.data_df['user_id'].unique())
 59 | 		else:
 60 | 			self.wanted_labels = [x for x in self.data_df.columns.values if '_Label' in x and 'tomorrow_' in x and 'Evening' in x and 'Alertness' not in x and 'Energy' not in x]
 61 | 			self.n_tasks = len(self.wanted_labels)
 62 | 
 63 | 		#parameters that can be tuned:
 64 | 		self.l2_regularizers = [1e-2, 1e-4]
 65 | 		self.dropout = [True, False]
 66 | 		self.decay = [True]
 67 | 		self.decay_steps = [10000]
 68 | 		self.decay_rates = [0.95]
 69 | 		self.optimizers = [tf.train.AdamOptimizer] #[tf.train.AdagradOptimizer,  tf.train.GradientDescentOptimizer
 70 | 		self.train_steps =[4001]
 71 | 		self.batch_sizes = [5,10,20]
 72 | 		self.learning_rates = [.01, .001]
 73 | 		self.architectures = [[100],[50,5],[100,10]] if architectures is None else architectures
 74 | 
 75 | 		self.test_run = test_run
 76 | 		if test_run:
 77 | 			print "This is only a testing run. Using cheap settings to make it faster"
 78 | 			self.l2_regularizers = [1e-2]
 79 | 			self.dropout = [True]
 80 | 			self.decay = [True]
 81 | 			self.decay_steps = [10000]
 82 | 			self.decay_rates = [0.95]
 83 | 			self.optimizers = [tf.train.AdamOptimizer] #[tf.train.AdagradOptimizer,  tf.train.GradientDescentOptimizer
 84 | 			self.train_steps =[1001]
 85 | 			self.batch_sizes = [10]
 86 | 			self.learning_rates = [.001]
 87 | 			self.architectures = [[100],[50,5]] if architectures is None else architectures
 88 | 
 89 | 		self.calcNumSettingsDesired()
 90 | 
 91 | 		#storing the results
 92 | 		self.time_sum = 0
 93 | 		if cont:
 94 | 			self.val_results_df = pd.DataFrame.from_csv(self.results_path + self.save_prefix + '.csv')
 95 | 			print '\nPrevious validation results df loaded. It has', len(self.val_results_df), "rows"
 96 | 			self.started_from = len(self.val_results_df)
 97 | 		else:
 98 | 			self.val_results_df = pd.DataFrame()
 99 | 			self.started_from = 0
100 | 
101 | 		# store for computing the accuracy/auc the unfair way
102 | 		self.cumulative_test_preds = []
103 | 		self.cumulative_test_true = []
104 | 
105 | 	def getSavePrefix(self, file_name, target_label, replace=False):
106 | 		if '/' in file_name:
107 | 			slash_loc = file_name.find('/')
108 | 			file_name = file_name[slash_loc:]
109 | 		dash_loc = file_name.find('-')
110 | 		if self.users_as_tasks:
111 | 			task_name = "tfSTLUsers"
112 | 			label_name = '-' + helper.getFriendlyLabelName(target_label)
113 | 		else:
114 | 			task_name = "tfSTLWellbeing"
115 | 			label_name = ""
116 | 		prefix = task_name + file_name[dash_loc:-4] + label_name 
117 | 		if not replace:
118 | 			while os.path.exists(self.results_path + prefix + '.csv'):
119 | 				prefix = prefix + '2'
120 | 		return prefix
121 | 
122 | 	def calcNumSettingsDesired(self):
123 | 		self.num_settings = len(self.l2_regularizers) * len(self.learning_rates) * len(self.dropout) * len(self.decay) \
124 | 						* len(self.batch_sizes)  * len(self.optimizers) * len(self.train_steps) * len(self.architectures)
125 | 		if True in self.decay and (len(self.decay_steps) > 1 or len(self.decay_rates) > 1):
126 | 			self.num_settings = num_settings * ((len(self.decay_steps) * len(self.decay_rates)) / 2.0) 
127 | 
128 | 	# use something like the following to test only one set of parameters:
129 | 	# wrapper.setParams(l2_regularizers=[1e-4, learning_rates=[.01], dropout=[True], decay=[True], batch_sizes=[50], optimizers=[tf.train.GradientDescentOptimizer])
130 | 	def setParams(self, l2_regularizers=None, learning_rates=None, dropout=None, 
131 | 				decay=None, decay_steps=None, decay_rates=None, batch_sizes=None,
132 | 				optimizers=None, train_steps=None):
133 | 		'''does not override existing parameter settings if the parameter is not set'''
134 | 		self.l2_regularizers = l2_regularizers if l2_regularizers is not None else self.l2_regularizers
135 | 		self.learning_rates = learning_rates if learning_rates is not None else self.learning_rates
136 | 		self.dropout= dropout if dropout is not None else self.dropout
137 | 		self.decay= decay if decay is not None else self.decay
138 | 		self.decay_steps= decay_steps if decay_steps is not None else self.decay_steps
139 | 		self.decay_rates= decay_rates if decay_rates is not None else self.decay_rates
140 | 		self.batch_sizes = batch_sizes if batch_sizes is not None else self.batch_sizes
141 | 		self.optimizers = optimizers if optimizers is not None else self.optimizers
142 | 
143 | 	def settingAlreadyDone(self, task):
144 | 		if len(self.val_results_df[(self.val_results_df['task_name']== task)]) > 0:
145 | 			print "setting already tested"
146 | 			return True
147 | 		else:
148 | 			return False
149 | 
150 | 	def getResultsDictFromRow(self,row_df):
151 | 		best_results_dict = dict()
152 | 		for col in row_df.columns.values:
153 | 			best_results_dict[col] = row_df[col].tolist()[0]
154 | 
155 | 		for arch in self.architectures:
156 | 			if str(arch) == best_results_dict['hidden_layers']:
157 | 				best_results_dict['hidden_layers'] = arch
158 | 
159 | 		for opt_func in self.optimizers:
160 | 			if str(opt_func) == best_results_dict['optimizer']:
161 | 				best_results_dict['optimizer'] = opt_func
162 | 
163 | 		return best_results_dict
164 | 
165 | 	def constructNetwork(self, hidden_layers):
166 | 		connections = ['full'] * (len(hidden_layers)+1)
167 | 		self.net.setUpNetworkStructure(hidden_layers,connections)
168 | 
169 | 	def sweepParametersForOneTask(self, task_name, target_label):
170 | 		if self.users_as_tasks:
171 | 			task_df = self.data_df[self.data_df['user_id'] == task_name]
172 | 		else:
173 | 			task_df = self.data_df
174 | 		self.net = tfnet.TensorFlowNetwork(task_df, copy.deepcopy(self.wanted_feats), self.wanted_labels, verbose=False, val_type=self.val_type)
175 | 
176 | 		if len(self.net.train_X) == 0 or len(self.net.train_y) == 0:
177 | 			print "No training data for this task!"
178 | 			return dict()
179 | 		if len(self.net.test_X) == 0:
180 | 			print "No testing data for this task! Skipping",
181 | 			return dict()
182 | 		if np.shape(self.net.train_X)[1] == 0: 
183 | 			print "All columns were null, this task has no features left!"
184 | 			return dict()
185 | 		if len(self.net.train_X) != len(self.net.train_y):
186 | 			print "Unequal length of X and Y dataframe!"
187 | 			return dict()
188 | 
189 | 		df = pd.DataFrame()
190 | 		
191 | 		#sweep all possible combinations of parameters
192 | 		print "...sweeping all parameters for this task..."
193 | 		for hidden_layers in self.architectures:
194 | 			for l2_beta in self.l2_regularizers:
195 | 				for lrate in self.learning_rates:
196 | 					for dropout in self.dropout:
197 | 						for bsize in self.batch_sizes:
198 | 							for opt in self.optimizers:
199 | 								for tsteps in self.train_steps:
200 | 									for decay in self.decay:
201 | 										if decay:
202 | 											for dsteps in self.decay_steps:
203 | 												for drate in self.decay_rates:
204 | 													results_dict = self.testOneSettingForOneTask(hidden_layers, l2_beta, lrate, dropout, decay, dsteps, drate, bsize, opt, tsteps)
205 | 													df = df.append(results_dict ,ignore_index=True)
206 | 										else:
207 | 											#decay steps and decay rate don't matter if decay is set to false
208 | 											results_dict = self.testOneSettingForOneTask(hidden_layers, l2_beta, lrate, dropout, decay, 10000, 0.95, bsize, opt, tsteps)
209 | 											df = df.append(results_dict ,ignore_index=True)
210 | 		
211 | 		accuracies = df['val_acc'].tolist()
212 | 		max_acc = max(accuracies)
213 | 		max_idx = accuracies.index(max_acc)
214 | 
215 | 		best_results_dict = df.iloc[max_idx]
216 | 
217 | 		#retrain with the best settings
218 | 
219 | 		test_acc, test_auc, test_preds = self.getFinalResultsForTask(best_results_dict)
220 | 		self.cumulative_test_preds.extend(test_preds)
221 | 		self.cumulative_test_true.extend(self.net.test_X)
222 | 
223 | 		best_results_dict['test_acc'] = test_acc
224 | 		best_results_dict['test_auc'] = test_auc
225 | 		return best_results_dict
226 | 
227 | 	def find_best_setting(self, task):
228 | 		df = self.val_results_df[self.val_results_df['task_name']==task]
229 | 		accuracies = df['val_acc'].tolist()
230 | 		max_acc = max(accuracies)
231 | 		max_idx = accuracies.index(max_acc)
232 | 
233 | 		best_results_dict = df.iloc[max_idx]
234 | 		return helper.fixSettingDictLoadedFromResultsDf(best_results_dict)
235 | 
236 | 	def getFinalResultsForTask(self, setting_dict):
237 | 		if self.users_as_tasks:
238 | 			task_df = self.data_df[self.data_df['user_id'] == setting_dict['task_name']]
239 | 			target_label = [self.target_label]
240 | 		else:
241 | 			task_df = self.data_df
242 | 			target_label = [helper.getOfficialLabelName(setting_dict['task_name'])]
243 | 		self.net = tfnet.TensorFlowNetwork(task_df, copy.deepcopy(self.wanted_feats),target_label, verbose=False, val_type=self.val_type)
244 | 		self.net.setParams(l2_beta=setting_dict['l2_beta'], initial_learning_rate=setting_dict['learning_rate'], decay=setting_dict['decay'], 
245 | 							decay_steps=setting_dict['decay_steps'], decay_rate=setting_dict['decay_rate'], batch_size=setting_dict['batch_size'],
246 | 							optimizer=setting_dict['optimizer'], dropout=setting_dict['dropout'])
247 | 		self.constructNetwork(setting_dict['hidden_layers'])
248 | 
249 | 		self.net.setUpGraph()
250 | 		preds = self.net.runGraph(self.test_steps, print_test=True, return_test_preds=True)
251 | 
252 | 		preds_df = self.net.get_preds_for_df()
253 | 		label_name = setting_dict['task_name']
254 | 		preds_df.to_csv(self.results_path + "Preds-" + self.save_prefix + label_name + '.csv')
255 | 		print "Preds df saved to", self.results_path + "Preds-" + self.save_prefix + label_name + '.csv'
256 | 
257 | 		return self.net.final_test_results['acc'], self.net.final_test_results['auc'], preds
258 | 
259 | 	def testOneSettingForOneTask(self, hidden_layers, l2_beta, lrate, dropout, decay, dsteps, drate, bsize, opt, tsteps):
260 | 		self.net.setParams(l2_beta=l2_beta, initial_learning_rate=lrate, decay=decay, 
261 | 							decay_steps=dsteps, decay_rate=drate, batch_size=bsize,
262 | 							optimizer=opt, n_steps=tsteps, dropout=dropout)
263 | 		self.constructNetwork(hidden_layers)
264 | 		if self.val_type == 'cross':
265 | 			val_acc, val_auc, val_f1, val_prec, val_recall = self.net.trainAndCrossValidate()
266 | 		else:
267 | 			val_acc, val_auc, val_f1, val_prec, val_recall = self.net.trainAndValidate()
268 | 
269 | 		results_dict = {'hidden_layers':hidden_layers, 'l2_beta': l2_beta, 'learning_rate': lrate, 
270 | 						'dropout': dropout, 'decay': decay, 'decay_steps': dsteps, 
271 | 						'decay_rate': drate, 'batch_size': bsize, 
272 | 						'optimizer': opt, 'val_acc': val_acc, 'val_auc':val_auc}
273 | 
274 | 		return results_dict
275 | 
276 | 	
277 | 	def runOneTask(self, task, target_label):
278 | 		print "\nRunning task", task
279 | 		if self.cont:
280 | 			if self.settingAlreadyDone(task):
281 | 				if self.redo_test:
282 | 					self.redoTestResult(task)
283 | 				best_setting = self.find_best_setting(task)
284 | 				print "The setting that produced the best validation results for task", task, "was:"
285 | 				print best_setting
286 | 				self.getFinalResultsForTask(best_setting)
287 | 				return 
288 | 
289 | 		t0 = time()
290 | 		
291 | 		results_dict = self.sweepParametersForOneTask(task, target_label)
292 | 		results_dict['task_name'] = task
293 | 		self.val_results_df = self.val_results_df.append(results_dict,ignore_index=True)
294 | 		
295 | 		print "\n", self.val_results_df.tail(n=1)
296 | 		t1 = time()
297 | 		this_time = t1 - t0
298 | 		print "It took", this_time, "seconds to obtain this result"
299 | 
300 | 		self.time_sum = self.time_sum + this_time
301 | 
302 | 		self.printTimeEstimate()
303 | 		sys.stdout.flush()
304 | 
305 | 		#output the file every few iterations for safekeeping 
306 | 		if len(self.val_results_df) % SAVE_RESULTS_EVERY_X_TESTS == 0:
307 | 			self.val_results_df.to_csv(self.results_path + self.save_prefix + '.csv')
308 | 
309 | 	def printTimeEstimate(self):
310 | 		num_done = len(self.val_results_df)-self.started_from
311 | 		num_remaining = self.n_tasks - num_done - self.started_from
312 | 		avg_time = self.time_sum / num_done
313 | 		total_secs_remaining = int(avg_time * num_remaining)
314 | 		hours = total_secs_remaining / 60 / 60
315 | 		mins = (total_secs_remaining % 3600) / 60
316 | 		secs = (total_secs_remaining % 3600) % 60
317 | 
318 | 		print "\n", num_done, "settings processed so far,", num_remaining, "left to go"
319 | 		print "Estimated time remaining:", hours, "hours", mins, "mins", secs, "secs"
320 | 
321 | 	def run(self):
322 | 		print "\nYou have chosen to test a total of", self.num_settings, "settings for each task"
323 | 		print "There are", self.n_tasks, "tasks, meaning you are training a total of..."
324 | 		print "\t", self.num_settings * self.n_tasks, "neural networks!!"
325 | 		sys.stdout.flush()
326 | 
327 | 		if self.users_as_tasks:
328 | 			tasks = self.data_df['user_id'].unique()
329 | 		else:
330 | 			tasks = [helper.getFriendlyLabelName(x) for x in self.wanted_labels]
331 | 
332 | 		i = 0
333 | 		for t in range(len(tasks)):
334 | 			if self.users_as_tasks:
335 | 				self.runOneTask(tasks[i], self.target_label)
336 | 			else:
337 | 				self.runOneTask(tasks[i], self.wanted_labels[i])
338 | 			if self.test_run and i > 2:
339 | 				break
340 | 			i += 1
341 | 		self.val_results_df.to_csv(self.results_path + self.save_prefix + '.csv')
342 | 
343 | 		if self.users_as_tasks:
344 | 			print "\n\nFINAL RESULTS - Averaging individual models:"
345 | 			print "\tValidation set: Accuracy =", np.nanmean(self.val_results_df['val_acc']), "AUC = ", np.nanmean(self.val_results_df['val_auc'])
346 | 			print "\tTest set: Accuracy =", np.nanmean(self.val_results_df['test_acc']), "AUC = ", np.nanmean(self.val_results_df['test_auc'])
347 | 			print ""
348 | 			print "FINAL RESULTS - Aggregating predictions of individual models"
349 | 			agg_auc = helper.computeAuc(self.cumulative_test_preds, self.cumulative_test_true)
350 | 			agg_acc = helper.getBinaryAccuracy(self.cumulative_test_preds, self.cumulative_test_true)
351 | 			print "\tTest set: Accuracy =", agg_acc, "AUC = ", agg_auc
352 | 
353 | 
354 | if __name__ == "__main__":
355 | 	print "TENSOR FLOW STL MODEL SELECTION"
356 | 	print "\tFor each tasl individually, this code will sweep a set of network architectures and parameters to find the ideal settings"
357 | 	print "\tIt will record the settings, validation and test results for each user"
358 | 
359 | 	if len(sys.argv) < 3:
360 | 		print "Error: usage is python tensorFlowWrapperSTL.py <data file> <task type> <target label> <continue> <redo test>"
361 | 		print "\t<data file>: e.g. dataset-Simple-Group.csv - program will look in the following directory for this file", DEFAULT_DATASETS_PATH
362 | 		print "\t<task type>: type 'users' for users as tasks, or 'wellbeing' for wellbeing measures as tasks"
363 | 		print "\t<target label>: Only required for users-as-tasks. Enter the name of the label you would like classify on. E.g. tomorrow_Group_Happiness_Evening_Label."
364 | 		print "\t<continue>: optional. If 'True', the neural net will pick up from where it left off by loading a previous validation results file"
365 | 		print "\t<redo test>: optional. If 'redo' the neural net will go through the saved validation results file and compute test predictions for each user for each setting. It will collect all the preds and only compute AUC at the end"
366 | 		sys.exit()
367 | 	filename= sys.argv[1] #get data file from command line argument
368 | 	task_type = sys.argv[2]
369 | 	if len(sys.argv) >= 4:
370 | 		target_label = sys.argv[3]
371 | 		print "Classifying on target label:", target_label
372 | 	else:
373 | 		target_label = None
374 | 
375 | 	print "\nLoading dataset", DEFAULT_DATASETS_PATH + filename
376 | 	if task_type == 'wellbeing':
377 | 		users_as_tasks = False
378 | 		print "Performing wellbeing-as-tasks classification\n"
379 | 	else:
380 | 		users_as_tasks = True
381 | 		print "Performing users-as-tasks classification\n"
382 | 
383 | 	if len(sys.argv) >= 5 and sys.argv[4] == 'True':
384 | 		cont = True
385 | 		print "Okay, will continue from a previously saved validation results file for this problem"
386 | 	else:
387 | 		cont = False
388 | 	print ""
389 | 
390 | 	redo = False
391 | 	if len(sys.argv) >= 6 and sys.argv[5] == 'redo':
392 | 		redo = True
393 | 		print "Okay, will redo all the test results to get a better AUC"
394 | 
395 | 
396 | 	wrapper = TensorFlowSTLWrapper(filename, target_label=target_label, users_as_tasks=users_as_tasks, cont=cont,
397 | 										results_path=DEFAULT_RESULTS_PATH, datasets_path=DEFAULT_DATASETS_PATH, figures_path=DEFAULT_FIGURES_PATH)
398 | 	
399 | 	if not redo:
400 | 		print "\nThe following parameter settings will be tested for each task:"
401 | 		print "\tl2_regularizers:  \t", wrapper.l2_regularizers
402 | 		print "\tlearning_rates:   \t", wrapper.learning_rates
403 | 		print "\tdropout:          \t", wrapper.dropout
404 | 		print "\tdecay:            \t", wrapper.decay
405 | 		print "\tdecay_steps:      \t", wrapper.decay_steps
406 | 		print "\tdecay_rates:      \t", wrapper.decay_rates
407 | 		print "\tbatch_sizes:      \t", wrapper.batch_sizes
408 | 		print "\toptimizers:       \t", wrapper.optimizers
409 | 		print "\ttrain_steps:      \t", wrapper.train_steps
410 | 
411 | 		print "\nThe following network structures will be tested:"
412 | 		print "\t", wrapper.architectures
413 | 
414 | 		print "\nThe validation results dataframe will be saved in:", wrapper.results_path + wrapper.save_prefix + '.csv'
415 | 		print "\nThe validation accuracy figures will be saved in:", wrapper.figures_path + wrapper.save_prefix + '.eps'
416 | 
417 | 		wrapper.run()
418 | 	else:
419 | 		wrapper.redoAllTestsResults()
420 | 
421 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Personalized Multitask Learning
 2 | This repo contains code for 3 multitask machine learning methods: deep neural networks, Multitask Multi-kernel Learning (MTMKL), and a hierarchical Bayesian model (HBLR). These methods can be used to personalize the prediction of outcomes like stress, happiness, etc. to each individual, by treating predicting the outcome of a single individual (or a cluster of related individuals) as a task. 
 3 | 
 4 | The code is related to two research papers which explain this approach in further detail: 
 5 | 
 6 | Taylor, S.\*, Jaques, N.\*, Nosakhare, E., Sano, A., Picard, R., <strong>"Personalized Multitask Learning for Predicting Tomorrow’s Mood, Stress, and Health"</strong>, IEEE Transactions on Affective Computing December 2017. <small>(\*equal contribution)</small> <a href="https://affect.media.mit.edu/pdfs/17.TaylorJaques-PredictingTomorrowsMoods.pdf">PDF</a>
 7 | 
 8 | Jaques, N.\*, Taylor, S.\*, Nosakhare, E., Sano, A., Picard, R., <strong>"Multi-task Learning for Predicting Health, Stress, and Happiness", </strong> NIPS Workshop on Machine Learning for Healthcare, December 2016, Barcelona, Spain. <small>(\*equal contribution)</small> <a href="http://affect.media.mit.edu/pdfs/16.Jaques-Taylor-et-al-PredictingHealthStressHappiness.pdf">PDF</a> <strong>*BEST PAPER AWARD*</strong><br/>
 9 | 
10 | <strong>If you find this code useful, please cite our work!</strong>
11 | 
12 | If you have any questions about this code or the associated papers, please email us at jaquesn@mit.edu or sataylor@mit.edu. 
13 | 
14 | # Models in this code:
15 | 
16 | ## Multitask Neural Network (MTL-NN)
17 | 
18 | ![image](mtl_nn_clusters.png)
19 | 
20 | The intuition behind the multitask neural network design is that the shared layers will learn to extract information 
21 | that is useful for summarizing relevant characteristics of any person’s day into an efficient, generalizable embedding. 
22 | The final, task-specific layers are then expected to learn how to map this embedding to a prediction customized for each person or cluster of people.
23 | 
24 | For example, if the shared layers learn to condense all of the relevant smartphone app data about phone calls and 
25 | texting into an aggregate measure of social support, the task-specific layers can then learn a unique weighting of this 
26 | measure for each cluster of participants. Perhaps a cluster containing participants with high extroversion scores will 
27 | be more strongly affected by a lack of social support than another cluster.
28 | 
29 | ## Multitask Multi-kernel Learning (MTMKL)
30 | 
31 | MTMKL (originally developed by <a href="https://www.sciencedirect.com/science/article/pii/S0925231214005025">Kandemir 
32 | et. al.</a>) is a modified version of Multi-Kernel Learning (MKL) in which tasks 
33 | share information through kernel weights on the modalities.  MTMKL uses a least-squares support vector machine (LSSVM) 
34 | for each task-specific model. Unlike the canonical SVM, the LSSVM uses a quadratic error on the “slack” variables 
35 | instead of an L1 error. As a result, the LSSVM can be learned by solving a series of linear equations in contrast to 
36 | using quadratic programing to learn a canonical SVM model.
37 | 
38 | 
39 | ## Hierarchical Bayesian Logistic Regression (HBLR)
40 | 
41 | In hierarchical Bayesian MTL approaches, the model for each task draws its parameters from a common prior distribution. 
42 | As the model is trained, the common prior is updated, allowing information to be shared across tasks. The model we 
43 | adopt, which was originally proposed by <a href="http://www.jmlr.org/papers/v8/xue07a.html">Xue et. al.</a>, draws logistic regression (LR) weights for each task 
44 | from a shared Dirichlet Process (DP) prior; we call this model Hierarchical Bayesian Logistic Regression (HBLR).
45 | 
46 | In contrast with our prior approaches (MTL-NN and MTMKL), the HBLR model allows us to directly define each task as 
47 | predicting a label (e.g. tomorrow's stress level) of a single user, since the model is able to implicitly learn its 
48 | own (soft) clustering. This model clusters tasks that are most similar in terms of their relationship between the 
49 | input features and their resulting outcome (i.e. decision boundaries) while simultaneously learning the prediction 
50 | function.
51 | 
52 | ## Single Task Learning models
53 | Code to train a logistic regression model, an LSSVM, and a single-task neural network is include for comparison purposes.
54 | 
55 | # Structure
56 | 
57 | ## Code structure
58 | Wrappers are used to perform a grid search over hyperparameters. The file `run_jobs.py` can be used to launch the training of several models in sequence, and send emails after they complete. To see an example of how to run the training code for the models, see `jobs_to_run.txt`. 
59 | 
60 | ## Input data format
61 | ### .csv files
62 | Assume csvs have columns for 'user_id', 'timestamp', and columns for the outcome labels containing the string '_Label'.
63 | 
64 | ### 'Task dict list' 
65 | For the multi-task algorithms, we use a special data structure saved to a pickle file to represent the data from multiple tasks. The code for generating files in this format given a .csv file is available in make_datasets.py. To run it, use:
66 | 
67 | ```python make_datasets.py --datafile='./example_data.csv' --task_type='users'```
68 | 
69 | #### File Format details
70 | - Data for both labels-as-tasks and users-as-tasks are stored in pickled files as a list of dicts (each list item represents a task)
71 |     - Labels-as-tasks
72 |         - The .csv file will be partitioned such that predicting related outcomes is each task (e.g. predicting stress is one task and predicting happiness is another)
73 |         - Normalization is done based on training data for entire group
74 | 	- Users-as-tasks:
75 |         - The .csv file will be partioned such that predicting the outcome of each user is one task.
76 |         - Need to specify which label to target (i.e., the label that you will be predicting)
77 |         - Normalization is done per-person
78 |         
79 | - Each task is a dict containing 4 keys:
80 |     - ‘Name’: gives the name of the task, eg. "Group_Happiness_Evening_Label" or a user ID
81 |     - ‘X’: the data matrix. Rows are samples, columns are features. Does not contain unnecessary stuff like ‘user_id’ and ‘timestamp’, and has already been normalized and empty cells filled
82 |     - ‘Y’: the classification labels for this task, in the same order as the rows of X
83 |     - ‘ModalityDict’: used for MTMKL model. Maps modalities like “phys” or “location” to their start index in the feature list 
84 | 
85 | 


--------------------------------------------------------------------------------
/__pycache__/helperFuncs.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mitmedialab/PersonalizedMultitaskLearning/2de7d9485f5ac09264bfa624f16c5b05a5a44ada/__pycache__/helperFuncs.cpython-35.pyc


--------------------------------------------------------------------------------
/example_data.csv:
--------------------------------------------------------------------------------
1 | ,user_id,timestamp,dataset,classifier_friendly_ppt_id,classifier_friendly_day_of_week,location_time_spent_on_campus,location_log_likelihood_of_day,weather_max_precip_intensity,call_0H-24H_total_num_missed,phys_3H-10H:percentHighPeakNoArtifact,phys_3H-10H:percentMedPeakNoArtifact,phys_3H-10H:sumTempWeightedAUC,screen_0H-3H_total_duration,sms_0H-3H_total_num_incoming,sms_17H-24H_unique_num_incoming,sms_17H-24H_unique_num_outgoing,sms_0H-24H_unique_num_outgoing,tomorrow_Happiness_Evening_Label,tomorrow_Health_Evening_Label,tomorrow_Calmness_Evening_Label0,1,8/22/17 0:00,Val,0,4,0,1434.850264,0,1,0,0,-0.369250141,49,0,4,3,4,1,1,1,1,8/23/17 0:00,Val,0,5,0,1384.511324,0,0,0,0,-47.11177225,21,0,2,1,1,1,1,02,1,8/24/17 0:00,Train,0,6,0,1432.698762,0,0,0,0,-8.383537082,10,1,3,3,5,1,0,13,1,8/25/17 0:00,Val,0,0,900,1282.323883,0,0,0,0,-32.14207652,0,0,4,4,6,0,0,04,1,8/26/17 0:00,Test,0,1,0,617.3508313,0,0,0,0,-123.1579134,66,0,3,3,3,1,1,05,2,8/22/17 0:00,Train,1,4,0,-24.69563937,0,0,0,0,-1.351502791,0,0,5,5,6,1,,06,2,8/23/17 0:00,Train,1,5,900,1433.587585,0,1,0.238095238,0.238095238,-65.58436476,0,0,4,3,3,1,,17,2,8/24/17 0:00,Train,1,6,0,662.5491124,0,0,0,0,-16.83429783,0,0,5,5,9,1,1,8,2,8/25/17 0:00,Test,1,0,0,966.1353508,0,0,0,0,-135.3018584,0,0,4,4,9,1,1,19,2,8/26/17 0:00,Test,1,1,0,757.6295022,0.03,0,0.476190476,0.476190476,-217.5607483,20,0,1,1,6,1,1,


--------------------------------------------------------------------------------
/generic_wrapper.py:
--------------------------------------------------------------------------------
  1 | """These abstract wrapper classes are designed to enable hyperparameter sweeps 
  2 | for a variety of different models that inherit them. 
  3 | 
  4 | Note: STL stands for Single-Task-Learning, i.e. normal machine learning 
  5 | algorithms like SVM, logistic regression, etc."""
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import os
 10 | import sys
 11 | import copy
 12 | from time import time
 13 | 
 14 | CODE_PATH = os.path.dirname(os.getcwd())
 15 | sys.path.append(CODE_PATH)
 16 | 
 17 | DEFAULT_MAIN_DIRECTORY = '/Your/path/here/'
 18 | 
 19 | DEFAULT_VALIDATION_TYPE = 'cross' #'val'
 20 | DEFAULT_NUM_CROSS_FOLDS = 5
 21 | 
 22 | import helperFuncs as helper
 23 | 
 24 | def reload_dependencies():
 25 | 	reload(helper)
 26 | 
 27 | # This optimizes parameters individually for each task
 28 | 
 29 | class STLWrapper:
 30 | 	""" WARNING: This code only deals with input files in the form of pickled task lists,
 31 | 	and only implements cross validation."""
 32 | 	def __init__(self, file_prefix, users_as_tasks=False, cont=False, classifier_name='LSSVM', 
 33 | 				num_cross_folds=DEFAULT_NUM_CROSS_FOLDS, main_directory=DEFAULT_MAIN_DIRECTORY, 
 34 | 				datasets_path='Data/Datasets/Discard20/', cant_train_with_one_class=True,
 35 | 				check_test=False, save_results_every_nth=3, test_csv_filename=None):
 36 | 		""" Initializes the parent model with fields useful for all child wrapper classes
 37 | 
 38 | 		Args:
 39 | 			file_prefix: The first portion of the name of a set of pickled task lists, e.g.
 40 | 				'datasetTaskList-Discard-Future-Group_'
 41 | 			users_as_tasks: A boolean. If true, will assume there are many tasks and each task
 42 | 				is one person. Will not print results per task. 
 43 | 			cont: A boolean. If true, will try to load a saved results .csv and continue 
 44 | 				training on the next unfinished result.
 45 | 			classifier_name: String name of the classifier trained. Used to know where to save
 46 | 				results.
 47 | 			num_cross_folds: An integer number of folds to use in cross validation.
 48 | 			main_directory: The path to the main dropbox directory which contains the results and
 49 | 				data directories.
 50 | 			datasets_path: The path from the main dropbox to the datasets directory.
 51 | 			cant_train_with_one_class: A boolean. If true, if the model encounters a task with 
 52 | 				only one type of label in the training data, it will just predict the most 
 53 | 				frequent class. 
 54 | 			check_test: A boolean. If true, will evaluate final results on held-out test set 
 55 | 				after running.
 56 | 			save_results_every_nth: An integer representing the number of settings to test before
 57 | 				writing the results df to a csv file.
 58 | 		"""
 59 | 		# memorize arguments and construct paths
 60 | 		self.main_directory = main_directory
 61 | 		self.classifier_name = classifier_name
 62 | 		self.results_path = main_directory + 'Results/' + classifier_name + '/'
 63 | 		self.figures_path = main_directory + 'Figures/' + classifier_name + '/'
 64 | 		self.datasets_path = main_directory + datasets_path
 65 | 		self.cont = cont
 66 | 		self.users_as_tasks = users_as_tasks
 67 | 		self.cant_train_with_one_class = cant_train_with_one_class
 68 | 		self.check_test = check_test
 69 | 		self.save_results_every_nth = save_results_every_nth
 70 | 		self.file_prefix = file_prefix
 71 | 		self.save_prefix = self.get_save_prefix(file_prefix, replace=cont)
 72 | 		if test_csv_filename is not None:
 73 | 			self.test_csv_filename = self.datasets_path + test_csv_filename
 74 | 		else:
 75 | 			self.test_csv_filename = None
 76 | 
 77 | 		self.params = {}
 78 | 		self.define_params()
 79 | 
 80 | 		self.load_data()
 81 | 
 82 | 		self.calc_num_param_settings()
 83 | 		self.construct_list_of_params_to_test()
 84 | 
 85 | 		#storing the results
 86 | 		self.time_sum = 0
 87 | 		if cont:
 88 | 			self.val_results_df = pd.DataFrame.from_csv(self.results_path + self.save_prefix + '.csv')
 89 | 			print '\nPrevious validation results df loaded. It has', len(self.val_results_df), "rows"
 90 | 			self.started_from = len(self.val_results_df)
 91 | 		else:
 92 | 			self.val_results_df = pd.DataFrame()
 93 | 			self.started_from = 0
 94 | 
 95 | 		self.num_cross_folds = num_cross_folds
 96 | 		helper.generateCrossValPickleFiles(self.datasets_path, self.file_prefix, self.num_cross_folds)
 97 | 
 98 | 	# These functions need to be overwritten by the child class
 99 | 	def define_params(self):
100 | 		""" This function should set self.params to a dict where they keys represent names of parameters
101 | 			to test (e.g. for SVM, 'C') as they should be saved to the val_results_df, and the values of 
102 | 			self.params should be a list of values for the parameter that need to be tested. An example 
103 | 			dict:
104 | 				self.params['C'] = [1,10,100]
105 | 				self.params['beta'] = [.001, .01, .1]
106 | 		"""
107 | 		print "Error! define_params should be overwritten in child class"
108 | 		raise NotImplementedError
109 | 
110 | 	def train_and_predict_task(self, t, train_X, train_y, eval_X, param_dict):
111 | 		print "Error! train_model_for_task should be overwritten in child class"
112 | 		raise NotImplementedError
113 | 
114 | 	def predict_task(self, X, t):
115 | 		print "Error! predict_task should be overwritten in child class"
116 | 		raise NotImplementedError
117 | 
118 | 	def calc_num_param_settings(self):
119 | 		self.num_settings = self.n_tasks
120 | 		for key in self.params:
121 | 			self.num_settings = self.num_settings * len(self.params[key])
122 | 
123 | 	def construct_list_of_params_to_test(self):
124 | 		"""Will make a class level variable that is a list of parameter dicts.
125 | 		Each entry in the list is a dict of parameter settings, 
126 | 		eg. {'C'=1.0, 'beta'=.01, ...}. All tasks can use this list to train
127 | 		against all settings."""
128 | 		self.list_of_param_settings = []
129 | 		self.recurse_and_append_params(copy.deepcopy(self.params), {})
130 | 
131 | 	def recurse_and_append_params(self, param_settings_left, this_param_dict, debug=False):
132 | 		"""param_settings_left is a dictionary of lists. The keys are parameters
133 | 		(like 'C'), the values are the list of settings for those parameters that 
134 | 		need to be tested (like [1.0, 10.0, 100.0]). this_param_dict is a dictionary 
135 | 		containing a single setting for each parameter. If a parameter is not in 
136 | 		this_param_dict's keys, a setting for it has not been chosen yet.
137 | 		
138 | 		Performs breadth-first-search"""
139 | 		if debug: print "Working on a parameter dict containing", this_param_dict
140 | 		for key in self.params.keys():
141 | 			if key in this_param_dict:
142 | 				continue
143 | 			else:
144 | 				this_setting = param_settings_left[key].pop()
145 | 				if debug: print "Popped", key, "=", this_setting, "off the params left"
146 | 				if len(param_settings_left[key]) > 0:
147 | 					if debug: print "Recursing on remaining parameters", param_settings_left
148 | 					self.recurse_and_append_params(copy.deepcopy(param_settings_left), 
149 | 												   copy.deepcopy(this_param_dict))
150 | 				if debug: print "Placing the popped setting", key, "=", this_setting, "into the parameter dict"
151 | 				this_param_dict[key] = this_setting
152 | 				
153 | 		self.list_of_param_settings.append(this_param_dict)
154 | 		if debug: print "Appending parameter dict to list:", this_param_dict, "\n"
155 | 
156 | 	def load_data(self):
157 | 		self.test_tasks = helper.loadPickledTaskList(self.datasets_path, self.file_prefix, "Test",fix_y=True)
158 | 		self.train_tasks = helper.loadPickledTaskList(self.datasets_path, self.file_prefix, "Train",fix_y=True)
159 | 		self.n_tasks = len(self.train_tasks)
160 | 	
161 | 	def get_save_prefix(self, file_prefix, replace=False):
162 | 		name_modifier = ""
163 | 		if '/' in file_prefix:
164 | 			if "NoLocation" in file_prefix:
165 | 				name_modifier = "-noloc"
166 | 			slash_loc = file_prefix.find('/')
167 | 			path_modifier = file_prefix[0:slash_loc+1]
168 | 			file_prefix = file_prefix[slash_loc+1:]
169 | 			self.file_prefix = file_prefix
170 | 			self.datasets_path += path_modifier
171 | 
172 | 		dash_loc = file_prefix.find('-')
173 | 
174 | 		if self.users_as_tasks:
175 | 			task_str = '_users'
176 | 		else:
177 | 			task_str = '_wellbeing'
178 | 
179 | 		prefix = self.classifier_name + task_str + file_prefix[dash_loc:-1] + name_modifier
180 | 
181 | 		if not replace:
182 | 			while os.path.exists(self.results_path + prefix + '.csv'):
183 | 				prefix = prefix + '2'
184 | 		return prefix
185 | 
186 | 	def setting_already_done(self, param_dict):
187 | 		mini_df = self.val_results_df
188 | 		for key in param_dict.keys():
189 | 			mini_df = mini_df[mini_df[key] == param_dict[key]]
190 | 			if len(mini_df) == 0:
191 | 				return False
192 | 		print "Setting already tested"
193 | 		return True
194 | 
195 | 	def convert_param_dict_for_use(self, param_dict):
196 | 		"""When loading rows from a saved results df in csv format, some 
197 | 		of the settings may end up being converted to a string representation
198 | 		and need to be converted back to actual numbers and objects.
199 | 		
200 | 		May need to be overwritten in child class.""" 
201 | 		param_dict['task_num'] = int(param_dict['task_num'])
202 | 		return param_dict
203 | 
204 | 	def get_preds_true_for_task(self,train_tasks, test_tasks, param_dict):
205 | 		t = param_dict['task_num']
206 | 		X = train_tasks[t]['X']
207 | 		y = train_tasks[t]['Y']
208 | 
209 | 		test_X = test_tasks[t]['X']
210 | 		true_y = list(test_tasks[t]['Y'].flatten())
211 | 
212 | 		if len(y)==0 or len(X)==0 or len(test_X) == 0 or len(true_y)==0:
213 | 			return None, None
214 | 
215 | 		if self.cant_train_with_one_class and len(np.unique(y))==1:
216 | 			preds = list(np.unique(y)[0]*np.ones(len(true_y)))
217 | 		else:
218 | 			preds = self.train_and_predict_task(t, X, y, test_X, param_dict)
219 | 
220 | 		return preds, true_y
221 | 
222 | 	def sweep_all_parameters(self):
223 | 		print "\nYou have chosen to test a total of", self.num_settings / self.n_tasks, "settings"
224 | 		print "for each of", self.n_tasks, "tasks, leading to a total of..."
225 | 		print self.num_settings, "models to train!!"
226 | 		sys.stdout.flush()
227 | 
228 | 		#sweep all possible combinations of parameters
229 | 		for t in range(self.n_tasks):
230 | 			print "\nSweeping all parameters for task t:", self.train_tasks[t]['Name']
231 | 			for param_dict in self.list_of_param_settings:
232 | 				these_params = copy.deepcopy(param_dict)
233 | 				these_params['task_num'] = t
234 | 				these_params['task_name'] = self.train_tasks[t]['Name']
235 | 				self.test_one_setting(these_params)
236 | 			
237 | 			self.val_results_df.to_csv(self.results_path + self.save_prefix + '.csv')
238 | 
239 | 	def test_one_setting(self, param_dict):
240 | 		if self.cont and self.setting_already_done(param_dict):
241 | 			return
242 | 		t0 = time()
243 | 		
244 | 		results_dict = self.get_cross_validation_results(param_dict)
245 | 		self.val_results_df = self.val_results_df.append(results_dict,ignore_index=True)
246 | 		
247 | 		t1 = time()
248 | 		this_time = t1 - t0
249 | 		self.time_sum = self.time_sum + this_time
250 | 		
251 | 		print "\n", self.val_results_df.tail(n=1)
252 | 		print "It took", this_time, "seconds to obtain this result"
253 | 		self.print_time_estimate()
254 | 		
255 | 		sys.stdout.flush()
256 | 
257 | 		#output the file every few iterations for safekeeping 
258 | 		if len(self.val_results_df) % self.save_results_every_nth == 0:
259 | 			self.val_results_df.to_csv(self.results_path + self.save_prefix + '.csv')
260 | 
261 | 	def get_cross_validation_results(self, param_dict, print_per_fold=False):
262 | 		all_acc = []
263 | 		all_auc = []
264 | 		all_f1 = []
265 | 		all_precision = []
266 | 		all_recall = []
267 | 
268 | 		for f in range(self.num_cross_folds):
269 | 			train_tasks, val_tasks = helper.loadCrossValData(self.datasets_path, self.file_prefix, f, fix_y=True)		
270 | 			
271 | 			preds, true_y = self.get_preds_true_for_task(train_tasks, val_tasks, param_dict)
272 | 			if preds is None or true_y is None:
273 | 				continue
274 | 
275 | 			acc, auc, f1, precision, recall = helper.computeAllMetricsForPreds(preds, true_y)
276 | 			all_acc.append(acc)
277 | 			all_auc.append(auc)
278 | 			all_f1.append(f1)
279 | 			all_precision.append(precision)
280 | 			all_recall.append(recall)
281 | 			if print_per_fold: print "Fold", f, "acc", acc, "auc", auc, "f1", f1, "precision",precision,"recall",recall
282 | 
283 | 		if print_per_fold:
284 | 			print "accs for all folds", all_acc
285 | 			print "aucs for all folds", all_auc
286 | 		
287 | 		# Add results to the dictionary
288 | 		param_dict['val_acc'] = np.nanmean(all_acc)
289 | 		param_dict['val_auc'] = np.nanmean(all_auc)
290 | 		param_dict['val_f1'] = np.nanmean(all_f1)
291 | 		param_dict['val_precision'] = np.nanmean(all_precision)
292 | 		param_dict['val_recall'] = np.nanmean(all_recall)
293 | 
294 | 		return param_dict
295 | 
296 | 	def print_time_estimate(self):
297 | 		num_done = len(self.val_results_df)-self.started_from
298 | 		num_remaining = self.num_settings - num_done - self.started_from
299 | 		avg_time = self.time_sum / num_done
300 | 		total_secs_remaining = int(avg_time * num_remaining)
301 | 		hours = total_secs_remaining / 60 / 60
302 | 		mins = (total_secs_remaining % 3600) / 60
303 | 		secs = (total_secs_remaining % 3600) % 60
304 | 
305 | 		print "\n", num_done, "settings processed so far,", num_remaining, "left to go"
306 | 		print "Estimated time remaining:", hours, "hours", mins, "mins", secs, "secs"
307 | 
308 | 	def get_baseline(self, Y):
309 | 		Y = Y.tolist()
310 | 		percent_true = float(Y.count(1.0)) / float(len(Y))
311 | 		if percent_true < 0.5:
312 | 			return 1.0 - percent_true
313 | 		else:
314 | 			return percent_true
315 | 
316 | 	def find_best_setting_for_task(self, task_num, optimize_for='val_acc'):
317 | 		task_df = self.val_results_df[self.val_results_df['task_num']==task_num]
318 | 		accuracies = task_df[optimize_for].tolist()
319 | 		max_acc = max(accuracies)
320 | 		max_idx = accuracies.index(max_acc)
321 | 		return task_df.iloc[max_idx]
322 | 
323 | 	def get_final_results(self, optimize_for='val_acc'):
324 | 		if self.users_as_tasks and not self.check_test:
325 | 			print "check_test is set to false, Will not evaluate performance on held-out test set."
326 | 			return
327 | 		print "\nAbout to evaluate results on held-out test set!!"
328 | 		print "Will use the settings that produced the best", optimize_for
329 | 		
330 | 		all_preds = []
331 | 		all_true_y = []
332 | 		per_task_accs = []
333 | 		per_task_aucs = []
334 | 		per_task_f1 = []
335 | 		per_task_precision = []
336 | 		per_task_recall = []
337 | 
338 | 		for t in range(self.n_tasks):
339 | 			task_settings = self.find_best_setting_for_task(t, optimize_for=optimize_for)
340 | 			assert(task_settings['task_num'] == t)
341 | 			if not self.users_as_tasks:
342 | 				print "\nBEST SETTING FOR TASK", t, "-", task_settings['task_name']
343 | 				print "The highest", optimize_for, "of", task_settings[optimize_for], "was found with the following settings:"
344 | 				print task_settings
345 | 
346 | 			task_settings = self.convert_param_dict_for_use(task_settings)
347 | 			preds, true_y = self.get_preds_true_for_task(self.train_tasks, self.test_tasks, task_settings)
348 | 			if preds is None or true_y is None:
349 | 				continue
350 | 
351 | 			all_preds.extend(preds)
352 | 			all_true_y.extend(true_y)
353 | 
354 | 			# save the per-task results
355 | 			t_acc, t_auc, t_f1, t_precision, t_recall = helper.computeAllMetricsForPreds(preds, true_y)
356 | 			per_task_accs.append(t_acc)
357 | 			per_task_aucs.append(t_auc)
358 | 			per_task_f1.append(t_f1)
359 | 			per_task_precision.append(t_precision)
360 | 			per_task_recall.append(t_recall)
361 | 
362 | 			if not self.users_as_tasks:
363 | 				print "\nFINAL TEST RESULTS FOR", helper.getFriendlyLabelName(self.train_tasks[t]['Name'])
364 | 				print 'Acc:', t_acc, 'AUC:', t_auc, 'F1:', t_f1, 'Precision:', t_precision, 'Recall:', t_recall
365 | 
366 | 		print "\nHELD OUT TEST METRICS COMPUTED BY AVERAGING OVER TASKS"
367 | 		avg_acc = np.nanmean(per_task_accs)
368 | 		avg_auc = np.nanmean(per_task_aucs)
369 | 		avg_f1 = np.nanmean(per_task_f1)
370 | 		avg_precision = np.nanmean(per_task_precision)
371 | 		avg_recall = np.nanmean(per_task_recall)
372 | 		print 'Acc:', avg_acc, 'AUC:', avg_auc, 'F1:', avg_f1, 'Precision:', avg_precision, 'Recall:', avg_recall
373 | 
374 | 		if self.test_csv_filename is not None:
375 | 			print "\tSAVING HELD OUT PREDICITONS"
376 | 			if self.users_as_tasks:
377 | 				task_column = 'user_id'
378 | 				label_name = helper.getFriendlyLabelName(self.file_prefix)
379 | 				wanted_label = helper.getOfficialLabelName(label_name)
380 | 				predictions_df = helper.get_test_predictions_for_df_with_task_column(
381 | 						self.predict_task, self.test_csv_filename, task_column, self.test_tasks, 
382 | 						wanted_label=wanted_label, num_feats_expected=np.shape(self.test_tasks[0]['X'])[1], 
383 | 						label_name=label_name, tasks_are_ints=False)
384 | 			else:
385 | 				predictions_df = helper.get_test_predictions_for_df_with_no_task_column(self.predict_task, 
386 | 					self.test_csv_filename, self.test_tasks, num_feats_expected=np.shape(self.test_tasks[0]['X'])[1])
387 | 			predictions_df.to_csv(self.results_path + "Preds-" + self.save_prefix + '.csv')
388 | 		else:
389 | 			print "Uh oh, the test csv filename was not set, can't save test preds"
390 | 
391 | 	def run(self):
392 | 		self.sweep_all_parameters()
393 | 		self.get_final_results()
394 | 
395 | 


--------------------------------------------------------------------------------
/helperFuncs.py:
--------------------------------------------------------------------------------
  1 | """Collection of utility functions to support the rest of the code."""
  2 | import numpy as np
  3 | import pandas as pd
  4 | import copy
  5 | import os
  6 | import pickle
  7 | from scipy import stats
  8 | from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
  9 | import ast
 10 | import tensorflow as tf 
 11 | 
 12 | NAN_FILL_VALUE = 0
 13 | 
 14 | def computeAuc(preds, true_y):
 15 | 	try:
 16 | 		return roc_auc_score(true_y, preds)
 17 | 	except:
 18 | 		return np.nan
 19 | 
 20 | def computeF1(preds, true_y):
 21 | 	try: 
 22 | 		if (1 not in true_y) or (1 not in preds):
 23 | 			# F-score is ill-defined when there are no true samples
 24 | 			# F-score is ill-defined when there are no predicted samples.
 25 | 			return np.nan
 26 | 		return f1_score(true_y, preds)
 27 | 	except:
 28 | 		return np.nan
 29 | 
 30 | #The precision is the ratio tp / (tp + fp) where tp is the number of 
 31 | #true positives and fp the number of false positives.
 32 | def computePrecision(preds, true_y):
 33 | 	try:
 34 | 		if (1 not in preds):
 35 | 			#Precision is ill-defined when there are no predicted samples.
 36 | 			return np.nan
 37 | 		return precision_score(true_y, preds)
 38 | 	except:
 39 | 		return np.nan
 40 | 
 41 | #The recall is the ratio tp / (tp + fn) where tp is the number of true 
 42 | #positives and fn the number of false negatives. The recall is intuitively 
 43 | #the ability of the classifier to find all the positive samples.
 44 | def computeRecall(preds, true_y):
 45 | 	try:
 46 | 		if 1 not in true_y:
 47 | 			# Recall is ill-defined and being set to 0.0 due to no true samples
 48 | 			return np.nan
 49 | 		return recall_score(true_y, preds)
 50 | 	except:
 51 | 		return np.nan
 52 | 
 53 | def computeDistanceFromBaseline(preds, true_y):
 54 | 	if len(np.shape(preds)) > 1:
 55 | 		print("ERROR! Baseline distance function not defined for multi-dimensional predictions")
 56 | 		return np.nan
 57 | 	baseline = getBaseline(true_y)
 58 | 	acc = getBinaryAccuracy(preds,true_y)
 59 | 	return acc - baseline
 60 | 
 61 | def computeAllMetricsForPreds(preds, true_y):
 62 | 	acc = getBinaryAccuracy(preds,true_y)
 63 | 	auc = computeAuc(preds, true_y)
 64 | 	f1 = computeF1(preds, true_y)
 65 | 	precision = computePrecision(preds, true_y)
 66 | 	recall = computeRecall(preds, true_y)
 67 | 	return acc, auc, f1, precision, recall
 68 | 
 69 | def checkTaskList(train_tasks):
 70 | 	for t in range(len(train_tasks)):
 71 | 		isValidTask(train_tasks,t)
 72 | 	print("...done!")
 73 | 
 74 | def isValidTask(train_tasks, t, print_msgs=True):
 75 | 	if train_tasks[t]['Y'] is None or train_tasks[t]['X'] is None:
 76 | 		if print_msgs: print("Uh oh,", train_tasks[t]['Name'], "is None!!")
 77 | 		return False
 78 | 	elif len(train_tasks[t]['X']) == 0:
 79 | 		if print_msgs: print("Uh oh,", train_tasks[t]['Name'], "has no data!")
 80 | 		return False
 81 | 	elif len(train_tasks[t]['X']) != len(train_tasks[t]['Y']):
 82 | 		if print_msgs: print("Uh oh,", train_tasks[t]['Name'], 
 83 | 							 "has messed up data! Lengths of X and Y don't match")
 84 | 		return False
 85 | 	return True
 86 | 
 87 | def getBootstrapSample(test_df):
 88 | 	bootstrap_ix = np.random.choice(test_df.index,len(test_df))
 89 | 		
 90 | 	test_df = test_df.loc[bootstrap_ix]
 91 | 	test_df = test_df.reset_index()
 92 | 	test_df = test_df.drop('index',1)
 93 | 	return test_df
 94 | 
 95 | def plotROC(auc_list,fpr_list,tpr_list):
 96 | 	mean_tpr = 0.0
 97 | 	mean_fpr = np.linspace(0,1,100)
 98 | 
 99 | 	plt.figure(figsize=(5,5))
100 | 
101 | 	for i in range(len(fpr_list)):
102 | 		mean_tpr += np.interp(mean_fpr, fpr_list[i], tpr_list[i])
103 | 		mean_tpr[0] = 0.0
104 | 		plt.plot(fpr_list[i], tpr_list[i], lw=1, label='ROC fold %d (area = %0.2f)' % (i, auc_list[i]))
105 | 
106 | 	plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
107 | 
108 | 	mean_tpr /= len(fpr_list)
109 | 	mean_tpr[-1] = 1.0
110 | 	mean_auc = auc(mean_fpr, mean_tpr)
111 | 	plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)
112 | 
113 | 	plt.xlim([-0.05, 1.05])
114 | 	plt.ylim([-0.05, 1.05])
115 | 	plt.xlabel('False Positive Rate')
116 | 	plt.ylabel('True Positive Rate')
117 | 	plt.title('')
118 | 	plt.legend(loc="lower right")
119 | 	plt.show()
120 | 
121 | 	return mean_auc, mean_fpr, mean_tpr
122 | 
123 | def getBinaryAccuracy(pred,true_labels):
124 | 	assert len(pred)==len(true_labels)
125 | 
126 | 	correct_labels = [1 for i in range(len(pred)) if pred[i]==true_labels[i]]
127 | 	try:
128 | 		return len(correct_labels)/float(len(pred))
129 | 	except:
130 | 		return np.nan
131 | 
132 | def getBaseline(Y):
133 | 	if type(Y) != list:
134 | 		Y = Y.tolist()
135 | 	percentTrue = float(Y.count(1.0)) / float(len(Y))
136 | 	if percentTrue < 0.5:
137 | 		return 1.0 - percentTrue
138 | 	else:
139 | 		return percentTrue
140 | 
141 | def getTaskListFileCoreName(file_prefix):
142 | 	dash_loc = file_prefix.find('-')
143 | 	return file_prefix[dash_loc:-1]
144 | 
145 | def loadPickledTaskList(datasets_path, file_prefix, dataset, reshape=False, fix_y=False):
146 | 	task_list = pickle.load(open(datasets_path + file_prefix + dataset + ".p","rb"))
147 | 
148 | 	task_list = fixTaskListFile(task_list)
149 | 
150 | 	if reshape:
151 | 		for i in range(len(task_list)):
152 | 			if task_list[i]["Y"] is not None:
153 | 				task_list[i]["Y"] = task_list[i]["Y"].reshape(-1,1)
154 | 
155 | 	if fix_y:
156 | 		for t in range(len(task_list)):
157 | 			task_list[t]["Y"] = 2*task_list[t]["Y"]-1
158 | 	
159 | 	return task_list
160 | 
161 | 
162 | def fixTaskListFile(task_list,debug=False):
163 | 	num_feats = calculateNumFeatsInTaskList(task_list)
164 | 	for i in range(len(task_list)):
165 | 		if task_list[i]["Y"] is None:
166 | 			if debug: print("Y for task", task_list[i]['Name'], 
167 | 							"is None, fixing")
168 | 			task_list[i]['Y'] = np.zeros((0))
169 | 		if task_list[i]['X'] is None:
170 | 			if debug: print("X for task", task_list[i]['Name'], 
171 | 							"is None, fixing")
172 | 			task_list[i]['X'] = np.zeros((0,num_feats))
173 | 	return task_list
174 | 
175 | 
176 | def loadCrossValData(datasets_path, file_prefix, fold, reshape=True, fix_y=False):
177 | 	save_prefix = getTaskListFileCoreName(file_prefix)
178 | 
179 | 	train_tasks = loadPickledTaskList(datasets_path, "CVFold" + str(fold) + save_prefix, "Train", reshape=reshape, fix_y=fix_y)
180 | 	val_tasks = loadPickledTaskList(datasets_path, "CVFold" + str(fold) + save_prefix, "Val", reshape=reshape, fix_y=fix_y)
181 | 
182 | 	return train_tasks, val_tasks
183 | 
184 | def generateCrossValPickleFiles(datasets_path, file_prefix, num_cross_folds):
185 | 	save_prefix = getTaskListFileCoreName(file_prefix)
186 | 
187 | 	if os.path.exists(datasets_path + "CVFold0" + save_prefix + "Train.p"):
188 | 		print("\nCross validation folds have already been created")
189 | 		return
190 | 
191 | 	train_tasks = pickle.load(open(datasets_path + file_prefix + "Train.p","rb"))
192 | 	val_tasks =  pickle.load(open(datasets_path + file_prefix + "Val.p","rb"))
193 | 	
194 | 	print("\nGenerating cross validation sets")
195 | 	new_train_tasks = [0] * (num_cross_folds+1)
196 | 	new_val_tasks = [0] * num_cross_folds
197 | 	for f in range(num_cross_folds):
198 | 		new_train_tasks[f] = copy.deepcopy(train_tasks)
199 | 		new_val_tasks[f] = copy.deepcopy(val_tasks)
200 | 	new_train_tasks[num_cross_folds] = copy.deepcopy(train_tasks)
201 | 
202 | 	n_tasks = len(train_tasks)
203 | 	for t in range(n_tasks):
204 | 		crossVal_X, crossVal_y = generateCrossValSet(train_tasks[t]['X'], train_tasks[t]['Y'], val_tasks[t]['X'], val_tasks[t]['Y'], num_cross_folds, verbose=False)			
205 | 
206 | 		for f in range(num_cross_folds):
207 | 			train_X, train_Y, val_X, val_Y = getTrainAndValDataForCrossValFold(crossVal_X, crossVal_y, f)
208 | 			new_train_tasks[f][t]['X'] = train_X
209 | 			new_train_tasks[f][t]['Y'] = train_Y
210 | 			new_val_tasks[f][t]['X'] = val_X
211 | 			new_val_tasks[f][t]['Y'] = val_Y
212 | 
213 | 		new_train_tasks[num_cross_folds][t]['X'],new_train_tasks[num_cross_folds][t]['Y'] = getFullTrain(crossVal_X, crossVal_y)
214 | 
215 | 	for f in range(num_cross_folds):
216 | 		pickle.dump(new_train_tasks[f], open(datasets_path + "CVFold" + str(f) +  save_prefix + "Train.p","wb"))
217 | 		pickle.dump(new_val_tasks[f], open(datasets_path + "CVFold" + str(f) + save_prefix + "Val.p","wb"))
218 | 	pickle.dump(new_train_tasks[num_cross_folds], open(datasets_path + "CVFullTrain" + save_prefix + ".p","wb"))
219 | 
220 | 
221 | def addKeepIndicesToCrossValPickleFiles(datasets_path, file_prefix, num_cross_folds, keep_percent):
222 | 	save_prefix = getTaskListFileCoreName(file_prefix)
223 | 
224 | 	for f in range(num_cross_folds):
225 | 		task_dict_list = pickle.load(open(datasets_path + "CVFold" + str(f) + save_prefix + "Train.p","rb"))
226 | 		for t in range(len(task_dict_list)):
227 | 			if not 'KeepIndices' in task_dict_list[t] or task_dict_list[t]['KeepIndices'] is None:
228 | 				n = len(task_dict_list[t]['X'])
229 | 				keep_indices = np.random.choice(n, n*keep_percent, replace=False)
230 | 				task_dict_list[t]['KeepIndices'] = keep_indices
231 | 		pickle.dump(task_dict_list, open(datasets_path + "CVFold" + str(f) +  save_prefix + "Train.p","wb"))
232 | 
233 | def getTrainAndValDataForCrossValFold(crossVal_X, crossVal_y, fold, only_train=False):
234 | 	num_folds = len(crossVal_X)
235 | 	if fold >= num_folds:
236 | 		if only_train: 
237 | 			return None, None
238 | 		else:
239 | 			return None, None, None, None
240 | 
241 | 	train_folds_X = [crossVal_X[x] for x in range(num_folds) if x != fold]
242 | 	train_folds_Y = [crossVal_y[x] for x in range(num_folds) if x != fold]
243 | 	
244 | 	train_X = train_folds_X[0]
245 | 	train_Y = train_folds_Y[0]
246 | 	for i in range(1,len(train_folds_X)):
247 | 		train_X = np.concatenate((train_X,train_folds_X[i]))
248 | 		train_Y = np.concatenate((train_Y,train_folds_Y[i]))
249 | 
250 | 	val_X = crossVal_X[fold]
251 | 	val_Y = crossVal_y[fold]
252 | 	return train_X, train_Y, val_X, val_Y
253 | 
254 | def containsEachLabelType(labels):
255 | 	'''	Checks if a set of labels contains all labels types (-1, 0, 1)'''
256 | 	return 1 in labels and 0 in labels
257 | 
258 | def containsEachSVMLabelType(labels):
259 | 	return -1 in labels and 1 in labels
260 | 
261 | def getFullTrain(crossVal_X, crossVal_y):
262 | 	full_X = crossVal_X[0]
263 | 	full_Y = crossVal_y[0]
264 | 	for i in range(1,len(crossVal_X)):
265 | 		full_X = np.concatenate((full_X,crossVal_X[i]))
266 | 		full_Y = np.concatenate((full_Y,crossVal_y[i]))
267 | 	return full_X, full_Y
268 | 
269 | def getFriendlyLabelName(col):
270 | 	if col is None:
271 | 		return ""
272 | 	if type(col) != str:
273 | 		return str(col)
274 | 
275 | 	name = ""
276 | 	if 'Happiness' in col:
277 | 		name ='Happiness'
278 | 	elif 'Calmness' in col:
279 | 		name = 'Calmness'
280 | 	elif 'Health' in col:
281 | 		name = 'Health'
282 | 	if 'Morning' in col:
283 | 		name = 'Morning-' + name
284 | 	if 'tomorrow' in col:
285 | 		name = 'tomorrow-' + name
286 | 	elif 'yesterday' in col:
287 | 		name = 'yesterday-' + name
288 | 
289 | 	return name	
290 | 
291 | def getOfficialLabelName(string):
292 | 	type_mod = 'Group'
293 | 	if 'Personal' in string:
294 | 		type_mod = 'Personal'
295 | 
296 | 	if 'Happiness' in string:
297 | 		return 'tomorrow_'+type_mod+'_Happiness_Evening_Label'
298 | 	elif 'Calmness' in string:
299 | 		return 'tomorrow_'+type_mod+'_Calmness_Evening_Label'
300 | 	elif 'Health' in string:
301 | 		return 'tomorrow_'+type_mod+'_Health_Evening_Label'
302 | 	else:
303 | 		print("Error! Could not determine official label name")
304 | 		return None
305 | 
306 | def getMinutesFromMidnight(df, feature):
307 | 	time_deltas = pd.to_datetime(df[feature]) - pd.to_datetime(df['timestamp'])
308 | 	mins = [time / pd.Timedelta('1 minute') for time in time_deltas]
309 | 	return [time if not pd.isnull(time) else np.nan for time in mins]
310 | 
311 | def mergeDataframes(all_df, mod_df, mod_name, merge_type='inner',merge_keys=['user_id','timestamp']):
312 | 	print("Merging", mod_name)
313 | 	old_len = len(all_df)
314 | 	print("\tMerged df started with", old_len, "samples")
315 | 	print("\t", mod_name, "has", len(mod_df), "samples")
316 | 	all_df = pd.merge(all_df, mod_df, how=merge_type, on=merge_keys)
317 | 	print("\tMerged df now has", len(all_df), "samples")
318 | 	print(mod_name, "is missing at least", old_len - len(all_df), "samples")
319 | 	
320 | 	return all_df
321 | 
322 | def renameAllColsWithPrefix(df,prefix,remove_len=0):
323 | 	for feat in df.columns.values:
324 | 		if feat != 'user_id' and feat != 'timestamp':
325 | 			df = df.rename(columns={feat:prefix+feat[remove_len:]})
326 | 	return df
327 | 
328 | def normalizeColumns(df, wanted_feats):
329 | 	train_df = df[df['dataset']=='Train']
330 | 	for feat in wanted_feats:
331 | 		train_mean = np.mean(train_df[feat].dropna().tolist())
332 | 		train_std = np.std(train_df[feat].dropna().tolist())
333 | 		zscore = lambda x: (x - train_mean) / train_std
334 | 		df[feat] = df[feat].apply(zscore)
335 | 	return df
336 | 
337 | def findNullColumns(df, features):
338 | 	df_len = len(df)
339 | 	bad_feats = []
340 | 	for feat in features:
341 | 		null_len = len(df[df[feat].isnull()])
342 | 		if df_len == null_len:
343 | 			bad_feats.append(feat)
344 | 	return bad_feats
345 | 
346 | def removeNullCols(df, features):
347 | 	'''Must check if a column is completely null in any of the datasets. Then it will remove it'''
348 | 	train_df = df[df['dataset']=='Train']
349 | 	test_df = df[df['dataset']=='Test']
350 | 	val_df = df[df['dataset']=='Val']
351 | 
352 | 	null_cols = findNullColumns(train_df,features)
353 | 	null_cols_test= findNullColumns(test_df,features)
354 | 	null_cols_val = findNullColumns(val_df,features)
355 | 
356 | 	if len(null_cols) > 0 or len(null_cols_test) > 0 or len(null_cols_val) > 0:
357 | 		for feat in null_cols_test:
358 | 			if feat not in null_cols:
359 | 				null_cols.append(feat)
360 | 		for feat in null_cols_val:
361 | 			if feat not in null_cols:
362 | 				null_cols.append(feat)
363 | 		print("Found", len(null_cols), 
364 | 			  "columns that were completely null. Removing", null_cols)
365 | 
366 | 		df = dropCols(df,null_cols)
367 | 		for col in null_cols:
368 | 			features.remove(col)
369 | 	return df, features
370 | 
371 | def generateWekaFile(X,Y,features,path,name):
372 | 	f = open(path + name + '.arff', 'w')
373 | 	f.write("@relation '" + name + "'\n\n")
374 | 
375 | 	for feat in features:
376 | 		f.write("@attribute " + feat + " numeric\n")
377 | 	f.write("@attribute cluster {True,False}\n\n")
378 | 
379 | 	f.write("@data\n\n")
380 | 	for i in range(X.shape[0]):
381 | 		for j in range(X.shape[1]):
382 | 			if np.isnan(X[i,j]):
383 | 				f.write("?,")
384 | 			else:
385 | 				f.write(str(X[i,j]) + ",")
386 | 		if Y[i] == 1.0 or Y[i] == True:
387 | 			f.write("True\n")
388 | 		else:
389 | 			f.write("False\n")
390 | 
391 | 	f.close()
392 | 
393 | def getMatrixData(data_df, wanted_feats, wanted_labels, dataset=None,single_output=False):
394 | 	if dataset is not None:
395 | 		set_df = data_df[data_df['dataset']==dataset]
396 | 	else:
397 | 		set_df = data_df
398 | 	
399 | 	X = set_df[wanted_feats].astype(float).as_matrix()
400 | 
401 | 	if single_output:
402 | 		y = set_df[wanted_labels[0]].tolist()
403 | 	else:
404 | 		y = set_df[wanted_labels].as_matrix()
405 | 	
406 | 	return X,y
407 | 
408 | def normalizeAndFillDataDf(df, wanted_feats, wanted_labels, suppress_output=False, remove_cols=True):
409 | 	data_df = normalizeColumns(copy.deepcopy(df), wanted_feats)
410 | 	if remove_cols:
411 | 		data_df, wanted_feats = removeNullCols(data_df, wanted_feats)
412 | 
413 | 	if not suppress_output: print("Original data length was", len(data_df))
414 | 	data_df = data_df.dropna(subset=wanted_labels, how='any')
415 | 	if not suppress_output: print(
416 | 		"After dropping rows with nan in any label column, length is", 
417 | 		len(data_df))
418 | 
419 | 	data_df = data_df.fillna(NAN_FILL_VALUE) #if dataset is already filled, won't do anything
420 | 
421 | 	return data_df
422 | 
423 | def getSvmPartitionDf(data_df, wanted_feats, wanted_labels, dataset='Train'):
424 | 	set_df = data_df[data_df['dataset']==dataset]
425 | 
426 | 	keep_cols = copy.deepcopy(wanted_feats)
427 | 	keep_cols.extend(wanted_labels)
428 | 	set_df = set_df[keep_cols]
429 | 	
430 | 	return set_df
431 | 
432 | def getTensorFlowMatrixData(data_df, wanted_feats, wanted_labels, dataset='Train',single_output=False):
433 | 	set_df = data_df[data_df['dataset']==dataset]
434 | 	
435 | 	X = set_df[wanted_feats].astype(float).as_matrix()
436 | 
437 | 	if single_output:
438 | 		y = set_df[wanted_labels[0]].tolist()
439 | 	else:
440 | 		y = set_df[wanted_labels].as_matrix()
441 | 	
442 | 	X = convertMatrixToTensorFlowFriendlyFormat(X)
443 | 	y = convertMatrixToTensorFlowFriendlyFormat(y)
444 | 
445 | 	return X,y
446 | 
447 | def convertMatrixToTensorFlowFriendlyFormat(X):
448 | 	X = np.asarray(X)
449 | 	X = X.astype(np.float32)
450 | 	return X
451 | 
452 | def dropCols(df,cols):
453 | 	for col in cols:
454 | 		df = df.drop(col, 1)
455 | 	return df
456 | 
457 | def convertTimestampViaString(row):
458 | 	return str(row['timestamp'])
459 | 
460 | def getMinutesFromMidnight(df, feature):
461 | 	time_deltas = pd.to_datetime(df[feature]) - pd.to_datetime(df['timestamp'])
462 | 	mins = [time / pd.Timedelta('1 minute') for time in time_deltas]
463 | 	return [time if not pd.isnull(time) else np.nan for time in mins]
464 | 
465 | def renameAllColsWithPrefix(df,prefix,remove_len=0):
466 | 	for feat in df.columns.values:
467 | 		if feat != 'user_id' and feat != 'timestamp':
468 | 			df = df.rename(columns={feat:prefix+feat[remove_len:]})
469 | 	return df
470 | 
471 | def combineFilesIntoDf(file_path, filenames, reset_index=False, drop_cols=None):
472 | 	df = None
473 | 	for filename in filenames:
474 | 		fdf = pd.DataFrame.from_csv(file_path + filename)
475 | 		
476 | 		if reset_index:
477 | 			fdf = fdf.reset_index()
478 | 				
479 | 		if df is None:
480 | 			df = fdf.copy(deep=True)
481 | 		else:
482 | 			df = pd.concat([df,fdf])
483 | 			
484 | 	if drop_cols is not None:
485 | 		for feat in drop_cols:
486 | 			df = df.drop(feat, 1)
487 | 	
488 | 	return df
489 | 
490 | def partitionRandomSubset(X, Y, size, replace=False, return_remainder=True):
491 | 	subset_indices = np.random.choice(len(X), size, replace=replace)
492 | 	
493 | 	sub_X = X[subset_indices]
494 | 	sub_Y = Y[subset_indices]
495 | 
496 | 	if return_remainder:
497 | 		remainder_indices = [x for x in range(0,len(X)) if x not in subset_indices]
498 | 		remainder_X = X[remainder_indices]
499 | 		remainder_Y = Y[remainder_indices]
500 | 		return sub_X, sub_Y, remainder_X, remainder_Y
501 | 	else:
502 | 		return sub_X, sub_Y
503 | 
504 | def generateCrossValSet(train_X, train_y, val_X, val_y, num_cross_folds, verbose=True):
505 | 	if verbose:
506 | 		print("...generating cross validation folds...")
507 | 
508 | 	fullTrain_X = np.concatenate((train_X,val_X))
509 | 	fullTrain_y = np.concatenate((train_y,val_y))
510 | 	if len(fullTrain_X) <= 1:
511 | 		print("LENGTH IS", len(fullTrain_X))
512 | 	crossVal_X = []
513 | 	crossVal_y = []
514 | 
515 | 	size = int(len(fullTrain_X) / num_cross_folds)
516 | 	if size < 1:
517 | 		size = 1
518 | 	remainder_X = fullTrain_X
519 | 	remainder_y = fullTrain_y
520 | 	for i in range(num_cross_folds-1):
521 | 		sub_X, sub_y, remainder_X, remainder_y = partitionRandomSubset(remainder_X, remainder_y, size)
522 | 		crossVal_X.append(sub_X)
523 | 		crossVal_y.append(sub_y)
524 | 		if len(remainder_X) == 0:
525 | 			# Insufficient data to make all folds, returning remaining.
526 | 			return crossVal_X, crossVal_y
527 | 	crossVal_X.append(remainder_X)
528 | 	crossVal_y.append(remainder_y)
529 | 
530 | 	return crossVal_X, crossVal_y
531 | 
532 | def discardNans(df,col1,col2):
533 |     small_df = df[[col1,col2]]
534 |     small_df = small_df.dropna()
535 |     x = small_df[col1].tolist()
536 |     y = small_df[col2].tolist()
537 |     n = len(x)
538 |     return x,y,n
539 | 
540 | def calcCorrelation(df,col1,col2):
541 |     x,y,n = discardNans(df,col1,col2)
542 |     return stats.pearsonr(x, y)
543 | 
544 | def calculateNumFeatsInTaskList(task_dict_list):
545 | 	i=0
546 | 	X = task_dict_list[i]['X']
547 | 	while len(X) == 0 and i < len(task_dict_list):
548 | 		i=i+1
549 | 		X = task_dict_list[i]['X']
550 | 	return np.shape(X)[1]
551 | 
552 | def addPredsToPredsDf(df, preds, true, task_name):
553 | 	assert len(preds) == len(true)
554 | 
555 | 	for i in range(len(preds)):
556 | 		df = df.append({'task_name':task_name, 'prediction':preds[i], 
557 | 						'true':true[i]}, ignore_index=True)
558 | 
559 | 	return df
560 | 
561 | def fixSettingDictLoadedFromResultsDf(setting_dict):
562 | 	if 'hidden_layers' in setting_dict.keys():
563 | 		if type(setting_dict['hidden_layers']) == str:
564 | 			setting_dict['hidden_layers'] = ast.literal_eval(setting_dict['hidden_layers'])
565 | 
566 | 	if 'optimizer' in setting_dict.keys():
567 | 		if 'GradientDescent' in setting_dict['optimizer']:
568 | 			setting_dict['optimizer'] = tf.train.GradientDescentOptimizer
569 | 		elif 'Adagrad' in setting_dict['optimizer']:
570 | 			setting_dict['optimizer'] = tf.train.AdagradOptimizer
571 | 		else:
572 | 			setting_dict['optimizer'] = tf.train.AdamOptimizer
573 | 
574 | 	for setting in ['batch_size','decay_steps']:
575 | 		if setting in setting_dict.keys():
576 | 			setting_dict[setting] = int(setting_dict[setting])
577 | 
578 | 	return setting_dict
579 | 
580 | def get_secs_mins_hours_from_secs(total_secs):
581 | 	hours = total_secs / 60 / 60
582 | 	mins = (total_secs % 3600) / 60
583 | 	secs = (total_secs % 3600) % 60
584 | 
585 | 	if hours < 1: hours = 0
586 | 	if mins < 1: mins = 0
587 | 	
588 | 	return hours, mins, secs
589 | 
590 | def tf_weight_variable(shape, name):
591 |     """Initializes a tensorflow weight variable with random values 
592 |     centered around 0.
593 |     """
594 |     initial = tf.truncated_normal(shape, stddev=1.0 / math.sqrt(float(shape[0])), dtype=tf.float64)
595 |     return tf.Variable(initial, name=name)
596 | 
597 | def tf_bias_variable(shape, name):
598 |     """Initializes a tensorflow bias variable to a small constant value."""
599 |     initial = tf.constant(0.1, shape=shape, dtype=tf.float64)
600 |     return tf.Variable(initial, name=name)
601 | 
602 | def get_test_predictions_for_df_with_task_column(model_predict_func, csv_path, task_column, tasks, 
603 | 												wanted_label=None, num_feats_expected=None, label_name="", 
604 | 												tasks_are_ints=True):
605 | 	data_df = pd.DataFrame.from_csv(csv_path)
606 | 	
607 | 	wanted_feats = [x for x in data_df.columns.values if x != 'user_id' and x != 'timestamp' and 'ppt_id' not in x and x!= 'dataset' and '_Label' not in x and 'Cluster' not in x]
608 | 	if num_feats_expected is not None and len(wanted_feats) != num_feats_expected:
609 | 		print("Error! Found", len(wanted_feats), 
610 | 			  "features but was expecting to find", num_feats_expected)
611 | 		return
612 | 
613 | 	if wanted_label is not None:
614 | 		wanted_labels = [wanted_label]
615 | 	else:
616 | 		wanted_labels = [x for x in data_df.columns.values if '_Label' in x and 'tomorrow_' in x and 'Evening' in x and 'Alertness' not in x and 'Energy' not in x]
617 | 
618 | 	data_df = normalizeAndFillDataDf(data_df, wanted_feats, wanted_labels)
619 | 
620 | 	if label_name is "" and wanted_label is not None:
621 | 		label_name = getFriendlyLabelName(wanted_label)
622 | 
623 | 	for i,task_dict in enumerate(tasks):
624 | 		task = task_dict['Name']
625 | 		if tasks_are_ints:
626 | 			task = int(task)
627 | 		task_df = data_df[data_df[task_column]==task]
628 | 		X = task_df[wanted_feats].as_matrix()
629 | 		preds = model_predict_func(X, i)
630 | 		data_df.loc[task_df.index.values,'test_pred_'+label_name] = preds
631 | 
632 | 	print("Predictions have been computed and are stored in dataframe.")
633 | 	
634 | 	if wanted_label is not None and wanted_label in data_df.columns.values:
635 | 		test_df = data_df[data_df['dataset']=='Test']
636 | 		all_preds = test_df['test_pred_'+label_name].tolist()
637 | 		all_true = test_df[wanted_label].tolist()
638 | 		print("FINAL METRICS ON TEST SET:", 
639 | 			  computeAllMetricsForPreds(all_preds, all_true))
640 | 	else:
641 | 		print("Cannot print test results unless wanted_label is set correctly")
642 | 
643 | 	return data_df
644 | 
645 | def get_test_predictions_for_df_with_no_task_column(model_predict_func, csv_path, tasks, 
646 | 													num_feats_expected=None):
647 | 	data_df = pd.DataFrame.from_csv(csv_path)
648 | 	
649 | 	wanted_feats = [x for x in data_df.columns.values if x != 'user_id' and x != 'timestamp' and x!= 'dataset' and '_Label' not in x and 'Cluster' not in x]
650 | 	if num_feats_expected is not None and len(wanted_feats) != num_feats_expected:
651 | 		print("Error! Found", len(wanted_feats), 
652 | 			  "features but was expecting to find", num_feats_expected)
653 | 		return
654 | 
655 | 	for i,task_dict in enumerate(tasks):
656 | 		wanted_label = task_dict['Name']
657 | 		label_name = getFriendlyLabelName(wanted_label)
658 | 		label_df = normalizeAndFillDataDf(copy.deepcopy(data_df), wanted_feats, [wanted_label])
659 | 		
660 | 		X = label_df[wanted_feats].as_matrix()
661 | 		preds = model_predict_func(X, i)
662 | 		data_df.loc[label_df.index.values,'test_pred_'+label_name] = preds
663 | 
664 | 		test_df = data_df[data_df['dataset']=='Test']
665 | 		test_df = test_df.dropna(subset=[wanted_label], how='any')
666 | 		all_preds = test_df['test_pred_'+label_name].tolist()
667 | 		all_true = test_df[wanted_label].tolist()
668 | 		print("FINAL METRICS ON TEST SET for label", label_name, ":", 
669 | 			  computeAllMetricsForPreds(all_preds, all_true))
670 | 
671 | 	print("Predictions have been computed and are stored in dataframe.")
672 | 	
673 | 	return data_df
674 | 	


--------------------------------------------------------------------------------
/jobs_to_run.txt:
--------------------------------------------------------------------------------
1 | NN job of some type - happiness
2 | python NeuralNetworks/tensorFlowWrapper.py Path/task_list_file-Happiness_ multitask wellbeing
3 | ../outputs/some_result_for_happiness.txt
4 | 
5 | NN job of some type - calmness
6 | python NeuralNetworks/tensorFlowWrapper.py Path/task_list_file-Calmness_ multitask wellbeing
7 | ../outputs/some_result_for_calmness.txt


--------------------------------------------------------------------------------
/make_datasets.py:
--------------------------------------------------------------------------------
  1 | """ This file contains functions for converting a .csv dataset into the 
  2 | 	'task dict list' format used by the rest of the code. The .csv file must 
  3 | 	have a particular format, with columns like 'user_id', and outcome columns
  4 | 	containing '_Label'. For an example, see the file 'example_data.csv'. 
  5 | 
  6 | 	How to partition tasks:
  7 | 		'users-as-tasks': The .csv file will be partioned such that predicting 
  8 | 			the outcome of each user is one task.
  9 | 		'labels-as-tasks': The .csv file will be partitioned such that 
 10 | 			predicting related outcomes is each task (e.g. predicting stress
 11 | 			is one task and predicting happiness is another)
 12 | """
 13 | 
 14 | import numpy as np
 15 | import pandas as pd
 16 | import sklearn as sk
 17 | import sys
 18 | import os
 19 | import pickle
 20 | import random
 21 | import time
 22 | import copy
 23 | import argparse
 24 | import helperFuncs as helper
 25 | from sklearn.cross_validation import StratifiedShuffleSplit
 26 | 
 27 | CODE_PATH = os.path.dirname(os.getcwd())
 28 | sys.path.append(CODE_PATH)
 29 | 
 30 | parser = argparse.ArgumentParser()
 31 | parser.add_argument('--datafile', type=str, default='/Your/path/here/')
 32 | parser.add_argument('--task_type', type=str, default='users', 
 33 | 					help="How to partition related tasks; can be 'users' so "
 34 | 						 "that predicting the outcome for each user is its own "
 35 | 						 "task, or 'labels', so that predicting related "
 36 | 						 "outcomes (like stress, happiness, etc) are their "
 37 | 						 "own tasks.")
 38 | parser.add_argument('--target_label', type=str, 
 39 | 					default='tomorrow_Happiness_Evening_Label',
 40 | 					help="Outcome label to predict for each user in "
 41 | 						 "users-as-tasks")
 42 | parser.add_argument('--group_users_on', type=str, 
 43 | 					default='user_id',
 44 | 					help="Name of column that indicates user or cluster ID "
 45 | 						 "for partitioning users into tasks.")
 46 | 
 47 | def getDatasetCoreNameAndPath(datafile):
 48 | 	core_name = os.path.basename(datafile)
 49 | 	core_name = os.path.splitext(core_name)[0]
 50 | 	path = os.path.splitext(datafile)[0].replace(core_name, '')
 51 | 	return core_name, path
 52 | 
 53 | def getLabelTaskListFromDataset(datafile, subdivide_phys=True):
 54 | 	"""Partitions a .csv file into a task-dict-list pickle file by separating
 55 | 	related labels into the different tasks."""
 56 | 	df = pd.DataFrame.from_csv(datafile)
 57 | 	wanted_labels = [x for x in df.columns.values if '_Label' in x and 'tomorrow_' in x and 'Evening' in x and 'Alertness' not in x and 'Energy' not in x]
 58 | 	wanted_feats = [x for x in df.columns.values if x != 'user_id' and x != 'timestamp' and x!= 'dataset' and x!='Cluster' and '_Label' not in x]
 59 | 
 60 | 	core_name, data_path = getDatasetCoreNameAndPath(datafile)
 61 | 
 62 | 	modality_dict = getModalityDict(wanted_feats, subdivide_phys=subdivide_phys)
 63 | 	
 64 | 	for dataset in ['Train','Val','Test']:
 65 | 		task_dict_list = []
 66 | 		for target_label in wanted_labels: 
 67 | 			mini_df = helper.normalizeAndFillDataDf(df, wanted_feats, [target_label], suppress_output=True)
 68 | 			mini_df.reindex(np.random.permutation(mini_df.index))
 69 | 				
 70 | 			X,y = helper.getTensorFlowMatrixData(mini_df, wanted_feats, [target_label], dataset=dataset, single_output=True)
 71 | 			task_dict = dict()
 72 | 			task_dict['X'] = X
 73 | 			task_dict['Y'] = y
 74 | 			task_dict['Name'] = target_label
 75 | 			task_dict['ModalityDict'] = modality_dict
 76 | 			task_dict_list.append(task_dict)
 77 | 		pickle.dump(task_dict_list, open(data_path + "datasetTaskList-" + core_name + "_" + dataset + ".p","wb"))
 78 | 
 79 | def getModalityDict(wanted_feats, subdivide_phys=False):
 80 | 	modalities = list(set([getFeatPrefix(x, subdivide_phys=subdivide_phys) for x in wanted_feats]))
 81 | 	mod_dict = dict()
 82 | 	for modality in modalities:
 83 | 		mod_dict[modality] = getStartIndex(wanted_feats, modality)
 84 | 	return mod_dict
 85 | 
 86 | def getStartIndex(wanted_feats, modality):
 87 | 	for i,s in enumerate(wanted_feats):
 88 | 		if modality[0:4] == 'phys' and 'H' in modality and modality != 'physTemp':
 89 | 			if modality + ':' in s:
 90 | 				return i
 91 | 		else:
 92 | 			if modality + '_' in s:
 93 | 				return i
 94 | 
 95 | def getFeatPrefix(feat_name, subdivide_phys=False):
 96 | 	idx = feat_name.find('_')
 97 | 	prefix = feat_name[0:idx]
 98 | 	if not subdivide_phys or prefix != 'phys':
 99 | 		return prefix
100 | 	else:
101 | 		idx = feat_name.find(':')
102 | 		return feat_name[0:idx]
103 | 
104 | def getUserTaskListFromDataset(datafile, target_label, suppress_output=False, 
105 | 							   group_on='user_id', subdivide_phys=False):
106 | 	"""Partitions a .csv file into a task-dict-list pickle file by separating
107 | 	different individuals (users) into the different tasks."""
108 | 	df = pd.DataFrame.from_csv(datafile)
109 | 	wanted_feats = [x for x in df.columns.values if x != 'user_id' and x != 'timestamp' and x!= 'dataset' and x!='classifier_friendly_ppt_id' and 'Cluster' not in x and '_Label' not in x]
110 | 	
111 | 	df = helper.normalizeAndFillDataDf(df, wanted_feats, [target_label], suppress_output=True)
112 | 	df = df.reindex(np.random.permutation(df.index))
113 | 
114 | 	dataset_name, datapath = getDatasetCoreNameAndPath(datafile)
115 | 	label_name = helper.getFriendlyLabelName(target_label)
116 | 	
117 | 	modality_dict = getModalityDict(wanted_feats, subdivide_phys=subdivide_phys)
118 | 
119 | 	train_task_dict_list = []
120 | 	val_task_dict_list = []
121 | 	test_task_dict_list = []
122 | 	for user in df[group_on].unique(): 
123 | 		if not suppress_output:
124 | 			print("Processing task", user)
125 | 		mini_df = df[df[group_on] == user]
126 | 
127 | 		train_task_dict_list.append(constructTaskDict(user, mini_df, wanted_feats, target_label, modality_dict, 'Train'))
128 | 		val_task_dict_list.append(constructTaskDict(user, mini_df, wanted_feats, target_label, modality_dict, 'Val'))
129 | 		test_task_dict_list.append(constructTaskDict(user, mini_df, wanted_feats, target_label, modality_dict, 'Test'))
130 | 
131 | 	if group_on == 'user_id':
132 | 		dataset_prefix = "datasetUserTaskList-"
133 | 	elif group_on == 'Cluster':
134 | 		dataset_prefix = 'datasetClusterTasks-'
135 | 	else:
136 | 		dataset_prefix = group_on
137 | 	pickle.dump(train_task_dict_list, open(datapath + dataset_prefix + dataset_name + "-" + label_name + "_Train.p","wb"))
138 | 	pickle.dump(val_task_dict_list, open(datapath + dataset_prefix + dataset_name + "-" + label_name + "_Val.p","wb"))
139 | 	pickle.dump(test_task_dict_list, open(datapath + dataset_prefix + dataset_name + "-" + label_name + "_Test.p","wb"))
140 | 
141 | 	return dataset_prefix + dataset_name + "-" + label_name
142 | 
143 | def constructTaskDict(task_name, mini_df, wanted_feats, target_label, modality_dict, dataset):
144 | 	X,y = helper.getTensorFlowMatrixData(mini_df, wanted_feats, [target_label], dataset=dataset, single_output=True)
145 | 	task_dict = dict()
146 | 	task_dict['X'] = X
147 | 	task_dict['Y'] = y
148 | 	task_dict['Name'] = task_name
149 | 	task_dict['ModalityDict'] = modality_dict
150 | 	return task_dict
151 | 
152 | if __name__ == '__main__':
153 | 	kwargs = vars(parser.parse_args())
154 | 
155 | 	if kwargs['task_type'] == 'labels':
156 | 		print("Creating a label task-dict-list dataset where tasks are "
157 | 			  "predicting related outcome labels.")
158 | 		getLabelTaskListFromDataset(kwargs['datafile'])
159 | 	else:
160 | 		print("Creating a user task-dict-list dataset where tasks are "
161 | 			  "predicting the outcome of each different person (user).")
162 | 		getUserTaskListFromDataset(kwargs['datafile'], 
163 | 								   target_label=kwargs['target_label'],
164 | 							       group_on=kwargs['group_users_on'])


--------------------------------------------------------------------------------
/mtl_nn_clusters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mitmedialab/PersonalizedMultitaskLearning/2de7d9485f5ac09264bfa624f16c5b05a5a44ada/mtl_nn_clusters.png


--------------------------------------------------------------------------------
/run_jobs.py:
--------------------------------------------------------------------------------
  1 | """ This file allows multiple jobs to be run on a server. After each job, an 
  2 | 	email is sent to notify desired people of its completion.
  3 | 
  4 | 	Must specify a text job file that contains the names and commands for each 
  5 | 	job. Each job has 4 lines, containing: 
  6 | 		1) the name, 
  7 | 		2) the command, 
  8 | 		3) the location of a file where the job output should be saved, 
  9 | 		4) a blank line.
 10 | 
 11 | 	An example job file format is as follows:
 12 | 
 13 | 		Job1
 14 | 		python job.py path1 arg1
 15 | 		path/output1.txt
 16 | 
 17 | 		Job2
 18 | 		python job.py path2 arg2
 19 | 		path/output2.txt
 20 | 
 21 | 	Usage: python run_jobs.py jobs.txt
 22 | """
 23 | 
 24 | import os
 25 | import sys
 26 | import smtplib
 27 | import string
 28 | from time import time
 29 | import helperFuncs as helper
 30 | 
 31 | DEFAULT_EMAIL_LIST = ['myemail@gmail.com', 'youremail@gmail.com']
 32 | SENDING_ADDRESS = 'myemail@gmail.com'
 33 | MINIMUM_JOB_SECONDS = 600 # 10 minutes
 34 | PRINT_LAST_X_LINES = 300
 35 | ERROR = 1
 36 | SUCCESS = 0
 37 | WARNING = 2
 38 | 
 39 | 
 40 | def reload_files():
 41 | 	reload(helper)
 42 | 
 43 | class Job:
 44 | 	def __init__(self, name, command, output_file):
 45 | 		self.name = name
 46 | 		self.command = command
 47 | 		self.output_file = output_file.rstrip('\n')
 48 | 
 49 | 
 50 | def send_email(subject, text, to_addr_list=DEFAULT_EMAIL_LIST):
 51 | 	body = string.join(('From: %s' % SENDING_ADDRESS,
 52 | 						'To: %s' % to_addr_list,
 53 | 						'Subject: %s' % subject,
 54 | 						'',
 55 | 						text), '\r\n')
 56 | 
 57 | 	try:
 58 | 		server = smtplib.SMTP('smtp.gmail.com:587')  # NOTE:  This is the GMAIL SSL port.
 59 | 		server.ehlo() # this line was not required in a previous working version
 60 | 		server.starttls()
 61 | 		server.login(SENDING_ADDRESS, 'gmail_password')
 62 | 		server.sendmail(SENDING_ADDRESS, to_addr_list, body)
 63 | 		server.quit()
 64 | 		print "Email sent successfully!"
 65 | 	except:
 66 | 		return "Email failed to send!"
 67 | 
 68 | def load_job_file(filename):
 69 | 	f = open(filename, 'r')
 70 | 	lines = f.readlines()
 71 | 
 72 | 	jobs = []
 73 | 
 74 | 	i = 0
 75 | 	while i < len(lines):
 76 | 		jobname = lines[i]
 77 | 		command = lines[i+1]
 78 | 		output_file = lines[i+2]
 79 | 		job = Job(jobname, command, output_file)
 80 | 		jobs.append(job)
 81 | 		i = i+4
 82 | 
 83 | 	return jobs
 84 | 
 85 | def run_job(job_obj):
 86 | 	""" Runs a system command for a job, returns whether it 
 87 | 		succeeded and output text to be emailed.
 88 | 
 89 | 		Inputs:
 90 | 			job_obj: an instance of the Job class
 91 | 
 92 | 		Returns
 93 | 			A code indicating whether the job was successful, and
 94 | 			a string containing text about the job and job output to 
 95 | 			be mailed to the user
 96 | 	"""
 97 | 
 98 | 	print "\nRunning job", job_obj.name	
 99 | 	
100 | 	if os.path.exists(job_obj.output_file):
101 | 		message = "The desired output file " + job_obj.output_file + " already exists."
102 | 		print "Error!", message
103 | 		return ERROR, message
104 | 
105 | 	t0 = time()
106 | 
107 | 	# execute the command
108 | 	stream = os.popen(job_obj.command)
109 | 	output = stream.read()
110 | 
111 | 	# save output to desired file
112 | 	of = open(job_obj.output_file, 'w')
113 | 	of.write(output)
114 | 	of.close()
115 | 
116 | 	t1 = time()
117 | 	total_secs = t1 - t0
118 | 
119 | 	hours, mins, secs = helper.get_secs_mins_hours_from_secs(total_secs)
120 | 	time_str = "Job ended. Total time taken: " + str(int(hours)) + "h " + str(int(mins)) + "m " + str(int(secs)) + "s"
121 | 	print time_str
122 | 
123 | 	if not os.path.exists(job_obj.output_file):
124 | 		message = "Job failed to create the desired output file."
125 | 		print "Error!", message
126 | 		code = ERROR
127 | 	elif total_secs < MINIMUM_JOB_SECONDS:
128 | 		message = "The total time taken for the job was suspiciously short."
129 | 		print "Warning!", message
130 | 		code = WARNING
131 | 	else:
132 | 		message = ""
133 | 		print "Job finished successfully!"
134 | 		code = SUCCESS
135 | 
136 | 	lines = output.split('\n')
137 | 	tail = "\n".join(lines[-PRINT_LAST_X_LINES:])
138 | 
139 | 	message += "\n\n" + time_str + "\n\n"
140 | 	message += "The last " + str(PRINT_LAST_X_LINES) + " lines of job output were:\n\n"
141 | 	message += tail
142 | 
143 | 	return code, message
144 | 
145 | def email_about_job(job_obj, status, output):
146 | 	if status == ERROR:
147 | 		title = "Error! Problem with job " + job_obj.name
148 | 	elif status == SUCCESS:
149 | 		title = "Success! Job " + job_obj.name + " is finished"
150 | 	else:
151 | 		title = "Warning! Job " + job_obj.name + " finished too quickly"
152 | 
153 | 	send_email(title, output)
154 | 
155 | def run_jobs(jobfile):
156 | 	jobs = load_job_file(filename)
157 | 
158 | 	for job in jobs:
159 | 		status, output = run_job(job)
160 | 		email_about_job(job, status, output)
161 | 
162 | 	send_email("ALL JOBS FINISHED!!", "Congratulations, all of the jobs in the file " + jobfile + " have finished running.")
163 | 
164 | if __name__ == "__main__":
165 | 	if len(sys.argv) < 1:
166 | 		print "Error! Usage is python run_jobs.py jobs.txt"
167 | 		print "See this file's documentation for required format for jobs.txt"
168 | 
169 | 	filename= sys.argv[1]
170 | 	jobfile=sys.argv[1]
171 | 	print "Running all jobs in file", jobfile, ". . ."
172 | 
173 | 	run_jobs(jobfile)
174 | 


--------------------------------------------------------------------------------