├── README.md ├── nMIL.py ├── nMIL_delta.py └── nMIL_omega.py /README.md: -------------------------------------------------------------------------------- 1 | # nested_Multi_Instance_Learning 2 | 3 | Source code for paper "[Modeling Precursors for Event Forecasting via Nested Multi-Instance Learning](http://www.kdd.org/kdd2016/subtopic/view/modeling-precursors-for-event-forecasting-via-nested-multi-instance-learnin)" published in KDD16. 4 | -------------------------------------------------------------------------------- /nMIL.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | Source code for paper - 6 | "Modeling Precursors for Event Forecasting via Nested Multi-Instance Learning" 7 | Authors: Yue Ning, Sathappan Muthiah, Huzefa Rangwala, Naren Ramakrishnan 8 | published in KDD16. 9 | 10 | """ 11 | 12 | 13 | import sklearn 14 | import sklearn.svm 15 | from sklearn.feature_extraction import DictVectorizer 16 | from sklearn.feature_extraction.text import TfidfTransformer 17 | from sklearn.base import ClassifierMixin 18 | from sklearn.grid_search import RandomizedSearchCV 19 | import scipy.stats 20 | from copy import deepcopy 21 | import numpy as np 22 | from numpy import linalg as la 23 | import pdb 24 | import misvm_sklearn 25 | from scipy.sparse import issparse, vstack as parse_vstack 26 | import random 27 | import math 28 | from collections import OrderedDict 29 | from datetime import datetime, timedelta 30 | 31 | def sigmoid(x): 32 | return 1. / (1. + math.exp(-x)) 33 | 34 | 35 | def _vstack(m): 36 | """ 37 | Return vstack 38 | """ 39 | if issparse(m[0]) or issparse(m[-1]): 40 | return parse_vstack(m) 41 | else: 42 | return np.vstack(m) 43 | 44 | 45 | class nMIL: 46 | def __init__(self, bagsize, beta=3.0, gamma=0.5, m0=0.5, p0=0.5): 47 | self.bagsize = bagsize 48 | self.beta = beta 49 | self.gamma = gamma 50 | self.m0 = m0 51 | self.p0 = p0 52 | 53 | def grad_func(X, Y, orgX, w): 54 | 55 | beta = 3.0; m0 = 0.5; p0 = 0.5; beta2 = 6.0; gamma = 0.5 56 | first_matrix = [] 57 | second_matrix = [] 58 | third_matrix = [] 59 | for i, superbag in enumerate(X): 60 | jk_plus = [] 61 | P_i_list = [] 62 | m_i = len(X[i]) 63 | P_ijk_list = [] 64 | n_ij_list = [] 65 | 66 | for j, bag in enumerate(superbag): 67 | m_i = len(bag) 68 | k_plus = [k for k, doc in enumerate(bag) 69 | if np.sign(sigmoid(np.dot(w, doc[:, np.newaxis])) - p0) * np.dot(w, doc[:, np.newaxis]) < m0] 70 | p_ij_list = [sigmoid(np.dot(w, doc[:, np.newaxis])) for k, doc in enumerate(bag)] 71 | jk_plus.append(k_plus) 72 | n_ij_list.append(m_i) 73 | P_ijk_list.append(p_ij_list) 74 | p_ij = np.mean(p_ij_list) 75 | P_i_list.append(p_ij) 76 | 77 | P_i = np.mean(P_i_list) 78 | 79 | for j, x in enumerate(X[i]): 80 | if len(x) > 0: 81 | row_list = [X[i][j][k] * P_ijk_list[j][k]\ 82 | * (1. - P_ijk_list[j][k])\ 83 | * ((Y[i] - P_i) / (P_i * (1.- P_i)))\ 84 | * (1. / n_ij_list[j])\ 85 | for k in xrange(len(X[i][j]))] 86 | 87 | first_matrix.append(np.sum(np.array(row_list), axis=0)) 88 | 89 | for j, x in enumerate(X[i]): 90 | if len(x) > 0: 91 | row_list = [(X[i][j][k]\ 92 | * P_ijk_list[j][k]\ 93 | * (1. - P_ijk_list[j][k])\ 94 | * (1. / n_ij_list[j]))\ 95 | for k in xrange(len(x))] 96 | if j > 0 and len(X[i][j-1]) > 0: 97 | row_list2 = [(X[i][j-1][k]\ 98 | * P_ijk_list[j-1][k]\ 99 | * (1. - P_ijk_list[j-1][k])\ 100 | * (1. / n_ij_list[j-1]))\ 101 | for k in xrange(len(X[i][j-1]))] 102 | current_sum = np.sum(np.array(row_list), axis=0) 103 | last_sum = np.sum(np.array(row_list2), axis=0) 104 | 105 | derivative = 2. * (1./ len(X[i]))\ 106 | * (P_i_list[j] - P_i_list[j-1])\ 107 | * (current_sum - last_sum)\ 108 | * orgX[i][j]['cross_cosine']\ 109 | /(len(X[i][j]) * len(X[i][j-1])) 110 | second_matrix.append(derivative) 111 | 112 | for idj, kplus in enumerate(jk_plus): 113 | if len(kplus) > 0: 114 | row_list = [X[i][idj][idk]\ 115 | * np.sign(P_ijk_list[idj][idk] - p0)\ 116 | * (1. / n_ij_list[idj])\ 117 | for idk in kplus] 118 | 119 | sum_row = np.sum(np.array(row_list), axis=0) 120 | third_matrix.append(sum_row * (1./ n_ij_list[idj])) 121 | 122 | 123 | first_sum = np.sum(np.array(first_matrix), axis=0) * beta 124 | second_sum = np.sum(np.array(second_matrix), axis=0) * gamma 125 | third_sum = np.sum(np.array(third_matrix), axis=0) * gamma 126 | 127 | if len(second_matrix) > 0 and len(third_matrix) > 0: 128 | return -first_sum + second_sum - third_sum 129 | else: 130 | return -first_sum 131 | 132 | def prepare_data(self, docMap, X_docs, dataIndex, X, Y, Z, origianl): 133 | 134 | pos_count = 0 135 | allcount = 0 136 | for cVal in dataIndex.viewvalues(): 137 | Z.append(cVal) 138 | if cVal['Y']: 139 | Y.append(1) 140 | pos_count += 1 141 | else: 142 | Y.append(0) 143 | 144 | combinedDoc = [] 145 | featureDoc = [] 146 | for i in range(max(1, 11 - self.bagsize), 11): 147 | if i == 1: 148 | cosine_sim = 1.0 149 | else: 150 | cosine_sim = cVal['history'][str(i)]['cross_cosine'] 151 | 152 | doc_vec = X_docs[[docMap[d] for d in 153 | cVal['history'][str(i)]['ids']], :] 154 | doc_ids = cVal['history'][str(i)]['ids'] 155 | x_dict = {'docvec': doc_vec, 156 | 'docids': doc_ids, 'cross_cosine': cosine_sim} 157 | combinedDoc.append(x_dict) 158 | featureDoc.append(doc_vec) 159 | 160 | origianl.append(combinedDoc) 161 | X.append(featureDoc) 162 | 163 | allcount += 1 164 | return pos_count, allcount 165 | 166 | 167 | def read_data(self, **kwargs): 168 | 169 | trainUnsorted = kwargs['trainIndex'] 170 | testUnsorted = kwargs['testIndex'] 171 | docIndex = kwargs['docIndex'] 172 | trainIndex = OrderedDict(sorted(trainUnsorted.items(), key=lambda x:x[1]['time'][:10])) 173 | testIndex = OrderedDict(sorted(testUnsorted.items(), key=lambda x:x[1]['time'][:10])) 174 | print "train data start and end" 175 | print trainIndex[trainIndex.keys()[0]]['time'] 176 | print trainIndex[trainIndex.keys()[-1]]['time'] 177 | 178 | 179 | print "test data start and end" 180 | print testIndex[testIndex.keys()[0]]['time'] 181 | print testIndex[testIndex.keys()[-1]]['time'] 182 | print "number of documents:", len(docIndex) 183 | 184 | # print backward 185 | docItems = docIndex.items() 186 | self.x_dimension = len(docIndex.values()[0]) 187 | docMap = {val[0]: index for index, val in enumerate(docItems)} 188 | X_docs = np.array([k[1] for k in docItems]) 189 | 190 | self._trainY = [] 191 | self._trainX = [] 192 | self.original_trainX = [] 193 | self._trainZ = [] 194 | pc_train, c_train = self.prepare_data(docMap, 195 | X_docs, 196 | trainIndex, 197 | self._trainX, 198 | self._trainY, 199 | self._trainZ, 200 | self.original_trainX) 201 | print "Traing: Positive %d, negative %d" % (pc_train, c_train - pc_train) 202 | 203 | self._testY = [] 204 | self._testX = [] 205 | self.original_testX = [] 206 | self._testZ = [] 207 | pc_test, c_test = self.prepare_data(docMap, 208 | X_docs, 209 | testIndex, 210 | self._testX, 211 | self._testY, 212 | self._testZ, 213 | self.original_testX) 214 | print "Testing: Positive %d, negative %d" % (pc_test, c_test - pc_test) 215 | return 216 | 217 | def SGD(self): 218 | 219 | train_X = np.array(self._trainX) 220 | train_Y = np.array(self._trainY) 221 | print "X: {}".format(train_X.shape) 222 | print "Y: {}".format(train_Y.shape) 223 | OriginalTrain_X = self.original_trainX 224 | 225 | test_X = np.array(self._testX) 226 | test_Y = np.array(self._testY) 227 | 228 | lambd = 0.05 229 | iteration = 2000 230 | x_dimension = 300 231 | n_sgd = 10 232 | test_f1_arr = []; test_recall_arr = []; test_prec_arr = []; test_roc_arr = []; test_acc_arr = [] 233 | train_f1_arr = []; train_recall_arr = []; train_prec_arr = []; train_roc_arr = [] 234 | test_score = [] 235 | train_score = [] 236 | for expr in range(n_sgd): 237 | w = np.random.rand(x_dimension) 238 | for t in range(iteration): 239 | eta = 1./((t + 1) * lambd) 240 | 241 | kset = random.sample(range(0, len(train_X) - 1), 10) 242 | 243 | X = np.array([train_X[z] for z in kset]) 244 | Y = np.array([train_Y[z] for z in kset]) 245 | orgX = [OriginalTrain_X[k] for k in kset] 246 | delta_w = self.grad_func(X, Y, orgX, w) 247 | 248 | new_w = np.dot((1 - eta * lambd), w) - eta * delta_w / len(X) 249 | 250 | rate = (1./np.sqrt(lambd)) * (1./la.norm(new_w)) 251 | if rate < 1.: 252 | w = np.dot(rate, new_w) 253 | else: 254 | w = new_w 255 | 256 | pred_Y = [] 257 | predicted_data = [] 258 | gsr_history_probs = {} 259 | for idx, testx in enumerate(test_X): 260 | p_ij_list = [] 261 | originalId = test_idx[idx] 262 | for j, day in enumerate(testx): 263 | p_ijk_list = [sigmoid(np.dot(w, doc[:, np.newaxis])) for k, doc in enumerate(day)] 264 | 265 | p_ij_list.append(p_ijk_list) 266 | 267 | days_list = [np.mean(p_ijk_list) for p_ijk_list in p_ij_list] 268 | 269 | P_i = np.mean(days_list) 270 | 271 | bag = self.original_testX[idx] 272 | bag_gsrId = self._testZ[idx]['Id'] 273 | 274 | gsr_history_probs[bag_gsrId] = {} 275 | gsr_history_probs[bag_gsrId]['trueY'] = data_Y[originalId] 276 | for idx, histday in enumerate(bag): 277 | 278 | doc_vec = histday['docvec'] 279 | doc_ids = histday['docids'] 280 | today_probs = [sigmoid(np.dot(w.T, doc)) for k, doc in enumerate(doc_vec)] 281 | 282 | a = zip(doc_ids, today_probs) 283 | gsr_history_probs[bag_gsrId][idx] = a 284 | originalnews = self._testZ[idx] 285 | test_f1 = sklearn.metrics.f1_score(test_Y, pred_Y) 286 | test_recall = sklearn.metrics.recall_score(test_Y, pred_Y) 287 | test_precision = sklearn.metrics.precision_score(test_Y, pred_Y) 288 | test_roc = sklearn.metrics.roc_auc_score(test_Y, pred_Y) 289 | test_acc = sklearn.metrics.accuracy_score(test_Y, pred_Y) 290 | 291 | test_f1_arr.append(test_f1) 292 | test_recall_arr.append(test_recall) 293 | test_prec_arr.append(test_precision) 294 | test_roc_arr.append(test_roc) 295 | test_acc_arr.append(test_acc) 296 | 297 | test_score = [np.mean(np.array(test_acc_arr)), 298 | np.mean(np.array(test_recall_arr)), 299 | np.mean(np.array(test_prec_arr)), 300 | np.mean(np.array(test_f1_arr)), 301 | np.mean(np.array(test_roc_arr))] 302 | print "test bagsize:", self.bagsize, 303 | print "accuracy:", np.mean(np.array(test_acc_arr)), 304 | print "recall:", np.mean(np.array(test_recall_arr)), 305 | print "recall:", np.mean(np.array(test_prec_arr)), 306 | print "f1-score:", np.mean(np.array(test_f1_arr)) 307 | 308 | return self, test_score, train_score, gsr_history_probs 309 | 310 | def main(args): 311 | import json 312 | import os 313 | 314 | trainfname = "input_forClassification/country-%s/leadtime-%d/%s"\ 315 | % (args.country, args.leadtime, args.train) 316 | trainf = args.path + trainfname 317 | 318 | # Testing data 319 | testfname = "input_forClassification/country-%s/leadtime-%d/%s"\ 320 | % (args.country, args.leadtime, args.test) 321 | testf = args.path + testfname 322 | 323 | 324 | dfname = "news_deepfeature/news_doc2vec_%s.json" % args.country 325 | docf = args.path + dfname 326 | 327 | ### Output Files: 328 | resultf = '../result/{}_{}_lt-{}.txt'.format(args.resultfile, args.country, args.leadtime) 329 | trainMap = {} 330 | with open(trainf) as infile: 331 | for line in infile: 332 | j = json.loads(line.strip()) 333 | trainMap[len(trainMap)] = j 334 | 335 | testMap = {} 336 | with open(testf) as infile: 337 | for line in infile: 338 | j = json.loads(line.strip()) 339 | testMap[len(testMap)] = j 340 | 341 | with open(docf) as infile: 342 | docMap = {j['Id']: j['doc2vec'] for j in 343 | (json.loads(l) for l in infile)} 344 | 345 | day = args.historyDays 346 | 347 | 348 | start = datetime.now() 349 | model = nMIL(bagsize=day, beta=args.beta, gamma=args.gamma, m0=args.m0, p0=args.p0) 350 | print "Learning for Bag Size: %d" % day 351 | model.read_data(trainIndex=trainMap, testIndex=testMap, docIndex=docMap, feature=args.feature) 352 | model, perf1, perf2, gsrHistoryProbs = model.SGD() 353 | 354 | timediff = datetime.now() - start 355 | timecom.append(timediff) 356 | w1 = open(resultf, 'a') 357 | w1.write('\t'.join([str(score) for score in perf1]) + '\n') 358 | w1.close() 359 | outf1 = "../result/{}_{}_lt-{}_hd-{}_.json".format(args.outfile, args.country, args.leadtime, day) 360 | w2 = open(outf1, 'wb') 361 | w2.write(json.dumps(gsrHistoryProbs)) 362 | w2.close() 363 | 364 | 365 | print "Running nMIL model for forecasting on country %s, leadtime %d" % (args.country, args.leadtime) 366 | 367 | 368 | if __name__ == "__main__": 369 | import argparse 370 | ap = argparse.ArgumentParser() 371 | ap.add_argument("-p", "--path", help="path of data") 372 | ap.add_argument("-c", "--country", help="country") 373 | ap.add_argument("--train", help="path of training data") 374 | ap.add_argument("--test", help="path of testing data") 375 | ap.add_argument("--resultfile", help="path of result file") 376 | ap.add_argument("--outfile", help="path of precursor file") 377 | ap.add_argument("-l", "--leadtime", type=int, default=1, help="k days before GSR to forecast") 378 | ap.add_argument("-d", "--historyDays", type=int, default=10, help="number of history days to be used for training") 379 | ap.add_argument("-m0", type=float, default=.5, help="hyper parameter in hinge loss") 380 | ap.add_argument("-p0", type=float, default=.5, help="hyper parameter in hinge loss") 381 | ap.add_argument("--gamma", type=float, default=.5, help="parameter in SGD") 382 | ap.add_argument("--beta", type=float, default=3.0, help="parameter in SGD") 383 | args = ap.parse_args() 384 | main(args) 385 | 386 | -------------------------------------------------------------------------------- /nMIL_delta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | """ 5 | Source code for paper - 6 | "Modeling Precursors for Event Forecasting via Nested Multi-Instance Learning" 7 | Authors: Yue Ning, Sathappan Muthiah, Huzefa Rangwala, Naren Ramakrishnan 8 | published in KDD16. 9 | 10 | """ 11 | 12 | 13 | import sklearn 14 | import sklearn.cross_validation 15 | from scipy.special import expit as scipy_sig 16 | import numpy as np 17 | from numpy import linalg as la 18 | import pdb 19 | import random 20 | import math 21 | import time 22 | from datetime import datetime, timedelta 23 | from dateutil.parser import parse as dateparser 24 | from collections import OrderedDict 25 | 26 | def sigmoid(x): 27 | return 1. / (1. + math.exp(-x)) 28 | 29 | 30 | class nMIL_delta: 31 | def __init__(self, bagsize, beta=3.0, gamma=0.5, m0=0.5, p0=0.5): 32 | self.bagsize = bagsize 33 | self.beta = beta 34 | self.gamma = gamma 35 | self.m0 = m0 36 | self.p0 = p0 37 | 38 | def grad_func(self, X, Y, orgX, w, beta, gamma, m0, p0): 39 | 40 | first_matrix = [] 41 | second_matrix = [] 42 | third_matrix = [] 43 | 44 | for i, superbag in enumerate(X): 45 | P_i_list = [] 46 | combinedbag_loss = [] 47 | combinedDoc_loss = [] 48 | for j, bag in enumerate(superbag): 49 | bag_dotp = np.dot(bag, w[:, np.newaxis]) 50 | p_ij_list = scipy_sig(bag_dotp) 51 | hinge_loss = np.sign(p_ij_list - p0) 52 | P_i_list.append(np.mean(p_ij_list)) 53 | bagloss = np.dot((p_ij_list*(1 - p_ij_list)).T, bag) / len(bag) 54 | combinedbag_loss.append(bagloss) 55 | mask = ((hinge_loss * bag_dotp) < m0) 56 | combinedDoc_loss.append(np.dot((hinge_loss * mask).T, bag) 57 | / len(bag)) 58 | 59 | combinedbag_loss = np.concatenate(combinedbag_loss, axis=0) 60 | combinedDoc_loss = np.concatenate(combinedDoc_loss, axis=0) 61 | P_i = np.mean(P_i_list) 62 | neg_loglh = (Y[i] - P_i) / (P_i * (1. - P_i)) 63 | 64 | superbagloss = neg_loglh * combinedbag_loss 65 | first_matrix.append(np.sum(superbagloss, axis=0)) 66 | 67 | diffbag = np.concatenate([np.zeros((1,self.x_dimension)), combinedbag_loss], axis=0) 68 | crossbag_loss = np.diff(diffbag, axis=0) 69 | if len(superbag) == 1: 70 | cross_cosine_combined = 1 71 | else: 72 | cross_cosine_combined = np.array([0.] + [(orgX[i][j]['cross_cosine'] / 73 | (len(X[i][j]) * len(X[i][j - 1]))) 74 | for j in 75 | range(1, len(superbag))])[:, np.newaxis] 76 | 77 | crossbag_derivative = 2*((np.diff([0]+ P_i_list)[:, np.newaxis] * 78 | crossbag_loss * cross_cosine_combined)) 79 | crossbag_derivative = (crossbag_derivative / len(X[i])).sum(axis=0) 80 | second_matrix.append(crossbag_derivative) 81 | third_matrix.append(combinedDoc_loss.sum(axis=0)) 82 | 83 | first_sum = np.sum(np.array(first_matrix), axis=0) * beta 84 | second_sum = np.sum(np.array(second_matrix), axis=0) * gamma 85 | third_sum = np.sum(np.array(third_matrix), axis=0) #* gamma 86 | if len(second_matrix) > 0 and len(third_matrix) > 0: 87 | return -first_sum + second_sum - third_sum 88 | else: 89 | return -first_sum 90 | 91 | 92 | def prepare_data(self, docMap, X_docs, dataIndex, X, Y, Z, origianl): 93 | 94 | pos_count = 0 95 | allcount = 0 96 | for cVal in dataIndex.viewvalues(): 97 | Z.append(cVal) 98 | if cVal['Y']: 99 | Y.append(1) 100 | pos_count += 1 101 | else: 102 | Y.append(0) 103 | 104 | combinedDoc = [] 105 | featureDoc = [] 106 | for i in range(max(1, 11 - self.bagsize), 11): 107 | if i == 1: 108 | cosine_sim = 1.0 109 | else: 110 | cosine_sim = cVal['history'][str(i)]['cross_cosine'] 111 | 112 | doc_vec = X_docs[[docMap[d] for d in 113 | cVal['history'][str(i)]['ids']], :] 114 | doc_ids = cVal['history'][str(i)]['ids'] 115 | x_dict = {'docvec': doc_vec, 116 | 'docids': doc_ids, 'cross_cosine': cosine_sim} 117 | combinedDoc.append(x_dict) 118 | featureDoc.append(doc_vec) 119 | 120 | origianl.append(combinedDoc) 121 | X.append(featureDoc) 122 | 123 | allcount += 1 124 | return pos_count, allcount 125 | 126 | 127 | def read_data(self, **kwargs): 128 | 129 | trainUnsorted = kwargs['trainIndex'] 130 | testUnsorted = kwargs['testIndex'] 131 | docIndex = kwargs['docIndex'] 132 | trainIndex = OrderedDict(sorted(trainUnsorted.items(), key=lambda x:x[1]['time'][:10])) 133 | testIndex = OrderedDict(sorted(testUnsorted.items(), key=lambda x:x[1]['time'][:10])) 134 | print "train data start and end" 135 | print trainIndex[trainIndex.keys()[0]]['time'] 136 | print trainIndex[trainIndex.keys()[-1]]['time'] 137 | 138 | 139 | print "test data start and end" 140 | print testIndex[testIndex.keys()[0]]['time'] 141 | print testIndex[testIndex.keys()[-1]]['time'] 142 | print "number of documents:", len(docIndex) 143 | 144 | # print backward 145 | docItems = docIndex.items() 146 | self.x_dimension = len(docIndex.values()[0]) 147 | docMap = {val[0]: index for index, val in enumerate(docItems)} 148 | X_docs = np.array([k[1] for k in docItems]) 149 | 150 | self._trainY = [] 151 | self._trainX = [] 152 | self.original_trainX = [] 153 | self._trainZ = [] 154 | pc_train, c_train = self.prepare_data(docMap, 155 | X_docs, 156 | trainIndex, 157 | self._trainX, 158 | self._trainY, 159 | self._trainZ, 160 | self.original_trainX) 161 | print "Traing: Positive %d, negative %d" % (pc_train, c_train - pc_train) 162 | 163 | self._testY = [] 164 | self._testX = [] 165 | self.original_testX = [] 166 | self._testZ = [] 167 | pc_test, c_test = self.prepare_data(docMap, 168 | X_docs, 169 | testIndex, 170 | self._testX, 171 | self._testY, 172 | self._testZ, 173 | self.original_testX) 174 | print "Testing: Positive %d, negative %d" % (pc_test, c_test - pc_test) 175 | return 176 | 177 | def SGD(self, leadtime): 178 | 179 | train_X = np.array(self._trainX) 180 | train_Y = np.array(self._trainY) 181 | print "X: {}".format(train_X.shape) 182 | print "Y: {}".format(train_Y.shape) 183 | OriginalTrain_X = self.original_trainX 184 | 185 | test_X = np.array(self._testX) 186 | test_Y = np.array(self._testY) 187 | 188 | lambd = 0.05 189 | iteration = 2000 190 | n_sgd = 10 191 | test_f1_arr = [] 192 | test_recall_arr = [] 193 | test_prec_arr = [] 194 | test_roc_arr = [] 195 | test_acc_arr = [] 196 | test_score = [] 197 | train_score = [] 198 | print "using params:\n gamma:{}, beta:{}, m0:{}, p0:{}".format(self.gamma, self.beta, 199 | self.m0, self.p0) 200 | for expr in range(n_sgd): 201 | w = np.random.rand(self.x_dimension) 202 | for t in range(iteration): 203 | eta = 1./((t + 1) * lambd) 204 | 205 | kset = random.sample(range(0, len(train_X) - 1), 10) 206 | X = train_X[kset] 207 | Y = train_Y[kset] 208 | orgX = [OriginalTrain_X[k] for k in kset] 209 | delta_w = self.grad_func(X, Y, orgX, w, self.beta, 210 | self.gamma, self.m0, self.p0) 211 | 212 | new_w = np.dot((1 - eta * lambd), w) - eta * delta_w / len(X) 213 | 214 | rate = (1./np.sqrt(lambd)) * (1./la.norm(new_w)) 215 | if rate < 1.: 216 | w = np.dot(rate, new_w) 217 | else: 218 | w = new_w 219 | 220 | pred_Y = [] 221 | pred_probs = [] 222 | predicted_data = [] 223 | gsr_history_probs = {} 224 | for idx, testx in enumerate(test_X): 225 | p_ij_list = [] 226 | days_list = [np.mean(scipy_sig(np.dot(bag, w[:, np.newaxis]))) 227 | for j, bag in enumerate(testx)] 228 | 229 | P_i = np.mean(days_list) 230 | pred_probs.append(P_i) 231 | if P_i > 0.5: 232 | pred_Y.append(1) 233 | else: 234 | pred_Y.append(0) 235 | 236 | bag = self.original_testX[idx] 237 | bag_gsrId = self._testZ[idx]['Id'] 238 | gsr_history_probs[bag_gsrId] = {'trueY': test_Y[idx]} 239 | for idx, histday in enumerate(bag): 240 | doc_vec = histday['docvec'] 241 | doc_ids = histday['docids'] 242 | today_probs = [sigmoid(np.dot(w.T, doc)) for k, doc 243 | in enumerate(doc_vec)] 244 | a = zip(doc_ids, today_probs) 245 | gsr_history_probs[bag_gsrId][idx] = a 246 | 247 | test_f1 = sklearn.metrics.f1_score(test_Y, pred_Y) 248 | test_recall = sklearn.metrics.recall_score(test_Y, pred_Y) 249 | test_precision = sklearn.metrics.precision_score(test_Y, pred_Y) 250 | test_roc = sklearn.metrics.roc_auc_score(test_Y, pred_Y) 251 | test_acc = sklearn.metrics.accuracy_score(test_Y, pred_Y) 252 | 253 | test_f1_arr.append(test_f1) 254 | test_recall_arr.append(test_recall) 255 | test_prec_arr.append(test_precision) 256 | test_roc_arr.append(test_roc) 257 | test_acc_arr.append(test_acc) 258 | 259 | test_score = [np.mean(np.array(test_acc_arr)), 260 | np.mean(np.array(test_recall_arr)), 261 | np.mean(np.array(test_prec_arr)), 262 | np.mean(np.array(test_f1_arr)), 263 | np.mean(np.array(test_roc_arr))] 264 | print "test bagsize:", self.bagsize, 265 | print "accuracy:", np.mean(np.array(test_acc_arr)), 266 | print "recall:", np.mean(np.array(test_recall_arr)), 267 | print "recall:", np.mean(np.array(test_prec_arr)), 268 | print "f1-score:", np.mean(np.array(test_f1_arr)) 269 | 270 | return self, test_score, train_score, gsr_history_probs, test_Y, pred_probs 271 | 272 | 273 | def main(args): 274 | import json 275 | import os 276 | 277 | trainfname = "input_forClassification/country-%s/leadtime-%d/%s"\ 278 | % (args.country, args.leadtime, args.train) 279 | trainf = args.path + trainfname 280 | # Testing data 281 | testfname = "input_forClassification/country-%s/leadtime-%d/%s"\ 282 | % (args.country, args.leadtime, args.test) 283 | testf = args.path + testfname 284 | 285 | 286 | dfname = "news_deepfeature/news_doc2vec_%s.json" % args.country 287 | docf = args.path + dfname 288 | 289 | ### Output Files: 290 | resultf = '../result/{}_{}_lt-{}.txt'.format(args.resultfile, args.country, args.leadtime) 291 | trainMap = {} 292 | with open(trainf) as infile: 293 | for line in infile: 294 | j = json.loads(line.strip()) 295 | trainMap[len(trainMap)] = j 296 | 297 | testMap = {} 298 | with open(testf) as infile: 299 | for line in infile: 300 | j = json.loads(line.strip()) 301 | testMap[len(testMap)] = j 302 | 303 | with open(docf) as infile: 304 | docMap = {j['Id']: j['doc2vec'] for j in 305 | (json.loads(l) for l in infile)} 306 | 307 | day = args.historyDays 308 | 309 | start = time.time() 310 | model = nMIL_delta(day, beta=args.beta, gamma=args.gamma, m0=args.m0, p0=args.p0) 311 | print "Learning for Bag Size: %d" % day 312 | model.read_data(trainIndex=trainMap, testIndex=testMap, docIndex=docMap) 313 | 314 | model, perf1, perf2, gsrHistoryProbs, test_Y, pred_probs = model.SGD(args.leadtime) 315 | w1 = open(resultf, 'a') 316 | w1.write('\t'.join([str(score) for score in perf1]) + '\n') 317 | w1.close() 318 | outf1 = "../result/{}_{}_lt-{}_hd-{}_.json".format(args.outfile, args.country, args.leadtime, day) 319 | w2 = open(outf1, 'wb') 320 | w2.write(json.dumps(gsrHistoryProbs)) 321 | w2.close() 322 | 323 | print "Running nMIL delta model for forecasting on country %s, leadtime %d" % (args.country, args.leadtime) 324 | print "run-time:{}s".format(time.time() - start) 325 | 326 | 327 | if __name__ == "__main__": 328 | import argparse 329 | ap = argparse.ArgumentParser() 330 | ap.add_argument("-p", "--path", help="path of data") 331 | ap.add_argument("-c", "--country", help="country") 332 | ap.add_argument("--train", help="path of training data") 333 | ap.add_argument("--test", help="path of testing data") 334 | ap.add_argument("--resultfile", help="path of result file") 335 | ap.add_argument("--outfile", help="path of precursor file") 336 | ap.add_argument("-l", "--leadtime", type=int, default=1, help="k days before events to forecast") 337 | ap.add_argument("-d", "--historyDays", type=int, default=10, help="number of history days to be used for training") 338 | ap.add_argument("-m0", type=float, default=.5, help="hyper parameter in hinge loss") 339 | ap.add_argument("-p0", type=float, default=.5, help="hyper parameter in hinge loss") 340 | ap.add_argument("--gamma", type=float, default=.5, help="parameter in SGD") 341 | ap.add_argument("--beta", type=float, default=3.0, help="parameter in SGD") 342 | args = ap.parse_args() 343 | main(args) 344 | -------------------------------------------------------------------------------- /nMIL_omega.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | """ 4 | Source code for paper - 5 | "Modeling Precursors for Event Forecasting via Nested Multi-Instance Learning" 6 | Authors: Yue Ning, Sathappan Muthiah, Huzefa Rangwala, Naren Ramakrishnan 7 | published in KDD16. 8 | 9 | """ 10 | 11 | import sklearn 12 | import sklearn.cross_validation 13 | from scipy.special import expit as scipy_sig 14 | import numpy as np 15 | from numpy import linalg as la 16 | import pdb 17 | import random 18 | import math 19 | import time 20 | 21 | 22 | def sigmoid(x): 23 | return 1. / (1. + math.exp(-x)) 24 | 25 | 26 | 27 | 28 | class nMIL_omega: 29 | def __init__(self, bagsize, beta=3.0, gamma=0.5, m0=0.5, p0=0.5): 30 | self.bagsize = bagsize 31 | self.beta = beta 32 | self.gamma = gamma 33 | self.m0 = m0 34 | self.p0 = p0 35 | 36 | def grad_func(X, Y, orgX, w, beta, gamma, m0, p0): 37 | 38 | first_matrix = [] 39 | second_matrix = [] 40 | third_matrix = [] 41 | 42 | for i, superbag in enumerate(X): 43 | P_i_list = [] 44 | combinedbag_loss = [] 45 | combinedDoc_loss = [] 46 | for j, bag in enumerate(superbag): 47 | bag_dotp = np.dot(bag, w[j, :][:, np.newaxis]) 48 | p_ij_list = scipy_sig(bag_dotp) 49 | hinge_loss = np.sign(p_ij_list - p0) 50 | P_i_list.append(np.mean(p_ij_list)) 51 | bagloss = np.dot((p_ij_list*(1 - p_ij_list)).T, bag) / len(bag) 52 | combinedbag_loss.append(bagloss) 53 | mask = ((hinge_loss * bag_dotp) < m0) 54 | combinedDoc_loss.append(np.dot((hinge_loss * mask).T, bag) 55 | / len(bag)) 56 | 57 | combinedbag_loss = np.concatenate(combinedbag_loss, axis=0) 58 | combinedDoc_loss = np.concatenate(combinedDoc_loss, axis=0) 59 | P_i = np.mean(P_i_list) 60 | neg_loglh = (Y[i] - P_i) / (P_i * (1. - P_i)) 61 | superbagloss = neg_loglh * combinedbag_loss 62 | first_matrix.append(superbagloss) 63 | 64 | diffbag = np.concatenate([np.zeros((1,300)), combinedbag_loss], axis=0) 65 | crossbag_loss = np.diff(diffbag, axis=0) 66 | if len(superbag) == 1: 67 | cross_cosine_combined = 1 68 | else: 69 | cross_cosine_combined = np.array([1] + [1 70 | for j in 71 | range(1, len(superbag))])[:, np.newaxis] 72 | 73 | crossbag_derivative = 2*((np.diff([0]+ P_i_list)[:, np.newaxis] * 74 | crossbag_loss * cross_cosine_combined)) 75 | crossbag_derivative = crossbag_derivative / len(X[i]) 76 | second_matrix.append(crossbag_derivative) 77 | third_matrix.append(combinedDoc_loss) 78 | 79 | first_sum = np.sum(np.array(first_matrix), axis=0) * beta 80 | second_sum = np.sum(np.array(second_matrix), axis=0) * gamma 81 | third_sum = np.sum(np.array(third_matrix), axis=0) #* gamma 82 | if len(second_matrix) > 0 and len(third_matrix) > 0: 83 | return -first_sum + second_sum - third_sum 84 | else: 85 | return -first_sum 86 | 87 | def prepare_data(self, docMap, X_docs, dataIndex, X, Y, Z, origianl): 88 | 89 | pos_count = 0 90 | allcount = 0 91 | for cVal in dataIndex.viewvalues(): 92 | Z.append(cVal) 93 | if cVal['Y']: 94 | Y.append(1) 95 | pos_count += 1 96 | else: 97 | Y.append(0) 98 | 99 | combinedDoc = [] 100 | featureDoc = [] 101 | for i in range(max(1, 11 - self.bagsize), 11): 102 | if i == 1: 103 | cosine_sim = 1.0 104 | else: 105 | cosine_sim = cVal['history'][str(i)]['cross_cosine'] 106 | 107 | doc_vec = X_docs[[docMap[d] for d in 108 | cVal['history'][str(i)]['ids']], :] 109 | doc_ids = cVal['history'][str(i)]['ids'] 110 | x_dict = {'docvec': doc_vec, 111 | 'docids': doc_ids, 'cross_cosine': cosine_sim} 112 | combinedDoc.append(x_dict) 113 | featureDoc.append(doc_vec) 114 | 115 | origianl.append(combinedDoc) 116 | X.append(featureDoc) 117 | 118 | allcount += 1 119 | return pos_count, allcount 120 | 121 | 122 | def read_data(self, **kwargs): 123 | 124 | trainUnsorted = kwargs['trainIndex'] 125 | testUnsorted = kwargs['testIndex'] 126 | docIndex = kwargs['docIndex'] 127 | trainIndex = OrderedDict(sorted(trainUnsorted.items(), key=lambda x:x[1]['time'][:10])) 128 | testIndex = OrderedDict(sorted(testUnsorted.items(), key=lambda x:x[1]['time'][:10])) 129 | print "train data start and end" 130 | print trainIndex[trainIndex.keys()[0]]['time'] 131 | print trainIndex[trainIndex.keys()[-1]]['time'] 132 | 133 | 134 | print "test data start and end" 135 | print testIndex[testIndex.keys()[0]]['time'] 136 | print testIndex[testIndex.keys()[-1]]['time'] 137 | print "number of documents:", len(docIndex) 138 | 139 | # print backward 140 | docItems = docIndex.items() 141 | self.x_dimension = len(docIndex.values()[0]) 142 | docMap = {val[0]: index for index, val in enumerate(docItems)} 143 | X_docs = np.array([k[1] for k in docItems]) 144 | 145 | self._trainY = [] 146 | self._trainX = [] 147 | self.original_trainX = [] 148 | self._trainZ = [] 149 | pc_train, c_train = self.prepare_data(docMap, 150 | X_docs, 151 | trainIndex, 152 | self._trainX, 153 | self._trainY, 154 | self._trainZ, 155 | self.original_trainX) 156 | print "Traing: Positive %d, negative %d" % (pc_train, c_train - pc_train) 157 | 158 | self._testY = [] 159 | self._testX = [] 160 | self.original_testX = [] 161 | self._testZ = [] 162 | pc_test, c_test = self.prepare_data(docMap, 163 | X_docs, 164 | testIndex, 165 | self._testX, 166 | self._testY, 167 | self._testZ, 168 | self.original_testX) 169 | print "Testing: Positive %d, negative %d" % (pc_test, c_test - pc_test) 170 | return 171 | 172 | def SGD(self): 173 | 174 | train_X = np.array(self._trainX) 175 | train_Y = np.array(self._trainY) 176 | print "X: {}".format(train_X.shape) 177 | print "Y: {}".format(train_Y.shape) 178 | OriginalTrain_X = self.original_trainX 179 | 180 | test_X = np.array(self._testX) 181 | test_Y = np.array(self._testY) 182 | 183 | lambd = 0.05 184 | iteration = 2000 185 | x_dimension = 300 186 | n_sgd = 10 187 | test_f1_arr = [] 188 | test_recall_arr = [] 189 | test_prec_arr = [] 190 | test_roc_arr = [] 191 | test_acc_arr = [] 192 | test_score = [] 193 | train_score = [] 194 | print "using params:\n gamma:{}, beta:{}, m0:{}, p0:{}".format(self.gamma, self.beta, 195 | self.m0, self.p0) 196 | for expr in range(n_sgd): 197 | w = np.random.rand(self.bagsize, x_dimension) 198 | for t in range(iteration): 199 | eta = 1./((t + 1) * lambd) 200 | 201 | kset = random.sample(range(0, len(train_X) - 1), 10) 202 | X = train_X[kset] 203 | Y = train_Y[kset] 204 | orgX = [OriginalTrain_X[k] for k in kset] 205 | delta_w = self.grad_func(X, Y, orgX, w, self.beta, 206 | self.gamma, self.m0, self.p0) 207 | 208 | new_w = np.dot((1 - eta * lambd), w) - eta * delta_w / len(X) 209 | 210 | rate = (1./np.sqrt(lambd)) * (1./la.norm(new_w)) 211 | if rate < 1.: 212 | w = np.dot(rate, new_w) 213 | else: 214 | w = new_w 215 | 216 | pred_Y = [] 217 | pred_probs = [] 218 | gsr_history_probs = {} 219 | for idx, testx in enumerate(test_X): 220 | p_ij_list = [] 221 | originalId = test_idx[idx] 222 | days_list = [np.mean(scipy_sig(np.dot(bag, w[j, :][:, np.newaxis]))) 223 | for j, bag in enumerate(testx)] 224 | 225 | P_i = np.mean(days_list) 226 | pred_probs.append(P_i) 227 | if P_i > 0.5: 228 | pred_Y.append(1) 229 | else: 230 | pred_Y.append(0) 231 | bag = self.original_testX[idx] 232 | bag_gsrId = self._testZ[idx]['Id'] 233 | 234 | gsr_history_probs[bag_gsrId] = {} 235 | gsr_history_probs[bag_gsrId]['trueY'] = data_Y[originalId] 236 | for idx, histday in enumerate(bag): 237 | 238 | doc_vec = histday['docvec'] 239 | doc_ids = histday['docids'] 240 | today_probs = [sigmoid(np.dot(w.T, doc)) for k, doc in enumerate(doc_vec)] 241 | 242 | a = zip(doc_ids, today_probs) 243 | gsr_history_probs[bag_gsrId][idx] = a 244 | originalnews = self._testZ[idx] 245 | test_f1 = sklearn.metrics.f1_score(test_Y, pred_Y) 246 | test_recall = sklearn.metrics.recall_score(test_Y, pred_Y) 247 | test_precision = sklearn.metrics.precision_score(test_Y, pred_Y) 248 | test_roc = sklearn.metrics.roc_auc_score(test_Y, pred_Y) 249 | test_acc = sklearn.metrics.accuracy_score(test_Y, pred_Y) 250 | 251 | test_f1_arr.append(test_f1) 252 | test_recall_arr.append(test_recall) 253 | test_prec_arr.append(test_precision) 254 | test_roc_arr.append(test_roc) 255 | test_acc_arr.append(test_acc) 256 | 257 | test_score = [np.mean(np.array(test_acc_arr)), 258 | np.mean(np.array(test_recall_arr)), 259 | np.mean(np.array(test_prec_arr)), 260 | np.mean(np.array(test_f1_arr)), 261 | np.mean(np.array(test_roc_arr))] 262 | 263 | print "test bagsize:", self.bagsize, 264 | print "accuracy:", np.mean(np.array(test_acc_arr)), 265 | print "recall:", np.mean(np.array(test_recall_arr)), 266 | print "recall:", np.mean(np.array(test_prec_arr)), 267 | print "f1-score:", np.mean(np.array(test_f1_arr)) 268 | 269 | return self, test_score, train_score, gsr_history_probs, test_Y, pred_probs 270 | 271 | 272 | def main(args): 273 | import json 274 | import os 275 | 276 | trainfname = "input_forClassification/country-%s/leadtime-%d/%s"\ 277 | % (args.country, args.leadtime, args.train) 278 | trainf = args.path + trainfname 279 | # Testing data 280 | testfname = "input_forClassification/country-%s/leadtime-%d/%s"\ 281 | % (args.country, args.leadtime, args.test) 282 | testf = args.path + testfname 283 | 284 | 285 | dfname = "news_deepfeature/news_doc2vec_%s.json" % args.country 286 | docf = args.path + dfname 287 | 288 | ### Output Files: 289 | resultf = '../result/{}_{}_lt-{}.txt'.format(args.resultfile, args.country, args.leadtime) 290 | trainMap = {} 291 | with open(trainf) as infile: 292 | for line in infile: 293 | j = json.loads(line.strip()) 294 | trainMap[len(trainMap)] = j 295 | 296 | testMap = {} 297 | with open(testf) as infile: 298 | for line in infile: 299 | j = json.loads(line.strip()) 300 | testMap[len(testMap)] = j 301 | 302 | with open(docf) as infile: 303 | docMap = {j['Id']: j['doc2vec'] for j in 304 | (json.loads(l) for l in infile)} 305 | 306 | day = args.historyDays 307 | 308 | start = time.time() 309 | model = nMIL_omega(day, beta=args.beta, gamma=args.gamma, m0=args.m0, p0=args.p0) 310 | print "Learning for Bag Size: %d" % day 311 | model.read_data(histIndex=historyMap, docIndex=docMap, feature=args.feature) 312 | 313 | model, perf1, perf2, gsrHistoryProbs, test_Y, pred_probs = model.SGD() 314 | w1 = open(resultf, 'a') 315 | w1.write('\t'.join([str(score) for score in perf1]) + '\n') 316 | w1.close() 317 | outf = "../result/{}_{}_lt-{}_hd-{}_.json".format(args.outfile, args.country, args.leadtime, day) 318 | w2 = open(outf, 'wb') 319 | w2.write(json.dumps(gsrHistoryProbs)) 320 | w2.close() 321 | print "Running nMIL omega model for forecasting on country %s, leadtime %d" % (args.country, args.leadtime) 322 | print "run-time:{}s".format(time.time() - start) 323 | 324 | 325 | if __name__ == "__main__": 326 | import argparse 327 | ap = argparse.ArgumentParser() 328 | ap.add_argument("-p", "--path", help="path of data") 329 | ap.add_argument("-c", "--country", help="country") 330 | ap.add_argument("--train", help="path of training data") 331 | ap.add_argument("--test", help="path of testing data") 332 | ap.add_argument("--resultfile", help="path of result file") 333 | ap.add_argument("--outfile", help="path of precursor file") 334 | ap.add_argument("-l", "--leadtime", type=int, default=1, help="k days before GSR to forecast") 335 | ap.add_argument("-d", "--historyDays", type=int, default=10, help="number of history days to be used for training") 336 | ap.add_argument("-m0", type=float, default=.5, help="hyper parameter in hinge loss") 337 | ap.add_argument("-p0", type=float, default=.5, help="hyper parameter in hinge loss") 338 | ap.add_argument("--gamma", type=float, default=.5, help="parameter in SGD") 339 | ap.add_argument("--beta", type=float, default=3.0, help="parameter in SGD") 340 | args = ap.parse_args() 341 | main(args) 342 | --------------------------------------------------------------------------------