├── README.md
├── nMIL.py
├── nMIL_delta.py
└── nMIL_omega.py


/README.md:
--------------------------------------------------------------------------------
1 | # nested_Multi_Instance_Learning
2 | 
3 | Source code for paper "[Modeling Precursors for Event Forecasting via Nested Multi-Instance Learning](http://www.kdd.org/kdd2016/subtopic/view/modeling-precursors-for-event-forecasting-via-nested-multi-instance-learnin)" published in KDD16.
4 | 


--------------------------------------------------------------------------------
/nMIL.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding:utf-8 -*-
  3 | 
  4 | """
  5 | Source code for paper -
  6 |     "Modeling Precursors for Event Forecasting via Nested Multi-Instance Learning" 
  7 |     Authors: Yue Ning, Sathappan Muthiah, Huzefa Rangwala, Naren Ramakrishnan
  8 | published in KDD16.
  9 | 
 10 | """
 11 | 
 12 | 
 13 | import sklearn
 14 | import sklearn.svm
 15 | from sklearn.feature_extraction import DictVectorizer
 16 | from sklearn.feature_extraction.text import TfidfTransformer
 17 | from sklearn.base import ClassifierMixin
 18 | from sklearn.grid_search import RandomizedSearchCV
 19 | import scipy.stats
 20 | from copy import deepcopy
 21 | import numpy as np
 22 | from numpy import linalg as la
 23 | import pdb
 24 | import misvm_sklearn
 25 | from scipy.sparse import issparse, vstack as parse_vstack
 26 | import random
 27 | import math
 28 | from collections import OrderedDict
 29 | from datetime import datetime, timedelta
 30 | 
 31 | def sigmoid(x):
 32 |   return 1. / (1. + math.exp(-x))
 33 | 
 34 | 
 35 | def _vstack(m):
 36 |     """
 37 |     Return vstack
 38 |     """
 39 |     if issparse(m[0]) or issparse(m[-1]):
 40 |         return parse_vstack(m)
 41 |     else:
 42 |         return np.vstack(m)
 43 | 
 44 | 
 45 | class nMIL:
 46 |     def __init__(self, bagsize, beta=3.0, gamma=0.5, m0=0.5, p0=0.5):
 47 |         self.bagsize = bagsize
 48 |         self.beta = beta
 49 |         self.gamma = gamma
 50 |         self.m0 = m0
 51 |         self.p0 = p0
 52 | 
 53 |     def grad_func(X, Y, orgX, w):
 54 | 
 55 |         beta = 3.0; m0 =  0.5; p0 = 0.5; beta2 = 6.0; gamma = 0.5
 56 |         first_matrix = []
 57 |         second_matrix = []
 58 |         third_matrix = []
 59 |         for i, superbag in enumerate(X):
 60 |             jk_plus = []
 61 |             P_i_list = []
 62 |             m_i = len(X[i])
 63 |             P_ijk_list = []
 64 |             n_ij_list = []
 65 |             
 66 |             for j, bag in enumerate(superbag):
 67 |                 m_i = len(bag)
 68 |                 k_plus = [k for k, doc in enumerate(bag) 
 69 |                         if np.sign(sigmoid(np.dot(w, doc[:, np.newaxis])) - p0) * np.dot(w, doc[:, np.newaxis]) < m0]
 70 |                 p_ij_list = [sigmoid(np.dot(w, doc[:, np.newaxis])) for k, doc in enumerate(bag)]
 71 |                 jk_plus.append(k_plus)
 72 |                 n_ij_list.append(m_i)
 73 |                 P_ijk_list.append(p_ij_list)
 74 |                 p_ij = np.mean(p_ij_list)
 75 |                 P_i_list.append(p_ij)
 76 |             
 77 |             P_i = np.mean(P_i_list)
 78 | 
 79 |             for j, x in enumerate(X[i]):
 80 |                 if len(x) > 0:
 81 |                     row_list = [X[i][j][k] * P_ijk_list[j][k]\
 82 |                                 * (1. - P_ijk_list[j][k])\
 83 |                                 * ((Y[i] - P_i) / (P_i * (1.- P_i)))\
 84 |                                 * (1. / n_ij_list[j])\
 85 |                                 for k in xrange(len(X[i][j]))]
 86 | 
 87 |                     first_matrix.append(np.sum(np.array(row_list), axis=0))
 88 | 
 89 |             for j, x in enumerate(X[i]):
 90 |                 if len(x) > 0:
 91 |                     row_list = [(X[i][j][k]\
 92 |                                 * P_ijk_list[j][k]\
 93 |                                 * (1. - P_ijk_list[j][k])\
 94 |                                 * (1. / n_ij_list[j]))\
 95 |                                 for k in xrange(len(x))]
 96 |                     if j > 0 and len(X[i][j-1]) > 0:
 97 |                         row_list2 = [(X[i][j-1][k]\
 98 |                                     * P_ijk_list[j-1][k]\
 99 |                                     * (1. - P_ijk_list[j-1][k])\
100 |                                     * (1. / n_ij_list[j-1]))\
101 |                                     for k in xrange(len(X[i][j-1]))]
102 |                         current_sum = np.sum(np.array(row_list), axis=0)
103 |                         last_sum = np.sum(np.array(row_list2), axis=0)
104 | 
105 |                         derivative = 2. * (1./ len(X[i]))\
106 |                                         * (P_i_list[j] -  P_i_list[j-1])\
107 |                                         * (current_sum - last_sum)\
108 |                                         * orgX[i][j]['cross_cosine']\
109 |                                         /(len(X[i][j]) * len(X[i][j-1]))
110 |                         second_matrix.append(derivative)
111 | 
112 |             for idj, kplus in enumerate(jk_plus):
113 |                 if len(kplus) > 0:
114 |                     row_list = [X[i][idj][idk]\
115 |                                 * np.sign(P_ijk_list[idj][idk] - p0)\
116 |                                 * (1. / n_ij_list[idj])\
117 |                                 for idk in kplus]
118 | 
119 |                     sum_row = np.sum(np.array(row_list), axis=0)
120 |                     third_matrix.append(sum_row * (1./ n_ij_list[idj]))
121 | 
122 | 
123 |         first_sum = np.sum(np.array(first_matrix), axis=0) * beta
124 |         second_sum = np.sum(np.array(second_matrix), axis=0) * gamma
125 |         third_sum = np.sum(np.array(third_matrix), axis=0) * gamma
126 | 
127 |         if len(second_matrix) > 0 and len(third_matrix) > 0:
128 |             return -first_sum + second_sum - third_sum 
129 |         else:
130 |              return -first_sum 
131 |    
132 |     def prepare_data(self, docMap, X_docs, dataIndex, X, Y, Z, origianl):
133 | 
134 |         pos_count = 0
135 |         allcount = 0
136 |         for cVal in dataIndex.viewvalues():
137 |             Z.append(cVal)
138 |             if cVal['Y']:
139 |                 Y.append(1)
140 |                 pos_count += 1
141 |             else:
142 |                 Y.append(0)
143 | 
144 |             combinedDoc = []
145 |             featureDoc = []
146 |             for i in range(max(1, 11 - self.bagsize), 11):
147 |                 if i == 1:
148 |                     cosine_sim = 1.0
149 |                 else:
150 |                     cosine_sim = cVal['history'][str(i)]['cross_cosine']
151 | 
152 |                 doc_vec = X_docs[[docMap[d] for d in
153 |                                   cVal['history'][str(i)]['ids']], :]
154 |                 doc_ids = cVal['history'][str(i)]['ids']
155 |                 x_dict = {'docvec': doc_vec,
156 |                           'docids': doc_ids, 'cross_cosine': cosine_sim}
157 |                 combinedDoc.append(x_dict)
158 |                 featureDoc.append(doc_vec)
159 | 
160 |             origianl.append(combinedDoc)
161 |             X.append(featureDoc)
162 | 
163 |             allcount += 1
164 |         return pos_count, allcount
165 | 
166 | 
167 |     def read_data(self, **kwargs):
168 | 
169 |         trainUnsorted = kwargs['trainIndex']
170 |         testUnsorted = kwargs['testIndex']
171 |         docIndex = kwargs['docIndex']
172 |         trainIndex = OrderedDict(sorted(trainUnsorted.items(), key=lambda x:x[1]['time'][:10]))
173 |         testIndex = OrderedDict(sorted(testUnsorted.items(), key=lambda x:x[1]['time'][:10]))
174 |         print "train data start and end"
175 |         print trainIndex[trainIndex.keys()[0]]['time']
176 |         print trainIndex[trainIndex.keys()[-1]]['time']
177 | 
178 | 
179 |         print "test data start and end"
180 |         print testIndex[testIndex.keys()[0]]['time']
181 |         print testIndex[testIndex.keys()[-1]]['time']
182 |         print "number of documents:", len(docIndex)
183 | 
184 |         # print backward
185 |         docItems = docIndex.items()
186 |         self.x_dimension = len(docIndex.values()[0])
187 |         docMap = {val[0]: index for index, val in enumerate(docItems)}
188 |         X_docs = np.array([k[1] for k in docItems])
189 | 
190 |         self._trainY = []
191 |         self._trainX = []
192 |         self.original_trainX = []
193 |         self._trainZ = []
194 |         pc_train, c_train = self.prepare_data(docMap, 
195 |                                             X_docs, 
196 |                                             trainIndex, 
197 |                                             self._trainX, 
198 |                                             self._trainY, 
199 |                                             self._trainZ, 
200 |                                             self.original_trainX)
201 |         print "Traing: Positive %d, negative %d" % (pc_train, c_train - pc_train)
202 | 
203 |         self._testY = []
204 |         self._testX = []
205 |         self.original_testX = []
206 |         self._testZ = []  
207 |         pc_test, c_test = self.prepare_data(docMap, 
208 |                                             X_docs, 
209 |                                             testIndex, 
210 |                                             self._testX, 
211 |                                             self._testY, 
212 |                                             self._testZ, 
213 |                                             self.original_testX) 
214 |         print "Testing: Positive %d, negative %d" % (pc_test, c_test - pc_test)
215 |         return
216 | 
217 |     def SGD(self):
218 | 
219 |         train_X = np.array(self._trainX)
220 |         train_Y = np.array(self._trainY)
221 |         print "X: {}".format(train_X.shape)
222 |         print "Y: {}".format(train_Y.shape)
223 |         OriginalTrain_X = self.original_trainX
224 | 
225 |         test_X = np.array(self._testX)
226 |         test_Y = np.array(self._testY)
227 | 
228 |         lambd = 0.05
229 |         iteration = 2000
230 |         x_dimension = 300
231 |         n_sgd = 10
232 |         test_f1_arr = []; test_recall_arr = []; test_prec_arr = [];  test_roc_arr = []; test_acc_arr = []
233 |         train_f1_arr = []; train_recall_arr = []; train_prec_arr = []; train_roc_arr = []
234 |         test_score = []
235 |         train_score = []
236 |         for expr in range(n_sgd):
237 |             w = np.random.rand(x_dimension)
238 |             for t in range(iteration):
239 |                 eta = 1./((t + 1) * lambd)
240 | 
241 |                 kset = random.sample(range(0, len(train_X) - 1), 10)
242 | 
243 |                 X = np.array([train_X[z] for z in kset])
244 |                 Y = np.array([train_Y[z] for z in kset])
245 |                 orgX = [OriginalTrain_X[k] for k in kset]
246 |                 delta_w = self.grad_func(X, Y, orgX, w)
247 | 
248 |                 new_w = np.dot((1 - eta * lambd), w) - eta * delta_w / len(X)
249 | 
250 |                 rate = (1./np.sqrt(lambd)) * (1./la.norm(new_w))
251 |                 if  rate < 1.:
252 |                     w = np.dot(rate, new_w)
253 |                 else:
254 |                     w = new_w
255 | 
256 |             pred_Y = []
257 |             predicted_data = []
258 |             gsr_history_probs = {}
259 |             for idx, testx in enumerate(test_X):
260 |                 p_ij_list = []
261 |                 originalId = test_idx[idx]
262 |                 for j, day in enumerate(testx):
263 |                     p_ijk_list = [sigmoid(np.dot(w, doc[:, np.newaxis])) for k, doc in enumerate(day)]
264 | 
265 |                     p_ij_list.append(p_ijk_list)
266 | 
267 |                 days_list = [np.mean(p_ijk_list) for p_ijk_list in p_ij_list]
268 | 
269 |                 P_i =  np.mean(days_list)
270 | 
271 |                 bag = self.original_testX[idx]
272 |                 bag_gsrId = self._testZ[idx]['Id']
273 | 
274 |                 gsr_history_probs[bag_gsrId] = {}
275 |                 gsr_history_probs[bag_gsrId]['trueY'] = data_Y[originalId]
276 |                 for idx, histday in enumerate(bag):
277 |                     
278 |                     doc_vec = histday['docvec']
279 |                     doc_ids = histday['docids']
280 |                     today_probs = [sigmoid(np.dot(w.T, doc)) for k, doc in enumerate(doc_vec)]
281 | 
282 |                     a = zip(doc_ids, today_probs)
283 |                     gsr_history_probs[bag_gsrId][idx] = a
284 |                                 originalnews = self._testZ[idx]
285 |             test_f1 = sklearn.metrics.f1_score(test_Y, pred_Y)
286 |             test_recall = sklearn.metrics.recall_score(test_Y, pred_Y)
287 |             test_precision = sklearn.metrics.precision_score(test_Y, pred_Y)
288 |             test_roc = sklearn.metrics.roc_auc_score(test_Y, pred_Y)
289 |             test_acc = sklearn.metrics.accuracy_score(test_Y, pred_Y)
290 | 
291 |             test_f1_arr.append(test_f1)
292 |             test_recall_arr.append(test_recall)
293 |             test_prec_arr.append(test_precision)
294 |             test_roc_arr.append(test_roc)
295 |             test_acc_arr.append(test_acc)
296 | 
297 |         test_score = [np.mean(np.array(test_acc_arr)),
298 |                       np.mean(np.array(test_recall_arr)),
299 |                       np.mean(np.array(test_prec_arr)),
300 |                       np.mean(np.array(test_f1_arr)),
301 |                       np.mean(np.array(test_roc_arr))]
302 |         print "test bagsize:", self.bagsize,
303 |         print "accuracy:", np.mean(np.array(test_acc_arr)),
304 |         print "recall:",  np.mean(np.array(test_recall_arr)),
305 |         print "recall:",  np.mean(np.array(test_prec_arr)),
306 |         print "f1-score:", np.mean(np.array(test_f1_arr))
307 | 
308 |         return self, test_score, train_score, gsr_history_probs
309 | 
310 | def main(args):
311 |     import json
312 |     import os
313 | 
314 |     trainfname = "input_forClassification/country-%s/leadtime-%d/%s"\
315 |             % (args.country, args.leadtime, args.train)
316 |     trainf =  args.path + trainfname
317 | 
318 |     # Testing data
319 |     testfname = "input_forClassification/country-%s/leadtime-%d/%s"\
320 |             % (args.country, args.leadtime, args.test)
321 |     testf = args.path + testfname  
322 | 
323 | 
324 |     dfname = "news_deepfeature/news_doc2vec_%s.json" % args.country
325 |     docf = args.path + dfname
326 | 
327 |     ### Output Files:
328 |     resultf = '../result/{}_{}_lt-{}.txt'.format(args.resultfile, args.country, args.leadtime)
329 |     trainMap = {}
330 |     with open(trainf) as infile:
331 |         for line in infile:
332 |             j = json.loads(line.strip())
333 |             trainMap[len(trainMap)] = j
334 | 
335 |     testMap = {}
336 |     with open(testf) as infile:
337 |         for line in infile:
338 |             j = json.loads(line.strip())
339 |             testMap[len(testMap)] = j
340 | 
341 |     with open(docf) as infile:
342 |         docMap = {j['Id']: j['doc2vec'] for j in
343 |                   (json.loads(l) for l in infile)}
344 | 
345 |     day = args.historyDays
346 | 
347 | 
348 |     start  = datetime.now()
349 |     model = nMIL(bagsize=day, beta=args.beta, gamma=args.gamma, m0=args.m0, p0=args.p0)
350 |     print "Learning for Bag Size: %d" % day
351 |     model.read_data(trainIndex=trainMap, testIndex=testMap, docIndex=docMap, feature=args.feature)
352 |     model, perf1, perf2, gsrHistoryProbs = model.SGD()
353 | 
354 |     timediff = datetime.now() - start
355 |     timecom.append(timediff)
356 |     w1 = open(resultf, 'a')
357 |     w1.write('\t'.join([str(score) for score in perf1]) + '\n')
358 |     w1.close()
359 |     outf1 = "../result/{}_{}_lt-{}_hd-{}_.json".format(args.outfile, args.country, args.leadtime, day)
360 |     w2 = open(outf1, 'wb')
361 |     w2.write(json.dumps(gsrHistoryProbs))
362 |     w2.close()
363 | 
364 | 
365 |     print "Running nMIL model for forecasting on country %s, leadtime %d" % (args.country, args.leadtime)
366 | 
367 | 
368 | if __name__ == "__main__":
369 |     import argparse
370 |     ap = argparse.ArgumentParser()
371 |     ap.add_argument("-p", "--path", help="path of data")
372 |     ap.add_argument("-c", "--country", help="country")
373 |     ap.add_argument("--train", help="path of training data")
374 |     ap.add_argument("--test", help="path of testing data")
375 |     ap.add_argument("--resultfile", help="path of result file")
376 |     ap.add_argument("--outfile", help="path of precursor file")
377 |     ap.add_argument("-l", "--leadtime", type=int, default=1, help="k days before GSR to forecast")
378 |     ap.add_argument("-d", "--historyDays", type=int, default=10, help="number of history days to be used for training")
379 |     ap.add_argument("-m0", type=float, default=.5, help="hyper parameter in hinge loss")
380 |     ap.add_argument("-p0", type=float, default=.5, help="hyper parameter in hinge loss")
381 |     ap.add_argument("--gamma", type=float, default=.5, help="parameter in SGD")
382 |     ap.add_argument("--beta", type=float, default=3.0, help="parameter in SGD")
383 |     args = ap.parse_args()
384 |     main(args)
385 | 
386 | 


--------------------------------------------------------------------------------
/nMIL_delta.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding:utf-8 -*-
  3 | 
  4 | """
  5 | Source code for paper -
  6 |     "Modeling Precursors for Event Forecasting via Nested Multi-Instance Learning" 
  7 |     Authors: Yue Ning, Sathappan Muthiah, Huzefa Rangwala, Naren Ramakrishnan
  8 | published in KDD16.
  9 | 
 10 | """
 11 | 
 12 | 
 13 | import sklearn
 14 | import sklearn.cross_validation
 15 | from scipy.special import expit as scipy_sig
 16 | import numpy as np
 17 | from numpy import linalg as la
 18 | import pdb
 19 | import random
 20 | import math
 21 | import time
 22 | from datetime import datetime, timedelta
 23 | from dateutil.parser import parse as dateparser
 24 | from collections import OrderedDict
 25 | 
 26 | def sigmoid(x):
 27 |     return 1. / (1. + math.exp(-x))
 28 | 
 29 | 
 30 | class nMIL_delta:
 31 |     def __init__(self, bagsize, beta=3.0, gamma=0.5, m0=0.5, p0=0.5):
 32 |         self.bagsize = bagsize
 33 |         self.beta = beta
 34 |         self.gamma = gamma
 35 |         self.m0 = m0
 36 |         self.p0 = p0
 37 | 
 38 |     def grad_func(self, X, Y, orgX, w, beta, gamma, m0, p0):
 39 | 
 40 |         first_matrix = []
 41 |         second_matrix = []
 42 |         third_matrix = []
 43 | 
 44 |         for i, superbag in enumerate(X):
 45 |             P_i_list = []
 46 |             combinedbag_loss = []
 47 |             combinedDoc_loss = []
 48 |             for j, bag in enumerate(superbag):
 49 |                 bag_dotp = np.dot(bag, w[:, np.newaxis])
 50 |                 p_ij_list = scipy_sig(bag_dotp)
 51 |                 hinge_loss = np.sign(p_ij_list - p0)
 52 |                 P_i_list.append(np.mean(p_ij_list))
 53 |                 bagloss = np.dot((p_ij_list*(1 - p_ij_list)).T, bag) / len(bag)
 54 |                 combinedbag_loss.append(bagloss)
 55 |                 mask = ((hinge_loss * bag_dotp) < m0)
 56 |                 combinedDoc_loss.append(np.dot((hinge_loss * mask).T, bag)
 57 |                                             / len(bag))
 58 | 
 59 |             combinedbag_loss = np.concatenate(combinedbag_loss, axis=0)
 60 |             combinedDoc_loss = np.concatenate(combinedDoc_loss, axis=0)
 61 |             P_i = np.mean(P_i_list)
 62 |             neg_loglh = (Y[i] - P_i) / (P_i * (1. - P_i))
 63 | 
 64 |             superbagloss = neg_loglh * combinedbag_loss
 65 |             first_matrix.append(np.sum(superbagloss, axis=0))
 66 | 
 67 |             diffbag = np.concatenate([np.zeros((1,self.x_dimension)), combinedbag_loss], axis=0)
 68 |             crossbag_loss = np.diff(diffbag, axis=0)
 69 |             if len(superbag) == 1:
 70 |                 cross_cosine_combined = 1
 71 |             else:
 72 |                 cross_cosine_combined = np.array([0.] + [(orgX[i][j]['cross_cosine'] /
 73 |                                                   (len(X[i][j]) * len(X[i][j - 1])))
 74 |                                                   for j in
 75 |                                                   range(1, len(superbag))])[:, np.newaxis]
 76 | 
 77 |             crossbag_derivative = 2*((np.diff([0]+ P_i_list)[:, np.newaxis] *
 78 |                                      crossbag_loss * cross_cosine_combined))
 79 |             crossbag_derivative = (crossbag_derivative / len(X[i])).sum(axis=0)
 80 |             second_matrix.append(crossbag_derivative)
 81 |             third_matrix.append(combinedDoc_loss.sum(axis=0))
 82 | 
 83 |         first_sum = np.sum(np.array(first_matrix), axis=0) * beta
 84 |         second_sum = np.sum(np.array(second_matrix), axis=0) * gamma
 85 |         third_sum = np.sum(np.array(third_matrix), axis=0) #* gamma
 86 |         if len(second_matrix) > 0 and len(third_matrix) > 0:
 87 |             return -first_sum + second_sum - third_sum
 88 |         else:
 89 |             return -first_sum
 90 | 
 91 | 
 92 |     def prepare_data(self, docMap, X_docs, dataIndex, X, Y, Z, origianl):
 93 | 
 94 |         pos_count = 0
 95 |         allcount = 0
 96 |         for cVal in dataIndex.viewvalues():
 97 |             Z.append(cVal)
 98 |             if cVal['Y']:
 99 |                 Y.append(1)
100 |                 pos_count += 1
101 |             else:
102 |                 Y.append(0)
103 | 
104 |             combinedDoc = []
105 |             featureDoc = []
106 |             for i in range(max(1, 11 - self.bagsize), 11):
107 |                 if i == 1:
108 |                     cosine_sim = 1.0
109 |                 else:
110 |                     cosine_sim = cVal['history'][str(i)]['cross_cosine']
111 | 
112 |                 doc_vec = X_docs[[docMap[d] for d in
113 |                                   cVal['history'][str(i)]['ids']], :]
114 |                 doc_ids = cVal['history'][str(i)]['ids']
115 |                 x_dict = {'docvec': doc_vec,
116 |                           'docids': doc_ids, 'cross_cosine': cosine_sim}
117 |                 combinedDoc.append(x_dict)
118 |                 featureDoc.append(doc_vec)
119 | 
120 |             origianl.append(combinedDoc)
121 |             X.append(featureDoc)
122 | 
123 |             allcount += 1
124 |         return pos_count, allcount
125 | 
126 | 
127 |     def read_data(self, **kwargs):
128 | 
129 |         trainUnsorted = kwargs['trainIndex']
130 |         testUnsorted = kwargs['testIndex']
131 |         docIndex = kwargs['docIndex']
132 |         trainIndex = OrderedDict(sorted(trainUnsorted.items(), key=lambda x:x[1]['time'][:10]))
133 |         testIndex = OrderedDict(sorted(testUnsorted.items(), key=lambda x:x[1]['time'][:10]))
134 |         print "train data start and end"
135 |         print trainIndex[trainIndex.keys()[0]]['time']
136 |         print trainIndex[trainIndex.keys()[-1]]['time']
137 | 
138 | 
139 |         print "test data start and end"
140 |         print testIndex[testIndex.keys()[0]]['time']
141 |         print testIndex[testIndex.keys()[-1]]['time']
142 |         print "number of documents:", len(docIndex)
143 | 
144 |         # print backward
145 |         docItems = docIndex.items()
146 |         self.x_dimension = len(docIndex.values()[0])
147 |         docMap = {val[0]: index for index, val in enumerate(docItems)}
148 |         X_docs = np.array([k[1] for k in docItems])
149 | 
150 |         self._trainY = []
151 |         self._trainX = []
152 |         self.original_trainX = []
153 |         self._trainZ = []
154 |         pc_train, c_train = self.prepare_data(docMap, 
155 |                                             X_docs, 
156 |                                             trainIndex, 
157 |                                             self._trainX, 
158 |                                             self._trainY, 
159 |                                             self._trainZ, 
160 |                                             self.original_trainX)
161 |         print "Traing: Positive %d, negative %d" % (pc_train, c_train - pc_train)
162 | 
163 |         self._testY = []
164 |         self._testX = []
165 |         self.original_testX = []
166 |         self._testZ = []  
167 |         pc_test, c_test = self.prepare_data(docMap, 
168 |                                             X_docs, 
169 |                                             testIndex, 
170 |                                             self._testX, 
171 |                                             self._testY, 
172 |                                             self._testZ, 
173 |                                             self.original_testX) 
174 |         print "Testing: Positive %d, negative %d" % (pc_test, c_test - pc_test)
175 |         return
176 | 
177 |     def SGD(self, leadtime):
178 | 
179 |         train_X = np.array(self._trainX)
180 |         train_Y = np.array(self._trainY)
181 |         print "X: {}".format(train_X.shape)
182 |         print "Y: {}".format(train_Y.shape)
183 |         OriginalTrain_X = self.original_trainX
184 | 
185 |         test_X = np.array(self._testX)
186 |         test_Y = np.array(self._testY)
187 | 
188 |         lambd = 0.05
189 |         iteration = 2000
190 |         n_sgd = 10
191 |         test_f1_arr = []
192 |         test_recall_arr = []
193 |         test_prec_arr = []
194 |         test_roc_arr = []
195 |         test_acc_arr = []
196 |         test_score = []
197 |         train_score = []
198 |         print "using params:\n gamma:{}, beta:{}, m0:{}, p0:{}".format(self.gamma, self.beta,
199 |                                                                        self.m0, self.p0)
200 |         for expr in range(n_sgd):
201 |             w = np.random.rand(self.x_dimension)
202 |             for t in range(iteration):
203 |                 eta = 1./((t + 1) * lambd)
204 | 
205 |                 kset = random.sample(range(0, len(train_X) - 1), 10)
206 |                 X = train_X[kset]
207 |                 Y = train_Y[kset]
208 |                 orgX = [OriginalTrain_X[k] for k in kset]
209 |                 delta_w = self.grad_func(X, Y, orgX, w, self.beta,
210 |                                     self.gamma, self.m0, self.p0)
211 | 
212 |                 new_w = np.dot((1 - eta * lambd), w) - eta * delta_w / len(X)
213 | 
214 |                 rate = (1./np.sqrt(lambd)) * (1./la.norm(new_w))
215 |                 if rate < 1.:
216 |                     w = np.dot(rate, new_w)
217 |                 else:
218 |                     w = new_w
219 | 
220 |             pred_Y = []
221 |             pred_probs = []
222 |             predicted_data = []
223 |             gsr_history_probs = {}
224 |             for idx, testx in enumerate(test_X):
225 |                 p_ij_list = []
226 |                 days_list = [np.mean(scipy_sig(np.dot(bag, w[:, np.newaxis])))
227 |                              for j, bag in enumerate(testx)]
228 | 
229 |                 P_i = np.mean(days_list)
230 |                 pred_probs.append(P_i)
231 |                 if P_i > 0.5:
232 |                     pred_Y.append(1)
233 |                 else:
234 |                     pred_Y.append(0)
235 | 
236 |                 bag = self.original_testX[idx]
237 |                 bag_gsrId = self._testZ[idx]['Id']
238 |                 gsr_history_probs[bag_gsrId] = {'trueY': test_Y[idx]}
239 |                 for idx, histday in enumerate(bag):
240 |                     doc_vec = histday['docvec']
241 |                     doc_ids = histday['docids']
242 |                     today_probs = [sigmoid(np.dot(w.T, doc)) for k, doc
243 |                                    in enumerate(doc_vec)]
244 |                     a = zip(doc_ids, today_probs)
245 |                     gsr_history_probs[bag_gsrId][idx] = a
246 | 
247 |             test_f1 = sklearn.metrics.f1_score(test_Y, pred_Y)
248 |             test_recall = sklearn.metrics.recall_score(test_Y, pred_Y)
249 |             test_precision = sklearn.metrics.precision_score(test_Y, pred_Y)
250 |             test_roc = sklearn.metrics.roc_auc_score(test_Y, pred_Y)
251 |             test_acc = sklearn.metrics.accuracy_score(test_Y, pred_Y)
252 | 
253 |             test_f1_arr.append(test_f1)
254 |             test_recall_arr.append(test_recall)
255 |             test_prec_arr.append(test_precision)
256 |             test_roc_arr.append(test_roc)
257 |             test_acc_arr.append(test_acc)
258 | 
259 |         test_score = [np.mean(np.array(test_acc_arr)),
260 |                       np.mean(np.array(test_recall_arr)),
261 |                       np.mean(np.array(test_prec_arr)),
262 |                       np.mean(np.array(test_f1_arr)),
263 |                       np.mean(np.array(test_roc_arr))]
264 |         print "test bagsize:", self.bagsize,
265 |         print "accuracy:", np.mean(np.array(test_acc_arr)),
266 |         print "recall:",  np.mean(np.array(test_recall_arr)),
267 |         print "recall:",  np.mean(np.array(test_prec_arr)),
268 |         print "f1-score:", np.mean(np.array(test_f1_arr))
269 |         
270 |         return self, test_score, train_score, gsr_history_probs, test_Y, pred_probs
271 | 
272 | 
273 | def main(args):
274 |     import json
275 |     import os
276 | 
277 |     trainfname = "input_forClassification/country-%s/leadtime-%d/%s"\
278 |             % (args.country, args.leadtime, args.train)
279 |     trainf =  args.path + trainfname
280 |     # Testing data
281 |     testfname = "input_forClassification/country-%s/leadtime-%d/%s"\
282 |             % (args.country, args.leadtime, args.test)
283 |     testf = args.path + testfname  
284 | 
285 | 
286 |     dfname = "news_deepfeature/news_doc2vec_%s.json" % args.country
287 |     docf = args.path + dfname
288 | 
289 |     ### Output Files:
290 |     resultf = '../result/{}_{}_lt-{}.txt'.format(args.resultfile, args.country, args.leadtime)
291 |     trainMap = {}
292 |     with open(trainf) as infile:
293 |         for line in infile:
294 |             j = json.loads(line.strip())
295 |             trainMap[len(trainMap)] = j
296 | 
297 |     testMap = {}
298 |     with open(testf) as infile:
299 |         for line in infile:
300 |             j = json.loads(line.strip())
301 |             testMap[len(testMap)] = j
302 | 
303 |     with open(docf) as infile:
304 |         docMap = {j['Id']: j['doc2vec'] for j in
305 |                   (json.loads(l) for l in infile)}
306 | 
307 |     day = args.historyDays
308 | 
309 |     start  = time.time()
310 |     model = nMIL_delta(day, beta=args.beta, gamma=args.gamma, m0=args.m0, p0=args.p0)
311 |     print "Learning for Bag Size: %d" % day
312 |     model.read_data(trainIndex=trainMap, testIndex=testMap, docIndex=docMap)
313 | 
314 |     model, perf1, perf2, gsrHistoryProbs, test_Y, pred_probs = model.SGD(args.leadtime)
315 |     w1 = open(resultf, 'a')
316 |     w1.write('\t'.join([str(score) for score in perf1]) + '\n')
317 |     w1.close()
318 |     outf1 = "../result/{}_{}_lt-{}_hd-{}_.json".format(args.outfile, args.country, args.leadtime, day)
319 |     w2 = open(outf1, 'wb')
320 |     w2.write(json.dumps(gsrHistoryProbs))
321 |     w2.close()
322 | 
323 |     print "Running nMIL delta model for forecasting on country %s, leadtime %d" % (args.country, args.leadtime)
324 |     print "run-time:{}s".format(time.time() - start)
325 | 
326 | 
327 | if __name__ == "__main__":
328 |     import argparse
329 |     ap = argparse.ArgumentParser()
330 |     ap.add_argument("-p", "--path", help="path of data")
331 |     ap.add_argument("-c", "--country", help="country")
332 |     ap.add_argument("--train", help="path of training data")
333 |     ap.add_argument("--test", help="path of testing data")
334 |     ap.add_argument("--resultfile", help="path of result file")
335 |     ap.add_argument("--outfile", help="path of precursor file")
336 |     ap.add_argument("-l", "--leadtime", type=int, default=1, help="k days before events to forecast")
337 |     ap.add_argument("-d", "--historyDays", type=int, default=10, help="number of history days to be used for training")
338 |     ap.add_argument("-m0", type=float, default=.5, help="hyper parameter in hinge loss")
339 |     ap.add_argument("-p0", type=float, default=.5, help="hyper parameter in hinge loss")
340 |     ap.add_argument("--gamma", type=float, default=.5, help="parameter in SGD")
341 |     ap.add_argument("--beta", type=float, default=3.0, help="parameter in SGD")
342 |     args = ap.parse_args()
343 |     main(args)
344 | 


--------------------------------------------------------------------------------
/nMIL_omega.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding:utf-8 -*-
  3 | """
  4 | Source code for paper -
  5 |     "Modeling Precursors for Event Forecasting via Nested Multi-Instance Learning" 
  6 |     Authors: Yue Ning, Sathappan Muthiah, Huzefa Rangwala, Naren Ramakrishnan
  7 | published in KDD16.
  8 | 
  9 | """
 10 | 
 11 | import sklearn
 12 | import sklearn.cross_validation
 13 | from scipy.special import expit as scipy_sig
 14 | import numpy as np
 15 | from numpy import linalg as la
 16 | import pdb
 17 | import random
 18 | import math
 19 | import time
 20 | 
 21 | 
 22 | def sigmoid(x):
 23 |     return 1. / (1. + math.exp(-x))
 24 | 
 25 | 
 26 | 
 27 | 
 28 | class nMIL_omega:
 29 |     def __init__(self, bagsize, beta=3.0, gamma=0.5, m0=0.5, p0=0.5):
 30 |         self.bagsize = bagsize
 31 |         self.beta = beta
 32 |         self.gamma = gamma
 33 |         self.m0 = m0
 34 |         self.p0 = p0
 35 | 
 36 |     def grad_func(X, Y, orgX, w, beta, gamma, m0, p0):
 37 | 
 38 |         first_matrix = []
 39 |         second_matrix = []
 40 |         third_matrix = []
 41 | 
 42 |         for i, superbag in enumerate(X):
 43 |             P_i_list = []
 44 |             combinedbag_loss = []
 45 |             combinedDoc_loss = []
 46 |             for j, bag in enumerate(superbag):
 47 |                 bag_dotp = np.dot(bag, w[j, :][:, np.newaxis])
 48 |                 p_ij_list = scipy_sig(bag_dotp)
 49 |                 hinge_loss = np.sign(p_ij_list - p0)
 50 |                 P_i_list.append(np.mean(p_ij_list))
 51 |                 bagloss = np.dot((p_ij_list*(1 - p_ij_list)).T, bag) / len(bag)
 52 |                 combinedbag_loss.append(bagloss)
 53 |                 mask = ((hinge_loss * bag_dotp) < m0)
 54 |                 combinedDoc_loss.append(np.dot((hinge_loss * mask).T, bag)
 55 |                                             / len(bag))
 56 | 
 57 |             combinedbag_loss = np.concatenate(combinedbag_loss, axis=0)
 58 |             combinedDoc_loss = np.concatenate(combinedDoc_loss, axis=0)
 59 |             P_i = np.mean(P_i_list)
 60 |             neg_loglh = (Y[i] - P_i) / (P_i * (1. - P_i))
 61 |             superbagloss = neg_loglh * combinedbag_loss
 62 |             first_matrix.append(superbagloss)
 63 | 
 64 |             diffbag = np.concatenate([np.zeros((1,300)), combinedbag_loss], axis=0)
 65 |             crossbag_loss = np.diff(diffbag, axis=0)
 66 |             if len(superbag) == 1:
 67 |                 cross_cosine_combined = 1
 68 |             else:
 69 |                 cross_cosine_combined = np.array([1] + [1
 70 |                                                   for j in
 71 |                                                   range(1, len(superbag))])[:, np.newaxis]
 72 | 
 73 |             crossbag_derivative = 2*((np.diff([0]+ P_i_list)[:, np.newaxis] *
 74 |                                      crossbag_loss * cross_cosine_combined))
 75 |             crossbag_derivative = crossbag_derivative / len(X[i])
 76 |             second_matrix.append(crossbag_derivative)
 77 |             third_matrix.append(combinedDoc_loss)
 78 | 
 79 |         first_sum = np.sum(np.array(first_matrix), axis=0) * beta
 80 |         second_sum = np.sum(np.array(second_matrix), axis=0) * gamma
 81 |         third_sum = np.sum(np.array(third_matrix), axis=0) #* gamma
 82 |         if len(second_matrix) > 0 and len(third_matrix) > 0:
 83 |             return -first_sum + second_sum - third_sum
 84 |         else:
 85 |             return -first_sum
 86 | 
 87 |     def prepare_data(self, docMap, X_docs, dataIndex, X, Y, Z, origianl):
 88 | 
 89 |         pos_count = 0
 90 |         allcount = 0
 91 |         for cVal in dataIndex.viewvalues():
 92 |             Z.append(cVal)
 93 |             if cVal['Y']:
 94 |                 Y.append(1)
 95 |                 pos_count += 1
 96 |             else:
 97 |                 Y.append(0)
 98 | 
 99 |             combinedDoc = []
100 |             featureDoc = []
101 |             for i in range(max(1, 11 - self.bagsize), 11):
102 |                 if i == 1:
103 |                     cosine_sim = 1.0
104 |                 else:
105 |                     cosine_sim = cVal['history'][str(i)]['cross_cosine']
106 | 
107 |                 doc_vec = X_docs[[docMap[d] for d in
108 |                                   cVal['history'][str(i)]['ids']], :]
109 |                 doc_ids = cVal['history'][str(i)]['ids']
110 |                 x_dict = {'docvec': doc_vec,
111 |                           'docids': doc_ids, 'cross_cosine': cosine_sim}
112 |                 combinedDoc.append(x_dict)
113 |                 featureDoc.append(doc_vec)
114 | 
115 |             origianl.append(combinedDoc)
116 |             X.append(featureDoc)
117 | 
118 |             allcount += 1
119 |         return pos_count, allcount
120 | 
121 | 
122 |     def read_data(self, **kwargs):
123 | 
124 |         trainUnsorted = kwargs['trainIndex']
125 |         testUnsorted = kwargs['testIndex']
126 |         docIndex = kwargs['docIndex']
127 |         trainIndex = OrderedDict(sorted(trainUnsorted.items(), key=lambda x:x[1]['time'][:10]))
128 |         testIndex = OrderedDict(sorted(testUnsorted.items(), key=lambda x:x[1]['time'][:10]))
129 |         print "train data start and end"
130 |         print trainIndex[trainIndex.keys()[0]]['time']
131 |         print trainIndex[trainIndex.keys()[-1]]['time']
132 | 
133 | 
134 |         print "test data start and end"
135 |         print testIndex[testIndex.keys()[0]]['time']
136 |         print testIndex[testIndex.keys()[-1]]['time']
137 |         print "number of documents:", len(docIndex)
138 | 
139 |         # print backward
140 |         docItems = docIndex.items()
141 |         self.x_dimension = len(docIndex.values()[0])
142 |         docMap = {val[0]: index for index, val in enumerate(docItems)}
143 |         X_docs = np.array([k[1] for k in docItems])
144 | 
145 |         self._trainY = []
146 |         self._trainX = []
147 |         self.original_trainX = []
148 |         self._trainZ = []
149 |         pc_train, c_train = self.prepare_data(docMap, 
150 |                                             X_docs, 
151 |                                             trainIndex, 
152 |                                             self._trainX, 
153 |                                             self._trainY, 
154 |                                             self._trainZ, 
155 |                                             self.original_trainX)
156 |         print "Traing: Positive %d, negative %d" % (pc_train, c_train - pc_train)
157 |  
158 |         self._testY = []
159 |         self._testX = []
160 |         self.original_testX = []
161 |         self._testZ = []  
162 |         pc_test, c_test = self.prepare_data(docMap, 
163 |                                             X_docs, 
164 |                                             testIndex, 
165 |                                             self._testX, 
166 |                                             self._testY, 
167 |                                             self._testZ, 
168 |                                             self.original_testX) 
169 |         print "Testing: Positive %d, negative %d" % (pc_test, c_test - pc_test)
170 |         return
171 | 
172 |     def SGD(self):
173 | 
174 |         train_X = np.array(self._trainX)
175 |         train_Y = np.array(self._trainY)
176 |         print "X: {}".format(train_X.shape)
177 |         print "Y: {}".format(train_Y.shape)
178 |         OriginalTrain_X = self.original_trainX
179 | 
180 |         test_X = np.array(self._testX)
181 |         test_Y = np.array(self._testY)
182 | 
183 |         lambd = 0.05
184 |         iteration = 2000
185 |         x_dimension = 300
186 |         n_sgd = 10
187 |         test_f1_arr = []
188 |         test_recall_arr = []
189 |         test_prec_arr = []
190 |         test_roc_arr = []
191 |         test_acc_arr = []
192 |         test_score = []
193 |         train_score = []
194 |         print "using params:\n gamma:{}, beta:{}, m0:{}, p0:{}".format(self.gamma, self.beta,
195 |                                                                        self.m0, self.p0)
196 |         for expr in range(n_sgd):
197 |             w = np.random.rand(self.bagsize, x_dimension)
198 |             for t in range(iteration):
199 |                 eta = 1./((t + 1) * lambd)
200 | 
201 |                 kset = random.sample(range(0, len(train_X) - 1), 10)
202 |                 X = train_X[kset]
203 |                 Y = train_Y[kset]
204 |                 orgX = [OriginalTrain_X[k] for k in kset]
205 |                 delta_w = self.grad_func(X, Y, orgX, w, self.beta,
206 |                                     self.gamma, self.m0, self.p0)
207 | 
208 |                 new_w = np.dot((1 - eta * lambd), w) - eta * delta_w / len(X)
209 | 
210 |                 rate = (1./np.sqrt(lambd)) * (1./la.norm(new_w))
211 |                 if rate < 1.:
212 |                     w = np.dot(rate, new_w)
213 |                 else:
214 |                     w = new_w
215 | 
216 |             pred_Y = []
217 |             pred_probs = []
218 |             gsr_history_probs = {}
219 |             for idx, testx in enumerate(test_X):
220 |                 p_ij_list = []
221 |                 originalId = test_idx[idx]
222 |                 days_list = [np.mean(scipy_sig(np.dot(bag, w[j, :][:, np.newaxis])))
223 |                              for j, bag in enumerate(testx)]
224 | 
225 |                 P_i = np.mean(days_list)
226 |                 pred_probs.append(P_i)
227 |                 if P_i > 0.5:
228 |                     pred_Y.append(1)
229 |                 else:
230 |                     pred_Y.append(0)
231 |                 bag = self.original_testX[idx]
232 |                 bag_gsrId = self._testZ[idx]['Id']
233 | 
234 |                 gsr_history_probs[bag_gsrId] = {}
235 |                 gsr_history_probs[bag_gsrId]['trueY'] = data_Y[originalId]
236 |                 for idx, histday in enumerate(bag):
237 | 
238 |                     doc_vec = histday['docvec']
239 |                     doc_ids = histday['docids']
240 |                     today_probs = [sigmoid(np.dot(w.T, doc)) for k, doc in enumerate(doc_vec)]
241 | 
242 |                     a = zip(doc_ids, today_probs)
243 |                     gsr_history_probs[bag_gsrId][idx] = a
244 |                                 originalnews = self._testZ[idx]
245 |             test_f1 = sklearn.metrics.f1_score(test_Y, pred_Y)
246 |             test_recall = sklearn.metrics.recall_score(test_Y, pred_Y)
247 |             test_precision = sklearn.metrics.precision_score(test_Y, pred_Y)
248 |             test_roc = sklearn.metrics.roc_auc_score(test_Y, pred_Y)
249 |             test_acc = sklearn.metrics.accuracy_score(test_Y, pred_Y)
250 | 
251 |             test_f1_arr.append(test_f1)
252 |             test_recall_arr.append(test_recall)
253 |             test_prec_arr.append(test_precision)
254 |             test_roc_arr.append(test_roc)
255 |             test_acc_arr.append(test_acc)
256 | 
257 |         test_score = [np.mean(np.array(test_acc_arr)),
258 |                       np.mean(np.array(test_recall_arr)),
259 |                       np.mean(np.array(test_prec_arr)),
260 |                       np.mean(np.array(test_f1_arr)),
261 |                       np.mean(np.array(test_roc_arr))]
262 | 
263 |         print "test bagsize:", self.bagsize,
264 |         print "accuracy:", np.mean(np.array(test_acc_arr)),
265 |         print "recall:",  np.mean(np.array(test_recall_arr)),
266 |         print "recall:",  np.mean(np.array(test_prec_arr)),
267 |         print "f1-score:", np.mean(np.array(test_f1_arr))
268 | 
269 |         return self, test_score, train_score, gsr_history_probs, test_Y, pred_probs
270 | 
271 | 
272 | def main(args):
273 |     import json
274 |     import os
275 | 
276 |     trainfname = "input_forClassification/country-%s/leadtime-%d/%s"\
277 |             % (args.country, args.leadtime, args.train)
278 |     trainf =  args.path + trainfname
279 |     # Testing data
280 |     testfname = "input_forClassification/country-%s/leadtime-%d/%s"\
281 |             % (args.country, args.leadtime, args.test)
282 |     testf = args.path + testfname  
283 | 
284 | 
285 |     dfname = "news_deepfeature/news_doc2vec_%s.json" % args.country
286 |     docf = args.path + dfname
287 | 
288 |     ### Output Files:
289 |     resultf = '../result/{}_{}_lt-{}.txt'.format(args.resultfile, args.country, args.leadtime)
290 |     trainMap = {}
291 |     with open(trainf) as infile:
292 |         for line in infile:
293 |             j = json.loads(line.strip())
294 |             trainMap[len(trainMap)] = j
295 | 
296 |     testMap = {}
297 |     with open(testf) as infile:
298 |         for line in infile:
299 |             j = json.loads(line.strip())
300 |             testMap[len(testMap)] = j
301 | 
302 |     with open(docf) as infile:
303 |         docMap = {j['Id']: j['doc2vec'] for j in
304 |                   (json.loads(l) for l in infile)}
305 | 
306 |     day = args.historyDays
307 | 
308 |     start  = time.time()
309 |     model = nMIL_omega(day, beta=args.beta, gamma=args.gamma, m0=args.m0, p0=args.p0)
310 |     print "Learning for Bag Size: %d" % day
311 |     model.read_data(histIndex=historyMap, docIndex=docMap, feature=args.feature)
312 | 
313 |     model, perf1, perf2, gsrHistoryProbs, test_Y, pred_probs = model.SGD()
314 |     w1 = open(resultf, 'a')
315 |     w1.write('\t'.join([str(score) for score in perf1]) + '\n')
316 |     w1.close()
317 |     outf = "../result/{}_{}_lt-{}_hd-{}_.json".format(args.outfile, args.country, args.leadtime, day)
318 |     w2 = open(outf, 'wb')
319 |     w2.write(json.dumps(gsrHistoryProbs))
320 |     w2.close()
321 |     print "Running nMIL omega model for forecasting on country %s, leadtime %d" % (args.country, args.leadtime)
322 |     print "run-time:{}s".format(time.time() - start)
323 | 
324 | 
325 | if __name__ == "__main__":
326 |     import argparse
327 |     ap = argparse.ArgumentParser()
328 |     ap.add_argument("-p", "--path", help="path of data")
329 |     ap.add_argument("-c", "--country", help="country")
330 |     ap.add_argument("--train", help="path of training data")
331 |     ap.add_argument("--test", help="path of testing data")
332 |     ap.add_argument("--resultfile", help="path of result file")
333 |     ap.add_argument("--outfile", help="path of precursor file")
334 |     ap.add_argument("-l", "--leadtime", type=int, default=1, help="k days before GSR to forecast")
335 |     ap.add_argument("-d", "--historyDays", type=int, default=10, help="number of history days to be used for training")
336 |     ap.add_argument("-m0", type=float, default=.5, help="hyper parameter in hinge loss")
337 |     ap.add_argument("-p0", type=float, default=.5, help="hyper parameter in hinge loss")
338 |     ap.add_argument("--gamma", type=float, default=.5, help="parameter in SGD")
339 |     ap.add_argument("--beta", type=float, default=3.0, help="parameter in SGD")
340 |     args = ap.parse_args()
341 |     main(args)
342 | 


--------------------------------------------------------------------------------