├── .gitignore ├── Config.py ├── Dataset.py ├── Exp.py ├── README.md ├── dfg.py ├── dfg_minibatch.py ├── factor.py ├── factor_minibatch.py ├── mkdata.py └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | *.pkl 3 | *.txt 4 | *.pyc 5 | -------------------------------------------------------------------------------- /Config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # File Name: Config.py 4 | # Author: Jiezhong Qiu 5 | 6 | import json 7 | import util 8 | import logging 9 | from datetime import datetime, date 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | #def getDataDir(): 14 | # return '/home/jiezhong/prediction/certificate' 15 | # 16 | #def getStart(course): 17 | # title = {} 18 | # title["TsinghuaX/00690242_2015X/2015_T1"] = date(2015, 3, 2) 19 | # title["TsinghuaX/30240184_2015X/2015_T1"] = date(2015, 3, 3) 20 | # return title[course] 21 | # 22 | #def getEnd(course): 23 | # title = {} 24 | # title["TsinghuaX/00690242_2015X/2015_T1"] = date(2015, 7, 5) 25 | # title["TsinghuaX/30240184_2015X/2015_T1"] = date(2015, 7, 5) 26 | # return title[course] 27 | # 28 | def getDDL(course): 29 | if course == "TsinghuaX/30240184_2015X/2015_T1": 30 | ddl = [] 31 | with open('../element.json', 'rb') as f: 32 | element = json.load(f) 33 | for k,v in element.iteritems(): 34 | if k.find('30240184') > -1 and v['due'] is not None: 35 | dt = datetime.strptime(v['due'], '%Y-%m-%dT%H:%M:%S') 36 | dt = util.roundTime(dt, 60 * 60) 37 | ddl.append(dt.date()) 38 | ddl.sort() 39 | ddl[-1] = date(2015, 6, 30) 40 | ddl = [item.strftime("%Y-%m-%d") for item in ddl] 41 | print json.dumps(ddl) 42 | return ddl 43 | else: 44 | ddl = [] 45 | with open('../Json/element.json', 'rb') as f: 46 | element = json.load(f) 47 | for k,v in element.iteritems(): 48 | if k.find('00690242_2015X') > -1 and v['start'] is not None: 49 | dt = datetime.strptime(v['start'], '%Y-%m-%dT%H:%M:%S') 50 | dt = util.roundTime(dt, 60 * 60) 51 | ddl.append(dt.date()) 52 | ddl.sort() 53 | ddl = [item.strftime("%Y-%m-%d") for item in ddl] 54 | print json.dumps(ddl) 55 | return ddl 56 | 57 | #def getThreshold(course): 58 | # return 0.8 59 | # 60 | #def getPklDir(): 61 | # return '/home/jiezhong/prediction/certificate/dynamic-factor-graph/data/data.pkl' 62 | # 63 | #def getPredictionResultDir(): 64 | # return 'a.txt' 65 | 66 | class Config(object): 67 | def __init__(self, course, fn): 68 | self.course = course 69 | with open(fn, 'rb') as f: 70 | self.config = json.load(f) 71 | logger.info('Loading config for %s from file %s', course, fn) 72 | 73 | def getThreshold(self): 74 | # return the threshold of grade 75 | return self.config['threshold'] 76 | 77 | def getPklFile(self): 78 | # return the pkl file which stores the feature vector for each user in every timestamp 79 | return self.config["pklFile"] 80 | 81 | def getPredictionResultFile(self): 82 | # return the prediction result dir 83 | return self.config["predictionResultFile"] 84 | 85 | def getJsonDir(self): 86 | # returen json dir which store the result of Feature.py 87 | return self.config["jsonDir"] 88 | 89 | def getStart(self): 90 | # return the start of the course 91 | return util.parseDate(self.config['start']) 92 | 93 | def getEnd(self): 94 | # return the end of the course 95 | return util.parseDate(self.config['end']) 96 | 97 | def getDDL(self): 98 | # a list of ddls 99 | return [util.parseDate(item) for item in self.config['ddl']] 100 | 101 | def getJsonDir(self): 102 | return self.config['jsonDir'] 103 | 104 | if __name__ == '__main__': 105 | #getDDL("TsinghuaX/30240184_2015X/2015_T1") 106 | getDDL("TsinghuaX/00690242_2015X/2015_T1") 107 | #config = Config("TsinghuaX/30240184_2015X/2015_T1", 'dsa.json') 108 | -------------------------------------------------------------------------------- /Dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # File Name: mkdata.py 4 | # Author: Jiezhong Qiu 5 | 6 | import sys 7 | import logging 8 | from Config import Config 9 | import os 10 | from bson import json_util 11 | import util 12 | import json 13 | import datetime 14 | import cPickle as pickle 15 | import numpy as np 16 | 17 | EPS=1e-2 18 | logger = logging.getLogger(__name__) 19 | 20 | class Dataset(object): 21 | def __init__(self, course, configFile): 22 | '''generate_Y data as the following format 23 | feature[uid][T] is a list of features for user uid at time T 24 | the feature shoule be additive 25 | we remove register-only student from the dataset 26 | ''' 27 | self.course = course 28 | self.config = Config(course, configFile) 29 | self.feature = {} 30 | self.feature_num = 0 31 | self.path = self.config.getJsonDir() 32 | self.getUser() 33 | self.start = self.config.getStart() 34 | self.end = self.config.getEnd() 35 | for uid in self.feature: 36 | for single_date in util.daterange(self.start, self.end): 37 | self.feature[uid][single_date] = [] 38 | logger.info('course: %s user: %d start: %s end: %s', self.course, 39 | len(self.feature), self.start.isoformat(), self.end.isoformat()) 40 | 41 | def getTimeStamp(self, date_obj): 42 | for index, item in enumerate(self.ddls): 43 | if date_obj <= item: 44 | return index 45 | return len(self.ddls) 46 | 47 | def expand_feature(self, num): 48 | self.feature_num += num 49 | for uid in self.feature: 50 | for single_date in self.feature[uid]: 51 | self.feature[uid][single_date] = [0.] * num + self.feature[uid][single_date] 52 | def getUser(self): 53 | with open(os.path.join(self.path, 'user.json'), 'rb') as f: 54 | user = json.load(f) 55 | for uid in user: 56 | self.feature[uid] = {} 57 | 58 | def getForumData(self): 59 | # post, reply, replyed, length, upvoted, cert-friend 60 | self.expand_feature(6) 61 | with open(os.path.join(self.path, 'forum.json')) as f: 62 | forum = json.load(f) 63 | for oid, item in forum.iteritems(): 64 | single_date = util.parseDate(item['date']) 65 | uid = item['user'] 66 | if uid in self.feature and single_date >= self.start and single_date < self.end: 67 | if item['father'] == None: 68 | self.feature[uid][single_date][0] += 1 69 | else: 70 | self.feature[uid][single_date][1] += 1 71 | fid = forum[item['father']]['user'] 72 | if fid in self.feature: 73 | self.feature[fid][single_date][2] += 1 74 | T = self.getTimeStamp(single_date) 75 | if T > 0: 76 | self.feature[uid][single_date][5] += self.score[fid][T-1] 77 | self.feature[fid][single_date][5] += self.score[uid][T-1] 78 | self.feature[uid][single_date][3] += item['length'] 79 | self.feature[uid][single_date][4] += item['vote_up'] 80 | 81 | def getScore(self): 82 | fileDir = os.path.join(self.path, 'grade.json') 83 | with open(fileDir, 'rb') as f: 84 | self.score = json.load(f) 85 | def getLearningData(self): 86 | # video_time assign_time 87 | # video_day assign_day 88 | self.expand_feature(4) 89 | with open(os.path.join(self.path, 'duration.json')) as f: 90 | learn = json.load(f) 91 | for uid in learn: 92 | if uid not in self.feature: 93 | continue 94 | for k, v in learn[uid].iteritems(): 95 | single_date = util.parseDate(k) 96 | if single_date < self.start or single_date >= self.end: 97 | continue 98 | self.feature[uid][single_date][0] += v[0] 99 | self.feature[uid][single_date][1] += v[1] 100 | self.feature[uid][single_date][2] += (v[0] > 0) 101 | self.feature[uid][single_date][3] += (v[1] > 0) 102 | 103 | def getBehaviorData(self): 104 | # video problem sequential chapter ddl_hit 105 | self.expand_feature(5) 106 | with open(os.path.join(self.path, 'behavior.json')) as f: 107 | behavior = json.load(f) 108 | with open(os.path.join(self.path, 'element.json')) as f: 109 | element = json.load(f, object_hook=json_util.object_hook) 110 | for uid in behavior: 111 | if uid not in self.feature: 112 | continue 113 | for date in behavior[uid]: 114 | single_date = util.parseDate(date) 115 | if single_date < self.start or single_date >= self.end: 116 | continue 117 | for log in behavior[uid][date]: 118 | course, catagory = util.parseLog(log) 119 | if element[log]['due'] is not None: 120 | if single_date <= util.parseDate(element[log]['due']): 121 | self.feature[uid][single_date][4] += 1 122 | if catagory == 'video': 123 | self.feature[uid][single_date][0] += 1 124 | elif catagory == 'problem': 125 | self.feature[uid][single_date][1] += 1 126 | elif catagory == 'sequential': 127 | self.feature[uid][single_date][2] += 1 128 | elif catagory == 'chapter': 129 | self.feature[uid][single_date][3] += 1 130 | 131 | def save(self, fpath='.', fname=None): 132 | """save a json or pickle representation of data set""" 133 | fpathstart, fpathext = os.path.splitext(fpath) 134 | if fpathext == '.json' or fpathext == '.pkl': 135 | fpath, fname = os.path.split(fpath) 136 | elif fname is None: 137 | # generate_Y filename based on date 138 | date_obj = datetime.datetime.now() 139 | date_str = date_obj.strftime('%Y-%m-%d-%H:%M:%S') 140 | class_name = self.__class__.__name__ 141 | fname = '%s.%s.pkl' % (class_name, date_str) 142 | fabspath = os.path.join(fpath, fname) 143 | logger.info('Saving to %s ...' % fabspath) 144 | with open(fabspath, 'wb') as file: 145 | if fpathext == '.json': 146 | json.dump(self.feature, file, 147 | indent=4, separators=(',', ';')) 148 | else: 149 | pickle.dump(self.feature, file, protocol=pickle.HIGHEST_PROTOCOL) 150 | 151 | def save_dataset(self): 152 | fabspath = self.config.getPklFile() 153 | logger.info('Saving dataset to %s shape=(%d, %d, %d)...' % (fabspath, len(self.ddls)+1, len(self.feature), self.feature_num)) 154 | # n_step x n_seq x n_obsv 155 | n_step = len(self.ddls) + 1 156 | n_seq = len(self.feature) 157 | dataset = np.zeros(shape=(n_step, n_seq, self.feature_num)) 158 | user_id = [] 159 | for index, uid in enumerate(self.feature): 160 | assert len(self.feature[uid]) == len(self.ddls) + 1 161 | user_id.append(uid) 162 | for T in xrange(len(self.feature[uid])): 163 | assert len(self.feature[uid][T]) == self.feature_num 164 | for i in xrange(self.feature_num): 165 | dataset[T][index][i] = self.feature[uid][T][i] 166 | X = np.zeros(shape=(n_step, n_seq, self.n_in)) 167 | for index, uid in enumerate(self.feature): 168 | for T in xrange(len(self.X[uid])): 169 | if len(self.X[uid][T]) != self.n_in: 170 | print len(self.X[uid][T]), self.n_in 171 | assert len(self.X[uid][T]) == self.n_in 172 | for i in xrange(self.n_in): 173 | X[T][index][i] = self.X[uid][T][i] 174 | 175 | now = datetime.date.today() + datetime.timedelta(days=1) 176 | now = self.getTimeStamp(now) 177 | with open(fabspath, 'wb') as file: 178 | pickle.dump((dataset, X, user_id, now), file, protocol=pickle.HIGHEST_PROTOCOL) 179 | self.dataset = dataset 180 | 181 | def getDDL(self): 182 | self.ddls = self.config.getDDL() 183 | 184 | def getStageFeature(self): 185 | feature = {} 186 | for uid in self.feature: 187 | feature[uid] = {} 188 | for single_date in self.feature[uid]: 189 | #date_str = single_date.isoformat() 190 | delta = (single_date - self.start).days 191 | feature[uid][delta] = self.feature[uid][single_date] 192 | sample = self.ddls + [self.end - datetime.timedelta(1)] 193 | sample = [(item - self.start).days for item in sample] 194 | print len(sample), sample 195 | self.feature = {} 196 | for uid in feature: 197 | if uid in self.score: 198 | self.score[uid].append(0.) 199 | self.feature[uid] = [] 200 | p = 0 201 | tmp = [0.] * self.feature_num 202 | for T in xrange(0, (self.end-self.start).days): 203 | if T <= sample[p]: 204 | for i in xrange(self.feature_num): 205 | tmp[i] += feature[uid][T][i] 206 | if T == sample[p]: 207 | self.feature[uid].append(tmp) 208 | p += 1 209 | tmp = [0.] * self.feature_num 210 | def filte(self, filter_type='binary', threshold=0.3): 211 | # first merge self.feature and self.score 212 | self.feature_num += 1 213 | for uid in self.score: 214 | for j in xrange(len(self.ddls) + 1): 215 | try: 216 | self.feature[uid][j].append(self.score[uid][j]) 217 | except: 218 | print len(self.feature[uid]), j 219 | print len(self.score[uid]), j 220 | exit() 221 | for i in xrange(self.feature_num): 222 | for T in xrange(len(self.ddls) + 1): 223 | tmp = sorted([self.feature[uid][T][i] for uid in self.feature], reverse=True) 224 | door = tmp[int(len(self.feature) * threshold)] 225 | if door == tmp[0]: 226 | door -= EPS 227 | elif door == tmp[-1]: 228 | door += EPS 229 | for uid in self.feature: 230 | self.feature[uid][T][i] = 1 if self.feature[uid][T][i] > door else 0 231 | def getDemographics(self): 232 | # binary feature 233 | # male, female, el, jhs, hs, c, b, m, p, [0,18], [18,23], [23, 28], [28, 36], [36, 51], [> 51] 234 | with open(os.path.join(self.path, 'profile.json')) as f: 235 | demos = json.load(f) 236 | self.n_in = 15 237 | for uid in self.feature: 238 | tmp = [] 239 | demo = demos[uid] 240 | for task in ['m', 'f']: 241 | tmp.append(1 if demo['gender'] == task else 0) 242 | for task in ['el', 'jhs', 'hs', 'c', 'b', 'm', 'p']: 243 | tmp.append(1 if demo['education'] == task else 0) 244 | if demo['birth'] is not None: 245 | age = 2015 - demo['birth'] 246 | task = [0, 18, 23, 28, 36, 51, 1000] 247 | for i in xrange(len(task)-1): 248 | tmp.append(1 if age >= task[i] and age < task[i+1] else 0) 249 | else: 250 | tmp += [0.] * 6 251 | for T in xrange(len(self.ddls)+1): 252 | self.X[uid][T] += tmp 253 | 254 | def generate_Y(self): 255 | self.getDDL() 256 | self.getScore() 257 | self.getForumData() 258 | self.getLearningData() 259 | self.getBehaviorData() 260 | self.getStageFeature() 261 | threshold = self.config.getThreshold() 262 | self.filte(filter_type='binary', threshold=threshold) 263 | 264 | def generate_X(self): 265 | self.X = {} 266 | for uid in self.feature: 267 | self.X[uid] = [[] for i in xrange(len(self.ddls)+1)] 268 | # Demographics Feature 269 | self.getDemographics() 270 | 271 | def regenerate(self): 272 | for uid in self.feature: 273 | for T in xrange(len(self.ddls) + 1): 274 | self.X[uid][T] += self.feature[uid][T][:-1] 275 | self.feature[uid][T] = self.feature[uid][T][-1:] 276 | self.n_in = self.n_in + self.feature_num - 1 277 | self.feature_num = 1 278 | 279 | if __name__ == "__main__": 280 | logging.basicConfig(level=logging.INFO) 281 | dataset = Dataset(sys.argv[1], sys.argv[2]) 282 | dataset.generate_Y() 283 | dataset.generate_X() 284 | dataset.regenerate() 285 | dataset.save_dataset() 286 | -------------------------------------------------------------------------------- /Exp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # File Name: Exp.py 4 | # Author: Jiezhong Qiu 5 | # Create Time: 2015/06/23 17:04 6 | # TODO: 7 | 8 | from dfg_minibatch import MetaDFG 9 | from Config import Config 10 | import cPickle as pickle 11 | import datetime 12 | import logging 13 | import sys 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | def run(config): 18 | with open(config.getPklFile(), 'rb') as f: 19 | Y, X, user_id, T = pickle.load(f) 20 | print X.shape, Y.shape 21 | n_in = X.shape[2] 22 | Y_train = Y[:T] 23 | Y_test = Y[T:] 24 | X_train = X[:T] 25 | X_test = X[T:] 26 | n_step, n_seq, n_obsv = Y_train.shape 27 | logger.info('load from pkl train_step=%d test_step=%d, n_seq=%d n_obsv=%d n_in=%d', n_step, X_test.shape[0], n_seq, n_obsv, n_in) 28 | start = datetime.datetime.now() 29 | dfg = MetaDFG(n_in=n_in, n_hidden=2, n_obsv=n_obsv, n_step=n_step, order=2, n_seq=n_seq, learning_rate_Estep=0.5, learning_rate_Mstep=0.1, 30 | factor_type='MLP', output_type='binary', 31 | n_epochs=200, batch_size=n_seq , snapshot_every=None, L1_reg=0.00, L2_reg=0.00, smooth_reg=0.00, 32 | learning_rate_decay=.5, learning_rate_decay_every=100, 33 | n_iter_low=[n_step / 2] , n_iter_high=[n_step + 1], n_iter_change_every=100, 34 | final_momentum=0.5, 35 | initial_momentum=0.3, momentum_switchover=1500, 36 | order_obsv=0, 37 | hidden_layer_config=[]) 38 | #X_train = np.zeros((n_step, n_seq, n_in)) 39 | #X_test = np.zeros((Y_test.shape[0], n_seq, n_in)) 40 | cert_pred = dfg.fit(Y_train=Y_train, X_train=X_train, Y_test=Y_test, X_test=X_test, validation_frequency=None) 41 | with open(config.getPredictionResultFile(), 'wb') as f: 42 | for i in xrange(len(user_id)): 43 | print >> f, '\t'.join([str(user_id[i]), config.course, str(cert_pred[i])]) 44 | print datetime.datetime.now() - start 45 | 46 | 47 | if __name__ == '__main__': 48 | logging.basicConfig(level=logging.INFO, 49 | format='%(asctime)s %(message)s') # include timestamp 50 | config = Config(sys.argv[1], sys.argv[2]) 51 | run(config) 52 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dynamic-factor-graph 2 | 3 | A implement of dynamic factor graph 4 | [Riotr Mirowski and Yann LeCun](http://yann.lecun.com/exdb/publis/pdf/mirowski-ecml-09.pdf) 5 | 6 | Learning how to use Theano to implement RNN-like structure from 7 | [gwtaylor's implementation of rnn](https://github.com/gwtaylor/theano-rnn) 8 | -------------------------------------------------------------------------------- /dfg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # encoding: utf-8 4 | # File Name: dfg.py 5 | # Author: Jiezhong Qiu 6 | # Create Time: 2015/01/26 00:25 7 | # TODO: 8 | 9 | from collections import OrderedDict 10 | import numpy as np 11 | import theano 12 | import theano.tensor as T 13 | from sklearn.base import BaseEstimator 14 | import logging 15 | import time 16 | import json 17 | import datetime 18 | import os 19 | import cPickle as pickle 20 | import factor 21 | import unittest 22 | import matplotlib.pylab as plt 23 | 24 | 25 | logger = logging.getLogger(__name__) 26 | theano.config.exception_verbosity='high' 27 | #mode = theano.Mode(linker='cvm') 28 | #mode = 'DebugMode' 29 | mode = 'FAST_COMPILE' 30 | #mode = 'ProfileMode' 31 | DEBUG = True 32 | 33 | class DFG(object): 34 | """ Dynamic factor graph class 35 | 36 | Support output types: 37 | real: linear output units, use mean-squared error 38 | binary: binary output units, use cross-entropy error 39 | softmax: single softmax out, use cross-entropy error 40 | """ 41 | def __init__(self, input, n_hidden, n_obsv, n_step, order, start, n_iter, 42 | factor_type='FIR'): 43 | self.input = input 44 | self.n_hidden = n_hidden 45 | self.n_obsv = n_obsv 46 | self.n_step = n_step 47 | self.order = order 48 | self.factor_type = factor_type 49 | # For mini-batch 50 | self.start = start 51 | self.n_iter = n_iter 52 | if self.factor_type == 'FIR': 53 | self.factor = factor.FIR(n_hidden=self.n_hidden, 54 | n_obsv=self.n_obsv, n_step=self.n_step, 55 | order=self.order, start=self.start, n_iter=self.n_iter) 56 | else: 57 | raise NotImplementedError 58 | 59 | self.params_Estep = self.factor.params_Estep 60 | self.params_Mstep = self.factor.params_Mstep 61 | self.L1 = self.factor.L1 62 | self.L2_sqr = self.factor.L2_sqr 63 | 64 | self.y_pred = self.factor.y_pred 65 | self.z_pred = self.factor.z_pred 66 | self.y_next = self.factor.y_next 67 | self.z_next = self.factor.z_next 68 | self.z = self.factor.z 69 | 70 | self.updates = OrderedDict() 71 | for param in self.params_Estep: 72 | init = np.zeros(param.get_value(borrow=True).shape, 73 | dtype=theano.config.floatX) 74 | 75 | self.updates[param] = theano.shared(init) 76 | for param in self.params_Mstep: 77 | init = np.zeros(param.get_value(borrow=True).shape, 78 | dtype=theano.config.floatX) 79 | self.updates[param] = theano.shared(init) 80 | 81 | # Loss = ||Z*(t)-Z(t)||^2 + ||Y*(t) - Y(t)||^2 82 | self.z_std = self.z[self.start + self.order: self.start + self.order + self.n_iter] 83 | self.loss_Estep = lambda y : self.se(self.y_pred, y) + self.se(self.z_pred, self.z[self.order:]) 84 | self.loss_Mstep = lambda y : self.se(self.y_next, y) + self.se(self.z_next, self.z_std) 85 | self.test_loss = lambda y : self.se(self.y_next, y) 86 | 87 | # Smooth Term ||Z(t+1)-Z(t)||^2 88 | # Estep 89 | diag_Estep = np.zeros((self.n_step + self.order, self.n_step + self.order), 90 | dtype=theano.config.floatX) 91 | np.fill_diagonal(diag_Estep[:,1:], 1.) 92 | diag_Estep[-1,-1] = 1. 93 | z_tm1 = T.dot(diag_Estep, self.z) 94 | self.smooth_Estep = self.mse(self.z, z_tm1) 95 | 96 | diag_Mstep = T.eye(self.n_iter, self.n_iter, 1) 97 | diag_Mstep = T.set_subtensor(diag_Mstep[-1, -1], 1) 98 | z_tm1_next = T.dot(diag_Mstep, self.z_next) 99 | self.smooth_Mstep = self.mse(self.z_next, z_tm1_next) 100 | def se(self, y_1, y_2): 101 | return T.sum((y_1 - y_2) ** 2) 102 | def mse(self, y_1, y_2): 103 | # error between output and target 104 | return T.mean((y_1 - y_2) ** 2) 105 | def nmse(self, y_1, y_2): 106 | # treat y_1 as the approximation to y_2 107 | return self.mse(y_1, y_2) / self.mse(y_2, 0) 108 | 109 | 110 | class MetaDFG(BaseEstimator): 111 | def __init__(self, n_hidden, n_obsv, n_step, order, learning_rate_Estep=0.1, learning_rate_Mstep=0.1, 112 | n_epochs=100, batch_size=100, L1_reg=0.00, L2_reg=0.00, smooth_reg=0.00, 113 | learning_rate_decay=1, learning_rate_decay_every=100, 114 | factor_type='FIR', activation='tanh', final_momentum=0.9, 115 | initial_momentum=0.5, momentum_switchover=5, 116 | n_iter_low=[20,], n_iter_high=[50,], n_iter_change_every=50, 117 | snapshot_every=None, snapshot_path='tmp/'): 118 | self.n_hidden = int(n_hidden) 119 | self.n_obsv = int(n_obsv) 120 | self.n_step = int(n_step) 121 | self.order = int(order) 122 | self.learning_rate_Estep = float(learning_rate_Estep) 123 | self.learning_rate_Mstep = float(learning_rate_Mstep) 124 | self.learning_rate_decay = float(learning_rate_decay) 125 | self.learning_rate_decay_every=int(learning_rate_decay_every) 126 | self.n_epochs = int(n_epochs) 127 | self.batch_size = int(batch_size) 128 | self.L1_reg = float(L1_reg) 129 | self.L2_reg = float(L2_reg) 130 | self.smooth_reg = float(smooth_reg) 131 | self.factor_type = factor_type 132 | self.activation = activation 133 | self.initial_momentum = float(initial_momentum) 134 | self.final_momentum = float(final_momentum) 135 | self.momentum_switchover = int(momentum_switchover) 136 | self.n_iter_low = n_iter_low 137 | self.n_iter_high = n_iter_high 138 | assert(len(self.n_iter_low) == len(self.n_iter_high)) 139 | self.n_iter_change_every = int(n_iter_change_every) 140 | if snapshot_every is not None: 141 | self.snapshot_every = int(snapshot_every) 142 | else: 143 | self.snapshot_every = None 144 | self.snapshot_path = snapshot_path 145 | self.ready() 146 | 147 | def ready(self): 148 | # observation (where first dimension is time) 149 | self.y = T.matrix(name='y', dtype=theano.config.floatX) 150 | 151 | # learning rate 152 | self.lr = T.scalar() 153 | # For mini-batch 154 | self.start = T.iscalar('start') 155 | self.n_iter = T.iscalar('n_iter') 156 | 157 | if self.activation == 'tanh': 158 | activation = T.tanh 159 | elif self.activation == 'sigmoid': 160 | activation = T.nnet.sigmoid 161 | elif self.activation == 'relu': 162 | activation = lambda x: x * (x > 0) 163 | else: 164 | raise NotImplementedError 165 | 166 | self.dfg = DFG(input=self.y, n_hidden=self.n_hidden, 167 | n_obsv=self.n_obsv, n_step=self.n_step, 168 | order=self.order, start=self.start, 169 | n_iter=self.n_iter, factor_type=self.factor_type) 170 | 171 | def shared_dataset(self, data): 172 | """ Load the dataset into shared variables """ 173 | 174 | shared_data = theano.shared(np.asarray(data, 175 | dtype=theano.config.floatX)) 176 | return shared_data 177 | 178 | def __getstate__(self, jsonobj=False): 179 | params = self.get_params() # all the parameters in self.__init__ 180 | weights_E = [p.get_value().tolist() if jsonobj else p.get_value() for p in self.dfg.params_Estep] 181 | weights_M = [p.get_value().tolist() if jsonobj else p.get_value() for p in self.dfg.params_Mstep] 182 | weights = (weights_E, weights_M) 183 | state = (params, weights) 184 | return state 185 | 186 | def _set_weights(self, weights): 187 | weights_E, weights_M = weights 188 | i = iter(weights_E) 189 | for param in self.dfg.params_Estep: 190 | param.set_value(i.next()) 191 | i = iter(weights_M) 192 | for param in self.dfg.params_Mstep: 193 | param.set_value(i.next()) 194 | 195 | def __setstate__(self, state): 196 | params, weights = state 197 | self.set_params(**params) 198 | self.ready() 199 | self._set_weights(weights) 200 | 201 | def save(self, fpath='.', fname=None): 202 | """Save a pickled representation of model state. """ 203 | fpathstart, fpathext = os.path.splitext(fpath) 204 | if fpathext == '.pkl': 205 | fpath, fname = os.path.split(fpath) 206 | elif fpathext == '.json': 207 | fpath, fname = os.path.split(fpath) 208 | elif fname is None: 209 | # Generate filename based on date 210 | date_obj = datetime.datetime.now() 211 | date_str = date_obj.strftime('%Y-%m-%d-%H:%M:%S') 212 | class_name = self.__class__.__name__ 213 | fname = '%s.%s.pkl' % (class_name, date_str) 214 | 215 | fabspath = os.path.join(fpath, fname) 216 | logger.info('Saving to %s ...' % fabspath) 217 | with open(fabspath, 'wb') as file: 218 | if fpathext == '.json': 219 | state = self.__getstate__(jsonobj=True) 220 | json.dump(state, file, 221 | indent=4, separators=(',', ': ')) 222 | else: 223 | state = self.__getstate__() 224 | pickle.dump(state, file, protocol=pickle.HIGHEST_PROTOCOL) 225 | 226 | def load(self, fpath): 227 | """ Load model parameters from fpath. """ 228 | logger.info('Loading from %s ...' % fpath) 229 | with open(fpath, 'rb') as file: 230 | state = pickle.load(file) 231 | self.__setstete__(state) 232 | 233 | def fit(self, Y_train, Y_test=None, 234 | validation_frequency=100): 235 | """Fit model 236 | 237 | Pass in Y_test to compute test error and report during training 238 | Y_train : ndarray (n_seq, n_step, n_out) 239 | Y_test : ndarray (n_seq, n_step, n_out) 240 | 241 | validation_frequency : int 242 | in terms of number of epoch 243 | """ 244 | 245 | 246 | if Y_test is not None: 247 | self.interactive = True 248 | test_set_y = self.shared_dataset(Y_test) 249 | else: 250 | self.interactive = False 251 | train_set_y = self.shared_dataset(Y_train) 252 | n_train = train_set_y.get_value(borrow=True).shape[0] 253 | if self.interactive: 254 | n_test = test_set_y.get_value(borrow=True).shape[0] 255 | 256 | logger.info('...building the model') 257 | 258 | index = T.lscalar('index') #index to a case 259 | # learning rate (may change) 260 | l_r = T.scalar('l_r', dtype=theano.config.floatX) 261 | mom = T.scalar('mom', dtype=theano.config.floatX) 262 | 263 | cost_Estep = self.dfg.loss_Estep(self.y) \ 264 | + self.dfg.smooth_Estep \ 265 | + self.L1_reg * self.dfg.L1 \ 266 | + self.L2_reg * self.dfg.L2_sqr 267 | 268 | cost_Mstep = self.dfg.loss_Mstep(self.y) \ 269 | + self.dfg.smooth_Mstep \ 270 | + self.L1_reg * self.dfg.L1 \ 271 | + self.L2_reg * self.dfg.L2_sqr 272 | 273 | # compute the gradient of cost with respect to theta = (W, W_in, W_out) 274 | # gradients on the weights using BPTT 275 | # E step 276 | gparams_Estep = [] 277 | for param in self.dfg.params_Estep: 278 | gparam = T.grad(cost_Estep, param) 279 | gparams_Estep.append(gparam) 280 | 281 | updates_Estep = OrderedDict() 282 | for param, gparam in zip(self.dfg.params_Estep, gparams_Estep): 283 | weight_update = self.dfg.updates[param] 284 | upd = mom * weight_update - l_r * gparam 285 | updates_Estep[weight_update] = upd 286 | updates_Estep[param] = param + upd 287 | 288 | # M step 289 | gparams_Mstep = [] 290 | for param in self.dfg.params_Mstep: 291 | gparam = T.grad(cost_Mstep, param) 292 | gparams_Mstep.append(gparam) 293 | 294 | updates_Mstep = OrderedDict() 295 | for param, gparam in zip(self.dfg.params_Mstep, gparams_Mstep): 296 | weight_update = self.dfg.updates[param] 297 | upd = mom * weight_update - l_r * gparam 298 | updates_Mstep[weight_update] = upd 299 | updates_Mstep[param] = param + upd 300 | 301 | # compiling a Theano function `train_model_Estep` that returns the 302 | # cost, but in the same time updates the parameter of the 303 | # model based on the rules defined in `updates_Estep` 304 | train_model_Estep = theano.function(inputs=[index, l_r, mom], 305 | outputs=[cost_Estep, self.dfg.loss_Estep(self.y), self.dfg.y_pred, self.dfg.z_pred], 306 | updates=updates_Estep, 307 | givens=OrderedDict([(self.y, train_set_y[index])]), 308 | mode=mode) 309 | # updates the parameter of the model based on 310 | # the rules defined in `updates_Mstep` 311 | train_model_Mstep = theano.function(inputs=[index, l_r, mom, self.start, self.n_iter], 312 | outputs=[cost_Mstep, self.dfg.y_next, self.dfg.z_next] + gparams_Mstep, 313 | updates=updates_Mstep, 314 | givens=OrderedDict([(self.y, train_set_y[index][self.start: self.start + self.n_iter])]), 315 | mode=mode) 316 | test_model = theano.function(inputs=[self.start, self.n_iter], 317 | outputs=[self.dfg.y_next, self.dfg.z_next], 318 | #givens=OrderedDict([(self.y, test_set_y[index])]), 319 | mode=mode) 320 | ############### 321 | # TRAIN MODEL # 322 | ############### 323 | logger.info('... training') 324 | epoch = 0 325 | history_energy = None 326 | Estep_on = True 327 | 328 | while (epoch < self.n_epochs): 329 | epoch = epoch + 1 330 | average_cost = 0. 331 | average_energy = 0. 332 | effective_momentum = self.final_momentum \ 333 | if epoch > self.momentum_switchover \ 334 | else self.initial_momentum 335 | if Estep_on: 336 | for idx in xrange(n_train): 337 | example_cost, example_energy, example_y_pred, example_z_pred = train_model_Estep(idx, self.learning_rate_Estep, 338 | 0.) 339 | average_cost += example_cost 340 | average_energy += example_energy 341 | logger.info('epoch %d E_step cost=%f energy=%f' % (epoch, 342 | average_cost / n_train, average_energy / n_train)) 343 | if not history_energy or average_energy / n_train < history_energy: 344 | history_energy = average_energy / n_train 345 | if epoch > 10 and history_energy and average_energy / n_train > history_energy: 346 | Estep_on = False 347 | average_cost = [] 348 | for idx in xrange(n_train): 349 | for i in xrange(self.n_step): 350 | n_iter = np.random.randint(low=self.n_iter_low[0], 351 | high=self.n_iter_high[0]) 352 | head = np.random.randint(self.n_step - n_iter) 353 | example_cost, example_y_next, example_z_next, gW_o, gb_o, gW = train_model_Mstep(idx, self.learning_rate_Mstep, 354 | effective_momentum, head, n_iter) 355 | average_cost.append(example_cost) 356 | if np.isnan(example_cost): 357 | date_obj = datetime.datetime.now() 358 | date_str = date_obj.strftime('%Y-%m-%d-%H:%M:%S') 359 | logger.warn('epoch=%d batch=%d head=%d n_iter=%d' % (epoch, i, head, n_iter)) 360 | fname = 'debug_%s_%d_%d.json' % (date_str, epoch, i) 361 | fabspath = os.path.join(self.snapshot_path, fname) 362 | self.save(fpath=fabspath) 363 | print gW_o 364 | print gb_o 365 | print gW 366 | exit() 367 | logger.info('epoch %d M_step n_iter=%d cost=%f' % (epoch, n_iter, np.mean(average_cost))) 368 | # Update learning rate 369 | if self.learning_rate_decay_every is not None: 370 | if epoch % self.learning_rate_decay_every == 0: 371 | self.learning_rate_Estep *= self.learning_rate_decay 372 | self.learning_rate_Mstep *= self.learning_rate_decay 373 | if epoch % self.n_iter_change_every == 0: 374 | if len(self.n_iter_low) > 0: 375 | self.n_iter_low = self.n_iter_low[1:] 376 | self.n_iter_high = self.n_iter_high[1:] 377 | # Snapshot 378 | if self.snapshot_every is not None: 379 | if (epoch - 1) % self.snapshot_every == 0: 380 | date_obj = datetime.datetime.now() 381 | date_str = date_obj.strftime('%Y-%m-%d-%H:%M:%S') 382 | class_name = self.__class__.__name__ 383 | fname = '%s.%s-snapshot-%d.png' % (class_name, date_str, epoch) 384 | plt.figure() 385 | n = Y_train[0].shape[0] + Y_test[0].shape[0] 386 | x = np.linspace(0, n, n) 387 | len_train = Y_train[0].shape[0] 388 | x_train, x_test = x[:len_train], x[len_train:] 389 | plt.plot(x_train, np.squeeze(Y_train[0]), 'b', linewidth=2) 390 | plt.plot(x_train, np.squeeze(example_y_pred), 'r', linewidth=2) 391 | plt.savefig(self.snapshot_path + fname) 392 | plt.close() 393 | if self.interactive: 394 | y_test_next, z_test_next = test_model(self.n_step, Y_test[0].shape[0]) 395 | #logger.info('epoch %d test loss=%f' % (epoch, test_loss)) 396 | plt.figure() 397 | plt.plot(x_test, np.squeeze(Y_test[0]), 'b', linewidth=2) 398 | plt.plot(x_test, np.squeeze(y_test_next), 'r', linewidth=2) 399 | fname = '%s.%s-snapshot-%d_test.png' % (class_name, date_str, epoch) 400 | plt.ylim(-5, 5) 401 | plt.savefig(self.snapshot_path + fname) 402 | plt.close() 403 | fname = '%s.%s-snapshot-%d.pkl' % (class_name, date_str, epoch) 404 | fabspath = os.path.join(self.snapshot_path, fname) 405 | self.save(fpath=fabspath) 406 | 407 | 408 | class sinTestCase(unittest.TestCase): 409 | def runTest(self): 410 | n = 1500 411 | x = np.linspace(0, n, n) 412 | sita = [.2, .331, .42, .51, .74] 413 | sita = sita[:3] 414 | y = np.zeros(n) 415 | for item in sita: 416 | y += np.sin(item * x) 417 | # n_seq x n_t x n_in 418 | n_train = n - 500 419 | n_test = 500 420 | y_train = y[:n_train] 421 | y_test = y[n_train:] 422 | y_train = y_train.reshape(1, n_train, 1) 423 | y_test = y_test.reshape(1, n_test, 1) 424 | dfg = MetaDFG(n_hidden=3, n_obsv=1, n_step=n_train, order=25, learning_rate_Estep=0.01, learning_rate_Mstep=0.001, 425 | n_epochs=1000, snapshot_every=1, L1_reg=0.01, L2_reg=0.01, smooth_reg=0.01, 426 | learning_rate_decay=.9, learning_rate_decay_every=50, 427 | n_iter_low=[20, 20, 20, 20] , n_iter_high=[31, 51, 71, 101], n_iter_change_every=15, 428 | final_momentum=0.9, 429 | initial_momentum=0.5, momentum_switchover=500) 430 | dfg.fit(y_train, y_test) 431 | assert True 432 | 433 | if __name__ == "__main__": 434 | logging.basicConfig(level=logging.INFO) 435 | unittest.main() 436 | 437 | -------------------------------------------------------------------------------- /dfg_minibatch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # File Name: dfg.py 4 | # Author: Jiezhong Qiu 5 | # Create Time: 2015/01/26 00:25 6 | # TODO: 7 | 8 | from collections import OrderedDict 9 | import numpy as np 10 | import theano 11 | import theano.tensor as T 12 | from sklearn.base import BaseEstimator 13 | import logging 14 | import json 15 | import datetime 16 | import os 17 | import cPickle as pickle 18 | import factor_minibatch 19 | import unittest 20 | #import matplotlib.pylab as plt 21 | from sklearn.metrics import precision_recall_fscore_support 22 | from sklearn.metrics import roc_auc_score 23 | 24 | 25 | logger = logging.getLogger(__name__) 26 | theano.config.exception_verbosity='high' 27 | mode = theano.Mode(linker='cvm') 28 | #mode = 'DebugMode' 29 | #mode = 'FAST_COMPILE' 30 | #mode = theano.Mode(optimizer=None) 31 | #mode = 'ProfileMode' 32 | DEBUG = True 33 | THRESHOLD = (1 - 41./513) * 100 34 | WEIGHT = [1./28] * 8 + [5. / 7] 35 | DATA_DIR = 'data/circuit.pkl' 36 | THRESHOLD = 70.3 37 | WEIGHT = [5.] * 10 + [50.] 38 | DATA_DIR = 'data/fin2.pkl' 39 | 40 | def getBestThreshold(yres, ystd): 41 | yList = sorted([ (yres[i], ystd[i]) for i in xrange(len(yres)) ], key = lambda item: item[0]) 42 | tn = tp = fn = fp = 0. 43 | Bestf1 = 0. 44 | for y_, y in yList: 45 | if y == 1: 46 | tp += 1 47 | else: 48 | fn += 1 49 | for item in yList: 50 | #this item predicted as negtive 51 | if item[1] == 1: 52 | fp += 1 53 | tp -= 1 54 | else: 55 | tn += 1 56 | fn -= 1 57 | if tp + fn == 0 or tp + fp == 0: 58 | continue 59 | prec = tp / (tp + fn) 60 | rec = tp / (tp + fp) 61 | if prec + rec < 1e-5: 62 | continue 63 | f1 = 2 * prec * rec / (prec + rec) 64 | if f1 > Bestf1: 65 | Bestf1 = f1 66 | threshold = item[0] 67 | return threshold 68 | def metric(y_pred, y_std): 69 | threshold = getBestThreshold(y_pred, y_std) 70 | prf = precision_recall_fscore_support(y_std, y_pred > threshold, average='micro') 71 | return 'auc %f, prec %f, rec %f, f1 %f' % (roc_auc_score(y_std, y_pred), prf[0], prf[1], prf[2]) 72 | 73 | class DFG(object): 74 | """ Dynamic factor graph class 75 | 76 | Support output types: 77 | real: linear output units, use mean-squared error 78 | binary: binary output units, use cross-entropy error 79 | softmax: single softmax out, use cross-entropy error 80 | """ 81 | def __init__(self, n_in, x, y_pad, n_hidden, n_obsv, n_step, order, n_seq, start, n_iter, 82 | factor_type='FIR', output_type='real', 83 | order_obsv=0, hidden_layer_config=None): 84 | self.n_in = n_in 85 | self.x = x 86 | self.y_pad = y_pad 87 | self.n_hidden = n_hidden 88 | self.n_obsv = n_obsv 89 | self.n_step = n_step 90 | self.order = order 91 | self.n_seq = n_seq 92 | self.factor_type = factor_type 93 | self.start = start 94 | self.n_iter = n_iter 95 | # For mini-batch 96 | self.index = T.iscalar('index') # index to a [mini]batch 97 | self.n_ex = T.iscalar('n_ex') # the number of examples 98 | self.batch_size = T.iscalar('batch_size') 99 | self.batch_start = self.index * self.batch_size 100 | self.batch_stop = T.minimum(self.n_ex, (self.index + 1) * self.batch_size) 101 | self.effective_batch_size = self.batch_stop - self.batch_start 102 | self.order_obsv=order_obsv 103 | self.hidden_layer_config = hidden_layer_config 104 | if self.factor_type == 'FIR': 105 | # FIR factor with n_in > 0 is not implemented 106 | if self.n_in > 0: 107 | raise NotImplementedError 108 | self.factor = factor_minibatch.FIR(n_hidden=self.n_hidden, 109 | n_obsv=self.n_obsv, n_step=self.n_step, 110 | order=self.order, n_seq=self.n_seq, start=self.start, n_iter=self.n_iter, 111 | batch_start=self.batch_start, batch_stop=self.batch_stop) 112 | elif self.factor_type == 'MLP': 113 | self.factor = factor_minibatch.MLP(n_in=self.n_in, 114 | x=self.x, y_pad=self.y_pad, 115 | n_hidden=self.n_hidden, 116 | n_obsv=self.n_obsv, n_step=self.n_step, 117 | order=self.order, n_seq=self.n_seq, start=self.start, n_iter=self.n_iter, 118 | batch_start=self.batch_start, batch_stop=self.batch_stop, 119 | order_obsv=self.order_obsv, 120 | hidden_layer_config=self.hidden_layer_config) 121 | else: 122 | raise NotImplementedError 123 | self.output_type = output_type 124 | 125 | self.params_Estep = self.factor.params_Estep 126 | self.params_Mstep = self.factor.params_Mstep 127 | self.L1 = self.factor.L1 128 | self.L2_sqr = self.factor.L2_sqr 129 | 130 | self.y_pred_Estep = self.factor.y_pred_Estep 131 | self.z_pred_Estep = self.factor.z_pred_Estep 132 | self.y_pred_Mstep = self.factor.y_pred_Mstep 133 | self.z_pred_Mstep = self.factor.z_pred_Mstep 134 | self.y_next = self.factor.y_next 135 | self.z_next = self.factor.z_next 136 | self.z = self.factor.z 137 | 138 | self.updates = OrderedDict() 139 | for param in self.params_Estep: 140 | init = np.zeros(param.get_value(borrow=True).shape, 141 | dtype=theano.config.floatX) 142 | 143 | self.updates[param] = theano.shared(init) 144 | for param in self.params_Mstep: 145 | init = np.zeros(param.get_value(borrow=True).shape, 146 | dtype=theano.config.floatX) 147 | self.updates[param] = theano.shared(init) 148 | 149 | # Loss = ||Z*(t)-Z(t)||^2 + ||Y*(t) - Y(t)||^2 150 | self.z_std = self.z[self.start+self.order:self.start+self.order+self.n_iter,self.batch_start:self.batch_stop] 151 | if self.output_type == 'real': 152 | self.loss_Estep = lambda y : (self.se(self.y_pred_Estep, y) + self.se(self.z_pred_Estep, self.z[self.order:])) / n_seq 153 | self.loss_Mstep = lambda y : (self.se(self.y_pred_Mstep, y) + self.se(self.z_pred_Mstep, self.z_std)) / self.effective_batch_size 154 | self.test_loss = lambda y : self.se(self.y_next, y) / self.effective_batch_size 155 | elif self.output_type == 'binary': 156 | self.loss_Estep = lambda y : (self.nll_binary(self.y_pred_Estep, y) \ 157 | + self.se(self.z_pred_Estep, self.z[self.order:]) \ 158 | + 0 * self.nll_binary(self.y_pred_Estep[:,:,-1], y[:,:,-1])) / n_seq 159 | self.loss_Mstep = lambda y : (self.nll_binary(self.y_pred_Mstep, y) \ 160 | + self.se(self.z_pred_Mstep, self.z_std) \ 161 | + 0 * self.nll_binary(self.y_pred_Mstep[:,:,-1], y[:,:,-1])) / self.effective_batch_size 162 | self.test_loss = lambda y : self.nll_binary(self.y_next, y) / self.effective_batch_size 163 | else: 164 | raise NotImplementedError 165 | 166 | # Smooth Term ||Z(t+1)-Z(t)||^2 167 | # Estep 168 | diag_Estep = np.zeros(((n_step + order)*n_hidden, (n_step+order)*n_hidden), 169 | dtype=theano.config.floatX) 170 | np.fill_diagonal(diag_Estep[n_hidden:,:], 1.) 171 | np.fill_diagonal(diag_Estep[-n_hidden:,-n_hidden:], 1.) 172 | # (n_step+order) x n_seq x n_hdden 173 | z_flatten = T.flatten(self.z.dimshuffle(1, 0, 2), outdim=2) 174 | z_tm1 = T.dot(z_flatten, diag_Estep) 175 | self.smooth_Estep = self.se(z_flatten, z_tm1) / n_seq 176 | 177 | diag_Mstep = T.eye(self.n_iter*n_hidden, self.n_iter*n_hidden, n_hidden) 178 | for i in xrange(n_hidden): 179 | diag_Mstep = T.set_subtensor(diag_Mstep[-i-1, -i-1], 1) 180 | z_next_flatten = T.flatten(self.z_next.dimshuffle(1, 0, 2), outdim=2) 181 | z_next_tm1 = T.dot(z_next_flatten, diag_Mstep) 182 | self.smooth_Mstep = self.se(z_next_flatten, z_next_tm1) / self.effective_batch_size 183 | def se(self, y_1, y_2): 184 | return T.sum((y_1 - y_2) ** 2) 185 | def mse(self, y_1, y_2): 186 | # error between output and target 187 | return T.mean((y_1 - y_2) ** 2) 188 | def nmse(self, y_1, y_2): 189 | # treat y_1 as the approximation to y_2 190 | return self.mse(y_1, y_2) / self.mse(y_2, 0) 191 | def nll_binary(self, y_1, y_2): 192 | return T.sum(T.nnet.binary_crossentropy(y_1, y_2)) 193 | 194 | def prec(self, y, y_pred): 195 | y_out = T.round(y_pred[-1,:,-1]) 196 | y_std = T.round(y[-1,:,-1]) 197 | true_pos = T.sum(T.eq(y_std, 1) * T.eq(y_out, 1)) 198 | false_pos = T.sum(T.neq(y_std, 1) * T.eq(y_out, 1)) 199 | return (true_pos + 0.) / (true_pos + false_pos) 200 | def rec(self, y, y_pred): 201 | y_out = T.round(y_pred[-1,:,-1]) 202 | y_std = T.round(y[-1,:,-1]) 203 | true_pos = T.sum(T.eq(y_std, 1) * T.eq(y_out, 1)) 204 | false_neg = T.sum(T.eq(y_std, 1) * T.neq(y_out, 1)) 205 | return (true_pos + 0.) / (true_pos + false_neg) 206 | 207 | class MetaDFG(BaseEstimator): 208 | def __init__(self, n_in, n_hidden, n_obsv, n_step, order, n_seq, learning_rate_Estep=0.1, learning_rate_Mstep=0.1, 209 | n_epochs=100, batch_size=100, L1_reg=0.00, L2_reg=0.00, smooth_reg=0.00, 210 | learning_rate_decay=1, learning_rate_decay_every=100, 211 | factor_type='FIR', output_type='real', activation='tanh', final_momentum=0.9, 212 | initial_momentum=0.5, momentum_switchover=5, 213 | n_iter_low=[20,], n_iter_high=[50,], n_iter_change_every=50, 214 | snapshot_every=None, snapshot_path='tmp/', 215 | order_obsv=0, 216 | hidden_layer_config=None): 217 | self.n_in = int(n_in) 218 | self.n_hidden = int(n_hidden) 219 | self.n_obsv = int(n_obsv) 220 | self.n_step = int(n_step) 221 | self.order = int(order) 222 | self.n_seq = int(n_seq) 223 | self.learning_rate_Estep = float(learning_rate_Estep) 224 | self.learning_rate_Mstep = float(learning_rate_Mstep) 225 | self.learning_rate_decay = float(learning_rate_decay) 226 | self.learning_rate_decay_every=int(learning_rate_decay_every) 227 | self.n_epochs = int(n_epochs) 228 | self.batch_size = int(batch_size) 229 | self.L1_reg = float(L1_reg) 230 | self.L2_reg = float(L2_reg) 231 | self.smooth_reg = float(smooth_reg) 232 | self.factor_type = factor_type 233 | self.output_type = output_type 234 | self.activation = activation 235 | self.initial_momentum = float(initial_momentum) 236 | self.final_momentum = float(final_momentum) 237 | self.momentum_switchover = int(momentum_switchover) 238 | self.n_iter_low = n_iter_low 239 | self.n_iter_high = n_iter_high 240 | assert(len(self.n_iter_low) == len(self.n_iter_high)) 241 | self.n_iter_change_every = int(n_iter_change_every) 242 | if snapshot_every is not None: 243 | self.snapshot_every = int(snapshot_every) 244 | else: 245 | self.snapshot_every = None 246 | self.snapshot_path = snapshot_path 247 | self.order_obsv = int(order_obsv) 248 | self.hidden_layer_config = hidden_layer_config 249 | self.ready() 250 | 251 | def ready(self): 252 | # observation (where first dimension is time) 253 | self.y = T.tensor3(name='y', dtype=theano.config.floatX) 254 | self.y_pad = T.tensor3(name='y_pad', dtype=theano.config.floatX) 255 | self.x = T.tensor3(name='x', dtype=theano.config.floatX) 256 | 257 | # learning rate 258 | self.lr = T.scalar() 259 | # For mini-batch 260 | self.start = T.iscalar('start') 261 | self.n_iter = T.iscalar('n_iter') 262 | 263 | if self.activation == 'tanh': 264 | activation = T.tanh 265 | elif self.activation == 'sigmoid': 266 | activation = T.nnet.sigmoid 267 | elif self.activation == 'relu': 268 | activation = lambda x: x * (x > 0) 269 | else: 270 | raise NotImplementedError 271 | 272 | self.dfg = DFG(n_in=self.n_in, x=self.x, y_pad=self.y_pad, n_hidden=self.n_hidden, 273 | n_obsv=self.n_obsv, n_step=self.n_step, 274 | order=self.order, n_seq=self.n_seq, start=self.start, 275 | n_iter=self.n_iter, factor_type=self.factor_type, 276 | output_type=self.output_type, 277 | order_obsv=self.order_obsv, 278 | hidden_layer_config=self.hidden_layer_config) 279 | 280 | def shared_dataset(self, data): 281 | """ Load the dataset into shared variables """ 282 | 283 | shared_data = theano.shared(np.asarray(data, 284 | dtype=theano.config.floatX)) 285 | return shared_data 286 | 287 | def __getstate__(self, jsonobj=False): 288 | params = self.get_params() # all the parameters in self.__init__ 289 | weights_E = [p.get_value().tolist() if jsonobj else p.get_value() for p in self.dfg.params_Estep] 290 | weights_M = [p.get_value().tolist() if jsonobj else p.get_value() for p in self.dfg.params_Mstep] 291 | weights = (weights_E, weights_M) 292 | state = (params, weights) 293 | return state 294 | 295 | def _set_weights(self, weights): 296 | weights_E, weights_M = weights 297 | i = iter(weights_E) 298 | for param in self.dfg.params_Estep: 299 | param.set_value(i.next()) 300 | i = iter(weights_M) 301 | for param in self.dfg.params_Mstep: 302 | param.set_value(i.next()) 303 | 304 | def __setstate__(self, state): 305 | params, weights = state 306 | self.set_params(**params) 307 | self.ready() 308 | self._set_weights(weights) 309 | 310 | def save(self, fpath='.', fname=None): 311 | """Save a pickled representation of model state. """ 312 | fpathstart, fpathext = os.path.splitext(fpath) 313 | if fpathext == '.pkl': 314 | fpath, fname = os.path.split(fpath) 315 | elif fpathext == '.json': 316 | fpath, fname = os.path.split(fpath) 317 | elif fname is None: 318 | # Generate filename based on date 319 | date_obj = datetime.datetime.now() 320 | date_str = date_obj.strftime('%Y-%m-%d-%H:%M:%S') 321 | class_name = self.__class__.__name__ 322 | fname = '%s.%s.pkl' % (class_name, date_str) 323 | 324 | fabspath = os.path.join(fpath, fname) 325 | logger.info('Saving to %s ...' % fabspath) 326 | with open(fabspath, 'wb') as file: 327 | if fpathext == '.json': 328 | state = self.__getstate__(jsonobj=True) 329 | json.dump(state, file, 330 | indent=4, separators=(',', ': ')) 331 | else: 332 | state = self.__getstate__() 333 | pickle.dump(state, file, protocol=pickle.HIGHEST_PROTOCOL) 334 | 335 | def load(self, fpath): 336 | """ Load model parameters from fpath. """ 337 | logger.info('Loading from %s ...' % fpath) 338 | with open(fpath, 'rb') as file: 339 | state = pickle.load(file) 340 | self.__setstete__(state) 341 | 342 | def fit(self, Y_train=None, Y_test=None, 343 | X_train=None, X_test=None, 344 | validation_frequency=None): 345 | """Fit model 346 | 347 | Pass in Y_test to compute test error and report during training 348 | Y_train : ndarray (n_step, n_seq, n_out) 349 | Y_test : ndarray (T, n_seq, n_out) 350 | X_train : ndarray (n_step, n_seq, n_in) 351 | X_test : ndarray (T, n_seq, n_in) 352 | validation_frequency : int 353 | in terms of number of epoch 354 | """ 355 | 356 | 357 | if Y_test is not None: 358 | self.interactive = True 359 | test_set_y = self.shared_dataset(Y_test) 360 | test_set_x = self.shared_dataset(X_test) 361 | Y_test_binary = np.zeros_like(Y_test, 362 | dtype=theano.config.floatX) 363 | for t in xrange(Y_test.shape[0]): 364 | for i in xrange(self.n_obsv): 365 | threshold = np.percentile(Y_test[t,:,i], THRESHOLD) 366 | Y_test_binary[t,:,i] = Y_test[t,:,i] >= threshold 367 | test_set_y_binary = self.shared_dataset(Y_test_binary) 368 | else: 369 | self.interactive = False 370 | 371 | train_set_x = self.shared_dataset(X_train) 372 | # generate Y_pad 373 | Y_train_pad = np.zeros(shape=(self.n_step + self.order_obsv, self.n_seq, self.n_obsv), 374 | dtype=theano.config.floatX) 375 | Y_train_pad[self.order_obsv:,:,:]=Y_train 376 | # generate Y_binary 377 | Y_train_binary = np.zeros_like(Y_train, 378 | dtype=theano.config.floatX) 379 | for t in xrange(self.n_step): 380 | for i in xrange(self.n_obsv): 381 | threshold = np.percentile(Y_train[t,:,i], THRESHOLD) 382 | Y_train_binary[t,:,i] = Y_train[t,:,i] >= threshold 383 | train_set_y = self.shared_dataset(Y_train) 384 | train_set_y_pad = self.shared_dataset(Y_train_pad) 385 | train_set_y_binary = self.shared_dataset(Y_train_binary) 386 | n_train = train_set_y.get_value(borrow=True).shape[1] 387 | n_train_batches = int(np.ceil(float(n_train) / self.batch_size)) 388 | if self.interactive: 389 | n_test = test_set_y.get_value(borrow=True).shape[1] 390 | n_test_batches = int(np.ceil(float(n_test) / self.batch_size)) 391 | 392 | logger.info('...building the model') 393 | 394 | index = self.dfg.index 395 | n_ex = self.dfg.n_ex 396 | # learning rate (may change) 397 | l_r = T.scalar('l_r', dtype=theano.config.floatX) 398 | mom = T.scalar('mom', dtype=theano.config.floatX) 399 | 400 | cost_Estep = self.dfg.loss_Estep(self.y) \ 401 | + self.smooth_reg * self.dfg.smooth_Estep \ 402 | + self.L1_reg * self.dfg.L1 \ 403 | + self.L2_reg * self.dfg.L2_sqr 404 | 405 | cost_Mstep = self.dfg.loss_Mstep(self.y) \ 406 | + self.smooth_reg * self.dfg.smooth_Mstep \ 407 | + self.L1_reg * self.dfg.L1 \ 408 | + self.L2_reg * self.dfg.L2_sqr 409 | 410 | # mini-batch implement 411 | batch_size = self.dfg.batch_size 412 | batch_start = self.dfg.batch_start 413 | batch_stop = self.dfg.batch_stop 414 | effective_batch_size = self.dfg.effective_batch_size 415 | get_batch_size = theano.function(inputs=[index, n_ex, batch_size], 416 | outputs=effective_batch_size) 417 | 418 | givens=[(self.y, train_set_y_binary), 419 | (self.x, train_set_x), 420 | (self.y_pad, train_set_y_pad)] 421 | if self.order_obsv == 0: 422 | givens = givens[:-1] 423 | compute_train_error_Estep = theano.function(inputs=[], 424 | outputs=[self.dfg.loss_Estep(self.y), self.dfg.y_pred_Estep, 425 | self.dfg.prec(self.y, self.dfg.y_pred_Estep), self.dfg.rec(self.y, self.dfg.y_pred_Estep)], 426 | givens=OrderedDict(givens), 427 | mode=mode) 428 | 429 | # compute_train_error_Mstep = theano.function(inputs=[index, n_ex, self.start, self.n_iter, batch_size], 430 | # outputs=self.dfg.loss_Mstep(self.y), 431 | # givens=OrderedDict([(self.y, train_set_y[self.start:self.start+self.n_iter, batch_start:batch_stop]), 432 | # (self.y_pad, train_set_y[self.start:self.start+1, batch_start:batch_stop]), 433 | # (self.x, train_set_x[self.start:self.start+self.n_iter, batch_start:batch_stop])]), 434 | # mode=mode) 435 | if self.interactive: 436 | givens=[(self.y, test_set_y_binary[:, batch_start:batch_stop]), 437 | (self.x, test_set_x[:,batch_start:batch_stop]), 438 | (self.y_pad, train_set_y_pad)] 439 | if self.order_obsv == 0: 440 | givens = givens[:-1] 441 | compute_test_error = theano.function(inputs=[index, n_ex, self.start, self.n_iter, batch_size], 442 | outputs=[self.dfg.test_loss(self.y), self.dfg.y_next, self.y], 443 | givens=OrderedDict(givens), 444 | mode=mode) 445 | 446 | 447 | # compute the gradient of cost with respect to theta = (W, W_in, W_out) 448 | # gradients on the weights using BPTT 449 | # E step 450 | gparams_Estep = [] 451 | for param in self.dfg.params_Estep: 452 | gparam = T.grad(cost_Estep, param) 453 | gparams_Estep.append(gparam) 454 | 455 | updates_Estep = OrderedDict() 456 | for param, gparam in zip(self.dfg.params_Estep, gparams_Estep): 457 | weight_update = self.dfg.updates[param] 458 | upd = mom * weight_update - l_r * gparam 459 | updates_Estep[weight_update] = upd 460 | updates_Estep[param] = param + upd 461 | 462 | # M step 463 | gparams_Mstep = [] 464 | for param in self.dfg.params_Mstep: 465 | gparam = T.grad(cost_Mstep, param) 466 | gparams_Mstep.append(gparam) 467 | 468 | updates_Mstep = OrderedDict() 469 | for param, gparam in zip(self.dfg.params_Mstep, gparams_Mstep): 470 | weight_update = self.dfg.updates[param] 471 | upd = mom * weight_update - l_r * gparam 472 | updates_Mstep[weight_update] = upd 473 | updates_Mstep[param] = param + upd 474 | 475 | # compiling a Theano function `train_model_Estep` that returns the 476 | # cost, but in the same time updates the parameter of the 477 | # model based on the rules defined in `updates_Estep` 478 | givens=[(self.y, train_set_y_binary), 479 | (self.x, train_set_x), 480 | (self.y_pad, train_set_y_pad)] 481 | if self.order_obsv == 0: 482 | givens = givens[:-1] 483 | train_model_Estep = theano.function(inputs=[l_r, mom], 484 | outputs=[cost_Estep, self.dfg.loss_Estep(self.y), self.dfg.y_pred_Estep, self.dfg.z_pred_Estep], 485 | updates=updates_Estep, 486 | givens=OrderedDict(givens), 487 | mode=mode) 488 | # updates the parameter of the model based on 489 | # the rules defined in `updates_Mstep` 490 | givens=[(self.y, train_set_y_binary[self.start:self.start+self.n_iter, batch_start:batch_stop]), 491 | (self.x, train_set_x[self.start:self.start+self.n_iter, batch_start:batch_stop]), 492 | (self.y_pad, train_set_y_pad) ] 493 | if self.order_obsv == 0: 494 | givens = givens[:-1] 495 | train_model_Mstep = theano.function(inputs=[index, n_ex, l_r, mom, self.start, self.n_iter, batch_size], 496 | outputs=[cost_Mstep, self.dfg.y_pred_Mstep, self.dfg.z_pred_Mstep], 497 | updates=updates_Mstep, 498 | givens=OrderedDict(givens), 499 | mode=mode) 500 | ############### 501 | # TRAIN MODEL # 502 | ############### 503 | logger.info('... training') 504 | epoch = 0 505 | auc = [] 506 | while (epoch < self.n_epochs): 507 | epoch = epoch + 1 508 | effective_momentum = self.final_momentum \ 509 | if epoch > self.momentum_switchover \ 510 | else self.initial_momentum 511 | example_cost, example_energy, example_y_pred_Estep, example_z_pred_Estep = train_model_Estep(self.learning_rate_Estep, 0.) 512 | logger.info('epoch %d E_step cost=%f energy=%f' % (epoch, 513 | example_cost, example_energy)) 514 | for minibatch_idx in xrange(n_train_batches): 515 | average_cost = [] 516 | for i in xrange(self.n_step): 517 | n_iter = np.random.randint(low=self.n_iter_low[0], 518 | high=self.n_iter_high[0]) 519 | head = np.random.randint(self.n_step - n_iter + 1) 520 | example_cost, example_y_pred_Mstep, example_z_pred_Mstep = train_model_Mstep(minibatch_idx, n_train, self.learning_rate_Mstep, 521 | effective_momentum, head, n_iter, self.batch_size) 522 | average_cost.append(example_cost) 523 | ''' 524 | test_losses, test_precs, test_recs = [], [], [] 525 | auc_now = [] 526 | for ii in xrange(n_test_batches): 527 | test_loss, y_next, y_std = compute_test_error(ii, n_test, self.n_step, Y_test.shape[0], self.batch_size) 528 | for j in xrange(y_next.shape[0]): 529 | auc_now.append(roc_auc_score(y_std[j,:,-1], y_next[j,:,-1])) 530 | auc.append(np.mean(auc_now)) 531 | ''' 532 | logger.info('epoch %d batch %d M_step cost=%f' % (epoch, minibatch_idx, np.mean(average_cost))) 533 | #iters = (epoch - 1) * n_train_batches + minibatch_idx + 1 534 | if validation_frequency is not None and epoch % validation_frequency == 0: 535 | # Computer loss on training set (conside Estep loss only) 536 | train_loss_Estep, y_pred_Estep, prec, rec = compute_train_error_Estep() 537 | if self.interactive: 538 | test_losses, test_precs, test_recs = [], [], [] 539 | for i in xrange(n_test_batches): 540 | test_loss, y_next, y_std = compute_test_error(i, n_test, self.n_step, Y_test.shape[0], self.batch_size) 541 | logger.info('epoch %d, %s batch tr_loss %f te_loss %f' % \ 542 | (epoch, 'valid' if i==0 else 'test', train_loss_Estep, test_loss)) 543 | for j in xrange(y_next.shape[0]): 544 | logger.info('behavior at time stamp %d' % (self.n_step + j + 1)) 545 | if i == 0: 546 | y_historic = Y_train[:,:self.batch_size,:] 547 | else: 548 | y_historic = Y_train[:,self.batch_size:,:] 549 | y_std = np.concatenate([y_historic, y_std]) 550 | y_next = np.concatenate([y_historic, y_next]) 551 | #cert_pred = np.squeeze(np.average(y_next, axis=0, weights=WEIGHT)) 552 | cert_pred = np.squeeze(np.average(y_next, axis=0)) 553 | #cert_std = np.squeeze(np.average(y_std, axis=0, weights=WEIGHT)) 554 | cert_std = np.squeeze(np.average(y_std, axis=0)) 555 | median = np.percentile(cert_std, THRESHOLD) 556 | cert_std = cert_std > median 557 | logger.info('certificate prediction') 558 | logger.info('%s' % \ 559 | metric(cert_pred, cert_std)) 560 | else: 561 | logger.info('epoch %d, tr_loss %f tr_prec %f tr_rec %f' % \ 562 | (epoch, train_loss_Estep, prec, rec)) 563 | # Update learning rate 564 | if self.learning_rate_decay_every is not None: 565 | if epoch % self.learning_rate_decay_every == 0: 566 | self.learning_rate_Estep *= self.learning_rate_decay 567 | self.learning_rate_Mstep *= self.learning_rate_decay 568 | if epoch % self.n_iter_change_every == 0: 569 | if len(self.n_iter_low) > 1: 570 | self.n_iter_low = self.n_iter_low[1:] 571 | self.n_iter_high = self.n_iter_high[1:] 572 | 573 | # Snapshot 574 | if self.snapshot_every is not None: 575 | if (epoch + 1) % self.snapshot_every == 0: 576 | date_obj = datetime.datetime.now() 577 | date_str = date_obj.strftime('%Y-%m-%d-%H:%M:%S') 578 | class_name = self.__class__.__name__ 579 | fname = '%s.%s-snapshot-%d.pkl' % (class_name, date_str, epoch + 1) 580 | fname = '%s.%s-snapshot-%d.json' % (class_name, date_str, epoch + 1) 581 | fabspath = os.path.join(self.snapshot_path, fname) 582 | self.save(fpath=fabspath) 583 | for i in xrange(n_test_batches): 584 | test_loss, y_next, y_std = compute_test_error(i, n_test, self.n_step, Y_test.shape[0], self.batch_size) 585 | if i == 0: 586 | y_historic = Y_train[:,:self.batch_size,:] 587 | else: 588 | y_historic = Y_train[:,self.batch_size:,:] 589 | y_std = np.concatenate([y_historic, y_std]) 590 | y_next = np.concatenate([y_historic, y_next]) 591 | cert_pred = np.squeeze(np.average(y_next, axis=0)) 592 | logger.info('dump certificate prediction result') 593 | return cert_pred 594 | 595 | 596 | class xtxTestCase(unittest.TestCase): 597 | def runTest(self): 598 | with open(DATA_DIR, 'rb') as file: 599 | Y, X = pickle.load(file) 600 | #X = X[:,:1000,:] 601 | #Y = Y[:,:1000,:] 602 | n_in = X.shape[2] 603 | T = -6 604 | #T = -9 605 | Y_train = Y[:T] 606 | Y_test = Y[T:] 607 | X_train = X[:T] 608 | X_test = X[T:] 609 | #print np.sum(data[-1,:,-1]) 610 | n_step, n_seq, n_obsv = Y_train.shape 611 | logger.info('load from pkl train_step=%d test_step=%d, n_seq=%d n_obsv=%d n_in=%d', n_step, X_test.shape[0], n_seq, n_obsv, n_in) 612 | start = datetime.datetime.now() 613 | dfg = MetaDFG(n_in=n_in, n_hidden=2, n_obsv=n_obsv, n_step=n_step, order=2, n_seq=n_seq, learning_rate_Estep=0.5, learning_rate_Mstep=0.1, 614 | factor_type='MLP', output_type='binary', 615 | n_epochs=10, batch_size=n_seq , snapshot_every=None, L1_reg=0.00, L2_reg=0.00, smooth_reg=0.00, 616 | learning_rate_decay=.5, learning_rate_decay_every=100, 617 | n_iter_low=[n_step / 2] , n_iter_high=[n_step + 1], n_iter_change_every=100, 618 | final_momentum=0.5, 619 | initial_momentum=0.3, momentum_switchover=1500, 620 | order_obsv=0, 621 | hidden_layer_config=[]) 622 | dfg.fit(Y_train=Y_train, X_train=X_train, Y_test=Y_test, X_test=X_test, validation_frequency=2000) 623 | print datetime.datetime.now() - start 624 | 625 | if __name__ == "__main__": 626 | logging.basicConfig(level=logging.INFO, 627 | format='%(asctime)s %(message)s') # include timestamp 628 | unittest.main() 629 | 630 | 631 | -------------------------------------------------------------------------------- /factor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # encoding: utf-8 4 | # File Name: factor.py 5 | # Author: Jiezhong Qiu 6 | # Create Time: 2015/01/26 03:05 7 | # TODO: 8 | 9 | import numpy as np 10 | import theano 11 | import theano.tensor as T 12 | import logging 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | class Factor(object): 17 | def __init__(self, n_hidden, n_obsv, n_step, order, start, n_iter): 18 | self.n_hiddent = n_hidden 19 | self.n_obsv = n_obsv 20 | self.n_step = n_step 21 | self.order = order 22 | self.start = start 23 | self.n_iter = n_iter 24 | # initialize np.random 25 | #np.random.seed(20) 26 | # we consider g as linear observation models 27 | # Y(t) = g(W_o, Z(t)) 28 | W_o_bound = n_obsv 29 | W_o_init = np.asarray(np.random.uniform(size=(n_hidden, n_obsv), 30 | low=-1.0 / W_o_bound, high=1.0 / W_o_bound), 31 | dtype=theano.config.floatX) 32 | self.W_o = theano.shared(value=W_o_init, name='W_o') 33 | b_o_init = np.zeros((n_obsv,), dtype=theano.config.floatX) 34 | self.b_o = theano.shared(value=b_o_init, name='b_o') 35 | #z0_init = np.zeros(size=(order, n_hidden), dtype=theano.config.floatX) 36 | #self.z0 = theano.shared(value=z0_init, name='z0') 37 | z_bound = n_hidden 38 | z_init = np.asarray(np.random.uniform(size=(n_step + order, n_hidden), 39 | low=-1.0 / z_bound, high=1.0 / z_bound), 40 | dtype=theano.config.floatX) 41 | self.z = theano.shared(value=z_init, name='z') 42 | self.params_Estep = [self.z] 43 | self.params_Mstep = [self.W_o, self.b_o] 44 | self.L1 = abs(self.W_o).sum() 45 | #+ abs(self.b_o).sum() 46 | #+ abs(self.z).sum() 47 | self.L2_sqr = (self.W_o ** 2).sum() 48 | #+ (self.b_o ** 2).sum() 49 | #+ (self.z ** 2).sum() 50 | 51 | class FIR(Factor): 52 | def __init__(self, n_hidden, n_obsv, n_step, order, start, n_iter): 53 | Factor.__init__(self, n_hidden, n_obsv, n_step, order, start, n_iter) 54 | W_bound = order 55 | W_init = np.asarray(np.random.uniform(size=(order, n_hidden), 56 | low=-1.0 / W_bound, high=1.0 / W_bound), 57 | dtype=theano.config.floatX) 58 | self.W = theano.shared(value=W_init, name="W") 59 | self.params_Mstep.append(self.W) 60 | self.L1 += abs(self.W).sum() 61 | self.L2_sqr += (self.W ** 2).sum() 62 | 63 | def step(*args): 64 | """ 65 | z_tmp, ..., z_tm1 \in R^{1,n_hidden} 66 | """ 67 | z_stack = T.stacklists(args) 68 | z_merge = z_stack * self.W 69 | z_t = T.sum(z_merge, axis=0) 70 | y_t = T.dot(z_t, self.W_o) + self.b_o 71 | return z_t, y_t 72 | 73 | # z_pred, y_pred for E_step 74 | [self.z_pred, self.y_pred], _ = theano.scan(step, 75 | sequences=[ dict(input=self.z, taps=range(-order, 0)) ]) 76 | 77 | 78 | # def onestep(z_tm1, z_tm0): 79 | # z_t = T.sum(z_tm1 * self.W, axis=0) 80 | # y_t = T.dot(z_t, self.W_o) + self.b_o 81 | # return z_t, y_t 82 | # # implement a hmm version, thus order == 1 83 | # [self.z_pred, self.y_pred], _ = theano.scan(onestep, 84 | # sequences=[ dict(input=self.z, taps=[-1, -0]) ]) 85 | [self.z_next, self.y_next], _ = theano.scan(step, 86 | n_steps=self.n_iter, 87 | outputs_info = [ dict(initial=self.z[self.start: self.start + order], taps=range(-order, 0)) , None ]) 88 | if __name__ == "__main__": 89 | pass 90 | 91 | 92 | -------------------------------------------------------------------------------- /factor_minibatch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # encoding: utf-8 4 | # File Name: factor.py 5 | # Author: Jiezhong Qiu 6 | # Create Time: 2015/01/26 03:05 7 | # TODO: 8 | 9 | import numpy as np 10 | import theano 11 | import theano.tensor as T 12 | import logging 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | class Factor(object): 17 | def __init__(self, n_in, n_hidden, n_obsv, n_step, order, n_seq, start, n_iter): 18 | self.n_in = n_in 19 | self.n_hiddent = n_hidden 20 | self.n_obsv = n_obsv 21 | self.n_step = n_step 22 | self.order = order 23 | self.n_seq = n_seq 24 | self.start = start # symbolic 25 | self.n_iter = n_iter # symbolic 26 | # initialize np.random 27 | #np.random.seed(20) 28 | # we consider g as linear observation models 29 | # Y(t) = g(W_o, Z(t)) 30 | W_o_n_in = n_hidden 31 | W_o_n_out = n_obsv 32 | W_o_bound = 4 * np.sqrt(6. / (W_o_n_in + W_o_n_out)) 33 | W_o_init = np.asarray(np.random.uniform(size=(n_hidden, n_obsv), 34 | low=-W_o_bound, high=W_o_bound), 35 | dtype=theano.config.floatX) 36 | self.W_o = theano.shared(value=W_o_init, name='W_o') 37 | b_o_init = np.zeros((n_obsv,), dtype=theano.config.floatX) 38 | self.b_o = theano.shared(value=b_o_init, name='b_o') 39 | #z0_init = np.zeros(size=(order, n_hidden), dtype=theano.config.floatX) 40 | #self.z0 = theano.shared(value=z0_init, name='z0') 41 | z_init = np.asarray(np.random.uniform(size=(n_step + order, n_seq, n_hidden), 42 | low=0., high=1.0), 43 | dtype=theano.config.floatX) 44 | self.z = theano.shared(value=z_init, name='z') 45 | self.params_Estep = [self.z] 46 | self.params_Mstep = [self.W_o, self.b_o] 47 | self.L1 = abs(self.W_o).sum() 48 | #+ abs(self.b_o).sum() 49 | #+ abs(self.z).sum() 50 | self.L2_sqr = (self.W_o ** 2).sum() 51 | #+ (self.b_o ** 2).sum() 52 | #+ (self.z ** 2).sum() 53 | 54 | class MLP(Factor): 55 | def __init__(self, n_in, x, y_pad, n_hidden, n_obsv, n_step, order, n_seq, start, n_iter, batch_start, batch_stop, order_obsv=0, hidden_layer_config=None): 56 | logger.info('A MLP factor with %d order obsv and %d order latent states built ...' % (order_obsv, order)) 57 | Factor.__init__(self, n_in, n_hidden, n_obsv, n_step, order, n_seq, start, n_iter) 58 | # real value passed through givens=[..] 59 | self.x = x 60 | self.y_pad = y_pad 61 | W_n_in = order * n_hidden + n_in + order_obsv * n_obsv 62 | W_n_out = n_hidden 63 | if hidden_layer_config is None: 64 | W_n_hidden = (W_n_in + W_n_out) * 2 / 3 65 | layer_size = [W_n_in, W_n_hidden, W_n_out] 66 | else: 67 | layer_size = [W_n_in] + hidden_layer_config + [W_n_out] 68 | logger.info('MLP layer sizes: %s' % ' '.join([str(item) for item in layer_size])) 69 | self.Ws, self.bs = [], [] 70 | for i in xrange(len(layer_size) - 1): 71 | W_bound = 4 * np.sqrt(6. / (layer_size[i] + layer_size[i+1])) 72 | W_init = np.asarray(np.random.uniform(size=(layer_size[i], layer_size[i+1]), 73 | low=-W_bound, high=W_bound), 74 | dtype=theano.config.floatX) 75 | W = theano.shared(value=W_init, name='W_%d' % i) 76 | b_init = np.zeros((layer_size[i+1],), dtype=theano.config.floatX) 77 | b = theano.shared(value=b_init, name='b_%d' % i) 78 | self.params_Mstep.append(W) 79 | self.params_Mstep.append(b) 80 | self.L1 += abs(W).sum() 81 | self.L2_sqr += (W ** 2).sum() 82 | self.Ws.append(W) 83 | self.bs.append(b) 84 | 85 | #rng = np.random.RandomState() 86 | #self.srng = T.shared_randomstreams.RandomStreams( 87 | # rng.randint(999999)) 88 | 89 | def step(*args): 90 | """ 91 | args include 92 | x_t \in R^{batch_size, n_in} 93 | z_tmp, ..., z_tm1 \in R^{batch_size, n_hidden} 94 | y_pad \in R^{batch_size, n_obsv} 95 | """ 96 | # args = list(args) 97 | # for i in xrange(order_obsv): 98 | # args[-i-1] = args[-i-1][:,:-1] 99 | z_t = T.concatenate(args, axis=1) # (batch_size, n_hidden x order + n_in + n_obsv) 100 | for i in xrange(len(layer_size) - 1): 101 | this_W = self.Ws[i] 102 | this_b = self.bs[i] 103 | z_t = T.nnet.sigmoid(T.dot(z_t, this_W) + this_b) 104 | y_t = T.nnet.sigmoid(T.dot(z_t, self.W_o) + self.b_o) 105 | return z_t, y_t 106 | 107 | # Compute z_pred_Estep, y_pred_Estep for E_step 108 | # Here x should be T x n_seq x n_in 109 | # and y_pad should be T x n_seq x n_obsv 110 | sequences=[ dict(input=self.x, taps=[0]), 111 | dict(input=self.z, taps=range(-order, 0)), 112 | dict(input=self.y_pad, taps=range(-order_obsv, 0)) ] 113 | if order_obsv == 0: 114 | sequences = sequences[:-1] 115 | [self.z_pred_Estep, self.y_pred_Estep], _ = theano.scan(step, 116 | sequences=sequences) 117 | 118 | self.z_subtensor = self.z[self.start:self.start+order+n_iter,batch_start:batch_stop] 119 | self.y_subtensor = self.y_pad[self.start:self.start+order_obsv+n_iter, batch_start:batch_stop] 120 | sequences=[ dict(input=self.x, taps=[0]), 121 | dict(input=self.z_subtensor, taps=range(-order, 0)), 122 | dict(input=self.y_subtensor, taps=range(-order_obsv, 0)) ] 123 | if order_obsv == 0: 124 | sequences = sequences[:-1] 125 | [self.z_pred_Mstep, self.y_pred_Mstep], _ = theano.scan(step, 126 | sequences=sequences) 127 | 128 | # Compute z_next, y_next for either M step or performance evaluation 129 | # Here x should be n_iter x effective_batch_size x n_in 130 | # and y_pad should be 1 x effective_batch_size x n_obsv 131 | 132 | self.z_subtensor = self.z[self.start:self.start+order,batch_start:batch_stop] 133 | self.y_subtensor = self.y_pad[self.start:self.start+order_obsv,batch_start:batch_stop] 134 | outputs_info = [ dict(initial=self.z_subtensor if order > 1 else self.z_subtensor[0], taps=range(-order, 0)), 135 | dict(initial=self.y_subtensor if order_obsv > 1 else self.y_subtensor[0], taps=range(-order_obsv, 0)) ] 136 | if order_obsv == 0: 137 | outputs_info[-1] = None 138 | [self.z_next, self.y_next], _ = theano.scan(step, 139 | sequences=[ dict(input=self.x, taps=[0]) ], 140 | n_steps=self.n_iter, 141 | outputs_info=outputs_info) 142 | if __name__ == "__main__": 143 | pass 144 | 145 | 146 | -------------------------------------------------------------------------------- /mkdata.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # File Name: mkdata.py 4 | # Author: Jiezhong Qiu 5 | # Create Time: 2015/02/09 14:12 6 | # TODO: generate_Y data for dfg 7 | 8 | import os 9 | import json 10 | import util 11 | import datetime 12 | import logging 13 | import pickle 14 | from xlrd import open_workbook 15 | import numpy as np 16 | from sklearn.metrics import precision_recall_fscore_support 17 | from sklearn.metrics import roc_auc_score 18 | 19 | logger = logging.getLogger(__name__) 20 | DEV_PATH = "../../xtx/dev/" 21 | RAW_GRADE_DIR = DEV_PATH + "RawData/grades/" 22 | GRADE_DIR = DEV_PATH + "Data/Excel.json" 23 | FORUM_DIR = DEV_PATH + "Data/Forum.json" 24 | COURSE_INFO_DIR = DEV_PATH + "Data/Course_.json" 25 | BEHAVIOR_DIR = DEV_PATH + "Data/LearningBehavior.json" 26 | LEARNING_TIME_DIR = DEV_PATH + 'Data/Trackinglog.json' 27 | MONGO_DIR = DEV_PATH + 'Data/MongoDB.json' 28 | DEMOGRAPHICS_DIR = DEV_PATH + 'Data/Demographics.json' 29 | EPS = 1e-3 30 | 31 | class mkdata(object): 32 | def __init__(self): 33 | '''generate_Y data as the following format 34 | feature[uid][T] is a list of features for user uid at time T 35 | the feature shoule be additive 36 | we remove register-only student from the dataset 37 | ''' 38 | self.feature = {} 39 | self.feature_num = 0 40 | with open(COURSE_INFO_DIR) as f: 41 | courses = json.load(f) 42 | self.getUser() 43 | self.start = util.parseDate(courses[self.course]['start']) 44 | self.end = util.parseDate(courses[self.course]['end']) 45 | for uid in self.feature: 46 | for single_date in util.daterange(self.start, self.end): 47 | self.feature[uid][single_date] = [] 48 | logger.info('course: %s user: %d start: %s end: %s', self.course, 49 | len(self.feature), self.start.isoformat(), self.end.isoformat()) 50 | 51 | def expand_feature(self, num): 52 | self.feature_num += num 53 | for uid in self.feature: 54 | for single_date in self.feature[uid]: 55 | self.feature[uid][single_date] = [0.] * num + self.feature[uid][single_date] 56 | def expand_X(self, num): 57 | self.n_in += num 58 | for uid in self.X: 59 | for T in xrange(len(self.ddls) + 1): 60 | self.X[uid][T] = [0.] * num + self.X[uid][T] 61 | def getTimeStamp(self, date_obj): 62 | for index, item in enumerate(self.ddls): 63 | if date_obj <= item: 64 | return index 65 | return len(self.ddls) 66 | def getUser(self): 67 | with open(GRADE_DIR) as f: 68 | grades = json.load(f)[0] 69 | for uid in grades: 70 | if self.course in grades[uid] and grades[uid][self.course] > 0: 71 | self.feature[uid] = {} 72 | def getLearningData(self): 73 | # video_time assign_time 74 | # video_day assign_day 75 | self.expand_feature(4) 76 | with open(LEARNING_TIME_DIR) as f: 77 | learn = json.load(f) 78 | for uid in learn: 79 | if uid not in self.feature: 80 | continue 81 | if self.course not in learn[uid]: 82 | continue 83 | for item in learn[uid][self.course]: 84 | single_date = util.parseDate(item[0]) 85 | if single_date < self.start or single_date >= self.end: 86 | continue 87 | self.feature[uid][single_date][0] += item[1] 88 | self.feature[uid][single_date][1] += item[2] 89 | self.feature[uid][single_date][2] += 1 90 | self.feature[uid][single_date][3] += 1 91 | 92 | def getBehaviorData(self): 93 | # video problem 94 | # in time visit 95 | # chapter 96 | #self.expand_feature(3) 97 | # ddl hit 98 | self.expand_feature(6) 99 | with open(BEHAVIOR_DIR) as f: 100 | behavior = json.load(f) 101 | with open(MONGO_DIR) as f: 102 | mongo = json.load(f) 103 | for uid in behavior: 104 | if uid not in self.feature: 105 | continue 106 | for date in behavior[uid]: 107 | single_date = util.parseDate(date) 108 | if single_date < self.start or single_date >= self.end: 109 | continue 110 | for log in behavior[uid][date]: 111 | course, catagory = util.parseLog(log) 112 | if course == self.course: 113 | if log in mongo and mongo[log]['due'] is not None: 114 | T_ddl = self.getTimeStamp(util.parseDate(mongo[log]['due'])) 115 | T = self.getTimeStamp(single_date) 116 | if T_ddl == T: 117 | self.feature[uid][single_date][5] += 1 118 | if catagory == 'video': 119 | self.feature[uid][single_date][0] += 1 120 | elif catagory == 'problem': 121 | self.feature[uid][single_date][1] += 1 122 | elif catagory == 'sequential': 123 | self.feature[uid][single_date][2] += 1 124 | try: 125 | date_obj = mongo[log]['start'] 126 | except: 127 | print log 128 | continue 129 | if date_obj is None: 130 | continue 131 | date_obj = util.parseDate(date_obj) 132 | if self.getTimeStamp(date_obj) == self.getTimeStamp(single_date): 133 | self.feature[uid][single_date][3] += 1 134 | elif catagory == 'chapter': 135 | self.feature[uid][single_date][4] += 1 136 | def save(self, fpath='.', fname=None): 137 | """save a json or pickle representation of data set""" 138 | fpathstart, fpathext = os.path.splitext(fpath) 139 | if fpathext == '.json' or fpathext == '.pkl': 140 | fpath, fname = os.path.split(fpath) 141 | elif fname is None: 142 | # generate_Y filename based on date 143 | date_obj = datetime.datetime.now() 144 | date_str = date_obj.strftime('%Y-%m-%d-%H:%M:%S') 145 | class_name = self.__class__.__name__ 146 | fname = '%s.%s.pkl' % (class_name, date_str) 147 | fabspath = os.path.join(fpath, fname) 148 | logger.info('Saving to %s ...' % fabspath) 149 | with open(fabspath, 'wb') as file: 150 | if fpathext == '.json': 151 | json.dump(self.feature, file, 152 | indent=4, separators=(',', ';')) 153 | else: 154 | pickle.dump(self.feature, file, protocol=pickle.HIGHEST_PROTOCOL) 155 | 156 | def save_dataset(self, fpath='.', fname=None): 157 | fpathstart, fpathext = os.path.splitext(fpath) 158 | if fpathext == '.pkl': 159 | fpath, fname = os.path.split(fpath) 160 | elif fname is None: 161 | # generate_Y filename based on date 162 | date_obj = datetime.datetime.now() 163 | date_str = date_obj.strftime('%Y-%m-%d-%H:%M:%S') 164 | class_name = self.__class__.__name__ 165 | fname = '%s.%s.pkl' % (class_name, date_str) 166 | fabspath = os.path.join(fpath, fname) 167 | logger.info('Saving dataset to %s shape=(%d, %d, %d)...' % (fabspath, len(self.ddls)+1, len(self.feature), self.feature_num)) 168 | # n_step x n_seq x n_obsv 169 | n_step = len(self.ddls) + 1 170 | n_seq = len(self.feature) 171 | dataset = np.zeros(shape=(n_step, n_seq, self.feature_num)) 172 | for index, uid in enumerate(self.feature): 173 | assert len(self.feature[uid]) == len(self.ddls) + 1 174 | for T in xrange(len(self.feature[uid])): 175 | assert len(self.feature[uid][T]) == self.feature_num 176 | for i in xrange(self.feature_num): 177 | dataset[T][index][i] = self.feature[uid][T][i] 178 | X = np.zeros(shape=(n_step, n_seq, self.n_in)) 179 | for index, uid in enumerate(self.feature): 180 | for T in xrange(len(self.X[uid])): 181 | if len(self.X[uid][T]) != self.n_in: 182 | print len(self.X[uid][T]), self.n_in 183 | assert len(self.X[uid][T]) == self.n_in 184 | for i in xrange(self.n_in): 185 | X[T][index][i] = self.X[uid][T][i] 186 | 187 | with open(fabspath, 'wb') as file: 188 | pickle.dump((dataset, X), file, protocol=pickle.HIGHEST_PROTOCOL) 189 | self.dataset = dataset 190 | def getDDL(self): 191 | self.ddls = [] 192 | with open(MONGO_DIR) as f: 193 | mongo = json.load(f) 194 | for item in mongo: 195 | try: 196 | course, categort = util.parseLog(item) 197 | except: 198 | continue 199 | if course == self.course: 200 | if mongo[item]['due'] is not None: 201 | #print item, mongo[item]['due'] 202 | self.ddls.append(util.parseDate(mongo[item]['due'])) 203 | self.ddls.sort() 204 | if self.course == "TsinghuaX/20220332_2X/_": 205 | self.ddls = self.ddls[:-1] 206 | for item in self.ddls: 207 | print item, (item - self.start).days / float((self.end - self.start).days) 208 | def getStageFeature(self): 209 | feature = {} 210 | for uid in self.feature: 211 | feature[uid] = {} 212 | for single_date in self.feature[uid]: 213 | #date_str = single_date.isoformat() 214 | delta = (single_date - self.start).days 215 | feature[uid][delta] = self.feature[uid][single_date] 216 | sample = self.ddls + [self.end - datetime.timedelta(1)] 217 | sample = [(item - self.start).days for item in sample] 218 | self.feature = {} 219 | for uid in feature: 220 | self.feature[uid] = [] 221 | p = 0 222 | tmp = [0.] * self.feature_num 223 | for T in xrange(0, (self.end-self.start).days): 224 | if T <= sample[p]: 225 | for i in xrange(self.feature_num): 226 | tmp[i] += feature[uid][T][i] 227 | if T == sample[p]: 228 | self.feature[uid].append(tmp) 229 | p += 1 230 | tmp = [0.] * self.feature_num 231 | def filte(self, filter_type='binary', threshold=0.3): 232 | # first merge self.feature and self.score 233 | self.feature_num += 1 234 | for uid in self.score: 235 | for j in xrange(len(self.ddls) + 1): 236 | self.feature[uid][j].append(self.score[uid][j]) 237 | for i in xrange(self.feature_num): 238 | for T in xrange(len(self.ddls) + 1): 239 | tmp = sorted([self.feature[uid][T][i] for uid in self.feature], reverse=True) 240 | if tmp[0] == 0: 241 | tmp[0] = EPS 242 | if filter_type == 'binary': 243 | door = tmp[int(len(self.feature) * threshold)] 244 | if door == tmp[0]: 245 | door -= EPS 246 | elif door == tmp[-1]: 247 | door += EPS 248 | for uid in self.feature: 249 | self.feature[uid][T][i] = 1 if self.feature[uid][T][i] > door else 0 250 | elif filter_type == 'real': 251 | for uid in self.feature: 252 | self.feature[uid][T][i] = self.feature[uid][T][i] / float(tmp[0]) 253 | 254 | def getDemographics(self): 255 | # binary feature 256 | # male, female, el, jhs, hs, c, b, m, p, [0,18], [18,23], [23, 28], [28, 36], [36, 51], [> 51] 257 | with open(DEMOGRAPHICS_DIR) as f: 258 | demos = json.load(f) 259 | self.n_in += 15 260 | for uid in self.feature: 261 | tmp = [] 262 | demo = demos[uid] 263 | for task in ['m', 'f']: 264 | tmp.append(1 if demo['gender'] == task else 0) 265 | for task in ['el', 'jhs', 'hs', 'c', 'b', 'm', 'p']: 266 | tmp.append(1 if demo['education'] == task else 0) 267 | if demo['age'] is not None: 268 | age = 2014 - demo['age'] 269 | task = [0, 18, 23, 28, 36, 51, 1000] 270 | for i in xrange(len(task)-1): 271 | tmp.append(1 if age >= task[i] and age < task[i+1] else 0) 272 | else: 273 | tmp += [0.] * 6 274 | for T in xrange(len(self.ddls)+1): 275 | self.X[uid][T] += tmp 276 | def getForumData(self): 277 | # post, reply, replyed, length, upvoted, cert-friend 278 | self.expand_feature(6) 279 | with open(FORUM_DIR) as f: 280 | forum = json.load(f) 281 | for oid, item in forum.iteritems(): 282 | if item['course'] != self.course: 283 | continue 284 | single_date = util.parseDate(item['date']) 285 | uid = item['user'] 286 | if uid in self.feature and single_date >= self.start and single_date < self.end: 287 | if item['father'] == None: 288 | self.feature[uid][single_date][0] += 1 289 | else: 290 | self.feature[uid][single_date][1] += 1 291 | fid = forum[item['father']]['user'] 292 | if fid in self.feature: 293 | self.feature[fid][single_date][2] += 1 294 | T = self.getTimeStamp(single_date) 295 | if T > 0 and self.score[fid][T-1] > .5: 296 | self.feature[uid][single_date][5] += 1 297 | if T > 0 and self.score[uid][T-1] > .5: 298 | self.feature[fid][single_date][5] += 1 299 | self.feature[uid][single_date][3] += item['length'] 300 | self.feature[uid][single_date][4] += item['vote_up'] 301 | def getSequentialRelease(self): 302 | with open(MONGO_DIR) as f: 303 | mongo = json.load(f) 304 | self.expand_X(1) 305 | for item in mongo: 306 | try: 307 | course, categort = util.parseLog(item) 308 | except: 309 | continue 310 | if course == self.course: 311 | if mongo[item]['start'] is not None and item.find('sequential') != -1: 312 | print item, mongo[item]['start'] 313 | date_obj = util.parseDate(mongo[item]['start']) 314 | T = self.getTimeStamp(date_obj) 315 | for uid in self.X: 316 | self.X[uid][T][0] += 1 317 | 318 | def generate_Y(self): 319 | raise NotImplementedError 320 | 321 | def regenerate(self): 322 | for uid in self.feature: 323 | for T in xrange(len(self.ddls) + 1): 324 | self.X[uid][T] += self.feature[uid][T][:-1] 325 | self.feature[uid][T] = self.feature[uid][T][-1:] 326 | self.n_in = self.n_in + self.feature_num - 1 327 | self.feature_num = 1 328 | 329 | def generate_X(self): 330 | self.n_in = 0 331 | self.X = {} 332 | for uid in self.feature: 333 | self.X[uid] = [[] for i in xrange(len(self.ddls)+1)] 334 | # Demographics Feature 335 | self.getDemographics() 336 | #self.getSequentialRelease() 337 | # Course Release Feature 338 | def base_line(self): 339 | for i in xrange(5, len(self.ddls) + 1): 340 | median_tp1 = np.percentile(self.dataset[4,:,-1], 70.3) 341 | median_t = np.percentile(self.dataset[i,:,-1], 70.3) 342 | if median_tp1 == 1.: 343 | median_tp1 -= EPS 344 | if median_t == 1.: 345 | median_t -= EPS 346 | print roc_auc_score(self.dataset[i,:,-1] > median_t, self.dataset[4,:,-1]) 347 | print precision_recall_fscore_support(self.dataset[4,:,-1] > median_tp1, self.dataset[i,:,-1] > median_t, average='micro') 348 | def dumpScoreDotDat(self, fileName): 349 | with open(fileName, 'wb') as f: 350 | title = '\t'.join(['userid'] + ['ps_%d' % i for i in xrange(len(self.score.values()[0]))]) 351 | print >> f, title 352 | for k, v in self.score.iteritems(): 353 | content = [k] + [str(item) for item in v] 354 | print >> f, '\t'.join(content) 355 | def __get_score__(self, scoreColumn, fname): 356 | book = open_workbook(RAW_GRADE_DIR + fname) 357 | sheet = book.sheet_by_index(0) 358 | scores = [sheet.col_values(util.getExcelColumnId(columnStr)) 359 | for columnStr in scoreColumn] 360 | self.score = {} 361 | users = sheet.col_values(0) 362 | for i in xrange(1, len(users)): 363 | user = str(int(users[i])) 364 | if user not in self.feature: 365 | logger.info('excel break from user %s' % user) 366 | break 367 | self.score[user] = [] 368 | for j in xrange(len(scoreColumn)): 369 | this_score = float(scores[j][i]) 370 | #last_score = 0 if j == 0 or j == len(scoreColumn) - 1 else float(self.score[user][-1]) 371 | last_score = 0 372 | self.score[user].append(this_score + last_score) 373 | 374 | class Circuit(mkdata): 375 | def __init__(self): 376 | self.course = "TsinghuaX/20220332_2X/_" 377 | mkdata.__init__(self) 378 | 379 | def getScore(self): 380 | # Column D to M 381 | scoreColumn = ['F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'BJ'] 382 | fname = 'grades_TsinghuaX-20220332_2X-_.xlsx' 383 | self.__get_score__(scoreColumn, fname) 384 | 385 | def generate_Y(self): 386 | self.getDDL() 387 | self.getScore() 388 | self.getForumData() 389 | self.getLearningData() 390 | self.getBehaviorData() 391 | self.getStageFeature() 392 | self.filte(filter_type='real', threshold=41. / 411) 393 | 394 | class DataStructure(mkdata): 395 | def __init__(self): 396 | self.course = "TsinghuaX/30240184_1X/_" 397 | mkdata.__init__(self) 398 | def getUser(self): 399 | with open(GRADE_DIR) as f: 400 | grades = json.load(f)[0] 401 | for uid in grades: 402 | if self.course in grades[uid]: 403 | self.feature[uid] = {} 404 | def getScore(self): 405 | scoreColumn = ['F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O'] 406 | fname = 'grades_TsinghuaX-30240184_1X-_.xlsx' 407 | self.__get_score__(scoreColumn, fname) 408 | 409 | 410 | class Combin(mkdata): 411 | def __init__(self): 412 | self.course = "TsinghuaX/60240013X/_" 413 | mkdata.__init__(self) 414 | def getUser(self): 415 | with open(GRADE_DIR) as f: 416 | grades = json.load(f)[0] 417 | for uid in grades: 418 | if self.course in grades[uid]: 419 | self.feature[uid] = {} 420 | def getScore(self): 421 | scoreColumn = [chr(ord('D') + i) for i in xrange(ord('Z')-ord('D')+1)] + ['A' + chr(ord('A') + i) for i in xrange(ord('J')-ord('A')+1)] 422 | print scoreColumn 423 | fname = 'grades_TsinghuaX-60240013X-_.xlsx' 424 | self.__get_score__(scoreColumn, fname) 425 | 426 | class Finance2014(mkdata): 427 | def __init__(self): 428 | self.course = "TsinghuaX/80512073_2014_1X/_2014_" 429 | mkdata.__init__(self) 430 | 431 | def getScore(self): 432 | # Column D to M 433 | scoreColumn = ['D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'O'] 434 | fname = 'grades_TsinghuaX-80512073_2014_1X-_.xlsx' 435 | self.__get_score__(scoreColumn, fname) 436 | 437 | def generate_Y(self): 438 | self.getDDL() 439 | self.getScore() 440 | self.getForumData() 441 | self.getLearningData() 442 | self.getBehaviorData() 443 | self.getStageFeature() 444 | self.filte(filter_type='real', threshold=0.296) 445 | 446 | 447 | if __name__ == "__main__": 448 | logging.basicConfig(level=logging.INFO) 449 | ''' 450 | fin2 = Finance2014() 451 | fin2.generate_Y() 452 | fin2.generate_X() 453 | fin2.regenerate() 454 | fin2.save_dataset('data/fin2.pkl') 455 | fin2.base_line() 456 | ''' 457 | # combin = Combin() 458 | # combin.generate_Y() 459 | # combin.save('combin.pkl') 460 | ''' 461 | circuit = Circuit() 462 | circuit.generate_Y() 463 | circuit.generate_X() 464 | circuit.regenerate() 465 | circuit.save_dataset('data/circuit.pkl') 466 | ''' 467 | ''' 468 | dsa = DataStructure() 469 | dsa.getScore() 470 | dsa.dumpScoreDotDat('data_structure.dat') 471 | ''' 472 | com = Combin() 473 | com.getScore() 474 | com.dumpScoreDotDat('combinatorics.dat') 475 | -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | # File Name: util.py 4 | # Author: Jiezhong Qiu 5 | # Create Time: 2015/02/09 15:11 6 | # TODO: 7 | 8 | from datetime import datetime 9 | from datetime import timedelta 10 | import re 11 | 12 | pattern = re.compile(r"i4x://(?P[^/]*)/(?P[^/]*)/(?P[^/]*)/(?P\w{32})") 13 | 14 | def roundTime(dt=None, roundTo=60): 15 | # http://stackoverflow.com/questions/3463930/how-to-round-the-minute-of-a-datetime-object-python/10854034#10854034 16 | """ 17 | Round a datetime object to any time laps in seconds 18 | dt : datetime.datetime object, default now. 19 | roundTo : Closest number of seconds to round to, default 1 minute. 20 | Author: Thierry Husson 2012 - Use it as you want but don't blame me. 21 | """ 22 | if dt == None : dt = datetime.now() 23 | seconds = (dt - dt.min).seconds 24 | # // is a floor division, not a comment on following line: 25 | rounding = (seconds+roundTo/2) // roundTo * roundTo 26 | return dt + timedelta(0,rounding-seconds,-dt.microsecond) 27 | 28 | def parseDate(dateStr): 29 | #return dateutil.parser.parse(dateStr).date() 30 | return datetime.strptime(dateStr.split("T")[0], "%Y-%m-%d").date() 31 | 32 | def daterange(start_date, end_date): 33 | for n in range(int ((end_date - start_date).days)): 34 | yield start_date + timedelta(n) 35 | 36 | def parseLog(logStr): 37 | for m in pattern.finditer(logStr): 38 | content = m.groupdict() 39 | course = "%s/%s/_" % (content["org"], content["course"]) 40 | if course == 'TsinghuaX/80512073_2014_1X/_': 41 | course += '2014_' 42 | return course, content['catagory'] 43 | 44 | def getExcelColumnId(columnStr): 45 | tmp, column = columnStr, 0 46 | for ch in tmp: 47 | column = column * 26 + ord(ch) - ord('A') 48 | column += sum([26**i for i in xrange(len(tmp))]) 49 | return column - 1 50 | if __name__ == "__main__": 51 | pass 52 | 53 | 54 | --------------------------------------------------------------------------------