├── CNNModel.py ├── GRUModel.py ├── README.md ├── main.py ├── model.jpg └── utils.py /CNNModel.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | from utils import NormalInit, OrthogonalInit, add_to_params 3 | import theano 4 | import numpy as np 5 | import theano.tensor as T 6 | 7 | class SentenceEncoder_CNN(): #用CNN学习句子向量表示 8 | def init_params(self, word_embedding_param): 9 | # Initialzie W_emb to given word embeddings 10 | assert(word_embedding_param != None) 11 | self.W_emb = word_embedding_param 12 | 13 | """ sent weights """ 14 | self.Filter1 = add_to_params(self.params, theano.shared(value=NormalInit(self.rng, self.rankdim, self.qdim_encoder), name='Filter1'+self.name)) 15 | self.Filter2 = add_to_params(self.params, theano.shared(value=NormalInit(self.rng, 2*self.rankdim, self.qdim_encoder), name='Filter2'+self.name)) 16 | self.Filter3 = add_to_params(self.params, theano.shared(value=NormalInit(self.rng, 3*self.rankdim, self.qdim_encoder), name='Filter3'+self.name)) 17 | 18 | self.b_1 = add_to_params(self.params, theano.shared(value=np.zeros((self.qdim_encoder,), dtype='float32'), name='cnn_b1'+self.name)) 19 | self.b_2 = add_to_params(self.params, theano.shared(value=np.zeros((self.qdim_encoder,), dtype='float32'), name='cnn_b2'+self.name)) 20 | self.b_3 = add_to_params(self.params, theano.shared(value=np.zeros((self.qdim_encoder,), dtype='float32'), name='cnn_b3'+self.name)) 21 | 22 | # This function takes as input word indices and extracts their corresponding word embeddings 23 | def approx_embedder(self, x): 24 | return self.W_emb[x] 25 | 26 | def ConvLayer1(self, q1): 27 | output = T.dot(q1, self.Filter1) + self.b_1 28 | return output 29 | 30 | def ConvLayer2(self, q1, q2): 31 | output = T.dot(T.concatenate([q1, q2], axis=1), self.Filter2) + self.b_2 32 | return output 33 | 34 | def ConvLayer3(self, q1, q2, q3): 35 | output = T.dot(T.concatenate([q1, q2, q3], axis=1), self.Filter3) + self.b_3 36 | return output 37 | 38 | def Convolution(self, x, mask): 39 | xe = self.approx_embedder(x) 40 | _mask = self.tmp[mask] 41 | 42 | _res1, _ = theano.scan(self.ConvLayer1, sequences=[xe]) 43 | _res2, _ = theano.scan(self.ConvLayer2, sequences=[xe[:-1], xe[1:]]) 44 | _res3, _ = theano.scan(self.ConvLayer3, sequences=[xe[:-2],xe[1:-1],xe[2:]]) 45 | 46 | hidden1 = T.tanh(T.max(_res1*_mask, axis=0)).dimshuffle('x',0,1) 47 | hidden2 = T.tanh(T.max(_res2*_mask[:-1], axis=0)).dimshuffle('x',0,1) 48 | hidden3 = T.tanh(T.max(_res3*_mask[:-2], axis=0)).dimshuffle('x',0,1) 49 | 50 | return T.mean(T.concatenate([hidden1, hidden2, hidden3], axis=0), axis=0) 51 | #return hidden3 52 | #return (hidden1 + hidden2 + hidden3)/3.0 53 | #return x[:5] 54 | #return (hidden1 + hidden2)/2.0 55 | 56 | def build_encoder(self, x, mask): #x是一个matrix 57 | res = self.Convolution(x, mask) 58 | 59 | return res 60 | 61 | def __init__(self, word_embedding_param, name, config): 62 | self.name = name 63 | self.rankdim = config.w_dim 64 | self.qdim_encoder = config.h_dim 65 | self.params = [] 66 | self.rng = np.random.RandomState(23333) 67 | self.init_params(word_embedding_param) 68 | a = np.zeros((2, self.qdim_encoder)) 69 | a[1] = 1 70 | self.tmp = theano.shared(value=a) -------------------------------------------------------------------------------- /GRUModel.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | from utils import NormalInit, OrthogonalInit, add_to_params 3 | import theano 4 | import numpy as np 5 | import theano.tensor as T 6 | 7 | class SentenceEncoder(): 8 | def init_params(self, word_embedding_param): 9 | # Initialzie W_emb to given word embeddings 10 | assert(word_embedding_param != None) 11 | self.W_emb = word_embedding_param 12 | 13 | """ sent weights """ 14 | self.W_in = add_to_params(self.params, theano.shared(value=NormalInit(self.rng, self.rankdim, self.qdim_encoder), name='W_in'+self.name)) 15 | self.W_hh = add_to_params(self.params, theano.shared(value=OrthogonalInit(self.rng, self.qdim_encoder, self.qdim_encoder), name='W_hh'+self.name)) 16 | self.b_hh = add_to_params(self.params, theano.shared(value=np.zeros((self.qdim_encoder,), dtype='float32'), name='b_hh'+self.name)) 17 | 18 | self.W_in_r = add_to_params(self.params, theano.shared(value=NormalInit(self.rng, self.rankdim, self.qdim_encoder), name='W_in_r'+self.name)) 19 | self.W_in_z = add_to_params(self.params, theano.shared(value=NormalInit(self.rng, self.rankdim, self.qdim_encoder), name='W_in_z'+self.name)) 20 | self.W_hh_r = add_to_params(self.params, theano.shared(value=OrthogonalInit(self.rng, self.qdim_encoder, self.qdim_encoder), name='W_hh_r'+self.name)) 21 | self.W_hh_z = add_to_params(self.params, theano.shared(value=OrthogonalInit(self.rng, self.qdim_encoder, self.qdim_encoder), name='W_hh_z'+self.name)) 22 | self.b_z = add_to_params(self.params, theano.shared(value=np.zeros((self.qdim_encoder,), dtype='float32'), name='b_z'+self.name)) 23 | self.b_r = add_to_params(self.params, theano.shared(value=np.zeros((self.qdim_encoder,), dtype='float32'), name='b_r'+self.name)) 24 | 25 | # This function takes as input word indices and extracts their corresponding word embeddings 26 | 27 | def approx_embedder(self, x): 28 | return self.W_emb[x] 29 | 30 | def GRU_sent_step(self, x_t, m_t, ph_t): 31 | hr_tm1 = ph_t 32 | 33 | r_t = T.nnet.sigmoid(T.dot(x_t, self.W_in_r) + T.dot(hr_tm1, self.W_hh_r) + self.b_r) 34 | z_t = T.nnet.sigmoid(T.dot(x_t, self.W_in_z) + T.dot(hr_tm1, self.W_hh_z) + self.b_z) 35 | h_tilde = T.tanh(T.dot(x_t, self.W_in) + T.dot(r_t * hr_tm1, self.W_hh) + self.b_hh) 36 | h_t = z_t * hr_tm1 + (np.float32(1.0) - z_t) * h_tilde 37 | 38 | m_t = m_t.dimshuffle(0, 'x') #make a column out of a 1d vector (N to Nx1) 39 | h_t = (m_t) * h_t + (1 - m_t) * ph_t 40 | 41 | # return both reset state and non-reset state 42 | return h_t, r_t, z_t, h_tilde 43 | 44 | def build_encoder(self, x, mask, prev_state): #x是一个matrix 45 | xe = self.approx_embedder(x) 46 | 47 | hs_0 = prev_state 48 | _res, _ = theano.scan(self.GRU_sent_step, 49 | sequences=[xe, mask],\ 50 | outputs_info=[hs_0, None, None, None])#每次循环输入GRU_sent_step是一个矩阵,shape为N*w_dim(N为x的列维度) 51 | 52 | # Get the hidden state sequence 53 | h = _res[0] #返回f_enc函数每次调用的第一个输出值,在RGU中h[i]会作为f_enc第i+1次迭代的输入,得到h[i+1] 54 | return h, mask 55 | 56 | def __init__(self, word_embedding_param, name, config): 57 | self.name = name 58 | self.rankdim = config.w_dim 59 | self.qdim_encoder = config.h_dim 60 | self.params = [] 61 | self.rng = np.random.RandomState(23333) 62 | self.init_params(word_embedding_param) 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GRU-CNN 2 | 本模型利用GRU或CNN对存在某种关系的两个句子进行建模。模型大致结构为利用GRU([Cho et al., 2014b](http://arxiv.org/abs/1406.1078), RNN中的一种)或CNN学习句子表示,两个句子不共享一套参数。然后再用一层神经网络学习两个句子的联合表示,最后利用一个sigmoid层对两个句子进行打分,输入关系强弱的值,训练方法采用正负例训练。模型结构如下图所示 3 | 4 | ![model](model.jpg?raw=true "model") 5 | 6 | 该模型可用于:连贯性任务(相当于窗口取2,只看前后两句话);答案选取任务(针对QA数据集,问-答对正好是具有关联的两个句子);以及对话质量评估(针对单轮对话,有点类似于一问一答那种形式(也是两个句子),模型评价对话的质量,即评价在聊天机器人系统中生成的对话质量如何)。 7 | 8 | ## 输入文件格式 9 | 由于模型目前仅对两个句子进行建模,所以输入文件为两个文件,一个文件存储第一句,另一个文件存储下句(对应存储,对于中文需要分词,按空格隔开)。注意:除了修改main.py中的file1(第一句)和file2(第二句)以外,还需要修改ReadDate函数中的数值来确定训练数据和测试数据的规模。 10 | 11 | ## 模型参数 12 | main.py文件里面有以下参数可以设定: 13 | - margin:正负例得分间隔 14 | - iter:总共迭代次数 15 | - learning_rate:学习率 16 | - test_freq:每迭代多少次进行一次测试 17 | - h_dim:隐层维度,即句子向量的维度 18 | - vocab_size:词表大小,选取最高频的N个词 19 | - w_dim:词向量维度 20 | - neg_sample:负例采样的数目 21 | - up_dim:句子联合表示的向量维度 22 | - CNN_Flag:是否使用CNN模型,为False时不使用(使用GRU模型) 23 | - save_file:保存测试结果的文件名 24 | 25 | ## 运行说明 26 | 在命令行中输入: 27 | 28 | python main.py 29 | 30 | ## 实验结果 31 | 实验所用的数据为100W个对话对,有点类似于QA语料。实验设置为90W用于训练,10W用于测试,测试数据中5W为正例,5W为负例,使用GRU模型。实验结果如下: 32 | 33 | **Iter 0:** 34 | 35 | >cost: 3.025 36 | >cost time: 195146.85 s 37 | >Test... 38 | >Accuracy: 0.75045 39 | >Test Done 40 | 41 | **Iter 1:** 42 | 43 | >cost: 2.428 44 | >cost time: 190828.23 s 45 | >Test... 46 | >Accuracy: **0.79202** 47 | >Test Done 48 | 49 | **Iter 2:** 50 | 51 | >cost: 2.255 52 | >cost time: 187904.05 s 53 | >Test... 54 | >Accuracy: 0.76932 55 | >Test Done 56 | 57 | **Iter 3:** 58 | 59 | >cost: 2.169 60 | >cost time: 155178.83 s 61 | >Test... 62 | >Accuracy: 0.78361 63 | >Test Done 64 | 65 | 使用CNN模型,实验结果如下: 66 | 67 | **Iter 0:** 68 | 69 | >cost: 0.998 70 | >cost time: 137159.67 s 71 | >Test... 72 | >Accuracy: 0.68731 73 | >Test Done 74 | 75 | **Iter 1:** 76 | 77 | >cost: 0.753 78 | >cost time: 72665.73 s 79 | >Test... 80 | >Accuracy: 0.7221 81 | >Test Done 82 | 83 | **Iter 2:** 84 | 85 | >cost: 0.737 86 | >cost time: 68464.48 s 87 | >Test... 88 | >Accuracy: **0.75117** 89 | >Test Done 90 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import theano, random 3 | import numpy as np 4 | import cPickle,time 5 | import theano.tensor as T 6 | from collections import OrderedDict, Counter 7 | import logging 8 | from utils import compute_updates, NormalInit, add_to_params 9 | from GRUModel import SentenceEncoder 10 | from CNNModel import SentenceEncoder_CNN 11 | 12 | logging.basicConfig(level=logging.DEBUG) 13 | 14 | class Configuration(object): 15 | margin = 0.6 #正负例得分间隔 16 | iter = 6 #迭代次数 17 | learning_rate = 0.0003 18 | test_freq = 1 #每迭代多少次进行一次测试 19 | h_dim = 300 #句子向量维度 20 | vocab_size = 60000 21 | w_dim = 100 #词向量维度 22 | neg_sample = 10 23 | up_dim = 500 #句子联合表示向量维度 24 | CNN_Flag = True #是否使用CNN,为False时使用GRU 25 | save_file = 'test_res' #保存测试结果的文件名 26 | 27 | config = Configuration() 28 | 29 | def ReadDate(file1, file2): #选90W作为训练数据,10W作为测试数据 30 | Que = [] 31 | Ans = [] 32 | allword = [] 33 | with open(file1,'r') as fq, open(file2,'r') as fa: 34 | for line in fq: 35 | tmp = line.split() 36 | allword += tmp 37 | if config.CNN_Flag: 38 | while len(tmp) < 3 and len(tmp) > 0: #当使用CNN模型时,需要做padding 39 | tmp.append('OOV') 40 | Que.append(tmp) 41 | else: 42 | Que.append(tmp) 43 | for line in fa: 44 | tmp = line.split() 45 | allword += tmp 46 | if config.CNN_Flag: 47 | while len(tmp) < 3 and len(tmp) > 0: 48 | tmp.append('OOV') 49 | Ans.append(tmp) 50 | else: 51 | Ans.append(tmp) 52 | 53 | assert(len(Que)==len(Ans)) 54 | traindata = [] 55 | testdata = [] 56 | c = Counter(allword) 57 | vocab = [i[0] for i in c.most_common(config.vocab_size-1)] 58 | for q,a in zip(Que[:900000],Ans[:900000]): 59 | traindata.append((q,a)) 60 | for q,a in zip(Que[900000:950000],Ans[900000:950000]): 61 | testdata.append((q,a,1)) 62 | for q in Que[950000:]: 63 | a = Ans[random.randint(0,200000)] 64 | testdata.append((q,a,0)) 65 | 66 | return traindata, testdata, vocab 67 | 68 | 69 | print 'Loading the data...' 70 | traindata, testdata, vocab = ReadDate('100w.q', '100w.a')#'100w.q'全是question,'100w.a'是对应的answers,请替换成自己的文件。 71 | print len(traindata), len(testdata) 72 | print ' Done' 73 | str_to_id = dict([(j,i) for i,j in enumerate(vocab)]+[('OOV',config.vocab_size-1)]) 74 | assert(len(str_to_id)==config.vocab_size) 75 | 76 | 77 | print 'Build model...' 78 | rng = np.random.RandomState(23455) 79 | params = [] 80 | W_emb = add_to_params(params, theano.shared(value=NormalInit(rng, config.vocab_size, config.w_dim), name='W_emb')) 81 | 82 | T_que = T.imatrix('question') 83 | T_ans = T.imatrix('answer') 84 | T_neg = T.imatrix('neg_sample') 85 | M_que = T.imatrix('question') 86 | M_ans = T.imatrix('answer') 87 | M_neg = T.imatrix('neg_sample') 88 | 89 | if config.CNN_Flag == False: 90 | print 'use GRU model...' 91 | Question_Encoder = SentenceEncoder(W_emb, 'Question', config) 92 | Answer_Encoder = SentenceEncoder(W_emb, 'Answer', config) 93 | 94 | que_ph = theano.shared(value=np.zeros((1, config.h_dim), dtype='float32'), name='que_ph') 95 | ans_ph = theano.shared(value=np.zeros((1, config.h_dim), dtype='float32'), name='ans_ph') 96 | neg_ph = theano.shared(value=np.zeros((config.neg_sample, config.h_dim), dtype='float32'), name='neg_ph') 97 | 98 | que_h, _ = Question_Encoder.build_encoder(T_que, T.eq(M_que,1), que_ph) 99 | ans_h, _ = Answer_Encoder.build_encoder(T_ans, T.eq(M_ans,1), ans_ph) 100 | neg_h, _test_mask = Answer_Encoder.build_encoder(T_neg, T.eq(M_neg,1), neg_ph) 101 | 102 | que_emb = que_h[-1] 103 | ans_emb = ans_h[-1] 104 | neg_emb = neg_h[-1] 105 | 106 | else: 107 | print 'use CNN model...' 108 | Question_Encoder = SentenceEncoder_CNN(W_emb, 'Question', config) 109 | Answer_Encoder = SentenceEncoder_CNN(W_emb, 'Answer', config) 110 | 111 | que_emb = Question_Encoder.build_encoder(T_que, T.eq(M_que,1)) 112 | ans_emb = Answer_Encoder.build_encoder(T_ans, T.eq(M_ans,1)) 113 | neg_emb = Answer_Encoder.build_encoder(T_neg, T.eq(M_neg,1)) 114 | 115 | W_up = add_to_params(params, theano.shared(value=NormalInit(rng, 2*config.h_dim, config.up_dim), name='W_up')) 116 | W_up_b = add_to_params(params, theano.shared(value=np.zeros((config.up_dim,), dtype='float32'), name='W_up_b')) 117 | Sen_U = add_to_params(params, theano.shared(value=NormalInit(rng, config.up_dim, 1), name='Sen_U')) 118 | Sen_b = add_to_params(params, theano.shared(value=np.zeros((1,), dtype='float32'), name='Sen_b')) 119 | 120 | join_emb = T.concatenate([que_emb, ans_emb], axis=1) 121 | join_hidden = T.tanh(T.dot(T.concatenate([que_emb, ans_emb], axis=1), W_up)+W_up_b) 122 | #join_hidden = T.tanh(T.dot(W_up, join_emb.T)+W_up_b) 123 | f_x = T.nnet.sigmoid(T.dot(join_hidden, Sen_U)+Sen_b) 124 | 125 | neg_join_hidden = T.tanh(T.dot(T.concatenate([T.repeat(que_emb, config.neg_sample, axis=0), neg_emb], axis=1), W_up)+W_up_b) 126 | f_neg = T.nnet.sigmoid(T.dot(neg_join_hidden, Sen_U)+Sen_b) 127 | 128 | cost = T.maximum(0, config.margin - f_x.sum() + f_neg) 129 | training_cost = cost.sum() 130 | 131 | updates = compute_updates(training_cost, params+Question_Encoder.params+Answer_Encoder.params, config) 132 | 133 | train_model = theano.function([T_que, T_ans, T_neg, M_que, M_ans, M_neg],[training_cost],updates=updates, on_unused_input='ignore', name="train_fn") 134 | #train_model = theano.function([T_que, T_ans, T_neg, M_que, M_ans, M_neg],[que_emb, ans_emb, neg_emb], on_unused_input='ignore', name="train_fn") 135 | test_model = theano.function([T_que, T_ans, M_que, M_ans], [f_x], on_unused_input='ignore', name="train_fn") 136 | print 'function build finish!' 137 | 138 | 139 | print 'Training...' 140 | for step in range(1, config.iter+1): 141 | print 'iter: ',step 142 | cost = 0 143 | length = 0 144 | stime = time.time() 145 | for idx in range(len(traindata)): 146 | if idx % 200000 == 0: 147 | print 'training on ', idx 148 | data = traindata[idx] 149 | que = data[0] 150 | ans = data[1] 151 | #print ' '.join(que) 152 | #print ' '.join(ans) 153 | if not que or not ans: 154 | continue 155 | #_range = range(len(traindata)) 156 | #_range.pop(idx) 157 | #nsample = random.sample(_range, neg_sample) 158 | nsample = [] 159 | n_traindata = len(traindata) 160 | neg_matrix = [] 161 | max_lenght = 0 162 | while len(nsample) < config.neg_sample: 163 | _rand = random.randint(10, n_traindata-10) 164 | if _rand != idx and _rand not in nsample: 165 | tmp = [] 166 | if not traindata[_rand][1]: 167 | continue 168 | for wd in traindata[_rand][1]: 169 | if wd in str_to_id: 170 | tmp.append(str_to_id[wd]) 171 | else: 172 | tmp.append(str_to_id['OOV']) 173 | neg_matrix.append(tmp) 174 | max_lenght = max(max_lenght, len(tmp)) 175 | nsample.append(_rand) 176 | #print len(nsample) 177 | 178 | for i_ in nsample: 179 | tmp = [] 180 | for wd in traindata[i_][1]: 181 | if wd in str_to_id: 182 | tmp.append(str_to_id[wd]) 183 | else: 184 | tmp.append(str_to_id['OOV']) 185 | neg_matrix.append(tmp) 186 | max_lenght = max(max_lenght, len(tmp)) 187 | 188 | neg_mask = [] 189 | new_neg_matrix = [] 190 | for i in range(max_lenght): 191 | tmp = [] 192 | tmp_mask = [] 193 | for j in range(config.neg_sample): 194 | if i < len(neg_matrix[j]): 195 | tmp.append(neg_matrix[j][i]) 196 | tmp_mask.append(1) 197 | else: 198 | tmp.append(0) 199 | tmp_mask.append(0) 200 | new_neg_matrix.append(tmp) 201 | neg_mask.append(tmp_mask) 202 | 203 | #print neg_matrix 204 | #print new_neg_matrix 205 | #print neg_mask 206 | 207 | que_array = [] 208 | que_mask = [] 209 | for wd in que: 210 | if wd in str_to_id: 211 | que_array.append([str_to_id[wd]]) 212 | else: 213 | que_array.append([str_to_id['OOV']]) 214 | que_mask.append([1]) 215 | ans_array = [] 216 | ans_mask = [] 217 | for wd in ans: 218 | if wd in str_to_id: 219 | ans_array.append([str_to_id[wd]]) 220 | else: 221 | ans_array.append([str_to_id['OOV']]) 222 | ans_mask.append([1]) 223 | 224 | que_matrix = np.array(que_array, dtype=np.int32) 225 | ans_matrix = np.array(ans_array, dtype=np.int32) 226 | neg_matrix = np.array(new_neg_matrix, dtype=np.int32) 227 | 228 | que_mask = np.array(que_mask, dtype=np.int32) 229 | ans_mask = np.array(ans_mask, dtype=np.int32) 230 | neg_mask = np.array(neg_mask, dtype=np.int32) 231 | 232 | c = train_model(que_matrix, ans_matrix, neg_matrix, que_mask, ans_mask, neg_mask)[0] 233 | #print que_matrix.shape, ans_matrix.shape, neg_matrix.shape 234 | #a, b, c = train_model(que_matrix, ans_matrix, neg_matrix, que_mask, ans_mask, neg_mask) 235 | #print a.shape, b.shape, c.shape 236 | #print c 237 | 238 | if np.isinf(c) or np.isnan(c): 239 | continue 240 | cost += c 241 | length += 1 242 | #break 243 | #f = foo() 244 | etime = time.time() 245 | print 'Cost: ', cost/length 246 | print 'cost time: ', etime-stime,'s' 247 | 248 | if step%config.test_freq == 0: 249 | print 'Test...' 250 | fw_valid = open(config.save_file+'_%d.txt'%step, 'w') 251 | test_length = 0 252 | test_right = 0 253 | for data in testdata: 254 | que = data[0] 255 | ans = data[1] 256 | label = data[2] 257 | 258 | if not que or not ans: 259 | continue 260 | 261 | test_length += 1 262 | que_array = [] 263 | que_mask = [] 264 | for wd in que: 265 | if wd in str_to_id: 266 | que_array.append([str_to_id[wd]]) 267 | else: 268 | que_array.append([str_to_id['OOV']]) 269 | que_mask.append([1]) 270 | ans_array = [] 271 | ans_mask = [] 272 | for wd in ans: 273 | if wd in str_to_id: 274 | ans_array.append([str_to_id[wd]]) 275 | else: 276 | ans_array.append([str_to_id['OOV']]) 277 | ans_mask.append([1]) 278 | 279 | que_matrix = np.array(que_array, dtype=np.int32) 280 | ans_matrix = np.array(ans_array, dtype=np.int32) 281 | 282 | que_mask = np.array(que_mask, dtype=np.int32) 283 | ans_mask = np.array(ans_mask, dtype=np.int32) 284 | 285 | prob = test_model(que_matrix, ans_matrix, que_mask, ans_mask)[0] 286 | prob = prob[0][0] 287 | 288 | if label == 1 and prob > 0.5: 289 | test_right += 1 290 | if label == 0 and prob < 0.5: 291 | test_right += 1 292 | fw_valid.write('Prob: ' + str(prob) + ' ' + str(label) + '\r\n') 293 | #break 294 | accuracy = 1.0 * test_right / test_length 295 | fw_valid.write('\r\n'+'Accuracy: ' + str(accuracy)) 296 | fw_valid.close() 297 | print 'Accuracy: ', accuracy 298 | #vals = dict([(x.name, x.get_value()) for x in [Wd_out, bd_out]]) 299 | #np.savez('models/model_%d.npz'%step, **vals) 300 | print 'Test Done' 301 | -------------------------------------------------------------------------------- /model.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hit-computer/GRU-or-CNN/b5c119c027aca51335475339a3fa163be163c654/model.jpg -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import theano, random 3 | import numpy as np 4 | import theano.tensor as T 5 | from collections import OrderedDict 6 | 7 | def sharedX(value, name=None, borrow=False, dtype=None): 8 | if dtype is None: 9 | dtype = theano.config.floatX 10 | return theano.shared(theano._asarray(value, dtype=dtype), 11 | name=name, 12 | borrow=borrow) 13 | 14 | def Adam(grads, lr=0.0002, b1=0.1, b2=0.001, e=1e-8): 15 | updates = [] 16 | i = sharedX(0.) 17 | i_t = i + 1. 18 | fix1 = 1. - (1. - b1)**i_t 19 | fix2 = 1. - (1. - b2)**i_t 20 | lr_t = lr * (T.sqrt(fix2) / fix1) 21 | for p, g in grads.items(): 22 | m = sharedX(p.get_value() * 0.) 23 | v = sharedX(p.get_value() * 0.) 24 | m_t = (b1 * g) + ((1. - b1) * m) 25 | v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) 26 | g_t = m_t / (T.sqrt(v_t) + e) 27 | p_t = p - (lr_t * g_t) 28 | updates.append((m, m_t)) 29 | updates.append((v, v_t)) 30 | updates.append((p, p_t)) 31 | updates.append((i, i_t)) 32 | return updates 33 | 34 | def compute_updates(training_cost, params, config): 35 | updates = [] 36 | 37 | grads = T.grad(training_cost, params) 38 | grads = OrderedDict(zip(params, grads)) 39 | 40 | # Clip stuff 41 | c = np.float32(1.) 42 | clip_grads = [] 43 | 44 | norm_gs = T.sqrt(sum(T.sum(g ** 2) for p, g in grads.items())) 45 | normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.)) 46 | notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) 47 | 48 | for p, g in grads.items(): 49 | clip_grads.append((p, T.switch(notfinite, np.float32(.1) * p, g * normalization))) 50 | 51 | grads = OrderedDict(clip_grads) 52 | 53 | updates = Adam(grads, config.learning_rate) #使用adam梯度更新策略 54 | 55 | return updates 56 | 57 | def NormalInit(rng, sizeX, sizeY, scale=0.01, sparsity=-1): 58 | """ 59 | Normal Initialization 60 | """ 61 | sizeX = int(sizeX) 62 | sizeY = int(sizeY) 63 | 64 | if sparsity < 0: 65 | sparsity = sizeY 66 | 67 | sparsity = np.minimum(sizeY, sparsity) 68 | values = np.zeros((sizeX, sizeY), dtype=theano.config.floatX) 69 | for dx in xrange(sizeX): 70 | perm = rng.permutation(sizeY) 71 | new_vals = rng.normal(loc=0, scale=scale, size=(sparsity,)) 72 | values[dx, perm[:sparsity]] = new_vals 73 | 74 | return values.astype(theano.config.floatX) 75 | 76 | def OrthogonalInit(rng, sizeX, sizeY, sparsity=-1, scale=1): 77 | """ 78 | Orthogonal Initialization 79 | """ 80 | 81 | sizeX = int(sizeX) 82 | sizeY = int(sizeY) 83 | 84 | assert sizeX == sizeY, 'for orthogonal init, sizeX == sizeY' 85 | 86 | if sparsity < 0: 87 | sparsity = sizeY 88 | else: 89 | sparsity = np.minimum(sizeY, sparsity) 90 | 91 | values = np.zeros((sizeX, sizeY), dtype=theano.config.floatX) 92 | for dx in xrange(sizeX): 93 | perm = rng.permutation(sizeY) 94 | new_vals = rng.normal(loc=0, scale=scale, size=(sparsity,)) 95 | values[dx, perm[:sparsity]] = new_vals 96 | 97 | # Use SciPy: 98 | if sizeX*sizeY > 20000000: 99 | import scipy 100 | u,s,v = scipy.linalg.svd(values) 101 | else: 102 | u,s,v = np.linalg.svd(values) 103 | values = u * scale 104 | return values.astype(theano.config.floatX) 105 | 106 | def SoftMax(x): 107 | x = T.exp(x - T.max(x, axis=x.ndim-1, keepdims=True)) 108 | return x / T.sum(x, axis=x.ndim-1, keepdims=True) 109 | 110 | def add_to_params(params, new_param): 111 | params.append(new_param) 112 | return new_param --------------------------------------------------------------------------------