├── README.md
├── predict.py
└── train.py


/README.md:
--------------------------------------------------------------------------------
1 | # CCL_CMRC2017
2 | 
3 | 第一届“讯飞杯”中文机器阅读理解评测参考模型
4 | 
5 | http://kexue.fm/archives/4564/
6 | 


--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
  1 | #! -*- coding:utf-8 -*-
  2 | 
  3 | import pickle
  4 | import numpy as np
  5 | 
  6 | id2word,word2id,embedding_array = pickle.load(open('model.config'))
  7 | word_size = embedding_array.shape[1]
  8 | 
  9 | import tensorflow as tf
 10 | 
 11 | padding_vec = tf.Variable(tf.random_uniform([1, word_size], -0.05, 0.05))
 12 | embeddings = tf.constant(embedding_array, dtype=tf.float32)
 13 | embeddings = tf.concat([padding_vec,embeddings], 0)
 14 | 
 15 | L_context = tf.placeholder(tf.int32, shape=[None,None])
 16 | L_context_length = tf.placeholder(tf.int32, shape=[None])
 17 | R_context = tf.placeholder(tf.int32, shape=[None,None])
 18 | R_context_length = tf.placeholder(tf.int32, shape=[None])
 19 | 
 20 | L_context_vec = tf.nn.embedding_lookup(embeddings, L_context)
 21 | R_context_vec = tf.nn.embedding_lookup(embeddings, R_context)
 22 | 
 23 | def add_brnn(inputs, rnn_size, seq_lens, name):
 24 |     rnn_cell_fw = tf.contrib.rnn.BasicLSTMCell(rnn_size)
 25 |     rnn_cell_bw = tf.contrib.rnn.BasicLSTMCell(rnn_size)
 26 |     outputs = []
 27 |     with tf.variable_scope(name_or_scope=name) as vs:
 28 |         for input,seq_len in zip(inputs,seq_lens):
 29 |             outputs.append(tf.nn.bidirectional_dynamic_rnn(rnn_cell_fw, rnn_cell_bw, input, sequence_length=seq_len, dtype=tf.float32))
 30 |             vs.reuse_variables()
 31 |     return [tf.concat(o[0],2) for o in outputs], [o[1] for o in outputs]
 32 | 
 33 | [L_outputs,R_outputs],[L_final_state,R_final_state] = add_brnn([L_context_vec,R_context_vec], word_size, [L_context_length,R_context_length], name='LSTM_1')
 34 | [L_outputs,R_outputs],[L_final_state,R_final_state] = add_brnn([L_outputs,R_outputs], word_size, [L_context_length,R_context_length], name='LSTM_2')
 35 | 
 36 | L_context_mask = (1-tf.cast(tf.sequence_mask(L_context_length), tf.float32))*(-1e12)
 37 | R_context_mask = (1-tf.cast(tf.sequence_mask(R_context_length), tf.float32))*(-1e12)
 38 | context_mask = tf.concat([L_context_mask,R_context_mask], 1)
 39 | 
 40 | outputs = tf.concat([L_outputs,R_outputs], 1)
 41 | final_state = (tf.concat([L_final_state[0][1], L_final_state[1][1]], 1) + tf.concat([R_final_state[0][1], R_final_state[1][1]], 1))/2
 42 | attention = context_mask + tf.matmul(outputs, tf.expand_dims(final_state, 2))[:,:,0]
 43 | sample_labels = tf.placeholder(tf.float32, shape=[None,None])
 44 | loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=sample_labels, logits=attention))
 45 | pred = tf.nn.softmax(attention)
 46 | 
 47 | train_step = tf.train.AdamOptimizer().minimize(loss)
 48 | init = tf.global_variables_initializer()
 49 | sess = tf.Session()
 50 | sess.run(init)
 51 | 
 52 | saver = tf.train.Saver()
 53 | saver.restore(sess, './tk/tk_highest.ckpt')
 54 | 
 55 | import re
 56 | def split_data(text):
 57 |     words = re.split('[ \n]+', text)
 58 |     idx = words.index('XXXXX')
 59 |     return words[:idx],words[idx+1:]
 60 | 
 61 | def cumsum_proba(x, y):
 62 |     tmp = {}
 63 |     for i,j in zip(x, y):
 64 |         if i in tmp:
 65 |             tmp[i] += j
 66 |         else:
 67 |             tmp[i] = j
 68 |     return tmp.keys()[np.argmax(tmp.values())]
 69 | 
 70 | def predict(text): #输入的text为字符串，用空格隔开分词结果，待填空位置用XXXXX表示
 71 |     text = split_data(text)
 72 |     text = [word2id[i] for i in text[0]] if text[0] else [0], [word2id[i] for i in text[1]] if text[1] else [0]
 73 |     p = sess.run(pred, feed_dict={L_context:[text[0]], R_context:[text[1]], L_context_length:[len(text[0])], R_context_length:[len(text[1])]})
 74 |     return id2word.get(cumsum_proba(text[0]+text[1], p[0]),' ')
 75 | 
 76 | 
 77 | if __name__ == '__main__':
 78 | 
 79 |     import codecs
 80 |     import os
 81 |     import sys
 82 | 
 83 |     vaild_name = sys.argv[1]
 84 |     output_name = sys.argv[2]
 85 | 
 86 |     text = codecs.open(vaild_name, encoding='utf-8').read()
 87 |     valid_x = re.split('<qid_.*?\n', text)[:-1]
 88 |     valid_x = ['\n'.join([l.split('||| ')[1] for l in re.split('\n+', t) if l.split('||| ')[0]]) for t in valid_x]
 89 |     valid_x = [split_data(l) for l in valid_x]
 90 |     valid_x = [([word2id[i] for i in j[0]] if j[0] else [0], [word2id[i] for i in j[1]] if j[1] else [0]) for j in valid_x]
 91 | 
 92 |     batch_size = 160
 93 |     def generate_batch_data(data, batch_size):
 94 |         batch = []
 95 |         for x in data:
 96 |             batch.append(x)
 97 |             if len(batch) == batch_size:
 98 |                 l0 = [len(x[0]) for x in batch]
 99 |                 l1 = [len(x[1]) for x in batch]
100 |                 x0 = np.array([x[0]+[0]*(max(l0)-len(x[0])) for x in batch])
101 |                 x1 = np.array([x[1]+[0]*(max(l1)-len(x[1])) for x in batch])
102 |                 yield (x0,
103 |                        x1,
104 |                        np.array(l0),
105 |                        np.array(l1),
106 |                       )
107 |                 batch = []
108 |         if batch:
109 |             l0 = [len(x[0]) for x in batch]
110 |             l1 = [len(x[1]) for x in batch]
111 |             x0 = np.array([x[0]+[0]*(max(l0)-len(x[0])) for x in batch])
112 |             x1 = np.array([x[1]+[0]*(max(l1)-len(x[1])) for x in batch])
113 |             yield (x0,
114 |                    x1,
115 |                    np.array(l0),
116 |                    np.array(l1),
117 |                   )
118 |             batch = []
119 | 
120 |     valid_data = list(generate_batch_data(valid_x, batch_size))
121 |     valid_result = []
122 |     for x in valid_data:
123 |         p = sess.run(pred, feed_dict={L_context:x[0], R_context:x[1], L_context_length:x[2], R_context_length:x[3]})
124 |         w = np.hstack([x[0],x[1]])
125 |         valid_result.extend(np.array([cumsum_proba(s,t) for s,t in zip(w, p)]))
126 | 
127 |     #生成讯飞杯要求的评测格式
128 |     names = re.findall('<qid_\d+>', text)
129 |     s = '\n'.join(names[i]+' ||| '+id2word.get(j,' ') for i,j in enumerate(valid_result))
130 |     with codecs.open(output_name, 'w', encoding='utf-8') as f:
131 |         f.write(s)
132 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | #! -*- coding:utf-8 -*-
  2 | 
  3 | import codecs
  4 | import re
  5 | import os
  6 | import numpy as np
  7 | 
  8 | def split_data(text):
  9 |     words = re.split('[ \n]+', text)
 10 |     idx = words.index('XXXXX')
 11 |     return words[:idx],words[idx+1:]
 12 | 
 13 | print u'正在读取训练语料...'
 14 | train_x = codecs.open('../CMRC2017_train/train.doc_query', encoding='utf-8').read()
 15 | train_x = re.split('<qid_.*?\n', train_x)[:-1]
 16 | train_x = ['\n'.join([l.split('||| ')[1] for l in re.split('\n+', t) if l.split('||| ')[0]]) for t in train_x]
 17 | train_x = [split_data(l) for l in train_x]
 18 | 
 19 | train_y = codecs.open('../CMRC2017_train/train.answer', encoding='utf-8').read()
 20 | train_y = train_y.split('\n')[:-1]
 21 | train_y = [l.split('||| ')[1] for l in train_y]
 22 | 
 23 | print u'正在读取验证语料...'
 24 | valid_x = codecs.open('../CMRC2017_cloze_valid/cloze.valid.doc_query', encoding='utf-8').read()
 25 | valid_x = re.split('<qid_.*?\n', valid_x)[:-1]
 26 | valid_x = ['\n'.join([l.split('||| ')[1] for l in re.split('\n+', t) if l.split('||| ')[0]]) for t in valid_x]
 27 | valid_x = [split_data(l) for l in valid_x]
 28 | 
 29 | valid_y = codecs.open('../CMRC2017_cloze_valid/cloze.valid.answer', encoding='utf-8').read()
 30 | valid_y = valid_y.split('\n')[:-1]
 31 | valid_y = [l.split('||| ')[1] for l in valid_y]
 32 | 
 33 | word_size = 128
 34 | if os.path.exists('model.config'): #如果有则读取配置信息
 35 |     id2word,word2id,embedding_array = pickle.load(open('model.config'))
 36 | else: #如果没有则重新训练词向量
 37 |     import jieba
 38 |     import codecs
 39 |     import logging
 40 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 41 |     from gensim.models import Word2Vec
 42 |     print u'正在对添加语料进行分词...'
 43 |     additional = codecs.open('../additional.txt', encoding='utf-8').read().split('\n') #自行从网上爬的童话语料
 44 |     additional = map(lambda s: jieba.lcut(s, HMM=False), additional)
 45 |     class data_for_word2vec: #用迭代器将三个语料整合起来
 46 |         def __iter__(self):
 47 |             for x in train_x:
 48 |                 yield x[0]
 49 |                 yield x[1]
 50 |             for x in valid_x:
 51 |                 yield x[0]
 52 |                 yield x[1]
 53 |             for x in additional:
 54 |                 yield x
 55 |     word2vec = Word2Vec(data_for_word2vec(), size=word_size, min_count=2, sg=2, negative=10, iter=10)
 56 |     word2vec.save('word2vec_tk')
 57 |     from collections import defaultdict
 58 |     id2word = {i+1:j for i,j in enumerate(word2vec.wv.index2word)}
 59 |     word2id = defaultdict(int, {j:i for i,j in id2word.items()})
 60 |     embedding_array = np.array([word2vec[id2word[i+1]] for i in range(len(id2word))])
 61 |     pickle.dump([id2word,word2id,embedding_array], open('model.config','w'))
 62 | 
 63 | import tensorflow as tf
 64 | 
 65 | padding_vec = tf.Variable(tf.random_uniform([1, word_size], -0.05, 0.05)) #只对填充向量进行训练，其余向量保持word2vec的结果
 66 | embeddings = tf.constant(embedding_array, dtype=tf.float32)
 67 | embeddings = tf.concat([padding_vec,embeddings], 0)
 68 | 
 69 | L_context = tf.placeholder(tf.int32, shape=[None,None])
 70 | L_context_length = tf.placeholder(tf.int32, shape=[None])
 71 | R_context = tf.placeholder(tf.int32, shape=[None,None])
 72 | R_context_length = tf.placeholder(tf.int32, shape=[None])
 73 | 
 74 | L_context_vec = tf.nn.embedding_lookup(embeddings, L_context)
 75 | R_context_vec = tf.nn.embedding_lookup(embeddings, R_context)
 76 | 
 77 | def add_brnn(inputs, rnn_size, seq_lens, name): #定义单层双向LSTM，上下文公用参数，分别过LSTM然后拼接
 78 |     rnn_cell_fw = tf.contrib.rnn.BasicLSTMCell(rnn_size)
 79 |     rnn_cell_bw = tf.contrib.rnn.BasicLSTMCell(rnn_size)
 80 |     outputs = []
 81 |     with tf.variable_scope(name_or_scope=name) as vs:
 82 |         for input,seq_len in zip(inputs,seq_lens):
 83 |             outputs.append(tf.nn.bidirectional_dynamic_rnn(rnn_cell_fw, rnn_cell_bw, input, sequence_length=seq_len, dtype=tf.float32))
 84 |             vs.reuse_variables()
 85 |     return [tf.concat(o[0],2) for o in outputs], [o[1] for o in outputs]
 86 | 
 87 | [L_outputs,R_outputs],[L_final_state,R_final_state] = add_brnn([L_context_vec,R_context_vec], word_size, [L_context_length,R_context_length], name='LSTM_1')
 88 | [L_outputs,R_outputs],[L_final_state,R_final_state] = add_brnn([L_outputs,R_outputs], word_size, [L_context_length,R_context_length], name='LSTM_2')
 89 | 
 90 | L_context_mask = (1-tf.cast(tf.sequence_mask(L_context_length), tf.float32))*(-1e12) #对填充位置进行mask，注意这里是softmax之前的mask，所以mask不是乘以0，而是减去1e12
 91 | R_context_mask = (1-tf.cast(tf.sequence_mask(R_context_length), tf.float32))*(-1e12)
 92 | context_mask = tf.concat([L_context_mask,R_context_mask], 1)
 93 | 
 94 | outputs = tf.concat([L_outputs,R_outputs], 1)
 95 | final_state = (tf.concat([L_final_state[0][1], L_final_state[1][1]], 1) + tf.concat([R_final_state[0][1], R_final_state[1][1]], 1))/2 #双向拼接、上下文取平均，得到encode向量
 96 | attention = context_mask + tf.matmul(outputs, tf.expand_dims(final_state, 2))[:,:,0] #encode向量与每个时间步状态向量做内积，然后mask，然后softmax
 97 | sample_labels = tf.placeholder(tf.float32, shape=[None,None])
 98 | loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=sample_labels, logits=attention))
 99 | pred = tf.nn.softmax(attention)
100 | 
101 | train_step = tf.train.AdamOptimizer().minimize(loss)
102 | init = tf.global_variables_initializer()
103 | sess = tf.Session()
104 | sess.run(init)
105 | 
106 | train_x = [([word2id[i] for i in j[0]] if j[0] else [0], [word2id[i] for i in j[1]] if j[1] else [0]) for j in train_x] #词序列ID化
107 | train_y = [word2id[i] for i in train_y]
108 | valid_x = [([word2id[i] for i in j[0]] if j[0] else [0], [word2id[i] for i in j[1]] if j[1] else [0]) for j in valid_x]
109 | valid_y = [word2id[i] for i in valid_y]
110 | 
111 | def construct_sample(x, y, i):
112 |     return x[i][0], x[i][1], y[i]
113 | 
114 | train_x = [construct_sample(train_x, train_y, i) for i in range(len(train_x))] #输入输出配对，构成训练样本
115 | valid_x = [construct_sample(valid_x, valid_y, i) for i in range(len(valid_x))]
116 | 
117 | batch_size = 160
118 | def generate_batch_data(data, batch_size): #生成单个batch
119 |     np.random.shuffle(data)
120 |     batch = []
121 |     for x in data:
122 |         batch.append(x)
123 |         if len(batch) == batch_size:
124 |             l0 = [len(x[0]) for x in batch]
125 |             l1 = [len(x[1]) for x in batch]
126 |             x0 = np.array([x[0]+[0]*(max(l0)-len(x[0])) for x in batch])
127 |             x1 = np.array([x[1]+[0]*(max(l1)-len(x[1])) for x in batch])
128 |             x2 = np.array([[x[2]] for x in batch])
129 |             y = (np.hstack([x0,x1])==x2).astype(np.float32)
130 |             yield (x0,
131 |                    x1,
132 |                    y/y.sum(axis=1).reshape((-1,1)),
133 |                    np.array(l0),
134 |                    np.array(l1),
135 |                    x2
136 |                   )
137 |             batch = []
138 |     if batch:
139 |         l0 = [len(x[0]) for x in batch]
140 |         l1 = [len(x[1]) for x in batch]
141 |         x0 = np.array([x[0]+[0]*(max(l0)-len(x[0])) for x in batch])
142 |         x1 = np.array([x[1]+[0]*(max(l1)-len(x[1])) for x in batch])
143 |         x2 = np.array([[x[2]] for x in batch])
144 |         y = (np.hstack([x0,x1])==x2).astype(np.float32)
145 |         yield (x0,
146 |                x1,
147 |                y/y.sum(axis=1).reshape((-1,1)),
148 |                np.array(l0),
149 |                np.array(l1),
150 |                x2
151 |               )
152 |         batch = []
153 | 
154 | import datetime
155 | import json
156 | 
157 | epochs = 30
158 | saver = tf.train.Saver()
159 | if not os.path.exists('./tk'):
160 |     os.mkdir('./tk')
161 | try:
162 |     saver.restore(sess, './tk/tk_highest.ckpt')
163 | except:
164 |     pass
165 | 
166 | def cumsum_proba(x, y): #对相同项的概率进行合并
167 |     tmp = {}
168 |     for i,j in zip(x, y):
169 |         if i in tmp:
170 |             tmp[i] += j
171 |         else:
172 |             tmp[i] = j
173 |     return tmp.keys()[np.argmax(tmp.values())]
174 | 
175 | highest_acc = 0.
176 | train_log = {'loss':[], 'accuracy':[]}
177 | for e in range(epochs):
178 |     train_data = list(generate_batch_data(train_x, batch_size))
179 |     count = 0
180 |     batch = 0
181 |     for x in train_data:
182 |         if batch % 10 == 0:
183 |             loss_ = sess.run(loss, feed_dict={L_context:x[0], R_context:x[1], sample_labels:x[2], L_context_length:x[3], R_context_length:x[4]})
184 |             print '%s, epoch %s, trained on %s samples, loss: %s'%(datetime.datetime.now(), e+1, count, loss_)
185 |             saver.save(sess, './tk/tk_%s.ckpt'%e) #每个epoch保存一次
186 |             train_log['loss'].append(float(loss_))
187 |             json.dump(train_log, open('train.log', 'w'))
188 |         sess.run(train_step, feed_dict={L_context:x[0], R_context:x[1], sample_labels:x[2], L_context_length:x[3], R_context_length:x[4]})
189 |         if batch % 100 == 0:
190 |             valid_data = list(generate_batch_data(valid_x, batch_size))
191 |             r = 0.
192 |             for x in valid_data:
193 |                 p = sess.run(pred, feed_dict={L_context:x[0], R_context:x[1], sample_labels:x[2], L_context_length:x[3], R_context_length:x[4]})
194 |                 w = np.hstack([x[0],x[1]])
195 |                 r += (np.array([cumsum_proba(s,t) for s,t in zip(w, p)]) == x[5].reshape(-1)).sum()
196 |             acc = r/len(valid_x)
197 |             print '%s, valid accuracy %s'%(datetime.datetime.now(), acc)
198 |             train_log['accuracy'].append(acc)
199 |             if highest_acc <= acc:
200 |                 highest_acc = acc
201 |                 saver.save(sess, './tk/tk_highest.ckpt') #历史最好也保存一次
202 |         batch += 1
203 |         count += len(x[0])
204 | 


--------------------------------------------------------------------------------