├── QA_CNN_pairwise.py
├── README.md
├── __pycache__
    ├── config.cpython-37.pyc
    ├── evaluation.cpython-37.pyc
    └── helper.cpython-37.pyc
├── badcase
├── config.py
├── data
    └── nlpcc
    │   ├── dev.txt
    │   ├── test.txt
    │   └── train.txt
├── evaluation.py
├── helper.py
├── models
    ├── __pycache__
    │   ├── basis_model.cpython-37.pyc
    │   ├── blocks.cpython-37.pyc
    │   └── cnn_model.cpython-37.pyc
    ├── basis_model.py
    ├── blocks.py
    └── cnn_model.py
├── propressing.py
├── run.py
├── test.py
└── train.py


/QA_CNN_pairwise.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | 
  4 | 
  5 | # model_type :apn or qacnn
  6 | class QA_CNN_extend(object):
  7 |     def __init__(self,max_input_left,max_input_right,batch_size,vocab_size,embedding_size,filter_sizes,num_filters,
  8 |         dropout_keep_prob = 1,learning_rate = 0.001,embeddings = None,l2_reg_lambda = 0.0,overlap_needed = False,trainable = True,extend_feature_dim = 10,pooling = 'attentive',position_needed = True,conv = 'narrow'):
  9 | 
 10 |         self.dropout_keep_prob = dropout_keep_prob
 11 |         self.num_filters = num_filters
 12 |         self.embeddings = embeddings
 13 |         self.embedding_size = embedding_size
 14 |         self.batch_size = batch_size
 15 |         self.filter_sizes = filter_sizes
 16 |         self.l2_reg_lambda = l2_reg_lambda
 17 |         self.para = []
 18 |         self.extend_feature_dim = extend_feature_dim
 19 |         self.max_input_left = max_input_left
 20 |         self.max_input_right = max_input_right
 21 |         self.overlap_needed = overlap_needed
 22 |         self.num_filters_total = self.num_filters * len(self.filter_sizes)
 23 |         self.trainable = trainable
 24 |         self.vocab_size = vocab_size
 25 |         self.pooling = pooling
 26 |         self.position_needed = position_needed
 27 |         self.conv = conv
 28 |         if self.overlap_needed:
 29 |             self.total_embedding_dim = embedding_size + extend_feature_dim
 30 |         else:
 31 |             self.total_embedding_dim = embedding_size
 32 |         #position embedding needed
 33 |         if self.position_needed:
 34 |             self.total_embedding_dim = self.total_embedding_dim + extend_feature_dim
 35 |         self.learning_rate = learning_rate
 36 |     def create_placeholder(self):
 37 |         print('Create placeholders')
 38 |         self.question = tf.placeholder(tf.int32,[None,self.max_input_left],name = 'input_question')
 39 |         self.answer = tf.placeholder(tf.int32,[None,self.max_input_right],name = 'input_answer')
 40 |         self.answer_negative = tf.placeholder(tf.int32,[None,self.max_input_right],name = 'input_right')
 41 |         self.q_pos_overlap = tf.placeholder(tf.int32,[None,self.max_input_left],name = 'q_pos_feature_embed')
 42 |         self.q_neg_overlap = tf.placeholder(tf.int32,[None,self.max_input_left],name = 'q_neg_feature_embed')
 43 |         self.a_pos_overlap = tf.placeholder(tf.int32,[None,self.max_input_right],name = 'a_feature_embed')
 44 |         self.a_neg_overlap = tf.placeholder(tf.int32,[None,self.max_input_right],name = 'a_neg_feature_embed')
 45 |         self.q_position = tf.placeholder(tf.int32,[None,self.max_input_left],name = 'q_position_embed')
 46 |         self.a_pos_position = tf.placeholder(tf.int32,[None,self.max_input_right],name = 'a_position_embed')
 47 |         self.a_neg_position = tf.placeholder(tf.int32,[None,self.max_input_right],name = 'a_neg_postion_embed')
 48 |     def create_position(self):
 49 |         print 'add conv position'
 50 |         self.q_conv_position = tf.Variable(tf.ones([self.max_input_left,1]),name = 'q_conv_position')
 51 |         self.a_conv_position = tf.Variable(tf.ones([self.max_input_right,1]),name = 'a_conv_position')
 52 |     def add_embeddings(self):
 53 |         print 'add embeddings'
 54 |         if self.embeddings is not None:
 55 |             print "load embedding"
 56 |             W = tf.Variable(np.array(self.embeddings),name = "W" ,dtype="float32",trainable = self.trainable)
 57 |             
 58 |         else:
 59 |             print "random embedding"
 60 |             W = tf.Variable(tf.random_uniform([self.vocab_size, self.embedding_size], -1.0, 1.0),name="W",trainable = self.trainable)
 61 |         self.embedding_W = W
 62 |         self.overlap_W = tf.Variable(tf.random_uniform([3, self.extend_feature_dim], -1.0, 1.0),name="W",trainable = True)
 63 |         # we suppose the max length of sentence is 300
 64 |         self.position_W = tf.Variable(tf.random_uniform([300,self.extend_feature_dim], -1.0, 1.0),name = 'W',trainable = True)
 65 |         # self.overlap_W = tf.Variable(a,name="W",trainable = True)
 66 |         self.para.append(self.embedding_W)
 67 |         self.para.append(self.overlap_W)
 68 |         self.para.append(self.position_W)
 69 |          #get embedding
 70 |         self.q_pos_embedding = self.concat_embedding(self.question,self.q_pos_overlap,self.q_position,self.q_conv_position)
 71 |         print self.q_pos_embedding
 72 |         self.q_neg_embedding = self.concat_embedding(self.question,self.q_neg_overlap,self.q_position,self.q_conv_position)
 73 |         self.a_pos_embedding = self.concat_embedding(self.answer, self.a_pos_overlap,self.a_pos_position,self.a_conv_position)
 74 |         self.a_neg_embedding = self.concat_embedding(self.answer_negative,self.a_neg_overlap,self.a_neg_position,self.a_conv_position)
 75 |     def convolution(self):
 76 |         print 'convolution:wide_convolution'
 77 |         self.kernels = []
 78 |         for i,filter_size in enumerate(self.filter_sizes):
 79 |             with tf.name_scope('conv-max-pool-%s' % filter_size):
 80 |                 filter_shape = [filter_size,self.total_embedding_dim,1,self.num_filters]
 81 |                 W = tf.Variable(tf.truncated_normal(filter_shape, stddev = 0.1), name="W")
 82 |                 b = tf.Variable(tf.constant(0.0, shape=[self.num_filters]), name="b")
 83 |                 self.kernels.append((W,b))
 84 |                 self.para.append(W)
 85 |                 self.para.append(b)
 86 |         #convolution
 87 |         embeddings = [self.q_pos_embedding,self.q_neg_embedding,self.a_pos_embedding,self.a_neg_embedding]
 88 |         self.q_pos_feature_map,self.q_neg_feature_map,self.a_pos_feature_map,self.a_neg_feature_map = \
 89 |         [self.wide_convolution(embedding) for embedding in embeddings]
 90 |     def pooling_graph(self):
 91 |         print 'pooling: max pooling or attentive pooling'
 92 |         #pooling strategy
 93 |         if self.pooling == 'max':
 94 |             print self.pooling
 95 |             self.q_pos_pooling = tf.reshape(self.max_pooling(self.q_pos_feature_map,self.max_input_left),[-1,self.num_filters_total])
 96 |             self.q_neg_pooling = tf.reshape(self.max_pooling(self.q_neg_feature_map,self.max_input_left),[-1,self.num_filters_total])
 97 |             self.a_pos_pooling = tf.reshape(self.max_pooling(self.a_pos_feature_map,self.max_input_right),[-1,self.num_filters_total])
 98 |             self.a_neg_pooling = tf.reshape(self.max_pooling(self.a_neg_feature_map,self.max_input_right),[-1,self.num_filters_total])
 99 | 
100 |         elif self.pooling == 'attentive':
101 |             print self.pooling
102 |             with tf.name_scope('attention'):    
103 |                     self.U = tf.Variable(tf.truncated_normal(shape = [self.num_filters_total,self.num_filters_total],stddev = 0.01,name = 'U'))
104 |                     self.para.append(self.U)
105 |             self.q_pos_pooling,self.a_pos_pooling = self.attentive_pooling(self.q_pos_feature_map,self.a_pos_feature_map)
106 |             self.q_neg_pooling,self.a_neg_pooling = self.attentive_pooling(self.q_neg_feature_map,self.a_neg_feature_map)
107 |             # print self.q_pos_pooling
108 |         else:
109 |             print 'no implement'
110 |             exit(0)  
111 |     def create_loss(self):
112 |         
113 |         with tf.name_scope('score'):
114 |             self.score12 = self.getCosine(self.q_pos_pooling,self.a_pos_pooling)
115 |             self.score13 = self.getCosine(self.q_neg_pooling,self.a_neg_pooling)
116 |         l2_loss = tf.constant(0.0)
117 |         for p in self.para:
118 |             l2_loss += tf.nn.l2_loss(p)
119 |         with tf.name_scope("loss"):
120 |             self.losses = tf.maximum(0.0, tf.subtract(0.05, tf.subtract(self.score12, self.score13)))
121 |             self.loss = tf.reduce_sum(self.losses) + self.l2_reg_lambda * l2_loss
122 |         tf.summary.scalar('loss', self.loss)
123 |         # Accuracy
124 |         with tf.name_scope("accuracy"):
125 |             self.correct = tf.equal(0.0, self.losses)
126 |             self.accuracy = tf.reduce_mean(tf.cast(self.correct, "float"), name="accuracy")
127 |         tf.summary.scalar('accuracy', self.accuracy)
128 |     def create_op(self):
129 |         self.global_step = tf.Variable(0, name="global_step", trainable = False)
130 |         self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
131 |         self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
132 |         self.train_op = self.optimizer.apply_gradients(self.grads_and_vars, global_step = self.global_step)
133 | 
134 |     def concat_embedding(self,words_indice,overlap_indice,position_indice,conv_position):
135 |         embedded_chars_q = tf.nn.embedding_lookup(self.embedding_W,words_indice)
136 |         position_embedding = tf.nn.embedding_lookup(self.position_W,position_indice)
137 |         overlap_embedding_q = tf.nn.embedding_lookup(self.overlap_W,overlap_indice)
138 |         if not self.overlap_needed :
139 |             if not self.position_needed:
140 |                 all_embedding = embedded_chars_q
141 |                 # return tf.expand_dims(embedded_chars_q,-1)
142 |             else:
143 |                 all_embedding = tf.concat([embedded_chars_q,position_embedding],2)
144 |                 # return tf.expand_dims(tf.concat([embedded_chars_q,position_embedding],2),-1)
145 |         else:
146 |             if not self.position_needed:
147 |                 all_embedding = tf.concat([embedded_chars_q,overlap_embedding_q],2)
148 |                 # return  tf.expand_dims(tf.concat([embedded_chars_q,overlap_embedding_q],2),-1)
149 |             else:
150 |                 all_embedding = tf.concat([embedded_chars_q,overlap_embedding_q,position_embedding],2)
151 |                 # return tf.expand_dims(tf.concat([embedded_chars_q,overlap_embedding_q,position_embedding],2),-1)
152 |         # all_embedding = tf.multiply(all_embedding,conv_position)
153 |         return tf.expand_dims(all_embedding,-1)
154 | 
155 |     def max_pooling(self,conv,input_length):
156 |         pooled = tf.nn.max_pool(
157 |                     conv,
158 |                     ksize = [1, input_length, 1, 1],
159 |                     strides = [1, 1, 1, 1],
160 |                     padding = 'VALID',
161 |                     name="pool")
162 |         return pooled
163 |     def getCosine(self,q,a):
164 |         pooled_flat_1 = tf.nn.dropout(q, self.dropout_keep_prob)
165 |         pooled_flat_2 = tf.nn.dropout(a, self.dropout_keep_prob)
166 |         
167 |         pooled_len_1 = tf.sqrt(tf.reduce_sum(tf.multiply(pooled_flat_1, pooled_flat_1), 1)) 
168 |         pooled_len_2 = tf.sqrt(tf.reduce_sum(tf.multiply(pooled_flat_2, pooled_flat_2), 1))
169 |         pooled_mul_12 = tf.reduce_sum(tf.multiply(pooled_flat_1, pooled_flat_2), 1) 
170 |         score = tf.div(pooled_mul_12, tf.multiply(pooled_len_1, pooled_len_2), name="scores") 
171 |         return score
172 |     
173 |     def attentive_pooling(self,input_left,input_right):
174 |         Q = tf.reshape(input_left,[-1,self.max_input_left,len(self.filter_sizes) * self.num_filters],name = 'Q')
175 |         A = tf.reshape(input_right,[-1,self.max_input_right,len(self.filter_sizes) * self.num_filters],name = 'A')
176 |         # G = tf.tanh(tf.matmul(tf.matmul(Q,self.U),\
177 |         # A,transpose_b = True),name = 'G')
178 |         
179 |         first = tf.matmul(tf.reshape(Q,[-1,len(self.filter_sizes) * self.num_filters]),self.U)
180 |         print tf.reshape(Q,[-1,len(self.filter_sizes) * self.num_filters])
181 |         print self.U
182 |         second_step = tf.reshape(first,[-1,self.max_input_left,len(self.filter_sizes) * self.num_filters])
183 |         result = tf.matmul(second_step,tf.transpose(A,perm = [0,2,1]))
184 |         # print 'result',result
185 |         G = tf.tanh(result)
186 |         
187 |         # G = result
188 |         # column-wise pooling ,row-wise pooling
189 |         row_pooling = tf.reduce_max(G,1,True,name = 'row_pooling')
190 |         col_pooling = tf.reduce_max(G,2,True,name = 'col_pooling')
191 |     
192 |         self.attention_q = tf.nn.softmax(col_pooling,1,name = 'attention_q')
193 |         print self.attention_q
194 |         self.see = self.attention_q
195 | 
196 |         self.attention_a = tf.nn.softmax(row_pooling,name = 'attention_a')
197 |         R_q = tf.reshape(tf.matmul(Q,self.attention_q,transpose_a = 1),[-1,self.num_filters * len(self.filter_sizes)],name = 'R_q')
198 |         R_a = tf.reshape(tf.matmul(self.attention_a,A),[-1,self.num_filters * len(self.filter_sizes)],name = 'R_a')
199 | 
200 |         return R_q,R_a
201 |         
202 |     def wide_convolution(self,embedding):
203 |         cnn_outputs = []
204 |         for i,filter_size in enumerate(self.filter_sizes):
205 |             conv = tf.nn.conv2d(
206 |                     embedding,
207 |                     self.kernels[i][0],
208 |                     strides=[1, 1, self.total_embedding_dim, 1],
209 |                     padding='SAME',
210 |                     name="conv-1"
211 |             )
212 |             h = tf.nn.relu(tf.nn.bias_add(conv, self.kernels[i][1]), name="relu-1")
213 |             cnn_outputs.append(h)
214 |         cnn_reshaped = tf.concat(cnn_outputs,3)
215 |         return cnn_reshaped
216 |     def narrow_convolution_pooling(self):
217 |         print 'narrow pooling'
218 |         self.kernels = []
219 |         for i,filter_size in enumerate(self.filter_sizes):
220 |             with tf.name_scope('conv-max-pool-%s' % filter_size):
221 |                 filter_shape = [filter_size,self.total_embedding_dim,1,self.num_filters]
222 |                 W = tf.Variable(tf.truncated_normal(filter_shape, stddev = 0.1), name="W")
223 |                 b = tf.Variable(tf.constant(0.0, shape=[self.num_filters]), name="b")
224 |                 self.kernels.append((W,b))
225 |                 self.para.append(W)
226 |                 self.para.append(b)
227 |         embeddings = [self.q_pos_embedding,self.q_neg_embedding,self.a_pos_embedding,self.a_neg_embedding]
228 |         self.q_pos_pooling,self.q_neg_pooling,self.a_pos_pooling,self.a_neg_pooling = [self.getFeatureMap(embedding,right = i / 2) for i,embedding in enumerate(embeddings) ]
229 |     def getFeatureMap(self,embedding,right=True):
230 |         if right == 1:
231 |             max_length = self.max_input_right
232 |         else:
233 |             max_length = self.max_input_left
234 |         pooled_outputs = []       
235 |         for i,filter_size in enumerate(self.filter_sizes):
236 |             conv = tf.nn.conv2d(
237 |                     embedding,
238 |                     self.kernels[i][0],
239 |                     strides=[1, 1, 1, 1],
240 |                     padding='VALID',
241 |                     name="conv-1"
242 |             )
243 |             h = tf.nn.relu(tf.nn.bias_add(conv, self.kernels[i][1]), name="relu-1")
244 | 
245 |             pooled = tf.nn.max_pool(
246 |                     h,
247 |                     ksize=[1, max_length - filter_size + 1, 1, 1],
248 |                     strides=[1, 1, 1, 1],
249 |                     padding='VALID',
250 |                     name="poll-1"
251 |             )
252 |             pooled_outputs.append(pooled) 
253 |         pooled_reshape = tf.reshape(tf.concat(pooled_outputs,3), [-1, self.num_filters_total])  
254 |         return pooled_reshape
255 |     def variable_summaries(self,var):
256 |         with tf.name_scope('summaries'):
257 |             mean = tf.reduce_mean(var)
258 |             tf.summary.scalar('mean', mean)
259 |             with tf.name_scope('stddev'):
260 |                 stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
261 |             tf.summary.scalar('stddev', stddev)
262 |             tf.summary.scalar('max', tf.reduce_max(var))
263 |             tf.summary.scalar('min', tf.reduce_min(var))
264 |             tf.summary.histogram('histogram', var)
265 | 
266 |     def build_graph(self):
267 |         self.create_placeholder()
268 |         self.create_position()
269 |         self.add_embeddings()
270 |         if self.conv == 'narrow':
271 |             self.narrow_convolution_pooling()
272 |         else:
273 |             self.convolution()
274 |             self.pooling_graph()
275 |         self.create_loss()
276 |         self.create_op()
277 |         self.merged = tf.summary.merge_all()
278 | 
279 |     
280 | if __name__ == '__main__':
281 |     cnn = QA_CNN_extend(max_input_left = 33,
282 |         max_input_right = 40,
283 |         batch_size = 3,
284 |         vocab_size = 5000,
285 |         embedding_size = 100,
286 |         filter_sizes = [3,4,5],
287 |         num_filters = 64, 
288 |         dropout_keep_prob = 1.0,
289 |         embeddings = None,
290 |         l2_reg_lambda = 0.0,
291 |         overlap_needed = False,
292 |         trainable = True,
293 |         extend_feature_dim = 10,
294 |         position_needed = False,
295 |         pooling = 'attentive',
296 |         conv = 'wide')
297 |     cnn.build_graph()
298 |     input_x_1 = np.reshape(np.arange(3 * 33),[3,33])
299 |     input_x_2 = np.reshape(np.arange(3 * 40),[3,40])
300 |     input_x_3 = np.reshape(np.arange(3 * 40),[3,40])
301 | 
302 |     q_pos_embedding = np.ones((3,33))
303 |     q_neg_embedding = np.ones((3,33))
304 |     a_pos_embedding = np.ones((3,40))
305 |     a_neg_embedding = np.ones((3,40)) 
306 | 
307 |     q_position = np.ones((3,33))
308 |     a_pos_position = np.ones((3,40))
309 |     a_neg_position = np.ones((3,40))
310 | 
311 |     with tf.Session() as sess:
312 |         sess.run(tf.global_variables_initializer())
313 |         feed_dict = {
314 |             cnn.question:input_x_1,
315 |             cnn.answer:input_x_2,
316 |             cnn.answer_negative:input_x_3,
317 |             # cnn.q_pos_overlap:q_pos_embedding,
318 |             # cnn.q_neg_overlap:q_neg_embedding,
319 |             # cnn.a_pos_overlap:a_pos_embedding,
320 |             # cnn.a_neg_overlap:a_neg_embedding,
321 |             # cnn.q_position:q_position,
322 |             # cnn.a_pos_position:a_pos_position,
323 |             # cnn.a_neg_position:a_neg_position
324 |         }
325 |         question,answer,score = sess.run([cnn.question,cnn.answer,cnn.score12],feed_dict)
326 |         print question.shape,answer.shape
327 |         print score
328 | 
329 | 
330 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This is a tensorflow implementation of  NLPCC2017 DBQA task. Our result ranks 5th amoung the 21 submission.
 2 | 
 3 | [Enhanced Embedding based Attentive Pooling Network for Answer Selection](http://tcci.ccf.org.cn/conference/2017/)
 4 | 
 5 | We utilize chinese wiki corpus to train our embedding. You can train embedding by youself or contact us to get what we use.
 6 | 
 7 | ## Requirements
 8 | 
 9 | - python3
10 | 
11 | - Tensorflow = 1.12
12 | 
13 | ## Training
14 | 
15 | 
16 | ```
17 | ./train.py --overlap_needed True --position_needed True
18 | ```
19 | 
20 | ##
21 | 
22 | 
23 | 
24 | | method | pooling | map(test1) | map(test2)
25 | | :--- | :----: | ----: |:----:|
26 | | CNN-base | max | 0.782 | 0.657
27 | | CNN-base | attentive | 0.772  | 0.646
28 | | +overlap | max | 0.828 | 0.674
29 | | +overlap | attentive | 0.811  | 0.672|
30 | | +position,overlap | attentive | 0.819 | 0.675
31 | | +position,overlap |  max | 0.834  | 0.679
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/__pycache__/config.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shuishen112/NLPCCDBQA/690fbfed1668fc435cbffc6ae8b532843bf28d50/__pycache__/config.cpython-37.pyc


--------------------------------------------------------------------------------
/__pycache__/evaluation.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shuishen112/NLPCCDBQA/690fbfed1668fc435cbffc6ae8b532843bf28d50/__pycache__/evaluation.cpython-37.pyc


--------------------------------------------------------------------------------
/__pycache__/helper.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shuishen112/NLPCCDBQA/690fbfed1668fc435cbffc6ae8b532843bf28d50/__pycache__/helper.cpython-37.pyc


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | # Model Hyperparameters
 3 | # flags.DEFINE_integer("embedding_dim",300, "Dimensionality of character embedding (default: 128)")
 4 | # flags.DEFINE_string("filter_sizes", "1,2,3,5", "Comma-separated filter sizes (default: '3,4,5')")
 5 | # flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
 6 | # flags.DEFINE_float("dropout_keep_prob", 1, "Dropout keep probability (default: 0.5)")
 7 | # flags.DEFINE_float("l2_reg_lambda", 0.000001, "L2 regularizaion lambda (default: 0.0)")
 8 | # flags.DEFINE_float("learning_rate", 1e-3, "learn rate( default: 0.0)")
 9 | # flags.DEFINE_integer("max_len_left", 40, "max document length of left input")
10 | # flags.DEFINE_integer("max_len_right", 40, "max document length of right input")
11 | # flags.DEFINE_string("loss","pair_wise","loss function (default:point_wise)")
12 | # flags.DEFINE_integer('extend_feature_dim',10,'overlap_feature_dim')
13 | # # Training parameters
14 | # flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
15 | # flags.DEFINE_boolean("trainable", False, "is embedding trainable? (default: False)")
16 | # flags.DEFINE_integer("num_epochs", 100, "Number of training epochs (default: 200)")
17 | # flags.DEFINE_integer("evaluate_every", 500, "Evaluate model on dev set after this many steps (default: 100)")
18 | # flags.DEFINE_integer("checkpoint_every", 500, "Save model after this many steps (default: 100)")
19 | # flags.DEFINE_boolean('overlap_needed',False,"is overlap used")
20 | # flags.DEFINE_boolean('position_needed',False,'is position embedding used')
21 | # flags.DEFINE_boolean('dns','False','whether use dns or not')
22 | # flags.DEFINE_string('data','wiki','data set')
23 | # flags.DEFINE_string('pooling','max','max pooling or attentive pooling')
24 | # flags.DEFINE_float('sample_train',1,'sampe my train data')
25 | # flags.DEFINE_boolean('fresh',True,'wheather recalculate the embedding or overlap default is True')
26 | # flags.DEFINE_boolean('clean',True,'whether we clean the data')
27 | # flags.DEFINE_string('conv','wide','wide conv or narrow')
28 | # flags.DEFINE_integer('gpu',0,'gpu number')
29 | # # Misc Parameters
30 | # flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
31 | # flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
32 | 
33 | # #data_help parameters
34 | # flags.DEFINE_boolean('isEnglish',True,'whether is data is english')
35 | # flags.DEFINE_string('en_embedding_file','embedding/aquaint+wiki.txt.gz.ndim=50.bin','english embedding')
36 | # flags.DEFINE_string('ch_embedding_file','embedding/','chinese embedding')
37 | # flags.DEFINE_string('ch_stopwords','model/chStopWordsSimple.txt','chinese stopwords')
38 | 
39 | flags = tf.app.flags
40 | flags.DEFINE_integer(
41 |     "embedding_size", 300, "Dimensionality of character embedding (default: 128)")
42 | flags.DEFINE_string("filter_sizes", "1,2,3,5",
43 |                     "Comma-separated filter sizes (default: '3,4,5')")
44 | flags.DEFINE_integer(
45 |     "num_filters", 64, "Number of filters per filter size (default: 128)")
46 | flags.DEFINE_float("dropout_keep_prob", 1,
47 |                     "Dropout keep probability (default: 0.5)")
48 | flags.DEFINE_float("l2_reg_lambda", 0.000001,
49 |                     "L2 regularizaion lambda (default: 0.0)")
50 | flags.DEFINE_float("learning_rate", 0.001,
51 |                     "learn rate( default: 0.0)")
52 | flags.DEFINE_integer("max_len_left", 40,
53 |                         "max document length of left input")
54 | flags.DEFINE_integer("max_len_right", 40,
55 |                         "max document length of right input")
56 | flags.DEFINE_string("loss", "pair_wise",
57 |                     "loss function (default:point_wise)")
58 | flags.DEFINE_string("model_name", "cnn", "cnn or rnn")
59 | 
60 | # Training parameters
61 | flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
62 | flags.DEFINE_boolean("trainable", False,
63 |                         "is embedding trainable? (default: False)")
64 | flags.DEFINE_integer("num_epoches", 100,
65 |                         "Number of training epochs (default: 100)")
66 | flags.DEFINE_integer(
67 |     "evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
68 | flags.DEFINE_integer(
69 |     "checkpoint_every", 500, "Save model after this many steps (default: 100)")
70 | 
71 | flags.DEFINE_string(
72 |     'embedding_file', '../../embedding/glove.6B/glove.6B.300d.txt', None)
73 | flags.DEFINE_string('data_dir', '../data/wiki', 'nlpcc')
74 | flags.DEFINE_string('summaries_dir','log/summary','log/summary')
75 | 
76 | flags.DEFINE_string(
77 |     'pooling', 'max', 'max pooling or attentive pooling')
78 | flags.DEFINE_string('attention', 'attentive', 'attention strategy')
79 | flags.DEFINE_boolean('clean', True, 'whether we clean the data')
80 | flags.DEFINE_integer('gpu', 0, 'gpu number')
81 | # Misc Parameters
82 | flags.DEFINE_boolean("debug",False,'debug the model')
83 | flags.DEFINE_boolean("allow_soft_placement",
84 |                         True, "Allow device soft device placement")
85 | flags.DEFINE_boolean("log_device_placement",
86 |                         False, "Log placement of ops on devices")
87 | 
88 | args = flags.FLAGS


--------------------------------------------------------------------------------
/evaluation.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import pandas as pd 
  3 | import subprocess
  4 | import platform,os
  5 | import sklearn
  6 | import numpy as np
  7 | qa_path="data/nlpcc-iccpol-2016.dbqa.testing-data"
  8 | 
  9 | def mrr_metric(group):
 10 | 	group = sklearn.utils.shuffle(group,random_state =132)
 11 | 	candidates=group.sort_values(by='score',ascending=False).reset_index()
 12 | 	rr=candidates[candidates["flag"]==1].index.min()+1
 13 | 	if rr!=rr:
 14 | 		return 0
 15 | 	return 1.0/rr
 16 | def map_metric(group):
 17 | 	group = sklearn.utils.shuffle(group,random_state =132)
 18 | 	ap=0
 19 | 	candidates=group.sort_values(by='score',ascending=False).reset_index()
 20 | 	correct_candidates=candidates[candidates["flag"]==1]
 21 | 	if len(correct_candidates)==0:
 22 | 		return 0
 23 | 	for i,index in enumerate(correct_candidates.index):
 24 | 		ap+=1.0* (i+1) /(index+1)
 25 | 	#print( ap/len(correct_candidates))
 26 | 	return ap/len(correct_candidates)
 27 | def mrr_metric_filter(group):
 28 | 	group = sklearn.utils.shuffle(group,random_state =132)
 29 | 	candidates = group.sort_values(by='score',ascending=False).reset_index()
 30 | 	rr=candidates[candidates["flag"]==1].index.min()+1
 31 | 	if rr!=rr:
 32 | 		return False
 33 | 	mrr = 1.0 / rr
 34 | 	return mrr < 0.5
 35 | def evaluation_plus(modelfile, groundtruth=qa_path):
 36 | 	answers=pd.read_csv(groundtruth,header=None,sep="\t",names=["question","answer","flag"],quoting =3)
 37 | 	answers["score"]=pd.read_csv(modelfile,header=None,sep="\t",names=["score"],quoting =3)
 38 | 	print( answers.groupby("question").apply(mrr_metric).mean())
 39 | 	print( answers.groupby("question").apply(map_metric).mean())
 40 | 
 41 | def eval(predicted,groundtruth=qa_path, file_flag=False):
 42 | 	if  'Windows' in platform.system() and file_flag ==False:
 43 | 		modelfile=write2file(predicted)
 44 | 		evaluationbyFile(modelfile)
 45 | 		return 
 46 | 
 47 | 	if type(groundtruth)!= str :
 48 | 		answers=groundtruth
 49 | 	else:
 50 | 		answers=pd.read_csv(groundtruth,header=None,sep="\t",names=["question","answer","flag"],quoting =3)
 51 | 	answers["score"]=predicted
 52 | 	mrr= answers.groupby("question").apply(mrr_metric).mean()
 53 | 	map= answers.groupby("question").apply(map_metric).mean()
 54 | 	return map,mrr
 55 | def evaluate(predicted,groundtruth):
 56 | 	filename=write2file(predicted)
 57 | 	evaluationbyFile(filename,groundtruth=groundtruth)
 58 | def write2file(datas,filename="train.QApair.TJU_IR_QA.score"):
 59 | 	with open(filename,"w") as f:
 60 | 		for data in datas:
 61 | 			f.write(("%.10f" %data )+"\n")
 62 | 	return filename
 63 | 
 64 | 
 65 | def evaluationbyFile(modelfile,resultfile="result.text",groundtruth=qa_path):
 66 | 	cmd="test.exe " + " ".join([groundtruth,modelfile,resultfile])
 67 | 	print( modelfile[19:-6]+":") # )
 68 | 	subprocess.call(cmd, shell=True)
 69 | def evaluationBypandas(df,predicted):
 70 | 	df["score"]=predicted
 71 | 	mrr= df.groupby("question").apply(mrr_metric).mean()
 72 | 	map= df.groupby("question").apply(map_metric).mean()
 73 | 	return map,mrr
 74 | def precision_per(group):
 75 | 	group = sklearn.utils.shuffle(group,random_state =132)
 76 | 	candidates=group.sort_values(by='score',ascending=False).reset_index()
 77 | 	rr=candidates[candidates["flag"]==1].index.min()
 78 | 	if rr==0:
 79 | 		return 1
 80 | 	return 0
 81 | def precision(df,predicted):
 82 | 	df["score"]=predicted
 83 | 	precision = df.groupby("question").apply(precision_per).mean()
 84 | 	return precision
 85 | 
 86 | def briany_test_file(df_test,  predicted=None,mode = 'test'):
 87 | 	N = len(df_test)
 88 | 
 89 | 	nnet_outdir = 'tmp/' + mode
 90 | 	if not os.path.exists(nnet_outdir):
 91 | 		os.makedirs(nnet_outdir)
 92 | 	question2id=dict()
 93 | 	for index,quesion in enumerate( df_test["question"].unique()):
 94 | 		question2id[quesion]=index
 95 | 
 96 | 	df_submission = pd.DataFrame(index=np.arange(N), columns=['qid', 'iter', 'docno', 'rank', 'sim', 'run_id'])
 97 | 	df_submission['qid'] =df_test.apply(lambda row: question2id[row['question']],axis=1)
 98 | 	df_submission['iter'] = 0
 99 | 	df_submission['docno'] = np.arange(N)
100 | 	df_submission['rank'] = 0
101 | 	if  predicted is None:
102 | 		df_submission['sim'] = df_test['score']
103 | 	else:
104 | 		df_submission['sim'] = predicted
105 | 	df_submission['run_id'] = 'nnet'
106 | 	df_submission.to_csv(os.path.join(nnet_outdir, 'submission.txt'), header=False, index=False, sep=' ')
107 | 
108 | 	df_gold = pd.DataFrame(index=np.arange(N), columns=['qid', 'iter', 'docno', 'rel'])
109 | 	df_gold['qid'] = df_test.apply(lambda row: question2id[row['question']],axis=1)
110 | 	df_gold['iter'] = 0
111 | 	df_gold['docno'] = np.arange(N)
112 | 	df_gold['rel'] = df_test['flag']
113 | 	df_gold.to_csv(os.path.join(nnet_outdir, 'gold.txt'), header=False, index=False, sep=' ')
114 | 
115 | if __name__ =="__main__":
116 | 	data_dir="data/"+"wiki"
117 | 	train_file=os.path.join(data_dir,"train.txt")
118 | 	test_file=os.path.join(data_dir,"test.txt")
119 | 
120 | 	train=pd.read_csv(train_file,header=None,sep="\t",names=["question","answer","flag"],quoting =3)
121 | 	train["score"]=np.random.randn(len(train))
122 | 	briany_test_file(train)


--------------------------------------------------------------------------------
/helper.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8-*-
  2 | import numpy as np
  3 | import random,os,math
  4 | import pandas as pd
  5 | import sklearn
  6 | import time
  7 | import matplotlib.pyplot as plt
  8 | from collections import Counter
  9 | import seaborn as sns
 10 | import evaluation
 11 | import string
 12 | import jieba
 13 | from nltk import stem
 14 | from tqdm import tqdm
 15 | import chardet
 16 | import re
 17 | import config
 18 | import logging
 19 | from functools import wraps
 20 | 
 21 | # stopwords = { word.decode("utf-8") for word in open("model/chStopWordsSimple.txt").read().split()}
 22 | # ner_dict = pickle.load(open('ner_dict'))
 23 | 
 24 | #print( tf.__version__)
 25 | def log_time_delta(func):
 26 | 	@wraps(func)
 27 | 	def _deco(*args, **kwargs):
 28 | 		start = time.time()
 29 | 		ret = func(*args, **kwargs)
 30 | 		end = time.time()
 31 | 		delta = end - start
 32 | 		print( "%s runed %.2f seconds"% (func.__name__,delta))
 33 | 		return ret
 34 | 	return _deco
 35 | def remove_the_unanswered_sample(df):
 36 | 	"""
 37 | 	clean the dataset
 38 | 			:param df: dataframe
 39 | 	"""
 40 | 	counter = df.groupby("question").apply(lambda group: sum(group["flag"]))
 41 | 	questions_have_correct = counter[counter > 0].index
 42 | 	counter = df.groupby("question").apply(
 43 | 		lambda group: sum(group["flag"] == 0))
 44 | 	questions_have_uncorrect = counter[counter > 0].index
 45 | 	counter = df.groupby("question").apply(lambda group: len(group["flag"]))
 46 | 	questions_multi = counter[counter > 1].index
 47 | 
 48 | 	return df[df["question"].isin(questions_have_correct) & df["question"].isin(questions_have_correct) & df["question"].isin(questions_have_uncorrect)].reset_index()
 49 | 
 50 | def load_train_file(data_dir, filter=False):
 51 | 	"""
 52 | 	load the dataset
 53 | 			:param data_dir: the data_dir
 54 | 			:param filter=False: whether clean the dataset
 55 | 	"""
 56 | 	train_df = pd.read_csv(os.path.join(data_dir, 'train.txt'), header=None, sep='\t', names=[
 57 | 		'question', 'answer', 'flag'], quoting=3).fillna('')
 58 | 	if filter:
 59 | 		train_df = remove_the_unanswered_sample(train_df)
 60 | 	dev_df = pd.read_csv(os.path.join(data_dir, 'dev.txt'), header=None, sep='\t', names=[
 61 | 		'question', 'answer', 'flag'], quoting=3).fillna('')
 62 | 	if filter:
 63 | 		dev_df = remove_the_unanswered_sample(dev_df)
 64 | 	test_df = pd.read_csv(os.path.join(data_dir, 'test.txt'), header=None, sep='\t', names=[
 65 | 		'question', 'answer', 'flag'], quoting=3).fillna('')
 66 | 	if filter:
 67 | 		test_df = remove_the_unanswered_sample(test_df)
 68 | 	return train_df, test_df, test_df
 69 | 
 70 | def cut(sentence):
 71 | 	"""
 72 | 	split the sentence to tokens
 73 | 			:param sentence: raw sentence
 74 | 	"""
 75 | 	tokens = sentence.split()
 76 | 
 77 | 	return tokens
 78 | 
 79 | def get_alphabet(corpuses):
 80 | 	"""
 81 | 	obtain the dict
 82 | 			:param corpuses: 
 83 | 	"""
 84 | 	word_counter = Counter()
 85 | 
 86 | 	for corpus in corpuses:
 87 | 		for texts in [corpus["question"].unique(), corpus["answer"]]:
 88 | 			for sentence in texts:
 89 | 				tokens = cut(sentence)
 90 | 				for token in tokens:
 91 | 					word_counter[token] += 1
 92 | 	print("there are {} words in dict".format(len(word_counter)))
 93 | 	logging.info("there are {} words in dict".format(len(word_counter)))
 94 | 	word_dict = {word: e + 2 for e, word in enumerate(list(word_counter))}
 95 | 	word_dict['UNK'] = 1
 96 | 	word_dict['<PAD>'] = 0
 97 | 
 98 | 	return word_dict
 99 | 
100 | def get_embedding(alphabet, filename="", embedding_size=100):
101 | 	embedding = np.random.rand(len(alphabet), embedding_size)
102 | 	if filename is None:
103 | 		return embedding
104 | 	with open(filename, encoding='utf-8') as f:
105 | 		i = 0
106 | 		for line in f:
107 | 			i += 1
108 | 			if i % 100000 == 0:
109 | 				print('epch %d' % i)
110 | 			items = line.strip().split(' ')
111 | 			if len(items) == 2:
112 | 				vocab_size, embedding_size = items[0], items[1]
113 | 				print((vocab_size, embedding_size))
114 | 			else:
115 | 				word = items[0]
116 | 				if word in alphabet:
117 | 					embedding[alphabet[word]] = items[1:]
118 | 
119 | 	print('done')
120 | 	return embedding
121 | 
122 | 
123 | def convert_to_word_ids(sentence,alphabet,max_len = 40):
124 | 	"""
125 | 	docstring here
126 | 		:param sentence: 
127 | 		:param alphabet: 
128 | 		:param max_len=40: 
129 | 	"""
130 | 	indices = []
131 | 	tokens = cut(sentence)
132 | 	
133 | 	for word in tokens:
134 | 		if word in alphabet:
135 | 			indices.append(alphabet[word])
136 | 		else:
137 | 			continue
138 | 	result = indices + [alphabet['<PAD>']] * (max_len - len(indices))
139 | 
140 | 	return result[:max_len]
141 | def gen_with_pair_train(df, alphabet, q_len,a_len):
142 | 	pairs = []
143 | 	for question in df['question'].unique():
144 |     		
145 | 		
146 | 		group = df[df['question'] == question]
147 | 		pos_group = group[group['flag'] == 1] # positive answer
148 | 		neg_group = group[group['flag'] == 0]
149 | 		neg_group = neg_group.reset_index()
150 | 	
151 | 		question_indice = convert_to_word_ids(question,alphabet,max_len = q_len)
152 | 
153 | 		negtive_pool_index = range(len(neg_group))
154 | 
155 | 		if len(neg_group) > 0:
156 | 			for pos in pos_group['answer']:
157 | 				neg_index = np.random.choice(negtive_pool_index)
158 | 				neg = neg_group.loc[neg_index]['answer']
159 | 
160 | 				positive_answer_indice = convert_to_word_ids(pos,alphabet,a_len)
161 | 				negative_answer_indice = convert_to_word_ids(neg,alphabet,a_len)
162 | 				pairs.append((question_indice,positive_answer_indice,negative_answer_indice))
163 | 	return pairs
164 | 
165 | def gen_with_pair_test(df,alphabet,q_len,a_len):
166 | 	pairs = []
167 | 	for _,row in df.iterrows():
168 | 		question_indice = convert_to_word_ids(row['question'],alphabet,max_len=q_len)
169 | 		answer_indice = convert_to_word_ids(row['answer'],alphabet,max_len = a_len)
170 | 		pairs.append((question_indice,answer_indice))
171 | 
172 | 	return pairs
173 | def batch_iter(data, batch_size, alphabet,shuffle = False,q_len = 33,a_len = 33):
174 | 	if shuffle:
175 | 		data = gen_with_pair_train(
176 | 			data, alphabet,q_len,a_len )
177 | 	else:
178 | 		data = gen_with_pair_test(data,alphabet,q_len,a_len)
179 | 	data = np.array(data)
180 | 	data_size = len(data)
181 | 
182 | 	if shuffle:
183 | 		shuffle_indice = np.random.permutation(np.arange(data_size))
184 | 		data = data[shuffle_indice]
185 | 
186 | 	num_batch = int((data_size - 1) / float(batch_size)) + 1
187 | 
188 | 	for i in range(num_batch):
189 | 	  start_index = i * batch_size
190 | 	  end_index = min((i + 1) * batch_size, data_size)
191 | 
192 | 	  yield data[start_index:end_index]
193 | 
194 | @log_time_delta
195 | def get_overlap_dict(df,alphabet,q_len = 40,a_len = 40):
196 | 	d = dict()
197 | 	for question in df['question'].unique():
198 | 		group = df[df['question'] == question]
199 | 		answers = group['answer']
200 | 		for ans in answers:
201 | 			q_overlap,a_overlap = overlap_index(question,ans,q_len,a_len)
202 | 			d[(question,ans)] = (q_overlap,a_overlap)
203 | 	return d
204 | # calculate the overlap_index
205 | def overlap_index(question,answer,q_len,a_len,stopwords = []):
206 | 	qset = set(cut(question))
207 | 	aset = set(cut(answer))
208 | 
209 | 	q_index = np.zeros(q_len)
210 | 	a_index = np.zeros(a_len)
211 | 
212 | 	overlap = qset.intersection(aset)
213 | 	for i,q in enumerate(cut(question)[:q_len]):
214 | 		value = 1
215 | 		if q in overlap:
216 | 			value = 2
217 | 		q_index[i] = value
218 | 	for i,a in enumerate(cut(answer)[:a_len]):
219 | 		value = 1
220 | 		if a in overlap:
221 | 			value = 2
222 | 		a_index[i] = value
223 | 	return q_index,a_index
224 | def position_index(sentence,length):
225 | 	index = np.zeros(length)
226 | 
227 | 	raw_len = len(cut(sentence))
228 | 	index[:min(raw_len,length)] = range(1,min(raw_len + 1,length + 1))
229 | 	# print index
230 | 	return index
231 | def transform(flag):
232 | 	if flag == 1:
233 | 		return [0,1]
234 | 	else:
235 | 		return [1,0]
236 | @log_time_delta
237 | def batch_gen_with_single(df,alphabet,batch_size = 10,q_len = 33,a_len = 40,overlap_dict = None):
238 | 	pairs=[]
239 | 	for index,row in df.iterrows():
240 | 		quetion = encode_to_split(row["question"],alphabet,max_sentence = q_len)
241 | 		answer = encode_to_split(row["answer"],alphabet,max_sentence = a_len)
242 | 		if overlap_dict:
243 | 			q_pos_overlap,a_pos_overlap = overlap_index(row["question"],row["answer"],q_len,a_len)
244 | 		else:
245 | 			q_pos_overlap,a_pos_overlap = overlap_dict[(row["question"],row["answer"])]
246 | 
247 | 		q_position = position_index(row['question'],q_len)
248 | 		a_pos_position = position_index(row['answer'],a_len)
249 | 		pairs.append((quetion,answer,q_pos_overlap,a_pos_overlap,q_position,a_pos_position))
250 | 	# n_batches= int(math.ceil(df["flag"].sum()*1.0/batch_size))
251 | 	# n_batches = int(len(pairs)*1.0/batch_size)
252 | 	# # pairs = sklearn.utils.shuffle(pairs,random_state =132)
253 | 	# for i in range(0,n_batches):
254 | 	#     batch = pairs[i*batch_size:(i+1) * batch_size]
255 | 	num_batches_per_epoch = int((len(pairs)-1)/ batch_size) + 1
256 | 	for batch_num in range(num_batches_per_epoch):
257 | 			start_index = batch_num * batch_size
258 | 			end_index = min((batch_num + 1) * batch_size, len(pairs))
259 | 			batch = pairs[start_index:end_index]
260 | 			yield [[pair[j] for pair in batch]  for j in range(6)]
261 | 	# batch= pairs[n_batches*batch_size:] + [pairs[n_batches*batch_size]] * (batch_size- len(pairs)+n_batches*batch_size  )
262 | 	# yield [[pair[i] for pair in batch]  for i in range(6)]
263 | def overlap_visualize():
264 | 	train,test,dev = load("nlpcc",filter = False)
265 | 
266 | 	test = test.reindex(np.random.permutation(test.index))
267 | 	df = train
268 | 	df['qlen'] = df['question'].str.len()
269 | 	df['alen'] = df['answer'].str.len()
270 | 
271 | 	df['q_n_words'] = df['question'].apply(lambda row:len(row.split(' ')))
272 | 	df['a_n_words'] = df['answer'].apply(lambda row:len(row.split(' ')))
273 | 
274 | 	def normalized_word_share(row):
275 | 		w1 = set(map(lambda word: word.lower().strip(), row['question'].split(" ")))
276 | 		w2 = set(map(lambda word: word.lower().strip(), row['answer'].split(" ")))    
277 | 		return 1.0 * len(w1 & w2)/(len(w1) + len(w2))
278 | 	def word_overlap(row):
279 | 		w1 = set(map(lambda word: word.lower().strip(), row['question'].split(" ")))
280 | 		w2 = set(map(lambda word: word.lower().strip(), row['answer'].split(" ")))
281 | 		return w1.intersection(w2)
282 | 	df['word_share'] = df.apply(normalized_word_share, axis=1)
283 | 	plt.figure(figsize=(12, 8))
284 | 	plt.subplot(1,2,1)
285 | 	sns.violinplot(x = 'flag', y = 'word_share', data = df[0:50000],hue = 'flag')
286 | 	plt.subplot(1,2,2)
287 | 	# sns.distplot(df[df['flag'] == 1.0]['word_share'][0:10000], color = 'green',label = 'not match')
288 | 	# sns.distplot(df[df['flag'] == 0.0]['word_share'][0:10000], color = 'blue',label = 'match')
289 | 
290 | 	# plt.figure(figsize=(15, 5))
291 | 	train_word_match = df.apply(normalized_word_share, axis=1, raw=True)
292 | 	plt.hist(train_word_match[df['flag'] == 0], bins=20, normed=True, label='flag 0')
293 | 	plt.hist(train_word_match[df['flag'] == 1], bins=20, normed=True, alpha=0.7, label='flag 1')
294 | 	plt.legend()
295 | 	plt.title('Label distribution over word_match_share', fontsize=15)
296 | 	plt.xlabel('word_match_share', fontsize=15)
297 | 
298 | 	# train_qs = pd.Series(train['question'].tolist() + train['answer'].tolist())
299 | 	# print train_qs
300 | 	plt.show('hold')
301 | def dns_sample(df,alphabet,q_len,a_len,sess,model,batch_size,neg_sample_num = 10):
302 | 	samples = []
303 | 	count = 0
304 | 	pool_answers = df[df.flag == 1]['answer'].tolist()
305 | 	# pool_answers = df[df['flag'] == 0]['answer'].tolist()
306 | 	for question in df['question'].unique():
307 | 		group = df[df['question'] == question]
308 | 		pos_answers = group[df["flag"]==1]["answer"].tolist()
309 | 		# pos_answers_exclude = list(set(pool_answers).difference(set(pos_answers)))
310 | 		neg_answers = group[df["flag"]==0]["answer"].tolist()
311 | 		question_indices = encode_to_split(question,alphabet,max_sentence = q_len)
312 | 		for pos in pos_answers:
313 | 			# negtive sample
314 | 			neg_pool = []
315 | 			if len(neg_answers) > 0:
316 | 				# neg_exc = list(np.random.choice(pos_answers_exclude,size = 100 - len(neg_answers)))
317 | 				neg_answers_sample = neg_answers
318 | 				# neg_answers = neg_a
319 | 				# print 'neg_tive answer:{}'.format(len(neg_answers))
320 | 				for neg in neg_answers_sample:
321 | 					neg_pool.append(encode_to_split(neg,alphabet,max_sentence = a_len))
322 | 				input_x_1 = [question_indices] * len(neg_answers_sample)
323 | 				input_x_2 = [encode_to_split(pos,alphabet,max_sentence = a_len)] * len(neg_answers_sample)
324 | 				input_x_3 = neg_pool
325 | 				feed_dict = {
326 | 					model.question: input_x_1,
327 | 					model.answer: input_x_2,
328 | 					model.answer_negative:input_x_3 
329 | 				}
330 | 				predicted = sess.run(model.score13,feed_dict)
331 | 				# find the max score
332 | 				index = np.argmax(predicted)
333 | 				# print len(neg_answers)
334 | 				# print 'index:{}'.format(index)
335 | 				# if len(neg_answers)>1:
336 | 				#     print neg_answers[1]
337 | 				samples.append((question_indices,encode_to_split(pos,alphabet,max_sentence = a_len),input_x_3[index]))      
338 | 				count += 1
339 | 				if count % 100 == 0:
340 | 					print ('samples load:{}'.format(count))
341 | 	print ('samples finishted len samples:{}'.format(len(samples)))
342 | 	return samples
343 | @log_time_delta
344 | def batch_gen_with_pair_dns(samples,batch_size,epoches=1):
345 | 	# n_batches= int(math.ceil(df["flag"].sum()*1.0/batch_size))
346 | 	n_batches = int(len(samples) * 1.0 / batch_size)
347 | 	for j in range(epoches):
348 | 		pairs = sklearn.utils.shuffle(samples,random_state =132)
349 | 		for i in range(0,n_batches):
350 | 			batch = pairs[i*batch_size:(i+1) * batch_size]
351 | 			yield [[pair[i] for pair in batch]  for i in range(3)]
352 | 
353 | def data_processing():
354 | 	train,test,dev = load('nlpcc',filter = False)
355 | 	q_max_sent_length = max(map(lambda x:len(x),train['question'].str.split()))
356 | 	a_max_sent_length = max(map(lambda x:len(x),train['answer'].str.split()))
357 | 	q_len = map(lambda x:len(x),train['question'].str.split())
358 | 	a_len = map(lambda x:len(x),train['answer'].str.split())
359 | 	print('Total number of unique question:{}'.format(len(train['question'].unique())))
360 | 	print('Total number of question pairs for training: {}'.format(len(train)))
361 | 	print('Total number of question pairs for test: {}'.format(len(test)))
362 | 	print('Total number of question pairs for dev: {}'.format(len(dev)))
363 | 	print('Duplicate pairs: {}%'.format(round(train['flag'].mean()*100, 2)))
364 | 	print(len(train['question'].unique()))
365 | 
366 | 	#text analysis
367 | 	train_qs = pd.Series(train['answer'].tolist())
368 | 	test_qs = pd.Series(test['answer'].tolist())
369 | 	dev_qs = pd.Series(dev['answer'].tolist())
370 | 
371 | 	dist_train = train_qs.apply(lambda x:len(x.split(' ')))
372 | 	dist_test = test_qs.apply(lambda x:len(x.split(' ')))
373 | 	dist_dev = dev_qs.apply(lambda x:len(x.split(' ')))
374 | 	pal = sns.color_palette()
375 | 	plt.figure(figsize=(15, 10))
376 | 	plt.hist(dist_train, bins = 200, range=[0, 200], color=pal[2], normed = True, label='train')
377 | 	plt.hist(dist_dev, bins = 200, range=[0, 200], color=pal[3], normed = True, alpha = 0.5, label='test1')
378 | 	plt.hist(dist_test, bins = 200, range=[0, 200], color=pal[1], normed = True, alpha = 0.5, label='test2')
379 | 	
380 | 	plt.title('Normalised histogram of tokens count in answers', fontsize = 15)
381 | 	plt.legend()
382 | 	plt.xlabel('Number of words', fontsize = 15)
383 | 	plt.ylabel('Probability', fontsize = 15)
384 | 
385 | 	print('mean-train {:.2f} std-train {:.2f} mean-test {:.2f} std-test {:.2f} max-train {:.2f} max-test {:.2f}'.format(dist_train.mean(), 
386 | 						  dist_train.std(), dist_test.mean(), dist_test.std(), dist_train.max(), dist_test.max()))
387 | 	plt.show('hard')
388 | 
389 | 	qmarks = np.mean(train_qs.apply(lambda x: '?' in x))
390 | 	who = np.mean(train_qs.apply(lambda x:'Who' in x))
391 | 	where = np.mean(train_qs.apply(lambda x:'Where' in x))
392 | 	how_many = np.mean(train_qs.apply(lambda x:'How many' in x))
393 | 	fullstop = np.mean(train_qs.apply(lambda x: '.' in x))
394 | 	capital_first = np.mean(train_qs.apply(lambda x: x[0].isupper()))
395 | 	capitals = np.mean(train_qs.apply(lambda x: max([y.isupper() for y in x])))
396 | 	numbers = np.mean(train_qs.apply(lambda x: max([y.isdigit() for y in x])))
397 | 	print('Questions with question marks: {:.2f}%'.format(qmarks * 100))
398 | 	print('Questions with [Who] tags: {:.2f}%'.format(who * 100))
399 | 	print('Questions with [where] tags: {:.2f}%'.format(where * 100))
400 | 	print('Questions with [How many] tags:{:.2f}%'.format(how_many * 100))
401 | 	print('Questions with full stops: {:.2f}%'.format(fullstop * 100))
402 | 	print('Questions with capitalised first letters: {:.2f}%'.format(capital_first * 100))
403 | 	print('Questions with capital letters: {:.2f}%'.format(capitals * 100))
404 | 	print('Questions with numbers: {:.2f}%'.format(numbers * 100))


--------------------------------------------------------------------------------
/models/__pycache__/basis_model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shuishen112/NLPCCDBQA/690fbfed1668fc435cbffc6ae8b532843bf28d50/models/__pycache__/basis_model.cpython-37.pyc


--------------------------------------------------------------------------------
/models/__pycache__/blocks.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shuishen112/NLPCCDBQA/690fbfed1668fc435cbffc6ae8b532843bf28d50/models/__pycache__/blocks.cpython-37.pyc


--------------------------------------------------------------------------------
/models/__pycache__/cnn_model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shuishen112/NLPCCDBQA/690fbfed1668fc435cbffc6ae8b532843bf28d50/models/__pycache__/cnn_model.cpython-37.pyc


--------------------------------------------------------------------------------
/models/basis_model.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | @Description: this is the basis model
  3 | @Author: zhansu
  4 | @Date: 2019-07-02 20:58:41
  5 | @LastEditTime: 2019-07-23 21:21:17
  6 | @LastEditors: Please set LastEditors
  7 | '''
  8 | # coding:utf-8
  9 | 
 10 | import tensorflow as tf
 11 | from tensorflow.python import debug as tf_debug
 12 | import numpy as np
 13 | from tensorflow.contrib import rnn
 14 | import models.blocks as blocks
 15 | import datetime
 16 | from functools import reduce
 17 | import abc
 18 | import sys
 19 | sys.path.append('../')
 20 | # tf.set_random_set()
 21 | 
 22 | 
 23 | class Model(object):
 24 | 
 25 |     def __init__(self, opt):
 26 |         """
 27 |         initialize the model by the para
 28 |         pair_wise model
 29 |             :param self: 
 30 |             :param opt: para of the model in the config
 31 |         """
 32 |         for key, value in opt.items():
 33 |             self.__setattr__(key, value)
 34 | 
 35 |         sess_config = tf.ConfigProto()
 36 |         sess_config.gpu_options.allow_growth = True
 37 |         self.sess = tf.Session(config=sess_config)
 38 | 
 39 |         self.build_graph()
 40 |         # summary
 41 |         self.merged = tf.summary.merge_all()
 42 |         self.train_writer = tf.summary.FileWriter(self.summaries_dir + '/train',
 43 |                                                   self.sess.graph)
 44 |         self.test_writer = tf.summary.FileWriter(self.summaries_dir + '/test')
 45 |         self.saver = tf.train.Saver()
 46 |         self.sess.run(tf.global_variables_initializer())
 47 | 
 48 |         # whether debug the code
 49 |         if self.debug:
 50 |             self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess)
 51 | 
 52 |     def build_graph(self):
 53 |         """
 54 |         build the graph of the model
 55 |             :param self: 
 56 |         """
 57 |         self.create_placeholder()
 58 |         self.add_embeddings()
 59 |         self.encode_sentence()
 60 |         self.create_loss()
 61 |         self.create_op()
 62 | 
 63 |     def create_placeholder(self):
 64 | 
 65 |         print(('Create placeholders'))
 66 |         # he length of the sentence is varied according to the batch,so the None,None
 67 |         self.question = tf.placeholder(
 68 |             tf.int32, [None, None], name='input_question')
 69 | 
 70 |         self.answer = tf.placeholder(
 71 |             tf.int32, [None, None], name='input_answer')
 72 |         self.answer_negative = tf.placeholder(
 73 |             tf.int32, [None, None], name='input_right')
 74 | 
 75 |         self.batch_size = tf.shape(self.question)[0]
 76 |         self.q_len, self.q_mask = blocks.length(self.question)
 77 |         self.a_len, self.a_mask = blocks.length(self.answer)
 78 |         self.a_neg_len, self.a_neg_mask = blocks.length(self.answer_negative)
 79 |         self.dropout_keep_prob_holder = tf.placeholder(
 80 |             tf.float32, name='dropout_keep_prob')
 81 | 
 82 |     def add_embeddings(self):
 83 |         print('add embeddings')
 84 | 
 85 |         self.embedding_w = tf.Variable(np.array(self.embeddings), name="embedding",
 86 |                                        dtype="float32", trainable=self.trainable)
 87 | 
 88 |         self.q_embedding = tf.nn.embedding_lookup(
 89 |             self.embedding_w, self.question, name="q_embedding")
 90 |         self.a_embedding = tf.nn.embedding_lookup(
 91 |             self.embedding_w, self.answer, name="a_embedding")
 92 |         self.a_neg_embedding = tf.nn.embedding_lookup(
 93 |             self.embedding_w, self.answer_negative, name="a_neg_embedding")
 94 | 
 95 |     def get_cosine(self, q, a, name):
 96 |         """
 97 |         docstring here
 98 |             :param self: 
 99 |             :param q: [batch, vector_size]
100 |             :param a: [batch, vector_size]
101 |         """
102 |         if self.dropout_keep_prob_holder != 1.0:
103 | 
104 |             pooled_flat_1 = tf.nn.dropout(q, self.dropout_keep_prob_holder)
105 |             pooled_flat_2 = tf.nn.dropout(a, self.dropout_keep_prob_holder)
106 | 
107 |             cosine = tf.div(
108 |                 tf.reduce_sum(pooled_flat_1*pooled_flat_2, 1),
109 |                 tf.sqrt(tf.reduce_sum(pooled_flat_1*pooled_flat_1, 1)) *
110 |                 tf.sqrt(tf.reduce_sum(pooled_flat_2*pooled_flat_2, 1)) + 1e-8,
111 |                 name="cosine")
112 | 
113 |             return cosine
114 | 
115 |             # q_normalize = tf.nn.l2_normalize(pooled_flat_1, dim=1)
116 |             # a_normalize = tf.nn.l2_normalize(pooled_flat_2, dim=1)
117 |         else:
118 |             #     q_normalize = tf.nn.l2_normalize(q, dim=1)
119 |             #     a_normalize = tf.nn.l2_normalize(a, dim=1)
120 | 
121 |             cosine = tf.div(
122 |                 tf.reduce_sum(q*a, 1),
123 |                 tf.sqrt(tf.reduce_sum(q*q, 1)) *
124 |                 tf.sqrt(tf.reduce_sum(a*a, 1)) + 1e-8,
125 |                 name="cosine")
126 | 
127 |         # score = tf.reduce_sum(tf.multiply(q_normalize, a_normalize), 1)
128 | 
129 |             return cosine
130 | 
131 |     def create_op(self):
132 | 
133 |         self.global_step = tf.Variable(0, name="global_step", trainable=False)
134 |         self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
135 |         self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
136 |         self.train_op = self.optimizer.apply_gradients(
137 |             self.grads_and_vars, global_step=self.global_step)
138 | 
139 |     def create_loss(self):
140 |         """
141 |         calculate the loss, noting that we don't use the l2_regularizer
142 |             :param self: 
143 |         """
144 |         with tf.name_scope('score'):
145 |             self.score12 = self.get_cosine(
146 |                 self.encode_q_pos, self.encode_a_pos, name="pos_score")
147 |             self.score13 = self.get_cosine(
148 |                 self.encode_q_neg, self.encode_a_neg, name="neg_score")
149 | 
150 |         with tf.name_scope("loss"):
151 |             l2_loss = 0.0
152 |             for para in tf.trainable_variables():
153 |                 l2_loss += tf.nn.l2_loss(para)
154 |             self.losses = tf.maximum(0.0, tf.subtract(
155 |                 0.05, tf.subtract(self.score12, self.score13)))
156 |             self.loss = tf.reduce_sum(self.losses) + self.l2_reg_lambda * l2_loss
157 | 
158 |         tf.summary.scalar('loss', self.loss)
159 |         # Accuracy
160 |         with tf.name_scope("accuracy"):
161 |             self.correct = tf.equal(0.0, self.losses)
162 |             self.accuracy = tf.reduce_mean(
163 |                 tf.cast(self.correct, "float"), name="accuracy")
164 |         tf.summary.scalar('accuracy', self.accuracy)
165 | 
166 |     def train(self, data_batch, i):
167 |         """
168 |         thain the model 
169 |             :param self: 
170 |             :param data_batch: train_dataset databatch
171 |         """
172 |         for data in data_batch:
173 |             question,pos_answer,neg_answer = zip(*data)
174 |             feed_dict = {
175 |                 self.question: question,
176 |                 self.answer: pos_answer,
177 |                 self.answer_negative:neg_answer,
178 |                 self.dropout_keep_prob_holder: self.dropout_keep_prob
179 |             }
180 |             _, summary, step, loss, accuracy, score12, score13 = self.sess.run(
181 |                 [self.train_op, self.merged, self.global_step, self.loss,
182 |                     self.accuracy, self.score12, self.score13],
183 |                 feed_dict)
184 |             self.train_writer.add_summary(summary, step)
185 |             time_str = datetime.datetime.now().isoformat()
186 |             print("{}: epoch:{},step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g},score{}".format(
187 |                 time_str, i, step, loss, accuracy, np.mean(score12), np.mean(score13), np.mean(score12)))
188 | 
189 |     def predict(self, data_batch):
190 |         """
191 |         predict the test_dataset
192 |             :param self: 
193 |             :param data_batch: test_dataset data_batch
194 |         """
195 |         scores = []
196 |         for e, data in enumerate(data_batch):
197 | 
198 |             question,answer = zip(*data)
199 |             feed_dict = {
200 |                 self.question: question,
201 |                 self.answer:answer,
202 |                 self.dropout_keep_prob_holder: 1.0
203 |             }
204 |             score = self.sess.run(
205 |                 self.score12, feed_dict)
206 |             # self.test_writer.add_summary(summary, e)
207 |             scores.extend(score)
208 |         return scores
209 | 
210 |     def variable_summaries(self, var):
211 |         with tf.name_scope('summaries'):
212 |             mean = tf.reduce_mean(var)
213 |             tf.summary.scalar('mean', mean)
214 |             with tf.name_scope('stddev'):
215 |                 stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
216 |             tf.summary.scalar('stddev', stddev)
217 |             tf.summary.scalar('max', tf.reduce_max(var))
218 |             tf.summary.scalar('min', tf.reduce_min(var))
219 |             tf.summary.histogram('histogram', var)
220 | 
221 |     @abc.abstractmethod
222 |     def encode_sentence(self):
223 |         """
224 |         the method is the implemented by the subclass
225 |             :param self: 
226 |         """
227 | 
228 |     @staticmethod
229 |     def _model_stats():
230 |         """Print trainable variables and total model size."""
231 | 
232 |         def size(v):
233 |             return reduce(lambda x, y: x * y, v.get_shape().as_list())
234 |         print("Trainable variables")
235 |         for v in tf.trainable_variables():
236 |             print("  %s, %s, %s, %s" %
237 |                   (v.name, v.device, str(v.get_shape()), size(v)))
238 |         print("Total model size: %d" % (sum(size(v)
239 |                                             for v in tf.trainable_variables())))
240 | 


--------------------------------------------------------------------------------
/models/blocks.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | Functions and components that can be slotted into tensorflow models.
 4 | 
 5 | TODO: Write functions for various types of attention.
 6 | 
 7 | """
 8 | 
 9 | import tensorflow as tf
10 | 
11 | 
12 | def length(sequence):
13 |     """
14 |     Get true length of sequences (without padding), and mask for true-length in max-length.
15 | 
16 |     Input of shape: (batch_size, max_seq_length, hidden_dim)
17 |     Output shapes, 
18 |     length: (batch_size)
19 |     mask: (batch_size, max_seq_length, 1)
20 |     """
21 |     populated = tf.sign(tf.abs(sequence))
22 |     length = tf.cast(tf.reduce_sum(populated, axis=1), tf.int32)
23 |     mask = tf.cast(populated, tf.int32)
24 |     return length, mask
25 | 
26 | 
27 | 
28 | def biLSTM(inputs, dim, seq_len, name):
29 |     """
30 |     A Bi-Directional LSTM layer. Returns forward and backward hidden states as a tuple, and cell states as a tuple.
31 | 
32 |     Ouput of hidden states: [(batch_size, max_seq_length, hidden_dim), (batch_size, max_seq_length, hidden_dim)]
33 |     Same shape for cell states.
34 |     """
35 |     with tf.name_scope(name):
36 |         with tf.variable_scope('forward' + name):
37 |             lstm_fwd = tf.contrib.rnn.LSTMCell(num_units=dim)
38 |         with tf.variable_scope('backward' + name):
39 |             lstm_bwd = tf.contrib.rnn.LSTMCell(num_units=dim)
40 | 
41 |         hidden_states, cell_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fwd, cell_bw=lstm_bwd, inputs=inputs, sequence_length=seq_len, dtype=tf.float32, scope=name)
42 | 
43 |     return hidden_states, cell_states
44 | 
45 | 
46 | def last_output(output, true_length):
47 |     """
48 |     To get the last hidden layer form a dynamically unrolled RNN.
49 |     Input of shape (batch_size, max_seq_length, hidden_dim).
50 | 
51 |     true_length: Tensor of shape (batch_size). Such a tensor is given by the length() function.
52 |     Output of shape (batch_size, hidden_dim).
53 |     """
54 |     max_length = int(output.get_shape()[1])
55 |     length_mask = tf.expand_dims(tf.one_hot(true_length-1, max_length, on_value=1., off_value=0.), -1)
56 |     last_output = tf.reduce_sum(tf.multiply(output, length_mask), 1)
57 |     return last_output
58 | 
59 | 
60 | def masked_softmax(scores, mask):
61 |     """
62 |     Used to calculcate a softmax score with true sequence length (without padding), rather than max-sequence length.
63 | 
64 |     Input shape: (batch_size, max_seq_length, hidden_dim). 
65 |     mask parameter: Tensor of shape (batch_size, max_seq_length). Such a mask is given by the length() function.
66 |     """
67 |     numerator = tf.exp(tf.subtract(scores, tf.reduce_max(scores, 1, keep_dims=True))) * mask
68 |     denominator = tf.reduce_sum(numerator, 1, keep_dims=True)
69 |     weights = tf.div(numerator, denominator)
70 |     return weights
71 | 


--------------------------------------------------------------------------------
/models/cnn_model.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | @Description: this is the attentive pooling network of the question answering
  3 | @Author: zhansu
  4 | @Date: 2019-07-10 21:50:33
  5 | @LastEditTime: 2019-07-23 17:11:59
  6 | @LastEditors: Please set LastEditors
  7 | '''
  8 | 
  9 | import tensorflow as tf
 10 | from models.basis_model import Model
 11 | 
 12 | 
 13 | class Attentive_CNN(Model):
 14 | 
 15 |     def attentive_pooling(self, input_left, input_right):
 16 |         """
 17 |         docstring here: attentive pooling network
 18 |             :param self:
 19 |             :param input_left: question [batch,q_len,vector_size(num_filters * num_of_window)]
 20 |             :param input_right: answer [batch,a_len,vector_size(num_filters * num_of_window)]
 21 |         """
 22 | 
 23 |         self.q_len = tf.shape(input_left)[1]
 24 |         self.a_len = tf.shape(input_right)[1]
 25 |         self.batch_size = tf.shape(input_left)[0]
 26 |         Q = tf.reshape(input_left, [self.batch_size, self.q_len,
 27 |                        self.vector_size], name='Q')
 28 |         A = tf.reshape(
 29 |             input_right, [self.batch_size, self.a_len, self.vector_size], name='A')
 30 | 
 31 |         # [-1,vector_size] * [vector_size,vector_size] noting that * is matrix multiple
 32 |         first = tf.matmul(tf.reshape(Q, [self.batch_size * self.q_len, self.vector_size]), self.U)
 33 |         # [-1,vector_size]->[batch,q_len,vector_size]
 34 |         second_step = tf.reshape(first, [self.batch_size, self.q_len, self.vector_size])
 35 |         # [batch,q_len,vector_size]* [batch,vector,a_len]->[batch,q_len,a_len]
 36 | 
 37 |         A_transpose = tf.transpose(A, perm=[0, 2, 1])
 38 |         result = tf.matmul(second_step, A_transpose)
 39 |         print(second_step.get_shape().as_list())
 40 |         print(A_transpose.get_shape().as_list())
 41 |         G = tf.tanh(result)
 42 | 
 43 |         # column-wise pooling ,row-wise pooling
 44 |         # [batch,q_len,a_len]->[batch,1,a_len]
 45 |         row_pooling = tf.reduce_max(G, axis=1, keepdims = True, name='row_pooling')
 46 |         # [batch,q_len,a_len]->[batch,q_len,1]
 47 |         col_pooling = tf.reduce_max(G, axis=2, keepdims = True, name='col_pooling')
 48 | 
 49 |         attention_q = tf.nn.softmax(
 50 |             col_pooling, 1, name='attention_q')  # [batch,q_len,1]
 51 |         attention_a = tf.transpose(tf.nn.softmax(
 52 |             row_pooling, 2, name='attention_a'),perm = [0,2,1]) # [batch,a_len,1]
 53 | 
 54 |         R_q = tf.reduce_sum(tf.multiply(Q, attention_q), axis=1)
 55 |         R_a = tf.reduce_sum(tf.multiply(A, attention_a), axis=1)
 56 | 
 57 |         return R_q, R_a
 58 | 
 59 |     def wide_convolution(self, embedding):
 60 |         """
 61 |         docstring here wide convolution of the model
 62 |             :param self:
 63 |             :param embedding: embedding representation of the sentence
 64 |         """
 65 |         cnn_outputs = []
 66 |         for i, filter_size in enumerate(self.filter_sizes):
 67 |             conv = tf.nn.conv2d(
 68 |                     embedding,
 69 |                     self.kernels[i][0],
 70 |                     strides=[1, 1, self.embedding_size, 1],
 71 |                     padding='SAME',
 72 |                     name="conv-{}".format(i)
 73 |             )
 74 |             h = tf.nn.relu(tf.nn.bias_add(
 75 |                 conv, self.kernels[i][1]), name="relu-{}".format(i))
 76 |             cnn_outputs.append(h)
 77 |         cnn_reshaped = tf.concat(cnn_outputs, 3)
 78 |         return cnn_reshaped
 79 | 
 80 |     def encode_sentence(self):
 81 |         """
 82 |         encode the sentence with cnn model
 83 |             :param self:
 84 |         """
 85 |         # pramaters of the attentive pooling
 86 |         self.vector_size = len(self.filter_sizes) * self.num_filters
 87 |         self.U = tf.Variable(tf.truncated_normal(
 88 |             shape=[self.vector_size, self.vector_size], stddev=0.01, name='U'))
 89 |         self.kernels = []
 90 |         for i, filter_size in enumerate(self.filter_sizes):
 91 |             with tf.name_scope('conv-max-pool-%s' % filter_size):
 92 |                 filter_shape = [filter_size, self.embedding_size, 1, self.num_filters]
 93 |                 conv_w = tf.Variable(tf.truncated_normal(filter_shape, stddev = 0.1), name="conv_w_filter_{}".format(i))
 94 |                 conv_b = tf.Variable(tf.constant(0.0, shape=[self.num_filters]), name="conv_b_{}".format(i))
 95 |                 self.kernels.append((conv_w, conv_b))
 96 | 
 97 |         q_emb = tf.expand_dims(self.q_embedding, -1)
 98 |         a_emb = tf.expand_dims(self.a_embedding, -1)
 99 |         a_neg_emb = tf.expand_dims(self.a_neg_embedding, -1)
100 |         # convolution
101 |         self.q_conv = self.wide_convolution(q_emb)
102 |         self.a_conv = self.wide_convolution(a_emb)
103 |         self.a_neg_conv = self.wide_convolution(a_neg_emb)
104 | 
105 |         # attentive pooling
106 |         self.encode_q_pos, self.encode_a_pos= self.attentive_pooling(self.q_conv, self.a_conv)
107 |         self.encode_q_neg, self.encode_a_neg= self.attentive_pooling(self.q_conv, self.a_neg_conv)
108 | 


--------------------------------------------------------------------------------
/propressing.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | @Description: 进行数据分析，关注数据细节
 3 | @Author: zhansu
 4 | @Date: 2019-07-05 17:26:53
 5 | @LastEditTime: 2019-07-23 15:52:32
 6 | @LastEditors: Please set LastEditors
 7 | '''
 8 | import numpy as np
 9 | import pandas as pd
10 | import os
11 | import matplotlib.pyplot as plt
12 | import seaborn as sns
13 | pal = sns.color_palette()
14 | print(os.getcwd())
15 | 
16 | df_train = pd.read_csv('data/nlpcc/train.txt', sep='\t',
17 |                        names=['question', 'answer', 'flag'], quoting=3)
18 | 
19 | print(df_train['flag'].head())
20 | 
21 | # 基础分析
22 | print(df_train.info())
23 | print(df_train.shape)
24 | df_train.groupby('flag')['question'].count().plot.bar()
25 | print("dataset size:{}".format(len(df_train)))
26 | print("positive sample rate:{}%".format(
27 |     round(df_train['flag'].mean() * 100, 2)))
28 | print('question pairs:{}'.format(len(df_train['question'].unique())))
29 | 
30 | # 文本分析
31 | df_test = pd.read_csv('data/nlpcc/test.txt', sep='\t',
32 |                       names=['question', 'answer', 'flag'], quoting=3)
33 | 
34 | train_qs = pd.Series(
35 |     df_train['question'].tolist() + df_train['answer'].tolist())
36 | test_qs = pd.Series(df_test['question'].tolist() + df_test['answer'].tolist())
37 | dist_train = train_qs.apply(lambda x: len(x.split(' ')))
38 | dist_test = test_qs.apply(lambda x: len(x.split(' ')))
39 | print('mean-train:{} std-train:{} max-train:{} mean-test:{} std-test:{} max-test:{}'.format(dist_train.mean(),
40 |                                                                                             dist_train.std(),
41 |                                                                                             dist_train.max(),
42 |                                                                                             dist_test.mean(),
43 |                                                                                             dist_test.std(),
44 |                                                                                             dist_test.max()))
45 | 
46 | dist_train = train_qs.apply(len)
47 | dist_test = test_qs.apply(len)
48 | plt.figure(figsize=(15, 10))
49 | plt.hist(dist_train, bins=40, range=[0, 40],
50 |          color=pal[2], normed=True, label='train')
51 | plt.hist(dist_test, bins=40, range=[
52 |          0, 40], color=pal[1], normed=True, alpha=0.5, label='test')
53 | plt.title('Normalised histogram of character count in questions', fontsize=15)
54 | plt.legend()
55 | plt.xlabel('Number of characters', fontsize=15)
56 | plt.ylabel('Probability', fontsize=15)
57 | plt.show()
58 | 
59 | # 语义分析
60 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | @Description: 
 3 | @Author: zhansu
 4 | @Date: 2019-06-28 20:14:28
 5 | @LastEditTime: 2019-07-23 21:00:37
 6 | @LastEditors: Please set LastEditors
 7 | '''
 8 | from tensorflow import flags
 9 | import tensorflow as tf
10 | from config import args
11 | import helper
12 | import time
13 | import datetime
14 | import os
15 | from models.cnn_model import Attentive_CNN
16 | import numpy as np
17 | import evaluation
18 | import sys
19 | import logging
20 | import os
21 | os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
22 | print(os.getcwd())
23 | 
24 | now = int(time.time())
25 | timeArray = time.localtime(now)
26 | log_filename = "log/" + time.strftime("%Y%m%d", timeArray)
27 | if not os.path.exists(log_filename):
28 |     os.makedirs(log_filename)
29 | 
30 | program = os.path.basename('QA')
31 | logger = logging.getLogger(program)
32 | 
33 | logging.basicConfig(format = '%(asctime)s: %(levelname)s: %(message)s', datefmt='%a, %d %b %Y %H:%M:%S',
34 |                     filename=log_filename+'/{}_qa.log'.format(time.strftime("%H%M", timeArray)), filemode='w')
35 | logging.root.setLevel(level=logging.INFO)
36 | logger.info("running %s" % ' '.join(sys.argv))
37 | 
38 | 
39 | opts = args.flag_values_dict()
40 | for item in opts:
41 |     logger.info('{} : {}'.format(item, opts[item]))
42 | 
43 | logger.info('load data ...........')
44 | train, test, dev = helper.load_train_file(
45 |     opts['data_dir'], filter=args.clean)
46 | 
47 | q_max_sent_length = max(map(lambda x: len(x), train['question'].str.split()))
48 | a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split()))
49 | 
50 | alphabet = helper.get_alphabet([train, test, dev])
51 | logger.info('the number of words :%d ' % len(alphabet))
52 | 
53 | embedding = helper.get_embedding(
54 |     alphabet, opts['embedding_file'], embedding_size=opts['embedding_size'])
55 | 
56 | opts["embeddings"] = embedding
57 | opts["vocab_size"] = len(alphabet)
58 | opts["max_input_right"] = a_max_sent_length
59 | opts["max_input_left"] = q_max_sent_length
60 | opts["filter_sizes"] = list(map(int, args.filter_sizes.split(",")))
61 | 
62 | with tf.Graph().as_default():
63 | 
64 |     model = Attentive_CNN(opts)
65 |     model._model_stats()
66 |     for i in range(args.num_epoches):
67 |         data_gen = helper.batch_iter(train, args.batch_size,alphabet,shuffle=True,q_len=q_max_sent_length,a_len=a_max_sent_length )
68 |         model.train(data_gen,i)
69 | 
70 |         test_datas = helper.batch_iter(
71 |             test, args.batch_size,alphabet,q_len=q_max_sent_length,a_len=a_max_sent_length )
72 | 
73 |         test['score'] = model.predict(test_datas)
74 |         map_, mrr_= evaluation.evaluationBypandas(test, test['score'].to_list())
75 |         df_group = test.groupby('question').filter(evaluation.mrr_metric_filter)
76 |         df_group[['question','answer','flag','score']].to_csv('badcase',sep = '\t',index = None)
77 |         logger.info('map:{}--mrr:{}'.format(map_, mrr_))
78 |         print('map:{}--mrr:{}'.format(map_, mrr_))
79 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf 
 2 | import cPickle as pickle
 3 | import numpy as np 
 4 | # a = tf.Variable(np.ones((3,33,10)))
 5 | # b = tf.expand_dims(tf.Variable(np.arange(33) + 0.0),-1)
 6 | # print b
 7 | # c = tf.transpose(a,perm = [1,0]) * b
 8 | # c = tf.multiply(a,b)
 9 | # d = tf.ones([10,2])
10 | a = [23.12,34.23,12.56]
11 | b = tf.nn.l2_normalize(a,0)
12 | c = tf.reduce_sum(b**2)
13 | # initializer = (np.array(0), np.array(1))
14 | # fibonaccis = tf.scan(lambda a, _: (a[1], a[0] + a[1]), elems)
15 | with tf.Session() as sess:
16 | 
17 | 	sess.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()))
18 | 	# print sess.run(a)
19 | 	print sess.run(b)
20 | 	print sess.run(c)
21 | 	# print sess.run(d)
22 | 
23 | import numpy as np
24 | import matplotlib.pyplot as plt
25 | # alpha = ['ABC', 'DEF', 'GHI', 'JKL']
26 | d = pickle.load(open('attention.file'))
27 | print d[0][0]
28 | exit()
29 | # print len(d)
30 | data = d[0][0]
31 | print data
32 | # print d[0][0]
33 | fig = plt.figure()
34 | ax = fig.add_subplot(111)
35 | cax = ax.matshow(data, cmap = plt.cm.Blues)
36 | fig.colorbar(cax)
37 | 
38 | # ax.set_xticklabels(['']+alpha)
39 | # ax.set_yticklabels(['']+alpha)
40 | 
41 | plt.show()
42 | 
43 | # a = []
44 | 
45 | # b = np.ones((10,10))
46 | # c = np.random.rand(10,20)
47 | # print c[0]
48 | # for b1,c1 in zip(b,c):
49 | # 	a.extend((b1,c1))
50 | 
51 | # print a[1]
52 | # import pandas as pd 
53 | # file = 'data/nlpcc/train.txt'
54 | # df = pd.read_csv(file,header = None,sep="\t",names=["question","answer","flag"],quoting =3).fillna('')
55 | # df['alen'] = df.apply(lambda x:len(x['answer'].split()),axis = 1)
56 | # print df[df['flag'] == 1]['alen'].
57 | # a = ('a','b')
58 | # print str(a)
59 | 
60 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | #coding=utf-8
  2 | #! /usr/bin/env python3.4
  3 | import tensorflow as tf
  4 | import numpy as np
  5 | import os
  6 | import time
  7 | import datetime
  8 | # from data_helpers import get_overlap_dict,replace_number,sample_data,batch_gen_with_pair_overlap,batch_gen_with_pair_dns,dns_sample,load,prepare,batch_gen_with_pair,batch_gen_with_single,batch_gen_with_point_wise,getQAIndiceofTest,batch_gen_with_pair_whole
  9 | from helper import get_overlap_dict,batch_gen_with_pair_overlap,load,prepare,batch_gen_with_single,dns_sample,batch_gen_with_pair_dns
 10 | import operator
 11 | from QA_CNN_pairwise import QA_CNN_extend
 12 | from QA_CNN_quantum_pairwise import QA_CNN_quantum_extend
 13 | from QA_RNN_pairwise import QA_RNN_extend
 14 | import random
 15 | import evaluation
 16 | import cPickle as pickle
 17 | import config
 18 | from sklearn.model_selection import train_test_split
 19 | import pynlpir
 20 | pynlpir.open()
 21 | 
 22 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 23 | 
 24 | now = int(time.time()) 
 25 |     
 26 | timeArray = time.localtime(now)
 27 | timeStamp = time.strftime("%Y%m%d%H%M%S", timeArray)
 28 | timeDay = time.strftime("%Y%m%d", timeArray)
 29 | print (timeStamp)
 30 | 
 31 | from functools import wraps
 32 | #print( tf.__version__)
 33 | def log_time_delta(func):
 34 |     @wraps(func)
 35 |     def _deco(*args, **kwargs):
 36 |         start = time.time()
 37 |         ret = func(*args, **kwargs)
 38 |         end = time.time()
 39 |         delta = end - start
 40 |         print( "%s runed %.2f seconds"% (func.__name__,delta))
 41 |         return ret
 42 |     return _deco
 43 | 
 44 | FLAGS = config.flags.FLAGS
 45 | FLAGS._parse_flags()
 46 | print("\nParameters:")
 47 | for attr, value in sorted(FLAGS.__flags.items()):
 48 |     print(("{}={}".format(attr.upper(), value)))
 49 | log_dir = 'log/'+ timeDay
 50 | if not os.path.exists(log_dir):
 51 |     os.makedirs(log_dir)
 52 | data_file = log_dir + '/test_' + FLAGS.data + timeStamp
 53 | precision = data_file + 'precise'
 54 | attention = []
 55 | @log_time_delta
 56 | def predict(sess,cnn,test,alphabet,batch_size,q_len,a_len):
 57 |     scores = []
 58 |     d = get_overlap_dict(test,alphabet,q_len,a_len)
 59 |     for data in batch_gen_with_single(test,alphabet,batch_size,q_len,a_len,overlap_dict = d): 
 60 |         feed_dict = {
 61 |                     cnn.question: data[0],
 62 |                     cnn.answer: data[1],
 63 |                     cnn.answer_negative:data[1],
 64 |                     cnn.q_pos_overlap: data[2],
 65 |                     cnn.q_neg_overlap:data[2],
 66 |                     cnn.a_pos_overlap: data[3],
 67 |                     cnn.a_neg_overlap:data[3],
 68 |                     cnn.q_position:data[4],
 69 |                     cnn.a_pos_position:data[5],
 70 |                     cnn.a_neg_position:data[5]
 71 |                     }
 72 | 
 73 |         score = sess.run(cnn.score12, feed_dict)
 74 |         # print len(score)
 75 |         # if batch_size == 20:
 76 |         #     attention.extend((q,a))
 77 |         scores.extend(score)
 78 |     pickle.dump(attention,open('attention.file','w'))
 79 |     return np.array(scores[:len(test)])
 80 | @log_time_delta
 81 | def test_pair_wise(dns = FLAGS.dns):
 82 |     train,test,dev = load(FLAGS.data,filter = FLAGS.clean)
 83 |     # train = train[:10000]
 84 |     # test = test[:10000]
 85 |     # dev = dev[:10000]
 86 |     # submit = submit[:1000]
 87 |     q_max_sent_length = max(map(lambda x:len(x),train['question'].str.split()))
 88 |     a_max_sent_length = max(map(lambda x:len(x),train['answer'].str.split()))
 89 |     print 'q_question_length:{} a_question_length:{}'.format(q_max_sent_length,a_max_sent_length)
 90 |     print 'train question unique:{}'.format(len(train['question'].unique()))
 91 |     print 'train length',len(train)
 92 |     print 'test length', len(test)
 93 |     print 'dev length', len(dev)
 94 |     alphabet,embeddings = prepare([train,test,dev],dim = FLAGS.embedding_dim,is_embedding_needed = True,fresh = FLAGS.fresh)
 95 |     # alphabet,embeddings = prepare_300([train,test,dev])
 96 |     print 'alphabet:',len(alphabet)
 97 |     with tf.Graph().as_default(), tf.device("/gpu:" + str(FLAGS.gpu)):
 98 |         # with tf.device("/cpu:0"):
 99 |         session_conf = tf.ConfigProto()
100 |         session_conf.allow_soft_placement = FLAGS.allow_soft_placement
101 |         session_conf.log_device_placement = FLAGS.log_device_placement
102 |         session_conf.gpu_options.allow_growth = True
103 |         sess = tf.Session(config=session_conf)
104 |         with sess.as_default(),open(precision,"w") as log:
105 |             log.write(str(FLAGS.__flags) + '\n')
106 |             folder = 'runs/' + timeDay + '/' + timeStamp + '/'
107 |             out_dir = folder + FLAGS.data
108 |             if not os.path.exists(folder):
109 |                 os.makedirs(folder)
110 |             # train,test,dev = load("trec",filter=True)
111 |             # alphabet,embeddings = prepare([train,test,dev],is_embedding_needed = True)
112 |             print "start build model"
113 |             cnn = QA_RNN_extend(
114 |                 max_input_left = q_max_sent_length,
115 |                 max_input_right = a_max_sent_length,
116 |                 batch_size = FLAGS.batch_size,
117 |                 vocab_size = len(alphabet),
118 |                 embedding_size = FLAGS.embedding_dim,
119 |                 filter_sizes = list(map(int, FLAGS.filter_sizes.split(","))),
120 |                 num_filters = FLAGS.num_filters,                
121 |                 dropout_keep_prob = FLAGS.dropout_keep_prob,
122 |                 embeddings = embeddings,                
123 |                 l2_reg_lambda = FLAGS.l2_reg_lambda,
124 |                 overlap_needed = FLAGS.overlap_needed,
125 |                 learning_rate=FLAGS.learning_rate,
126 |                 trainable = FLAGS.trainable,
127 |                 extend_feature_dim = FLAGS.extend_feature_dim,
128 |                 pooling = FLAGS.pooling,
129 |                 position_needed = FLAGS.position_needed,
130 |                 conv = FLAGS.conv)
131 |             cnn.build_graph()
132 |            
133 |             saver = tf.train.Saver(tf.global_variables(), max_to_keep = 20)
134 |             train_writer = tf.summary.FileWriter(log_dir + '/train', sess.graph)
135 |             test_writer = tf.summary.FileWriter(log_dir + '/test')
136 |             # Initialize all variables
137 |             print "build over"
138 |             sess.run(tf.global_variables_initializer())
139 |             print "variables_initializer"
140 |     
141 |             map_max = 0.65
142 |             for i in range(FLAGS.num_epochs):
143 |                 if FLAGS.dns == True:
144 |                     samples = dns_sample(train,alphabet,q_max_sent_length,
145 |                         a_max_sent_length,sess,cnn,FLAGS.batch_size,neg_sample_num = 10)
146 |                     datas = batch_gen_with_pair_dns(samples,FLAGS.batch_size)
147 |                     print 'load dns datas'
148 |                     for data in datas:
149 |                         feed_dict = {
150 |                             cnn.question:data[0],
151 |                             cnn.answer:data[1],
152 |                             cnn.answer_negative:data[2]
153 |                         }
154 |                         _, step,loss, accuracy,score12,score13 = sess.run(
155 |                         [cnn.train_op, cnn.global_step,cnn.loss, cnn.accuracy,cnn.score12,cnn.score13],
156 |                         feed_dict)
157 |                         time_str = datetime.datetime.now().isoformat()
158 |                         print("{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}".format(time_str, step, loss, accuracy,np.mean(score12),np.mean(score13)))
159 |                         line = "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}".format(time_str, step, loss, accuracy,np.mean(score12),np.mean(score13))
160 |                 else:
161 |                     d = get_overlap_dict(train,alphabet,q_len = q_max_sent_length,a_len = a_max_sent_length)
162 |                     datas = batch_gen_with_pair_overlap(train,alphabet,FLAGS.batch_size,
163 |                         q_len = q_max_sent_length,a_len = a_max_sent_length,fresh = FLAGS.fresh,overlap_dict = d)        
164 |                     print "load data"
165 |                     for data in datas:
166 |                         feed_dict = {
167 |                             cnn.question: data[0],
168 |                             cnn.answer: data[1],
169 |                             cnn.answer_negative:data[2],
170 |                             cnn.q_pos_overlap:data[3],
171 |                             cnn.q_neg_overlap:data[4],
172 |                             cnn.a_pos_overlap:data[5],
173 |                             cnn.a_neg_overlap:data[6],
174 |                             cnn.q_position:data[7],
175 |                             cnn.a_pos_position:data[8],
176 |                             cnn.a_neg_position:data[9]
177 |                         }
178 |                         _, summary,step,loss, accuracy,score12,score13 = sess.run(
179 |                         [cnn.train_op, cnn.merged,cnn.global_step,cnn.loss, cnn.accuracy,cnn.score12,cnn.score13],
180 |                         feed_dict)
181 |                         train_writer.add_summary(summary, i)
182 |                         time_str = datetime.datetime.now().isoformat()
183 |                         print("{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}".format(time_str, step, loss, accuracy,np.mean(score12),np.mean(score13)))
184 |                         line = "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}".format(time_str, step, loss, accuracy,np.mean(score12),np.mean(score13))
185 |                         # print loss
186 |                 if i % 1 == 0:
187 |                     predicted_dev = predict(sess,cnn,dev,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length)
188 |                     map_mrr_dev = evaluation.evaluationBypandas(dev,predicted_dev)
189 |                     predicted_test = predict(sess,cnn,test,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length)
190 |                     map_mrr_test = evaluation.evaluationBypandas(test,predicted_test)
191 | 
192 |                     print "{}:epoch:dev map mrr {}".format(i,map_mrr_dev)
193 |                     print "{}:epoch:test map mrr {}".format(i,map_mrr_test)
194 |                     line = " {}:epoch: map_dev{}-------map_mrr_test{}".format(i,map_mrr_dev[0],map_mrr_test)
195 |                     if map_mrr_dev[0] > map_max:
196 |                         map_max = map_mrr_dev[0]
197 |                         # timeStamp = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))
198 |                         
199 |                         save_path = saver.save(sess, out_dir)
200 |                         print "Model saved in file: ", save_path
201 | 
202 |                 log.write(line + '\n')
203 |                 log.flush()
204 |             print 'train over'
205 |             saver.restore(sess, out_dir)
206 |             predicted = predict(sess,cnn,train,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length)
207 |             train['predicted'] = predicted
208 |             train['predicted'].to_csv('train.QApair.TJU_IR_QA2017_train.score',index = False,sep = '\t')
209 |             map_mrr_train = evaluation.evaluationBypandas(train,predicted)
210 | 
211 |             predicted_dev = predict(sess,cnn,dev,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length)
212 |             dev['predicted'] = predicted_dev
213 |             dev['predicted'].to_csv('train.QApair.TJU_IR_QA2017_dev.score',index = False,sep = '\t')
214 |             map_mrr_dev = evaluation.evaluationBypandas(dev,predicted_dev)
215 | 
216 |             predicted_test = predict(sess,cnn,test,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length)
217 | 
218 |             test['predicted'] = predicted_test
219 |             test['predicted'].to_csv('train.QApair.TJU_IR_QA2017.score',index = False,sep = '\t')
220 |             map_mrr_test = evaluation.evaluationBypandas(test,predicted_test)
221 |     
222 |             print 'map_mrr train',map_mrr_train
223 |             print 'map_mrr dev',map_mrr_dev
224 |             print 'map_mrr test',map_mrr_test
225 |             log.write(str(map_mrr_train) + '\n')
226 |             log.write(str(map_mrr_test) + '\n')
227 |             log.write(str(map_mrr_dev) + '\n')
228 |             predict(sess,cnn,train[:100],alphabet,20,q_max_sent_length,a_max_sent_length)
229 | 
230 | 
231 | 
232 | 
233 | if __name__ == '__main__':
234 |     # test_quora()
235 |     # predicted_pair()
236 |     test_pair_wise()
237 |     # test_point_wise()
238 | 


--------------------------------------------------------------------------------