├── QA_CNN_pairwise.py ├── README.md ├── __pycache__ ├── config.cpython-37.pyc ├── evaluation.cpython-37.pyc └── helper.cpython-37.pyc ├── badcase ├── config.py ├── data └── nlpcc │ ├── dev.txt │ ├── test.txt │ └── train.txt ├── evaluation.py ├── helper.py ├── models ├── __pycache__ │ ├── basis_model.cpython-37.pyc │ ├── blocks.cpython-37.pyc │ └── cnn_model.cpython-37.pyc ├── basis_model.py ├── blocks.py └── cnn_model.py ├── propressing.py ├── run.py ├── test.py └── train.py /QA_CNN_pairwise.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | 5 | # model_type :apn or qacnn 6 | class QA_CNN_extend(object): 7 | def __init__(self,max_input_left,max_input_right,batch_size,vocab_size,embedding_size,filter_sizes,num_filters, 8 | dropout_keep_prob = 1,learning_rate = 0.001,embeddings = None,l2_reg_lambda = 0.0,overlap_needed = False,trainable = True,extend_feature_dim = 10,pooling = 'attentive',position_needed = True,conv = 'narrow'): 9 | 10 | self.dropout_keep_prob = dropout_keep_prob 11 | self.num_filters = num_filters 12 | self.embeddings = embeddings 13 | self.embedding_size = embedding_size 14 | self.batch_size = batch_size 15 | self.filter_sizes = filter_sizes 16 | self.l2_reg_lambda = l2_reg_lambda 17 | self.para = [] 18 | self.extend_feature_dim = extend_feature_dim 19 | self.max_input_left = max_input_left 20 | self.max_input_right = max_input_right 21 | self.overlap_needed = overlap_needed 22 | self.num_filters_total = self.num_filters * len(self.filter_sizes) 23 | self.trainable = trainable 24 | self.vocab_size = vocab_size 25 | self.pooling = pooling 26 | self.position_needed = position_needed 27 | self.conv = conv 28 | if self.overlap_needed: 29 | self.total_embedding_dim = embedding_size + extend_feature_dim 30 | else: 31 | self.total_embedding_dim = embedding_size 32 | #position embedding needed 33 | if self.position_needed: 34 | self.total_embedding_dim = self.total_embedding_dim + extend_feature_dim 35 | self.learning_rate = learning_rate 36 | def create_placeholder(self): 37 | print('Create placeholders') 38 | self.question = tf.placeholder(tf.int32,[None,self.max_input_left],name = 'input_question') 39 | self.answer = tf.placeholder(tf.int32,[None,self.max_input_right],name = 'input_answer') 40 | self.answer_negative = tf.placeholder(tf.int32,[None,self.max_input_right],name = 'input_right') 41 | self.q_pos_overlap = tf.placeholder(tf.int32,[None,self.max_input_left],name = 'q_pos_feature_embed') 42 | self.q_neg_overlap = tf.placeholder(tf.int32,[None,self.max_input_left],name = 'q_neg_feature_embed') 43 | self.a_pos_overlap = tf.placeholder(tf.int32,[None,self.max_input_right],name = 'a_feature_embed') 44 | self.a_neg_overlap = tf.placeholder(tf.int32,[None,self.max_input_right],name = 'a_neg_feature_embed') 45 | self.q_position = tf.placeholder(tf.int32,[None,self.max_input_left],name = 'q_position_embed') 46 | self.a_pos_position = tf.placeholder(tf.int32,[None,self.max_input_right],name = 'a_position_embed') 47 | self.a_neg_position = tf.placeholder(tf.int32,[None,self.max_input_right],name = 'a_neg_postion_embed') 48 | def create_position(self): 49 | print 'add conv position' 50 | self.q_conv_position = tf.Variable(tf.ones([self.max_input_left,1]),name = 'q_conv_position') 51 | self.a_conv_position = tf.Variable(tf.ones([self.max_input_right,1]),name = 'a_conv_position') 52 | def add_embeddings(self): 53 | print 'add embeddings' 54 | if self.embeddings is not None: 55 | print "load embedding" 56 | W = tf.Variable(np.array(self.embeddings),name = "W" ,dtype="float32",trainable = self.trainable) 57 | 58 | else: 59 | print "random embedding" 60 | W = tf.Variable(tf.random_uniform([self.vocab_size, self.embedding_size], -1.0, 1.0),name="W",trainable = self.trainable) 61 | self.embedding_W = W 62 | self.overlap_W = tf.Variable(tf.random_uniform([3, self.extend_feature_dim], -1.0, 1.0),name="W",trainable = True) 63 | # we suppose the max length of sentence is 300 64 | self.position_W = tf.Variable(tf.random_uniform([300,self.extend_feature_dim], -1.0, 1.0),name = 'W',trainable = True) 65 | # self.overlap_W = tf.Variable(a,name="W",trainable = True) 66 | self.para.append(self.embedding_W) 67 | self.para.append(self.overlap_W) 68 | self.para.append(self.position_W) 69 | #get embedding 70 | self.q_pos_embedding = self.concat_embedding(self.question,self.q_pos_overlap,self.q_position,self.q_conv_position) 71 | print self.q_pos_embedding 72 | self.q_neg_embedding = self.concat_embedding(self.question,self.q_neg_overlap,self.q_position,self.q_conv_position) 73 | self.a_pos_embedding = self.concat_embedding(self.answer, self.a_pos_overlap,self.a_pos_position,self.a_conv_position) 74 | self.a_neg_embedding = self.concat_embedding(self.answer_negative,self.a_neg_overlap,self.a_neg_position,self.a_conv_position) 75 | def convolution(self): 76 | print 'convolution:wide_convolution' 77 | self.kernels = [] 78 | for i,filter_size in enumerate(self.filter_sizes): 79 | with tf.name_scope('conv-max-pool-%s' % filter_size): 80 | filter_shape = [filter_size,self.total_embedding_dim,1,self.num_filters] 81 | W = tf.Variable(tf.truncated_normal(filter_shape, stddev = 0.1), name="W") 82 | b = tf.Variable(tf.constant(0.0, shape=[self.num_filters]), name="b") 83 | self.kernels.append((W,b)) 84 | self.para.append(W) 85 | self.para.append(b) 86 | #convolution 87 | embeddings = [self.q_pos_embedding,self.q_neg_embedding,self.a_pos_embedding,self.a_neg_embedding] 88 | self.q_pos_feature_map,self.q_neg_feature_map,self.a_pos_feature_map,self.a_neg_feature_map = \ 89 | [self.wide_convolution(embedding) for embedding in embeddings] 90 | def pooling_graph(self): 91 | print 'pooling: max pooling or attentive pooling' 92 | #pooling strategy 93 | if self.pooling == 'max': 94 | print self.pooling 95 | self.q_pos_pooling = tf.reshape(self.max_pooling(self.q_pos_feature_map,self.max_input_left),[-1,self.num_filters_total]) 96 | self.q_neg_pooling = tf.reshape(self.max_pooling(self.q_neg_feature_map,self.max_input_left),[-1,self.num_filters_total]) 97 | self.a_pos_pooling = tf.reshape(self.max_pooling(self.a_pos_feature_map,self.max_input_right),[-1,self.num_filters_total]) 98 | self.a_neg_pooling = tf.reshape(self.max_pooling(self.a_neg_feature_map,self.max_input_right),[-1,self.num_filters_total]) 99 | 100 | elif self.pooling == 'attentive': 101 | print self.pooling 102 | with tf.name_scope('attention'): 103 | self.U = tf.Variable(tf.truncated_normal(shape = [self.num_filters_total,self.num_filters_total],stddev = 0.01,name = 'U')) 104 | self.para.append(self.U) 105 | self.q_pos_pooling,self.a_pos_pooling = self.attentive_pooling(self.q_pos_feature_map,self.a_pos_feature_map) 106 | self.q_neg_pooling,self.a_neg_pooling = self.attentive_pooling(self.q_neg_feature_map,self.a_neg_feature_map) 107 | # print self.q_pos_pooling 108 | else: 109 | print 'no implement' 110 | exit(0) 111 | def create_loss(self): 112 | 113 | with tf.name_scope('score'): 114 | self.score12 = self.getCosine(self.q_pos_pooling,self.a_pos_pooling) 115 | self.score13 = self.getCosine(self.q_neg_pooling,self.a_neg_pooling) 116 | l2_loss = tf.constant(0.0) 117 | for p in self.para: 118 | l2_loss += tf.nn.l2_loss(p) 119 | with tf.name_scope("loss"): 120 | self.losses = tf.maximum(0.0, tf.subtract(0.05, tf.subtract(self.score12, self.score13))) 121 | self.loss = tf.reduce_sum(self.losses) + self.l2_reg_lambda * l2_loss 122 | tf.summary.scalar('loss', self.loss) 123 | # Accuracy 124 | with tf.name_scope("accuracy"): 125 | self.correct = tf.equal(0.0, self.losses) 126 | self.accuracy = tf.reduce_mean(tf.cast(self.correct, "float"), name="accuracy") 127 | tf.summary.scalar('accuracy', self.accuracy) 128 | def create_op(self): 129 | self.global_step = tf.Variable(0, name="global_step", trainable = False) 130 | self.optimizer = tf.train.AdamOptimizer(self.learning_rate) 131 | self.grads_and_vars = self.optimizer.compute_gradients(self.loss) 132 | self.train_op = self.optimizer.apply_gradients(self.grads_and_vars, global_step = self.global_step) 133 | 134 | def concat_embedding(self,words_indice,overlap_indice,position_indice,conv_position): 135 | embedded_chars_q = tf.nn.embedding_lookup(self.embedding_W,words_indice) 136 | position_embedding = tf.nn.embedding_lookup(self.position_W,position_indice) 137 | overlap_embedding_q = tf.nn.embedding_lookup(self.overlap_W,overlap_indice) 138 | if not self.overlap_needed : 139 | if not self.position_needed: 140 | all_embedding = embedded_chars_q 141 | # return tf.expand_dims(embedded_chars_q,-1) 142 | else: 143 | all_embedding = tf.concat([embedded_chars_q,position_embedding],2) 144 | # return tf.expand_dims(tf.concat([embedded_chars_q,position_embedding],2),-1) 145 | else: 146 | if not self.position_needed: 147 | all_embedding = tf.concat([embedded_chars_q,overlap_embedding_q],2) 148 | # return tf.expand_dims(tf.concat([embedded_chars_q,overlap_embedding_q],2),-1) 149 | else: 150 | all_embedding = tf.concat([embedded_chars_q,overlap_embedding_q,position_embedding],2) 151 | # return tf.expand_dims(tf.concat([embedded_chars_q,overlap_embedding_q,position_embedding],2),-1) 152 | # all_embedding = tf.multiply(all_embedding,conv_position) 153 | return tf.expand_dims(all_embedding,-1) 154 | 155 | def max_pooling(self,conv,input_length): 156 | pooled = tf.nn.max_pool( 157 | conv, 158 | ksize = [1, input_length, 1, 1], 159 | strides = [1, 1, 1, 1], 160 | padding = 'VALID', 161 | name="pool") 162 | return pooled 163 | def getCosine(self,q,a): 164 | pooled_flat_1 = tf.nn.dropout(q, self.dropout_keep_prob) 165 | pooled_flat_2 = tf.nn.dropout(a, self.dropout_keep_prob) 166 | 167 | pooled_len_1 = tf.sqrt(tf.reduce_sum(tf.multiply(pooled_flat_1, pooled_flat_1), 1)) 168 | pooled_len_2 = tf.sqrt(tf.reduce_sum(tf.multiply(pooled_flat_2, pooled_flat_2), 1)) 169 | pooled_mul_12 = tf.reduce_sum(tf.multiply(pooled_flat_1, pooled_flat_2), 1) 170 | score = tf.div(pooled_mul_12, tf.multiply(pooled_len_1, pooled_len_2), name="scores") 171 | return score 172 | 173 | def attentive_pooling(self,input_left,input_right): 174 | Q = tf.reshape(input_left,[-1,self.max_input_left,len(self.filter_sizes) * self.num_filters],name = 'Q') 175 | A = tf.reshape(input_right,[-1,self.max_input_right,len(self.filter_sizes) * self.num_filters],name = 'A') 176 | # G = tf.tanh(tf.matmul(tf.matmul(Q,self.U),\ 177 | # A,transpose_b = True),name = 'G') 178 | 179 | first = tf.matmul(tf.reshape(Q,[-1,len(self.filter_sizes) * self.num_filters]),self.U) 180 | print tf.reshape(Q,[-1,len(self.filter_sizes) * self.num_filters]) 181 | print self.U 182 | second_step = tf.reshape(first,[-1,self.max_input_left,len(self.filter_sizes) * self.num_filters]) 183 | result = tf.matmul(second_step,tf.transpose(A,perm = [0,2,1])) 184 | # print 'result',result 185 | G = tf.tanh(result) 186 | 187 | # G = result 188 | # column-wise pooling ,row-wise pooling 189 | row_pooling = tf.reduce_max(G,1,True,name = 'row_pooling') 190 | col_pooling = tf.reduce_max(G,2,True,name = 'col_pooling') 191 | 192 | self.attention_q = tf.nn.softmax(col_pooling,1,name = 'attention_q') 193 | print self.attention_q 194 | self.see = self.attention_q 195 | 196 | self.attention_a = tf.nn.softmax(row_pooling,name = 'attention_a') 197 | R_q = tf.reshape(tf.matmul(Q,self.attention_q,transpose_a = 1),[-1,self.num_filters * len(self.filter_sizes)],name = 'R_q') 198 | R_a = tf.reshape(tf.matmul(self.attention_a,A),[-1,self.num_filters * len(self.filter_sizes)],name = 'R_a') 199 | 200 | return R_q,R_a 201 | 202 | def wide_convolution(self,embedding): 203 | cnn_outputs = [] 204 | for i,filter_size in enumerate(self.filter_sizes): 205 | conv = tf.nn.conv2d( 206 | embedding, 207 | self.kernels[i][0], 208 | strides=[1, 1, self.total_embedding_dim, 1], 209 | padding='SAME', 210 | name="conv-1" 211 | ) 212 | h = tf.nn.relu(tf.nn.bias_add(conv, self.kernels[i][1]), name="relu-1") 213 | cnn_outputs.append(h) 214 | cnn_reshaped = tf.concat(cnn_outputs,3) 215 | return cnn_reshaped 216 | def narrow_convolution_pooling(self): 217 | print 'narrow pooling' 218 | self.kernels = [] 219 | for i,filter_size in enumerate(self.filter_sizes): 220 | with tf.name_scope('conv-max-pool-%s' % filter_size): 221 | filter_shape = [filter_size,self.total_embedding_dim,1,self.num_filters] 222 | W = tf.Variable(tf.truncated_normal(filter_shape, stddev = 0.1), name="W") 223 | b = tf.Variable(tf.constant(0.0, shape=[self.num_filters]), name="b") 224 | self.kernels.append((W,b)) 225 | self.para.append(W) 226 | self.para.append(b) 227 | embeddings = [self.q_pos_embedding,self.q_neg_embedding,self.a_pos_embedding,self.a_neg_embedding] 228 | self.q_pos_pooling,self.q_neg_pooling,self.a_pos_pooling,self.a_neg_pooling = [self.getFeatureMap(embedding,right = i / 2) for i,embedding in enumerate(embeddings) ] 229 | def getFeatureMap(self,embedding,right=True): 230 | if right == 1: 231 | max_length = self.max_input_right 232 | else: 233 | max_length = self.max_input_left 234 | pooled_outputs = [] 235 | for i,filter_size in enumerate(self.filter_sizes): 236 | conv = tf.nn.conv2d( 237 | embedding, 238 | self.kernels[i][0], 239 | strides=[1, 1, 1, 1], 240 | padding='VALID', 241 | name="conv-1" 242 | ) 243 | h = tf.nn.relu(tf.nn.bias_add(conv, self.kernels[i][1]), name="relu-1") 244 | 245 | pooled = tf.nn.max_pool( 246 | h, 247 | ksize=[1, max_length - filter_size + 1, 1, 1], 248 | strides=[1, 1, 1, 1], 249 | padding='VALID', 250 | name="poll-1" 251 | ) 252 | pooled_outputs.append(pooled) 253 | pooled_reshape = tf.reshape(tf.concat(pooled_outputs,3), [-1, self.num_filters_total]) 254 | return pooled_reshape 255 | def variable_summaries(self,var): 256 | with tf.name_scope('summaries'): 257 | mean = tf.reduce_mean(var) 258 | tf.summary.scalar('mean', mean) 259 | with tf.name_scope('stddev'): 260 | stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) 261 | tf.summary.scalar('stddev', stddev) 262 | tf.summary.scalar('max', tf.reduce_max(var)) 263 | tf.summary.scalar('min', tf.reduce_min(var)) 264 | tf.summary.histogram('histogram', var) 265 | 266 | def build_graph(self): 267 | self.create_placeholder() 268 | self.create_position() 269 | self.add_embeddings() 270 | if self.conv == 'narrow': 271 | self.narrow_convolution_pooling() 272 | else: 273 | self.convolution() 274 | self.pooling_graph() 275 | self.create_loss() 276 | self.create_op() 277 | self.merged = tf.summary.merge_all() 278 | 279 | 280 | if __name__ == '__main__': 281 | cnn = QA_CNN_extend(max_input_left = 33, 282 | max_input_right = 40, 283 | batch_size = 3, 284 | vocab_size = 5000, 285 | embedding_size = 100, 286 | filter_sizes = [3,4,5], 287 | num_filters = 64, 288 | dropout_keep_prob = 1.0, 289 | embeddings = None, 290 | l2_reg_lambda = 0.0, 291 | overlap_needed = False, 292 | trainable = True, 293 | extend_feature_dim = 10, 294 | position_needed = False, 295 | pooling = 'attentive', 296 | conv = 'wide') 297 | cnn.build_graph() 298 | input_x_1 = np.reshape(np.arange(3 * 33),[3,33]) 299 | input_x_2 = np.reshape(np.arange(3 * 40),[3,40]) 300 | input_x_3 = np.reshape(np.arange(3 * 40),[3,40]) 301 | 302 | q_pos_embedding = np.ones((3,33)) 303 | q_neg_embedding = np.ones((3,33)) 304 | a_pos_embedding = np.ones((3,40)) 305 | a_neg_embedding = np.ones((3,40)) 306 | 307 | q_position = np.ones((3,33)) 308 | a_pos_position = np.ones((3,40)) 309 | a_neg_position = np.ones((3,40)) 310 | 311 | with tf.Session() as sess: 312 | sess.run(tf.global_variables_initializer()) 313 | feed_dict = { 314 | cnn.question:input_x_1, 315 | cnn.answer:input_x_2, 316 | cnn.answer_negative:input_x_3, 317 | # cnn.q_pos_overlap:q_pos_embedding, 318 | # cnn.q_neg_overlap:q_neg_embedding, 319 | # cnn.a_pos_overlap:a_pos_embedding, 320 | # cnn.a_neg_overlap:a_neg_embedding, 321 | # cnn.q_position:q_position, 322 | # cnn.a_pos_position:a_pos_position, 323 | # cnn.a_neg_position:a_neg_position 324 | } 325 | question,answer,score = sess.run([cnn.question,cnn.answer,cnn.score12],feed_dict) 326 | print question.shape,answer.shape 327 | print score 328 | 329 | 330 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This is a tensorflow implementation of NLPCC2017 DBQA task. Our result ranks 5th amoung the 21 submission. 2 | 3 | [Enhanced Embedding based Attentive Pooling Network for Answer Selection](http://tcci.ccf.org.cn/conference/2017/) 4 | 5 | We utilize chinese wiki corpus to train our embedding. You can train embedding by youself or contact us to get what we use. 6 | 7 | ## Requirements 8 | 9 | - python3 10 | 11 | - Tensorflow = 1.12 12 | 13 | ## Training 14 | 15 | 16 | ``` 17 | ./train.py --overlap_needed True --position_needed True 18 | ``` 19 | 20 | ## 21 | 22 | 23 | 24 | | method | pooling | map(test1) | map(test2) 25 | | :--- | :----: | ----: |:----:| 26 | | CNN-base | max | 0.782 | 0.657 27 | | CNN-base | attentive | 0.772 | 0.646 28 | | +overlap | max | 0.828 | 0.674 29 | | +overlap | attentive | 0.811 | 0.672| 30 | | +position,overlap | attentive | 0.819 | 0.675 31 | | +position,overlap | max | 0.834 | 0.679 32 | 33 | 34 | -------------------------------------------------------------------------------- /__pycache__/config.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shuishen112/NLPCCDBQA/690fbfed1668fc435cbffc6ae8b532843bf28d50/__pycache__/config.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/evaluation.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shuishen112/NLPCCDBQA/690fbfed1668fc435cbffc6ae8b532843bf28d50/__pycache__/evaluation.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/helper.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shuishen112/NLPCCDBQA/690fbfed1668fc435cbffc6ae8b532843bf28d50/__pycache__/helper.cpython-37.pyc -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | # Model Hyperparameters 3 | # flags.DEFINE_integer("embedding_dim",300, "Dimensionality of character embedding (default: 128)") 4 | # flags.DEFINE_string("filter_sizes", "1,2,3,5", "Comma-separated filter sizes (default: '3,4,5')") 5 | # flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)") 6 | # flags.DEFINE_float("dropout_keep_prob", 1, "Dropout keep probability (default: 0.5)") 7 | # flags.DEFINE_float("l2_reg_lambda", 0.000001, "L2 regularizaion lambda (default: 0.0)") 8 | # flags.DEFINE_float("learning_rate", 1e-3, "learn rate( default: 0.0)") 9 | # flags.DEFINE_integer("max_len_left", 40, "max document length of left input") 10 | # flags.DEFINE_integer("max_len_right", 40, "max document length of right input") 11 | # flags.DEFINE_string("loss","pair_wise","loss function (default:point_wise)") 12 | # flags.DEFINE_integer('extend_feature_dim',10,'overlap_feature_dim') 13 | # # Training parameters 14 | # flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)") 15 | # flags.DEFINE_boolean("trainable", False, "is embedding trainable? (default: False)") 16 | # flags.DEFINE_integer("num_epochs", 100, "Number of training epochs (default: 200)") 17 | # flags.DEFINE_integer("evaluate_every", 500, "Evaluate model on dev set after this many steps (default: 100)") 18 | # flags.DEFINE_integer("checkpoint_every", 500, "Save model after this many steps (default: 100)") 19 | # flags.DEFINE_boolean('overlap_needed',False,"is overlap used") 20 | # flags.DEFINE_boolean('position_needed',False,'is position embedding used') 21 | # flags.DEFINE_boolean('dns','False','whether use dns or not') 22 | # flags.DEFINE_string('data','wiki','data set') 23 | # flags.DEFINE_string('pooling','max','max pooling or attentive pooling') 24 | # flags.DEFINE_float('sample_train',1,'sampe my train data') 25 | # flags.DEFINE_boolean('fresh',True,'wheather recalculate the embedding or overlap default is True') 26 | # flags.DEFINE_boolean('clean',True,'whether we clean the data') 27 | # flags.DEFINE_string('conv','wide','wide conv or narrow') 28 | # flags.DEFINE_integer('gpu',0,'gpu number') 29 | # # Misc Parameters 30 | # flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 31 | # flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 32 | 33 | # #data_help parameters 34 | # flags.DEFINE_boolean('isEnglish',True,'whether is data is english') 35 | # flags.DEFINE_string('en_embedding_file','embedding/aquaint+wiki.txt.gz.ndim=50.bin','english embedding') 36 | # flags.DEFINE_string('ch_embedding_file','embedding/','chinese embedding') 37 | # flags.DEFINE_string('ch_stopwords','model/chStopWordsSimple.txt','chinese stopwords') 38 | 39 | flags = tf.app.flags 40 | flags.DEFINE_integer( 41 | "embedding_size", 300, "Dimensionality of character embedding (default: 128)") 42 | flags.DEFINE_string("filter_sizes", "1,2,3,5", 43 | "Comma-separated filter sizes (default: '3,4,5')") 44 | flags.DEFINE_integer( 45 | "num_filters", 64, "Number of filters per filter size (default: 128)") 46 | flags.DEFINE_float("dropout_keep_prob", 1, 47 | "Dropout keep probability (default: 0.5)") 48 | flags.DEFINE_float("l2_reg_lambda", 0.000001, 49 | "L2 regularizaion lambda (default: 0.0)") 50 | flags.DEFINE_float("learning_rate", 0.001, 51 | "learn rate( default: 0.0)") 52 | flags.DEFINE_integer("max_len_left", 40, 53 | "max document length of left input") 54 | flags.DEFINE_integer("max_len_right", 40, 55 | "max document length of right input") 56 | flags.DEFINE_string("loss", "pair_wise", 57 | "loss function (default:point_wise)") 58 | flags.DEFINE_string("model_name", "cnn", "cnn or rnn") 59 | 60 | # Training parameters 61 | flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)") 62 | flags.DEFINE_boolean("trainable", False, 63 | "is embedding trainable? (default: False)") 64 | flags.DEFINE_integer("num_epoches", 100, 65 | "Number of training epochs (default: 100)") 66 | flags.DEFINE_integer( 67 | "evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)") 68 | flags.DEFINE_integer( 69 | "checkpoint_every", 500, "Save model after this many steps (default: 100)") 70 | 71 | flags.DEFINE_string( 72 | 'embedding_file', '../../embedding/glove.6B/glove.6B.300d.txt', None) 73 | flags.DEFINE_string('data_dir', '../data/wiki', 'nlpcc') 74 | flags.DEFINE_string('summaries_dir','log/summary','log/summary') 75 | 76 | flags.DEFINE_string( 77 | 'pooling', 'max', 'max pooling or attentive pooling') 78 | flags.DEFINE_string('attention', 'attentive', 'attention strategy') 79 | flags.DEFINE_boolean('clean', True, 'whether we clean the data') 80 | flags.DEFINE_integer('gpu', 0, 'gpu number') 81 | # Misc Parameters 82 | flags.DEFINE_boolean("debug",False,'debug the model') 83 | flags.DEFINE_boolean("allow_soft_placement", 84 | True, "Allow device soft device placement") 85 | flags.DEFINE_boolean("log_device_placement", 86 | False, "Log placement of ops on devices") 87 | 88 | args = flags.FLAGS -------------------------------------------------------------------------------- /evaluation.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import pandas as pd 3 | import subprocess 4 | import platform,os 5 | import sklearn 6 | import numpy as np 7 | qa_path="data/nlpcc-iccpol-2016.dbqa.testing-data" 8 | 9 | def mrr_metric(group): 10 | group = sklearn.utils.shuffle(group,random_state =132) 11 | candidates=group.sort_values(by='score',ascending=False).reset_index() 12 | rr=candidates[candidates["flag"]==1].index.min()+1 13 | if rr!=rr: 14 | return 0 15 | return 1.0/rr 16 | def map_metric(group): 17 | group = sklearn.utils.shuffle(group,random_state =132) 18 | ap=0 19 | candidates=group.sort_values(by='score',ascending=False).reset_index() 20 | correct_candidates=candidates[candidates["flag"]==1] 21 | if len(correct_candidates)==0: 22 | return 0 23 | for i,index in enumerate(correct_candidates.index): 24 | ap+=1.0* (i+1) /(index+1) 25 | #print( ap/len(correct_candidates)) 26 | return ap/len(correct_candidates) 27 | def mrr_metric_filter(group): 28 | group = sklearn.utils.shuffle(group,random_state =132) 29 | candidates = group.sort_values(by='score',ascending=False).reset_index() 30 | rr=candidates[candidates["flag"]==1].index.min()+1 31 | if rr!=rr: 32 | return False 33 | mrr = 1.0 / rr 34 | return mrr < 0.5 35 | def evaluation_plus(modelfile, groundtruth=qa_path): 36 | answers=pd.read_csv(groundtruth,header=None,sep="\t",names=["question","answer","flag"],quoting =3) 37 | answers["score"]=pd.read_csv(modelfile,header=None,sep="\t",names=["score"],quoting =3) 38 | print( answers.groupby("question").apply(mrr_metric).mean()) 39 | print( answers.groupby("question").apply(map_metric).mean()) 40 | 41 | def eval(predicted,groundtruth=qa_path, file_flag=False): 42 | if 'Windows' in platform.system() and file_flag ==False: 43 | modelfile=write2file(predicted) 44 | evaluationbyFile(modelfile) 45 | return 46 | 47 | if type(groundtruth)!= str : 48 | answers=groundtruth 49 | else: 50 | answers=pd.read_csv(groundtruth,header=None,sep="\t",names=["question","answer","flag"],quoting =3) 51 | answers["score"]=predicted 52 | mrr= answers.groupby("question").apply(mrr_metric).mean() 53 | map= answers.groupby("question").apply(map_metric).mean() 54 | return map,mrr 55 | def evaluate(predicted,groundtruth): 56 | filename=write2file(predicted) 57 | evaluationbyFile(filename,groundtruth=groundtruth) 58 | def write2file(datas,filename="train.QApair.TJU_IR_QA.score"): 59 | with open(filename,"w") as f: 60 | for data in datas: 61 | f.write(("%.10f" %data )+"\n") 62 | return filename 63 | 64 | 65 | def evaluationbyFile(modelfile,resultfile="result.text",groundtruth=qa_path): 66 | cmd="test.exe " + " ".join([groundtruth,modelfile,resultfile]) 67 | print( modelfile[19:-6]+":") # ) 68 | subprocess.call(cmd, shell=True) 69 | def evaluationBypandas(df,predicted): 70 | df["score"]=predicted 71 | mrr= df.groupby("question").apply(mrr_metric).mean() 72 | map= df.groupby("question").apply(map_metric).mean() 73 | return map,mrr 74 | def precision_per(group): 75 | group = sklearn.utils.shuffle(group,random_state =132) 76 | candidates=group.sort_values(by='score',ascending=False).reset_index() 77 | rr=candidates[candidates["flag"]==1].index.min() 78 | if rr==0: 79 | return 1 80 | return 0 81 | def precision(df,predicted): 82 | df["score"]=predicted 83 | precision = df.groupby("question").apply(precision_per).mean() 84 | return precision 85 | 86 | def briany_test_file(df_test, predicted=None,mode = 'test'): 87 | N = len(df_test) 88 | 89 | nnet_outdir = 'tmp/' + mode 90 | if not os.path.exists(nnet_outdir): 91 | os.makedirs(nnet_outdir) 92 | question2id=dict() 93 | for index,quesion in enumerate( df_test["question"].unique()): 94 | question2id[quesion]=index 95 | 96 | df_submission = pd.DataFrame(index=np.arange(N), columns=['qid', 'iter', 'docno', 'rank', 'sim', 'run_id']) 97 | df_submission['qid'] =df_test.apply(lambda row: question2id[row['question']],axis=1) 98 | df_submission['iter'] = 0 99 | df_submission['docno'] = np.arange(N) 100 | df_submission['rank'] = 0 101 | if predicted is None: 102 | df_submission['sim'] = df_test['score'] 103 | else: 104 | df_submission['sim'] = predicted 105 | df_submission['run_id'] = 'nnet' 106 | df_submission.to_csv(os.path.join(nnet_outdir, 'submission.txt'), header=False, index=False, sep=' ') 107 | 108 | df_gold = pd.DataFrame(index=np.arange(N), columns=['qid', 'iter', 'docno', 'rel']) 109 | df_gold['qid'] = df_test.apply(lambda row: question2id[row['question']],axis=1) 110 | df_gold['iter'] = 0 111 | df_gold['docno'] = np.arange(N) 112 | df_gold['rel'] = df_test['flag'] 113 | df_gold.to_csv(os.path.join(nnet_outdir, 'gold.txt'), header=False, index=False, sep=' ') 114 | 115 | if __name__ =="__main__": 116 | data_dir="data/"+"wiki" 117 | train_file=os.path.join(data_dir,"train.txt") 118 | test_file=os.path.join(data_dir,"test.txt") 119 | 120 | train=pd.read_csv(train_file,header=None,sep="\t",names=["question","answer","flag"],quoting =3) 121 | train["score"]=np.random.randn(len(train)) 122 | briany_test_file(train) -------------------------------------------------------------------------------- /helper.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8-*- 2 | import numpy as np 3 | import random,os,math 4 | import pandas as pd 5 | import sklearn 6 | import time 7 | import matplotlib.pyplot as plt 8 | from collections import Counter 9 | import seaborn as sns 10 | import evaluation 11 | import string 12 | import jieba 13 | from nltk import stem 14 | from tqdm import tqdm 15 | import chardet 16 | import re 17 | import config 18 | import logging 19 | from functools import wraps 20 | 21 | # stopwords = { word.decode("utf-8") for word in open("model/chStopWordsSimple.txt").read().split()} 22 | # ner_dict = pickle.load(open('ner_dict')) 23 | 24 | #print( tf.__version__) 25 | def log_time_delta(func): 26 | @wraps(func) 27 | def _deco(*args, **kwargs): 28 | start = time.time() 29 | ret = func(*args, **kwargs) 30 | end = time.time() 31 | delta = end - start 32 | print( "%s runed %.2f seconds"% (func.__name__,delta)) 33 | return ret 34 | return _deco 35 | def remove_the_unanswered_sample(df): 36 | """ 37 | clean the dataset 38 | :param df: dataframe 39 | """ 40 | counter = df.groupby("question").apply(lambda group: sum(group["flag"])) 41 | questions_have_correct = counter[counter > 0].index 42 | counter = df.groupby("question").apply( 43 | lambda group: sum(group["flag"] == 0)) 44 | questions_have_uncorrect = counter[counter > 0].index 45 | counter = df.groupby("question").apply(lambda group: len(group["flag"])) 46 | questions_multi = counter[counter > 1].index 47 | 48 | return df[df["question"].isin(questions_have_correct) & df["question"].isin(questions_have_correct) & df["question"].isin(questions_have_uncorrect)].reset_index() 49 | 50 | def load_train_file(data_dir, filter=False): 51 | """ 52 | load the dataset 53 | :param data_dir: the data_dir 54 | :param filter=False: whether clean the dataset 55 | """ 56 | train_df = pd.read_csv(os.path.join(data_dir, 'train.txt'), header=None, sep='\t', names=[ 57 | 'question', 'answer', 'flag'], quoting=3).fillna('') 58 | if filter: 59 | train_df = remove_the_unanswered_sample(train_df) 60 | dev_df = pd.read_csv(os.path.join(data_dir, 'dev.txt'), header=None, sep='\t', names=[ 61 | 'question', 'answer', 'flag'], quoting=3).fillna('') 62 | if filter: 63 | dev_df = remove_the_unanswered_sample(dev_df) 64 | test_df = pd.read_csv(os.path.join(data_dir, 'test.txt'), header=None, sep='\t', names=[ 65 | 'question', 'answer', 'flag'], quoting=3).fillna('') 66 | if filter: 67 | test_df = remove_the_unanswered_sample(test_df) 68 | return train_df, test_df, test_df 69 | 70 | def cut(sentence): 71 | """ 72 | split the sentence to tokens 73 | :param sentence: raw sentence 74 | """ 75 | tokens = sentence.split() 76 | 77 | return tokens 78 | 79 | def get_alphabet(corpuses): 80 | """ 81 | obtain the dict 82 | :param corpuses: 83 | """ 84 | word_counter = Counter() 85 | 86 | for corpus in corpuses: 87 | for texts in [corpus["question"].unique(), corpus["answer"]]: 88 | for sentence in texts: 89 | tokens = cut(sentence) 90 | for token in tokens: 91 | word_counter[token] += 1 92 | print("there are {} words in dict".format(len(word_counter))) 93 | logging.info("there are {} words in dict".format(len(word_counter))) 94 | word_dict = {word: e + 2 for e, word in enumerate(list(word_counter))} 95 | word_dict['UNK'] = 1 96 | word_dict[''] = 0 97 | 98 | return word_dict 99 | 100 | def get_embedding(alphabet, filename="", embedding_size=100): 101 | embedding = np.random.rand(len(alphabet), embedding_size) 102 | if filename is None: 103 | return embedding 104 | with open(filename, encoding='utf-8') as f: 105 | i = 0 106 | for line in f: 107 | i += 1 108 | if i % 100000 == 0: 109 | print('epch %d' % i) 110 | items = line.strip().split(' ') 111 | if len(items) == 2: 112 | vocab_size, embedding_size = items[0], items[1] 113 | print((vocab_size, embedding_size)) 114 | else: 115 | word = items[0] 116 | if word in alphabet: 117 | embedding[alphabet[word]] = items[1:] 118 | 119 | print('done') 120 | return embedding 121 | 122 | 123 | def convert_to_word_ids(sentence,alphabet,max_len = 40): 124 | """ 125 | docstring here 126 | :param sentence: 127 | :param alphabet: 128 | :param max_len=40: 129 | """ 130 | indices = [] 131 | tokens = cut(sentence) 132 | 133 | for word in tokens: 134 | if word in alphabet: 135 | indices.append(alphabet[word]) 136 | else: 137 | continue 138 | result = indices + [alphabet['']] * (max_len - len(indices)) 139 | 140 | return result[:max_len] 141 | def gen_with_pair_train(df, alphabet, q_len,a_len): 142 | pairs = [] 143 | for question in df['question'].unique(): 144 | 145 | 146 | group = df[df['question'] == question] 147 | pos_group = group[group['flag'] == 1] # positive answer 148 | neg_group = group[group['flag'] == 0] 149 | neg_group = neg_group.reset_index() 150 | 151 | question_indice = convert_to_word_ids(question,alphabet,max_len = q_len) 152 | 153 | negtive_pool_index = range(len(neg_group)) 154 | 155 | if len(neg_group) > 0: 156 | for pos in pos_group['answer']: 157 | neg_index = np.random.choice(negtive_pool_index) 158 | neg = neg_group.loc[neg_index]['answer'] 159 | 160 | positive_answer_indice = convert_to_word_ids(pos,alphabet,a_len) 161 | negative_answer_indice = convert_to_word_ids(neg,alphabet,a_len) 162 | pairs.append((question_indice,positive_answer_indice,negative_answer_indice)) 163 | return pairs 164 | 165 | def gen_with_pair_test(df,alphabet,q_len,a_len): 166 | pairs = [] 167 | for _,row in df.iterrows(): 168 | question_indice = convert_to_word_ids(row['question'],alphabet,max_len=q_len) 169 | answer_indice = convert_to_word_ids(row['answer'],alphabet,max_len = a_len) 170 | pairs.append((question_indice,answer_indice)) 171 | 172 | return pairs 173 | def batch_iter(data, batch_size, alphabet,shuffle = False,q_len = 33,a_len = 33): 174 | if shuffle: 175 | data = gen_with_pair_train( 176 | data, alphabet,q_len,a_len ) 177 | else: 178 | data = gen_with_pair_test(data,alphabet,q_len,a_len) 179 | data = np.array(data) 180 | data_size = len(data) 181 | 182 | if shuffle: 183 | shuffle_indice = np.random.permutation(np.arange(data_size)) 184 | data = data[shuffle_indice] 185 | 186 | num_batch = int((data_size - 1) / float(batch_size)) + 1 187 | 188 | for i in range(num_batch): 189 | start_index = i * batch_size 190 | end_index = min((i + 1) * batch_size, data_size) 191 | 192 | yield data[start_index:end_index] 193 | 194 | @log_time_delta 195 | def get_overlap_dict(df,alphabet,q_len = 40,a_len = 40): 196 | d = dict() 197 | for question in df['question'].unique(): 198 | group = df[df['question'] == question] 199 | answers = group['answer'] 200 | for ans in answers: 201 | q_overlap,a_overlap = overlap_index(question,ans,q_len,a_len) 202 | d[(question,ans)] = (q_overlap,a_overlap) 203 | return d 204 | # calculate the overlap_index 205 | def overlap_index(question,answer,q_len,a_len,stopwords = []): 206 | qset = set(cut(question)) 207 | aset = set(cut(answer)) 208 | 209 | q_index = np.zeros(q_len) 210 | a_index = np.zeros(a_len) 211 | 212 | overlap = qset.intersection(aset) 213 | for i,q in enumerate(cut(question)[:q_len]): 214 | value = 1 215 | if q in overlap: 216 | value = 2 217 | q_index[i] = value 218 | for i,a in enumerate(cut(answer)[:a_len]): 219 | value = 1 220 | if a in overlap: 221 | value = 2 222 | a_index[i] = value 223 | return q_index,a_index 224 | def position_index(sentence,length): 225 | index = np.zeros(length) 226 | 227 | raw_len = len(cut(sentence)) 228 | index[:min(raw_len,length)] = range(1,min(raw_len + 1,length + 1)) 229 | # print index 230 | return index 231 | def transform(flag): 232 | if flag == 1: 233 | return [0,1] 234 | else: 235 | return [1,0] 236 | @log_time_delta 237 | def batch_gen_with_single(df,alphabet,batch_size = 10,q_len = 33,a_len = 40,overlap_dict = None): 238 | pairs=[] 239 | for index,row in df.iterrows(): 240 | quetion = encode_to_split(row["question"],alphabet,max_sentence = q_len) 241 | answer = encode_to_split(row["answer"],alphabet,max_sentence = a_len) 242 | if overlap_dict: 243 | q_pos_overlap,a_pos_overlap = overlap_index(row["question"],row["answer"],q_len,a_len) 244 | else: 245 | q_pos_overlap,a_pos_overlap = overlap_dict[(row["question"],row["answer"])] 246 | 247 | q_position = position_index(row['question'],q_len) 248 | a_pos_position = position_index(row['answer'],a_len) 249 | pairs.append((quetion,answer,q_pos_overlap,a_pos_overlap,q_position,a_pos_position)) 250 | # n_batches= int(math.ceil(df["flag"].sum()*1.0/batch_size)) 251 | # n_batches = int(len(pairs)*1.0/batch_size) 252 | # # pairs = sklearn.utils.shuffle(pairs,random_state =132) 253 | # for i in range(0,n_batches): 254 | # batch = pairs[i*batch_size:(i+1) * batch_size] 255 | num_batches_per_epoch = int((len(pairs)-1)/ batch_size) + 1 256 | for batch_num in range(num_batches_per_epoch): 257 | start_index = batch_num * batch_size 258 | end_index = min((batch_num + 1) * batch_size, len(pairs)) 259 | batch = pairs[start_index:end_index] 260 | yield [[pair[j] for pair in batch] for j in range(6)] 261 | # batch= pairs[n_batches*batch_size:] + [pairs[n_batches*batch_size]] * (batch_size- len(pairs)+n_batches*batch_size ) 262 | # yield [[pair[i] for pair in batch] for i in range(6)] 263 | def overlap_visualize(): 264 | train,test,dev = load("nlpcc",filter = False) 265 | 266 | test = test.reindex(np.random.permutation(test.index)) 267 | df = train 268 | df['qlen'] = df['question'].str.len() 269 | df['alen'] = df['answer'].str.len() 270 | 271 | df['q_n_words'] = df['question'].apply(lambda row:len(row.split(' '))) 272 | df['a_n_words'] = df['answer'].apply(lambda row:len(row.split(' '))) 273 | 274 | def normalized_word_share(row): 275 | w1 = set(map(lambda word: word.lower().strip(), row['question'].split(" "))) 276 | w2 = set(map(lambda word: word.lower().strip(), row['answer'].split(" "))) 277 | return 1.0 * len(w1 & w2)/(len(w1) + len(w2)) 278 | def word_overlap(row): 279 | w1 = set(map(lambda word: word.lower().strip(), row['question'].split(" "))) 280 | w2 = set(map(lambda word: word.lower().strip(), row['answer'].split(" "))) 281 | return w1.intersection(w2) 282 | df['word_share'] = df.apply(normalized_word_share, axis=1) 283 | plt.figure(figsize=(12, 8)) 284 | plt.subplot(1,2,1) 285 | sns.violinplot(x = 'flag', y = 'word_share', data = df[0:50000],hue = 'flag') 286 | plt.subplot(1,2,2) 287 | # sns.distplot(df[df['flag'] == 1.0]['word_share'][0:10000], color = 'green',label = 'not match') 288 | # sns.distplot(df[df['flag'] == 0.0]['word_share'][0:10000], color = 'blue',label = 'match') 289 | 290 | # plt.figure(figsize=(15, 5)) 291 | train_word_match = df.apply(normalized_word_share, axis=1, raw=True) 292 | plt.hist(train_word_match[df['flag'] == 0], bins=20, normed=True, label='flag 0') 293 | plt.hist(train_word_match[df['flag'] == 1], bins=20, normed=True, alpha=0.7, label='flag 1') 294 | plt.legend() 295 | plt.title('Label distribution over word_match_share', fontsize=15) 296 | plt.xlabel('word_match_share', fontsize=15) 297 | 298 | # train_qs = pd.Series(train['question'].tolist() + train['answer'].tolist()) 299 | # print train_qs 300 | plt.show('hold') 301 | def dns_sample(df,alphabet,q_len,a_len,sess,model,batch_size,neg_sample_num = 10): 302 | samples = [] 303 | count = 0 304 | pool_answers = df[df.flag == 1]['answer'].tolist() 305 | # pool_answers = df[df['flag'] == 0]['answer'].tolist() 306 | for question in df['question'].unique(): 307 | group = df[df['question'] == question] 308 | pos_answers = group[df["flag"]==1]["answer"].tolist() 309 | # pos_answers_exclude = list(set(pool_answers).difference(set(pos_answers))) 310 | neg_answers = group[df["flag"]==0]["answer"].tolist() 311 | question_indices = encode_to_split(question,alphabet,max_sentence = q_len) 312 | for pos in pos_answers: 313 | # negtive sample 314 | neg_pool = [] 315 | if len(neg_answers) > 0: 316 | # neg_exc = list(np.random.choice(pos_answers_exclude,size = 100 - len(neg_answers))) 317 | neg_answers_sample = neg_answers 318 | # neg_answers = neg_a 319 | # print 'neg_tive answer:{}'.format(len(neg_answers)) 320 | for neg in neg_answers_sample: 321 | neg_pool.append(encode_to_split(neg,alphabet,max_sentence = a_len)) 322 | input_x_1 = [question_indices] * len(neg_answers_sample) 323 | input_x_2 = [encode_to_split(pos,alphabet,max_sentence = a_len)] * len(neg_answers_sample) 324 | input_x_3 = neg_pool 325 | feed_dict = { 326 | model.question: input_x_1, 327 | model.answer: input_x_2, 328 | model.answer_negative:input_x_3 329 | } 330 | predicted = sess.run(model.score13,feed_dict) 331 | # find the max score 332 | index = np.argmax(predicted) 333 | # print len(neg_answers) 334 | # print 'index:{}'.format(index) 335 | # if len(neg_answers)>1: 336 | # print neg_answers[1] 337 | samples.append((question_indices,encode_to_split(pos,alphabet,max_sentence = a_len),input_x_3[index])) 338 | count += 1 339 | if count % 100 == 0: 340 | print ('samples load:{}'.format(count)) 341 | print ('samples finishted len samples:{}'.format(len(samples))) 342 | return samples 343 | @log_time_delta 344 | def batch_gen_with_pair_dns(samples,batch_size,epoches=1): 345 | # n_batches= int(math.ceil(df["flag"].sum()*1.0/batch_size)) 346 | n_batches = int(len(samples) * 1.0 / batch_size) 347 | for j in range(epoches): 348 | pairs = sklearn.utils.shuffle(samples,random_state =132) 349 | for i in range(0,n_batches): 350 | batch = pairs[i*batch_size:(i+1) * batch_size] 351 | yield [[pair[i] for pair in batch] for i in range(3)] 352 | 353 | def data_processing(): 354 | train,test,dev = load('nlpcc',filter = False) 355 | q_max_sent_length = max(map(lambda x:len(x),train['question'].str.split())) 356 | a_max_sent_length = max(map(lambda x:len(x),train['answer'].str.split())) 357 | q_len = map(lambda x:len(x),train['question'].str.split()) 358 | a_len = map(lambda x:len(x),train['answer'].str.split()) 359 | print('Total number of unique question:{}'.format(len(train['question'].unique()))) 360 | print('Total number of question pairs for training: {}'.format(len(train))) 361 | print('Total number of question pairs for test: {}'.format(len(test))) 362 | print('Total number of question pairs for dev: {}'.format(len(dev))) 363 | print('Duplicate pairs: {}%'.format(round(train['flag'].mean()*100, 2))) 364 | print(len(train['question'].unique())) 365 | 366 | #text analysis 367 | train_qs = pd.Series(train['answer'].tolist()) 368 | test_qs = pd.Series(test['answer'].tolist()) 369 | dev_qs = pd.Series(dev['answer'].tolist()) 370 | 371 | dist_train = train_qs.apply(lambda x:len(x.split(' '))) 372 | dist_test = test_qs.apply(lambda x:len(x.split(' '))) 373 | dist_dev = dev_qs.apply(lambda x:len(x.split(' '))) 374 | pal = sns.color_palette() 375 | plt.figure(figsize=(15, 10)) 376 | plt.hist(dist_train, bins = 200, range=[0, 200], color=pal[2], normed = True, label='train') 377 | plt.hist(dist_dev, bins = 200, range=[0, 200], color=pal[3], normed = True, alpha = 0.5, label='test1') 378 | plt.hist(dist_test, bins = 200, range=[0, 200], color=pal[1], normed = True, alpha = 0.5, label='test2') 379 | 380 | plt.title('Normalised histogram of tokens count in answers', fontsize = 15) 381 | plt.legend() 382 | plt.xlabel('Number of words', fontsize = 15) 383 | plt.ylabel('Probability', fontsize = 15) 384 | 385 | print('mean-train {:.2f} std-train {:.2f} mean-test {:.2f} std-test {:.2f} max-train {:.2f} max-test {:.2f}'.format(dist_train.mean(), 386 | dist_train.std(), dist_test.mean(), dist_test.std(), dist_train.max(), dist_test.max())) 387 | plt.show('hard') 388 | 389 | qmarks = np.mean(train_qs.apply(lambda x: '?' in x)) 390 | who = np.mean(train_qs.apply(lambda x:'Who' in x)) 391 | where = np.mean(train_qs.apply(lambda x:'Where' in x)) 392 | how_many = np.mean(train_qs.apply(lambda x:'How many' in x)) 393 | fullstop = np.mean(train_qs.apply(lambda x: '.' in x)) 394 | capital_first = np.mean(train_qs.apply(lambda x: x[0].isupper())) 395 | capitals = np.mean(train_qs.apply(lambda x: max([y.isupper() for y in x]))) 396 | numbers = np.mean(train_qs.apply(lambda x: max([y.isdigit() for y in x]))) 397 | print('Questions with question marks: {:.2f}%'.format(qmarks * 100)) 398 | print('Questions with [Who] tags: {:.2f}%'.format(who * 100)) 399 | print('Questions with [where] tags: {:.2f}%'.format(where * 100)) 400 | print('Questions with [How many] tags:{:.2f}%'.format(how_many * 100)) 401 | print('Questions with full stops: {:.2f}%'.format(fullstop * 100)) 402 | print('Questions with capitalised first letters: {:.2f}%'.format(capital_first * 100)) 403 | print('Questions with capital letters: {:.2f}%'.format(capitals * 100)) 404 | print('Questions with numbers: {:.2f}%'.format(numbers * 100)) -------------------------------------------------------------------------------- /models/__pycache__/basis_model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shuishen112/NLPCCDBQA/690fbfed1668fc435cbffc6ae8b532843bf28d50/models/__pycache__/basis_model.cpython-37.pyc -------------------------------------------------------------------------------- /models/__pycache__/blocks.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shuishen112/NLPCCDBQA/690fbfed1668fc435cbffc6ae8b532843bf28d50/models/__pycache__/blocks.cpython-37.pyc -------------------------------------------------------------------------------- /models/__pycache__/cnn_model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shuishen112/NLPCCDBQA/690fbfed1668fc435cbffc6ae8b532843bf28d50/models/__pycache__/cnn_model.cpython-37.pyc -------------------------------------------------------------------------------- /models/basis_model.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @Description: this is the basis model 3 | @Author: zhansu 4 | @Date: 2019-07-02 20:58:41 5 | @LastEditTime: 2019-07-23 21:21:17 6 | @LastEditors: Please set LastEditors 7 | ''' 8 | # coding:utf-8 9 | 10 | import tensorflow as tf 11 | from tensorflow.python import debug as tf_debug 12 | import numpy as np 13 | from tensorflow.contrib import rnn 14 | import models.blocks as blocks 15 | import datetime 16 | from functools import reduce 17 | import abc 18 | import sys 19 | sys.path.append('../') 20 | # tf.set_random_set() 21 | 22 | 23 | class Model(object): 24 | 25 | def __init__(self, opt): 26 | """ 27 | initialize the model by the para 28 | pair_wise model 29 | :param self: 30 | :param opt: para of the model in the config 31 | """ 32 | for key, value in opt.items(): 33 | self.__setattr__(key, value) 34 | 35 | sess_config = tf.ConfigProto() 36 | sess_config.gpu_options.allow_growth = True 37 | self.sess = tf.Session(config=sess_config) 38 | 39 | self.build_graph() 40 | # summary 41 | self.merged = tf.summary.merge_all() 42 | self.train_writer = tf.summary.FileWriter(self.summaries_dir + '/train', 43 | self.sess.graph) 44 | self.test_writer = tf.summary.FileWriter(self.summaries_dir + '/test') 45 | self.saver = tf.train.Saver() 46 | self.sess.run(tf.global_variables_initializer()) 47 | 48 | # whether debug the code 49 | if self.debug: 50 | self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) 51 | 52 | def build_graph(self): 53 | """ 54 | build the graph of the model 55 | :param self: 56 | """ 57 | self.create_placeholder() 58 | self.add_embeddings() 59 | self.encode_sentence() 60 | self.create_loss() 61 | self.create_op() 62 | 63 | def create_placeholder(self): 64 | 65 | print(('Create placeholders')) 66 | # he length of the sentence is varied according to the batch,so the None,None 67 | self.question = tf.placeholder( 68 | tf.int32, [None, None], name='input_question') 69 | 70 | self.answer = tf.placeholder( 71 | tf.int32, [None, None], name='input_answer') 72 | self.answer_negative = tf.placeholder( 73 | tf.int32, [None, None], name='input_right') 74 | 75 | self.batch_size = tf.shape(self.question)[0] 76 | self.q_len, self.q_mask = blocks.length(self.question) 77 | self.a_len, self.a_mask = blocks.length(self.answer) 78 | self.a_neg_len, self.a_neg_mask = blocks.length(self.answer_negative) 79 | self.dropout_keep_prob_holder = tf.placeholder( 80 | tf.float32, name='dropout_keep_prob') 81 | 82 | def add_embeddings(self): 83 | print('add embeddings') 84 | 85 | self.embedding_w = tf.Variable(np.array(self.embeddings), name="embedding", 86 | dtype="float32", trainable=self.trainable) 87 | 88 | self.q_embedding = tf.nn.embedding_lookup( 89 | self.embedding_w, self.question, name="q_embedding") 90 | self.a_embedding = tf.nn.embedding_lookup( 91 | self.embedding_w, self.answer, name="a_embedding") 92 | self.a_neg_embedding = tf.nn.embedding_lookup( 93 | self.embedding_w, self.answer_negative, name="a_neg_embedding") 94 | 95 | def get_cosine(self, q, a, name): 96 | """ 97 | docstring here 98 | :param self: 99 | :param q: [batch, vector_size] 100 | :param a: [batch, vector_size] 101 | """ 102 | if self.dropout_keep_prob_holder != 1.0: 103 | 104 | pooled_flat_1 = tf.nn.dropout(q, self.dropout_keep_prob_holder) 105 | pooled_flat_2 = tf.nn.dropout(a, self.dropout_keep_prob_holder) 106 | 107 | cosine = tf.div( 108 | tf.reduce_sum(pooled_flat_1*pooled_flat_2, 1), 109 | tf.sqrt(tf.reduce_sum(pooled_flat_1*pooled_flat_1, 1)) * 110 | tf.sqrt(tf.reduce_sum(pooled_flat_2*pooled_flat_2, 1)) + 1e-8, 111 | name="cosine") 112 | 113 | return cosine 114 | 115 | # q_normalize = tf.nn.l2_normalize(pooled_flat_1, dim=1) 116 | # a_normalize = tf.nn.l2_normalize(pooled_flat_2, dim=1) 117 | else: 118 | # q_normalize = tf.nn.l2_normalize(q, dim=1) 119 | # a_normalize = tf.nn.l2_normalize(a, dim=1) 120 | 121 | cosine = tf.div( 122 | tf.reduce_sum(q*a, 1), 123 | tf.sqrt(tf.reduce_sum(q*q, 1)) * 124 | tf.sqrt(tf.reduce_sum(a*a, 1)) + 1e-8, 125 | name="cosine") 126 | 127 | # score = tf.reduce_sum(tf.multiply(q_normalize, a_normalize), 1) 128 | 129 | return cosine 130 | 131 | def create_op(self): 132 | 133 | self.global_step = tf.Variable(0, name="global_step", trainable=False) 134 | self.optimizer = tf.train.AdamOptimizer(self.learning_rate) 135 | self.grads_and_vars = self.optimizer.compute_gradients(self.loss) 136 | self.train_op = self.optimizer.apply_gradients( 137 | self.grads_and_vars, global_step=self.global_step) 138 | 139 | def create_loss(self): 140 | """ 141 | calculate the loss, noting that we don't use the l2_regularizer 142 | :param self: 143 | """ 144 | with tf.name_scope('score'): 145 | self.score12 = self.get_cosine( 146 | self.encode_q_pos, self.encode_a_pos, name="pos_score") 147 | self.score13 = self.get_cosine( 148 | self.encode_q_neg, self.encode_a_neg, name="neg_score") 149 | 150 | with tf.name_scope("loss"): 151 | l2_loss = 0.0 152 | for para in tf.trainable_variables(): 153 | l2_loss += tf.nn.l2_loss(para) 154 | self.losses = tf.maximum(0.0, tf.subtract( 155 | 0.05, tf.subtract(self.score12, self.score13))) 156 | self.loss = tf.reduce_sum(self.losses) + self.l2_reg_lambda * l2_loss 157 | 158 | tf.summary.scalar('loss', self.loss) 159 | # Accuracy 160 | with tf.name_scope("accuracy"): 161 | self.correct = tf.equal(0.0, self.losses) 162 | self.accuracy = tf.reduce_mean( 163 | tf.cast(self.correct, "float"), name="accuracy") 164 | tf.summary.scalar('accuracy', self.accuracy) 165 | 166 | def train(self, data_batch, i): 167 | """ 168 | thain the model 169 | :param self: 170 | :param data_batch: train_dataset databatch 171 | """ 172 | for data in data_batch: 173 | question,pos_answer,neg_answer = zip(*data) 174 | feed_dict = { 175 | self.question: question, 176 | self.answer: pos_answer, 177 | self.answer_negative:neg_answer, 178 | self.dropout_keep_prob_holder: self.dropout_keep_prob 179 | } 180 | _, summary, step, loss, accuracy, score12, score13 = self.sess.run( 181 | [self.train_op, self.merged, self.global_step, self.loss, 182 | self.accuracy, self.score12, self.score13], 183 | feed_dict) 184 | self.train_writer.add_summary(summary, step) 185 | time_str = datetime.datetime.now().isoformat() 186 | print("{}: epoch:{},step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g},score{}".format( 187 | time_str, i, step, loss, accuracy, np.mean(score12), np.mean(score13), np.mean(score12))) 188 | 189 | def predict(self, data_batch): 190 | """ 191 | predict the test_dataset 192 | :param self: 193 | :param data_batch: test_dataset data_batch 194 | """ 195 | scores = [] 196 | for e, data in enumerate(data_batch): 197 | 198 | question,answer = zip(*data) 199 | feed_dict = { 200 | self.question: question, 201 | self.answer:answer, 202 | self.dropout_keep_prob_holder: 1.0 203 | } 204 | score = self.sess.run( 205 | self.score12, feed_dict) 206 | # self.test_writer.add_summary(summary, e) 207 | scores.extend(score) 208 | return scores 209 | 210 | def variable_summaries(self, var): 211 | with tf.name_scope('summaries'): 212 | mean = tf.reduce_mean(var) 213 | tf.summary.scalar('mean', mean) 214 | with tf.name_scope('stddev'): 215 | stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) 216 | tf.summary.scalar('stddev', stddev) 217 | tf.summary.scalar('max', tf.reduce_max(var)) 218 | tf.summary.scalar('min', tf.reduce_min(var)) 219 | tf.summary.histogram('histogram', var) 220 | 221 | @abc.abstractmethod 222 | def encode_sentence(self): 223 | """ 224 | the method is the implemented by the subclass 225 | :param self: 226 | """ 227 | 228 | @staticmethod 229 | def _model_stats(): 230 | """Print trainable variables and total model size.""" 231 | 232 | def size(v): 233 | return reduce(lambda x, y: x * y, v.get_shape().as_list()) 234 | print("Trainable variables") 235 | for v in tf.trainable_variables(): 236 | print(" %s, %s, %s, %s" % 237 | (v.name, v.device, str(v.get_shape()), size(v))) 238 | print("Total model size: %d" % (sum(size(v) 239 | for v in tf.trainable_variables()))) 240 | -------------------------------------------------------------------------------- /models/blocks.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Functions and components that can be slotted into tensorflow models. 4 | 5 | TODO: Write functions for various types of attention. 6 | 7 | """ 8 | 9 | import tensorflow as tf 10 | 11 | 12 | def length(sequence): 13 | """ 14 | Get true length of sequences (without padding), and mask for true-length in max-length. 15 | 16 | Input of shape: (batch_size, max_seq_length, hidden_dim) 17 | Output shapes, 18 | length: (batch_size) 19 | mask: (batch_size, max_seq_length, 1) 20 | """ 21 | populated = tf.sign(tf.abs(sequence)) 22 | length = tf.cast(tf.reduce_sum(populated, axis=1), tf.int32) 23 | mask = tf.cast(populated, tf.int32) 24 | return length, mask 25 | 26 | 27 | 28 | def biLSTM(inputs, dim, seq_len, name): 29 | """ 30 | A Bi-Directional LSTM layer. Returns forward and backward hidden states as a tuple, and cell states as a tuple. 31 | 32 | Ouput of hidden states: [(batch_size, max_seq_length, hidden_dim), (batch_size, max_seq_length, hidden_dim)] 33 | Same shape for cell states. 34 | """ 35 | with tf.name_scope(name): 36 | with tf.variable_scope('forward' + name): 37 | lstm_fwd = tf.contrib.rnn.LSTMCell(num_units=dim) 38 | with tf.variable_scope('backward' + name): 39 | lstm_bwd = tf.contrib.rnn.LSTMCell(num_units=dim) 40 | 41 | hidden_states, cell_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fwd, cell_bw=lstm_bwd, inputs=inputs, sequence_length=seq_len, dtype=tf.float32, scope=name) 42 | 43 | return hidden_states, cell_states 44 | 45 | 46 | def last_output(output, true_length): 47 | """ 48 | To get the last hidden layer form a dynamically unrolled RNN. 49 | Input of shape (batch_size, max_seq_length, hidden_dim). 50 | 51 | true_length: Tensor of shape (batch_size). Such a tensor is given by the length() function. 52 | Output of shape (batch_size, hidden_dim). 53 | """ 54 | max_length = int(output.get_shape()[1]) 55 | length_mask = tf.expand_dims(tf.one_hot(true_length-1, max_length, on_value=1., off_value=0.), -1) 56 | last_output = tf.reduce_sum(tf.multiply(output, length_mask), 1) 57 | return last_output 58 | 59 | 60 | def masked_softmax(scores, mask): 61 | """ 62 | Used to calculcate a softmax score with true sequence length (without padding), rather than max-sequence length. 63 | 64 | Input shape: (batch_size, max_seq_length, hidden_dim). 65 | mask parameter: Tensor of shape (batch_size, max_seq_length). Such a mask is given by the length() function. 66 | """ 67 | numerator = tf.exp(tf.subtract(scores, tf.reduce_max(scores, 1, keep_dims=True))) * mask 68 | denominator = tf.reduce_sum(numerator, 1, keep_dims=True) 69 | weights = tf.div(numerator, denominator) 70 | return weights 71 | -------------------------------------------------------------------------------- /models/cnn_model.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @Description: this is the attentive pooling network of the question answering 3 | @Author: zhansu 4 | @Date: 2019-07-10 21:50:33 5 | @LastEditTime: 2019-07-23 17:11:59 6 | @LastEditors: Please set LastEditors 7 | ''' 8 | 9 | import tensorflow as tf 10 | from models.basis_model import Model 11 | 12 | 13 | class Attentive_CNN(Model): 14 | 15 | def attentive_pooling(self, input_left, input_right): 16 | """ 17 | docstring here: attentive pooling network 18 | :param self: 19 | :param input_left: question [batch,q_len,vector_size(num_filters * num_of_window)] 20 | :param input_right: answer [batch,a_len,vector_size(num_filters * num_of_window)] 21 | """ 22 | 23 | self.q_len = tf.shape(input_left)[1] 24 | self.a_len = tf.shape(input_right)[1] 25 | self.batch_size = tf.shape(input_left)[0] 26 | Q = tf.reshape(input_left, [self.batch_size, self.q_len, 27 | self.vector_size], name='Q') 28 | A = tf.reshape( 29 | input_right, [self.batch_size, self.a_len, self.vector_size], name='A') 30 | 31 | # [-1,vector_size] * [vector_size,vector_size] noting that * is matrix multiple 32 | first = tf.matmul(tf.reshape(Q, [self.batch_size * self.q_len, self.vector_size]), self.U) 33 | # [-1,vector_size]->[batch,q_len,vector_size] 34 | second_step = tf.reshape(first, [self.batch_size, self.q_len, self.vector_size]) 35 | # [batch,q_len,vector_size]* [batch,vector,a_len]->[batch,q_len,a_len] 36 | 37 | A_transpose = tf.transpose(A, perm=[0, 2, 1]) 38 | result = tf.matmul(second_step, A_transpose) 39 | print(second_step.get_shape().as_list()) 40 | print(A_transpose.get_shape().as_list()) 41 | G = tf.tanh(result) 42 | 43 | # column-wise pooling ,row-wise pooling 44 | # [batch,q_len,a_len]->[batch,1,a_len] 45 | row_pooling = tf.reduce_max(G, axis=1, keepdims = True, name='row_pooling') 46 | # [batch,q_len,a_len]->[batch,q_len,1] 47 | col_pooling = tf.reduce_max(G, axis=2, keepdims = True, name='col_pooling') 48 | 49 | attention_q = tf.nn.softmax( 50 | col_pooling, 1, name='attention_q') # [batch,q_len,1] 51 | attention_a = tf.transpose(tf.nn.softmax( 52 | row_pooling, 2, name='attention_a'),perm = [0,2,1]) # [batch,a_len,1] 53 | 54 | R_q = tf.reduce_sum(tf.multiply(Q, attention_q), axis=1) 55 | R_a = tf.reduce_sum(tf.multiply(A, attention_a), axis=1) 56 | 57 | return R_q, R_a 58 | 59 | def wide_convolution(self, embedding): 60 | """ 61 | docstring here wide convolution of the model 62 | :param self: 63 | :param embedding: embedding representation of the sentence 64 | """ 65 | cnn_outputs = [] 66 | for i, filter_size in enumerate(self.filter_sizes): 67 | conv = tf.nn.conv2d( 68 | embedding, 69 | self.kernels[i][0], 70 | strides=[1, 1, self.embedding_size, 1], 71 | padding='SAME', 72 | name="conv-{}".format(i) 73 | ) 74 | h = tf.nn.relu(tf.nn.bias_add( 75 | conv, self.kernels[i][1]), name="relu-{}".format(i)) 76 | cnn_outputs.append(h) 77 | cnn_reshaped = tf.concat(cnn_outputs, 3) 78 | return cnn_reshaped 79 | 80 | def encode_sentence(self): 81 | """ 82 | encode the sentence with cnn model 83 | :param self: 84 | """ 85 | # pramaters of the attentive pooling 86 | self.vector_size = len(self.filter_sizes) * self.num_filters 87 | self.U = tf.Variable(tf.truncated_normal( 88 | shape=[self.vector_size, self.vector_size], stddev=0.01, name='U')) 89 | self.kernels = [] 90 | for i, filter_size in enumerate(self.filter_sizes): 91 | with tf.name_scope('conv-max-pool-%s' % filter_size): 92 | filter_shape = [filter_size, self.embedding_size, 1, self.num_filters] 93 | conv_w = tf.Variable(tf.truncated_normal(filter_shape, stddev = 0.1), name="conv_w_filter_{}".format(i)) 94 | conv_b = tf.Variable(tf.constant(0.0, shape=[self.num_filters]), name="conv_b_{}".format(i)) 95 | self.kernels.append((conv_w, conv_b)) 96 | 97 | q_emb = tf.expand_dims(self.q_embedding, -1) 98 | a_emb = tf.expand_dims(self.a_embedding, -1) 99 | a_neg_emb = tf.expand_dims(self.a_neg_embedding, -1) 100 | # convolution 101 | self.q_conv = self.wide_convolution(q_emb) 102 | self.a_conv = self.wide_convolution(a_emb) 103 | self.a_neg_conv = self.wide_convolution(a_neg_emb) 104 | 105 | # attentive pooling 106 | self.encode_q_pos, self.encode_a_pos= self.attentive_pooling(self.q_conv, self.a_conv) 107 | self.encode_q_neg, self.encode_a_neg= self.attentive_pooling(self.q_conv, self.a_neg_conv) 108 | -------------------------------------------------------------------------------- /propressing.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @Description: 进行数据分析,关注数据细节 3 | @Author: zhansu 4 | @Date: 2019-07-05 17:26:53 5 | @LastEditTime: 2019-07-23 15:52:32 6 | @LastEditors: Please set LastEditors 7 | ''' 8 | import numpy as np 9 | import pandas as pd 10 | import os 11 | import matplotlib.pyplot as plt 12 | import seaborn as sns 13 | pal = sns.color_palette() 14 | print(os.getcwd()) 15 | 16 | df_train = pd.read_csv('data/nlpcc/train.txt', sep='\t', 17 | names=['question', 'answer', 'flag'], quoting=3) 18 | 19 | print(df_train['flag'].head()) 20 | 21 | # 基础分析 22 | print(df_train.info()) 23 | print(df_train.shape) 24 | df_train.groupby('flag')['question'].count().plot.bar() 25 | print("dataset size:{}".format(len(df_train))) 26 | print("positive sample rate:{}%".format( 27 | round(df_train['flag'].mean() * 100, 2))) 28 | print('question pairs:{}'.format(len(df_train['question'].unique()))) 29 | 30 | # 文本分析 31 | df_test = pd.read_csv('data/nlpcc/test.txt', sep='\t', 32 | names=['question', 'answer', 'flag'], quoting=3) 33 | 34 | train_qs = pd.Series( 35 | df_train['question'].tolist() + df_train['answer'].tolist()) 36 | test_qs = pd.Series(df_test['question'].tolist() + df_test['answer'].tolist()) 37 | dist_train = train_qs.apply(lambda x: len(x.split(' '))) 38 | dist_test = test_qs.apply(lambda x: len(x.split(' '))) 39 | print('mean-train:{} std-train:{} max-train:{} mean-test:{} std-test:{} max-test:{}'.format(dist_train.mean(), 40 | dist_train.std(), 41 | dist_train.max(), 42 | dist_test.mean(), 43 | dist_test.std(), 44 | dist_test.max())) 45 | 46 | dist_train = train_qs.apply(len) 47 | dist_test = test_qs.apply(len) 48 | plt.figure(figsize=(15, 10)) 49 | plt.hist(dist_train, bins=40, range=[0, 40], 50 | color=pal[2], normed=True, label='train') 51 | plt.hist(dist_test, bins=40, range=[ 52 | 0, 40], color=pal[1], normed=True, alpha=0.5, label='test') 53 | plt.title('Normalised histogram of character count in questions', fontsize=15) 54 | plt.legend() 55 | plt.xlabel('Number of characters', fontsize=15) 56 | plt.ylabel('Probability', fontsize=15) 57 | plt.show() 58 | 59 | # 语义分析 60 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @Description: 3 | @Author: zhansu 4 | @Date: 2019-06-28 20:14:28 5 | @LastEditTime: 2019-07-23 21:00:37 6 | @LastEditors: Please set LastEditors 7 | ''' 8 | from tensorflow import flags 9 | import tensorflow as tf 10 | from config import args 11 | import helper 12 | import time 13 | import datetime 14 | import os 15 | from models.cnn_model import Attentive_CNN 16 | import numpy as np 17 | import evaluation 18 | import sys 19 | import logging 20 | import os 21 | os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' 22 | print(os.getcwd()) 23 | 24 | now = int(time.time()) 25 | timeArray = time.localtime(now) 26 | log_filename = "log/" + time.strftime("%Y%m%d", timeArray) 27 | if not os.path.exists(log_filename): 28 | os.makedirs(log_filename) 29 | 30 | program = os.path.basename('QA') 31 | logger = logging.getLogger(program) 32 | 33 | logging.basicConfig(format = '%(asctime)s: %(levelname)s: %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', 34 | filename=log_filename+'/{}_qa.log'.format(time.strftime("%H%M", timeArray)), filemode='w') 35 | logging.root.setLevel(level=logging.INFO) 36 | logger.info("running %s" % ' '.join(sys.argv)) 37 | 38 | 39 | opts = args.flag_values_dict() 40 | for item in opts: 41 | logger.info('{} : {}'.format(item, opts[item])) 42 | 43 | logger.info('load data ...........') 44 | train, test, dev = helper.load_train_file( 45 | opts['data_dir'], filter=args.clean) 46 | 47 | q_max_sent_length = max(map(lambda x: len(x), train['question'].str.split())) 48 | a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split())) 49 | 50 | alphabet = helper.get_alphabet([train, test, dev]) 51 | logger.info('the number of words :%d ' % len(alphabet)) 52 | 53 | embedding = helper.get_embedding( 54 | alphabet, opts['embedding_file'], embedding_size=opts['embedding_size']) 55 | 56 | opts["embeddings"] = embedding 57 | opts["vocab_size"] = len(alphabet) 58 | opts["max_input_right"] = a_max_sent_length 59 | opts["max_input_left"] = q_max_sent_length 60 | opts["filter_sizes"] = list(map(int, args.filter_sizes.split(","))) 61 | 62 | with tf.Graph().as_default(): 63 | 64 | model = Attentive_CNN(opts) 65 | model._model_stats() 66 | for i in range(args.num_epoches): 67 | data_gen = helper.batch_iter(train, args.batch_size,alphabet,shuffle=True,q_len=q_max_sent_length,a_len=a_max_sent_length ) 68 | model.train(data_gen,i) 69 | 70 | test_datas = helper.batch_iter( 71 | test, args.batch_size,alphabet,q_len=q_max_sent_length,a_len=a_max_sent_length ) 72 | 73 | test['score'] = model.predict(test_datas) 74 | map_, mrr_= evaluation.evaluationBypandas(test, test['score'].to_list()) 75 | df_group = test.groupby('question').filter(evaluation.mrr_metric_filter) 76 | df_group[['question','answer','flag','score']].to_csv('badcase',sep = '\t',index = None) 77 | logger.info('map:{}--mrr:{}'.format(map_, mrr_)) 78 | print('map:{}--mrr:{}'.format(map_, mrr_)) 79 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import cPickle as pickle 3 | import numpy as np 4 | # a = tf.Variable(np.ones((3,33,10))) 5 | # b = tf.expand_dims(tf.Variable(np.arange(33) + 0.0),-1) 6 | # print b 7 | # c = tf.transpose(a,perm = [1,0]) * b 8 | # c = tf.multiply(a,b) 9 | # d = tf.ones([10,2]) 10 | a = [23.12,34.23,12.56] 11 | b = tf.nn.l2_normalize(a,0) 12 | c = tf.reduce_sum(b**2) 13 | # initializer = (np.array(0), np.array(1)) 14 | # fibonaccis = tf.scan(lambda a, _: (a[1], a[0] + a[1]), elems) 15 | with tf.Session() as sess: 16 | 17 | sess.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) 18 | # print sess.run(a) 19 | print sess.run(b) 20 | print sess.run(c) 21 | # print sess.run(d) 22 | 23 | import numpy as np 24 | import matplotlib.pyplot as plt 25 | # alpha = ['ABC', 'DEF', 'GHI', 'JKL'] 26 | d = pickle.load(open('attention.file')) 27 | print d[0][0] 28 | exit() 29 | # print len(d) 30 | data = d[0][0] 31 | print data 32 | # print d[0][0] 33 | fig = plt.figure() 34 | ax = fig.add_subplot(111) 35 | cax = ax.matshow(data, cmap = plt.cm.Blues) 36 | fig.colorbar(cax) 37 | 38 | # ax.set_xticklabels(['']+alpha) 39 | # ax.set_yticklabels(['']+alpha) 40 | 41 | plt.show() 42 | 43 | # a = [] 44 | 45 | # b = np.ones((10,10)) 46 | # c = np.random.rand(10,20) 47 | # print c[0] 48 | # for b1,c1 in zip(b,c): 49 | # a.extend((b1,c1)) 50 | 51 | # print a[1] 52 | # import pandas as pd 53 | # file = 'data/nlpcc/train.txt' 54 | # df = pd.read_csv(file,header = None,sep="\t",names=["question","answer","flag"],quoting =3).fillna('') 55 | # df['alen'] = df.apply(lambda x:len(x['answer'].split()),axis = 1) 56 | # print df[df['flag'] == 1]['alen']. 57 | # a = ('a','b') 58 | # print str(a) 59 | 60 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | #! /usr/bin/env python3.4 3 | import tensorflow as tf 4 | import numpy as np 5 | import os 6 | import time 7 | import datetime 8 | # from data_helpers import get_overlap_dict,replace_number,sample_data,batch_gen_with_pair_overlap,batch_gen_with_pair_dns,dns_sample,load,prepare,batch_gen_with_pair,batch_gen_with_single,batch_gen_with_point_wise,getQAIndiceofTest,batch_gen_with_pair_whole 9 | from helper import get_overlap_dict,batch_gen_with_pair_overlap,load,prepare,batch_gen_with_single,dns_sample,batch_gen_with_pair_dns 10 | import operator 11 | from QA_CNN_pairwise import QA_CNN_extend 12 | from QA_CNN_quantum_pairwise import QA_CNN_quantum_extend 13 | from QA_RNN_pairwise import QA_RNN_extend 14 | import random 15 | import evaluation 16 | import cPickle as pickle 17 | import config 18 | from sklearn.model_selection import train_test_split 19 | import pynlpir 20 | pynlpir.open() 21 | 22 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 23 | 24 | now = int(time.time()) 25 | 26 | timeArray = time.localtime(now) 27 | timeStamp = time.strftime("%Y%m%d%H%M%S", timeArray) 28 | timeDay = time.strftime("%Y%m%d", timeArray) 29 | print (timeStamp) 30 | 31 | from functools import wraps 32 | #print( tf.__version__) 33 | def log_time_delta(func): 34 | @wraps(func) 35 | def _deco(*args, **kwargs): 36 | start = time.time() 37 | ret = func(*args, **kwargs) 38 | end = time.time() 39 | delta = end - start 40 | print( "%s runed %.2f seconds"% (func.__name__,delta)) 41 | return ret 42 | return _deco 43 | 44 | FLAGS = config.flags.FLAGS 45 | FLAGS._parse_flags() 46 | print("\nParameters:") 47 | for attr, value in sorted(FLAGS.__flags.items()): 48 | print(("{}={}".format(attr.upper(), value))) 49 | log_dir = 'log/'+ timeDay 50 | if not os.path.exists(log_dir): 51 | os.makedirs(log_dir) 52 | data_file = log_dir + '/test_' + FLAGS.data + timeStamp 53 | precision = data_file + 'precise' 54 | attention = [] 55 | @log_time_delta 56 | def predict(sess,cnn,test,alphabet,batch_size,q_len,a_len): 57 | scores = [] 58 | d = get_overlap_dict(test,alphabet,q_len,a_len) 59 | for data in batch_gen_with_single(test,alphabet,batch_size,q_len,a_len,overlap_dict = d): 60 | feed_dict = { 61 | cnn.question: data[0], 62 | cnn.answer: data[1], 63 | cnn.answer_negative:data[1], 64 | cnn.q_pos_overlap: data[2], 65 | cnn.q_neg_overlap:data[2], 66 | cnn.a_pos_overlap: data[3], 67 | cnn.a_neg_overlap:data[3], 68 | cnn.q_position:data[4], 69 | cnn.a_pos_position:data[5], 70 | cnn.a_neg_position:data[5] 71 | } 72 | 73 | score = sess.run(cnn.score12, feed_dict) 74 | # print len(score) 75 | # if batch_size == 20: 76 | # attention.extend((q,a)) 77 | scores.extend(score) 78 | pickle.dump(attention,open('attention.file','w')) 79 | return np.array(scores[:len(test)]) 80 | @log_time_delta 81 | def test_pair_wise(dns = FLAGS.dns): 82 | train,test,dev = load(FLAGS.data,filter = FLAGS.clean) 83 | # train = train[:10000] 84 | # test = test[:10000] 85 | # dev = dev[:10000] 86 | # submit = submit[:1000] 87 | q_max_sent_length = max(map(lambda x:len(x),train['question'].str.split())) 88 | a_max_sent_length = max(map(lambda x:len(x),train['answer'].str.split())) 89 | print 'q_question_length:{} a_question_length:{}'.format(q_max_sent_length,a_max_sent_length) 90 | print 'train question unique:{}'.format(len(train['question'].unique())) 91 | print 'train length',len(train) 92 | print 'test length', len(test) 93 | print 'dev length', len(dev) 94 | alphabet,embeddings = prepare([train,test,dev],dim = FLAGS.embedding_dim,is_embedding_needed = True,fresh = FLAGS.fresh) 95 | # alphabet,embeddings = prepare_300([train,test,dev]) 96 | print 'alphabet:',len(alphabet) 97 | with tf.Graph().as_default(), tf.device("/gpu:" + str(FLAGS.gpu)): 98 | # with tf.device("/cpu:0"): 99 | session_conf = tf.ConfigProto() 100 | session_conf.allow_soft_placement = FLAGS.allow_soft_placement 101 | session_conf.log_device_placement = FLAGS.log_device_placement 102 | session_conf.gpu_options.allow_growth = True 103 | sess = tf.Session(config=session_conf) 104 | with sess.as_default(),open(precision,"w") as log: 105 | log.write(str(FLAGS.__flags) + '\n') 106 | folder = 'runs/' + timeDay + '/' + timeStamp + '/' 107 | out_dir = folder + FLAGS.data 108 | if not os.path.exists(folder): 109 | os.makedirs(folder) 110 | # train,test,dev = load("trec",filter=True) 111 | # alphabet,embeddings = prepare([train,test,dev],is_embedding_needed = True) 112 | print "start build model" 113 | cnn = QA_RNN_extend( 114 | max_input_left = q_max_sent_length, 115 | max_input_right = a_max_sent_length, 116 | batch_size = FLAGS.batch_size, 117 | vocab_size = len(alphabet), 118 | embedding_size = FLAGS.embedding_dim, 119 | filter_sizes = list(map(int, FLAGS.filter_sizes.split(","))), 120 | num_filters = FLAGS.num_filters, 121 | dropout_keep_prob = FLAGS.dropout_keep_prob, 122 | embeddings = embeddings, 123 | l2_reg_lambda = FLAGS.l2_reg_lambda, 124 | overlap_needed = FLAGS.overlap_needed, 125 | learning_rate=FLAGS.learning_rate, 126 | trainable = FLAGS.trainable, 127 | extend_feature_dim = FLAGS.extend_feature_dim, 128 | pooling = FLAGS.pooling, 129 | position_needed = FLAGS.position_needed, 130 | conv = FLAGS.conv) 131 | cnn.build_graph() 132 | 133 | saver = tf.train.Saver(tf.global_variables(), max_to_keep = 20) 134 | train_writer = tf.summary.FileWriter(log_dir + '/train', sess.graph) 135 | test_writer = tf.summary.FileWriter(log_dir + '/test') 136 | # Initialize all variables 137 | print "build over" 138 | sess.run(tf.global_variables_initializer()) 139 | print "variables_initializer" 140 | 141 | map_max = 0.65 142 | for i in range(FLAGS.num_epochs): 143 | if FLAGS.dns == True: 144 | samples = dns_sample(train,alphabet,q_max_sent_length, 145 | a_max_sent_length,sess,cnn,FLAGS.batch_size,neg_sample_num = 10) 146 | datas = batch_gen_with_pair_dns(samples,FLAGS.batch_size) 147 | print 'load dns datas' 148 | for data in datas: 149 | feed_dict = { 150 | cnn.question:data[0], 151 | cnn.answer:data[1], 152 | cnn.answer_negative:data[2] 153 | } 154 | _, step,loss, accuracy,score12,score13 = sess.run( 155 | [cnn.train_op, cnn.global_step,cnn.loss, cnn.accuracy,cnn.score12,cnn.score13], 156 | feed_dict) 157 | time_str = datetime.datetime.now().isoformat() 158 | print("{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}".format(time_str, step, loss, accuracy,np.mean(score12),np.mean(score13))) 159 | line = "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}".format(time_str, step, loss, accuracy,np.mean(score12),np.mean(score13)) 160 | else: 161 | d = get_overlap_dict(train,alphabet,q_len = q_max_sent_length,a_len = a_max_sent_length) 162 | datas = batch_gen_with_pair_overlap(train,alphabet,FLAGS.batch_size, 163 | q_len = q_max_sent_length,a_len = a_max_sent_length,fresh = FLAGS.fresh,overlap_dict = d) 164 | print "load data" 165 | for data in datas: 166 | feed_dict = { 167 | cnn.question: data[0], 168 | cnn.answer: data[1], 169 | cnn.answer_negative:data[2], 170 | cnn.q_pos_overlap:data[3], 171 | cnn.q_neg_overlap:data[4], 172 | cnn.a_pos_overlap:data[5], 173 | cnn.a_neg_overlap:data[6], 174 | cnn.q_position:data[7], 175 | cnn.a_pos_position:data[8], 176 | cnn.a_neg_position:data[9] 177 | } 178 | _, summary,step,loss, accuracy,score12,score13 = sess.run( 179 | [cnn.train_op, cnn.merged,cnn.global_step,cnn.loss, cnn.accuracy,cnn.score12,cnn.score13], 180 | feed_dict) 181 | train_writer.add_summary(summary, i) 182 | time_str = datetime.datetime.now().isoformat() 183 | print("{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}".format(time_str, step, loss, accuracy,np.mean(score12),np.mean(score13))) 184 | line = "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}".format(time_str, step, loss, accuracy,np.mean(score12),np.mean(score13)) 185 | # print loss 186 | if i % 1 == 0: 187 | predicted_dev = predict(sess,cnn,dev,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length) 188 | map_mrr_dev = evaluation.evaluationBypandas(dev,predicted_dev) 189 | predicted_test = predict(sess,cnn,test,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length) 190 | map_mrr_test = evaluation.evaluationBypandas(test,predicted_test) 191 | 192 | print "{}:epoch:dev map mrr {}".format(i,map_mrr_dev) 193 | print "{}:epoch:test map mrr {}".format(i,map_mrr_test) 194 | line = " {}:epoch: map_dev{}-------map_mrr_test{}".format(i,map_mrr_dev[0],map_mrr_test) 195 | if map_mrr_dev[0] > map_max: 196 | map_max = map_mrr_dev[0] 197 | # timeStamp = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time()))) 198 | 199 | save_path = saver.save(sess, out_dir) 200 | print "Model saved in file: ", save_path 201 | 202 | log.write(line + '\n') 203 | log.flush() 204 | print 'train over' 205 | saver.restore(sess, out_dir) 206 | predicted = predict(sess,cnn,train,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length) 207 | train['predicted'] = predicted 208 | train['predicted'].to_csv('train.QApair.TJU_IR_QA2017_train.score',index = False,sep = '\t') 209 | map_mrr_train = evaluation.evaluationBypandas(train,predicted) 210 | 211 | predicted_dev = predict(sess,cnn,dev,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length) 212 | dev['predicted'] = predicted_dev 213 | dev['predicted'].to_csv('train.QApair.TJU_IR_QA2017_dev.score',index = False,sep = '\t') 214 | map_mrr_dev = evaluation.evaluationBypandas(dev,predicted_dev) 215 | 216 | predicted_test = predict(sess,cnn,test,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length) 217 | 218 | test['predicted'] = predicted_test 219 | test['predicted'].to_csv('train.QApair.TJU_IR_QA2017.score',index = False,sep = '\t') 220 | map_mrr_test = evaluation.evaluationBypandas(test,predicted_test) 221 | 222 | print 'map_mrr train',map_mrr_train 223 | print 'map_mrr dev',map_mrr_dev 224 | print 'map_mrr test',map_mrr_test 225 | log.write(str(map_mrr_train) + '\n') 226 | log.write(str(map_mrr_test) + '\n') 227 | log.write(str(map_mrr_dev) + '\n') 228 | predict(sess,cnn,train[:100],alphabet,20,q_max_sent_length,a_max_sent_length) 229 | 230 | 231 | 232 | 233 | if __name__ == '__main__': 234 | # test_quora() 235 | # predicted_pair() 236 | test_pair_wise() 237 | # test_point_wise() 238 | --------------------------------------------------------------------------------