├── README.md ├── data_iterator.py ├── model.py ├── model_char.py ├── model_word.py ├── train.py └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | This's the variation of ESIM model with lanuage model, and the language model is very powerful. 3 | # Note 4 | In this competition, we don't use any manual features or rules. only model with language model achieves top16. 5 | 6 | (More details will be described soon) 7 | -------------------------------------------------------------------------------- /data_iterator.py: -------------------------------------------------------------------------------- 1 | import pickle as pkl 2 | import numpy as np 3 | import random 4 | import math 5 | import json 6 | import pandas as pd 7 | random.seed(2018) 8 | class TextIterator: 9 | """Simple Bitext iterator.""" 10 | def __init__(self,mode,hparams,batch_size,file_name): 11 | self.mode=mode 12 | self.hparams=hparams 13 | self.batch_size=batch_size 14 | df=pd.read_csv(file_name) 15 | index=list(range(len(df))) 16 | if self.mode=='train': 17 | index=index[int(len(df)*hparams.idx*1.0/hparams.all_process):]+index[:int(len(df)*hparams.idx*1.0/hparams.all_process)] 18 | self.data=df.iloc[index][['words_1','chars_1','words_2','chars_2','label']].values 19 | self.word_num=df.iloc[index][hparams.word_num_features].values 20 | self.char_num=df.iloc[index][hparams.char_num_features].values 21 | 22 | 23 | 24 | self.idx=0 25 | 26 | def reset(self): 27 | self.idx=0 28 | 29 | def next(self): 30 | if self.idx>=len(self.data): 31 | self.reset() 32 | raise StopIteration 33 | words1=[] 34 | chars1=[] 35 | words_EM1=[] 36 | chars_EM1=[] 37 | words1_len=[] 38 | chars1_len=[] 39 | words2=[] 40 | chars2=[] 41 | words_EM2=[] 42 | chars_EM2=[] 43 | words2_len=[] 44 | chars2_len=[] 45 | label=[] 46 | 47 | word_num=[] 48 | char_num=[] 49 | 50 | while self.idx']*(max_word1_len-words1_len[i]) 74 | words2[i]=['BOS']+words2[i]+['EOS']+['']*(max_word2_len-words2_len[i]) 75 | chars1[i]=['BOS']+chars1[i]+['EOS']+['']*(max_char1_len-chars1_len[i]) 76 | chars2[i]=['BOS']+chars2[i]+['EOS']+['']*(max_char2_len-chars2_len[i]) 77 | 78 | 79 | 80 | 81 | 82 | 83 | return (words1,chars1,words1_len,chars1_len,),(words2,chars2,words2_len,chars2_len),label,word_num,char_num 84 | 85 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import data_iterator 3 | from tensorflow.python.ops import lookup_ops 4 | from tensorflow.python.layers import core as layers_core 5 | import time 6 | import numpy as np 7 | import pickle 8 | import utils 9 | from sklearn.metrics import log_loss 10 | import os 11 | import pandas as pd 12 | from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm 13 | import math 14 | class Model(object): 15 | def __init__(self,hparams,mode): 16 | self.mode=mode 17 | self.hparams=hparams 18 | params = tf.trainable_variables() 19 | #define placeholder 20 | self.vocab_table_word=lookup_ops.index_table_from_file('pre_data/vocab_word.txt', default_value=0) 21 | self.vocab_table_char=lookup_ops.index_table_from_file('pre_data/vocab_char.txt', default_value=0) 22 | self.norm_trainable=tf.placeholder(tf.bool) 23 | self.q1={} 24 | self.q2={} 25 | self.label=tf.placeholder(shape=(None,),dtype=tf.float32) 26 | 27 | for q in [self.q1,self.q2]: 28 | q['words']=tf.placeholder(shape=(None,None), dtype=tf.string) 29 | q['words_len']=tf.placeholder(shape=(None,), dtype=tf.int32) 30 | q['chars']=tf.placeholder(shape=(None,None), dtype=tf.string) 31 | q['chars_len']=tf.placeholder(shape=(None,), dtype=tf.int32) 32 | q['words_num']=tf.placeholder(shape=(None,len(hparams.word_num_features)), dtype=tf.float32) 33 | q['chars_num']=tf.placeholder(shape=(None,len(hparams.char_num_features)), dtype=tf.float32) 34 | 35 | 36 | 37 | 38 | 39 | #build graph 40 | self.build_graph(hparams) 41 | 42 | #build optimizer 43 | self.optimizer(hparams) 44 | params = tf.trainable_variables() 45 | self.saver = tf.train.Saver(tf.global_variables()) 46 | elmo_param=[] 47 | for param in tf.global_variables(): 48 | if 'elmo' in param.name: 49 | elmo_param.append(param) 50 | self.pretrain_saver = tf.train.Saver(elmo_param) 51 | utils.print_out("# Trainable variables") 52 | for param in params: 53 | if hparams.pretrain is False and 'elmo' in param.name: 54 | continue 55 | else: 56 | utils.print_out(" %s, %s, %s" % (param.name, str(param.get_shape()),param.op.device)) 57 | 58 | 59 | def build_graph(self, hparams): 60 | with tf.variable_scope('elmo') as scope: 61 | self.build_embedding_layer(hparams,trainable=False,scope_name='embedding') 62 | self.build_bilstm(hparams,scope_name='bilstm') 63 | if hparams.pretrain: 64 | self.cost=self.build_elmo_logits(hparams) 65 | return 66 | 67 | self.build_encoder(hparams,scope_name='encoder') 68 | self.build_interaction(hparams,scope_name='interaction') 69 | self.build_decoder(hparams,scope_name='decoder') 70 | logits=self.build_mlp(hparams) 71 | self.cost=self.compute_loss(hparams,logits) 72 | 73 | 74 | 75 | def build_embedding_layer(self,hparams,trainable,scope_name): 76 | #create embedding layer 77 | word_vocab={} 78 | char_vocab={} 79 | with open('pre_data/vocab_word.txt','r') as f: 80 | for line in f: 81 | word=line.strip() 82 | word_vocab[word]=len(word_vocab) 83 | 84 | word_embedding=np.random.randn(len(word_vocab), 300)*0.1 85 | hparams.word_vocab_size=len(word_vocab) 86 | if hparams.word_embedding: 87 | with open(hparams.word_embedding, 'r') as f: 88 | for line in f: 89 | temp=line.split() 90 | word=temp[0] 91 | vector=temp[1:] 92 | if word in word_vocab: 93 | word_embedding[word_vocab[word],:]=vector 94 | self.word_embedding=tf.Variable(word_embedding,trainable=trainable,dtype=tf.float32) 95 | 96 | with open('pre_data/vocab_char.txt','r') as f: 97 | for line in f: 98 | char=line.strip() 99 | char_vocab[char]=len(char_vocab) 100 | 101 | char_embedding=np.random.randn(len(char_vocab), 300)*0.1 102 | hparams.char_vocab_size=len(char_vocab) 103 | if hparams.char_embedding: 104 | with open(hparams.char_embedding, 'r') as f: 105 | for line in f: 106 | temp=line.split() 107 | char=temp[0] 108 | vector=temp[1:] 109 | if char in char_vocab: 110 | char_embedding[char_vocab[char],:]=vector 111 | 112 | self.char_embedding=tf.Variable(char_embedding,trainable=trainable,dtype=tf.float32) 113 | 114 | for q in [self.q1,self.q2]: 115 | words_id=self.vocab_table_word.lookup(q['words']) 116 | q['words_id']=words_id 117 | if hparams.maskdropout > 0.0 and self.mode==tf.contrib.learn.ModeKeys.TRAIN: 118 | mask=tf.ones(tf.shape(words_id)) 119 | mask=tf.cast(tf.minimum(tf.nn.dropout(mask,1-hparams.maskdropout),1),tf.int64) 120 | words_id=tf.cast(words_id*mask,tf.int32) 121 | q['words_inp'] = tf.gather(self.word_embedding, words_id[:,1:-1]) 122 | 123 | for q in [self.q1,self.q2]: 124 | chars_id=self.vocab_table_char.lookup(q['chars']) 125 | q['chars_id']=chars_id 126 | if hparams.maskdropout > 0.0 and self.mode==tf.contrib.learn.ModeKeys.TRAIN: 127 | mask=tf.ones(tf.shape(chars_id)) 128 | mask=tf.cast(tf.minimum(tf.nn.dropout(mask,1-hparams.maskdropout),1),tf.int64) 129 | chars_id=tf.cast(chars_id*mask,tf.int32) 130 | q['chars_inp'] = tf.gather(self.char_embedding, chars_id[:,1:-1]) 131 | 132 | def build_bilstm(self,hparams,scope_name): 133 | with tf.variable_scope(scope_name+'_words') as scope: 134 | fw_cell,bw_cell= self._build_encoder_cell(hparams,num_layer=4,num_units=300,encoder_type='bi',dropout=0.5 if hparams.pretrain else 0.0) 135 | W = layers_core.Dense(512,activation=tf.nn.relu, use_bias=False, name="W") 136 | for q in [self.q1,self.q2]: 137 | words_inp = q['words_inp'] 138 | bi_outputs, bi_state = tf.nn.bidirectional_dynamic_rnn(fw_cell,bw_cell,words_inp,dtype=tf.float32, sequence_length=q['words_len'],time_major=False,swap_memory=True) 139 | q['word_elmo_lstm']=bi_outputs 140 | q['word_elmo_output']=[W(x) for x in bi_outputs] 141 | q['word_elmo_label']=[q['words_id'][:,2:],q['words_id'][:,:-2]] 142 | 143 | with tf.variable_scope(scope_name+'_chars') as scope: 144 | fw_cell,bw_cell= self._build_encoder_cell(hparams,num_layer=4,num_units=300,encoder_type='bi',dropout=0.5 if hparams.pretrain else 0.0) 145 | W = layers_core.Dense(512,activation=tf.nn.relu, use_bias=False, name="W") 146 | for q in [self.q1,self.q2]: 147 | chars_inp = q['chars_inp'] 148 | bi_outputs, bi_state = tf.nn.bidirectional_dynamic_rnn(fw_cell,bw_cell,chars_inp,dtype=tf.float32, sequence_length=q['chars_len'],time_major=False,swap_memory=True) 149 | q['char_elmo_lstm']=bi_outputs 150 | q['char_elmo_output']=[W(x) for x in bi_outputs] 151 | q['char_elmo_label']=[q['chars_id'][:,2:],q['chars_id'][:,:-2]] 152 | 153 | def build_elmo_logits(self,hparams): 154 | costs=[] 155 | with tf.variable_scope("softmax_words") as scope: 156 | nce_weights= tf.Variable(\ 157 | tf.truncated_normal([hparams.word_vocab_size,512],stddev=1.0/math.sqrt(512))) 158 | nce_biases=tf.Variable(tf.zeros([hparams.word_vocab_size])) 159 | for q in [self.q1,self.q2]: 160 | for i in range(2): 161 | mask = tf.sequence_mask(q['words_len'], tf.shape(q['word_elmo_output'][i])[-2], dtype=tf.float32) 162 | mask=tf.reshape(mask,[-1]) 163 | inputs=tf.reshape(q['word_elmo_output'][i],[-1,512]) 164 | labels=tf.reshape(q['word_elmo_label'][i],[-1,1]) 165 | cost=tf.nn.nce_loss(weights=nce_weights,biases=nce_biases,labels=labels,inputs=inputs,num_sampled=32,num_classes=hparams.word_vocab_size) 166 | cost=tf.reduce_sum(cost*mask)/tf.reduce_sum(mask) 167 | costs.append(cost) 168 | 169 | 170 | with tf.variable_scope("softmax_chars") as scope: 171 | nce_weights= tf.Variable(\ 172 | tf.truncated_normal([hparams.char_vocab_size,512],stddev=1.0/math.sqrt(512))) 173 | nce_biases=tf.Variable(tf.zeros([hparams.char_vocab_size])) 174 | for q in [self.q1,self.q2]: 175 | for i in range(2): 176 | mask = tf.sequence_mask(q['chars_len'], tf.shape(q['char_elmo_output'][i])[-2], dtype=tf.float32) 177 | mask=tf.reshape(mask,[-1]) 178 | inputs=tf.reshape(q['char_elmo_output'][i],[-1,512]) 179 | labels=tf.reshape(q['char_elmo_label'][i],[-1,1]) 180 | cost=tf.nn.nce_loss(weights=nce_weights,biases=nce_biases,labels=labels,inputs=inputs,num_sampled=32,num_classes=hparams.char_vocab_size) 181 | cost=tf.reduce_sum(cost*mask)/tf.reduce_sum(mask) 182 | costs.append(cost) 183 | 184 | loss=tf.reduce_mean(costs) 185 | return loss 186 | 187 | def build_encoder(self,hparams,scope_name): 188 | with tf.variable_scope(scope_name+'_words') as scope: 189 | #encoding words 190 | fw_cell,bw_cell= self._build_encoder_cell(hparams) 191 | for q in [self.q1,self.q2]: 192 | words_inp = tf.transpose(tf.concat(q['word_elmo_output']+[q['words_inp']],-1),[1,0,2]) 193 | bi_outputs, bi_state = tf.nn.bidirectional_dynamic_rnn(fw_cell,bw_cell,words_inp,dtype=tf.float32, sequence_length=q['words_len'],time_major=True,swap_memory=True) 194 | q['word_encoder_output']=tf.transpose(tf.concat(bi_outputs,-1),[1,0,2]) 195 | q['word_encoder_hidden']=bi_state 196 | 197 | 198 | with tf.variable_scope(scope_name+'_chars') as scope: 199 | #encoding chars 200 | fw_cell,bw_cell= self._build_encoder_cell(hparams) 201 | for q in [self.q1,self.q2]: 202 | chars_inp = tf.transpose(tf.concat(q['char_elmo_output']+[q['chars_inp']],-1),[1,0,2]) 203 | bi_outputs, bi_state = tf.nn.bidirectional_dynamic_rnn(fw_cell,bw_cell,chars_inp,dtype=tf.float32, sequence_length=q['chars_len'],time_major=True,swap_memory=True) 204 | q['char_encoder_output']=tf.transpose(tf.concat(bi_outputs,-1),[1,0,2]) 205 | q['char_encoder_hidden']=bi_state 206 | return 207 | 208 | def build_decoder(self,hparams,scope_name): 209 | with tf.variable_scope(scope_name+'_words') as scope: 210 | fw_cell,bw_cell= self._build_encoder_cell(hparams) 211 | for q in [self.q1,self.q2]: 212 | decoder_inp=tf.transpose(q['word_interaction'],[1,0,2]) 213 | bi_outputs, bi_state = tf.nn.bidirectional_dynamic_rnn(fw_cell,bw_cell,decoder_inp,dtype=tf.float32, sequence_length=q['words_len'],time_major=True,swap_memory=True) 214 | bi_outputs=tf.concat(bi_outputs,-1) 215 | #bi_outputs=self.HighwayNetwork(bi_outputs) 216 | q['word_decoder_output']=tf.transpose(bi_outputs,[1,0,2]) 217 | 218 | 219 | with tf.variable_scope(scope_name+'_chars') as scope: 220 | fw_cell,bw_cell= self._build_encoder_cell(hparams) 221 | for q in [self.q1,self.q2]: 222 | decoder_inp=tf.transpose(q['char_interaction'],[1,0,2]) 223 | bi_outputs, bi_state = tf.nn.bidirectional_dynamic_rnn(fw_cell,bw_cell,decoder_inp,dtype=tf.float32, sequence_length=q['chars_len'],time_major=True,swap_memory=True) 224 | bi_outputs=tf.concat(bi_outputs,-1) 225 | #bi_outputs=self.HighwayNetwork(bi_outputs) 226 | q['char_decoder_output']=tf.transpose(bi_outputs,[1,0,2]) 227 | 228 | 229 | 230 | 231 | 232 | def build_interaction(self,hparams,scope_name): 233 | with tf.variable_scope(scope_name+'_words') as scope: 234 | for q in [(self.q1,self.q2),(self.q2,self.q1)]: 235 | encoder_hidden=q[0]['word_encoder_output'] 236 | weight=tf.reduce_sum(encoder_hidden[:,:,None,:]*q[1]['word_encoder_output'][:,None,:,:],-1) 237 | mask = tf.sequence_mask(q[1]['words_len'], tf.shape(weight)[-1], dtype=tf.float32) 238 | weight=tf.nn.softmax(weight)*mask[:,None,:] 239 | weight=weight/(tf.reduce_sum(weight,-1)[:,:,None]+0.000001) 240 | word_inter=tf.reduce_sum(q[1]['word_encoder_output'][:,None,:,:]*weight[:,:,:,None],-2) 241 | q[0]['word_interaction']=tf.concat([encoder_hidden,word_inter,tf.abs(encoder_hidden-word_inter),encoder_hidden*word_inter],-1) 242 | 243 | with tf.variable_scope(scope_name+'_chars') as scope: 244 | for q in [(self.q1,self.q2),(self.q2,self.q1)]: 245 | encoder_hidden=q[0]['char_encoder_output'] 246 | weight=tf.reduce_sum(encoder_hidden[:,:,None,:]*q[1]['char_encoder_output'][:,None,:,:],-1) 247 | mask = tf.sequence_mask(q[1]['chars_len'], tf.shape(weight)[-1], dtype=tf.float32) 248 | weight=tf.nn.softmax(weight)*mask[:,None,:] 249 | weight=weight/(tf.reduce_sum(weight,-1)[:,:,None]+0.000001) 250 | char_inter=tf.reduce_sum(q[1]['char_encoder_output'][:,None,:,:]*weight[:,:,:,None],-2) 251 | q[0]['char_interaction']=tf.concat([encoder_hidden,char_inter,tf.abs(encoder_hidden-char_inter),encoder_hidden*char_inter],-1) 252 | return 253 | 254 | 255 | def build_mlp(self,hparams): 256 | hidden_word=[] 257 | with tf.variable_scope("MLP_words") as scope: 258 | attention_W = layers_core.Dense(hparams.hidden_size,activation=tf.nn.relu, use_bias=False, name="attention_W") 259 | attention_V = layers_core.Dense(1,use_bias=False, name="attention_V") 260 | for q in [self.q1,self.q2]: 261 | weight=tf.nn.softmax(tf.reduce_sum(attention_V(attention_W(q['word_decoder_output'])),-1)) 262 | mask = tf.sequence_mask(q['words_len'], tf.shape(weight)[-1], dtype=tf.float32) 263 | weight=weight*mask 264 | weight=weight/(tf.reduce_sum(weight,-1)[:,None]+0.000001) 265 | context_hidden=tf.reduce_sum(q['word_decoder_output']*weight[:,:,None],1) 266 | q['word_rep']=context_hidden 267 | hidden_word=[self.q1['word_rep'],self.q2['word_rep'],self.q1['word_rep']*self.q2['word_rep']] 268 | 269 | 270 | hidden_word.append(self.q1['words_num']) 271 | 272 | 273 | with tf.variable_scope("MLP_chars") as scope: 274 | attention_W = layers_core.Dense(hparams.hidden_size,activation=tf.nn.relu, use_bias=False, name="attention_W") 275 | attention_V = layers_core.Dense(1,use_bias=False, name="attention_V") 276 | for q in [self.q1,self.q2]: 277 | weight=tf.nn.softmax(tf.reduce_sum(attention_V(attention_W(q['char_decoder_output'])),-1)) 278 | mask = tf.sequence_mask(q['chars_len'], tf.shape(weight)[-1], dtype=tf.float32) 279 | weight=weight*mask 280 | weight=weight/(tf.reduce_sum(weight,-1)[:,None]+0.000001) 281 | context_hidden=tf.reduce_sum(q['char_decoder_output']*weight[:,:,None],1) 282 | q['char_rep']=context_hidden 283 | hidden_char=[self.q1['char_rep'],self.q2['char_rep'],self.q1['char_rep']*self.q2['char_rep']] 284 | 285 | 286 | hidden_char.append(self.q1['chars_num']) 287 | 288 | with tf.variable_scope("MLP_words") as scope: 289 | layer_W = layers_core.Dense(hparams.hidden_size,activation=tf.nn.tanh, use_bias=False, name="ff_layer") 290 | hidden_word=tf.concat(hidden_word,-1) 291 | logits=layer_W(hidden_word) 292 | if hparams.dropout > 0.0 and self.mode==tf.contrib.learn.ModeKeys.TRAIN: 293 | logits = tf.nn.dropout(logits,1-hparams.dropout) 294 | layer_W = layers_core.Dense(1, use_bias=False, name="ff_layer_output") 295 | logits_word=layer_W(logits)[:,0] 296 | with tf.variable_scope("MLP_chars") as scope: 297 | layer_W = layers_core.Dense(hparams.hidden_size,activation=tf.nn.tanh, use_bias=False, name="ff_layer") 298 | hidden_char=tf.concat(hidden_char,-1) 299 | logits=layer_W(hidden_char) 300 | if hparams.dropout > 0.0 and self.mode==tf.contrib.learn.ModeKeys.TRAIN: 301 | logits = tf.nn.dropout(logits,1-hparams.dropout) 302 | layer_W = layers_core.Dense(1, use_bias=False, name="ff_layer_output") 303 | logits_char=layer_W(logits)[:,0] 304 | logits=logits_word+logits_char 305 | return logits 306 | 307 | 308 | def compute_loss(self,hparams,logits): 309 | self.prob=tf.nn.sigmoid(logits) 310 | loss=-tf.reduce_mean(self.label*tf.log(self.prob+0.0001)+(1-self.label)*tf.log(1-self.prob+0.0001),-1) 311 | return loss 312 | 313 | def HighwayNetwork(self,inputs, num_layers=2, function='relu', 314 | keep_prob=0.8, scope='HN'): 315 | with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): 316 | if function == 'relu': 317 | function = tf.nn.relu 318 | elif function == 'tanh': 319 | function = tf.nn.tanh 320 | else: 321 | raise NotImplementedError 322 | hidden_size = inputs.get_shape().as_list()[-1] 323 | memory = inputs 324 | for layer in range(num_layers): 325 | with tf.variable_scope('layer_%d' % (layer)): 326 | H = layers_core.Dense(hidden_size,activation=function, use_bias=True, name="h") 327 | T = layers_core.Dense(hidden_size,activation=function, use_bias=True, name="t") 328 | h = H(memory) 329 | t = T(memory) 330 | memory = h * t + (1-t) * memory 331 | if keep_prob > 0.0 and self.mode==tf.contrib.learn.ModeKeys.TRAIN: 332 | outputs = tf.nn.dropout(memory,keep_prob) 333 | else: 334 | outputs = memory 335 | return outputs 336 | 337 | def _build_encoder_cell(self,hparams,num_layer=None,num_units=None,encoder_type=None,dropout=None,forget_bias=None): 338 | num_layer=num_layer or hparams.num_layer 339 | num_units=num_units or hparams.num_units 340 | encoder_type=encoder_type or hparams.encoder_type 341 | dropout=dropout or hparams.dropout 342 | forget_bias=forget_bias or hparams.forget_bias 343 | if encoder_type=="uni": 344 | cell_list = [] 345 | for i in range(num_layer): 346 | single_cell = tf.contrib.rnn.BasicLSTMCell(num_units,forget_bias=hparams.forget_bias) 347 | # Dropout (= 1 - keep_prob) 348 | if dropout > 0.0 and self.mode==tf.contrib.learn.ModeKeys.TRAIN: 349 | single_cell = tf.contrib.rnn.DropoutWrapper(cell=single_cell, input_keep_prob=(1.0 - dropout)) 350 | 351 | cell_list.append(single_cell) 352 | if len(cell_list) == 1: # Single layer. 353 | return cell_list[0] 354 | else: # Multi layers 355 | return tf.contrib.rnn.MultiRNNCell(cell_list) 356 | else: 357 | num_bi_layers = int(num_layer / 2) 358 | fw_cell_list=[] 359 | bw_cell_list=[] 360 | for i in range(num_bi_layers): 361 | single_cell = tf.contrib.rnn.BasicLSTMCell(num_units,forget_bias=forget_bias) 362 | if dropout > 0.0 and self.mode==tf.contrib.learn.ModeKeys.TRAIN: 363 | single_cell = tf.contrib.rnn.DropoutWrapper(cell=single_cell, input_keep_prob=(1.0 - dropout)) 364 | 365 | fw_cell_list.append(single_cell) 366 | single_cell = tf.contrib.rnn.BasicLSTMCell(num_units,forget_bias=forget_bias) 367 | if dropout > 0.0 and self.mode==tf.contrib.learn.ModeKeys.TRAIN: 368 | single_cell = tf.contrib.rnn.DropoutWrapper(cell=single_cell, input_keep_prob=(1.0 - dropout)) 369 | 370 | bw_cell_list.append(single_cell) 371 | 372 | if num_bi_layers == 1: # Single layer. 373 | fw_cell=fw_cell_list[0] 374 | bw_cell=bw_cell_list[0] 375 | else: # Multi layers 376 | fw_cell=tf.contrib.rnn.MultiRNNCell(fw_cell_list) 377 | bw_cell=tf.contrib.rnn.MultiRNNCell(bw_cell_list) 378 | return fw_cell,bw_cell 379 | def dey_lrate(self,sess,lrate): 380 | sess.run(tf.assign(self.lrate,lrate)) 381 | 382 | def optimizer(self,hparams): 383 | self.lrate=tf.Variable(hparams.learning_rate,trainable=False) 384 | if hparams.op=='sgd': 385 | opt = tf.train.GradientDescentOptimizer(self.lrate) 386 | elif hparams.op=='adam': 387 | opt = tf.train.AdamOptimizer(self.lrate,beta1=0.9, beta2=0.999,epsilon=1e-8) 388 | params = tf.trainable_variables() 389 | 390 | 391 | gradients = tf.gradients(self.cost,params,colocate_gradients_with_ops=True) 392 | clipped_grads, gradient_norm = tf.clip_by_global_norm(gradients, 5.0) 393 | self.grad_norm =gradient_norm 394 | self.update = opt.apply_gradients(zip(clipped_grads, params)) 395 | 396 | def batch_norm_layer(self, x, train_phase, scope_bn): 397 | z = tf.cond(train_phase, lambda: batch_norm(x, decay=self.hparams.batch_norm_decay, center=True, scale=True, updates_collections=None,is_training=True, reuse=None, trainable=True, scope=scope_bn), lambda: batch_norm(x, decay=self.hparams.batch_norm_decay, center=True, scale=True, updates_collections=None,is_training=False, reuse=True, trainable=True, scope=scope_bn)) 398 | return z 399 | 400 | def train(self,sess,iterator): 401 | assert self.mode == tf.contrib.learn.ModeKeys.TRAIN 402 | q1,q2,label,words_num,chars_num=iterator.next() 403 | dic={} 404 | dic[self.q1['words']]=q1[0] 405 | dic[self.q1['chars']]=q1[1] 406 | dic[self.q1['words_len']]=q1[2] 407 | dic[self.q1['chars_len']]=q1[3] 408 | dic[self.q1['words_num']]=words_num 409 | dic[self.q1['chars_num']]=chars_num 410 | dic[self.q2['words']]=q2[0] 411 | dic[self.q2['chars']]=q2[1] 412 | dic[self.q2['words_len']]=q2[2] 413 | dic[self.q2['chars_len']]=q2[3] 414 | dic[self.label]=label 415 | dic[self.norm_trainable]=True 416 | 417 | 418 | 419 | return sess.run([self.cost,self.update,self.grad_norm],feed_dict=dic) 420 | 421 | def pretrain_infer(self,sess,iterator): 422 | assert self.mode == tf.contrib.learn.ModeKeys.INFER 423 | q1,q2,label,words_num,chars_num=iterator.next() 424 | dic={} 425 | dic[self.q1['words']]=q1[0] 426 | dic[self.q1['chars']]=q1[1] 427 | dic[self.q1['words_len']]=q1[2] 428 | dic[self.q1['chars_len']]=q1[3] 429 | dic[self.q2['words']]=q2[0] 430 | dic[self.q2['chars']]=q2[1] 431 | dic[self.q2['words_len']]=q2[2] 432 | dic[self.q2['chars_len']]=q2[3] 433 | dic[self.label]=label 434 | 435 | 436 | 437 | return sess.run(self.cost,feed_dict=dic) 438 | 439 | 440 | def infer(self,sess,iterator): 441 | assert self.mode == tf.contrib.learn.ModeKeys.INFER 442 | q1,q2,label,words_num,chars_num=iterator.next() 443 | dic={} 444 | dic[self.q1['words']]=q1[0] 445 | dic[self.q1['chars']]=q1[1] 446 | dic[self.q1['words_len']]=q1[2] 447 | dic[self.q1['chars_len']]=q1[3] 448 | dic[self.q1['words_num']]=words_num 449 | dic[self.q1['chars_num']]=chars_num 450 | dic[self.q2['words']]=q2[0] 451 | dic[self.q2['chars']]=q2[1] 452 | dic[self.q2['words_len']]=q2[2] 453 | dic[self.q2['chars_len']]=q2[3] 454 | dic[self.norm_trainable]=False 455 | dic[self.label]=label 456 | 457 | 458 | 459 | prob1=sess.run(self.prob,feed_dict=dic) 460 | dic[self.q2['words']]=q1[0] 461 | dic[self.q2['chars']]=q1[1] 462 | dic[self.q2['words_len']]=q1[2] 463 | dic[self.q2['chars_len']]=q1[3] 464 | dic[self.q1['words']]=q2[0] 465 | dic[self.q1['chars']]=q2[1] 466 | dic[self.q1['words_len']]=q2[2] 467 | dic[self.q1['chars_len']]=q2[3] 468 | 469 | 470 | dic[self.label]=label 471 | prob2=sess.run(self.prob,feed_dict=dic) 472 | return (prob1+prob2)/2.0 473 | 474 | 475 | def train(hparams): 476 | 477 | hparams.num_units=300 478 | if hparams.pretrain: 479 | hparams.learning_rate=0.001 480 | config_proto = tf.ConfigProto(log_device_placement=0,allow_soft_placement=0) 481 | config_proto.gpu_options.allow_growth = True 482 | train_graph = tf.Graph() 483 | infer_graph = tf.Graph() 484 | 485 | with train_graph.as_default(): 486 | train_model=Model(hparams,tf.contrib.learn.ModeKeys.TRAIN) 487 | train_sess=tf.Session(graph=train_graph,config=config_proto) 488 | train_sess.run(tf.global_variables_initializer()) 489 | train_sess.run(tf.tables_initializer()) 490 | 491 | with infer_graph.as_default(): 492 | infer_model=Model(hparams,tf.contrib.learn.ModeKeys.INFER) 493 | infer_sess=tf.Session(graph=infer_graph,config=config_proto) 494 | infer_sess.run(tf.global_variables_initializer()) 495 | infer_sess.run(tf.tables_initializer()) 496 | 497 | train_model.pretrain_saver.restore(train_sess,'pretrain_model/best_model') 498 | decay=0 499 | pay_attention=0 500 | global_step=0 501 | train_loss=0 502 | train_norm=0 503 | best_score=1000 504 | epoch=0 505 | flag=False 506 | if hparams.pretrain: 507 | train_iterator=data_iterator.TextIterator('train',hparams,32,'pre_data/train.csv') 508 | dev_iterator=data_iterator.TextIterator('dev',hparams,512,'pre_data/dev.csv') 509 | test_iterator=data_iterator.TextIterator('test',hparams,512,'pre_data/test.csv') 510 | while True: 511 | start_time = time.time() 512 | try: 513 | cost,_,norm=train_model.train(train_sess,train_iterator) 514 | global_step+=1 515 | train_loss+=cost 516 | train_norm+=norm 517 | except StopIteration: 518 | continue 519 | if global_step%hparams.num_display_steps==0: 520 | info={} 521 | info['learning_rate']=hparams.learning_rate 522 | info["avg_step_time"]=(time.time()-start_time)/hparams.num_display_steps 523 | start_time = time.time() 524 | info["train_ppl"]= train_loss / hparams.num_display_steps 525 | info["avg_grad_norm"]=train_norm/hparams.num_display_steps 526 | train_loss=0 527 | train_norm=0 528 | utils.print_step_info(" ", global_step, info) 529 | if global_step%hparams.num_eval_steps==0: 530 | train_model.saver.save(train_sess,'pretrain_model/model') 531 | with infer_graph.as_default(): 532 | infer_model.saver.restore(infer_sess,'pretrain_model/model') 533 | loss=[] 534 | while True: 535 | try: 536 | cost=infer_model.pretrain_infer(infer_sess,dev_iterator) 537 | loss.append(cost) 538 | except StopIteration: 539 | break 540 | logloss=round(np.mean(loss),5) 541 | if logloss 0.0 and self.mode==tf.contrib.learn.ModeKeys.TRAIN: 118 | mask=tf.ones(tf.shape(words_id)) 119 | mask=tf.cast(tf.minimum(tf.nn.dropout(mask,1-hparams.maskdropout),1),tf.int64) 120 | words_id=tf.cast(words_id*mask,tf.int32) 121 | q['words_inp'] = tf.gather(self.word_embedding, words_id[:,1:-1]) 122 | 123 | for q in [self.q1,self.q2]: 124 | chars_id=self.vocab_table_char.lookup(q['chars']) 125 | q['chars_id']=chars_id 126 | if hparams.maskdropout > 0.0 and self.mode==tf.contrib.learn.ModeKeys.TRAIN: 127 | mask=tf.ones(tf.shape(chars_id)) 128 | mask=tf.cast(tf.minimum(tf.nn.dropout(mask,1-hparams.maskdropout),1),tf.int64) 129 | chars_id=tf.cast(chars_id*mask,tf.int32) 130 | q['chars_inp'] = tf.gather(self.char_embedding, chars_id[:,1:-1]) 131 | 132 | def build_bilstm(self,hparams,scope_name): 133 | with tf.variable_scope(scope_name+'_words') as scope: 134 | fw_cell,bw_cell= self._build_encoder_cell(hparams,num_layer=4,num_units=300,encoder_type='bi',dropout=0.5 if hparams.pretrain else 0.0) 135 | W = layers_core.Dense(512,activation=tf.nn.relu, use_bias=False, name="W") 136 | for q in [self.q1,self.q2]: 137 | words_inp = q['words_inp'] 138 | bi_outputs, bi_state = tf.nn.bidirectional_dynamic_rnn(fw_cell,bw_cell,words_inp,dtype=tf.float32, sequence_length=q['words_len'],time_major=False,swap_memory=True) 139 | q['word_elmo_lstm']=bi_outputs 140 | q['word_elmo_output']=[W(x) for x in bi_outputs] 141 | q['word_elmo_label']=[q['words_id'][:,2:],q['words_id'][:,:-2]] 142 | 143 | with tf.variable_scope(scope_name+'_chars') as scope: 144 | fw_cell,bw_cell= self._build_encoder_cell(hparams,num_layer=4,num_units=300,encoder_type='bi',dropout=0.5 if hparams.pretrain else 0.0) 145 | W = layers_core.Dense(512,activation=tf.nn.relu, use_bias=False, name="W") 146 | for q in [self.q1,self.q2]: 147 | chars_inp = q['chars_inp'] 148 | bi_outputs, bi_state = tf.nn.bidirectional_dynamic_rnn(fw_cell,bw_cell,chars_inp,dtype=tf.float32, sequence_length=q['chars_len'],time_major=False,swap_memory=True) 149 | q['char_elmo_lstm']=bi_outputs 150 | q['char_elmo_output']=[W(x) for x in bi_outputs] 151 | q['char_elmo_label']=[q['chars_id'][:,2:],q['chars_id'][:,:-2]] 152 | 153 | def build_elmo_logits(self,hparams): 154 | costs=[] 155 | with tf.variable_scope("softmax_chars") as scope: 156 | nce_weights= tf.Variable(\ 157 | tf.truncated_normal([hparams.char_vocab_size,512],stddev=1.0/math.sqrt(512))) 158 | nce_biases=tf.Variable(tf.zeros([hparams.char_vocab_size])) 159 | for q in [self.q1,self.q2]: 160 | for i in range(2): 161 | mask = tf.sequence_mask(q['chars_len'], tf.shape(q['char_elmo_output'][i])[-2], dtype=tf.float32) 162 | mask=tf.reshape(mask,[-1]) 163 | inputs=tf.reshape(q['char_elmo_output'][i],[-1,512]) 164 | labels=tf.reshape(q['char_elmo_label'][i],[-1,1]) 165 | cost=tf.nn.nce_loss(weights=nce_weights,biases=nce_biases,labels=labels,inputs=inputs,num_sampled=32,num_classes=hparams.char_vocab_size) 166 | cost=tf.reduce_sum(cost*mask)/tf.reduce_sum(mask) 167 | costs.append(cost) 168 | 169 | loss=tf.reduce_mean(costs) 170 | return loss 171 | 172 | def build_encoder(self,hparams,scope_name): 173 | with tf.variable_scope(scope_name+'_chars') as scope: 174 | #encoding chars 175 | fw_cell,bw_cell= self._build_encoder_cell(hparams) 176 | for q in [self.q1,self.q2]: 177 | chars_inp = tf.transpose(tf.concat(q['char_elmo_output']+[q['chars_inp']],-1),[1,0,2]) 178 | bi_outputs, bi_state = tf.nn.bidirectional_dynamic_rnn(fw_cell,bw_cell,chars_inp,dtype=tf.float32, sequence_length=q['chars_len'],time_major=True,swap_memory=True) 179 | q['char_encoder_output']=tf.transpose(tf.concat(bi_outputs,-1),[1,0,2]) 180 | q['char_encoder_hidden']=bi_state 181 | return 182 | 183 | def build_decoder(self,hparams,scope_name): 184 | with tf.variable_scope(scope_name+'_chars') as scope: 185 | fw_cell,bw_cell= self._build_encoder_cell(hparams) 186 | for q in [self.q1,self.q2]: 187 | decoder_inp=tf.transpose(q['char_interaction'],[1,0,2]) 188 | bi_outputs, bi_state = tf.nn.bidirectional_dynamic_rnn(fw_cell,bw_cell,decoder_inp,dtype=tf.float32, sequence_length=q['chars_len'],time_major=True,swap_memory=True) 189 | q['char_decoder_output']=tf.transpose(tf.concat(bi_outputs,-1),[1,0,2]) 190 | 191 | 192 | 193 | 194 | 195 | def build_interaction(self,hparams,scope_name): 196 | with tf.variable_scope(scope_name+'_chars') as scope: 197 | for q in [(self.q1,self.q2),(self.q2,self.q1)]: 198 | encoder_hidden=q[0]['char_encoder_output'] 199 | weight=tf.reduce_sum(encoder_hidden[:,:,None,:]*q[1]['char_encoder_output'][:,None,:,:],-1) 200 | mask = tf.sequence_mask(q[1]['chars_len'], tf.shape(weight)[-1], dtype=tf.float32) 201 | weight=tf.nn.softmax(weight)*mask[:,None,:] 202 | weight=weight/(tf.reduce_sum(weight,-1)[:,:,None]+0.000001) 203 | char_inter=tf.reduce_sum(q[1]['char_encoder_output'][:,None,:,:]*weight[:,:,:,None],-2) 204 | q[0]['char_interaction']=tf.concat([encoder_hidden,char_inter,tf.abs(encoder_hidden-char_inter),encoder_hidden*char_inter],-1) 205 | return 206 | 207 | 208 | def build_mlp(self,hparams): 209 | with tf.variable_scope("MLP_chars") as scope: 210 | attention_W = layers_core.Dense(hparams.hidden_size,activation=tf.nn.relu, use_bias=False, name="attention_W") 211 | attention_V = layers_core.Dense(1,use_bias=False, name="attention_V") 212 | for q in [self.q1,self.q2]: 213 | weight=tf.nn.softmax(tf.reduce_sum(attention_V(attention_W(q['char_decoder_output'])),-1)) 214 | mask = tf.sequence_mask(q['chars_len'], tf.shape(weight)[-1], dtype=tf.float32) 215 | weight=weight*mask 216 | weight=weight/(tf.reduce_sum(weight,-1)[:,None]+0.000001) 217 | context_hidden=tf.reduce_sum(q['char_decoder_output']*weight[:,:,None],1) 218 | q['char_rep']=context_hidden 219 | hidden_char=[self.q1['char_rep'],self.q2['char_rep'],self.q1['char_rep']*self.q2['char_rep']] 220 | 221 | 222 | hidden_char.append(self.q1['chars_num']) 223 | 224 | with tf.variable_scope("MLP_chars") as scope: 225 | layer_W = layers_core.Dense(hparams.hidden_size,activation=tf.nn.tanh, use_bias=False, name="ff_layer") 226 | hidden_char=tf.concat(hidden_char,-1) 227 | logits=layer_W(hidden_char) 228 | if hparams.dropout > 0.0 and self.mode==tf.contrib.learn.ModeKeys.TRAIN: 229 | logits = tf.nn.dropout(logits,1-hparams.dropout) 230 | layer_W = layers_core.Dense(1, use_bias=False, name="ff_layer_output") 231 | logits_char=layer_W(logits)[:,0] 232 | logits=logits_char 233 | return logits 234 | 235 | 236 | def compute_loss(self,hparams,logits): 237 | self.prob=tf.nn.sigmoid(logits) 238 | loss=-tf.reduce_mean(self.label*tf.log(self.prob+0.0001)+(1-self.label)*tf.log(1-self.prob+0.0001),-1) 239 | return loss 240 | 241 | 242 | def _build_encoder_cell(self,hparams,num_layer=None,num_units=None,encoder_type=None,dropout=None,forget_bias=None): 243 | num_layer=num_layer or hparams.num_layer 244 | num_units=num_units or hparams.num_units 245 | encoder_type=encoder_type or hparams.encoder_type 246 | dropout=dropout or hparams.dropout 247 | forget_bias=forget_bias or hparams.forget_bias 248 | if encoder_type=="uni": 249 | cell_list = [] 250 | for i in range(num_layer): 251 | single_cell = tf.contrib.rnn.BasicLSTMCell(num_units,forget_bias=hparams.forget_bias) 252 | # Dropout (= 1 - keep_prob) 253 | if dropout > 0.0 and self.mode==tf.contrib.learn.ModeKeys.TRAIN: 254 | single_cell = tf.contrib.rnn.DropoutWrapper(cell=single_cell, input_keep_prob=(1.0 - dropout)) 255 | 256 | cell_list.append(single_cell) 257 | if len(cell_list) == 1: # Single layer. 258 | return cell_list[0] 259 | else: # Multi layers 260 | return tf.contrib.rnn.MultiRNNCell(cell_list) 261 | else: 262 | num_bi_layers = int(num_layer / 2) 263 | fw_cell_list=[] 264 | bw_cell_list=[] 265 | for i in range(num_bi_layers): 266 | single_cell = tf.contrib.rnn.BasicLSTMCell(num_units,forget_bias=forget_bias) 267 | if dropout > 0.0 and self.mode==tf.contrib.learn.ModeKeys.TRAIN: 268 | single_cell = tf.contrib.rnn.DropoutWrapper(cell=single_cell, input_keep_prob=(1.0 - dropout)) 269 | 270 | fw_cell_list.append(single_cell) 271 | single_cell = tf.contrib.rnn.BasicLSTMCell(num_units,forget_bias=forget_bias) 272 | if dropout > 0.0 and self.mode==tf.contrib.learn.ModeKeys.TRAIN: 273 | single_cell = tf.contrib.rnn.DropoutWrapper(cell=single_cell, input_keep_prob=(1.0 - dropout)) 274 | 275 | bw_cell_list.append(single_cell) 276 | 277 | if num_bi_layers == 1: # Single layer. 278 | fw_cell=fw_cell_list[0] 279 | bw_cell=bw_cell_list[0] 280 | else: # Multi layers 281 | fw_cell=tf.contrib.rnn.MultiRNNCell(fw_cell_list) 282 | bw_cell=tf.contrib.rnn.MultiRNNCell(bw_cell_list) 283 | return fw_cell,bw_cell 284 | def dey_lrate(self,sess,lrate): 285 | sess.run(tf.assign(self.lrate,lrate)) 286 | 287 | def optimizer(self,hparams): 288 | self.lrate=tf.Variable(hparams.learning_rate,trainable=False) 289 | if hparams.op=='sgd': 290 | opt = tf.train.GradientDescentOptimizer(self.lrate) 291 | elif hparams.op=='adam': 292 | opt = tf.train.AdamOptimizer(self.lrate,beta1=0.9, beta2=0.999,epsilon=1e-8) 293 | params = tf.trainable_variables() 294 | 295 | 296 | gradients = tf.gradients(self.cost,params,colocate_gradients_with_ops=True) 297 | clipped_grads, gradient_norm = tf.clip_by_global_norm(gradients, 5.0) 298 | self.grad_norm =gradient_norm 299 | self.update = opt.apply_gradients(zip(clipped_grads, params)) 300 | 301 | def batch_norm_layer(self, x, train_phase, scope_bn): 302 | z = tf.cond(train_phase, lambda: batch_norm(x, decay=self.hparams.batch_norm_decay, center=True, scale=True, updates_collections=None,is_training=True, reuse=None, trainable=True, scope=scope_bn), lambda: batch_norm(x, decay=self.hparams.batch_norm_decay, center=True, scale=True, updates_collections=None,is_training=False, reuse=True, trainable=True, scope=scope_bn)) 303 | return z 304 | 305 | def train(self,sess,iterator): 306 | assert self.mode == tf.contrib.learn.ModeKeys.TRAIN 307 | q1,q2,label,words_num,chars_num=iterator.next() 308 | dic={} 309 | dic[self.q1['chars']]=q1[1] 310 | dic[self.q1['chars_len']]=q1[3] 311 | dic[self.q1['chars_num']]=chars_num 312 | dic[self.q2['chars']]=q2[1] 313 | dic[self.q2['chars_len']]=q2[3] 314 | dic[self.label]=label 315 | dic[self.norm_trainable]=True 316 | 317 | 318 | 319 | return sess.run([self.cost,self.update,self.grad_norm],feed_dict=dic) 320 | 321 | def pretrain_infer(self,sess,iterator): 322 | assert self.mode == tf.contrib.learn.ModeKeys.INFER 323 | q1,q2,label,words_num,chars_num=iterator.next() 324 | dic={} 325 | dic[self.q1['chars']]=q1[1] 326 | dic[self.q1['chars_len']]=q1[3] 327 | dic[self.q2['chars']]=q2[1] 328 | dic[self.q2['chars_len']]=q2[3] 329 | dic[self.label]=label 330 | 331 | 332 | 333 | return sess.run(self.cost,feed_dict=dic) 334 | 335 | 336 | def infer(self,sess,iterator): 337 | assert self.mode == tf.contrib.learn.ModeKeys.INFER 338 | q1,q2,label,words_num,chars_num=iterator.next() 339 | dic={} 340 | dic[self.q1['chars']]=q1[1] 341 | dic[self.q1['chars_len']]=q1[3] 342 | dic[self.q1['chars_num']]=chars_num 343 | dic[self.q2['chars']]=q2[1] 344 | dic[self.q2['chars_len']]=q2[3] 345 | dic[self.norm_trainable]=False 346 | dic[self.label]=label 347 | 348 | 349 | 350 | prob1=sess.run(self.prob,feed_dict=dic) 351 | dic[self.q2['chars']]=q1[1] 352 | dic[self.q2['chars_len']]=q1[3] 353 | dic[self.q1['chars']]=q2[1] 354 | dic[self.q1['chars_len']]=q2[3] 355 | 356 | 357 | dic[self.label]=label 358 | prob2=sess.run(self.prob,feed_dict=dic) 359 | return (prob1+prob2)/2.0 360 | 361 | 362 | def train(hparams): 363 | 364 | 365 | if hparams.pretrain: 366 | hparams.learning_rate=0.001 367 | config_proto = tf.ConfigProto(log_device_placement=0,allow_soft_placement=0) 368 | config_proto.gpu_options.allow_growth = True 369 | train_graph = tf.Graph() 370 | infer_graph = tf.Graph() 371 | 372 | with train_graph.as_default(): 373 | train_model=Model(hparams,tf.contrib.learn.ModeKeys.TRAIN) 374 | train_sess=tf.Session(graph=train_graph,config=config_proto) 375 | train_sess.run(tf.global_variables_initializer()) 376 | train_sess.run(tf.tables_initializer()) 377 | 378 | with infer_graph.as_default(): 379 | infer_model=Model(hparams,tf.contrib.learn.ModeKeys.INFER) 380 | infer_sess=tf.Session(graph=infer_graph,config=config_proto) 381 | infer_sess.run(tf.global_variables_initializer()) 382 | infer_sess.run(tf.tables_initializer()) 383 | 384 | train_model.pretrain_saver.restore(train_sess,'pretrain_model/best_model') 385 | decay=0 386 | pay_attention=0 387 | global_step=0 388 | train_loss=0 389 | train_norm=0 390 | best_score=1000 391 | epoch=0 392 | flag=False 393 | if hparams.pretrain: 394 | train_iterator=data_iterator.TextIterator('train',hparams,32,'pre_data/train.csv') 395 | dev_iterator=data_iterator.TextIterator('dev',hparams,512,'pre_data/dev.csv') 396 | test_iterator=data_iterator.TextIterator('test',hparams,512,'pre_data/test.csv') 397 | while True: 398 | start_time = time.time() 399 | try: 400 | cost,_,norm=train_model.train(train_sess,train_iterator) 401 | global_step+=1 402 | train_loss+=cost 403 | train_norm+=norm 404 | except StopIteration: 405 | continue 406 | if global_step%hparams.num_display_steps==0: 407 | info={} 408 | info['learning_rate']=hparams.learning_rate 409 | info["avg_step_time"]=(time.time()-start_time)/hparams.num_display_steps 410 | start_time = time.time() 411 | info["train_ppl"]= train_loss / hparams.num_display_steps 412 | info["avg_grad_norm"]=train_norm/hparams.num_display_steps 413 | train_loss=0 414 | train_norm=0 415 | utils.print_step_info(" ", global_step, info) 416 | if global_step%hparams.num_eval_steps==0: 417 | train_model.saver.save(train_sess,'pretrain_model/model') 418 | with infer_graph.as_default(): 419 | infer_model.saver.restore(infer_sess,'pretrain_model/model') 420 | loss=[] 421 | while True: 422 | try: 423 | cost=infer_model.pretrain_infer(infer_sess,dev_iterator) 424 | loss.append(cost) 425 | except StopIteration: 426 | break 427 | logloss=round(np.mean(loss),5) 428 | if logloss 0.0 and self.mode==tf.contrib.learn.ModeKeys.TRAIN: 117 | mask=tf.ones(tf.shape(words_id)) 118 | mask=tf.cast(tf.minimum(tf.nn.dropout(mask,1-hparams.maskdropout),1),tf.int64) 119 | words_id=tf.cast(words_id*mask,tf.int32) 120 | q['words_inp'] = tf.gather(self.word_embedding, words_id[:,1:-1]) 121 | 122 | for q in [self.q1,self.q2]: 123 | chars_id=self.vocab_table_char.lookup(q['chars']) 124 | q['chars_id']=chars_id 125 | if hparams.maskdropout > 0.0 and self.mode==tf.contrib.learn.ModeKeys.TRAIN: 126 | mask=tf.ones(tf.shape(chars_id)) 127 | mask=tf.cast(tf.minimum(tf.nn.dropout(mask,1-hparams.maskdropout),1),tf.int64) 128 | chars_id=tf.cast(chars_id*mask,tf.int32) 129 | q['chars_inp'] = tf.gather(self.char_embedding, chars_id[:,1:-1]) 130 | 131 | def build_bilstm(self,hparams,scope_name): 132 | with tf.variable_scope(scope_name+'_words') as scope: 133 | fw_cell,bw_cell= self._build_encoder_cell(hparams,num_layer=4,num_units=300,encoder_type='bi',dropout=0.5 if hparams.pretrain else 0.0) 134 | W = layers_core.Dense(512,activation=tf.nn.relu, use_bias=False, name="W") 135 | for q in [self.q1,self.q2]: 136 | words_inp = q['words_inp'] 137 | bi_outputs, bi_state = tf.nn.bidirectional_dynamic_rnn(fw_cell,bw_cell,words_inp,dtype=tf.float32, sequence_length=q['words_len'],time_major=False,swap_memory=True) 138 | q['word_elmo_lstm']=bi_outputs 139 | q['word_elmo_output']=[W(x) for x in bi_outputs] 140 | q['word_elmo_label']=[q['words_id'][:,2:],q['words_id'][:,:-2]] 141 | 142 | with tf.variable_scope(scope_name+'_chars') as scope: 143 | fw_cell,bw_cell= self._build_encoder_cell(hparams,num_layer=4,num_units=300,encoder_type='bi',dropout=0.5 if hparams.pretrain else 0.0) 144 | W = layers_core.Dense(512,activation=tf.nn.relu, use_bias=False, name="W") 145 | for q in [self.q1,self.q2]: 146 | chars_inp = q['chars_inp'] 147 | bi_outputs, bi_state = tf.nn.bidirectional_dynamic_rnn(fw_cell,bw_cell,chars_inp,dtype=tf.float32, sequence_length=q['chars_len'],time_major=False,swap_memory=True) 148 | q['char_elmo_lstm']=bi_outputs 149 | q['char_elmo_output']=[W(x) for x in bi_outputs] 150 | q['char_elmo_label']=[q['chars_id'][:,2:],q['chars_id'][:,:-2]] 151 | def build_elmo_logits(self,hparams): 152 | costs=[] 153 | with tf.variable_scope("softmax_words") as scope: 154 | nce_weights= tf.Variable(\ 155 | tf.truncated_normal([hparams.word_vocab_size,512],stddev=1.0/math.sqrt(512))) 156 | nce_biases=tf.Variable(tf.zeros([hparams.word_vocab_size])) 157 | for q in [self.q1,self.q2]: 158 | for i in range(2): 159 | mask = tf.sequence_mask(q['words_len'], tf.shape(q['word_elmo_output'][i])[-2], dtype=tf.float32) 160 | mask=tf.reshape(mask,[-1]) 161 | inputs=tf.reshape(q['word_elmo_output'][i],[-1,512]) 162 | labels=tf.reshape(q['word_elmo_label'][i],[-1,1]) 163 | cost=tf.nn.nce_loss(weights=nce_weights,biases=nce_biases,labels=labels,inputs=inputs,num_sampled=32,num_classes=hparams.word_vocab_size) 164 | cost=tf.reduce_sum(cost*mask)/tf.reduce_sum(mask) 165 | costs.append(cost) 166 | 167 | 168 | 169 | 170 | loss=tf.reduce_mean(costs) 171 | return loss 172 | 173 | def build_encoder(self,hparams,scope_name): 174 | with tf.variable_scope(scope_name+'_words') as scope: 175 | #encoding words 176 | fw_cell,bw_cell= self._build_encoder_cell(hparams) 177 | for q in [self.q1,self.q2]: 178 | inputs=tf.concat(q['word_elmo_output']+[q['words_inp']],-1) 179 | words_inp = tf.transpose(inputs,[1,0,2]) 180 | bi_outputs, bi_state = tf.nn.bidirectional_dynamic_rnn(fw_cell,bw_cell,words_inp,dtype=tf.float32, sequence_length=q['words_len'],time_major=True,swap_memory=True) 181 | bi_outputs=tf.concat(bi_outputs,-1) 182 | 183 | q['word_encoder_output']=tf.transpose(tf.concat(bi_outputs,-1),[1,0,2]) 184 | q['word_encoder_hidden']=bi_state 185 | 186 | 187 | return 188 | 189 | def build_decoder(self,hparams,scope_name): 190 | with tf.variable_scope(scope_name+'_words') as scope: 191 | fw_cell,bw_cell= self._build_encoder_cell(hparams) 192 | for q in [self.q1,self.q2]: 193 | decoder_inp=tf.transpose(q['word_interaction'],[1,0,2]) 194 | bi_outputs, bi_state = tf.nn.bidirectional_dynamic_rnn(fw_cell,bw_cell,decoder_inp,dtype=tf.float32, sequence_length=q['words_len'],time_major=True,swap_memory=True) 195 | bi_outputs=tf.concat(bi_outputs,-1) 196 | #bi_outputs=self.HighwayNetwork(bi_outputs) 197 | q['word_decoder_output']=tf.transpose(bi_outputs,[1,0,2]) 198 | 199 | 200 | 201 | 202 | 203 | 204 | def build_interaction(self,hparams,scope_name): 205 | with tf.variable_scope(scope_name+'_words') as scope: 206 | for q in [(self.q1,self.q2),(self.q2,self.q1)]: 207 | encoder_hidden=q[0]['word_encoder_output'] 208 | weight=tf.reduce_sum(encoder_hidden[:,:,None,:]*q[1]['word_encoder_output'][:,None,:,:],-1) 209 | mask = tf.sequence_mask(q[1]['words_len'], tf.shape(weight)[-1], dtype=tf.float32) 210 | weight=tf.nn.softmax(weight)*mask[:,None,:] 211 | weight=weight/(tf.reduce_sum(weight,-1)[:,:,None]+0.000001) 212 | word_inter=tf.reduce_sum(q[1]['word_encoder_output'][:,None,:,:]*weight[:,:,:,None],-2) 213 | q[0]['word_interaction']=tf.concat([encoder_hidden,word_inter,tf.abs(encoder_hidden-word_inter),encoder_hidden*word_inter],-1) 214 | 215 | 216 | return 217 | 218 | 219 | def build_mlp(self,hparams): 220 | hidden_word=[] 221 | with tf.variable_scope("MLP_words") as scope: 222 | attention_W = layers_core.Dense(hparams.hidden_size,activation=tf.nn.relu, use_bias=False, name="attention_W") 223 | attention_V = layers_core.Dense(1,use_bias=False, name="attention_V") 224 | for q in [self.q1,self.q2]: 225 | weight=tf.nn.softmax(tf.reduce_sum(attention_V(attention_W(q['word_decoder_output'])),-1)) 226 | mask = tf.sequence_mask(q['words_len'], tf.shape(weight)[-1], dtype=tf.float32) 227 | weight=weight*mask 228 | weight=weight/(tf.reduce_sum(weight,-1)[:,None]+0.000001) 229 | context_hidden=tf.reduce_sum(q['word_decoder_output']*weight[:,:,None],1) 230 | q['word_rep']=context_hidden 231 | hidden_word=[self.q1['word_rep'],self.q2['word_rep'],self.q1['word_rep']*self.q2['word_rep']] 232 | 233 | 234 | hidden_word.append(self.q1['words_num']) 235 | 236 | 237 | 238 | 239 | with tf.variable_scope("MLP_words") as scope: 240 | layer_W = layers_core.Dense(hparams.hidden_size,activation=tf.nn.tanh, use_bias=False, name="ff_layer") 241 | hidden_word=tf.concat(hidden_word,-1) 242 | logits=layer_W(hidden_word) 243 | if hparams.dropout > 0.0 and self.mode==tf.contrib.learn.ModeKeys.TRAIN: 244 | logits = tf.nn.dropout(logits,1-hparams.dropout) 245 | layer_W = layers_core.Dense(1, use_bias=False, name="ff_layer_output") 246 | logits_word=layer_W(logits)[:,0] 247 | 248 | logits=logits_word 249 | return logits 250 | 251 | 252 | def compute_loss(self,hparams,logits): 253 | self.prob=tf.nn.sigmoid(logits) 254 | loss=-tf.reduce_mean(self.label*tf.log(self.prob+0.0001)+(1-self.label)*tf.log(1-self.prob+0.0001),-1) 255 | return loss 256 | 257 | def HighwayNetwork(self,inputs, num_layers=2, function='relu', 258 | keep_prob=0.8, scope='HN'): 259 | with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): 260 | if function == 'relu': 261 | function = tf.nn.relu 262 | elif function == 'tanh': 263 | function = tf.nn.tanh 264 | else: 265 | raise NotImplementedError 266 | hidden_size = inputs.get_shape().as_list()[-1] 267 | memory = inputs 268 | for layer in range(num_layers): 269 | with tf.variable_scope('layer_%d' % (layer)): 270 | H = layers_core.Dense(hidden_size,activation=function, use_bias=True, name="h") 271 | T = layers_core.Dense(hidden_size,activation=function, use_bias=True, name="t") 272 | h = H(memory) 273 | t = T(memory) 274 | memory = h * t + (1-t) * memory 275 | if keep_prob > 0.0 and self.mode==tf.contrib.learn.ModeKeys.TRAIN: 276 | outputs = tf.nn.dropout(memory,keep_prob) 277 | else: 278 | outputs = memory 279 | return outputs 280 | 281 | def _build_encoder_cell(self,hparams,num_layer=None,num_units=None,encoder_type=None,dropout=None,forget_bias=None): 282 | num_layer=num_layer or hparams.num_layer 283 | num_units=num_units or hparams.num_units 284 | encoder_type=encoder_type or hparams.encoder_type 285 | dropout=dropout or hparams.dropout 286 | forget_bias=forget_bias or hparams.forget_bias 287 | if encoder_type=="uni": 288 | cell_list = [] 289 | for i in range(num_layer): 290 | single_cell = tf.contrib.rnn.BasicLSTMCell(num_units,forget_bias=hparams.forget_bias) 291 | # Dropout (= 1 - keep_prob) 292 | if dropout > 0.0 and self.mode==tf.contrib.learn.ModeKeys.TRAIN: 293 | single_cell = tf.contrib.rnn.DropoutWrapper(cell=single_cell, input_keep_prob=(1.0 - dropout)) 294 | 295 | cell_list.append(single_cell) 296 | if len(cell_list) == 1: # Single layer. 297 | return cell_list[0] 298 | else: # Multi layers 299 | return tf.contrib.rnn.MultiRNNCell(cell_list) 300 | else: 301 | num_bi_layers = int(num_layer / 2) 302 | fw_cell_list=[] 303 | bw_cell_list=[] 304 | for i in range(num_bi_layers): 305 | single_cell = tf.contrib.rnn.BasicLSTMCell(num_units,forget_bias=forget_bias) 306 | if dropout > 0.0 and self.mode==tf.contrib.learn.ModeKeys.TRAIN: 307 | single_cell = tf.contrib.rnn.DropoutWrapper(cell=single_cell, input_keep_prob=(1.0 - dropout)) 308 | 309 | fw_cell_list.append(single_cell) 310 | single_cell = tf.contrib.rnn.BasicLSTMCell(num_units,forget_bias=forget_bias) 311 | if dropout > 0.0 and self.mode==tf.contrib.learn.ModeKeys.TRAIN: 312 | single_cell = tf.contrib.rnn.DropoutWrapper(cell=single_cell, input_keep_prob=(1.0 - dropout)) 313 | 314 | bw_cell_list.append(single_cell) 315 | 316 | if num_bi_layers == 1: # Single layer. 317 | fw_cell=fw_cell_list[0] 318 | bw_cell=bw_cell_list[0] 319 | else: # Multi layers 320 | fw_cell=tf.contrib.rnn.MultiRNNCell(fw_cell_list) 321 | bw_cell=tf.contrib.rnn.MultiRNNCell(bw_cell_list) 322 | return fw_cell,bw_cell 323 | def dey_lrate(self,sess,lrate): 324 | sess.run(tf.assign(self.lrate,lrate)) 325 | 326 | def optimizer(self,hparams): 327 | self.lrate=tf.Variable(hparams.learning_rate,trainable=False) 328 | if hparams.op=='sgd': 329 | opt = tf.train.GradientDescentOptimizer(self.lrate) 330 | elif hparams.op=='adam': 331 | opt = tf.train.AdamOptimizer(self.lrate,beta1=0.9, beta2=0.999,epsilon=1e-8) 332 | params = tf.trainable_variables() 333 | 334 | 335 | gradients = tf.gradients(self.cost,params,colocate_gradients_with_ops=True) 336 | clipped_grads, gradient_norm = tf.clip_by_global_norm(gradients, 5.0) 337 | self.grad_norm =gradient_norm 338 | self.update = opt.apply_gradients(zip(clipped_grads, params)) 339 | 340 | def batch_norm_layer(self, x, train_phase, scope_bn): 341 | z = tf.cond(train_phase, lambda: batch_norm(x, decay=self.hparams.batch_norm_decay, center=True, scale=True, updates_collections=None,is_training=True, reuse=None, trainable=True, scope=scope_bn), lambda: batch_norm(x, decay=self.hparams.batch_norm_decay, center=True, scale=True, updates_collections=None,is_training=False, reuse=True, trainable=True, scope=scope_bn)) 342 | return z 343 | 344 | def train(self,sess,iterator): 345 | assert self.mode == tf.contrib.learn.ModeKeys.TRAIN 346 | q1,q2,label,words_num,chars_num=iterator.next() 347 | dic={} 348 | dic[self.q1['words']]=q1[0] 349 | dic[self.q1['words_len']]=q1[2] 350 | dic[self.q1['words_num']]=words_num 351 | dic[self.q2['words']]=q2[0] 352 | dic[self.q2['words_len']]=q2[2] 353 | dic[self.label]=label 354 | dic[self.norm_trainable]=True 355 | 356 | 357 | 358 | return sess.run([self.cost,self.update,self.grad_norm],feed_dict=dic) 359 | 360 | def pretrain_infer(self,sess,iterator): 361 | assert self.mode == tf.contrib.learn.ModeKeys.INFER 362 | q1,q2,label,words_num,chars_num=iterator.next() 363 | dic={} 364 | dic[self.q1['words']]=q1[0] 365 | dic[self.q1['words_len']]=q1[2] 366 | dic[self.q2['words']]=q2[0] 367 | dic[self.q2['words_len']]=q2[2] 368 | dic[self.label]=label 369 | 370 | 371 | 372 | return sess.run(self.cost,feed_dict=dic) 373 | 374 | 375 | def infer(self,sess,iterator): 376 | assert self.mode == tf.contrib.learn.ModeKeys.INFER 377 | q1,q2,label,words_num,chars_num=iterator.next() 378 | dic={} 379 | dic[self.q1['words']]=q1[0] 380 | dic[self.q1['words_len']]=q1[2] 381 | dic[self.q1['words_num']]=words_num 382 | dic[self.q2['words']]=q2[0] 383 | dic[self.q2['words_len']]=q2[2] 384 | dic[self.norm_trainable]=False 385 | dic[self.label]=label 386 | 387 | 388 | 389 | prob1=sess.run(self.prob,feed_dict=dic) 390 | dic[self.q2['words']]=q1[0] 391 | dic[self.q2['words_len']]=q1[2] 392 | dic[self.q1['words']]=q2[0] 393 | dic[self.q1['words_len']]=q2[2] 394 | 395 | 396 | dic[self.label]=label 397 | prob2=sess.run(self.prob,feed_dict=dic) 398 | return (prob1+prob2)/2.0 399 | 400 | 401 | def train(hparams): 402 | 403 | 404 | if hparams.pretrain: 405 | hparams.learning_rate=0.001 406 | config_proto = tf.ConfigProto(log_device_placement=0,allow_soft_placement=0) 407 | config_proto.gpu_options.allow_growth = True 408 | train_graph = tf.Graph() 409 | infer_graph = tf.Graph() 410 | 411 | with train_graph.as_default(): 412 | train_model=Model(hparams,tf.contrib.learn.ModeKeys.TRAIN) 413 | train_sess=tf.Session(graph=train_graph,config=config_proto) 414 | train_sess.run(tf.global_variables_initializer()) 415 | train_sess.run(tf.tables_initializer()) 416 | 417 | with infer_graph.as_default(): 418 | infer_model=Model(hparams,tf.contrib.learn.ModeKeys.INFER) 419 | infer_sess=tf.Session(graph=infer_graph,config=config_proto) 420 | infer_sess.run(tf.global_variables_initializer()) 421 | infer_sess.run(tf.tables_initializer()) 422 | 423 | train_model.pretrain_saver.restore(train_sess,'pretrain_model/best_model') 424 | decay=0 425 | pay_attention=0 426 | global_step=0 427 | train_loss=0 428 | train_norm=0 429 | best_score=1000 430 | epoch=0 431 | flag=False 432 | if hparams.pretrain: 433 | train_iterator=data_iterator.TextIterator('train',hparams,32,'pre_data/train.csv') 434 | dev_iterator=data_iterator.TextIterator('dev',hparams,512,'pre_data/dev.csv') 435 | test_iterator=data_iterator.TextIterator('test',hparams,512,'pre_data/test.csv') 436 | while True: 437 | start_time = time.time() 438 | try: 439 | cost,_,norm=train_model.train(train_sess,train_iterator) 440 | global_step+=1 441 | train_loss+=cost 442 | train_norm+=norm 443 | except StopIteration: 444 | continue 445 | if global_step%hparams.num_display_steps==0: 446 | info={} 447 | info['learning_rate']=hparams.learning_rate 448 | info["avg_step_time"]=(time.time()-start_time)/hparams.num_display_steps 449 | start_time = time.time() 450 | info["train_ppl"]= train_loss / hparams.num_display_steps 451 | info["avg_grad_norm"]=train_norm/hparams.num_display_steps 452 | train_loss=0 453 | train_norm=0 454 | utils.print_step_info(" ", global_step, info) 455 | if global_step%hparams.num_eval_steps==0: 456 | train_model.saver.save(train_sess,'pretrain_model/model') 457 | with infer_graph.as_default(): 458 | infer_model.saver.restore(infer_sess,'pretrain_model/model') 459 | loss=[] 460 | while True: 461 | try: 462 | cost=infer_model.pretrain_infer(infer_sess,dev_iterator) 463 | loss.append(cost) 464 | except StopIteration: 465 | break 466 | logloss=round(np.mean(loss),5) 467 | if logloss=hparams.vocab_threshold]) 62 | word2index[s]={} 63 | for v in vals: 64 | word2index[s][v]=len(word2index[s])+2 65 | 66 | print("done!") 67 | return word2index 68 | 69 | train_df=pd.read_csv('pre_data/train.csv') 70 | hparams=create_hparams() 71 | utils.print_hparams(hparams) 72 | if hparams.word_num_features is None: 73 | hparams.word_num_features=[] 74 | if hparams.word_single_features is None: 75 | hparams.word_single_features=[] 76 | if hparams.char_num_features is None: 77 | hparams.char_num_features=[] 78 | if hparams.char_single_features is None: 79 | hparams.char_single_features=[] 80 | hparams.word2index=build_vocabulary(train_df,hparams) 81 | 82 | 83 | 84 | if hparams.model=='model_word': 85 | print('model_word') 86 | preds=model_word.train(hparams) 87 | elif hparams.model=='model_char': 88 | print('model_char') 89 | preds=model_char.train(hparams) 90 | else: 91 | preds=model_all.train(hparams) 92 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Generally useful utility functions.""" 17 | from __future__ import print_function 18 | 19 | import codecs 20 | import collections 21 | import json 22 | import math 23 | import os 24 | import sys 25 | import time 26 | import numpy as np 27 | 28 | def print_out(s, f=None, new_line=True): 29 | """Similar to print but with support to flush and output to a file.""" 30 | if isinstance(s, bytes): 31 | s = s.decode("utf-8") 32 | 33 | if f: 34 | f.write(s.encode("utf-8")) 35 | if new_line: 36 | f.write(b"\n") 37 | 38 | # stdout 39 | out_s = s.encode("utf-8") 40 | if not isinstance(out_s, str): 41 | out_s = out_s.decode("utf-8") 42 | print(out_s, end="", file=sys.stdout) 43 | 44 | if new_line: 45 | sys.stdout.write("\n") 46 | sys.stdout.flush() 47 | 48 | 49 | def print_hparams(hparams, skip_patterns=None, header=None): 50 | """Print hparams, can skip keys based on pattern.""" 51 | if header: print_out("%s" % header) 52 | values = hparams.values() 53 | for key in sorted(values.keys()): 54 | if not skip_patterns or all( 55 | [skip_pattern not in key for skip_pattern in skip_patterns]): 56 | print_out(" %s=%s" % (key, str(values[key]))) 57 | 58 | 59 | def print_step_info(prefix, global_step, info): 60 | """Print all info at the current global step.""" 61 | print_out( 62 | "%sstep %d lr %g step-time %.2fs loss %.5f gN %.2f, %s" % 63 | (prefix, global_step, info["learning_rate"], info["avg_step_time"], 64 | info["train_ppl"], info["avg_grad_norm"], time.ctime())) 65 | def mrr(dist_list, gold): 66 | """ 67 | dist_list: list of list of label probability for all labels. 68 | gold: list of gold indexes. 69 | 70 | Get mean reciprocal rank. (this is slow, as have to sort for 10K vocab) 71 | """ 72 | mrr_per_example = [] 73 | dist_arrays = np.array(dist_list) 74 | dist_sorted = np.argsort(-dist_arrays, axis=1) 75 | for ind, gold_i in enumerate(gold): 76 | rr_per_array = [] 77 | sorted_index = dist_sorted[ind, :] 78 | for k in range(len(sorted_index)): 79 | if sorted_index[k] in gold_i : 80 | rr_per_array.append(1.0 / (k + 1)) 81 | mrr_per_example.append(np.mean(rr_per_array)) 82 | return sum(mrr_per_example) * 1.0 / len(mrr_per_example) --------------------------------------------------------------------------------