├── BaselineModel ├── config.py ├── data │ ├── cnli │ │ ├── cnli_dev_1.0_seg.txt │ │ └── cnli_train_1.0_seg.txt │ └── embedding │ │ ├── cnli_embedding.npy │ │ ├── cnli_vocab.txt │ │ ├── convert_embedding.py │ │ └── run_embedding.sh ├── data_reader.py ├── decomposable_att.py ├── esim.py ├── myutils.py ├── ops_cudnn_rnn.py ├── run.sh └── train.py ├── CCL2018中文文本蕴含评测总结.pdf ├── CCL2018中文文本蕴含识别系统报告集合.pdf ├── CNLI2018 Evaluation Result.md ├── CNLI_Data ├── cnli_dev_1.0.txt ├── cnli_test_1.0.txt ├── cnli_test_labeled.txt └── cnli_train_1.0.txt ├── Codalab Example ├── answer.zip └── readme └── README.md /BaselineModel/config.py: -------------------------------------------------------------------------------- 1 | class SmallConfig(object): 2 | """Small config.""" 3 | init_scale = 0.1 4 | learning_rate = 0.0003 5 | 6 | max_grad_norm = 5 7 | xmaxlen=32 8 | ymaxlen=30 9 | num_classes=3 10 | hidden_units = 300 11 | embedding_size =300 12 | MAXITER=70 13 | keep_prob = 0.8 14 | 15 | batch_size = 32 16 | l2_strength=0.0003 17 | 18 | early_stopping=5 19 | 20 | train_file='./data/cnli/cnli_train_1.0_seg.txt' 21 | dev_file='./data/cnli/cnli_dev_1.0_seg.txt' 22 | 23 | cnli_embedding_dir= './data/embedding/cnli_embedding.npy' 24 | 25 | -------------------------------------------------------------------------------- /BaselineModel/data/embedding/cnli_embedding.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blcunlp/CNLI/604638ecf50201c15da6420ceb14aa7a43bd1463/BaselineModel/data/embedding/cnli_embedding.npy -------------------------------------------------------------------------------- /BaselineModel/data/embedding/convert_embedding.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # read a vocab and precompute a .npy embedding matrix. 4 | # if a vocab entry is in the provided glove embeddings then use the glove data. 5 | # if it's not, generate a random vector but scale it to the median length of the glove embeddings. 6 | # reserve row 0 in the matrix for the PAD embedding (always set to {0}) 7 | # reserve row 1 in the matrix for the UNK embedding (given a random value) 8 | import argparse 9 | import numpy as np 10 | import sys 11 | from sklearn import random_projection 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--vocab", required=True, help="reference vocab of non glove data; token \t idx") 15 | parser.add_argument("--glove-data", required=True, help="glove data. ssv, token, e_d1, e_d2, ...") 16 | parser.add_argument("--npy", required=True, help="npy output") 17 | parser.add_argument("--random-projection-dimensionality", default=None, type=float, 18 | help="if set we randomly project the glove data to a smaller dimensionality") 19 | opts = parser.parse_args() 20 | 21 | # slurp vocab entries. assume idxs are valid, ie 1 < i < |v|, no dups, no gaps, etc 22 | # (recall reserving 0 for UNK) 23 | # TODO: use vocab.py 24 | vocab = {} # token => idx 25 | for line in open(opts.vocab, "r"): 26 | token, idx = line.strip().split("\t") 27 | if idx == 0: 28 | assert token == '_PAD', "expecting to reserve 0 for _PAD" 29 | elif idx == 1: 30 | assert token == '_UNK', "expecting to reserve 1 for _UNK" 31 | elif idx ==2: 32 | assert token == '_GO', "expecting to reverse 2 for _GO" 33 | elif idx ==3: 34 | assert token == '_EOS', "expecting to reverse 3 for _EOS" 35 | else: 36 | vocab[token] = int(idx) 37 | print "vocab has", len(vocab), "entries (not _PAD or _UNK or _GO or _EOS)" 38 | 39 | # alloc output after we see first glove embedding (so we know it's dimensionality) 40 | embeddings = None 41 | glove_dimensionality = None 42 | 43 | # pass over glove data copying data into embedddings array 44 | # for the cases where the token is in the reference vocab. 45 | tokens_requiring_random = set(vocab.keys()) 46 | glove_embedding_norms = [] 47 | for line in open(opts.glove_data, "r"): 48 | cols = line.strip().split(" ") 49 | token = cols[0] 50 | if token in vocab: 51 | glove_embedding = np.array(cols[1:], dtype=np.float32) 52 | if embeddings is None: 53 | glove_dimensionality = len(glove_embedding) 54 | embeddings = np.empty((len(vocab), glove_dimensionality), dtype=np.float32) # +1 for pad & unk 55 | assert len(glove_embedding) == glove_dimensionality, "differing dimensionality in glove data?" 56 | embeddings[vocab[token]] = glove_embedding 57 | tokens_requiring_random.remove(token) 58 | glove_embedding_norms.append(np.linalg.norm(glove_embedding)) 59 | 60 | # given these embeddings we can calculate the median norm of the glove data 61 | median_glove_embedding_norm = np.median(glove_embedding_norms) 62 | 63 | print >>sys.stderr, "build .npy file" 64 | print >>sys.stderr, "after passing over glove there are", len(tokens_requiring_random), \ 65 | "tokens requiring a random alloc" 66 | 67 | # return a random embedding with the same norm as the glove data median norm 68 | def random_embedding(): 69 | random_embedding = np.random.randn(1, glove_dimensionality) 70 | random_embedding /= np.linalg.norm(random_embedding) 71 | random_embedding *= median_glove_embedding_norm 72 | return random_embedding 73 | 74 | # assign PAD and UNK random embeddings (pre projection) 75 | embeddings[0] = random_embedding() # PAD 76 | embeddings[1] = random_embedding() # UNK 77 | 78 | # assign random projections for every other fields requiring it 79 | for token in tokens_requiring_random: 80 | embeddings[vocab[token]] = random_embedding() 81 | 82 | # randomly project (if configured to do so) 83 | if opts.random_projection_dimensionality is not None: 84 | # assign a temp random embedding for PAD before projection (and zero it after) 85 | p = random_projection.GaussianRandomProjection(n_components=opts.random_projection_dimensionality) 86 | embeddings = p.fit_transform(embeddings) 87 | 88 | # zero out PAD embedding 89 | embeddings[0] = [0] * embeddings.shape[1] 90 | 91 | # write embeddings npy to disk 92 | np.save(opts.npy, embeddings) 93 | 94 | 95 | 96 | -------------------------------------------------------------------------------- /BaselineModel/data/embedding/run_embedding.sh: -------------------------------------------------------------------------------- 1 | 2 | #time cat ../snli_1.0/snli_1.0_train.jsonl | ./generate_vocab_from_snli.py > glove/vocab.tsv 3 | 4 | 5 | time ./data/embedding/convert_embedding.py \ 6 | --vocab ./data/embedding/cnli_vocab.txt \ 7 | --glove-data ./data/embedding/sgns.merge.word \ 8 | --npy ./data/embedding/cnli_embedding.npy \ 9 | 10 | # 11 | # --random-projection-dimensionality 100 12 | -------------------------------------------------------------------------------- /BaselineModel/data_reader.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import os 3 | import json 4 | from myutils import * 5 | from collections import Counter 6 | 7 | from six.moves import xrange 8 | import numpy as np 9 | _PAD="_PAD" 10 | _UNK= "_UNK" 11 | _GO= "_GO" 12 | _EOS= "_EOS" 13 | _START_VOCAB=[_PAD,_UNK,_GO,_EOS] 14 | 15 | PAD_ID=0 16 | UNK_ID=1 17 | GO_ID =2 18 | EOS_ID =3 19 | 20 | def filter_length(seq,maxlen): 21 | if len(seq)>maxlen: 22 | new_seq=seq[:maxlen] 23 | else: 24 | new_seq=seq 25 | return new_seq 26 | 27 | def load_data(train,vocab,labels={'neutral':0,'entailment':1,'contradiction':2}): 28 | X,Y,Z=[],[],[] 29 | for p,h,l in train: 30 | p=map_to_idx(tokenize(p),vocab)+ [EOS_ID] 31 | h=[GO_ID]+map_to_idx(tokenize(h),vocab)+ [EOS_ID] 32 | p=filter_length(p,32) 33 | h=filter_length(h,30) 34 | if l in labels: 35 | X+=[p] 36 | Y+=[h] 37 | Z+=[labels[l]] 38 | return X,Y,Z 39 | 40 | def get_vocab(data): 41 | vocab=Counter() 42 | for ex in data: 43 | tokens=tokenize(ex[0]) 44 | tokens+=tokenize(ex[1]) 45 | vocab.update(tokens) 46 | vocab_sorted = sorted(vocab.items(), key=lambda x: (-x[1], x[0])) 47 | lst = _START_VOCAB + [ x for x, y in vocab_sorted if y > 0] 48 | 49 | vocab_exist=os.path.isfile("./data/embedding/cnli_vocab.txt") 50 | 51 | #if not vocab_exist: 52 | print ("build cnli_vocab.txt") 53 | f =open("./data/embedding/cnli_vocab.txt","w+") 54 | for x,y in enumerate(lst): 55 | x_y = str(y) +"\t"+ str(x)+"\n" 56 | f.write(x_y) 57 | f.close() 58 | 59 | os.system('./data/embedding/run_embedding.sh') 60 | vocab = dict([ (y,x) for x,y in enumerate(lst)]) 61 | return vocab 62 | 63 | 64 | class DataSet(object): 65 | def __init__(self,x,y,labels,x_len,y_len,X_mask,Y_mask): 66 | self._data_len=len(x) 67 | self._x =x 68 | self._y =y 69 | self._labels =labels 70 | self._x_len = x_len 71 | self._y_len = y_len 72 | self._epochs_completed = 0 73 | self._index_in_epoch = 0 74 | self._num_examples = x.shape[0] 75 | self._x_mask=X_mask 76 | self._y_mask=Y_mask 77 | 78 | def next_batch(self, batch_size): 79 | """Return the next `batch_size` examples from this data set.""" 80 | 81 | start = self._index_in_epoch 82 | self._index_in_epoch += batch_size 83 | if self._index_in_epoch > self._num_examples: 84 | # Finished epoch 85 | self._epochs_completed += 1 86 | 87 | # Start next epoch 88 | start = 0 89 | self._index_in_epoch = batch_size 90 | assert batch_size <= self._num_examples 91 | 92 | end = self._index_in_epoch 93 | 94 | batch_x, batch_x_mask, batch_x_len = self._x[start:end], self._x_mask[start:end], self._x_len[start:end] 95 | batch_y,batch_y_mask, batch_y_len = self._y[start:end], self._y_mask[start:end], self._y_len[start:end] 96 | batch_labels = self._labels[start:end] 97 | 98 | return batch_x,batch_y, batch_labels,batch_x_mask,batch_y_mask,batch_x_len,batch_y_len 99 | 100 | @property 101 | def get_x(self): 102 | return self._x 103 | 104 | @property 105 | def get_y(self): 106 | return self.y 107 | 108 | @property 109 | def labels(self): 110 | return self._labels 111 | 112 | @property 113 | def get_x_len(self): 114 | return self._x_len 115 | 116 | @property 117 | def get_y_len(self): 118 | return self._y_len 119 | 120 | @property 121 | def get_data_num(self): 122 | return self._data_len 123 | 124 | def get_epoch_size(self,batch_size): 125 | epoch_size = self._data_len //batch_size 126 | return epoch_size 127 | 128 | def singlefile2seqid(data,vocab, config): 129 | X_data, Y_data, Z_data = load_data(data, vocab) 130 | 131 | X_data_lengths=np.asarray([len(x) for x in X_data]).reshape(len(X_data)) 132 | X_data_mask = np.asarray([np.ones(x) for x in X_data_lengths]).reshape(len(X_data_lengths)) 133 | X_data_mask=pad_sequences(X_data_mask, maxlen=config.xmaxlen, value=vocab[_PAD], padding='post') 134 | X_data=pad_sequences(X_data, maxlen=config.xmaxlen, value=vocab[_PAD], padding='post') 135 | 136 | Y_data_lengths = np.asarray([len(x) for x in Y_data]).reshape(len(Y_data)) 137 | Y_data_mask = np.asarray([np.ones(x) for x in Y_data_lengths]).reshape(len(Y_data_lengths)) 138 | Y_data_mask = pad_sequences(Y_data_mask, maxlen=config.ymaxlen, value=vocab[_PAD], padding='post') 139 | Y_data = pad_sequences(Y_data, maxlen=config.ymaxlen, value=vocab[_PAD], padding='post') 140 | 141 | 142 | Z_data = to_categorical(Z_data, num_classes=config.num_classes) 143 | #X_data = np.asarray(X_data) 144 | dataset = DataSet(X_data,Y_data,Z_data,\ 145 | X_data_lengths,Y_data_lengths, 146 | X_data_mask,Y_data_mask) 147 | 148 | return dataset 149 | 150 | def file2seqid(config): 151 | 152 | xmaxlen = config.xmaxlen 153 | ymaxlen = config.ymaxlen 154 | train = [l.strip().split('\t') for l in open(config.train_file)] 155 | dev = [l.strip().split('\t') for l in open(config.dev_file)] 156 | vocab = get_vocab(train) 157 | 158 | Train = singlefile2seqid(train,vocab, config) 159 | Dev = singlefile2seqid(dev,vocab, config) 160 | return Train,Dev,vocab 161 | 162 | 163 | 164 | 165 | if __name__=="__main__": 166 | 167 | train=[l.strip().split('\t') for l in open('train.txt')][:20000] 168 | dev=[l.strip().split('\t') for l in open('dev.txt')] 169 | test=[l.strip().split('\t') for l in open('test.txt')] 170 | labels={'neutral':0,'entailment':1,'contradiction':2} 171 | 172 | vocab=get_vocab(train) 173 | #X_train,Y_train,Z_train=load_data(train,vocab) 174 | X_dev,Y_dev,Z_dev=load_data(dev,vocab) 175 | #print (len(X_train),X_train[0]) 176 | print (len(X_dev),X_dev[0]) 177 | print (len(Y_dev),Y_dev[0]) 178 | print (len(Z_dev),Z_dev[0]) 179 | -------------------------------------------------------------------------------- /BaselineModel/decomposable_att.py: -------------------------------------------------------------------------------- 1 | ############### 2 | #20180615 3 | #implementation of decomposable attention on cnli 4 | ################ 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import sys 11 | import time 12 | import inspect 13 | import logging 14 | import numpy as np 15 | import tensorflow as tf 16 | from tensorflow.contrib.rnn.python.ops import core_rnn_cell 17 | from tensorflow.contrib.layers import batch_norm,l2_regularizer 18 | from tensorflow.python.ops import variable_scope 19 | 20 | 21 | class MyModel(object): 22 | """The decomposable model.""" 23 | 24 | def __init__(self, is_training, config): 25 | 26 | batch_size = config.batch_size 27 | self.config = config 28 | self.is_training = is_training 29 | self.global_step = tf.Variable(0, trainable=False) 30 | 31 | self.add_placeholder() 32 | self.add_embedding() 33 | self.input_encoding() 34 | self.attend() 35 | self.compare() 36 | self.aggregate() 37 | 38 | self.compute_accuracy() 39 | self.compute_loss() 40 | 41 | if not is_training: 42 | return 43 | self.optimization() 44 | 45 | def add_placeholder(self): 46 | ''' 47 | add_placeholder for inputs 48 | ''' 49 | self.x = tf.placeholder(tf.int32, [self.config.batch_size, self.config.xmaxlen]) 50 | self.y = tf.placeholder(tf.int32, [self.config.batch_size, self.config.ymaxlen]) 51 | 52 | self.x_mask = tf.placeholder(tf.int32, [self.config.batch_size, self.config.xmaxlen]) 53 | self.y_mask = tf.placeholder(tf.int32, [self.config.batch_size, self.config.ymaxlen]) 54 | self.x_mask = tf.cast(self.x_mask,tf.float32) 55 | self.y_mask = tf.cast(self.y_mask,tf.float32) 56 | 57 | self.x_len = tf.placeholder(tf.int32, [self.config.batch_size,]) 58 | self.y_len = tf.placeholder(tf.int32, [self.config.batch_size,]) 59 | self.x_len = tf.cast(self.x_len,tf.float32) 60 | self.y_len = tf.cast(self.y_len,tf.float32) 61 | 62 | self.label = tf.placeholder(tf.int32, [self.config.batch_size,self.config.num_classes]) 63 | 64 | 65 | def add_embedding(self): 66 | ''' 67 | add pretrained embedding 68 | ''' 69 | with tf.device("/cpu:0"): 70 | embedding_matrix=np.load(self.config.cnli_embedding_dir) 71 | embedding = tf.Variable(embedding_matrix,trainable=False, name="embedding") 72 | 73 | self.input_xemb = tf.nn.embedding_lookup(embedding, self.x) 74 | self.input_yemb = tf.nn.embedding_lookup(embedding, self.y) 75 | 76 | if self.is_training and self.config.keep_prob < 1: 77 | self.input_xemb = tf.nn.dropout(self.input_xemb, self.config.keep_prob) 78 | self.input_yemb = tf.nn.dropout(self.input_yemb, self.config.keep_prob) 79 | 80 | def input_encoding(self): 81 | ''' 82 | encode the x and y with a two-layer fnn seperately 83 | ''' 84 | with tf.variable_scope("encode_x"): 85 | self.x_output=self.two_layer_dense(self.input_xemb,self.config.hidden_units, 86 | scope="x_fnn",regularizer=l2_regularizer(self.config.l2_strength) ) 87 | self.x_output=self.x_output*self.x_mask[:,:,None] 88 | 89 | if self.is_training and self.config.keep_prob < 1: 90 | self.x_output = tf.nn.dropout(self.x_output,self.config.keep_prob) # its length must be x_length 91 | 92 | with tf.variable_scope("encode_y"): 93 | self.y_output=self.two_layer_dense(self.input_yemb,self.config.hidden_units, 94 | scope="y_fnn",regularizer=l2_regularizer(self.config.l2_strength)) 95 | self.y_output=self.y_output*self.y_mask[:,:,None] 96 | 97 | if self.is_training and self.config.keep_prob < 1: 98 | self.y_output = tf.nn.dropout(self.y_output, self.config.keep_prob) 99 | 100 | 101 | def attend(self): 102 | self.weighted_y, self.weighted_x =self.attention(x_sen= self.x_output, 103 | y_sen= self.y_output, 104 | x_len= self.config.xmaxlen, 105 | y_len= self.config.ymaxlen) 106 | 107 | 108 | def compare(self): 109 | 110 | with tf.variable_scope("compare"): 111 | with tf.variable_scope("compare-xy"): 112 | co_xy = tf.concat([self.x_output,self.weighted_y],axis=-1) 113 | v_co_xy=self.two_layer_dense(co_xy,self.config.hidden_units, 114 | scope="compare_xy",regularizer=l2_regularizer(self.config.l2_strength)) 115 | self.v_co_xy=v_co_xy*self.x_mask[:,:,None] 116 | 117 | if self.is_training and self.config.keep_prob < 1: 118 | self.v_co_xy = tf.nn.dropout(self.v_co_xy,self.config.keep_prob) 119 | 120 | with tf.variable_scope("compare-yx"): 121 | co_yx = tf.concat([self.y_output,self.weighted_x],axis=-1) 122 | v_co_yx=self.two_layer_dense(co_yx,self.config.hidden_units, 123 | scope="compare_yx",regularizer=l2_regularizer(self.config.l2_strength)) 124 | self.v_co_yx=v_co_yx*self.y_mask[:,:,None] 125 | 126 | if self.is_training and self.config.keep_prob < 1: 127 | self.v_co_yx = tf.nn.dropout(self.v_co_yx,self.config.keep_prob) 128 | 129 | 130 | def aggregate(self): 131 | ''' 132 | 1. sum pooling 2. fnn 133 | ''' 134 | with tf.variable_scope("pooling"): 135 | v1=tf.reduce_sum(self.v_co_xy,axis=1) 136 | v2=tf.reduce_sum(self.v_co_yx,axis=1) 137 | 138 | self.v = tf.concat([v1,v2],axis=-1) 139 | 140 | with tf.variable_scope("pred-layer"): 141 | 142 | dense1 = tf.layers.dense(inputs=self.v, 143 | units=self.config.hidden_units, 144 | activation=tf.nn.tanh, 145 | use_bias=True, 146 | kernel_regularizer= l2_regularizer(self.config.l2_strength), 147 | name="dense-pred-W") 148 | 149 | if self.is_training and self.config.keep_prob < 1: 150 | dense1 = tf.nn.dropout(dense1, self.config.keep_prob) 151 | 152 | W_pred = tf.get_variable("W_pred", shape=[self.config.hidden_units, self.config.num_classes],regularizer=l2_regularizer(self.config.l2_strength)) 153 | 154 | self.pred = tf.nn.softmax(tf.matmul(dense1, W_pred), name="pred") 155 | 156 | def compute_accuracy(self): 157 | correct = tf.equal(tf.argmax(self.pred,1),tf.argmax(self.label,1)) 158 | self.acc = tf.reduce_mean(tf.cast(correct, "float"), name="accuracy") 159 | 160 | def compute_loss(self): 161 | 162 | self.loss_term = -tf.reduce_sum(tf.cast(self.label,tf.float32) * tf.log(self.pred),name="loss_term") 163 | self.reg_term = tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES),name="reg_term") 164 | self.loss = tf.add(self.loss_term,self.reg_term,name="loss") 165 | 166 | 167 | def optimization(self): 168 | 169 | with tf.variable_scope("bp_layer"): 170 | tvars = tf.trainable_variables() 171 | grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), 172 | self.config.max_grad_norm) 173 | optimizer = tf.train.AdamOptimizer(self.config.learning_rate) 174 | self.optim = optimizer.apply_gradients( 175 | zip(grads, tvars), 176 | global_step=self.global_step) 177 | 178 | 179 | 180 | def attention(self,x_sen,y_sen,x_len,y_len): 181 | ''' 182 | function: use the dot-production of left_sen and right_sen to compute the attention weight matrix 183 | :param left_sen: a list of 2D tensor (x_len,hidden_units) 184 | :param right_sen: a list of 2D tensor (y_len,hidden_units) 185 | :return: (1) weighted_y: the weightd sum of y_sen, a 3D tensor with shape (b,x_len,2*h) 186 | (2) weghted_x: the weighted sum of x_sen, a 3D tensor with shape (b,y_len,2*h) 187 | ''' 188 | 189 | weight_matrix =tf.matmul(x_sen, tf.transpose(y_sen,perm=[0,2,1])) #(b,x_len,h) x (b,h,y_len)->(b,x_len,y_len) 190 | 191 | weight_matrix_y =tf.exp(weight_matrix - tf.reduce_max(weight_matrix,axis=2,keep_dims=True)) #(b,x_len,y_len) 192 | weight_matrix_x =tf.exp(tf.transpose((weight_matrix - tf.reduce_max(weight_matrix,axis=1,keep_dims=True)),perm=[0,2,1])) #(b,y_len,x_len) 193 | 194 | weight_matrix_y=weight_matrix_y*self.y_mask[:,None,:]#(b,x_len,y_len)*(b,1,y_len) 195 | weight_matrix_x=weight_matrix_x*self.x_mask[:,None,:]#(b,y_len,x_len)*(b,1,x_len) 196 | 197 | alpha=weight_matrix_y/(tf.reduce_sum(weight_matrix_y,2,keep_dims=True)+1e-8)#(b,x_len,y_len) 198 | beta=weight_matrix_x/(tf.reduce_sum(weight_matrix_x,2,keep_dims=True)+1e-8)#(b,y_len,x_len) 199 | 200 | #(b,1,y_len,2*h)*(b,x_len,y_len,1)*=>(b,x_len,y_len,2*h) =>(b,x_len,2*h) 201 | weighted_y =tf.reduce_sum(tf.expand_dims(y_sen,1) *tf.expand_dims(alpha,-1),2) 202 | 203 | #(b,1,x_len,2*h)*(b,y_len,x_len,1) =>(b,y_len,x_len,2*h) =>(b,y_len,2*h) 204 | weighted_x =tf.reduce_sum(tf.expand_dims(x_sen,1) * tf.expand_dims(beta,-1),2) 205 | 206 | return weighted_y,weighted_x 207 | 208 | 209 | def two_layer_dense(self,inp,out_dim,scope,regularizer=None): 210 | with tf.variable_scope(scope): 211 | dense1 = tf.layers.dense(inputs=inp, 212 | units=out_dim, 213 | activation=tf.nn.relu, 214 | kernel_regularizer= regularizer, 215 | use_bias=True) 216 | 217 | dense2 = tf.layers.dense(inputs=dense1, 218 | units=out_dim, 219 | activation=tf.nn.relu, 220 | kernel_regularizer= regularizer, 221 | use_bias=True) 222 | return dense2 223 | 224 | -------------------------------------------------------------------------------- /BaselineModel/esim.py: -------------------------------------------------------------------------------- 1 | ############### 2 | #20180615 3 | #implementation of decomposable attention on cnli 4 | ################ 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import sys 11 | import time 12 | import inspect 13 | import logging 14 | import numpy as np 15 | import tensorflow as tf 16 | from tensorflow.contrib.rnn.python.ops import core_rnn_cell 17 | from tensorflow.contrib.layers import batch_norm,l2_regularizer 18 | from tensorflow.python.ops import variable_scope 19 | from tensorflow.contrib.rnn.python.ops import core_rnn_cell 20 | from ops_cudnn_rnn import cudnn_lstm 21 | 22 | 23 | class MyModel(object): 24 | """The ESIM model.""" 25 | 26 | def __init__(self, is_training, config): 27 | 28 | batch_size = config.batch_size 29 | self.config = config 30 | self.is_training = is_training 31 | self.global_step = tf.Variable(0, trainable=False) 32 | 33 | self.add_placeholder() 34 | self.add_embedding() 35 | self.input_encoding() 36 | self.attend() 37 | self.compare() 38 | self.aggregate() 39 | 40 | self.compute_accuracy() 41 | self.compute_loss() 42 | 43 | if not is_training: 44 | return 45 | self.optimization() 46 | 47 | def add_placeholder(self): 48 | ''' 49 | add_placeholder for inputs 50 | ''' 51 | self.x = tf.placeholder(tf.int32, [self.config.batch_size, self.config.xmaxlen]) 52 | self.y = tf.placeholder(tf.int32, [self.config.batch_size, self.config.ymaxlen]) 53 | 54 | self.x_mask = tf.placeholder(tf.int32, [self.config.batch_size, self.config.xmaxlen]) 55 | self.y_mask = tf.placeholder(tf.int32, [self.config.batch_size, self.config.ymaxlen]) 56 | self.x_mask = tf.cast(self.x_mask,tf.float32) 57 | self.y_mask = tf.cast(self.y_mask,tf.float32) 58 | 59 | self.x_len = tf.placeholder(tf.int32, [self.config.batch_size,]) 60 | self.y_len = tf.placeholder(tf.int32, [self.config.batch_size,]) 61 | self.x_len = tf.cast(self.x_len,tf.float32) 62 | self.y_len = tf.cast(self.y_len,tf.float32) 63 | 64 | self.label = tf.placeholder(tf.int32, [self.config.batch_size,self.config.num_classes]) 65 | 66 | 67 | def add_embedding(self): 68 | ''' 69 | add pretrained embedding 70 | ''' 71 | with tf.device("/cpu:0"): 72 | embedding_matrix=np.load(self.config.cnli_embedding_dir) 73 | embedding = tf.Variable(embedding_matrix,trainable=False, name="embedding") 74 | 75 | self.input_xemb = tf.nn.embedding_lookup(embedding, self.x) 76 | self.input_yemb = tf.nn.embedding_lookup(embedding, self.y) 77 | 78 | if self.is_training and self.config.keep_prob < 1: 79 | self.input_xemb = tf.nn.dropout(self.input_xemb, self.config.keep_prob) 80 | self.input_yemb = tf.nn.dropout(self.input_yemb, self.config.keep_prob) 81 | 82 | 83 | 84 | def input_encoding(self): 85 | ''' 86 | encode the x and y with a two-layer fnn seperately 87 | ''' 88 | with tf.variable_scope("encode_xy") as scope: 89 | self.x_output = cudnn_lstm(inputs=self.input_xemb,num_layers=1,hidden_size=self.config.hidden_units,is_training=self.is_training) 90 | self.x_output=self.x_output*self.x_mask[:,:,None] 91 | 92 | scope.reuse_variables() 93 | self.y_output = cudnn_lstm(inputs=self.input_yemb,num_layers=1,hidden_size=self.config.hidden_units,is_training=self.is_training) 94 | self.y_output=self.y_output*self.y_mask[:,:,None] 95 | 96 | if self.is_training and self.config.keep_prob < 1: 97 | self.x_output = tf.nn.dropout(self.x_output,self.config.keep_prob) # its length must be x_length 98 | self.y_output = tf.nn.dropout(self.y_output, self.config.keep_prob) 99 | 100 | 101 | def attend(self): 102 | self.weighted_y, self.weighted_x =self.attention(x_sen= self.x_output, 103 | y_sen= self.y_output, 104 | x_len= self.config.xmaxlen, 105 | y_len= self.config.ymaxlen) 106 | 107 | 108 | def compare(self): 109 | 110 | with tf.variable_scope("compare"): 111 | with tf.variable_scope("compare-xy") as scope: 112 | co_xy = tf.concat([self.x_output,self.weighted_y, self.x_output-self.weighted_y, self.x_output*self.weighted_y],axis=-1) 113 | co_xy_dense = tf.layers.dense(inputs=co_xy,units=self.config.hidden_units, activation=tf.nn.relu, 114 | kernel_regularizer=l2_regularizer(self.config.l2_strength), use_bias=True) 115 | 116 | v_co_xy = cudnn_lstm(inputs=co_xy_dense,num_layers=1,hidden_size=self.config.hidden_units,is_training=self.is_training) 117 | self.v_co_xy=v_co_xy*self.x_mask[:,:,None] 118 | 119 | 120 | scope.reuse_variables() 121 | co_yx = tf.concat([self.y_output,self.weighted_x, self.y_output-self.weighted_x, self.y_output*self.weighted_x],axis=-1) 122 | co_yx_dense = tf.layers.dense(inputs=co_yx,units=self.config.hidden_units, activation=tf.nn.relu, 123 | kernel_regularizer=l2_regularizer(self.config.l2_strength), use_bias=True,reuse=tf.AUTO_REUSE) 124 | 125 | v_co_yx = cudnn_lstm(inputs=co_yx_dense,num_layers=1,hidden_size=self.config.hidden_units,is_training=self.is_training) 126 | self.v_co_yx=v_co_yx*self.y_mask[:,:,None] 127 | 128 | if self.is_training and self.config.keep_prob < 1: 129 | self.v_co_xy = tf.nn.dropout(self.v_co_xy,self.config.keep_prob) 130 | self.v_co_yx = tf.nn.dropout(self.v_co_yx,self.config.keep_prob) 131 | 132 | 133 | def aggregate(self): 134 | ''' 135 | 1. sum pooling 2. fnn 136 | ''' 137 | with tf.variable_scope("pooling"): 138 | 139 | v_xyave = tf.div(tf.reduce_sum(self.v_co_xy, 1), tf.expand_dims(self.x_len, -1)) #div true length 140 | v_yxave = tf.div(tf.reduce_sum(self.v_co_yx, 1), tf.expand_dims(self.y_len, -1)) #div true length 141 | v_xymax = tf.reduce_max(self.v_co_xy,axis=1) #(b,2h) 142 | v_yxmax = tf.reduce_max(self.v_co_yx,axis=1) #(b,2h) 143 | 144 | self.v = tf.concat([v_xyave, v_xymax, v_yxave, v_yxmax],axis=-1) 145 | 146 | with tf.variable_scope("pred-layer"): 147 | 148 | dense1 = tf.layers.dense(inputs=self.v, 149 | units=self.config.hidden_units, 150 | activation=tf.nn.tanh, 151 | use_bias=True, 152 | kernel_regularizer= l2_regularizer(self.config.l2_strength), 153 | name="dense-pred-W") 154 | 155 | if self.is_training and self.config.keep_prob < 1: 156 | dense1 = tf.nn.dropout(dense1, self.config.keep_prob) 157 | 158 | W_pred = tf.get_variable("W_pred", shape=[self.config.hidden_units, self.config.num_classes],regularizer=l2_regularizer(self.config.l2_strength)) 159 | 160 | self.pred = tf.nn.softmax(tf.matmul(dense1, W_pred), name="pred") 161 | 162 | def compute_accuracy(self): 163 | correct = tf.equal(tf.argmax(self.pred,1),tf.argmax(self.label,1)) 164 | self.acc = tf.reduce_mean(tf.cast(correct, "float"), name="accuracy") 165 | 166 | def compute_loss(self): 167 | 168 | self.loss_term = -tf.reduce_sum(tf.cast(self.label,tf.float32) * tf.log(self.pred),name="loss_term") 169 | self.reg_term = tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES),name="reg_term") 170 | self.loss = tf.add(self.loss_term,self.reg_term,name="loss") 171 | 172 | 173 | def optimization(self): 174 | 175 | with tf.variable_scope("bp_layer"): 176 | tvars = tf.trainable_variables() 177 | grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), 178 | self.config.max_grad_norm) 179 | optimizer = tf.train.AdamOptimizer(self.config.learning_rate) 180 | self.optim = optimizer.apply_gradients( 181 | zip(grads, tvars), 182 | global_step=self.global_step) 183 | 184 | 185 | 186 | def attention(self,x_sen,y_sen,x_len,y_len): 187 | ''' 188 | function: use the dot-production of left_sen and right_sen to compute the attention weight matrix 189 | :param left_sen: a list of 2D tensor (x_len,hidden_units) 190 | :param right_sen: a list of 2D tensor (y_len,hidden_units) 191 | :return: (1) weighted_y: the weightd sum of y_sen, a 3D tensor with shape (b,x_len,2*h) 192 | (2) weghted_x: the weighted sum of x_sen, a 3D tensor with shape (b,y_len,2*h) 193 | ''' 194 | 195 | weight_matrix =tf.matmul(x_sen, tf.transpose(y_sen,perm=[0,2,1])) #(b,x_len,h) x (b,h,y_len)->(b,x_len,y_len) 196 | 197 | weight_matrix_y =tf.exp(weight_matrix - tf.reduce_max(weight_matrix,axis=2,keep_dims=True)) #(b,x_len,y_len) 198 | weight_matrix_x =tf.exp(tf.transpose((weight_matrix - tf.reduce_max(weight_matrix,axis=1,keep_dims=True)),perm=[0,2,1])) #(b,y_len,x_len) 199 | 200 | weight_matrix_y=weight_matrix_y*self.y_mask[:,None,:]#(b,x_len,y_len)*(b,1,y_len) 201 | weight_matrix_x=weight_matrix_x*self.x_mask[:,None,:]#(b,y_len,x_len)*(b,1,x_len) 202 | 203 | alpha=weight_matrix_y/(tf.reduce_sum(weight_matrix_y,2,keep_dims=True)+1e-8)#(b,x_len,y_len) 204 | beta=weight_matrix_x/(tf.reduce_sum(weight_matrix_x,2,keep_dims=True)+1e-8)#(b,y_len,x_len) 205 | 206 | #(b,1,y_len,2*h)*(b,x_len,y_len,1)*=>(b,x_len,y_len,2*h) =>(b,x_len,2*h) 207 | weighted_y =tf.reduce_sum(tf.expand_dims(y_sen,1) *tf.expand_dims(alpha,-1),2) 208 | 209 | #(b,1,x_len,2*h)*(b,y_len,x_len,1) =>(b,y_len,x_len,2*h) =>(b,y_len,2*h) 210 | weighted_x =tf.reduce_sum(tf.expand_dims(x_sen,1) * tf.expand_dims(beta,-1),2) 211 | 212 | return weighted_y,weighted_x 213 | 214 | 215 | def two_layer_dense(self,inp,out_dim,scope,regularizer=None): 216 | with tf.variable_scope(scope): 217 | dense1 = tf.layers.dense(inputs=inp, 218 | units=out_dim, 219 | activation=tf.nn.relu, 220 | kernel_regularizer= regularizer, 221 | use_bias=True) 222 | 223 | dense2 = tf.layers.dense(inputs=dense1, 224 | units=out_dim, 225 | activation=tf.nn.relu, 226 | kernel_regularizer= regularizer, 227 | use_bias=True) 228 | return dense2 229 | 230 | -------------------------------------------------------------------------------- /BaselineModel/myutils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | import numpy as np 4 | import argparse 5 | import random 6 | import string 7 | 8 | 9 | 10 | def tokenize(sent): 11 | ''' 12 | data_reader.tokenize('a#b') 13 | ['a', '#', 'b'] 14 | ''' 15 | #return [x.strip().lower() for x in re.split('(\W+)?', sent) if x.strip()] 16 | return [x.strip().lower() for x in re.split('(\W+)', sent) if x.strip()] 17 | 18 | 19 | def map_to_idx(x, vocab): 20 | ''' 21 | x is a sequence of tokens 22 | ''' 23 | # 1 is for UNK,0 is for PAD 24 | return [ vocab[w] if w in vocab else 1 for w in x ] 25 | 26 | 27 | def to_categorical(y, num_classes=None): 28 | """from keras.utils.np_utils import to_categorical 29 | 30 | Converts a class vector (integers) to binary class matrix. 31 | E.g. for use with categorical_crossentropy. 32 | # Arguments 33 | y: class vector to be converted into a matrix 34 | (integers from 0 to num_classes). 35 | num_classes: total number of classes. 36 | # Returns 37 | A binary matrix representation of the input. 38 | """ 39 | y = np.array(y, dtype='int').ravel() 40 | if not num_classes: 41 | num_classes = np.max(y) + 1 42 | n = y.shape[0] 43 | categorical = np.zeros((n, num_classes)) 44 | categorical[np.arange(n), y] = 1 45 | 46 | return categorical 47 | 48 | def pad_sequences(sequences, maxlen=None, dtype='int32', 49 | padding='pre', truncating='pre', value=0.): 50 | 51 | """from keras.preprocessing.sequence.pad_sequences 52 | Pads each sequence to the same length (length of the longest sequence). 53 | 54 | If maxlen is provided, any sequence longer 55 | than maxlen is truncated to maxlen. 56 | Truncation happens off either the beginning (default) or 57 | the end of the sequence. 58 | 59 | Supports post-padding and pre-padding (default). 60 | 61 | # Arguments 62 | sequences: list of lists where each element is a sequence 63 | maxlen: int, maximum length 64 | dtype: type to cast the resulting sequence. 65 | padding: 'pre' or 'post', pad either before or after each sequence. 66 | truncating: 'pre' or 'post', remove values from sequences larger than 67 | maxlen either in the beginning or in the end of the sequence 68 | value: float, value to pad the sequences to the desired value. 69 | 70 | # Returns 71 | x: numpy array with dimensions (number_of_sequences, maxlen) 72 | 73 | # Raises 74 | ValueError: in case of invalid values for `truncating` or `padding`, 75 | or in case of invalid shape for a `sequences` entry. 76 | """ 77 | if not hasattr(sequences, '__len__'): 78 | raise ValueError('`sequences` must be iterable.') 79 | lengths = [] 80 | for x in sequences: 81 | if not hasattr(x, '__len__'): 82 | raise ValueError('`sequences` must be a list of iterables. ' 83 | 'Found non-iterable: ' + str(x)) 84 | lengths.append(len(x)) 85 | 86 | num_samples = len(sequences) 87 | if maxlen is None: 88 | maxlen = np.max(lengths) 89 | 90 | # take the sample shape from the first non empty sequence 91 | # checking for consistency in the main loop below. 92 | sample_shape = tuple() 93 | for s in sequences: 94 | if len(s) > 0: 95 | sample_shape = np.asarray(s).shape[1:] 96 | break 97 | 98 | x = (np.ones((num_samples, maxlen) + sample_shape) * value).astype(dtype) 99 | for idx, s in enumerate(sequences): 100 | if not len(s): 101 | continue # empty list/array was found 102 | if truncating == 'pre': 103 | trunc = s[-maxlen:] 104 | elif truncating == 'post': 105 | trunc = s[:maxlen] 106 | else: 107 | raise ValueError('Truncating type "%s" not understood' % truncating) 108 | 109 | # check `trunc` has expected shape 110 | trunc = np.asarray(trunc, dtype=dtype) 111 | if trunc.shape[1:] != sample_shape: 112 | raise ValueError('Shape of sample %s of sequence at position %s is different from expected shape %s' % 113 | (trunc.shape[1:], idx, sample_shape)) 114 | 115 | if padding == 'post': 116 | x[idx, :len(trunc)] = trunc 117 | elif padding == 'pre': 118 | x[idx, -len(trunc):] = trunc 119 | else: 120 | raise ValueError('Padding type "%s" not understood' % padding) 121 | return x 122 | 123 | 124 | 125 | if __name__=="__main__": 126 | pass 127 | -------------------------------------------------------------------------------- /BaselineModel/ops_cudnn_rnn.py: -------------------------------------------------------------------------------- 1 | ''' 2 | forked from https://github.com/baidu-research/GloballyNormalizedReader/blob/master/ops.py 3 | ''' 4 | 5 | import tensorflow as tf 6 | import tensorflow.contrib.cudnn_rnn as cudnn_rnn 7 | from itertools import zip_longest 8 | import queue 9 | import threading 10 | import numpy as np 11 | 12 | #######cudnn_lstm########## 13 | def cudnn_lstm(inputs, num_layers, hidden_size, is_training, direction='bidirectional',regularizer=None,scope=None): 14 | """Run the CuDNN LSTM. 15 | Arguments: 16 | - inputs: A tensor of shape [batch, length, input_size] of inputs. 17 | - layers: Number of RNN layers. 18 | - hidden_size: Number of units in each layer. 19 | - direction: indicate 'bidirectional' or 'unidirectional' 20 | - is_training: tf.bool indicating whether training mode is enabled. 21 | Return a tuple of (outputs, init_state, final_state). 22 | """ 23 | input_size = inputs.get_shape()[-1].value 24 | if input_size is None: 25 | raise ValueError("Number of input dimensions to CuDNN RNNs must be " 26 | "known, but was None.") 27 | 28 | # CUDNN expects the inputs to be time major 29 | inputs = tf.transpose(inputs, [1, 0, 2]) 30 | 31 | cudnn_cell = tf.contrib.cudnn_rnn.CudnnLSTM( 32 | num_layers, hidden_size, input_size, 33 | input_mode="linear_input", direction=direction) 34 | 35 | est_size = estimate_cudnn_lstm_parameter_size( 36 | num_layers=num_layers, 37 | hidden_size=hidden_size, 38 | input_size=input_size, 39 | input_mode="linear_input", 40 | direction=direction) 41 | 42 | cudnn_params = tf.get_variable( 43 | "RNNParams", 44 | shape=[est_size], 45 | initializer=tf.contrib.layers.variance_scaling_initializer(), 46 | regularizer=regularizer) 47 | 48 | num_dir = direction_to_num_directions(direction) 49 | # initial_state: a tuple of tensor(s) of shape`[num_layers * num_dirs, batch_size, num_units] 50 | init_state = tf.tile( 51 | tf.zeros([num_dir * num_layers, 1, hidden_size], dtype=tf.float32), 52 | [1, tf.shape(inputs)[1], 1]) # [num_dir * num_layers, batch_size, hidden_size] 53 | ''' 54 | Args: 55 | inputs: `3-D` tensor with shape `[time_len, batch_size, input_size]`. 56 | initial_state: a tuple of tensor(s) of shape 57 | `[num_layers * num_dirs, batch_size, num_units]`. If not provided, use 58 | zero initial states. The tuple size is 2 for LSTM and 1 for other RNNs. 59 | training: whether this operation will be used in training or inference. 60 | Returns: 61 | output: a tensor of shape `[time_len, batch_size, num_dirs * num_units]`. 62 | It is a `concat([fwd_output, bak_output], axis=2)`. 63 | output_states: a tuple of tensor(s) of the same shape and structure as 64 | `initial_state`. 65 | ''' 66 | hiddens, output_h, output_c = cudnn_cell( 67 | inputs, 68 | input_h=init_state, 69 | input_c=init_state, 70 | params=cudnn_params, 71 | is_training=True) 72 | 73 | # Convert to batch major 74 | hiddens = tf.transpose(hiddens, [1, 0, 2]) 75 | output_h = tf.transpose(output_h, [1, 0, 2]) 76 | output_c = tf.transpose(output_c, [1, 0, 2]) 77 | 78 | #return hiddens, output_h, output_c 79 | return hiddens 80 | 81 | #######cudnn_gru########## 82 | 83 | def cudnn_gru(inputs, num_layers, hidden_size, is_training, direction='bidirectional',scope=None): 84 | """Run the CuDNN LSTM. 85 | Arguments: 86 | - inputs: A tensor of shape [batch, length, input_size] of inputs. 87 | - layers: Number of RNN layers. 88 | - hidden_size: Number of units in each layer. 89 | - direction: indicate 'bidirectional' or 'unidirectional' 90 | - is_training: tf.bool indicating whether training mode is enabled. 91 | Return a tuple of (outputs, init_state, final_state). 92 | ref: https://github.com/tensorflow/tensorflow/issues/13860 93 | """ 94 | input_size = inputs.get_shape()[-1].value 95 | if input_size is None: 96 | raise ValueError("Number of input dimensions to CuDNN RNNs must be " 97 | "known, but was None.") 98 | 99 | # CUDNN expects the inputs to be time major 100 | inputs = tf.transpose(inputs, [1, 0, 2]) 101 | 102 | cudnn_cell = tf.contrib.cudnn_rnn.CudnnGRU( 103 | num_layers, hidden_size, input_size, 104 | input_mode="linear_input", direction=direction) 105 | 106 | est_size = estimate_cudnn_gru_parameter_size( 107 | num_layers=num_layers, 108 | hidden_size=hidden_size, 109 | input_size=input_size, 110 | input_mode="linear_input", 111 | direction=direction) 112 | 113 | cudnn_params = tf.get_variable( 114 | "RNNParams", 115 | shape=[est_size], 116 | initializer=tf.contrib.layers.variance_scaling_initializer()) 117 | 118 | num_dir = direction_to_num_directions(direction) 119 | # initial_state: a tuple of tensor(s) of shape`[num_layers * num_dirs, batch_size, num_units] 120 | init_state = tf.tile( 121 | tf.zeros([num_dir * num_layers, 1, hidden_size], dtype=tf.float32), 122 | [1, tf.shape(inputs)[1], 1]) # [num_dir * num_layers, batch_size, hidden_size] 123 | ''' 124 | Args: 125 | inputs: `3-D` tensor with shape `[time_len, batch_size, input_size]`. 126 | initial_state: a tuple of tensor(s) of shape 127 | `[num_layers * num_dirs, batch_size, num_units]`. If not provided, use 128 | zero initial states. The tuple size is 2 for LSTM and 1 for other RNNs. 129 | training: whether this operation will be used in training or inference. 130 | Returns: 131 | output: a tensor of shape `[time_len, batch_size, num_dirs * num_units]`. 132 | It is a `concat([fwd_output, bak_output], axis=2)`. 133 | output_states: a tuple of tensor(s) of the same shape and structure as 134 | `initial_state`. 135 | ''' 136 | #hiddens, output_h, output_c = cudnn_cell( 137 | hiddens, output_h = cudnn_cell( 138 | inputs, 139 | input_h=init_state, 140 | params=cudnn_params, 141 | is_training=True) 142 | 143 | # Convert to batch major 144 | hiddens = tf.transpose(hiddens, [1, 0, 2]) 145 | output_h = tf.transpose(output_h, [1, 0, 2]) 146 | #output_c = tf.transpose(output_c, [1, 0, 2]) 147 | 148 | #return hiddens, output_h 149 | return hiddens 150 | 151 | def estimate_cudnn_lstm_parameter_size(num_layers, 152 | input_size, 153 | hidden_size, 154 | input_mode, 155 | direction): 156 | """ 157 | Compute the number of parameters needed to 158 | construct a stack of LSTMs. Assumes the hidden states 159 | of bidirectional LSTMs are concatenated before being 160 | sent to the next layer up. 161 | """ 162 | num_directions = direction_to_num_directions(direction) 163 | params = 0 164 | isize = input_size 165 | for layer in range(num_layers): 166 | for direction in range(num_directions): 167 | params += cudnn_lstm_parameter_size( 168 | isize, hidden_size 169 | ) 170 | isize = hidden_size * num_directions 171 | return params 172 | 173 | def cudnn_lstm_parameter_size(input_size, hidden_size): 174 | """Number of parameters in a single CuDNN LSTM cell.""" 175 | biases = 8 * hidden_size 176 | weights = 4 * (hidden_size * input_size) + 4 * (hidden_size * hidden_size) 177 | return biases + weights 178 | 179 | 180 | def estimate_cudnn_gru_parameter_size(num_layers, 181 | input_size, 182 | hidden_size, 183 | input_mode, 184 | direction): 185 | """ 186 | Compute the number of parameters needed to 187 | construct a stack of LSTMs. Assumes the hidden states 188 | of bidirectional LSTMs are concatenated before being 189 | sent to the next layer up. 190 | """ 191 | num_directions = direction_to_num_directions(direction) 192 | params = 0 193 | isize = input_size 194 | for layer in range(num_layers): 195 | for direction in range(num_directions): 196 | params += cudnn_gru_parameter_size( 197 | isize, hidden_size 198 | ) 199 | isize = hidden_size * num_directions 200 | return params 201 | 202 | 203 | def cudnn_gru_parameter_size(input_size, hidden_size): 204 | """Number of parameters in a single CuDNN LSTM cell.""" 205 | biases = 6 * hidden_size 206 | weights = 3 * (hidden_size * input_size) + 3 * (hidden_size * hidden_size) 207 | return biases + weights 208 | 209 | def direction_to_num_directions(direction): 210 | if direction == "unidirectional": 211 | return 1 212 | elif direction == "bidirectional": 213 | return 2 214 | else: 215 | raise ValueError("Unknown direction: %r." % (direction,)) 216 | 217 | def parameter_count(): 218 | """Return the total number of parameters in all Tensorflow-defined 219 | variables, using `tf.trainable_variables()` to get the list of 220 | variables.""" 221 | return sum(np.product(var.get_shape().as_list()) 222 | for var in tf.trainable_variables()) 223 | -------------------------------------------------------------------------------- /BaselineModel/run.sh: -------------------------------------------------------------------------------- 1 | python3 train.py --model_type decomposable_att 2>f2 1>f1_decompsable_0808 2 | python3 train.py --model_type esim 2>f2 1>f1_esim_0808 3 | -------------------------------------------------------------------------------- /BaselineModel/train.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import sys 6 | import time 7 | import inspect 8 | import logging 9 | import copy 10 | import importlib 11 | import numpy as np 12 | import tensorflow as tf 13 | from tensorflow.contrib.rnn.python.ops import core_rnn_cell 14 | from tensorflow.contrib.layers import batch_norm,l2_regularizer 15 | from tensorflow.python.ops import variable_scope 16 | 17 | from myutils import * 18 | import data_reader as reader 19 | #from decomposable_att import MyModel 20 | #from esim import MyModel 21 | from config import SmallConfig 22 | 23 | flags = tf.flags 24 | logging = tf.logging 25 | 26 | flags.DEFINE_string( 27 | "model", "small", 28 | "A type of model. Possible options are: small, medium, large.") 29 | flags.DEFINE_string('model_type', "esim", 'esim or decomposable-att') 30 | flags.DEFINE_string("data_path", "", 31 | "Where the training/test data is stored.") 32 | flags.DEFINE_string("save_path","model_saved", 33 | "Model output directory.") 34 | flags.DEFINE_bool("use_fp16", False, 35 | "Train using 16-bit floats instead of 32bit floats") 36 | flags.DEFINE_float('learning_rate', 0.0004, 'Initial learning rate.') 37 | flags.DEFINE_float('keep_prob', 0.8, 'keep_prob for dropout.') 38 | flags.DEFINE_float('l2_strength', 0.0002, 'l2 rate for l2 loss.') 39 | flags.DEFINE_integer('batch_size', 32,'batch_size ') 40 | 41 | FLAGS = flags.FLAGS 42 | 43 | def data_type(): 44 | return tf.float16 if FLAGS.use_fp16 else tf.float32 45 | 46 | def fill_placeholder(data, model,config): 47 | batch_x,batch_y,batch_label,batch_x_mask,batch_y_mask, batch_x_len,batch_y_len= data.next_batch(config.batch_size) 48 | feed_dict = {model.x:batch_x , 49 | model.y:batch_y, 50 | model.label:batch_label, 51 | model.x_mask:batch_x_mask, 52 | model.y_mask:batch_y_mask, 53 | model.x_len :batch_x_len, 54 | model.y_len :batch_y_len, 55 | } 56 | 57 | return feed_dict 58 | 59 | def run_epoch(session, data,model,config, eval_op=None, verbose=False): 60 | """Runs the model on the given data.""" 61 | start_time = time.time() 62 | losses = 0.0 63 | iters = 0 64 | acc_total=0.0 65 | fetches = { 66 | "acc":model.acc, 67 | "loss": model.loss, 68 | "global_step":model.global_step, 69 | "pred": model.pred, 70 | "label": model.label, 71 | } 72 | if eval_op is not None: 73 | fetches["eval_op"] = eval_op 74 | 75 | start_time = time.time() 76 | epoch_size = data.get_epoch_size(config.batch_size) 77 | for step in range(epoch_size): 78 | feed_dict = fill_placeholder(data,model,config) 79 | 80 | vals = session.run(fetches, feed_dict) 81 | acc = vals["acc"] 82 | loss = vals["loss"] 83 | global_step=vals["global_step"] 84 | 85 | 86 | pred = vals["pred"] 87 | label = vals["label"] 88 | 89 | losses += loss 90 | iters= iters+1 91 | acc_total += acc 92 | #if verbose and step %10 == 0: 93 | # print('global_step: %s train_acc: %s batch_train_loss: %s' % (global_step,acc,loss)) 94 | acc_average=acc_total/iters 95 | loss_average = losses/iters 96 | return acc_average,loss_average,global_step,pred,label 97 | 98 | 99 | def get_config(): 100 | if FLAGS.model == "small": 101 | return SmallConfig() 102 | else: 103 | raise ValueError("Invalid model: %s", FLAGS.model) 104 | 105 | 106 | def main(_): 107 | config = get_config() 108 | config.learning_rate = FLAGS.learning_rate 109 | config.keep_prob = FLAGS.keep_prob 110 | config.l2_strength = FLAGS.l2_strength 111 | config.batch_size = FLAGS.batch_size 112 | 113 | eval_config= copy.deepcopy(config) 114 | eval_config.batch_size=1 115 | print("config",vars(config)) 116 | print("eval_config",vars(eval_config)) 117 | 118 | Train,Dev,vocab = reader.file2seqid(config) 119 | 120 | model = FLAGS.model_type 121 | module = importlib.import_module('.'.join([FLAGS.model_type]),package='b') 122 | MyModel = getattr(module, 'MyModel') 123 | with tf.Graph().as_default(): 124 | initializer = tf.random_uniform_initializer(-config.init_scale,config.init_scale) 125 | 126 | with tf.name_scope("Train"): 127 | with tf.variable_scope("Model", reuse=None, initializer=initializer): 128 | m = MyModel(is_training=True, config=config) 129 | 130 | with tf.name_scope("Valid"): 131 | with tf.variable_scope("Model", reuse=True, initializer=initializer): 132 | mvalid = MyModel(is_training=False,config=eval_config) 133 | 134 | 135 | sv = tf.train.Supervisor() 136 | with sv.managed_session() as session: 137 | print ("model params",np.sum([np.product([xi.value for xi in x.get_shape()]) for x in tf.trainable_variables()])) 138 | t0=time.time() 139 | best_dev_acc = 0.0 140 | best_val_epoch = 0 141 | 142 | 143 | for i in range(config.MAXITER): 144 | start_time=time.time() 145 | train_acc,train_loss,train_global_step,train_pred,train_label= run_epoch(session,data=Train, model=m,config=config, eval_op=m.optim, verbose=True) 146 | print("Epoch: %d train_acc: %.4f train_loss %.4f train_global_step:%s" % (i ,train_acc,train_loss,train_global_step)) 147 | 148 | dev_acc,dev_loss,_,dev_pred,dev_label= run_epoch(session,data=Dev,model=mvalid,config=eval_config) 149 | print("Epoch: %d dev_acc: %.4f dev_loss %.4f" % (i , dev_acc,dev_loss)) 150 | 151 | 152 | sys.stdout.flush() 153 | if best_dev_acc <= dev_acc: 154 | best_dev_acc = dev_acc 155 | best_val_epoch = i 156 | if FLAGS.save_path: 157 | print("train_global_step:%s. Saving %d model to %s." % (train_global_step,i,FLAGS.save_path)) 158 | sv.saver.save(session,FLAGS.save_path+"/model", global_step=train_global_step) 159 | print (time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())) 160 | 161 | 162 | end_time=time.time() 163 | print("################# all_training time: %s one_epoch time: %s ############### " % ((end_time-t0)//60, (end_time-start_time)//60)) 164 | if i - best_val_epoch > config.early_stopping: 165 | print ("best_val_epoch:%d best_val_accuracy:%.4f"%(best_val_epoch,best_dev_acc)) 166 | logging.info("Normal Early stop") 167 | print (time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())) 168 | break 169 | elif i == config.MAXITER-1: 170 | print ("best_val_epoch:%d best_val_accuracy:%.4f"%(best_val_epoch,best_dev_acc)) 171 | logging.info("Finishe Training") 172 | 173 | 174 | if __name__ == "__main__": 175 | tf.app.run() 176 | -------------------------------------------------------------------------------- /CCL2018中文文本蕴含评测总结.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blcunlp/CNLI/604638ecf50201c15da6420ceb14aa7a43bd1463/CCL2018中文文本蕴含评测总结.pdf -------------------------------------------------------------------------------- /CCL2018中文文本蕴含识别系统报告集合.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blcunlp/CNLI/604638ecf50201c15da6420ceb14aa7a43bd1463/CCL2018中文文本蕴含识别系统报告集合.pdf -------------------------------------------------------------------------------- /CNLI2018 Evaluation Result.md: -------------------------------------------------------------------------------- 1 | # CNLI2018 Evaluation Result 2 | #### CNLI2018 已在近日结束。 3 | #### 我们在Codalab上收到了12份提交答案,排名如下: 4 | 5 | 排名 | 团队名 | 提交者 | 模型名称 | 准确度 | Github链接 6 | ---|---|---|---|---|---| 7 | 1 | water | water123 | cnn+lstm |0.8238 | 8 | 2 | zzunlp2018 | nlpc | decomposable_att_t | 0.7828 | 9 | 3 | 百度智珠团队 | ShawnNg | Excalibur | 0.7692 | 10 | 4 | GDUFSER | Kunxun_Qi | - | 0.7618 | 11 | 5 | ray_li | ray_li | - | 0.7425 | 12 | 6 | INTSIG_AI | eedanny | - | 0.7303 | 13 | 7 | Yonseiiii | Parkhaeju | decom-att | 0.7242 | 14 | 8 | **Baseline** | **BLCU-nlp** | **ESIM** | **0.7222** | 15 | 9 | 狂奔 | friend2 | lstm+cnn | 0.6952 | 16 | 10 | _503 | _503 | bi | 0.6848 | 17 | 11 | 遵义医学院医学信息工程学院 | lyb3b | BiLSTM | 0.6203 | 18 | 12 | Hiter | oliver_arrow | DAM | 0.6090 | 19 | 20 | 21 | #### 在征得参赛团队及个人同意后,我们会放上模型代码的Github链接,供大家研究参考。 22 | #### 评测现处于Post Competition 阶段,且不会关闭。可以继续提交结果,刷新SOA。 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /Codalab Example/answer.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blcunlp/CNLI/604638ecf50201c15da6420ceb14aa7a43bd1463/Codalab Example/answer.zip -------------------------------------------------------------------------------- /Codalab Example/readme: -------------------------------------------------------------------------------- 1 | This is an example of result submission for our CNLI competition on Codalab:https://competitions.codalab.org/competitions/19911. 2 | 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Baseline for Chinese Natural Language Inference (CNLI) dataset 2 | 3 | ## Description 4 | This repository provides the official training and development dataset for [the Chinese Natural Language Inference (CNLI) share task](http://www.cips-cl.org/static/CCL2018/call-evaluation.html). 5 | We evaluate the cnli\_1.0 corpus on two baseline models. 6 | 7 | 8 | ## Data 9 | 10 | The CNLI dataset can be downloaded at [here](https://github.com/blcunlp/CNLI/tree/master/CNLI_Data) 11 | 12 | Both the train and dev set are **tab-separated** format. 13 | Each line in the train (or dev) file corresponds to an instance, and it is arranged as: 14 | >sentence-id premise hypothesis label 15 | 16 | 17 | 18 | ## Model 19 | 20 | This repository includes the baseline model for Chinese Natural Language Inference (CNLI) dataset. 21 | We provide two baseline models. 22 | (1) The [Decomposable Attention Model](https://arxiv.org/pdf/1606.01933.pdf), which use FNNs and inter-attention mechinaism. More details about the model can be found in the [original paper](https://arxiv.org/pdf/1606.01933.pdf). 23 | (2) The ESIM Model (https://arxiv.org/pdf/1609.06038.pdf), which is a strong baseline model for SNLI dataset. 24 | 25 | ## Requirements 26 | * python 3.5 27 | * tensorflow '1.4.0' 28 | * jieba 0.39 29 | 30 | ## Training 31 | 32 | 33 | **Data Preprocessing** 34 | We use jieba to tokenize the sentences. During trainging, we use the pre-trained SGNS embedding introduced in [Analogical Reasoning on Chinese Morphological and Semantic Relations] (https://arxiv.org/abs/1805.06504). You can download the sgns.merge.word from [here](https://pan.baidu.com/s/1kwxiPouou6ecxyJdYmnkvw). 35 | 36 | **Main Scripts** 37 | config.py:the parameter configuration. 38 | decomposable_att.py: implementation of the Decomposable Attention Model. 39 | data_reader.py: preparing data for the model. 40 | train.py: training the Decomposable Attention Model. 41 | 42 | **Running Model** 43 | You can train the decomposable attention model and the esim model by the following command lines: 44 | > python3 train.py --model_type decomposable_att 45 | > python3 train.py --model_type esim 46 | 47 | 48 | 49 | ## Results 50 | We provide the whole training data, which comprimises 90,000 items in the training set and 10,000 items in the dev dataset. 51 | We adopt early stopping on dev set. The best results are shown in the following table: 52 | 53 | |Model |train-acc(%)|dev-acc(%) 54 | |:-:|:-:|:-: 55 | | Decomposable-Att|76.91 |69.35 56 | |ESIM | 76.82| 73.57 57 | 58 | 59 | 60 | ## Reporting issues 61 | Please let us know, if you encounter any problems. 62 | --------------------------------------------------------------------------------