├── BaselineModel
    ├── config.py
    ├── data
    │   ├── cnli
    │   │   ├── cnli_dev_1.0_seg.txt
    │   │   └── cnli_train_1.0_seg.txt
    │   └── embedding
    │   │   ├── cnli_embedding.npy
    │   │   ├── cnli_vocab.txt
    │   │   ├── convert_embedding.py
    │   │   └── run_embedding.sh
    ├── data_reader.py
    ├── decomposable_att.py
    ├── esim.py
    ├── myutils.py
    ├── ops_cudnn_rnn.py
    ├── run.sh
    └── train.py
├── CCL2018中文文本蕴含评测总结.pdf
├── CCL2018中文文本蕴含识别系统报告集合.pdf
├── CNLI2018 Evaluation Result.md
├── CNLI_Data
    ├── cnli_dev_1.0.txt
    ├── cnli_test_1.0.txt
    ├── cnli_test_labeled.txt
    └── cnli_train_1.0.txt
├── Codalab Example
    ├── answer.zip
    └── readme
└── README.md


/BaselineModel/config.py:
--------------------------------------------------------------------------------
 1 | class SmallConfig(object):
 2 |   """Small config."""
 3 |   init_scale = 0.1
 4 |   learning_rate = 0.0003
 5 | 
 6 |   max_grad_norm = 5
 7 |   xmaxlen=32
 8 |   ymaxlen=30
 9 |   num_classes=3
10 |   hidden_units = 300
11 |   embedding_size =300
12 |   MAXITER=70
13 |   keep_prob = 0.8
14 |               
15 |   batch_size = 32
16 |   l2_strength=0.0003
17 | 
18 |   early_stopping=5
19 |  
20 |   train_file='./data/cnli/cnli_train_1.0_seg.txt'
21 |   dev_file='./data/cnli/cnli_dev_1.0_seg.txt'
22 | 
23 |   cnli_embedding_dir= './data/embedding/cnli_embedding.npy'
24 | 
25 | 


--------------------------------------------------------------------------------
/BaselineModel/data/embedding/cnli_embedding.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blcunlp/CNLI/604638ecf50201c15da6420ceb14aa7a43bd1463/BaselineModel/data/embedding/cnli_embedding.npy


--------------------------------------------------------------------------------
/BaselineModel/data/embedding/convert_embedding.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # read a vocab and precompute a .npy embedding matrix.
 4 | # if a vocab entry is in the provided glove embeddings then use the glove data. 
 5 | # if it's not, generate a random vector but scale it to the median length of the glove embeddings.
 6 | # reserve row 0 in the matrix for the PAD embedding (always set to {0}) 
 7 | # reserve row 1 in the matrix for the UNK embedding (given a random value)
 8 | import argparse
 9 | import numpy as np
10 | import sys
11 | from sklearn import random_projection
12 | 
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument("--vocab", required=True, help="reference vocab of non glove data; token \t idx")
15 | parser.add_argument("--glove-data", required=True, help="glove data. ssv, token, e_d1, e_d2, ...")
16 | parser.add_argument("--npy", required=True, help="npy output")
17 | parser.add_argument("--random-projection-dimensionality", default=None, type=float, 
18 |                     help="if set we randomly project the glove data to a smaller dimensionality")
19 | opts = parser.parse_args()
20 | 
21 | # slurp vocab entries. assume idxs are valid, ie 1 < i < |v|, no dups, no gaps, etc
22 | # (recall reserving 0 for UNK)
23 | # TODO: use vocab.py
24 | vocab = {}  # token => idx
25 | for line in open(opts.vocab, "r"):
26 |     token, idx = line.strip().split("\t")
27 |     if idx == 0:
28 |         assert token == '_PAD', "expecting to reserve 0 for _PAD"
29 |     elif idx == 1:
30 |         assert token == '_UNK', "expecting to reserve 1 for _UNK"
31 |     elif idx ==2:
32 |         assert token == '_GO',  "expecting to reverse 2 for _GO"
33 |     elif idx ==3:
34 |         assert token == '_EOS',  "expecting to reverse 3 for _EOS"
35 |     else:
36 |         vocab[token] = int(idx)
37 | print "vocab has", len(vocab), "entries (not _PAD or _UNK or _GO or _EOS)"
38 | 
39 | # alloc output after we see first glove embedding (so we know it's dimensionality)
40 | embeddings = None
41 | glove_dimensionality = None
42 | 
43 | # pass over glove data copying data into embedddings array
44 | # for the cases where the token is in the reference vocab.
45 | tokens_requiring_random = set(vocab.keys())
46 | glove_embedding_norms = []
47 | for line in open(opts.glove_data, "r"):
48 |     cols = line.strip().split(" ")
49 |     token = cols[0]
50 |     if token in vocab:
51 |         glove_embedding = np.array(cols[1:], dtype=np.float32)
52 |         if embeddings is None:
53 |             glove_dimensionality = len(glove_embedding)
54 |             embeddings = np.empty((len(vocab), glove_dimensionality), dtype=np.float32)  # +1 for pad & unk
55 |         assert len(glove_embedding) == glove_dimensionality, "differing dimensionality in glove data?"
56 |         embeddings[vocab[token]] = glove_embedding
57 |         tokens_requiring_random.remove(token)
58 |         glove_embedding_norms.append(np.linalg.norm(glove_embedding))
59 | 
60 | # given these embeddings we can calculate the median norm of the glove data
61 | median_glove_embedding_norm = np.median(glove_embedding_norms)
62 | 
63 | print >>sys.stderr, "build .npy file" 
64 | print >>sys.stderr, "after passing over glove there are", len(tokens_requiring_random), \
65 |     "tokens requiring a random alloc"
66 | 
67 | # return a random embedding with the same norm as the glove data median norm
68 | def random_embedding():
69 |     random_embedding = np.random.randn(1, glove_dimensionality)
70 |     random_embedding /= np.linalg.norm(random_embedding)
71 |     random_embedding *= median_glove_embedding_norm
72 |     return random_embedding
73 | 
74 | # assign PAD and UNK random embeddings (pre projection)
75 | embeddings[0] = random_embedding()  # PAD
76 | embeddings[1] = random_embedding()  # UNK
77 | 
78 | # assign random projections for every other fields requiring it
79 | for token in tokens_requiring_random:
80 |     embeddings[vocab[token]] = random_embedding()
81 | 
82 | # randomly project (if configured to do so)
83 | if opts.random_projection_dimensionality is not None:
84 |     # assign a temp random embedding for PAD before projection (and zero it after)
85 |     p = random_projection.GaussianRandomProjection(n_components=opts.random_projection_dimensionality)
86 |     embeddings = p.fit_transform(embeddings)
87 | 
88 | # zero out PAD embedding
89 | embeddings[0] = [0] * embeddings.shape[1]
90 | 
91 | # write embeddings npy to disk
92 | np.save(opts.npy, embeddings)
93 | 
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/BaselineModel/data/embedding/run_embedding.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | #time cat ../snli_1.0/snli_1.0_train.jsonl | ./generate_vocab_from_snli.py  > glove/vocab.tsv
 3 | 
 4 | 
 5 | time ./data/embedding/convert_embedding.py \
 6 |  --vocab ./data/embedding/cnli_vocab.txt \
 7 |  --glove-data ./data/embedding/sgns.merge.word \
 8 |  --npy ./data/embedding/cnli_embedding.npy \
 9 | 
10 | #
11 | # --random-projection-dimensionality 100
12 | 


--------------------------------------------------------------------------------
/BaselineModel/data_reader.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import os
  3 | import json
  4 | from myutils import *
  5 | from collections import Counter
  6 | 
  7 | from six.moves import xrange
  8 | import numpy as np
  9 | _PAD="_PAD"
 10 | _UNK= "_UNK"
 11 | _GO= "_GO"
 12 | _EOS= "_EOS"
 13 | _START_VOCAB=[_PAD,_UNK,_GO,_EOS]
 14 | 
 15 | PAD_ID=0
 16 | UNK_ID=1
 17 | GO_ID =2
 18 | EOS_ID =3
 19 | 
 20 | def filter_length(seq,maxlen):
 21 |   if len(seq)>maxlen:
 22 |     new_seq=seq[:maxlen]
 23 |   else:
 24 |     new_seq=seq
 25 |   return new_seq
 26 | 
 27 | def load_data(train,vocab,labels={'neutral':0,'entailment':1,'contradiction':2}):
 28 |     X,Y,Z=[],[],[]
 29 |     for p,h,l in train:
 30 |         p=map_to_idx(tokenize(p),vocab)+ [EOS_ID]
 31 |         h=[GO_ID]+map_to_idx(tokenize(h),vocab)+ [EOS_ID]
 32 |         p=filter_length(p,32)
 33 |         h=filter_length(h,30)
 34 |         if l in labels:        
 35 |             X+=[p]
 36 |             Y+=[h]
 37 |             Z+=[labels[l]]
 38 |     return X,Y,Z
 39 | 
 40 | def get_vocab(data):
 41 |     vocab=Counter()
 42 |     for ex in data:
 43 |         tokens=tokenize(ex[0])
 44 |         tokens+=tokenize(ex[1])
 45 |         vocab.update(tokens)
 46 |     vocab_sorted = sorted(vocab.items(), key=lambda x: (-x[1], x[0]))
 47 |     lst = _START_VOCAB + [ x for x, y in vocab_sorted if y > 0]
 48 | 
 49 |     vocab_exist=os.path.isfile("./data/embedding/cnli_vocab.txt")
 50 | 
 51 |     #if not vocab_exist:
 52 |     print ("build cnli_vocab.txt")
 53 |     f =open("./data/embedding/cnli_vocab.txt","w+")
 54 |     for x,y in enumerate(lst):
 55 |       x_y = str(y) +"\t"+ str(x)+"\n"
 56 |       f.write(x_y)
 57 |     f.close()
 58 | 
 59 |     os.system('./data/embedding/run_embedding.sh') 
 60 |     vocab = dict([ (y,x) for x,y in enumerate(lst)])
 61 |     return vocab
 62 | 
 63 | 
 64 | class DataSet(object):
 65 |   def __init__(self,x,y,labels,x_len,y_len,X_mask,Y_mask):
 66 |     self._data_len=len(x)
 67 |     self._x =x
 68 |     self._y =y
 69 |     self._labels =labels
 70 |     self._x_len = x_len
 71 |     self._y_len = y_len
 72 |     self._epochs_completed = 0
 73 |     self._index_in_epoch = 0
 74 |     self._num_examples = x.shape[0]
 75 |     self._x_mask=X_mask
 76 |     self._y_mask=Y_mask
 77 | 
 78 |   def next_batch(self, batch_size):
 79 |     """Return the next `batch_size` examples from this data set."""
 80 | 
 81 |     start = self._index_in_epoch
 82 |     self._index_in_epoch += batch_size
 83 |     if self._index_in_epoch > self._num_examples:
 84 |       # Finished epoch
 85 |       self._epochs_completed += 1
 86 | 
 87 |       # Start next epoch
 88 |       start = 0
 89 |       self._index_in_epoch = batch_size
 90 |       assert batch_size <= self._num_examples
 91 | 
 92 |     end = self._index_in_epoch
 93 | 
 94 |     batch_x, batch_x_mask, batch_x_len = self._x[start:end], self._x_mask[start:end], self._x_len[start:end]
 95 |     batch_y,batch_y_mask, batch_y_len = self._y[start:end], self._y_mask[start:end], self._y_len[start:end]
 96 |     batch_labels = self._labels[start:end]
 97 |     
 98 |     return batch_x,batch_y, batch_labels,batch_x_mask,batch_y_mask,batch_x_len,batch_y_len
 99 | 
100 |   @property
101 |   def get_x(self):
102 |     return self._x
103 |   
104 |   @property
105 |   def get_y(self):
106 |     return self.y
107 | 
108 |   @property
109 |   def labels(self):
110 |     return self._labels
111 | 
112 |   @property
113 |   def get_x_len(self):
114 |     return self._x_len
115 |   
116 |   @property
117 |   def get_y_len(self):
118 |     return self._y_len
119 | 
120 |   @property
121 |   def get_data_num(self):
122 |     return self._data_len
123 |   
124 |   def get_epoch_size(self,batch_size):
125 |     epoch_size = self._data_len //batch_size
126 |     return epoch_size
127 | 
128 | def singlefile2seqid(data,vocab, config):
129 |   X_data, Y_data,  Z_data = load_data(data, vocab)
130 | 
131 |   X_data_lengths=np.asarray([len(x) for x in X_data]).reshape(len(X_data))
132 |   X_data_mask = np.asarray([np.ones(x) for x in X_data_lengths]).reshape(len(X_data_lengths))
133 |   X_data_mask=pad_sequences(X_data_mask, maxlen=config.xmaxlen, value=vocab[_PAD], padding='post')
134 |   X_data=pad_sequences(X_data, maxlen=config.xmaxlen, value=vocab[_PAD], padding='post')
135 | 
136 |   Y_data_lengths = np.asarray([len(x) for x in Y_data]).reshape(len(Y_data))
137 |   Y_data_mask = np.asarray([np.ones(x) for x in Y_data_lengths]).reshape(len(Y_data_lengths))
138 |   Y_data_mask = pad_sequences(Y_data_mask, maxlen=config.ymaxlen, value=vocab[_PAD], padding='post')
139 |   Y_data = pad_sequences(Y_data, maxlen=config.ymaxlen, value=vocab[_PAD], padding='post')
140 | 
141 | 
142 |   Z_data = to_categorical(Z_data, num_classes=config.num_classes)
143 |   #X_data = np.asarray(X_data)
144 |   dataset = DataSet(X_data,Y_data,Z_data,\
145 |                     X_data_lengths,Y_data_lengths,
146 |                     X_data_mask,Y_data_mask)
147 | 
148 |   return dataset
149 | 
150 | def file2seqid(config):
151 | 
152 |   xmaxlen = config.xmaxlen
153 |   ymaxlen = config.ymaxlen
154 |   train = [l.strip().split('\t') for l in open(config.train_file)]
155 |   dev = [l.strip().split('\t') for l in open(config.dev_file)]
156 |   vocab = get_vocab(train)
157 | 
158 |   Train = singlefile2seqid(train,vocab, config)
159 |   Dev = singlefile2seqid(dev,vocab, config)
160 |   return Train,Dev,vocab
161 |  
162 |   
163 | 
164 |  
165 | if __name__=="__main__":
166 | 
167 |     train=[l.strip().split('\t') for l in open('train.txt')][:20000]
168 |     dev=[l.strip().split('\t') for l in open('dev.txt')]
169 |     test=[l.strip().split('\t') for l in open('test.txt')]
170 |     labels={'neutral':0,'entailment':1,'contradiction':2}
171 | 
172 |     vocab=get_vocab(train)
173 |     #X_train,Y_train,Z_train=load_data(train,vocab)
174 |     X_dev,Y_dev,Z_dev=load_data(dev,vocab)
175 |     #print (len(X_train),X_train[0])
176 |     print (len(X_dev),X_dev[0])
177 |     print (len(Y_dev),Y_dev[0])
178 |     print (len(Z_dev),Z_dev[0])
179 | 


--------------------------------------------------------------------------------
/BaselineModel/decomposable_att.py:
--------------------------------------------------------------------------------
  1 | ###############
  2 | #20180615
  3 | #implementation of decomposable attention on cnli
  4 | ################
  5 | 
  6 | from __future__ import absolute_import
  7 | from __future__ import division
  8 | from __future__ import print_function
  9 | 
 10 | import sys
 11 | import time
 12 | import inspect
 13 | import logging
 14 | import numpy as np
 15 | import tensorflow as tf
 16 | from tensorflow.contrib.rnn.python.ops import core_rnn_cell
 17 | from tensorflow.contrib.layers import batch_norm,l2_regularizer
 18 | from tensorflow.python.ops import variable_scope
 19 | 
 20 | 
 21 | class MyModel(object):
 22 |   """The decomposable model."""
 23 | 
 24 |   def __init__(self, is_training, config):
 25 | 
 26 |     batch_size = config.batch_size
 27 |     self.config = config
 28 |     self.is_training = is_training
 29 |     self.global_step = tf.Variable(0, trainable=False)
 30 |    
 31 |     self.add_placeholder() 
 32 |     self.add_embedding() 
 33 |     self.input_encoding()
 34 |     self.attend()
 35 |     self.compare() 
 36 |     self.aggregate() 
 37 | 
 38 |     self.compute_accuracy()
 39 |     self.compute_loss()   
 40 | 
 41 |     if not is_training:
 42 |         return
 43 |     self.optimization()
 44 | 
 45 |   def add_placeholder(self):
 46 |     '''
 47 |     add_placeholder for inputs
 48 |     '''
 49 |     self.x = tf.placeholder(tf.int32, [self.config.batch_size, self.config.xmaxlen])
 50 |     self.y = tf.placeholder(tf.int32, [self.config.batch_size, self.config.ymaxlen])
 51 | 
 52 |     self.x_mask = tf.placeholder(tf.int32, [self.config.batch_size, self.config.xmaxlen])
 53 |     self.y_mask = tf.placeholder(tf.int32, [self.config.batch_size, self.config.ymaxlen])
 54 |     self.x_mask = tf.cast(self.x_mask,tf.float32)
 55 |     self.y_mask = tf.cast(self.y_mask,tf.float32)
 56 | 
 57 |     self.x_len = tf.placeholder(tf.int32, [self.config.batch_size,])
 58 |     self.y_len = tf.placeholder(tf.int32, [self.config.batch_size,])
 59 |     self.x_len = tf.cast(self.x_len,tf.float32)
 60 |     self.y_len = tf.cast(self.y_len,tf.float32)
 61 | 
 62 |     self.label = tf.placeholder(tf.int32, [self.config.batch_size,self.config.num_classes])
 63 |   
 64 | 
 65 |   def add_embedding(self):
 66 |     '''
 67 |     add pretrained embedding
 68 |     '''
 69 |     with tf.device("/cpu:0"):
 70 |       embedding_matrix=np.load(self.config.cnli_embedding_dir)
 71 |       embedding = tf.Variable(embedding_matrix,trainable=False, name="embedding")
 72 |       
 73 |       self.input_xemb = tf.nn.embedding_lookup(embedding, self.x)
 74 |       self.input_yemb = tf.nn.embedding_lookup(embedding, self.y)
 75 |     
 76 |       if self.is_training and self.config.keep_prob < 1:
 77 |         self.input_xemb = tf.nn.dropout(self.input_xemb, self.config.keep_prob)
 78 |         self.input_yemb = tf.nn.dropout(self.input_yemb, self.config.keep_prob)
 79 | 
 80 |   def input_encoding(self):
 81 |     '''
 82 |     encode the x and y with a two-layer fnn seperately
 83 |     '''
 84 |     with tf.variable_scope("encode_x"):
 85 |       self.x_output=self.two_layer_dense(self.input_xemb,self.config.hidden_units,
 86 |                                         scope="x_fnn",regularizer=l2_regularizer(self.config.l2_strength) )
 87 |       self.x_output=self.x_output*self.x_mask[:,:,None]
 88 | 
 89 |       if self.is_training and self.config.keep_prob < 1:
 90 |         self.x_output = tf.nn.dropout(self.x_output,self.config.keep_prob)  # its length must be x_length
 91 | 
 92 |     with tf.variable_scope("encode_y"):
 93 |       self.y_output=self.two_layer_dense(self.input_yemb,self.config.hidden_units,
 94 |                                          scope="y_fnn",regularizer=l2_regularizer(self.config.l2_strength))
 95 |       self.y_output=self.y_output*self.y_mask[:,:,None]
 96 | 
 97 |       if self.is_training and self.config.keep_prob < 1:
 98 |         self.y_output = tf.nn.dropout(self.y_output, self.config.keep_prob)
 99 | 
100 | 
101 |   def attend(self):
102 |       self.weighted_y, self.weighted_x =self.attention(x_sen= self.x_output,
103 |                                                        y_sen= self.y_output,
104 |                                                        x_len= self.config.xmaxlen,
105 |                                                        y_len= self.config.ymaxlen)
106 | 
107 | 
108 |   def compare(self):
109 | 
110 |     with tf.variable_scope("compare"):
111 |       with tf.variable_scope("compare-xy"):
112 |         co_xy = tf.concat([self.x_output,self.weighted_y],axis=-1) 
113 |         v_co_xy=self.two_layer_dense(co_xy,self.config.hidden_units,
114 |                                      scope="compare_xy",regularizer=l2_regularizer(self.config.l2_strength))
115 |         self.v_co_xy=v_co_xy*self.x_mask[:,:,None]
116 | 
117 |         if self.is_training and self.config.keep_prob < 1:
118 |           self.v_co_xy = tf.nn.dropout(self.v_co_xy,self.config.keep_prob)  
119 | 
120 |       with tf.variable_scope("compare-yx"):
121 |         co_yx = tf.concat([self.y_output,self.weighted_x],axis=-1) 
122 |         v_co_yx=self.two_layer_dense(co_yx,self.config.hidden_units,
123 |                                      scope="compare_yx",regularizer=l2_regularizer(self.config.l2_strength))
124 |         self.v_co_yx=v_co_yx*self.y_mask[:,:,None]
125 | 
126 |         if self.is_training and self.config.keep_prob < 1:
127 |           self.v_co_yx = tf.nn.dropout(self.v_co_yx,self.config.keep_prob)  
128 | 
129 | 
130 |   def aggregate(self):
131 |     '''
132 |     1. sum pooling   2. fnn
133 |     ''' 
134 |     with tf.variable_scope("pooling"):
135 |       v1=tf.reduce_sum(self.v_co_xy,axis=1)
136 |       v2=tf.reduce_sum(self.v_co_yx,axis=1)
137 | 
138 |       self.v = tf.concat([v1,v2],axis=-1) 
139 | 
140 |     with tf.variable_scope("pred-layer"):
141 |   
142 |       dense1 = tf.layers.dense(inputs=self.v,
143 |                              units=self.config.hidden_units, 
144 |                              activation=tf.nn.tanh,
145 |                              use_bias=True,
146 |                              kernel_regularizer= l2_regularizer(self.config.l2_strength),
147 |                              name="dense-pred-W")
148 | 
149 |       if self.is_training and self.config.keep_prob < 1:
150 |         dense1 = tf.nn.dropout(dense1, self.config.keep_prob)
151 | 
152 |       W_pred = tf.get_variable("W_pred", shape=[self.config.hidden_units, self.config.num_classes],regularizer=l2_regularizer(self.config.l2_strength))
153 | 
154 |       self.pred = tf.nn.softmax(tf.matmul(dense1, W_pred), name="pred")
155 | 
156 |   def compute_accuracy(self):
157 |     correct = tf.equal(tf.argmax(self.pred,1),tf.argmax(self.label,1))
158 |     self.acc = tf.reduce_mean(tf.cast(correct, "float"), name="accuracy")
159 | 
160 |   def compute_loss(self):
161 |     
162 |     self.loss_term = -tf.reduce_sum(tf.cast(self.label,tf.float32) * tf.log(self.pred),name="loss_term")
163 |     self.reg_term = tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES),name="reg_term")
164 |     self.loss = tf.add(self.loss_term,self.reg_term,name="loss")
165 | 
166 | 
167 |   def optimization(self):
168 |   
169 |     with tf.variable_scope("bp_layer"):
170 |       tvars = tf.trainable_variables()
171 |       grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars),
172 |                                       self.config.max_grad_norm)
173 |       optimizer = tf.train.AdamOptimizer(self.config.learning_rate)
174 |       self.optim = optimizer.apply_gradients(
175 |           zip(grads, tvars),
176 |           global_step=self.global_step)
177 | 
178 | 
179 | 
180 |   def attention(self,x_sen,y_sen,x_len,y_len):
181 |     '''
182 |     function: use the dot-production of left_sen and right_sen to compute the attention weight matrix
183 |     :param left_sen: a list of 2D tensor (x_len,hidden_units)
184 |     :param right_sen: a list of 2D tensor (y_len,hidden_units)
185 |     :return: (1) weighted_y: the weightd sum of y_sen, a 3D tensor with shape (b,x_len,2*h)
186 |              (2) weghted_x:  the weighted sum of x_sen, a 3D tensor with shape (b,y_len,2*h)
187 |     '''
188 |     
189 |     weight_matrix =tf.matmul(x_sen, tf.transpose(y_sen,perm=[0,2,1])) #(b,x_len,h) x (b,h,y_len)->(b,x_len,y_len)
190 | 
191 |     weight_matrix_y =tf.exp(weight_matrix - tf.reduce_max(weight_matrix,axis=2,keep_dims=True))  #(b,x_len,y_len)
192 |     weight_matrix_x =tf.exp(tf.transpose((weight_matrix - tf.reduce_max(weight_matrix,axis=1,keep_dims=True)),perm=[0,2,1]))  #(b,y_len,x_len)
193 | 
194 |     weight_matrix_y=weight_matrix_y*self.y_mask[:,None,:]#(b,x_len,y_len)*(b,1,y_len)
195 |     weight_matrix_x=weight_matrix_x*self.x_mask[:,None,:]#(b,y_len,x_len)*(b,1,x_len)
196 |     
197 |     alpha=weight_matrix_y/(tf.reduce_sum(weight_matrix_y,2,keep_dims=True)+1e-8)#(b,x_len,y_len)
198 |     beta=weight_matrix_x/(tf.reduce_sum(weight_matrix_x,2,keep_dims=True)+1e-8)#(b,y_len,x_len)
199 | 
200 |     #(b,1,y_len,2*h)*(b,x_len,y_len,1)*=>(b,x_len,y_len,2*h) =>(b,x_len,2*h)
201 |     weighted_y =tf.reduce_sum(tf.expand_dims(y_sen,1) *tf.expand_dims(alpha,-1),2)
202 | 
203 |     #(b,1,x_len,2*h)*(b,y_len,x_len,1) =>(b,y_len,x_len,2*h) =>(b,y_len,2*h)
204 |     weighted_x =tf.reduce_sum(tf.expand_dims(x_sen,1) * tf.expand_dims(beta,-1),2)
205 | 
206 |     return weighted_y,weighted_x
207 | 
208 | 
209 |   def two_layer_dense(self,inp,out_dim,scope,regularizer=None):
210 |     with tf.variable_scope(scope):
211 |       dense1 = tf.layers.dense(inputs=inp,
212 |                              units=out_dim, 
213 |                              activation=tf.nn.relu,
214 |                              kernel_regularizer= regularizer,
215 |                              use_bias=True)
216 | 
217 |       dense2 = tf.layers.dense(inputs=dense1,
218 |                              units=out_dim, 
219 |                              activation=tf.nn.relu,
220 |                              kernel_regularizer= regularizer,
221 |                              use_bias=True)
222 |       return dense2
223 | 
224 | 


--------------------------------------------------------------------------------
/BaselineModel/esim.py:
--------------------------------------------------------------------------------
  1 | ###############
  2 | #20180615
  3 | #implementation of decomposable attention on cnli
  4 | ################
  5 | 
  6 | from __future__ import absolute_import
  7 | from __future__ import division
  8 | from __future__ import print_function
  9 | 
 10 | import sys
 11 | import time
 12 | import inspect
 13 | import logging
 14 | import numpy as np
 15 | import tensorflow as tf
 16 | from tensorflow.contrib.rnn.python.ops import core_rnn_cell
 17 | from tensorflow.contrib.layers import batch_norm,l2_regularizer
 18 | from tensorflow.python.ops import variable_scope
 19 | from tensorflow.contrib.rnn.python.ops import core_rnn_cell
 20 | from ops_cudnn_rnn import cudnn_lstm
 21 | 
 22 | 
 23 | class MyModel(object):
 24 |   """The ESIM model."""
 25 | 
 26 |   def __init__(self, is_training, config):
 27 | 
 28 |     batch_size = config.batch_size
 29 |     self.config = config
 30 |     self.is_training = is_training
 31 |     self.global_step = tf.Variable(0, trainable=False)
 32 |    
 33 |     self.add_placeholder() 
 34 |     self.add_embedding() 
 35 |     self.input_encoding()
 36 |     self.attend()
 37 |     self.compare() 
 38 |     self.aggregate() 
 39 | 
 40 |     self.compute_accuracy()
 41 |     self.compute_loss()   
 42 | 
 43 |     if not is_training:
 44 |         return
 45 |     self.optimization()
 46 | 
 47 |   def add_placeholder(self):
 48 |     '''
 49 |     add_placeholder for inputs
 50 |     '''
 51 |     self.x = tf.placeholder(tf.int32, [self.config.batch_size, self.config.xmaxlen])
 52 |     self.y = tf.placeholder(tf.int32, [self.config.batch_size, self.config.ymaxlen])
 53 | 
 54 |     self.x_mask = tf.placeholder(tf.int32, [self.config.batch_size, self.config.xmaxlen])
 55 |     self.y_mask = tf.placeholder(tf.int32, [self.config.batch_size, self.config.ymaxlen])
 56 |     self.x_mask = tf.cast(self.x_mask,tf.float32)
 57 |     self.y_mask = tf.cast(self.y_mask,tf.float32)
 58 | 
 59 |     self.x_len = tf.placeholder(tf.int32, [self.config.batch_size,])
 60 |     self.y_len = tf.placeholder(tf.int32, [self.config.batch_size,])
 61 |     self.x_len = tf.cast(self.x_len,tf.float32)
 62 |     self.y_len = tf.cast(self.y_len,tf.float32)
 63 | 
 64 |     self.label = tf.placeholder(tf.int32, [self.config.batch_size,self.config.num_classes])
 65 |   
 66 | 
 67 |   def add_embedding(self):
 68 |     '''
 69 |     add pretrained embedding
 70 |     '''
 71 |     with tf.device("/cpu:0"):
 72 |       embedding_matrix=np.load(self.config.cnli_embedding_dir)
 73 |       embedding = tf.Variable(embedding_matrix,trainable=False, name="embedding")
 74 |       
 75 |       self.input_xemb = tf.nn.embedding_lookup(embedding, self.x)
 76 |       self.input_yemb = tf.nn.embedding_lookup(embedding, self.y)
 77 |     
 78 |       if self.is_training and self.config.keep_prob < 1:
 79 |         self.input_xemb = tf.nn.dropout(self.input_xemb, self.config.keep_prob)
 80 |         self.input_yemb = tf.nn.dropout(self.input_yemb, self.config.keep_prob)
 81 | 
 82 | 
 83 | 
 84 |   def input_encoding(self):
 85 |     '''
 86 |     encode the x and y with a two-layer fnn seperately
 87 |     '''
 88 |     with tf.variable_scope("encode_xy") as scope:
 89 |       self.x_output = cudnn_lstm(inputs=self.input_xemb,num_layers=1,hidden_size=self.config.hidden_units,is_training=self.is_training)    
 90 |       self.x_output=self.x_output*self.x_mask[:,:,None]
 91 | 
 92 |       scope.reuse_variables()
 93 |       self.y_output = cudnn_lstm(inputs=self.input_yemb,num_layers=1,hidden_size=self.config.hidden_units,is_training=self.is_training)    
 94 |       self.y_output=self.y_output*self.y_mask[:,:,None]
 95 | 
 96 |       if self.is_training and self.config.keep_prob < 1:
 97 |         self.x_output = tf.nn.dropout(self.x_output,self.config.keep_prob)  # its length must be x_length
 98 |         self.y_output = tf.nn.dropout(self.y_output, self.config.keep_prob)
 99 | 
100 | 
101 |   def attend(self):
102 |       self.weighted_y, self.weighted_x =self.attention(x_sen= self.x_output,
103 |                                                        y_sen= self.y_output,
104 |                                                        x_len= self.config.xmaxlen,
105 |                                                        y_len= self.config.ymaxlen)
106 | 
107 | 
108 |   def compare(self):
109 | 
110 |     with tf.variable_scope("compare"):
111 |       with tf.variable_scope("compare-xy") as scope:
112 |         co_xy = tf.concat([self.x_output,self.weighted_y, self.x_output-self.weighted_y, self.x_output*self.weighted_y],axis=-1) 
113 |         co_xy_dense = tf.layers.dense(inputs=co_xy,units=self.config.hidden_units, activation=tf.nn.relu,
114 |                                       kernel_regularizer=l2_regularizer(self.config.l2_strength),  use_bias=True)
115 | 
116 |         v_co_xy = cudnn_lstm(inputs=co_xy_dense,num_layers=1,hidden_size=self.config.hidden_units,is_training=self.is_training)    
117 |         self.v_co_xy=v_co_xy*self.x_mask[:,:,None]
118 | 
119 | 
120 |         scope.reuse_variables()
121 |         co_yx = tf.concat([self.y_output,self.weighted_x, self.y_output-self.weighted_x, self.y_output*self.weighted_x],axis=-1) 
122 |         co_yx_dense = tf.layers.dense(inputs=co_yx,units=self.config.hidden_units, activation=tf.nn.relu,
123 |                                       kernel_regularizer=l2_regularizer(self.config.l2_strength),  use_bias=True,reuse=tf.AUTO_REUSE)
124 | 
125 |         v_co_yx = cudnn_lstm(inputs=co_yx_dense,num_layers=1,hidden_size=self.config.hidden_units,is_training=self.is_training)    
126 |         self.v_co_yx=v_co_yx*self.y_mask[:,:,None]
127 | 
128 |         if self.is_training and self.config.keep_prob < 1:
129 |           self.v_co_xy = tf.nn.dropout(self.v_co_xy,self.config.keep_prob)  
130 |           self.v_co_yx = tf.nn.dropout(self.v_co_yx,self.config.keep_prob)  
131 | 
132 | 
133 |   def aggregate(self):
134 |     '''
135 |     1. sum pooling   2. fnn
136 |     ''' 
137 |     with tf.variable_scope("pooling"):
138 | 
139 |       v_xyave = tf.div(tf.reduce_sum(self.v_co_xy, 1), tf.expand_dims(self.x_len, -1)) #div true length
140 |       v_yxave = tf.div(tf.reduce_sum(self.v_co_yx, 1), tf.expand_dims(self.y_len,  -1)) #div true length
141 |       v_xymax = tf.reduce_max(self.v_co_xy,axis=1)  #(b,2h)    
142 |       v_yxmax = tf.reduce_max(self.v_co_yx,axis=1)  #(b,2h)
143 | 
144 |       self.v = tf.concat([v_xyave, v_xymax, v_yxave, v_yxmax],axis=-1) 
145 | 
146 |     with tf.variable_scope("pred-layer"):
147 |   
148 |       dense1 = tf.layers.dense(inputs=self.v,
149 |                              units=self.config.hidden_units, 
150 |                              activation=tf.nn.tanh,
151 |                              use_bias=True,
152 |                              kernel_regularizer= l2_regularizer(self.config.l2_strength),
153 |                              name="dense-pred-W")
154 | 
155 |       if self.is_training and self.config.keep_prob < 1:
156 |         dense1 = tf.nn.dropout(dense1, self.config.keep_prob)
157 | 
158 |       W_pred = tf.get_variable("W_pred", shape=[self.config.hidden_units, self.config.num_classes],regularizer=l2_regularizer(self.config.l2_strength))
159 | 
160 |       self.pred = tf.nn.softmax(tf.matmul(dense1, W_pred), name="pred")
161 | 
162 |   def compute_accuracy(self):
163 |     correct = tf.equal(tf.argmax(self.pred,1),tf.argmax(self.label,1))
164 |     self.acc = tf.reduce_mean(tf.cast(correct, "float"), name="accuracy")
165 | 
166 |   def compute_loss(self):
167 |     
168 |     self.loss_term = -tf.reduce_sum(tf.cast(self.label,tf.float32) * tf.log(self.pred),name="loss_term")
169 |     self.reg_term = tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES),name="reg_term")
170 |     self.loss = tf.add(self.loss_term,self.reg_term,name="loss")
171 | 
172 | 
173 |   def optimization(self):
174 |   
175 |     with tf.variable_scope("bp_layer"):
176 |       tvars = tf.trainable_variables()
177 |       grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars),
178 |                                       self.config.max_grad_norm)
179 |       optimizer = tf.train.AdamOptimizer(self.config.learning_rate)
180 |       self.optim = optimizer.apply_gradients(
181 |           zip(grads, tvars),
182 |           global_step=self.global_step)
183 | 
184 | 
185 | 
186 |   def attention(self,x_sen,y_sen,x_len,y_len):
187 |     '''
188 |     function: use the dot-production of left_sen and right_sen to compute the attention weight matrix
189 |     :param left_sen: a list of 2D tensor (x_len,hidden_units)
190 |     :param right_sen: a list of 2D tensor (y_len,hidden_units)
191 |     :return: (1) weighted_y: the weightd sum of y_sen, a 3D tensor with shape (b,x_len,2*h)
192 |              (2) weghted_x:  the weighted sum of x_sen, a 3D tensor with shape (b,y_len,2*h)
193 |     '''
194 |     
195 |     weight_matrix =tf.matmul(x_sen, tf.transpose(y_sen,perm=[0,2,1])) #(b,x_len,h) x (b,h,y_len)->(b,x_len,y_len)
196 | 
197 |     weight_matrix_y =tf.exp(weight_matrix - tf.reduce_max(weight_matrix,axis=2,keep_dims=True))  #(b,x_len,y_len)
198 |     weight_matrix_x =tf.exp(tf.transpose((weight_matrix - tf.reduce_max(weight_matrix,axis=1,keep_dims=True)),perm=[0,2,1]))  #(b,y_len,x_len)
199 | 
200 |     weight_matrix_y=weight_matrix_y*self.y_mask[:,None,:]#(b,x_len,y_len)*(b,1,y_len)
201 |     weight_matrix_x=weight_matrix_x*self.x_mask[:,None,:]#(b,y_len,x_len)*(b,1,x_len)
202 |     
203 |     alpha=weight_matrix_y/(tf.reduce_sum(weight_matrix_y,2,keep_dims=True)+1e-8)#(b,x_len,y_len)
204 |     beta=weight_matrix_x/(tf.reduce_sum(weight_matrix_x,2,keep_dims=True)+1e-8)#(b,y_len,x_len)
205 | 
206 |     #(b,1,y_len,2*h)*(b,x_len,y_len,1)*=>(b,x_len,y_len,2*h) =>(b,x_len,2*h)
207 |     weighted_y =tf.reduce_sum(tf.expand_dims(y_sen,1) *tf.expand_dims(alpha,-1),2)
208 | 
209 |     #(b,1,x_len,2*h)*(b,y_len,x_len,1) =>(b,y_len,x_len,2*h) =>(b,y_len,2*h)
210 |     weighted_x =tf.reduce_sum(tf.expand_dims(x_sen,1) * tf.expand_dims(beta,-1),2)
211 | 
212 |     return weighted_y,weighted_x
213 | 
214 | 
215 |   def two_layer_dense(self,inp,out_dim,scope,regularizer=None):
216 |     with tf.variable_scope(scope):
217 |       dense1 = tf.layers.dense(inputs=inp,
218 |                              units=out_dim, 
219 |                              activation=tf.nn.relu,
220 |                              kernel_regularizer= regularizer,
221 |                              use_bias=True)
222 | 
223 |       dense2 = tf.layers.dense(inputs=dense1,
224 |                              units=out_dim, 
225 |                              activation=tf.nn.relu,
226 |                              kernel_regularizer= regularizer,
227 |                              use_bias=True)
228 |       return dense2
229 | 
230 | 


--------------------------------------------------------------------------------
/BaselineModel/myutils.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import re
  3 | import numpy as np
  4 | import argparse
  5 | import random
  6 | import string
  7 | 
  8 | 
  9 | 
 10 | def tokenize(sent):
 11 |     '''
 12 |     data_reader.tokenize('a#b')
 13 |     ['a', '#', 'b']
 14 |     '''
 15 |     #return [x.strip().lower() for x in re.split('(\W+)?', sent) if x.strip()]
 16 |     return [x.strip().lower() for x in re.split('(\W+)', sent) if x.strip()]
 17 | 
 18 | 
 19 | def map_to_idx(x, vocab):
 20 |     '''
 21 |     x is a sequence of tokens
 22 |     '''
 23 |     # 1 is for UNK,0 is for PAD
 24 |     return [ vocab[w] if w in vocab else 1 for w in x  ]
 25 | 
 26 | 
 27 | def to_categorical(y, num_classes=None):
 28 |     """from keras.utils.np_utils import to_categorical
 29 | 
 30 |     Converts a class vector (integers) to binary class matrix.
 31 |     E.g. for use with categorical_crossentropy.
 32 |     # Arguments
 33 |         y: class vector to be converted into a matrix
 34 |             (integers from 0 to num_classes).
 35 |         num_classes: total number of classes.
 36 |     # Returns
 37 |         A binary matrix representation of the input.
 38 |     """
 39 |     y = np.array(y, dtype='int').ravel()
 40 |     if not num_classes:
 41 |         num_classes = np.max(y) + 1
 42 |     n = y.shape[0]
 43 |     categorical = np.zeros((n, num_classes))
 44 |     categorical[np.arange(n), y] = 1
 45 | 
 46 |     return categorical
 47 | 
 48 | def pad_sequences(sequences, maxlen=None, dtype='int32',
 49 |                   padding='pre', truncating='pre', value=0.):
 50 | 
 51 |     """from keras.preprocessing.sequence.pad_sequences
 52 |     Pads each sequence to the same length (length of the longest sequence).
 53 | 
 54 |     If maxlen is provided, any sequence longer
 55 |     than maxlen is truncated to maxlen.
 56 |     Truncation happens off either the beginning (default) or
 57 |     the end of the sequence.
 58 | 
 59 |     Supports post-padding and pre-padding (default).
 60 | 
 61 |     # Arguments
 62 |         sequences: list of lists where each element is a sequence
 63 |         maxlen: int, maximum length
 64 |         dtype: type to cast the resulting sequence.
 65 |         padding: 'pre' or 'post', pad either before or after each sequence.
 66 |         truncating: 'pre' or 'post', remove values from sequences larger than
 67 |             maxlen either in the beginning or in the end of the sequence
 68 |         value: float, value to pad the sequences to the desired value.
 69 | 
 70 |     # Returns
 71 |         x: numpy array with dimensions (number_of_sequences, maxlen)
 72 | 
 73 |     # Raises
 74 |         ValueError: in case of invalid values for `truncating` or `padding`,
 75 |             or in case of invalid shape for a `sequences` entry.
 76 |     """
 77 |     if not hasattr(sequences, '__len__'):
 78 |         raise ValueError('`sequences` must be iterable.')
 79 |     lengths = []
 80 |     for x in sequences:
 81 |         if not hasattr(x, '__len__'):
 82 |             raise ValueError('`sequences` must be a list of iterables. '
 83 |                              'Found non-iterable: ' + str(x))
 84 |         lengths.append(len(x))
 85 | 
 86 |     num_samples = len(sequences)
 87 |     if maxlen is None:
 88 |         maxlen = np.max(lengths)
 89 | 
 90 |     # take the sample shape from the first non empty sequence
 91 |     # checking for consistency in the main loop below.
 92 |     sample_shape = tuple()
 93 |     for s in sequences:
 94 |         if len(s) > 0:
 95 |             sample_shape = np.asarray(s).shape[1:]
 96 |             break
 97 | 
 98 |     x = (np.ones((num_samples, maxlen) + sample_shape) * value).astype(dtype)
 99 |     for idx, s in enumerate(sequences):
100 |         if not len(s):
101 |             continue  # empty list/array was found
102 |         if truncating == 'pre':
103 |             trunc = s[-maxlen:]
104 |         elif truncating == 'post':
105 |             trunc = s[:maxlen]
106 |         else:
107 |             raise ValueError('Truncating type "%s" not understood' % truncating)
108 | 
109 |         # check `trunc` has expected shape
110 |         trunc = np.asarray(trunc, dtype=dtype)
111 |         if trunc.shape[1:] != sample_shape:
112 |             raise ValueError('Shape of sample %s of sequence at position %s is different from expected shape %s' %
113 |                              (trunc.shape[1:], idx, sample_shape))
114 | 
115 |         if padding == 'post':
116 |             x[idx, :len(trunc)] = trunc
117 |         elif padding == 'pre':
118 |             x[idx, -len(trunc):] = trunc
119 |         else:
120 |             raise ValueError('Padding type "%s" not understood' % padding)
121 |     return x
122 | 
123 | 
124 | 
125 | if __name__=="__main__":
126 |     pass
127 | 


--------------------------------------------------------------------------------
/BaselineModel/ops_cudnn_rnn.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | forked from https://github.com/baidu-research/GloballyNormalizedReader/blob/master/ops.py
  3 | '''
  4 | 
  5 | import tensorflow as tf
  6 | import tensorflow.contrib.cudnn_rnn as cudnn_rnn
  7 | from itertools import zip_longest
  8 | import queue
  9 | import threading
 10 | import numpy as np
 11 | 
 12 | #######cudnn_lstm##########
 13 | def cudnn_lstm(inputs, num_layers, hidden_size, is_training, direction='bidirectional',regularizer=None,scope=None):
 14 |     """Run the CuDNN LSTM.
 15 |     Arguments:
 16 |         - inputs:   A tensor of shape [batch, length, input_size] of inputs.
 17 |         - layers:   Number of RNN layers.
 18 |         - hidden_size:  Number of units in each layer.
 19 |         - direction: indicate 'bidirectional' or 'unidirectional'     
 20 |         - is_training:     tf.bool indicating whether training mode is enabled.
 21 |     Return a tuple of (outputs, init_state, final_state).
 22 |     """
 23 |     input_size = inputs.get_shape()[-1].value
 24 |     if input_size is None:
 25 |         raise ValueError("Number of input dimensions to CuDNN RNNs must be "
 26 |                          "known, but was None.")
 27 | 
 28 |     # CUDNN expects the inputs to be time major
 29 |     inputs = tf.transpose(inputs, [1, 0, 2])
 30 | 
 31 |     cudnn_cell = tf.contrib.cudnn_rnn.CudnnLSTM(
 32 |         num_layers, hidden_size, input_size,
 33 |         input_mode="linear_input", direction=direction)
 34 | 
 35 |     est_size = estimate_cudnn_lstm_parameter_size(
 36 |         num_layers=num_layers,
 37 |         hidden_size=hidden_size,
 38 |         input_size=input_size,
 39 |         input_mode="linear_input",
 40 |         direction=direction)
 41 | 
 42 |     cudnn_params = tf.get_variable(
 43 |         "RNNParams",
 44 |         shape=[est_size],
 45 |         initializer=tf.contrib.layers.variance_scaling_initializer(),
 46 |         regularizer=regularizer)
 47 | 
 48 |     num_dir = direction_to_num_directions(direction)
 49 |     # initial_state: a tuple of tensor(s) of shape`[num_layers * num_dirs, batch_size, num_units]
 50 |     init_state = tf.tile(
 51 |         tf.zeros([num_dir * num_layers, 1, hidden_size], dtype=tf.float32),
 52 |         [1, tf.shape(inputs)[1], 1])  # [num_dir * num_layers, batch_size, hidden_size]
 53 |     '''
 54 |     Args:
 55 |       inputs: `3-D` tensor with shape `[time_len, batch_size, input_size]`.
 56 |       initial_state: a tuple of tensor(s) of shape
 57 |         `[num_layers * num_dirs, batch_size, num_units]`. If not provided, use
 58 |         zero initial states. The tuple size is 2 for LSTM and 1 for other RNNs.
 59 |       training: whether this operation will be used in training or inference.
 60 |     Returns:
 61 |       output: a tensor of shape `[time_len, batch_size, num_dirs * num_units]`.
 62 |         It is a `concat([fwd_output, bak_output], axis=2)`.
 63 |       output_states: a tuple of tensor(s) of the same shape and structure as
 64 |         `initial_state`.
 65 |     '''
 66 |     hiddens, output_h, output_c = cudnn_cell(
 67 |         inputs,
 68 |         input_h=init_state,
 69 |         input_c=init_state,
 70 |         params=cudnn_params,
 71 |         is_training=True)
 72 | 
 73 |     # Convert to batch major
 74 |     hiddens = tf.transpose(hiddens, [1, 0, 2])
 75 |     output_h = tf.transpose(output_h, [1, 0, 2])
 76 |     output_c = tf.transpose(output_c, [1, 0, 2])
 77 | 
 78 |     #return hiddens, output_h, output_c
 79 |     return hiddens
 80 | 
 81 | #######cudnn_gru##########
 82 | 
 83 | def cudnn_gru(inputs, num_layers, hidden_size, is_training, direction='bidirectional',scope=None):
 84 |     """Run the CuDNN LSTM.
 85 |     Arguments:
 86 |         - inputs:   A tensor of shape [batch, length, input_size] of inputs.
 87 |         - layers:   Number of RNN layers.
 88 |         - hidden_size:  Number of units in each layer.
 89 |         - direction: indicate 'bidirectional' or 'unidirectional'     
 90 |         - is_training:     tf.bool indicating whether training mode is enabled.
 91 |     Return a tuple of (outputs, init_state, final_state).
 92 |     ref: https://github.com/tensorflow/tensorflow/issues/13860
 93 |     """
 94 |     input_size = inputs.get_shape()[-1].value
 95 |     if input_size is None:
 96 |         raise ValueError("Number of input dimensions to CuDNN RNNs must be "
 97 |                          "known, but was None.")
 98 | 
 99 |     # CUDNN expects the inputs to be time major
100 |     inputs = tf.transpose(inputs, [1, 0, 2])
101 |    
102 |     cudnn_cell = tf.contrib.cudnn_rnn.CudnnGRU(
103 |         num_layers, hidden_size, input_size,
104 |         input_mode="linear_input", direction=direction)
105 | 
106 |     est_size = estimate_cudnn_gru_parameter_size(
107 |         num_layers=num_layers,
108 |         hidden_size=hidden_size,
109 |         input_size=input_size,
110 |         input_mode="linear_input",
111 |         direction=direction)
112 | 
113 |     cudnn_params = tf.get_variable(
114 |         "RNNParams",
115 |         shape=[est_size],
116 |         initializer=tf.contrib.layers.variance_scaling_initializer())
117 | 
118 |     num_dir = direction_to_num_directions(direction)
119 |     # initial_state: a tuple of tensor(s) of shape`[num_layers * num_dirs, batch_size, num_units]
120 |     init_state = tf.tile(
121 |         tf.zeros([num_dir * num_layers, 1, hidden_size], dtype=tf.float32),
122 |         [1, tf.shape(inputs)[1], 1])  # [num_dir * num_layers, batch_size, hidden_size]
123 |     '''
124 |     Args:
125 |       inputs: `3-D` tensor with shape `[time_len, batch_size, input_size]`.
126 |       initial_state: a tuple of tensor(s) of shape
127 |         `[num_layers * num_dirs, batch_size, num_units]`. If not provided, use
128 |         zero initial states. The tuple size is 2 for LSTM and 1 for other RNNs.
129 |       training: whether this operation will be used in training or inference.
130 |     Returns:
131 |       output: a tensor of shape `[time_len, batch_size, num_dirs * num_units]`.
132 |         It is a `concat([fwd_output, bak_output], axis=2)`.
133 |       output_states: a tuple of tensor(s) of the same shape and structure as
134 |         `initial_state`.
135 |     '''
136 |     #hiddens, output_h, output_c = cudnn_cell(
137 |     hiddens, output_h = cudnn_cell(
138 |         inputs,
139 |         input_h=init_state,
140 |         params=cudnn_params,
141 |         is_training=True)
142 | 
143 |     # Convert to batch major
144 |     hiddens = tf.transpose(hiddens, [1, 0, 2])
145 |     output_h = tf.transpose(output_h, [1, 0, 2])
146 |     #output_c = tf.transpose(output_c, [1, 0, 2])
147 | 
148 |     #return hiddens,  output_h
149 |     return hiddens
150 | 
151 | def estimate_cudnn_lstm_parameter_size(num_layers,
152 |                                   input_size,
153 |                                   hidden_size,
154 |                                   input_mode,
155 |                                   direction):
156 |     """
157 |     Compute the number of parameters needed to
158 |     construct a stack of LSTMs. Assumes the hidden states
159 |     of bidirectional LSTMs are concatenated before being
160 |     sent to the next layer up.
161 |     """
162 |     num_directions = direction_to_num_directions(direction)
163 |     params = 0
164 |     isize = input_size
165 |     for layer in range(num_layers):
166 |         for direction in range(num_directions):
167 |             params += cudnn_lstm_parameter_size(
168 |                 isize, hidden_size
169 |             )
170 |         isize = hidden_size * num_directions
171 |     return params
172 | 
173 | def cudnn_lstm_parameter_size(input_size, hidden_size):
174 |     """Number of parameters in a single CuDNN LSTM cell."""
175 |     biases = 8 * hidden_size
176 |     weights = 4 * (hidden_size * input_size) + 4 * (hidden_size * hidden_size)
177 |     return biases + weights
178 | 
179 | 
180 | def estimate_cudnn_gru_parameter_size(num_layers,
181 |                                   input_size,
182 |                                   hidden_size,
183 |                                   input_mode,
184 |                                   direction):
185 |     """
186 |     Compute the number of parameters needed to
187 |     construct a stack of LSTMs. Assumes the hidden states
188 |     of bidirectional LSTMs are concatenated before being
189 |     sent to the next layer up.
190 |     """
191 |     num_directions = direction_to_num_directions(direction)
192 |     params = 0
193 |     isize = input_size
194 |     for layer in range(num_layers):
195 |         for direction in range(num_directions):
196 |             params += cudnn_gru_parameter_size(
197 |                 isize, hidden_size
198 |             )
199 |         isize = hidden_size * num_directions
200 |     return params
201 |  
202 | 
203 | def cudnn_gru_parameter_size(input_size, hidden_size):
204 |     """Number of parameters in a single CuDNN LSTM cell."""
205 |     biases = 6 * hidden_size
206 |     weights = 3 * (hidden_size * input_size) + 3 * (hidden_size * hidden_size)
207 |     return biases + weights               
208 | 
209 | def direction_to_num_directions(direction):
210 |     if direction == "unidirectional":
211 |         return 1
212 |     elif direction == "bidirectional":
213 |         return 2
214 |     else:
215 |         raise ValueError("Unknown direction: %r." % (direction,))
216 | 
217 | def parameter_count():
218 |     """Return the total number of parameters in all Tensorflow-defined
219 |     variables, using `tf.trainable_variables()` to get the list of
220 |     variables."""
221 |     return sum(np.product(var.get_shape().as_list())
222 |                for var in tf.trainable_variables())
223 | 


--------------------------------------------------------------------------------
/BaselineModel/run.sh:
--------------------------------------------------------------------------------
1 | python3 train.py  --model_type  decomposable_att 2>f2 1>f1_decompsable_0808
2 | python3 train.py  --model_type  esim 2>f2 1>f1_esim_0808
3 | 


--------------------------------------------------------------------------------
/BaselineModel/train.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import sys
  6 | import time
  7 | import inspect
  8 | import logging
  9 | import copy
 10 | import importlib
 11 | import numpy as np
 12 | import tensorflow as tf
 13 | from tensorflow.contrib.rnn.python.ops import core_rnn_cell
 14 | from tensorflow.contrib.layers import batch_norm,l2_regularizer
 15 | from tensorflow.python.ops import variable_scope
 16 | 
 17 | from myutils import *
 18 | import data_reader as reader
 19 | #from decomposable_att import MyModel 
 20 | #from esim import MyModel 
 21 | from config import SmallConfig
 22 | 
 23 | flags = tf.flags
 24 | logging = tf.logging
 25 | 
 26 | flags.DEFINE_string(
 27 |     "model", "small",
 28 |     "A type of model. Possible options are: small, medium, large.")
 29 | flags.DEFINE_string('model_type', "esim", 'esim or decomposable-att')
 30 | flags.DEFINE_string("data_path", "",
 31 |                     "Where the training/test data is stored.")
 32 | flags.DEFINE_string("save_path","model_saved",
 33 |                     "Model output directory.")
 34 | flags.DEFINE_bool("use_fp16", False,
 35 |                   "Train using 16-bit floats instead of 32bit floats")
 36 | flags.DEFINE_float('learning_rate', 0.0004, 'Initial learning rate.')  
 37 | flags.DEFINE_float('keep_prob', 0.8, 'keep_prob for dropout.')  
 38 | flags.DEFINE_float('l2_strength', 0.0002, 'l2 rate for l2 loss.') 
 39 | flags.DEFINE_integer('batch_size', 32,'batch_size ') 
 40 | 
 41 | FLAGS = flags.FLAGS
 42 | 
 43 | def data_type():
 44 |   return tf.float16 if FLAGS.use_fp16 else tf.float32
 45 | 
 46 | def fill_placeholder(data, model,config):
 47 |   batch_x,batch_y,batch_label,batch_x_mask,batch_y_mask, batch_x_len,batch_y_len= data.next_batch(config.batch_size)
 48 |   feed_dict = {model.x:batch_x , 
 49 |                 model.y:batch_y,
 50 |                 model.label:batch_label,
 51 |                 model.x_mask:batch_x_mask,
 52 |                 model.y_mask:batch_y_mask, 
 53 |                 model.x_len :batch_x_len,
 54 |                 model.y_len :batch_y_len,
 55 |                 }
 56 | 
 57 |   return feed_dict
 58 | 
 59 | def run_epoch(session, data,model,config, eval_op=None, verbose=False):
 60 |   """Runs the model on the given data."""
 61 |   start_time = time.time()
 62 |   losses = 0.0
 63 |   iters = 0
 64 |   acc_total=0.0
 65 |   fetches = {
 66 |       "acc":model.acc,
 67 |       "loss": model.loss,
 68 |       "global_step":model.global_step,
 69 |       "pred": model.pred,
 70 |       "label": model.label,
 71 |   }
 72 |   if eval_op is not None:
 73 |     fetches["eval_op"] = eval_op
 74 |   
 75 |   start_time = time.time()
 76 |   epoch_size = data.get_epoch_size(config.batch_size)
 77 |   for step in range(epoch_size):
 78 |     feed_dict = fill_placeholder(data,model,config)
 79 |     
 80 |     vals = session.run(fetches, feed_dict)
 81 |     acc = vals["acc"]
 82 |     loss = vals["loss"]
 83 |     global_step=vals["global_step"]
 84 | 
 85 |     
 86 |     pred = vals["pred"]
 87 |     label = vals["label"]
 88 | 
 89 |     losses += loss
 90 |     iters= iters+1
 91 |     acc_total += acc
 92 |     #if verbose and step %10 == 0:
 93 |     #  print('global_step: %s train_acc: %s  batch_train_loss: %s' % (global_step,acc,loss))
 94 |     acc_average=acc_total/iters
 95 |     loss_average = losses/iters
 96 |   return acc_average,loss_average,global_step,pred,label
 97 | 
 98 | 
 99 | def get_config():
100 |   if FLAGS.model == "small":
101 |     return SmallConfig()
102 |   else:
103 |     raise ValueError("Invalid model: %s", FLAGS.model)
104 | 
105 | 
106 | def main(_):
107 |   config = get_config()
108 |   config.learning_rate = FLAGS.learning_rate
109 |   config.keep_prob = FLAGS.keep_prob
110 |   config.l2_strength = FLAGS.l2_strength
111 |   config.batch_size = FLAGS.batch_size
112 | 
113 |   eval_config= copy.deepcopy(config)
114 |   eval_config.batch_size=1
115 |   print("config",vars(config))
116 |   print("eval_config",vars(eval_config))
117 | 
118 |   Train,Dev,vocab = reader.file2seqid(config)
119 | 
120 |   model = FLAGS.model_type
121 |   module = importlib.import_module('.'.join([FLAGS.model_type]),package='b')
122 |   MyModel = getattr(module, 'MyModel')
123 |   with tf.Graph().as_default():
124 |     initializer = tf.random_uniform_initializer(-config.init_scale,config.init_scale)
125 | 
126 |     with tf.name_scope("Train"):
127 |       with tf.variable_scope("Model", reuse=None, initializer=initializer):
128 |         m = MyModel(is_training=True, config=config)
129 |     
130 |     with tf.name_scope("Valid"):
131 |       with tf.variable_scope("Model", reuse=True, initializer=initializer):
132 |         mvalid = MyModel(is_training=False,config=eval_config)
133 | 
134 |     
135 |     sv = tf.train.Supervisor()
136 |     with sv.managed_session() as session:
137 |       print ("model params",np.sum([np.product([xi.value for xi in x.get_shape()]) for x in tf.trainable_variables()]))
138 |       t0=time.time()
139 |       best_dev_acc = 0.0
140 |       best_val_epoch = 0 
141 | 
142 | 
143 |       for i in range(config.MAXITER):
144 |         start_time=time.time()
145 |         train_acc,train_loss,train_global_step,train_pred,train_label= run_epoch(session,data=Train, model=m,config=config, eval_op=m.optim, verbose=True)
146 |         print("Epoch: %d train_acc: %.4f train_loss %.4f train_global_step:%s" % (i ,train_acc,train_loss,train_global_step))
147 | 
148 |         dev_acc,dev_loss,_,dev_pred,dev_label= run_epoch(session,data=Dev,model=mvalid,config=eval_config)
149 |         print("Epoch: %d dev_acc: %.4f dev_loss %.4f" % (i , dev_acc,dev_loss))
150 | 
151 | 
152 |         sys.stdout.flush()
153 |         if best_dev_acc <= dev_acc:
154 |           best_dev_acc = dev_acc
155 |           best_val_epoch = i
156 |           if FLAGS.save_path:
157 |             print("train_global_step:%s.  Saving %d model to %s." % (train_global_step,i,FLAGS.save_path))
158 |             sv.saver.save(session,FLAGS.save_path+"/model", global_step=train_global_step)
159 |             print (time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()))
160 | 
161 |         
162 |         end_time=time.time()
163 |         print("################# all_training time: %s one_epoch time: %s ############### " % ((end_time-t0)//60, (end_time-start_time)//60))
164 |         if i - best_val_epoch > config.early_stopping:
165 |           print ("best_val_epoch:%d  best_val_accuracy:%.4f"%(best_val_epoch,best_dev_acc))
166 |           logging.info("Normal Early stop")
167 |           print (time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()))
168 |           break        
169 |         elif i == config.MAXITER-1:
170 |           print ("best_val_epoch:%d  best_val_accuracy:%.4f"%(best_val_epoch,best_dev_acc))
171 |           logging.info("Finishe Training")
172 | 
173 |       
174 | if __name__ == "__main__":
175 |   tf.app.run()
176 | 


--------------------------------------------------------------------------------
/CCL2018中文文本蕴含评测总结.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blcunlp/CNLI/604638ecf50201c15da6420ceb14aa7a43bd1463/CCL2018中文文本蕴含评测总结.pdf


--------------------------------------------------------------------------------
/CCL2018中文文本蕴含识别系统报告集合.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blcunlp/CNLI/604638ecf50201c15da6420ceb14aa7a43bd1463/CCL2018中文文本蕴含识别系统报告集合.pdf


--------------------------------------------------------------------------------
/CNLI2018 Evaluation Result.md:
--------------------------------------------------------------------------------
 1 | # CNLI2018 Evaluation Result 
 2 | #### CNLI2018 已在近日结束。
 3 | #### 我们在Codalab上收到了12份提交答案，排名如下：
 4 | 
 5 | 排名 | 团队名 | 提交者 | 模型名称 | 准确度 | Github链接
 6 | ---|---|---|---|---|---|
 7 | 1 | water | water123 | cnn+lstm |0.8238 |
 8 | 2 | zzunlp2018 | nlpc | decomposable_att_t | 0.7828 |
 9 | 3 | 百度智珠团队 | ShawnNg | Excalibur | 0.7692 |
10 | 4 | GDUFSER | Kunxun_Qi | - | 0.7618 |
11 | 5 | ray_li | ray_li | - | 0.7425 |
12 | 6 | INTSIG_AI | eedanny | - | 0.7303 |
13 | 7 | Yonseiiii | Parkhaeju | decom-att | 0.7242 |
14 | 8 | **Baseline** | **BLCU-nlp** | **ESIM** | **0.7222** |
15 | 9 | 狂奔 | friend2 | lstm+cnn | 0.6952 |
16 | 10 | _503 | _503 | bi | 0.6848 |
17 | 11 | 遵义医学院医学信息工程学院 | lyb3b | BiLSTM | 0.6203 |
18 | 12 | Hiter | oliver_arrow | DAM | 0.6090 |
19 | 
20 | 
21 | #### 在征得参赛团队及个人同意后，我们会放上模型代码的Github链接，供大家研究参考。
22 | #### 评测现处于Post Competition 阶段，且不会关闭。可以继续提交结果，刷新SOA。
23 | 
24 | 
25 | 
26 | 
27 | 
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/Codalab Example/answer.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blcunlp/CNLI/604638ecf50201c15da6420ceb14aa7a43bd1463/Codalab Example/answer.zip


--------------------------------------------------------------------------------
/Codalab Example/readme:
--------------------------------------------------------------------------------
1 | This is an example of result submission for our CNLI competition on Codalab:https://competitions.codalab.org/competitions/19911.
2 | 
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Baseline for Chinese Natural Language Inference (CNLI)  dataset
 2 | 
 3 | ## Description
 4 | This repository provides the official training and development dataset for [the Chinese Natural Language Inference (CNLI) share task](http://www.cips-cl.org/static/CCL2018/call-evaluation.html).
 5 | We evaluate the cnli\_1.0 corpus on two baseline models. 
 6 | 
 7 | 
 8 | ## Data
 9 | 
10 | The CNLI dataset can be downloaded at [here](https://github.com/blcunlp/CNLI/tree/master/CNLI_Data)
11 | 
12 | Both the train and dev set are  **tab-separated** format.
13 | Each line in the train (or dev) file corresponds to an instance, and it is arranged as：  
14 | >sentence-id premise   hypothesis  label
15 | 
16 | 
17 | 
18 | ## Model
19 | 
20 | This repository includes the baseline model for Chinese Natural Language Inference (CNLI) dataset. 
21 | We provide two baseline models. 
22 | (1) The [Decomposable Attention Model](https://arxiv.org/pdf/1606.01933.pdf), which use FNNs and inter-attention mechinaism. More details about the model can be found in the [original paper](https://arxiv.org/pdf/1606.01933.pdf). 
23 | (2) The ESIM Model (https://arxiv.org/pdf/1609.06038.pdf), which is a strong baseline model for SNLI dataset. 
24 | 
25 | ## Requirements
26 | * python 3.5
27 | * tensorflow      '1.4.0'
28 | * jieba 0.39
29 | 
30 | ## Training
31 | 
32 | 
33 | **Data Preprocessing**  
34 | We use jieba to tokenize the sentences. During trainging, we use the pre-trained SGNS embedding introduced in [Analogical Reasoning on Chinese Morphological and Semantic Relations] (https://arxiv.org/abs/1805.06504).  You can download the sgns.merge.word from [here](https://pan.baidu.com/s/1kwxiPouou6ecxyJdYmnkvw).
35 | 
36 | **Main Scripts**  
37 | config.py：the parameter configuration.  
38 | decomposable_att.py: implementation of the Decomposable Attention Model.   
39 | data_reader.py: preparing data for the model.    
40 | train.py: training the Decomposable Attention Model. 
41 | 
42 | **Running Model**  
43 | You can train the decomposable attention model and the esim model by the following command lines: 
44 | > python3 train.py  --model_type  decomposable_att 
45 | > python3 train.py  --model_type  esim
46 | 
47 | 
48 | 
49 | ## Results 
50 | We provide the whole training data, which comprimises 90,000 items in the training set and 10,000 items in the dev dataset. 
51 | We adopt early stopping on dev set. The best results are shown in the following table: 
52 | 
53 | |Model |train-acc(%)|dev-acc(%)
54 | |:-:|:-:|:-:
55 | | Decomposable-Att|76.91 |69.35
56 | |ESIM |  76.82| 73.57
57 | 
58 | 
59 | 
60 | ## Reporting issues
61 | Please let us know, if you encounter any problems.
62 | 


--------------------------------------------------------------------------------