├── README.md ├── amodel.py ├── myutils.py ├── reader.py ├── tf_model.py └── train10.txt /README.md: -------------------------------------------------------------------------------- 1 | Implementations of a attention model for entailment from [this paper](http://arxiv.org/abs/1509.06664) in keras and tensorflow. 2 | 3 | Compatible with keras v1.0.6 and tensorflow 0.11.0rc2 4 | 5 | I implemented the model to learn the APIs for keras and tensorflow, so I have not really tuned on the performance. The models implemented in keras is a little different, as keras does not expose a method to set a LSTMs state. 6 | 7 | To train, 8 | 9 | * Download [snli dataset](http://nlp.stanford.edu/projects/snli/). 10 | * Create train, dev, test files with tab separated text, hypothesis and label (example file train10.txt). You can find some snippet in `reader.py` for this, if you are lazy. 11 | * Train by either running, 12 | 13 | ``` 14 | python amodel.py -train -dev -test 15 | ``` 16 | for using the keras implementation, or 17 | ``` 18 | python tf_model.py -train -dev -test 19 | ``` 20 | for using the tensorflow implementation. Look at the `get_params()` method in both scripts to see how to specify different parameters. 21 | 22 | 23 | Log is written out in *.log file with callback for accuracy. 24 | 25 | For comments, improvements, bug-reports and suggestions for tuning, email shyamupa@gmail.com -------------------------------------------------------------------------------- /amodel.py: -------------------------------------------------------------------------------- 1 | # from __future__ import print_function 2 | import numpy as np 3 | 4 | np.random.seed(1337) # for reproducibility 5 | import os 6 | from keras.regularizers import l2 7 | from keras.callbacks import * 8 | # from visualizer import * 9 | from keras.models import * 10 | from keras.optimizers import * 11 | from keras.utils.np_utils import to_categorical, accuracy 12 | from keras.layers.core import * 13 | from keras.layers import Input, Embedding, LSTM, Dense, merge, TimeDistributed 14 | # from keras.utils.visualize_util import plot # THIS IS BAD 15 | # from data_reader import * 16 | from reader import * 17 | from myutils import * 18 | import logging 19 | from datetime import datetime 20 | 21 | 22 | # from myconfig import DATAPATH,MYPATH 23 | 24 | def get_params(): 25 | parser = argparse.ArgumentParser(description='Short sample app') 26 | parser.add_argument('-lstm', action="store", default=150, dest="lstm_units", type=int) 27 | parser.add_argument('-epochs', action="store", default=20, dest="epochs", type=int) 28 | parser.add_argument('-batch', action="store", default=32, dest="batch_size", type=int) 29 | parser.add_argument('-emb', action="store", default=100, dest="emb", type=int) 30 | parser.add_argument('-xmaxlen', action="store", default=20, dest="xmaxlen", type=int) 31 | parser.add_argument('-ymaxlen', action="store", default=20, dest="ymaxlen", type=int) 32 | parser.add_argument('-maxfeat', action="store", default=35000, dest="max_features", type=int) 33 | parser.add_argument('-classes', action="store", default=351, dest="num_classes", type=int) 34 | parser.add_argument('-sample', action="store", default=1, dest="samples", type=int) 35 | parser.add_argument('-nopad', action="store", default=False, dest="no_padding", type=bool) 36 | parser.add_argument('-lr', action="store", default=0.001, dest="lr", type=float) 37 | parser.add_argument('-load', action="store", default=False, dest="load_save", type=bool) 38 | parser.add_argument('-verbose', action="store", default=False, dest="verbose", type=bool) 39 | parser.add_argument('-train', action="store", default="train_all.txt", dest="train") 40 | parser.add_argument('-test', action="store", default="test_all.txt", dest="test") 41 | parser.add_argument('-dev', action="store", default="dev.txt", dest="dev") 42 | opts = parser.parse_args(sys.argv[1:]) 43 | print "lstm_units", opts.lstm_units 44 | print "epochs", opts.epochs 45 | print "batch_size", opts.batch_size 46 | print "emb", opts.emb 47 | print "samples", opts.samples 48 | print "xmaxlen", opts.xmaxlen 49 | print "ymaxlen", opts.ymaxlen 50 | print "max_features", opts.max_features 51 | print "no_padding", opts.no_padding 52 | return opts 53 | 54 | 55 | class AccCallBack(Callback): 56 | def __init__(self, xtrain, ytrain, xdev, ydev, xtest, ytest, vocab, opts): 57 | self.xtrain = xtrain 58 | self.ytrain = ytrain 59 | self.xdev = xdev 60 | self.ydev = ydev 61 | self.xtest = xtest 62 | self.ytest = ytest 63 | self.vocab = vocab 64 | self.opts = opts 65 | 66 | def on_epoch_end(self, epoch, logs={}): 67 | train_acc = compute_acc(self.xtrain, self.ytrain, self.vocab, self.model, self.opts) 68 | dev_acc = compute_acc(self.xdev, self.ydev, self.vocab, self.model, self.opts) 69 | test_acc = compute_acc(self.xtest, self.ytest, self.vocab, self.model, self.opts) 70 | logging.info('----------------------------------') 71 | logging.info('Epoch ' + str(epoch) + ' train loss:' + str(logs.get('loss')) + ' - Validation loss: ' + str( 72 | logs.get('val_loss')) + ' train acc: ' + str(train_acc[0]) + '/' + str(train_acc[1]) + ' dev acc: ' + str( 73 | dev_acc[0]) + '/' + str(dev_acc[1]) + ' test acc: ' + str(test_acc[0]) + '/' + str(test_acc[1])) 74 | logging.info('----------------------------------') 75 | 76 | 77 | def get_H_n(X): 78 | ans = X[:, -1, :] # get last element from time dim 79 | return ans 80 | 81 | 82 | def get_Y(X, xmaxlen): 83 | return X[:, :xmaxlen, :] # get first xmaxlen elem from time dim 84 | 85 | 86 | def get_R(X): 87 | Y, alpha = X[0], X[1] 88 | ans = K.T.batched_dot(Y, alpha) 89 | return ans 90 | 91 | 92 | def build_model(opts, verbose=False): 93 | k = 2 * opts.lstm_units # 300 94 | L = opts.xmaxlen # 20 95 | N = opts.xmaxlen + opts.ymaxlen + 1 # for delim 96 | print "x len", L, "total len", N 97 | print "k", k, "L", L 98 | 99 | main_input = Input(shape=(N,), dtype='int32', name='main_input') 100 | x = Embedding(output_dim=opts.emb, input_dim=opts.max_features, input_length=N, name='x')(main_input) 101 | drop_out = Dropout(0.1, name='dropout')(x) 102 | lstm_fwd = LSTM(opts.lstm_units, return_sequences=True, name='lstm_fwd')(drop_out) 103 | lstm_bwd = LSTM(opts.lstm_units, return_sequences=True, go_backwards=True, name='lstm_bwd')(drop_out) 104 | bilstm = merge([lstm_fwd, lstm_bwd], name='bilstm', mode='concat') 105 | drop_out = Dropout(0.1)(bilstm) 106 | h_n = Lambda(get_H_n, output_shape=(k,), name="h_n")(drop_out) 107 | Y = Lambda(get_Y, arguments={"xmaxlen": L}, name="Y", output_shape=(L, k))(drop_out) 108 | Whn = Dense(k, W_regularizer=l2(0.01), name="Wh_n")(h_n) 109 | Whn_x_e = RepeatVector(L, name="Wh_n_x_e")(Whn) 110 | WY = TimeDistributed(Dense(k, W_regularizer=l2(0.01)), name="WY")(Y) 111 | merged = merge([Whn_x_e, WY], name="merged", mode='sum') 112 | M = Activation('tanh', name="M")(merged) 113 | 114 | alpha_ = TimeDistributed(Dense(1, activation='linear'), name="alpha_")(M) 115 | flat_alpha = Flatten(name="flat_alpha")(alpha_) 116 | alpha = Dense(L, activation='softmax', name="alpha")(flat_alpha) 117 | 118 | Y_trans = Permute((2, 1), name="y_trans")(Y) # of shape (None,300,20) 119 | 120 | r_ = merge([Y_trans, alpha], output_shape=(k, 1), name="r_", mode=get_R) 121 | 122 | r = Reshape((k,), name="r")(r_) 123 | 124 | Wr = Dense(k, W_regularizer=l2(0.01))(r) 125 | Wh = Dense(k, W_regularizer=l2(0.01))(h_n) 126 | merged = merge([Wr, Wh], mode='sum') 127 | h_star = Activation('tanh')(merged) 128 | out = Dense(3, activation='softmax')(h_star) 129 | output = out 130 | model = Model(input=[main_input], output=output) 131 | if verbose: 132 | model.summary() 133 | # plot(model, 'model.png') 134 | # # model.compile(loss={'output':'binary_crossentropy'}, optimizer=Adam()) 135 | # model.compile(loss={'output':'categorical_crossentropy'}, optimizer=Adam(options.lr)) 136 | model.compile(loss='categorical_crossentropy',optimizer=Adam(options.lr)) 137 | return model 138 | 139 | 140 | def compute_acc(X, Y, vocab, model, opts): 141 | scores = model.predict(X, batch_size=options.batch_size) 142 | prediction = np.zeros(scores.shape) 143 | for i in range(scores.shape[0]): 144 | l = np.argmax(scores[i]) 145 | prediction[i][l] = 1.0 146 | assert np.array_equal(np.ones(prediction.shape[0]), np.sum(prediction, axis=1)) 147 | plabels = np.argmax(prediction, axis=1) 148 | tlabels = np.argmax(Y, axis=1) 149 | acc = accuracy(tlabels, plabels) 150 | return acc, acc 151 | 152 | 153 | def getConfig(opts): 154 | conf = [opts.xmaxlen, 155 | opts.ymaxlen, 156 | opts.batch_size, 157 | opts.emb, 158 | opts.lr, 159 | opts.samples, 160 | opts.lstm_units, 161 | opts.epochs] 162 | if opts.no_padding: 163 | conf.append("no-pad") 164 | return "_".join(map(lambda x: str(x), conf)) 165 | 166 | 167 | def save_model(model, wtpath, archpath, mode='yaml'): 168 | if mode == 'yaml': 169 | yaml_string = model.to_yaml() 170 | open(archpath, 'w').write(yaml_string) 171 | else: 172 | with open(archpath, 'w') as f: 173 | f.write(model.to_json()) 174 | model.save_weights(wtpath) 175 | 176 | 177 | def load_model(wtpath, archpath, mode='yaml'): 178 | if mode == 'yaml': 179 | model = model_from_yaml(open(archpath).read()) # ,custom_objects={"MyEmbedding": MyEmbedding}) 180 | else: 181 | with open(archpath) as f: 182 | model = model_from_json(f.read()) # , custom_objects={"MyEmbedding": MyEmbedding}) 183 | model.load_weights(wtpath) 184 | return model 185 | 186 | 187 | def concat_in_out(X, Y, vocab): 188 | numex = X.shape[0] # num examples 189 | glue = vocab["delimiter"] * np.ones(numex).reshape(numex, 1) 190 | inp_train = np.concatenate((X, glue, Y), axis=1) 191 | return inp_train 192 | 193 | 194 | def setup_logger(config_str): 195 | logging.basicConfig(level=logging.DEBUG, 196 | format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', 197 | datefmt='%m-%d %H:%M', 198 | filename=datetime.now().strftime('mylogfile_%H_%M_%d_%m_%Y.log'), 199 | filemode='w') 200 | 201 | 202 | if __name__ == "__main__": 203 | options = get_params() 204 | train = [l.strip().split('\t') for l in open(options.train)] 205 | dev = [l.strip().split('\t') for l in open(options.dev)] 206 | test = [l.strip().split('\t') for l in open(options.test)] 207 | vocab = get_vocab(train) 208 | print "vocab (incr. maxfeatures accordingly):",len(vocab) 209 | X_train,Y_train,Z_train=load_data(train,vocab) 210 | X_dev,Y_dev,Z_dev=load_data(dev,vocab) 211 | X_test,Y_test,Z_test=load_data(test,vocab) 212 | print 'Build model...' 213 | model = build_model(options) 214 | 215 | config_str = getConfig(options) 216 | MODEL_ARCH = "arch_att" + config_str + ".yaml" 217 | MODEL_WGHT = "weights_att" + config_str + ".weights" 218 | 219 | MAXLEN = options.xmaxlen 220 | X_train = pad_sequences(X_train, maxlen=MAXLEN, value=vocab["unk"], padding='pre') 221 | X_dev = pad_sequences(X_dev, maxlen=MAXLEN, value=vocab["unk"], padding='pre') 222 | X_test = pad_sequences(X_test, maxlen=MAXLEN, value=vocab["unk"], padding='pre') 223 | Y_train = pad_sequences(Y_train, maxlen=MAXLEN, value=vocab["unk"], padding='post') 224 | Y_dev = pad_sequences(Y_dev, maxlen=MAXLEN, value=vocab["unk"], padding='post') 225 | Y_test = pad_sequences(Y_test, maxlen=MAXLEN, value=vocab["unk"], padding='post') 226 | 227 | net_train = concat_in_out(X_train, Y_train, vocab) 228 | net_dev = concat_in_out(X_dev, Y_dev, vocab) 229 | net_test = concat_in_out(X_test, Y_test, vocab) 230 | 231 | Z_train = to_categorical(Z_train, nb_classes=3) 232 | Z_dev = to_categorical(Z_dev, nb_classes=3) 233 | Z_test = to_categorical(Z_test, nb_classes=3) 234 | 235 | print X_train.shape, Y_train.shape, net_train.shape 236 | print map_to_txt(net_train[0], vocab), Z_train[0] 237 | print map_to_txt(net_train[1], vocab), Z_train[1] 238 | setup_logger(config_str) 239 | 240 | assert net_train[0][options.xmaxlen] == 1 241 | train_dict = {'input': net_train, 'output': Z_train} 242 | dev_dict = {'input': net_dev, 'output': Z_dev} 243 | print 'Build model...' 244 | model = build_model(options) 245 | 246 | logging.info(vars(options)) 247 | logging.info( 248 | "train size: " + str(len(net_train)) + " dev size: " + str(len(net_dev)) + " test size: " + str(len(net_test))) 249 | if options.load_save and os.path.exists(MODEL_ARCH) and os.path.exists(MODEL_WGHT): 250 | print("Loading pre-trained model from", MODEL_WGHT) 251 | load_model(MODEL_WGHT, MODEL_ARCH, 'json') 252 | train_acc = compute_acc(net_train, Z_train, vocab, model, options) 253 | dev_acc = compute_acc(net_dev, Z_dev, vocab, model, options) 254 | test_acc = compute_acc(net_test, Z_test, vocab, model, options) 255 | print train_acc, dev_acc, test_acc 256 | else: 257 | # history = model.fit(train_dict, 258 | # batch_size=options.batch_size, 259 | # nb_epoch=options.epochs, 260 | # validation_data=dev_dict, 261 | # callbacks=[ 262 | # AccCallBack(net_train, Z_train, net_dev, Z_dev, net_test, Z_test, vocab, options)] 263 | # ) 264 | history = model.fit(net_train,Z_train, 265 | batch_size=options.batch_size, 266 | nb_epoch=options.epochs, 267 | validation_data=(net_dev,Z_dev), 268 | callbacks=[ 269 | AccCallBack(net_train, Z_train, net_dev, Z_dev, net_test, Z_test, vocab, options)] 270 | ) 271 | save_model(model, MODEL_WGHT, MODEL_ARCH) 272 | -------------------------------------------------------------------------------- /myutils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | import numpy as np 4 | import argparse 5 | import random 6 | # from keras import backend as K 7 | def tokenize(sent): 8 | ''' 9 | data_reader.tokenize('a#b') 10 | ['a', '#', 'b'] 11 | ''' 12 | return [x.strip().lower() for x in re.split('(\W+)?', sent) if x.strip()] 13 | 14 | def map_to_idx(x, vocab): 15 | ''' 16 | x is a sequence of tokens 17 | ''' 18 | # 0 is for UNK 19 | return [ vocab[w] if w in vocab else 0 for w in x ] 20 | 21 | def map_to_txt(x,vocab): 22 | textify=map_to_idx(x,inverse_map(vocab)) 23 | return ' '.join(textify) 24 | 25 | def inverse_map(vocab): 26 | return {v: k for k, v in vocab.items()} 27 | 28 | def inverse_ids2txt(X_inp,Y_inp,vocabx,vocaby,outp=None): 29 | ''' 30 | takes x,y int seqs and maps them back to strings 31 | ''' 32 | inv_map_x = inverse_map(vocabx) 33 | inv_map_y = inverse_map(vocaby) 34 | if outp: 35 | for x,y,z in zip(X_inp,Y_inp,outp): 36 | print(' '.join(map_to_idx(x,inv_map_x))) 37 | print(' '.join(map_to_idx(y,inv_map_y))) 38 | print(z) 39 | else: 40 | for x,y in zip(X_inp,Y_inp): 41 | print(' '.join(map_to_idx(x,inv_map_x))) 42 | print(' '.join(map_to_idx(y,inv_map_y))) 43 | 44 | 45 | def create_train_examples(X, Y, yspace, num=-1, balanced=True): 46 | ''' 47 | :param X: X seq 48 | :param Y: Y seq 49 | :param yspace: from which to sample 50 | :param num: how many negs, -1 means all of it 51 | :return: x,y,z such that if x,y in X,Y then z=1 else 0 52 | ''' 53 | X_inp = [] 54 | Y_inp = [] 55 | outp = [] 56 | for x, y in zip(X, Y): 57 | neg_samples=yspace[:] # copy 58 | neg_samples.remove(y) 59 | if num == -1: 60 | pass 61 | else: 62 | neg_samples=[i for i in random.sample(neg_samples,num)] 63 | 64 | if not balanced: 65 | X_inp.append(x) 66 | Y_inp.append(y) 67 | outp.append([1.0, 0.0]) 68 | 69 | for yn in neg_samples: 70 | if balanced: 71 | X_inp.append(x) 72 | Y_inp.append(y) 73 | outp.append([1.0, 0.0]) 74 | X_inp.append(x) 75 | Y_inp.append(yn) 76 | outp.append([0.0, 1.0]) 77 | 78 | return X_inp, Y_inp, outp 79 | 80 | # def load_word2vec_embeddings(vocab_dim,index_dict,word_vectors,output_dim): 81 | # vocab_dim = 300 # dimensionality of your word vectors 82 | # n_symbols = len(index_dict) + 1 # adding 1 to account for 0th index (for masking) 83 | # embedding_weights = np.zeros((n_symbols+1,vocab_dim)) 84 | # for word,index in index_dict.items(): 85 | # embedding_weights[index,:] = word_vectors[word] 86 | # 87 | # return Embedding(output_dim=output_dim, input_dim=n_symbols + 1, mask_zero=True, weights=[embedding_weights]) # note you have to put embedding weights in a list by convention 88 | 89 | def check_layer_output_shape(layer, input_data): 90 | ndim = len(input_data.shape) 91 | layer.input = K.placeholder(ndim=ndim) 92 | layer.set_input_shape(input_data.shape) 93 | expected_output_shape = layer.output_shape[1:] 94 | 95 | function = K.function([layer.input], [layer.get_output()]) 96 | output = function([input_data])[0] 97 | print(output.shape,expected_output_shape) 98 | assert output.shape[1:] == expected_output_shape 99 | 100 | def mytest(): 101 | input_data = np.random.random((10, 142, 200)) 102 | Y = np.random.random((10, 142, 200)) 103 | alpha = np.random.random((10, 142)) 104 | print(input_data.shape) 105 | # layer = Reshape(dims=(2, 3)) 106 | # layer = Lambda(get_H_n, output_shape=(200,)) 107 | # Y = Layer() 108 | # alpha= Layer() 109 | # Y.set_input_shape((None,142,200)) 110 | # alpha.set_input_shape((None,142)) 111 | # ll=Merge([Y,alpha],mode='join') 112 | # layer=Lambda(get_R, output_shape=(200,1)) 113 | # layer.set_previous(ll) 114 | # print(layer.input) 115 | # func = K.function([layer.input], [layer.get_output()]) 116 | # layer=Lambda(get_Y, output_shape=(110, 200)) 117 | # check_layer_output_shape(layer, input_data) 118 | sys.exit(0) 119 | 120 | def show_weights(model,node_name,indices=[0]): 121 | Wr=model.nodes[node_name].get_weights() 122 | for i in indices: 123 | print(Wr[i][0:5]) 124 | 125 | 126 | def show_output(model,node_name,input_data_dict): 127 | lout= K.function([model.inputs[i].input for i in model.input_order], 128 | [model.nodes[node_name].get_output(train=False)]) 129 | output= lout([input_data_dict[i] for i in model.input_order])[0] 130 | print('input', input_data_dict['input'][0][0:10]) 131 | print(node_name, output[0][0:5]) 132 | print('input', input_data_dict['input'][1][0:10]) 133 | print(node_name, output[1][0:5]) 134 | 135 | 136 | def categorize(ll): 137 | new_y_train = [] 138 | for y in ll: 139 | if y == 1: 140 | new_y_train += [[0, 1]] 141 | else: 142 | new_y_train += [[1, 0]] 143 | return np.asarray(new_y_train) 144 | 145 | if __name__=="__main__": 146 | pass 147 | -------------------------------------------------------------------------------- /reader.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | from myutils import * 4 | from keras.preprocessing.sequence import pad_sequences 5 | from collections import Counter 6 | 7 | def get_data(data,vocab): 8 | for d in data: 9 | prem=map_to_idx(tokenize(d["sentence1"]),vocab) 10 | hyp=map_to_idx(tokenize(d["sentence2"]),vocab) 11 | label=d["gold_label"] 12 | yield prem, hyp , label 13 | 14 | def load_data(train,vocab,labels={'neutral':0,'entailment':1,'contradiction':2}): 15 | X,Y,Z=[],[],[] 16 | for p,h,l in train: 17 | p=map_to_idx(tokenize(p),vocab) 18 | h=map_to_idx(tokenize(h),vocab) 19 | # print 'P:',map_to_txt(p,vocab) 20 | # print 'H:',map_to_txt(h,vocab) 21 | # print p+" DELIMITER "+h 22 | # ph=map_to_idx(tokenize(p+" delimiter "+h),vocab) 23 | # print 'PH:',map_to_txt(ph,vocab) 24 | # print 'L:',l 25 | if l in labels: # get rid of '-' 26 | X+=[p] 27 | Y+=[h] 28 | Z+=[labels[l]] 29 | return X,Y,Z 30 | 31 | 32 | def get_vocab(data): 33 | vocab=Counter() 34 | for ex in data: 35 | tokens=tokenize(ex[0]) 36 | tokens+=tokenize(ex[1]) 37 | vocab.update(tokens) 38 | lst = ["unk", "delimiter"] + [ x for x, y in vocab.iteritems() if y > 0] 39 | vocab = dict([ (y,x) for x,y in enumerate(lst) ]) 40 | return vocab 41 | 42 | def convert2simple(data,out): 43 | ''' 44 | get the good stuff out of json into a tsv file 45 | ''' 46 | for d in data: 47 | print>>out, d["sentence1"]+"\t"+d["sentence2"]+"\t"+d["gold_label"] 48 | out.close() 49 | 50 | 51 | if __name__=="__main__": 52 | 53 | train=[l.strip().split('\t') for l in open('train.txt')][:20000] 54 | dev=[l.strip().split('\t') for l in open('dev.txt')] 55 | test=[l.strip().split('\t') for l in open('test.txt')] 56 | labels={'contradiction':-1,'neutral':0,'entailment':1} 57 | 58 | vocab=get_vocab(train) 59 | X_train,Y_train,Z_train=load_data(train,vocab) 60 | X_dev,Y_dev,Z_dev=load_data(dev,vocab) 61 | print len(X_train),X_train[0] 62 | print len(X_dev),X_dev[0] 63 | -------------------------------------------------------------------------------- /tf_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import tensorflow as tf 4 | import numpy as np 5 | from collections import Counter, defaultdict 6 | from itertools import count 7 | import random 8 | from keras.preprocessing.sequence import pad_sequences 9 | from keras.utils.np_utils import to_categorical, accuracy 10 | import time 11 | 12 | V = 10000 13 | dim = 20 14 | k = 100 # opts.lstm_units 15 | 16 | 17 | def get_params(): 18 | parser = argparse.ArgumentParser(description='Short sample app') 19 | parser.add_argument('-lstm', action="store", default=150, dest="lstm_units", type=int) 20 | parser.add_argument('-epochs', action="store", default=20, dest="epochs", type=int) 21 | parser.add_argument('-batch', action="store", default=32, dest="batch_size", type=int) 22 | parser.add_argument('-emb', action="store", default=100, dest="emb", type=int) 23 | parser.add_argument('-xmaxlen', action="store", default=120, dest="xmaxlen", type=int) 24 | parser.add_argument('-ymaxlen', action="store", default=70, dest="ymaxlen", type=int) 25 | parser.add_argument('-maxfeat', action="store", default=35000, dest="max_features", type=int) 26 | parser.add_argument('-classes', action="store", default=3, dest="num_classes", type=int) 27 | parser.add_argument('-sample', action="store", default=1, dest="samples", type=int) 28 | parser.add_argument('-nopad', action="store", default=False, dest="no_padding", type=bool) 29 | parser.add_argument('-lr', action="store", default=0.001, dest="lr", type=float) 30 | parser.add_argument('-load', action="store", default=False, dest="load_save", type=bool) 31 | parser.add_argument('-verbose', action="store", default=False, dest="verbose", type=bool) 32 | parser.add_argument('-train', action="store", default="train_all.txt", dest="train") 33 | parser.add_argument('-test', action="store", default="test_all.txt", dest="test") 34 | parser.add_argument('-dev', action="store", default="dev.txt", dest="dev") 35 | opts = parser.parse_args(sys.argv[1:]) 36 | print ("lstm_units", opts.lstm_units) 37 | print ("epochs", opts.epochs) 38 | print ("batch_size", opts.batch_size) 39 | print ("emb", opts.emb) 40 | print ("samples", opts.samples) 41 | print ("xmaxlen", opts.xmaxlen) 42 | print ("ymaxlen", opts.ymaxlen) 43 | print ("max_features", opts.max_features) 44 | print ("no_padding", opts.no_padding) 45 | return opts 46 | 47 | 48 | class CustomModel: 49 | def __init__(self, opts, sess, XMAXLEN, YMAXLEN, vocab, batch_size=1000): 50 | self.dim = 100 51 | self.sess = sess 52 | self.h_dim = opts.lstm_units 53 | self.batch_size = batch_size 54 | self.vocab_size = len(vocab) 55 | self.XMAXLEN = XMAXLEN 56 | self.YMAXLEN = YMAXLEN 57 | 58 | # def last_relevant(output, length): 59 | # batch_size = tf.shape(output)[0] 60 | # max_length = tf.shape(output)[1] 61 | # out_size = int(output.get_shape()[2]) 62 | # index = tf.range(0, batch_size) * max_length + (length - 1) 63 | # flat = tf.reshape(output, [-1, out_size]) 64 | # relevant = tf.gather(flat, index) 65 | # return relevant 66 | 67 | # def repeat(x, n): 68 | # '''Repeats a 2D tensor: 69 | # if x has shape (samples, dim) and n=2, 70 | # the output will have shape (samples, 2, dim) 71 | # ''' 72 | # x = tf.expand_dims(x, 1) 73 | # pattern = tf.pack([1, n, 1]) 74 | # return tf.tile(x, pattern) 75 | 76 | def build_model(self): 77 | self.x = tf.placeholder(tf.int32, [self.batch_size, self.XMAXLEN], name="premise") 78 | self.x_length = tf.placeholder(tf.int32, [self.batch_size], name="premise_len") 79 | self.y = tf.placeholder(tf.int32, [self.batch_size, self.YMAXLEN], name="hypothesis") 80 | self.y_length = tf.placeholder(tf.int32, [self.batch_size], name="hyp_len") 81 | self.target = tf.placeholder(tf.float32, [self.batch_size,3], name="label") # change this to int32 and it breaks. 82 | 83 | # DO NOT DO THIS 84 | # self.batch_size = tf.shape(self.x)[0] # batch size 85 | # self.x_length = tf.shape(self.x)[1] # batch size 86 | # print self.batch_size,self.x_length 87 | 88 | self.embed_matrix = tf.get_variable("embeddings", [self.vocab_size, self.dim]) 89 | self.x_emb = tf.nn.embedding_lookup(self.embed_matrix, self.x) 90 | self.y_emb = tf.nn.embedding_lookup(self.embed_matrix, self.y) 91 | 92 | print self.x_emb, self.y_emb 93 | with tf.variable_scope("encode_x"): 94 | self.fwd_lstm = tf.nn.rnn_cell.BasicLSTMCell(self.h_dim, state_is_tuple=True) 95 | self.x_output, self.x_state = tf.nn.dynamic_rnn(cell=self.fwd_lstm, inputs=self.x_emb, dtype=tf.float32) 96 | # self.x_output, self.x_state = tf.nn.bidirectional_dynamic_rnn(cell_fw=self.fwd_lstm,cell_bw=self.bwd_lstm,inputs=self.x_emb,dtype=tf.float32) 97 | print self.x_output 98 | # print self.x_state 99 | # print tf.shape(self.x) 100 | with tf.variable_scope("encode_y"): 101 | self.fwd_lstm = tf.nn.rnn_cell.BasicLSTMCell(self.h_dim, state_is_tuple=True) 102 | self.y_output, self.y_state = tf.nn.dynamic_rnn(cell=self.fwd_lstm, inputs=self.y_emb, 103 | initial_state=self.x_state, dtype=tf.float32) 104 | # print self.y_output 105 | # print self.y_state 106 | 107 | self.Y = self.x_output # its length must be x_length 108 | 109 | # self.h_n = self.last_relevant(self.y_output,self.x_length) # TODO 110 | tmp5= tf.transpose(self.y_output, [1, 0, 2]) 111 | self.h_n = tf.gather(tmp5, int(tmp5.get_shape()[0]) - 1) 112 | print self.h_n 113 | 114 | # self.h_n_repeat = self.repeat(self.h_n,self.x_length) # TODO 115 | self.h_n_repeat = tf.expand_dims(self.h_n, 1) 116 | pattern = tf.pack([1, self.XMAXLEN, 1]) 117 | self.h_n_repeat = tf.tile(self.h_n_repeat, pattern) 118 | 119 | self.W_Y = tf.get_variable("W_Y", shape=[self.h_dim, self.h_dim]) 120 | self.W_h = tf.get_variable("W_h", shape=[self.h_dim, self.h_dim]) 121 | 122 | # TODO compute M = tanh(W*Y + W*[h_n...]) 123 | tmp1 = tf.matmul(tf.reshape(self.Y, shape=[self.batch_size * self.XMAXLEN, self.h_dim]), self.W_Y, 124 | name="Wy") 125 | self.Wy = tf.reshape(tmp1, shape=[self.batch_size, self.XMAXLEN, self.h_dim]); 126 | tmp2 = tf.matmul(tf.reshape(self.h_n_repeat, shape=[self.batch_size * self.XMAXLEN, self.h_dim]), self.W_h) 127 | self.Whn = tf.reshape(tmp2, shape=[self.batch_size, self.XMAXLEN, self.h_dim], name="Whn"); 128 | self.M = tf.tanh(tf.add(self.Wy, self.Whn), name="M") 129 | # print "M",self.M 130 | 131 | # use attention 132 | self.W_att = tf.get_variable("W_att",shape=[self.h_dim,1]) # h x 1 133 | tmp3 = tf.matmul(tf.reshape(self.M,shape=[self.batch_size*self.XMAXLEN,self.h_dim]),self.W_att) 134 | # need 1 here so that later can do multiplication with h x L 135 | self.att = tf.nn.softmax(tf.reshape(tmp3,shape=[self.batch_size,1, self.XMAXLEN],name="att")) # nb x 1 x Xmax 136 | # print "att",self.att 137 | 138 | # COMPUTE WEIGHTED 139 | self.r = tf.reshape(tf.batch_matmul(self.att, self.Y, name="r"),shape=[self.batch_size,self.h_dim]) # (nb,1,L) X (nb,L,k) = (nb,1,k) 140 | # get last step of Y as r which is (batch,k) 141 | # tmp4 = tf.transpose(self.Y, [1, 0, 2]) 142 | # self.r = tf.gather(tmp4, int(tmp4.get_shape()[0]) - 1) 143 | # print "r",self.r 144 | 145 | self.W_p, self.b_p= tf.get_variable("W_p", shape=[self.h_dim, self.h_dim]), tf.get_variable("b_p",shape=[self.h_dim],initializer=tf.constant_initializer()) 146 | self.W_x, self.b_x = tf.get_variable("W_x", shape=[self.h_dim, self.h_dim]), tf.get_variable("b_x",shape=[self.h_dim],initializer=tf.constant_initializer()) 147 | self.Wpr = tf.matmul(self.r, self.W_p, name="Wy") + self.b_p 148 | self.Wxhn = tf.matmul(self.h_n, self.W_x, name="Wxhn") + self.b_x 149 | self.hstar = tf.tanh(tf.add(self.Wpr, self.Wxhn), name="hstar") 150 | # print "Wpr",self.Wpr 151 | # print "Wxhn",self.Wxhn 152 | # print "hstar",self.hstar 153 | 154 | self.W_pred = tf.get_variable("W_pred", shape=[self.h_dim, 3]) 155 | self.pred = tf.nn.softmax(tf.matmul(self.hstar, self.W_pred), name="pred_layer") 156 | # print "pred",self.pred,"target",self.target 157 | correct = tf.equal(tf.argmax(self.pred,1),tf.argmax(self.target,1)) 158 | self.acc = tf.reduce_mean(tf.cast(correct, "float"), name="accuracy") 159 | # self.H_n = self.last_relevant(self.en_output) 160 | self.loss = -tf.reduce_sum(self.target * tf.log(self.pred), name="loss") 161 | # print self.loss 162 | self.optimizer = tf.train.AdamOptimizer() 163 | self.optim = self.optimizer.minimize(self.loss, var_list=tf.trainable_variables()) 164 | _ = tf.scalar_summary("loss", self.loss) 165 | 166 | def train(self,\ 167 | xdata, ydata, zdata, x_lengths, y_lengths,\ 168 | xxdata, yydata, zzdata, xx_lengths, yy_lengths,\ 169 | MAXITER): 170 | merged_sum = tf.merge_all_summaries() 171 | # writer = tf.train.SummaryWriter("./logs/%s" % "modeldir", self.sess.graph_def) 172 | tf.initialize_all_variables().run() 173 | start_time = time.time() 174 | for ITER in range(MAXITER): 175 | # xdata, ydata, zdata, x_lengths, y_lengths = joint_shuffle(xdata, ydata, zdata, x_lengths, y_lengths) 176 | for i in xrange(0, len(l), self.batch_size): 177 | x,y,z,xlen,ylen=xdata[i:i + self.batch_size],\ 178 | ydata[i:i + self.batch_size],\ 179 | zdata[i:i + self.batch_size],\ 180 | x_lengths[i:i + self.batch_size],\ 181 | y_lengths[i:i + self.batch_size] 182 | feed_dict = {self.x: x,\ 183 | self.y: y,\ 184 | self.target: z,\ 185 | self.x_length:xlen,\ 186 | self.y_length:ylen} 187 | att, _ , loss, acc, summ = self.sess.run([self.att,self.optim, self.loss, self.acc, merged_sum],feed_dict=feed_dict) 188 | # print "att for 0th",att[0] 189 | print "loss",loss, "acc on train", acc 190 | total_test_acc=[] 191 | for i in xrange(0, len(l), self.batch_size): 192 | x,y,z,xlen,ylen=xxdata[i:i + self.batch_size],\ 193 | yydata[i:i + self.batch_size],\ 194 | zzdata[i:i + self.batch_size],\ 195 | xx_lengths[i:i + self.batch_size],\ 196 | yy_lengths[i:i + self.batch_size] 197 | tfeed_dict = {self.x: x,\ 198 | self.y: y,\ 199 | self.target: z,\ 200 | self.x_length:xlen,\ 201 | self.y_length:ylen} 202 | att, _ , test_loss, test_acc, summ = self.sess.run([self.att,self.optim, self.loss, self.acc, merged_sum],feed_dict=tfeed_dict) 203 | total_test_acc.append(test_acc) 204 | print "acc on test",np.mean(total_test_acc) 205 | # for x, y, z in zip(xdata, ydata, zdata): 206 | # print x, y, z 207 | # feeddict = {self.x: x, self.y: y, self.target: z, self.x_length:x_lengths, self.y_length:y_lengths} 208 | # self.sess.run([self.optim, self.loss, merged_sum],feed_dict=feeddict); 209 | elapsed_time = time.time() - start_time 210 | print "total time",elapsed_time 211 | 212 | def joint_shuffle(xdata, ydata, zdata, x_lengths, y_lengths): 213 | tmp=list(zip(xdata, ydata, zdata, x_lengths, y_lengths)) 214 | random.shuffle(tmp) 215 | xdata, ydata, zdata, x_lengths, y_lengths = zip(*tmp) 216 | return xdata, ydata, zdata, x_lengths, y_lengths 217 | if __name__ == "__main__": 218 | from reader import * 219 | from myutils import * 220 | 221 | options = get_params() 222 | train = [l.strip().split('\t') for l in open(options.train)] 223 | dev = [l.strip().split('\t') for l in open(options.dev)] 224 | test = [l.strip().split('\t') for l in open(options.test)] 225 | vocab = get_vocab(train) 226 | 227 | X_train, Y_train, Z_train = load_data(train, vocab) 228 | X_dev, Y_dev, Z_dev = load_data(dev, vocab) 229 | X_test, Y_test, Z_test = load_data(test, vocab) 230 | # print Z_train[1] 231 | # sys.exit() 232 | 233 | X_train_lengths = [len(x) for x in X_train] 234 | X_dev_lengths = np.asarray([len(x) for x in X_dev]).reshape(len(X_dev)) 235 | X_test_lengths = np.asarray([len(x) for x in X_test]).reshape(len(X_test)) 236 | # print len(X_test_lengths) 237 | 238 | Y_train_lengths = np.asarray([len(x) for x in Y_train]).reshape(len(Y_train)) 239 | Y_dev_lengths = np.asarray([len(x) for x in Y_dev]).reshape(len(Y_dev)) 240 | Y_test_lengths = np.asarray([len(x) for x in Y_test]).reshape(len(Y_test)) 241 | # print len(Y_test_lengths) 242 | 243 | Z_train = to_categorical(Z_train, nb_classes=options.num_classes) 244 | Z_dev = to_categorical(Z_dev, nb_classes=options.num_classes) 245 | Z_test = to_categorical(Z_test, nb_classes=options.num_classes) 246 | # print Z_train[0] 247 | 248 | XMAXLEN = options.xmaxlen 249 | YMAXLEN = options.ymaxlen 250 | MAXITER = 1000 251 | X_train = pad_sequences(X_train, maxlen=XMAXLEN, value=vocab["unk"], padding='post') ## NO NEED TO GO TO NUMPY , CAN GIVE LIST OF PADDED LIST 252 | X_dev = pad_sequences(X_dev, maxlen=XMAXLEN, value=vocab["unk"], padding='post') 253 | X_test = pad_sequences(X_test, maxlen=XMAXLEN, value=vocab["unk"], padding='post') 254 | Y_train = pad_sequences(Y_train, maxlen=YMAXLEN, value=vocab["unk"], padding='post') 255 | Y_dev = pad_sequences(Y_dev, maxlen=YMAXLEN, value=vocab["unk"], padding='post') 256 | Y_test = pad_sequences(Y_test, maxlen=YMAXLEN, value=vocab["unk"], padding='post') 257 | print X_test.shape, X_test_lengths.shape 258 | vocab = get_vocab(train) 259 | with tf.Session() as sess: 260 | model = CustomModel(options, sess, XMAXLEN, YMAXLEN, vocab, batch_size=200) 261 | model.build_model() 262 | model.train(X_train,Y_train,Z_train,X_train_lengths,Y_train_lengths,\ 263 | X_test,Y_test,Z_test,X_test_lengths,Y_test_lengths,\ 264 | MAXITER) 265 | -------------------------------------------------------------------------------- /train10.txt: -------------------------------------------------------------------------------- 1 | A person on a horse jumps over a broken down airplane. A person is training his horse for a competition. neutral 2 | A person on a horse jumps over a broken down airplane. A person is at a diner, ordering an omelette. contradiction 3 | A person on a horse jumps over a broken down airplane. A person is outdoors, on a horse. entailment 4 | Children smiling and waving at camera They are smiling at their parents neutral 5 | Children smiling and waving at camera There are children present entailment 6 | Children smiling and waving at camera The kids are frowning contradiction 7 | A boy is jumping on skateboard in the middle of a red bridge. The boy skates down the sidewalk. contradiction 8 | A boy is jumping on skateboard in the middle of a red bridge. The boy does a skateboarding trick. entailment 9 | A boy is jumping on skateboard in the middle of a red bridge. The boy is wearing safety equipment. neutral 10 | An older man sits with his orange juice at a small table in a coffee shop while employees in bright colored shirts smile in the background. An older man drinks his juice as he waits for his daughter to get off work. neutral 11 | --------------------------------------------------------------------------------