├── README.md ├── data ├── download.py ├── fetch_and_preprocess.sh ├── filter_glove.py ├── preprocess_data.py └── preprocess_vocab.py ├── model ├── __init__.py ├── data_helpers.py ├── eval.py ├── model_ESIM.py └── train.py └── scripts ├── test.sh └── train.sh /README.md: -------------------------------------------------------------------------------- 1 | # Enhanced LSTM for Natural Language Inference 2 | Implementation of the ESIM model for natural language inference with Tensorflow 3 | 4 | This repository contains an implementation with Tensorflow of the sequential model presented in the paper ["Enhanced LSTM for Natural Language Inference"](http://www.aclweb.org/anthology/P17-1152) by Chen et al. in 2017. 5 | 6 | # Dependencies 7 | Python 2.7
8 | Tensorflow 1.4.0 9 | 10 | # Running the scripts 11 | ## Download and preprocess 12 | ``` 13 | cd data 14 | bash fetch_and_preprocess.sh 15 | ``` 16 | 17 | ## Train and test a new model 18 | ``` 19 | cd scripts 20 | bash train.sh 21 | ``` 22 | The training process and results are in log.txt file. 23 | 24 | ## Test a trained model 25 | ``` 26 | bash test.sh 27 | ``` 28 | The test results are in log_test.txt file. 29 | -------------------------------------------------------------------------------- /data/download.py: -------------------------------------------------------------------------------- 1 | """ 2 | Downloads the following: 3 | - Glove vectors 4 | - Stanford Natural Language Inference (SNLI) Corpus 5 | 6 | """ 7 | 8 | import sys 9 | import os 10 | import zipfile 11 | import gzip 12 | 13 | def download(url, dirpath): 14 | filename = url.split('/')[-1] 15 | filepath = os.path.join(dirpath, filename) 16 | os.system('wget {} -O {}'.format(url, filepath)) 17 | return filepath 18 | 19 | def unzip(filepath): 20 | print("Extracting: " + filepath) 21 | dirpath = os.path.dirname(filepath) 22 | with zipfile.ZipFile(filepath) as zf: 23 | zf.extractall(dirpath) 24 | os.remove(filepath) 25 | 26 | def download_wordvecs(dirpath): 27 | if os.path.exists(dirpath): 28 | print('Found Glove vectors - skip') 29 | return 30 | else: 31 | os.makedirs(dirpath) 32 | url = 'http://www-nlp.stanford.edu/data/glove.840B.300d.zip' 33 | unzip(download(url, dirpath)) 34 | 35 | def download_snli(dirpath): 36 | if os.path.exists(dirpath): 37 | print('Found SNLI dataset - skip') 38 | return 39 | else: 40 | os.makedirs(dirpath) 41 | url = 'https://nlp.stanford.edu/projects/snli/snli_1.0.zip' 42 | unzip(download(url, dirpath)) 43 | 44 | 45 | if __name__ == '__main__': 46 | base_dir = os.path.dirname(os.path.realpath(__file__)) 47 | snli_dir = os.path.join(base_dir, 'snli') 48 | wordvec_dir = os.path.join(base_dir, 'glove') 49 | download_snli(snli_dir) 50 | download_wordvecs(wordvec_dir) 51 | 52 | -------------------------------------------------------------------------------- /data/fetch_and_preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | python download.py 4 | python preprocess_data.py 5 | python preprocess_vocab.py 6 | python filter_glove.py -------------------------------------------------------------------------------- /data/filter_glove.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | base_dir = os.path.dirname(os.path.realpath(__file__)) 4 | 5 | vocab_file = os.path.join(base_dir, 'word_sequence/vocab.txt') 6 | vocab = [] 7 | with open(vocab_file, 'rb') as f: 8 | for line in f: 9 | line = line.decode('utf-8').strip() 10 | vocab.append(line) 11 | print("Vocabulary size: {}".format(len(vocab))) 12 | 13 | 14 | print("Filtering glove embedding ...") 15 | glove_file = vocab_file = os.path.join(base_dir, 'glove/glove.840B.300d.txt') 16 | vectors = {} 17 | with open(glove_file, 'rt') as f: 18 | for line in f: 19 | items = line.strip().split(' ') 20 | if len(items[0]) <= 0: 21 | continue 22 | vec = [float(items[i]) for i in range(1, 300+1)] 23 | vectors[items[0]] = vec 24 | print("Glove size: {}".format(len(vectors))) 25 | 26 | 27 | filtered_vectors = {} 28 | NOT = 0 29 | for word in vocab: 30 | if word in vectors: 31 | filtered_vectors[word] = vectors[word] 32 | else: 33 | NOT += 1 34 | print("Filtered vectors size: {}".format(len(filtered_vectors))) 35 | print("Words not in glove size: {}".format(NOT)) 36 | 37 | 38 | filtered_glove_file = os.path.join(base_dir, 'glove/filtered_glove_840B_300d.txt') 39 | with open(filtered_glove_file, 'w') as f: 40 | for word,vector in filtered_vectors.items(): 41 | to_write = [] 42 | to_write.append(word) 43 | vector = [str(ele) for ele in vector] 44 | to_write.extend(vector) 45 | f.write(" ".join(to_write)) 46 | f.write("\n") 47 | print("Write to {} finished.".format(filtered_glove_file)) 48 | -------------------------------------------------------------------------------- /data/preprocess_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import sys 3 | import os 4 | import numpy 5 | import cPickle as pkl 6 | 7 | from collections import OrderedDict 8 | 9 | dic = {'entailment': '0', 'neutral': '1', 'contradiction': '2'} 10 | 11 | def build_dictionary(filepaths, dst_path, lowercase=False): 12 | word_freqs = OrderedDict() 13 | for filepath in filepaths: 14 | print 'Processing', filepath 15 | with open(filepath, 'r') as f: 16 | for line in f: 17 | if lowercase: 18 | line = line.lower() 19 | words_in = line.strip().split(' ') 20 | for w in words_in: 21 | if w not in word_freqs: 22 | word_freqs[w] = 0 23 | word_freqs[w] += 1 24 | 25 | words = word_freqs.keys() 26 | freqs = word_freqs.values() 27 | 28 | sorted_idx = numpy.argsort(freqs) 29 | sorted_words = [words[ii] for ii in sorted_idx[::-1]] 30 | 31 | worddict = OrderedDict() 32 | worddict['_PAD_'] = 0 # default, padding 33 | worddict['_UNK_'] = 1 # out-of-vocabulary 34 | worddict['_BOS_'] = 2 # begin of sentence token 35 | worddict['_EOS_'] = 3 # end of sentence token 36 | 37 | for ii, ww in enumerate(sorted_words): 38 | worddict[ww] = ii + 4 39 | 40 | with open(dst_path, 'wb') as f: 41 | pkl.dump(worddict, f) 42 | 43 | print 'Dict size', len(worddict) 44 | print 'Done' 45 | 46 | 47 | def build_sequence(filepath, dst_dir): 48 | filename = os.path.basename(filepath) 49 | print filename 50 | len_p = [] 51 | len_h = [] 52 | with open(filepath) as f, \ 53 | open(os.path.join(dst_dir, 'premise_%s'%filename), 'w') as f1, \ 54 | open(os.path.join(dst_dir, 'hypothesis_%s'%filename), 'w') as f2, \ 55 | open(os.path.join(dst_dir, 'label_%s'%filename), 'w') as f3: 56 | next(f) # skip the header row 57 | for line in f: 58 | sents = line.strip().split('\t') 59 | if sents[0] is '-': 60 | continue 61 | 62 | words_in = sents[1].strip().split(' ') 63 | words_in = [x for x in words_in if x not in ('(',')')] 64 | f1.write(' '.join(words_in) + '\n') 65 | len_p.append(len(words_in)) 66 | 67 | words_in = sents[2].strip().split(' ') 68 | words_in = [x for x in words_in if x not in ('(',')')] 69 | f2.write(' '.join(words_in) + '\n') 70 | len_h.append(len(words_in)) 71 | 72 | f3.write(dic[sents[0]] + '\n') 73 | 74 | print 'max min len premise', max(len_p), min(len_p) 75 | print 'max min len hypothesis', max(len_h), min(len_h) 76 | 77 | 78 | def make_dirs(dirs): 79 | for d in dirs: 80 | if not os.path.exists(d): 81 | os.makedirs(d) 82 | 83 | if __name__ == '__main__': 84 | print('=' * 80) 85 | print('Preprocessing snli_1.0 dataset') 86 | print('=' * 80) 87 | base_dir = os.path.dirname(os.path.realpath(__file__)) 88 | dst_dir = os.path.join(base_dir, 'word_sequence') 89 | snli_dir = os.path.join(base_dir, 'snli/snli_1.0') 90 | make_dirs([dst_dir]) 91 | 92 | build_sequence(os.path.join(snli_dir, 'snli_1.0_dev.txt'), dst_dir) 93 | build_sequence(os.path.join(snli_dir, 'snli_1.0_test.txt'), dst_dir) 94 | build_sequence(os.path.join(snli_dir, 'snli_1.0_train.txt'), dst_dir) 95 | 96 | build_dictionary([os.path.join(dst_dir, 'premise_snli_1.0_train.txt'), 97 | os.path.join(dst_dir, 'hypothesis_snli_1.0_train.txt')], 98 | os.path.join(dst_dir, 'vocab_cased.pkl')) 99 | 100 | -------------------------------------------------------------------------------- /data/preprocess_vocab.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cPickle 3 | 4 | base_dir = os.path.dirname(os.path.realpath(__file__)) 5 | dictionary = os.path.join(base_dir, 'word_sequence/vocab_cased.pkl') 6 | vocab = os.path.join(base_dir, 'word_sequence/vocab.txt') 7 | 8 | with open(dictionary, 'rb') as f: 9 | worddicts = cPickle.load(f) 10 | 11 | with open(vocab, 'w') as f: 12 | for k, v in worddicts.items(): 13 | f.write(k) 14 | f.write('\n') 15 | print("Preprocess vocab done.") 16 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /model/data_helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | 5 | def loadVocab(fname): 6 | ''' 7 | vocab = {"": 0, ...} 8 | idf = { 0: log(total_doc/doc_freq)} 9 | ''' 10 | vocab={} 11 | idf={} 12 | with open(fname, 'rt') as f: 13 | for index, word in enumerate(f): 14 | word = word.decode('utf-8').strip() 15 | vocab[word] = index 16 | return vocab, idf 17 | 18 | def toVec(tokens, vocab, maxlen): 19 | ''' 20 | length: length of the input sequence 21 | vec: map the token to the vocab_id, return a varied-length array [3, 6, 4, 3, ...] 22 | ''' 23 | n = len(tokens) 24 | length = 0 25 | vec=[] 26 | for i in range(n): 27 | length += 1 28 | if tokens[i] in vocab: 29 | vec.append(vocab[tokens[i]]) 30 | else: 31 | vec.append(vocab["_UNK_"]) 32 | 33 | return length, np.array(vec) 34 | 35 | 36 | def loadDataset(premise_file, hypothesis_file, label_file, vocab, maxlen): 37 | 38 | # premise 39 | premise_tokens = [] 40 | premise_vec = [] 41 | premise_len = [] 42 | with open(premise_file, 'rt') as f1: 43 | for line in f1: 44 | line = line.decode('utf-8').strip() 45 | p_tokens = line.split(' ')[:maxlen] 46 | p_len, p_vec = toVec(p_tokens, vocab, maxlen) 47 | premise_tokens.append(p_tokens) 48 | premise_vec.append(p_vec) 49 | premise_len.append(p_len) 50 | 51 | # hypothesis 52 | hypothesis_tokens = [] 53 | hypothesis_vec = [] 54 | hypothesis_len = [] 55 | with open(hypothesis_file, 'rt') as f2: 56 | for line in f2: 57 | line = line.decode('utf-8').strip() 58 | h_tokens = line.split(' ')[:maxlen] 59 | h_len, h_vec = toVec(h_tokens, vocab, maxlen) 60 | hypothesis_tokens.append(h_tokens) 61 | hypothesis_vec.append(h_vec) 62 | hypothesis_len.append(h_len) 63 | 64 | # label 65 | label = [] 66 | with open(label_file, 'rt') as f3: 67 | for line in f3: 68 | line = line.decode('utf-8').strip() 69 | label.append(int(line)) 70 | 71 | assert len(premise_tokens) == len(hypothesis_tokens) 72 | assert len(hypothesis_tokens) == len(label) 73 | 74 | # dataset 75 | dataset = [] 76 | for i in range(len(label)): 77 | dataset.append( (premise_tokens[i], premise_vec[i], premise_len[i], 78 | label[i], 79 | hypothesis_tokens[i], hypothesis_vec[i], hypothesis_len[i]) ) 80 | 81 | return dataset 82 | 83 | 84 | def word_count(q_vec, a_vec, q_len, a_len, idf): 85 | q_set = set([q_vec[i] for i in range(q_len) if q_vec[i] > 100]) 86 | a_set = set([a_vec[i] for i in range(a_len) if a_vec[i] > 100]) 87 | new_q_len = float(max(len(q_set), 1)) 88 | count1 = 0.0 89 | count2 = 0.0 90 | for id1 in q_set: 91 | if id1 in a_set: 92 | count1 += 1.0 93 | if id1 in idf: 94 | count2 += idf[id1] 95 | return count1/new_q_len, count2/new_q_len 96 | 97 | def common_words(q_vec, a_vec, q_len, a_len): 98 | q_set = set([q_vec[i] for i in range(q_len) if q_vec[i] > 100]) 99 | a_set = set([a_vec[i] for i in range(a_len) if a_vec[i] > 100]) 100 | return q_set.intersection(a_set) 101 | 102 | def tfidf_feature(id_list, common_id_set, idf): 103 | word_freq={} 104 | for t in id_list: 105 | if t in common_id_set: 106 | if t in word_freq: 107 | word_freq[t] += 1 108 | else: 109 | word_freq[t] = 1 110 | tfidf_feature={} 111 | for t in common_id_set: 112 | if t in idf: 113 | tfidf_feature[t] = word_freq[t] * idf[t] 114 | else: 115 | tfidf_feature[t] = word_freq[t] 116 | return tfidf_feature 117 | 118 | def word_feature(id_list, tfidf): 119 | len1 = len(id_list) 120 | features = np.zeros((len1, 2), dtype='float32') 121 | for idx, t in enumerate(id_list): 122 | if t in tfidf: 123 | features[idx, 0] = 1 124 | features[idx, 1] = tfidf[t] 125 | return features 126 | 127 | def normalize_vec(vec, maxlen): 128 | ''' 129 | pad the original vec to the same maxlen 130 | [3, 4, 7] maxlen=5 --> [3, 4, 7, 0, 0] 131 | ''' 132 | if len(vec) == maxlen: 133 | return vec 134 | 135 | new_vec = np.zeros(maxlen, dtype='int32') 136 | for i in range(len(vec)): 137 | new_vec[i] = vec[i] 138 | return new_vec 139 | 140 | 141 | def batch_iter(data, batch_size, num_epochs, idf, maxlen, shuffle=True): 142 | """ 143 | Generates a batch iterator for a dataset. 144 | """ 145 | data_size = len(data) 146 | num_batches_per_epoch = int(len(data)/batch_size) + 1 147 | for epoch in range(num_epochs): 148 | # Shuffle the data at each epoch 149 | if shuffle: 150 | random.shuffle(data) 151 | for batch_num in range(num_batches_per_epoch): 152 | start_index = batch_num * batch_size 153 | end_index = min((batch_num + 1) * batch_size, data_size) 154 | 155 | x_premise = [] 156 | x_hypothesis = [] 157 | x_premise_len = [] 158 | x_hypothesis_len = [] 159 | 160 | targets = [] 161 | p_features=[] 162 | h_features=[] 163 | extra_feature =[] 164 | 165 | for rowIdx in range(start_index, end_index): 166 | premise_tokens, premise_vec, premise_len,\ 167 | label, \ 168 | hypothesis_tokens, hypothesis_vec, hypothesis_len = data[rowIdx] 169 | 170 | # feature 1 171 | word_count_feature1, word_count_feature2 = word_count(premise_vec, hypothesis_vec, premise_len, hypothesis_len, idf) # scalar feature 172 | common_ids = common_words(premise_vec, hypothesis_vec, premise_len, hypothesis_len) # list: q_set.intersection(a_set) when word_id > 100 173 | tfidf = tfidf_feature(premise_vec, common_ids, idf) # dict: { id: scalar feature } 174 | 175 | # normalize premise_vec and hypothesis_vec 176 | new_premise_vec = normalize_vec(premise_vec, maxlen) # pad the original vec to the same maxlen 177 | new_hypothesis_vec = normalize_vec(hypothesis_vec, maxlen) 178 | 179 | # feature 2 180 | p_word_feature = word_feature(new_premise_vec, tfidf) # feature of np.array( maxlen, 2 ) 181 | h_word_feature = word_feature(new_hypothesis_vec, tfidf) 182 | 183 | x_premise.append(new_premise_vec) 184 | x_premise_len.append(premise_len) 185 | x_hypothesis.append(new_hypothesis_vec) 186 | x_hypothesis_len.append(hypothesis_len) 187 | targets.append(label) 188 | 189 | p_features.append(p_word_feature) 190 | h_features.append(h_word_feature) 191 | 192 | extra_feature.append(np.array([word_count_feature1, word_count_feature2], dtype="float32") ) 193 | 194 | yield np.array(x_premise), np.array(x_hypothesis), np.array(x_premise_len), np.array(x_hypothesis_len),\ 195 | np.array(targets), np.array(extra_feature), np.array(p_features), np.array(h_features) 196 | 197 | -------------------------------------------------------------------------------- /model/eval.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from model import data_helpers 4 | 5 | 6 | # Files 7 | tf.flags.DEFINE_string("test_premise_file", "", "test premise file") 8 | tf.flags.DEFINE_string("test_hypothesis_file", "", "test hypothesis file") 9 | tf.flags.DEFINE_string("test_label_file", "", "test label file") 10 | tf.flags.DEFINE_string("vocab_file", "", "vocabulary file (map word to integer)") 11 | 12 | # Data Parameters 13 | tf.flags.DEFINE_integer("batch_size", 128, "Batch Size (default: 64)") 14 | tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run") 15 | tf.flags.DEFINE_integer("max_sequence_length", 100, "max sequence length") 16 | 17 | # Misc Parameters 18 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 19 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 20 | 21 | 22 | FLAGS = tf.flags.FLAGS 23 | FLAGS._parse_flags() 24 | print("\nParameters:") 25 | for attr, value in sorted(FLAGS.__flags.items()): 26 | print("{}={}".format(attr.upper(), value)) 27 | print("") 28 | 29 | vocab, idf = data_helpers.loadVocab(FLAGS.vocab_file) 30 | print('vocabulary size: {}'.format(len(vocab))) 31 | 32 | SEQ_LEN = FLAGS.max_sequence_length 33 | test_dataset = data_helpers.loadDataset(FLAGS.test_premise_file, FLAGS.test_hypothesis_file, FLAGS.test_label_file, vocab, SEQ_LEN) 34 | print('test_dataset: {}'.format(len(test_dataset))) 35 | 36 | print("\nEvaluating...\n") 37 | 38 | checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) 39 | print(checkpoint_file) 40 | 41 | graph = tf.Graph() 42 | with graph.as_default(): 43 | session_conf = tf.ConfigProto( 44 | allow_soft_placement=FLAGS.allow_soft_placement, 45 | log_device_placement=FLAGS.log_device_placement) 46 | sess = tf.Session(config=session_conf) 47 | with sess.as_default(): 48 | # Load the saved meta graph and restore variables 49 | saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) 50 | saver.restore(sess, checkpoint_file) 51 | 52 | # Get the placeholders from the graph by name 53 | premise = graph.get_operation_by_name("premise").outputs[0] 54 | hypothesis = graph.get_operation_by_name("hypothesis").outputs[0] 55 | 56 | premise_len = graph.get_operation_by_name("premise_len").outputs[0] 57 | hypothesis_len = graph.get_operation_by_name("hypothesis_len").outputs[0] 58 | 59 | dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0] 60 | model_extra_feature = graph.get_operation_by_name("extra_feature").outputs[0] 61 | 62 | premise_word_feature = graph.get_operation_by_name("premise_word_feature").outputs[0] 63 | hypothesis_word_feature = graph.get_operation_by_name("hypothesis_word_feature").outputs[0] 64 | 65 | # Tensors we want to evaluate 66 | prob = graph.get_operation_by_name("prediction_layer/prob").outputs[0] 67 | 68 | num_test = 0 69 | prob_list = [] 70 | target_list = [] 71 | test_batches = data_helpers.batch_iter(test_dataset, FLAGS.batch_size, 1, idf, SEQ_LEN, shuffle=False) 72 | for test_batch in test_batches: 73 | x_premise, x_hypothesis, x_premise_len, x_hypothesis_len, \ 74 | targets, extra_feature, p_features, h_features = test_batch 75 | feed_dict = { 76 | premise: x_premise, 77 | hypothesis: x_hypothesis, 78 | premise_len: x_premise_len, 79 | hypothesis_len: x_hypothesis_len, 80 | dropout_keep_prob: 1.0, 81 | model_extra_feature: extra_feature, 82 | premise_word_feature: p_features, 83 | hypothesis_word_feature: h_features, 84 | } 85 | predicted_prob = sess.run(prob, feed_dict) 86 | prob_list.append(predicted_prob) 87 | target_list.append(targets) 88 | num_test += len(predicted_prob) 89 | print('num_test_sample={}'.format(num_test)) 90 | 91 | probs_aggre = np.concatenate(prob_list, axis=0) 92 | labels_aggre = np.concatenate(target_list, axis=0) 93 | 94 | prediction = np.argmax(probs_aggre, axis=1) 95 | accuracy = np.equal(prediction, labels_aggre) 96 | accuracy = np.mean(accuracy) 97 | 98 | print('num_test_samples: {} accuracy: {}'.format(num_test, round(accuracy, 3))) 99 | -------------------------------------------------------------------------------- /model/model_ESIM.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | FLAGS = tf.flags.FLAGS 5 | 6 | def get_embeddings(vocab): 7 | print("get_embedding") 8 | initializer = load_word_embeddings(vocab, FLAGS.embedding_dim) 9 | return tf.constant(initializer, name="word_embedding") 10 | # return tf.get_variable(initializer=initializer, name="word_embedding") 11 | 12 | def load_embed_vectors(fname, dim): 13 | vectors = {} 14 | for line in open(fname, 'rt'): 15 | items = line.strip().split(' ') 16 | if len(items[0]) <= 0: 17 | continue 18 | vec = [float(items[i]) for i in range(1, dim+1)] 19 | vectors[items[0]] = vec 20 | 21 | return vectors 22 | 23 | def load_word_embeddings(vocab, dim): 24 | vectors = load_embed_vectors(FLAGS.embedded_vector_file, dim) 25 | vocab_size = len(vocab) 26 | embeddings = np.zeros((vocab_size, dim), dtype='float32') 27 | for word, code in vocab.items(): 28 | if word in vectors: 29 | embeddings[code] = vectors[word] 30 | else: 31 | embeddings[code] = np.random.uniform(-0.25, 0.25, dim) 32 | 33 | return embeddings 34 | 35 | 36 | def lstm_layer(inputs, input_seq_len, rnn_size, dropout_keep_prob, scope, scope_reuse=False): 37 | with tf.variable_scope(scope, reuse=scope_reuse) as vs: 38 | fw_cell = tf.contrib.rnn.LSTMCell(rnn_size, forget_bias=1.0, state_is_tuple=True, reuse=scope_reuse) 39 | fw_cell = tf.contrib.rnn.DropoutWrapper(fw_cell, output_keep_prob=dropout_keep_prob) 40 | bw_cell = tf.contrib.rnn.LSTMCell(rnn_size, forget_bias=1.0, state_is_tuple=True, reuse=scope_reuse) 41 | bw_cell = tf.contrib.rnn.DropoutWrapper(bw_cell, output_keep_prob=dropout_keep_prob) 42 | rnn_outputs, rnn_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=fw_cell, cell_bw=bw_cell, 43 | inputs=inputs, 44 | sequence_length=input_seq_len, 45 | dtype=tf.float32) 46 | return rnn_outputs, rnn_states 47 | 48 | # output = tanh( xW + b ) 49 | def ffnn_layer(inputs, output_size, dropout_keep_prob, scope, scope_reuse=False): 50 | with tf.variable_scope(scope, reuse=scope_reuse): 51 | input_size = inputs.get_shape()[-1].value 52 | W = tf.get_variable("W_trans", shape=[input_size, output_size], initializer=tf.orthogonal_initializer()) 53 | b = tf.get_variable("b_trans", shape=[output_size, ], initializer=tf.zeros_initializer()) 54 | outputs = tf.nn.relu(tf.einsum('aij,jk->aik', inputs, W) + b) 55 | outputs = tf.nn.dropout(outputs, keep_prob=dropout_keep_prob) 56 | return outputs 57 | 58 | def premise_hypothesis_similarity_matrix(premise, hypothesis): 59 | #[batch_size, dim, p_len] 60 | p2 = tf.transpose(premise, perm=[0,2,1]) 61 | 62 | #[batch_size, h_len, p_len] 63 | similarity = tf.matmul(hypothesis, p2, name='similarity_matrix') 64 | 65 | return similarity 66 | 67 | def self_attended(similarity_matrix, inputs): 68 | #similarity_matrix: [batch_size, len, len] 69 | #inputs: [batch_size, len, dim] 70 | 71 | attended_w = tf.nn.softmax(similarity_matrix, dim=-1) 72 | 73 | #[batch_size, len, dim] 74 | attended_out = tf.matmul(attended_w, inputs) 75 | return attended_out 76 | 77 | def attend_hypothesis(similarity_matrix, premise, premise_len, maxlen): 78 | #similarity_matrix: [batch_size, h_len, p_len] 79 | #premise: [batch_size, p_len, dim] 80 | 81 | # masked similarity_matrix 82 | mask_p = tf.sequence_mask(premise_len, maxlen, dtype=tf.float32) # [batch_size, p_len] 83 | mask_p = tf.expand_dims(mask_p, 1) # [batch_size, 1, p_len] 84 | similarity_matrix = similarity_matrix * mask_p + -1e9 * (1-mask_p) # [batch_size, h_len, p_len] 85 | 86 | #[batch_size, h_len, p_len] 87 | attention_weight_for_p = tf.nn.softmax(similarity_matrix, dim=-1) 88 | 89 | #[batch_size, a_len, dim] 90 | attended_hypothesis = tf.matmul(attention_weight_for_p, premise) 91 | return attended_hypothesis 92 | 93 | def attend_premise(similarity_matrix, hypothesis, hypothesis_len, maxlen): 94 | #similarity_matrix: [batch_size, h_len, p_len] 95 | #hypothesis: [batch_size, h_len, dim] 96 | 97 | # masked similarity_matrix 98 | mask_h = tf.sequence_mask(hypothesis_len, maxlen, dtype=tf.float32) # [batch_size, h_len] 99 | mask_h = tf.expand_dims(mask_h, 2) # [batch_size, h_len, 1] 100 | similarity_matrix = similarity_matrix * mask_h + -1e9 * (1-mask_h) # [batch_size, h_len, p_len] 101 | 102 | #[batch_size, p_len, h_len] 103 | attention_weight_for_h = tf.nn.softmax(tf.transpose(similarity_matrix, perm=[0,2,1]), dim=-1) 104 | 105 | #[batch_size, p_len, dim] 106 | attended_premise = tf.matmul(attention_weight_for_h, hypothesis) 107 | return attended_premise 108 | 109 | 110 | class ESIM(object): 111 | def __init__( 112 | self, sequence_length, vocab_size, embedding_size, vocab, rnn_size, l2_reg_lambda=0.0): 113 | 114 | self.premise = tf.placeholder(tf.int32, [None, sequence_length], name="premise") 115 | self.hypothesis = tf.placeholder(tf.int32, [None, sequence_length], name="hypothesis") 116 | 117 | self.premise_len = tf.placeholder(tf.int32, [None], name="premise_len") 118 | self.hypothesis_len = tf.placeholder(tf.int32, [None], name="hypothesis_len") 119 | 120 | self.target = tf.placeholder(tf.int64, [None], name="target") 121 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") 122 | self.extra_feature = tf.placeholder(tf.float32, [None, 2], name="extra_feature") 123 | 124 | self.p_word_feature = tf.placeholder(tf.float32, [None, sequence_length, 2], name="premise_word_feature") 125 | self.h_word_feature = tf.placeholder(tf.float32, [None, sequence_length, 2], name="hypothesis_word_feature") 126 | 127 | l2_loss = tf.constant(0.0) 128 | 129 | # =============================== Embedding layer =============================== 130 | # 1. word embedding layer 131 | with tf.name_scope("embedding"): 132 | W = get_embeddings(vocab) # tf.constant( np.array(vocab_size of task_dataset, dim) ) 133 | premise_embedded = tf.nn.embedding_lookup(W, self.premise) # [batch_size, q_len, word_dim] 134 | hypothesis_embedded = tf.nn.embedding_lookup(W, self.hypothesis) 135 | 136 | premise_embedded = tf.nn.dropout(premise_embedded, keep_prob=self.dropout_keep_prob) 137 | hypothesis_embedded = tf.nn.dropout(hypothesis_embedded, keep_prob=self.dropout_keep_prob) 138 | print("shape of premise_embedded: {}".format(premise_embedded.get_shape())) 139 | print("shape of hypothesis_embedded: {}".format(hypothesis_embedded.get_shape())) 140 | 141 | # =============================== Encoding layer =============================== 142 | with tf.variable_scope("encoding_layer") as vs: 143 | rnn_scope_name = "bidirectional_rnn" 144 | p_rnn_output, p_rnn_states = lstm_layer(premise_embedded, self.premise_len, rnn_size, self.dropout_keep_prob, rnn_scope_name, scope_reuse=False) # [batch_size, sequence_length, rnn_size(200)] 145 | premise_output = tf.concat(axis=2, values=p_rnn_output) # [batch_size, maxlen, rnn_size*2] 146 | h_rnn_output, h_rnn_states = lstm_layer(hypothesis_embedded, self.hypothesis_len, rnn_size, self.dropout_keep_prob, rnn_scope_name, scope_reuse=True) 147 | hypothesis_output = tf.concat(axis=2, values=h_rnn_output) # [batch_size, maxlen, rnn_size*2] 148 | print('Incorporate single_lstm_layer successfully.') 149 | 150 | # =============================== Matching layer =============================== 151 | with tf.variable_scope("matching_layer") as vs: 152 | similarity = premise_hypothesis_similarity_matrix(premise_output, hypothesis_output) #[batch_size, answer_len, question_len] 153 | attended_premise = attend_premise(similarity, hypothesis_output, self.hypothesis_len, sequence_length) #[batch_size, maxlen, dim] 154 | attended_hypothesis = attend_hypothesis(similarity, premise_output, self.premise_len, sequence_length) #[batch_size, maxlen, dim] 155 | 156 | m_p = tf.concat(axis=2, values=[premise_output, attended_premise, tf.multiply(premise_output, attended_premise), premise_output-attended_premise]) 157 | m_h = tf.concat(axis=2, values=[hypothesis_output, attended_hypothesis, tf.multiply(hypothesis_output, attended_hypothesis), hypothesis_output-attended_hypothesis]) 158 | 159 | # m_ffnn 160 | m_input_size = m_p.get_shape()[-1].value 161 | m_output_size = m_input_size 162 | m_p = ffnn_layer(m_p, m_output_size, self.dropout_keep_prob, "m_ffnn", scope_reuse=False) 163 | m_h = ffnn_layer(m_h, m_output_size, self.dropout_keep_prob, "m_ffnn", scope_reuse=True) 164 | print('Incorporate ffnn_layer after cross attention successfully.') 165 | 166 | rnn_scope_cross = 'bidirectional_rnn_cross' 167 | rnn_size_layer_2 = rnn_size 168 | rnn_output_p_2, rnn_states_p_2 = lstm_layer(m_p, self.premise_len, rnn_size_layer_2, self.dropout_keep_prob, rnn_scope_cross, scope_reuse=False) 169 | rnn_output_h_2, rnn_states_h_2 = lstm_layer(m_h, self.hypothesis_len, rnn_size_layer_2, self.dropout_keep_prob, rnn_scope_cross, scope_reuse=True) 170 | 171 | premise_output_cross = tf.concat(axis=2, values=rnn_output_p_2) # [batch_size, sequence_length, 2*rnn_size(400)] 172 | hypothesis_output_cross = tf.concat(axis=2, values=rnn_output_h_2) 173 | 174 | # =============================== Aggregation layer =============================== 175 | with tf.variable_scope("aggregation_layer") as vs: 176 | premise_max = tf.reduce_max(premise_output_cross, axis=1) # [batch_size, 2*rnn_size(400)] 177 | hypothesis_max = tf.reduce_max(hypothesis_output_cross, axis=1) 178 | 179 | premise_mean = tf.reduce_mean(premise_output_cross, axis=1) # [batch_size, 2*rnn_size(400)] 180 | hypothesis_mean = tf.reduce_mean(hypothesis_output_cross, axis=1) 181 | 182 | # premise_state = tf.concat(axis=1, values=[rnn_states_p_2[0].h, rnn_states_p_2[1].h]) # [batch_size, 2*rnn_size(400)] 183 | # hypothesis_state = tf.concat(axis=1, values=[rnn_states_h_2[0].h, rnn_states_h_2[1].h]) 184 | 185 | joined_feature = tf.concat(axis=1, values=[premise_max, hypothesis_max, premise_mean, hypothesis_mean]) # [batch_size, 8*rnn_size(1600)] 186 | print("shape of joined feature: {}".format(joined_feature.get_shape())) 187 | 188 | # =============================== Prediction layer =============================== 189 | with tf.variable_scope("prediction_layer") as vs: 190 | hidden_input_size = joined_feature.get_shape()[1].value 191 | hidden_output_size = 256 192 | regularizer = tf.contrib.layers.l2_regularizer(l2_reg_lambda) 193 | #regularizer = None 194 | joined_feature = tf.nn.dropout(joined_feature, keep_prob=self.dropout_keep_prob) 195 | full_out = tf.contrib.layers.fully_connected(joined_feature, hidden_output_size, 196 | activation_fn=tf.nn.relu, 197 | reuse=False, 198 | trainable=True, 199 | scope="projected_layer") # [batch_size, hidden_output_size(256)] 200 | full_out = tf.nn.dropout(full_out, keep_prob=self.dropout_keep_prob) 201 | #full_out = tf.concat(axis=1, values=[full_out, self.extra_feature]) 202 | 203 | last_weight_dim = full_out.get_shape()[1].value 204 | print("last_weight_dim: {}".format(last_weight_dim)) 205 | bias = tf.Variable(tf.constant(0.1, shape=[3]), name="bias") 206 | s_w = tf.get_variable("s_w", shape=[last_weight_dim, 3], initializer=tf.contrib.layers.xavier_initializer()) 207 | logits = tf.matmul(full_out, s_w) + bias # [batch_size, 3] 208 | print("shape of logits: {}".format(logits.get_shape())) 209 | 210 | self.probs = tf.nn.softmax(logits, name="prob") # [batch_size, n_class(3)] 211 | 212 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=self.target) 213 | self.mean_loss = tf.reduce_mean(losses, name="mean_loss") + l2_reg_lambda * l2_loss + sum( 214 | tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) 215 | 216 | with tf.name_scope("accuracy"): 217 | correct_prediction = tf.equal(tf.argmax(self.probs, 1), self.target) 218 | self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name="accuracy") 219 | -------------------------------------------------------------------------------- /model/train.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | import time 5 | import datetime 6 | from model import data_helpers 7 | from model.model_ESIM import ESIM 8 | import operator 9 | from collections import defaultdict 10 | 11 | # Files 12 | tf.flags.DEFINE_string("train_premise_file", "", "train premise file") 13 | tf.flags.DEFINE_string("train_hypothesis_file", "", "train hypothesis file") 14 | tf.flags.DEFINE_string("train_label_file", "", "train label file") 15 | tf.flags.DEFINE_string("dev_premise_file", "", "dev premise file") 16 | tf.flags.DEFINE_string("dev_hypothesis_file", "", "dev hypothesis file") 17 | tf.flags.DEFINE_string("dev_label_file", "", "dev label file") 18 | tf.flags.DEFINE_string("test_premise_file", "", "test premise file") 19 | tf.flags.DEFINE_string("test_hypothesis_file", "", "test hypothesis file") 20 | tf.flags.DEFINE_string("test_label_file", "", "test label file") 21 | tf.flags.DEFINE_string("embedded_vector_file", "", "pre-trained embedded word vector") 22 | tf.flags.DEFINE_string("vocab_file", "", "vocabulary file (map word to integer)") 23 | 24 | # Training parameters 25 | tf.flags.DEFINE_integer("batch_size", 1024, "Batch Size (default: 64)") 26 | tf.flags.DEFINE_integer("num_epochs", 5000000, "Number of training epochs (default: 200)") 27 | tf.flags.DEFINE_integer("evaluate_every", 1000, "Evaluate model on dev set after this many steps (default: 100)") 28 | 29 | # Model Hyperparameters 30 | tf.flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of character embedding (default: 128)") 31 | tf.flags.DEFINE_float("dropout_keep_prob", 1.0, "Dropout keep probability (default: 0.5)") 32 | tf.flags.DEFINE_float("l2_reg_lambda", 0.000005, "L2 regularizaion lambda (default: 0.0)") 33 | tf.flags.DEFINE_integer("max_sequence_length", 200, "max sequence length") 34 | tf.flags.DEFINE_integer("rnn_size", 200, "number of RNN units") 35 | 36 | # Misc Parameters 37 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 38 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 39 | 40 | FLAGS = tf.flags.FLAGS 41 | FLAGS._parse_flags() 42 | print("\nParameters:") 43 | for attr, value in sorted(FLAGS.__flags.items()): 44 | print("{}={}".format(attr.upper(), value)) 45 | print("") 46 | 47 | 48 | # Data Preparation 49 | print("Loading data...") 50 | 51 | # vocab = {"": 0, ...} 52 | vocab, idf = data_helpers.loadVocab(FLAGS.vocab_file) 53 | print('vocabulary size: {}'.format(len(vocab))) 54 | 55 | SEQ_LEN = FLAGS.max_sequence_length 56 | train_dataset = data_helpers.loadDataset(FLAGS.train_premise_file, FLAGS.train_hypothesis_file, FLAGS.train_label_file, vocab, SEQ_LEN) 57 | print('train_dataset: {}'.format(len(train_dataset))) 58 | dev_dataset = data_helpers.loadDataset(FLAGS.dev_premise_file, FLAGS.dev_hypothesis_file, FLAGS.dev_label_file, vocab, SEQ_LEN) 59 | print('dev_dataset: {}'.format(len(dev_dataset))) 60 | test_dataset = data_helpers.loadDataset(FLAGS.test_premise_file, FLAGS.test_hypothesis_file, FLAGS.test_label_file, vocab, SEQ_LEN) 61 | print('test_dataset: {}'.format(len(test_dataset))) 62 | 63 | 64 | with tf.Graph().as_default(): 65 | session_conf = tf.ConfigProto( 66 | allow_soft_placement=FLAGS.allow_soft_placement, 67 | log_device_placement=FLAGS.log_device_placement) 68 | sess = tf.Session(config=session_conf) 69 | with sess.as_default(): 70 | esim = ESIM( 71 | sequence_length=SEQ_LEN, 72 | vocab_size=len(vocab), 73 | embedding_size=FLAGS.embedding_dim, 74 | vocab=vocab, 75 | rnn_size=FLAGS.rnn_size, 76 | l2_reg_lambda=FLAGS.l2_reg_lambda) 77 | # Define Training procedure 78 | global_step = tf.Variable(0, name="global_step", trainable=False) 79 | starter_learning_rate = 0.001 80 | learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 81 | 5000, 0.96, staircase=True) 82 | optimizer = tf.train.AdamOptimizer(learning_rate) 83 | grads_and_vars = optimizer.compute_gradients(esim.mean_loss) 84 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 85 | 86 | # Keep track of gradient values and sparsity (optional) 87 | """ 88 | grad_summaries = [] 89 | for g, v in grads_and_vars: 90 | if g is not None: 91 | grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g) 92 | sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) 93 | grad_summaries.append(grad_hist_summary) 94 | grad_summaries.append(sparsity_summary) 95 | grad_summaries_merged = tf.merge_summary(grad_summaries) 96 | """ 97 | 98 | # Output directory for models and summaries 99 | timestamp = str(int(time.time())) 100 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) 101 | print("Writing to {}\n".format(out_dir)) 102 | 103 | # Summaries for loss and accuracy 104 | """ 105 | loss_summary = tf.scalar_summary("loss", esim.mean_loss) 106 | acc_summary = tf.scalar_summary("accuracy", esim.accuracy) 107 | 108 | # Train Summaries 109 | train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged]) 110 | train_summary_dir = os.path.join(out_dir, "summaries", "train") 111 | train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph_def) 112 | 113 | # Dev summaries 114 | dev_summary_op = tf.merge_summary([loss_summary, acc_summary]) 115 | dev_summary_dir = os.path.join(out_dir, "summaries", "dev") 116 | dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph_def) 117 | """ 118 | 119 | # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it 120 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) 121 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 122 | if not os.path.exists(checkpoint_dir): 123 | os.makedirs(checkpoint_dir) 124 | saver = tf.train.Saver(tf.global_variables()) 125 | 126 | # Initialize all variables 127 | sess.run(tf.global_variables_initializer()) 128 | 129 | def train_step(x_premise, x_hypothesis, x_premise_len, x_hypothesis_len, 130 | targets, extra_feature, p_features, h_features): 131 | """ 132 | A single training step 133 | """ 134 | feed_dict = { 135 | esim.premise: x_premise, 136 | esim.hypothesis: x_hypothesis, 137 | esim.premise_len: x_premise_len, 138 | esim.hypothesis_len: x_hypothesis_len, 139 | esim.target: targets, 140 | esim.dropout_keep_prob: FLAGS.dropout_keep_prob, 141 | esim.extra_feature: extra_feature, 142 | esim.p_word_feature: p_features, 143 | esim.h_word_feature: h_features 144 | } 145 | 146 | _, step, loss, accuracy, predicted_prob = sess.run( 147 | [train_op, global_step, esim.mean_loss, esim.accuracy, esim.probs], 148 | feed_dict) 149 | 150 | time_str = datetime.datetime.now().isoformat() 151 | if step % 100 == 0: 152 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) 153 | #train_summary_writer.add_summary(summaries, step) 154 | 155 | 156 | def check_step(dataset, shuffle=False): 157 | results = defaultdict(list) 158 | num_test = 0 159 | num_correct = 0.0 160 | batches = data_helpers.batch_iter(dataset, FLAGS.batch_size, 1, idf, SEQ_LEN, shuffle=shuffle) 161 | for batch in batches: 162 | x_premise, x_hypothesis, x_premise_len, x_hypothesis_len, \ 163 | targets, extra_feature, p_features, h_features = batch 164 | feed_dict = { 165 | esim.premise: x_premise, 166 | esim.hypothesis: x_hypothesis, 167 | esim.premise_len: x_premise_len, 168 | esim.hypothesis_len: x_hypothesis_len, 169 | esim.target: targets, 170 | esim.dropout_keep_prob: 1.0, 171 | esim.extra_feature: extra_feature, 172 | esim.p_word_feature: p_features, 173 | esim.h_word_feature: h_features 174 | } 175 | batch_accuracy, predicted_prob = sess.run([esim.accuracy, esim.probs], feed_dict) 176 | num_test += len(predicted_prob) 177 | if num_test % 1000 == 0: 178 | print(num_test) 179 | 180 | num_correct += len(predicted_prob) * batch_accuracy 181 | 182 | # calculate Accuracy 183 | acc = num_correct / num_test 184 | print('num_test_samples: {} accuracy: {}'.format(num_test, acc)) 185 | 186 | return acc 187 | 188 | best_acc = 0.0 189 | EPOCH = 0 190 | batches = data_helpers.batch_iter(train_dataset, FLAGS.batch_size, FLAGS.num_epochs, idf, SEQ_LEN, shuffle=True) 191 | for batch in batches: 192 | x_premise, x_hypothesis, x_premise_len, x_hypothesis_len, \ 193 | targets, extra_feature, p_features, h_features = batch 194 | train_step(x_premise, x_hypothesis, x_premise_len, x_hypothesis_len, targets, extra_feature, p_features, h_features) 195 | current_step = tf.train.global_step(sess, global_step) 196 | if current_step % FLAGS.evaluate_every == 0: 197 | EPOCH += 1 198 | print("\nEPOCH: {}".format(EPOCH)) 199 | print("Evaluation on dev:") 200 | valid_acc = check_step(dev_dataset, shuffle=True) 201 | print("\nEvaluation on test:") 202 | test_acc = check_step(test_dataset, shuffle=False) 203 | if valid_acc > best_acc: 204 | best_acc = valid_acc 205 | path = saver.save(sess, checkpoint_prefix, global_step=current_step) 206 | print("Saved model checkpoint to {}\n".format(path)) 207 | 208 | -------------------------------------------------------------------------------- /scripts/test.sh: -------------------------------------------------------------------------------- 1 | cur_dir=`pwd` 2 | parentdir="$(dirname $cur_dir)" 3 | 4 | DATA_DIR=${parentdir}/data 5 | 6 | latest_run=`ls -dt runs/* |head -n 1` 7 | latest_checkpoint=${latest_run}/checkpoints 8 | # latest_checkpoint=runs/1541064267/checkpoints # or edit the path to the model requires testing here 9 | echo $latest_checkpoint 10 | 11 | test_premise_file=$DATA_DIR/word_sequence/premise_snli_1.0_test.txt 12 | test_hypothesis_file=$DATA_DIR/word_sequence/hypothesis_snli_1.0_test.txt 13 | test_label_file=$DATA_DIR/word_sequence/label_snli_1.0_test.txt 14 | vocab_file=$DATA_DIR/word_sequence/vocab.txt 15 | 16 | batch_size=128 17 | max_sequence_length=100 18 | 19 | PKG_DIR=${parentdir} 20 | 21 | PYTHONPATH=${PKG_DIR}:$PYTHONPATH CUDA_VISIBLE_DEVICES=1 python -u ${PKG_DIR}/model/eval.py \ 22 | --test_premise_file $test_premise_file \ 23 | --test_hypothesis_file $test_hypothesis_file \ 24 | --test_label_file $test_label_file \ 25 | --vocab_file $vocab_file \ 26 | --max_sequence_length $max_sequence_length \ 27 | --batch_size $batch_size \ 28 | --checkpoint_dir $latest_checkpoint > log_test.txt 2>&1 & 29 | -------------------------------------------------------------------------------- /scripts/train.sh: -------------------------------------------------------------------------------- 1 | cur_dir=`pwd` 2 | parentdir="$(dirname $cur_dir)" 3 | 4 | DATA_DIR=${parentdir}/data 5 | 6 | train_premise_file=$DATA_DIR/word_sequence/premise_snli_1.0_train.txt 7 | train_hypothesis_file=$DATA_DIR/word_sequence/hypothesis_snli_1.0_train.txt 8 | train_label_file=$DATA_DIR/word_sequence/label_snli_1.0_train.txt 9 | 10 | dev_premise_file=$DATA_DIR/word_sequence/premise_snli_1.0_dev.txt 11 | dev_hypothesis_file=$DATA_DIR/word_sequence/hypothesis_snli_1.0_dev.txt 12 | dev_label_file=$DATA_DIR/word_sequence/label_snli_1.0_dev.txt 13 | 14 | test_premise_file=$DATA_DIR/word_sequence/premise_snli_1.0_test.txt 15 | test_hypothesis_file=$DATA_DIR/word_sequence/hypothesis_snli_1.0_test.txt 16 | test_label_file=$DATA_DIR/word_sequence/label_snli_1.0_test.txt 17 | 18 | embedded_vector_file=$DATA_DIR/glove/filtered_glove_840B_300d.txt 19 | vocab_file=$DATA_DIR/word_sequence/vocab.txt 20 | 21 | lambda=0 22 | dropout_keep_prob=0.8 23 | batch_size=128 24 | max_sequence_length=100 25 | DIM=300 26 | rnn_size=300 27 | evaluate_every=4292 28 | 29 | PKG_DIR=${parentdir} 30 | 31 | PYTHONPATH=${PKG_DIR}:$PYTHONPATH CUDA_VISIBLE_DEVICES=1 python -u ${PKG_DIR}/model/train.py \ 32 | --train_premise_file $train_premise_file \ 33 | --train_hypothesis_file $train_hypothesis_file \ 34 | --train_label_file $train_label_file \ 35 | --dev_premise_file $dev_premise_file \ 36 | --dev_hypothesis_file $dev_hypothesis_file \ 37 | --dev_label_file $dev_label_file \ 38 | --test_premise_file $test_premise_file \ 39 | --test_hypothesis_file $test_hypothesis_file \ 40 | --test_label_file $test_label_file \ 41 | --embedded_vector_file $embedded_vector_file \ 42 | --vocab_file $vocab_file \ 43 | --max_sequence_length $max_sequence_length \ 44 | --embedding_dim $DIM \ 45 | --l2_reg_lambda $lambda \ 46 | --dropout_keep_prob $dropout_keep_prob \ 47 | --batch_size $batch_size \ 48 | --rnn_size $rnn_size \ 49 | --evaluate_every $evaluate_every > log.txt 2>&1 & 50 | --------------------------------------------------------------------------------