├── README.md ├── data ├── dev-v1.1.json └── train-v1.1.json ├── main.py ├── parse_data.py ├── plot.py └── preprocess_data.py /README.md: -------------------------------------------------------------------------------- 1 | # RaSoR-in-Tensorflow 2 | The implementation of one of the SQuAD solutions 3 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import tensorflow as tf 4 | from tensorflow.contrib import rnn 5 | import numpy as np 6 | import cPickle as pickle 7 | import argparse 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--batch_size', default='10', help='Batch size', type=int) 11 | parser.add_argument('--dropout', default='0.1', help='Dropout in LSTMs', type=float) 12 | parser.add_argument('--epochs', default='10', help='Number of epochs', type=int) 13 | parser.add_argument('--test_every', default='100', help='Number of iterations before validation testing', type=int) 14 | parser.add_argument('--lr', default='0.01', help='Learning rate', type=float) 15 | args = parser.parse_args() 16 | 17 | 18 | learning_rate = args.lr 19 | epochs = args.epochs 20 | dropout = args.dropout 21 | batch_size = args.batch_size 22 | test_iter = args.test_every 23 | 24 | print("Reading train data...") 25 | with open('data/train_data.pkl', 'rb') as fd: 26 | train_data = pickle.load(fd) 27 | print("Done!") 28 | 29 | print("Reading val data...") 30 | with open('data/valid_data.pkl', 'rb') as fd: 31 | val_data = pickle.load(fd) 32 | print("Done!") 33 | 34 | ############################################################################################################### 35 | 36 | # train_data = [[train_data[0][0][:1000], train_data[0][1][:1000]], [train_data[1][0][:1000], train_data[1][1][:1000]]] 37 | # val_data = [[val_data[0][0][:1000], val_data[0][1][:1000]], [val_data[1][0][:1000], val_data[1][1][:1000]]] 38 | 39 | ################################################################################################################ 40 | 41 | max_span_length = 30 42 | n_hidden = 50 43 | word_vec_size = 300 44 | 45 | def data_generator(data, is_train=True, batch_size=batch_size, shuffle=False): 46 | n_samples = len(data[0][0]) 47 | if shuffle: 48 | perm = np.random.permutation(n_samples) 49 | else: 50 | perm = np.arange(n_samples) 51 | for i in range(0, n_samples, batch_size): 52 | indices = perm[i:i+batch_size] 53 | bs = len(indices) 54 | max_plen = max([data[0][0][j].shape[0] for j in indices]) 55 | max_qlen = max([data[0][1][j].shape[0] for j in indices]) 56 | p_mask = np.ones( (bs, max_plen, 1), dtype=np.float32) 57 | q_mask = np.ones( (bs, max_qlen, 1), dtype=np.float32) 58 | p_s = [] 59 | q_s = [] 60 | for j in range(bs): 61 | ind = indices[j] 62 | l_p = data[0][0][ind].shape[0] 63 | l_q = data[0][1][ind].shape[0] 64 | p_s.append(np.lib.pad(data[0][0][ind], ((0, max_plen - l_p), (0, 0)), 'constant', constant_values=(0,0))) 65 | q_s.append(np.lib.pad(data[0][1][ind], ((0, max_qlen - l_q), (0, 0)), 'constant', constant_values=(0,0))) 66 | p_mask[j, l_p:, 0] = 0 67 | q_mask[j, l_q:, 0] = 0 68 | p = np.stack(p_s) 69 | q = np.stack(q_s) 70 | 71 | 72 | n_s = np.zeros((bs), dtype=np.int32) 73 | 74 | for j in range(bs): 75 | ind = indices[j] 76 | l_p = data[0][0][ind].shape[0] 77 | if l_p >= max_span_length: 78 | n_s[j] = (max_span_length + 1) * max_span_length / 2 + (l_p - max_span_length) * max_span_length 79 | else: 80 | n_s[j] = (l_p + 1) * l_p / 2 81 | max_n_s = n_s.max() 82 | y = np.zeros((bs, max_n_s)) 83 | i_p = np.zeros((bs, max_n_s, 2), dtype=np.int32) 84 | i_p_mask = np.ones((bs, max_n_s, 1), dtype=np.float32) 85 | for j in range(bs): 86 | ind = indices[j] 87 | l_p = data[0][0][ind].shape[0] 88 | k = 0 89 | a1 = data[1][0][ind] 90 | a2 = data[1][1][ind] 91 | for m in range(l_p): 92 | for n in range(m, min(m+max_span_length, l_p)): 93 | i_p[j, k, 0] = m 94 | i_p[j, k, 1] = n 95 | if is_train and m == a1 and n == a2: 96 | y[j, k] = 1 97 | k += 1 98 | assert k <= n_s[j] 99 | i_p_mask[j, n_s[j]:, 0] = 0 100 | if is_train: 101 | yield ( (p, q, i_p), (p_mask, q_mask, i_p_mask), y) 102 | else: 103 | yield ( (p, q, i_p), (p_mask, q_mask, i_p_mask)) 104 | return 105 | 106 | p = tf.placeholder("float", [None, None, word_vec_size]) 107 | q = tf.placeholder("float", [None, None, word_vec_size]) 108 | p_mask = tf.placeholder("float", [None, None, 1]) 109 | q_mask = tf.placeholder("float", [None, None, 1]) 110 | index_pairs = tf.placeholder("int32", [None, None, 2]) 111 | index_pairs_mask = tf.placeholder("float", [None, None, 1]) 112 | y = tf.placeholder("float", [None, None]) 113 | 114 | def softmax_with_mask(input, mask, dim=-1): 115 | m = tf.reduce_max(input,axis=dim, keep_dims=True) 116 | e = tf.exp(input - m) * mask 117 | s = tf.reduce_sum(e, axis=dim, keep_dims=True) 118 | s = tf.clip_by_value(s, np.finfo(np.float32).eps, np.finfo(np.float32).max) 119 | return e / s 120 | 121 | def FFNN(input, input_mask, name, layer_shapes = [n_hidden]): 122 | # A Feed Forward Neural Network 123 | 124 | x = input 125 | for i in range(len(layer_shapes)): 126 | s = layer_shapes[i] 127 | with tf.variable_scope('{}_{}'.format(name, i)): 128 | x = tf.layers.dense(inputs=x, units=s, activation=tf.nn.relu) 129 | x = x * input_mask 130 | 131 | return x 132 | 133 | 134 | def BiLSTM(input, input_mask, name): 135 | with tf.variable_scope(name): 136 | lstm_fw_cell = rnn.LSTMCell(n_hidden, forget_bias=1.0) 137 | lstm_fw_cell = tf.contrib.rnn.DropoutWrapper(lstm_fw_cell, state_keep_prob=1.0-dropout, 138 | # input_keep_prob=1.0-dropout, input_size=tf.shape(input)[1:], 139 | variational_recurrent=True, dtype=tf.float32) 140 | lstm_bw_cell = rnn.LSTMCell(n_hidden, forget_bias=1.0) 141 | lstm_bw_cell = tf.contrib.rnn.DropoutWrapper(lstm_bw_cell, state_keep_prob=1.0-dropout, 142 | # input_keep_prob=1.0-dropout, input_size=tf.shape(input)[1:], 143 | variational_recurrent=True,dtype=tf.float32) 144 | outputs, states = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, input, dtype=tf.float32) 145 | outputs = tf.concat(outputs, axis=-1) * input_mask 146 | return outputs 147 | 148 | def q_align(p, q, p_mask, q_mask): 149 | p_n = FFNN(p, p_mask, 'align_p') 150 | q_n = FFNN(q, q_mask, 'align_q') 151 | s = tf.matmul(p_n, q_n, transpose_b=True) 152 | a = softmax_with_mask(s, p_mask) 153 | return tf.matmul(a, q) 154 | 155 | def q_indep(q, q_mask): 156 | q_s = q 157 | for i in range(2): 158 | q_s = BiLSTM(q_s, q_mask, 'BiLSTM_q_indep_{}'.format(i)) 159 | w_q = tf.Variable(tf.random_normal([1, n_hidden])) 160 | s = tf.tensordot(FFNN(q_s, q_mask, 'FFNN_q_s'), w_q, axes=[[-1],[-1]]) 161 | a = softmax_with_mask(s,q_mask, dim=1) 162 | return tf.matmul(a, q_s, transpose_a=True) 163 | 164 | def concat(p, q_a, q_i): 165 | p_tmp = tf.reduce_sum(p, axis=-1, keep_dims=True) 166 | q_i = q_i + p_tmp * 0 167 | return tf.concat([p, q_a, q_i], axis=-1) 168 | 169 | def question_focused_passage(p, q, p_mask, q_mask): 170 | q_a = q_align(p, q, p_mask, q_mask) 171 | q_i = q_indep(q, q_mask) 172 | h_a = concat(p, q_a, q_i) 173 | return h_a 174 | 175 | p_qf = question_focused_passage(p, q, p_mask, q_mask) 176 | 177 | for i in range(2): 178 | p_qf = BiLSTM(p_qf, p_mask, 'BiLSTM_p_qf_{}'.format(i)) 179 | 180 | 181 | # Getting answer span representation 182 | 183 | start_indices = index_pairs[:, :, 0] 184 | start_indices = tf.expand_dims(start_indices, -1) 185 | end_indices = index_pairs[:, :, 1] 186 | end_indices = tf.expand_dims(end_indices, -1) 187 | symbolic_batch_size = tf.shape(index_pairs)[0] 188 | b_s = tf.range(0, symbolic_batch_size, dtype=tf.int32) 189 | b_s = tf.expand_dims(b_s,-1) 190 | b_s = tf.expand_dims(b_s,-1) # b_s.shape == (B, 1, 1) 191 | b_s = start_indices * 0 + b_s # b_s broadcasts to shape (batch_size, n_spans, 1) == shape of start_indices 192 | 193 | start_indices = tf.concat((b_s, start_indices), axis=-1) 194 | end_indices = tf.concat((b_s, end_indices), axis=-1) 195 | 196 | start_vectors = tf.gather_nd(p_qf, start_indices) 197 | end_vectors = tf.gather_nd(p_qf, end_indices) 198 | 199 | spans = tf.concat((start_vectors, end_vectors), axis=-1) # spans.shape == (batch_size, n_spans, 2 * n_hidden) 200 | spans = spans * index_pairs_mask 201 | 202 | 203 | def span_score_logits(spans, spans_mask): 204 | w_a = tf.Variable(tf.random_normal([n_hidden])) 205 | h_a = FFNN(spans, spans_mask, 'spans') 206 | s_a = tf.tensordot(h_a, w_a, axes=[[-1],[-1]]) 207 | return s_a * spans_mask[:, :, 0] 208 | 209 | logits = span_score_logits(spans, index_pairs_mask) 210 | probs = softmax_with_mask(logits, index_pairs_mask[:, :, 0]) 211 | def cross_entropy(y_, y): 212 | y_ = tf.clip_by_value(y_, np.finfo(np.float32).eps, np.finfo(np.float32).max) 213 | return tf.reduce_mean(-tf.reduce_sum(y * tf.log(y_), reduction_indices=[1])) 214 | 215 | 216 | #cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y)) 217 | cost = cross_entropy(probs, y) 218 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) 219 | 220 | correct_pred = tf.equal(tf.argmax(logits,-1), tf.argmax(y,-1)) 221 | accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) 222 | 223 | 224 | init = tf.global_variables_initializer() 225 | saver = tf.train.Saver(max_to_keep=epochs) 226 | 227 | def test_on_validation_set(val_data, global_iter): 228 | acc = 0 229 | loss = 0 230 | iter = 0 231 | counter = 0.0 232 | data_len = len(val_data[1][1]) 233 | print("Running on Validataion Set") 234 | 235 | for ((batch_p, batch_q, batch_i_p), (batch_p_mask, batch_q_mask, batch_i_p_mask), batch_y) in data_generator(val_data, is_train=True, batch_size=batch_size, shuffle=False): 236 | f_dict={p: batch_p, q: batch_q, index_pairs: batch_i_p, p_mask: batch_p_mask, q_mask: batch_q_mask, index_pairs_mask: batch_i_p_mask, y: batch_y} 237 | acc += sess.run(accuracy, feed_dict=f_dict) 238 | loss += sess.run(cost, feed_dict=f_dict) 239 | counter += len(batch_p) 240 | iter += 1 241 | print("{:.4f}%".format(counter * 100 / data_len), end='\r') 242 | 243 | print("\nIter: {:4d} Val Loss: {:.4f} Val Acc: {:.4f}".format(global_iter, loss/iter, acc/iter)) 244 | 245 | with tf.Session() as sess: 246 | sess.run(init) 247 | global_iter = 0 248 | 249 | for e in range(epochs): 250 | 251 | for ((batch_p, batch_q, batch_i_p), (batch_p_mask, batch_q_mask, batch_i_p_mask), batch_y) in data_generator(train_data, is_train=True, batch_size=batch_size, shuffle=True): 252 | f_dict={p: batch_p, q: batch_q, index_pairs: batch_i_p, p_mask: batch_p_mask, q_mask: batch_q_mask, index_pairs_mask: batch_i_p_mask, y: batch_y} 253 | if global_iter % test_iter == 0: 254 | test_on_validation_set(val_data, global_iter) 255 | train_loss = sess.run(cost, feed_dict=f_dict) 256 | train_acc = sess.run(accuracy, feed_dict=f_dict) 257 | print("Iter: {:4d} Train Loss: {:.4f} Train Acc: {:.4f}".format(global_iter, train_loss, train_acc)) 258 | sess.run(optimizer, feed_dict=f_dict) 259 | global_iter += 1 260 | 261 | 262 | 263 | 264 | save_path = saver.save(sess, "models/model", global_step=e) 265 | print("Model saved in file: {}".format(save_path)) 266 | 267 | print("Optimization Finished!") 268 | -------------------------------------------------------------------------------- /parse_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: python parse_data.py dataset_file --output_file outfile.json --train_ratio 1. 3 | """ 4 | # -*- coding: utf-8 -*- 5 | import json 6 | import argparse 7 | import random 8 | random.seed(20) 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('data', help='Path to the dataset file', type=str) 12 | parser.add_argument('--output_destination', default='data/tmp.json', 13 | help='Desired path to output json', type=str) 14 | parser.add_argument('--train_ratio', default=1., help='ratio for train/val split', type=float) 15 | args = parser.parse_args() 16 | 17 | 18 | file_path = args.data 19 | outfile = args.output_destination 20 | train_ratio = args.train_ratio 21 | 22 | json_data = open(file_path, 'r').read() 23 | data = json.loads(json_data) 24 | 25 | print "Keys of json are:", data.keys() 26 | data = data['data'] 27 | print "Dataset is a list of %d topics, each topic contrains some paragraphs" % len(data) 28 | print "Keys of topics are", data[0].keys() 29 | topics = [data[i]['title'] for i in range(len(data))] 30 | #print "The topics are:", topics 31 | 32 | cnt_paragraphs_in_topic = dict([(data[i]['title'], len(data[i]['paragraphs'])) for i in range(len(data))]) 33 | print "Keys of paragraphs are:", data[0]['paragraphs'][0].keys() 34 | print "Dataset contains %d paragraphs in total" % sum(cnt_paragraphs_in_topic[x] for x in cnt_paragraphs_in_topic) 35 | print "Each paragraph has some questions and answers associated with it" 36 | print "Keys of qas sections are:", data[0]['paragraphs'][0]['qas'][0].keys() 37 | print "Keys of answers are:", data[0]['paragraphs'][0]['qas'][0]['answers'][0].keys() 38 | 39 | train_cqas = [] # ContextQuestionAnswer 40 | val_cqas = [] 41 | 42 | for topic_id in range(len(data)): 43 | paragraphs = data[topic_id]['paragraphs'] 44 | if random.random() < train_ratio: 45 | train = True 46 | else: 47 | train = False 48 | for paragraph in paragraphs: 49 | context = paragraph['context'] 50 | for qa in paragraph['qas']: 51 | # assert len(qa['answers']) == 1 # for trainset, dev set has a few answers 52 | 53 | question = qa['question'] 54 | _id = qa['id'] 55 | answer = qa['answers'][0]['text'] 56 | answer_start = qa['answers'][0]['answer_start'] 57 | answer_end = answer_start + len(answer) - 1 # answer == context[answer_start : answer_end + 1] 58 | if train: 59 | train_cqas.append({"context": context, "question": question, "answer": answer, 60 | 'answer_start': answer_start, 'answer_end': answer_end, 61 | 'id': _id, 'topic': topics[topic_id] 62 | }) 63 | else: 64 | val_cqas.append({"context": context, "question": question, "answer": answer, 65 | 'answer_start': answer_start, 'answer_end': answer_end, 66 | 'id': _id, 'topic': topics[topic_id] 67 | }) 68 | 69 | 70 | print "Saving dataset to outfile..." 71 | if train_ratio == 1.: 72 | with open(outfile, 'w') as fd: 73 | json.dump(train_cqas,fd) 74 | else: 75 | print "Train/Val ratio is %f" % (1. * len(train_cqas) / len(val_cqas)) 76 | train_file = 'train_' + outfile 77 | val_file = 'val_' + outfile 78 | with open(train_file, 'w') as fd: 79 | json.dump(train_cqas, fd) 80 | with open(val_file, 'w') as fd: 81 | json.dump(val_cqas, fd) -------------------------------------------------------------------------------- /plot.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import argparse 5 | import matplotlib 6 | matplotlib.use('Agg') 7 | import matplotlib.pyplot as plt 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--log_file', default='log.txt', help='Log file', type=str) 11 | parser.add_argument('--loss_lim', default='10', help='Upper limit for the loss plot', type=float) 12 | parser.add_argument('--acc_lim', default='0', help='Lower limit for the accuracy plot', type=float) 13 | parser.add_argument('--window_size', default='15', help='The size of the smoothing window', type=int) 14 | args = parser.parse_args() 15 | 16 | file_name = args.log_file 17 | ws = args.window_size 18 | loss_lim = args.loss_lim 19 | acc_lim = args.acc_lim 20 | 21 | def smoothen(arr, window): 22 | to_return = [] 23 | s = 0. 24 | for i in range(min(len(arr), window)): 25 | s += arr[i] 26 | to_return.append(s/(i+1)) 27 | for i in range(window, len(arr)): 28 | s += arr[i] 29 | s -= arr[i-window] 30 | to_return.append(s/window) 31 | return to_return 32 | 33 | with open(file_name, 'r') as log: 34 | lines = log.readlines() 35 | train_iters = [float(line.split()[1]) for line in lines if "Train Loss" in line] 36 | train_losses = [float(line.split()[4]) for line in lines if "Train Loss" in line] 37 | train_acc = [float(line.split()[7]) for line in lines if "Train Acc" in line] 38 | 39 | val_iters = [float(line.split()[1]) for line in lines if "Val Loss" in line] 40 | val_losses = [float(line.split()[4]) for line in lines if "Val Loss" in line] 41 | val_acc = [float(line.split()[7]) for line in lines if "Val Acc" in line] 42 | 43 | 44 | plt.plot(train_iters, smoothen(train_losses, window=ws)) 45 | plt.plot(np.array(val_iters), val_losses) 46 | plt.ylim([0, loss_lim]) 47 | #plt.show() 48 | plt.savefig('loss.png') 49 | plt.plot(train_iters, smoothen(train_acc, window=ws)) 50 | plt.plot(np.array(val_iters), val_acc) 51 | plt.ylim([0, acc_lim]) 52 | #plt.show() 53 | plt.savefig('acc.png') 54 | -------------------------------------------------------------------------------- /preprocess_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: python preprocess_data.py parsed_file.json --output_destination outfile 3 | """ 4 | # -*- coding: utf-8 -*- 5 | import json 6 | import argparse 7 | import gensim 8 | import numpy as np 9 | import random 10 | import cPickle as pickle 11 | from tqdm import tqdm 12 | 13 | random.seed(20) 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('data', help='Data json', type=str) 17 | parser.add_argument('--output_destination', default='data/tmp.pkl', help='Desired path to output pickle', type=str) 18 | args = parser.parse_args() 19 | 20 | file_path = args.data 21 | outfile = args.output_destination 22 | if not outfile.endswith('.pkl'): 23 | outfile += '.pkl' 24 | 25 | print "Reading SQuAD data... ", 26 | with open(file_path) as fd: 27 | samples = json.load(fd) 28 | print "Done!" 29 | 30 | print "Reading word2vec data... ", 31 | word_vec_size = 300 32 | w2v_model = gensim.models.KeyedVectors.load_word2vec_format('word2vec_from_glove_300.vec') 33 | vocab = w2v_model.vocab 34 | print "Done!" 35 | 36 | def get_word_vector(word): 37 | if word in vocab: 38 | return w2v_model[word] 39 | else: 40 | return np.zeros(word_vec_size) 41 | 42 | print "Initiating CoreNLP service connection... ", 43 | from stanford_corenlp_pywrapper import CoreNLP 44 | proc = CoreNLP(configdict={'annotators': "tokenize,ssplit"}, corenlp_jars=["/home/tigrann/Documents/stanford-corenlp-full-2017-06-09/*"]) 45 | print "Done!" 46 | 47 | def parse_sample(context, question, answer_start, answer_end, **kwargs): 48 | context = proc.parse_doc(context) 49 | tokens = [] 50 | char_offsets = [] 51 | for s in context['sentences']: 52 | tokens += s['tokens'] 53 | char_offsets += s['char_offsets'] 54 | 55 | try: 56 | answer_start = [answer_start >= s and answer_start < e for s, e in char_offsets].index(True) 57 | answer_end = [answer_end >= s and answer_end < e for s, e in char_offsets].index(True) 58 | except ValueError: 59 | # print(char_offsets) 60 | # print(answer_start, answer_end) 61 | return None 62 | 63 | print('context', tokens) 64 | context_vecs = [get_word_vector(token) for token in tokens] 65 | context_vecs = np.vstack(context_vecs).astype(np.float32) 66 | 67 | question = proc.parse_doc(question) 68 | tokens = [] 69 | for s in question['sentences']: 70 | tokens += s['tokens'] 71 | # print('question', tokens) 72 | question_vecs = [get_word_vector(token) for token in tokens] 73 | question_vecs = np.vstack(question_vecs).astype(np.float32) 74 | # print('ans', answer_start, answer_end) 75 | # exit(0) 76 | return [[context_vecs, question_vecs], 77 | [answer_start, answer_end]] 78 | 79 | print "Parsing samples... ", 80 | samples = [parse_sample(**sample) for sample in tqdm(samples)] 81 | #samples = [sample for sample in samples if sample is not None] 82 | 83 | 84 | print len(samples), "=>", 85 | samples = [sample for sample in samples if sample is not None] 86 | print len(samples) 87 | 88 | print "Done!" 89 | 90 | # Transpose 91 | data = [[[], []], 92 | [[], []]] 93 | for sample in samples: 94 | data[0][0].append(sample[0][0]) 95 | data[0][1].append(sample[0][1]) 96 | data[1][0].append(sample[1][0]) 97 | data[1][1].append(sample[1][1]) 98 | 99 | print "Writing data to file '{}'... ".format(outfile) 100 | with open(outfile, 'w') as fd: 101 | pickle.dump(data, fd, protocol=pickle.HIGHEST_PROTOCOL) 102 | print "Done!" 103 | 104 | print "Bye!" 105 | --------------------------------------------------------------------------------