├── .gitignore
├── Data
    └── download.sh
├── Models
    ├── config.json
    └── model_rnet.py
├── rnet.py
├── Results
    └── evaluate-v1.1.py
├── evaluate.py
├── README.md
└── preprocess.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | Models/__pycache__
3 | 


--------------------------------------------------------------------------------
/Data/download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Download SQuAD
 4 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
 5 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json 
 6 | 
 7 | # Download GloVe
 8 | wget http://nlp.stanford.edu/data/glove.6B.zip
 9 | wget http://nlp.stanford.edu/data/glove.840B.300d.zip
10 | unzip glove.6B.zip
11 | unzip glove.840B.300d.zip
12 | 
13 | 


--------------------------------------------------------------------------------
/Models/config.json:
--------------------------------------------------------------------------------
 1 | {  
 2 | 	"rnet":{  
 3 | 		"train":{  
 4 | 			"glove": "300",
 5 | 			"share_context_LSTM":true,
 6 | 			"char_emb":false,
 7 | 			"in_keep_prob":0.8,
 8 | 			"batch_size":60,
 9 | 			"state_size":75,
10 | 			"emb_dim" : 300,
11 | 			"word_emb_dim" : 300,
12 | 			"char_max_length" : 37,
13 | 			"char_vocab_size" : 1368, 
14 | 			"char_emb_mat_dim" : 8,
15 | 			"p_length":300,
16 | 			"q_length":30,
17 | 			"a_length":20,
18 | 			"span_length":20
19 | 		},
20 | 		"dev":{  
21 | 			"glove":"300",
22 | 			"share_context_LSTM":true,
23 | 			"char_emb":false,
24 | 			"in_keep_prob":1.0,
25 | 			"batch_size":60,
26 | 			"state_size":75,
27 | 			"emb_dim":300,
28 | 			"word_emb_dim" : 300, 
29 | 			"char_max_length" : 37,
30 | 			"char_vocab_size" : 1368, 
31 | 			"char_emb_mat_dim" : 8,
32 | 			"p_length":300,
33 | 			"q_length":30,
34 | 			"a_length":20,
35 | 			"span_length":20
36 | 		}
37 | 	}
38 | }
39 | 


--------------------------------------------------------------------------------
/rnet.py:
--------------------------------------------------------------------------------
 1 | import preprocess
 2 | from Models import model_rnet
 3 | import numpy as np
 4 | import tensorflow as tf
 5 | import argparse
 6 | import random
 7 | import string
 8 | import os
 9 | import json
10 | 
11 | def run():
12 | 	parser = argparse.ArgumentParser()
13 | 	parser.add_argument('--learning_rate', type=float, default=0.001, help='Learning Rate')
14 | 	parser.add_argument('--epochs', type=int, default=12, help='Expochs')
15 | 	parser.add_argument('--debug', type=bool, default=False, help='print debug msgs')
16 | 	parser.add_argument('--load', type=bool, default=False, help='load model')
17 | 	parser.add_argument('--save_dir', type=str, default='Models/save/', help='Data')
18 | 
19 | 	args = parser.parse_args()
20 | 
21 | 	modOpts = json.load(open('Models/config.json','r'))['rnet']['train']
22 | 
23 | 	print('Reading data')
24 | 	dp = preprocess.read_data('train', modOpts)
25 | 	num_batches = int(np.floor(dp.num_samples/modOpts['batch_size'])) - 1
26 | 	
27 | 	rnet_model = model_rnet.R_NET(modOpts)
28 | 	input_tensors, loss, acc, pred_si, pred_ei = rnet_model.build_model()
29 | 	#train_op = tf.train.AdamOptimizer(args.learning_rate).minimize(loss)
30 | 	train_op = tf.train.AdadeltaOptimizer(1.0, rho=0.95, epsilon=1e-06,).minimize(loss)
31 | 
32 | 	#saver
33 | 	saver = tf.train.Saver()
34 | 
35 | 	config = tf.ConfigProto()
36 | 	config.gpu_options.allow_growth = True
37 | 	sess = tf.InteractiveSession(config=config)
38 | 	if args.load:
39 | 		PATH = 'Models/save/rnet_model0.ckpt'
40 | 		start_epoch = 1
41 | 		saver.restore(sess, PATH)
42 | 		f = open('Results/rnet_training_result.txt','a')
43 | 	else:
44 | 		init = tf.global_variables_initializer()
45 | 		sess.run(init)
46 | 		f = open('Results/rnet_training_result.txt','w')
47 | 		start_epoch = 0
48 | 
49 | 	for i in range(start_epoch, args.epochs):
50 | 		rl=random.sample(range(num_batches), num_batches)
51 | 		batch_no = 0
52 | 		LOSS = 0.0
53 | 		EM = 0.0
54 | 		while batch_no < num_batches:
55 | 			tensor_dict, idxs = dp.get_training_batch(rl[batch_no])
56 | 			feed_dict = {
57 | 				input_tensors['p']:tensor_dict['paragraph'],
58 | 				input_tensors['q']:tensor_dict['question'],
59 | 				input_tensors['a_si']:tensor_dict['answer_si'],
60 | 				input_tensors['a_ei']:tensor_dict['answer_ei'],
61 | 			}
62 | 			if modOpts['char_emb']:
63 | 				feed_dict.update({
64 | 						input_tensors['pc']:tensor_dict['paragraph_c'],
65 | 						input_tensors['qc']:tensor_dict['question_c'],
66 | 					})
67 | 			_, loss_value, accuracy, predictions_si, predictions_ei = sess.run(
68 | 					[train_op, loss, acc, pred_si, pred_ei], feed_dict=feed_dict)
69 | 			batch_no += 1
70 | 			LOSS += loss_value
71 | 			EM += accuracy
72 | 			print("{} epoch {} batch, Loss:{:.2f}, Acc:{:.2f}".format(i, batch_no, loss_value, accuracy))
73 | 		save_path = saver.save(sess, os.path.join(args.save_dir, "rnet_model{}.ckpt".format(i)))
74 | 		f.write(' '.join( ("Loss", str(LOSS/dp.num_samples), str(i), '\n' ) ) )
75 | 		f.write(' '.join( ("EM", str(EM/num_batches), '\n') ) )
76 | 		f.write("---------------\n")
77 | 		f.flush()
78 | 		print("---------------")
79 | 	f.close()
80 | 	save_path = saver.save(sess, os.path.join(args.save_dir, "rnet_model_final.ckpt"))
81 | 	print('save path:',save_path)
82 | 
83 | def f1_score(prediction, ground_truth):
84 | 	from collections import Counter
85 | 
86 | 	prediction_tokens = prediction
87 | 	ground_truth_tokens = ground_truth
88 | 	common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
89 | 	num_same = sum(common.values())
90 | 	if num_same == 0:
91 | 		return 0
92 | 	precision = 1.0 * num_same / len(prediction_tokens)
93 | 	recall = 1.0 * num_same / len(ground_truth_tokens)
94 | 	f1 = (2 * precision * recall) / (precision + recall)
95 | 	return f1
96 | 
97 | if __name__ == '__main__':
98 | 	run()
99 | 


--------------------------------------------------------------------------------
/Results/evaluate-v1.1.py:
--------------------------------------------------------------------------------
 1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """
 2 | from __future__ import print_function
 3 | from collections import Counter
 4 | import string
 5 | import re
 6 | import argparse
 7 | import json
 8 | import sys
 9 | 
10 | 
11 | def normalize_answer(s):
12 |     """Lower text and remove punctuation, articles and extra whitespace."""
13 |     def remove_articles(text):
14 |         return re.sub(r'\b(a|an|the)\b', ' ', text)
15 | 
16 |     def white_space_fix(text):
17 |         return ' '.join(text.split())
18 | 
19 |     def remove_punc(text):
20 |         exclude = set(string.punctuation)
21 |         return ''.join(ch for ch in text if ch not in exclude)
22 | 
23 |     def lower(text):
24 |         return text.lower()
25 | 
26 |     return white_space_fix(remove_articles(remove_punc(lower(s))))
27 | 
28 | 
29 | def f1_score(prediction, ground_truth):
30 |     prediction_tokens = normalize_answer(prediction).split()
31 |     ground_truth_tokens = normalize_answer(ground_truth).split()
32 |     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
33 |     num_same = sum(common.values())
34 |     if num_same == 0:
35 |         return 0
36 |     precision = 1.0 * num_same / len(prediction_tokens)
37 |     recall = 1.0 * num_same / len(ground_truth_tokens)
38 |     f1 = (2 * precision * recall) / (precision + recall)
39 |     return f1
40 | 
41 | 
42 | def exact_match_score(prediction, ground_truth):
43 |     return (normalize_answer(prediction) == normalize_answer(ground_truth))
44 | 
45 | 
46 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
47 |     scores_for_ground_truths = []
48 |     for ground_truth in ground_truths:
49 |         score = metric_fn(prediction, ground_truth)
50 |         scores_for_ground_truths.append(score)
51 |     return max(scores_for_ground_truths)
52 | 
53 | 
54 | def evaluate(dataset, predictions):
55 |     f1 = exact_match = total = 0
56 |     for article in dataset:
57 |         for paragraph in article['paragraphs']:
58 |             for qa in paragraph['qas']:
59 |                 total += 1
60 |                 if qa['id'] not in predictions:
61 |                     message = 'Unanswered question ' + qa['id'] + \
62 |                               ' will receive score 0.'
63 |                     print(message, file=sys.stderr)
64 |                     continue
65 |                 ground_truths = list(map(lambda x: x['text'], qa['answers']))
66 |                 prediction = predictions[qa['id']]
67 |                 exact_match += metric_max_over_ground_truths(
68 |                     exact_match_score, prediction, ground_truths)
69 |                 f1 += metric_max_over_ground_truths(
70 |                     f1_score, prediction, ground_truths)
71 | 
72 |     exact_match = 100.0 * exact_match / total
73 |     f1 = 100.0 * f1 / total
74 | 
75 |     return {'exact_match': exact_match, 'f1': f1}
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     expected_version = '1.1'
80 |     parser = argparse.ArgumentParser(
81 |         description='Evaluation for SQuAD ' + expected_version)
82 |     parser.add_argument('dataset_file', help='Dataset file')
83 |     parser.add_argument('prediction_file', help='Prediction File')
84 |     args = parser.parse_args()
85 |     with open(args.dataset_file) as dataset_file:
86 |         dataset_json = json.load(dataset_file)
87 |         if (dataset_json['version'] != expected_version):
88 |             print('Evaluation expects v-' + expected_version +
89 |                   ', but got dataset with v-' + dataset_json['version'],
90 |                   file=sys.stderr)
91 |         dataset = dataset_json['data']
92 |     with open(args.prediction_file) as prediction_file:
93 |         predictions = json.load(prediction_file)
94 |     print(json.dumps(evaluate(dataset, predictions)))
95 | 


--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
  1 | import preprocess
  2 | from Models import model_rnet
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | import argparse
  6 | import random
  7 | import json
  8 | from pprint import pprint
  9 | 
 10 | def run():
 11 | 	parser = argparse.ArgumentParser()
 12 | 	parser.add_argument('--model', type=str, default='rnet', help='Model: match_lstm, bidaf, rnet')
 13 | 	parser.add_argument('--debug', type=bool, default=False, help='print debug msgs')
 14 | 	parser.add_argument('--dataset', type=str, default='dev', help='dataset')
 15 | 	parser.add_argument('--model_path', type=str, default='Models/save/rnet_model0.ckpt', help='saved model path')
 16 | 
 17 | 	args = parser.parse_args()
 18 | 	if not args.model == 'rnet':
 19 | 		raise NotImplementedError
 20 | 
 21 | 	modOpts = json.load(open('Models/config.json','r'))[args.model]['dev']
 22 | 	print('Model Configs:')
 23 | 	pprint(modOpts)
 24 | 
 25 | 	print('Reading data')
 26 | 	if args.dataset == 'train':
 27 | 		raise NotImplementedError
 28 | 	elif args.dataset == 'dev':
 29 | 		dp = preprocess.read_data(args.dataset, modOpts)
 30 |     
 31 | 	model = model_rnet.R_NET(modOpts)
 32 | 	input_tensors, loss, acc, pred_si, pred_ei = model.build_model()
 33 | 	saved_model = args.model_path
 34 | 
 35 | 
 36 | 	num_batches = int(np.ceil(dp.num_samples/modOpts['batch_size']))
 37 | 	print(num_batches, 'batches')
 38 | 	
 39 | 	config = tf.ConfigProto()
 40 | 	config.gpu_options.allow_growth = True
 41 | 	new_saver = tf.train.Saver()
 42 | 	sess = tf.InteractiveSession(config=config)
 43 | 	new_saver.restore(sess, saved_model)
 44 | 	
 45 | 	pred_data = {}
 46 | 
 47 | 	EM = 0.0
 48 | 	F1 = 0.0
 49 | 	empty_answer_idx = np.ndarray((modOpts['batch_size'], modOpts['p_length']))
 50 | 	for batch_no in range(num_batches):
 51 | 		if args.model == 'rnet':
 52 | 			context, context_original, paragraph, question, paragraph_c, question_c, answer_si, answer_ei, ID, n = dp.get_testing_batch(batch_no)
 53 | 			feed_dict={
 54 | 				input_tensors['p']:paragraph,
 55 | 				input_tensors['q']:question,
 56 | 				input_tensors['a_si']:empty_answer_idx,
 57 | 				input_tensors['a_ei']:empty_answer_idx,
 58 | 			}
 59 | 			if modOpts['char_emb']:
 60 | 				feed_dict.update({
 61 | 						input_tensors['pc']:tensor_dict['paragraph_c'],
 62 | 						input_tensors['qc']:tensor_dict['question_c'],
 63 | 					})
 64 | 			predictions_si, predictions_ei = sess.run([pred_si, pred_ei], feed_dict=feed_dict)
 65 | 		for i in range(n):
 66 | 			parag = context[i]
 67 | 			f1 = []
 68 | 			p_tokens = []
 69 | 			for j in range(len(answer_si[i])):
 70 | 				if answer_si[i][j] == answer_ei[i][j]: # single word answer
 71 | 					truth_tokens = [parag[int(answer_si[i][j])]]
 72 | 					pred_tokens = [parag[int(predictions_si[i])]]
 73 | 				else:
 74 | 					truth_tokens = parag[int(answer_si[i][j]):int(answer_ei[i][j])+1]
 75 | 					pred_tokens = parag[int(predictions_si[i]):int(predictions_ei[i])+1]
 76 | 				f1.append(f1_score( pred_tokens, truth_tokens ))
 77 | 				p_tokens.append(pred_tokens)
 78 | 			idx = np.argmax(f1)
 79 | 			if answer_si[i][idx] == int(predictions_si[i]) and answer_ei[i][idx] == int(predictions_ei[i]):
 80 | 				EM += 1.0
 81 | 			F1 += f1[idx]
 82 | 			pred_data[ID[i]] =  ' '.join( p_tokens[idx] )
 83 | 		print(batch_no, 'EM', '{:.5f}'.format(EM/(batch_no+1)/modOpts['batch_size']), 'F1', F1/(batch_no+1)/modOpts['batch_size'])
 84 | 	print("---------------")
 85 | 	print("EM", EM/dp.num_samples )
 86 | 	print("F1", F1/dp.num_samples )
 87 | 	with open('Results/'+args.model+'_prediction.txt', 'w') as outfile:
 88 | 	    json.dump(pred_data, outfile)
 89 |     
 90 | def f1_score(prediction, ground_truth):
 91 | 	from collections import Counter
 92 | 
 93 | 	prediction_tokens = prediction
 94 | 	ground_truth_tokens = ground_truth
 95 | 	common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
 96 | 	num_same = sum(common.values())
 97 | 	if num_same == 0:
 98 | 		return 0
 99 | 	precision = 1.0 * num_same / len(prediction_tokens)
100 | 	recall = 1.0 * num_same / len(ground_truth_tokens)
101 | 	f1 = (2 * precision * recall) / (precision + recall)
102 | 	return f1
103 | 
104 | if __name__ == '__main__':
105 | 	run()
106 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # R-NET in Tensorflow
  2 | 
  3 | * This repository is a Tensorflow implementation of [R-NET](https://www.microsoft.com/en-us/research/wp-content/uploads/2017/05/r-net.pdf), a neural network designed to solve the Question Answering (QA) task. 
  4 | * This implementation is specifically designed for [SQuAD](stanford-qa.com) , a large-scale dataset drawing attention in the field of QA recently.
  5 | * If you have any questions, contact b03902012@ntu.edu.tw.
  6 | 
  7 | ## Updates and Acknowledgements
  8 | 
  9 | ### 17.12.30
 10 | - As some have required recently, I have released a set of trained model weights. Details can be found in the Current Results section below.
 11 | 
 12 | ### 17.12.12
 13 | - I'd like to thank _Fan Yang_ for pointing out several bugs when evaluating models. First, the model to be evaluated needs to be explicitly specified when executing the `evaluate.py` program. See the Usage section below. Also, I fixed some problems when loading characters.
 14 | 
 15 | ### 17.11.10
 16 | - I'd like to thank _Elías Jónsson_ for pointing out that there's a problem in the mapping between characters and their indices. Previously, the indices for training and testing (dev set) were inconsistent. Actually, the mapping for testing shouldn't be constructed. During testing, if the machine sees a character it has not seen in the training set, it should mark it as OOV. So the table is now constructed using only the training set, and is used in both training and testing.
 17 | - As some are asking about how to turn the character embeddings off, one can now avoid using character embeddings by changing the hyperparameter in `Models/config.json`.
 18 | - I applied dropout to various components in the model, including all LSTM cells, passage & question encoding, question-passage matching, self-attention, and question representation. This led to improvement of about 3%.
 19 | - As I read the original paper more carefully, I found that the authors used Adadelta as optimizer, and 3 layers of bi-GRU were used to encode both passage and question. Changing from Adam to Adadelta led to roughly 1% improvement. In my experiments, after stacking layers, the epochs required for convergence increased, and I found that instead of stacking 3 layers, 2 layers led to better performances. Details are depicted in the current results section.
 20 | 
 21 | 
 22 | ## Dependency
 23 | * Python 3.6
 24 | * Tensorflow-gpu 1.2.1
 25 | * Numpy 1.13.1
 26 | * NLTK
 27 | 
 28 | ## Usage
 29 | 1. First we need to download [SQuAD](stanford-qa.com) as well as the pre-trained [GloVe](nlp.stanford.edu/projects/glove/) word embeddings. This should take roughly 30 minutes, depending on network speed.
 30 | ```
 31 | cd Data
 32 | sh download.sh
 33 | cd ..
 34 | ```
 35 | 2. Data preprocessing, including tokenizing and collection of pre-trained word embeddings, can take about 15 minutes.
 36 | Two kinds of files, `{data/shared}_{train/dev}.json`, will be generated and stored in `Data`.
 37 |     * shared: including the original and tokenized articles, GloVe word embeddings and character dictionaries.
 38 |     * data: including the ID, corresponding article id, tokenized question and the answer indices.
 39 | ```
 40 | python preprocess.py --gen_seq True
 41 | ```
 42 | 3. Train R-NET by simply executing the following. The program will
 43 |     1. Read the training data, and then build the model. This should take around an hour, depending on hardware.
 44 |     2. Train for 12 epochs, by default.
 45 | 
 46 |     Hyper-arameters can be specified in `Models/config.json`. The training procedure, including the mean loss and mean EM score for each epoch, will be stored in `Results/rnet_training_result.txt`. Note that the score appear during training could be lower than the scores from the official evaluator. The models will be stored in `Models/save/`.
 47 | ```
 48 | python rnet.py
 49 | ```
 50 | 
 51 | 4. The evaluation of the model on the dev set can be generated by executing the following. The result will be stored in `Results/rnet_prediction.txt`. Note that the score appear during evaluation could be lower than the scores from the official evaluator.
 52 | **Note:** The model to be evaluated has to be specified explictly. For example, if 12 epochs were trained (by default), then in `Models/save/` there should exist 5 saved models:
 53 | ```
 54 | rnet_model8.ckpt.meta
 55 | rnet_model8.ckpt.data-00000-of-00001
 56 | rnet_model8.ckpt.index
 57 | ...
 58 | rnet_model11.ckpt.meta
 59 | rnet_model11.ckpt.data-00000-of-00001
 60 | rnet_model11.ckpt.index
 61 | rnet_model_final.ckpt.meta
 62 | rnet_model_final.ckpt.data-00000-of-00001
 63 | rnet_model_final.ckpt.index
 64 | ```
 65 | Here, `rnet_model11` and `rnet_model_final` are the same. Say, for example, one wish to evaluate on `rnet_model_final`, the following would to it:
 66 | ```
 67 | python evaluate.py --model_path Models/save/rnet_model_final.ckpt
 68 | ```
 69 | 
 70 | 5. To get the final official score, you need to use the official evaluation script, which is in the `Results` directory.
 71 | ```
 72 | python Results/evaluate-v1.1.py Data/dev-v1.1.json Results/rnet_prediction.txt
 73 | ```
 74 | 
 75 | ## Current Results
 76 | 
 77 | 
 78 | | Model | Dev EM Score | Dev F1 Score |
 79 | | -------- | -------- | -------- |
 80 | | Original Paper | 71.1 | 79.5 |
 81 | | My (Adadelta, 2 layer, dropouts, w/o char emb) | 62.6 | 71.5 |
 82 | | My (Adadelta, 1 layer, dropouts, w/o char emb) | 61.0 | 70.3 |
 83 | | My (Adam, 1 layer, dropouts, w/o char emb) | 60.8 | 70.5 |
 84 | | My (Adam, 1 layer, w/o char emb)| 57.8 | 67.9|
 85 | | My (Adam, 1 layer, w/ char emb) | 60.1 | 68.9 |
 86 | 
 87 | You can find the [current leaderboard](https://rajpurkar.github.io/SQuAD-explorer/) and compare with other models.
 88 | 
 89 | ### Trained model weights
 90 | As some have required recently, a set of trained model weights can be downloaded [here](http://slam.iis.sinica.edu.tw/demo/RNet/release.zip). Unzip and you can find 3 files. Put the 3 files in `Models/save/` and evaluate on it by following the instruction above. This set of parameter was obtained by training for 28 epochs, using current settings, and achieved 62.2/71.5 on the dev set. I didn't save each set of model weights when I ran the experiments originally, so I reran the experiment, causing a slight degration compared with the best score on the table above. I want to clarify that the difference may come from random initialization, so feel free to train your own model weights.
 91 | 
 92 | ## Discussion
 93 | 
 94 | ### Reproduction
 95 | 
 96 | As shown above, I still fail to reproduce the results. I think there are some technical details that draw my concern:
 97 | 
 98 | 1. Data Preprocessing. I have tried two preprocessing approaches, one of which is used in the implementation of [Match-LSTM](https://github.com/shuohangwang/SeqMatchSeq/blob/master/preprocess.py), and the other is used in the implementation of [Bi-DAF](https://github.com/allenai/bi-att-flow/blob/master/squad/prepro.py). While the latter approach includes lots of reasonable processing, I chose the former one empirically since it yields better performance.
 99 | 2. As pointed out in another [implementation of R-NET in Keras](https://github.com/YerevaNN/R-NET-in-Keras), 
100 |     > The first formula in (11) of the report contains a strange summand `W_v^Q V_r^Q`. Both tensors are trainable and are not used anywhere else in the network. We have replaced this product with a single trainable vector.
101 | 
102 |     However, instead of replacing the product with a single trainable vector, I followed the notation and still used two vectors.
103 | 4. Variable sharing. The notation in the original paper was very confusing to me. For example, `W_v^P` appeared in both equations (4) and (8). In my opinion, they should not be the same since they are multiplied by vectors of total different spaces. As a result, I treat them as different variables empirically.
104 | 5. Hyper-parameters ambiguity. Some hyper-paramters weren't specified in the original paper, including character embedding matrix dimension, truncating of articles and questions, and length of answer span during inference. I set up my own hyper-parameters empirically, mostly following the settings of [Match-LSTM](https://arxiv.org/pdf/1608.07905.pdf) and [Bi-DAF](https://arxiv.org/pdf/1611.01603.pdf).
105 | 6. Any other implementation mistakes and bugs.
106 | 
107 | ### OOM
108 | 
109 | The full model could not be trained with NVIDIA Tesla K40m with 12GiB memory. Tensorflow will report serious OOM problem. There are a few possible solutions.
110 | 
111 | 1. Run with CPU. This can be achieved by assigning a device mask with command line as follows. In fact, my implementation result shown in the previous section was generated by a model trained with CPU. However, this might cause extremely slow training speed. In my experience, it might cost roughly _24 hours per epoch_.
112 | ```
113 | CUDA_VISIBLE_DEVICES="" python rnet.py
114 | ```
115 | 2. Reduce hyperparameters. Modifying these parameters might help:
116 |     * `p_length`
117 |     * Word embedding dimension: change from 300d GloVe vectors to 100d.
118 |     
119 | 3. Don't use character embeddings. According to [Bi-DAF](https://arxiv.org/pdf/1611.01603.pdf), character embeddings don't help much. However, Bi-DAF uses 1D-CNNs to generate the character embeddings, while R-NET uses RNNs. As shown in the previous section, the performance dropped for 2%. Further investigation is needed for this part.
120 | 


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import os
  3 | import re
  4 | from collections import Counter
  5 | import json
  6 | import numpy as np
  7 | 
  8 | def sublist_exists(sl, l):
  9 | 	n = len(sl)
 10 | 	return any((sl == l[i:i+n]) for i in range(len(l)-n+1))
 11 | 
 12 | def sublist_idx(sl, l):
 13 | 	sll=len(sl)
 14 | 	for ind in (i for i,e in enumerate(l) if e==sl[0]):
 15 | 		if l[ind:ind+sll]==sl:
 16 | 			return ind,ind+sll
 17 | 
 18 | class DataProcessor:
 19 | 	def __init__(self, data_type, opts):
 20 | 		self.data_type = data_type
 21 | 		self.opts = opts
 22 | 		data_path = os.path.join('Data', "data_{}.json".format(data_type))
 23 | 		shared_path = os.path.join('Data', "shared_{}.json".format(data_type))
 24 | 		idx_path = os.path.join('Data', "idx_table.json")
 25 | 		self.data = self.load_data(data_path)
 26 | 		self.shared = self.load_data(shared_path)
 27 | 		self.idx_table = self.load_data(idx_path)
 28 | 
 29 | 		# paragraph length filter: (train only)
 30 | 		if self.data_type == 'train':
 31 | 			self.data = [sample for sample in self.data if sample['answer'][0][-1] < self.opts['p_length']]
 32 | 		self.num_samples = self.get_data_size()
 33 | 		print("Loaded {} examples from {}".format(self.num_samples, data_type))
 34 | 
 35 | 	def load_data(self, path):
 36 | 		with open(path, 'r') as fh:
 37 | 			data = json.load(fh)
 38 | 		return data
 39 | 	
 40 | 
 41 | 	def get_data_size(self):
 42 | 		return len(self.data)
 43 | 
 44 | 	def get_training_batch(self, batch_no):
 45 | 		opts = self.opts
 46 | 		si = (batch_no * opts['batch_size'])
 47 | 		ei = min(self.num_samples, si + opts['batch_size'])
 48 | 		n = ei - si
 49 | 
 50 | 		tensor_dict = {}
 51 | 		paragraph = np.zeros((n, opts['p_length'], opts['word_emb_dim']))
 52 | 		question = np.zeros((n, opts['q_length'], opts['word_emb_dim']))
 53 | 		paragraph_c = np.zeros((n, opts['p_length'], opts['char_max_length']))
 54 | 		question_c = np.zeros((n, opts['q_length'], opts['char_max_length']))
 55 | 		answer_si = np.zeros( (n, opts['p_length']) )
 56 | 		answer_ei = np.zeros( (n, opts['p_length']) )
 57 | 		idxs= [] 
 58 | 
 59 | 		count = 0
 60 | 		for i in range(si, ei):
 61 | 			idxs.append(i)
 62 | 			sample = self.data[i]
 63 | 			aipi = sample['aipi']
 64 | 			p = self.shared['paragraphs'][aipi[0]][aipi[1]]
 65 | 			p_sent = self.shared['paragraphs_sent'][aipi[0]][aipi[1]]
 66 | 			q = sample['question']
 67 | 
 68 | 			for j in range(len(p)):
 69 | 				if j >= opts['p_length']:
 70 | 					break
 71 | 				try:
 72 | 					paragraph[count][j][:opts['word_emb_dim']] = self.shared['glove'+opts['glove']][p[j]]
 73 | 				except KeyError:
 74 | 					pass
 75 | 				for k, char in enumerate(p[j]):
 76 | 					paragraph_c[count][j][k] = self.idx_table['char2idx'][char]
 77 | 			
 78 | 			for j in range(len(q)):
 79 | 				if j >= opts['q_length']:
 80 | 					break
 81 | 				try:
 82 | 					question[count][j] = self.shared['glove'+opts['glove']][q[j]]
 83 | 				except KeyError:
 84 | 					pass
 85 | 				for k, char in enumerate(q[j]):
 86 | 					question_c[count][j][k] = self.idx_table['char2idx'][char]
 87 | 			
 88 | 			si, ei = sample['answer'][0][0], sample['answer'][0][-1]
 89 | 			answer_si[count][si] = 1.0
 90 | 			answer_ei[count][ei] = 1.0
 91 | 			
 92 | 			count += 1
 93 | 		
 94 | 		tensor_dict['paragraph'] = paragraph
 95 | 		tensor_dict['question'] = question
 96 | 		tensor_dict['paragraph_c'] = paragraph_c
 97 | 		tensor_dict['question_c'] = question_c
 98 | 		tensor_dict['answer_si'] = answer_si
 99 | 		tensor_dict['answer_ei'] = answer_ei
100 | 		return tensor_dict, idxs
101 | 	
102 | 	def get_testing_batch(self, batch_no):
103 | 		opts = self.opts
104 | 		si = (batch_no * opts['batch_size'])
105 | 		ei = min(self.num_samples, si + opts['batch_size'])
106 | 		n = ei - si
107 | 
108 | 		paragraph = np.zeros((opts['batch_size'], opts['p_length'], opts['word_emb_dim']))
109 | 		question = np.zeros((opts['batch_size'], opts['q_length'], opts['word_emb_dim']))
110 | 		paragraph_c = np.zeros((opts['batch_size'], opts['p_length'], opts['char_max_length']))
111 | 		question_c = np.zeros((opts['batch_size'], opts['q_length'], opts['char_max_length']))
112 | 		context = [None for _ in range(n)]
113 | 		context_original = [None for _ in range(n)]
114 | 		answer_si = [None for _ in range(n)]
115 | 		answer_ei = [None for _ in range(n)]
116 | 		ID = [None for _ in range(n)]
117 | 		
118 | 		count = 0
119 | 		for i in range(si, ei):
120 | 			sample = self.data[i]
121 | 			aipi = sample['aipi']
122 | 			p = self.shared['paragraphs'][aipi[0]][aipi[1]]
123 | 			p_o = self.shared['paragraphs_original'][aipi[0]][aipi[1]]
124 | 			q = sample['question']
125 | 			
126 | 			context[count] = p
127 | 			context_original[count] = p_o
128 | 			for j in range(len(p)):
129 | 				if j >= opts['p_length']:
130 | 					break
131 | 				try:
132 | 					paragraph[count][j][:opts['word_emb_dim']] = self.shared['glove'+opts['glove']][p[j]]
133 | 					for k, char in enumerate(p[j]):
134 | 						paragraph_c[count][j][k] = self.idx_table['char2idx'][char]
135 | 				except KeyError:
136 | 					#print('{} not in GloVe'.format(p[j]))
137 | 					pass
138 | 			
139 | 			for j in range(len(q)):
140 | 				if j >= opts['q_length']:
141 | 					break
142 | 				try:
143 | 					question[count][j] = self.shared['glove'+opts['glove']][q[j]]
144 | 					for k, char in enumerate(q[j]):
145 | 						question_c[count][j][k] = self.idx_table['char2idx'][char]
146 | 				except KeyError:
147 | 					pass
148 | 					#print('{} not in GloVe'.format(triplet['question'][j].lower()))
149 | 			
150 | 			answer_si[count] = [ans[0]  for ans in sample['answer']]
151 | 			answer_ei[count] = [ans[-1] for ans in sample['answer']]
152 | 			ID[count] = sample['id']
153 | 			count += 1
154 | 		
155 | 		return context, context_original, paragraph, question, paragraph_c, question_c, answer_si, answer_ei, ID, n
156 | 
157 | def get_word2vec(glove_path, word_counter):
158 | 	word2vec_dict = {}
159 | 	with open(glove_path, 'r', encoding='utf-8') as fh:
160 | 		for line in fh:
161 | 			array = line.lstrip().rstrip().split(" ")
162 | 			word = array[0]
163 | 			vector = list(map(float, array[1:]))
164 | 			if word in word_counter:
165 | 				word2vec_dict[word] = vector
166 | 			if word.capitalize() in word_counter:
167 | 				word2vec_dict[word.capitalize()] = vector
168 | 			if word.lower() in word_counter:
169 | 				word2vec_dict[word.lower()] = vector
170 | 			if word.upper() in word_counter:
171 | 				word2vec_dict[word.upper()] = vector
172 | 	print("{}/{} of word vocab have corresponding vectors in {}".format(len(word2vec_dict), len(word_counter), glove_path))
173 | 	return word2vec_dict
174 | 
175 | def get_char_vocab(word_counter):
176 | 	char2idx = {' ':0}
177 | 	idx2char = [' ']
178 | 	max_word_length = 0
179 | 	word_count = [0 for _ in range(37)]
180 | 
181 | 	for word in word_counter:
182 | 		word_count[len(word)-1] +=1
183 | 		max_word_length = max(max_word_length, len(word))
184 | 		for char in word:
185 | 			if not char in char2idx:
186 | 				idx2char.append(char)
187 | 				char2idx[char] = len(idx2char) - 1
188 | 	print('max word length:',max_word_length)
189 | 	print(len(char2idx),'chars read')
190 | 	print(word_count)
191 | 
192 | 	return char2idx, idx2char
193 | 
194 | def read_local_word2vec():
195 | 	local_w2v_dir = os.join.path('Data', 'local_w2v', 'local_w2v.json')
196 | 	local_w2v = json.load(open(local_w2v_dir, 'r'))
197 | 	return local_w2v_dir	
198 | 
199 | def generate_seq(data_type):
200 | 	import nltk
201 | 	nltk.download('punkt')
202 | 	from nltk.tokenize import word_tokenize, sent_tokenize
203 | 	#def word_tokenize(tokens):
204 | 	#	return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
205 | 
206 | 	fpr = open(os.path.join('Data', data_type+'-v1.1.json'), 'r')
207 | 	source_data = json.load(fpr)
208 | 	
209 | 	data = []
210 | 	articles = []
211 | 	articles_sent = []
212 | 	articles_original = []
213 | 	articles_original_sent = []
214 | 	word_counter = Counter()
215 | 
216 | 	fpw = open(os.path.join('Data','data_'+data_type+".json"), 'w')
217 | 	for ai, article in enumerate(source_data["data"]):
218 | 		if ai%20 == 0:
219 | 			print('processing article',ai)
220 | 		paragraphs = []
221 | 		paragraphs_sent = []
222 | 		paragraphs_original = []
223 | 		paragraphs_original_sent = []
224 | 
225 | 		for pi, p in enumerate(article["paragraphs"]):
226 | 			context = p["context"]
227 | 			#context = context.replace("''", '" ')
228 | 			#context = context.replace("``", '" ')
229 | 			paragraph = word_tokenize(context)
230 | 			context_sent = sent_tokenize(context)
231 | 			paragraph_sent = [word_tokenize(sent) for sent in context_sent]
232 | 			paragraphs.append(paragraph) # word level paragraph
233 | 			paragraphs_sent.append(paragraph_sent) # sentence_word level paragraph
234 | 			paragraphs_original.append(context) # original paragraph
235 | 			paragraphs_original_sent.append(context_sent) # sentence_tokenized original paragraph
236 | 			for w in paragraph:
237 | 				word_counter[w] += len(p['qas'])
238 | 
239 | 			for qa in p["qas"]:
240 | 				question = word_tokenize(qa["question"])
241 | 				answers = []
242 | 				answers_sent = []
243 | 				for w in question:
244 | 					word_counter[w] += 1
245 | 
246 | 				for a in qa['answers']:
247 | 					answer = a['text'].strip()
248 | 					answer_start = int(a['answer_start'])
249 | 
250 | 					#add '.' here, just because NLTK is not good enough in some cases
251 | 					answer_words = word_tokenize(answer+'.')
252 | 					if answer_words[-1] == '.':
253 | 						answer_words = answer_words[:-1]
254 | 					else:
255 | 						answer_words = word_tokenize(answer)
256 | 
257 | 					#word level
258 | 					prev_context_words = word_tokenize( context[:answer_start] )
259 | 					left_context_words = word_tokenize( context[answer_start:] )
260 | 					pos_list = []
261 | 					for i in range(len(answer_words)):
262 | 						if i < len(left_context_words):
263 | 							pos_list.append(len(prev_context_words)+i)
264 | 					assert(len(pos_list) > 0)
265 | 
266 | 					# sent level
267 | 					# [sent_idx, word_idx]
268 | 					for idx, sent in enumerate(paragraph_sent):
269 | 						if sublist_exists(answer_words, sent):
270 | 							sent_idx = idx
271 | 							try:
272 | 								si,ei = sublist_idx(answer_words, sent)
273 | 							except:
274 | 								print(answer)
275 | 								print(answer_words)
276 | 								print(sent)
277 | 								exit()
278 | 							pos_list_sent = [[idx, i] for i in range(si, ei)]
279 | 							break
280 | 					
281 | 					answers.append(pos_list)
282 | 					answers_sent.append(pos_list_sent)
283 | 
284 | 				sample = {'aipi': [ai, pi],
285 | 						  'question': question,
286 | 						  'answer': answers,
287 | 						  'answer_sent': answers_sent, 
288 | 						  'id': str(qa['id']), 
289 | 						  }
290 | 				data.append(sample)
291 | 		articles.append(paragraphs)
292 | 		articles_sent.append(paragraphs_sent)
293 | 		articles_original.append(paragraphs_original)
294 | 		articles_original_sent.append(paragraphs_original_sent)
295 | 	
296 | 	w2v_100 = get_word2vec('Data/glove.6B.100d.txt', word_counter)
297 | 	w2v_300 = get_word2vec('Data/glove.840B.300d.txt', word_counter)
298 | 	char2idx, idx2char = get_char_vocab(word_counter)
299 | 
300 | 	print(len(data))
301 | 	print(len(articles), len(articles_sent))
302 | 	shared = {'paragraphs': articles,
303 | 			  'paragraphs_sent': articles_sent,
304 | 			  'paragraphs_original': articles_original,
305 | 			  'paragraphs_original_sent': articles_original_sent,
306 | 			  'glove100': w2v_100,
307 | 			  'glove300': w2v_300,
308 | 			  }
309 | 	print('Saving...')
310 | 	with open(os.path.join('Data','data_'+data_type+".json"), 'w') as f:
311 | 		json.dump(data, f)
312 | 	with open(os.path.join('Data','shared_'+data_type+".json"), 'w') as f:
313 | 		json.dump(shared, f)
314 | 
315 | 	if data_type == 'train':
316 | 		char2idx, idx2char = get_char_vocab(word_counter)
317 | 		idx_table = {'char2idx': char2idx,
318 | 					 'idx2char': idx2char,
319 | 					 }
320 | 		with open(os.path.join('Data','idx_table.json'), 'w') as f:
321 | 			json.dump(idx_table, f)
322 | 
323 | 	print('SQuAD '+data_type+' preprossing finished!')
324 | 
325 | def read_data(data_type, opts):
326 | 	return DataProcessor(data_type, opts)
327 | 
328 | def read_data_old(filename, PATH, p_thres):
329 | 	import json
330 | 	from pprint import pprint
331 | 
332 | 	data = []
333 | 	if filename == 'train':
334 | 		with open(PATH, 'r', encoding = 'utf-8') as f:
335 | 			for l in f:
336 | 				triplet = l.strip('\n').split("\t")
337 | 				paragraph = triplet[0].split(' ')
338 | 				question = triplet[1].split(' ')
339 | 				d = {'paragraph' : paragraph,
340 | 					'question' : question,
341 | 					'answer_si' : int(triplet[2].split(' ')[0]),
342 | 					'answer_ei' : int(triplet[2].split(' ')[-1]),
343 | 					}
344 | 				if d['answer_ei'] >= p_thres:
345 | 					continue
346 | 				data.append(d)
347 | 	else:
348 | 		with open(PATH, 'r', encoding = 'utf-8') as f:
349 | 			for l in f:
350 | 				triplet = l.strip('\n').split("\t")
351 | 				paragraph = triplet[0].split(' ')
352 | 				question = triplet[1].split(' ')
353 | 				try:
354 | 					answer_si = [ int(seq.split(' ')[0])  for seq in triplet[2:-1]]
355 | 					answer_ei = [ int(seq.split(' ')[-1])  for seq in triplet[2:-1]]
356 | 				except:
357 | 					print(triplet[2:])
358 | 					exit()
359 | 				d = {'paragraph' : paragraph,
360 | 					'question' : question,
361 | 					'answer_si' : answer_si,
362 | 					'answer_ei' : answer_ei,
363 | 					'ID' : triplet[-1]
364 | 					}
365 | 				data.append(d)
366 | 	print(filename,'{} triplets read'.format(len(data)))
367 | 	return data
368 | 
369 | def run():
370 | 	import argparse
371 | 	
372 | 	parser = argparse.ArgumentParser()
373 | 	parser.add_argument('--gen_seq', type=bool, default=False, help='original data to seq')
374 | 	args = parser.parse_args()
375 | 	
376 | 	if args.gen_seq:
377 | 		print('Generating Sequences...')
378 | 		generate_seq('train')
379 | 		generate_seq('dev')
380 | 
381 | if __name__ == "__main__":
382 | 	run()
383 | 


--------------------------------------------------------------------------------
/Models/model_rnet.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import math
  3 | 
  4 | class R_NET:
  5 | 	def random_weight(self, dim_in, dim_out, name=None, stddev=1.0):
  6 | 		return tf.Variable(tf.truncated_normal([dim_in, dim_out], stddev=stddev/math.sqrt(float(dim_in))), name=name)
  7 | 
  8 | 	def random_bias(self, dim, name=None):
  9 | 		return tf.Variable(tf.truncated_normal([dim]), name=name)
 10 | 	
 11 | 	def random_scalar(self, name=None):
 12 | 		return tf.Variable(0.0, name=name)
 13 | 
 14 | 	def DropoutWrappedGRUCell(self, hidden_size, in_keep_prob, name=None):
 15 | 		# cell = tf.contrib.rnn.GRUCell(hidden_size)
 16 | 		cell = tf.contrib.rnn.BasicLSTMCell(hidden_size, forget_bias=1.0, state_is_tuple=True)
 17 | 		cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob = in_keep_prob)
 18 | 		return cell
 19 | 
 20 | 	def mat_weight_mul(self, mat, weight):
 21 | 		# [batch_size, n, m] * [m, p] = [batch_size, n, p]
 22 | 		mat_shape = mat.get_shape().as_list()
 23 | 		weight_shape = weight.get_shape().as_list()
 24 | 		assert(mat_shape[-1] == weight_shape[0])
 25 | 		mat_reshape = tf.reshape(mat, [-1, mat_shape[-1]]) # [batch_size * n, m]
 26 | 		mul = tf.matmul(mat_reshape, weight) # [batch_size * n, p]
 27 | 		return tf.reshape(mul, [-1, mat_shape[1], weight_shape[-1]])
 28 | 
 29 | 	def __init__(self, options):
 30 | 		with tf.device('/cpu:0'):
 31 | 			self.options = options
 32 | 			
 33 | 			# Char embeddings
 34 | 			if options['char_emb']: 
 35 | 				self.char_emb_mat = self.random_weight(self.options['char_vocab_size'], 
 36 | 									self.options['char_emb_mat_dim'], name = 'char_emb_matrix')
 37 | 
 38 | 			# Weights
 39 | 			self.W_uQ = self.random_weight(2 * options['state_size'], options['state_size'], name='W_uQ')
 40 | 			self.W_uP = self.random_weight(2 * options['state_size'], options['state_size'], name='W_uP')
 41 | 			self.W_vP = self.random_weight(options['state_size'], options['state_size'], name='W_vP')
 42 | 			self.W_g_QP = self.random_weight(4 * options['state_size'], 4 * options['state_size'], name='W_g_QP')
 43 | 			self.W_smP1 = self.random_weight(options['state_size'], options['state_size'], name='W_smP1')
 44 | 			self.W_smP2 = self.random_weight(options['state_size'], options['state_size'], name='W_smP2')
 45 | 			self.W_g_SM = self.random_weight(2 * options['state_size'], 2 * options['state_size'], name='W_g_SM')
 46 | 			self.W_ruQ = self.random_weight(2 * options['state_size'], 2 * options['state_size'], name='W_ruQ')
 47 | 			self.W_vQ = self.random_weight(options['state_size'], 2 * options['state_size'], name='W_vQ')
 48 | 			self.W_VrQ = self.random_weight(options['q_length'], options['state_size'], name='W_VrQ') # has same size as u_Q
 49 | 			self.W_hP = self.random_weight(2 * options['state_size'], options['state_size'], name='W_hP')
 50 | 			self.W_ha = self.random_weight(2 * options['state_size'], options['state_size'], name='W_ha')
 51 | 
 52 | 			# Biases
 53 | 			self.B_v_QP = self.random_bias(options['state_size'], name='B_v_QP')
 54 | 			self.B_v_SM = self.random_bias(options['state_size'], name='B_v_SM')
 55 | 			self.B_v_rQ = self.random_bias(2 * options['state_size'], name='B_v_rQ')
 56 | 			self.B_v_ap = self.random_bias(options['state_size'], name='B_v_ap')
 57 | 
 58 | 			# QP_match
 59 | 			with tf.variable_scope('QP_match') as scope:
 60 | 				self.QPmatch_cell = self.DropoutWrappedGRUCell(self.options['state_size'], self.options['in_keep_prob'])
 61 | 				self.QPmatch_state = self.QPmatch_cell.zero_state(self.options['batch_size'], dtype=tf.float32)
 62 | 
 63 | 			# Ans Ptr
 64 | 			with tf.variable_scope('Ans_ptr') as scope:
 65 | 				self.AnsPtr_cell = self.DropoutWrappedGRUCell(2 * self.options['state_size'], self.options['in_keep_prob'])
 66 | 		
 67 | 	def build_model(self):
 68 | 		opts = self.options
 69 | 
 70 | 		# placeholders
 71 | 		paragraph = tf.placeholder(tf.float32, [opts['batch_size'], opts['p_length'], opts['emb_dim']])
 72 | 		question = tf.placeholder(tf.float32, [opts['batch_size'], opts['q_length'], opts['emb_dim']])
 73 | 		answer_si = tf.placeholder(tf.float32, [opts['batch_size'], opts['p_length']])
 74 | 		answer_ei = tf.placeholder(tf.float32, [opts['batch_size'], opts['p_length']])
 75 | 		if opts['char_emb']:
 76 | 			paragraph_c = tf.placeholder(tf.int32, [opts['batch_size'], opts['p_length'], opts['char_max_length']])
 77 | 			question_c = tf.placeholder(tf.int32, [opts['batch_size'], opts['q_length'], opts['char_max_length']])
 78 | 
 79 | 		print('Question and Passage Encoding')
 80 | 		if opts['char_emb']:
 81 | 			# char embedding -> word level char embedding
 82 | 			paragraph_c_emb = tf.nn.embedding_lookup(self.char_emb_mat, paragraph_c) # [batch_size, p_length, char_max_length, char_emb_dim]
 83 | 			question_c_emb = tf.nn.embedding_lookup(self.char_emb_mat, question_c)
 84 | 			paragraph_c_list = [tf.squeeze(w, [1]) for w in tf.split(paragraph_c_emb, opts['p_length'], axis=1)]
 85 | 			question_c_list = [tf.squeeze(w, [1]) for w in tf.split(question_c_emb, opts['q_length'], axis=1)]
 86 | 
 87 | 			c_Q = []
 88 | 			c_P = []
 89 | 			with tf.variable_scope('char_emb_rnn') as scope:
 90 | 				char_emb_fw_cell = self.DropoutWrappedGRUCell(opts['emb_dim'], 1.0)
 91 | 				char_emb_bw_cell = self.DropoutWrappedGRUCell(opts['emb_dim'], 1.0)
 92 | 				for t in range(opts['q_length']):
 93 | 					unstacked_q_c = tf.unstack(question_c_list[t], opts['char_max_length'], 1)
 94 | 					if t>0 :
 95 | 						tf.get_variable_scope().reuse_variables()
 96 | 					q_c_e_outputs, q_c_e_final_fw, q_c_e_final_bw = tf.contrib.rnn.static_bidirectional_rnn(
 97 | 						char_emb_fw_cell, char_emb_bw_cell, unstacked_q_c, dtype=tf.float32, scope = 'char_emb')
 98 | 					c_q_t = tf.concat([q_c_e_final_fw[1], q_c_e_final_bw[1]], 1)
 99 | 					c_Q.append(c_q_t)
100 | 				for t in range(opts['p_length']):
101 | 					unstacked_p_c = tf.unstack(paragraph_c_list[t], opts['char_max_length'], 1)
102 | 					p_c_e_outputs, p_c_e_final_fw, p_c_e_final_bw = tf.contrib.rnn.static_bidirectional_rnn(
103 | 						char_emb_fw_cell, char_emb_bw_cell, unstacked_p_c, dtype=tf.float32, scope = 'char_emb')
104 | 					c_p_t = tf.concat([p_c_e_final_fw[1], p_c_e_final_bw[1]], 1)
105 | 					c_P.append(c_p_t)
106 | 			c_Q = tf.stack(c_Q, 1)
107 | 			c_P = tf.stack(c_P, 1)
108 | 			print('c_Q', c_Q)
109 | 			print('c_P', c_P)
110 | 		
111 | 			# Concat e and c
112 | 			eQcQ = tf.concat([question, c_Q], 2)
113 | 			ePcP = tf.concat([paragraph, c_P], 2)
114 | 		else:
115 | 			eQcQ = question
116 | 			ePcP = paragraph
117 | 
118 | 		unstacked_eQcQ = tf.unstack(eQcQ, opts['q_length'], 1)
119 | 		unstacked_ePcP = tf.unstack(ePcP, opts['p_length'], 1)
120 | 		with tf.variable_scope('encoding') as scope:
121 | 			stacked_enc_fw_cells=[ self.DropoutWrappedGRUCell(opts['state_size'], opts['in_keep_prob']) for _ in range(2)]
122 | 			stacked_enc_bw_cells=[ self.DropoutWrappedGRUCell(opts['state_size'], opts['in_keep_prob']) for _ in range(2)]
123 | 			q_enc_outputs, q_enc_final_fw, q_enc_final_bw = tf.contrib.rnn.stack_bidirectional_rnn(
124 | 									stacked_enc_fw_cells, stacked_enc_bw_cells, unstacked_eQcQ, dtype=tf.float32, scope = 'context_encoding')
125 | 			tf.get_variable_scope().reuse_variables()
126 | 			p_enc_outputs, p_enc_final_fw, p_enc_final_bw = tf.contrib.rnn.stack_bidirectional_rnn(
127 | 									stacked_enc_fw_cells, stacked_enc_bw_cells, unstacked_ePcP, dtype=tf.float32, scope = 'context_encoding')
128 | 			u_Q = tf.stack(q_enc_outputs, 1)
129 | 			u_P = tf.stack(p_enc_outputs, 1)
130 | 		u_Q = tf.nn.dropout(u_Q, opts['in_keep_prob'])
131 | 		u_P = tf.nn.dropout(u_P, opts['in_keep_prob'])
132 | 		print(u_Q)
133 | 		print(u_P)
134 | 
135 | 		v_P = []
136 | 		print('Question-Passage Matching')
137 | 		for t in range(opts['p_length']):
138 | 			# Calculate c_t
139 | 			W_uQ_u_Q = self.mat_weight_mul(u_Q, self.W_uQ) # [batch_size, q_length, state_size]
140 | 			tiled_u_tP = tf.concat( [tf.reshape(u_P[:, t, :], [opts['batch_size'], 1, -1])] * opts['q_length'], 1)
141 | 			W_uP_u_tP = self.mat_weight_mul(tiled_u_tP , self.W_uP)
142 | 			
143 | 			if t == 0:
144 | 				tanh = tf.tanh(W_uQ_u_Q + W_uP_u_tP)
145 | 			else:
146 | 				tiled_v_t1P = tf.concat( [tf.reshape(v_P[t-1], [opts['batch_size'], 1, -1])] * opts['q_length'], 1)
147 | 				W_vP_v_t1P = self.mat_weight_mul(tiled_v_t1P, self.W_vP)
148 | 				tanh = tf.tanh(W_uQ_u_Q + W_uP_u_tP + W_vP_v_t1P)
149 | 			s_t = tf.squeeze(self.mat_weight_mul(tanh, tf.reshape(self.B_v_QP, [-1, 1])))
150 | 			a_t = tf.nn.softmax(s_t, 1)
151 | 			tiled_a_t = tf.concat( [tf.reshape(a_t, [opts['batch_size'], -1, 1])] * 2 * opts['state_size'] , 2)
152 | 			c_t = tf.reduce_sum( tf.multiply(tiled_a_t, u_Q) , 1) # [batch_size, 2 * state_size]
153 | 
154 | 			# gate
155 | 			u_tP_c_t = tf.expand_dims( tf.concat( [tf.squeeze(u_P[:, t, :]), c_t], 1), 1)
156 | 			g_t = tf.sigmoid( self.mat_weight_mul(u_tP_c_t, self.W_g_QP) )
157 | 			u_tP_c_t_star = tf.squeeze(tf.multiply(u_tP_c_t, g_t))
158 | 
159 | 			# QP_match
160 | 			with tf.variable_scope("QP_match"):
161 | 				if t > 0: tf.get_variable_scope().reuse_variables()
162 | 				output, self.QPmatch_state = self.QPmatch_cell(u_tP_c_t_star, self.QPmatch_state)
163 | 				v_P.append(output)
164 | 		v_P = tf.stack(v_P, 1)
165 | 		v_P = tf.nn.dropout(v_P, opts['in_keep_prob'])
166 | 		print('v_P', v_P)
167 | 
168 | 		print('Self-Matching Attention')
169 | 		SM_star = []
170 | 		for t in range(opts['p_length']):
171 | 			# Calculate s_t
172 | 			W_p1_v_P = self.mat_weight_mul(v_P, self.W_smP1) # [batch_size, p_length, state_size]
173 | 			tiled_v_tP = tf.concat( [tf.reshape(v_P[:, t, :], [opts['batch_size'], 1, -1])] * opts['p_length'], 1)
174 | 			W_p2_v_tP = self.mat_weight_mul(tiled_v_tP , self.W_smP2)
175 | 			
176 | 			tanh = tf.tanh(W_p1_v_P + W_p2_v_tP)
177 | 			s_t = tf.squeeze(self.mat_weight_mul(tanh, tf.reshape(self.B_v_SM, [-1, 1])))
178 | 			a_t = tf.nn.softmax(s_t, 1)
179 | 			tiled_a_t = tf.concat( [tf.reshape(a_t, [opts['batch_size'], -1, 1])] * opts['state_size'] , 2)
180 | 			c_t = tf.reduce_sum( tf.multiply(tiled_a_t, v_P) , 1) # [batch_size, 2 * state_size]
181 | 
182 | 			# gate
183 | 			v_tP_c_t = tf.expand_dims( tf.concat( [tf.squeeze(v_P[:, t, :]), c_t], 1), 1)
184 | 			g_t = tf.sigmoid( self.mat_weight_mul(v_tP_c_t, self.W_g_SM) )
185 | 			v_tP_c_t_star = tf.squeeze(tf.multiply(v_tP_c_t, g_t))
186 | 			SM_star.append(v_tP_c_t_star)
187 | 		SM_star = tf.stack(SM_star, 1)
188 | 		unstacked_SM_star = tf.unstack(SM_star, opts['p_length'], 1)
189 | 		with tf.variable_scope('Self_match') as scope:
190 | 			SM_fw_cell = self.DropoutWrappedGRUCell(opts['state_size'], opts['in_keep_prob'])
191 | 			SM_bw_cell = self.DropoutWrappedGRUCell(opts['state_size'], opts['in_keep_prob'])
192 | 			SM_outputs, SM_final_fw, SM_final_bw = tf.contrib.rnn.static_bidirectional_rnn(SM_fw_cell, SM_bw_cell, unstacked_SM_star, dtype=tf.float32)
193 | 			h_P = tf.stack(SM_outputs, 1)
194 | 		h_P = tf.nn.dropout(h_P, opts['in_keep_prob'])
195 | 		print('h_P', h_P)
196 | 		
197 | 		print('Output Layer')
198 | 		# calculate r_Q
199 | 		W_ruQ_u_Q = self.mat_weight_mul(u_Q, self.W_ruQ) # [batch_size, q_length, 2 * state_size]
200 | 		W_vQ_V_rQ = tf.matmul(self.W_VrQ, self.W_vQ)
201 | 		W_vQ_V_rQ = tf.stack([W_vQ_V_rQ]*opts['batch_size'], 0) # stack -> [batch_size, state_size, state_size]
202 | 		
203 | 		tanh = tf.tanh(W_ruQ_u_Q + W_vQ_V_rQ)
204 | 		s_t = tf.squeeze(self.mat_weight_mul(tanh, tf.reshape(self.B_v_rQ, [-1, 1])))
205 | 		a_t = tf.nn.softmax(s_t, 1)
206 | 		tiled_a_t = tf.concat( [tf.reshape(a_t, [opts['batch_size'], -1, 1])] * 2 * opts['state_size'] , 2)
207 | 		r_Q = tf.reduce_sum( tf.multiply(tiled_a_t, u_Q) , 1) # [batch_size, 2 * state_size]
208 | 		r_Q = tf.nn.dropout(r_Q, opts['in_keep_prob'])
209 | 		print('r_Q', r_Q)
210 | 
211 | 		# r_Q as initial state of ans ptr
212 | 		h_a = None
213 | 		p = [None for _ in range(2)]
214 | 		for t in range(2):
215 | 			W_hP_h_P = self.mat_weight_mul(h_P, self.W_hP) # [batch_size, p_length, state_size]
216 | 			
217 | 			if t == 0:
218 | 				h_t1a = r_Q
219 | 			else:
220 | 				h_t1a = h_a
221 | 			print('h_t1a', h_t1a)
222 | 			tiled_h_t1a = tf.concat( [tf.reshape(h_t1a, [opts['batch_size'], 1, -1])] * opts['p_length'], 1)
223 | 			W_ha_h_t1a = self.mat_weight_mul(tiled_h_t1a , self.W_ha)
224 | 			
225 | 			tanh = tf.tanh(W_hP_h_P + W_ha_h_t1a)
226 | 			s_t = tf.squeeze(self.mat_weight_mul(tanh, tf.reshape(self.B_v_ap, [-1, 1])))
227 | 			a_t = tf.nn.softmax(s_t, 1)
228 | 			p[t] = a_t
229 | 
230 | 			tiled_a_t = tf.concat( [tf.reshape(a_t, [opts['batch_size'], -1, 1])] * 2 * opts['state_size'] , 2)
231 | 			c_t = tf.reduce_sum( tf.multiply(tiled_a_t, h_P) , 1) # [batch_size, 2 * state_size]
232 | 
233 | 			if t == 0:
234 | 				AnsPtr_state = self.AnsPtr_cell.zero_state(opts['batch_size'], dtype=tf.float32)
235 | 				h_a, _ = self.AnsPtr_cell(c_t, (AnsPtr_state, r_Q) )
236 | 				h_a = h_a[1]
237 | 				print(h_a)
238 | 		print(p)	
239 | 		p1 = p[0]
240 | 		p2 = p[1]	
241 | 
242 | 		answer_si_idx = tf.cast(tf.argmax(answer_si, 1), tf.int32)
243 | 		answer_ei_idx = tf.cast(tf.argmax(answer_ei, 1), tf.int32)
244 | 		
245 | 		"""	
246 | 		ce_si = tf.nn.softmax_cross_entropy_with_logits(labels = answer_si, logits = p1)
247 | 		ce_ei = tf.nn.softmax_cross_entropy_with_logits(labels = answer_ei, logits = p2)
248 | 		print(ce_si, ce_ei)
249 | 		loss_si = tf.reduce_sum(ce_si)
250 | 		loss_ei = tf.reduce_sum(ce_ei)
251 | 		loss = loss_si + loss_ei
252 | 		"""
253 | 		
254 | 		batch_idx = tf.reshape(tf.range(0, opts['batch_size']), [-1,1])
255 | 		answer_si_re = tf.reshape(answer_si_idx, [-1,1])
256 | 		batch_idx_si = tf.concat([batch_idx, answer_si_re],1)
257 | 		answer_ei_re = tf.reshape(answer_ei_idx, [-1,1])
258 | 		batch_idx_ei = tf.concat([batch_idx, answer_ei_re],1)
259 | 		
260 | 		log_prob = tf.multiply(tf.gather_nd(p1, batch_idx_si), tf.gather_nd(p2, batch_idx_ei))
261 | 		loss = -tf.reduce_sum(tf.log(log_prob+0.0000001))
262 | 		
263 | 		# Search
264 | 		prob = []
265 | 		search_range = opts['p_length'] - opts['span_length']
266 | 		for i in range(search_range):
267 | 			for j in range(opts['span_length']):
268 | 				prob.append(tf.multiply(p1[:, i], p2[:, i+j]))
269 | 		prob = tf.stack(prob, axis = 1)
270 | 		argmax_idx = tf.argmax(prob, axis=1)
271 | 		pred_si = argmax_idx / opts['span_length']
272 | 		pred_ei = pred_si + tf.cast(tf.mod(argmax_idx , opts['span_length']), tf.float64)
273 | 		correct = tf.logical_and(tf.equal(tf.cast(pred_si, tf.int64), tf.cast(answer_si_idx, tf.int64)), 
274 | 								 tf.equal(tf.cast(pred_ei, tf.int64), tf.cast(answer_ei_idx, tf.int64)))
275 | 		accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
276 | 
277 | 		input_tensors = {
278 | 			'p':paragraph,
279 | 			'q':question,
280 | 			'a_si':answer_si,
281 | 			'a_ei':answer_ei,
282 | 		}
283 | 		if opts['char_emb']:
284 | 			input_tensors.update({'pc': paragraph_c, 'qc': question_c})
285 | 	
286 | 		print('Model built')
287 | 		for v in tf.global_variables():
288 | 			print(v.name, v.shape)
289 | 		
290 | 		return input_tensors, loss, accuracy, pred_si, pred_ei
291 | 
292 | 


--------------------------------------------------------------------------------