├── .gitignore ├── Data └── download.sh ├── Models ├── config.json └── model_rnet.py ├── rnet.py ├── Results └── evaluate-v1.1.py ├── evaluate.py ├── README.md └── preprocess.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | Models/__pycache__ 3 | -------------------------------------------------------------------------------- /Data/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Download SQuAD 4 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json 5 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json 6 | 7 | # Download GloVe 8 | wget http://nlp.stanford.edu/data/glove.6B.zip 9 | wget http://nlp.stanford.edu/data/glove.840B.300d.zip 10 | unzip glove.6B.zip 11 | unzip glove.840B.300d.zip 12 | 13 | -------------------------------------------------------------------------------- /Models/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "rnet":{ 3 | "train":{ 4 | "glove": "300", 5 | "share_context_LSTM":true, 6 | "char_emb":false, 7 | "in_keep_prob":0.8, 8 | "batch_size":60, 9 | "state_size":75, 10 | "emb_dim" : 300, 11 | "word_emb_dim" : 300, 12 | "char_max_length" : 37, 13 | "char_vocab_size" : 1368, 14 | "char_emb_mat_dim" : 8, 15 | "p_length":300, 16 | "q_length":30, 17 | "a_length":20, 18 | "span_length":20 19 | }, 20 | "dev":{ 21 | "glove":"300", 22 | "share_context_LSTM":true, 23 | "char_emb":false, 24 | "in_keep_prob":1.0, 25 | "batch_size":60, 26 | "state_size":75, 27 | "emb_dim":300, 28 | "word_emb_dim" : 300, 29 | "char_max_length" : 37, 30 | "char_vocab_size" : 1368, 31 | "char_emb_mat_dim" : 8, 32 | "p_length":300, 33 | "q_length":30, 34 | "a_length":20, 35 | "span_length":20 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /rnet.py: -------------------------------------------------------------------------------- 1 | import preprocess 2 | from Models import model_rnet 3 | import numpy as np 4 | import tensorflow as tf 5 | import argparse 6 | import random 7 | import string 8 | import os 9 | import json 10 | 11 | def run(): 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--learning_rate', type=float, default=0.001, help='Learning Rate') 14 | parser.add_argument('--epochs', type=int, default=12, help='Expochs') 15 | parser.add_argument('--debug', type=bool, default=False, help='print debug msgs') 16 | parser.add_argument('--load', type=bool, default=False, help='load model') 17 | parser.add_argument('--save_dir', type=str, default='Models/save/', help='Data') 18 | 19 | args = parser.parse_args() 20 | 21 | modOpts = json.load(open('Models/config.json','r'))['rnet']['train'] 22 | 23 | print('Reading data') 24 | dp = preprocess.read_data('train', modOpts) 25 | num_batches = int(np.floor(dp.num_samples/modOpts['batch_size'])) - 1 26 | 27 | rnet_model = model_rnet.R_NET(modOpts) 28 | input_tensors, loss, acc, pred_si, pred_ei = rnet_model.build_model() 29 | #train_op = tf.train.AdamOptimizer(args.learning_rate).minimize(loss) 30 | train_op = tf.train.AdadeltaOptimizer(1.0, rho=0.95, epsilon=1e-06,).minimize(loss) 31 | 32 | #saver 33 | saver = tf.train.Saver() 34 | 35 | config = tf.ConfigProto() 36 | config.gpu_options.allow_growth = True 37 | sess = tf.InteractiveSession(config=config) 38 | if args.load: 39 | PATH = 'Models/save/rnet_model0.ckpt' 40 | start_epoch = 1 41 | saver.restore(sess, PATH) 42 | f = open('Results/rnet_training_result.txt','a') 43 | else: 44 | init = tf.global_variables_initializer() 45 | sess.run(init) 46 | f = open('Results/rnet_training_result.txt','w') 47 | start_epoch = 0 48 | 49 | for i in range(start_epoch, args.epochs): 50 | rl=random.sample(range(num_batches), num_batches) 51 | batch_no = 0 52 | LOSS = 0.0 53 | EM = 0.0 54 | while batch_no < num_batches: 55 | tensor_dict, idxs = dp.get_training_batch(rl[batch_no]) 56 | feed_dict = { 57 | input_tensors['p']:tensor_dict['paragraph'], 58 | input_tensors['q']:tensor_dict['question'], 59 | input_tensors['a_si']:tensor_dict['answer_si'], 60 | input_tensors['a_ei']:tensor_dict['answer_ei'], 61 | } 62 | if modOpts['char_emb']: 63 | feed_dict.update({ 64 | input_tensors['pc']:tensor_dict['paragraph_c'], 65 | input_tensors['qc']:tensor_dict['question_c'], 66 | }) 67 | _, loss_value, accuracy, predictions_si, predictions_ei = sess.run( 68 | [train_op, loss, acc, pred_si, pred_ei], feed_dict=feed_dict) 69 | batch_no += 1 70 | LOSS += loss_value 71 | EM += accuracy 72 | print("{} epoch {} batch, Loss:{:.2f}, Acc:{:.2f}".format(i, batch_no, loss_value, accuracy)) 73 | save_path = saver.save(sess, os.path.join(args.save_dir, "rnet_model{}.ckpt".format(i))) 74 | f.write(' '.join( ("Loss", str(LOSS/dp.num_samples), str(i), '\n' ) ) ) 75 | f.write(' '.join( ("EM", str(EM/num_batches), '\n') ) ) 76 | f.write("---------------\n") 77 | f.flush() 78 | print("---------------") 79 | f.close() 80 | save_path = saver.save(sess, os.path.join(args.save_dir, "rnet_model_final.ckpt")) 81 | print('save path:',save_path) 82 | 83 | def f1_score(prediction, ground_truth): 84 | from collections import Counter 85 | 86 | prediction_tokens = prediction 87 | ground_truth_tokens = ground_truth 88 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 89 | num_same = sum(common.values()) 90 | if num_same == 0: 91 | return 0 92 | precision = 1.0 * num_same / len(prediction_tokens) 93 | recall = 1.0 * num_same / len(ground_truth_tokens) 94 | f1 = (2 * precision * recall) / (precision + recall) 95 | return f1 96 | 97 | if __name__ == '__main__': 98 | run() 99 | -------------------------------------------------------------------------------- /Results/evaluate-v1.1.py: -------------------------------------------------------------------------------- 1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """ 2 | from __future__ import print_function 3 | from collections import Counter 4 | import string 5 | import re 6 | import argparse 7 | import json 8 | import sys 9 | 10 | 11 | def normalize_answer(s): 12 | """Lower text and remove punctuation, articles and extra whitespace.""" 13 | def remove_articles(text): 14 | return re.sub(r'\b(a|an|the)\b', ' ', text) 15 | 16 | def white_space_fix(text): 17 | return ' '.join(text.split()) 18 | 19 | def remove_punc(text): 20 | exclude = set(string.punctuation) 21 | return ''.join(ch for ch in text if ch not in exclude) 22 | 23 | def lower(text): 24 | return text.lower() 25 | 26 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 27 | 28 | 29 | def f1_score(prediction, ground_truth): 30 | prediction_tokens = normalize_answer(prediction).split() 31 | ground_truth_tokens = normalize_answer(ground_truth).split() 32 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 33 | num_same = sum(common.values()) 34 | if num_same == 0: 35 | return 0 36 | precision = 1.0 * num_same / len(prediction_tokens) 37 | recall = 1.0 * num_same / len(ground_truth_tokens) 38 | f1 = (2 * precision * recall) / (precision + recall) 39 | return f1 40 | 41 | 42 | def exact_match_score(prediction, ground_truth): 43 | return (normalize_answer(prediction) == normalize_answer(ground_truth)) 44 | 45 | 46 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 47 | scores_for_ground_truths = [] 48 | for ground_truth in ground_truths: 49 | score = metric_fn(prediction, ground_truth) 50 | scores_for_ground_truths.append(score) 51 | return max(scores_for_ground_truths) 52 | 53 | 54 | def evaluate(dataset, predictions): 55 | f1 = exact_match = total = 0 56 | for article in dataset: 57 | for paragraph in article['paragraphs']: 58 | for qa in paragraph['qas']: 59 | total += 1 60 | if qa['id'] not in predictions: 61 | message = 'Unanswered question ' + qa['id'] + \ 62 | ' will receive score 0.' 63 | print(message, file=sys.stderr) 64 | continue 65 | ground_truths = list(map(lambda x: x['text'], qa['answers'])) 66 | prediction = predictions[qa['id']] 67 | exact_match += metric_max_over_ground_truths( 68 | exact_match_score, prediction, ground_truths) 69 | f1 += metric_max_over_ground_truths( 70 | f1_score, prediction, ground_truths) 71 | 72 | exact_match = 100.0 * exact_match / total 73 | f1 = 100.0 * f1 / total 74 | 75 | return {'exact_match': exact_match, 'f1': f1} 76 | 77 | 78 | if __name__ == '__main__': 79 | expected_version = '1.1' 80 | parser = argparse.ArgumentParser( 81 | description='Evaluation for SQuAD ' + expected_version) 82 | parser.add_argument('dataset_file', help='Dataset file') 83 | parser.add_argument('prediction_file', help='Prediction File') 84 | args = parser.parse_args() 85 | with open(args.dataset_file) as dataset_file: 86 | dataset_json = json.load(dataset_file) 87 | if (dataset_json['version'] != expected_version): 88 | print('Evaluation expects v-' + expected_version + 89 | ', but got dataset with v-' + dataset_json['version'], 90 | file=sys.stderr) 91 | dataset = dataset_json['data'] 92 | with open(args.prediction_file) as prediction_file: 93 | predictions = json.load(prediction_file) 94 | print(json.dumps(evaluate(dataset, predictions))) 95 | -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | import preprocess 2 | from Models import model_rnet 3 | import numpy as np 4 | import tensorflow as tf 5 | import argparse 6 | import random 7 | import json 8 | from pprint import pprint 9 | 10 | def run(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--model', type=str, default='rnet', help='Model: match_lstm, bidaf, rnet') 13 | parser.add_argument('--debug', type=bool, default=False, help='print debug msgs') 14 | parser.add_argument('--dataset', type=str, default='dev', help='dataset') 15 | parser.add_argument('--model_path', type=str, default='Models/save/rnet_model0.ckpt', help='saved model path') 16 | 17 | args = parser.parse_args() 18 | if not args.model == 'rnet': 19 | raise NotImplementedError 20 | 21 | modOpts = json.load(open('Models/config.json','r'))[args.model]['dev'] 22 | print('Model Configs:') 23 | pprint(modOpts) 24 | 25 | print('Reading data') 26 | if args.dataset == 'train': 27 | raise NotImplementedError 28 | elif args.dataset == 'dev': 29 | dp = preprocess.read_data(args.dataset, modOpts) 30 | 31 | model = model_rnet.R_NET(modOpts) 32 | input_tensors, loss, acc, pred_si, pred_ei = model.build_model() 33 | saved_model = args.model_path 34 | 35 | 36 | num_batches = int(np.ceil(dp.num_samples/modOpts['batch_size'])) 37 | print(num_batches, 'batches') 38 | 39 | config = tf.ConfigProto() 40 | config.gpu_options.allow_growth = True 41 | new_saver = tf.train.Saver() 42 | sess = tf.InteractiveSession(config=config) 43 | new_saver.restore(sess, saved_model) 44 | 45 | pred_data = {} 46 | 47 | EM = 0.0 48 | F1 = 0.0 49 | empty_answer_idx = np.ndarray((modOpts['batch_size'], modOpts['p_length'])) 50 | for batch_no in range(num_batches): 51 | if args.model == 'rnet': 52 | context, context_original, paragraph, question, paragraph_c, question_c, answer_si, answer_ei, ID, n = dp.get_testing_batch(batch_no) 53 | feed_dict={ 54 | input_tensors['p']:paragraph, 55 | input_tensors['q']:question, 56 | input_tensors['a_si']:empty_answer_idx, 57 | input_tensors['a_ei']:empty_answer_idx, 58 | } 59 | if modOpts['char_emb']: 60 | feed_dict.update({ 61 | input_tensors['pc']:tensor_dict['paragraph_c'], 62 | input_tensors['qc']:tensor_dict['question_c'], 63 | }) 64 | predictions_si, predictions_ei = sess.run([pred_si, pred_ei], feed_dict=feed_dict) 65 | for i in range(n): 66 | parag = context[i] 67 | f1 = [] 68 | p_tokens = [] 69 | for j in range(len(answer_si[i])): 70 | if answer_si[i][j] == answer_ei[i][j]: # single word answer 71 | truth_tokens = [parag[int(answer_si[i][j])]] 72 | pred_tokens = [parag[int(predictions_si[i])]] 73 | else: 74 | truth_tokens = parag[int(answer_si[i][j]):int(answer_ei[i][j])+1] 75 | pred_tokens = parag[int(predictions_si[i]):int(predictions_ei[i])+1] 76 | f1.append(f1_score( pred_tokens, truth_tokens )) 77 | p_tokens.append(pred_tokens) 78 | idx = np.argmax(f1) 79 | if answer_si[i][idx] == int(predictions_si[i]) and answer_ei[i][idx] == int(predictions_ei[i]): 80 | EM += 1.0 81 | F1 += f1[idx] 82 | pred_data[ID[i]] = ' '.join( p_tokens[idx] ) 83 | print(batch_no, 'EM', '{:.5f}'.format(EM/(batch_no+1)/modOpts['batch_size']), 'F1', F1/(batch_no+1)/modOpts['batch_size']) 84 | print("---------------") 85 | print("EM", EM/dp.num_samples ) 86 | print("F1", F1/dp.num_samples ) 87 | with open('Results/'+args.model+'_prediction.txt', 'w') as outfile: 88 | json.dump(pred_data, outfile) 89 | 90 | def f1_score(prediction, ground_truth): 91 | from collections import Counter 92 | 93 | prediction_tokens = prediction 94 | ground_truth_tokens = ground_truth 95 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 96 | num_same = sum(common.values()) 97 | if num_same == 0: 98 | return 0 99 | precision = 1.0 * num_same / len(prediction_tokens) 100 | recall = 1.0 * num_same / len(ground_truth_tokens) 101 | f1 = (2 * precision * recall) / (precision + recall) 102 | return f1 103 | 104 | if __name__ == '__main__': 105 | run() 106 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # R-NET in Tensorflow 2 | 3 | * This repository is a Tensorflow implementation of [R-NET](https://www.microsoft.com/en-us/research/wp-content/uploads/2017/05/r-net.pdf), a neural network designed to solve the Question Answering (QA) task. 4 | * This implementation is specifically designed for [SQuAD](stanford-qa.com) , a large-scale dataset drawing attention in the field of QA recently. 5 | * If you have any questions, contact b03902012@ntu.edu.tw. 6 | 7 | ## Updates and Acknowledgements 8 | 9 | ### 17.12.30 10 | - As some have required recently, I have released a set of trained model weights. Details can be found in the Current Results section below. 11 | 12 | ### 17.12.12 13 | - I'd like to thank _Fan Yang_ for pointing out several bugs when evaluating models. First, the model to be evaluated needs to be explicitly specified when executing the `evaluate.py` program. See the Usage section below. Also, I fixed some problems when loading characters. 14 | 15 | ### 17.11.10 16 | - I'd like to thank _Elías Jónsson_ for pointing out that there's a problem in the mapping between characters and their indices. Previously, the indices for training and testing (dev set) were inconsistent. Actually, the mapping for testing shouldn't be constructed. During testing, if the machine sees a character it has not seen in the training set, it should mark it as OOV. So the table is now constructed using only the training set, and is used in both training and testing. 17 | - As some are asking about how to turn the character embeddings off, one can now avoid using character embeddings by changing the hyperparameter in `Models/config.json`. 18 | - I applied dropout to various components in the model, including all LSTM cells, passage & question encoding, question-passage matching, self-attention, and question representation. This led to improvement of about 3%. 19 | - As I read the original paper more carefully, I found that the authors used Adadelta as optimizer, and 3 layers of bi-GRU were used to encode both passage and question. Changing from Adam to Adadelta led to roughly 1% improvement. In my experiments, after stacking layers, the epochs required for convergence increased, and I found that instead of stacking 3 layers, 2 layers led to better performances. Details are depicted in the current results section. 20 | 21 | 22 | ## Dependency 23 | * Python 3.6 24 | * Tensorflow-gpu 1.2.1 25 | * Numpy 1.13.1 26 | * NLTK 27 | 28 | ## Usage 29 | 1. First we need to download [SQuAD](stanford-qa.com) as well as the pre-trained [GloVe](nlp.stanford.edu/projects/glove/) word embeddings. This should take roughly 30 minutes, depending on network speed. 30 | ``` 31 | cd Data 32 | sh download.sh 33 | cd .. 34 | ``` 35 | 2. Data preprocessing, including tokenizing and collection of pre-trained word embeddings, can take about 15 minutes. 36 | Two kinds of files, `{data/shared}_{train/dev}.json`, will be generated and stored in `Data`. 37 | * shared: including the original and tokenized articles, GloVe word embeddings and character dictionaries. 38 | * data: including the ID, corresponding article id, tokenized question and the answer indices. 39 | ``` 40 | python preprocess.py --gen_seq True 41 | ``` 42 | 3. Train R-NET by simply executing the following. The program will 43 | 1. Read the training data, and then build the model. This should take around an hour, depending on hardware. 44 | 2. Train for 12 epochs, by default. 45 | 46 | Hyper-arameters can be specified in `Models/config.json`. The training procedure, including the mean loss and mean EM score for each epoch, will be stored in `Results/rnet_training_result.txt`. Note that the score appear during training could be lower than the scores from the official evaluator. The models will be stored in `Models/save/`. 47 | ``` 48 | python rnet.py 49 | ``` 50 | 51 | 4. The evaluation of the model on the dev set can be generated by executing the following. The result will be stored in `Results/rnet_prediction.txt`. Note that the score appear during evaluation could be lower than the scores from the official evaluator. 52 | **Note:** The model to be evaluated has to be specified explictly. For example, if 12 epochs were trained (by default), then in `Models/save/` there should exist 5 saved models: 53 | ``` 54 | rnet_model8.ckpt.meta 55 | rnet_model8.ckpt.data-00000-of-00001 56 | rnet_model8.ckpt.index 57 | ... 58 | rnet_model11.ckpt.meta 59 | rnet_model11.ckpt.data-00000-of-00001 60 | rnet_model11.ckpt.index 61 | rnet_model_final.ckpt.meta 62 | rnet_model_final.ckpt.data-00000-of-00001 63 | rnet_model_final.ckpt.index 64 | ``` 65 | Here, `rnet_model11` and `rnet_model_final` are the same. Say, for example, one wish to evaluate on `rnet_model_final`, the following would to it: 66 | ``` 67 | python evaluate.py --model_path Models/save/rnet_model_final.ckpt 68 | ``` 69 | 70 | 5. To get the final official score, you need to use the official evaluation script, which is in the `Results` directory. 71 | ``` 72 | python Results/evaluate-v1.1.py Data/dev-v1.1.json Results/rnet_prediction.txt 73 | ``` 74 | 75 | ## Current Results 76 | 77 | 78 | | Model | Dev EM Score | Dev F1 Score | 79 | | -------- | -------- | -------- | 80 | | Original Paper | 71.1 | 79.5 | 81 | | My (Adadelta, 2 layer, dropouts, w/o char emb) | 62.6 | 71.5 | 82 | | My (Adadelta, 1 layer, dropouts, w/o char emb) | 61.0 | 70.3 | 83 | | My (Adam, 1 layer, dropouts, w/o char emb) | 60.8 | 70.5 | 84 | | My (Adam, 1 layer, w/o char emb)| 57.8 | 67.9| 85 | | My (Adam, 1 layer, w/ char emb) | 60.1 | 68.9 | 86 | 87 | You can find the [current leaderboard](https://rajpurkar.github.io/SQuAD-explorer/) and compare with other models. 88 | 89 | ### Trained model weights 90 | As some have required recently, a set of trained model weights can be downloaded [here](http://slam.iis.sinica.edu.tw/demo/RNet/release.zip). Unzip and you can find 3 files. Put the 3 files in `Models/save/` and evaluate on it by following the instruction above. This set of parameter was obtained by training for 28 epochs, using current settings, and achieved 62.2/71.5 on the dev set. I didn't save each set of model weights when I ran the experiments originally, so I reran the experiment, causing a slight degration compared with the best score on the table above. I want to clarify that the difference may come from random initialization, so feel free to train your own model weights. 91 | 92 | ## Discussion 93 | 94 | ### Reproduction 95 | 96 | As shown above, I still fail to reproduce the results. I think there are some technical details that draw my concern: 97 | 98 | 1. Data Preprocessing. I have tried two preprocessing approaches, one of which is used in the implementation of [Match-LSTM](https://github.com/shuohangwang/SeqMatchSeq/blob/master/preprocess.py), and the other is used in the implementation of [Bi-DAF](https://github.com/allenai/bi-att-flow/blob/master/squad/prepro.py). While the latter approach includes lots of reasonable processing, I chose the former one empirically since it yields better performance. 99 | 2. As pointed out in another [implementation of R-NET in Keras](https://github.com/YerevaNN/R-NET-in-Keras), 100 | > The first formula in (11) of the report contains a strange summand `W_v^Q V_r^Q`. Both tensors are trainable and are not used anywhere else in the network. We have replaced this product with a single trainable vector. 101 | 102 | However, instead of replacing the product with a single trainable vector, I followed the notation and still used two vectors. 103 | 4. Variable sharing. The notation in the original paper was very confusing to me. For example, `W_v^P` appeared in both equations (4) and (8). In my opinion, they should not be the same since they are multiplied by vectors of total different spaces. As a result, I treat them as different variables empirically. 104 | 5. Hyper-parameters ambiguity. Some hyper-paramters weren't specified in the original paper, including character embedding matrix dimension, truncating of articles and questions, and length of answer span during inference. I set up my own hyper-parameters empirically, mostly following the settings of [Match-LSTM](https://arxiv.org/pdf/1608.07905.pdf) and [Bi-DAF](https://arxiv.org/pdf/1611.01603.pdf). 105 | 6. Any other implementation mistakes and bugs. 106 | 107 | ### OOM 108 | 109 | The full model could not be trained with NVIDIA Tesla K40m with 12GiB memory. Tensorflow will report serious OOM problem. There are a few possible solutions. 110 | 111 | 1. Run with CPU. This can be achieved by assigning a device mask with command line as follows. In fact, my implementation result shown in the previous section was generated by a model trained with CPU. However, this might cause extremely slow training speed. In my experience, it might cost roughly _24 hours per epoch_. 112 | ``` 113 | CUDA_VISIBLE_DEVICES="" python rnet.py 114 | ``` 115 | 2. Reduce hyperparameters. Modifying these parameters might help: 116 | * `p_length` 117 | * Word embedding dimension: change from 300d GloVe vectors to 100d. 118 | 119 | 3. Don't use character embeddings. According to [Bi-DAF](https://arxiv.org/pdf/1611.01603.pdf), character embeddings don't help much. However, Bi-DAF uses 1D-CNNs to generate the character embeddings, while R-NET uses RNNs. As shown in the previous section, the performance dropped for 2%. Further investigation is needed for this part. 120 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import os 3 | import re 4 | from collections import Counter 5 | import json 6 | import numpy as np 7 | 8 | def sublist_exists(sl, l): 9 | n = len(sl) 10 | return any((sl == l[i:i+n]) for i in range(len(l)-n+1)) 11 | 12 | def sublist_idx(sl, l): 13 | sll=len(sl) 14 | for ind in (i for i,e in enumerate(l) if e==sl[0]): 15 | if l[ind:ind+sll]==sl: 16 | return ind,ind+sll 17 | 18 | class DataProcessor: 19 | def __init__(self, data_type, opts): 20 | self.data_type = data_type 21 | self.opts = opts 22 | data_path = os.path.join('Data', "data_{}.json".format(data_type)) 23 | shared_path = os.path.join('Data', "shared_{}.json".format(data_type)) 24 | idx_path = os.path.join('Data', "idx_table.json") 25 | self.data = self.load_data(data_path) 26 | self.shared = self.load_data(shared_path) 27 | self.idx_table = self.load_data(idx_path) 28 | 29 | # paragraph length filter: (train only) 30 | if self.data_type == 'train': 31 | self.data = [sample for sample in self.data if sample['answer'][0][-1] < self.opts['p_length']] 32 | self.num_samples = self.get_data_size() 33 | print("Loaded {} examples from {}".format(self.num_samples, data_type)) 34 | 35 | def load_data(self, path): 36 | with open(path, 'r') as fh: 37 | data = json.load(fh) 38 | return data 39 | 40 | 41 | def get_data_size(self): 42 | return len(self.data) 43 | 44 | def get_training_batch(self, batch_no): 45 | opts = self.opts 46 | si = (batch_no * opts['batch_size']) 47 | ei = min(self.num_samples, si + opts['batch_size']) 48 | n = ei - si 49 | 50 | tensor_dict = {} 51 | paragraph = np.zeros((n, opts['p_length'], opts['word_emb_dim'])) 52 | question = np.zeros((n, opts['q_length'], opts['word_emb_dim'])) 53 | paragraph_c = np.zeros((n, opts['p_length'], opts['char_max_length'])) 54 | question_c = np.zeros((n, opts['q_length'], opts['char_max_length'])) 55 | answer_si = np.zeros( (n, opts['p_length']) ) 56 | answer_ei = np.zeros( (n, opts['p_length']) ) 57 | idxs= [] 58 | 59 | count = 0 60 | for i in range(si, ei): 61 | idxs.append(i) 62 | sample = self.data[i] 63 | aipi = sample['aipi'] 64 | p = self.shared['paragraphs'][aipi[0]][aipi[1]] 65 | p_sent = self.shared['paragraphs_sent'][aipi[0]][aipi[1]] 66 | q = sample['question'] 67 | 68 | for j in range(len(p)): 69 | if j >= opts['p_length']: 70 | break 71 | try: 72 | paragraph[count][j][:opts['word_emb_dim']] = self.shared['glove'+opts['glove']][p[j]] 73 | except KeyError: 74 | pass 75 | for k, char in enumerate(p[j]): 76 | paragraph_c[count][j][k] = self.idx_table['char2idx'][char] 77 | 78 | for j in range(len(q)): 79 | if j >= opts['q_length']: 80 | break 81 | try: 82 | question[count][j] = self.shared['glove'+opts['glove']][q[j]] 83 | except KeyError: 84 | pass 85 | for k, char in enumerate(q[j]): 86 | question_c[count][j][k] = self.idx_table['char2idx'][char] 87 | 88 | si, ei = sample['answer'][0][0], sample['answer'][0][-1] 89 | answer_si[count][si] = 1.0 90 | answer_ei[count][ei] = 1.0 91 | 92 | count += 1 93 | 94 | tensor_dict['paragraph'] = paragraph 95 | tensor_dict['question'] = question 96 | tensor_dict['paragraph_c'] = paragraph_c 97 | tensor_dict['question_c'] = question_c 98 | tensor_dict['answer_si'] = answer_si 99 | tensor_dict['answer_ei'] = answer_ei 100 | return tensor_dict, idxs 101 | 102 | def get_testing_batch(self, batch_no): 103 | opts = self.opts 104 | si = (batch_no * opts['batch_size']) 105 | ei = min(self.num_samples, si + opts['batch_size']) 106 | n = ei - si 107 | 108 | paragraph = np.zeros((opts['batch_size'], opts['p_length'], opts['word_emb_dim'])) 109 | question = np.zeros((opts['batch_size'], opts['q_length'], opts['word_emb_dim'])) 110 | paragraph_c = np.zeros((opts['batch_size'], opts['p_length'], opts['char_max_length'])) 111 | question_c = np.zeros((opts['batch_size'], opts['q_length'], opts['char_max_length'])) 112 | context = [None for _ in range(n)] 113 | context_original = [None for _ in range(n)] 114 | answer_si = [None for _ in range(n)] 115 | answer_ei = [None for _ in range(n)] 116 | ID = [None for _ in range(n)] 117 | 118 | count = 0 119 | for i in range(si, ei): 120 | sample = self.data[i] 121 | aipi = sample['aipi'] 122 | p = self.shared['paragraphs'][aipi[0]][aipi[1]] 123 | p_o = self.shared['paragraphs_original'][aipi[0]][aipi[1]] 124 | q = sample['question'] 125 | 126 | context[count] = p 127 | context_original[count] = p_o 128 | for j in range(len(p)): 129 | if j >= opts['p_length']: 130 | break 131 | try: 132 | paragraph[count][j][:opts['word_emb_dim']] = self.shared['glove'+opts['glove']][p[j]] 133 | for k, char in enumerate(p[j]): 134 | paragraph_c[count][j][k] = self.idx_table['char2idx'][char] 135 | except KeyError: 136 | #print('{} not in GloVe'.format(p[j])) 137 | pass 138 | 139 | for j in range(len(q)): 140 | if j >= opts['q_length']: 141 | break 142 | try: 143 | question[count][j] = self.shared['glove'+opts['glove']][q[j]] 144 | for k, char in enumerate(q[j]): 145 | question_c[count][j][k] = self.idx_table['char2idx'][char] 146 | except KeyError: 147 | pass 148 | #print('{} not in GloVe'.format(triplet['question'][j].lower())) 149 | 150 | answer_si[count] = [ans[0] for ans in sample['answer']] 151 | answer_ei[count] = [ans[-1] for ans in sample['answer']] 152 | ID[count] = sample['id'] 153 | count += 1 154 | 155 | return context, context_original, paragraph, question, paragraph_c, question_c, answer_si, answer_ei, ID, n 156 | 157 | def get_word2vec(glove_path, word_counter): 158 | word2vec_dict = {} 159 | with open(glove_path, 'r', encoding='utf-8') as fh: 160 | for line in fh: 161 | array = line.lstrip().rstrip().split(" ") 162 | word = array[0] 163 | vector = list(map(float, array[1:])) 164 | if word in word_counter: 165 | word2vec_dict[word] = vector 166 | if word.capitalize() in word_counter: 167 | word2vec_dict[word.capitalize()] = vector 168 | if word.lower() in word_counter: 169 | word2vec_dict[word.lower()] = vector 170 | if word.upper() in word_counter: 171 | word2vec_dict[word.upper()] = vector 172 | print("{}/{} of word vocab have corresponding vectors in {}".format(len(word2vec_dict), len(word_counter), glove_path)) 173 | return word2vec_dict 174 | 175 | def get_char_vocab(word_counter): 176 | char2idx = {' ':0} 177 | idx2char = [' '] 178 | max_word_length = 0 179 | word_count = [0 for _ in range(37)] 180 | 181 | for word in word_counter: 182 | word_count[len(word)-1] +=1 183 | max_word_length = max(max_word_length, len(word)) 184 | for char in word: 185 | if not char in char2idx: 186 | idx2char.append(char) 187 | char2idx[char] = len(idx2char) - 1 188 | print('max word length:',max_word_length) 189 | print(len(char2idx),'chars read') 190 | print(word_count) 191 | 192 | return char2idx, idx2char 193 | 194 | def read_local_word2vec(): 195 | local_w2v_dir = os.join.path('Data', 'local_w2v', 'local_w2v.json') 196 | local_w2v = json.load(open(local_w2v_dir, 'r')) 197 | return local_w2v_dir 198 | 199 | def generate_seq(data_type): 200 | import nltk 201 | nltk.download('punkt') 202 | from nltk.tokenize import word_tokenize, sent_tokenize 203 | #def word_tokenize(tokens): 204 | # return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)] 205 | 206 | fpr = open(os.path.join('Data', data_type+'-v1.1.json'), 'r') 207 | source_data = json.load(fpr) 208 | 209 | data = [] 210 | articles = [] 211 | articles_sent = [] 212 | articles_original = [] 213 | articles_original_sent = [] 214 | word_counter = Counter() 215 | 216 | fpw = open(os.path.join('Data','data_'+data_type+".json"), 'w') 217 | for ai, article in enumerate(source_data["data"]): 218 | if ai%20 == 0: 219 | print('processing article',ai) 220 | paragraphs = [] 221 | paragraphs_sent = [] 222 | paragraphs_original = [] 223 | paragraphs_original_sent = [] 224 | 225 | for pi, p in enumerate(article["paragraphs"]): 226 | context = p["context"] 227 | #context = context.replace("''", '" ') 228 | #context = context.replace("``", '" ') 229 | paragraph = word_tokenize(context) 230 | context_sent = sent_tokenize(context) 231 | paragraph_sent = [word_tokenize(sent) for sent in context_sent] 232 | paragraphs.append(paragraph) # word level paragraph 233 | paragraphs_sent.append(paragraph_sent) # sentence_word level paragraph 234 | paragraphs_original.append(context) # original paragraph 235 | paragraphs_original_sent.append(context_sent) # sentence_tokenized original paragraph 236 | for w in paragraph: 237 | word_counter[w] += len(p['qas']) 238 | 239 | for qa in p["qas"]: 240 | question = word_tokenize(qa["question"]) 241 | answers = [] 242 | answers_sent = [] 243 | for w in question: 244 | word_counter[w] += 1 245 | 246 | for a in qa['answers']: 247 | answer = a['text'].strip() 248 | answer_start = int(a['answer_start']) 249 | 250 | #add '.' here, just because NLTK is not good enough in some cases 251 | answer_words = word_tokenize(answer+'.') 252 | if answer_words[-1] == '.': 253 | answer_words = answer_words[:-1] 254 | else: 255 | answer_words = word_tokenize(answer) 256 | 257 | #word level 258 | prev_context_words = word_tokenize( context[:answer_start] ) 259 | left_context_words = word_tokenize( context[answer_start:] ) 260 | pos_list = [] 261 | for i in range(len(answer_words)): 262 | if i < len(left_context_words): 263 | pos_list.append(len(prev_context_words)+i) 264 | assert(len(pos_list) > 0) 265 | 266 | # sent level 267 | # [sent_idx, word_idx] 268 | for idx, sent in enumerate(paragraph_sent): 269 | if sublist_exists(answer_words, sent): 270 | sent_idx = idx 271 | try: 272 | si,ei = sublist_idx(answer_words, sent) 273 | except: 274 | print(answer) 275 | print(answer_words) 276 | print(sent) 277 | exit() 278 | pos_list_sent = [[idx, i] for i in range(si, ei)] 279 | break 280 | 281 | answers.append(pos_list) 282 | answers_sent.append(pos_list_sent) 283 | 284 | sample = {'aipi': [ai, pi], 285 | 'question': question, 286 | 'answer': answers, 287 | 'answer_sent': answers_sent, 288 | 'id': str(qa['id']), 289 | } 290 | data.append(sample) 291 | articles.append(paragraphs) 292 | articles_sent.append(paragraphs_sent) 293 | articles_original.append(paragraphs_original) 294 | articles_original_sent.append(paragraphs_original_sent) 295 | 296 | w2v_100 = get_word2vec('Data/glove.6B.100d.txt', word_counter) 297 | w2v_300 = get_word2vec('Data/glove.840B.300d.txt', word_counter) 298 | char2idx, idx2char = get_char_vocab(word_counter) 299 | 300 | print(len(data)) 301 | print(len(articles), len(articles_sent)) 302 | shared = {'paragraphs': articles, 303 | 'paragraphs_sent': articles_sent, 304 | 'paragraphs_original': articles_original, 305 | 'paragraphs_original_sent': articles_original_sent, 306 | 'glove100': w2v_100, 307 | 'glove300': w2v_300, 308 | } 309 | print('Saving...') 310 | with open(os.path.join('Data','data_'+data_type+".json"), 'w') as f: 311 | json.dump(data, f) 312 | with open(os.path.join('Data','shared_'+data_type+".json"), 'w') as f: 313 | json.dump(shared, f) 314 | 315 | if data_type == 'train': 316 | char2idx, idx2char = get_char_vocab(word_counter) 317 | idx_table = {'char2idx': char2idx, 318 | 'idx2char': idx2char, 319 | } 320 | with open(os.path.join('Data','idx_table.json'), 'w') as f: 321 | json.dump(idx_table, f) 322 | 323 | print('SQuAD '+data_type+' preprossing finished!') 324 | 325 | def read_data(data_type, opts): 326 | return DataProcessor(data_type, opts) 327 | 328 | def read_data_old(filename, PATH, p_thres): 329 | import json 330 | from pprint import pprint 331 | 332 | data = [] 333 | if filename == 'train': 334 | with open(PATH, 'r', encoding = 'utf-8') as f: 335 | for l in f: 336 | triplet = l.strip('\n').split("\t") 337 | paragraph = triplet[0].split(' ') 338 | question = triplet[1].split(' ') 339 | d = {'paragraph' : paragraph, 340 | 'question' : question, 341 | 'answer_si' : int(triplet[2].split(' ')[0]), 342 | 'answer_ei' : int(triplet[2].split(' ')[-1]), 343 | } 344 | if d['answer_ei'] >= p_thres: 345 | continue 346 | data.append(d) 347 | else: 348 | with open(PATH, 'r', encoding = 'utf-8') as f: 349 | for l in f: 350 | triplet = l.strip('\n').split("\t") 351 | paragraph = triplet[0].split(' ') 352 | question = triplet[1].split(' ') 353 | try: 354 | answer_si = [ int(seq.split(' ')[0]) for seq in triplet[2:-1]] 355 | answer_ei = [ int(seq.split(' ')[-1]) for seq in triplet[2:-1]] 356 | except: 357 | print(triplet[2:]) 358 | exit() 359 | d = {'paragraph' : paragraph, 360 | 'question' : question, 361 | 'answer_si' : answer_si, 362 | 'answer_ei' : answer_ei, 363 | 'ID' : triplet[-1] 364 | } 365 | data.append(d) 366 | print(filename,'{} triplets read'.format(len(data))) 367 | return data 368 | 369 | def run(): 370 | import argparse 371 | 372 | parser = argparse.ArgumentParser() 373 | parser.add_argument('--gen_seq', type=bool, default=False, help='original data to seq') 374 | args = parser.parse_args() 375 | 376 | if args.gen_seq: 377 | print('Generating Sequences...') 378 | generate_seq('train') 379 | generate_seq('dev') 380 | 381 | if __name__ == "__main__": 382 | run() 383 | -------------------------------------------------------------------------------- /Models/model_rnet.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import math 3 | 4 | class R_NET: 5 | def random_weight(self, dim_in, dim_out, name=None, stddev=1.0): 6 | return tf.Variable(tf.truncated_normal([dim_in, dim_out], stddev=stddev/math.sqrt(float(dim_in))), name=name) 7 | 8 | def random_bias(self, dim, name=None): 9 | return tf.Variable(tf.truncated_normal([dim]), name=name) 10 | 11 | def random_scalar(self, name=None): 12 | return tf.Variable(0.0, name=name) 13 | 14 | def DropoutWrappedGRUCell(self, hidden_size, in_keep_prob, name=None): 15 | # cell = tf.contrib.rnn.GRUCell(hidden_size) 16 | cell = tf.contrib.rnn.BasicLSTMCell(hidden_size, forget_bias=1.0, state_is_tuple=True) 17 | cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob = in_keep_prob) 18 | return cell 19 | 20 | def mat_weight_mul(self, mat, weight): 21 | # [batch_size, n, m] * [m, p] = [batch_size, n, p] 22 | mat_shape = mat.get_shape().as_list() 23 | weight_shape = weight.get_shape().as_list() 24 | assert(mat_shape[-1] == weight_shape[0]) 25 | mat_reshape = tf.reshape(mat, [-1, mat_shape[-1]]) # [batch_size * n, m] 26 | mul = tf.matmul(mat_reshape, weight) # [batch_size * n, p] 27 | return tf.reshape(mul, [-1, mat_shape[1], weight_shape[-1]]) 28 | 29 | def __init__(self, options): 30 | with tf.device('/cpu:0'): 31 | self.options = options 32 | 33 | # Char embeddings 34 | if options['char_emb']: 35 | self.char_emb_mat = self.random_weight(self.options['char_vocab_size'], 36 | self.options['char_emb_mat_dim'], name = 'char_emb_matrix') 37 | 38 | # Weights 39 | self.W_uQ = self.random_weight(2 * options['state_size'], options['state_size'], name='W_uQ') 40 | self.W_uP = self.random_weight(2 * options['state_size'], options['state_size'], name='W_uP') 41 | self.W_vP = self.random_weight(options['state_size'], options['state_size'], name='W_vP') 42 | self.W_g_QP = self.random_weight(4 * options['state_size'], 4 * options['state_size'], name='W_g_QP') 43 | self.W_smP1 = self.random_weight(options['state_size'], options['state_size'], name='W_smP1') 44 | self.W_smP2 = self.random_weight(options['state_size'], options['state_size'], name='W_smP2') 45 | self.W_g_SM = self.random_weight(2 * options['state_size'], 2 * options['state_size'], name='W_g_SM') 46 | self.W_ruQ = self.random_weight(2 * options['state_size'], 2 * options['state_size'], name='W_ruQ') 47 | self.W_vQ = self.random_weight(options['state_size'], 2 * options['state_size'], name='W_vQ') 48 | self.W_VrQ = self.random_weight(options['q_length'], options['state_size'], name='W_VrQ') # has same size as u_Q 49 | self.W_hP = self.random_weight(2 * options['state_size'], options['state_size'], name='W_hP') 50 | self.W_ha = self.random_weight(2 * options['state_size'], options['state_size'], name='W_ha') 51 | 52 | # Biases 53 | self.B_v_QP = self.random_bias(options['state_size'], name='B_v_QP') 54 | self.B_v_SM = self.random_bias(options['state_size'], name='B_v_SM') 55 | self.B_v_rQ = self.random_bias(2 * options['state_size'], name='B_v_rQ') 56 | self.B_v_ap = self.random_bias(options['state_size'], name='B_v_ap') 57 | 58 | # QP_match 59 | with tf.variable_scope('QP_match') as scope: 60 | self.QPmatch_cell = self.DropoutWrappedGRUCell(self.options['state_size'], self.options['in_keep_prob']) 61 | self.QPmatch_state = self.QPmatch_cell.zero_state(self.options['batch_size'], dtype=tf.float32) 62 | 63 | # Ans Ptr 64 | with tf.variable_scope('Ans_ptr') as scope: 65 | self.AnsPtr_cell = self.DropoutWrappedGRUCell(2 * self.options['state_size'], self.options['in_keep_prob']) 66 | 67 | def build_model(self): 68 | opts = self.options 69 | 70 | # placeholders 71 | paragraph = tf.placeholder(tf.float32, [opts['batch_size'], opts['p_length'], opts['emb_dim']]) 72 | question = tf.placeholder(tf.float32, [opts['batch_size'], opts['q_length'], opts['emb_dim']]) 73 | answer_si = tf.placeholder(tf.float32, [opts['batch_size'], opts['p_length']]) 74 | answer_ei = tf.placeholder(tf.float32, [opts['batch_size'], opts['p_length']]) 75 | if opts['char_emb']: 76 | paragraph_c = tf.placeholder(tf.int32, [opts['batch_size'], opts['p_length'], opts['char_max_length']]) 77 | question_c = tf.placeholder(tf.int32, [opts['batch_size'], opts['q_length'], opts['char_max_length']]) 78 | 79 | print('Question and Passage Encoding') 80 | if opts['char_emb']: 81 | # char embedding -> word level char embedding 82 | paragraph_c_emb = tf.nn.embedding_lookup(self.char_emb_mat, paragraph_c) # [batch_size, p_length, char_max_length, char_emb_dim] 83 | question_c_emb = tf.nn.embedding_lookup(self.char_emb_mat, question_c) 84 | paragraph_c_list = [tf.squeeze(w, [1]) for w in tf.split(paragraph_c_emb, opts['p_length'], axis=1)] 85 | question_c_list = [tf.squeeze(w, [1]) for w in tf.split(question_c_emb, opts['q_length'], axis=1)] 86 | 87 | c_Q = [] 88 | c_P = [] 89 | with tf.variable_scope('char_emb_rnn') as scope: 90 | char_emb_fw_cell = self.DropoutWrappedGRUCell(opts['emb_dim'], 1.0) 91 | char_emb_bw_cell = self.DropoutWrappedGRUCell(opts['emb_dim'], 1.0) 92 | for t in range(opts['q_length']): 93 | unstacked_q_c = tf.unstack(question_c_list[t], opts['char_max_length'], 1) 94 | if t>0 : 95 | tf.get_variable_scope().reuse_variables() 96 | q_c_e_outputs, q_c_e_final_fw, q_c_e_final_bw = tf.contrib.rnn.static_bidirectional_rnn( 97 | char_emb_fw_cell, char_emb_bw_cell, unstacked_q_c, dtype=tf.float32, scope = 'char_emb') 98 | c_q_t = tf.concat([q_c_e_final_fw[1], q_c_e_final_bw[1]], 1) 99 | c_Q.append(c_q_t) 100 | for t in range(opts['p_length']): 101 | unstacked_p_c = tf.unstack(paragraph_c_list[t], opts['char_max_length'], 1) 102 | p_c_e_outputs, p_c_e_final_fw, p_c_e_final_bw = tf.contrib.rnn.static_bidirectional_rnn( 103 | char_emb_fw_cell, char_emb_bw_cell, unstacked_p_c, dtype=tf.float32, scope = 'char_emb') 104 | c_p_t = tf.concat([p_c_e_final_fw[1], p_c_e_final_bw[1]], 1) 105 | c_P.append(c_p_t) 106 | c_Q = tf.stack(c_Q, 1) 107 | c_P = tf.stack(c_P, 1) 108 | print('c_Q', c_Q) 109 | print('c_P', c_P) 110 | 111 | # Concat e and c 112 | eQcQ = tf.concat([question, c_Q], 2) 113 | ePcP = tf.concat([paragraph, c_P], 2) 114 | else: 115 | eQcQ = question 116 | ePcP = paragraph 117 | 118 | unstacked_eQcQ = tf.unstack(eQcQ, opts['q_length'], 1) 119 | unstacked_ePcP = tf.unstack(ePcP, opts['p_length'], 1) 120 | with tf.variable_scope('encoding') as scope: 121 | stacked_enc_fw_cells=[ self.DropoutWrappedGRUCell(opts['state_size'], opts['in_keep_prob']) for _ in range(2)] 122 | stacked_enc_bw_cells=[ self.DropoutWrappedGRUCell(opts['state_size'], opts['in_keep_prob']) for _ in range(2)] 123 | q_enc_outputs, q_enc_final_fw, q_enc_final_bw = tf.contrib.rnn.stack_bidirectional_rnn( 124 | stacked_enc_fw_cells, stacked_enc_bw_cells, unstacked_eQcQ, dtype=tf.float32, scope = 'context_encoding') 125 | tf.get_variable_scope().reuse_variables() 126 | p_enc_outputs, p_enc_final_fw, p_enc_final_bw = tf.contrib.rnn.stack_bidirectional_rnn( 127 | stacked_enc_fw_cells, stacked_enc_bw_cells, unstacked_ePcP, dtype=tf.float32, scope = 'context_encoding') 128 | u_Q = tf.stack(q_enc_outputs, 1) 129 | u_P = tf.stack(p_enc_outputs, 1) 130 | u_Q = tf.nn.dropout(u_Q, opts['in_keep_prob']) 131 | u_P = tf.nn.dropout(u_P, opts['in_keep_prob']) 132 | print(u_Q) 133 | print(u_P) 134 | 135 | v_P = [] 136 | print('Question-Passage Matching') 137 | for t in range(opts['p_length']): 138 | # Calculate c_t 139 | W_uQ_u_Q = self.mat_weight_mul(u_Q, self.W_uQ) # [batch_size, q_length, state_size] 140 | tiled_u_tP = tf.concat( [tf.reshape(u_P[:, t, :], [opts['batch_size'], 1, -1])] * opts['q_length'], 1) 141 | W_uP_u_tP = self.mat_weight_mul(tiled_u_tP , self.W_uP) 142 | 143 | if t == 0: 144 | tanh = tf.tanh(W_uQ_u_Q + W_uP_u_tP) 145 | else: 146 | tiled_v_t1P = tf.concat( [tf.reshape(v_P[t-1], [opts['batch_size'], 1, -1])] * opts['q_length'], 1) 147 | W_vP_v_t1P = self.mat_weight_mul(tiled_v_t1P, self.W_vP) 148 | tanh = tf.tanh(W_uQ_u_Q + W_uP_u_tP + W_vP_v_t1P) 149 | s_t = tf.squeeze(self.mat_weight_mul(tanh, tf.reshape(self.B_v_QP, [-1, 1]))) 150 | a_t = tf.nn.softmax(s_t, 1) 151 | tiled_a_t = tf.concat( [tf.reshape(a_t, [opts['batch_size'], -1, 1])] * 2 * opts['state_size'] , 2) 152 | c_t = tf.reduce_sum( tf.multiply(tiled_a_t, u_Q) , 1) # [batch_size, 2 * state_size] 153 | 154 | # gate 155 | u_tP_c_t = tf.expand_dims( tf.concat( [tf.squeeze(u_P[:, t, :]), c_t], 1), 1) 156 | g_t = tf.sigmoid( self.mat_weight_mul(u_tP_c_t, self.W_g_QP) ) 157 | u_tP_c_t_star = tf.squeeze(tf.multiply(u_tP_c_t, g_t)) 158 | 159 | # QP_match 160 | with tf.variable_scope("QP_match"): 161 | if t > 0: tf.get_variable_scope().reuse_variables() 162 | output, self.QPmatch_state = self.QPmatch_cell(u_tP_c_t_star, self.QPmatch_state) 163 | v_P.append(output) 164 | v_P = tf.stack(v_P, 1) 165 | v_P = tf.nn.dropout(v_P, opts['in_keep_prob']) 166 | print('v_P', v_P) 167 | 168 | print('Self-Matching Attention') 169 | SM_star = [] 170 | for t in range(opts['p_length']): 171 | # Calculate s_t 172 | W_p1_v_P = self.mat_weight_mul(v_P, self.W_smP1) # [batch_size, p_length, state_size] 173 | tiled_v_tP = tf.concat( [tf.reshape(v_P[:, t, :], [opts['batch_size'], 1, -1])] * opts['p_length'], 1) 174 | W_p2_v_tP = self.mat_weight_mul(tiled_v_tP , self.W_smP2) 175 | 176 | tanh = tf.tanh(W_p1_v_P + W_p2_v_tP) 177 | s_t = tf.squeeze(self.mat_weight_mul(tanh, tf.reshape(self.B_v_SM, [-1, 1]))) 178 | a_t = tf.nn.softmax(s_t, 1) 179 | tiled_a_t = tf.concat( [tf.reshape(a_t, [opts['batch_size'], -1, 1])] * opts['state_size'] , 2) 180 | c_t = tf.reduce_sum( tf.multiply(tiled_a_t, v_P) , 1) # [batch_size, 2 * state_size] 181 | 182 | # gate 183 | v_tP_c_t = tf.expand_dims( tf.concat( [tf.squeeze(v_P[:, t, :]), c_t], 1), 1) 184 | g_t = tf.sigmoid( self.mat_weight_mul(v_tP_c_t, self.W_g_SM) ) 185 | v_tP_c_t_star = tf.squeeze(tf.multiply(v_tP_c_t, g_t)) 186 | SM_star.append(v_tP_c_t_star) 187 | SM_star = tf.stack(SM_star, 1) 188 | unstacked_SM_star = tf.unstack(SM_star, opts['p_length'], 1) 189 | with tf.variable_scope('Self_match') as scope: 190 | SM_fw_cell = self.DropoutWrappedGRUCell(opts['state_size'], opts['in_keep_prob']) 191 | SM_bw_cell = self.DropoutWrappedGRUCell(opts['state_size'], opts['in_keep_prob']) 192 | SM_outputs, SM_final_fw, SM_final_bw = tf.contrib.rnn.static_bidirectional_rnn(SM_fw_cell, SM_bw_cell, unstacked_SM_star, dtype=tf.float32) 193 | h_P = tf.stack(SM_outputs, 1) 194 | h_P = tf.nn.dropout(h_P, opts['in_keep_prob']) 195 | print('h_P', h_P) 196 | 197 | print('Output Layer') 198 | # calculate r_Q 199 | W_ruQ_u_Q = self.mat_weight_mul(u_Q, self.W_ruQ) # [batch_size, q_length, 2 * state_size] 200 | W_vQ_V_rQ = tf.matmul(self.W_VrQ, self.W_vQ) 201 | W_vQ_V_rQ = tf.stack([W_vQ_V_rQ]*opts['batch_size'], 0) # stack -> [batch_size, state_size, state_size] 202 | 203 | tanh = tf.tanh(W_ruQ_u_Q + W_vQ_V_rQ) 204 | s_t = tf.squeeze(self.mat_weight_mul(tanh, tf.reshape(self.B_v_rQ, [-1, 1]))) 205 | a_t = tf.nn.softmax(s_t, 1) 206 | tiled_a_t = tf.concat( [tf.reshape(a_t, [opts['batch_size'], -1, 1])] * 2 * opts['state_size'] , 2) 207 | r_Q = tf.reduce_sum( tf.multiply(tiled_a_t, u_Q) , 1) # [batch_size, 2 * state_size] 208 | r_Q = tf.nn.dropout(r_Q, opts['in_keep_prob']) 209 | print('r_Q', r_Q) 210 | 211 | # r_Q as initial state of ans ptr 212 | h_a = None 213 | p = [None for _ in range(2)] 214 | for t in range(2): 215 | W_hP_h_P = self.mat_weight_mul(h_P, self.W_hP) # [batch_size, p_length, state_size] 216 | 217 | if t == 0: 218 | h_t1a = r_Q 219 | else: 220 | h_t1a = h_a 221 | print('h_t1a', h_t1a) 222 | tiled_h_t1a = tf.concat( [tf.reshape(h_t1a, [opts['batch_size'], 1, -1])] * opts['p_length'], 1) 223 | W_ha_h_t1a = self.mat_weight_mul(tiled_h_t1a , self.W_ha) 224 | 225 | tanh = tf.tanh(W_hP_h_P + W_ha_h_t1a) 226 | s_t = tf.squeeze(self.mat_weight_mul(tanh, tf.reshape(self.B_v_ap, [-1, 1]))) 227 | a_t = tf.nn.softmax(s_t, 1) 228 | p[t] = a_t 229 | 230 | tiled_a_t = tf.concat( [tf.reshape(a_t, [opts['batch_size'], -1, 1])] * 2 * opts['state_size'] , 2) 231 | c_t = tf.reduce_sum( tf.multiply(tiled_a_t, h_P) , 1) # [batch_size, 2 * state_size] 232 | 233 | if t == 0: 234 | AnsPtr_state = self.AnsPtr_cell.zero_state(opts['batch_size'], dtype=tf.float32) 235 | h_a, _ = self.AnsPtr_cell(c_t, (AnsPtr_state, r_Q) ) 236 | h_a = h_a[1] 237 | print(h_a) 238 | print(p) 239 | p1 = p[0] 240 | p2 = p[1] 241 | 242 | answer_si_idx = tf.cast(tf.argmax(answer_si, 1), tf.int32) 243 | answer_ei_idx = tf.cast(tf.argmax(answer_ei, 1), tf.int32) 244 | 245 | """ 246 | ce_si = tf.nn.softmax_cross_entropy_with_logits(labels = answer_si, logits = p1) 247 | ce_ei = tf.nn.softmax_cross_entropy_with_logits(labels = answer_ei, logits = p2) 248 | print(ce_si, ce_ei) 249 | loss_si = tf.reduce_sum(ce_si) 250 | loss_ei = tf.reduce_sum(ce_ei) 251 | loss = loss_si + loss_ei 252 | """ 253 | 254 | batch_idx = tf.reshape(tf.range(0, opts['batch_size']), [-1,1]) 255 | answer_si_re = tf.reshape(answer_si_idx, [-1,1]) 256 | batch_idx_si = tf.concat([batch_idx, answer_si_re],1) 257 | answer_ei_re = tf.reshape(answer_ei_idx, [-1,1]) 258 | batch_idx_ei = tf.concat([batch_idx, answer_ei_re],1) 259 | 260 | log_prob = tf.multiply(tf.gather_nd(p1, batch_idx_si), tf.gather_nd(p2, batch_idx_ei)) 261 | loss = -tf.reduce_sum(tf.log(log_prob+0.0000001)) 262 | 263 | # Search 264 | prob = [] 265 | search_range = opts['p_length'] - opts['span_length'] 266 | for i in range(search_range): 267 | for j in range(opts['span_length']): 268 | prob.append(tf.multiply(p1[:, i], p2[:, i+j])) 269 | prob = tf.stack(prob, axis = 1) 270 | argmax_idx = tf.argmax(prob, axis=1) 271 | pred_si = argmax_idx / opts['span_length'] 272 | pred_ei = pred_si + tf.cast(tf.mod(argmax_idx , opts['span_length']), tf.float64) 273 | correct = tf.logical_and(tf.equal(tf.cast(pred_si, tf.int64), tf.cast(answer_si_idx, tf.int64)), 274 | tf.equal(tf.cast(pred_ei, tf.int64), tf.cast(answer_ei_idx, tf.int64))) 275 | accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) 276 | 277 | input_tensors = { 278 | 'p':paragraph, 279 | 'q':question, 280 | 'a_si':answer_si, 281 | 'a_ei':answer_ei, 282 | } 283 | if opts['char_emb']: 284 | input_tensors.update({'pc': paragraph_c, 'qc': question_c}) 285 | 286 | print('Model built') 287 | for v in tf.global_variables(): 288 | print(v.name, v.shape) 289 | 290 | return input_tensors, loss, accuracy, pred_si, pred_ei 291 | 292 | --------------------------------------------------------------------------------