├── reading_comprehension ├── external │ ├── __init__.py │ ├── squad.py │ ├── bleu.py │ └── rouge.py ├── model │ ├── __init__.py │ └── base_model.py ├── document │ ├── BiDAF.metric.png │ ├── QANet.metric.png │ ├── SQuAD.example.png │ ├── BiDAF.architecture.png │ ├── QANet.architecture.png │ └── R-Net.architecture.png ├── layer │ ├── __init__.py │ ├── pooling.py │ ├── basic.py │ ├── embedding.py │ ├── position.py │ ├── highway.py │ ├── recurrent.py │ └── dense.py ├── util │ ├── __init__.py │ ├── debug_logger.py │ ├── result_writer.py │ ├── summary_writer.py │ ├── default_util.py │ ├── eval_util.py │ ├── train_logger.py │ ├── reading_comprehension_util.py │ └── eval_logger.py ├── hparam_search.py ├── squad │ ├── evaluate-v1.py │ ├── preprocess.py │ └── evaluate-v2.py └── config │ ├── config_mrc_template.rnet.json │ ├── config_mrc_template.qanet.json │ ├── config_mrc_template.bidaf.json │ ├── config_search_template.qanet.json │ └── config_search_template.bidaf.json ├── docs ├── BiDAF.metric.png ├── QANet.metric.png ├── SQuAD.example.png ├── BiDAF.architecture.png ├── QANet.architecture.png ├── R-Net.architecture.png ├── _config.yml └── index.md ├── .gitignore ├── README.md └── LICENSE /reading_comprehension/external/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["bleu", "rouge", "squad"] -------------------------------------------------------------------------------- /reading_comprehension/model/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["base_model", "bidaf", "qanet", "rnet"] -------------------------------------------------------------------------------- /docs/BiDAF.metric.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/docs/BiDAF.metric.png -------------------------------------------------------------------------------- /docs/QANet.metric.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/docs/QANet.metric.png -------------------------------------------------------------------------------- /docs/SQuAD.example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/docs/SQuAD.example.png -------------------------------------------------------------------------------- /docs/BiDAF.architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/docs/BiDAF.architecture.png -------------------------------------------------------------------------------- /docs/QANet.architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/docs/QANet.architecture.png -------------------------------------------------------------------------------- /docs/R-Net.architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/docs/R-Net.architecture.png -------------------------------------------------------------------------------- /reading_comprehension/document/BiDAF.metric.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/reading_comprehension/document/BiDAF.metric.png -------------------------------------------------------------------------------- /reading_comprehension/document/QANet.metric.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/reading_comprehension/document/QANet.metric.png -------------------------------------------------------------------------------- /reading_comprehension/document/SQuAD.example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/reading_comprehension/document/SQuAD.example.png -------------------------------------------------------------------------------- /reading_comprehension/layer/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["basic", "embedding", "position", "convolution", "pooling", 2 | "dense", "highway", "recurrent", "attention"] -------------------------------------------------------------------------------- /reading_comprehension/document/BiDAF.architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/reading_comprehension/document/BiDAF.architecture.png -------------------------------------------------------------------------------- /reading_comprehension/document/QANet.architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/reading_comprehension/document/QANet.architecture.png -------------------------------------------------------------------------------- /reading_comprehension/document/R-Net.architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/reading_comprehension/document/R-Net.architecture.png -------------------------------------------------------------------------------- /reading_comprehension/util/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["debug_logger", "train_logger", "eval_logger.py", "summary_writer", "result_writer", 2 | "default_util", "param_util", "data_util", "model_util", "eval_util", "layer_util", "reading_comprehension_util"] -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman 2 | title: Machine Reading Comprehension 3 | description: This project is a Machine Reading Comprehension (MRC) framework in TensorFlow. It also contains several classic models (e.g. QANet, BiDAF, etc.) re-implementation and their benchmarks on SQuAD dataset. 4 | -------------------------------------------------------------------------------- /reading_comprehension/util/debug_logger.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import os.path 3 | import time 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | __all__ = ["DebugLogger"] 9 | 10 | class DebugLogger(object): 11 | """debug logger""" 12 | def __init__(self, 13 | output_dir): 14 | """initialize debug logger""" 15 | if not tf.gfile.Exists(output_dir): 16 | tf.gfile.MakeDirs(output_dir) 17 | self.log_file = os.path.join(output_dir, "debug_{0}.log".format(time.time())) 18 | self.log_writer = codecs.getwriter("utf-8")(tf.gfile.GFile(self.log_file, mode="a")) 19 | 20 | def log_print(self, 21 | message): 22 | """log and print debugging message""" 23 | time_stamp = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) 24 | log_line = "{0}: {1}".format(time_stamp, message).encode('utf-8') 25 | self.log_writer.write("{0}\r\n".format(log_line)) 26 | print(log_line) 27 | -------------------------------------------------------------------------------- /reading_comprehension/util/result_writer.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import os.path 3 | import json 4 | import time 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | __all__ = ["ResultWriter"] 10 | 11 | class ResultWriter(object): 12 | """result writer""" 13 | def __init__(self, 14 | output_dir): 15 | """initialize result writer""" 16 | self.output_dir = output_dir 17 | if not tf.gfile.Exists(self.output_dir): 18 | tf.gfile.MakeDirs(self.output_dir) 19 | 20 | def write_result(self, 21 | results, 22 | result_tag, 23 | result_id): 24 | """write result to file""" 25 | result_file = os.path.join(self.output_dir, "{0}_{1}_{2}.result".format(result_tag, result_id, time.time())) 26 | with codecs.getwriter("utf-8")(tf.gfile.GFile(result_file, mode="w")) as result_writer: 27 | for result in results: 28 | result_writer.write("{0}\r\n".format(json.dumps(result))) 29 | -------------------------------------------------------------------------------- /reading_comprehension/hparam_search.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | from util.param_util import * 7 | 8 | def add_arguments(parser): 9 | parser.add_argument("--base-config", help="path to base config", required=True) 10 | parser.add_argument("--search-config", help="path to search config", required=True) 11 | parser.add_argument("--num-group", help="num of hyperparam group", type=int, required=True) 12 | parser.add_argument("--random-seed", help="random seed", type=int, required=True) 13 | parser.add_argument("--output-dir", help="path to output dir", required=True) 14 | 15 | def main(args): 16 | hyperparams = load_hyperparams(args.base_config) 17 | hyperparams_group = search_hyperparams(hyperparams, 18 | args.search_config, args.num_group, args.random_seed) 19 | create_hyperparams_file(hyperparams_group, args.output_dir) 20 | 21 | if __name__ == "__main__": 22 | parser = argparse.ArgumentParser() 23 | add_arguments(parser) 24 | args = parser.parse_args() 25 | main(args) 26 | -------------------------------------------------------------------------------- /reading_comprehension/util/summary_writer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | __all__ = ["SummaryWriter"] 5 | 6 | class SummaryWriter(object): 7 | """summary writer""" 8 | def __init__(self, 9 | graph, 10 | output_dir): 11 | """initialize summary writer""" 12 | if not tf.gfile.Exists(output_dir): 13 | tf.gfile.MakeDirs(output_dir) 14 | self.summary_writer = tf.summary.FileWriter(output_dir, graph) 15 | 16 | def add_summary(self, 17 | summary, 18 | global_step): 19 | """add new summary""" 20 | self.summary_writer.add_summary(summary, global_step) 21 | 22 | def add_value_summary(self, 23 | summary_tag, 24 | summary_value, 25 | global_step): 26 | """add new value summary""" 27 | summary = tf.Summary(value=[tf.Summary.Value(tag=summary_tag, simple_value=summary_value)]) 28 | self.summary_writer.add_summary(summary, global_step) 29 | 30 | def close_writer(self): 31 | """close summary writer""" 32 | self.summary_writer.close() 33 | 34 | def reopen_writer(self): 35 | """re-open summary writer""" 36 | self.summary_writer.reopen() 37 | -------------------------------------------------------------------------------- /reading_comprehension/util/default_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | __all__ = ["EPSILON", "MAX_INT", "MIN_FLOAT", "check_tensorflow_version", "safe_exp", "get_config_proto", "get_device_spec"] 5 | 6 | EPSILON = 1e-30 7 | MAX_INT = 2147483647 8 | MIN_FLOAT = -1e30 9 | 10 | def check_tensorflow_version(): 11 | """check tensorflow version in current environment""" 12 | min_tf_version = "1.12.0" 13 | curr_tf_version = tf.__version__ 14 | if curr_tf_version < min_tf_version: 15 | raise EnvironmentError("tensorflow version must be >= {0}".format(min_tf_version)) 16 | return curr_tf_version 17 | 18 | def safe_exp(value): 19 | """handle overflow exception for exp""" 20 | try: 21 | res = np.exp(value) 22 | except OverflowError: 23 | res = float("inf") 24 | return res 25 | 26 | def get_config_proto(log_device_placement, 27 | allow_soft_placement, 28 | allow_growth, 29 | per_process_gpu_memory_fraction): 30 | """get config proto for device setting""" 31 | config_proto = tf.ConfigProto(log_device_placement=log_device_placement, 32 | allow_soft_placement=allow_soft_placement) 33 | config_proto.gpu_options.allow_growth = allow_growth 34 | config_proto.gpu_options.per_process_gpu_memory_fraction = per_process_gpu_memory_fraction 35 | 36 | return config_proto 37 | 38 | def get_device_spec(device_id, num_gpus): 39 | """get device specification""" 40 | if num_gpus == 0: 41 | device_spec = "/device:CPU:0" 42 | else: 43 | device_spec = "/device:GPU:{0}".format(device_id % num_gpus) 44 | 45 | return device_spec 46 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /reading_comprehension/layer/pooling.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from util.default_util import * 5 | from util.reading_comprehension_util import * 6 | 7 | __all__ = ["MaxPooling", "AveragePooling"] 8 | 9 | class MaxPooling(object): 10 | """max pooling layer""" 11 | def __init__(self, 12 | num_gpus=1, 13 | default_gpu_id=0, 14 | scope="max_pool"): 15 | """initialize max pooling layer""" 16 | self.scope = scope 17 | self.device_spec = get_device_spec(default_gpu_id, num_gpus) 18 | 19 | def __call__(self, 20 | input_data, 21 | input_mask): 22 | """call max pooling layer""" 23 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 24 | output_mask = tf.squeeze(tf.reduce_max(input_mask, axis=-2, keepdims=True), axis=-2) 25 | output_pool = tf.reduce_max(input_data * input_mask + MIN_FLOAT * (1 - input_mask), axis=-2) * output_mask 26 | output_pool = output_pool + tf.reduce_max(input_data, axis=-2) * (1 - output_mask) 27 | 28 | return output_pool, output_mask 29 | 30 | class AveragePooling(object): 31 | """average pooling layer""" 32 | def __init__(self, 33 | num_gpus=1, 34 | default_gpu_id=0, 35 | scope="avg_pool"): 36 | """initialize average pooling layer""" 37 | self.scope = scope 38 | self.device_spec = get_device_spec(default_gpu_id, num_gpus) 39 | 40 | def __call__(self, 41 | input_data, 42 | input_mask): 43 | """call average pooling layer""" 44 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 45 | input_sum = tf.reduce_sum(input_data * input_mask, axis=-2) 46 | input_count = tf.count_nonzero(input_mask, axis=-2, dtype=tf.float32) 47 | output_mask = tf.squeeze(tf.reduce_max(input_mask, axis=-2, keepdims=True), axis=-2) 48 | output_pool = 1.0 * input_sum / (input_count - output_mask + 1.0) 49 | 50 | return output_pool, output_mask 51 | -------------------------------------------------------------------------------- /reading_comprehension/external/squad.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import string 3 | import re 4 | import sys 5 | 6 | __all__ = ["eval_exact_match_score", "eval_f1_score"] 7 | 8 | def normalize_answer(s): 9 | """Lower text and remove punctuation, articles and extra whitespace.""" 10 | def remove_articles(text): 11 | return re.sub(r'\b(a|an|the)\b', ' ', text) 12 | 13 | def white_space_fix(text): 14 | return ' '.join(text.split()) 15 | 16 | def remove_punc(text): 17 | exclude = set(string.punctuation) 18 | return ''.join(ch for ch in text if ch not in exclude) 19 | 20 | def lower(text): 21 | return text.lower() 22 | 23 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 24 | 25 | def f1_score(prediction, ground_truth): 26 | prediction_tokens = normalize_answer(prediction).split() 27 | ground_truth_tokens = normalize_answer(ground_truth).split() 28 | common = collections.Counter(prediction_tokens) & collections.Counter(ground_truth_tokens) 29 | num_same = sum(common.values()) 30 | if num_same == 0: 31 | return 0 32 | precision = 1.0 * num_same / len(prediction_tokens) 33 | recall = 1.0 * num_same / len(ground_truth_tokens) 34 | f1 = (2 * precision * recall) / (precision + recall) 35 | return f1 36 | 37 | def exact_match_score(prediction, ground_truth): 38 | return (normalize_answer(prediction) == normalize_answer(ground_truth)) 39 | 40 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 41 | scores_for_ground_truths = [] 42 | for ground_truth in ground_truths: 43 | score = metric_fn(prediction, ground_truth) 44 | scores_for_ground_truths.append(score) 45 | return max(scores_for_ground_truths) 46 | 47 | def eval_exact_match_score(predicts, labels): 48 | exact_match = total = 0 49 | for (predict, label_list) in zip(predicts, labels): 50 | total += 1 51 | exact_match += metric_max_over_ground_truths(exact_match_score, predict, label_list) 52 | 53 | exact_match = 100.0 * exact_match / total 54 | 55 | return exact_match 56 | 57 | def eval_f1_score(predicts, labels): 58 | f1 = total = 0 59 | for (predict, label_list) in zip(predicts, labels): 60 | total += 1 61 | f1 += metric_max_over_ground_truths(f1_score, predict, label_list) 62 | 63 | f1 = 100.0 * f1 / total 64 | 65 | return f1 66 | -------------------------------------------------------------------------------- /reading_comprehension/util/eval_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from external.bleu import * 5 | from external.rouge import * 6 | from external.squad import * 7 | 8 | __all__ = ["evaluate_from_data", "evaluate_from_file"] 9 | 10 | def _bleu(pred_data, ref_data): 11 | """BLEU score for translation task""" 12 | max_order = 4 13 | smooth = False 14 | score, _, _, _, _, _ = compute_bleu(ref_data, pred_data, max_order, smooth) 15 | bleu_score = 100 * score 16 | return bleu_score 17 | 18 | def _rouge(pred_data, ref_data): 19 | """ROUGE score for summarization task""" 20 | score_map = rouge(pred_data, ref_data) 21 | rouge_score = 100 * score_map["rouge_l/f_score"] 22 | return rouge_score 23 | 24 | def _squad_em(pred_data, ref_data): 25 | """EM score for reading comprehension task""" 26 | em_score = eval_exact_match_score(pred_data, ref_data) 27 | return em_score 28 | 29 | def _squad_f1(pred_data, ref_data): 30 | """F1 score for reading comprehension task""" 31 | f1_score = eval_f1_score(pred_data, ref_data) 32 | return f1_score 33 | 34 | def evaluate_from_data(pred_data, ref_data, metric): 35 | """compute evaluation score based on selected metric""" 36 | pred_and_ref = [(pred, ref_list) for pred, ref_list in zip(pred_data, ref_data) if pred and ref_list] 37 | pred_data = [pred for (pred, _) in pred_and_ref] 38 | ref_data = [ref_list for (_, ref_list) in pred_and_ref] 39 | 40 | if len(pred_data) == 0 or len(ref_data) == 0: 41 | return 0.0 42 | 43 | if metric == "bleu": 44 | eval_score = _bleu(pred_data, ref_data) 45 | elif metric == "rouge": 46 | eval_score = _rouge(pred_data, ref_data) 47 | elif metric == "exact": 48 | eval_score = _squad_em(pred_data, ref_data) 49 | elif metric == "f1": 50 | eval_score = _squad_f1(pred_data, ref_data) 51 | else: 52 | raise ValueError("unsupported metric {0}".format(metric)) 53 | 54 | return eval_score 55 | 56 | def evaluate_from_file(pred_file, ref_file, metric): 57 | predict = [] 58 | with codecs.getreader("utf-8")(tf.gfile.GFile(pred_file, "rb")) as file_p: 59 | for line in file_p: 60 | predict.append(line.strip()) 61 | reference = [] 62 | with codecs.getreader("utf-8")(tf.gfile.GFile(ref_file, "rb")) as file_r: 63 | for line in file_r: 64 | reference.append(line.strip()) 65 | 66 | eval_score = evaluate(predict, reference, metric) 67 | return eval_score 68 | -------------------------------------------------------------------------------- /reading_comprehension/util/train_logger.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import os.path 3 | import time 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | __all__ = ["TrainLogger"] 9 | 10 | class TrainLogger(object): 11 | """train logger""" 12 | def __init__(self, 13 | output_dir): 14 | """initialize train logger""" 15 | self.loss = 0.0 16 | self.learning_rate = 0.0 17 | self.global_step = 0 18 | self.epoch = 0 19 | self.step_in_epoch = 0 20 | self.train_time = 0.0 21 | self.sample_size = 0 22 | self.prev_check_loss = 0.0 23 | self.prev_check_train_time = 0.0 24 | self.prev_check_sample_size = 0 25 | 26 | if not tf.gfile.Exists(output_dir): 27 | tf.gfile.MakeDirs(output_dir) 28 | self.log_file = os.path.join(output_dir, "train_{0}.log".format(time.time())) 29 | self.log_writer = codecs.getwriter("utf-8")(tf.gfile.GFile(self.log_file, mode="a")) 30 | 31 | def update(self, 32 | train_result, 33 | epoch, 34 | step_in_epoch, 35 | time_per_step): 36 | """update train logger based on train result""" 37 | self.loss += train_result.loss * train_result.batch_size 38 | self.learning_rate = train_result.learning_rate 39 | self.global_step = train_result.global_step 40 | self.epoch = epoch 41 | self.step_in_epoch = step_in_epoch 42 | self.train_time += time_per_step 43 | self.sample_size += train_result.batch_size 44 | 45 | def check(self): 46 | """check train statistic""" 47 | loss_delta = self.loss - self.prev_check_loss 48 | train_time_delta = self.train_time - self.prev_check_train_time 49 | sample_size_delta = self.sample_size - self.prev_check_sample_size 50 | 51 | if self.sample_size <= 0: 52 | raise ValueError("current sample size is less than or equal to 0") 53 | 54 | if sample_size_delta <= 0: 55 | return 56 | 57 | avg_loss = loss_delta / sample_size_delta 58 | curr_loss = self.loss / self.sample_size 59 | 60 | log_line = "epoch={0}, step={1}, global step={2}, train time={3} avg. loss={4}, curr loss={5}".format( 61 | self.epoch, self.step_in_epoch, self.global_step, train_time_delta, avg_loss, curr_loss).encode('utf-8') 62 | self.log_writer.write("{0}\r\n".format(log_line)) 63 | print(log_line) 64 | 65 | self.prev_check_loss = self.loss 66 | self.prev_check_train_time = self.train_time 67 | self.prev_check_sample_size = self.sample_size 68 | -------------------------------------------------------------------------------- /reading_comprehension/layer/basic.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from util.default_util import * 5 | from util.reading_comprehension_util import * 6 | 7 | __all__ = ["Dropout", "LayerNorm"] 8 | 9 | class Dropout(object): 10 | """dropout layer""" 11 | def __init__(self, 12 | rate, 13 | num_gpus=1, 14 | default_gpu_id=0, 15 | random_seed=0, 16 | scope="dropout"): 17 | """initialize dropout layer""" 18 | self.rate = rate 19 | self.random_seed = random_seed 20 | self.scope = scope 21 | self.device_spec = get_device_spec(default_gpu_id, num_gpus) 22 | 23 | def __call__(self, 24 | input_data, 25 | input_mask): 26 | """call dropout layer""" 27 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 28 | if self.rate > 0.0: 29 | output_dropout = tf.nn.dropout(input_data, 1.0 - self.rate, seed=self.random_seed) 30 | else: 31 | output_dropout = input_data 32 | 33 | output_mask = input_mask 34 | 35 | return output_dropout, output_mask 36 | 37 | class LayerNorm(object): 38 | """layer norm layer""" 39 | def __init__(self, 40 | layer_dim, 41 | num_gpus=1, 42 | default_gpu_id=0, 43 | regularizer=None, 44 | random_seed=0, 45 | trainable=True, 46 | scope="layer_norm"): 47 | """initialize layer norm layer""" 48 | self.layer_dim = layer_dim 49 | self.regularizer = regularizer 50 | self.random_seed = random_seed 51 | self.trainable = trainable 52 | self.scope = scope 53 | self.device_spec = get_device_spec(default_gpu_id, num_gpus) 54 | 55 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 56 | gamma_initializer = create_variable_initializer("one") 57 | beta_initializer = create_variable_initializer("zero") 58 | self.gamma = tf.get_variable("gamma", shape=[self.layer_dim], initializer=gamma_initializer, 59 | regularizer=self.regularizer, trainable=self.trainable, dtype=tf.float32) 60 | self.beta = tf.get_variable("beta", shape=[self.layer_dim], initializer=beta_initializer, 61 | regularizer=self.regularizer, trainable=self.trainable, dtype=tf.float32) 62 | 63 | def __call__(self, 64 | input_data, 65 | input_mask): 66 | """call layer norm layer""" 67 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 68 | input_mean, input_variance = tf.nn.moments(input_data, axes=[-1], keep_dims=True) 69 | output_norm = (input_data - input_mean) / tf.sqrt(input_variance + EPSILON) 70 | output_norm = output_norm * self.gamma + self.beta 71 | output_mask = input_mask 72 | 73 | return output_norm, output_mask 74 | -------------------------------------------------------------------------------- /reading_comprehension/layer/embedding.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from util.default_util import * 5 | from util.reading_comprehension_util import * 6 | 7 | __all__ = ["Embedding", "PretrainedEmbedding"] 8 | 9 | class Embedding(object): 10 | """Embedding layer""" 11 | def __init__(self, 12 | vocab_size, 13 | embed_dim, 14 | num_gpus=1, 15 | default_gpu_id=0, 16 | regularizer=None, 17 | random_seed=0, 18 | trainable=True, 19 | scope="embedding"): 20 | """initialize embedding layer""" 21 | self.vocab_size = vocab_size 22 | self.embed_dim = embed_dim 23 | self.regularizer = regularizer if trainable == True else None 24 | self.random_seed = random_seed 25 | self.trainable = trainable 26 | self.scope = scope 27 | self.device_spec = get_device_spec(default_gpu_id, num_gpus) 28 | 29 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 30 | initializer = create_variable_initializer("glorot_uniform", self.random_seed) 31 | self.embedding = tf.get_variable("embedding", shape=[self.vocab_size, self.embed_dim], 32 | initializer=initializer, regularizer=self.regularizer, trainable=self.trainable, dtype=tf.float32) 33 | 34 | def __call__(self, 35 | input_data): 36 | """call embedding layer""" 37 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 38 | output_embedding = tf.nn.embedding_lookup(self.embedding, input_data) 39 | 40 | return output_embedding 41 | 42 | class PretrainedEmbedding(object): 43 | """Pretrained Embedding layer""" 44 | def __init__(self, 45 | vocab_size, 46 | embed_dim, 47 | embed_data, 48 | num_gpus=1, 49 | default_gpu_id=0, 50 | regularizer=None, 51 | trainable=True, 52 | scope="pretrained_embedding"): 53 | """initialize pretrained embedding layer""" 54 | self.vocab_size = vocab_size 55 | self.embed_dim = embed_dim 56 | self.embed_data = embed_data 57 | self.regularizer = regularizer if trainable == True else None 58 | self.trainable = trainable 59 | self.scope = scope 60 | self.device_spec = get_device_spec(default_gpu_id, num_gpus) 61 | 62 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 63 | initializer = tf.constant_initializer(self.embed_data) 64 | self.embedding = tf.get_variable("pretrained_embedding", shape=[self.vocab_size, self.embed_dim], 65 | initializer=initializer, regularizer=self.regularizer, trainable=self.trainable, dtype=tf.float32) 66 | 67 | def __call__(self, 68 | input_data): 69 | """call pretrained embedding layer""" 70 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 71 | output_embedding = tf.nn.embedding_lookup(self.embedding, input_data) 72 | 73 | return output_embedding 74 | -------------------------------------------------------------------------------- /reading_comprehension/squad/evaluate-v1.py: -------------------------------------------------------------------------------- 1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """ 2 | from __future__ import print_function 3 | from collections import Counter 4 | import string 5 | import re 6 | import argparse 7 | import json 8 | import sys 9 | 10 | 11 | def normalize_answer(s): 12 | """Lower text and remove punctuation, articles and extra whitespace.""" 13 | def remove_articles(text): 14 | return re.sub(r'\b(a|an|the)\b', ' ', text) 15 | 16 | def white_space_fix(text): 17 | return ' '.join(text.split()) 18 | 19 | def remove_punc(text): 20 | exclude = set(string.punctuation) 21 | return ''.join(ch for ch in text if ch not in exclude) 22 | 23 | def lower(text): 24 | return text.lower() 25 | 26 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 27 | 28 | 29 | def f1_score(prediction, ground_truth): 30 | prediction_tokens = normalize_answer(prediction).split() 31 | ground_truth_tokens = normalize_answer(ground_truth).split() 32 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 33 | num_same = sum(common.values()) 34 | if num_same == 0: 35 | return 0 36 | precision = 1.0 * num_same / len(prediction_tokens) 37 | recall = 1.0 * num_same / len(ground_truth_tokens) 38 | f1 = (2 * precision * recall) / (precision + recall) 39 | return f1 40 | 41 | 42 | def exact_match_score(prediction, ground_truth): 43 | return (normalize_answer(prediction) == normalize_answer(ground_truth)) 44 | 45 | 46 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 47 | scores_for_ground_truths = [] 48 | for ground_truth in ground_truths: 49 | score = metric_fn(prediction, ground_truth) 50 | scores_for_ground_truths.append(score) 51 | return max(scores_for_ground_truths) 52 | 53 | 54 | def evaluate(dataset, predictions): 55 | f1 = exact_match = total = 0 56 | for article in dataset: 57 | for paragraph in article['paragraphs']: 58 | for qa in paragraph['qas']: 59 | total += 1 60 | if qa['id'] not in predictions: 61 | message = 'Unanswered question ' + qa['id'] + \ 62 | ' will receive score 0.' 63 | print(message, file=sys.stderr) 64 | continue 65 | ground_truths = list(map(lambda x: x['text'], qa['answers'])) 66 | prediction = predictions[qa['id']] 67 | exact_match += metric_max_over_ground_truths( 68 | exact_match_score, prediction, ground_truths) 69 | f1 += metric_max_over_ground_truths( 70 | f1_score, prediction, ground_truths) 71 | 72 | exact_match = 100.0 * exact_match / total 73 | f1 = 100.0 * f1 / total 74 | 75 | return {'exact_match': exact_match, 'f1': f1} 76 | 77 | 78 | if __name__ == '__main__': 79 | expected_version = '1.1' 80 | parser = argparse.ArgumentParser( 81 | description='Evaluation for SQuAD ' + expected_version) 82 | parser.add_argument('dataset_file', help='Dataset file') 83 | parser.add_argument('prediction_file', help='Prediction File') 84 | args = parser.parse_args() 85 | with open(args.dataset_file) as dataset_file: 86 | dataset_json = json.load(dataset_file) 87 | if (dataset_json['version'] != expected_version): 88 | print('Evaluation expects v-' + expected_version + 89 | ', but got dataset with v-' + dataset_json['version'], 90 | file=sys.stderr) 91 | dataset = dataset_json['data'] 92 | with open(args.prediction_file) as prediction_file: 93 | predictions = json.load(prediction_file) 94 | print(json.dumps(evaluate(dataset, predictions))) 95 | -------------------------------------------------------------------------------- /reading_comprehension/layer/position.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from util.default_util import * 5 | from util.reading_comprehension_util import * 6 | 7 | __all__ = ["SinusoidPosition", "AbsolutePosition"] 8 | 9 | class SinusoidPosition(object): 10 | """sinusoid position layer""" 11 | def __init__(self, 12 | min_time_scale, 13 | max_time_scale, 14 | num_gpus=1, 15 | default_gpu_id=0, 16 | scope="sin_pos"): 17 | """initialize sinusoid position layer""" 18 | self.min_time_scale = min_time_scale 19 | self.max_time_scale = max_time_scale 20 | self.scope = scope 21 | self.device_spec = get_device_spec(default_gpu_id, num_gpus) 22 | 23 | def __call__(self, 24 | input_data, 25 | input_mask): 26 | """call sinusoid position layer""" 27 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 28 | input_shape = tf.shape(input_data) 29 | length = input_shape[-2] 30 | channel = input_shape[-1] 31 | num_time_scale = channel // 2 32 | position = tf.to_float(tf.range(length)) 33 | log_time_scale = tf.log(float(self.max_time_scale) / float(self.min_time_scale)) / (tf.to_float(num_time_scale) - 1) 34 | inv_time_scale = float(self.min_time_scale) * tf.exp(-1.0 * log_time_scale * tf.to_float(tf.range(num_time_scale))) 35 | scaled_time = tf.expand_dims(position, axis=1) * tf.expand_dims(inv_time_scale, axis=0) 36 | signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1) 37 | signal = tf.pad(signal, paddings=[[0, 0], [0, tf.mod(channel, 2)]]) 38 | signal = tf.reshape(signal, shape=[1, length, channel]) 39 | 40 | output_signal = input_data + signal 41 | output_mask = input_mask 42 | 43 | return output_signal, output_mask 44 | 45 | class AbsolutePosition(object): 46 | """absolute position layer""" 47 | def __init__(self, 48 | unit_dim, 49 | max_length, 50 | num_gpus=1, 51 | default_gpu_id=0, 52 | regularizer=None, 53 | random_seed=0, 54 | trainable=True, 55 | scope="abs_pos"): 56 | """initialize absolute position layer""" 57 | self.unit_dim = unit_dim 58 | self.max_length = max_length 59 | self.random_seed = random_seed 60 | self.regularizer = regularizer 61 | self.trainable = trainable 62 | self.scope = scope 63 | self.device_spec = get_device_spec(default_gpu_id, num_gpus) 64 | 65 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 66 | weight_initializer = create_variable_initializer("glorot_uniform", self.random_seed) 67 | self.position_embedding = tf.get_variable("position_embedding", shape=[1, self.max_length, self.unit_dim], 68 | initializer=weight_initializer, regularizer=self.regularizer, trainable=self.trainable, dtype=tf.float32) 69 | 70 | def __call__(self, 71 | input_data, 72 | input_mask): 73 | """call absolute position layer""" 74 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 75 | input_shape = tf.shape(input_data) 76 | max_length = input_shape[-2] 77 | position_embedding = self.position_embedding[:,:max_length,:] 78 | output_signal = input_data + position_embedding 79 | output_mask = input_mask 80 | 81 | return output_signal, output_mask 82 | -------------------------------------------------------------------------------- /reading_comprehension/util/reading_comprehension_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from util.default_util import * 5 | 6 | __all__ = ["create_variable_initializer", "create_weight_regularizer", "create_activation_function", 7 | "softmax_with_mask", "generate_masked_data", "generate_onehot_label"] 8 | 9 | def create_variable_initializer(initializer_type, 10 | random_seed=None, 11 | data_type=tf.float32): 12 | """create variable initializer""" 13 | if initializer_type == "zero": 14 | initializer = tf.zeros_initializer 15 | elif initializer_type == "one": 16 | initializer = tf.ones_initializer 17 | elif initializer_type == "orthogonal": 18 | initializer = tf.orthogonal_initializer(seed=random_seed, dtype=data_type) 19 | elif initializer_type == "random_uniform": 20 | initializer = tf.random_uniform_initializer(seed=random_seed, dtype=data_type) 21 | elif initializer_type == "glorot_uniform": 22 | initializer = tf.glorot_uniform_initializer(seed=random_seed, dtype=data_type) 23 | elif initializer_type == "xavier_uniform": 24 | initializer = tf.contrib.layers.xavier_initializer(uniform=True, seed=random_seed, dtype=tf.float32) 25 | elif initializer_type == "random_normal": 26 | initializer = tf.random_normal_initializer(seed=random_seed, dtype=data_type) 27 | elif initializer_type == "truncated_normal": 28 | initializer = tf.truncated_normal_initializer(seed=random_seed, dtype=data_type) 29 | elif initializer_type == "glorot_normal": 30 | initializer = tf.glorot_normal_initializer(seed=random_seed, dtype=data_type) 31 | elif initializer_type == "xavier_normal": 32 | initializer = tf.contrib.layers.xavier_initializer(uniform=False, seed=random_seed, dtype=tf.float32) 33 | elif initializer_type == "variance_scaling": 34 | initializer = tf.contrib.layers.variance_scaling_initializer(factor=2.0, 35 | mode='FAN_IN', uniform=False, seed=random_seed, dtype=tf.float32) 36 | else: 37 | initializer = None 38 | 39 | return initializer 40 | 41 | def create_weight_regularizer(regularizer_type, 42 | scale): 43 | """create weight regularizer""" 44 | if regularizer_type == "l1": 45 | regularizer = tf.contrib.layers.l1_regularizer(scale) 46 | elif regularizer_type == "l2": 47 | regularizer = tf.contrib.layers.l2_regularizer(scale) 48 | else: 49 | regularizer = None 50 | 51 | return regularizer 52 | 53 | def create_activation_function(activation): 54 | """create activation function""" 55 | if activation == "relu": 56 | activation_function = tf.nn.relu 57 | elif activation == "relu6": 58 | activation_function = tf.nn.relu6 59 | elif activation == "leaky_relu": 60 | activation_function = tf.nn.leaky_relu 61 | elif activation == "elu": 62 | activation_function = tf.nn.elu 63 | elif activation == "crelu": 64 | activation_function = tf.nn.crelu 65 | elif activation == "selu": 66 | activation_function = tf.nn.selu 67 | elif activation == "gelu": 68 | activation_function = gelu 69 | elif activation == "tanh": 70 | activation_function = tf.nn.tanh 71 | elif activation == "sigmoid": 72 | activation_function = tf.nn.sigmoid 73 | elif activation == "softplus": 74 | activation_function = tf.nn.softplus 75 | else: 76 | activation_function = None 77 | 78 | return activation_function 79 | 80 | def softmax_with_mask(input_data, 81 | input_mask, 82 | axis=-1): 83 | """compute softmax with masking""" 84 | return tf.nn.softmax(input_data * input_mask + MIN_FLOAT * (1 - input_mask), axis=axis) 85 | 86 | def generate_masked_data(input_data, 87 | input_mask): 88 | """generate masked data""" 89 | return input_data * input_mask + MIN_FLOAT * (1 - input_mask) 90 | 91 | def generate_onehot_label(input_data, 92 | input_depth): 93 | """generate one-hot label""" 94 | return tf.one_hot(input_data, depth=input_depth, on_value=1.0, off_value=0.0, dtype=tf.float32) 95 | 96 | def gelu(input_tensor): 97 | """Gaussian Error Linear Unit""" 98 | cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0))) 99 | return input_tensor * cdf 100 | -------------------------------------------------------------------------------- /reading_comprehension/external/bleu.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Python implementation of BLEU and smooth-BLEU. 17 | 18 | This module provides a Python implementation of BLEU and smooth-BLEU. 19 | Smooth BLEU is computed following the method outlined in the paper: 20 | Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic 21 | evaluation metrics for machine translation. COLING 2004. 22 | """ 23 | 24 | import collections 25 | import math 26 | 27 | __all__ = ["compute_bleu"] 28 | 29 | def _get_ngrams(segment, max_order): 30 | """Extracts all n-grams upto a given maximum order from an input segment. 31 | 32 | Args: 33 | segment: text segment from which n-grams will be extracted. 34 | max_order: maximum length in tokens of the n-grams returned by this 35 | methods. 36 | 37 | Returns: 38 | The Counter containing all n-grams upto max_order in segment 39 | with a count of how many times each n-gram occurred. 40 | """ 41 | ngram_counts = collections.Counter() 42 | for order in range(1, max_order + 1): 43 | for i in range(0, len(segment) - order + 1): 44 | ngram = tuple(segment[i:i+order]) 45 | ngram_counts[ngram] += 1 46 | return ngram_counts 47 | 48 | 49 | def compute_bleu(reference_corpus, translation_corpus, max_order=4, 50 | smooth=False): 51 | """Computes BLEU score of translated segments against one or more references. 52 | 53 | Args: 54 | reference_corpus: list of lists of references for each translation. Each 55 | reference should be tokenized into a list of tokens. 56 | translation_corpus: list of translations to score. Each translation 57 | should be tokenized into a list of tokens. 58 | max_order: Maximum n-gram order to use when computing BLEU score. 59 | smooth: Whether or not to apply Lin et al. 2004 smoothing. 60 | 61 | Returns: 62 | 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram 63 | precisions and brevity penalty. 64 | """ 65 | matches_by_order = [0] * max_order 66 | possible_matches_by_order = [0] * max_order 67 | reference_length = 0 68 | translation_length = 0 69 | for (references, translation) in zip(reference_corpus, 70 | translation_corpus): 71 | reference_length += min(len(r) for r in references) 72 | translation_length += len(translation) 73 | 74 | merged_ref_ngram_counts = collections.Counter() 75 | for reference in references: 76 | merged_ref_ngram_counts |= _get_ngrams(reference, max_order) 77 | translation_ngram_counts = _get_ngrams(translation, max_order) 78 | overlap = translation_ngram_counts & merged_ref_ngram_counts 79 | for ngram in overlap: 80 | matches_by_order[len(ngram)-1] += overlap[ngram] 81 | for order in range(1, max_order+1): 82 | possible_matches = len(translation) - order + 1 83 | if possible_matches > 0: 84 | possible_matches_by_order[order-1] += possible_matches 85 | 86 | precisions = [0] * max_order 87 | for i in range(0, max_order): 88 | if smooth: 89 | precisions[i] = ((matches_by_order[i] + 1.) / 90 | (possible_matches_by_order[i] + 1.)) 91 | else: 92 | if possible_matches_by_order[i] > 0: 93 | precisions[i] = (float(matches_by_order[i]) / 94 | possible_matches_by_order[i]) 95 | else: 96 | precisions[i] = 0.0 97 | 98 | if min(precisions) > 0: 99 | p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions) 100 | geo_mean = math.exp(p_log_sum) 101 | else: 102 | geo_mean = 0 103 | 104 | ratio = float(translation_length) / reference_length 105 | 106 | if ratio > 1.0: 107 | bp = 1. 108 | else: 109 | bp = math.exp(1 - 1. / ratio) 110 | 111 | bleu = geo_mean * bp 112 | 113 | return (bleu, precisions, bp, ratio, translation_length, reference_length) 114 | -------------------------------------------------------------------------------- /reading_comprehension/util/eval_logger.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import collections 3 | import os.path 4 | import time 5 | import json 6 | 7 | import numpy as np 8 | import tensorflow as tf 9 | 10 | __all__ = ["BasicInfoEvalLog", "ExtrinsicEvalLog", "DecodingEvalLog", "EvalLogger"] 11 | 12 | class BasicInfoEvalLog(collections.namedtuple("BasicInfoEvalLog", ("epoch", "global_step"))): 13 | pass 14 | 15 | class ExtrinsicEvalLog(collections.namedtuple("ExtrinsicEvalLog", ("metric", "score", "sample_output", "sample_size"))): 16 | pass 17 | 18 | class DecodingEvalLog(collections.namedtuple("DecodingEvalLog", ("sample_input", "sample_output", "sample_reference"))): 19 | pass 20 | 21 | class EvalLogger(object): 22 | """evaluation logger""" 23 | def __init__(self, 24 | output_dir): 25 | """extrinsic evaluation result""" 26 | self.extrinsic_eval = None 27 | self.extrinsic_eval_info = None 28 | self.extrinsic_eval_detail = None 29 | self.extrinsic_eval_detail_info = None 30 | 31 | """extrinsic evaluation result""" 32 | self.decoding_eval = None 33 | self.decoding_eval_info = None 34 | 35 | """initialize evaluation logger""" 36 | self.output_dir = output_dir 37 | if not tf.gfile.Exists(self.output_dir): 38 | tf.gfile.MakeDirs(self.output_dir) 39 | self.log_file = os.path.join(self.output_dir, "eval_{0}.log".format(time.time())) 40 | self.log_writer = codecs.getwriter("utf-8")(tf.gfile.GFile(self.log_file, mode="a")) 41 | 42 | def update_extrinsic_eval(self, 43 | eval_result_list, 44 | basic_info): 45 | """update evaluation logger with extrinsic evaluation result""" 46 | self.extrinsic_eval = eval_result_list 47 | self.extrinsic_eval_info = basic_info 48 | 49 | def update_extrinsic_eval_detail(self, 50 | eval_result_detail, 51 | basic_info): 52 | """update evaluation logger with extrinsic evaluation result detail""" 53 | self.extrinsic_eval_detail = eval_result_detail 54 | self.extrinsic_eval_detail_info = basic_info 55 | 56 | def check_extrinsic_eval(self): 57 | """check extrinsic evaluation result""" 58 | for eval_result in self.extrinsic_eval: 59 | log_line = "epoch={0}, global step={1}, {2}={3}, sample size={4}".format(self.extrinsic_eval_info.epoch, 60 | self.extrinsic_eval_info.global_step, eval_result.metric, eval_result.score, eval_result.sample_size).encode('utf-8') 61 | self.log_writer.write("{0}\r\n".format(log_line)) 62 | print(log_line) 63 | 64 | def check_extrinsic_eval_detail(self): 65 | """check extrinsic evaluation detail result""" 66 | eval_detail_file = os.path.join(self.output_dir, "eval_{0}_{1}_{2}.detail".format(self.extrinsic_eval_detail_info.epoch, 67 | self.extrinsic_eval_detail_info.global_step, time.time())) 68 | with codecs.getwriter("utf-8")(tf.gfile.GFile(eval_detail_file, mode="w")) as eval_detail_writer: 69 | if self.extrinsic_eval_detail is None: 70 | return 71 | sample_output = json.dumps(self.extrinsic_eval_detail.sample_output, indent=4) 72 | eval_detail_writer.write(sample_output) 73 | 74 | def update_decoding_eval(self, 75 | eval_result_list, 76 | basic_info): 77 | """update evaluation logger with decoding evaluation result""" 78 | self.decoding_eval = eval_result_list 79 | self.decoding_eval_info = basic_info 80 | 81 | def check_decoding_eval(self): 82 | """check decoding evaluation result""" 83 | sample_size = len(self.decoding_eval) 84 | log_line = "epoch={0}, global step={1}, sample size={2}".format(self.decoding_eval_info.epoch, 85 | self.decoding_eval_info.global_step, sample_size).encode('utf-8') 86 | self.log_writer.write("{0}\r\n".format(log_line)) 87 | print(log_line) 88 | 89 | for i in range(sample_size): 90 | eval_result = self.decoding_eval[i] 91 | log_line = "=====================================" 92 | self.log_writer.write("{0}\r\n".format(log_line)) 93 | print(log_line) 94 | log_line = "sample {0} - input: {1}".format(i+1, eval_result.sample_input).encode('utf-8') 95 | self.log_writer.write("{0}\r\n".format(log_line)) 96 | print(log_line) 97 | log_line = "sample {0} - output: {1}".format(i+1, eval_result.sample_output).encode('utf-8') 98 | self.log_writer.write("{0}\r\n".format(log_line)) 99 | print(log_line) 100 | log_line = "sample {0} - reference: {1}".format(i+1, eval_result.sample_reference).encode('utf-8') 101 | self.log_writer.write("{0}\r\n".format(log_line)) 102 | print(log_line) 103 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | Machine reading comprehension (MRC), a task which asks machine to read a given context then answer questions based on its understanding, is considered one of the key problems in artificial intelligence and has significant interest from both academic and industry. Over the past few years, great progress has been made in this field, thanks to various end-to-end trained neural models and high quality datasets with large amount of examples proposed. 3 | ![squad_example]({{ site.url }}/reading_comprehension_tf/SQuAD.example.png){:width="800px"} 4 | *Figure 1: MRC example from SQuAD 2.0 dev set* 5 | 6 | ## DataSet 7 | * [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) is a reading comprehension dataset, consisting of questions posed by crowd-workers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable. 8 | * [GloVe](https://nlp.stanford.edu/projects/glove/) is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space. 9 | 10 | ## Experiment 11 | ### QANet 12 | [QANet](https://github.com/google-research/google-research/tree/master/qanet) is a MRC architecture proposed by Google Brain, which does not require recurrent networks: Its encoder consists exclusively of convolution and self-attention, where convolution models local interactions and self-attention models global interactions. 13 | 14 | ![qanet_arch]({{ site.url }}/reading_comprehension_tf/QANet.architecture.png){:width="700px"} 15 | 16 | *Figure 2: An overview of the QANet architecture* 17 | 18 | ![qanet_metric]({{ site.url }}/reading_comprehension_tf/QANet.metric.png){:width="1000px"} 19 | 20 | *Figure 3: The experiment details are reported on SQuAD v1 dataset. Both train & dev sets are processed using Spacy. Invalid samples are removed from both train & dev sets. EM results for QANet model with/without EMA are shown on left. F1 results for QANet model with/without EMA are shown on right* 21 | 22 | | Model | # Epoch | # Train Steps | Batch Size | Data Size | # Head | # Dim | EM | F1 | 23 | |:-------------------:|:-------:|:-------------:|:----------:|:-------------:|:------:|:-----:|:------:|:------:| 24 | | This implementation | 13 | ~70,000 | 16 | 87k (no aug) | 8 | 128 | 70.2 | 80.0 | 25 | | Original Paper | ~13 | 35,000 | 32 | 87k (no aug) | 8 | 128 | N/A | 77.0 | 26 | | Original Paper | ~55 | 150,000 | 32 | 87k (no aug) | 8 | 128 | 73.6 | 82.7 | 27 | 28 | *Table 1: The performance results are reported on SQuAD v1 dataset. Both train & dev sets are processed using Spacy. Invalid samples are removed from train set only. Settings for this QANet implementation is selected to be comparable with settings in original paper* 29 | 30 | ### BiDAF 31 | [BiDAF](https://allenai.github.io/bi-att-flow/) (Bi-Directional Attention Flow) is a MRC architecture proposed by Allen Institute for Artificial Intelligence (AI2), which consists a multi-stage hierarchical process that represents the context at different levels of granularity and uses bidirectional attention flow mechanism to obtain a query-aware context representation without early summarization. 32 | 33 | ![bidaf_arch]({{ site.url }}/reading_comprehension_tf/BiDAF.architecture.png){:width="700px"} 34 | 35 | *Figure 4: An overview of the BiDAF architecture* 36 | 37 | ![bidaf_metric]({{ site.url }}/reading_comprehension_tf/BiDAF.metric.png){:width="1000px"} 38 | 39 | *Figure 5: The experiment details are reported on SQuAD v1 dataset. Both train & dev sets are processed using Spacy. Invalid samples are removed from both train & dev sets. EM results for BiDAF model with/without EMA are shown on left. F1 results for BiDAF model with/without EMA are shown on right* 40 | 41 | | Model | # Epoch | # Train Steps | Batch Size | Attention Type | # Dim | EM | F1 | 42 | |:-------------------:|:-------:|:-------------:|:----------:|:--------------:|:-----:|:------:|:------:| 43 | | This implementation | 12 | ~17,500 | 60 | trilinear | 100 | 68.5 | 78.2 | 44 | | Original Paper | 12 | ~17,500 | 60 | trilinear | 100 | 67.7 | 77.3 | 45 | 46 | *Table 2: The performance results are reported on SQuAD v1 dataset. Both train & dev sets are processed using Spacy. Invalid samples are removed from train set only. Settings for this BiDAF implementation is selected to be comparable with settings in original paper* 47 | 48 | ### R-Net 49 | [R-Net](https://www.microsoft.com/en-us/research/publication/mcr/) is a MRC architecture proposed by Microsoft Research Asia (MSRA), which first matches the question and passage with gated attention-based recurrent networks to obtain the question-aware passage representation, then uses a self-matching attention mechanism to refine the representation by matching the passage against itself, and finally employs the pointer networks to locate the positions of answers from the passages. 50 | 51 | ![rnet_arch]({{ site.url }}/reading_comprehension_tf/R-Net.architecture.png){:width="700px"} 52 | 53 | *Figure 6: An overview of the R-Net architecture* 54 | 55 | ## Reference 56 | * Adams Wei Yu, David Dohan, Minh-Thang Luong, Rui Zhao, Kai Chen, Mohammad Norouzi, and Quoc V Le. [QANet: Combining local convolution with global self-attention for reading comprehension](https://arxiv.org/abs/1804.09541) [2018] 57 | * Min Joon Seo, Aniruddha Kembhavi, Ali Farhadi, and Hannaneh Hajishirzi. [Bidirectional attention flow for machine comprehension](https://arxiv.org/abs/1611.01603) [2017] 58 | * Wenhui Wang, Nan Yang, Furu Wei, Baobao Chang, and Ming Zhou. [Gated self-matching networks for reading comprehension and question answering](https://aclanthology.info/papers/P17-1018/p17-1018) [2017] 59 | * Danqi Chen. [Neural reading comprehension and beyond](https://cs.stanford.edu/~danqi/papers/thesis.pdf) [2018] 60 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine Reading Comprehension 2 | Machine reading comprehension (MRC), a task which asks machine to read a given context then answer questions based on its understanding, is considered one of the key problems in artificial intelligence and has significant interest from both academic and industry. Over the past few years, great progress has been made in this field, thanks to various end-to-end trained neural models and high quality datasets with large amount of examples proposed. In this repo, I'll share more details on MRC task by re-implementing a few MRC models and testing them on standard MRC datasets. 3 |

4 |

Figure 1: MRC example from SQuAD 2.0 dev set

5 | 6 | ## Setting 7 | * Python 3.6.6 8 | * Tensorflow 1.12 9 | * NumPy 1.15.4 10 | * NLTK 3.3 11 | * Spacy 2.0.12 12 | 13 | ## DataSet 14 | * [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) is a reading comprehension dataset, consisting of questions posed by crowd-workers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable. 15 | * [GloVe](https://nlp.stanford.edu/projects/glove/) is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space. 16 | 17 | ## Usage 18 | * Preprocess data 19 | ```bash 20 | # preprocess train data 21 | python squad/preprocess.py --format json --input_file data/squad/train-v1.1/train-v1.1.json --output_file data/squad/train-v1.1/train-v1.1.squad.json 22 | # preprocess dev data 23 | python squad/preprocess.py --format json --input_file data/squad/dev-v1.1/dev-v1.1.json --output_file data/squad/dev-v1.1/dev-v1.1.squad.json 24 | ``` 25 | * Run experiment 26 | ```bash 27 | # run experiment in train + eval mode 28 | python reading_comprehension_run.py --mode train_eval --config config/config_mrc_template.xxx.json 29 | # run experiment in train only mode 30 | python reading_comprehension_run.py --mode train --config config/config_mrc_template.xxx.json 31 | # run experiment in eval only mode 32 | python reading_comprehension_run.py --mode eval --config config/config_mrc_template.xxx.json 33 | ``` 34 | * Search hyper-parameter 35 | ```bash 36 | # random search hyper-parameters 37 | python hparam_search.py --base-config config/config_mrc_template.xxx.json --search-config config/config_search_template.xxx.json --num-group 10 --random-seed 100 --output-dir config/search 38 | ``` 39 | * Visualize summary 40 | ```bash 41 | # visualize summary via tensorboard 42 | tensorboard --logdir=output 43 | ``` 44 | ## Experiment 45 | ### QANet 46 | [QANet](https://github.com/google-research/google-research/tree/master/qanet) is a MRC architecture proposed by Google Brain, which does not require recurrent networks: Its encoder consists exclusively of convolution and self-attention, where convolution models local interactions and self-attention models global interactions. 47 | 48 |

49 |

Figure 2: An overview of the QANet architecture

50 | 51 |

52 |

Figure 3: The experiment details are reported on SQuAD v1 dataset. Both train & dev sets are processed using Spacy. Invalid samples are removed from both train & dev sets. EM results for QANet model with/without EMA are shown on left. F1 results for QANet model with/without EMA are shown on right

53 | 54 | | Model | # Epoch | # Train Steps | Batch Size | Data Size | # Head | # Dim | EM | F1 | 55 | |:-------------------:|:-------:|:-------------:|:----------:|:-------------:|:------:|:-----:|:------:|:------:| 56 | | This implementation | 13 | ~70,000 | 16 | 87k (no aug) | 8 | 128 | 70.2 | 80.0 | 57 | | Original Paper | ~13 | 35,000 | 32 | 87k (no aug) | 8 | 128 | N/A | 77.0 | 58 | | Original Paper | ~55 | 150,000 | 32 | 87k (no aug) | 8 | 128 | 73.6 | 82.7 | 59 | 60 |

Table 1: The performance results are reported on SQuAD v1 dataset. Both train & dev sets are processed using Spacy. Invalid samples are removed from train set only. Settings for this QANet implementation is selected to be comparable with settings in original paper

61 | 62 | ### BiDAF 63 | [BiDAF](https://allenai.github.io/bi-att-flow/) (Bi-Directional Attention Flow) is a MRC architecture proposed by Allen Institute for Artificial Intelligence (AI2), which consists a multi-stage hierarchical process that represents the context at different levels of granularity and uses bidirectional attention flow mechanism to obtain a query-aware context representation without early summarization. 64 | 65 |

66 |

Figure 4: An overview of the BiDAF architecture

67 | 68 |

69 |

Figure 5: The experiment details are reported on SQuAD v1 dataset. Both train & dev sets are processed using Spacy. Invalid samples are removed from both train & dev sets. EM results for BiDAF model with/without EMA are shown on left. F1 results for BiDAF model with/without EMA are shown on right

70 | 71 | | Model | # Epoch | # Train Steps | Batch Size | Attention Type | # Dim | EM | F1 | 72 | |:-------------------:|:-------:|:-------------:|:----------:|:--------------:|:-----:|:------:|:------:| 73 | | This implementation | 12 | ~17,500 | 60 | trilinear | 100 | 68.5 | 78.2 | 74 | | Original Paper | 12 | ~17,500 | 60 | trilinear | 100 | 67.7 | 77.3 | 75 | 76 |

Table 2: The performance results are reported on SQuAD v1 dataset. Both train & dev sets are processed using Spacy. Invalid samples are removed from train set only. Settings for this BiDAF implementation is selected to be comparable with settings in original paper

77 | 78 | ### R-Net 79 | [R-Net](https://www.microsoft.com/en-us/research/publication/mcr/) is a MRC architecture proposed by Microsoft Research Asia (MSRA), which first matches the question and passage with gated attention-based recurrent networks to obtain the question-aware passage representation, then uses a self-matching attention mechanism to refine the representation by matching the passage against itself, and finally employs the pointer networks to locate the positions of answers from the passages. 80 | 81 |

82 |

Figure 6: An overview of the R-Net architecture

83 | 84 | ## Reference 85 | * Adams Wei Yu, David Dohan, Minh-Thang Luong, Rui Zhao, Kai Chen, Mohammad Norouzi, and Quoc V Le. [QANet: Combining local convolution with global self-attention for reading comprehension](https://arxiv.org/abs/1804.09541) [2018] 86 | * Min Joon Seo, Aniruddha Kembhavi, Ali Farhadi, and Hannaneh Hajishirzi. [Bidirectional attention flow for machine comprehension](https://arxiv.org/abs/1611.01603) [2017] 87 | * Wenhui Wang, Nan Yang, Furu Wei, Baobao Chang, and Ming Zhou. [Gated self-matching networks for reading comprehension and question answering](https://aclanthology.info/papers/P17-1018/p17-1018) [2017] 88 | * Danqi Chen. [Neural reading comprehension and beyond](https://cs.stanford.edu/~danqi/papers/thesis.pdf) [2018] 89 | -------------------------------------------------------------------------------- /reading_comprehension/config/config_mrc_template.rnet.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_train_mrc_file": "data/squad/train-v1.1/train-v1.1.squad.json", 3 | "data_train_mrc_file_type": "json", 4 | "data_eval_mrc_file": "data/squad/dev-v1.1/dev-v1.1.squad.json", 5 | "data_eval_mrc_file_type": "json", 6 | "data_embedding_file": "data/squad/resource/squad.all.word.embed", 7 | "data_full_embedding_file": "data/glove/glove.840B.300d.txt", 8 | "data_tfrecord_dir": "data/squad/tfrecord", 9 | "data_max_question_length": 40, 10 | "data_max_context_length": 500, 11 | "data_max_answer_length": 30, 12 | "data_max_subword_length": 16, 13 | "data_max_char_length": 16, 14 | "data_word_vocab_file": "data/squad/resource/squad.all.word.vocab", 15 | "data_word_vocab_size": 180963, 16 | "data_word_vocab_threshold": 0, 17 | "data_word_unk": "", 18 | "data_word_pad": "", 19 | "data_word_sos": "", 20 | "data_word_eos": "", 21 | "data_word_placeholder_enable": false, 22 | "data_subword_vocab_file": "data/squad/resource/squad.all.subword.vocab", 23 | "data_subword_vocab_size": 50554, 24 | "data_subword_vocab_threshold": 0, 25 | "data_subword_unk": "***", 26 | "data_subword_pad": "###", 27 | "data_subword_size": 3, 28 | "data_char_vocab_file": "data/squad/resource/squad.all.char.vocab", 29 | "data_char_vocab_size": 1610, 30 | "data_char_vocab_threshold": 0, 31 | "data_char_unk": "*", 32 | "data_char_pad": "#", 33 | "data_answer_type": "span", 34 | "data_expand_multiple_answer": false, 35 | "data_enable_validation": true, 36 | "data_pipeline_mode": "tfrecord", 37 | "data_num_parallel": 4, 38 | "data_log_output_dir": "output/rnet/log", 39 | "data_result_output_dir": "output/rnet/result", 40 | "train_random_seed": 100, 41 | "train_enable_shuffle": true, 42 | "train_shuffle_buffer_size": 30000, 43 | "train_batch_size": 64, 44 | "train_eval_batch_size": 100, 45 | "train_eval_metric": ["exact", "f1"], 46 | "train_eval_detail_type": "simplified", 47 | "train_decoding_sample_size": 3, 48 | "train_num_epoch": 3, 49 | "train_ckpt_output_dir": "output/rnet/checkpoint", 50 | "train_summary_output_dir": "output/rnet/summary", 51 | "train_step_per_stat": 10, 52 | "train_step_per_ckpt": 1000, 53 | "train_step_per_eval": 1000, 54 | "train_clip_norm": 5.0, 55 | "train_label_smoothing": 0.0, 56 | "train_enable_debugging": false, 57 | "train_ema_enable": true, 58 | "train_ema_decay_rate": 0.999, 59 | "train_ema_enable_debias": false, 60 | "train_ema_enable_dynamic_decay": false, 61 | "train_regularization_enable": false, 62 | "train_regularization_type": "l2", 63 | "train_regularization_scale": 3e-7, 64 | "train_optimizer_type": "adadelta", 65 | "train_optimizer_learning_rate": 1.0, 66 | "train_optimizer_warmup_enable": false, 67 | "train_optimizer_warmup_mode": "exponential_warmup", 68 | "train_optimizer_warmup_rate": 0.01, 69 | "train_optimizer_warmup_end_step": 1000, 70 | "train_optimizer_decay_enable": false, 71 | "train_optimizer_decay_mode": "exponential_decay", 72 | "train_optimizer_decay_rate": 0.95, 73 | "train_optimizer_decay_step": 1000, 74 | "train_optimizer_decay_start_step": 10000, 75 | "train_optimizer_momentum_beta": 0.9, 76 | "train_optimizer_rmsprop_beta": 0.999, 77 | "train_optimizer_rmsprop_epsilon": 1e-08, 78 | "train_optimizer_adadelta_rho": 0.95, 79 | "train_optimizer_adadelta_epsilon": 1e-06, 80 | "train_optimizer_adagrad_init_accumulator": 0.1, 81 | "train_optimizer_adam_beta_1": 0.9, 82 | "train_optimizer_adam_beta_2": 0.999, 83 | "train_optimizer_adam_epsilon": 1e-08, 84 | "model_type": "rnet", 85 | "model_scope": "mrc", 86 | "model_representation_word_embed_dim": 300, 87 | "model_representation_word_embed_pretrained": true, 88 | "model_representation_word_feat_trainable": false, 89 | "model_representation_word_feat_enable": true, 90 | "model_representation_subword_embed_dim": 8, 91 | "model_representation_subword_unit_dim": 75, 92 | "model_representation_subword_cell_type": "gru", 93 | "model_representation_subword_hidden_activation": "relu", 94 | "model_representation_subword_dropout": 0.2, 95 | "model_representation_subword_feat_trainable": true, 96 | "model_representation_subword_feat_enable": false, 97 | "model_representation_char_embed_dim": 8, 98 | "model_representation_char_unit_dim": 75, 99 | "model_representation_char_cell_type": "gru", 100 | "model_representation_char_hidden_activation": "relu", 101 | "model_representation_char_dropout": 0.2, 102 | "model_representation_char_feat_trainable": true, 103 | "model_representation_char_feat_enable": true, 104 | "model_representation_fusion_type": "highway", 105 | "model_representation_fusion_num_layer": 2, 106 | "model_representation_fusion_unit_dim": 450, 107 | "model_representation_fusion_hidden_activation": "relu", 108 | "model_representation_fusion_dropout": 0.2, 109 | "model_representation_fusion_trainable": true, 110 | "model_understanding_question_num_layer": 3, 111 | "model_understanding_question_unit_dim": 75, 112 | "model_understanding_question_cell_type": "gru", 113 | "model_understanding_question_hidden_activation": "tanh", 114 | "model_understanding_question_dropout": 0.2, 115 | "model_understanding_question_forget_bias": 1.0, 116 | "model_understanding_question_residual_connect": false, 117 | "model_understanding_question_trainable": true, 118 | "model_understanding_context_num_layer": 3, 119 | "model_understanding_context_unit_dim": 75, 120 | "model_understanding_context_cell_type": "gru", 121 | "model_understanding_context_hidden_activation": "tanh", 122 | "model_understanding_context_dropout": 0.2, 123 | "model_understanding_context_forget_bias": 1.0, 124 | "model_understanding_context_residual_connect": false, 125 | "model_understanding_context_trainable": true, 126 | "model_understanding_enable_sharing": false, 127 | "model_interaction_context2question_num_layer": 1, 128 | "model_interaction_context2question_unit_dim": 75, 129 | "model_interaction_context2question_cell_type": "gru", 130 | "model_interaction_context2question_hidden_activation": "tanh", 131 | "model_interaction_context2question_dropout": 0.2, 132 | "model_interaction_context2question_attention_dropout": 0.0, 133 | "model_interaction_context2question_forget_bias": 1.0, 134 | "model_interaction_context2question_residual_connect": false, 135 | "model_interaction_context2question_attention_dim": 75, 136 | "model_interaction_context2question_score_type": "linear", 137 | "model_interaction_context2question_trainable": true, 138 | "model_modeling_answer_num_layer": 1, 139 | "model_modeling_answer_unit_dim": 75, 140 | "model_modeling_answer_cell_type": "gru", 141 | "model_modeling_answer_hidden_activation": "tanh", 142 | "model_modeling_answer_dropout": 0.2, 143 | "model_modeling_answer_attention_dropout": 0.0, 144 | "model_modeling_answer_forget_bias": 1.0, 145 | "model_modeling_answer_residual_connect": false, 146 | "model_modeling_answer_attention_dim": 75, 147 | "model_modeling_answer_score_type": "linear", 148 | "model_modeling_answer_trainable": true, 149 | "model_output_answer_num_layer": 1, 150 | "model_output_answer_unit_dim": 150, 151 | "model_output_answer_cell_type": "gru", 152 | "model_output_answer_hidden_activation": "tanh", 153 | "model_output_answer_dropout": 0.2, 154 | "model_output_answer_forget_bias": 1.0, 155 | "model_output_answer_residual_connect": false, 156 | "model_output_answer_attention_dim": 150, 157 | "model_output_answer_score_type": "linear", 158 | "model_output_answer_trainable": true, 159 | "device_num_gpus": 1, 160 | "device_default_gpu_id": 0, 161 | "device_log_device_placement": false, 162 | "device_allow_soft_placement": true, 163 | "device_allow_growth": false, 164 | "device_per_process_gpu_memory_fraction": 0.8 165 | } -------------------------------------------------------------------------------- /reading_comprehension/config/config_mrc_template.qanet.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_train_mrc_file": "data/squad/train-v1.1/train-v1.1.squad.json", 3 | "data_train_mrc_file_type": "json", 4 | "data_eval_mrc_file": "data/squad/dev-v1.1/dev-v1.1.squad.json", 5 | "data_eval_mrc_file_type": "json", 6 | "data_embedding_file": "data/squad/resource/squad.all.word.embed", 7 | "data_full_embedding_file": "data/glove/glove.840B.300d.txt", 8 | "data_tfrecord_dir": "data/squad/tfrecord", 9 | "data_max_question_length": 50, 10 | "data_max_context_length": 400, 11 | "data_max_answer_length": 30, 12 | "data_max_subword_length": 16, 13 | "data_max_char_length": 16, 14 | "data_word_vocab_file": "data/squad/resource/squad.all.word.vocab", 15 | "data_word_vocab_size": 180963, 16 | "data_word_vocab_threshold": 0, 17 | "data_word_unk": "", 18 | "data_word_pad": "", 19 | "data_word_sos": "", 20 | "data_word_eos": "", 21 | "data_word_placeholder_enable": false, 22 | "data_subword_vocab_file": "data/squad/resource/squad.all.subword.vocab", 23 | "data_subword_vocab_size": 50554, 24 | "data_subword_vocab_threshold": 0, 25 | "data_subword_unk": "***", 26 | "data_subword_pad": "###", 27 | "data_subword_size": 3, 28 | "data_char_vocab_file": "data/squad/resource/squad.all.char.vocab", 29 | "data_char_vocab_size": 1610, 30 | "data_char_vocab_threshold": 0, 31 | "data_char_unk": "*", 32 | "data_char_pad": "#", 33 | "data_answer_type": "span", 34 | "data_expand_multiple_answer": false, 35 | "data_enable_validation": true, 36 | "data_pipeline_mode": "tfrecord", 37 | "data_num_parallel": 4, 38 | "data_log_output_dir": "output/qanet/log", 39 | "data_result_output_dir": "output/qanet/result", 40 | "train_random_seed": 100, 41 | "train_enable_shuffle": true, 42 | "train_shuffle_buffer_size": 30000, 43 | "train_batch_size": 16, 44 | "train_eval_batch_size": 100, 45 | "train_eval_metric": ["exact", "f1"], 46 | "train_eval_detail_type": "simplified", 47 | "train_decoding_sample_size": 3, 48 | "train_num_epoch": 3, 49 | "train_ckpt_output_dir": "output/qanet/checkpoint", 50 | "train_summary_output_dir": "output/qanet/summary", 51 | "train_step_per_stat": 10, 52 | "train_step_per_ckpt": 1000, 53 | "train_step_per_eval": 1000, 54 | "train_clip_norm": 5.0, 55 | "train_label_smoothing": 0.0, 56 | "train_enable_debugging": false, 57 | "train_ema_enable": true, 58 | "train_ema_decay_rate": 0.9999, 59 | "train_ema_enable_debias": false, 60 | "train_ema_enable_dynamic_decay": false, 61 | "train_regularization_enable": true, 62 | "train_regularization_type": "l2", 63 | "train_regularization_scale": 3e-7, 64 | "train_optimizer_type": "adam", 65 | "train_optimizer_learning_rate": 0.001, 66 | "train_optimizer_warmup_enable": true, 67 | "train_optimizer_warmup_mode": "exponential_warmup", 68 | "train_optimizer_warmup_rate": 0.01, 69 | "train_optimizer_warmup_end_step": 1000, 70 | "train_optimizer_decay_enable": false, 71 | "train_optimizer_decay_mode": "exponential_decay", 72 | "train_optimizer_decay_rate": 0.95, 73 | "train_optimizer_decay_step": 1000, 74 | "train_optimizer_decay_start_step": 10000, 75 | "train_optimizer_momentum_beta": 0.9, 76 | "train_optimizer_rmsprop_beta": 0.999, 77 | "train_optimizer_rmsprop_epsilon": 1e-08, 78 | "train_optimizer_adadelta_rho": 0.95, 79 | "train_optimizer_adadelta_epsilon": 1e-08, 80 | "train_optimizer_adagrad_init_accumulator": 0.1, 81 | "train_optimizer_adam_beta_1": 0.8, 82 | "train_optimizer_adam_beta_2": 0.999, 83 | "train_optimizer_adam_epsilon": 1e-07, 84 | "model_type": "qanet", 85 | "model_scope": "mrc", 86 | "model_representation_word_embed_dim": 300, 87 | "model_representation_word_dropout": 0.1, 88 | "model_representation_word_embed_pretrained": true, 89 | "model_representation_word_feat_trainable": false, 90 | "model_representation_word_feat_enable": true, 91 | "model_representation_subword_embed_dim": 64, 92 | "model_representation_subword_unit_dim": 200, 93 | "model_representation_subword_window_size": [5], 94 | "model_representation_subword_hidden_activation": "relu", 95 | "model_representation_subword_dropout": 0.05, 96 | "model_representation_subword_pooling_type": "max", 97 | "model_representation_subword_feat_trainable": true, 98 | "model_representation_subword_feat_enable": false, 99 | "model_representation_char_embed_dim": 64, 100 | "model_representation_char_unit_dim": 200, 101 | "model_representation_char_window_size": [5], 102 | "model_representation_char_hidden_activation": "relu", 103 | "model_representation_char_dropout": 0.05, 104 | "model_representation_char_pooling_type": "max", 105 | "model_representation_char_feat_trainable": true, 106 | "model_representation_char_feat_enable": true, 107 | "model_representation_fusion_type": "highway", 108 | "model_representation_fusion_num_layer": 2, 109 | "model_representation_fusion_unit_dim": 128, 110 | "model_representation_fusion_hidden_activation": "relu", 111 | "model_representation_fusion_dropout": 0.1, 112 | "model_representation_fusion_trainable": true, 113 | "model_understanding_question_num_layer": 1, 114 | "model_understanding_question_num_conv": 4, 115 | "model_understanding_question_num_head": 8, 116 | "model_understanding_question_unit_dim": 128, 117 | "model_understanding_question_window_size": [7], 118 | "model_understanding_question_hidden_activation": "relu", 119 | "model_understanding_question_dropout": 0.1, 120 | "model_understanding_question_attention_dropout": 0.0, 121 | "model_understanding_question_layer_dropout": 0.1, 122 | "model_understanding_question_trainable": true, 123 | "model_understanding_context_num_layer": 1, 124 | "model_understanding_context_num_conv": 4, 125 | "model_understanding_context_num_head": 8, 126 | "model_understanding_context_unit_dim": 128, 127 | "model_understanding_context_window_size": [7], 128 | "model_understanding_context_hidden_activation": "relu", 129 | "model_understanding_context_dropout": 0.1, 130 | "model_understanding_context_attention_dropout": 0.0, 131 | "model_understanding_context_layer_dropout": 0.1, 132 | "model_understanding_context_trainable": true, 133 | "model_understanding_enable_sharing": true, 134 | "model_interaction_context2question_attention_dim": 128, 135 | "model_interaction_context2question_score_type": "trilinear", 136 | "model_interaction_context2question_dropout": 0.1, 137 | "model_interaction_context2question_attention_dropout": 0.0, 138 | "model_interaction_context2question_trainable": true, 139 | "model_interaction_context2question_enable": true, 140 | "model_interaction_question2context_attention_dim": 128, 141 | "model_interaction_question2context_score_type": "trilinear", 142 | "model_interaction_question2context_dropout": 0.1, 143 | "model_interaction_question2context_attention_dropout": 0.0, 144 | "model_interaction_question2context_trainable": true, 145 | "model_interaction_question2context_enable": true, 146 | "model_interaction_fusion_type": "concate", 147 | "model_interaction_fusion_num_layer": 1, 148 | "model_interaction_fusion_unit_dim": 128, 149 | "model_interaction_fusion_hidden_activation": "relu", 150 | "model_interaction_fusion_dropout": 0.1, 151 | "model_interaction_fusion_trainable": true, 152 | "model_interaction_fusion_combo_enable": true, 153 | "model_interaction_enable_sharing": true, 154 | "model_modeling_answer_num_layer": 7, 155 | "model_modeling_answer_num_conv": 2, 156 | "model_modeling_answer_num_head": 8, 157 | "model_modeling_answer_unit_dim": 128, 158 | "model_modeling_answer_window_size": [5], 159 | "model_modeling_answer_hidden_activation": "relu", 160 | "model_modeling_answer_dropout": 0.1, 161 | "model_modeling_answer_attention_dropout": 0.0, 162 | "model_modeling_answer_layer_dropout": 0.1, 163 | "model_modeling_answer_trainable": true, 164 | "model_modeling_enable_sharing": true, 165 | "model_output_answer_start_dropout": 0.1, 166 | "model_output_answer_start_trainable": true, 167 | "model_output_answer_end_dropout": 0.1, 168 | "model_output_answer_end_trainable": true, 169 | "device_num_gpus": 1, 170 | "device_default_gpu_id": 0, 171 | "device_log_device_placement": false, 172 | "device_allow_soft_placement": true, 173 | "device_allow_growth": false, 174 | "device_per_process_gpu_memory_fraction": 0.8 175 | } -------------------------------------------------------------------------------- /reading_comprehension/squad/preprocess.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import codecs 3 | import json 4 | import os.path 5 | import string 6 | import re 7 | import nltk 8 | import spacy 9 | 10 | spacy_nlp = spacy.load('en') 11 | 12 | def add_arguments(parser): 13 | parser.add_argument("--format", help="format to generate", required=True) 14 | parser.add_argument("--input_file", help="path to input file", required=True) 15 | parser.add_argument("--output_file", help="path to output file", required=True) 16 | 17 | def nltk_tokenize(text, lower_case=False, remove_punc=False): 18 | def process_token(tokens): 19 | special = ("-", "£", "€", "¥", "¢", "₹", "\u2212", "\u2014", "\u2013", 20 | "/", "~", '"', "'", "\ud01C", "\u201C", "\u2019", "\u201D", "\u2018", "\u00B0") 21 | pattern = "([{}])".format("".join(special)) 22 | processed_tokens = [] 23 | for token in tokens: 24 | token = token.replace("''", '" ').replace("``", '" ') 25 | processed_tokens.extend(re.split(pattern, token)) 26 | 27 | return processed_tokens 28 | 29 | def remove_punctuation(tokens): 30 | exclude = set(string.punctuation) 31 | return [token for token in tokens if token not in exclude] 32 | 33 | def fix_white_space(tokens): 34 | return [token for token in tokens if token and not token.isspace()] 35 | 36 | sents = nltk.sent_tokenize(text) 37 | norm_sents = [] 38 | for sent in sents: 39 | words = nltk.word_tokenize(sent) 40 | words = process_token(words) 41 | if remove_punc: 42 | words = remove_punctuation(words) 43 | 44 | words = fix_white_space(words) 45 | norm_sents.append(' '.join(words)) 46 | 47 | norm_text = ' '.join(norm_sents) 48 | if lower_case: 49 | norm_text = norm_text.lower() 50 | 51 | return norm_text 52 | 53 | def spacy_tokenize(text, lower_case=False, remove_punc=False): 54 | def process_token(tokens): 55 | special = ("-", "£", "€", "¥", "¢", "₹", "\u2212", "\u2014", "\u2013", 56 | "/", "~", '"', "'", "\ud01C", "\u201C", "\u2019", "\u201D", "\u2018", "\u00B0") 57 | pattern = "([{}])".format("".join(special)) 58 | processed_tokens = [] 59 | for token in tokens: 60 | token = token.replace("''", '" ').replace("``", '" ') 61 | processed_tokens.extend(re.split(pattern, token)) 62 | 63 | return processed_tokens 64 | 65 | def remove_punctuation(tokens): 66 | exclude = set(string.punctuation) 67 | return [token for token in tokens if token not in exclude] 68 | 69 | def fix_white_space(tokens): 70 | return [token for token in tokens if token and not token.isspace()] 71 | 72 | word_docs = spacy_nlp(text) 73 | words = [word.text for word in word_docs] 74 | words = process_token(words) 75 | if remove_punc: 76 | words = remove_punctuation(words) 77 | 78 | words = fix_white_space(words) 79 | 80 | norm_text = ' '.join(words) 81 | if lower_case: 82 | norm_text = norm_text.lower() 83 | 84 | return norm_text 85 | 86 | def get_char_spans(raw_text, norm_text): 87 | pattern = "\"|``|''" 88 | spans = [] 89 | idx = 0 90 | norm_tokens = norm_text.split(' ') 91 | for token in norm_tokens: 92 | if re.match(pattern, token): 93 | span = re.search(pattern, raw_text[idx:]) 94 | idx += span.start() 95 | token_len = span.end() - span.start() 96 | else: 97 | idx = raw_text.find(token, idx) 98 | token_len = len(token) 99 | 100 | if idx < 0 or token is None or token_len == 0: 101 | raise ValueError("invalid text: {0} <--> {1}".format(raw_text, norm_text)) 102 | 103 | spans.append((idx, idx + token_len)) 104 | idx += token_len 105 | 106 | return spans 107 | 108 | def get_word_span(char_spans, answer_char_start, answer_char_end): 109 | answer_word_start = None 110 | answer_word_end = None 111 | for word_idx, (char_start_idx, char_end_indx) in enumerate(char_spans): 112 | if char_start_idx <= answer_char_start <= char_end_indx: 113 | answer_word_start = word_idx 114 | if char_start_idx <= answer_char_end <= char_end_indx: 115 | answer_word_end = word_idx 116 | 117 | if answer_word_end is None and answer_word_start is not None: 118 | if answer_char_end > char_spans[-1][-1]: 119 | answer_word_end = len(char_spans) - 1 120 | 121 | if answer_word_end is None or answer_word_start is None or answer_word_end < answer_word_start: 122 | raise ValueError("invalid word span: ({0}, {1})".format(answer_word_start, answer_word_end)) 123 | 124 | return answer_word_start, answer_word_end 125 | 126 | def preprocess(file_name): 127 | if not os.path.exists(file_name): 128 | raise FileNotFoundError("file not found") 129 | 130 | processed_data_list = [] 131 | with open(file_name, "r") as file: 132 | json_content = json.load(file) 133 | for article in json_content["data"]: 134 | for paragraph in article["paragraphs"]: 135 | context = paragraph["context"].strip() 136 | norm_context = spacy_tokenize(context) 137 | char_spans = get_char_spans(context, norm_context) 138 | for qa in paragraph["qas"]: 139 | qa_id = qa["id"] 140 | question = qa["question"].strip() 141 | norm_question = spacy_tokenize(question) 142 | 143 | processed_data = { 144 | "id": qa_id, 145 | "question": norm_question, 146 | "context": norm_context, 147 | "answers": [] 148 | } 149 | 150 | for answer in qa["answers"]: 151 | answer_text = answer["text"].strip() 152 | answer_char_start = answer["answer_start"] 153 | answer_char_end = answer_char_start + len(answer_text) - 1 154 | 155 | answer_word_start, answer_word_end = get_word_span(char_spans, 156 | answer_char_start, answer_char_end) 157 | 158 | answer_text = " ".join(norm_context.split(' ')[answer_word_start:answer_word_end+1]) 159 | 160 | processed_data["answers"].append({ 161 | "text": answer_text, 162 | "start": answer_word_start, 163 | "end": answer_word_end 164 | }) 165 | 166 | processed_data_list.append(processed_data) 167 | 168 | return processed_data_list 169 | 170 | def output_to_json(data_list, file_name): 171 | with open(file_name, "w") as file: 172 | data_json = json.dumps(data_list, indent=4) 173 | file.write(data_json) 174 | 175 | def output_to_plain(data_list, file_name): 176 | with open(file_name, "wb") as file: 177 | for data in data_list: 178 | for answer in data["answers"]: 179 | data_plain = "{0}\t{1}\t{2}\t{3}\t{4}|{5}\r\n".format(data["id"], data["question"], 180 | data["context"].replace("\n", " "), answer["text"], answer["start"], answer["end"]) 181 | file.write(data_plain.encode("utf-8")) 182 | 183 | def output_to_split(data_list, file_prefix): 184 | with open("{0}.question".format(file_prefix), "wb") as q_file, open("{0}.context".format(file_prefix), "wb") as c_file, open("{0}.answer_text".format(file_prefix), "wb") as at_file, open("{0}.answer_span".format(file_prefix), "wb") as as_file: 185 | for data in data_list: 186 | for answer in data["answers"]: 187 | q_data_plain = "{0}\r\n".format(data["question"]) 188 | q_file.write(q_data_plain.encode("utf-8")) 189 | c_data_plain = "{0}\r\n".format(data["context"].replace("\n", " ")) 190 | c_file.write(c_data_plain.encode("utf-8")) 191 | at_data_plain = "{0}\r\n".format(answer["text"]) 192 | at_file.write(at_data_plain.encode("utf-8")) 193 | as_data_plain = "{0}|{1}\r\n".format(answer["start"], answer["end"]) 194 | as_file.write(as_data_plain.encode("utf-8")) 195 | 196 | def main(args): 197 | processed_data = preprocess(args.input_file) 198 | if (args.format == 'json'): 199 | output_to_json(processed_data, args.output_file) 200 | elif (args.format == 'plain'): 201 | output_to_plain(processed_data, args.output_file) 202 | elif (args.format == 'split'): 203 | output_to_split(processed_data, args.output_file) 204 | 205 | if __name__ == "__main__": 206 | parser = argparse.ArgumentParser() 207 | add_arguments(parser) 208 | args = parser.parse_args() 209 | main(args) -------------------------------------------------------------------------------- /reading_comprehension/config/config_mrc_template.bidaf.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_train_mrc_file": "data/squad/train-v1.1/train-v1.1.squad.json", 3 | "data_train_mrc_file_type": "json", 4 | "data_eval_mrc_file": "data/squad/dev-v1.1/dev-v1.1.squad.json", 5 | "data_eval_mrc_file_type": "json", 6 | "data_embedding_file": "data/squad/resource/squad.all.word.embed", 7 | "data_full_embedding_file": "data/glove/glove.6B.100d.txt", 8 | "data_tfrecord_dir": "data/squad/tfrecord", 9 | "data_max_question_length": 40, 10 | "data_max_context_length": 500, 11 | "data_max_answer_length": 30, 12 | "data_max_subword_length": 16, 13 | "data_max_char_length": 16, 14 | "data_word_vocab_file": "data/squad/resource/squad.all.word.vocab", 15 | "data_word_vocab_size": 72646, 16 | "data_word_vocab_threshold": 0, 17 | "data_word_unk": "", 18 | "data_word_pad": "", 19 | "data_word_sos": "", 20 | "data_word_eos": "", 21 | "data_word_placeholder_enable": false, 22 | "data_subword_vocab_file": "data/squad/resource/squad.all.subword.vocab", 23 | "data_subword_vocab_size": 50554, 24 | "data_subword_vocab_threshold": 0, 25 | "data_subword_unk": "***", 26 | "data_subword_pad": "###", 27 | "data_subword_size": 3, 28 | "data_char_vocab_file": "data/squad/resource/squad.all.char.vocab", 29 | "data_char_vocab_size": 1610, 30 | "data_char_vocab_threshold": 0, 31 | "data_char_unk": "*", 32 | "data_char_pad": "#", 33 | "data_answer_type": "span", 34 | "data_expand_multiple_answer": false, 35 | "data_enable_validation": true, 36 | "data_pipeline_mode": "tfrecord", 37 | "data_num_parallel": 4, 38 | "data_log_output_dir": "output/bidaf/log", 39 | "data_result_output_dir": "output/bidaf/result", 40 | "train_random_seed": 100, 41 | "train_enable_shuffle": true, 42 | "train_shuffle_buffer_size": 30000, 43 | "train_batch_size": 60, 44 | "train_eval_batch_size": 100, 45 | "train_eval_metric": ["exact", "f1"], 46 | "train_eval_detail_type": "simplified", 47 | "train_decoding_sample_size": 3, 48 | "train_num_epoch": 3, 49 | "train_ckpt_output_dir": "output/bidaf/checkpoint", 50 | "train_summary_output_dir": "output/bidaf/summary", 51 | "train_step_per_stat": 10, 52 | "train_step_per_ckpt": 1000, 53 | "train_step_per_eval": 1000, 54 | "train_clip_norm": 5.0, 55 | "train_label_smoothing": 0.0, 56 | "train_enable_debugging": false, 57 | "train_ema_enable": true, 58 | "train_ema_decay_rate": 0.999, 59 | "train_ema_enable_debias": false, 60 | "train_ema_enable_dynamic_decay": false, 61 | "train_regularization_enable": false, 62 | "train_regularization_type": "l2", 63 | "train_regularization_scale": 3e-7, 64 | "train_optimizer_type": "adam", 65 | "train_optimizer_learning_rate": 0.001, 66 | "train_optimizer_warmup_enable": false, 67 | "train_optimizer_warmup_mode": "exponential_warmup", 68 | "train_optimizer_warmup_rate": 0.01, 69 | "train_optimizer_warmup_end_step": 1000, 70 | "train_optimizer_decay_enable": false, 71 | "train_optimizer_decay_mode": "exponential_decay", 72 | "train_optimizer_decay_rate": 0.95, 73 | "train_optimizer_decay_step": 1000, 74 | "train_optimizer_decay_start_step": 10000, 75 | "train_optimizer_momentum_beta": 0.9, 76 | "train_optimizer_rmsprop_beta": 0.999, 77 | "train_optimizer_rmsprop_epsilon": 1e-08, 78 | "train_optimizer_adadelta_rho": 0.95, 79 | "train_optimizer_adadelta_epsilon": 1e-08, 80 | "train_optimizer_adagrad_init_accumulator": 0.1, 81 | "train_optimizer_adam_beta_1": 0.9, 82 | "train_optimizer_adam_beta_2": 0.999, 83 | "train_optimizer_adam_epsilon": 1e-08, 84 | "model_type": "bidaf", 85 | "model_scope": "mrc", 86 | "model_representation_word_embed_dim": 100, 87 | "model_representation_word_embed_pretrained": true, 88 | "model_representation_word_feat_trainable": false, 89 | "model_representation_word_feat_enable": true, 90 | "model_representation_subword_embed_dim": 8, 91 | "model_representation_subword_unit_dim": 100, 92 | "model_representation_subword_window_size": [5], 93 | "model_representation_subword_hidden_activation": "relu", 94 | "model_representation_subword_dropout": 0.2, 95 | "model_representation_subword_pooling_type": "max", 96 | "model_representation_subword_feat_trainable": true, 97 | "model_representation_subword_feat_enable": false, 98 | "model_representation_char_embed_dim": 8, 99 | "model_representation_char_unit_dim": 100, 100 | "model_representation_char_window_size": [5], 101 | "model_representation_char_hidden_activation": "relu", 102 | "model_representation_char_dropout": 0.2, 103 | "model_representation_char_pooling_type": "max", 104 | "model_representation_char_feat_trainable": true, 105 | "model_representation_char_feat_enable": true, 106 | "model_representation_fusion_type": "highway", 107 | "model_representation_fusion_num_layer": 2, 108 | "model_representation_fusion_unit_dim": 400, 109 | "model_representation_fusion_hidden_activation": "relu", 110 | "model_representation_fusion_dropout": 0.2, 111 | "model_representation_fusion_trainable": true, 112 | "model_understanding_question_num_layer": 1, 113 | "model_understanding_question_unit_dim": 100, 114 | "model_understanding_question_cell_type": "lstm", 115 | "model_understanding_question_hidden_activation": "tanh", 116 | "model_understanding_question_dropout": 0.2, 117 | "model_understanding_question_forget_bias": 1.0, 118 | "model_understanding_question_residual_connect": false, 119 | "model_understanding_question_trainable": true, 120 | "model_understanding_context_num_layer": 1, 121 | "model_understanding_context_unit_dim": 100, 122 | "model_understanding_context_cell_type": "lstm", 123 | "model_understanding_context_hidden_activation": "tanh", 124 | "model_understanding_context_dropout": 0.2, 125 | "model_understanding_context_forget_bias": 1.0, 126 | "model_understanding_context_residual_connect": false, 127 | "model_understanding_context_trainable": true, 128 | "model_understanding_enable_sharing": true, 129 | "model_interaction_context2question_attention_dim": 200, 130 | "model_interaction_context2question_score_type": "trilinear", 131 | "model_interaction_context2question_dropout": 0.2, 132 | "model_interaction_context2question_attention_dropout": 0.0, 133 | "model_interaction_context2question_trainable": true, 134 | "model_interaction_context2question_enable": true, 135 | "model_interaction_question2context_attention_dim": 200, 136 | "model_interaction_question2context_score_type": "trilinear", 137 | "model_interaction_question2context_dropout": 0.2, 138 | "model_interaction_question2context_attention_dropout": 0.0, 139 | "model_interaction_question2context_trainable": true, 140 | "model_interaction_question2context_enable": true, 141 | "model_interaction_fusion_type": "concate", 142 | "model_interaction_fusion_num_layer": 1, 143 | "model_interaction_fusion_unit_dim": 800, 144 | "model_interaction_fusion_hidden_activation": "relu", 145 | "model_interaction_fusion_dropout": 0.2, 146 | "model_interaction_fusion_trainable": true, 147 | "model_interaction_fusion_combo_enable": true, 148 | "model_interaction_enable_sharing": true, 149 | "model_modeling_answer_num_layer": 1, 150 | "model_modeling_answer_unit_dim": 100, 151 | "model_modeling_answer_cell_type": "lstm", 152 | "model_modeling_answer_hidden_activation": "tanh", 153 | "model_modeling_answer_dropout": 0.2, 154 | "model_modeling_answer_attention_dropout": 0.0, 155 | "model_modeling_answer_forget_bias": 1.0, 156 | "model_modeling_answer_residual_connect": false, 157 | "model_modeling_answer_attention_dim": 200, 158 | "model_modeling_answer_score_type": "trilinear", 159 | "model_modeling_answer_attention_enable": false, 160 | "model_modeling_answer_trainable": true, 161 | "model_modeling_fusion_type": "concate", 162 | "model_modeling_fusion_num_layer": 1, 163 | "model_modeling_fusion_unit_dim": 1000, 164 | "model_modeling_fusion_hidden_activation": "relu", 165 | "model_modeling_fusion_dropout": 0.2, 166 | "model_modeling_fusion_trainable": true, 167 | "model_output_answer_start_num_layer": 1, 168 | "model_output_answer_start_unit_dim": 100, 169 | "model_output_answer_start_cell_type": "lstm", 170 | "model_output_answer_start_hidden_activation": "tanh", 171 | "model_output_answer_start_dropout": 0.2, 172 | "model_output_answer_start_forget_bias": 1.0, 173 | "model_output_answer_start_residual_connect": false, 174 | "model_output_answer_start_trainable": true, 175 | "model_output_answer_end_num_layer": 1, 176 | "model_output_answer_end_unit_dim": 100, 177 | "model_output_answer_end_cell_type": "lstm", 178 | "model_output_answer_end_hidden_activation": "tanh", 179 | "model_output_answer_end_dropout": 0.2, 180 | "model_output_answer_end_forget_bias": 1.0, 181 | "model_output_answer_end_residual_connect": false, 182 | "model_output_answer_end_trainable": true, 183 | "device_num_gpus": 1, 184 | "device_default_gpu_id": 0, 185 | "device_log_device_placement": false, 186 | "device_allow_soft_placement": true, 187 | "device_allow_growth": false, 188 | "device_per_process_gpu_memory_fraction": 0.8 189 | } -------------------------------------------------------------------------------- /reading_comprehension/layer/highway.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from util.default_util import * 5 | from util.reading_comprehension_util import * 6 | 7 | from layer.basic import * 8 | 9 | __all__ = ["Highway", "ConvHighway", "StackedHighway", "StackedConvHighway"] 10 | 11 | class Highway(object): 12 | """highway layer""" 13 | def __init__(self, 14 | unit_dim, 15 | activation, 16 | dropout, 17 | num_gpus=1, 18 | default_gpu_id=0, 19 | regularizer=None, 20 | random_seed=0, 21 | trainable=True, 22 | scope="highway"): 23 | """initialize highway layer""" 24 | self.unit_dim = unit_dim 25 | self.activation = activation 26 | self.dropout = dropout 27 | self.regularizer = regularizer 28 | self.random_seed = random_seed 29 | self.trainable = trainable 30 | self.scope = scope 31 | self.device_spec = get_device_spec(default_gpu_id, num_gpus) 32 | 33 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 34 | weight_initializer = create_variable_initializer("glorot_uniform", self.random_seed) 35 | bias_initializer = create_variable_initializer("zero") 36 | transform_activation = create_activation_function(self.activation) 37 | gate_activation = create_activation_function("sigmoid") 38 | self.transform_layer = tf.layers.Dense(units=self.unit_dim, activation=transform_activation, use_bias=True, 39 | kernel_initializer=weight_initializer, bias_initializer=bias_initializer, 40 | kernel_regularizer=self.regularizer, bias_regularizer=self.regularizer, trainable=self.trainable) 41 | self.gate_layer = tf.layers.Dense(units=self.unit_dim, activation=gate_activation, use_bias=True, 42 | kernel_initializer=weight_initializer, bias_initializer=bias_initializer, 43 | kernel_regularizer=self.regularizer, bias_regularizer=self.regularizer, trainable=self.trainable) 44 | 45 | self.dropout_layer = Dropout(rate=self.dropout, num_gpus=num_gpus, 46 | default_gpu_id=default_gpu_id, random_seed=self.random_seed) 47 | 48 | def __call__(self, 49 | input_data, 50 | input_mask): 51 | """call highway layer""" 52 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 53 | transform, _ = self.dropout_layer(self.transform_layer(input_data), input_mask) 54 | gate = self.gate_layer(input_data) 55 | output_highway = transform * gate + input_data * (1 - gate) 56 | output_mask = input_mask 57 | 58 | return output_highway, output_mask 59 | 60 | class ConvHighway(object): 61 | """convolutional highway layer""" 62 | def __init__(self, 63 | num_filter, 64 | window_size, 65 | activation, 66 | dropout, 67 | num_gpus=1, 68 | default_gpu_id=0, 69 | regularizer=None, 70 | random_seed=0, 71 | trainable=True, 72 | scope="conv_highway"): 73 | """initialize convolutional highway layer""" 74 | self.num_filter = num_filter 75 | self.window_size = window_size 76 | self.activation = activation 77 | self.dropout = dropout 78 | self.regularizer = regularizer 79 | self.random_seed = random_seed 80 | self.trainable = trainable 81 | self.scope = scope 82 | self.device_spec = get_device_spec(default_gpu_id, num_gpus) 83 | 84 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 85 | weight_initializer = create_variable_initializer("glorot_uniform", self.m_seed) 86 | bias_initializer = create_variable_initializer("zero") 87 | transform_activation = create_activation_function(self.activation) 88 | gate_activation = create_activation_function("sigmoid") 89 | 90 | self.transform_layer = tf.layers.Conv1D(filters=self.num_filter, kernel_size=window_size, 91 | strides=1, padding="SAME", activation=transform_activation, use_bias=True, 92 | kernel_initializer=weight_initializer, bias_initializer=bias_initializer, 93 | kernel_regularizer=self.regularizer, bias_regularizer=self.regularizer, trainable=trainable) 94 | self.gate_layer = tf.layers.Conv1D(filters=self.num_filter, kernel_size=window_size, 95 | strides=1, padding="SAME", activation=gate_activation, use_bias=True, 96 | kernel_initializer=weight_initializer, bias_initializer=bias_initializer, 97 | kernel_regularizer=self.regularizer, bias_regularizer=self.regularizer, trainable=trainable) 98 | 99 | self.dropout_layer = Dropout(rate=self.dropout, num_gpus=num_gpus, 100 | default_gpu_id=default_gpu_id, random_seed=self.random_seed) 101 | 102 | def __call__(self, 103 | input_data, 104 | input_mask): 105 | """call convolutional highway layer""" 106 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 107 | transform, _ = self.dropout_layer(self.transform_layer(input_data), input_mask) 108 | gate = self.gate_layer(input_data) 109 | output_highway = transform * gate + input_data * (1 - gate) 110 | output_mask = input_mask 111 | 112 | return output_highway, output_mask 113 | 114 | class StackedHighway(object): 115 | """stacked highway layer""" 116 | def __init__(self, 117 | num_layer, 118 | unit_dim, 119 | activation, 120 | dropout, 121 | num_gpus=1, 122 | default_gpu_id=0, 123 | regularizer=None, 124 | random_seed=0, 125 | trainable=True, 126 | scope="stacked_highway"): 127 | """initialize stacked highway layer""" 128 | self.num_layer = num_layer 129 | self.unit_dim = unit_dim 130 | self.activation = activation 131 | self.dropout = dropout 132 | self.num_gpus = num_gpus 133 | self.default_gpu_id = default_gpu_id 134 | self.regularizer = regularizer 135 | self.random_seed = random_seed 136 | self.trainable = trainable 137 | self.scope = scope 138 | self.device_spec = get_device_spec(default_gpu_id, num_gpus) 139 | 140 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 141 | self.highway_layer_list = [] 142 | for i in range(self.num_layer): 143 | layer_scope = "layer_{0}".format(i) 144 | sublayer_dropout = self.dropout[i] if self.dropout != None else 0.0 145 | highway_layer = Highway(unit_dim=self.unit_dim, activation=self.activation, 146 | dropout=sublayer_dropout, num_gpus=self.num_gpus, default_gpu_id=self.default_gpu_id, 147 | regularizer=self.regularizer, random_seed=self.random_seed, trainable=self.trainable, scope=layer_scope) 148 | self.highway_layer_list.append(highway_layer) 149 | 150 | def __call__(self, 151 | input_data, 152 | input_mask): 153 | """call stacked highway layer""" 154 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 155 | input_highway = input_data 156 | input_highway_mask = input_mask 157 | 158 | for highway_layer in self.highway_layer_list: 159 | input_highway, input_highway_mask = highway_layer(input_highway, input_highway_mask) 160 | 161 | output_highway = input_highway 162 | output_mask = input_highway_mask 163 | 164 | return output_highway, output_mask 165 | 166 | class StackedConvHighway(object): 167 | """stacked convolution highway layer""" 168 | def __init__(self, 169 | num_layer, 170 | num_filter, 171 | window_size, 172 | activation, 173 | dropout, 174 | num_gpus=1, 175 | default_gpu_id=0, 176 | regularizer=None, 177 | random_seed=0, 178 | trainable=True, 179 | scope="stacked_conv_highway"): 180 | """initialize stacked convolution highway layer""" 181 | self.num_layer = num_layer 182 | self.num_filter = num_filter 183 | self.unit_dim = unit_dim 184 | self.window_size = window_size 185 | self.dropout = dropout 186 | self.num_gpus = num_gpus 187 | self.default_gpu_id = default_gpu_id 188 | self.regularizer = regularizer 189 | self.random_seed = random_seed 190 | self.trainable = trainable 191 | self.scope = scope 192 | self.device_spec = get_device_spec(default_gpu_id, num_gpus) 193 | 194 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 195 | self.highway_layer_list = [] 196 | for i in range(self.num_layer): 197 | layer_scope = "layer_{0}".format(i) 198 | sublayer_dropout = self.dropout[i] if self.dropout != None else 0.0 199 | highway_layer = ConvHighway(num_filter=self.num_filter, window_size=self.window_size, 200 | activation=self.activation, dropout=sublayer_dropout, num_gpus=self.num_gpus, default_gpu_id=self.default_gpu_id, 201 | regularizer=self.regularizer, random_seed=self.random_seed, trainable=self.trainable, scope=layer_scope) 202 | self.highway_layer_list.append(highway_layer) 203 | 204 | def __call__(self, 205 | input_data, 206 | input_mask): 207 | """call stacked convolution highway layer""" 208 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 209 | input_highway = input_data 210 | input_highway_mask = input_mask 211 | 212 | for highway_layer in self.highway_layer_list: 213 | input_highway, input_highway_mask = highway_layer(input_highway, input_highway_mask) 214 | 215 | output_highway = input_highway 216 | output_mask = input_highway_mask 217 | 218 | return output_highway, output_mask 219 | -------------------------------------------------------------------------------- /reading_comprehension/squad/evaluate-v2.py: -------------------------------------------------------------------------------- 1 | """Official evaluation script for SQuAD version 2.0. 2 | 3 | In addition to basic functionality, we also compute additional statistics and 4 | plot precision-recall curves if an additional na_prob.json file is provided. 5 | This file is expected to map question ID's to the model's predicted probability 6 | that a question is unanswerable. 7 | """ 8 | import argparse 9 | import collections 10 | import json 11 | import numpy as np 12 | import os 13 | import re 14 | import string 15 | import sys 16 | 17 | OPTS = None 18 | 19 | def parse_args(): 20 | parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.') 21 | parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.') 22 | parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.') 23 | parser.add_argument('--out-file', '-o', metavar='eval.json', 24 | help='Write accuracy metrics to file (default is stdout).') 25 | parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json', 26 | help='Model estimates of probability of no answer.') 27 | parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0, 28 | help='Predict "" if no-answer probability exceeds this (default = 1.0).') 29 | parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None, 30 | help='Save precision-recall curves to directory.') 31 | parser.add_argument('--verbose', '-v', action='store_true') 32 | if len(sys.argv) == 1: 33 | parser.print_help() 34 | sys.exit(1) 35 | return parser.parse_args() 36 | 37 | def make_qid_to_has_ans(dataset): 38 | qid_to_has_ans = {} 39 | for article in dataset: 40 | for p in article['paragraphs']: 41 | for qa in p['qas']: 42 | qid_to_has_ans[qa['id']] = bool(qa['answers']) 43 | return qid_to_has_ans 44 | 45 | def normalize_answer(s): 46 | """Lower text and remove punctuation, articles and extra whitespace.""" 47 | def remove_articles(text): 48 | regex = re.compile(r'\b(a|an|the)\b', re.UNICODE) 49 | return re.sub(regex, ' ', text) 50 | def white_space_fix(text): 51 | return ' '.join(text.split()) 52 | def remove_punc(text): 53 | exclude = set(string.punctuation) 54 | return ''.join(ch for ch in text if ch not in exclude) 55 | def lower(text): 56 | return text.lower() 57 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 58 | 59 | def get_tokens(s): 60 | if not s: return [] 61 | return normalize_answer(s).split() 62 | 63 | def compute_exact(a_gold, a_pred): 64 | return int(normalize_answer(a_gold) == normalize_answer(a_pred)) 65 | 66 | def compute_f1(a_gold, a_pred): 67 | gold_toks = get_tokens(a_gold) 68 | pred_toks = get_tokens(a_pred) 69 | common = collections.Counter(gold_toks) & collections.Counter(pred_toks) 70 | num_same = sum(common.values()) 71 | if len(gold_toks) == 0 or len(pred_toks) == 0: 72 | # If either is no-answer, then F1 is 1 if they agree, 0 otherwise 73 | return int(gold_toks == pred_toks) 74 | if num_same == 0: 75 | return 0 76 | precision = 1.0 * num_same / len(pred_toks) 77 | recall = 1.0 * num_same / len(gold_toks) 78 | f1 = (2 * precision * recall) / (precision + recall) 79 | return f1 80 | 81 | def get_raw_scores(dataset, preds): 82 | exact_scores = {} 83 | f1_scores = {} 84 | for article in dataset: 85 | for p in article['paragraphs']: 86 | for qa in p['qas']: 87 | qid = qa['id'] 88 | gold_answers = [a['text'] for a in qa['answers'] 89 | if normalize_answer(a['text'])] 90 | if not gold_answers: 91 | # For unanswerable questions, only correct answer is empty string 92 | gold_answers = [''] 93 | if qid not in preds: 94 | print('Missing prediction for %s' % qid) 95 | continue 96 | a_pred = preds[qid] 97 | # Take max over all gold answers 98 | exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers) 99 | f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers) 100 | return exact_scores, f1_scores 101 | 102 | def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh): 103 | new_scores = {} 104 | for qid, s in scores.items(): 105 | pred_na = na_probs[qid] > na_prob_thresh 106 | if pred_na: 107 | new_scores[qid] = float(not qid_to_has_ans[qid]) 108 | else: 109 | new_scores[qid] = s 110 | return new_scores 111 | 112 | def make_eval_dict(exact_scores, f1_scores, qid_list=None): 113 | if not qid_list: 114 | total = len(exact_scores) 115 | return collections.OrderedDict([ 116 | ('exact', 100.0 * sum(exact_scores.values()) / total), 117 | ('f1', 100.0 * sum(f1_scores.values()) / total), 118 | ('total', total), 119 | ]) 120 | else: 121 | total = len(qid_list) 122 | return collections.OrderedDict([ 123 | ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total), 124 | ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total), 125 | ('total', total), 126 | ]) 127 | 128 | def merge_eval(main_eval, new_eval, prefix): 129 | for k in new_eval: 130 | main_eval['%s_%s' % (prefix, k)] = new_eval[k] 131 | 132 | def plot_pr_curve(precisions, recalls, out_image, title): 133 | plt.step(recalls, precisions, color='b', alpha=0.2, where='post') 134 | plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b') 135 | plt.xlabel('Recall') 136 | plt.ylabel('Precision') 137 | plt.xlim([0.0, 1.05]) 138 | plt.ylim([0.0, 1.05]) 139 | plt.title(title) 140 | plt.savefig(out_image) 141 | plt.clf() 142 | 143 | def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans, 144 | out_image=None, title=None): 145 | qid_list = sorted(na_probs, key=lambda k: na_probs[k]) 146 | true_pos = 0.0 147 | cur_p = 1.0 148 | cur_r = 0.0 149 | precisions = [1.0] 150 | recalls = [0.0] 151 | avg_prec = 0.0 152 | for i, qid in enumerate(qid_list): 153 | if qid_to_has_ans[qid]: 154 | true_pos += scores[qid] 155 | cur_p = true_pos / float(i+1) 156 | cur_r = true_pos / float(num_true_pos) 157 | if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]: 158 | # i.e., if we can put a threshold after this point 159 | avg_prec += cur_p * (cur_r - recalls[-1]) 160 | precisions.append(cur_p) 161 | recalls.append(cur_r) 162 | if out_image: 163 | plot_pr_curve(precisions, recalls, out_image, title) 164 | return {'ap': 100.0 * avg_prec} 165 | 166 | def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs, 167 | qid_to_has_ans, out_image_dir): 168 | if out_image_dir and not os.path.exists(out_image_dir): 169 | os.makedirs(out_image_dir) 170 | num_true_pos = sum(1 for v in qid_to_has_ans.values() if v) 171 | if num_true_pos == 0: 172 | return 173 | pr_exact = make_precision_recall_eval( 174 | exact_raw, na_probs, num_true_pos, qid_to_has_ans, 175 | out_image=os.path.join(out_image_dir, 'pr_exact.png'), 176 | title='Precision-Recall curve for Exact Match score') 177 | pr_f1 = make_precision_recall_eval( 178 | f1_raw, na_probs, num_true_pos, qid_to_has_ans, 179 | out_image=os.path.join(out_image_dir, 'pr_f1.png'), 180 | title='Precision-Recall curve for F1 score') 181 | oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()} 182 | pr_oracle = make_precision_recall_eval( 183 | oracle_scores, na_probs, num_true_pos, qid_to_has_ans, 184 | out_image=os.path.join(out_image_dir, 'pr_oracle.png'), 185 | title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)') 186 | merge_eval(main_eval, pr_exact, 'pr_exact') 187 | merge_eval(main_eval, pr_f1, 'pr_f1') 188 | merge_eval(main_eval, pr_oracle, 'pr_oracle') 189 | 190 | def histogram_na_prob(na_probs, qid_list, image_dir, name): 191 | if not qid_list: 192 | return 193 | x = [na_probs[k] for k in qid_list] 194 | weights = np.ones_like(x) / float(len(x)) 195 | plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0)) 196 | plt.xlabel('Model probability of no-answer') 197 | plt.ylabel('Proportion of dataset') 198 | plt.title('Histogram of no-answer probability: %s' % name) 199 | plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name)) 200 | plt.clf() 201 | 202 | def find_best_thresh(preds, scores, na_probs, qid_to_has_ans): 203 | num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) 204 | cur_score = num_no_ans 205 | best_score = cur_score 206 | best_thresh = 0.0 207 | qid_list = sorted(na_probs, key=lambda k: na_probs[k]) 208 | for i, qid in enumerate(qid_list): 209 | if qid not in scores: continue 210 | if qid_to_has_ans[qid]: 211 | diff = scores[qid] 212 | else: 213 | if preds[qid]: 214 | diff = -1 215 | else: 216 | diff = 0 217 | cur_score += diff 218 | if cur_score > best_score: 219 | best_score = cur_score 220 | best_thresh = na_probs[qid] 221 | return 100.0 * best_score / len(scores), best_thresh 222 | 223 | def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): 224 | best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans) 225 | best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans) 226 | main_eval['best_exact'] = best_exact 227 | main_eval['best_exact_thresh'] = exact_thresh 228 | main_eval['best_f1'] = best_f1 229 | main_eval['best_f1_thresh'] = f1_thresh 230 | 231 | def main(): 232 | with open(OPTS.data_file) as f: 233 | dataset_json = json.load(f) 234 | dataset = dataset_json['data'] 235 | with open(OPTS.pred_file) as f: 236 | preds = json.load(f) 237 | if OPTS.na_prob_file: 238 | with open(OPTS.na_prob_file) as f: 239 | na_probs = json.load(f) 240 | else: 241 | na_probs = {k: 0.0 for k in preds} 242 | qid_to_has_ans = make_qid_to_has_ans(dataset) # maps qid to True/False 243 | has_ans_qids = [k for k, v in qid_to_has_ans.items() if v] 244 | no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v] 245 | exact_raw, f1_raw = get_raw_scores(dataset, preds) 246 | exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans, 247 | OPTS.na_prob_thresh) 248 | f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans, 249 | OPTS.na_prob_thresh) 250 | out_eval = make_eval_dict(exact_thresh, f1_thresh) 251 | if has_ans_qids: 252 | has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids) 253 | merge_eval(out_eval, has_ans_eval, 'HasAns') 254 | if no_ans_qids: 255 | no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids) 256 | merge_eval(out_eval, no_ans_eval, 'NoAns') 257 | if OPTS.na_prob_file: 258 | find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans) 259 | if OPTS.na_prob_file and OPTS.out_image_dir: 260 | run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs, 261 | qid_to_has_ans, OPTS.out_image_dir) 262 | histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns') 263 | histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns') 264 | if OPTS.out_file: 265 | with open(OPTS.out_file, 'w') as f: 266 | json.dump(out_eval, f) 267 | else: 268 | print(json.dumps(out_eval, indent=2)) 269 | 270 | if __name__ == '__main__': 271 | OPTS = parse_args() 272 | if OPTS.out_image_dir: 273 | import matplotlib 274 | matplotlib.use('Agg') 275 | import matplotlib.pyplot as plt 276 | main() 277 | 278 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /reading_comprehension/external/rouge.py: -------------------------------------------------------------------------------- 1 | """ROUGE metric implementation. 2 | 3 | Copy from tf_seq2seq/seq2seq/metrics/rouge.py. 4 | This is a modified and slightly extended verison of 5 | https://github.com/miso-belica/sumy/blob/dev/sumy/evaluation/rouge.py. 6 | """ 7 | 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | from __future__ import print_function 11 | from __future__ import unicode_literals 12 | 13 | import itertools 14 | import numpy as np 15 | 16 | #pylint: disable=C0103 17 | 18 | __all__ = ["rouge"] 19 | 20 | def _get_ngrams(n, text): 21 | """Calcualtes n-grams. 22 | 23 | Args: 24 | n: which n-grams to calculate 25 | text: An array of tokens 26 | 27 | Returns: 28 | A set of n-grams 29 | """ 30 | ngram_set = set() 31 | text_length = len(text) 32 | max_index_ngram_start = text_length - n 33 | for i in range(max_index_ngram_start + 1): 34 | ngram_set.add(tuple(text[i:i + n])) 35 | return ngram_set 36 | 37 | 38 | def _split_into_words(sentences): 39 | """Splits multiple sentences into words and flattens the result""" 40 | return list(itertools.chain(*[_.split(" ") for _ in sentences])) 41 | 42 | 43 | def _get_word_ngrams(n, sentences): 44 | """Calculates word n-grams for multiple sentences. 45 | """ 46 | assert len(sentences) > 0 47 | assert n > 0 48 | 49 | words = _split_into_words(sentences) 50 | return _get_ngrams(n, words) 51 | 52 | 53 | def _len_lcs(x, y): 54 | """ 55 | Returns the length of the Longest Common Subsequence between sequences x 56 | and y. 57 | Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence 58 | 59 | Args: 60 | x: sequence of words 61 | y: sequence of words 62 | 63 | Returns 64 | integer: Length of LCS between x and y 65 | """ 66 | table = _lcs(x, y) 67 | n, m = len(x), len(y) 68 | return table[n, m] 69 | 70 | 71 | def _lcs(x, y): 72 | """ 73 | Computes the length of the longest common subsequence (lcs) between two 74 | strings. The implementation below uses a DP programming algorithm and runs 75 | in O(nm) time where n = len(x) and m = len(y). 76 | Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence 77 | 78 | Args: 79 | x: collection of words 80 | y: collection of words 81 | 82 | Returns: 83 | Table of dictionary of coord and len lcs 84 | """ 85 | n, m = len(x), len(y) 86 | table = dict() 87 | for i in range(n + 1): 88 | for j in range(m + 1): 89 | if i == 0 or j == 0: 90 | table[i, j] = 0 91 | elif x[i - 1] == y[j - 1]: 92 | table[i, j] = table[i - 1, j - 1] + 1 93 | else: 94 | table[i, j] = max(table[i - 1, j], table[i, j - 1]) 95 | return table 96 | 97 | 98 | def _recon_lcs(x, y): 99 | """ 100 | Returns the Longest Subsequence between x and y. 101 | Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence 102 | 103 | Args: 104 | x: sequence of words 105 | y: sequence of words 106 | 107 | Returns: 108 | sequence: LCS of x and y 109 | """ 110 | i, j = len(x), len(y) 111 | table = _lcs(x, y) 112 | 113 | def _recon(i, j): 114 | """private recon calculation""" 115 | if i == 0 or j == 0: 116 | return [] 117 | elif x[i - 1] == y[j - 1]: 118 | return _recon(i - 1, j - 1) + [(x[i - 1], i)] 119 | elif table[i - 1, j] > table[i, j - 1]: 120 | return _recon(i - 1, j) 121 | else: 122 | return _recon(i, j - 1) 123 | 124 | recon_tuple = tuple(map(lambda x: x[0], _recon(i, j))) 125 | return recon_tuple 126 | 127 | 128 | def rouge_n(evaluated_sentences, reference_sentences, n=2): 129 | """ 130 | Computes ROUGE-N of two text collections of sentences. 131 | Sourece: http://research.microsoft.com/en-us/um/people/cyl/download/ 132 | papers/rouge-working-note-v1.3.1.pdf 133 | 134 | Args: 135 | evaluated_sentences: The sentences that have been picked by the summarizer 136 | reference_sentences: The sentences from the referene set 137 | n: Size of ngram. Defaults to 2. 138 | 139 | Returns: 140 | A tuple (f1, precision, recall) for ROUGE-N 141 | 142 | Raises: 143 | ValueError: raises exception if a param has len <= 0 144 | """ 145 | if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: 146 | raise ValueError("Collections must contain at least 1 sentence.") 147 | 148 | evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences) 149 | reference_ngrams = _get_word_ngrams(n, reference_sentences) 150 | reference_count = len(reference_ngrams) 151 | evaluated_count = len(evaluated_ngrams) 152 | 153 | # Gets the overlapping ngrams between evaluated and reference 154 | overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams) 155 | overlapping_count = len(overlapping_ngrams) 156 | 157 | # Handle edge case. This isn't mathematically correct, but it's good enough 158 | if evaluated_count == 0: 159 | precision = 0.0 160 | else: 161 | precision = overlapping_count / evaluated_count 162 | 163 | if reference_count == 0: 164 | recall = 0.0 165 | else: 166 | recall = overlapping_count / reference_count 167 | 168 | f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8)) 169 | 170 | # return overlapping_count / reference_count 171 | return f1_score, precision, recall 172 | 173 | 174 | def _f_p_r_lcs(llcs, m, n): 175 | """ 176 | Computes the LCS-based F-measure score 177 | Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/ 178 | rouge-working-note-v1.3.1.pdf 179 | 180 | Args: 181 | llcs: Length of LCS 182 | m: number of words in reference summary 183 | n: number of words in candidate summary 184 | 185 | Returns: 186 | Float. LCS-based F-measure score 187 | """ 188 | r_lcs = llcs / m 189 | p_lcs = llcs / n 190 | beta = p_lcs / (r_lcs + 1e-12) 191 | num = (1 + (beta**2)) * r_lcs * p_lcs 192 | denom = r_lcs + ((beta**2) * p_lcs) 193 | f_lcs = num / (denom + 1e-12) 194 | return f_lcs, p_lcs, r_lcs 195 | 196 | 197 | def rouge_l_sentence_level(evaluated_sentences, reference_sentences): 198 | """ 199 | Computes ROUGE-L (sentence level) of two text collections of sentences. 200 | http://research.microsoft.com/en-us/um/people/cyl/download/papers/ 201 | rouge-working-note-v1.3.1.pdf 202 | 203 | Calculated according to: 204 | R_lcs = LCS(X,Y)/m 205 | P_lcs = LCS(X,Y)/n 206 | F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs) 207 | 208 | where: 209 | X = reference summary 210 | Y = Candidate summary 211 | m = length of reference summary 212 | n = length of candidate summary 213 | 214 | Args: 215 | evaluated_sentences: The sentences that have been picked by the summarizer 216 | reference_sentences: The sentences from the referene set 217 | 218 | Returns: 219 | A float: F_lcs 220 | 221 | Raises: 222 | ValueError: raises exception if a param has len <= 0 223 | """ 224 | if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: 225 | raise ValueError("Collections must contain at least 1 sentence.") 226 | reference_words = _split_into_words(reference_sentences) 227 | evaluated_words = _split_into_words(evaluated_sentences) 228 | m = len(reference_words) 229 | n = len(evaluated_words) 230 | lcs = _len_lcs(evaluated_words, reference_words) 231 | return _f_p_r_lcs(lcs, m, n) 232 | 233 | 234 | def _union_lcs(evaluated_sentences, reference_sentence): 235 | """ 236 | Returns LCS_u(r_i, C) which is the LCS score of the union longest common 237 | subsequence between reference sentence ri and candidate summary C. For example 238 | if r_i= w1 w2 w3 w4 w5, and C contains two sentences: c1 = w1 w2 w6 w7 w8 and 239 | c2 = w1 w3 w8 w9 w5, then the longest common subsequence of r_i and c1 is 240 | "w1 w2" and the longest common subsequence of r_i and c2 is "w1 w3 w5". The 241 | union longest common subsequence of r_i, c1, and c2 is "w1 w2 w3 w5" and 242 | LCS_u(r_i, C) = 4/5. 243 | 244 | Args: 245 | evaluated_sentences: The sentences that have been picked by the summarizer 246 | reference_sentence: One of the sentences in the reference summaries 247 | 248 | Returns: 249 | float: LCS_u(r_i, C) 250 | 251 | ValueError: 252 | Raises exception if a param has len <= 0 253 | """ 254 | if len(evaluated_sentences) <= 0: 255 | raise ValueError("Collections must contain at least 1 sentence.") 256 | 257 | lcs_union = set() 258 | reference_words = _split_into_words([reference_sentence]) 259 | combined_lcs_length = 0 260 | for eval_s in evaluated_sentences: 261 | evaluated_words = _split_into_words([eval_s]) 262 | lcs = set(_recon_lcs(reference_words, evaluated_words)) 263 | combined_lcs_length += len(lcs) 264 | lcs_union = lcs_union.union(lcs) 265 | 266 | union_lcs_count = len(lcs_union) 267 | union_lcs_value = union_lcs_count / combined_lcs_length 268 | return union_lcs_value 269 | 270 | 271 | def rouge_l_summary_level(evaluated_sentences, reference_sentences): 272 | """ 273 | Computes ROUGE-L (summary level) of two text collections of sentences. 274 | http://research.microsoft.com/en-us/um/people/cyl/download/papers/ 275 | rouge-working-note-v1.3.1.pdf 276 | 277 | Calculated according to: 278 | R_lcs = SUM(1, u)[LCS(r_i,C)]/m 279 | P_lcs = SUM(1, u)[LCS(r_i,C)]/n 280 | F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs) 281 | 282 | where: 283 | SUM(i,u) = SUM from i through u 284 | u = number of sentences in reference summary 285 | C = Candidate summary made up of v sentences 286 | m = number of words in reference summary 287 | n = number of words in candidate summary 288 | 289 | Args: 290 | evaluated_sentences: The sentences that have been picked by the summarizer 291 | reference_sentence: One of the sentences in the reference summaries 292 | 293 | Returns: 294 | A float: F_lcs 295 | 296 | Raises: 297 | ValueError: raises exception if a param has len <= 0 298 | """ 299 | if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: 300 | raise ValueError("Collections must contain at least 1 sentence.") 301 | 302 | # total number of words in reference sentences 303 | m = len(_split_into_words(reference_sentences)) 304 | 305 | # total number of words in evaluated sentences 306 | n = len(_split_into_words(evaluated_sentences)) 307 | 308 | union_lcs_sum_across_all_references = 0 309 | for ref_s in reference_sentences: 310 | union_lcs_sum_across_all_references += _union_lcs(evaluated_sentences, 311 | ref_s) 312 | return _f_p_r_lcs(union_lcs_sum_across_all_references, m, n) 313 | 314 | 315 | def rouge(hypotheses, references): 316 | """Calculates average rouge scores for a list of hypotheses and 317 | references""" 318 | 319 | # Filter out hyps that are of 0 length 320 | # hyps_and_refs = zip(hypotheses, references) 321 | # hyps_and_refs = [_ for _ in hyps_and_refs if len(_[0]) > 0] 322 | # hypotheses, references = zip(*hyps_and_refs) 323 | 324 | # Calculate ROUGE-1 F1, precision, recall scores 325 | rouge_1 = [ 326 | rouge_n([hyp], [ref], 1) for hyp, ref in zip(hypotheses, references) 327 | ] 328 | rouge_1_f, rouge_1_p, rouge_1_r = map(np.mean, zip(*rouge_1)) 329 | 330 | # Calculate ROUGE-2 F1, precision, recall scores 331 | rouge_2 = [ 332 | rouge_n([hyp], [ref], 2) for hyp, ref in zip(hypotheses, references) 333 | ] 334 | rouge_2_f, rouge_2_p, rouge_2_r = map(np.mean, zip(*rouge_2)) 335 | 336 | # Calculate ROUGE-L F1, precision, recall scores 337 | rouge_l = [ 338 | rouge_l_sentence_level([hyp], [ref]) 339 | for hyp, ref in zip(hypotheses, references) 340 | ] 341 | rouge_l_f, rouge_l_p, rouge_l_r = map(np.mean, zip(*rouge_l)) 342 | 343 | return { 344 | "rouge_1/f_score": rouge_1_f, 345 | "rouge_1/r_score": rouge_1_r, 346 | "rouge_1/p_score": rouge_1_p, 347 | "rouge_2/f_score": rouge_2_f, 348 | "rouge_2/r_score": rouge_2_r, 349 | "rouge_2/p_score": rouge_2_p, 350 | "rouge_l/f_score": rouge_l_f, 351 | "rouge_l/r_score": rouge_l_r, 352 | "rouge_l/p_score": rouge_l_p, 353 | } 354 | -------------------------------------------------------------------------------- /reading_comprehension/layer/recurrent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from tensorflow.contrib.rnn import RNNCell 5 | 6 | from util.default_util import * 7 | from util.reading_comprehension_util import * 8 | 9 | __all__ = ["RNN", "BiRNN"] 10 | 11 | def _extract_hidden_state(state, 12 | cell_type): 13 | """extract hidden state""" 14 | return state.h if "lstm" in cell_type else state 15 | 16 | def _create_single_reccurent_cell(unit_dim, 17 | cell_type, 18 | activation, 19 | dropout, 20 | forget_bias, 21 | residual_connect, 22 | attention_mechanism, 23 | device_spec, 24 | random_seed): 25 | """create single recurrent cell""" 26 | weight_initializer = create_variable_initializer("glorot_uniform", random_seed) 27 | bias_initializer = create_variable_initializer("zero") 28 | recurrent_activation = create_activation_function(activation) 29 | 30 | if cell_type == "lstm": 31 | single_cell = tf.contrib.rnn.LSTMCell(num_units=unit_dim, use_peepholes=False, 32 | activation=recurrent_activation, forget_bias=forget_bias, initializer=weight_initializer, state_is_tuple=True) 33 | elif cell_type == "peephole_lstm": 34 | single_cell = tf.contrib.rnn.LSTMCell(num_units=unit_dim, use_peepholes=True, 35 | activation=recurrent_activation, forget_bias=forget_bias, initializer=weight_initializer) 36 | elif cell_type == "layer_norm_lstm": 37 | single_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(num_units=unit_dim, layer_norm=True, 38 | activation=recurrent_activation, forget_bias=forget_bias) 39 | elif cell_type == "block_lstm": 40 | single_cell = tf.contrib.rnn.LSTMBlockCell(num_units=unit_dim, forget_bias=forget_bias) 41 | elif cell_type == "block_fused_lstm": 42 | single_cell = tf.contrib.rnn.LSTMBlockFusedCell(num_units=unit_dim, forget_bias=forget_bias) 43 | elif cell_type == "gru": 44 | single_cell = tf.contrib.rnn.GRUCell(num_units=unit_dim, activation=recurrent_activation, 45 | kernel_initializer=weight_initializer, bias_initializer=bias_initializer) 46 | elif cell_type == "sru": 47 | single_cell = tf.contrib.rnn.SRUCell(num_units=unit_dim, activation=recurrent_activation) 48 | else: 49 | raise ValueError("unsupported cell type {0}".format(cell_type)) 50 | 51 | if attention_mechanism != None: 52 | single_cell = AttentionCellWrapper(cell=single_cell, attention_mechanism=attention_mechanism) 53 | 54 | if dropout > 0.0: 55 | single_cell = tf.contrib.rnn.DropoutWrapper(cell=single_cell, input_keep_prob=1.0-dropout) 56 | 57 | if residual_connect == True: 58 | single_cell = tf.contrib.rnn.ResidualWrapper(cell=single_cell) 59 | 60 | if device_spec is not None: 61 | single_cell = tf.contrib.rnn.DeviceWrapper(cell=single_cell, device=device_spec) 62 | 63 | return single_cell 64 | 65 | def _create_recurrent_cell(num_layer, 66 | unit_dim, 67 | cell_type, 68 | activation, 69 | dropout, 70 | forget_bias, 71 | residual_connect, 72 | attention_mechanism, 73 | num_gpus, 74 | default_gpu_id, 75 | random_seed): 76 | """create recurrent cell""" 77 | cell_list = [] 78 | for i in range(num_layer): 79 | device_spec = get_device_spec(default_gpu_id, num_gpus) 80 | 81 | single_cell = _create_single_reccurent_cell(unit_dim, cell_type, activation, 82 | dropout, forget_bias, residual_connect, attention_mechanism, device_spec, random_seed) 83 | 84 | cell_list.append(single_cell) 85 | 86 | cell = tf.contrib.rnn.MultiRNNCell(cell_list) 87 | 88 | return cell 89 | 90 | class RNN(object): 91 | """uni-directional recurrent layer""" 92 | def __init__(self, 93 | num_layer, 94 | unit_dim, 95 | cell_type, 96 | activation, 97 | dropout, 98 | forget_bias=1.0, 99 | residual_connect=False, 100 | attention_mechanism=None, 101 | num_gpus=1, 102 | default_gpu_id=0, 103 | random_seed=0, 104 | trainable=True, 105 | scope="rnn"): 106 | """initialize uni-directional recurrent layer""" 107 | self.num_layer = num_layer 108 | self.unit_dim = unit_dim 109 | self.cell_type = cell_type 110 | self.activation = activation 111 | self.dropout = dropout 112 | self.forget_bias = forget_bias 113 | self.residual_connect = residual_connect 114 | self.attention_mechanism = attention_mechanism 115 | self.num_gpus = num_gpus 116 | self.default_gpu_id = default_gpu_id 117 | self.random_seed = random_seed 118 | self.trainable = trainable 119 | self.scope = scope 120 | 121 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE): 122 | self.cell = _create_recurrent_cell(self.num_layer, self.unit_dim, self.cell_type, 123 | self.activation, self.dropout, self.forget_bias, self.residual_connect, 124 | self.attention_mechanism, self.num_gpus, self.default_gpu_id, self.random_seed) 125 | 126 | def __call__(self, 127 | input_data, 128 | input_mask): 129 | """call uni-directional recurrent layer""" 130 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE): 131 | input_data_shape = tf.shape(input_data) 132 | input_mask_shape = tf.shape(input_mask) 133 | shape_size = len(input_data.get_shape().as_list()) 134 | if shape_size > 3: 135 | input_data = tf.reshape(input_data, shape=tf.concat([[-1], input_data_shape[-2:]], axis=0)) 136 | input_mask = tf.reshape(input_mask, shape=tf.concat([[-1], input_mask_shape[-2:]], axis=0)) 137 | 138 | input_length = tf.cast(tf.reduce_sum(tf.squeeze(input_mask, axis=-1), axis=-1), dtype=tf.int32) 139 | output_recurrent, final_state_recurrent = tf.nn.dynamic_rnn(cell=self.cell, 140 | inputs=input_data, sequence_length=input_length, dtype=input_data.dtype) 141 | output_mask = input_mask 142 | 143 | state_list = [_extract_hidden_state(state, self.cell_type) for state in final_state_recurrent] 144 | final_state_recurrent = tf.concat(state_list, axis=-1) 145 | final_state_mask = tf.squeeze(tf.reduce_max(input_mask, axis=1, keepdims=True), axis=1) 146 | 147 | if shape_size > 3: 148 | output_recurrent_shape = tf.shape(output_recurrent) 149 | output_mask_shape = tf.shape(output_mask) 150 | final_state_recurrent_shape = tf.shape(final_state_recurrent) 151 | final_state_mask_shape = tf.shape(final_state_mask) 152 | output_recurrent = tf.reshape(output_recurrent, 153 | shape=tf.concat([input_data_shape[:-2], output_recurrent_shape[-2:]], axis=0)) 154 | output_mask = tf.reshape(output_mask, 155 | shape=tf.concat([input_mask_shape[:-2], output_mask_shape[-2:]], axis=0)) 156 | final_state_recurrent = tf.reshape(final_state_recurrent, 157 | shape=tf.concat([input_data_shape[:-2], final_state_recurrent_shape[-1:]], axis=0)) 158 | final_state_mask = tf.reshape(final_state_mask, 159 | shape=tf.concat([input_mask_shape[:-2], final_state_mask_shape[-1:]], axis=0)) 160 | 161 | return output_recurrent, output_mask, final_state_recurrent, final_state_mask 162 | 163 | class BiRNN(object): 164 | """bi-directional recurrent layer""" 165 | def __init__(self, 166 | num_layer, 167 | unit_dim, 168 | cell_type, 169 | activation, 170 | dropout, 171 | forget_bias=1.0, 172 | residual_connect=False, 173 | attention_mechanism=None, 174 | num_gpus=1, 175 | default_gpu_id=0, 176 | random_seed=0, 177 | trainable=True, 178 | scope="bi_rnn"): 179 | """initialize bi-directional recurrent layer""" 180 | self.num_layer = num_layer 181 | self.unit_dim = unit_dim 182 | self.cell_type = cell_type 183 | self.activation = activation 184 | self.dropout = dropout 185 | self.forget_bias = forget_bias 186 | self.residual_connect = residual_connect 187 | self.attention_mechanism = attention_mechanism 188 | self.num_gpus = num_gpus 189 | self.default_gpu_id = default_gpu_id 190 | self.random_seed = random_seed 191 | self.trainable = trainable 192 | self.scope = scope 193 | 194 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE): 195 | self.fwd_cell = _create_recurrent_cell(self.num_layer, self.unit_dim, self.cell_type, 196 | self.activation, self.dropout, self.forget_bias, self.residual_connect, 197 | self.attention_mechanism, self.num_gpus, self.default_gpu_id, self.random_seed) 198 | self.bwd_cell = _create_recurrent_cell(self.num_layer, self.unit_dim, self.cell_type, 199 | self.activation, self.dropout, self.forget_bias, self.residual_connect, 200 | self.attention_mechanism, self.num_gpus, self.default_gpu_id + self.num_layer, self.random_seed) 201 | 202 | def __call__(self, 203 | input_data, 204 | input_mask): 205 | """call bi-directional recurrent layer""" 206 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE): 207 | input_data_shape = tf.shape(input_data) 208 | input_mask_shape = tf.shape(input_mask) 209 | shape_size = len(input_data.get_shape().as_list()) 210 | if shape_size > 3: 211 | input_data = tf.reshape(input_data, shape=tf.concat([[-1], input_data_shape[-2:]], axis=0)) 212 | input_mask = tf.reshape(input_mask, shape=tf.concat([[-1], input_mask_shape[-2:]], axis=0)) 213 | 214 | input_length = tf.cast(tf.reduce_sum(tf.squeeze(input_mask, axis=-1), axis=-1), dtype=tf.int32) 215 | output_recurrent, final_state_recurrent = tf.nn.bidirectional_dynamic_rnn(cell_fw=self.fwd_cell, 216 | cell_bw=self.bwd_cell, inputs=input_data, sequence_length=input_length, dtype=input_data.dtype) 217 | 218 | output_recurrent = tf.concat(output_recurrent, axis=-1) 219 | output_mask = input_mask 220 | 221 | fwd_state = final_state_recurrent[0] 222 | bwd_state = final_state_recurrent[1] 223 | 224 | state_list = [] 225 | for i in range(self.num_layer): 226 | state_list.append(_extract_hidden_state(fwd_state[i], self.cell_type)) 227 | state_list.append(_extract_hidden_state(bwd_state[i], self.cell_type)) 228 | 229 | final_state_recurrent = tf.concat(state_list, axis=-1) 230 | final_state_mask = tf.squeeze(tf.reduce_max(input_mask, axis=1, keepdims=True), axis=1) 231 | 232 | if shape_size > 3: 233 | output_recurrent_shape = tf.shape(output_recurrent) 234 | output_mask_shape = tf.shape(output_mask) 235 | final_state_recurrent_shape = tf.shape(final_state_recurrent) 236 | final_state_mask_shape = tf.shape(final_state_mask) 237 | output_recurrent = tf.reshape(output_recurrent, 238 | shape=tf.concat([input_data_shape[:-2], output_recurrent_shape[-2:]], axis=0)) 239 | output_mask = tf.reshape(output_mask, 240 | shape=tf.concat([input_mask_shape[:-2], output_mask_shape[-2:]], axis=0)) 241 | final_state_recurrent = tf.reshape(final_state_recurrent, 242 | shape=tf.concat([input_data_shape[:-2], final_state_recurrent_shape[-1:]], axis=0)) 243 | final_state_mask = tf.reshape(final_state_mask, 244 | shape=tf.concat([input_mask_shape[:-2], final_state_mask_shape[-1:]], axis=0)) 245 | 246 | return output_recurrent, output_mask, final_state_recurrent, final_state_mask 247 | 248 | class AttentionCellWrapper(RNNCell): 249 | def __init__(self, 250 | cell, 251 | attention_mechanism): 252 | """initialize attention cell wrapper""" 253 | super(AttentionCellWrapper, self).__init__() 254 | 255 | self._cell = cell 256 | self._attention_mechanism = attention_mechanism 257 | 258 | @property 259 | def state_size(self): 260 | return self._cell.state_size 261 | 262 | @property 263 | def output_size(self): 264 | return self._cell.output_size 265 | 266 | def __call__(self, 267 | inputs, 268 | state, 269 | scope=None): 270 | """call attention cell wrapper""" 271 | query = tf.expand_dims(tf.concat([inputs, state], axis=-1), axis=1) 272 | query_mask = tf.reduce_sum(query, axis=-1, keepdims=True) 273 | query_mask = tf.cast(tf.greater(query_mask, tf.constant(0, shape=[], dtype=tf.float32)), dtype=tf.float32) 274 | attention, attention_mask = self._attention_mechanism(query, query_mask) 275 | inputs = tf.squeeze(attention, axis=1) 276 | cell_output, new_state = self._cell(inputs, state, scope) 277 | 278 | return cell_output, new_state 279 | -------------------------------------------------------------------------------- /reading_comprehension/layer/dense.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from util.default_util import * 5 | from util.reading_comprehension_util import * 6 | 7 | from layer.basic import * 8 | 9 | __all__ = ["Dense", "DoubleDense", "StackedDense", "StackedDoubleDense"] 10 | 11 | class Dense(object): 12 | """dense layer""" 13 | def __init__(self, 14 | unit_dim, 15 | activation, 16 | dropout, 17 | layer_dropout=0.0, 18 | layer_norm=False, 19 | residual_connect=False, 20 | use_bias=False, 21 | num_gpus=1, 22 | default_gpu_id=0, 23 | regularizer=None, 24 | random_seed=0, 25 | trainable=True, 26 | scope="dense"): 27 | """initialize dense layer""" 28 | self.unit_dim = unit_dim 29 | self.activation = activation 30 | self.dropout = dropout 31 | self.layer_dropout = layer_dropout 32 | self.layer_norm = layer_norm 33 | self.residual_connect = residual_connect 34 | self.use_bias = use_bias 35 | self.regularizer = regularizer 36 | self.random_seed = random_seed 37 | self.trainable = trainable 38 | self.scope = scope 39 | self.device_spec = get_device_spec(default_gpu_id, num_gpus) 40 | 41 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 42 | weight_initializer = create_variable_initializer("glorot_uniform", self.random_seed) 43 | bias_initializer = create_variable_initializer("zero") 44 | self.dense_layer = tf.layers.Dense(units=self.unit_dim, activation=None, use_bias=self.use_bias, 45 | kernel_initializer=weight_initializer, bias_initializer=bias_initializer, 46 | kernel_regularizer=self.regularizer, bias_regularizer=self.regularizer, trainable=self.trainable) 47 | 48 | self.dense_activation = create_activation_function(self.activation) 49 | 50 | self.dropout_layer = Dropout(rate=self.dropout, num_gpus=num_gpus, 51 | default_gpu_id=default_gpu_id, random_seed=self.random_seed) 52 | 53 | if self.layer_norm == True: 54 | self.norm_layer = LayerNorm(layer_dim=self.unit_dim, 55 | num_gpus=num_gpus, default_gpu_id=default_gpu_id, trainable=self.trainable) 56 | 57 | def __call__(self, 58 | input_data, 59 | input_mask): 60 | """call dense layer""" 61 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 62 | input_dense = input_data 63 | input_dense_mask = input_mask 64 | 65 | if self.layer_norm == True: 66 | input_dense, input_dense_mask = self.norm_layer(input_dense, input_dense_mask) 67 | 68 | input_dense = self.dense_layer(input_dense) 69 | 70 | if self.dense_activation != None: 71 | input_dense = self.dense_activation(input_dense) 72 | 73 | input_dense, input_dense_mask = self.dropout_layer(input_dense, input_dense_mask) 74 | 75 | if self.residual_connect == True: 76 | output_dense, output_mask = tf.cond(tf.random_uniform([]) < self.layer_dropout, 77 | lambda: (input_data, input_mask), 78 | lambda: (input_dense + input_data, input_mask)) 79 | else: 80 | output_dense = input_dense 81 | output_mask = input_dense_mask 82 | 83 | return output_dense, output_mask 84 | 85 | class DoubleDense(object): 86 | """double-dense layer""" 87 | def __init__(self, 88 | unit_dim, 89 | inner_scale, 90 | activation, 91 | dropout, 92 | layer_dropout=0.0, 93 | layer_norm=False, 94 | residual_connect=False, 95 | use_bias=False, 96 | num_gpus=1, 97 | default_gpu_id=0, 98 | regularizer=None, 99 | random_seed=0, 100 | trainable=True, 101 | scope="double_dense"): 102 | """initialize double-dense layer""" 103 | self.unit_dim = unit_dim 104 | self.inner_scale = inner_scale 105 | self.activation = activation 106 | self.dropout = dropout 107 | self.layer_dropout = layer_dropout 108 | self.layer_norm = layer_norm 109 | self.residual_connect = residual_connect 110 | self.use_bias = use_bias 111 | self.regularizer = regularizer 112 | self.random_seed = random_seed 113 | self.trainable = trainable 114 | self.scope = scope 115 | self.device_spec = get_device_spec(default_gpu_id, num_gpus) 116 | 117 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 118 | weight_initializer = create_variable_initializer("glorot_uniform", self.random_seed) 119 | bias_initializer = create_variable_initializer("zero") 120 | self.inner_dense_layer = tf.layers.Dense(units=self.unit_dim * self.inner_scale, activation=None, use_bias=self.use_bias, 121 | kernel_initializer=weight_initializer, bias_initializer=bias_initializer, 122 | kernel_regularizer=self.regularizer, bias_regularizer=self.regularizer, trainable=self.trainable) 123 | self.outer_dense_layer = tf.layers.Dense(units=self.unit_dim, activation=None, use_bias=self.use_bias, 124 | kernel_initializer=weight_initializer, bias_initializer=bias_initializer, 125 | kernel_regularizer=self.regularizer, bias_regularizer=self.regularizer, trainable=self.trainable) 126 | 127 | self.dense_activation = create_activation_function(self.activation) 128 | 129 | self.dropout_layer = Dropout(rate=self.dropout, num_gpus=num_gpus, 130 | default_gpu_id=default_gpu_id, random_seed=self.random_seed) 131 | 132 | if self.layer_norm == True: 133 | self.norm_layer = LayerNorm(layer_dim=self.unit_dim, 134 | num_gpus=num_gpus, default_gpu_id=default_gpu_id, trainable=self.trainable) 135 | 136 | def __call__(self, 137 | input_data, 138 | input_mask): 139 | """call double-dense layer""" 140 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 141 | input_dense = input_data 142 | input_dense_mask = input_mask 143 | 144 | if self.layer_norm == True: 145 | input_dense, input_dense_mask = self.norm_layer(input_dense, input_dense_mask) 146 | 147 | input_dense = self.inner_dense_layer(input_dense) 148 | 149 | if self.dense_activation != None: 150 | input_dense = self.dense_activation(input_dense) 151 | 152 | input_dense = self.outer_dense_layer(input_dense) 153 | 154 | input_dense, input_dense_mask = self.dropout_layer(input_dense, input_dense_mask) 155 | 156 | if self.residual_connect == True: 157 | output_dense, output_mask = tf.cond(tf.random_uniform([]) < self.layer_dropout, 158 | lambda: (input_data, input_mask), 159 | lambda: (input_dense + input_data, input_mask)) 160 | else: 161 | output_dense = input_dense 162 | output_mask = input_dense_mask 163 | 164 | return output_dense, output_mask 165 | 166 | class StackedDense(object): 167 | """stacked dense layer""" 168 | def __init__(self, 169 | layer_creator, 170 | num_layer, 171 | unit_dim, 172 | activation, 173 | dropout, 174 | layer_dropout=None, 175 | layer_norm=False, 176 | residual_connect=False, 177 | use_bias=False, 178 | num_gpus=1, 179 | default_gpu_id=0, 180 | regularizer=None, 181 | random_seed=0, 182 | trainable=True, 183 | scope="stacked_dense"): 184 | """initialize stacked dense layer""" 185 | self.layer_creator = layer_creator 186 | self.num_layer = num_layer 187 | self.unit_dim = unit_dim 188 | self.activation = activation 189 | self.dropout = dropout 190 | self.layer_dropout = layer_dropout 191 | self.layer_norm = layer_norm 192 | self.residual_connect = residual_connect 193 | self.use_bias = use_bias 194 | self.num_gpus = num_gpus 195 | self.default_gpu_id = default_gpu_id 196 | self.regularizer = regularizer 197 | self.random_seed = random_seed 198 | self.trainable = trainable 199 | self.scope = scope 200 | self.device_spec = get_device_spec(default_gpu_id, num_gpus) 201 | 202 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 203 | self.dense_layer_list = [] 204 | for i in range(self.num_layer): 205 | layer_scope = "layer_{0}".format(i) 206 | sublayer_dropout = self.dropout[i] if self.dropout != None else 0.0 207 | sublayer_layer_dropout = self.layer_dropout[i] if self.layer_dropout != None else 0.0 208 | dense_layer = self.layer_creator(unit_dim=self.unit_dim, activation=self.activation, 209 | dropout=sublayer_dropout, layer_dropout=sublayer_layer_dropout, layer_norm=self.layer_norm, 210 | residual_connect=self.residual_connect, use_bias=self.use_bias, num_gpus=self.num_gpus, 211 | default_gpu_id=self.default_gpu_id, regularizer=self.regularizer, random_seed=self.random_seed, 212 | trainable=self.trainable, scope=layer_scope) 213 | self.dense_layer_list.append(dense_layer) 214 | 215 | def __call__(self, 216 | input_data, 217 | input_mask): 218 | """call stacked dense layer""" 219 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 220 | input_dense = input_data 221 | input_dense_mask = input_mask 222 | 223 | for dense_layer in self.dense_layer_list: 224 | input_dense, input_dense_mask = dense_layer(input_dense, input_dense_mask) 225 | 226 | output_dense = input_dense 227 | output_mask = input_dense_mask 228 | 229 | return output_dense, output_mask 230 | 231 | class StackedDoubleDense(object): 232 | """stacked double-dense layer""" 233 | def __init__(self, 234 | layer_creator, 235 | num_layer, 236 | unit_dim, 237 | inner_scale, 238 | activation, 239 | dropout, 240 | layer_dropout=None, 241 | layer_norm=False, 242 | residual_connect=False, 243 | use_bias=False, 244 | num_gpus=1, 245 | default_gpu_id=0, 246 | regularizer=None, 247 | random_seed=0, 248 | trainable=True, 249 | scope="stacked_double_dense"): 250 | """initialize stacked double-dense layer""" 251 | self.layer_creator = layer_creator 252 | self.num_layer = num_layer 253 | self.unit_dim = unit_dim 254 | self.inner_scale = inner_scale 255 | self.activation = activation 256 | self.dropout = dropout 257 | self.layer_dropout = layer_dropout 258 | self.layer_norm = layer_norm 259 | self.residual_connect = residual_connect 260 | self.use_bias = use_bias 261 | self.num_gpus = num_gpus 262 | self.default_gpu_id = default_gpu_id 263 | self.regularizer = regularizer 264 | self.random_seed = random_seed 265 | self.trainable = trainable 266 | self.scope = scope 267 | self.device_spec = get_device_spec(default_gpu_id, num_gpus) 268 | 269 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 270 | self.dense_layer_list = [] 271 | for i in range(self.num_layer): 272 | layer_scope = "layer_{0}".format(i) 273 | sublayer_dropout = self.dropout[i] if self.dropout != None else 0.0 274 | sublayer_layer_dropout = self.layer_dropout[i] if self.layer_dropout != None else 0.0 275 | dense_layer = self.layer_creator(unit_dim=self.unit_dim, inner_scale=self.inner_scale, activation=self.activation, 276 | dropout=sublayer_dropout, layer_dropout=sublayer_layer_dropout, layer_norm=self.layer_norm, 277 | residual_connect=self.residual_connect, use_bias=self.use_bias, num_gpus=self.num_gpus, 278 | default_gpu_id=self.default_gpu_id, regularizer=self.regularizer, random_seed=self.random_seed, 279 | trainable=self.trainable, scope=layer_scope) 280 | self.dense_layer_list.append(dense_layer) 281 | 282 | def __call__(self, 283 | input_data, 284 | input_mask): 285 | """call stacked double-dense layer""" 286 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec): 287 | input_dense = input_data 288 | input_dense_mask = input_mask 289 | 290 | for dense_layer in self.dense_layer_list: 291 | input_dense, input_dense_mask = dense_layer(input_dense, input_dense_mask) 292 | 293 | output_dense = input_dense 294 | output_mask = input_dense_mask 295 | 296 | return output_dense, output_mask 297 | -------------------------------------------------------------------------------- /reading_comprehension/config/config_search_template.qanet.json: -------------------------------------------------------------------------------- 1 | { 2 | "hyperparams": { 3 | "data_max_question_length": { 4 | "stype": "discrete", 5 | "set": [50], 6 | "dtype": "int" 7 | }, 8 | "data_max_context_length": { 9 | "stype": "discrete", 10 | "set": [400], 11 | "dtype": "int" 12 | }, 13 | "data_max_answer_length": { 14 | "stype": "discrete", 15 | "set": [30], 16 | "dtype": "int" 17 | }, 18 | "data_max_subword_length": { 19 | "stype": "discrete", 20 | "set": [16], 21 | "dtype": "int" 22 | }, 23 | "data_max_char_length": { 24 | "stype": "discrete", 25 | "set": [16], 26 | "dtype": "int" 27 | }, 28 | "train_batch_size": { 29 | "stype": "discrete", 30 | "set": [32], 31 | "dtype": "int" 32 | }, 33 | "train_optimizer_type": { 34 | "stype": "discrete", 35 | "set": ["adam"], 36 | "dtype": "string" 37 | }, 38 | "train_optimizer_learning_rate": { 39 | "stype": "log", 40 | "range": [0.0001, 0.01], 41 | "dtype": "float" 42 | }, 43 | "model_representation_word_embed_dim": { 44 | "stype": "discrete", 45 | "set": [300], 46 | "dtype": "int" 47 | }, 48 | "model_representation_subword_embed_dim": { 49 | "stype": "lookup", 50 | "key": "embed_dim", 51 | "dtype": "int" 52 | }, 53 | "model_representation_subword_unit_dim": { 54 | "stype": "discrete", 55 | "set": [100], 56 | "dtype": "int" 57 | }, 58 | "model_representation_subword_window_size": { 59 | "stype": "lookup", 60 | "key": "window_size", 61 | "dtype": "list" 62 | }, 63 | "model_representation_subword_hidden_activation": { 64 | "stype": "discrete", 65 | "set": ["relu"], 66 | "dtype": "string" 67 | }, 68 | "model_representation_subword_dropout": { 69 | "stype": "lookup", 70 | "key": "dropout", 71 | "scale": 1.0, 72 | "shift": 0.0, 73 | "dtype": "float" 74 | }, 75 | "model_representation_subword_pooling_type": { 76 | "stype": "lookup", 77 | "key": "pooling_type", 78 | "dtype": "string" 79 | }, 80 | "model_representation_char_embed_dim": { 81 | "stype": "lookup", 82 | "key": "embed_dim", 83 | "dtype": "int" 84 | }, 85 | "model_representation_char_unit_dim": { 86 | "stype": "discrete", 87 | "set": [100], 88 | "dtype": "int" 89 | }, 90 | "model_representation_char_window_size": { 91 | "stype": "lookup", 92 | "key": "window_size", 93 | "dtype": "list" 94 | }, 95 | "model_representation_char_hidden_activation": { 96 | "stype": "discrete", 97 | "set": ["relu"], 98 | "dtype": "string" 99 | }, 100 | "model_representation_char_dropout": { 101 | "stype": "lookup", 102 | "key": "dropout", 103 | "scale": 1.0, 104 | "shift": 0.0, 105 | "dtype": "float" 106 | }, 107 | "model_representation_char_pooling_type": { 108 | "stype": "lookup", 109 | "key": "pooling_type", 110 | "dtype": "string" 111 | }, 112 | "model_representation_fusion_type": { 113 | "stype": "discrete", 114 | "set": ["highway"], 115 | "dtype": "string" 116 | }, 117 | "model_representation_fusion_num_layer": { 118 | "stype": "discrete", 119 | "set": [2], 120 | "dtype": "int" 121 | }, 122 | "model_representation_fusion_unit_dim": { 123 | "stype": "discrete", 124 | "set": [400], 125 | "dtype": "int" 126 | }, 127 | "model_representation_fusion_hidden_activation": { 128 | "stype": "discrete", 129 | "set": ["relu"], 130 | "dtype": "string" 131 | }, 132 | "model_representation_fusion_dropout": { 133 | "stype": "lookup", 134 | "key": "dropout", 135 | "scale": 1.0, 136 | "shift": 0.0, 137 | "dtype": "float" 138 | }, 139 | "model_understanding_question_num_layer": { 140 | "stype": "lookup", 141 | "key": "understanding_num_layer", 142 | "dtype": "int" 143 | }, 144 | "model_understanding_question_num_conv": { 145 | "stype": "lookup", 146 | "key": "understanding_num_conv", 147 | "dtype": "int" 148 | }, 149 | "model_understanding_question_num_head": { 150 | "stype": "discrete", 151 | "set": [8], 152 | "dtype": "int" 153 | }, 154 | "model_understanding_question_unit_dim": { 155 | "stype": "lookup", 156 | "key": "unit_dim", 157 | "scale": 1.0, 158 | "shift": 0.0, 159 | "dtype": "int" 160 | }, 161 | "model_understanding_question_window_size": { 162 | "stype": "lookup", 163 | "key": "understanding_window_size", 164 | "dtype": "list" 165 | }, 166 | "model_understanding_question_hidden_activation": { 167 | "stype": "lookup", 168 | "key": "hidden_activation", 169 | "dtype": "string" 170 | }, 171 | "model_understanding_question_dropout": { 172 | "stype": "lookup", 173 | "key": "dropout", 174 | "scale": 1.0, 175 | "shift": 0.0, 176 | "dtype": "float" 177 | }, 178 | "model_understanding_question_layer_dropout": { 179 | "stype": "lookup", 180 | "key": "layer_dropout", 181 | "scale": 1.0, 182 | "shift": 0.0, 183 | "dtype": "float" 184 | }, 185 | "model_understanding_context_num_layer": { 186 | "stype": "lookup", 187 | "key": "understanding_num_layer", 188 | "dtype": "int" 189 | }, 190 | "model_understanding_context_num_conv": { 191 | "stype": "lookup", 192 | "key": "understanding_num_conv", 193 | "dtype": "int" 194 | }, 195 | "model_understanding_context_num_head": { 196 | "stype": "discrete", 197 | "set": [8], 198 | "dtype": "int" 199 | }, 200 | "model_understanding_context_unit_dim": { 201 | "stype": "lookup", 202 | "key": "unit_dim", 203 | "scale": 1.0, 204 | "shift": 0.0, 205 | "dtype": "int" 206 | }, 207 | "model_understanding_context_window_size": { 208 | "stype": "lookup", 209 | "key": "understanding_window_size", 210 | "dtype": "list" 211 | }, 212 | "model_understanding_context_hidden_activation": { 213 | "stype": "lookup", 214 | "key": "hidden_activation", 215 | "dtype": "string" 216 | }, 217 | "model_understanding_context_dropout": { 218 | "stype": "lookup", 219 | "key": "dropout", 220 | "scale": 1.0, 221 | "shift": 0.0, 222 | "dtype": "float" 223 | }, 224 | "model_understanding_context_layer_dropout": { 225 | "stype": "lookup", 226 | "key": "layer_dropout", 227 | "scale": 1.0, 228 | "shift": 0.0, 229 | "dtype": "float" 230 | }, 231 | "model_interaction_context2question_attention_dim": { 232 | "stype": "lookup", 233 | "key": "unit_dim", 234 | "scale": 1.0, 235 | "shift": 0.0, 236 | "dtype": "int" 237 | }, 238 | "model_interaction_context2question_score_type": { 239 | "stype": "lookup", 240 | "key": "score_type", 241 | "dtype": "string" 242 | }, 243 | "model_interaction_question2context_attention_dim": { 244 | "stype": "lookup", 245 | "key": "unit_dim", 246 | "scale": 1.0, 247 | "shift": 0.0, 248 | "dtype": "int" 249 | }, 250 | "model_interaction_question2context_score_type": { 251 | "stype": "lookup", 252 | "key": "score_type", 253 | "dtype": "string" 254 | }, 255 | "model_interaction_fusion_type": { 256 | "stype": "discrete", 257 | "set": ["concate"], 258 | "dtype": "string" 259 | }, 260 | "model_interaction_fusion_num_layer": { 261 | "stype": "discrete", 262 | "set": [1], 263 | "dtype": "int" 264 | }, 265 | "model_interaction_fusion_unit_dim": { 266 | "stype": "lookup", 267 | "key": "unit_dim", 268 | "scale": 4.0, 269 | "shift": 0.0, 270 | "dtype": "int" 271 | }, 272 | "model_interaction_fusion_hidden_activation": { 273 | "stype": "discrete", 274 | "set": ["relu"], 275 | "dtype": "string" 276 | }, 277 | "model_interaction_fusion_dropout": { 278 | "stype": "lookup", 279 | "key": "dropout", 280 | "scale": 1.0, 281 | "shift": 0.0, 282 | "dtype": "float" 283 | }, 284 | "model_interaction_fusion_combo_enable": { 285 | "stype": "discrete", 286 | "set": [true], 287 | "dtype": "boolean" 288 | }, 289 | "model_modeling_answer_num_layer": { 290 | "stype": "lookup", 291 | "key": "modeling_num_layer", 292 | "dtype": "int" 293 | }, 294 | "model_modeling_answer_num_conv": { 295 | "stype": "lookup", 296 | "key": "modeling_num_conv", 297 | "dtype": "int" 298 | }, 299 | "model_modeling_answer_num_head": { 300 | "stype": "discrete", 301 | "set": [8], 302 | "dtype": "int" 303 | }, 304 | "model_modeling_answer_unit_dim": { 305 | "stype": "lookup", 306 | "key": "unit_dim", 307 | "scale": 1.0, 308 | "shift": 0.0, 309 | "dtype": "int" 310 | }, 311 | "model_modeling_answer_window_size": { 312 | "stype": "lookup", 313 | "key": "modeling_window_size", 314 | "dtype": "list" 315 | }, 316 | "model_modeling_answer_hidden_activation": { 317 | "stype": "lookup", 318 | "key": "hidden_activation", 319 | "dtype": "string" 320 | }, 321 | "model_modeling_answer_dropout": { 322 | "stype": "lookup", 323 | "key": "dropout", 324 | "scale": 1.0, 325 | "shift": 0.0, 326 | "dtype": "float" 327 | }, 328 | "model_modeling_answer_layer_dropout": { 329 | "stype": "lookup", 330 | "key": "layer_dropout", 331 | "scale": 1.0, 332 | "shift": 0.0, 333 | "dtype": "float" 334 | }, 335 | "model_output_answer_start_dropout": { 336 | "stype": "lookup", 337 | "key": "dropout", 338 | "scale": 1.0, 339 | "shift": 0.0, 340 | "dtype": "float" 341 | }, 342 | "model_output_answer_end_dropout": { 343 | "stype": "lookup", 344 | "key": "dropout", 345 | "scale": 1.0, 346 | "shift": 0.0, 347 | "dtype": "float" 348 | } 349 | }, 350 | "variables": { 351 | "embed_dim": { 352 | "stype": "discrete", 353 | "set": [8, 16, 32, 64], 354 | "dtype": "int" 355 | }, 356 | "window_size": { 357 | "stype": "discrete", 358 | "set": [[3], [5], [7]], 359 | "dtype": "list" 360 | }, 361 | "pooling_type": { 362 | "stype": "discrete", 363 | "set": ["max"], 364 | "dtype": "string" 365 | }, 366 | "unit_dim": { 367 | "stype": "uniform", 368 | "range": [50, 200], 369 | "dtype": "int" 370 | }, 371 | "hidden_activation": { 372 | "stype": "discrete", 373 | "set": ["tanh", "relu"], 374 | "dtype": "string" 375 | }, 376 | "dropout": { 377 | "stype": "uniform", 378 | "range": [0.0, 0.5], 379 | "dtype": "float" 380 | }, 381 | "layer_dropout": { 382 | "stype": "uniform", 383 | "range": [0.0, 0.5], 384 | "dtype": "float" 385 | }, 386 | "score_type": { 387 | "stype": "discrete", 388 | "set": ["scaled_dot", "triliear"], 389 | "dtype": "string" 390 | }, 391 | "understanding_num_layer": { 392 | "stype": "discrete", 393 | "set": [1, 2, 3, 4], 394 | "dtype": "int" 395 | }, 396 | "understanding_num_conv": { 397 | "stype": "discrete", 398 | "set": [2, 4], 399 | "dtype": "int" 400 | }, 401 | "understanding_window_size": { 402 | "stype": "discrete", 403 | "set": [[3], [5], [7]], 404 | "dtype": "list" 405 | }, 406 | "modeling_num_layer": { 407 | "stype": "discrete", 408 | "set": [2, 4, 8, 12], 409 | "dtype": "int" 410 | }, 411 | "modeling_num_conv": { 412 | "stype": "discrete", 413 | "set": [2, 4], 414 | "dtype": "int" 415 | }, 416 | "modeling_window_size": { 417 | "stype": "discrete", 418 | "set": [[3], [5], [7]], 419 | "dtype": "list" 420 | } 421 | } 422 | } -------------------------------------------------------------------------------- /reading_comprehension/config/config_search_template.bidaf.json: -------------------------------------------------------------------------------- 1 | { 2 | "hyperparams": { 3 | "data_max_question_length": { 4 | "stype": "discrete", 5 | "set": [40], 6 | "dtype": "int" 7 | }, 8 | "data_max_context_length": { 9 | "stype": "discrete", 10 | "set": [500], 11 | "dtype": "int" 12 | }, 13 | "data_max_answer_length": { 14 | "stype": "discrete", 15 | "set": [30], 16 | "dtype": "int" 17 | }, 18 | "data_max_subword_length": { 19 | "stype": "discrete", 20 | "set": [16], 21 | "dtype": "int" 22 | }, 23 | "data_max_char_length": { 24 | "stype": "discrete", 25 | "set": [16], 26 | "dtype": "int" 27 | }, 28 | "train_batch_size": { 29 | "stype": "discrete", 30 | "set": [60], 31 | "dtype": "int" 32 | }, 33 | "train_optimizer_type": { 34 | "stype": "discrete", 35 | "set": ["adam"], 36 | "dtype": "string" 37 | }, 38 | "train_optimizer_learning_rate": { 39 | "stype": "log", 40 | "range": [0.0001, 0.001], 41 | "dtype": "float" 42 | }, 43 | "model_representation_word_embed_dim": { 44 | "stype": "discrete", 45 | "set": [100], 46 | "dtype": "int" 47 | }, 48 | "model_representation_subword_embed_dim": { 49 | "stype": "lookup", 50 | "key": "embed_dim", 51 | "dtype": "int" 52 | }, 53 | "model_representation_subword_unit_dim": { 54 | "stype": "lookup", 55 | "key": "unit_dim", 56 | "scale": 1.0, 57 | "shift": 0.0, 58 | "dtype": "int" 59 | }, 60 | "model_representation_subword_window_size": { 61 | "stype": "lookup", 62 | "key": "window_size", 63 | "dtype": "list" 64 | }, 65 | "model_representation_subword_hidden_activation": { 66 | "stype": "discrete", 67 | "set": ["relu"], 68 | "dtype": "string" 69 | }, 70 | "model_representation_subword_dropout": { 71 | "stype": "lookup", 72 | "key": "dropout", 73 | "scale": 1.0, 74 | "shift": 0.0, 75 | "dtype": "float" 76 | }, 77 | "model_representation_subword_pooling_type": { 78 | "stype": "lookup", 79 | "key": "pooling_type", 80 | "dtype": "string" 81 | }, 82 | "model_representation_char_embed_dim": { 83 | "stype": "lookup", 84 | "key": "embed_dim", 85 | "dtype": "int" 86 | }, 87 | "model_representation_char_unit_dim": { 88 | "stype": "lookup", 89 | "key": "unit_dim", 90 | "scale": 1.0, 91 | "shift": 0.0, 92 | "dtype": "int" 93 | }, 94 | "model_representation_char_window_size": { 95 | "stype": "lookup", 96 | "key": "window_size", 97 | "dtype": "list" 98 | }, 99 | "model_representation_char_hidden_activation": { 100 | "stype": "discrete", 101 | "set": ["relu"], 102 | "dtype": "string" 103 | }, 104 | "model_representation_char_dropout": { 105 | "stype": "lookup", 106 | "key": "dropout", 107 | "scale": 1.0, 108 | "shift": 0.0, 109 | "dtype": "float" 110 | }, 111 | "model_representation_char_pooling_type": { 112 | "stype": "lookup", 113 | "key": "pooling_type", 114 | "dtype": "string" 115 | }, 116 | "model_representation_fusion_type": { 117 | "stype": "discrete", 118 | "set": ["highway"], 119 | "dtype": "string" 120 | }, 121 | "model_representation_fusion_num_layer": { 122 | "stype": "discrete", 123 | "set": [2], 124 | "dtype": "int" 125 | }, 126 | "model_representation_fusion_unit_dim": { 127 | "stype": "lookup", 128 | "key": "unit_dim", 129 | "scale": 1.0, 130 | "shift": 0.0, 131 | "dtype": "int" 132 | }, 133 | "model_representation_fusion_hidden_activation": { 134 | "stype": "discrete", 135 | "set": ["relu"], 136 | "dtype": "string" 137 | }, 138 | "model_representation_fusion_dropout": { 139 | "stype": "lookup", 140 | "key": "dropout", 141 | "scale": 1.0, 142 | "shift": 0.0, 143 | "dtype": "float" 144 | }, 145 | "model_understanding_question_num_layer": { 146 | "stype": "lookup", 147 | "key": "num_layer", 148 | "dtype": "int" 149 | }, 150 | "model_understanding_question_unit_dim": { 151 | "stype": "lookup", 152 | "key": "unit_dim", 153 | "scale": 1.0, 154 | "shift": 0.0, 155 | "dtype": "int" 156 | }, 157 | "model_understanding_question_cell_type": { 158 | "stype": "lookup", 159 | "key": "cell_type", 160 | "dtype": "string" 161 | }, 162 | "model_understanding_question_hidden_activation": { 163 | "stype": "lookup", 164 | "key": "hidden_activation", 165 | "dtype": "string" 166 | }, 167 | "model_understanding_question_dropout": { 168 | "stype": "lookup", 169 | "key": "dropout", 170 | "scale": 1.0, 171 | "shift": 0.0, 172 | "dtype": "float" 173 | }, 174 | "model_understanding_context_num_layer": { 175 | "stype": "lookup", 176 | "key": "num_layer", 177 | "dtype": "int" 178 | }, 179 | "model_understanding_context_unit_dim": { 180 | "stype": "lookup", 181 | "key": "unit_dim", 182 | "scale": 1.0, 183 | "shift": 0.0, 184 | "dtype": "int" 185 | }, 186 | "model_understanding_context_cell_type": { 187 | "stype": "lookup", 188 | "key": "cell_type", 189 | "dtype": "string" 190 | }, 191 | "model_understanding_context_hidden_activation": { 192 | "stype": "lookup", 193 | "key": "hidden_activation", 194 | "dtype": "string" 195 | }, 196 | "model_understanding_context_dropout": { 197 | "stype": "lookup", 198 | "key": "dropout", 199 | "scale": 1.0, 200 | "shift": 0.0, 201 | "dtype": "float" 202 | }, 203 | "model_interaction_context2question_attention_dim": { 204 | "stype": "lookup", 205 | "key": "unit_dim", 206 | "scale": 2.0, 207 | "shift": 0.0, 208 | "dtype": "int" 209 | }, 210 | "model_interaction_context2question_score_type": { 211 | "stype": "lookup", 212 | "key": "score_type", 213 | "dtype": "string" 214 | }, 215 | "model_interaction_question2context_attention_dim": { 216 | "stype": "lookup", 217 | "key": "unit_dim", 218 | "scale": 2.0, 219 | "shift": 0.0, 220 | "dtype": "int" 221 | }, 222 | "model_interaction_question2context_score_type": { 223 | "stype": "lookup", 224 | "key": "score_type", 225 | "dtype": "string" 226 | }, 227 | "model_interaction_fusion_type": { 228 | "stype": "discrete", 229 | "set": ["concate"], 230 | "dtype": "string" 231 | }, 232 | "model_interaction_fusion_num_layer": { 233 | "stype": "discrete", 234 | "set": [1], 235 | "dtype": "int" 236 | }, 237 | "model_interaction_fusion_unit_dim": { 238 | "stype": "lookup", 239 | "key": "unit_dim", 240 | "scale": 4.0, 241 | "shift": 0.0, 242 | "dtype": "int" 243 | }, 244 | "model_interaction_fusion_hidden_activation": { 245 | "stype": "discrete", 246 | "set": ["relu"], 247 | "dtype": "string" 248 | }, 249 | "model_interaction_fusion_dropout": { 250 | "stype": "lookup", 251 | "key": "dropout", 252 | "scale": 1.0, 253 | "shift": 0.0, 254 | "dtype": "float" 255 | }, 256 | "model_interaction_fusion_combo_enable": { 257 | "stype": "discrete", 258 | "set": [true], 259 | "dtype": "boolean" 260 | }, 261 | "model_modeling_answer_num_layer": { 262 | "stype": "lookup", 263 | "key": "num_layer", 264 | "dtype": "int" 265 | }, 266 | "model_modeling_answer_unit_dim": { 267 | "stype": "lookup", 268 | "key": "unit_dim", 269 | "scale": 1.0, 270 | "shift": 0.0, 271 | "dtype": "int" 272 | }, 273 | "model_modeling_answer_cell_type": { 274 | "stype": "lookup", 275 | "key": "cell_type", 276 | "dtype": "string" 277 | }, 278 | "model_modeling_answer_hidden_activation": { 279 | "stype": "lookup", 280 | "key": "hidden_activation", 281 | "dtype": "string" 282 | }, 283 | "model_modeling_answer_dropout": { 284 | "stype": "lookup", 285 | "key": "dropout", 286 | "scale": 1.0, 287 | "shift": 0.0, 288 | "dtype": "float" 289 | }, 290 | "model_modeling_answer_attention_dim": { 291 | "stype": "lookup", 292 | "key": "unit_dim", 293 | "scale": 2.0, 294 | "shift": 0.0, 295 | "dtype": "int" 296 | }, 297 | "model_modeling_answer_score_type": { 298 | "stype": "lookup", 299 | "key": "score_type", 300 | "dtype": "string" 301 | }, 302 | "model_modeling_answer_attention_enable": { 303 | "stype": "discrete", 304 | "set": [false], 305 | "dtype": "boolean" 306 | }, 307 | "model_modeling_fusion_type": { 308 | "stype": "discrete", 309 | "set": ["concate"], 310 | "dtype": "string" 311 | }, 312 | "model_modeling_fusion_num_layer": { 313 | "stype": "discrete", 314 | "set": [1], 315 | "dtype": "int" 316 | }, 317 | "model_modeling_fusion_unit_dim": { 318 | "stype": "lookup", 319 | "key": "unit_dim", 320 | "scale": 2.0, 321 | "shift": 0.0, 322 | "dtype": "int" 323 | }, 324 | "model_modeling_fusion_hidden_activation": { 325 | "stype": "discrete", 326 | "set": ["relu"], 327 | "dtype": "string" 328 | }, 329 | "model_modeling_fusion_dropout": { 330 | "stype": "lookup", 331 | "key": "dropout", 332 | "scale": 1.0, 333 | "shift": 0.0, 334 | "dtype": "float" 335 | }, 336 | "model_output_answer_start_num_layer": { 337 | "stype": "lookup", 338 | "key": "num_layer", 339 | "dtype": "int" 340 | }, 341 | "model_output_answer_start_unit_dim": { 342 | "stype": "lookup", 343 | "key": "unit_dim", 344 | "scale": 1.0, 345 | "shift": 0.0, 346 | "dtype": "int" 347 | }, 348 | "model_output_answer_start_cell_type": { 349 | "stype": "lookup", 350 | "key": "cell_type", 351 | "dtype": "string" 352 | }, 353 | "model_output_answer_start_hidden_activation": { 354 | "stype": "lookup", 355 | "key": "hidden_activation", 356 | "dtype": "string" 357 | }, 358 | "model_output_answer_start_dropout": { 359 | "stype": "lookup", 360 | "key": "dropout", 361 | "scale": 1.0, 362 | "shift": 0.0, 363 | "dtype": "float" 364 | }, 365 | "model_output_answer_end_num_layer": { 366 | "stype": "lookup", 367 | "key": "num_layer", 368 | "dtype": "int" 369 | }, 370 | "model_output_answer_end_unit_dim": { 371 | "stype": "lookup", 372 | "key": "unit_dim", 373 | "scale": 1.0, 374 | "shift": 0.0, 375 | "dtype": "int" 376 | }, 377 | "model_output_answer_end_cell_type": { 378 | "stype": "lookup", 379 | "key": "cell_type", 380 | "dtype": "string" 381 | }, 382 | "model_output_answer_end_hidden_activation": { 383 | "stype": "lookup", 384 | "key": "hidden_activation", 385 | "dtype": "string" 386 | }, 387 | "model_output_answer_end_dropout": { 388 | "stype": "lookup", 389 | "key": "dropout", 390 | "scale": 1.0, 391 | "shift": 0.0, 392 | "dtype": "float" 393 | } 394 | }, 395 | "variables": { 396 | "embed_dim": { 397 | "stype": "discrete", 398 | "set": [8, 16, 32, 64], 399 | "dtype": "int" 400 | }, 401 | "window_size": { 402 | "stype": "discrete", 403 | "set": [[3], [5], [7]], 404 | "dtype": "list" 405 | }, 406 | "pooling_type": { 407 | "stype": "discrete", 408 | "set": ["max"], 409 | "dtype": "string" 410 | }, 411 | "num_layer": { 412 | "stype": "discrete", 413 | "set": [1, 2, 3, 4], 414 | "dtype": "int" 415 | }, 416 | "unit_dim": { 417 | "stype": "uniform", 418 | "range": [50, 200], 419 | "dtype": "int" 420 | }, 421 | "hidden_activation": { 422 | "stype": "discrete", 423 | "set": ["tanh", "relu"], 424 | "dtype": "string" 425 | }, 426 | "cell_type": { 427 | "stype": "discrete", 428 | "set": ["lstm", "gru"], 429 | "dtype": "string" 430 | }, 431 | "dropout": { 432 | "stype": "uniform", 433 | "range": [0.0, 0.5], 434 | "dtype": "float" 435 | }, 436 | "score_type": { 437 | "stype": "discrete", 438 | "set": ["scaled_dot", "triliear"], 439 | "dtype": "string" 440 | } 441 | } 442 | } -------------------------------------------------------------------------------- /reading_comprehension/model/base_model.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import os.path 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | from util.default_util import * 8 | from util.reading_comprehension_util import * 9 | from util.layer_util import * 10 | 11 | __all__ = ["TrainResult", "InferResult", "BaseModel"] 12 | 13 | class TrainResult(collections.namedtuple("TrainResult", 14 | ("loss", "learning_rate", "global_step", "batch_size", "summary"))): 15 | pass 16 | 17 | class InferResult(collections.namedtuple("InferResult", 18 | ("predict", "predict_detail", "batch_size", "summary"))): 19 | pass 20 | 21 | class BaseModel(object): 22 | """reading comprehension base model""" 23 | def __init__(self, 24 | logger, 25 | hyperparams, 26 | data_pipeline, 27 | external_data, 28 | mode="train", 29 | scope="base"): 30 | """initialize mrc base model""" 31 | self.logger = logger 32 | self.hyperparams = hyperparams 33 | self.data_pipeline = data_pipeline 34 | self.mode = mode 35 | self.scope = scope 36 | 37 | self.update_op = None 38 | self.train_loss = None 39 | self.learning_rate = None 40 | self.global_step = None 41 | self.train_summary = None 42 | self.infer_answer_start = None 43 | self.infer_answer_start_mask = None 44 | self.infer_answer_end = None 45 | self.infer_answer_end_mask = None 46 | self.infer_summary = None 47 | 48 | self.word_embedding = external_data["word_embedding"] if external_data is not None and "word_embedding" in external_data else None 49 | self.batch_size = tf.size(tf.reduce_max(self.data_pipeline.input_answer_mask, axis=-2)) 50 | 51 | self.num_gpus = self.hyperparams.device_num_gpus 52 | self.default_gpu_id = self.hyperparams.device_default_gpu_id 53 | self.logger.log_print("# {0} gpus are used with default gpu id set as {1}" 54 | .format(self.num_gpus, self.default_gpu_id)) 55 | 56 | if self.hyperparams.train_regularization_enable == True: 57 | self.regularizer = create_weight_regularizer(self.hyperparams.train_regularization_type, 58 | self.hyperparams.train_regularization_scale) 59 | else: 60 | self.regularizer = None 61 | 62 | self.random_seed = self.hyperparams.train_random_seed if self.hyperparams.train_enable_debugging else None 63 | 64 | def _create_fusion_layer(self, 65 | input_unit_dim, 66 | output_unit_dim, 67 | fusion_type, 68 | num_layer, 69 | hidden_activation, 70 | dropout, 71 | num_gpus, 72 | default_gpu_id, 73 | regularizer, 74 | random_seed, 75 | trainable): 76 | """create fusion layer for mrc base model""" 77 | with tf.variable_scope("fusion", reuse=tf.AUTO_REUSE): 78 | if fusion_type == "concate": 79 | fusion_layer_list = [] 80 | if input_unit_dim != output_unit_dim: 81 | convert_layer = create_convolution_layer("1d", 1, input_unit_dim, 82 | output_unit_dim, 1, 1, 1, "SAME", None, [0.0], None, False, False, False, 83 | num_gpus, default_gpu_id, regularizer, random_seed, trainable) 84 | fusion_layer_list.append(convert_layer) 85 | elif fusion_type == "dense": 86 | fusion_layer = create_dense_layer("single", num_layer, output_unit_dim, 1, hidden_activation, 87 | [dropout] * num_layer, None, False, False, False, num_gpus, default_gpu_id, regularizer, random_seed, trainable) 88 | fusion_layer_list = [fusion_layer] 89 | elif fusion_type == "highway": 90 | fusion_layer_list = [] 91 | if input_unit_dim != output_unit_dim: 92 | convert_layer = create_convolution_layer("1d", 1, input_unit_dim, 93 | output_unit_dim, 1, 1, 1, "SAME", None, [0.0], None, False, False, False, 94 | num_gpus, default_gpu_id, regularizer, random_seed, trainable) 95 | fusion_layer_list.append(convert_layer) 96 | 97 | fusion_layer = create_highway_layer(num_layer, output_unit_dim, hidden_activation, 98 | [dropout] * num_layer, num_gpus, default_gpu_id, regularizer, random_seed, trainable) 99 | fusion_layer_list.append(fusion_layer) 100 | elif fusion_type == "conv": 101 | fusion_layer = create_convolution_layer("1d", num_layer, input_unit_dim, 102 | output_unit_dim, 1, 1, 1, "SAME", hidden_activation, [dropout] * num_layer, 103 | None, False, False, False, num_gpus, default_gpu_id, regularizer, random_seed, trainable) 104 | fusion_layer_list = [fusion_layer] 105 | else: 106 | raise ValueError("unsupported fusion type {0}".format(fusion_type)) 107 | 108 | return fusion_layer_list 109 | 110 | def _build_fusion_result(self, 111 | input_data_list, 112 | input_mask_list, 113 | fusion_layer_list): 114 | """build fusion result for mrc base model""" 115 | input_fusion = tf.concat(input_data_list, axis=-1) 116 | input_fusion_mask = tf.reduce_max(tf.concat(input_mask_list, axis=-1), axis=-1, keepdims=True) 117 | 118 | if fusion_layer_list != None: 119 | for fusion_layer in fusion_layer_list: 120 | input_fusion, input_fusion_mask = fusion_layer(input_fusion, input_fusion_mask) 121 | 122 | return input_fusion, input_fusion_mask 123 | 124 | def _get_exponential_moving_average(self, 125 | num_steps): 126 | decay_rate = self.hyperparams.train_ema_decay_rate 127 | enable_debias = self.hyperparams.train_ema_enable_debias 128 | enable_dynamic_decay = self.hyperparams.train_ema_enable_dynamic_decay 129 | 130 | if enable_dynamic_decay == True: 131 | ema = tf.train.ExponentialMovingAverage(decay=decay_rate, num_updates=num_steps, zero_debias=enable_debias) 132 | else: 133 | ema = tf.train.ExponentialMovingAverage(decay=decay_rate, zero_debias=enable_debias) 134 | 135 | return ema 136 | 137 | def _apply_learning_rate_warmup(self, 138 | learning_rate): 139 | """apply learning rate warmup""" 140 | warmup_mode = self.hyperparams.train_optimizer_warmup_mode 141 | warmup_rate = self.hyperparams.train_optimizer_warmup_rate 142 | warmup_end_step = self.hyperparams.train_optimizer_warmup_end_step 143 | 144 | if warmup_mode == "exponential_warmup": 145 | warmup_factor = warmup_rate ** (1 - tf.to_float(self.global_step) / tf.to_float(warmup_end_step)) 146 | warmup_learning_rate = warmup_factor * learning_rate 147 | elif warmup_mode == "inverse_exponential_warmup": 148 | warmup_factor = tf.log(tf.to_float(self.global_step + 1)) / tf.log(tf.to_float(warmup_end_step)) 149 | warmup_learning_rate = warmup_factor * learning_rate 150 | else: 151 | raise ValueError("unsupported warm-up mode {0}".format(warmup_mode)) 152 | 153 | warmup_learning_rate = tf.cond(tf.less(self.global_step, warmup_end_step), 154 | lambda: warmup_learning_rate, lambda: learning_rate) 155 | 156 | return warmup_learning_rate 157 | 158 | def _apply_learning_rate_decay(self, 159 | learning_rate): 160 | """apply learning rate decay""" 161 | decay_mode = self.hyperparams.train_optimizer_decay_mode 162 | decay_rate = self.hyperparams.train_optimizer_decay_rate 163 | decay_step = self.hyperparams.train_optimizer_decay_step 164 | decay_start_step = self.hyperparams.train_optimizer_decay_start_step 165 | 166 | if decay_mode == "exponential_decay": 167 | decayed_learning_rate = tf.train.exponential_decay(learning_rate=learning_rate, 168 | global_step=(self.global_step - decay_start_step), 169 | decay_steps=decay_step, decay_rate=decay_rate, staircase=True) 170 | elif decay_mode == "inverse_time_decay": 171 | decayed_learning_rate = tf.train.inverse_time_decay(learning_rate=learning_rate, 172 | global_step=(self.global_step - decay_start_step), 173 | decay_steps=decay_step, decay_rate=decay_rate, staircase=True) 174 | else: 175 | raise ValueError("unsupported decay mode {0}".format(decay_mode)) 176 | 177 | decayed_learning_rate = tf.cond(tf.less(self.global_step, decay_start_step), 178 | lambda: learning_rate, lambda: decayed_learning_rate) 179 | 180 | return decayed_learning_rate 181 | 182 | def _initialize_optimizer(self, 183 | learning_rate): 184 | """initialize optimizer""" 185 | optimizer_type = self.hyperparams.train_optimizer_type 186 | if optimizer_type == "sgd": 187 | optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate) 188 | elif optimizer_type == "momentum": 189 | optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, 190 | momentum=self.hyperparams.train_optimizer_momentum_beta) 191 | elif optimizer_type == "rmsprop": 192 | optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, 193 | decay=self.hyperparams.train_optimizer_rmsprop_beta, 194 | epsilon=self.hyperparams.train_optimizer_rmsprop_epsilon) 195 | elif optimizer_type == "adadelta": 196 | optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate, 197 | rho=self.hyperparams.train_optimizer_adadelta_rho, 198 | epsilon=self.hyperparams.train_optimizer_adadelta_epsilon) 199 | elif optimizer_type == "adagrad": 200 | optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, 201 | initial_accumulator_value=self.hyperparams.train_optimizer_adagrad_init_accumulator) 202 | elif optimizer_type == "adam": 203 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, 204 | beta1=self.hyperparams.train_optimizer_adam_beta_1, beta2=self.hyperparams.train_optimizer_adam_beta_2, 205 | epsilon=self.hyperparams.train_optimizer_adam_epsilon) 206 | else: 207 | raise ValueError("unsupported optimizer type {0}".format(optimizer_type)) 208 | 209 | return optimizer 210 | 211 | def _minimize_loss(self, 212 | loss): 213 | """minimize optimization loss""" 214 | """compute gradients""" 215 | if self.num_gpus > 1: 216 | grads_and_vars = self.optimizer.compute_gradients(loss, colocate_gradients_with_ops=True) 217 | else: 218 | grads_and_vars = self.optimizer.compute_gradients(loss, colocate_gradients_with_ops=False) 219 | 220 | """clip gradients""" 221 | gradients = [x[0] for x in grads_and_vars] 222 | variables = [x[1] for x in grads_and_vars] 223 | clipped_gradients, gradient_norm = tf.clip_by_global_norm(gradients, self.hyperparams.train_clip_norm) 224 | grads_and_vars = zip(clipped_gradients, variables) 225 | 226 | """update model based on gradients""" 227 | update_model = self.optimizer.apply_gradients(grads_and_vars, global_step=self.global_step) 228 | 229 | return update_model, clipped_gradients, gradient_norm 230 | 231 | def train(self, 232 | sess): 233 | """train model""" 234 | _, loss, learning_rate, global_step, batch_size, summary = sess.run([self.update_op, 235 | self.train_loss, self.decayed_learning_rate, self.global_step, self.batch_size, self.train_summary]) 236 | 237 | return TrainResult(loss=loss, learning_rate=learning_rate, 238 | global_step=global_step, batch_size=batch_size, summary=summary) 239 | 240 | def infer(self, 241 | sess): 242 | """infer model""" 243 | (answer_start, answer_end, answer_start_mask, answer_end_mask, 244 | batch_size, summary) = sess.run([self.infer_answer_start, self.infer_answer_end, 245 | self.infer_answer_start_mask, self.infer_answer_end_mask, self.batch_size, self.infer_summary]) 246 | 247 | max_context_length = self.hyperparams.data_max_context_length 248 | max_answer_length = self.hyperparams.data_max_answer_length 249 | 250 | predict_start = np.expand_dims(answer_start[:max_context_length], axis=-1) 251 | predict_start_mask = np.expand_dims(answer_start_mask[:max_context_length], axis=-1) 252 | predict_start_start = predict_start * predict_start_mask 253 | predict_end = np.expand_dims(answer_end[:max_context_length], axis=-1) 254 | predict_end_mask = np.expand_dims(answer_end_mask[:max_context_length], axis=-1) 255 | predict_end = predict_end * predict_end_mask 256 | 257 | predict_span = np.matmul(predict_start, predict_end.transpose((0,2,1))) 258 | predict_span_mask = np.matmul(predict_start_mask, predict_end_mask.transpose((0,2,1))) 259 | predict_span = predict_span * predict_span_mask 260 | 261 | predict = np.full((batch_size, 2), -1) 262 | for k in range(batch_size): 263 | max_prob = float('-inf') 264 | max_prob_start = -1 265 | max_prob_end = -1 266 | for i in range(max_context_length): 267 | for j in range(i, min(max_context_length, i+max_answer_length)): 268 | if predict_span[k, i, j] > max_prob: 269 | max_prob = predict_span[k, i, j] 270 | max_prob_start = i 271 | max_prob_end = j 272 | 273 | predict[k, 0] = max_prob_start 274 | predict[k, 1] = max_prob_end 275 | 276 | predict_detail = np.concatenate((predict_start, predict_end), axis=-1) 277 | 278 | return InferResult(predict=predict, predict_detail=predict_detail, batch_size=batch_size, summary=summary) 279 | 280 | def _get_train_summary(self): 281 | """get train summary""" 282 | return tf.summary.merge([tf.summary.scalar("learning_rate", self.learning_rate), 283 | tf.summary.scalar("train_loss", self.train_loss), tf.summary.scalar("gradient_norm", self.gradient_norm)]) 284 | 285 | def _get_infer_summary(self): 286 | """get infer summary""" 287 | return tf.no_op() 288 | --------------------------------------------------------------------------------