├── reading_comprehension
├── external
│ ├── __init__.py
│ ├── squad.py
│ ├── bleu.py
│ └── rouge.py
├── model
│ ├── __init__.py
│ └── base_model.py
├── document
│ ├── BiDAF.metric.png
│ ├── QANet.metric.png
│ ├── SQuAD.example.png
│ ├── BiDAF.architecture.png
│ ├── QANet.architecture.png
│ └── R-Net.architecture.png
├── layer
│ ├── __init__.py
│ ├── pooling.py
│ ├── basic.py
│ ├── embedding.py
│ ├── position.py
│ ├── highway.py
│ ├── recurrent.py
│ └── dense.py
├── util
│ ├── __init__.py
│ ├── debug_logger.py
│ ├── result_writer.py
│ ├── summary_writer.py
│ ├── default_util.py
│ ├── eval_util.py
│ ├── train_logger.py
│ ├── reading_comprehension_util.py
│ └── eval_logger.py
├── hparam_search.py
├── squad
│ ├── evaluate-v1.py
│ ├── preprocess.py
│ └── evaluate-v2.py
└── config
│ ├── config_mrc_template.rnet.json
│ ├── config_mrc_template.qanet.json
│ ├── config_mrc_template.bidaf.json
│ ├── config_search_template.qanet.json
│ └── config_search_template.bidaf.json
├── docs
├── BiDAF.metric.png
├── QANet.metric.png
├── SQuAD.example.png
├── BiDAF.architecture.png
├── QANet.architecture.png
├── R-Net.architecture.png
├── _config.yml
└── index.md
├── .gitignore
├── README.md
└── LICENSE
/reading_comprehension/external/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["bleu", "rouge", "squad"]
--------------------------------------------------------------------------------
/reading_comprehension/model/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["base_model", "bidaf", "qanet", "rnet"]
--------------------------------------------------------------------------------
/docs/BiDAF.metric.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/docs/BiDAF.metric.png
--------------------------------------------------------------------------------
/docs/QANet.metric.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/docs/QANet.metric.png
--------------------------------------------------------------------------------
/docs/SQuAD.example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/docs/SQuAD.example.png
--------------------------------------------------------------------------------
/docs/BiDAF.architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/docs/BiDAF.architecture.png
--------------------------------------------------------------------------------
/docs/QANet.architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/docs/QANet.architecture.png
--------------------------------------------------------------------------------
/docs/R-Net.architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/docs/R-Net.architecture.png
--------------------------------------------------------------------------------
/reading_comprehension/document/BiDAF.metric.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/reading_comprehension/document/BiDAF.metric.png
--------------------------------------------------------------------------------
/reading_comprehension/document/QANet.metric.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/reading_comprehension/document/QANet.metric.png
--------------------------------------------------------------------------------
/reading_comprehension/document/SQuAD.example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/reading_comprehension/document/SQuAD.example.png
--------------------------------------------------------------------------------
/reading_comprehension/layer/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["basic", "embedding", "position", "convolution", "pooling",
2 | "dense", "highway", "recurrent", "attention"]
--------------------------------------------------------------------------------
/reading_comprehension/document/BiDAF.architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/reading_comprehension/document/BiDAF.architecture.png
--------------------------------------------------------------------------------
/reading_comprehension/document/QANet.architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/reading_comprehension/document/QANet.architecture.png
--------------------------------------------------------------------------------
/reading_comprehension/document/R-Net.architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stevezheng23/reading_comprehension_tf/HEAD/reading_comprehension/document/R-Net.architecture.png
--------------------------------------------------------------------------------
/reading_comprehension/util/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["debug_logger", "train_logger", "eval_logger.py", "summary_writer", "result_writer",
2 | "default_util", "param_util", "data_util", "model_util", "eval_util", "layer_util", "reading_comprehension_util"]
--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman
2 | title: Machine Reading Comprehension
3 | description: This project is a Machine Reading Comprehension (MRC) framework in TensorFlow. It also contains several classic models (e.g. QANet, BiDAF, etc.) re-implementation and their benchmarks on SQuAD dataset.
4 |
--------------------------------------------------------------------------------
/reading_comprehension/util/debug_logger.py:
--------------------------------------------------------------------------------
1 | import codecs
2 | import os.path
3 | import time
4 |
5 | import numpy as np
6 | import tensorflow as tf
7 |
8 | __all__ = ["DebugLogger"]
9 |
10 | class DebugLogger(object):
11 | """debug logger"""
12 | def __init__(self,
13 | output_dir):
14 | """initialize debug logger"""
15 | if not tf.gfile.Exists(output_dir):
16 | tf.gfile.MakeDirs(output_dir)
17 | self.log_file = os.path.join(output_dir, "debug_{0}.log".format(time.time()))
18 | self.log_writer = codecs.getwriter("utf-8")(tf.gfile.GFile(self.log_file, mode="a"))
19 |
20 | def log_print(self,
21 | message):
22 | """log and print debugging message"""
23 | time_stamp = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
24 | log_line = "{0}: {1}".format(time_stamp, message).encode('utf-8')
25 | self.log_writer.write("{0}\r\n".format(log_line))
26 | print(log_line)
27 |
--------------------------------------------------------------------------------
/reading_comprehension/util/result_writer.py:
--------------------------------------------------------------------------------
1 | import codecs
2 | import os.path
3 | import json
4 | import time
5 |
6 | import numpy as np
7 | import tensorflow as tf
8 |
9 | __all__ = ["ResultWriter"]
10 |
11 | class ResultWriter(object):
12 | """result writer"""
13 | def __init__(self,
14 | output_dir):
15 | """initialize result writer"""
16 | self.output_dir = output_dir
17 | if not tf.gfile.Exists(self.output_dir):
18 | tf.gfile.MakeDirs(self.output_dir)
19 |
20 | def write_result(self,
21 | results,
22 | result_tag,
23 | result_id):
24 | """write result to file"""
25 | result_file = os.path.join(self.output_dir, "{0}_{1}_{2}.result".format(result_tag, result_id, time.time()))
26 | with codecs.getwriter("utf-8")(tf.gfile.GFile(result_file, mode="w")) as result_writer:
27 | for result in results:
28 | result_writer.write("{0}\r\n".format(json.dumps(result)))
29 |
--------------------------------------------------------------------------------
/reading_comprehension/hparam_search.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import numpy as np
4 | import tensorflow as tf
5 |
6 | from util.param_util import *
7 |
8 | def add_arguments(parser):
9 | parser.add_argument("--base-config", help="path to base config", required=True)
10 | parser.add_argument("--search-config", help="path to search config", required=True)
11 | parser.add_argument("--num-group", help="num of hyperparam group", type=int, required=True)
12 | parser.add_argument("--random-seed", help="random seed", type=int, required=True)
13 | parser.add_argument("--output-dir", help="path to output dir", required=True)
14 |
15 | def main(args):
16 | hyperparams = load_hyperparams(args.base_config)
17 | hyperparams_group = search_hyperparams(hyperparams,
18 | args.search_config, args.num_group, args.random_seed)
19 | create_hyperparams_file(hyperparams_group, args.output_dir)
20 |
21 | if __name__ == "__main__":
22 | parser = argparse.ArgumentParser()
23 | add_arguments(parser)
24 | args = parser.parse_args()
25 | main(args)
26 |
--------------------------------------------------------------------------------
/reading_comprehension/util/summary_writer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 |
4 | __all__ = ["SummaryWriter"]
5 |
6 | class SummaryWriter(object):
7 | """summary writer"""
8 | def __init__(self,
9 | graph,
10 | output_dir):
11 | """initialize summary writer"""
12 | if not tf.gfile.Exists(output_dir):
13 | tf.gfile.MakeDirs(output_dir)
14 | self.summary_writer = tf.summary.FileWriter(output_dir, graph)
15 |
16 | def add_summary(self,
17 | summary,
18 | global_step):
19 | """add new summary"""
20 | self.summary_writer.add_summary(summary, global_step)
21 |
22 | def add_value_summary(self,
23 | summary_tag,
24 | summary_value,
25 | global_step):
26 | """add new value summary"""
27 | summary = tf.Summary(value=[tf.Summary.Value(tag=summary_tag, simple_value=summary_value)])
28 | self.summary_writer.add_summary(summary, global_step)
29 |
30 | def close_writer(self):
31 | """close summary writer"""
32 | self.summary_writer.close()
33 |
34 | def reopen_writer(self):
35 | """re-open summary writer"""
36 | self.summary_writer.reopen()
37 |
--------------------------------------------------------------------------------
/reading_comprehension/util/default_util.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 |
4 | __all__ = ["EPSILON", "MAX_INT", "MIN_FLOAT", "check_tensorflow_version", "safe_exp", "get_config_proto", "get_device_spec"]
5 |
6 | EPSILON = 1e-30
7 | MAX_INT = 2147483647
8 | MIN_FLOAT = -1e30
9 |
10 | def check_tensorflow_version():
11 | """check tensorflow version in current environment"""
12 | min_tf_version = "1.12.0"
13 | curr_tf_version = tf.__version__
14 | if curr_tf_version < min_tf_version:
15 | raise EnvironmentError("tensorflow version must be >= {0}".format(min_tf_version))
16 | return curr_tf_version
17 |
18 | def safe_exp(value):
19 | """handle overflow exception for exp"""
20 | try:
21 | res = np.exp(value)
22 | except OverflowError:
23 | res = float("inf")
24 | return res
25 |
26 | def get_config_proto(log_device_placement,
27 | allow_soft_placement,
28 | allow_growth,
29 | per_process_gpu_memory_fraction):
30 | """get config proto for device setting"""
31 | config_proto = tf.ConfigProto(log_device_placement=log_device_placement,
32 | allow_soft_placement=allow_soft_placement)
33 | config_proto.gpu_options.allow_growth = allow_growth
34 | config_proto.gpu_options.per_process_gpu_memory_fraction = per_process_gpu_memory_fraction
35 |
36 | return config_proto
37 |
38 | def get_device_spec(device_id, num_gpus):
39 | """get device specification"""
40 | if num_gpus == 0:
41 | device_spec = "/device:CPU:0"
42 | else:
43 | device_spec = "/device:GPU:{0}".format(device_id % num_gpus)
44 |
45 | return device_spec
46 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
--------------------------------------------------------------------------------
/reading_comprehension/layer/pooling.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 |
4 | from util.default_util import *
5 | from util.reading_comprehension_util import *
6 |
7 | __all__ = ["MaxPooling", "AveragePooling"]
8 |
9 | class MaxPooling(object):
10 | """max pooling layer"""
11 | def __init__(self,
12 | num_gpus=1,
13 | default_gpu_id=0,
14 | scope="max_pool"):
15 | """initialize max pooling layer"""
16 | self.scope = scope
17 | self.device_spec = get_device_spec(default_gpu_id, num_gpus)
18 |
19 | def __call__(self,
20 | input_data,
21 | input_mask):
22 | """call max pooling layer"""
23 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
24 | output_mask = tf.squeeze(tf.reduce_max(input_mask, axis=-2, keepdims=True), axis=-2)
25 | output_pool = tf.reduce_max(input_data * input_mask + MIN_FLOAT * (1 - input_mask), axis=-2) * output_mask
26 | output_pool = output_pool + tf.reduce_max(input_data, axis=-2) * (1 - output_mask)
27 |
28 | return output_pool, output_mask
29 |
30 | class AveragePooling(object):
31 | """average pooling layer"""
32 | def __init__(self,
33 | num_gpus=1,
34 | default_gpu_id=0,
35 | scope="avg_pool"):
36 | """initialize average pooling layer"""
37 | self.scope = scope
38 | self.device_spec = get_device_spec(default_gpu_id, num_gpus)
39 |
40 | def __call__(self,
41 | input_data,
42 | input_mask):
43 | """call average pooling layer"""
44 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
45 | input_sum = tf.reduce_sum(input_data * input_mask, axis=-2)
46 | input_count = tf.count_nonzero(input_mask, axis=-2, dtype=tf.float32)
47 | output_mask = tf.squeeze(tf.reduce_max(input_mask, axis=-2, keepdims=True), axis=-2)
48 | output_pool = 1.0 * input_sum / (input_count - output_mask + 1.0)
49 |
50 | return output_pool, output_mask
51 |
--------------------------------------------------------------------------------
/reading_comprehension/external/squad.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import string
3 | import re
4 | import sys
5 |
6 | __all__ = ["eval_exact_match_score", "eval_f1_score"]
7 |
8 | def normalize_answer(s):
9 | """Lower text and remove punctuation, articles and extra whitespace."""
10 | def remove_articles(text):
11 | return re.sub(r'\b(a|an|the)\b', ' ', text)
12 |
13 | def white_space_fix(text):
14 | return ' '.join(text.split())
15 |
16 | def remove_punc(text):
17 | exclude = set(string.punctuation)
18 | return ''.join(ch for ch in text if ch not in exclude)
19 |
20 | def lower(text):
21 | return text.lower()
22 |
23 | return white_space_fix(remove_articles(remove_punc(lower(s))))
24 |
25 | def f1_score(prediction, ground_truth):
26 | prediction_tokens = normalize_answer(prediction).split()
27 | ground_truth_tokens = normalize_answer(ground_truth).split()
28 | common = collections.Counter(prediction_tokens) & collections.Counter(ground_truth_tokens)
29 | num_same = sum(common.values())
30 | if num_same == 0:
31 | return 0
32 | precision = 1.0 * num_same / len(prediction_tokens)
33 | recall = 1.0 * num_same / len(ground_truth_tokens)
34 | f1 = (2 * precision * recall) / (precision + recall)
35 | return f1
36 |
37 | def exact_match_score(prediction, ground_truth):
38 | return (normalize_answer(prediction) == normalize_answer(ground_truth))
39 |
40 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
41 | scores_for_ground_truths = []
42 | for ground_truth in ground_truths:
43 | score = metric_fn(prediction, ground_truth)
44 | scores_for_ground_truths.append(score)
45 | return max(scores_for_ground_truths)
46 |
47 | def eval_exact_match_score(predicts, labels):
48 | exact_match = total = 0
49 | for (predict, label_list) in zip(predicts, labels):
50 | total += 1
51 | exact_match += metric_max_over_ground_truths(exact_match_score, predict, label_list)
52 |
53 | exact_match = 100.0 * exact_match / total
54 |
55 | return exact_match
56 |
57 | def eval_f1_score(predicts, labels):
58 | f1 = total = 0
59 | for (predict, label_list) in zip(predicts, labels):
60 | total += 1
61 | f1 += metric_max_over_ground_truths(f1_score, predict, label_list)
62 |
63 | f1 = 100.0 * f1 / total
64 |
65 | return f1
66 |
--------------------------------------------------------------------------------
/reading_comprehension/util/eval_util.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 |
4 | from external.bleu import *
5 | from external.rouge import *
6 | from external.squad import *
7 |
8 | __all__ = ["evaluate_from_data", "evaluate_from_file"]
9 |
10 | def _bleu(pred_data, ref_data):
11 | """BLEU score for translation task"""
12 | max_order = 4
13 | smooth = False
14 | score, _, _, _, _, _ = compute_bleu(ref_data, pred_data, max_order, smooth)
15 | bleu_score = 100 * score
16 | return bleu_score
17 |
18 | def _rouge(pred_data, ref_data):
19 | """ROUGE score for summarization task"""
20 | score_map = rouge(pred_data, ref_data)
21 | rouge_score = 100 * score_map["rouge_l/f_score"]
22 | return rouge_score
23 |
24 | def _squad_em(pred_data, ref_data):
25 | """EM score for reading comprehension task"""
26 | em_score = eval_exact_match_score(pred_data, ref_data)
27 | return em_score
28 |
29 | def _squad_f1(pred_data, ref_data):
30 | """F1 score for reading comprehension task"""
31 | f1_score = eval_f1_score(pred_data, ref_data)
32 | return f1_score
33 |
34 | def evaluate_from_data(pred_data, ref_data, metric):
35 | """compute evaluation score based on selected metric"""
36 | pred_and_ref = [(pred, ref_list) for pred, ref_list in zip(pred_data, ref_data) if pred and ref_list]
37 | pred_data = [pred for (pred, _) in pred_and_ref]
38 | ref_data = [ref_list for (_, ref_list) in pred_and_ref]
39 |
40 | if len(pred_data) == 0 or len(ref_data) == 0:
41 | return 0.0
42 |
43 | if metric == "bleu":
44 | eval_score = _bleu(pred_data, ref_data)
45 | elif metric == "rouge":
46 | eval_score = _rouge(pred_data, ref_data)
47 | elif metric == "exact":
48 | eval_score = _squad_em(pred_data, ref_data)
49 | elif metric == "f1":
50 | eval_score = _squad_f1(pred_data, ref_data)
51 | else:
52 | raise ValueError("unsupported metric {0}".format(metric))
53 |
54 | return eval_score
55 |
56 | def evaluate_from_file(pred_file, ref_file, metric):
57 | predict = []
58 | with codecs.getreader("utf-8")(tf.gfile.GFile(pred_file, "rb")) as file_p:
59 | for line in file_p:
60 | predict.append(line.strip())
61 | reference = []
62 | with codecs.getreader("utf-8")(tf.gfile.GFile(ref_file, "rb")) as file_r:
63 | for line in file_r:
64 | reference.append(line.strip())
65 |
66 | eval_score = evaluate(predict, reference, metric)
67 | return eval_score
68 |
--------------------------------------------------------------------------------
/reading_comprehension/util/train_logger.py:
--------------------------------------------------------------------------------
1 | import codecs
2 | import os.path
3 | import time
4 |
5 | import numpy as np
6 | import tensorflow as tf
7 |
8 | __all__ = ["TrainLogger"]
9 |
10 | class TrainLogger(object):
11 | """train logger"""
12 | def __init__(self,
13 | output_dir):
14 | """initialize train logger"""
15 | self.loss = 0.0
16 | self.learning_rate = 0.0
17 | self.global_step = 0
18 | self.epoch = 0
19 | self.step_in_epoch = 0
20 | self.train_time = 0.0
21 | self.sample_size = 0
22 | self.prev_check_loss = 0.0
23 | self.prev_check_train_time = 0.0
24 | self.prev_check_sample_size = 0
25 |
26 | if not tf.gfile.Exists(output_dir):
27 | tf.gfile.MakeDirs(output_dir)
28 | self.log_file = os.path.join(output_dir, "train_{0}.log".format(time.time()))
29 | self.log_writer = codecs.getwriter("utf-8")(tf.gfile.GFile(self.log_file, mode="a"))
30 |
31 | def update(self,
32 | train_result,
33 | epoch,
34 | step_in_epoch,
35 | time_per_step):
36 | """update train logger based on train result"""
37 | self.loss += train_result.loss * train_result.batch_size
38 | self.learning_rate = train_result.learning_rate
39 | self.global_step = train_result.global_step
40 | self.epoch = epoch
41 | self.step_in_epoch = step_in_epoch
42 | self.train_time += time_per_step
43 | self.sample_size += train_result.batch_size
44 |
45 | def check(self):
46 | """check train statistic"""
47 | loss_delta = self.loss - self.prev_check_loss
48 | train_time_delta = self.train_time - self.prev_check_train_time
49 | sample_size_delta = self.sample_size - self.prev_check_sample_size
50 |
51 | if self.sample_size <= 0:
52 | raise ValueError("current sample size is less than or equal to 0")
53 |
54 | if sample_size_delta <= 0:
55 | return
56 |
57 | avg_loss = loss_delta / sample_size_delta
58 | curr_loss = self.loss / self.sample_size
59 |
60 | log_line = "epoch={0}, step={1}, global step={2}, train time={3} avg. loss={4}, curr loss={5}".format(
61 | self.epoch, self.step_in_epoch, self.global_step, train_time_delta, avg_loss, curr_loss).encode('utf-8')
62 | self.log_writer.write("{0}\r\n".format(log_line))
63 | print(log_line)
64 |
65 | self.prev_check_loss = self.loss
66 | self.prev_check_train_time = self.train_time
67 | self.prev_check_sample_size = self.sample_size
68 |
--------------------------------------------------------------------------------
/reading_comprehension/layer/basic.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 |
4 | from util.default_util import *
5 | from util.reading_comprehension_util import *
6 |
7 | __all__ = ["Dropout", "LayerNorm"]
8 |
9 | class Dropout(object):
10 | """dropout layer"""
11 | def __init__(self,
12 | rate,
13 | num_gpus=1,
14 | default_gpu_id=0,
15 | random_seed=0,
16 | scope="dropout"):
17 | """initialize dropout layer"""
18 | self.rate = rate
19 | self.random_seed = random_seed
20 | self.scope = scope
21 | self.device_spec = get_device_spec(default_gpu_id, num_gpus)
22 |
23 | def __call__(self,
24 | input_data,
25 | input_mask):
26 | """call dropout layer"""
27 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
28 | if self.rate > 0.0:
29 | output_dropout = tf.nn.dropout(input_data, 1.0 - self.rate, seed=self.random_seed)
30 | else:
31 | output_dropout = input_data
32 |
33 | output_mask = input_mask
34 |
35 | return output_dropout, output_mask
36 |
37 | class LayerNorm(object):
38 | """layer norm layer"""
39 | def __init__(self,
40 | layer_dim,
41 | num_gpus=1,
42 | default_gpu_id=0,
43 | regularizer=None,
44 | random_seed=0,
45 | trainable=True,
46 | scope="layer_norm"):
47 | """initialize layer norm layer"""
48 | self.layer_dim = layer_dim
49 | self.regularizer = regularizer
50 | self.random_seed = random_seed
51 | self.trainable = trainable
52 | self.scope = scope
53 | self.device_spec = get_device_spec(default_gpu_id, num_gpus)
54 |
55 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
56 | gamma_initializer = create_variable_initializer("one")
57 | beta_initializer = create_variable_initializer("zero")
58 | self.gamma = tf.get_variable("gamma", shape=[self.layer_dim], initializer=gamma_initializer,
59 | regularizer=self.regularizer, trainable=self.trainable, dtype=tf.float32)
60 | self.beta = tf.get_variable("beta", shape=[self.layer_dim], initializer=beta_initializer,
61 | regularizer=self.regularizer, trainable=self.trainable, dtype=tf.float32)
62 |
63 | def __call__(self,
64 | input_data,
65 | input_mask):
66 | """call layer norm layer"""
67 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
68 | input_mean, input_variance = tf.nn.moments(input_data, axes=[-1], keep_dims=True)
69 | output_norm = (input_data - input_mean) / tf.sqrt(input_variance + EPSILON)
70 | output_norm = output_norm * self.gamma + self.beta
71 | output_mask = input_mask
72 |
73 | return output_norm, output_mask
74 |
--------------------------------------------------------------------------------
/reading_comprehension/layer/embedding.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 |
4 | from util.default_util import *
5 | from util.reading_comprehension_util import *
6 |
7 | __all__ = ["Embedding", "PretrainedEmbedding"]
8 |
9 | class Embedding(object):
10 | """Embedding layer"""
11 | def __init__(self,
12 | vocab_size,
13 | embed_dim,
14 | num_gpus=1,
15 | default_gpu_id=0,
16 | regularizer=None,
17 | random_seed=0,
18 | trainable=True,
19 | scope="embedding"):
20 | """initialize embedding layer"""
21 | self.vocab_size = vocab_size
22 | self.embed_dim = embed_dim
23 | self.regularizer = regularizer if trainable == True else None
24 | self.random_seed = random_seed
25 | self.trainable = trainable
26 | self.scope = scope
27 | self.device_spec = get_device_spec(default_gpu_id, num_gpus)
28 |
29 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
30 | initializer = create_variable_initializer("glorot_uniform", self.random_seed)
31 | self.embedding = tf.get_variable("embedding", shape=[self.vocab_size, self.embed_dim],
32 | initializer=initializer, regularizer=self.regularizer, trainable=self.trainable, dtype=tf.float32)
33 |
34 | def __call__(self,
35 | input_data):
36 | """call embedding layer"""
37 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
38 | output_embedding = tf.nn.embedding_lookup(self.embedding, input_data)
39 |
40 | return output_embedding
41 |
42 | class PretrainedEmbedding(object):
43 | """Pretrained Embedding layer"""
44 | def __init__(self,
45 | vocab_size,
46 | embed_dim,
47 | embed_data,
48 | num_gpus=1,
49 | default_gpu_id=0,
50 | regularizer=None,
51 | trainable=True,
52 | scope="pretrained_embedding"):
53 | """initialize pretrained embedding layer"""
54 | self.vocab_size = vocab_size
55 | self.embed_dim = embed_dim
56 | self.embed_data = embed_data
57 | self.regularizer = regularizer if trainable == True else None
58 | self.trainable = trainable
59 | self.scope = scope
60 | self.device_spec = get_device_spec(default_gpu_id, num_gpus)
61 |
62 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
63 | initializer = tf.constant_initializer(self.embed_data)
64 | self.embedding = tf.get_variable("pretrained_embedding", shape=[self.vocab_size, self.embed_dim],
65 | initializer=initializer, regularizer=self.regularizer, trainable=self.trainable, dtype=tf.float32)
66 |
67 | def __call__(self,
68 | input_data):
69 | """call pretrained embedding layer"""
70 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
71 | output_embedding = tf.nn.embedding_lookup(self.embedding, input_data)
72 |
73 | return output_embedding
74 |
--------------------------------------------------------------------------------
/reading_comprehension/squad/evaluate-v1.py:
--------------------------------------------------------------------------------
1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """
2 | from __future__ import print_function
3 | from collections import Counter
4 | import string
5 | import re
6 | import argparse
7 | import json
8 | import sys
9 |
10 |
11 | def normalize_answer(s):
12 | """Lower text and remove punctuation, articles and extra whitespace."""
13 | def remove_articles(text):
14 | return re.sub(r'\b(a|an|the)\b', ' ', text)
15 |
16 | def white_space_fix(text):
17 | return ' '.join(text.split())
18 |
19 | def remove_punc(text):
20 | exclude = set(string.punctuation)
21 | return ''.join(ch for ch in text if ch not in exclude)
22 |
23 | def lower(text):
24 | return text.lower()
25 |
26 | return white_space_fix(remove_articles(remove_punc(lower(s))))
27 |
28 |
29 | def f1_score(prediction, ground_truth):
30 | prediction_tokens = normalize_answer(prediction).split()
31 | ground_truth_tokens = normalize_answer(ground_truth).split()
32 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
33 | num_same = sum(common.values())
34 | if num_same == 0:
35 | return 0
36 | precision = 1.0 * num_same / len(prediction_tokens)
37 | recall = 1.0 * num_same / len(ground_truth_tokens)
38 | f1 = (2 * precision * recall) / (precision + recall)
39 | return f1
40 |
41 |
42 | def exact_match_score(prediction, ground_truth):
43 | return (normalize_answer(prediction) == normalize_answer(ground_truth))
44 |
45 |
46 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
47 | scores_for_ground_truths = []
48 | for ground_truth in ground_truths:
49 | score = metric_fn(prediction, ground_truth)
50 | scores_for_ground_truths.append(score)
51 | return max(scores_for_ground_truths)
52 |
53 |
54 | def evaluate(dataset, predictions):
55 | f1 = exact_match = total = 0
56 | for article in dataset:
57 | for paragraph in article['paragraphs']:
58 | for qa in paragraph['qas']:
59 | total += 1
60 | if qa['id'] not in predictions:
61 | message = 'Unanswered question ' + qa['id'] + \
62 | ' will receive score 0.'
63 | print(message, file=sys.stderr)
64 | continue
65 | ground_truths = list(map(lambda x: x['text'], qa['answers']))
66 | prediction = predictions[qa['id']]
67 | exact_match += metric_max_over_ground_truths(
68 | exact_match_score, prediction, ground_truths)
69 | f1 += metric_max_over_ground_truths(
70 | f1_score, prediction, ground_truths)
71 |
72 | exact_match = 100.0 * exact_match / total
73 | f1 = 100.0 * f1 / total
74 |
75 | return {'exact_match': exact_match, 'f1': f1}
76 |
77 |
78 | if __name__ == '__main__':
79 | expected_version = '1.1'
80 | parser = argparse.ArgumentParser(
81 | description='Evaluation for SQuAD ' + expected_version)
82 | parser.add_argument('dataset_file', help='Dataset file')
83 | parser.add_argument('prediction_file', help='Prediction File')
84 | args = parser.parse_args()
85 | with open(args.dataset_file) as dataset_file:
86 | dataset_json = json.load(dataset_file)
87 | if (dataset_json['version'] != expected_version):
88 | print('Evaluation expects v-' + expected_version +
89 | ', but got dataset with v-' + dataset_json['version'],
90 | file=sys.stderr)
91 | dataset = dataset_json['data']
92 | with open(args.prediction_file) as prediction_file:
93 | predictions = json.load(prediction_file)
94 | print(json.dumps(evaluate(dataset, predictions)))
95 |
--------------------------------------------------------------------------------
/reading_comprehension/layer/position.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 |
4 | from util.default_util import *
5 | from util.reading_comprehension_util import *
6 |
7 | __all__ = ["SinusoidPosition", "AbsolutePosition"]
8 |
9 | class SinusoidPosition(object):
10 | """sinusoid position layer"""
11 | def __init__(self,
12 | min_time_scale,
13 | max_time_scale,
14 | num_gpus=1,
15 | default_gpu_id=0,
16 | scope="sin_pos"):
17 | """initialize sinusoid position layer"""
18 | self.min_time_scale = min_time_scale
19 | self.max_time_scale = max_time_scale
20 | self.scope = scope
21 | self.device_spec = get_device_spec(default_gpu_id, num_gpus)
22 |
23 | def __call__(self,
24 | input_data,
25 | input_mask):
26 | """call sinusoid position layer"""
27 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
28 | input_shape = tf.shape(input_data)
29 | length = input_shape[-2]
30 | channel = input_shape[-1]
31 | num_time_scale = channel // 2
32 | position = tf.to_float(tf.range(length))
33 | log_time_scale = tf.log(float(self.max_time_scale) / float(self.min_time_scale)) / (tf.to_float(num_time_scale) - 1)
34 | inv_time_scale = float(self.min_time_scale) * tf.exp(-1.0 * log_time_scale * tf.to_float(tf.range(num_time_scale)))
35 | scaled_time = tf.expand_dims(position, axis=1) * tf.expand_dims(inv_time_scale, axis=0)
36 | signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
37 | signal = tf.pad(signal, paddings=[[0, 0], [0, tf.mod(channel, 2)]])
38 | signal = tf.reshape(signal, shape=[1, length, channel])
39 |
40 | output_signal = input_data + signal
41 | output_mask = input_mask
42 |
43 | return output_signal, output_mask
44 |
45 | class AbsolutePosition(object):
46 | """absolute position layer"""
47 | def __init__(self,
48 | unit_dim,
49 | max_length,
50 | num_gpus=1,
51 | default_gpu_id=0,
52 | regularizer=None,
53 | random_seed=0,
54 | trainable=True,
55 | scope="abs_pos"):
56 | """initialize absolute position layer"""
57 | self.unit_dim = unit_dim
58 | self.max_length = max_length
59 | self.random_seed = random_seed
60 | self.regularizer = regularizer
61 | self.trainable = trainable
62 | self.scope = scope
63 | self.device_spec = get_device_spec(default_gpu_id, num_gpus)
64 |
65 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
66 | weight_initializer = create_variable_initializer("glorot_uniform", self.random_seed)
67 | self.position_embedding = tf.get_variable("position_embedding", shape=[1, self.max_length, self.unit_dim],
68 | initializer=weight_initializer, regularizer=self.regularizer, trainable=self.trainable, dtype=tf.float32)
69 |
70 | def __call__(self,
71 | input_data,
72 | input_mask):
73 | """call absolute position layer"""
74 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
75 | input_shape = tf.shape(input_data)
76 | max_length = input_shape[-2]
77 | position_embedding = self.position_embedding[:,:max_length,:]
78 | output_signal = input_data + position_embedding
79 | output_mask = input_mask
80 |
81 | return output_signal, output_mask
82 |
--------------------------------------------------------------------------------
/reading_comprehension/util/reading_comprehension_util.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 |
4 | from util.default_util import *
5 |
6 | __all__ = ["create_variable_initializer", "create_weight_regularizer", "create_activation_function",
7 | "softmax_with_mask", "generate_masked_data", "generate_onehot_label"]
8 |
9 | def create_variable_initializer(initializer_type,
10 | random_seed=None,
11 | data_type=tf.float32):
12 | """create variable initializer"""
13 | if initializer_type == "zero":
14 | initializer = tf.zeros_initializer
15 | elif initializer_type == "one":
16 | initializer = tf.ones_initializer
17 | elif initializer_type == "orthogonal":
18 | initializer = tf.orthogonal_initializer(seed=random_seed, dtype=data_type)
19 | elif initializer_type == "random_uniform":
20 | initializer = tf.random_uniform_initializer(seed=random_seed, dtype=data_type)
21 | elif initializer_type == "glorot_uniform":
22 | initializer = tf.glorot_uniform_initializer(seed=random_seed, dtype=data_type)
23 | elif initializer_type == "xavier_uniform":
24 | initializer = tf.contrib.layers.xavier_initializer(uniform=True, seed=random_seed, dtype=tf.float32)
25 | elif initializer_type == "random_normal":
26 | initializer = tf.random_normal_initializer(seed=random_seed, dtype=data_type)
27 | elif initializer_type == "truncated_normal":
28 | initializer = tf.truncated_normal_initializer(seed=random_seed, dtype=data_type)
29 | elif initializer_type == "glorot_normal":
30 | initializer = tf.glorot_normal_initializer(seed=random_seed, dtype=data_type)
31 | elif initializer_type == "xavier_normal":
32 | initializer = tf.contrib.layers.xavier_initializer(uniform=False, seed=random_seed, dtype=tf.float32)
33 | elif initializer_type == "variance_scaling":
34 | initializer = tf.contrib.layers.variance_scaling_initializer(factor=2.0,
35 | mode='FAN_IN', uniform=False, seed=random_seed, dtype=tf.float32)
36 | else:
37 | initializer = None
38 |
39 | return initializer
40 |
41 | def create_weight_regularizer(regularizer_type,
42 | scale):
43 | """create weight regularizer"""
44 | if regularizer_type == "l1":
45 | regularizer = tf.contrib.layers.l1_regularizer(scale)
46 | elif regularizer_type == "l2":
47 | regularizer = tf.contrib.layers.l2_regularizer(scale)
48 | else:
49 | regularizer = None
50 |
51 | return regularizer
52 |
53 | def create_activation_function(activation):
54 | """create activation function"""
55 | if activation == "relu":
56 | activation_function = tf.nn.relu
57 | elif activation == "relu6":
58 | activation_function = tf.nn.relu6
59 | elif activation == "leaky_relu":
60 | activation_function = tf.nn.leaky_relu
61 | elif activation == "elu":
62 | activation_function = tf.nn.elu
63 | elif activation == "crelu":
64 | activation_function = tf.nn.crelu
65 | elif activation == "selu":
66 | activation_function = tf.nn.selu
67 | elif activation == "gelu":
68 | activation_function = gelu
69 | elif activation == "tanh":
70 | activation_function = tf.nn.tanh
71 | elif activation == "sigmoid":
72 | activation_function = tf.nn.sigmoid
73 | elif activation == "softplus":
74 | activation_function = tf.nn.softplus
75 | else:
76 | activation_function = None
77 |
78 | return activation_function
79 |
80 | def softmax_with_mask(input_data,
81 | input_mask,
82 | axis=-1):
83 | """compute softmax with masking"""
84 | return tf.nn.softmax(input_data * input_mask + MIN_FLOAT * (1 - input_mask), axis=axis)
85 |
86 | def generate_masked_data(input_data,
87 | input_mask):
88 | """generate masked data"""
89 | return input_data * input_mask + MIN_FLOAT * (1 - input_mask)
90 |
91 | def generate_onehot_label(input_data,
92 | input_depth):
93 | """generate one-hot label"""
94 | return tf.one_hot(input_data, depth=input_depth, on_value=1.0, off_value=0.0, dtype=tf.float32)
95 |
96 | def gelu(input_tensor):
97 | """Gaussian Error Linear Unit"""
98 | cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0)))
99 | return input_tensor * cdf
100 |
--------------------------------------------------------------------------------
/reading_comprehension/external/bleu.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Python implementation of BLEU and smooth-BLEU.
17 |
18 | This module provides a Python implementation of BLEU and smooth-BLEU.
19 | Smooth BLEU is computed following the method outlined in the paper:
20 | Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
21 | evaluation metrics for machine translation. COLING 2004.
22 | """
23 |
24 | import collections
25 | import math
26 |
27 | __all__ = ["compute_bleu"]
28 |
29 | def _get_ngrams(segment, max_order):
30 | """Extracts all n-grams upto a given maximum order from an input segment.
31 |
32 | Args:
33 | segment: text segment from which n-grams will be extracted.
34 | max_order: maximum length in tokens of the n-grams returned by this
35 | methods.
36 |
37 | Returns:
38 | The Counter containing all n-grams upto max_order in segment
39 | with a count of how many times each n-gram occurred.
40 | """
41 | ngram_counts = collections.Counter()
42 | for order in range(1, max_order + 1):
43 | for i in range(0, len(segment) - order + 1):
44 | ngram = tuple(segment[i:i+order])
45 | ngram_counts[ngram] += 1
46 | return ngram_counts
47 |
48 |
49 | def compute_bleu(reference_corpus, translation_corpus, max_order=4,
50 | smooth=False):
51 | """Computes BLEU score of translated segments against one or more references.
52 |
53 | Args:
54 | reference_corpus: list of lists of references for each translation. Each
55 | reference should be tokenized into a list of tokens.
56 | translation_corpus: list of translations to score. Each translation
57 | should be tokenized into a list of tokens.
58 | max_order: Maximum n-gram order to use when computing BLEU score.
59 | smooth: Whether or not to apply Lin et al. 2004 smoothing.
60 |
61 | Returns:
62 | 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
63 | precisions and brevity penalty.
64 | """
65 | matches_by_order = [0] * max_order
66 | possible_matches_by_order = [0] * max_order
67 | reference_length = 0
68 | translation_length = 0
69 | for (references, translation) in zip(reference_corpus,
70 | translation_corpus):
71 | reference_length += min(len(r) for r in references)
72 | translation_length += len(translation)
73 |
74 | merged_ref_ngram_counts = collections.Counter()
75 | for reference in references:
76 | merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
77 | translation_ngram_counts = _get_ngrams(translation, max_order)
78 | overlap = translation_ngram_counts & merged_ref_ngram_counts
79 | for ngram in overlap:
80 | matches_by_order[len(ngram)-1] += overlap[ngram]
81 | for order in range(1, max_order+1):
82 | possible_matches = len(translation) - order + 1
83 | if possible_matches > 0:
84 | possible_matches_by_order[order-1] += possible_matches
85 |
86 | precisions = [0] * max_order
87 | for i in range(0, max_order):
88 | if smooth:
89 | precisions[i] = ((matches_by_order[i] + 1.) /
90 | (possible_matches_by_order[i] + 1.))
91 | else:
92 | if possible_matches_by_order[i] > 0:
93 | precisions[i] = (float(matches_by_order[i]) /
94 | possible_matches_by_order[i])
95 | else:
96 | precisions[i] = 0.0
97 |
98 | if min(precisions) > 0:
99 | p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
100 | geo_mean = math.exp(p_log_sum)
101 | else:
102 | geo_mean = 0
103 |
104 | ratio = float(translation_length) / reference_length
105 |
106 | if ratio > 1.0:
107 | bp = 1.
108 | else:
109 | bp = math.exp(1 - 1. / ratio)
110 |
111 | bleu = geo_mean * bp
112 |
113 | return (bleu, precisions, bp, ratio, translation_length, reference_length)
114 |
--------------------------------------------------------------------------------
/reading_comprehension/util/eval_logger.py:
--------------------------------------------------------------------------------
1 | import codecs
2 | import collections
3 | import os.path
4 | import time
5 | import json
6 |
7 | import numpy as np
8 | import tensorflow as tf
9 |
10 | __all__ = ["BasicInfoEvalLog", "ExtrinsicEvalLog", "DecodingEvalLog", "EvalLogger"]
11 |
12 | class BasicInfoEvalLog(collections.namedtuple("BasicInfoEvalLog", ("epoch", "global_step"))):
13 | pass
14 |
15 | class ExtrinsicEvalLog(collections.namedtuple("ExtrinsicEvalLog", ("metric", "score", "sample_output", "sample_size"))):
16 | pass
17 |
18 | class DecodingEvalLog(collections.namedtuple("DecodingEvalLog", ("sample_input", "sample_output", "sample_reference"))):
19 | pass
20 |
21 | class EvalLogger(object):
22 | """evaluation logger"""
23 | def __init__(self,
24 | output_dir):
25 | """extrinsic evaluation result"""
26 | self.extrinsic_eval = None
27 | self.extrinsic_eval_info = None
28 | self.extrinsic_eval_detail = None
29 | self.extrinsic_eval_detail_info = None
30 |
31 | """extrinsic evaluation result"""
32 | self.decoding_eval = None
33 | self.decoding_eval_info = None
34 |
35 | """initialize evaluation logger"""
36 | self.output_dir = output_dir
37 | if not tf.gfile.Exists(self.output_dir):
38 | tf.gfile.MakeDirs(self.output_dir)
39 | self.log_file = os.path.join(self.output_dir, "eval_{0}.log".format(time.time()))
40 | self.log_writer = codecs.getwriter("utf-8")(tf.gfile.GFile(self.log_file, mode="a"))
41 |
42 | def update_extrinsic_eval(self,
43 | eval_result_list,
44 | basic_info):
45 | """update evaluation logger with extrinsic evaluation result"""
46 | self.extrinsic_eval = eval_result_list
47 | self.extrinsic_eval_info = basic_info
48 |
49 | def update_extrinsic_eval_detail(self,
50 | eval_result_detail,
51 | basic_info):
52 | """update evaluation logger with extrinsic evaluation result detail"""
53 | self.extrinsic_eval_detail = eval_result_detail
54 | self.extrinsic_eval_detail_info = basic_info
55 |
56 | def check_extrinsic_eval(self):
57 | """check extrinsic evaluation result"""
58 | for eval_result in self.extrinsic_eval:
59 | log_line = "epoch={0}, global step={1}, {2}={3}, sample size={4}".format(self.extrinsic_eval_info.epoch,
60 | self.extrinsic_eval_info.global_step, eval_result.metric, eval_result.score, eval_result.sample_size).encode('utf-8')
61 | self.log_writer.write("{0}\r\n".format(log_line))
62 | print(log_line)
63 |
64 | def check_extrinsic_eval_detail(self):
65 | """check extrinsic evaluation detail result"""
66 | eval_detail_file = os.path.join(self.output_dir, "eval_{0}_{1}_{2}.detail".format(self.extrinsic_eval_detail_info.epoch,
67 | self.extrinsic_eval_detail_info.global_step, time.time()))
68 | with codecs.getwriter("utf-8")(tf.gfile.GFile(eval_detail_file, mode="w")) as eval_detail_writer:
69 | if self.extrinsic_eval_detail is None:
70 | return
71 | sample_output = json.dumps(self.extrinsic_eval_detail.sample_output, indent=4)
72 | eval_detail_writer.write(sample_output)
73 |
74 | def update_decoding_eval(self,
75 | eval_result_list,
76 | basic_info):
77 | """update evaluation logger with decoding evaluation result"""
78 | self.decoding_eval = eval_result_list
79 | self.decoding_eval_info = basic_info
80 |
81 | def check_decoding_eval(self):
82 | """check decoding evaluation result"""
83 | sample_size = len(self.decoding_eval)
84 | log_line = "epoch={0}, global step={1}, sample size={2}".format(self.decoding_eval_info.epoch,
85 | self.decoding_eval_info.global_step, sample_size).encode('utf-8')
86 | self.log_writer.write("{0}\r\n".format(log_line))
87 | print(log_line)
88 |
89 | for i in range(sample_size):
90 | eval_result = self.decoding_eval[i]
91 | log_line = "====================================="
92 | self.log_writer.write("{0}\r\n".format(log_line))
93 | print(log_line)
94 | log_line = "sample {0} - input: {1}".format(i+1, eval_result.sample_input).encode('utf-8')
95 | self.log_writer.write("{0}\r\n".format(log_line))
96 | print(log_line)
97 | log_line = "sample {0} - output: {1}".format(i+1, eval_result.sample_output).encode('utf-8')
98 | self.log_writer.write("{0}\r\n".format(log_line))
99 | print(log_line)
100 | log_line = "sample {0} - reference: {1}".format(i+1, eval_result.sample_reference).encode('utf-8')
101 | self.log_writer.write("{0}\r\n".format(log_line))
102 | print(log_line)
103 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | ## Description
2 | Machine reading comprehension (MRC), a task which asks machine to read a given context then answer questions based on its understanding, is considered one of the key problems in artificial intelligence and has significant interest from both academic and industry. Over the past few years, great progress has been made in this field, thanks to various end-to-end trained neural models and high quality datasets with large amount of examples proposed.
3 | {:width="800px"}
4 | *Figure 1: MRC example from SQuAD 2.0 dev set*
5 |
6 | ## DataSet
7 | * [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) is a reading comprehension dataset, consisting of questions posed by crowd-workers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.
8 | * [GloVe](https://nlp.stanford.edu/projects/glove/) is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.
9 |
10 | ## Experiment
11 | ### QANet
12 | [QANet](https://github.com/google-research/google-research/tree/master/qanet) is a MRC architecture proposed by Google Brain, which does not require recurrent networks: Its encoder consists exclusively of convolution and self-attention, where convolution models local interactions and self-attention models global interactions.
13 |
14 | {:width="700px"}
15 |
16 | *Figure 2: An overview of the QANet architecture*
17 |
18 | {:width="1000px"}
19 |
20 | *Figure 3: The experiment details are reported on SQuAD v1 dataset. Both train & dev sets are processed using Spacy. Invalid samples are removed from both train & dev sets. EM results for QANet model with/without EMA are shown on left. F1 results for QANet model with/without EMA are shown on right*
21 |
22 | | Model | # Epoch | # Train Steps | Batch Size | Data Size | # Head | # Dim | EM | F1 |
23 | |:-------------------:|:-------:|:-------------:|:----------:|:-------------:|:------:|:-----:|:------:|:------:|
24 | | This implementation | 13 | ~70,000 | 16 | 87k (no aug) | 8 | 128 | 70.2 | 80.0 |
25 | | Original Paper | ~13 | 35,000 | 32 | 87k (no aug) | 8 | 128 | N/A | 77.0 |
26 | | Original Paper | ~55 | 150,000 | 32 | 87k (no aug) | 8 | 128 | 73.6 | 82.7 |
27 |
28 | *Table 1: The performance results are reported on SQuAD v1 dataset. Both train & dev sets are processed using Spacy. Invalid samples are removed from train set only. Settings for this QANet implementation is selected to be comparable with settings in original paper*
29 |
30 | ### BiDAF
31 | [BiDAF](https://allenai.github.io/bi-att-flow/) (Bi-Directional Attention Flow) is a MRC architecture proposed by Allen Institute for Artificial Intelligence (AI2), which consists a multi-stage hierarchical process that represents the context at different levels of granularity and uses bidirectional attention flow mechanism to obtain a query-aware context representation without early summarization.
32 |
33 | {:width="700px"}
34 |
35 | *Figure 4: An overview of the BiDAF architecture*
36 |
37 | {:width="1000px"}
38 |
39 | *Figure 5: The experiment details are reported on SQuAD v1 dataset. Both train & dev sets are processed using Spacy. Invalid samples are removed from both train & dev sets. EM results for BiDAF model with/without EMA are shown on left. F1 results for BiDAF model with/without EMA are shown on right*
40 |
41 | | Model | # Epoch | # Train Steps | Batch Size | Attention Type | # Dim | EM | F1 |
42 | |:-------------------:|:-------:|:-------------:|:----------:|:--------------:|:-----:|:------:|:------:|
43 | | This implementation | 12 | ~17,500 | 60 | trilinear | 100 | 68.5 | 78.2 |
44 | | Original Paper | 12 | ~17,500 | 60 | trilinear | 100 | 67.7 | 77.3 |
45 |
46 | *Table 2: The performance results are reported on SQuAD v1 dataset. Both train & dev sets are processed using Spacy. Invalid samples are removed from train set only. Settings for this BiDAF implementation is selected to be comparable with settings in original paper*
47 |
48 | ### R-Net
49 | [R-Net](https://www.microsoft.com/en-us/research/publication/mcr/) is a MRC architecture proposed by Microsoft Research Asia (MSRA), which first matches the question and passage with gated attention-based recurrent networks to obtain the question-aware passage representation, then uses a self-matching attention mechanism to refine the representation by matching the passage against itself, and finally employs the pointer networks to locate the positions of answers from the passages.
50 |
51 | {:width="700px"}
52 |
53 | *Figure 6: An overview of the R-Net architecture*
54 |
55 | ## Reference
56 | * Adams Wei Yu, David Dohan, Minh-Thang Luong, Rui Zhao, Kai Chen, Mohammad Norouzi, and Quoc V Le. [QANet: Combining local convolution with global self-attention for reading comprehension](https://arxiv.org/abs/1804.09541) [2018]
57 | * Min Joon Seo, Aniruddha Kembhavi, Ali Farhadi, and Hannaneh Hajishirzi. [Bidirectional attention flow for machine comprehension](https://arxiv.org/abs/1611.01603) [2017]
58 | * Wenhui Wang, Nan Yang, Furu Wei, Baobao Chang, and Ming Zhou. [Gated self-matching networks for reading comprehension and question answering](https://aclanthology.info/papers/P17-1018/p17-1018) [2017]
59 | * Danqi Chen. [Neural reading comprehension and beyond](https://cs.stanford.edu/~danqi/papers/thesis.pdf) [2018]
60 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Machine Reading Comprehension
2 | Machine reading comprehension (MRC), a task which asks machine to read a given context then answer questions based on its understanding, is considered one of the key problems in artificial intelligence and has significant interest from both academic and industry. Over the past few years, great progress has been made in this field, thanks to various end-to-end trained neural models and high quality datasets with large amount of examples proposed. In this repo, I'll share more details on MRC task by re-implementing a few MRC models and testing them on standard MRC datasets.
3 |

4 | Figure 1: MRC example from SQuAD 2.0 dev set
5 |
6 | ## Setting
7 | * Python 3.6.6
8 | * Tensorflow 1.12
9 | * NumPy 1.15.4
10 | * NLTK 3.3
11 | * Spacy 2.0.12
12 |
13 | ## DataSet
14 | * [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) is a reading comprehension dataset, consisting of questions posed by crowd-workers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.
15 | * [GloVe](https://nlp.stanford.edu/projects/glove/) is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.
16 |
17 | ## Usage
18 | * Preprocess data
19 | ```bash
20 | # preprocess train data
21 | python squad/preprocess.py --format json --input_file data/squad/train-v1.1/train-v1.1.json --output_file data/squad/train-v1.1/train-v1.1.squad.json
22 | # preprocess dev data
23 | python squad/preprocess.py --format json --input_file data/squad/dev-v1.1/dev-v1.1.json --output_file data/squad/dev-v1.1/dev-v1.1.squad.json
24 | ```
25 | * Run experiment
26 | ```bash
27 | # run experiment in train + eval mode
28 | python reading_comprehension_run.py --mode train_eval --config config/config_mrc_template.xxx.json
29 | # run experiment in train only mode
30 | python reading_comprehension_run.py --mode train --config config/config_mrc_template.xxx.json
31 | # run experiment in eval only mode
32 | python reading_comprehension_run.py --mode eval --config config/config_mrc_template.xxx.json
33 | ```
34 | * Search hyper-parameter
35 | ```bash
36 | # random search hyper-parameters
37 | python hparam_search.py --base-config config/config_mrc_template.xxx.json --search-config config/config_search_template.xxx.json --num-group 10 --random-seed 100 --output-dir config/search
38 | ```
39 | * Visualize summary
40 | ```bash
41 | # visualize summary via tensorboard
42 | tensorboard --logdir=output
43 | ```
44 | ## Experiment
45 | ### QANet
46 | [QANet](https://github.com/google-research/google-research/tree/master/qanet) is a MRC architecture proposed by Google Brain, which does not require recurrent networks: Its encoder consists exclusively of convolution and self-attention, where convolution models local interactions and self-attention models global interactions.
47 |
48 | 
49 | Figure 2: An overview of the QANet architecture
50 |
51 | 
52 | Figure 3: The experiment details are reported on SQuAD v1 dataset. Both train & dev sets are processed using Spacy. Invalid samples are removed from both train & dev sets. EM results for QANet model with/without EMA are shown on left. F1 results for QANet model with/without EMA are shown on right
53 |
54 | | Model | # Epoch | # Train Steps | Batch Size | Data Size | # Head | # Dim | EM | F1 |
55 | |:-------------------:|:-------:|:-------------:|:----------:|:-------------:|:------:|:-----:|:------:|:------:|
56 | | This implementation | 13 | ~70,000 | 16 | 87k (no aug) | 8 | 128 | 70.2 | 80.0 |
57 | | Original Paper | ~13 | 35,000 | 32 | 87k (no aug) | 8 | 128 | N/A | 77.0 |
58 | | Original Paper | ~55 | 150,000 | 32 | 87k (no aug) | 8 | 128 | 73.6 | 82.7 |
59 |
60 | Table 1: The performance results are reported on SQuAD v1 dataset. Both train & dev sets are processed using Spacy. Invalid samples are removed from train set only. Settings for this QANet implementation is selected to be comparable with settings in original paper
61 |
62 | ### BiDAF
63 | [BiDAF](https://allenai.github.io/bi-att-flow/) (Bi-Directional Attention Flow) is a MRC architecture proposed by Allen Institute for Artificial Intelligence (AI2), which consists a multi-stage hierarchical process that represents the context at different levels of granularity and uses bidirectional attention flow mechanism to obtain a query-aware context representation without early summarization.
64 |
65 | 
66 | Figure 4: An overview of the BiDAF architecture
67 |
68 | 
69 | Figure 5: The experiment details are reported on SQuAD v1 dataset. Both train & dev sets are processed using Spacy. Invalid samples are removed from both train & dev sets. EM results for BiDAF model with/without EMA are shown on left. F1 results for BiDAF model with/without EMA are shown on right
70 |
71 | | Model | # Epoch | # Train Steps | Batch Size | Attention Type | # Dim | EM | F1 |
72 | |:-------------------:|:-------:|:-------------:|:----------:|:--------------:|:-----:|:------:|:------:|
73 | | This implementation | 12 | ~17,500 | 60 | trilinear | 100 | 68.5 | 78.2 |
74 | | Original Paper | 12 | ~17,500 | 60 | trilinear | 100 | 67.7 | 77.3 |
75 |
76 | Table 2: The performance results are reported on SQuAD v1 dataset. Both train & dev sets are processed using Spacy. Invalid samples are removed from train set only. Settings for this BiDAF implementation is selected to be comparable with settings in original paper
77 |
78 | ### R-Net
79 | [R-Net](https://www.microsoft.com/en-us/research/publication/mcr/) is a MRC architecture proposed by Microsoft Research Asia (MSRA), which first matches the question and passage with gated attention-based recurrent networks to obtain the question-aware passage representation, then uses a self-matching attention mechanism to refine the representation by matching the passage against itself, and finally employs the pointer networks to locate the positions of answers from the passages.
80 |
81 | 
82 | Figure 6: An overview of the R-Net architecture
83 |
84 | ## Reference
85 | * Adams Wei Yu, David Dohan, Minh-Thang Luong, Rui Zhao, Kai Chen, Mohammad Norouzi, and Quoc V Le. [QANet: Combining local convolution with global self-attention for reading comprehension](https://arxiv.org/abs/1804.09541) [2018]
86 | * Min Joon Seo, Aniruddha Kembhavi, Ali Farhadi, and Hannaneh Hajishirzi. [Bidirectional attention flow for machine comprehension](https://arxiv.org/abs/1611.01603) [2017]
87 | * Wenhui Wang, Nan Yang, Furu Wei, Baobao Chang, and Ming Zhou. [Gated self-matching networks for reading comprehension and question answering](https://aclanthology.info/papers/P17-1018/p17-1018) [2017]
88 | * Danqi Chen. [Neural reading comprehension and beyond](https://cs.stanford.edu/~danqi/papers/thesis.pdf) [2018]
89 |
--------------------------------------------------------------------------------
/reading_comprehension/config/config_mrc_template.rnet.json:
--------------------------------------------------------------------------------
1 | {
2 | "data_train_mrc_file": "data/squad/train-v1.1/train-v1.1.squad.json",
3 | "data_train_mrc_file_type": "json",
4 | "data_eval_mrc_file": "data/squad/dev-v1.1/dev-v1.1.squad.json",
5 | "data_eval_mrc_file_type": "json",
6 | "data_embedding_file": "data/squad/resource/squad.all.word.embed",
7 | "data_full_embedding_file": "data/glove/glove.840B.300d.txt",
8 | "data_tfrecord_dir": "data/squad/tfrecord",
9 | "data_max_question_length": 40,
10 | "data_max_context_length": 500,
11 | "data_max_answer_length": 30,
12 | "data_max_subword_length": 16,
13 | "data_max_char_length": 16,
14 | "data_word_vocab_file": "data/squad/resource/squad.all.word.vocab",
15 | "data_word_vocab_size": 180963,
16 | "data_word_vocab_threshold": 0,
17 | "data_word_unk": "",
18 | "data_word_pad": "",
19 | "data_word_sos": "",
20 | "data_word_eos": "",
21 | "data_word_placeholder_enable": false,
22 | "data_subword_vocab_file": "data/squad/resource/squad.all.subword.vocab",
23 | "data_subword_vocab_size": 50554,
24 | "data_subword_vocab_threshold": 0,
25 | "data_subword_unk": "***",
26 | "data_subword_pad": "###",
27 | "data_subword_size": 3,
28 | "data_char_vocab_file": "data/squad/resource/squad.all.char.vocab",
29 | "data_char_vocab_size": 1610,
30 | "data_char_vocab_threshold": 0,
31 | "data_char_unk": "*",
32 | "data_char_pad": "#",
33 | "data_answer_type": "span",
34 | "data_expand_multiple_answer": false,
35 | "data_enable_validation": true,
36 | "data_pipeline_mode": "tfrecord",
37 | "data_num_parallel": 4,
38 | "data_log_output_dir": "output/rnet/log",
39 | "data_result_output_dir": "output/rnet/result",
40 | "train_random_seed": 100,
41 | "train_enable_shuffle": true,
42 | "train_shuffle_buffer_size": 30000,
43 | "train_batch_size": 64,
44 | "train_eval_batch_size": 100,
45 | "train_eval_metric": ["exact", "f1"],
46 | "train_eval_detail_type": "simplified",
47 | "train_decoding_sample_size": 3,
48 | "train_num_epoch": 3,
49 | "train_ckpt_output_dir": "output/rnet/checkpoint",
50 | "train_summary_output_dir": "output/rnet/summary",
51 | "train_step_per_stat": 10,
52 | "train_step_per_ckpt": 1000,
53 | "train_step_per_eval": 1000,
54 | "train_clip_norm": 5.0,
55 | "train_label_smoothing": 0.0,
56 | "train_enable_debugging": false,
57 | "train_ema_enable": true,
58 | "train_ema_decay_rate": 0.999,
59 | "train_ema_enable_debias": false,
60 | "train_ema_enable_dynamic_decay": false,
61 | "train_regularization_enable": false,
62 | "train_regularization_type": "l2",
63 | "train_regularization_scale": 3e-7,
64 | "train_optimizer_type": "adadelta",
65 | "train_optimizer_learning_rate": 1.0,
66 | "train_optimizer_warmup_enable": false,
67 | "train_optimizer_warmup_mode": "exponential_warmup",
68 | "train_optimizer_warmup_rate": 0.01,
69 | "train_optimizer_warmup_end_step": 1000,
70 | "train_optimizer_decay_enable": false,
71 | "train_optimizer_decay_mode": "exponential_decay",
72 | "train_optimizer_decay_rate": 0.95,
73 | "train_optimizer_decay_step": 1000,
74 | "train_optimizer_decay_start_step": 10000,
75 | "train_optimizer_momentum_beta": 0.9,
76 | "train_optimizer_rmsprop_beta": 0.999,
77 | "train_optimizer_rmsprop_epsilon": 1e-08,
78 | "train_optimizer_adadelta_rho": 0.95,
79 | "train_optimizer_adadelta_epsilon": 1e-06,
80 | "train_optimizer_adagrad_init_accumulator": 0.1,
81 | "train_optimizer_adam_beta_1": 0.9,
82 | "train_optimizer_adam_beta_2": 0.999,
83 | "train_optimizer_adam_epsilon": 1e-08,
84 | "model_type": "rnet",
85 | "model_scope": "mrc",
86 | "model_representation_word_embed_dim": 300,
87 | "model_representation_word_embed_pretrained": true,
88 | "model_representation_word_feat_trainable": false,
89 | "model_representation_word_feat_enable": true,
90 | "model_representation_subword_embed_dim": 8,
91 | "model_representation_subword_unit_dim": 75,
92 | "model_representation_subword_cell_type": "gru",
93 | "model_representation_subword_hidden_activation": "relu",
94 | "model_representation_subword_dropout": 0.2,
95 | "model_representation_subword_feat_trainable": true,
96 | "model_representation_subword_feat_enable": false,
97 | "model_representation_char_embed_dim": 8,
98 | "model_representation_char_unit_dim": 75,
99 | "model_representation_char_cell_type": "gru",
100 | "model_representation_char_hidden_activation": "relu",
101 | "model_representation_char_dropout": 0.2,
102 | "model_representation_char_feat_trainable": true,
103 | "model_representation_char_feat_enable": true,
104 | "model_representation_fusion_type": "highway",
105 | "model_representation_fusion_num_layer": 2,
106 | "model_representation_fusion_unit_dim": 450,
107 | "model_representation_fusion_hidden_activation": "relu",
108 | "model_representation_fusion_dropout": 0.2,
109 | "model_representation_fusion_trainable": true,
110 | "model_understanding_question_num_layer": 3,
111 | "model_understanding_question_unit_dim": 75,
112 | "model_understanding_question_cell_type": "gru",
113 | "model_understanding_question_hidden_activation": "tanh",
114 | "model_understanding_question_dropout": 0.2,
115 | "model_understanding_question_forget_bias": 1.0,
116 | "model_understanding_question_residual_connect": false,
117 | "model_understanding_question_trainable": true,
118 | "model_understanding_context_num_layer": 3,
119 | "model_understanding_context_unit_dim": 75,
120 | "model_understanding_context_cell_type": "gru",
121 | "model_understanding_context_hidden_activation": "tanh",
122 | "model_understanding_context_dropout": 0.2,
123 | "model_understanding_context_forget_bias": 1.0,
124 | "model_understanding_context_residual_connect": false,
125 | "model_understanding_context_trainable": true,
126 | "model_understanding_enable_sharing": false,
127 | "model_interaction_context2question_num_layer": 1,
128 | "model_interaction_context2question_unit_dim": 75,
129 | "model_interaction_context2question_cell_type": "gru",
130 | "model_interaction_context2question_hidden_activation": "tanh",
131 | "model_interaction_context2question_dropout": 0.2,
132 | "model_interaction_context2question_attention_dropout": 0.0,
133 | "model_interaction_context2question_forget_bias": 1.0,
134 | "model_interaction_context2question_residual_connect": false,
135 | "model_interaction_context2question_attention_dim": 75,
136 | "model_interaction_context2question_score_type": "linear",
137 | "model_interaction_context2question_trainable": true,
138 | "model_modeling_answer_num_layer": 1,
139 | "model_modeling_answer_unit_dim": 75,
140 | "model_modeling_answer_cell_type": "gru",
141 | "model_modeling_answer_hidden_activation": "tanh",
142 | "model_modeling_answer_dropout": 0.2,
143 | "model_modeling_answer_attention_dropout": 0.0,
144 | "model_modeling_answer_forget_bias": 1.0,
145 | "model_modeling_answer_residual_connect": false,
146 | "model_modeling_answer_attention_dim": 75,
147 | "model_modeling_answer_score_type": "linear",
148 | "model_modeling_answer_trainable": true,
149 | "model_output_answer_num_layer": 1,
150 | "model_output_answer_unit_dim": 150,
151 | "model_output_answer_cell_type": "gru",
152 | "model_output_answer_hidden_activation": "tanh",
153 | "model_output_answer_dropout": 0.2,
154 | "model_output_answer_forget_bias": 1.0,
155 | "model_output_answer_residual_connect": false,
156 | "model_output_answer_attention_dim": 150,
157 | "model_output_answer_score_type": "linear",
158 | "model_output_answer_trainable": true,
159 | "device_num_gpus": 1,
160 | "device_default_gpu_id": 0,
161 | "device_log_device_placement": false,
162 | "device_allow_soft_placement": true,
163 | "device_allow_growth": false,
164 | "device_per_process_gpu_memory_fraction": 0.8
165 | }
--------------------------------------------------------------------------------
/reading_comprehension/config/config_mrc_template.qanet.json:
--------------------------------------------------------------------------------
1 | {
2 | "data_train_mrc_file": "data/squad/train-v1.1/train-v1.1.squad.json",
3 | "data_train_mrc_file_type": "json",
4 | "data_eval_mrc_file": "data/squad/dev-v1.1/dev-v1.1.squad.json",
5 | "data_eval_mrc_file_type": "json",
6 | "data_embedding_file": "data/squad/resource/squad.all.word.embed",
7 | "data_full_embedding_file": "data/glove/glove.840B.300d.txt",
8 | "data_tfrecord_dir": "data/squad/tfrecord",
9 | "data_max_question_length": 50,
10 | "data_max_context_length": 400,
11 | "data_max_answer_length": 30,
12 | "data_max_subword_length": 16,
13 | "data_max_char_length": 16,
14 | "data_word_vocab_file": "data/squad/resource/squad.all.word.vocab",
15 | "data_word_vocab_size": 180963,
16 | "data_word_vocab_threshold": 0,
17 | "data_word_unk": "",
18 | "data_word_pad": "",
19 | "data_word_sos": "",
20 | "data_word_eos": "",
21 | "data_word_placeholder_enable": false,
22 | "data_subword_vocab_file": "data/squad/resource/squad.all.subword.vocab",
23 | "data_subword_vocab_size": 50554,
24 | "data_subword_vocab_threshold": 0,
25 | "data_subword_unk": "***",
26 | "data_subword_pad": "###",
27 | "data_subword_size": 3,
28 | "data_char_vocab_file": "data/squad/resource/squad.all.char.vocab",
29 | "data_char_vocab_size": 1610,
30 | "data_char_vocab_threshold": 0,
31 | "data_char_unk": "*",
32 | "data_char_pad": "#",
33 | "data_answer_type": "span",
34 | "data_expand_multiple_answer": false,
35 | "data_enable_validation": true,
36 | "data_pipeline_mode": "tfrecord",
37 | "data_num_parallel": 4,
38 | "data_log_output_dir": "output/qanet/log",
39 | "data_result_output_dir": "output/qanet/result",
40 | "train_random_seed": 100,
41 | "train_enable_shuffle": true,
42 | "train_shuffle_buffer_size": 30000,
43 | "train_batch_size": 16,
44 | "train_eval_batch_size": 100,
45 | "train_eval_metric": ["exact", "f1"],
46 | "train_eval_detail_type": "simplified",
47 | "train_decoding_sample_size": 3,
48 | "train_num_epoch": 3,
49 | "train_ckpt_output_dir": "output/qanet/checkpoint",
50 | "train_summary_output_dir": "output/qanet/summary",
51 | "train_step_per_stat": 10,
52 | "train_step_per_ckpt": 1000,
53 | "train_step_per_eval": 1000,
54 | "train_clip_norm": 5.0,
55 | "train_label_smoothing": 0.0,
56 | "train_enable_debugging": false,
57 | "train_ema_enable": true,
58 | "train_ema_decay_rate": 0.9999,
59 | "train_ema_enable_debias": false,
60 | "train_ema_enable_dynamic_decay": false,
61 | "train_regularization_enable": true,
62 | "train_regularization_type": "l2",
63 | "train_regularization_scale": 3e-7,
64 | "train_optimizer_type": "adam",
65 | "train_optimizer_learning_rate": 0.001,
66 | "train_optimizer_warmup_enable": true,
67 | "train_optimizer_warmup_mode": "exponential_warmup",
68 | "train_optimizer_warmup_rate": 0.01,
69 | "train_optimizer_warmup_end_step": 1000,
70 | "train_optimizer_decay_enable": false,
71 | "train_optimizer_decay_mode": "exponential_decay",
72 | "train_optimizer_decay_rate": 0.95,
73 | "train_optimizer_decay_step": 1000,
74 | "train_optimizer_decay_start_step": 10000,
75 | "train_optimizer_momentum_beta": 0.9,
76 | "train_optimizer_rmsprop_beta": 0.999,
77 | "train_optimizer_rmsprop_epsilon": 1e-08,
78 | "train_optimizer_adadelta_rho": 0.95,
79 | "train_optimizer_adadelta_epsilon": 1e-08,
80 | "train_optimizer_adagrad_init_accumulator": 0.1,
81 | "train_optimizer_adam_beta_1": 0.8,
82 | "train_optimizer_adam_beta_2": 0.999,
83 | "train_optimizer_adam_epsilon": 1e-07,
84 | "model_type": "qanet",
85 | "model_scope": "mrc",
86 | "model_representation_word_embed_dim": 300,
87 | "model_representation_word_dropout": 0.1,
88 | "model_representation_word_embed_pretrained": true,
89 | "model_representation_word_feat_trainable": false,
90 | "model_representation_word_feat_enable": true,
91 | "model_representation_subword_embed_dim": 64,
92 | "model_representation_subword_unit_dim": 200,
93 | "model_representation_subword_window_size": [5],
94 | "model_representation_subword_hidden_activation": "relu",
95 | "model_representation_subword_dropout": 0.05,
96 | "model_representation_subword_pooling_type": "max",
97 | "model_representation_subword_feat_trainable": true,
98 | "model_representation_subword_feat_enable": false,
99 | "model_representation_char_embed_dim": 64,
100 | "model_representation_char_unit_dim": 200,
101 | "model_representation_char_window_size": [5],
102 | "model_representation_char_hidden_activation": "relu",
103 | "model_representation_char_dropout": 0.05,
104 | "model_representation_char_pooling_type": "max",
105 | "model_representation_char_feat_trainable": true,
106 | "model_representation_char_feat_enable": true,
107 | "model_representation_fusion_type": "highway",
108 | "model_representation_fusion_num_layer": 2,
109 | "model_representation_fusion_unit_dim": 128,
110 | "model_representation_fusion_hidden_activation": "relu",
111 | "model_representation_fusion_dropout": 0.1,
112 | "model_representation_fusion_trainable": true,
113 | "model_understanding_question_num_layer": 1,
114 | "model_understanding_question_num_conv": 4,
115 | "model_understanding_question_num_head": 8,
116 | "model_understanding_question_unit_dim": 128,
117 | "model_understanding_question_window_size": [7],
118 | "model_understanding_question_hidden_activation": "relu",
119 | "model_understanding_question_dropout": 0.1,
120 | "model_understanding_question_attention_dropout": 0.0,
121 | "model_understanding_question_layer_dropout": 0.1,
122 | "model_understanding_question_trainable": true,
123 | "model_understanding_context_num_layer": 1,
124 | "model_understanding_context_num_conv": 4,
125 | "model_understanding_context_num_head": 8,
126 | "model_understanding_context_unit_dim": 128,
127 | "model_understanding_context_window_size": [7],
128 | "model_understanding_context_hidden_activation": "relu",
129 | "model_understanding_context_dropout": 0.1,
130 | "model_understanding_context_attention_dropout": 0.0,
131 | "model_understanding_context_layer_dropout": 0.1,
132 | "model_understanding_context_trainable": true,
133 | "model_understanding_enable_sharing": true,
134 | "model_interaction_context2question_attention_dim": 128,
135 | "model_interaction_context2question_score_type": "trilinear",
136 | "model_interaction_context2question_dropout": 0.1,
137 | "model_interaction_context2question_attention_dropout": 0.0,
138 | "model_interaction_context2question_trainable": true,
139 | "model_interaction_context2question_enable": true,
140 | "model_interaction_question2context_attention_dim": 128,
141 | "model_interaction_question2context_score_type": "trilinear",
142 | "model_interaction_question2context_dropout": 0.1,
143 | "model_interaction_question2context_attention_dropout": 0.0,
144 | "model_interaction_question2context_trainable": true,
145 | "model_interaction_question2context_enable": true,
146 | "model_interaction_fusion_type": "concate",
147 | "model_interaction_fusion_num_layer": 1,
148 | "model_interaction_fusion_unit_dim": 128,
149 | "model_interaction_fusion_hidden_activation": "relu",
150 | "model_interaction_fusion_dropout": 0.1,
151 | "model_interaction_fusion_trainable": true,
152 | "model_interaction_fusion_combo_enable": true,
153 | "model_interaction_enable_sharing": true,
154 | "model_modeling_answer_num_layer": 7,
155 | "model_modeling_answer_num_conv": 2,
156 | "model_modeling_answer_num_head": 8,
157 | "model_modeling_answer_unit_dim": 128,
158 | "model_modeling_answer_window_size": [5],
159 | "model_modeling_answer_hidden_activation": "relu",
160 | "model_modeling_answer_dropout": 0.1,
161 | "model_modeling_answer_attention_dropout": 0.0,
162 | "model_modeling_answer_layer_dropout": 0.1,
163 | "model_modeling_answer_trainable": true,
164 | "model_modeling_enable_sharing": true,
165 | "model_output_answer_start_dropout": 0.1,
166 | "model_output_answer_start_trainable": true,
167 | "model_output_answer_end_dropout": 0.1,
168 | "model_output_answer_end_trainable": true,
169 | "device_num_gpus": 1,
170 | "device_default_gpu_id": 0,
171 | "device_log_device_placement": false,
172 | "device_allow_soft_placement": true,
173 | "device_allow_growth": false,
174 | "device_per_process_gpu_memory_fraction": 0.8
175 | }
--------------------------------------------------------------------------------
/reading_comprehension/squad/preprocess.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import codecs
3 | import json
4 | import os.path
5 | import string
6 | import re
7 | import nltk
8 | import spacy
9 |
10 | spacy_nlp = spacy.load('en')
11 |
12 | def add_arguments(parser):
13 | parser.add_argument("--format", help="format to generate", required=True)
14 | parser.add_argument("--input_file", help="path to input file", required=True)
15 | parser.add_argument("--output_file", help="path to output file", required=True)
16 |
17 | def nltk_tokenize(text, lower_case=False, remove_punc=False):
18 | def process_token(tokens):
19 | special = ("-", "£", "€", "¥", "¢", "₹", "\u2212", "\u2014", "\u2013",
20 | "/", "~", '"', "'", "\ud01C", "\u201C", "\u2019", "\u201D", "\u2018", "\u00B0")
21 | pattern = "([{}])".format("".join(special))
22 | processed_tokens = []
23 | for token in tokens:
24 | token = token.replace("''", '" ').replace("``", '" ')
25 | processed_tokens.extend(re.split(pattern, token))
26 |
27 | return processed_tokens
28 |
29 | def remove_punctuation(tokens):
30 | exclude = set(string.punctuation)
31 | return [token for token in tokens if token not in exclude]
32 |
33 | def fix_white_space(tokens):
34 | return [token for token in tokens if token and not token.isspace()]
35 |
36 | sents = nltk.sent_tokenize(text)
37 | norm_sents = []
38 | for sent in sents:
39 | words = nltk.word_tokenize(sent)
40 | words = process_token(words)
41 | if remove_punc:
42 | words = remove_punctuation(words)
43 |
44 | words = fix_white_space(words)
45 | norm_sents.append(' '.join(words))
46 |
47 | norm_text = ' '.join(norm_sents)
48 | if lower_case:
49 | norm_text = norm_text.lower()
50 |
51 | return norm_text
52 |
53 | def spacy_tokenize(text, lower_case=False, remove_punc=False):
54 | def process_token(tokens):
55 | special = ("-", "£", "€", "¥", "¢", "₹", "\u2212", "\u2014", "\u2013",
56 | "/", "~", '"', "'", "\ud01C", "\u201C", "\u2019", "\u201D", "\u2018", "\u00B0")
57 | pattern = "([{}])".format("".join(special))
58 | processed_tokens = []
59 | for token in tokens:
60 | token = token.replace("''", '" ').replace("``", '" ')
61 | processed_tokens.extend(re.split(pattern, token))
62 |
63 | return processed_tokens
64 |
65 | def remove_punctuation(tokens):
66 | exclude = set(string.punctuation)
67 | return [token for token in tokens if token not in exclude]
68 |
69 | def fix_white_space(tokens):
70 | return [token for token in tokens if token and not token.isspace()]
71 |
72 | word_docs = spacy_nlp(text)
73 | words = [word.text for word in word_docs]
74 | words = process_token(words)
75 | if remove_punc:
76 | words = remove_punctuation(words)
77 |
78 | words = fix_white_space(words)
79 |
80 | norm_text = ' '.join(words)
81 | if lower_case:
82 | norm_text = norm_text.lower()
83 |
84 | return norm_text
85 |
86 | def get_char_spans(raw_text, norm_text):
87 | pattern = "\"|``|''"
88 | spans = []
89 | idx = 0
90 | norm_tokens = norm_text.split(' ')
91 | for token in norm_tokens:
92 | if re.match(pattern, token):
93 | span = re.search(pattern, raw_text[idx:])
94 | idx += span.start()
95 | token_len = span.end() - span.start()
96 | else:
97 | idx = raw_text.find(token, idx)
98 | token_len = len(token)
99 |
100 | if idx < 0 or token is None or token_len == 0:
101 | raise ValueError("invalid text: {0} <--> {1}".format(raw_text, norm_text))
102 |
103 | spans.append((idx, idx + token_len))
104 | idx += token_len
105 |
106 | return spans
107 |
108 | def get_word_span(char_spans, answer_char_start, answer_char_end):
109 | answer_word_start = None
110 | answer_word_end = None
111 | for word_idx, (char_start_idx, char_end_indx) in enumerate(char_spans):
112 | if char_start_idx <= answer_char_start <= char_end_indx:
113 | answer_word_start = word_idx
114 | if char_start_idx <= answer_char_end <= char_end_indx:
115 | answer_word_end = word_idx
116 |
117 | if answer_word_end is None and answer_word_start is not None:
118 | if answer_char_end > char_spans[-1][-1]:
119 | answer_word_end = len(char_spans) - 1
120 |
121 | if answer_word_end is None or answer_word_start is None or answer_word_end < answer_word_start:
122 | raise ValueError("invalid word span: ({0}, {1})".format(answer_word_start, answer_word_end))
123 |
124 | return answer_word_start, answer_word_end
125 |
126 | def preprocess(file_name):
127 | if not os.path.exists(file_name):
128 | raise FileNotFoundError("file not found")
129 |
130 | processed_data_list = []
131 | with open(file_name, "r") as file:
132 | json_content = json.load(file)
133 | for article in json_content["data"]:
134 | for paragraph in article["paragraphs"]:
135 | context = paragraph["context"].strip()
136 | norm_context = spacy_tokenize(context)
137 | char_spans = get_char_spans(context, norm_context)
138 | for qa in paragraph["qas"]:
139 | qa_id = qa["id"]
140 | question = qa["question"].strip()
141 | norm_question = spacy_tokenize(question)
142 |
143 | processed_data = {
144 | "id": qa_id,
145 | "question": norm_question,
146 | "context": norm_context,
147 | "answers": []
148 | }
149 |
150 | for answer in qa["answers"]:
151 | answer_text = answer["text"].strip()
152 | answer_char_start = answer["answer_start"]
153 | answer_char_end = answer_char_start + len(answer_text) - 1
154 |
155 | answer_word_start, answer_word_end = get_word_span(char_spans,
156 | answer_char_start, answer_char_end)
157 |
158 | answer_text = " ".join(norm_context.split(' ')[answer_word_start:answer_word_end+1])
159 |
160 | processed_data["answers"].append({
161 | "text": answer_text,
162 | "start": answer_word_start,
163 | "end": answer_word_end
164 | })
165 |
166 | processed_data_list.append(processed_data)
167 |
168 | return processed_data_list
169 |
170 | def output_to_json(data_list, file_name):
171 | with open(file_name, "w") as file:
172 | data_json = json.dumps(data_list, indent=4)
173 | file.write(data_json)
174 |
175 | def output_to_plain(data_list, file_name):
176 | with open(file_name, "wb") as file:
177 | for data in data_list:
178 | for answer in data["answers"]:
179 | data_plain = "{0}\t{1}\t{2}\t{3}\t{4}|{5}\r\n".format(data["id"], data["question"],
180 | data["context"].replace("\n", " "), answer["text"], answer["start"], answer["end"])
181 | file.write(data_plain.encode("utf-8"))
182 |
183 | def output_to_split(data_list, file_prefix):
184 | with open("{0}.question".format(file_prefix), "wb") as q_file, open("{0}.context".format(file_prefix), "wb") as c_file, open("{0}.answer_text".format(file_prefix), "wb") as at_file, open("{0}.answer_span".format(file_prefix), "wb") as as_file:
185 | for data in data_list:
186 | for answer in data["answers"]:
187 | q_data_plain = "{0}\r\n".format(data["question"])
188 | q_file.write(q_data_plain.encode("utf-8"))
189 | c_data_plain = "{0}\r\n".format(data["context"].replace("\n", " "))
190 | c_file.write(c_data_plain.encode("utf-8"))
191 | at_data_plain = "{0}\r\n".format(answer["text"])
192 | at_file.write(at_data_plain.encode("utf-8"))
193 | as_data_plain = "{0}|{1}\r\n".format(answer["start"], answer["end"])
194 | as_file.write(as_data_plain.encode("utf-8"))
195 |
196 | def main(args):
197 | processed_data = preprocess(args.input_file)
198 | if (args.format == 'json'):
199 | output_to_json(processed_data, args.output_file)
200 | elif (args.format == 'plain'):
201 | output_to_plain(processed_data, args.output_file)
202 | elif (args.format == 'split'):
203 | output_to_split(processed_data, args.output_file)
204 |
205 | if __name__ == "__main__":
206 | parser = argparse.ArgumentParser()
207 | add_arguments(parser)
208 | args = parser.parse_args()
209 | main(args)
--------------------------------------------------------------------------------
/reading_comprehension/config/config_mrc_template.bidaf.json:
--------------------------------------------------------------------------------
1 | {
2 | "data_train_mrc_file": "data/squad/train-v1.1/train-v1.1.squad.json",
3 | "data_train_mrc_file_type": "json",
4 | "data_eval_mrc_file": "data/squad/dev-v1.1/dev-v1.1.squad.json",
5 | "data_eval_mrc_file_type": "json",
6 | "data_embedding_file": "data/squad/resource/squad.all.word.embed",
7 | "data_full_embedding_file": "data/glove/glove.6B.100d.txt",
8 | "data_tfrecord_dir": "data/squad/tfrecord",
9 | "data_max_question_length": 40,
10 | "data_max_context_length": 500,
11 | "data_max_answer_length": 30,
12 | "data_max_subword_length": 16,
13 | "data_max_char_length": 16,
14 | "data_word_vocab_file": "data/squad/resource/squad.all.word.vocab",
15 | "data_word_vocab_size": 72646,
16 | "data_word_vocab_threshold": 0,
17 | "data_word_unk": "",
18 | "data_word_pad": "",
19 | "data_word_sos": "",
20 | "data_word_eos": "",
21 | "data_word_placeholder_enable": false,
22 | "data_subword_vocab_file": "data/squad/resource/squad.all.subword.vocab",
23 | "data_subword_vocab_size": 50554,
24 | "data_subword_vocab_threshold": 0,
25 | "data_subword_unk": "***",
26 | "data_subword_pad": "###",
27 | "data_subword_size": 3,
28 | "data_char_vocab_file": "data/squad/resource/squad.all.char.vocab",
29 | "data_char_vocab_size": 1610,
30 | "data_char_vocab_threshold": 0,
31 | "data_char_unk": "*",
32 | "data_char_pad": "#",
33 | "data_answer_type": "span",
34 | "data_expand_multiple_answer": false,
35 | "data_enable_validation": true,
36 | "data_pipeline_mode": "tfrecord",
37 | "data_num_parallel": 4,
38 | "data_log_output_dir": "output/bidaf/log",
39 | "data_result_output_dir": "output/bidaf/result",
40 | "train_random_seed": 100,
41 | "train_enable_shuffle": true,
42 | "train_shuffle_buffer_size": 30000,
43 | "train_batch_size": 60,
44 | "train_eval_batch_size": 100,
45 | "train_eval_metric": ["exact", "f1"],
46 | "train_eval_detail_type": "simplified",
47 | "train_decoding_sample_size": 3,
48 | "train_num_epoch": 3,
49 | "train_ckpt_output_dir": "output/bidaf/checkpoint",
50 | "train_summary_output_dir": "output/bidaf/summary",
51 | "train_step_per_stat": 10,
52 | "train_step_per_ckpt": 1000,
53 | "train_step_per_eval": 1000,
54 | "train_clip_norm": 5.0,
55 | "train_label_smoothing": 0.0,
56 | "train_enable_debugging": false,
57 | "train_ema_enable": true,
58 | "train_ema_decay_rate": 0.999,
59 | "train_ema_enable_debias": false,
60 | "train_ema_enable_dynamic_decay": false,
61 | "train_regularization_enable": false,
62 | "train_regularization_type": "l2",
63 | "train_regularization_scale": 3e-7,
64 | "train_optimizer_type": "adam",
65 | "train_optimizer_learning_rate": 0.001,
66 | "train_optimizer_warmup_enable": false,
67 | "train_optimizer_warmup_mode": "exponential_warmup",
68 | "train_optimizer_warmup_rate": 0.01,
69 | "train_optimizer_warmup_end_step": 1000,
70 | "train_optimizer_decay_enable": false,
71 | "train_optimizer_decay_mode": "exponential_decay",
72 | "train_optimizer_decay_rate": 0.95,
73 | "train_optimizer_decay_step": 1000,
74 | "train_optimizer_decay_start_step": 10000,
75 | "train_optimizer_momentum_beta": 0.9,
76 | "train_optimizer_rmsprop_beta": 0.999,
77 | "train_optimizer_rmsprop_epsilon": 1e-08,
78 | "train_optimizer_adadelta_rho": 0.95,
79 | "train_optimizer_adadelta_epsilon": 1e-08,
80 | "train_optimizer_adagrad_init_accumulator": 0.1,
81 | "train_optimizer_adam_beta_1": 0.9,
82 | "train_optimizer_adam_beta_2": 0.999,
83 | "train_optimizer_adam_epsilon": 1e-08,
84 | "model_type": "bidaf",
85 | "model_scope": "mrc",
86 | "model_representation_word_embed_dim": 100,
87 | "model_representation_word_embed_pretrained": true,
88 | "model_representation_word_feat_trainable": false,
89 | "model_representation_word_feat_enable": true,
90 | "model_representation_subword_embed_dim": 8,
91 | "model_representation_subword_unit_dim": 100,
92 | "model_representation_subword_window_size": [5],
93 | "model_representation_subword_hidden_activation": "relu",
94 | "model_representation_subword_dropout": 0.2,
95 | "model_representation_subword_pooling_type": "max",
96 | "model_representation_subword_feat_trainable": true,
97 | "model_representation_subword_feat_enable": false,
98 | "model_representation_char_embed_dim": 8,
99 | "model_representation_char_unit_dim": 100,
100 | "model_representation_char_window_size": [5],
101 | "model_representation_char_hidden_activation": "relu",
102 | "model_representation_char_dropout": 0.2,
103 | "model_representation_char_pooling_type": "max",
104 | "model_representation_char_feat_trainable": true,
105 | "model_representation_char_feat_enable": true,
106 | "model_representation_fusion_type": "highway",
107 | "model_representation_fusion_num_layer": 2,
108 | "model_representation_fusion_unit_dim": 400,
109 | "model_representation_fusion_hidden_activation": "relu",
110 | "model_representation_fusion_dropout": 0.2,
111 | "model_representation_fusion_trainable": true,
112 | "model_understanding_question_num_layer": 1,
113 | "model_understanding_question_unit_dim": 100,
114 | "model_understanding_question_cell_type": "lstm",
115 | "model_understanding_question_hidden_activation": "tanh",
116 | "model_understanding_question_dropout": 0.2,
117 | "model_understanding_question_forget_bias": 1.0,
118 | "model_understanding_question_residual_connect": false,
119 | "model_understanding_question_trainable": true,
120 | "model_understanding_context_num_layer": 1,
121 | "model_understanding_context_unit_dim": 100,
122 | "model_understanding_context_cell_type": "lstm",
123 | "model_understanding_context_hidden_activation": "tanh",
124 | "model_understanding_context_dropout": 0.2,
125 | "model_understanding_context_forget_bias": 1.0,
126 | "model_understanding_context_residual_connect": false,
127 | "model_understanding_context_trainable": true,
128 | "model_understanding_enable_sharing": true,
129 | "model_interaction_context2question_attention_dim": 200,
130 | "model_interaction_context2question_score_type": "trilinear",
131 | "model_interaction_context2question_dropout": 0.2,
132 | "model_interaction_context2question_attention_dropout": 0.0,
133 | "model_interaction_context2question_trainable": true,
134 | "model_interaction_context2question_enable": true,
135 | "model_interaction_question2context_attention_dim": 200,
136 | "model_interaction_question2context_score_type": "trilinear",
137 | "model_interaction_question2context_dropout": 0.2,
138 | "model_interaction_question2context_attention_dropout": 0.0,
139 | "model_interaction_question2context_trainable": true,
140 | "model_interaction_question2context_enable": true,
141 | "model_interaction_fusion_type": "concate",
142 | "model_interaction_fusion_num_layer": 1,
143 | "model_interaction_fusion_unit_dim": 800,
144 | "model_interaction_fusion_hidden_activation": "relu",
145 | "model_interaction_fusion_dropout": 0.2,
146 | "model_interaction_fusion_trainable": true,
147 | "model_interaction_fusion_combo_enable": true,
148 | "model_interaction_enable_sharing": true,
149 | "model_modeling_answer_num_layer": 1,
150 | "model_modeling_answer_unit_dim": 100,
151 | "model_modeling_answer_cell_type": "lstm",
152 | "model_modeling_answer_hidden_activation": "tanh",
153 | "model_modeling_answer_dropout": 0.2,
154 | "model_modeling_answer_attention_dropout": 0.0,
155 | "model_modeling_answer_forget_bias": 1.0,
156 | "model_modeling_answer_residual_connect": false,
157 | "model_modeling_answer_attention_dim": 200,
158 | "model_modeling_answer_score_type": "trilinear",
159 | "model_modeling_answer_attention_enable": false,
160 | "model_modeling_answer_trainable": true,
161 | "model_modeling_fusion_type": "concate",
162 | "model_modeling_fusion_num_layer": 1,
163 | "model_modeling_fusion_unit_dim": 1000,
164 | "model_modeling_fusion_hidden_activation": "relu",
165 | "model_modeling_fusion_dropout": 0.2,
166 | "model_modeling_fusion_trainable": true,
167 | "model_output_answer_start_num_layer": 1,
168 | "model_output_answer_start_unit_dim": 100,
169 | "model_output_answer_start_cell_type": "lstm",
170 | "model_output_answer_start_hidden_activation": "tanh",
171 | "model_output_answer_start_dropout": 0.2,
172 | "model_output_answer_start_forget_bias": 1.0,
173 | "model_output_answer_start_residual_connect": false,
174 | "model_output_answer_start_trainable": true,
175 | "model_output_answer_end_num_layer": 1,
176 | "model_output_answer_end_unit_dim": 100,
177 | "model_output_answer_end_cell_type": "lstm",
178 | "model_output_answer_end_hidden_activation": "tanh",
179 | "model_output_answer_end_dropout": 0.2,
180 | "model_output_answer_end_forget_bias": 1.0,
181 | "model_output_answer_end_residual_connect": false,
182 | "model_output_answer_end_trainable": true,
183 | "device_num_gpus": 1,
184 | "device_default_gpu_id": 0,
185 | "device_log_device_placement": false,
186 | "device_allow_soft_placement": true,
187 | "device_allow_growth": false,
188 | "device_per_process_gpu_memory_fraction": 0.8
189 | }
--------------------------------------------------------------------------------
/reading_comprehension/layer/highway.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 |
4 | from util.default_util import *
5 | from util.reading_comprehension_util import *
6 |
7 | from layer.basic import *
8 |
9 | __all__ = ["Highway", "ConvHighway", "StackedHighway", "StackedConvHighway"]
10 |
11 | class Highway(object):
12 | """highway layer"""
13 | def __init__(self,
14 | unit_dim,
15 | activation,
16 | dropout,
17 | num_gpus=1,
18 | default_gpu_id=0,
19 | regularizer=None,
20 | random_seed=0,
21 | trainable=True,
22 | scope="highway"):
23 | """initialize highway layer"""
24 | self.unit_dim = unit_dim
25 | self.activation = activation
26 | self.dropout = dropout
27 | self.regularizer = regularizer
28 | self.random_seed = random_seed
29 | self.trainable = trainable
30 | self.scope = scope
31 | self.device_spec = get_device_spec(default_gpu_id, num_gpus)
32 |
33 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
34 | weight_initializer = create_variable_initializer("glorot_uniform", self.random_seed)
35 | bias_initializer = create_variable_initializer("zero")
36 | transform_activation = create_activation_function(self.activation)
37 | gate_activation = create_activation_function("sigmoid")
38 | self.transform_layer = tf.layers.Dense(units=self.unit_dim, activation=transform_activation, use_bias=True,
39 | kernel_initializer=weight_initializer, bias_initializer=bias_initializer,
40 | kernel_regularizer=self.regularizer, bias_regularizer=self.regularizer, trainable=self.trainable)
41 | self.gate_layer = tf.layers.Dense(units=self.unit_dim, activation=gate_activation, use_bias=True,
42 | kernel_initializer=weight_initializer, bias_initializer=bias_initializer,
43 | kernel_regularizer=self.regularizer, bias_regularizer=self.regularizer, trainable=self.trainable)
44 |
45 | self.dropout_layer = Dropout(rate=self.dropout, num_gpus=num_gpus,
46 | default_gpu_id=default_gpu_id, random_seed=self.random_seed)
47 |
48 | def __call__(self,
49 | input_data,
50 | input_mask):
51 | """call highway layer"""
52 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
53 | transform, _ = self.dropout_layer(self.transform_layer(input_data), input_mask)
54 | gate = self.gate_layer(input_data)
55 | output_highway = transform * gate + input_data * (1 - gate)
56 | output_mask = input_mask
57 |
58 | return output_highway, output_mask
59 |
60 | class ConvHighway(object):
61 | """convolutional highway layer"""
62 | def __init__(self,
63 | num_filter,
64 | window_size,
65 | activation,
66 | dropout,
67 | num_gpus=1,
68 | default_gpu_id=0,
69 | regularizer=None,
70 | random_seed=0,
71 | trainable=True,
72 | scope="conv_highway"):
73 | """initialize convolutional highway layer"""
74 | self.num_filter = num_filter
75 | self.window_size = window_size
76 | self.activation = activation
77 | self.dropout = dropout
78 | self.regularizer = regularizer
79 | self.random_seed = random_seed
80 | self.trainable = trainable
81 | self.scope = scope
82 | self.device_spec = get_device_spec(default_gpu_id, num_gpus)
83 |
84 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
85 | weight_initializer = create_variable_initializer("glorot_uniform", self.m_seed)
86 | bias_initializer = create_variable_initializer("zero")
87 | transform_activation = create_activation_function(self.activation)
88 | gate_activation = create_activation_function("sigmoid")
89 |
90 | self.transform_layer = tf.layers.Conv1D(filters=self.num_filter, kernel_size=window_size,
91 | strides=1, padding="SAME", activation=transform_activation, use_bias=True,
92 | kernel_initializer=weight_initializer, bias_initializer=bias_initializer,
93 | kernel_regularizer=self.regularizer, bias_regularizer=self.regularizer, trainable=trainable)
94 | self.gate_layer = tf.layers.Conv1D(filters=self.num_filter, kernel_size=window_size,
95 | strides=1, padding="SAME", activation=gate_activation, use_bias=True,
96 | kernel_initializer=weight_initializer, bias_initializer=bias_initializer,
97 | kernel_regularizer=self.regularizer, bias_regularizer=self.regularizer, trainable=trainable)
98 |
99 | self.dropout_layer = Dropout(rate=self.dropout, num_gpus=num_gpus,
100 | default_gpu_id=default_gpu_id, random_seed=self.random_seed)
101 |
102 | def __call__(self,
103 | input_data,
104 | input_mask):
105 | """call convolutional highway layer"""
106 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
107 | transform, _ = self.dropout_layer(self.transform_layer(input_data), input_mask)
108 | gate = self.gate_layer(input_data)
109 | output_highway = transform * gate + input_data * (1 - gate)
110 | output_mask = input_mask
111 |
112 | return output_highway, output_mask
113 |
114 | class StackedHighway(object):
115 | """stacked highway layer"""
116 | def __init__(self,
117 | num_layer,
118 | unit_dim,
119 | activation,
120 | dropout,
121 | num_gpus=1,
122 | default_gpu_id=0,
123 | regularizer=None,
124 | random_seed=0,
125 | trainable=True,
126 | scope="stacked_highway"):
127 | """initialize stacked highway layer"""
128 | self.num_layer = num_layer
129 | self.unit_dim = unit_dim
130 | self.activation = activation
131 | self.dropout = dropout
132 | self.num_gpus = num_gpus
133 | self.default_gpu_id = default_gpu_id
134 | self.regularizer = regularizer
135 | self.random_seed = random_seed
136 | self.trainable = trainable
137 | self.scope = scope
138 | self.device_spec = get_device_spec(default_gpu_id, num_gpus)
139 |
140 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
141 | self.highway_layer_list = []
142 | for i in range(self.num_layer):
143 | layer_scope = "layer_{0}".format(i)
144 | sublayer_dropout = self.dropout[i] if self.dropout != None else 0.0
145 | highway_layer = Highway(unit_dim=self.unit_dim, activation=self.activation,
146 | dropout=sublayer_dropout, num_gpus=self.num_gpus, default_gpu_id=self.default_gpu_id,
147 | regularizer=self.regularizer, random_seed=self.random_seed, trainable=self.trainable, scope=layer_scope)
148 | self.highway_layer_list.append(highway_layer)
149 |
150 | def __call__(self,
151 | input_data,
152 | input_mask):
153 | """call stacked highway layer"""
154 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
155 | input_highway = input_data
156 | input_highway_mask = input_mask
157 |
158 | for highway_layer in self.highway_layer_list:
159 | input_highway, input_highway_mask = highway_layer(input_highway, input_highway_mask)
160 |
161 | output_highway = input_highway
162 | output_mask = input_highway_mask
163 |
164 | return output_highway, output_mask
165 |
166 | class StackedConvHighway(object):
167 | """stacked convolution highway layer"""
168 | def __init__(self,
169 | num_layer,
170 | num_filter,
171 | window_size,
172 | activation,
173 | dropout,
174 | num_gpus=1,
175 | default_gpu_id=0,
176 | regularizer=None,
177 | random_seed=0,
178 | trainable=True,
179 | scope="stacked_conv_highway"):
180 | """initialize stacked convolution highway layer"""
181 | self.num_layer = num_layer
182 | self.num_filter = num_filter
183 | self.unit_dim = unit_dim
184 | self.window_size = window_size
185 | self.dropout = dropout
186 | self.num_gpus = num_gpus
187 | self.default_gpu_id = default_gpu_id
188 | self.regularizer = regularizer
189 | self.random_seed = random_seed
190 | self.trainable = trainable
191 | self.scope = scope
192 | self.device_spec = get_device_spec(default_gpu_id, num_gpus)
193 |
194 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
195 | self.highway_layer_list = []
196 | for i in range(self.num_layer):
197 | layer_scope = "layer_{0}".format(i)
198 | sublayer_dropout = self.dropout[i] if self.dropout != None else 0.0
199 | highway_layer = ConvHighway(num_filter=self.num_filter, window_size=self.window_size,
200 | activation=self.activation, dropout=sublayer_dropout, num_gpus=self.num_gpus, default_gpu_id=self.default_gpu_id,
201 | regularizer=self.regularizer, random_seed=self.random_seed, trainable=self.trainable, scope=layer_scope)
202 | self.highway_layer_list.append(highway_layer)
203 |
204 | def __call__(self,
205 | input_data,
206 | input_mask):
207 | """call stacked convolution highway layer"""
208 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
209 | input_highway = input_data
210 | input_highway_mask = input_mask
211 |
212 | for highway_layer in self.highway_layer_list:
213 | input_highway, input_highway_mask = highway_layer(input_highway, input_highway_mask)
214 |
215 | output_highway = input_highway
216 | output_mask = input_highway_mask
217 |
218 | return output_highway, output_mask
219 |
--------------------------------------------------------------------------------
/reading_comprehension/squad/evaluate-v2.py:
--------------------------------------------------------------------------------
1 | """Official evaluation script for SQuAD version 2.0.
2 |
3 | In addition to basic functionality, we also compute additional statistics and
4 | plot precision-recall curves if an additional na_prob.json file is provided.
5 | This file is expected to map question ID's to the model's predicted probability
6 | that a question is unanswerable.
7 | """
8 | import argparse
9 | import collections
10 | import json
11 | import numpy as np
12 | import os
13 | import re
14 | import string
15 | import sys
16 |
17 | OPTS = None
18 |
19 | def parse_args():
20 | parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.')
21 | parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.')
22 | parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.')
23 | parser.add_argument('--out-file', '-o', metavar='eval.json',
24 | help='Write accuracy metrics to file (default is stdout).')
25 | parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json',
26 | help='Model estimates of probability of no answer.')
27 | parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0,
28 | help='Predict "" if no-answer probability exceeds this (default = 1.0).')
29 | parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None,
30 | help='Save precision-recall curves to directory.')
31 | parser.add_argument('--verbose', '-v', action='store_true')
32 | if len(sys.argv) == 1:
33 | parser.print_help()
34 | sys.exit(1)
35 | return parser.parse_args()
36 |
37 | def make_qid_to_has_ans(dataset):
38 | qid_to_has_ans = {}
39 | for article in dataset:
40 | for p in article['paragraphs']:
41 | for qa in p['qas']:
42 | qid_to_has_ans[qa['id']] = bool(qa['answers'])
43 | return qid_to_has_ans
44 |
45 | def normalize_answer(s):
46 | """Lower text and remove punctuation, articles and extra whitespace."""
47 | def remove_articles(text):
48 | regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
49 | return re.sub(regex, ' ', text)
50 | def white_space_fix(text):
51 | return ' '.join(text.split())
52 | def remove_punc(text):
53 | exclude = set(string.punctuation)
54 | return ''.join(ch for ch in text if ch not in exclude)
55 | def lower(text):
56 | return text.lower()
57 | return white_space_fix(remove_articles(remove_punc(lower(s))))
58 |
59 | def get_tokens(s):
60 | if not s: return []
61 | return normalize_answer(s).split()
62 |
63 | def compute_exact(a_gold, a_pred):
64 | return int(normalize_answer(a_gold) == normalize_answer(a_pred))
65 |
66 | def compute_f1(a_gold, a_pred):
67 | gold_toks = get_tokens(a_gold)
68 | pred_toks = get_tokens(a_pred)
69 | common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
70 | num_same = sum(common.values())
71 | if len(gold_toks) == 0 or len(pred_toks) == 0:
72 | # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
73 | return int(gold_toks == pred_toks)
74 | if num_same == 0:
75 | return 0
76 | precision = 1.0 * num_same / len(pred_toks)
77 | recall = 1.0 * num_same / len(gold_toks)
78 | f1 = (2 * precision * recall) / (precision + recall)
79 | return f1
80 |
81 | def get_raw_scores(dataset, preds):
82 | exact_scores = {}
83 | f1_scores = {}
84 | for article in dataset:
85 | for p in article['paragraphs']:
86 | for qa in p['qas']:
87 | qid = qa['id']
88 | gold_answers = [a['text'] for a in qa['answers']
89 | if normalize_answer(a['text'])]
90 | if not gold_answers:
91 | # For unanswerable questions, only correct answer is empty string
92 | gold_answers = ['']
93 | if qid not in preds:
94 | print('Missing prediction for %s' % qid)
95 | continue
96 | a_pred = preds[qid]
97 | # Take max over all gold answers
98 | exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
99 | f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers)
100 | return exact_scores, f1_scores
101 |
102 | def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
103 | new_scores = {}
104 | for qid, s in scores.items():
105 | pred_na = na_probs[qid] > na_prob_thresh
106 | if pred_na:
107 | new_scores[qid] = float(not qid_to_has_ans[qid])
108 | else:
109 | new_scores[qid] = s
110 | return new_scores
111 |
112 | def make_eval_dict(exact_scores, f1_scores, qid_list=None):
113 | if not qid_list:
114 | total = len(exact_scores)
115 | return collections.OrderedDict([
116 | ('exact', 100.0 * sum(exact_scores.values()) / total),
117 | ('f1', 100.0 * sum(f1_scores.values()) / total),
118 | ('total', total),
119 | ])
120 | else:
121 | total = len(qid_list)
122 | return collections.OrderedDict([
123 | ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
124 | ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
125 | ('total', total),
126 | ])
127 |
128 | def merge_eval(main_eval, new_eval, prefix):
129 | for k in new_eval:
130 | main_eval['%s_%s' % (prefix, k)] = new_eval[k]
131 |
132 | def plot_pr_curve(precisions, recalls, out_image, title):
133 | plt.step(recalls, precisions, color='b', alpha=0.2, where='post')
134 | plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b')
135 | plt.xlabel('Recall')
136 | plt.ylabel('Precision')
137 | plt.xlim([0.0, 1.05])
138 | plt.ylim([0.0, 1.05])
139 | plt.title(title)
140 | plt.savefig(out_image)
141 | plt.clf()
142 |
143 | def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans,
144 | out_image=None, title=None):
145 | qid_list = sorted(na_probs, key=lambda k: na_probs[k])
146 | true_pos = 0.0
147 | cur_p = 1.0
148 | cur_r = 0.0
149 | precisions = [1.0]
150 | recalls = [0.0]
151 | avg_prec = 0.0
152 | for i, qid in enumerate(qid_list):
153 | if qid_to_has_ans[qid]:
154 | true_pos += scores[qid]
155 | cur_p = true_pos / float(i+1)
156 | cur_r = true_pos / float(num_true_pos)
157 | if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]:
158 | # i.e., if we can put a threshold after this point
159 | avg_prec += cur_p * (cur_r - recalls[-1])
160 | precisions.append(cur_p)
161 | recalls.append(cur_r)
162 | if out_image:
163 | plot_pr_curve(precisions, recalls, out_image, title)
164 | return {'ap': 100.0 * avg_prec}
165 |
166 | def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs,
167 | qid_to_has_ans, out_image_dir):
168 | if out_image_dir and not os.path.exists(out_image_dir):
169 | os.makedirs(out_image_dir)
170 | num_true_pos = sum(1 for v in qid_to_has_ans.values() if v)
171 | if num_true_pos == 0:
172 | return
173 | pr_exact = make_precision_recall_eval(
174 | exact_raw, na_probs, num_true_pos, qid_to_has_ans,
175 | out_image=os.path.join(out_image_dir, 'pr_exact.png'),
176 | title='Precision-Recall curve for Exact Match score')
177 | pr_f1 = make_precision_recall_eval(
178 | f1_raw, na_probs, num_true_pos, qid_to_has_ans,
179 | out_image=os.path.join(out_image_dir, 'pr_f1.png'),
180 | title='Precision-Recall curve for F1 score')
181 | oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
182 | pr_oracle = make_precision_recall_eval(
183 | oracle_scores, na_probs, num_true_pos, qid_to_has_ans,
184 | out_image=os.path.join(out_image_dir, 'pr_oracle.png'),
185 | title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)')
186 | merge_eval(main_eval, pr_exact, 'pr_exact')
187 | merge_eval(main_eval, pr_f1, 'pr_f1')
188 | merge_eval(main_eval, pr_oracle, 'pr_oracle')
189 |
190 | def histogram_na_prob(na_probs, qid_list, image_dir, name):
191 | if not qid_list:
192 | return
193 | x = [na_probs[k] for k in qid_list]
194 | weights = np.ones_like(x) / float(len(x))
195 | plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
196 | plt.xlabel('Model probability of no-answer')
197 | plt.ylabel('Proportion of dataset')
198 | plt.title('Histogram of no-answer probability: %s' % name)
199 | plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name))
200 | plt.clf()
201 |
202 | def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
203 | num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
204 | cur_score = num_no_ans
205 | best_score = cur_score
206 | best_thresh = 0.0
207 | qid_list = sorted(na_probs, key=lambda k: na_probs[k])
208 | for i, qid in enumerate(qid_list):
209 | if qid not in scores: continue
210 | if qid_to_has_ans[qid]:
211 | diff = scores[qid]
212 | else:
213 | if preds[qid]:
214 | diff = -1
215 | else:
216 | diff = 0
217 | cur_score += diff
218 | if cur_score > best_score:
219 | best_score = cur_score
220 | best_thresh = na_probs[qid]
221 | return 100.0 * best_score / len(scores), best_thresh
222 |
223 | def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
224 | best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
225 | best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
226 | main_eval['best_exact'] = best_exact
227 | main_eval['best_exact_thresh'] = exact_thresh
228 | main_eval['best_f1'] = best_f1
229 | main_eval['best_f1_thresh'] = f1_thresh
230 |
231 | def main():
232 | with open(OPTS.data_file) as f:
233 | dataset_json = json.load(f)
234 | dataset = dataset_json['data']
235 | with open(OPTS.pred_file) as f:
236 | preds = json.load(f)
237 | if OPTS.na_prob_file:
238 | with open(OPTS.na_prob_file) as f:
239 | na_probs = json.load(f)
240 | else:
241 | na_probs = {k: 0.0 for k in preds}
242 | qid_to_has_ans = make_qid_to_has_ans(dataset) # maps qid to True/False
243 | has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
244 | no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
245 | exact_raw, f1_raw = get_raw_scores(dataset, preds)
246 | exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
247 | OPTS.na_prob_thresh)
248 | f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
249 | OPTS.na_prob_thresh)
250 | out_eval = make_eval_dict(exact_thresh, f1_thresh)
251 | if has_ans_qids:
252 | has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
253 | merge_eval(out_eval, has_ans_eval, 'HasAns')
254 | if no_ans_qids:
255 | no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
256 | merge_eval(out_eval, no_ans_eval, 'NoAns')
257 | if OPTS.na_prob_file:
258 | find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans)
259 | if OPTS.na_prob_file and OPTS.out_image_dir:
260 | run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs,
261 | qid_to_has_ans, OPTS.out_image_dir)
262 | histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns')
263 | histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns')
264 | if OPTS.out_file:
265 | with open(OPTS.out_file, 'w') as f:
266 | json.dump(out_eval, f)
267 | else:
268 | print(json.dumps(out_eval, indent=2))
269 |
270 | if __name__ == '__main__':
271 | OPTS = parse_args()
272 | if OPTS.out_image_dir:
273 | import matplotlib
274 | matplotlib.use('Agg')
275 | import matplotlib.pyplot as plt
276 | main()
277 |
278 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/reading_comprehension/external/rouge.py:
--------------------------------------------------------------------------------
1 | """ROUGE metric implementation.
2 |
3 | Copy from tf_seq2seq/seq2seq/metrics/rouge.py.
4 | This is a modified and slightly extended verison of
5 | https://github.com/miso-belica/sumy/blob/dev/sumy/evaluation/rouge.py.
6 | """
7 |
8 | from __future__ import absolute_import
9 | from __future__ import division
10 | from __future__ import print_function
11 | from __future__ import unicode_literals
12 |
13 | import itertools
14 | import numpy as np
15 |
16 | #pylint: disable=C0103
17 |
18 | __all__ = ["rouge"]
19 |
20 | def _get_ngrams(n, text):
21 | """Calcualtes n-grams.
22 |
23 | Args:
24 | n: which n-grams to calculate
25 | text: An array of tokens
26 |
27 | Returns:
28 | A set of n-grams
29 | """
30 | ngram_set = set()
31 | text_length = len(text)
32 | max_index_ngram_start = text_length - n
33 | for i in range(max_index_ngram_start + 1):
34 | ngram_set.add(tuple(text[i:i + n]))
35 | return ngram_set
36 |
37 |
38 | def _split_into_words(sentences):
39 | """Splits multiple sentences into words and flattens the result"""
40 | return list(itertools.chain(*[_.split(" ") for _ in sentences]))
41 |
42 |
43 | def _get_word_ngrams(n, sentences):
44 | """Calculates word n-grams for multiple sentences.
45 | """
46 | assert len(sentences) > 0
47 | assert n > 0
48 |
49 | words = _split_into_words(sentences)
50 | return _get_ngrams(n, words)
51 |
52 |
53 | def _len_lcs(x, y):
54 | """
55 | Returns the length of the Longest Common Subsequence between sequences x
56 | and y.
57 | Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
58 |
59 | Args:
60 | x: sequence of words
61 | y: sequence of words
62 |
63 | Returns
64 | integer: Length of LCS between x and y
65 | """
66 | table = _lcs(x, y)
67 | n, m = len(x), len(y)
68 | return table[n, m]
69 |
70 |
71 | def _lcs(x, y):
72 | """
73 | Computes the length of the longest common subsequence (lcs) between two
74 | strings. The implementation below uses a DP programming algorithm and runs
75 | in O(nm) time where n = len(x) and m = len(y).
76 | Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
77 |
78 | Args:
79 | x: collection of words
80 | y: collection of words
81 |
82 | Returns:
83 | Table of dictionary of coord and len lcs
84 | """
85 | n, m = len(x), len(y)
86 | table = dict()
87 | for i in range(n + 1):
88 | for j in range(m + 1):
89 | if i == 0 or j == 0:
90 | table[i, j] = 0
91 | elif x[i - 1] == y[j - 1]:
92 | table[i, j] = table[i - 1, j - 1] + 1
93 | else:
94 | table[i, j] = max(table[i - 1, j], table[i, j - 1])
95 | return table
96 |
97 |
98 | def _recon_lcs(x, y):
99 | """
100 | Returns the Longest Subsequence between x and y.
101 | Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
102 |
103 | Args:
104 | x: sequence of words
105 | y: sequence of words
106 |
107 | Returns:
108 | sequence: LCS of x and y
109 | """
110 | i, j = len(x), len(y)
111 | table = _lcs(x, y)
112 |
113 | def _recon(i, j):
114 | """private recon calculation"""
115 | if i == 0 or j == 0:
116 | return []
117 | elif x[i - 1] == y[j - 1]:
118 | return _recon(i - 1, j - 1) + [(x[i - 1], i)]
119 | elif table[i - 1, j] > table[i, j - 1]:
120 | return _recon(i - 1, j)
121 | else:
122 | return _recon(i, j - 1)
123 |
124 | recon_tuple = tuple(map(lambda x: x[0], _recon(i, j)))
125 | return recon_tuple
126 |
127 |
128 | def rouge_n(evaluated_sentences, reference_sentences, n=2):
129 | """
130 | Computes ROUGE-N of two text collections of sentences.
131 | Sourece: http://research.microsoft.com/en-us/um/people/cyl/download/
132 | papers/rouge-working-note-v1.3.1.pdf
133 |
134 | Args:
135 | evaluated_sentences: The sentences that have been picked by the summarizer
136 | reference_sentences: The sentences from the referene set
137 | n: Size of ngram. Defaults to 2.
138 |
139 | Returns:
140 | A tuple (f1, precision, recall) for ROUGE-N
141 |
142 | Raises:
143 | ValueError: raises exception if a param has len <= 0
144 | """
145 | if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
146 | raise ValueError("Collections must contain at least 1 sentence.")
147 |
148 | evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences)
149 | reference_ngrams = _get_word_ngrams(n, reference_sentences)
150 | reference_count = len(reference_ngrams)
151 | evaluated_count = len(evaluated_ngrams)
152 |
153 | # Gets the overlapping ngrams between evaluated and reference
154 | overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)
155 | overlapping_count = len(overlapping_ngrams)
156 |
157 | # Handle edge case. This isn't mathematically correct, but it's good enough
158 | if evaluated_count == 0:
159 | precision = 0.0
160 | else:
161 | precision = overlapping_count / evaluated_count
162 |
163 | if reference_count == 0:
164 | recall = 0.0
165 | else:
166 | recall = overlapping_count / reference_count
167 |
168 | f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))
169 |
170 | # return overlapping_count / reference_count
171 | return f1_score, precision, recall
172 |
173 |
174 | def _f_p_r_lcs(llcs, m, n):
175 | """
176 | Computes the LCS-based F-measure score
177 | Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/
178 | rouge-working-note-v1.3.1.pdf
179 |
180 | Args:
181 | llcs: Length of LCS
182 | m: number of words in reference summary
183 | n: number of words in candidate summary
184 |
185 | Returns:
186 | Float. LCS-based F-measure score
187 | """
188 | r_lcs = llcs / m
189 | p_lcs = llcs / n
190 | beta = p_lcs / (r_lcs + 1e-12)
191 | num = (1 + (beta**2)) * r_lcs * p_lcs
192 | denom = r_lcs + ((beta**2) * p_lcs)
193 | f_lcs = num / (denom + 1e-12)
194 | return f_lcs, p_lcs, r_lcs
195 |
196 |
197 | def rouge_l_sentence_level(evaluated_sentences, reference_sentences):
198 | """
199 | Computes ROUGE-L (sentence level) of two text collections of sentences.
200 | http://research.microsoft.com/en-us/um/people/cyl/download/papers/
201 | rouge-working-note-v1.3.1.pdf
202 |
203 | Calculated according to:
204 | R_lcs = LCS(X,Y)/m
205 | P_lcs = LCS(X,Y)/n
206 | F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs)
207 |
208 | where:
209 | X = reference summary
210 | Y = Candidate summary
211 | m = length of reference summary
212 | n = length of candidate summary
213 |
214 | Args:
215 | evaluated_sentences: The sentences that have been picked by the summarizer
216 | reference_sentences: The sentences from the referene set
217 |
218 | Returns:
219 | A float: F_lcs
220 |
221 | Raises:
222 | ValueError: raises exception if a param has len <= 0
223 | """
224 | if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
225 | raise ValueError("Collections must contain at least 1 sentence.")
226 | reference_words = _split_into_words(reference_sentences)
227 | evaluated_words = _split_into_words(evaluated_sentences)
228 | m = len(reference_words)
229 | n = len(evaluated_words)
230 | lcs = _len_lcs(evaluated_words, reference_words)
231 | return _f_p_r_lcs(lcs, m, n)
232 |
233 |
234 | def _union_lcs(evaluated_sentences, reference_sentence):
235 | """
236 | Returns LCS_u(r_i, C) which is the LCS score of the union longest common
237 | subsequence between reference sentence ri and candidate summary C. For example
238 | if r_i= w1 w2 w3 w4 w5, and C contains two sentences: c1 = w1 w2 w6 w7 w8 and
239 | c2 = w1 w3 w8 w9 w5, then the longest common subsequence of r_i and c1 is
240 | "w1 w2" and the longest common subsequence of r_i and c2 is "w1 w3 w5". The
241 | union longest common subsequence of r_i, c1, and c2 is "w1 w2 w3 w5" and
242 | LCS_u(r_i, C) = 4/5.
243 |
244 | Args:
245 | evaluated_sentences: The sentences that have been picked by the summarizer
246 | reference_sentence: One of the sentences in the reference summaries
247 |
248 | Returns:
249 | float: LCS_u(r_i, C)
250 |
251 | ValueError:
252 | Raises exception if a param has len <= 0
253 | """
254 | if len(evaluated_sentences) <= 0:
255 | raise ValueError("Collections must contain at least 1 sentence.")
256 |
257 | lcs_union = set()
258 | reference_words = _split_into_words([reference_sentence])
259 | combined_lcs_length = 0
260 | for eval_s in evaluated_sentences:
261 | evaluated_words = _split_into_words([eval_s])
262 | lcs = set(_recon_lcs(reference_words, evaluated_words))
263 | combined_lcs_length += len(lcs)
264 | lcs_union = lcs_union.union(lcs)
265 |
266 | union_lcs_count = len(lcs_union)
267 | union_lcs_value = union_lcs_count / combined_lcs_length
268 | return union_lcs_value
269 |
270 |
271 | def rouge_l_summary_level(evaluated_sentences, reference_sentences):
272 | """
273 | Computes ROUGE-L (summary level) of two text collections of sentences.
274 | http://research.microsoft.com/en-us/um/people/cyl/download/papers/
275 | rouge-working-note-v1.3.1.pdf
276 |
277 | Calculated according to:
278 | R_lcs = SUM(1, u)[LCS(r_i,C)]/m
279 | P_lcs = SUM(1, u)[LCS(r_i,C)]/n
280 | F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs)
281 |
282 | where:
283 | SUM(i,u) = SUM from i through u
284 | u = number of sentences in reference summary
285 | C = Candidate summary made up of v sentences
286 | m = number of words in reference summary
287 | n = number of words in candidate summary
288 |
289 | Args:
290 | evaluated_sentences: The sentences that have been picked by the summarizer
291 | reference_sentence: One of the sentences in the reference summaries
292 |
293 | Returns:
294 | A float: F_lcs
295 |
296 | Raises:
297 | ValueError: raises exception if a param has len <= 0
298 | """
299 | if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
300 | raise ValueError("Collections must contain at least 1 sentence.")
301 |
302 | # total number of words in reference sentences
303 | m = len(_split_into_words(reference_sentences))
304 |
305 | # total number of words in evaluated sentences
306 | n = len(_split_into_words(evaluated_sentences))
307 |
308 | union_lcs_sum_across_all_references = 0
309 | for ref_s in reference_sentences:
310 | union_lcs_sum_across_all_references += _union_lcs(evaluated_sentences,
311 | ref_s)
312 | return _f_p_r_lcs(union_lcs_sum_across_all_references, m, n)
313 |
314 |
315 | def rouge(hypotheses, references):
316 | """Calculates average rouge scores for a list of hypotheses and
317 | references"""
318 |
319 | # Filter out hyps that are of 0 length
320 | # hyps_and_refs = zip(hypotheses, references)
321 | # hyps_and_refs = [_ for _ in hyps_and_refs if len(_[0]) > 0]
322 | # hypotheses, references = zip(*hyps_and_refs)
323 |
324 | # Calculate ROUGE-1 F1, precision, recall scores
325 | rouge_1 = [
326 | rouge_n([hyp], [ref], 1) for hyp, ref in zip(hypotheses, references)
327 | ]
328 | rouge_1_f, rouge_1_p, rouge_1_r = map(np.mean, zip(*rouge_1))
329 |
330 | # Calculate ROUGE-2 F1, precision, recall scores
331 | rouge_2 = [
332 | rouge_n([hyp], [ref], 2) for hyp, ref in zip(hypotheses, references)
333 | ]
334 | rouge_2_f, rouge_2_p, rouge_2_r = map(np.mean, zip(*rouge_2))
335 |
336 | # Calculate ROUGE-L F1, precision, recall scores
337 | rouge_l = [
338 | rouge_l_sentence_level([hyp], [ref])
339 | for hyp, ref in zip(hypotheses, references)
340 | ]
341 | rouge_l_f, rouge_l_p, rouge_l_r = map(np.mean, zip(*rouge_l))
342 |
343 | return {
344 | "rouge_1/f_score": rouge_1_f,
345 | "rouge_1/r_score": rouge_1_r,
346 | "rouge_1/p_score": rouge_1_p,
347 | "rouge_2/f_score": rouge_2_f,
348 | "rouge_2/r_score": rouge_2_r,
349 | "rouge_2/p_score": rouge_2_p,
350 | "rouge_l/f_score": rouge_l_f,
351 | "rouge_l/r_score": rouge_l_r,
352 | "rouge_l/p_score": rouge_l_p,
353 | }
354 |
--------------------------------------------------------------------------------
/reading_comprehension/layer/recurrent.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 |
4 | from tensorflow.contrib.rnn import RNNCell
5 |
6 | from util.default_util import *
7 | from util.reading_comprehension_util import *
8 |
9 | __all__ = ["RNN", "BiRNN"]
10 |
11 | def _extract_hidden_state(state,
12 | cell_type):
13 | """extract hidden state"""
14 | return state.h if "lstm" in cell_type else state
15 |
16 | def _create_single_reccurent_cell(unit_dim,
17 | cell_type,
18 | activation,
19 | dropout,
20 | forget_bias,
21 | residual_connect,
22 | attention_mechanism,
23 | device_spec,
24 | random_seed):
25 | """create single recurrent cell"""
26 | weight_initializer = create_variable_initializer("glorot_uniform", random_seed)
27 | bias_initializer = create_variable_initializer("zero")
28 | recurrent_activation = create_activation_function(activation)
29 |
30 | if cell_type == "lstm":
31 | single_cell = tf.contrib.rnn.LSTMCell(num_units=unit_dim, use_peepholes=False,
32 | activation=recurrent_activation, forget_bias=forget_bias, initializer=weight_initializer, state_is_tuple=True)
33 | elif cell_type == "peephole_lstm":
34 | single_cell = tf.contrib.rnn.LSTMCell(num_units=unit_dim, use_peepholes=True,
35 | activation=recurrent_activation, forget_bias=forget_bias, initializer=weight_initializer)
36 | elif cell_type == "layer_norm_lstm":
37 | single_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(num_units=unit_dim, layer_norm=True,
38 | activation=recurrent_activation, forget_bias=forget_bias)
39 | elif cell_type == "block_lstm":
40 | single_cell = tf.contrib.rnn.LSTMBlockCell(num_units=unit_dim, forget_bias=forget_bias)
41 | elif cell_type == "block_fused_lstm":
42 | single_cell = tf.contrib.rnn.LSTMBlockFusedCell(num_units=unit_dim, forget_bias=forget_bias)
43 | elif cell_type == "gru":
44 | single_cell = tf.contrib.rnn.GRUCell(num_units=unit_dim, activation=recurrent_activation,
45 | kernel_initializer=weight_initializer, bias_initializer=bias_initializer)
46 | elif cell_type == "sru":
47 | single_cell = tf.contrib.rnn.SRUCell(num_units=unit_dim, activation=recurrent_activation)
48 | else:
49 | raise ValueError("unsupported cell type {0}".format(cell_type))
50 |
51 | if attention_mechanism != None:
52 | single_cell = AttentionCellWrapper(cell=single_cell, attention_mechanism=attention_mechanism)
53 |
54 | if dropout > 0.0:
55 | single_cell = tf.contrib.rnn.DropoutWrapper(cell=single_cell, input_keep_prob=1.0-dropout)
56 |
57 | if residual_connect == True:
58 | single_cell = tf.contrib.rnn.ResidualWrapper(cell=single_cell)
59 |
60 | if device_spec is not None:
61 | single_cell = tf.contrib.rnn.DeviceWrapper(cell=single_cell, device=device_spec)
62 |
63 | return single_cell
64 |
65 | def _create_recurrent_cell(num_layer,
66 | unit_dim,
67 | cell_type,
68 | activation,
69 | dropout,
70 | forget_bias,
71 | residual_connect,
72 | attention_mechanism,
73 | num_gpus,
74 | default_gpu_id,
75 | random_seed):
76 | """create recurrent cell"""
77 | cell_list = []
78 | for i in range(num_layer):
79 | device_spec = get_device_spec(default_gpu_id, num_gpus)
80 |
81 | single_cell = _create_single_reccurent_cell(unit_dim, cell_type, activation,
82 | dropout, forget_bias, residual_connect, attention_mechanism, device_spec, random_seed)
83 |
84 | cell_list.append(single_cell)
85 |
86 | cell = tf.contrib.rnn.MultiRNNCell(cell_list)
87 |
88 | return cell
89 |
90 | class RNN(object):
91 | """uni-directional recurrent layer"""
92 | def __init__(self,
93 | num_layer,
94 | unit_dim,
95 | cell_type,
96 | activation,
97 | dropout,
98 | forget_bias=1.0,
99 | residual_connect=False,
100 | attention_mechanism=None,
101 | num_gpus=1,
102 | default_gpu_id=0,
103 | random_seed=0,
104 | trainable=True,
105 | scope="rnn"):
106 | """initialize uni-directional recurrent layer"""
107 | self.num_layer = num_layer
108 | self.unit_dim = unit_dim
109 | self.cell_type = cell_type
110 | self.activation = activation
111 | self.dropout = dropout
112 | self.forget_bias = forget_bias
113 | self.residual_connect = residual_connect
114 | self.attention_mechanism = attention_mechanism
115 | self.num_gpus = num_gpus
116 | self.default_gpu_id = default_gpu_id
117 | self.random_seed = random_seed
118 | self.trainable = trainable
119 | self.scope = scope
120 |
121 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE):
122 | self.cell = _create_recurrent_cell(self.num_layer, self.unit_dim, self.cell_type,
123 | self.activation, self.dropout, self.forget_bias, self.residual_connect,
124 | self.attention_mechanism, self.num_gpus, self.default_gpu_id, self.random_seed)
125 |
126 | def __call__(self,
127 | input_data,
128 | input_mask):
129 | """call uni-directional recurrent layer"""
130 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE):
131 | input_data_shape = tf.shape(input_data)
132 | input_mask_shape = tf.shape(input_mask)
133 | shape_size = len(input_data.get_shape().as_list())
134 | if shape_size > 3:
135 | input_data = tf.reshape(input_data, shape=tf.concat([[-1], input_data_shape[-2:]], axis=0))
136 | input_mask = tf.reshape(input_mask, shape=tf.concat([[-1], input_mask_shape[-2:]], axis=0))
137 |
138 | input_length = tf.cast(tf.reduce_sum(tf.squeeze(input_mask, axis=-1), axis=-1), dtype=tf.int32)
139 | output_recurrent, final_state_recurrent = tf.nn.dynamic_rnn(cell=self.cell,
140 | inputs=input_data, sequence_length=input_length, dtype=input_data.dtype)
141 | output_mask = input_mask
142 |
143 | state_list = [_extract_hidden_state(state, self.cell_type) for state in final_state_recurrent]
144 | final_state_recurrent = tf.concat(state_list, axis=-1)
145 | final_state_mask = tf.squeeze(tf.reduce_max(input_mask, axis=1, keepdims=True), axis=1)
146 |
147 | if shape_size > 3:
148 | output_recurrent_shape = tf.shape(output_recurrent)
149 | output_mask_shape = tf.shape(output_mask)
150 | final_state_recurrent_shape = tf.shape(final_state_recurrent)
151 | final_state_mask_shape = tf.shape(final_state_mask)
152 | output_recurrent = tf.reshape(output_recurrent,
153 | shape=tf.concat([input_data_shape[:-2], output_recurrent_shape[-2:]], axis=0))
154 | output_mask = tf.reshape(output_mask,
155 | shape=tf.concat([input_mask_shape[:-2], output_mask_shape[-2:]], axis=0))
156 | final_state_recurrent = tf.reshape(final_state_recurrent,
157 | shape=tf.concat([input_data_shape[:-2], final_state_recurrent_shape[-1:]], axis=0))
158 | final_state_mask = tf.reshape(final_state_mask,
159 | shape=tf.concat([input_mask_shape[:-2], final_state_mask_shape[-1:]], axis=0))
160 |
161 | return output_recurrent, output_mask, final_state_recurrent, final_state_mask
162 |
163 | class BiRNN(object):
164 | """bi-directional recurrent layer"""
165 | def __init__(self,
166 | num_layer,
167 | unit_dim,
168 | cell_type,
169 | activation,
170 | dropout,
171 | forget_bias=1.0,
172 | residual_connect=False,
173 | attention_mechanism=None,
174 | num_gpus=1,
175 | default_gpu_id=0,
176 | random_seed=0,
177 | trainable=True,
178 | scope="bi_rnn"):
179 | """initialize bi-directional recurrent layer"""
180 | self.num_layer = num_layer
181 | self.unit_dim = unit_dim
182 | self.cell_type = cell_type
183 | self.activation = activation
184 | self.dropout = dropout
185 | self.forget_bias = forget_bias
186 | self.residual_connect = residual_connect
187 | self.attention_mechanism = attention_mechanism
188 | self.num_gpus = num_gpus
189 | self.default_gpu_id = default_gpu_id
190 | self.random_seed = random_seed
191 | self.trainable = trainable
192 | self.scope = scope
193 |
194 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE):
195 | self.fwd_cell = _create_recurrent_cell(self.num_layer, self.unit_dim, self.cell_type,
196 | self.activation, self.dropout, self.forget_bias, self.residual_connect,
197 | self.attention_mechanism, self.num_gpus, self.default_gpu_id, self.random_seed)
198 | self.bwd_cell = _create_recurrent_cell(self.num_layer, self.unit_dim, self.cell_type,
199 | self.activation, self.dropout, self.forget_bias, self.residual_connect,
200 | self.attention_mechanism, self.num_gpus, self.default_gpu_id + self.num_layer, self.random_seed)
201 |
202 | def __call__(self,
203 | input_data,
204 | input_mask):
205 | """call bi-directional recurrent layer"""
206 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE):
207 | input_data_shape = tf.shape(input_data)
208 | input_mask_shape = tf.shape(input_mask)
209 | shape_size = len(input_data.get_shape().as_list())
210 | if shape_size > 3:
211 | input_data = tf.reshape(input_data, shape=tf.concat([[-1], input_data_shape[-2:]], axis=0))
212 | input_mask = tf.reshape(input_mask, shape=tf.concat([[-1], input_mask_shape[-2:]], axis=0))
213 |
214 | input_length = tf.cast(tf.reduce_sum(tf.squeeze(input_mask, axis=-1), axis=-1), dtype=tf.int32)
215 | output_recurrent, final_state_recurrent = tf.nn.bidirectional_dynamic_rnn(cell_fw=self.fwd_cell,
216 | cell_bw=self.bwd_cell, inputs=input_data, sequence_length=input_length, dtype=input_data.dtype)
217 |
218 | output_recurrent = tf.concat(output_recurrent, axis=-1)
219 | output_mask = input_mask
220 |
221 | fwd_state = final_state_recurrent[0]
222 | bwd_state = final_state_recurrent[1]
223 |
224 | state_list = []
225 | for i in range(self.num_layer):
226 | state_list.append(_extract_hidden_state(fwd_state[i], self.cell_type))
227 | state_list.append(_extract_hidden_state(bwd_state[i], self.cell_type))
228 |
229 | final_state_recurrent = tf.concat(state_list, axis=-1)
230 | final_state_mask = tf.squeeze(tf.reduce_max(input_mask, axis=1, keepdims=True), axis=1)
231 |
232 | if shape_size > 3:
233 | output_recurrent_shape = tf.shape(output_recurrent)
234 | output_mask_shape = tf.shape(output_mask)
235 | final_state_recurrent_shape = tf.shape(final_state_recurrent)
236 | final_state_mask_shape = tf.shape(final_state_mask)
237 | output_recurrent = tf.reshape(output_recurrent,
238 | shape=tf.concat([input_data_shape[:-2], output_recurrent_shape[-2:]], axis=0))
239 | output_mask = tf.reshape(output_mask,
240 | shape=tf.concat([input_mask_shape[:-2], output_mask_shape[-2:]], axis=0))
241 | final_state_recurrent = tf.reshape(final_state_recurrent,
242 | shape=tf.concat([input_data_shape[:-2], final_state_recurrent_shape[-1:]], axis=0))
243 | final_state_mask = tf.reshape(final_state_mask,
244 | shape=tf.concat([input_mask_shape[:-2], final_state_mask_shape[-1:]], axis=0))
245 |
246 | return output_recurrent, output_mask, final_state_recurrent, final_state_mask
247 |
248 | class AttentionCellWrapper(RNNCell):
249 | def __init__(self,
250 | cell,
251 | attention_mechanism):
252 | """initialize attention cell wrapper"""
253 | super(AttentionCellWrapper, self).__init__()
254 |
255 | self._cell = cell
256 | self._attention_mechanism = attention_mechanism
257 |
258 | @property
259 | def state_size(self):
260 | return self._cell.state_size
261 |
262 | @property
263 | def output_size(self):
264 | return self._cell.output_size
265 |
266 | def __call__(self,
267 | inputs,
268 | state,
269 | scope=None):
270 | """call attention cell wrapper"""
271 | query = tf.expand_dims(tf.concat([inputs, state], axis=-1), axis=1)
272 | query_mask = tf.reduce_sum(query, axis=-1, keepdims=True)
273 | query_mask = tf.cast(tf.greater(query_mask, tf.constant(0, shape=[], dtype=tf.float32)), dtype=tf.float32)
274 | attention, attention_mask = self._attention_mechanism(query, query_mask)
275 | inputs = tf.squeeze(attention, axis=1)
276 | cell_output, new_state = self._cell(inputs, state, scope)
277 |
278 | return cell_output, new_state
279 |
--------------------------------------------------------------------------------
/reading_comprehension/layer/dense.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 |
4 | from util.default_util import *
5 | from util.reading_comprehension_util import *
6 |
7 | from layer.basic import *
8 |
9 | __all__ = ["Dense", "DoubleDense", "StackedDense", "StackedDoubleDense"]
10 |
11 | class Dense(object):
12 | """dense layer"""
13 | def __init__(self,
14 | unit_dim,
15 | activation,
16 | dropout,
17 | layer_dropout=0.0,
18 | layer_norm=False,
19 | residual_connect=False,
20 | use_bias=False,
21 | num_gpus=1,
22 | default_gpu_id=0,
23 | regularizer=None,
24 | random_seed=0,
25 | trainable=True,
26 | scope="dense"):
27 | """initialize dense layer"""
28 | self.unit_dim = unit_dim
29 | self.activation = activation
30 | self.dropout = dropout
31 | self.layer_dropout = layer_dropout
32 | self.layer_norm = layer_norm
33 | self.residual_connect = residual_connect
34 | self.use_bias = use_bias
35 | self.regularizer = regularizer
36 | self.random_seed = random_seed
37 | self.trainable = trainable
38 | self.scope = scope
39 | self.device_spec = get_device_spec(default_gpu_id, num_gpus)
40 |
41 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
42 | weight_initializer = create_variable_initializer("glorot_uniform", self.random_seed)
43 | bias_initializer = create_variable_initializer("zero")
44 | self.dense_layer = tf.layers.Dense(units=self.unit_dim, activation=None, use_bias=self.use_bias,
45 | kernel_initializer=weight_initializer, bias_initializer=bias_initializer,
46 | kernel_regularizer=self.regularizer, bias_regularizer=self.regularizer, trainable=self.trainable)
47 |
48 | self.dense_activation = create_activation_function(self.activation)
49 |
50 | self.dropout_layer = Dropout(rate=self.dropout, num_gpus=num_gpus,
51 | default_gpu_id=default_gpu_id, random_seed=self.random_seed)
52 |
53 | if self.layer_norm == True:
54 | self.norm_layer = LayerNorm(layer_dim=self.unit_dim,
55 | num_gpus=num_gpus, default_gpu_id=default_gpu_id, trainable=self.trainable)
56 |
57 | def __call__(self,
58 | input_data,
59 | input_mask):
60 | """call dense layer"""
61 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
62 | input_dense = input_data
63 | input_dense_mask = input_mask
64 |
65 | if self.layer_norm == True:
66 | input_dense, input_dense_mask = self.norm_layer(input_dense, input_dense_mask)
67 |
68 | input_dense = self.dense_layer(input_dense)
69 |
70 | if self.dense_activation != None:
71 | input_dense = self.dense_activation(input_dense)
72 |
73 | input_dense, input_dense_mask = self.dropout_layer(input_dense, input_dense_mask)
74 |
75 | if self.residual_connect == True:
76 | output_dense, output_mask = tf.cond(tf.random_uniform([]) < self.layer_dropout,
77 | lambda: (input_data, input_mask),
78 | lambda: (input_dense + input_data, input_mask))
79 | else:
80 | output_dense = input_dense
81 | output_mask = input_dense_mask
82 |
83 | return output_dense, output_mask
84 |
85 | class DoubleDense(object):
86 | """double-dense layer"""
87 | def __init__(self,
88 | unit_dim,
89 | inner_scale,
90 | activation,
91 | dropout,
92 | layer_dropout=0.0,
93 | layer_norm=False,
94 | residual_connect=False,
95 | use_bias=False,
96 | num_gpus=1,
97 | default_gpu_id=0,
98 | regularizer=None,
99 | random_seed=0,
100 | trainable=True,
101 | scope="double_dense"):
102 | """initialize double-dense layer"""
103 | self.unit_dim = unit_dim
104 | self.inner_scale = inner_scale
105 | self.activation = activation
106 | self.dropout = dropout
107 | self.layer_dropout = layer_dropout
108 | self.layer_norm = layer_norm
109 | self.residual_connect = residual_connect
110 | self.use_bias = use_bias
111 | self.regularizer = regularizer
112 | self.random_seed = random_seed
113 | self.trainable = trainable
114 | self.scope = scope
115 | self.device_spec = get_device_spec(default_gpu_id, num_gpus)
116 |
117 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
118 | weight_initializer = create_variable_initializer("glorot_uniform", self.random_seed)
119 | bias_initializer = create_variable_initializer("zero")
120 | self.inner_dense_layer = tf.layers.Dense(units=self.unit_dim * self.inner_scale, activation=None, use_bias=self.use_bias,
121 | kernel_initializer=weight_initializer, bias_initializer=bias_initializer,
122 | kernel_regularizer=self.regularizer, bias_regularizer=self.regularizer, trainable=self.trainable)
123 | self.outer_dense_layer = tf.layers.Dense(units=self.unit_dim, activation=None, use_bias=self.use_bias,
124 | kernel_initializer=weight_initializer, bias_initializer=bias_initializer,
125 | kernel_regularizer=self.regularizer, bias_regularizer=self.regularizer, trainable=self.trainable)
126 |
127 | self.dense_activation = create_activation_function(self.activation)
128 |
129 | self.dropout_layer = Dropout(rate=self.dropout, num_gpus=num_gpus,
130 | default_gpu_id=default_gpu_id, random_seed=self.random_seed)
131 |
132 | if self.layer_norm == True:
133 | self.norm_layer = LayerNorm(layer_dim=self.unit_dim,
134 | num_gpus=num_gpus, default_gpu_id=default_gpu_id, trainable=self.trainable)
135 |
136 | def __call__(self,
137 | input_data,
138 | input_mask):
139 | """call double-dense layer"""
140 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
141 | input_dense = input_data
142 | input_dense_mask = input_mask
143 |
144 | if self.layer_norm == True:
145 | input_dense, input_dense_mask = self.norm_layer(input_dense, input_dense_mask)
146 |
147 | input_dense = self.inner_dense_layer(input_dense)
148 |
149 | if self.dense_activation != None:
150 | input_dense = self.dense_activation(input_dense)
151 |
152 | input_dense = self.outer_dense_layer(input_dense)
153 |
154 | input_dense, input_dense_mask = self.dropout_layer(input_dense, input_dense_mask)
155 |
156 | if self.residual_connect == True:
157 | output_dense, output_mask = tf.cond(tf.random_uniform([]) < self.layer_dropout,
158 | lambda: (input_data, input_mask),
159 | lambda: (input_dense + input_data, input_mask))
160 | else:
161 | output_dense = input_dense
162 | output_mask = input_dense_mask
163 |
164 | return output_dense, output_mask
165 |
166 | class StackedDense(object):
167 | """stacked dense layer"""
168 | def __init__(self,
169 | layer_creator,
170 | num_layer,
171 | unit_dim,
172 | activation,
173 | dropout,
174 | layer_dropout=None,
175 | layer_norm=False,
176 | residual_connect=False,
177 | use_bias=False,
178 | num_gpus=1,
179 | default_gpu_id=0,
180 | regularizer=None,
181 | random_seed=0,
182 | trainable=True,
183 | scope="stacked_dense"):
184 | """initialize stacked dense layer"""
185 | self.layer_creator = layer_creator
186 | self.num_layer = num_layer
187 | self.unit_dim = unit_dim
188 | self.activation = activation
189 | self.dropout = dropout
190 | self.layer_dropout = layer_dropout
191 | self.layer_norm = layer_norm
192 | self.residual_connect = residual_connect
193 | self.use_bias = use_bias
194 | self.num_gpus = num_gpus
195 | self.default_gpu_id = default_gpu_id
196 | self.regularizer = regularizer
197 | self.random_seed = random_seed
198 | self.trainable = trainable
199 | self.scope = scope
200 | self.device_spec = get_device_spec(default_gpu_id, num_gpus)
201 |
202 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
203 | self.dense_layer_list = []
204 | for i in range(self.num_layer):
205 | layer_scope = "layer_{0}".format(i)
206 | sublayer_dropout = self.dropout[i] if self.dropout != None else 0.0
207 | sublayer_layer_dropout = self.layer_dropout[i] if self.layer_dropout != None else 0.0
208 | dense_layer = self.layer_creator(unit_dim=self.unit_dim, activation=self.activation,
209 | dropout=sublayer_dropout, layer_dropout=sublayer_layer_dropout, layer_norm=self.layer_norm,
210 | residual_connect=self.residual_connect, use_bias=self.use_bias, num_gpus=self.num_gpus,
211 | default_gpu_id=self.default_gpu_id, regularizer=self.regularizer, random_seed=self.random_seed,
212 | trainable=self.trainable, scope=layer_scope)
213 | self.dense_layer_list.append(dense_layer)
214 |
215 | def __call__(self,
216 | input_data,
217 | input_mask):
218 | """call stacked dense layer"""
219 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
220 | input_dense = input_data
221 | input_dense_mask = input_mask
222 |
223 | for dense_layer in self.dense_layer_list:
224 | input_dense, input_dense_mask = dense_layer(input_dense, input_dense_mask)
225 |
226 | output_dense = input_dense
227 | output_mask = input_dense_mask
228 |
229 | return output_dense, output_mask
230 |
231 | class StackedDoubleDense(object):
232 | """stacked double-dense layer"""
233 | def __init__(self,
234 | layer_creator,
235 | num_layer,
236 | unit_dim,
237 | inner_scale,
238 | activation,
239 | dropout,
240 | layer_dropout=None,
241 | layer_norm=False,
242 | residual_connect=False,
243 | use_bias=False,
244 | num_gpus=1,
245 | default_gpu_id=0,
246 | regularizer=None,
247 | random_seed=0,
248 | trainable=True,
249 | scope="stacked_double_dense"):
250 | """initialize stacked double-dense layer"""
251 | self.layer_creator = layer_creator
252 | self.num_layer = num_layer
253 | self.unit_dim = unit_dim
254 | self.inner_scale = inner_scale
255 | self.activation = activation
256 | self.dropout = dropout
257 | self.layer_dropout = layer_dropout
258 | self.layer_norm = layer_norm
259 | self.residual_connect = residual_connect
260 | self.use_bias = use_bias
261 | self.num_gpus = num_gpus
262 | self.default_gpu_id = default_gpu_id
263 | self.regularizer = regularizer
264 | self.random_seed = random_seed
265 | self.trainable = trainable
266 | self.scope = scope
267 | self.device_spec = get_device_spec(default_gpu_id, num_gpus)
268 |
269 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
270 | self.dense_layer_list = []
271 | for i in range(self.num_layer):
272 | layer_scope = "layer_{0}".format(i)
273 | sublayer_dropout = self.dropout[i] if self.dropout != None else 0.0
274 | sublayer_layer_dropout = self.layer_dropout[i] if self.layer_dropout != None else 0.0
275 | dense_layer = self.layer_creator(unit_dim=self.unit_dim, inner_scale=self.inner_scale, activation=self.activation,
276 | dropout=sublayer_dropout, layer_dropout=sublayer_layer_dropout, layer_norm=self.layer_norm,
277 | residual_connect=self.residual_connect, use_bias=self.use_bias, num_gpus=self.num_gpus,
278 | default_gpu_id=self.default_gpu_id, regularizer=self.regularizer, random_seed=self.random_seed,
279 | trainable=self.trainable, scope=layer_scope)
280 | self.dense_layer_list.append(dense_layer)
281 |
282 | def __call__(self,
283 | input_data,
284 | input_mask):
285 | """call stacked double-dense layer"""
286 | with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE), tf.device(self.device_spec):
287 | input_dense = input_data
288 | input_dense_mask = input_mask
289 |
290 | for dense_layer in self.dense_layer_list:
291 | input_dense, input_dense_mask = dense_layer(input_dense, input_dense_mask)
292 |
293 | output_dense = input_dense
294 | output_mask = input_dense_mask
295 |
296 | return output_dense, output_mask
297 |
--------------------------------------------------------------------------------
/reading_comprehension/config/config_search_template.qanet.json:
--------------------------------------------------------------------------------
1 | {
2 | "hyperparams": {
3 | "data_max_question_length": {
4 | "stype": "discrete",
5 | "set": [50],
6 | "dtype": "int"
7 | },
8 | "data_max_context_length": {
9 | "stype": "discrete",
10 | "set": [400],
11 | "dtype": "int"
12 | },
13 | "data_max_answer_length": {
14 | "stype": "discrete",
15 | "set": [30],
16 | "dtype": "int"
17 | },
18 | "data_max_subword_length": {
19 | "stype": "discrete",
20 | "set": [16],
21 | "dtype": "int"
22 | },
23 | "data_max_char_length": {
24 | "stype": "discrete",
25 | "set": [16],
26 | "dtype": "int"
27 | },
28 | "train_batch_size": {
29 | "stype": "discrete",
30 | "set": [32],
31 | "dtype": "int"
32 | },
33 | "train_optimizer_type": {
34 | "stype": "discrete",
35 | "set": ["adam"],
36 | "dtype": "string"
37 | },
38 | "train_optimizer_learning_rate": {
39 | "stype": "log",
40 | "range": [0.0001, 0.01],
41 | "dtype": "float"
42 | },
43 | "model_representation_word_embed_dim": {
44 | "stype": "discrete",
45 | "set": [300],
46 | "dtype": "int"
47 | },
48 | "model_representation_subword_embed_dim": {
49 | "stype": "lookup",
50 | "key": "embed_dim",
51 | "dtype": "int"
52 | },
53 | "model_representation_subword_unit_dim": {
54 | "stype": "discrete",
55 | "set": [100],
56 | "dtype": "int"
57 | },
58 | "model_representation_subword_window_size": {
59 | "stype": "lookup",
60 | "key": "window_size",
61 | "dtype": "list"
62 | },
63 | "model_representation_subword_hidden_activation": {
64 | "stype": "discrete",
65 | "set": ["relu"],
66 | "dtype": "string"
67 | },
68 | "model_representation_subword_dropout": {
69 | "stype": "lookup",
70 | "key": "dropout",
71 | "scale": 1.0,
72 | "shift": 0.0,
73 | "dtype": "float"
74 | },
75 | "model_representation_subword_pooling_type": {
76 | "stype": "lookup",
77 | "key": "pooling_type",
78 | "dtype": "string"
79 | },
80 | "model_representation_char_embed_dim": {
81 | "stype": "lookup",
82 | "key": "embed_dim",
83 | "dtype": "int"
84 | },
85 | "model_representation_char_unit_dim": {
86 | "stype": "discrete",
87 | "set": [100],
88 | "dtype": "int"
89 | },
90 | "model_representation_char_window_size": {
91 | "stype": "lookup",
92 | "key": "window_size",
93 | "dtype": "list"
94 | },
95 | "model_representation_char_hidden_activation": {
96 | "stype": "discrete",
97 | "set": ["relu"],
98 | "dtype": "string"
99 | },
100 | "model_representation_char_dropout": {
101 | "stype": "lookup",
102 | "key": "dropout",
103 | "scale": 1.0,
104 | "shift": 0.0,
105 | "dtype": "float"
106 | },
107 | "model_representation_char_pooling_type": {
108 | "stype": "lookup",
109 | "key": "pooling_type",
110 | "dtype": "string"
111 | },
112 | "model_representation_fusion_type": {
113 | "stype": "discrete",
114 | "set": ["highway"],
115 | "dtype": "string"
116 | },
117 | "model_representation_fusion_num_layer": {
118 | "stype": "discrete",
119 | "set": [2],
120 | "dtype": "int"
121 | },
122 | "model_representation_fusion_unit_dim": {
123 | "stype": "discrete",
124 | "set": [400],
125 | "dtype": "int"
126 | },
127 | "model_representation_fusion_hidden_activation": {
128 | "stype": "discrete",
129 | "set": ["relu"],
130 | "dtype": "string"
131 | },
132 | "model_representation_fusion_dropout": {
133 | "stype": "lookup",
134 | "key": "dropout",
135 | "scale": 1.0,
136 | "shift": 0.0,
137 | "dtype": "float"
138 | },
139 | "model_understanding_question_num_layer": {
140 | "stype": "lookup",
141 | "key": "understanding_num_layer",
142 | "dtype": "int"
143 | },
144 | "model_understanding_question_num_conv": {
145 | "stype": "lookup",
146 | "key": "understanding_num_conv",
147 | "dtype": "int"
148 | },
149 | "model_understanding_question_num_head": {
150 | "stype": "discrete",
151 | "set": [8],
152 | "dtype": "int"
153 | },
154 | "model_understanding_question_unit_dim": {
155 | "stype": "lookup",
156 | "key": "unit_dim",
157 | "scale": 1.0,
158 | "shift": 0.0,
159 | "dtype": "int"
160 | },
161 | "model_understanding_question_window_size": {
162 | "stype": "lookup",
163 | "key": "understanding_window_size",
164 | "dtype": "list"
165 | },
166 | "model_understanding_question_hidden_activation": {
167 | "stype": "lookup",
168 | "key": "hidden_activation",
169 | "dtype": "string"
170 | },
171 | "model_understanding_question_dropout": {
172 | "stype": "lookup",
173 | "key": "dropout",
174 | "scale": 1.0,
175 | "shift": 0.0,
176 | "dtype": "float"
177 | },
178 | "model_understanding_question_layer_dropout": {
179 | "stype": "lookup",
180 | "key": "layer_dropout",
181 | "scale": 1.0,
182 | "shift": 0.0,
183 | "dtype": "float"
184 | },
185 | "model_understanding_context_num_layer": {
186 | "stype": "lookup",
187 | "key": "understanding_num_layer",
188 | "dtype": "int"
189 | },
190 | "model_understanding_context_num_conv": {
191 | "stype": "lookup",
192 | "key": "understanding_num_conv",
193 | "dtype": "int"
194 | },
195 | "model_understanding_context_num_head": {
196 | "stype": "discrete",
197 | "set": [8],
198 | "dtype": "int"
199 | },
200 | "model_understanding_context_unit_dim": {
201 | "stype": "lookup",
202 | "key": "unit_dim",
203 | "scale": 1.0,
204 | "shift": 0.0,
205 | "dtype": "int"
206 | },
207 | "model_understanding_context_window_size": {
208 | "stype": "lookup",
209 | "key": "understanding_window_size",
210 | "dtype": "list"
211 | },
212 | "model_understanding_context_hidden_activation": {
213 | "stype": "lookup",
214 | "key": "hidden_activation",
215 | "dtype": "string"
216 | },
217 | "model_understanding_context_dropout": {
218 | "stype": "lookup",
219 | "key": "dropout",
220 | "scale": 1.0,
221 | "shift": 0.0,
222 | "dtype": "float"
223 | },
224 | "model_understanding_context_layer_dropout": {
225 | "stype": "lookup",
226 | "key": "layer_dropout",
227 | "scale": 1.0,
228 | "shift": 0.0,
229 | "dtype": "float"
230 | },
231 | "model_interaction_context2question_attention_dim": {
232 | "stype": "lookup",
233 | "key": "unit_dim",
234 | "scale": 1.0,
235 | "shift": 0.0,
236 | "dtype": "int"
237 | },
238 | "model_interaction_context2question_score_type": {
239 | "stype": "lookup",
240 | "key": "score_type",
241 | "dtype": "string"
242 | },
243 | "model_interaction_question2context_attention_dim": {
244 | "stype": "lookup",
245 | "key": "unit_dim",
246 | "scale": 1.0,
247 | "shift": 0.0,
248 | "dtype": "int"
249 | },
250 | "model_interaction_question2context_score_type": {
251 | "stype": "lookup",
252 | "key": "score_type",
253 | "dtype": "string"
254 | },
255 | "model_interaction_fusion_type": {
256 | "stype": "discrete",
257 | "set": ["concate"],
258 | "dtype": "string"
259 | },
260 | "model_interaction_fusion_num_layer": {
261 | "stype": "discrete",
262 | "set": [1],
263 | "dtype": "int"
264 | },
265 | "model_interaction_fusion_unit_dim": {
266 | "stype": "lookup",
267 | "key": "unit_dim",
268 | "scale": 4.0,
269 | "shift": 0.0,
270 | "dtype": "int"
271 | },
272 | "model_interaction_fusion_hidden_activation": {
273 | "stype": "discrete",
274 | "set": ["relu"],
275 | "dtype": "string"
276 | },
277 | "model_interaction_fusion_dropout": {
278 | "stype": "lookup",
279 | "key": "dropout",
280 | "scale": 1.0,
281 | "shift": 0.0,
282 | "dtype": "float"
283 | },
284 | "model_interaction_fusion_combo_enable": {
285 | "stype": "discrete",
286 | "set": [true],
287 | "dtype": "boolean"
288 | },
289 | "model_modeling_answer_num_layer": {
290 | "stype": "lookup",
291 | "key": "modeling_num_layer",
292 | "dtype": "int"
293 | },
294 | "model_modeling_answer_num_conv": {
295 | "stype": "lookup",
296 | "key": "modeling_num_conv",
297 | "dtype": "int"
298 | },
299 | "model_modeling_answer_num_head": {
300 | "stype": "discrete",
301 | "set": [8],
302 | "dtype": "int"
303 | },
304 | "model_modeling_answer_unit_dim": {
305 | "stype": "lookup",
306 | "key": "unit_dim",
307 | "scale": 1.0,
308 | "shift": 0.0,
309 | "dtype": "int"
310 | },
311 | "model_modeling_answer_window_size": {
312 | "stype": "lookup",
313 | "key": "modeling_window_size",
314 | "dtype": "list"
315 | },
316 | "model_modeling_answer_hidden_activation": {
317 | "stype": "lookup",
318 | "key": "hidden_activation",
319 | "dtype": "string"
320 | },
321 | "model_modeling_answer_dropout": {
322 | "stype": "lookup",
323 | "key": "dropout",
324 | "scale": 1.0,
325 | "shift": 0.0,
326 | "dtype": "float"
327 | },
328 | "model_modeling_answer_layer_dropout": {
329 | "stype": "lookup",
330 | "key": "layer_dropout",
331 | "scale": 1.0,
332 | "shift": 0.0,
333 | "dtype": "float"
334 | },
335 | "model_output_answer_start_dropout": {
336 | "stype": "lookup",
337 | "key": "dropout",
338 | "scale": 1.0,
339 | "shift": 0.0,
340 | "dtype": "float"
341 | },
342 | "model_output_answer_end_dropout": {
343 | "stype": "lookup",
344 | "key": "dropout",
345 | "scale": 1.0,
346 | "shift": 0.0,
347 | "dtype": "float"
348 | }
349 | },
350 | "variables": {
351 | "embed_dim": {
352 | "stype": "discrete",
353 | "set": [8, 16, 32, 64],
354 | "dtype": "int"
355 | },
356 | "window_size": {
357 | "stype": "discrete",
358 | "set": [[3], [5], [7]],
359 | "dtype": "list"
360 | },
361 | "pooling_type": {
362 | "stype": "discrete",
363 | "set": ["max"],
364 | "dtype": "string"
365 | },
366 | "unit_dim": {
367 | "stype": "uniform",
368 | "range": [50, 200],
369 | "dtype": "int"
370 | },
371 | "hidden_activation": {
372 | "stype": "discrete",
373 | "set": ["tanh", "relu"],
374 | "dtype": "string"
375 | },
376 | "dropout": {
377 | "stype": "uniform",
378 | "range": [0.0, 0.5],
379 | "dtype": "float"
380 | },
381 | "layer_dropout": {
382 | "stype": "uniform",
383 | "range": [0.0, 0.5],
384 | "dtype": "float"
385 | },
386 | "score_type": {
387 | "stype": "discrete",
388 | "set": ["scaled_dot", "triliear"],
389 | "dtype": "string"
390 | },
391 | "understanding_num_layer": {
392 | "stype": "discrete",
393 | "set": [1, 2, 3, 4],
394 | "dtype": "int"
395 | },
396 | "understanding_num_conv": {
397 | "stype": "discrete",
398 | "set": [2, 4],
399 | "dtype": "int"
400 | },
401 | "understanding_window_size": {
402 | "stype": "discrete",
403 | "set": [[3], [5], [7]],
404 | "dtype": "list"
405 | },
406 | "modeling_num_layer": {
407 | "stype": "discrete",
408 | "set": [2, 4, 8, 12],
409 | "dtype": "int"
410 | },
411 | "modeling_num_conv": {
412 | "stype": "discrete",
413 | "set": [2, 4],
414 | "dtype": "int"
415 | },
416 | "modeling_window_size": {
417 | "stype": "discrete",
418 | "set": [[3], [5], [7]],
419 | "dtype": "list"
420 | }
421 | }
422 | }
--------------------------------------------------------------------------------
/reading_comprehension/config/config_search_template.bidaf.json:
--------------------------------------------------------------------------------
1 | {
2 | "hyperparams": {
3 | "data_max_question_length": {
4 | "stype": "discrete",
5 | "set": [40],
6 | "dtype": "int"
7 | },
8 | "data_max_context_length": {
9 | "stype": "discrete",
10 | "set": [500],
11 | "dtype": "int"
12 | },
13 | "data_max_answer_length": {
14 | "stype": "discrete",
15 | "set": [30],
16 | "dtype": "int"
17 | },
18 | "data_max_subword_length": {
19 | "stype": "discrete",
20 | "set": [16],
21 | "dtype": "int"
22 | },
23 | "data_max_char_length": {
24 | "stype": "discrete",
25 | "set": [16],
26 | "dtype": "int"
27 | },
28 | "train_batch_size": {
29 | "stype": "discrete",
30 | "set": [60],
31 | "dtype": "int"
32 | },
33 | "train_optimizer_type": {
34 | "stype": "discrete",
35 | "set": ["adam"],
36 | "dtype": "string"
37 | },
38 | "train_optimizer_learning_rate": {
39 | "stype": "log",
40 | "range": [0.0001, 0.001],
41 | "dtype": "float"
42 | },
43 | "model_representation_word_embed_dim": {
44 | "stype": "discrete",
45 | "set": [100],
46 | "dtype": "int"
47 | },
48 | "model_representation_subword_embed_dim": {
49 | "stype": "lookup",
50 | "key": "embed_dim",
51 | "dtype": "int"
52 | },
53 | "model_representation_subword_unit_dim": {
54 | "stype": "lookup",
55 | "key": "unit_dim",
56 | "scale": 1.0,
57 | "shift": 0.0,
58 | "dtype": "int"
59 | },
60 | "model_representation_subword_window_size": {
61 | "stype": "lookup",
62 | "key": "window_size",
63 | "dtype": "list"
64 | },
65 | "model_representation_subword_hidden_activation": {
66 | "stype": "discrete",
67 | "set": ["relu"],
68 | "dtype": "string"
69 | },
70 | "model_representation_subword_dropout": {
71 | "stype": "lookup",
72 | "key": "dropout",
73 | "scale": 1.0,
74 | "shift": 0.0,
75 | "dtype": "float"
76 | },
77 | "model_representation_subword_pooling_type": {
78 | "stype": "lookup",
79 | "key": "pooling_type",
80 | "dtype": "string"
81 | },
82 | "model_representation_char_embed_dim": {
83 | "stype": "lookup",
84 | "key": "embed_dim",
85 | "dtype": "int"
86 | },
87 | "model_representation_char_unit_dim": {
88 | "stype": "lookup",
89 | "key": "unit_dim",
90 | "scale": 1.0,
91 | "shift": 0.0,
92 | "dtype": "int"
93 | },
94 | "model_representation_char_window_size": {
95 | "stype": "lookup",
96 | "key": "window_size",
97 | "dtype": "list"
98 | },
99 | "model_representation_char_hidden_activation": {
100 | "stype": "discrete",
101 | "set": ["relu"],
102 | "dtype": "string"
103 | },
104 | "model_representation_char_dropout": {
105 | "stype": "lookup",
106 | "key": "dropout",
107 | "scale": 1.0,
108 | "shift": 0.0,
109 | "dtype": "float"
110 | },
111 | "model_representation_char_pooling_type": {
112 | "stype": "lookup",
113 | "key": "pooling_type",
114 | "dtype": "string"
115 | },
116 | "model_representation_fusion_type": {
117 | "stype": "discrete",
118 | "set": ["highway"],
119 | "dtype": "string"
120 | },
121 | "model_representation_fusion_num_layer": {
122 | "stype": "discrete",
123 | "set": [2],
124 | "dtype": "int"
125 | },
126 | "model_representation_fusion_unit_dim": {
127 | "stype": "lookup",
128 | "key": "unit_dim",
129 | "scale": 1.0,
130 | "shift": 0.0,
131 | "dtype": "int"
132 | },
133 | "model_representation_fusion_hidden_activation": {
134 | "stype": "discrete",
135 | "set": ["relu"],
136 | "dtype": "string"
137 | },
138 | "model_representation_fusion_dropout": {
139 | "stype": "lookup",
140 | "key": "dropout",
141 | "scale": 1.0,
142 | "shift": 0.0,
143 | "dtype": "float"
144 | },
145 | "model_understanding_question_num_layer": {
146 | "stype": "lookup",
147 | "key": "num_layer",
148 | "dtype": "int"
149 | },
150 | "model_understanding_question_unit_dim": {
151 | "stype": "lookup",
152 | "key": "unit_dim",
153 | "scale": 1.0,
154 | "shift": 0.0,
155 | "dtype": "int"
156 | },
157 | "model_understanding_question_cell_type": {
158 | "stype": "lookup",
159 | "key": "cell_type",
160 | "dtype": "string"
161 | },
162 | "model_understanding_question_hidden_activation": {
163 | "stype": "lookup",
164 | "key": "hidden_activation",
165 | "dtype": "string"
166 | },
167 | "model_understanding_question_dropout": {
168 | "stype": "lookup",
169 | "key": "dropout",
170 | "scale": 1.0,
171 | "shift": 0.0,
172 | "dtype": "float"
173 | },
174 | "model_understanding_context_num_layer": {
175 | "stype": "lookup",
176 | "key": "num_layer",
177 | "dtype": "int"
178 | },
179 | "model_understanding_context_unit_dim": {
180 | "stype": "lookup",
181 | "key": "unit_dim",
182 | "scale": 1.0,
183 | "shift": 0.0,
184 | "dtype": "int"
185 | },
186 | "model_understanding_context_cell_type": {
187 | "stype": "lookup",
188 | "key": "cell_type",
189 | "dtype": "string"
190 | },
191 | "model_understanding_context_hidden_activation": {
192 | "stype": "lookup",
193 | "key": "hidden_activation",
194 | "dtype": "string"
195 | },
196 | "model_understanding_context_dropout": {
197 | "stype": "lookup",
198 | "key": "dropout",
199 | "scale": 1.0,
200 | "shift": 0.0,
201 | "dtype": "float"
202 | },
203 | "model_interaction_context2question_attention_dim": {
204 | "stype": "lookup",
205 | "key": "unit_dim",
206 | "scale": 2.0,
207 | "shift": 0.0,
208 | "dtype": "int"
209 | },
210 | "model_interaction_context2question_score_type": {
211 | "stype": "lookup",
212 | "key": "score_type",
213 | "dtype": "string"
214 | },
215 | "model_interaction_question2context_attention_dim": {
216 | "stype": "lookup",
217 | "key": "unit_dim",
218 | "scale": 2.0,
219 | "shift": 0.0,
220 | "dtype": "int"
221 | },
222 | "model_interaction_question2context_score_type": {
223 | "stype": "lookup",
224 | "key": "score_type",
225 | "dtype": "string"
226 | },
227 | "model_interaction_fusion_type": {
228 | "stype": "discrete",
229 | "set": ["concate"],
230 | "dtype": "string"
231 | },
232 | "model_interaction_fusion_num_layer": {
233 | "stype": "discrete",
234 | "set": [1],
235 | "dtype": "int"
236 | },
237 | "model_interaction_fusion_unit_dim": {
238 | "stype": "lookup",
239 | "key": "unit_dim",
240 | "scale": 4.0,
241 | "shift": 0.0,
242 | "dtype": "int"
243 | },
244 | "model_interaction_fusion_hidden_activation": {
245 | "stype": "discrete",
246 | "set": ["relu"],
247 | "dtype": "string"
248 | },
249 | "model_interaction_fusion_dropout": {
250 | "stype": "lookup",
251 | "key": "dropout",
252 | "scale": 1.0,
253 | "shift": 0.0,
254 | "dtype": "float"
255 | },
256 | "model_interaction_fusion_combo_enable": {
257 | "stype": "discrete",
258 | "set": [true],
259 | "dtype": "boolean"
260 | },
261 | "model_modeling_answer_num_layer": {
262 | "stype": "lookup",
263 | "key": "num_layer",
264 | "dtype": "int"
265 | },
266 | "model_modeling_answer_unit_dim": {
267 | "stype": "lookup",
268 | "key": "unit_dim",
269 | "scale": 1.0,
270 | "shift": 0.0,
271 | "dtype": "int"
272 | },
273 | "model_modeling_answer_cell_type": {
274 | "stype": "lookup",
275 | "key": "cell_type",
276 | "dtype": "string"
277 | },
278 | "model_modeling_answer_hidden_activation": {
279 | "stype": "lookup",
280 | "key": "hidden_activation",
281 | "dtype": "string"
282 | },
283 | "model_modeling_answer_dropout": {
284 | "stype": "lookup",
285 | "key": "dropout",
286 | "scale": 1.0,
287 | "shift": 0.0,
288 | "dtype": "float"
289 | },
290 | "model_modeling_answer_attention_dim": {
291 | "stype": "lookup",
292 | "key": "unit_dim",
293 | "scale": 2.0,
294 | "shift": 0.0,
295 | "dtype": "int"
296 | },
297 | "model_modeling_answer_score_type": {
298 | "stype": "lookup",
299 | "key": "score_type",
300 | "dtype": "string"
301 | },
302 | "model_modeling_answer_attention_enable": {
303 | "stype": "discrete",
304 | "set": [false],
305 | "dtype": "boolean"
306 | },
307 | "model_modeling_fusion_type": {
308 | "stype": "discrete",
309 | "set": ["concate"],
310 | "dtype": "string"
311 | },
312 | "model_modeling_fusion_num_layer": {
313 | "stype": "discrete",
314 | "set": [1],
315 | "dtype": "int"
316 | },
317 | "model_modeling_fusion_unit_dim": {
318 | "stype": "lookup",
319 | "key": "unit_dim",
320 | "scale": 2.0,
321 | "shift": 0.0,
322 | "dtype": "int"
323 | },
324 | "model_modeling_fusion_hidden_activation": {
325 | "stype": "discrete",
326 | "set": ["relu"],
327 | "dtype": "string"
328 | },
329 | "model_modeling_fusion_dropout": {
330 | "stype": "lookup",
331 | "key": "dropout",
332 | "scale": 1.0,
333 | "shift": 0.0,
334 | "dtype": "float"
335 | },
336 | "model_output_answer_start_num_layer": {
337 | "stype": "lookup",
338 | "key": "num_layer",
339 | "dtype": "int"
340 | },
341 | "model_output_answer_start_unit_dim": {
342 | "stype": "lookup",
343 | "key": "unit_dim",
344 | "scale": 1.0,
345 | "shift": 0.0,
346 | "dtype": "int"
347 | },
348 | "model_output_answer_start_cell_type": {
349 | "stype": "lookup",
350 | "key": "cell_type",
351 | "dtype": "string"
352 | },
353 | "model_output_answer_start_hidden_activation": {
354 | "stype": "lookup",
355 | "key": "hidden_activation",
356 | "dtype": "string"
357 | },
358 | "model_output_answer_start_dropout": {
359 | "stype": "lookup",
360 | "key": "dropout",
361 | "scale": 1.0,
362 | "shift": 0.0,
363 | "dtype": "float"
364 | },
365 | "model_output_answer_end_num_layer": {
366 | "stype": "lookup",
367 | "key": "num_layer",
368 | "dtype": "int"
369 | },
370 | "model_output_answer_end_unit_dim": {
371 | "stype": "lookup",
372 | "key": "unit_dim",
373 | "scale": 1.0,
374 | "shift": 0.0,
375 | "dtype": "int"
376 | },
377 | "model_output_answer_end_cell_type": {
378 | "stype": "lookup",
379 | "key": "cell_type",
380 | "dtype": "string"
381 | },
382 | "model_output_answer_end_hidden_activation": {
383 | "stype": "lookup",
384 | "key": "hidden_activation",
385 | "dtype": "string"
386 | },
387 | "model_output_answer_end_dropout": {
388 | "stype": "lookup",
389 | "key": "dropout",
390 | "scale": 1.0,
391 | "shift": 0.0,
392 | "dtype": "float"
393 | }
394 | },
395 | "variables": {
396 | "embed_dim": {
397 | "stype": "discrete",
398 | "set": [8, 16, 32, 64],
399 | "dtype": "int"
400 | },
401 | "window_size": {
402 | "stype": "discrete",
403 | "set": [[3], [5], [7]],
404 | "dtype": "list"
405 | },
406 | "pooling_type": {
407 | "stype": "discrete",
408 | "set": ["max"],
409 | "dtype": "string"
410 | },
411 | "num_layer": {
412 | "stype": "discrete",
413 | "set": [1, 2, 3, 4],
414 | "dtype": "int"
415 | },
416 | "unit_dim": {
417 | "stype": "uniform",
418 | "range": [50, 200],
419 | "dtype": "int"
420 | },
421 | "hidden_activation": {
422 | "stype": "discrete",
423 | "set": ["tanh", "relu"],
424 | "dtype": "string"
425 | },
426 | "cell_type": {
427 | "stype": "discrete",
428 | "set": ["lstm", "gru"],
429 | "dtype": "string"
430 | },
431 | "dropout": {
432 | "stype": "uniform",
433 | "range": [0.0, 0.5],
434 | "dtype": "float"
435 | },
436 | "score_type": {
437 | "stype": "discrete",
438 | "set": ["scaled_dot", "triliear"],
439 | "dtype": "string"
440 | }
441 | }
442 | }
--------------------------------------------------------------------------------
/reading_comprehension/model/base_model.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import os.path
3 |
4 | import numpy as np
5 | import tensorflow as tf
6 |
7 | from util.default_util import *
8 | from util.reading_comprehension_util import *
9 | from util.layer_util import *
10 |
11 | __all__ = ["TrainResult", "InferResult", "BaseModel"]
12 |
13 | class TrainResult(collections.namedtuple("TrainResult",
14 | ("loss", "learning_rate", "global_step", "batch_size", "summary"))):
15 | pass
16 |
17 | class InferResult(collections.namedtuple("InferResult",
18 | ("predict", "predict_detail", "batch_size", "summary"))):
19 | pass
20 |
21 | class BaseModel(object):
22 | """reading comprehension base model"""
23 | def __init__(self,
24 | logger,
25 | hyperparams,
26 | data_pipeline,
27 | external_data,
28 | mode="train",
29 | scope="base"):
30 | """initialize mrc base model"""
31 | self.logger = logger
32 | self.hyperparams = hyperparams
33 | self.data_pipeline = data_pipeline
34 | self.mode = mode
35 | self.scope = scope
36 |
37 | self.update_op = None
38 | self.train_loss = None
39 | self.learning_rate = None
40 | self.global_step = None
41 | self.train_summary = None
42 | self.infer_answer_start = None
43 | self.infer_answer_start_mask = None
44 | self.infer_answer_end = None
45 | self.infer_answer_end_mask = None
46 | self.infer_summary = None
47 |
48 | self.word_embedding = external_data["word_embedding"] if external_data is not None and "word_embedding" in external_data else None
49 | self.batch_size = tf.size(tf.reduce_max(self.data_pipeline.input_answer_mask, axis=-2))
50 |
51 | self.num_gpus = self.hyperparams.device_num_gpus
52 | self.default_gpu_id = self.hyperparams.device_default_gpu_id
53 | self.logger.log_print("# {0} gpus are used with default gpu id set as {1}"
54 | .format(self.num_gpus, self.default_gpu_id))
55 |
56 | if self.hyperparams.train_regularization_enable == True:
57 | self.regularizer = create_weight_regularizer(self.hyperparams.train_regularization_type,
58 | self.hyperparams.train_regularization_scale)
59 | else:
60 | self.regularizer = None
61 |
62 | self.random_seed = self.hyperparams.train_random_seed if self.hyperparams.train_enable_debugging else None
63 |
64 | def _create_fusion_layer(self,
65 | input_unit_dim,
66 | output_unit_dim,
67 | fusion_type,
68 | num_layer,
69 | hidden_activation,
70 | dropout,
71 | num_gpus,
72 | default_gpu_id,
73 | regularizer,
74 | random_seed,
75 | trainable):
76 | """create fusion layer for mrc base model"""
77 | with tf.variable_scope("fusion", reuse=tf.AUTO_REUSE):
78 | if fusion_type == "concate":
79 | fusion_layer_list = []
80 | if input_unit_dim != output_unit_dim:
81 | convert_layer = create_convolution_layer("1d", 1, input_unit_dim,
82 | output_unit_dim, 1, 1, 1, "SAME", None, [0.0], None, False, False, False,
83 | num_gpus, default_gpu_id, regularizer, random_seed, trainable)
84 | fusion_layer_list.append(convert_layer)
85 | elif fusion_type == "dense":
86 | fusion_layer = create_dense_layer("single", num_layer, output_unit_dim, 1, hidden_activation,
87 | [dropout] * num_layer, None, False, False, False, num_gpus, default_gpu_id, regularizer, random_seed, trainable)
88 | fusion_layer_list = [fusion_layer]
89 | elif fusion_type == "highway":
90 | fusion_layer_list = []
91 | if input_unit_dim != output_unit_dim:
92 | convert_layer = create_convolution_layer("1d", 1, input_unit_dim,
93 | output_unit_dim, 1, 1, 1, "SAME", None, [0.0], None, False, False, False,
94 | num_gpus, default_gpu_id, regularizer, random_seed, trainable)
95 | fusion_layer_list.append(convert_layer)
96 |
97 | fusion_layer = create_highway_layer(num_layer, output_unit_dim, hidden_activation,
98 | [dropout] * num_layer, num_gpus, default_gpu_id, regularizer, random_seed, trainable)
99 | fusion_layer_list.append(fusion_layer)
100 | elif fusion_type == "conv":
101 | fusion_layer = create_convolution_layer("1d", num_layer, input_unit_dim,
102 | output_unit_dim, 1, 1, 1, "SAME", hidden_activation, [dropout] * num_layer,
103 | None, False, False, False, num_gpus, default_gpu_id, regularizer, random_seed, trainable)
104 | fusion_layer_list = [fusion_layer]
105 | else:
106 | raise ValueError("unsupported fusion type {0}".format(fusion_type))
107 |
108 | return fusion_layer_list
109 |
110 | def _build_fusion_result(self,
111 | input_data_list,
112 | input_mask_list,
113 | fusion_layer_list):
114 | """build fusion result for mrc base model"""
115 | input_fusion = tf.concat(input_data_list, axis=-1)
116 | input_fusion_mask = tf.reduce_max(tf.concat(input_mask_list, axis=-1), axis=-1, keepdims=True)
117 |
118 | if fusion_layer_list != None:
119 | for fusion_layer in fusion_layer_list:
120 | input_fusion, input_fusion_mask = fusion_layer(input_fusion, input_fusion_mask)
121 |
122 | return input_fusion, input_fusion_mask
123 |
124 | def _get_exponential_moving_average(self,
125 | num_steps):
126 | decay_rate = self.hyperparams.train_ema_decay_rate
127 | enable_debias = self.hyperparams.train_ema_enable_debias
128 | enable_dynamic_decay = self.hyperparams.train_ema_enable_dynamic_decay
129 |
130 | if enable_dynamic_decay == True:
131 | ema = tf.train.ExponentialMovingAverage(decay=decay_rate, num_updates=num_steps, zero_debias=enable_debias)
132 | else:
133 | ema = tf.train.ExponentialMovingAverage(decay=decay_rate, zero_debias=enable_debias)
134 |
135 | return ema
136 |
137 | def _apply_learning_rate_warmup(self,
138 | learning_rate):
139 | """apply learning rate warmup"""
140 | warmup_mode = self.hyperparams.train_optimizer_warmup_mode
141 | warmup_rate = self.hyperparams.train_optimizer_warmup_rate
142 | warmup_end_step = self.hyperparams.train_optimizer_warmup_end_step
143 |
144 | if warmup_mode == "exponential_warmup":
145 | warmup_factor = warmup_rate ** (1 - tf.to_float(self.global_step) / tf.to_float(warmup_end_step))
146 | warmup_learning_rate = warmup_factor * learning_rate
147 | elif warmup_mode == "inverse_exponential_warmup":
148 | warmup_factor = tf.log(tf.to_float(self.global_step + 1)) / tf.log(tf.to_float(warmup_end_step))
149 | warmup_learning_rate = warmup_factor * learning_rate
150 | else:
151 | raise ValueError("unsupported warm-up mode {0}".format(warmup_mode))
152 |
153 | warmup_learning_rate = tf.cond(tf.less(self.global_step, warmup_end_step),
154 | lambda: warmup_learning_rate, lambda: learning_rate)
155 |
156 | return warmup_learning_rate
157 |
158 | def _apply_learning_rate_decay(self,
159 | learning_rate):
160 | """apply learning rate decay"""
161 | decay_mode = self.hyperparams.train_optimizer_decay_mode
162 | decay_rate = self.hyperparams.train_optimizer_decay_rate
163 | decay_step = self.hyperparams.train_optimizer_decay_step
164 | decay_start_step = self.hyperparams.train_optimizer_decay_start_step
165 |
166 | if decay_mode == "exponential_decay":
167 | decayed_learning_rate = tf.train.exponential_decay(learning_rate=learning_rate,
168 | global_step=(self.global_step - decay_start_step),
169 | decay_steps=decay_step, decay_rate=decay_rate, staircase=True)
170 | elif decay_mode == "inverse_time_decay":
171 | decayed_learning_rate = tf.train.inverse_time_decay(learning_rate=learning_rate,
172 | global_step=(self.global_step - decay_start_step),
173 | decay_steps=decay_step, decay_rate=decay_rate, staircase=True)
174 | else:
175 | raise ValueError("unsupported decay mode {0}".format(decay_mode))
176 |
177 | decayed_learning_rate = tf.cond(tf.less(self.global_step, decay_start_step),
178 | lambda: learning_rate, lambda: decayed_learning_rate)
179 |
180 | return decayed_learning_rate
181 |
182 | def _initialize_optimizer(self,
183 | learning_rate):
184 | """initialize optimizer"""
185 | optimizer_type = self.hyperparams.train_optimizer_type
186 | if optimizer_type == "sgd":
187 | optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
188 | elif optimizer_type == "momentum":
189 | optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
190 | momentum=self.hyperparams.train_optimizer_momentum_beta)
191 | elif optimizer_type == "rmsprop":
192 | optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
193 | decay=self.hyperparams.train_optimizer_rmsprop_beta,
194 | epsilon=self.hyperparams.train_optimizer_rmsprop_epsilon)
195 | elif optimizer_type == "adadelta":
196 | optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate,
197 | rho=self.hyperparams.train_optimizer_adadelta_rho,
198 | epsilon=self.hyperparams.train_optimizer_adadelta_epsilon)
199 | elif optimizer_type == "adagrad":
200 | optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate,
201 | initial_accumulator_value=self.hyperparams.train_optimizer_adagrad_init_accumulator)
202 | elif optimizer_type == "adam":
203 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate,
204 | beta1=self.hyperparams.train_optimizer_adam_beta_1, beta2=self.hyperparams.train_optimizer_adam_beta_2,
205 | epsilon=self.hyperparams.train_optimizer_adam_epsilon)
206 | else:
207 | raise ValueError("unsupported optimizer type {0}".format(optimizer_type))
208 |
209 | return optimizer
210 |
211 | def _minimize_loss(self,
212 | loss):
213 | """minimize optimization loss"""
214 | """compute gradients"""
215 | if self.num_gpus > 1:
216 | grads_and_vars = self.optimizer.compute_gradients(loss, colocate_gradients_with_ops=True)
217 | else:
218 | grads_and_vars = self.optimizer.compute_gradients(loss, colocate_gradients_with_ops=False)
219 |
220 | """clip gradients"""
221 | gradients = [x[0] for x in grads_and_vars]
222 | variables = [x[1] for x in grads_and_vars]
223 | clipped_gradients, gradient_norm = tf.clip_by_global_norm(gradients, self.hyperparams.train_clip_norm)
224 | grads_and_vars = zip(clipped_gradients, variables)
225 |
226 | """update model based on gradients"""
227 | update_model = self.optimizer.apply_gradients(grads_and_vars, global_step=self.global_step)
228 |
229 | return update_model, clipped_gradients, gradient_norm
230 |
231 | def train(self,
232 | sess):
233 | """train model"""
234 | _, loss, learning_rate, global_step, batch_size, summary = sess.run([self.update_op,
235 | self.train_loss, self.decayed_learning_rate, self.global_step, self.batch_size, self.train_summary])
236 |
237 | return TrainResult(loss=loss, learning_rate=learning_rate,
238 | global_step=global_step, batch_size=batch_size, summary=summary)
239 |
240 | def infer(self,
241 | sess):
242 | """infer model"""
243 | (answer_start, answer_end, answer_start_mask, answer_end_mask,
244 | batch_size, summary) = sess.run([self.infer_answer_start, self.infer_answer_end,
245 | self.infer_answer_start_mask, self.infer_answer_end_mask, self.batch_size, self.infer_summary])
246 |
247 | max_context_length = self.hyperparams.data_max_context_length
248 | max_answer_length = self.hyperparams.data_max_answer_length
249 |
250 | predict_start = np.expand_dims(answer_start[:max_context_length], axis=-1)
251 | predict_start_mask = np.expand_dims(answer_start_mask[:max_context_length], axis=-1)
252 | predict_start_start = predict_start * predict_start_mask
253 | predict_end = np.expand_dims(answer_end[:max_context_length], axis=-1)
254 | predict_end_mask = np.expand_dims(answer_end_mask[:max_context_length], axis=-1)
255 | predict_end = predict_end * predict_end_mask
256 |
257 | predict_span = np.matmul(predict_start, predict_end.transpose((0,2,1)))
258 | predict_span_mask = np.matmul(predict_start_mask, predict_end_mask.transpose((0,2,1)))
259 | predict_span = predict_span * predict_span_mask
260 |
261 | predict = np.full((batch_size, 2), -1)
262 | for k in range(batch_size):
263 | max_prob = float('-inf')
264 | max_prob_start = -1
265 | max_prob_end = -1
266 | for i in range(max_context_length):
267 | for j in range(i, min(max_context_length, i+max_answer_length)):
268 | if predict_span[k, i, j] > max_prob:
269 | max_prob = predict_span[k, i, j]
270 | max_prob_start = i
271 | max_prob_end = j
272 |
273 | predict[k, 0] = max_prob_start
274 | predict[k, 1] = max_prob_end
275 |
276 | predict_detail = np.concatenate((predict_start, predict_end), axis=-1)
277 |
278 | return InferResult(predict=predict, predict_detail=predict_detail, batch_size=batch_size, summary=summary)
279 |
280 | def _get_train_summary(self):
281 | """get train summary"""
282 | return tf.summary.merge([tf.summary.scalar("learning_rate", self.learning_rate),
283 | tf.summary.scalar("train_loss", self.train_loss), tf.summary.scalar("gradient_norm", self.gradient_norm)])
284 |
285 | def _get_infer_summary(self):
286 | """get infer summary"""
287 | return tf.no_op()
288 |
--------------------------------------------------------------------------------