├── .DS_Store ├── Experiment 1 ├── .DS_Store ├── basic │ ├── .DS_Store │ ├── __init__.py │ ├── cli.py │ ├── ensemble.py │ ├── ensemble_fast.py │ ├── evaluator.py │ ├── graph_handler.py │ ├── main.py │ ├── model.py │ ├── read_data.py │ ├── run_ensemble.sh │ ├── run_single.sh │ ├── templates │ │ └── visualizer.html │ ├── trainer.py │ └── visualizer.py ├── basic_cnn │ ├── __init__.py │ ├── cli.py │ ├── evaluator.py │ ├── graph_handler.py │ ├── main.py │ ├── model.py │ ├── read_data.py │ ├── superhighway.py │ ├── templates │ │ └── visualizer.html │ ├── trainer.py │ └── visualizer.py ├── cnn_dm │ ├── __init__.py │ ├── eda.ipynb │ ├── evaluate.py │ └── prepro.py ├── download.sh ├── my │ ├── __init__.py │ ├── corenlp_interface.py │ ├── nltk_utils.py │ ├── tensorflow │ │ ├── __init__.py │ │ ├── general.py │ │ ├── nn.py │ │ ├── rnn.py │ │ └── rnn_cell.py │ ├── utils.py │ └── zip_save.py ├── requirements.txt ├── squad │ ├── __init__.py │ ├── aug_squad.py │ ├── eda_aug_dev.ipynb │ ├── eda_aug_train.ipynb │ ├── evaluate-v1.1.py │ ├── evaluate.py │ ├── prepro.py │ ├── prepro_aug.py │ └── utils.py └── tree │ ├── __init__.py │ ├── cli.py │ ├── evaluator.py │ ├── graph_handler.py │ ├── main.py │ ├── model.py │ ├── read_data.py │ ├── templates │ └── visualizer.html │ ├── test.ipynb │ ├── trainer.py │ └── visualizer.py ├── Experiment 2 ├── .DS_Store ├── code │ ├── __init__.py │ ├── docker │ │ └── Dockerfile │ ├── evaluate.py │ ├── evaluate.pyc │ ├── get_started.sh │ ├── plotHistograms.py │ ├── preprocessing │ │ ├── __init__.py │ │ ├── dwr.py │ │ ├── squad_preprocess.py │ │ └── squad_preprocess.pyc │ ├── qa_answer.py │ ├── qa_bidaf_model.py │ ├── qa_bidaf_model.pyc │ ├── qa_data.py │ ├── qa_model.py │ ├── qa_model.pyc │ ├── requirements.txt │ ├── train.py │ ├── util.py │ └── util.pyc └── train │ ├── .DS_Store │ └── BiDAF_3 │ ├── .DS_Store │ ├── checkpoint │ ├── training_scores.txt │ └── validation_scores.txt └── README.md /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imraviagrawal/ReadingComprehension/b1eb0f4a20a6a73bf07cabd48a1aeae01639830a/.DS_Store -------------------------------------------------------------------------------- /Experiment 1/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imraviagrawal/ReadingComprehension/b1eb0f4a20a6a73bf07cabd48a1aeae01639830a/Experiment 1/.DS_Store -------------------------------------------------------------------------------- /Experiment 1/basic/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imraviagrawal/ReadingComprehension/b1eb0f4a20a6a73bf07cabd48a1aeae01639830a/Experiment 1/basic/.DS_Store -------------------------------------------------------------------------------- /Experiment 1/basic/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imraviagrawal/ReadingComprehension/b1eb0f4a20a6a73bf07cabd48a1aeae01639830a/Experiment 1/basic/__init__.py -------------------------------------------------------------------------------- /Experiment 1/basic/cli.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import tensorflow as tf 4 | 5 | from basic.main import main as m 6 | 7 | flags = tf.app.flags 8 | 9 | # Names and directories 10 | flags.DEFINE_string("model_name", "basic", "Model name [basic]") 11 | flags.DEFINE_string("data_dir", "data/squad", "Data dir [data/squad]") 12 | flags.DEFINE_string("run_id", "0", "Run ID [0]") 13 | flags.DEFINE_string("out_base_dir", "out", "out base dir [out]") 14 | flags.DEFINE_string("forward_name", "single", "Forward name [single]") 15 | flags.DEFINE_string("answer_path", "", "Answer path []") 16 | flags.DEFINE_string("eval_path", "", "Eval path []") 17 | flags.DEFINE_string("load_path", "", "Load path []") 18 | flags.DEFINE_string("shared_path", "", "Shared path []") 19 | 20 | # Device placement 21 | flags.DEFINE_string("device", "/cpu:0", "default device for summing gradients. [/cpu:0]") 22 | flags.DEFINE_string("device_type", "gpu", "device for computing gradients (parallelization). cpu | gpu [gpu]") 23 | flags.DEFINE_integer("num_gpus", 1, "num of gpus or cpus for computing gradients [1]") 24 | 25 | # Essential training and test options 26 | flags.DEFINE_string("mode", "test", "trains | test | forward [test]") 27 | flags.DEFINE_boolean("load", True, "load saved data? [True]") 28 | flags.DEFINE_bool("single", False, "supervise only the answer sentence? [False]") 29 | flags.DEFINE_boolean("debug", False, "Debugging mode? [False]") 30 | flags.DEFINE_bool('load_ema', True, "load exponential average of variables when testing? [True]") 31 | flags.DEFINE_bool("eval", True, "eval? [True]") 32 | 33 | # Training / test parameters 34 | flags.DEFINE_integer("batch_size", 60, "Batch size [60]") 35 | flags.DEFINE_integer("val_num_batches", 100, "validation num batches [100]") 36 | flags.DEFINE_integer("test_num_batches", 0, "test num batches [0]") 37 | flags.DEFINE_integer("num_epochs", 12, "Total number of epochs for training [12]") 38 | flags.DEFINE_integer("num_steps", 20000, "Number of steps [20000]") 39 | flags.DEFINE_integer("load_step", 0, "load step [0]") 40 | flags.DEFINE_float("init_lr", 0.5, "Initial learning rate [0.5]") 41 | flags.DEFINE_float("input_keep_prob", 0.8, "Input keep prob for the dropout of LSTM weights [0.8]") 42 | flags.DEFINE_float("keep_prob", 0.8, "Keep prob for the dropout of Char-CNN weights [0.8]") 43 | flags.DEFINE_float("wd", 0.0, "L2 weight decay for regularization [0.0]") 44 | flags.DEFINE_integer("hidden_size", 100, "Hidden size [100]") 45 | flags.DEFINE_integer("char_out_size", 100, "char-level word embedding size [100]") 46 | flags.DEFINE_integer("char_emb_size", 8, "Char emb size [8]") 47 | flags.DEFINE_string("out_channel_dims", "100", "Out channel dims of Char-CNN, separated by commas [100]") 48 | flags.DEFINE_string("filter_heights", "5", "Filter heights of Char-CNN, separated by commas [5]") 49 | flags.DEFINE_bool("finetune", False, "Finetune word embeddings? [False]") 50 | flags.DEFINE_bool("highway", True, "Use highway? [True]") 51 | flags.DEFINE_integer("highway_num_layers", 2, "highway num layers [2]") 52 | flags.DEFINE_bool("share_cnn_weights", True, "Share Char-CNN weights [True]") 53 | flags.DEFINE_bool("share_lstm_weights", True, "Share pre-processing (phrase-level) LSTM weights [True]") 54 | flags.DEFINE_float("var_decay", 0.999, "Exponential moving average decay for variables [0.999]") 55 | 56 | # Optimizations 57 | flags.DEFINE_bool("cluster", False, "Cluster data for faster training [False]") 58 | flags.DEFINE_bool("len_opt", False, "Length optimization? [False]") 59 | flags.DEFINE_bool("cpu_opt", False, "CPU optimization? GPU computation can be slower [False]") 60 | 61 | # Logging and saving options 62 | flags.DEFINE_boolean("progress", True, "Show progress? [True]") 63 | flags.DEFINE_integer("log_period", 100, "Log period [100]") 64 | flags.DEFINE_integer("eval_period", 1000, "Eval period [1000]") 65 | flags.DEFINE_integer("save_period", 1000, "Save Period [1000]") 66 | flags.DEFINE_integer("max_to_keep", 20, "Max recent saves to keep [20]") 67 | flags.DEFINE_bool("dump_eval", True, "dump eval? [True]") 68 | flags.DEFINE_bool("dump_answer", True, "dump answer? [True]") 69 | flags.DEFINE_bool("vis", False, "output visualization numbers? [False]") 70 | flags.DEFINE_bool("dump_pickle", True, "Dump pickle instead of json? [True]") 71 | flags.DEFINE_float("decay", 0.9, "Exponential moving average decay for logging values [0.9]") 72 | 73 | # Thresholds for speed and less memory usage 74 | flags.DEFINE_integer("word_count_th", 10, "word count th [100]") 75 | flags.DEFINE_integer("char_count_th", 50, "char count th [500]") 76 | flags.DEFINE_integer("sent_size_th", 400, "sent size th [64]") 77 | flags.DEFINE_integer("num_sents_th", 8, "num sents th [8]") 78 | flags.DEFINE_integer("ques_size_th", 30, "ques size th [32]") 79 | flags.DEFINE_integer("word_size_th", 16, "word size th [16]") 80 | flags.DEFINE_integer("para_size_th", 256, "para size th [256]") 81 | 82 | # Advanced training options 83 | flags.DEFINE_bool("lower_word", True, "lower word [True]") 84 | flags.DEFINE_bool("squash", False, "squash the sentences into one? [False]") 85 | flags.DEFINE_bool("swap_memory", True, "swap memory? [True]") 86 | flags.DEFINE_string("data_filter", "max", "max | valid | semi [max]") 87 | flags.DEFINE_bool("use_glove_for_unk", True, "use glove for unk [False]") 88 | flags.DEFINE_bool("known_if_glove", True, "consider as known if present in glove [False]") 89 | flags.DEFINE_string("logit_func", "tri_linear", "logit func [tri_linear]") 90 | flags.DEFINE_string("answer_func", "linear", "answer logit func [linear]") 91 | flags.DEFINE_string("sh_logit_func", "tri_linear", "sh logit func [tri_linear]") 92 | 93 | # Ablation options 94 | flags.DEFINE_bool("use_char_emb", True, "use char emb? [True]") 95 | flags.DEFINE_bool("use_word_emb", True, "use word embedding? [True]") 96 | flags.DEFINE_bool("q2c_att", True, "question-to-context attention? [True]") 97 | flags.DEFINE_bool("c2q_att", True, "context-to-question attention? [True]") 98 | flags.DEFINE_bool("dynamic_att", False, "Dynamic attention [False]") 99 | 100 | 101 | def main(_): 102 | config = flags.FLAGS 103 | 104 | config.out_dir = os.path.join(config.out_base_dir, config.model_name, str(config.run_id).zfill(2)) 105 | 106 | m(config) 107 | 108 | if __name__ == "__main__": 109 | tf.app.run() 110 | -------------------------------------------------------------------------------- /Experiment 1/basic/ensemble.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | import gzip 4 | import json 5 | import pickle 6 | from collections import defaultdict 7 | from operator import mul 8 | 9 | from tqdm import tqdm 10 | from squad.utils import get_phrase, get_best_span 11 | 12 | 13 | def get_args(): 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('paths', nargs='+') 16 | parser.add_argument('-o', '--out', default='ensemble.json') 17 | parser.add_argument("--data_path", default="data/squad/data_test.json") 18 | parser.add_argument("--shared_path", default="data/squad/shared_test.json") 19 | args = parser.parse_args() 20 | return args 21 | 22 | 23 | def ensemble(args): 24 | e_list = [] 25 | for path in tqdm(args.paths): 26 | with gzip.open(path, 'r') as fh: 27 | e = pickle.load(fh) 28 | e_list.append(e) 29 | 30 | with open(args.data_path, 'r') as fh: 31 | data = json.load(fh) 32 | 33 | with open(args.shared_path, 'r') as fh: 34 | shared = json.load(fh) 35 | 36 | out = {} 37 | for idx, (id_, rx) in tqdm(enumerate(zip(data['ids'], data['*x'])), total=len(e['yp'])): 38 | if idx >= len(e['yp']): 39 | # for debugging purpose 40 | break 41 | context = shared['p'][rx[0]][rx[1]] 42 | wordss = shared['x'][rx[0]][rx[1]] 43 | yp_list = [e['yp'][idx] for e in e_list] 44 | yp2_list = [e['yp2'][idx] for e in e_list] 45 | answer = ensemble3(context, wordss, yp_list, yp2_list) 46 | out[id_] = answer 47 | 48 | with open(args.out, 'w') as fh: 49 | json.dump(out, fh) 50 | 51 | 52 | def ensemble1(context, wordss, y1_list, y2_list): 53 | """ 54 | 55 | :param context: Original context 56 | :param wordss: tokenized words (nested 2D list) 57 | :param y1_list: list of start index probs (each element corresponds to probs form single model) 58 | :param y2_list: list of stop index probs 59 | :return: 60 | """ 61 | sum_y1 = combine_y_list(y1_list) 62 | sum_y2 = combine_y_list(y2_list) 63 | span, score = get_best_span(sum_y1, sum_y2) 64 | return get_phrase(context, wordss, span) 65 | 66 | 67 | def ensemble2(context, wordss, y1_list, y2_list): 68 | start_dict = defaultdict(float) 69 | stop_dict = defaultdict(float) 70 | for y1, y2 in zip(y1_list, y2_list): 71 | span, score = get_best_span(y1, y2) 72 | start_dict[span[0]] += y1[span[0][0]][span[0][1]] 73 | stop_dict[span[1]] += y2[span[1][0]][span[1][1]] 74 | start = max(start_dict.items(), key=lambda pair: pair[1])[0] 75 | stop = max(stop_dict.items(), key=lambda pair: pair[1])[0] 76 | best_span = (start, stop) 77 | return get_phrase(context, wordss, best_span) 78 | 79 | 80 | def ensemble3(context, wordss, y1_list, y2_list): 81 | d = defaultdict(float) 82 | for y1, y2 in zip(y1_list, y2_list): 83 | span, score = get_best_span(y1, y2) 84 | phrase = get_phrase(context, wordss, span) 85 | d[phrase] += score 86 | return max(d.items(), key=lambda pair: pair[1])[0] 87 | 88 | 89 | def combine_y_list(y_list, op='*'): 90 | if op == '+': 91 | func = sum 92 | elif op == '*': 93 | def func(l): return functools.reduce(mul, l) 94 | else: 95 | func = op 96 | return [[func(yij_list) for yij_list in zip(*yi_list)] for yi_list in zip(*y_list)] 97 | 98 | 99 | def main(): 100 | args = get_args() 101 | ensemble(args) 102 | 103 | if __name__ == "__main__": 104 | main() 105 | 106 | 107 | -------------------------------------------------------------------------------- /Experiment 1/basic/ensemble_fast.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | from collections import Counter, defaultdict 4 | import re 5 | 6 | def key_func(pair): 7 | return pair[1] 8 | 9 | 10 | def get_func(vals, probs): 11 | counter = Counter(vals) 12 | # return max(zip(vals, probs), key=lambda pair: pair[1])[0] 13 | # return max(zip(vals, probs), key=lambda pair: pair[1] * counter[pair[0]] / len(counter) - 999 * (len(pair[0]) == 0) )[0] 14 | # return max(zip(vals, probs), key=lambda pair: pair[1] + 0.7 * counter[pair[0]] / len(counter) - 999 * (len(pair[0]) == 0) )[0] 15 | d = defaultdict(float) 16 | for val, prob in zip(vals, probs): 17 | d[val] += prob 18 | d[''] = 0 19 | return max(d.items(), key=lambda pair: pair[1])[0] 20 | 21 | third_path = sys.argv[1] 22 | other_paths = sys.argv[2:] 23 | 24 | others = [json.load(open(path, 'r')) for path in other_paths] 25 | 26 | 27 | c = {} 28 | 29 | assert min(map(len, others)) == max(map(len, others)), list(map(len, others)) 30 | 31 | for key in others[0].keys(): 32 | if key == 'scores': 33 | continue 34 | probs = [other['scores'][key] for other in others] 35 | vals = [other[key] for other in others] 36 | largest_val = get_func(vals, probs) 37 | c[key] = largest_val 38 | 39 | json.dump(c, open(third_path, 'w')) -------------------------------------------------------------------------------- /Experiment 1/basic/graph_handler.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import json 3 | from json import encoder 4 | import os 5 | 6 | import tensorflow as tf 7 | 8 | from basic.evaluator import Evaluation, F1Evaluation 9 | from my.utils import short_floats 10 | 11 | import pickle 12 | 13 | 14 | class GraphHandler(object): 15 | def __init__(self, config, model): 16 | self.config = config 17 | self.model = model 18 | self.saver = tf.train.Saver(max_to_keep=config.max_to_keep) 19 | self.writer = None 20 | self.save_path = os.path.join(config.save_dir, config.model_name) 21 | 22 | def initialize(self, sess): 23 | sess.run(tf.initialize_all_variables()) 24 | if self.config.load: 25 | self._load(sess) 26 | 27 | if self.config.mode == 'train': 28 | self.writer = tf.train.SummaryWriter(self.config.log_dir, graph=tf.get_default_graph()) 29 | 30 | def save(self, sess, global_step=None): 31 | saver = tf.train.Saver(max_to_keep=self.config.max_to_keep) 32 | saver.save(sess, self.save_path, global_step=global_step) 33 | 34 | def _load(self, sess): 35 | config = self.config 36 | vars_ = {var.name.split(":")[0]: var for var in tf.all_variables()} 37 | if config.load_ema: 38 | ema = self.model.var_ema 39 | for var in tf.trainable_variables(): 40 | del vars_[var.name.split(":")[0]] 41 | vars_[ema.average_name(var)] = var 42 | saver = tf.train.Saver(vars_, max_to_keep=config.max_to_keep) 43 | 44 | if config.load_path: 45 | save_path = config.load_path 46 | elif config.load_step > 0: 47 | save_path = os.path.join(config.save_dir, "{}-{}".format(config.model_name, config.load_step)) 48 | else: 49 | save_dir = config.save_dir 50 | checkpoint = tf.train.get_checkpoint_state(save_dir) 51 | assert checkpoint is not None, "cannot load checkpoint at {}".format(save_dir) 52 | save_path = checkpoint.model_checkpoint_path 53 | print("Loading saved model from {}".format(save_path)) 54 | saver.restore(sess, save_path) 55 | 56 | def add_summary(self, summary, global_step): 57 | self.writer.add_summary(summary, global_step) 58 | 59 | def add_summaries(self, summaries, global_step): 60 | for summary in summaries: 61 | self.add_summary(summary, global_step) 62 | 63 | def dump_eval(self, e, precision=2, path=None): 64 | assert isinstance(e, Evaluation) 65 | if self.config.dump_pickle: 66 | path = path or os.path.join(self.config.eval_dir, "{}-{}.pklz".format(e.data_type, str(e.global_step).zfill(6))) 67 | with gzip.open(path, 'wb', compresslevel=3) as fh: 68 | pickle.dump(e.dict, fh) 69 | else: 70 | path = path or os.path.join(self.config.eval_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6))) 71 | with open(path, 'w') as fh: 72 | json.dump(short_floats(e.dict, precision), fh) 73 | 74 | def dump_answer(self, e, path=None): 75 | assert isinstance(e, Evaluation) 76 | path = path or os.path.join(self.config.answer_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6))) 77 | with open(path, 'w') as fh: 78 | json.dump(e.id2answer_dict, fh) 79 | 80 | -------------------------------------------------------------------------------- /Experiment 1/basic/run_ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | source_path=$1 3 | target_path=$2 4 | inter_dir="inter_ensemble" 5 | root_dir="save" 6 | 7 | parg="" 8 | marg="" 9 | if [ "$3" = "debug" ] 10 | then 11 | parg="-d" 12 | marg="--debug" 13 | fi 14 | 15 | # Preprocess data 16 | python3 -m squad.prepro --mode single --single_path $source_path $parg --target_dir $inter_dir --glove_dir . 17 | 18 | eargs="" 19 | for num in 31 33 34 35 36 37 40 41 43 44 45 46; do 20 | load_path="$root_dir/$num/save" 21 | shared_path="$root_dir/$num/shared.json" 22 | eval_path="$inter_dir/eval-$num.json" 23 | eargs="$eargs $eval_path" 24 | python3 -m basic.cli --data_dir $inter_dir --eval_path $eval_path --nodump_answer --load_path $load_path --shared_path $shared_path $marg --eval_num_batches 0 --mode forward --batch_size 1 --len_opt --cluster --cpu_opt --load_ema & 25 | done 26 | wait 27 | 28 | # Ensemble 29 | python3 -m basic.ensemble --data_path $inter_dir/data_single.json --shared_path $inter_dir/shared_single.json -o $target_path $eargs 30 | -------------------------------------------------------------------------------- /Experiment 1/basic/run_single.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | source_path=$1 3 | target_path=$2 4 | inter_dir="inter_single" 5 | root_dir="save" 6 | 7 | parg="" 8 | marg="" 9 | if [ "$3" = "debug" ] 10 | then 11 | parg="-d" 12 | marg="--debug" 13 | fi 14 | 15 | # Preprocess data 16 | python3 -m squad.prepro --mode single --single_path $source_path $parg --target_dir $inter_dir --glove_dir . 17 | 18 | num=37 19 | load_path="$root_dir/$num/save" 20 | shared_path="$root_dir/$num/shared.json" 21 | eval_path="$inter_dir/eval.json" 22 | python3 -m basic.cli --data_dir $inter_dir --eval_path $eval_path --nodump_answer --load_path $load_path --shared_path $shared_path $marg --eval_num_batches 0 --mode forward --batch_size 1 --len_opt --cluster --cpu_opt --load_ema 23 | 24 | # Ensemble (for single run, just one input) 25 | python3 -m basic.ensemble --data_path $inter_dir/data_single.json --shared_path $inter_dir/shared_single.json -o $target_path $eval_path 26 | 27 | 28 | -------------------------------------------------------------------------------- /Experiment 1/basic/templates/visualizer.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {{ title }} 6 | 7 | 8 | 19 | 20 | 23 | 24 |

{{ title }}

25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | {% for row in rows %} 35 | 36 | 37 | 42 | 47 | 48 | 49 | 72 | 73 | {% endfor %} 74 |
IDQuestionAnswersPredictedScoreParagraph
{{ row.id }} 38 | {% for qj in row.ques %} 39 | {{ qj }} 40 | {% endfor %} 41 | 43 | {% for aa in row.a %} 44 |
  • {{ aa }}
  • 45 | {% endfor %} 46 |
    {{ row.ap }}{{ row.score }} 50 | 51 | {% for xj, ypj, yp2j in zip(row.para, row.yp, row.yp2) %} 52 | 53 | {% set rowloop = loop %} 54 | {% for xjk, ypjk in zip(xj, ypj) %} 55 | 62 | {% endfor %} 63 | 64 | 65 | {% for xjk, yp2jk in zip(xj, yp2j) %} 66 | 67 | {% endfor %} 68 | 69 | {% endfor %} 70 |
    56 | {% if row.y[0][0] == rowloop.index0 and row.y[0][1] <= loop.index0 <= row.y[1][1] %} 57 | {{ xjk }} 58 | {% else %} 59 | {{ xjk }} 60 | {% endif %} 61 |
    -
    71 |
    75 | 76 | -------------------------------------------------------------------------------- /Experiment 1/basic/trainer.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from basic.model import Model 4 | from my.tensorflow import average_gradients 5 | 6 | 7 | class Trainer(object): 8 | def __init__(self, config, model): 9 | assert isinstance(model, Model) 10 | self.config = config 11 | self.model = model 12 | self.opt = tf.train.AdadeltaOptimizer(config.init_lr) 13 | self.loss = model.get_loss() 14 | self.var_list = model.get_var_list() 15 | self.global_step = model.get_global_step() 16 | self.summary = model.summary 17 | self.grads = self.opt.compute_gradients(self.loss, var_list=self.var_list) 18 | self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step) 19 | 20 | def get_train_op(self): 21 | return self.train_op 22 | 23 | def step(self, sess, batch, get_summary=False): 24 | assert isinstance(sess, tf.Session) 25 | _, ds = batch 26 | feed_dict = self.model.get_feed_dict(ds, True) 27 | if get_summary: 28 | loss, summary, train_op = \ 29 | sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict) 30 | else: 31 | loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict) 32 | summary = None 33 | return loss, summary, train_op 34 | 35 | 36 | class MultiGPUTrainer(object): 37 | def __init__(self, config, models): 38 | model = models[0] 39 | assert isinstance(model, Model) 40 | self.config = config 41 | self.model = model 42 | self.opt = tf.train.AdadeltaOptimizer(config.init_lr) 43 | self.var_list = model.get_var_list() 44 | self.global_step = model.get_global_step() 45 | self.summary = model.summary 46 | self.models = models 47 | losses = [] 48 | grads_list = [] 49 | for gpu_idx, model in enumerate(models): 50 | with tf.name_scope("grads_{}".format(gpu_idx)), tf.device("/{}:{}".format(config.device_type, gpu_idx)): 51 | loss = model.get_loss() 52 | grads = self.opt.compute_gradients(loss, var_list=self.var_list) 53 | losses.append(loss) 54 | grads_list.append(grads) 55 | 56 | self.loss = tf.add_n(losses)/len(losses) 57 | self.grads = average_gradients(grads_list) 58 | self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step) 59 | 60 | def step(self, sess, batches, get_summary=False): 61 | assert isinstance(sess, tf.Session) 62 | feed_dict = {} 63 | for batch, model in zip(batches, self.models): 64 | _, ds = batch 65 | feed_dict.update(model.get_feed_dict(ds, True)) 66 | 67 | if get_summary: 68 | loss, summary, train_op = \ 69 | sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict) 70 | else: 71 | loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict) 72 | summary = None 73 | return loss, summary, train_op 74 | -------------------------------------------------------------------------------- /Experiment 1/basic/visualizer.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from collections import OrderedDict 3 | import http.server 4 | import socketserver 5 | import argparse 6 | import json 7 | import os 8 | import numpy as np 9 | from tqdm import tqdm 10 | 11 | from jinja2 import Environment, FileSystemLoader 12 | 13 | from basic.evaluator import get_span_score_pairs 14 | from squad.utils import get_best_span, get_span_score_pairs 15 | 16 | 17 | def bool_(string): 18 | if string == 'True': 19 | return True 20 | elif string == 'False': 21 | return False 22 | else: 23 | raise Exception() 24 | 25 | def get_args(): 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument("--model_name", type=str, default='basic') 28 | parser.add_argument("--data_type", type=str, default='dev') 29 | parser.add_argument("--step", type=int, default=5000) 30 | parser.add_argument("--template_name", type=str, default="visualizer.html") 31 | parser.add_argument("--num_per_page", type=int, default=100) 32 | parser.add_argument("--data_dir", type=str, default="data/squad") 33 | parser.add_argument("--port", type=int, default=8000) 34 | parser.add_argument("--host", type=str, default="0.0.0.0") 35 | parser.add_argument("--open", type=str, default='False') 36 | parser.add_argument("--run_id", type=str, default="0") 37 | 38 | args = parser.parse_args() 39 | return args 40 | 41 | 42 | def _decode(decoder, sent): 43 | return " ".join(decoder[idx] for idx in sent) 44 | 45 | 46 | def accuracy2_visualizer(args): 47 | model_name = args.model_name 48 | data_type = args.data_type 49 | num_per_page = args.num_per_page 50 | data_dir = args.data_dir 51 | run_id = args.run_id.zfill(2) 52 | step = args.step 53 | 54 | eval_path =os.path.join("out", model_name, run_id, "eval", "{}-{}.json".format(data_type, str(step).zfill(6))) 55 | print("loading {}".format(eval_path)) 56 | eval_ = json.load(open(eval_path, 'r')) 57 | 58 | _id = 0 59 | html_dir = "/tmp/list_results%d" % _id 60 | while os.path.exists(html_dir): 61 | _id += 1 62 | html_dir = "/tmp/list_results%d" % _id 63 | 64 | if os.path.exists(html_dir): 65 | shutil.rmtree(html_dir) 66 | os.mkdir(html_dir) 67 | 68 | cur_dir = os.path.dirname(os.path.realpath(__file__)) 69 | templates_dir = os.path.join(cur_dir, 'templates') 70 | env = Environment(loader=FileSystemLoader(templates_dir)) 71 | env.globals.update(zip=zip, reversed=reversed) 72 | template = env.get_template(args.template_name) 73 | 74 | data_path = os.path.join(data_dir, "data_{}.json".format(data_type)) 75 | shared_path = os.path.join(data_dir, "shared_{}.json".format(data_type)) 76 | print("loading {}".format(data_path)) 77 | data = json.load(open(data_path, 'r')) 78 | print("loading {}".format(shared_path)) 79 | shared = json.load(open(shared_path, 'r')) 80 | 81 | rows = [] 82 | for i, (idx, yi, ypi, yp2i) in tqdm(enumerate(zip(*[eval_[key] for key in ('idxs', 'y', 'yp', 'yp2')])), total=len(eval_['idxs'])): 83 | id_, q, rx, answers = (data[key][idx] for key in ('ids', 'q', '*x', 'answerss')) 84 | x = shared['x'][rx[0]][rx[1]] 85 | ques = [" ".join(q)] 86 | para = [[word for word in sent] for sent in x] 87 | span = get_best_span(ypi, yp2i) 88 | ap = get_segment(para, span) 89 | score = "{:.3f}".format(ypi[span[0][0]][span[0][1]] * yp2i[span[1][0]][span[1][1]-1]) 90 | 91 | row = { 92 | 'id': id_, 93 | 'title': "Hello world!", 94 | 'ques': ques, 95 | 'para': para, 96 | 'y': yi[0][0], 97 | 'y2': yi[0][1], 98 | 'yp': ypi, 99 | 'yp2': yp2i, 100 | 'a': answers, 101 | 'ap': ap, 102 | 'score': score 103 | } 104 | rows.append(row) 105 | 106 | if i % num_per_page == 0: 107 | html_path = os.path.join(html_dir, "%s.html" % str(i).zfill(8)) 108 | 109 | if (i + 1) % num_per_page == 0 or (i + 1) == len(eval_['y']): 110 | var_dict = {'title': "Accuracy Visualization", 111 | 'rows': rows 112 | } 113 | with open(html_path, "wb") as f: 114 | f.write(template.render(**var_dict).encode('UTF-8')) 115 | rows = [] 116 | 117 | os.chdir(html_dir) 118 | port = args.port 119 | host = args.host 120 | # Overriding to suppress log message 121 | class MyHandler(http.server.SimpleHTTPRequestHandler): 122 | def log_message(self, format, *args): 123 | pass 124 | handler = MyHandler 125 | httpd = socketserver.TCPServer((host, port), handler) 126 | if args.open == 'True': 127 | os.system("open http://%s:%d" % (args.host, args.port)) 128 | print("serving at %s:%d" % (host, port)) 129 | httpd.serve_forever() 130 | 131 | 132 | def get_segment(para, span): 133 | return " ".join(para[span[0][0]][span[0][1]:span[1][1]]) 134 | 135 | 136 | if __name__ == "__main__": 137 | ARGS = get_args() 138 | accuracy2_visualizer(ARGS) -------------------------------------------------------------------------------- /Experiment 1/basic_cnn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imraviagrawal/ReadingComprehension/b1eb0f4a20a6a73bf07cabd48a1aeae01639830a/Experiment 1/basic_cnn/__init__.py -------------------------------------------------------------------------------- /Experiment 1/basic_cnn/cli.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import tensorflow as tf 4 | 5 | from basic_cnn.main import main as m 6 | 7 | flags = tf.app.flags 8 | 9 | flags.DEFINE_string("model_name", "basic_cnn", "Model name [basic]") 10 | flags.DEFINE_string("data_dir", "data/cnn", "Data dir [data/cnn]") 11 | flags.DEFINE_string("root_dir", "/Users/minjoons/data/cnn/questions", "root dir [~/data/cnn/questions]") 12 | flags.DEFINE_string("run_id", "0", "Run ID [0]") 13 | flags.DEFINE_string("out_base_dir", "out", "out base dir [out]") 14 | 15 | flags.DEFINE_integer("batch_size", 60, "Batch size [60]") 16 | flags.DEFINE_float("init_lr", 0.5, "Initial learning rate [0.5]") 17 | flags.DEFINE_integer("num_epochs", 50, "Total number of epochs for training [50]") 18 | flags.DEFINE_integer("num_steps", 20000, "Number of steps [20000]") 19 | flags.DEFINE_integer("eval_num_batches", 100, "eval num batches [100]") 20 | flags.DEFINE_integer("load_step", 0, "load step [0]") 21 | flags.DEFINE_integer("early_stop", 4, "early stop [4]") 22 | 23 | flags.DEFINE_string("mode", "test", "train | dev | test | forward [test]") 24 | flags.DEFINE_boolean("load", True, "load saved data? [True]") 25 | flags.DEFINE_boolean("progress", True, "Show progress? [True]") 26 | flags.DEFINE_integer("log_period", 100, "Log period [100]") 27 | flags.DEFINE_integer("eval_period", 1000, "Eval period [1000]") 28 | flags.DEFINE_integer("save_period", 1000, "Save Period [1000]") 29 | flags.DEFINE_float("decay", 0.9, "Exponential moving average decay [0.9]") 30 | 31 | flags.DEFINE_boolean("draft", False, "Draft for quick testing? [False]") 32 | 33 | flags.DEFINE_integer("hidden_size", 100, "Hidden size [100]") 34 | flags.DEFINE_integer("char_out_size", 100, "Char out size [100]") 35 | flags.DEFINE_float("input_keep_prob", 0.8, "Input keep prob [0.8]") 36 | flags.DEFINE_integer("char_emb_size", 8, "Char emb size [8]") 37 | flags.DEFINE_integer("char_filter_height", 5, "Char filter height [5]") 38 | flags.DEFINE_float("wd", 0.0, "Weight decay [0.0]") 39 | flags.DEFINE_bool("lower_word", True, "lower word [True]") 40 | flags.DEFINE_bool("dump_eval", False, "dump eval? [True]") 41 | flags.DEFINE_bool("dump_answer", True, "dump answer? [True]") 42 | flags.DEFINE_string("model", "2", "config 1 |2 [2]") 43 | flags.DEFINE_bool("squash", False, "squash the sentences into one? [False]") 44 | flags.DEFINE_bool("single", False, "supervise only the answer sentence? [False]") 45 | 46 | flags.DEFINE_integer("word_count_th", 10, "word count th [100]") 47 | flags.DEFINE_integer("char_count_th", 50, "char count th [500]") 48 | flags.DEFINE_integer("sent_size_th", 60, "sent size th [64]") 49 | flags.DEFINE_integer("num_sents_th", 200, "num sents th [8]") 50 | flags.DEFINE_integer("ques_size_th", 30, "ques size th [32]") 51 | flags.DEFINE_integer("word_size_th", 16, "word size th [16]") 52 | flags.DEFINE_integer("para_size_th", 256, "para size th [256]") 53 | 54 | flags.DEFINE_bool("swap_memory", True, "swap memory? [True]") 55 | flags.DEFINE_string("data_filter", "max", "max | valid | semi [max]") 56 | flags.DEFINE_bool("finetune", False, "finetune? [False]") 57 | flags.DEFINE_bool("feed_gt", False, "feed gt prev token during training [False]") 58 | flags.DEFINE_bool("feed_hard", False, "feed hard argmax prev token during testing [False]") 59 | flags.DEFINE_bool("use_glove_for_unk", True, "use glove for unk [False]") 60 | flags.DEFINE_bool("known_if_glove", True, "consider as known if present in glove [False]") 61 | flags.DEFINE_bool("eval", True, "eval? [True]") 62 | flags.DEFINE_integer("highway_num_layers", 2, "highway num layers [2]") 63 | flags.DEFINE_bool("use_word_emb", True, "use word embedding? [True]") 64 | 65 | flags.DEFINE_string("forward_name", "single", "Forward name [single]") 66 | flags.DEFINE_string("answer_path", "", "Answer path []") 67 | flags.DEFINE_string("load_path", "", "Load path []") 68 | flags.DEFINE_string("shared_path", "", "Shared path []") 69 | flags.DEFINE_string("device", "/cpu:0", "default device [/cpu:0]") 70 | flags.DEFINE_integer("num_gpus", 1, "num of gpus [1]") 71 | 72 | flags.DEFINE_string("out_channel_dims", "100", "Out channel dims, separated by commas [100]") 73 | flags.DEFINE_string("filter_heights", "5", "Filter heights, separated by commas [5]") 74 | 75 | flags.DEFINE_bool("share_cnn_weights", True, "Share CNN weights [False]") 76 | flags.DEFINE_bool("share_lstm_weights", True, "Share LSTM weights [True]") 77 | flags.DEFINE_bool("two_prepro_layers", False, "Use two layers for preprocessing? [False]") 78 | flags.DEFINE_bool("aug_att", False, "Augment attention layers with more features? [False]") 79 | flags.DEFINE_integer("max_to_keep", 20, "Max recent saves to keep [20]") 80 | flags.DEFINE_bool("vis", False, "output visualization numbers? [False]") 81 | flags.DEFINE_bool("dump_pickle", True, "Dump pickle instead of json? [True]") 82 | flags.DEFINE_float("keep_prob", 1.0, "keep prob [1.0]") 83 | flags.DEFINE_string("prev_mode", "a", "prev mode gy | y | a [a]") 84 | flags.DEFINE_string("logit_func", "tri_linear", "logit func [tri_linear]") 85 | flags.DEFINE_bool("sh", False, "use superhighway [False]") 86 | flags.DEFINE_string("answer_func", "linear", "answer logit func [linear]") 87 | flags.DEFINE_bool("cluster", False, "Cluster data for faster training [False]") 88 | flags.DEFINE_bool("len_opt", False, "Length optimization? [False]") 89 | flags.DEFINE_string("sh_logit_func", "tri_linear", "sh logit func [tri_linear]") 90 | flags.DEFINE_float("filter_ratio", 1.0, "filter ratio [1.0]") 91 | flags.DEFINE_bool("bi", False, "bi-directional attention? [False]") 92 | flags.DEFINE_integer("width", 5, "width around entity [5]") 93 | 94 | 95 | def main(_): 96 | config = flags.FLAGS 97 | 98 | config.out_dir = os.path.join(config.out_base_dir, config.model_name, str(config.run_id).zfill(2)) 99 | 100 | m(config) 101 | 102 | if __name__ == "__main__": 103 | tf.app.run() 104 | -------------------------------------------------------------------------------- /Experiment 1/basic_cnn/graph_handler.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import json 3 | from json import encoder 4 | import os 5 | 6 | import tensorflow as tf 7 | 8 | from basic_cnn.evaluator import Evaluation, F1Evaluation 9 | from my.utils import short_floats 10 | 11 | import pickle 12 | 13 | 14 | class GraphHandler(object): 15 | def __init__(self, config): 16 | self.config = config 17 | self.saver = tf.train.Saver(max_to_keep=config.max_to_keep) 18 | self.writer = None 19 | self.save_path = os.path.join(config.save_dir, config.model_name) 20 | 21 | def initialize(self, sess): 22 | if self.config.load: 23 | self._load(sess) 24 | else: 25 | sess.run(tf.initialize_all_variables()) 26 | 27 | if self.config.mode == 'train': 28 | self.writer = tf.train.SummaryWriter(self.config.log_dir, graph=tf.get_default_graph()) 29 | 30 | def save(self, sess, global_step=None): 31 | self.saver.save(sess, self.save_path, global_step=global_step) 32 | 33 | def _load(self, sess): 34 | config = self.config 35 | if config.load_path: 36 | save_path = config.load_path 37 | elif config.load_step > 0: 38 | save_path = os.path.join(config.save_dir, "{}-{}".format(config.model_name, config.load_step)) 39 | else: 40 | save_dir = config.save_dir 41 | checkpoint = tf.train.get_checkpoint_state(save_dir) 42 | assert checkpoint is not None, "cannot load checkpoint at {}".format(save_dir) 43 | save_path = checkpoint.model_checkpoint_path 44 | print("Loading saved model from {}".format(save_path)) 45 | self.saver.restore(sess, save_path) 46 | 47 | def add_summary(self, summary, global_step): 48 | self.writer.add_summary(summary, global_step) 49 | 50 | def add_summaries(self, summaries, global_step): 51 | for summary in summaries: 52 | self.add_summary(summary, global_step) 53 | 54 | def dump_eval(self, e, precision=2, path=None): 55 | assert isinstance(e, Evaluation) 56 | if self.config.dump_pickle: 57 | path = path or os.path.join(self.config.eval_dir, "{}-{}.pklz".format(e.data_type, str(e.global_step).zfill(6))) 58 | with gzip.open(path, 'wb', compresslevel=3) as fh: 59 | pickle.dump(e.dict, fh) 60 | else: 61 | path = path or os.path.join(self.config.eval_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6))) 62 | with open(path, 'w') as fh: 63 | json.dump(short_floats(e.dict, precision), fh) 64 | 65 | def dump_answer(self, e, path=None): 66 | assert isinstance(e, Evaluation) 67 | path = path or os.path.join(self.config.answer_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6))) 68 | with open(path, 'w') as fh: 69 | json.dump(e.id2answer_dict, fh) 70 | 71 | -------------------------------------------------------------------------------- /Experiment 1/basic_cnn/superhighway.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.python.ops.rnn_cell import RNNCell 3 | 4 | from my.tensorflow.nn import linear 5 | 6 | 7 | class SHCell(RNNCell): 8 | """ 9 | Super-Highway Cell 10 | """ 11 | def __init__(self, input_size, logit_func='tri_linear', scalar=False): 12 | self._state_size = input_size 13 | self._output_size = input_size 14 | self._logit_func = logit_func 15 | self._scalar = scalar 16 | 17 | @property 18 | def state_size(self): 19 | return self._state_size 20 | 21 | @property 22 | def output_size(self): 23 | return self._output_size 24 | 25 | def __call__(self, inputs, state, scope=None): 26 | with tf.variable_scope(scope or "SHCell"): 27 | a_size = 1 if self._scalar else self._state_size 28 | h, u = tf.split(1, 2, inputs) 29 | if self._logit_func == 'mul_linear': 30 | args = [h * u, state * u] 31 | a = tf.nn.sigmoid(linear(args, a_size, True)) 32 | elif self._logit_func == 'linear': 33 | args = [h, u, state] 34 | a = tf.nn.sigmoid(linear(args, a_size, True)) 35 | elif self._logit_func == 'tri_linear': 36 | args = [h, u, state, h * u, state * u] 37 | a = tf.nn.sigmoid(linear(args, a_size, True)) 38 | elif self._logit_func == 'double': 39 | args = [h, u, state] 40 | a = tf.nn.sigmoid(linear(tf.tanh(linear(args, a_size, True)), self._state_size, True)) 41 | 42 | else: 43 | raise Exception() 44 | new_state = a * state + (1 - a) * h 45 | outputs = state 46 | return outputs, new_state 47 | 48 | -------------------------------------------------------------------------------- /Experiment 1/basic_cnn/templates/visualizer.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {{ title }} 6 | 7 | 8 | 19 | 20 | 23 | 24 |

    {{ title }}

    25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | {% for row in rows %} 35 | 36 | 37 | 42 | 47 | 48 | 49 | 72 | 73 | {% endfor %} 74 |
    IDQuestionAnswersPredictedScoreParagraph
    {{ row.id }} 38 | {% for qj in row.ques %} 39 | {{ qj }} 40 | {% endfor %} 41 | 43 | {% for aa in row.a %} 44 |
  • {{ aa }}
  • 45 | {% endfor %} 46 |
    {{ row.ap }}{{ row.score }} 50 | 51 | {% for xj, ypj, yp2j in zip(row.para, row.yp, row.yp2) %} 52 | 53 | {% set rowloop = loop %} 54 | {% for xjk, ypjk in zip(xj, ypj) %} 55 | 62 | {% endfor %} 63 | 64 | 65 | {% for xjk, yp2jk in zip(xj, yp2j) %} 66 | 67 | {% endfor %} 68 | 69 | {% endfor %} 70 |
    56 | {% if row.y[0][0] == rowloop.index0 and row.y[0][1] <= loop.index0 <= row.y[1][1] %} 57 | {{ xjk }} 58 | {% else %} 59 | {{ xjk }} 60 | {% endif %} 61 |
    -
    71 |
    75 | 76 | -------------------------------------------------------------------------------- /Experiment 1/basic_cnn/trainer.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from basic_cnn.model import Model 4 | from my.tensorflow import average_gradients 5 | 6 | 7 | class Trainer(object): 8 | def __init__(self, config, model): 9 | assert isinstance(model, Model) 10 | self.config = config 11 | self.model = model 12 | self.opt = tf.train.AdadeltaOptimizer(config.init_lr) 13 | self.loss = model.get_loss() 14 | self.var_list = model.get_var_list() 15 | self.global_step = model.get_global_step() 16 | self.summary = model.summary 17 | self.grads = self.opt.compute_gradients(self.loss, var_list=self.var_list) 18 | self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step) 19 | 20 | def get_train_op(self): 21 | return self.train_op 22 | 23 | def step(self, sess, batch, get_summary=False): 24 | assert isinstance(sess, tf.Session) 25 | _, ds = batch 26 | feed_dict = self.model.get_feed_dict(ds, True) 27 | if get_summary: 28 | loss, summary, train_op = \ 29 | sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict) 30 | else: 31 | loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict) 32 | summary = None 33 | return loss, summary, train_op 34 | 35 | 36 | class MultiGPUTrainer(object): 37 | def __init__(self, config, models): 38 | model = models[0] 39 | assert isinstance(model, Model) 40 | self.config = config 41 | self.model = model 42 | self.opt = tf.train.AdadeltaOptimizer(config.init_lr) 43 | self.var_list = model.get_var_list() 44 | self.global_step = model.get_global_step() 45 | self.summary = model.summary 46 | self.models = models 47 | losses = [] 48 | grads_list = [] 49 | for gpu_idx, model in enumerate(models): 50 | with tf.name_scope("grads_{}".format(gpu_idx)), tf.device("/gpu:{}".format(gpu_idx)): 51 | loss = model.get_loss() 52 | grads = self.opt.compute_gradients(loss, var_list=self.var_list) 53 | losses.append(loss) 54 | grads_list.append(grads) 55 | 56 | self.loss = tf.add_n(losses)/len(losses) 57 | self.grads = average_gradients(grads_list) 58 | self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step) 59 | 60 | def step(self, sess, batches, get_summary=False): 61 | assert isinstance(sess, tf.Session) 62 | feed_dict = {} 63 | for batch, model in zip(batches, self.models): 64 | _, ds = batch 65 | feed_dict.update(model.get_feed_dict(ds, True)) 66 | 67 | if get_summary: 68 | loss, summary, train_op = \ 69 | sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict) 70 | else: 71 | loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict) 72 | summary = None 73 | return loss, summary, train_op 74 | -------------------------------------------------------------------------------- /Experiment 1/basic_cnn/visualizer.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from collections import OrderedDict 3 | import http.server 4 | import socketserver 5 | import argparse 6 | import json 7 | import os 8 | import numpy as np 9 | from tqdm import tqdm 10 | 11 | from jinja2 import Environment, FileSystemLoader 12 | 13 | from basic_cnn.evaluator import get_span_score_pairs, get_best_span 14 | 15 | 16 | def bool_(string): 17 | if string == 'True': 18 | return True 19 | elif string == 'False': 20 | return False 21 | else: 22 | raise Exception() 23 | 24 | def get_args(): 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument("--model_name", type=str, default='basic') 27 | parser.add_argument("--data_type", type=str, default='dev') 28 | parser.add_argument("--step", type=int, default=5000) 29 | parser.add_argument("--template_name", type=str, default="visualizer.html") 30 | parser.add_argument("--num_per_page", type=int, default=100) 31 | parser.add_argument("--data_dir", type=str, default="data/squad") 32 | parser.add_argument("--port", type=int, default=8000) 33 | parser.add_argument("--host", type=str, default="0.0.0.0") 34 | parser.add_argument("--open", type=str, default='False') 35 | parser.add_argument("--run_id", type=str, default="0") 36 | 37 | args = parser.parse_args() 38 | return args 39 | 40 | 41 | def _decode(decoder, sent): 42 | return " ".join(decoder[idx] for idx in sent) 43 | 44 | 45 | def accuracy2_visualizer(args): 46 | model_name = args.model_name 47 | data_type = args.data_type 48 | num_per_page = args.num_per_page 49 | data_dir = args.data_dir 50 | run_id = args.run_id.zfill(2) 51 | step = args.step 52 | 53 | eval_path =os.path.join("out", model_name, run_id, "eval", "{}-{}.json".format(data_type, str(step).zfill(6))) 54 | print("loading {}".format(eval_path)) 55 | eval_ = json.load(open(eval_path, 'r')) 56 | 57 | _id = 0 58 | html_dir = "/tmp/list_results%d" % _id 59 | while os.path.exists(html_dir): 60 | _id += 1 61 | html_dir = "/tmp/list_results%d" % _id 62 | 63 | if os.path.exists(html_dir): 64 | shutil.rmtree(html_dir) 65 | os.mkdir(html_dir) 66 | 67 | cur_dir = os.path.dirname(os.path.realpath(__file__)) 68 | templates_dir = os.path.join(cur_dir, 'templates') 69 | env = Environment(loader=FileSystemLoader(templates_dir)) 70 | env.globals.update(zip=zip, reversed=reversed) 71 | template = env.get_template(args.template_name) 72 | 73 | data_path = os.path.join(data_dir, "data_{}.json".format(data_type)) 74 | shared_path = os.path.join(data_dir, "shared_{}.json".format(data_type)) 75 | print("loading {}".format(data_path)) 76 | data = json.load(open(data_path, 'r')) 77 | print("loading {}".format(shared_path)) 78 | shared = json.load(open(shared_path, 'r')) 79 | 80 | rows = [] 81 | for i, (idx, yi, ypi, yp2i) in tqdm(enumerate(zip(*[eval_[key] for key in ('idxs', 'y', 'yp', 'yp2')])), total=len(eval_['idxs'])): 82 | id_, q, rx, answers = (data[key][idx] for key in ('ids', 'q', '*x', 'answerss')) 83 | x = shared['x'][rx[0]][rx[1]] 84 | ques = [" ".join(q)] 85 | para = [[word for word in sent] for sent in x] 86 | span = get_best_span(ypi, yp2i) 87 | ap = get_segment(para, span) 88 | score = "{:.3f}".format(ypi[span[0][0]][span[0][1]] * yp2i[span[1][0]][span[1][1]-1]) 89 | 90 | row = { 91 | 'id': id_, 92 | 'title': "Hello world!", 93 | 'ques': ques, 94 | 'para': para, 95 | 'y': yi[0][0], 96 | 'y2': yi[0][1], 97 | 'yp': ypi, 98 | 'yp2': yp2i, 99 | 'a': answers, 100 | 'ap': ap, 101 | 'score': score 102 | } 103 | rows.append(row) 104 | 105 | if i % num_per_page == 0: 106 | html_path = os.path.join(html_dir, "%s.html" % str(i).zfill(8)) 107 | 108 | if (i + 1) % num_per_page == 0 or (i + 1) == len(eval_['y']): 109 | var_dict = {'title': "Accuracy Visualization", 110 | 'rows': rows 111 | } 112 | with open(html_path, "wb") as f: 113 | f.write(template.render(**var_dict).encode('UTF-8')) 114 | rows = [] 115 | 116 | os.chdir(html_dir) 117 | port = args.port 118 | host = args.host 119 | # Overriding to suppress log message 120 | class MyHandler(http.server.SimpleHTTPRequestHandler): 121 | def log_message(self, format, *args): 122 | pass 123 | handler = MyHandler 124 | httpd = socketserver.TCPServer((host, port), handler) 125 | if args.open == 'True': 126 | os.system("open http://%s:%d" % (args.host, args.port)) 127 | print("serving at %s:%d" % (host, port)) 128 | httpd.serve_forever() 129 | 130 | 131 | def get_segment(para, span): 132 | return " ".join(para[span[0][0]][span[0][1]:span[1][1]]) 133 | 134 | 135 | if __name__ == "__main__": 136 | ARGS = get_args() 137 | accuracy2_visualizer(ARGS) -------------------------------------------------------------------------------- /Experiment 1/cnn_dm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imraviagrawal/ReadingComprehension/b1eb0f4a20a6a73bf07cabd48a1aeae01639830a/Experiment 1/cnn_dm/__init__.py -------------------------------------------------------------------------------- /Experiment 1/cnn_dm/evaluate.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | 5 | root_dir = sys.argv[1] 6 | answer_path = sys.argv[2] 7 | file_names = os.listdir(root_dir) 8 | 9 | num_correct = 0 10 | num_wrong = 0 11 | 12 | with open(answer_path, 'r') as fh: 13 | id2answer_dict = json.load(fh) 14 | 15 | for file_name in file_names: 16 | if not file_name.endswith(".question"): 17 | continue 18 | with open(os.path.join(root_dir, file_name), 'r') as fh: 19 | url = fh.readline().strip() 20 | _ = fh.readline() 21 | para = fh.readline().strip() 22 | _ = fh.readline() 23 | ques = fh.readline().strip() 24 | _ = fh.readline() 25 | answer = fh.readline().strip() 26 | _ = fh.readline() 27 | if file_name in id2answer_dict: 28 | pred = id2answer_dict[file_name] 29 | if pred == answer: 30 | num_correct += 1 31 | else: 32 | num_wrong += 1 33 | else: 34 | num_wrong += 1 35 | 36 | total = num_correct + num_wrong 37 | acc = float(num_correct) / total 38 | print("{} = {} / {}".format(acc, num_correct, total)) -------------------------------------------------------------------------------- /Experiment 1/cnn_dm/prepro.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | # data: q, cq, (dq), (pq), y, *x, *cx 5 | # shared: x, cx, (dx), (px), word_counter, char_counter, word2vec 6 | # no metadata 7 | from collections import Counter 8 | 9 | from tqdm import tqdm 10 | 11 | from my.utils import process_tokens 12 | from squad.utils import get_word_span, process_tokens 13 | 14 | 15 | def bool_(arg): 16 | if arg == 'True': 17 | return True 18 | elif arg == 'False': 19 | return False 20 | raise Exception(arg) 21 | 22 | 23 | def main(): 24 | args = get_args() 25 | prepro(args) 26 | 27 | 28 | def get_args(): 29 | parser = argparse.ArgumentParser() 30 | home = os.path.expanduser("~") 31 | source_dir = os.path.join(home, "data", "cnn", 'questions') 32 | target_dir = "data/cnn" 33 | glove_dir = os.path.join(home, "data", "glove") 34 | parser.add_argument("--source_dir", default=source_dir) 35 | parser.add_argument("--target_dir", default=target_dir) 36 | parser.add_argument("--glove_dir", default=glove_dir) 37 | parser.add_argument("--glove_corpus", default='6B') 38 | parser.add_argument("--glove_vec_size", default=100, type=int) 39 | parser.add_argument("--debug", default=False, type=bool_) 40 | parser.add_argument("--num_sents_th", default=200, type=int) 41 | parser.add_argument("--ques_size_th", default=30, type=int) 42 | parser.add_argument("--width", default=5, type=int) 43 | # TODO : put more args here 44 | return parser.parse_args() 45 | 46 | 47 | def prepro(args): 48 | prepro_each(args, 'train') 49 | prepro_each(args, 'dev') 50 | prepro_each(args, 'test') 51 | 52 | 53 | def para2sents(para, width): 54 | """ 55 | Turn para into double array of words (wordss) 56 | Where each sentence is up to 5 word neighbors of each entity 57 | :param para: 58 | :return: 59 | """ 60 | words = para.split(" ") 61 | sents = [] 62 | for i, word in enumerate(words): 63 | if word.startswith("@"): 64 | start = max(i - width, 0) 65 | stop = min(i + width + 1, len(words)) 66 | sent = words[start:stop] 67 | sents.append(sent) 68 | return sents 69 | 70 | 71 | def get_word2vec(args, word_counter): 72 | glove_path = os.path.join(args.glove_dir, "glove.{}.{}d.txt".format(args.glove_corpus, args.glove_vec_size)) 73 | sizes = {'6B': int(4e5), '42B': int(1.9e6), '840B': int(2.2e6), '2B': int(1.2e6)} 74 | total = sizes[args.glove_corpus] 75 | word2vec_dict = {} 76 | with open(glove_path, 'r', encoding='utf-8') as fh: 77 | for line in tqdm(fh, total=total): 78 | array = line.lstrip().rstrip().split(" ") 79 | word = array[0] 80 | vector = list(map(float, array[1:])) 81 | if word in word_counter: 82 | word2vec_dict[word] = vector 83 | elif word.capitalize() in word_counter: 84 | word2vec_dict[word.capitalize()] = vector 85 | elif word.lower() in word_counter: 86 | word2vec_dict[word.lower()] = vector 87 | elif word.upper() in word_counter: 88 | word2vec_dict[word.upper()] = vector 89 | 90 | print("{}/{} of word vocab have corresponding vectors in {}".format(len(word2vec_dict), len(word_counter), glove_path)) 91 | return word2vec_dict 92 | 93 | 94 | def prepro_each(args, mode): 95 | source_dir = os.path.join(args.source_dir, mode) 96 | word_counter = Counter() 97 | lower_word_counter = Counter() 98 | ent_counter = Counter() 99 | char_counter = Counter() 100 | max_sent_size = 0 101 | max_word_size = 0 102 | max_ques_size = 0 103 | max_num_sents = 0 104 | 105 | file_names = list(os.listdir(source_dir)) 106 | if args.debug: 107 | file_names = file_names[:1000] 108 | lens = [] 109 | 110 | out_file_names = [] 111 | for file_name in tqdm(file_names, total=len(file_names)): 112 | if file_name.endswith(".question"): 113 | with open(os.path.join(source_dir, file_name), 'r') as fh: 114 | url = fh.readline().strip() 115 | _ = fh.readline() 116 | para = fh.readline().strip() 117 | _ = fh.readline() 118 | ques = fh.readline().strip() 119 | _ = fh.readline() 120 | answer = fh.readline().strip() 121 | _ = fh.readline() 122 | cands = list(line.strip() for line in fh) 123 | cand_ents = list(cand.split(":")[0] for cand in cands) 124 | sents = para2sents(para, args.width) 125 | ques_words = ques.split(" ") 126 | 127 | # Filtering 128 | if len(sents) > args.num_sents_th or len(ques_words) > args.ques_size_th: 129 | continue 130 | 131 | max_sent_size = max(max(map(len, sents)), max_sent_size) 132 | max_ques_size = max(len(ques_words), max_ques_size) 133 | max_word_size = max(max(len(word) for sent in sents for word in sent), max_word_size) 134 | max_num_sents = max(len(sents), max_num_sents) 135 | 136 | for word in ques_words: 137 | if word.startswith("@"): 138 | ent_counter[word] += 1 139 | word_counter[word] += 1 140 | else: 141 | word_counter[word] += 1 142 | lower_word_counter[word.lower()] += 1 143 | for c in word: 144 | char_counter[c] += 1 145 | for sent in sents: 146 | for word in sent: 147 | if word.startswith("@"): 148 | ent_counter[word] += 1 149 | word_counter[word] += 1 150 | else: 151 | word_counter[word] += 1 152 | lower_word_counter[word.lower()] += 1 153 | for c in word: 154 | char_counter[c] += 1 155 | 156 | out_file_names.append(file_name) 157 | lens.append(len(sents)) 158 | num_examples = len(out_file_names) 159 | 160 | assert len(out_file_names) == len(lens) 161 | sorted_file_names, lens = zip(*sorted(zip(out_file_names, lens), key=lambda each: each[1])) 162 | assert lens[-1] == max_num_sents 163 | 164 | word2vec_dict = get_word2vec(args, word_counter) 165 | lower_word2vec_dit = get_word2vec(args, lower_word_counter) 166 | 167 | shared = {'word_counter': word_counter, 'ent_counter': ent_counter, 'char_counter': char_counter, 168 | 'lower_word_counter': lower_word_counter, 169 | 'max_num_sents': max_num_sents, 'max_sent_size': max_sent_size, 'max_word_size': max_word_size, 170 | 'max_ques_size': max_ques_size, 171 | 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dit, 'sorted': sorted_file_names, 172 | 'num_examples': num_examples} 173 | 174 | print("max num sents: {}".format(max_num_sents)) 175 | print("max ques size: {}".format(max_ques_size)) 176 | 177 | if not os.path.exists(args.target_dir): 178 | os.makedirs(args.target_dir) 179 | shared_path = os.path.join(args.target_dir, "shared_{}.json".format(mode)) 180 | with open(shared_path, 'w') as fh: 181 | json.dump(shared, fh) 182 | 183 | 184 | if __name__ == "__main__": 185 | main() 186 | -------------------------------------------------------------------------------- /Experiment 1/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | DATA_DIR=$HOME/data 4 | mkdir $DATA_DIR 5 | 6 | # Download SQuAD 7 | SQUAD_DIR=$DATA_DIR/squad 8 | mkdir $SQUAD_DIR 9 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O $SQUAD_DIR/train-v1.1.json 10 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O $SQUAD_DIR/dev-v1.1.json 11 | 12 | 13 | # Download CNN and DailyMail 14 | # Download at: http://cs.nyu.edu/~kcho/DMQA/ 15 | 16 | 17 | # Download GloVe 18 | GLOVE_DIR=$DATA_DIR/glove 19 | mkdir $GLOVE_DIR 20 | wget http://nlp.stanford.edu/data/glove.6B.zip -O $GLOVE_DIR/glove.6B.zip 21 | unzip $GLOVE_DIR/glove.6B.zip -d $GLOVE_DIR 22 | 23 | # Download NLTK (for tokenizer) 24 | # Make sure that nltk is installed! 25 | python3 -m nltk.downloader -d $HOME/nltk_data punkt 26 | -------------------------------------------------------------------------------- /Experiment 1/my/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imraviagrawal/ReadingComprehension/b1eb0f4a20a6a73bf07cabd48a1aeae01639830a/Experiment 1/my/__init__.py -------------------------------------------------------------------------------- /Experiment 1/my/corenlp_interface.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import requests 4 | import nltk 5 | import json 6 | import networkx as nx 7 | import time 8 | 9 | 10 | class CoreNLPInterface(object): 11 | def __init__(self, url, port): 12 | self._url = url 13 | self._port = port 14 | 15 | def get(self, type_, in_, num_max_requests=100): 16 | in_ = in_.encode("utf-8") 17 | url = "http://{}:{}/{}".format(self._url, self._port, type_) 18 | out = None 19 | for _ in range(num_max_requests): 20 | try: 21 | r = requests.post(url, data=in_) 22 | out = r.content.decode('utf-8') 23 | if out == 'error': 24 | out = None 25 | break 26 | except: 27 | time.sleep(1) 28 | return out 29 | 30 | def split_doc(self, doc): 31 | out = self.get("doc", doc) 32 | return out if out is None else json.loads(out) 33 | 34 | def split_sent(self, sent): 35 | out = self.get("sent", sent) 36 | return out if out is None else json.loads(out) 37 | 38 | def get_dep(self, sent): 39 | out = self.get("dep", sent) 40 | return out if out is None else json.loads(out) 41 | 42 | def get_const(self, sent): 43 | out = self.get("const", sent) 44 | return out 45 | 46 | def get_const_tree(self, sent): 47 | out = self.get_const(sent) 48 | return out if out is None else nltk.tree.Tree.fromstring(out) 49 | 50 | @staticmethod 51 | def dep2tree(dep): 52 | tree = nx.DiGraph() 53 | for dep, i, gov, j, label in dep: 54 | tree.add_edge(gov, dep, label=label) 55 | return tree 56 | -------------------------------------------------------------------------------- /Experiment 1/my/nltk_utils.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import numpy as np 3 | 4 | 5 | def _set_span(t, i): 6 | if isinstance(t[0], str): 7 | t.span = (i, i+len(t)) 8 | else: 9 | first = True 10 | for c in t: 11 | cur_span = _set_span(c, i) 12 | i = cur_span[1] 13 | if first: 14 | min_ = cur_span[0] 15 | first = False 16 | max_ = cur_span[1] 17 | t.span = (min_, max_) 18 | return t.span 19 | 20 | 21 | def set_span(t): 22 | assert isinstance(t, nltk.tree.Tree) 23 | try: 24 | return _set_span(t, 0) 25 | except: 26 | print(t) 27 | exit() 28 | 29 | 30 | def tree_contains_span(tree, span): 31 | """ 32 | Assumes that tree span has been set with set_span 33 | Returns true if any subtree of t has exact span as the given span 34 | :param t: 35 | :param span: 36 | :return bool: 37 | """ 38 | return span in set(t.span for t in tree.subtrees()) 39 | 40 | 41 | def span_len(span): 42 | return span[1] - span[0] 43 | 44 | 45 | def span_overlap(s1, s2): 46 | start = max(s1[0], s2[0]) 47 | stop = min(s1[1], s2[1]) 48 | if stop > start: 49 | return start, stop 50 | return None 51 | 52 | 53 | def span_prec(true_span, pred_span): 54 | overlap = span_overlap(true_span, pred_span) 55 | if overlap is None: 56 | return 0 57 | return span_len(overlap) / span_len(pred_span) 58 | 59 | 60 | def span_recall(true_span, pred_span): 61 | overlap = span_overlap(true_span, pred_span) 62 | if overlap is None: 63 | return 0 64 | return span_len(overlap) / span_len(true_span) 65 | 66 | 67 | def span_f1(true_span, pred_span): 68 | p = span_prec(true_span, pred_span) 69 | r = span_recall(true_span, pred_span) 70 | if p == 0 or r == 0: 71 | return 0.0 72 | return 2 * p * r / (p + r) 73 | 74 | 75 | def find_max_f1_span(tree, span): 76 | return find_max_f1_subtree(tree, span).span 77 | 78 | 79 | def find_max_f1_subtree(tree, span): 80 | return max(((t, span_f1(span, t.span)) for t in tree.subtrees()), key=lambda p: p[1])[0] 81 | 82 | 83 | def tree2matrix(tree, node2num, row_size=None, col_size=None, dtype='int32'): 84 | set_span(tree) 85 | D = tree.height() - 1 86 | B = len(tree.leaves()) 87 | row_size = row_size or D 88 | col_size = col_size or B 89 | matrix = np.zeros([row_size, col_size], dtype=dtype) 90 | mask = np.zeros([row_size, col_size, col_size], dtype='bool') 91 | 92 | for subtree in tree.subtrees(): 93 | row = subtree.height() - 2 94 | col = subtree.span[0] 95 | matrix[row, col] = node2num(subtree) 96 | for subsub in subtree.subtrees(): 97 | if isinstance(subsub, nltk.tree.Tree): 98 | mask[row, col, subsub.span[0]] = True 99 | if not isinstance(subsub[0], nltk.tree.Tree): 100 | c = subsub.span[0] 101 | for r in range(row): 102 | mask[r, c, c] = True 103 | else: 104 | mask[row, col, col] = True 105 | 106 | return matrix, mask 107 | 108 | 109 | def load_compressed_tree(s): 110 | 111 | def compress_tree(tree): 112 | assert not isinstance(tree, str) 113 | if len(tree) == 1: 114 | if isinstance(tree[0], nltk.tree.Tree): 115 | return compress_tree(tree[0]) 116 | else: 117 | return tree 118 | else: 119 | for i, t in enumerate(tree): 120 | if isinstance(t, nltk.tree.Tree): 121 | tree[i] = compress_tree(t) 122 | else: 123 | tree[i] = t 124 | return tree 125 | 126 | return compress_tree(nltk.tree.Tree.fromstring(s)) 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /Experiment 1/my/tensorflow/__init__.py: -------------------------------------------------------------------------------- 1 | from my.tensorflow.general import * -------------------------------------------------------------------------------- /Experiment 1/my/tensorflow/general.py: -------------------------------------------------------------------------------- 1 | from itertools import zip_longest 2 | 3 | import tensorflow as tf 4 | from functools import reduce 5 | from operator import mul 6 | import numpy as np 7 | 8 | VERY_BIG_NUMBER = 1e30 9 | VERY_SMALL_NUMBER = 1e-30 10 | VERY_POSITIVE_NUMBER = VERY_BIG_NUMBER 11 | VERY_NEGATIVE_NUMBER = -VERY_BIG_NUMBER 12 | 13 | 14 | def get_initializer(matrix): 15 | def _initializer(shape, dtype=None, partition_info=None, **kwargs): return matrix 16 | return _initializer 17 | 18 | 19 | def variable_on_cpu(name, shape, initializer): 20 | """Helper to create a Variable stored on CPU memory. 21 | 22 | Args: 23 | name: name of the variable 24 | shape: list of ints 25 | initializer: initializer for Variable 26 | 27 | Returns: 28 | Variable Tensor 29 | """ 30 | with tf.device('/cpu:0'): 31 | var = tf.get_variable(name, shape, initializer=initializer) 32 | return var 33 | 34 | 35 | def variable_with_weight_decay(name, shape, stddev, wd): 36 | """Helper to create an initialized Variable with weight decay. 37 | 38 | Note that the Variable is initialized with a truncated normal distribution. 39 | A weight decay is added only if one is specified. 40 | 41 | Args: 42 | name: name of the variable 43 | shape: list of ints 44 | stddev: standard deviation of a truncated Gaussian 45 | wd: add L2Loss weight decay multiplied by this float. If None, weight 46 | decay is not added for this Variable. 47 | 48 | Returns: 49 | Variable Tensor 50 | """ 51 | var = variable_on_cpu(name, shape, 52 | tf.truncated_normal_initializer(stddev=stddev)) 53 | if wd: 54 | weight_decay = tf.mul(tf.nn.l2_loss(var), wd, name='weight_loss') 55 | tf.add_to_collection('losses', weight_decay) 56 | return var 57 | 58 | 59 | def average_gradients(tower_grads): 60 | """Calculate the average gradient for each shared variable across all towers. 61 | 62 | Note that this function provides a synchronization point across all towers. 63 | 64 | Args: 65 | tower_grads: List of lists of (gradient, variable) tuples. The outer list 66 | is over individual gradients. The inner list is over the gradient 67 | calculation for each tower. 68 | Returns: 69 | List of pairs of (gradient, variable) where the gradient has been averaged 70 | across all towers. 71 | """ 72 | average_grads = [] 73 | for grad_and_vars in zip(*tower_grads): 74 | # Note that each grad_and_vars looks like the following: 75 | # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) 76 | grads = [] 77 | for g, var in grad_and_vars: 78 | # Add 0 dimension to the gradients to represent the tower. 79 | assert g is not None, var.name 80 | expanded_g = tf.expand_dims(g, 0) 81 | 82 | # Append on a 'tower' dimension which we will average over below. 83 | grads.append(expanded_g) 84 | 85 | # Average over the 'tower' dimension. 86 | grad = tf.concat(0, grads) 87 | grad = tf.reduce_mean(grad, 0) 88 | 89 | # Keep in mind that the Variables are redundant because they are shared 90 | # across towers. So .. we will just return the first tower's pointer to 91 | # the Variable. 92 | v = grad_and_vars[0][1] 93 | grad_and_var = (grad, v) 94 | average_grads.append(grad_and_var) 95 | return average_grads 96 | 97 | 98 | def mask(val, mask, name=None): 99 | if name is None: 100 | name = 'mask' 101 | return tf.mul(val, tf.cast(mask, 'float'), name=name) 102 | 103 | 104 | def exp_mask(val, mask, name=None): 105 | """Give very negative number to unmasked elements in val. 106 | For example, [-3, -2, 10], [True, True, False] -> [-3, -2, -1e9]. 107 | Typically, this effectively masks in exponential space (e.g. softmax) 108 | Args: 109 | val: values to be masked 110 | mask: masking boolean tensor, same shape as tensor 111 | name: name for output tensor 112 | 113 | Returns: 114 | Same shape as val, where some elements are very small (exponentially zero) 115 | """ 116 | if name is None: 117 | name = "exp_mask" 118 | return tf.add(val, (1 - tf.cast(mask, 'float')) * VERY_NEGATIVE_NUMBER, name=name) 119 | 120 | 121 | def flatten(tensor, keep): 122 | fixed_shape = tensor.get_shape().as_list() 123 | start = len(fixed_shape) - keep 124 | left = reduce(mul, [fixed_shape[i] or tf.shape(tensor)[i] for i in range(start)]) 125 | out_shape = [left] + [fixed_shape[i] or tf.shape(tensor)[i] for i in range(start, len(fixed_shape))] 126 | flat = tf.reshape(tensor, out_shape) 127 | return flat 128 | 129 | 130 | def reconstruct(tensor, ref, keep): 131 | ref_shape = ref.get_shape().as_list() 132 | tensor_shape = tensor.get_shape().as_list() 133 | ref_stop = len(ref_shape) - keep 134 | tensor_start = len(tensor_shape) - keep 135 | pre_shape = [ref_shape[i] or tf.shape(ref)[i] for i in range(ref_stop)] 136 | keep_shape = [tensor_shape[i] or tf.shape(tensor)[i] for i in range(tensor_start, len(tensor_shape))] 137 | # pre_shape = [tf.shape(ref)[i] for i in range(len(ref.get_shape().as_list()[:-keep]))] 138 | # keep_shape = tensor.get_shape().as_list()[-keep:] 139 | target_shape = pre_shape + keep_shape 140 | out = tf.reshape(tensor, target_shape) 141 | return out 142 | 143 | 144 | def add_wd(wd, scope=None): 145 | scope = scope or tf.get_variable_scope().name 146 | variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) 147 | with tf.name_scope("weight_decay"): 148 | for var in variables: 149 | weight_decay = tf.mul(tf.nn.l2_loss(var), wd, name="{}/wd".format(var.op.name)) 150 | tf.add_to_collection('losses', weight_decay) 151 | 152 | 153 | def grouper(iterable, n, fillvalue=None, shorten=False, num_groups=None): 154 | args = [iter(iterable)] * n 155 | out = zip_longest(*args, fillvalue=fillvalue) 156 | out = list(out) 157 | if num_groups is not None: 158 | default = (fillvalue, ) * n 159 | assert isinstance(num_groups, int) 160 | out = list(each for each, _ in zip_longest(out, range(num_groups), fillvalue=default)) 161 | if shorten: 162 | assert fillvalue is None 163 | out = (tuple(e for e in each if e is not None) for each in out) 164 | return out 165 | 166 | def padded_reshape(tensor, shape, mode='CONSTANT', name=None): 167 | paddings = [[0, shape[i] - tf.shape(tensor)[i]] for i in range(len(shape))] 168 | return tf.pad(tensor, paddings, mode=mode, name=name) -------------------------------------------------------------------------------- /Experiment 1/my/tensorflow/nn.py: -------------------------------------------------------------------------------- 1 | from tensorflow.python.ops.rnn_cell import _linear 2 | from tensorflow.python.util import nest 3 | import tensorflow as tf 4 | 5 | from my.tensorflow import flatten, reconstruct, add_wd, exp_mask 6 | 7 | 8 | def linear(args, output_size, bias, bias_start=0.0, scope=None, squeeze=False, wd=0.0, input_keep_prob=1.0, 9 | is_train=None): 10 | if args is None or (nest.is_sequence(args) and not args): 11 | raise ValueError("`args` must be specified") 12 | if not nest.is_sequence(args): 13 | args = [args] 14 | 15 | flat_args = [flatten(arg, 1) for arg in args] 16 | if input_keep_prob < 1.0: 17 | assert is_train is not None 18 | flat_args = [tf.cond(is_train, lambda: tf.nn.dropout(arg, input_keep_prob), lambda: arg) 19 | for arg in flat_args] 20 | flat_out = _linear(flat_args, output_size, bias, bias_start=bias_start, scope=scope) 21 | out = reconstruct(flat_out, args[0], 1) 22 | if squeeze: 23 | out = tf.squeeze(out, [len(args[0].get_shape().as_list())-1]) 24 | if wd: 25 | add_wd(wd) 26 | 27 | return out 28 | 29 | 30 | def dropout(x, keep_prob, is_train, noise_shape=None, seed=None, name=None): 31 | with tf.name_scope(name or "dropout"): 32 | if keep_prob < 1.0: 33 | d = tf.nn.dropout(x, keep_prob, noise_shape=noise_shape, seed=seed) 34 | out = tf.cond(is_train, lambda: d, lambda: x) 35 | return out 36 | return x 37 | 38 | 39 | def softmax(logits, mask=None, scope=None): 40 | with tf.name_scope(scope or "Softmax"): 41 | if mask is not None: 42 | logits = exp_mask(logits, mask) 43 | flat_logits = flatten(logits, 1) 44 | flat_out = tf.nn.softmax(flat_logits) 45 | out = reconstruct(flat_out, logits, 1) 46 | 47 | return out 48 | 49 | 50 | def softsel(target, logits, mask=None, scope=None): 51 | """ 52 | 53 | :param target: [ ..., J, d] dtype=float 54 | :param logits: [ ..., J], dtype=float 55 | :param mask: [ ..., J], dtype=bool 56 | :param scope: 57 | :return: [..., d], dtype=float 58 | """ 59 | with tf.name_scope(scope or "Softsel"): 60 | a = softmax(logits, mask=mask) 61 | target_rank = len(target.get_shape().as_list()) 62 | out = tf.reduce_sum(tf.expand_dims(a, -1) * target, target_rank - 2) 63 | return out 64 | 65 | 66 | def double_linear_logits(args, size, bias, bias_start=0.0, scope=None, mask=None, wd=0.0, input_keep_prob=1.0, is_train=None): 67 | with tf.variable_scope(scope or "Double_Linear_Logits"): 68 | first = tf.tanh(linear(args, size, bias, bias_start=bias_start, scope='first', 69 | wd=wd, input_keep_prob=input_keep_prob, is_train=is_train)) 70 | second = linear(first, 1, bias, bias_start=bias_start, squeeze=True, scope='second', 71 | wd=wd, input_keep_prob=input_keep_prob, is_train=is_train) 72 | if mask is not None: 73 | second = exp_mask(second, mask) 74 | return second 75 | 76 | 77 | def linear_logits(args, bias, bias_start=0.0, scope=None, mask=None, wd=0.0, input_keep_prob=1.0, is_train=None): 78 | with tf.variable_scope(scope or "Linear_Logits"): 79 | logits = linear(args, 1, bias, bias_start=bias_start, squeeze=True, scope='first', 80 | wd=wd, input_keep_prob=input_keep_prob, is_train=is_train) 81 | if mask is not None: 82 | logits = exp_mask(logits, mask) 83 | return logits 84 | 85 | 86 | def sum_logits(args, mask=None, name=None): 87 | with tf.name_scope(name or "sum_logits"): 88 | if args is None or (nest.is_sequence(args) and not args): 89 | raise ValueError("`args` must be specified") 90 | if not nest.is_sequence(args): 91 | args = [args] 92 | rank = len(args[0].get_shape()) 93 | logits = sum(tf.reduce_sum(arg, rank-1) for arg in args) 94 | if mask is not None: 95 | logits = exp_mask(logits, mask) 96 | return logits 97 | 98 | 99 | def get_logits(args, size, bias, bias_start=0.0, scope=None, mask=None, wd=0.0, input_keep_prob=1.0, is_train=None, func=None): 100 | if func is None: 101 | func = "sum" 102 | if func == 'sum': 103 | return sum_logits(args, mask=mask, name=scope) 104 | elif func == 'linear': 105 | return linear_logits(args, bias, bias_start=bias_start, scope=scope, mask=mask, wd=wd, input_keep_prob=input_keep_prob, 106 | is_train=is_train) 107 | elif func == 'double': 108 | return double_linear_logits(args, size, bias, bias_start=bias_start, scope=scope, mask=mask, wd=wd, input_keep_prob=input_keep_prob, 109 | is_train=is_train) 110 | elif func == 'dot': 111 | assert len(args) == 2 112 | arg = args[0] * args[1] 113 | return sum_logits([arg], mask=mask, name=scope) 114 | elif func == 'mul_linear': 115 | assert len(args) == 2 116 | arg = args[0] * args[1] 117 | return linear_logits([arg], bias, bias_start=bias_start, scope=scope, mask=mask, wd=wd, input_keep_prob=input_keep_prob, 118 | is_train=is_train) 119 | elif func == 'proj': 120 | assert len(args) == 2 121 | d = args[1].get_shape()[-1] 122 | proj = linear([args[0]], d, False, bias_start=bias_start, scope=scope, wd=wd, input_keep_prob=input_keep_prob, 123 | is_train=is_train) 124 | return sum_logits([proj * args[1]], mask=mask) 125 | elif func == 'tri_linear': 126 | assert len(args) == 2 127 | new_arg = args[0] * args[1] 128 | return linear_logits([args[0], args[1], new_arg], bias, bias_start=bias_start, scope=scope, mask=mask, wd=wd, input_keep_prob=input_keep_prob, 129 | is_train=is_train) 130 | else: 131 | raise Exception() 132 | 133 | 134 | def highway_layer(arg, bias, bias_start=0.0, scope=None, wd=0.0, input_keep_prob=1.0, is_train=None): 135 | with tf.variable_scope(scope or "highway_layer"): 136 | d = arg.get_shape()[-1] 137 | trans = linear([arg], d, bias, bias_start=bias_start, scope='trans', wd=wd, input_keep_prob=input_keep_prob, is_train=is_train) 138 | trans = tf.nn.relu(trans) 139 | gate = linear([arg], d, bias, bias_start=bias_start, scope='gate', wd=wd, input_keep_prob=input_keep_prob, is_train=is_train) 140 | gate = tf.nn.sigmoid(gate) 141 | out = gate * trans + (1 - gate) * arg 142 | return out 143 | 144 | 145 | def highway_network(arg, num_layers, bias, bias_start=0.0, scope=None, wd=0.0, input_keep_prob=1.0, is_train=None): 146 | with tf.variable_scope(scope or "highway_network"): 147 | prev = arg 148 | cur = None 149 | for layer_idx in range(num_layers): 150 | cur = highway_layer(prev, bias, bias_start=bias_start, scope="layer_{}".format(layer_idx), wd=wd, 151 | input_keep_prob=input_keep_prob, is_train=is_train) 152 | prev = cur 153 | return cur 154 | 155 | 156 | def conv1d(in_, filter_size, height, padding, is_train=None, keep_prob=1.0, scope=None): 157 | with tf.variable_scope(scope or "conv1d"): 158 | num_channels = in_.get_shape()[-1] 159 | filter_ = tf.get_variable("filter", shape=[1, height, num_channels, filter_size], dtype='float') 160 | bias = tf.get_variable("bias", shape=[filter_size], dtype='float') 161 | strides = [1, 1, 1, 1] 162 | if is_train is not None and keep_prob < 1.0: 163 | in_ = dropout(in_, keep_prob, is_train) 164 | xxc = tf.nn.conv2d(in_, filter_, strides, padding) + bias # [N*M, JX, W/filter_stride, d] 165 | out = tf.reduce_max(tf.nn.relu(xxc), 2) # [-1, JX, d] 166 | return out 167 | 168 | 169 | def multi_conv1d(in_, filter_sizes, heights, padding, is_train=None, keep_prob=1.0, scope=None): 170 | with tf.variable_scope(scope or "multi_conv1d"): 171 | assert len(filter_sizes) == len(heights) 172 | outs = [] 173 | for filter_size, height in zip(filter_sizes, heights): 174 | if filter_size == 0: 175 | continue 176 | out = conv1d(in_, filter_size, height, padding, is_train=is_train, keep_prob=keep_prob, scope="conv1d_{}".format(height)) 177 | outs.append(out) 178 | concat_out = tf.concat(2, outs) 179 | return concat_out 180 | -------------------------------------------------------------------------------- /Experiment 1/my/tensorflow/rnn.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.python.ops.rnn import dynamic_rnn as _dynamic_rnn, \ 3 | bidirectional_dynamic_rnn as _bidirectional_dynamic_rnn 4 | from tensorflow.python.ops.rnn import bidirectional_rnn as _bidirectional_rnn 5 | 6 | from my.tensorflow import flatten, reconstruct 7 | 8 | 9 | def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None, 10 | dtype=None, parallel_iterations=None, swap_memory=False, 11 | time_major=False, scope=None): 12 | assert not time_major # TODO : to be implemented later! 13 | flat_inputs = flatten(inputs, 2) # [-1, J, d] 14 | flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64') 15 | 16 | flat_outputs, final_state = _dynamic_rnn(cell, flat_inputs, sequence_length=flat_len, 17 | initial_state=initial_state, dtype=dtype, 18 | parallel_iterations=parallel_iterations, swap_memory=swap_memory, 19 | time_major=time_major, scope=scope) 20 | 21 | outputs = reconstruct(flat_outputs, inputs, 2) 22 | return outputs, final_state 23 | 24 | 25 | def bw_dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None, 26 | dtype=None, parallel_iterations=None, swap_memory=False, 27 | time_major=False, scope=None): 28 | assert not time_major # TODO : to be implemented later! 29 | 30 | flat_inputs = flatten(inputs, 2) # [-1, J, d] 31 | flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64') 32 | 33 | flat_inputs = tf.reverse(flat_inputs, 1) if sequence_length is None \ 34 | else tf.reverse_sequence(flat_inputs, sequence_length, 1) 35 | flat_outputs, final_state = _dynamic_rnn(cell, flat_inputs, sequence_length=flat_len, 36 | initial_state=initial_state, dtype=dtype, 37 | parallel_iterations=parallel_iterations, swap_memory=swap_memory, 38 | time_major=time_major, scope=scope) 39 | flat_outputs = tf.reverse(flat_outputs, 1) if sequence_length is None \ 40 | else tf.reverse_sequence(flat_outputs, sequence_length, 1) 41 | 42 | outputs = reconstruct(flat_outputs, inputs, 2) 43 | return outputs, final_state 44 | 45 | 46 | def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None, 47 | initial_state_fw=None, initial_state_bw=None, 48 | dtype=None, parallel_iterations=None, 49 | swap_memory=False, time_major=False, scope=None): 50 | assert not time_major 51 | 52 | flat_inputs = flatten(inputs, 2) # [-1, J, d] 53 | flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64') 54 | 55 | (flat_fw_outputs, flat_bw_outputs), final_state = \ 56 | _bidirectional_dynamic_rnn(cell_fw, cell_bw, flat_inputs, sequence_length=flat_len, 57 | initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw, 58 | dtype=dtype, parallel_iterations=parallel_iterations, swap_memory=swap_memory, 59 | time_major=time_major, scope=scope) 60 | 61 | fw_outputs = reconstruct(flat_fw_outputs, inputs, 2) 62 | bw_outputs = reconstruct(flat_bw_outputs, inputs, 2) 63 | # FIXME : final state is not reshaped! 64 | return (fw_outputs, bw_outputs), final_state 65 | 66 | 67 | def bidirectional_rnn(cell_fw, cell_bw, inputs, 68 | initial_state_fw=None, initial_state_bw=None, 69 | dtype=None, sequence_length=None, scope=None): 70 | 71 | flat_inputs = flatten(inputs, 2) # [-1, J, d] 72 | flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64') 73 | 74 | (flat_fw_outputs, flat_bw_outputs), final_state = \ 75 | _bidirectional_rnn(cell_fw, cell_bw, flat_inputs, sequence_length=flat_len, 76 | initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw, 77 | dtype=dtype, scope=scope) 78 | 79 | fw_outputs = reconstruct(flat_fw_outputs, inputs, 2) 80 | bw_outputs = reconstruct(flat_bw_outputs, inputs, 2) 81 | # FIXME : final state is not reshaped! 82 | return (fw_outputs, bw_outputs), final_state 83 | -------------------------------------------------------------------------------- /Experiment 1/my/tensorflow/rnn_cell.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.python.ops.rnn_cell import DropoutWrapper, RNNCell, LSTMStateTuple 3 | 4 | from my.tensorflow import exp_mask, flatten 5 | from my.tensorflow.nn import linear, softsel, double_linear_logits 6 | 7 | 8 | class SwitchableDropoutWrapper(DropoutWrapper): 9 | def __init__(self, cell, is_train, input_keep_prob=1.0, output_keep_prob=1.0, 10 | seed=None): 11 | super(SwitchableDropoutWrapper, self).__init__(cell, input_keep_prob=input_keep_prob, output_keep_prob=output_keep_prob, 12 | seed=seed) 13 | self.is_train = is_train 14 | 15 | def __call__(self, inputs, state, scope=None): 16 | outputs_do, new_state_do = super(SwitchableDropoutWrapper, self).__call__(inputs, state, scope=scope) 17 | tf.get_variable_scope().reuse_variables() 18 | outputs, new_state = self._cell(inputs, state, scope) 19 | outputs = tf.cond(self.is_train, lambda: outputs_do, lambda: outputs) 20 | if isinstance(state, tuple): 21 | new_state = state.__class__(*[tf.cond(self.is_train, lambda: new_state_do_i, lambda: new_state_i) 22 | for new_state_do_i, new_state_i in zip(new_state_do, new_state)]) 23 | else: 24 | new_state = tf.cond(self.is_train, lambda: new_state_do, lambda: new_state) 25 | return outputs, new_state 26 | 27 | 28 | class TreeRNNCell(RNNCell): 29 | def __init__(self, cell, input_size, reduce_func): 30 | self._cell = cell 31 | self._input_size = input_size 32 | self._reduce_func = reduce_func 33 | 34 | def __call__(self, inputs, state, scope=None): 35 | """ 36 | :param inputs: [N*B, I + B] 37 | :param state: [N*B, d] 38 | :param scope: 39 | :return: [N*B, d] 40 | """ 41 | with tf.variable_scope(scope or self.__class__.__name__): 42 | d = self.state_size 43 | x = tf.slice(inputs, [0, 0], [-1, self._input_size]) # [N*B, I] 44 | mask = tf.slice(inputs, [0, self._input_size], [-1, -1]) # [N*B, B] 45 | B = tf.shape(mask)[1] 46 | prev_state = tf.expand_dims(tf.reshape(state, [-1, B, d]), 1) # [N, B, d] -> [N, 1, B, d] 47 | mask = tf.tile(tf.expand_dims(tf.reshape(mask, [-1, B, B]), -1), [1, 1, 1, d]) # [N, B, B, d] 48 | # prev_state = self._reduce_func(tf.tile(prev_state, [1, B, 1, 1]), 2) 49 | prev_state = self._reduce_func(exp_mask(prev_state, mask), 2) # [N, B, d] 50 | prev_state = tf.reshape(prev_state, [-1, d]) # [N*B, d] 51 | return self._cell(x, prev_state) 52 | 53 | @property 54 | def state_size(self): 55 | return self._cell.state_size 56 | 57 | @property 58 | def output_size(self): 59 | return self._cell.output_size 60 | 61 | 62 | class NoOpCell(RNNCell): 63 | def __init__(self, num_units): 64 | self._num_units = num_units 65 | 66 | def __call__(self, inputs, state, scope=None): 67 | return state, state 68 | 69 | @property 70 | def state_size(self): 71 | return self._num_units 72 | 73 | @property 74 | def output_size(self): 75 | return self._num_units 76 | 77 | 78 | class MatchCell(RNNCell): 79 | def __init__(self, cell, input_size, q_len): 80 | self._cell = cell 81 | self._input_size = input_size 82 | # FIXME : This won't be needed with good shape guessing 83 | self._q_len = q_len 84 | 85 | @property 86 | def state_size(self): 87 | return self._cell.state_size 88 | 89 | @property 90 | def output_size(self): 91 | return self._cell.output_size 92 | 93 | def __call__(self, inputs, state, scope=None): 94 | """ 95 | 96 | :param inputs: [N, d + JQ + JQ * d] 97 | :param state: [N, d] 98 | :param scope: 99 | :return: 100 | """ 101 | with tf.variable_scope(scope or self.__class__.__name__): 102 | c_prev, h_prev = state 103 | x = tf.slice(inputs, [0, 0], [-1, self._input_size]) 104 | q_mask = tf.slice(inputs, [0, self._input_size], [-1, self._q_len]) # [N, JQ] 105 | qs = tf.slice(inputs, [0, self._input_size + self._q_len], [-1, -1]) 106 | qs = tf.reshape(qs, [-1, self._q_len, self._input_size]) # [N, JQ, d] 107 | x_tiled = tf.tile(tf.expand_dims(x, 1), [1, self._q_len, 1]) # [N, JQ, d] 108 | h_prev_tiled = tf.tile(tf.expand_dims(h_prev, 1), [1, self._q_len, 1]) # [N, JQ, d] 109 | f = tf.tanh(linear([qs, x_tiled, h_prev_tiled], self._input_size, True, scope='f')) # [N, JQ, d] 110 | a = tf.nn.softmax(exp_mask(linear(f, 1, True, squeeze=True, scope='a'), q_mask)) # [N, JQ] 111 | q = tf.reduce_sum(qs * tf.expand_dims(a, -1), 1) 112 | z = tf.concat(1, [x, q]) # [N, 2d] 113 | return self._cell(z, state) 114 | 115 | 116 | class AttentionCell(RNNCell): 117 | def __init__(self, cell, memory, mask=None, controller=None, mapper=None, input_keep_prob=1.0, is_train=None): 118 | """ 119 | Early fusion attention cell: uses the (inputs, state) to control the current attention. 120 | 121 | :param cell: 122 | :param memory: [N, M, m] 123 | :param mask: 124 | :param controller: (inputs, prev_state, memory) -> memory_logits 125 | """ 126 | self._cell = cell 127 | self._memory = memory 128 | self._mask = mask 129 | self._flat_memory = flatten(memory, 2) 130 | self._flat_mask = flatten(mask, 1) 131 | if controller is None: 132 | controller = AttentionCell.get_linear_controller(True, is_train=is_train) 133 | self._controller = controller 134 | if mapper is None: 135 | mapper = AttentionCell.get_concat_mapper() 136 | elif mapper == 'sim': 137 | mapper = AttentionCell.get_sim_mapper() 138 | self._mapper = mapper 139 | 140 | @property 141 | def state_size(self): 142 | return self._cell.state_size 143 | 144 | @property 145 | def output_size(self): 146 | return self._cell.output_size 147 | 148 | def __call__(self, inputs, state, scope=None): 149 | with tf.variable_scope(scope or "AttentionCell"): 150 | memory_logits = self._controller(inputs, state, self._flat_memory) 151 | sel_mem = softsel(self._flat_memory, memory_logits, mask=self._flat_mask) # [N, m] 152 | new_inputs, new_state = self._mapper(inputs, state, sel_mem) 153 | return self._cell(new_inputs, state) 154 | 155 | @staticmethod 156 | def get_double_linear_controller(size, bias, input_keep_prob=1.0, is_train=None): 157 | def double_linear_controller(inputs, state, memory): 158 | """ 159 | 160 | :param inputs: [N, i] 161 | :param state: [N, d] 162 | :param memory: [N, M, m] 163 | :return: [N, M] 164 | """ 165 | rank = len(memory.get_shape()) 166 | _memory_size = tf.shape(memory)[rank-2] 167 | tiled_inputs = tf.tile(tf.expand_dims(inputs, 1), [1, _memory_size, 1]) 168 | if isinstance(state, tuple): 169 | tiled_states = [tf.tile(tf.expand_dims(each, 1), [1, _memory_size, 1]) 170 | for each in state] 171 | else: 172 | tiled_states = [tf.tile(tf.expand_dims(state, 1), [1, _memory_size, 1])] 173 | 174 | # [N, M, d] 175 | in_ = tf.concat(2, [tiled_inputs] + tiled_states + [memory]) 176 | out = double_linear_logits(in_, size, bias, input_keep_prob=input_keep_prob, 177 | is_train=is_train) 178 | return out 179 | return double_linear_controller 180 | 181 | @staticmethod 182 | def get_linear_controller(bias, input_keep_prob=1.0, is_train=None): 183 | def linear_controller(inputs, state, memory): 184 | rank = len(memory.get_shape()) 185 | _memory_size = tf.shape(memory)[rank-2] 186 | tiled_inputs = tf.tile(tf.expand_dims(inputs, 1), [1, _memory_size, 1]) 187 | if isinstance(state, tuple): 188 | tiled_states = [tf.tile(tf.expand_dims(each, 1), [1, _memory_size, 1]) 189 | for each in state] 190 | else: 191 | tiled_states = [tf.tile(tf.expand_dims(state, 1), [1, _memory_size, 1])] 192 | 193 | # [N, M, d] 194 | in_ = tf.concat(2, [tiled_inputs] + tiled_states + [memory]) 195 | out = linear(in_, 1, bias, squeeze=True, input_keep_prob=input_keep_prob, is_train=is_train) 196 | return out 197 | return linear_controller 198 | 199 | @staticmethod 200 | def get_concat_mapper(): 201 | def concat_mapper(inputs, state, sel_mem): 202 | """ 203 | 204 | :param inputs: [N, i] 205 | :param state: [N, d] 206 | :param sel_mem: [N, m] 207 | :return: (new_inputs, new_state) tuple 208 | """ 209 | return tf.concat(1, [inputs, sel_mem]), state 210 | return concat_mapper 211 | 212 | @staticmethod 213 | def get_sim_mapper(): 214 | def sim_mapper(inputs, state, sel_mem): 215 | """ 216 | Assume that inputs and sel_mem are the same size 217 | :param inputs: [N, i] 218 | :param state: [N, d] 219 | :param sel_mem: [N, i] 220 | :return: (new_inputs, new_state) tuple 221 | """ 222 | return tf.concat(1, [inputs, sel_mem, inputs * sel_mem, tf.abs(inputs - sel_mem)]), state 223 | return sim_mapper 224 | -------------------------------------------------------------------------------- /Experiment 1/my/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | from collections import deque 3 | 4 | import numpy as np 5 | from tqdm import tqdm 6 | 7 | 8 | def mytqdm(list_, desc="", show=True): 9 | if show: 10 | pbar = tqdm(list_) 11 | pbar.set_description(desc) 12 | return pbar 13 | return list_ 14 | 15 | 16 | def json_pretty_dump(obj, fh): 17 | return json.dump(obj, fh, sort_keys=True, indent=2, separators=(',', ': ')) 18 | 19 | 20 | def index(l, i): 21 | return index(l[i[0]], i[1:]) if len(i) > 1 else l[i[0]] 22 | 23 | 24 | def fill(l, shape, dtype=None): 25 | out = np.zeros(shape, dtype=dtype) 26 | stack = deque() 27 | stack.appendleft(((), l)) 28 | while len(stack) > 0: 29 | indices, cur = stack.pop() 30 | if len(indices) < shape: 31 | for i, sub in enumerate(cur): 32 | stack.appendleft([indices + (i,), sub]) 33 | else: 34 | out[indices] = cur 35 | return out 36 | 37 | 38 | def short_floats(o, precision): 39 | class ShortFloat(float): 40 | def __repr__(self): 41 | return '%.{}g'.format(precision) % self 42 | 43 | def _short_floats(obj): 44 | if isinstance(obj, float): 45 | return ShortFloat(obj) 46 | elif isinstance(obj, dict): 47 | return dict((k, _short_floats(v)) for k, v in obj.items()) 48 | elif isinstance(obj, (list, tuple)): 49 | return tuple(map(_short_floats, obj)) 50 | return obj 51 | 52 | return _short_floats(o) 53 | 54 | 55 | def argmax(x): 56 | return np.unravel_index(x.argmax(), x.shape) 57 | 58 | 59 | -------------------------------------------------------------------------------- /Experiment 1/my/zip_save.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import shutil 5 | from zipfile import ZipFile 6 | 7 | from tqdm import tqdm 8 | 9 | 10 | def get_args(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('paths', nargs='+') 13 | parser.add_argument('-o', '--out', default='save.zip') 14 | args = parser.parse_args() 15 | return args 16 | 17 | 18 | def zip_save(args): 19 | temp_dir = "." 20 | save_dir = os.path.join(temp_dir, "save") 21 | if not os.path.exists(save_dir): 22 | os.makedirs(save_dir) 23 | for save_source_path in tqdm(args.paths): 24 | # path = "out/basic/30/save/basic-18000" 25 | # target_path = "save_dir/30/save" 26 | # also output full path name to "save_dir/30/readme.txt 27 | # need to also extract "out/basic/30/shared.json" 28 | temp, _ = os.path.split(save_source_path) # "out/basic/30/save", _ 29 | model_dir, _ = os.path.split(temp) # "out/basic/30, _ 30 | _, model_name = os.path.split(model_dir) 31 | cur_dir = os.path.join(save_dir, model_name) 32 | if not os.path.exists(cur_dir): 33 | os.makedirs(cur_dir) 34 | save_target_path = os.path.join(cur_dir, "save") 35 | shared_target_path = os.path.join(cur_dir, "shared.json") 36 | readme_path = os.path.join(cur_dir, "readme.txt") 37 | shared_source_path = os.path.join(model_dir, "shared.json") 38 | shutil.copy(save_source_path, save_target_path) 39 | shutil.copy(shared_source_path, shared_target_path) 40 | with open(readme_path, 'w') as fh: 41 | fh.write(save_source_path) 42 | 43 | os.system("zip {} -r {}".format(args.out, save_dir)) 44 | 45 | def main(): 46 | args = get_args() 47 | zip_save(args) 48 | 49 | if __name__ == "__main__": 50 | main() 51 | -------------------------------------------------------------------------------- /Experiment 1/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow>=0.11 2 | nltk 3 | tqdm 4 | jinja2 -------------------------------------------------------------------------------- /Experiment 1/squad/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imraviagrawal/ReadingComprehension/b1eb0f4a20a6a73bf07cabd48a1aeae01639830a/Experiment 1/squad/__init__.py -------------------------------------------------------------------------------- /Experiment 1/squad/aug_squad.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | 4 | from tqdm import tqdm 5 | 6 | from my.corenlp_interface import CoreNLPInterface 7 | 8 | in_path = sys.argv[1] 9 | out_path = sys.argv[2] 10 | url = sys.argv[3] 11 | port = int(sys.argv[4]) 12 | data = json.load(open(in_path, 'r')) 13 | 14 | h = CoreNLPInterface(url, port) 15 | 16 | 17 | def find_all(a_str, sub): 18 | start = 0 19 | while True: 20 | start = a_str.find(sub, start) 21 | if start == -1: return 22 | yield start 23 | start += len(sub) # use start += 1 to find overlapping matches 24 | 25 | 26 | def to_hex(s): 27 | return " ".join(map(hex, map(ord, s))) 28 | 29 | 30 | def handle_nobreak(cand, text): 31 | if cand == text: 32 | return cand 33 | if cand.replace(u'\u00A0', ' ') == text: 34 | return cand 35 | elif cand == text.replace(u'\u00A0', ' '): 36 | return text 37 | raise Exception("{} '{}' {} '{}'".format(cand, to_hex(cand), text, to_hex(text))) 38 | 39 | 40 | # resolving unicode complication 41 | 42 | wrong_loc_count = 0 43 | loc_diffs = [] 44 | 45 | for article in data['data']: 46 | for para in article['paragraphs']: 47 | para['context'] = para['context'].replace(u'\u000A', '') 48 | para['context'] = para['context'].replace(u'\u00A0', ' ') 49 | context = para['context'] 50 | for qa in para['qas']: 51 | for answer in qa['answers']: 52 | answer['text'] = answer['text'].replace(u'\u00A0', ' ') 53 | text = answer['text'] 54 | answer_start = answer['answer_start'] 55 | if context[answer_start:answer_start + len(text)] == text: 56 | if text.lstrip() == text: 57 | pass 58 | else: 59 | answer_start += len(text) - len(text.lstrip()) 60 | answer['answer_start'] = answer_start 61 | text = text.lstrip() 62 | answer['text'] = text 63 | else: 64 | wrong_loc_count += 1 65 | text = text.lstrip() 66 | answer['text'] = text 67 | starts = list(find_all(context, text)) 68 | if len(starts) == 1: 69 | answer_start = starts[0] 70 | elif len(starts) > 1: 71 | new_answer_start = min(starts, key=lambda s: abs(s - answer_start)) 72 | loc_diffs.append(abs(new_answer_start - answer_start)) 73 | answer_start = new_answer_start 74 | else: 75 | raise Exception() 76 | answer['answer_start'] = answer_start 77 | 78 | answer_stop = answer_start + len(text) 79 | answer['answer_stop'] = answer_stop 80 | assert para['context'][answer_start:answer_stop] == answer['text'], "{} {}".format( 81 | para['context'][answer_start:answer_stop], answer['text']) 82 | 83 | print(wrong_loc_count, loc_diffs) 84 | 85 | mismatch_count = 0 86 | dep_fail_count = 0 87 | no_answer_count = 0 88 | 89 | size = sum(len(article['paragraphs']) for article in data['data']) 90 | pbar = tqdm(range(size)) 91 | 92 | for ai, article in enumerate(data['data']): 93 | for pi, para in enumerate(article['paragraphs']): 94 | context = para['context'] 95 | sents = h.split_doc(context) 96 | words = h.split_sent(context) 97 | sent_starts = [] 98 | ref_idx = 0 99 | for sent in sents: 100 | new_idx = context.find(sent, ref_idx) 101 | sent_starts.append(new_idx) 102 | ref_idx = new_idx + len(sent) 103 | para['sents'] = sents 104 | para['words'] = words 105 | para['sent_starts'] = sent_starts 106 | 107 | consts = list(map(h.get_const, sents)) 108 | para['consts'] = consts 109 | deps = list(map(h.get_dep, sents)) 110 | para['deps'] = deps 111 | 112 | for qa in para['qas']: 113 | question = qa['question'] 114 | question_const = h.get_const(question) 115 | qa['const'] = question_const 116 | question_dep = h.get_dep(question) 117 | qa['dep'] = question_dep 118 | qa['words'] = h.split_sent(question) 119 | 120 | for answer in qa['answers']: 121 | answer_start = answer['answer_start'] 122 | text = answer['text'] 123 | answer_stop = answer_start + len(text) 124 | # answer_words = h.split_sent(text) 125 | word_idxs = [] 126 | answer_words = [] 127 | for sent_idx, (sent, sent_start, dep) in enumerate(zip(sents, sent_starts, deps)): 128 | if dep is None: 129 | print("dep parse failed at {} {} {}".format(ai, pi, sent_idx)) 130 | dep_fail_count += 1 131 | continue 132 | nodes, edges = dep 133 | words = [node[0] for node in nodes] 134 | 135 | for word_idx, (word, _, _, start, _) in enumerate(nodes): 136 | global_start = sent_start + start 137 | global_stop = global_start + len(word) 138 | if answer_start <= global_start < answer_stop or answer_start < global_stop <= answer_stop: 139 | word_idxs.append((sent_idx, word_idx)) 140 | answer_words.append(word) 141 | if len(word_idxs) > 0: 142 | answer['answer_word_start'] = word_idxs[0] 143 | answer['answer_word_stop'] = word_idxs[-1][0], word_idxs[-1][1] + 1 144 | if not text.startswith(answer_words[0]): 145 | print("'{}' '{}'".format(text, ' '.join(answer_words))) 146 | mismatch_count += 1 147 | else: 148 | answer['answer_word_start'] = None 149 | answer['answer_word_stop'] = None 150 | no_answer_count += 1 151 | pbar.update(1) 152 | pbar.close() 153 | 154 | print(mismatch_count, dep_fail_count, no_answer_count) 155 | 156 | print("saving...") 157 | json.dump(data, open(out_path, 'w')) -------------------------------------------------------------------------------- /Experiment 1/squad/eda_aug_dev.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import json\n", 12 | "\n", 13 | "aug_data_path = \"/Users/minjoons/data/squad/dev-v1.0-aug.json\"\n", 14 | "aug_data = json.load(open(aug_data_path, 'r'))" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 17, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "(['Denver', 'Broncos'], 'Denver Broncos')\n", 29 | "(['Denver', 'Broncos'], 'Denver Broncos')\n", 30 | "(['Denver', 'Broncos'], 'Denver Broncos ')\n", 31 | "(['Carolina', 'Panthers'], 'Carolina Panthers')\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "def compare_answers():\n", 37 | " for article in aug_data['data']:\n", 38 | " for para in article['paragraphs']:\n", 39 | " deps = para['deps']\n", 40 | " nodess = []\n", 41 | " for dep in deps:\n", 42 | " nodes, edges = dep\n", 43 | " if dep is not None:\n", 44 | " nodess.append(nodes)\n", 45 | " else:\n", 46 | " nodess.append([])\n", 47 | " wordss = [[node[0] for node in nodes] for nodes in nodess]\n", 48 | " for qa in para['qas']:\n", 49 | " for answer in qa['answers']:\n", 50 | " text = answer['text']\n", 51 | " word_start = answer['answer_word_start']\n", 52 | " word_stop = answer['answer_word_stop']\n", 53 | " answer_words = wordss[word_start[0]][word_start[1]:word_stop[1]]\n", 54 | " yield answer_words, text\n", 55 | "\n", 56 | "ca = compare_answers()\n", 57 | "print(next(ca))\n", 58 | "print(next(ca))\n", 59 | "print(next(ca))\n", 60 | "print(next(ca))" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 18, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [ 70 | { 71 | "name": "stdout", 72 | "output_type": "stream", 73 | "text": [ 74 | "8\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "def counter():\n", 80 | " count = 0\n", 81 | " for article in aug_data['data']:\n", 82 | " for para in article['paragraphs']:\n", 83 | " deps = para['deps']\n", 84 | " nodess = []\n", 85 | " for dep in deps:\n", 86 | " if dep is None:\n", 87 | " count += 1\n", 88 | " print(count)\n", 89 | "counter()\n" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 19, 95 | "metadata": { 96 | "collapsed": false 97 | }, 98 | "outputs": [ 99 | { 100 | "name": "stdout", 101 | "output_type": "stream", 102 | "text": [ 103 | "0\n" 104 | ] 105 | } 106 | ], 107 | "source": [ 108 | "def bad_node_counter():\n", 109 | " count = 0\n", 110 | " for article in aug_data['data']:\n", 111 | " for para in article['paragraphs']:\n", 112 | " sents = para['sents']\n", 113 | " deps = para['deps']\n", 114 | " nodess = []\n", 115 | " for dep in deps:\n", 116 | " if dep is not None:\n", 117 | " nodes, edges = dep\n", 118 | " for node in nodes:\n", 119 | " if len(node) != 5:\n", 120 | " count += 1\n", 121 | " print(count)\n", 122 | "bad_node_counter() " 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 20, 128 | "metadata": { 129 | "collapsed": false 130 | }, 131 | "outputs": [ 132 | { 133 | "name": "stdout", 134 | "output_type": "stream", 135 | "text": [ 136 | "7\n" 137 | ] 138 | } 139 | ], 140 | "source": [ 141 | "def noanswer_counter():\n", 142 | " count = 0\n", 143 | " for article in aug_data['data']:\n", 144 | " for para in article['paragraphs']:\n", 145 | " deps = para['deps']\n", 146 | " nodess = []\n", 147 | " for dep in deps:\n", 148 | " if dep is not None:\n", 149 | " nodes, edges = dep\n", 150 | " nodess.append(nodes)\n", 151 | " else:\n", 152 | " nodess.append([])\n", 153 | " wordss = [[node[0] for node in nodes] for nodes in nodess]\n", 154 | " for qa in para['qas']:\n", 155 | " for answer in qa['answers']:\n", 156 | " text = answer['text']\n", 157 | " word_start = answer['answer_word_start']\n", 158 | " word_stop = answer['answer_word_stop']\n", 159 | " if word_start is None:\n", 160 | " count += 1\n", 161 | " print(count)\n", 162 | "noanswer_counter()" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 22, 168 | "metadata": { 169 | "collapsed": false 170 | }, 171 | "outputs": [ 172 | { 173 | "name": "stdout", 174 | "output_type": "stream", 175 | "text": [ 176 | "10600\n" 177 | ] 178 | } 179 | ], 180 | "source": [ 181 | "print(sum(len(para['qas']) for a in aug_data['data'] for para in a['paragraphs']))" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 5, 187 | "metadata": { 188 | "collapsed": false 189 | }, 190 | "outputs": [ 191 | { 192 | "name": "stdout", 193 | "output_type": "stream", 194 | "text": [ 195 | "10348\n" 196 | ] 197 | } 198 | ], 199 | "source": [ 200 | "import nltk\n", 201 | "\n", 202 | "def _set_span(t, i):\n", 203 | " if isinstance(t[0], str):\n", 204 | " t.span = (i, i+len(t))\n", 205 | " else:\n", 206 | " first = True\n", 207 | " for c in t:\n", 208 | " cur_span = _set_span(c, i)\n", 209 | " i = cur_span[1]\n", 210 | " if first:\n", 211 | " min_ = cur_span[0]\n", 212 | " first = False\n", 213 | " max_ = cur_span[1]\n", 214 | " t.span = (min_, max_)\n", 215 | " return t.span\n", 216 | "\n", 217 | "\n", 218 | "def set_span(t):\n", 219 | " assert isinstance(t, nltk.tree.Tree)\n", 220 | " try:\n", 221 | " return _set_span(t, 0)\n", 222 | " except:\n", 223 | " print(t)\n", 224 | " exit()\n", 225 | "\n", 226 | "def same_span_counter():\n", 227 | " count = 0\n", 228 | " for article in aug_data['data']:\n", 229 | " for para in article['paragraphs']:\n", 230 | " consts = para['consts']\n", 231 | " for const in consts:\n", 232 | " tree = nltk.tree.Tree.fromstring(const)\n", 233 | " set_span(tree)\n", 234 | " if len(list(tree.subtrees())) > len(set(t.span for t in tree.subtrees())):\n", 235 | " count += 1\n", 236 | " print(count)\n", 237 | "same_span_counter()" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": { 244 | "collapsed": true 245 | }, 246 | "outputs": [], 247 | "source": [] 248 | } 249 | ], 250 | "metadata": { 251 | "kernelspec": { 252 | "display_name": "Python 3", 253 | "language": "python", 254 | "name": "python3" 255 | }, 256 | "language_info": { 257 | "codemirror_mode": { 258 | "name": "ipython", 259 | "version": 3 260 | }, 261 | "file_extension": ".py", 262 | "mimetype": "text/x-python", 263 | "name": "python", 264 | "nbconvert_exporter": "python", 265 | "pygments_lexer": "ipython3", 266 | "version": "3.5.1" 267 | } 268 | }, 269 | "nbformat": 4, 270 | "nbformat_minor": 0 271 | } 272 | -------------------------------------------------------------------------------- /Experiment 1/squad/eda_aug_train.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import json\n", 12 | "\n", 13 | "aug_data_path = \"/Users/minjoons/data/squad/train-v1.0-aug.json\"\n", 14 | "aug_data = json.load(open(aug_data_path, 'r'))" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "(['Saint', 'Bernadette', 'Soubirous'], 'Saint Bernadette Soubirous')\n", 29 | "(['a', 'copper', 'statue', 'of', 'Christ'], 'a copper statue of Christ')\n", 30 | "(['the', 'Main', 'Building'], 'the Main Building')\n", 31 | "(['a', 'Marian', 'place', 'of', 'prayer', 'and', 'reflection'], 'a Marian place of prayer and reflection')\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "def compare_answers():\n", 37 | " for article in aug_data['data']:\n", 38 | " for para in article['paragraphs']:\n", 39 | " deps = para['deps']\n", 40 | " nodess = []\n", 41 | " for dep in deps:\n", 42 | " nodes, edges = dep\n", 43 | " if dep is not None:\n", 44 | " nodess.append(nodes)\n", 45 | " else:\n", 46 | " nodess.append([])\n", 47 | " wordss = [[node[0] for node in nodes] for nodes in nodess]\n", 48 | " for qa in para['qas']:\n", 49 | " for answer in qa['answers']:\n", 50 | " text = answer['text']\n", 51 | " word_start = answer['answer_word_start']\n", 52 | " word_stop = answer['answer_word_stop']\n", 53 | " answer_words = wordss[word_start[0]][word_start[1]:word_stop[1]]\n", 54 | " yield answer_words, text\n", 55 | "\n", 56 | "ca = compare_answers()\n", 57 | "print(next(ca))\n", 58 | "print(next(ca))\n", 59 | "print(next(ca))\n", 60 | "print(next(ca))" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 11, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [ 70 | { 71 | "name": "stdout", 72 | "output_type": "stream", 73 | "text": [ 74 | "x: .\n", 75 | "x: .\n", 76 | "x: .\n", 77 | "x: .\n", 78 | "x: .\n", 79 | "x: .\n", 80 | "x: .\n", 81 | "x: .\n", 82 | "q: k\n", 83 | "q: j\n", 84 | "q: n\n", 85 | "q: b\n", 86 | "q: v\n", 87 | "x: .\n", 88 | "x: :208\n", 89 | "x: .\n", 90 | "x: .\n", 91 | "x: .\n", 92 | "x: .\n", 93 | "x: .\n", 94 | "x: .\n", 95 | "x: .\n", 96 | "x: .\n", 97 | "x: .\n", 98 | "x: .\n", 99 | "x: .\n", 100 | "q: dd\n", 101 | "q: dd\n", 102 | "q: dd\n", 103 | "q: dd\n", 104 | "q: d\n", 105 | "x: .\n", 106 | "x: .\n", 107 | "x: .\n", 108 | "x: .\n", 109 | "x: .\n", 110 | "x: .\n", 111 | "x: .\n", 112 | "x: .\n", 113 | "x: :411\n", 114 | "x: .\n", 115 | "x: .\n", 116 | "x: .\n", 117 | "x: .\n", 118 | "x: .\n", 119 | "x: .\n", 120 | "x: :40\n", 121 | "x: .\n", 122 | "x: *\n", 123 | "x: :14\n", 124 | "x: .\n", 125 | "x: .\n", 126 | "x: .\n", 127 | "x: :131\n", 128 | "x: .\n", 129 | "x: .\n", 130 | "x: .\n", 131 | "x: .\n", 132 | "x: .\n", 133 | "x: .\n", 134 | "x: .\n", 135 | "x: .\n", 136 | "x: .\n", 137 | "53 10\n" 138 | ] 139 | } 140 | ], 141 | "source": [ 142 | "def nodep_counter():\n", 143 | " x_count = 0\n", 144 | " q_count = 0\n", 145 | " for article in aug_data['data']:\n", 146 | " for para in article['paragraphs']:\n", 147 | " deps = para['deps']\n", 148 | " nodess = []\n", 149 | " for sent, dep in zip(para['sents'], deps):\n", 150 | " if dep is None:\n", 151 | " print(\"x:\", sent)\n", 152 | " x_count += 1\n", 153 | " for qa in para['qas']:\n", 154 | " if qa['dep'] is None:\n", 155 | " print(\"q:\", qa['question'])\n", 156 | " q_count += 1\n", 157 | " print(x_count, q_count)\n", 158 | "nodep_counter()\n" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 4, 164 | "metadata": { 165 | "collapsed": false 166 | }, 167 | "outputs": [ 168 | { 169 | "name": "stdout", 170 | "output_type": "stream", 171 | "text": [ 172 | "0\n" 173 | ] 174 | } 175 | ], 176 | "source": [ 177 | "def bad_node_counter():\n", 178 | " count = 0\n", 179 | " for article in aug_data['data']:\n", 180 | " for para in article['paragraphs']:\n", 181 | " sents = para['sents']\n", 182 | " deps = para['deps']\n", 183 | " nodess = []\n", 184 | " for dep in deps:\n", 185 | " if dep is not None:\n", 186 | " nodes, edges = dep\n", 187 | " for node in nodes:\n", 188 | " if len(node) != 5:\n", 189 | " count += 1\n", 190 | " print(count)\n", 191 | "bad_node_counter() " 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 5, 197 | "metadata": { 198 | "collapsed": false 199 | }, 200 | "outputs": [ 201 | { 202 | "name": "stdout", 203 | "output_type": "stream", 204 | "text": [ 205 | "36\n" 206 | ] 207 | } 208 | ], 209 | "source": [ 210 | "def noanswer_counter():\n", 211 | " count = 0\n", 212 | " for article in aug_data['data']:\n", 213 | " for para in article['paragraphs']:\n", 214 | " deps = para['deps']\n", 215 | " nodess = []\n", 216 | " for dep in deps:\n", 217 | " if dep is not None:\n", 218 | " nodes, edges = dep\n", 219 | " nodess.append(nodes)\n", 220 | " else:\n", 221 | " nodess.append([])\n", 222 | " wordss = [[node[0] for node in nodes] for nodes in nodess]\n", 223 | " for qa in para['qas']:\n", 224 | " for answer in qa['answers']:\n", 225 | " text = answer['text']\n", 226 | " word_start = answer['answer_word_start']\n", 227 | " word_stop = answer['answer_word_stop']\n", 228 | " if word_start is None:\n", 229 | " count += 1\n", 230 | " print(count)\n", 231 | "noanswer_counter()" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 14, 237 | "metadata": { 238 | "collapsed": false 239 | }, 240 | "outputs": [ 241 | { 242 | "name": "stdout", 243 | "output_type": "stream", 244 | "text": [ 245 | "106\n" 246 | ] 247 | } 248 | ], 249 | "source": [ 250 | "def mult_sent_answer_counter():\n", 251 | " count = 0\n", 252 | " for article in aug_data['data']:\n", 253 | " for para in article['paragraphs']:\n", 254 | " for qa in para['qas']:\n", 255 | " for answer in qa['answers']:\n", 256 | " text = answer['text']\n", 257 | " word_start = answer['answer_word_start']\n", 258 | " word_stop = answer['answer_word_stop']\n", 259 | " if word_start is not None and word_start[0] != word_stop[0]:\n", 260 | " count += 1\n", 261 | " print(count)\n", 262 | "mult_sent_answer_counter()" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": { 269 | "collapsed": true 270 | }, 271 | "outputs": [], 272 | "source": [] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": { 278 | "collapsed": true 279 | }, 280 | "outputs": [], 281 | "source": [] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": { 287 | "collapsed": true 288 | }, 289 | "outputs": [], 290 | "source": [] 291 | } 292 | ], 293 | "metadata": { 294 | "kernelspec": { 295 | "display_name": "Python 3", 296 | "language": "python", 297 | "name": "python3" 298 | }, 299 | "language_info": { 300 | "codemirror_mode": { 301 | "name": "ipython", 302 | "version": 3 303 | }, 304 | "file_extension": ".py", 305 | "mimetype": "text/x-python", 306 | "name": "python", 307 | "nbconvert_exporter": "python", 308 | "pygments_lexer": "ipython3", 309 | "version": "3.5.1" 310 | } 311 | }, 312 | "nbformat": 4, 313 | "nbformat_minor": 0 314 | } 315 | -------------------------------------------------------------------------------- /Experiment 1/squad/evaluate-v1.1.py: -------------------------------------------------------------------------------- 1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """ 2 | from __future__ import print_function 3 | from collections import Counter 4 | import string 5 | import re 6 | import argparse 7 | import json 8 | import sys 9 | 10 | 11 | def normalize_answer(s): 12 | """Lower text and remove punctuation, articles and extra whitespace.""" 13 | def remove_articles(text): 14 | return re.sub(r'\b(a|an|the)\b', ' ', text) 15 | 16 | def white_space_fix(text): 17 | return ' '.join(text.split()) 18 | 19 | def remove_punc(text): 20 | exclude = set(string.punctuation) 21 | return ''.join(ch for ch in text if ch not in exclude) 22 | 23 | def lower(text): 24 | return text.lower() 25 | 26 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 27 | 28 | 29 | def f1_score(prediction, ground_truth): 30 | prediction_tokens = normalize_answer(prediction).split() 31 | ground_truth_tokens = normalize_answer(ground_truth).split() 32 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 33 | num_same = sum(common.values()) 34 | if num_same == 0: 35 | return 0 36 | precision = 1.0 * num_same / len(prediction_tokens) 37 | recall = 1.0 * num_same / len(ground_truth_tokens) 38 | f1 = (2 * precision * recall) / (precision + recall) 39 | return f1 40 | 41 | 42 | def exact_match_score(prediction, ground_truth): 43 | return (normalize_answer(prediction) == normalize_answer(ground_truth)) 44 | 45 | 46 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 47 | scores_for_ground_truths = [] 48 | for ground_truth in ground_truths: 49 | score = metric_fn(prediction, ground_truth) 50 | scores_for_ground_truths.append(score) 51 | return max(scores_for_ground_truths) 52 | 53 | 54 | def evaluate(dataset, predictions): 55 | f1 = exact_match = total = 0 56 | for article in dataset: 57 | for paragraph in article['paragraphs']: 58 | for qa in paragraph['qas']: 59 | total += 1 60 | if qa['id'] not in predictions: 61 | message = 'Unanswered question ' + qa['id'] + \ 62 | ' will receive score 0.' 63 | print(message, file=sys.stderr) 64 | continue 65 | ground_truths = list(map(lambda x: x['text'], qa['answers'])) 66 | prediction = predictions[qa['id']] 67 | exact_match += metric_max_over_ground_truths( 68 | exact_match_score, prediction, ground_truths) 69 | f1 += metric_max_over_ground_truths( 70 | f1_score, prediction, ground_truths) 71 | 72 | exact_match = 100.0 * exact_match / total 73 | f1 = 100.0 * f1 / total 74 | 75 | return {'exact_match': exact_match, 'f1': f1} 76 | 77 | 78 | if __name__ == '__main__': 79 | expected_version = '1.1' 80 | parser = argparse.ArgumentParser( 81 | description='Evaluation for SQuAD ' + expected_version) 82 | parser.add_argument('dataset_file', help='Dataset file') 83 | parser.add_argument('prediction_file', help='Prediction File') 84 | args = parser.parse_args() 85 | with open(args.dataset_file) as dataset_file: 86 | dataset_json = json.load(dataset_file) 87 | if (dataset_json['version'] != expected_version): 88 | print('Evaluation expects v-' + expected_version + 89 | ', but got dataset with v-' + dataset_json['version'], 90 | file=sys.stderr) 91 | dataset = dataset_json['data'] 92 | with open(args.prediction_file) as prediction_file: 93 | predictions = json.load(prediction_file) 94 | print(json.dumps(evaluate(dataset, predictions))) 95 | -------------------------------------------------------------------------------- /Experiment 1/squad/evaluate.py: -------------------------------------------------------------------------------- 1 | """ Official evaluation script for v1.1 of the SQuAD dataset. [Changed name for external importing]""" 2 | from __future__ import print_function 3 | from collections import Counter 4 | import string 5 | import re 6 | import argparse 7 | import json 8 | import sys 9 | 10 | 11 | def normalize_answer(s): 12 | """Lower text and remove punctuation, articles and extra whitespace.""" 13 | def remove_articles(text): 14 | return re.sub(r'\b(a|an|the)\b', ' ', text) 15 | 16 | def white_space_fix(text): 17 | return ' '.join(text.split()) 18 | 19 | def remove_punc(text): 20 | exclude = set(string.punctuation) 21 | return ''.join(ch for ch in text if ch not in exclude) 22 | 23 | def lower(text): 24 | return text.lower() 25 | 26 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 27 | 28 | 29 | def f1_score(prediction, ground_truth): 30 | prediction_tokens = normalize_answer(prediction).split() 31 | ground_truth_tokens = normalize_answer(ground_truth).split() 32 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 33 | num_same = sum(common.values()) 34 | if num_same == 0: 35 | return 0 36 | precision = 1.0 * num_same / len(prediction_tokens) 37 | recall = 1.0 * num_same / len(ground_truth_tokens) 38 | f1 = (2 * precision * recall) / (precision + recall) 39 | return f1 40 | 41 | 42 | def exact_match_score(prediction, ground_truth): 43 | return (normalize_answer(prediction) == normalize_answer(ground_truth)) 44 | 45 | 46 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 47 | scores_for_ground_truths = [] 48 | for ground_truth in ground_truths: 49 | score = metric_fn(prediction, ground_truth) 50 | scores_for_ground_truths.append(score) 51 | return max(scores_for_ground_truths) 52 | 53 | 54 | def evaluate(dataset, predictions): 55 | f1 = exact_match = total = 0 56 | for article in dataset: 57 | for paragraph in article['paragraphs']: 58 | for qa in paragraph['qas']: 59 | total += 1 60 | if qa['id'] not in predictions: 61 | message = 'Unanswered question ' + qa['id'] + \ 62 | ' will receive score 0.' 63 | print(message, file=sys.stderr) 64 | continue 65 | ground_truths = list(map(lambda x: x['text'], qa['answers'])) 66 | prediction = predictions[qa['id']] 67 | exact_match += metric_max_over_ground_truths( 68 | exact_match_score, prediction, ground_truths) 69 | f1 += metric_max_over_ground_truths( 70 | f1_score, prediction, ground_truths) 71 | 72 | exact_match = 100.0 * exact_match / total 73 | f1 = 100.0 * f1 / total 74 | 75 | return {'exact_match': exact_match, 'f1': f1} 76 | 77 | 78 | if __name__ == '__main__': 79 | expected_version = '1.1' 80 | parser = argparse.ArgumentParser( 81 | description='Evaluation for SQuAD ' + expected_version) 82 | parser.add_argument('dataset_file', help='Dataset file') 83 | parser.add_argument('prediction_file', help='Prediction File') 84 | args = parser.parse_args() 85 | with open(args.dataset_file) as dataset_file: 86 | dataset_json = json.load(dataset_file) 87 | if (dataset_json['version'] != expected_version): 88 | print('Evaluation expects v-' + expected_version + 89 | ', but got dataset with v-' + dataset_json['version'], 90 | file=sys.stderr) 91 | dataset = dataset_json['data'] 92 | with open(args.prediction_file) as prediction_file: 93 | predictions = json.load(prediction_file) 94 | print(json.dumps(evaluate(dataset, predictions))) 95 | -------------------------------------------------------------------------------- /Experiment 1/squad/prepro_aug.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | # data: q, cq, (dq), (pq), y, *x, *cx 5 | # shared: x, cx, (dx), (px), word_counter, char_counter, word2vec 6 | # no metadata 7 | from collections import Counter 8 | 9 | import nltk 10 | from tqdm import tqdm 11 | 12 | from my.nltk_utils import load_compressed_tree 13 | 14 | 15 | def bool_(arg): 16 | if arg == 'True': 17 | return True 18 | elif arg == 'False': 19 | return False 20 | raise Exception() 21 | 22 | 23 | def main(): 24 | args = get_args() 25 | prepro(args) 26 | 27 | 28 | def get_args(): 29 | parser = argparse.ArgumentParser() 30 | home = os.path.expanduser("~") 31 | source_dir = os.path.join(home, "data", "squad") 32 | target_dir = "data/squad" 33 | glove_dir = os.path.join(home, "data", "glove") 34 | parser.add_argument("--source_dir", default=source_dir) 35 | parser.add_argument("--target_dir", default=target_dir) 36 | parser.add_argument("--debug", default=False, type=bool_) 37 | parser.add_argument("--train_ratio", default=0.9, type=int) 38 | parser.add_argument("--glove_corpus", default="6B") 39 | parser.add_argument("--glove_dir", default=glove_dir) 40 | parser.add_argument("--glove_vec_size", default=100, type=int) 41 | parser.add_argument("--full_train", default=False, type=bool_) 42 | # TODO : put more args here 43 | return parser.parse_args() 44 | 45 | 46 | def prepro(args): 47 | if not os.path.exists(args.target_dir): 48 | os.makedirs(args.target_dir) 49 | 50 | if args.full_train: 51 | data_train, shared_train = prepro_each(args, 'train') 52 | data_dev, shared_dev = prepro_each(args, 'dev') 53 | else: 54 | data_train, shared_train = prepro_each(args, 'train', 0.0, args.train_ratio) 55 | data_dev, shared_dev = prepro_each(args, 'train', args.train_ratio, 1.0) 56 | data_test, shared_test = prepro_each(args, 'dev') 57 | 58 | print("saving ...") 59 | save(args, data_train, shared_train, 'train') 60 | save(args, data_dev, shared_dev, 'dev') 61 | save(args, data_test, shared_test, 'test') 62 | 63 | 64 | def save(args, data, shared, data_type): 65 | data_path = os.path.join(args.target_dir, "data_{}.json".format(data_type)) 66 | shared_path = os.path.join(args.target_dir, "shared_{}.json".format(data_type)) 67 | json.dump(data, open(data_path, 'w')) 68 | json.dump(shared, open(shared_path, 'w')) 69 | 70 | 71 | def get_word2vec(args, word_counter): 72 | glove_path = os.path.join(args.glove_dir, "glove.{}.{}d.txt".format(args.glove_corpus, args.glove_vec_size)) 73 | sizes = {'6B': int(4e5), '42B': int(1.9e6), '840B': int(2.2e6), '2B': int(1.2e6)} 74 | total = sizes[args.glove_corpus] 75 | word2vec_dict = {} 76 | with open(glove_path, 'r') as fh: 77 | for line in tqdm(fh, total=total): 78 | array = line.lstrip().rstrip().split(" ") 79 | word = array[0] 80 | vector = list(map(float, array[1:])) 81 | if word in word_counter: 82 | word2vec_dict[word] = vector 83 | elif word.capitalize() in word_counter: 84 | word2vec_dict[word.capitalize()] = vector 85 | elif word.lower() in word_counter: 86 | word2vec_dict[word.lower()] = vector 87 | elif word.upper() in word_counter: 88 | word2vec_dict[word.upper()] = vector 89 | 90 | print("{}/{} of word vocab have corresponding vectors in {}".format(len(word2vec_dict), len(word_counter), glove_path)) 91 | return word2vec_dict 92 | 93 | 94 | def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0): 95 | source_path = os.path.join(args.source_dir, "{}-v1.0-aug.json".format(data_type)) 96 | source_data = json.load(open(source_path, 'r')) 97 | 98 | q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] 99 | x, cx, tx, stx = [], [], [], [] 100 | answerss = [] 101 | word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter() 102 | pos_counter = Counter() 103 | start_ai = int(round(len(source_data['data']) * start_ratio)) 104 | stop_ai = int(round(len(source_data['data']) * stop_ratio)) 105 | for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])): 106 | xp, cxp, txp, stxp = [], [], [], [] 107 | x.append(xp) 108 | cx.append(cxp) 109 | tx.append(txp) 110 | stx.append(stxp) 111 | for pi, para in enumerate(article['paragraphs']): 112 | xi = [] 113 | for dep in para['deps']: 114 | if dep is None: 115 | xi.append([]) 116 | else: 117 | xi.append([node[0] for node in dep[0]]) 118 | cxi = [[list(xijk) for xijk in xij] for xij in xi] 119 | xp.append(xi) 120 | cxp.append(cxi) 121 | txp.append(para['consts']) 122 | stxp.append([str(load_compressed_tree(s)) for s in para['consts']]) 123 | trees = map(nltk.tree.Tree.fromstring, para['consts']) 124 | for tree in trees: 125 | for subtree in tree.subtrees(): 126 | pos_counter[subtree.label()] += 1 127 | 128 | for xij in xi: 129 | for xijk in xij: 130 | word_counter[xijk] += len(para['qas']) 131 | lower_word_counter[xijk.lower()] += len(para['qas']) 132 | for xijkl in xijk: 133 | char_counter[xijkl] += len(para['qas']) 134 | 135 | rxi = [ai, pi] 136 | assert len(x) - 1 == ai 137 | assert len(x[ai]) - 1 == pi 138 | for qa in para['qas']: 139 | dep = qa['dep'] 140 | qi = [] if dep is None else [node[0] for node in dep[0]] 141 | cqi = [list(qij) for qij in qi] 142 | yi = [] 143 | answers = [] 144 | for answer in qa['answers']: 145 | answers.append(answer['text']) 146 | yi0 = answer['answer_word_start'] or [0, 0] 147 | yi1 = answer['answer_word_stop'] or [0, 1] 148 | assert len(xi[yi0[0]]) > yi0[1] 149 | assert len(xi[yi1[0]]) >= yi1[1] 150 | yi.append([yi0, yi1]) 151 | 152 | for qij in qi: 153 | word_counter[qij] += 1 154 | lower_word_counter[qij.lower()] += 1 155 | for qijk in qij: 156 | char_counter[qijk] += 1 157 | 158 | q.append(qi) 159 | cq.append(cqi) 160 | y.append(yi) 161 | rx.append(rxi) 162 | rcx.append(rxi) 163 | ids.append(qa['id']) 164 | idxs.append(len(idxs)) 165 | answerss.append(answers) 166 | 167 | if args.debug: 168 | break 169 | 170 | word2vec_dict = get_word2vec(args, word_counter) 171 | lower_word2vec_dict = get_word2vec(args, lower_word_counter) 172 | 173 | data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, '*tx': rx, '*stx': rx, 174 | 'idxs': idxs, 'ids': ids, 'answerss': answerss} 175 | shared = {'x': x, 'cx': cx, 'tx': tx, 'stx': stx, 176 | 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 177 | 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict, 'pos_counter': pos_counter} 178 | 179 | return data, shared 180 | 181 | 182 | if __name__ == "__main__": 183 | main() -------------------------------------------------------------------------------- /Experiment 1/squad/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def get_2d_spans(text, tokenss): 5 | spanss = [] 6 | cur_idx = 0 7 | for tokens in tokenss: 8 | spans = [] 9 | for token in tokens: 10 | if text.find(token, cur_idx) < 0: 11 | print(tokens) 12 | print("{} {} {}".format(token, cur_idx, text)) 13 | raise Exception() 14 | cur_idx = text.find(token, cur_idx) 15 | spans.append((cur_idx, cur_idx + len(token))) 16 | cur_idx += len(token) 17 | spanss.append(spans) 18 | return spanss 19 | 20 | 21 | def get_word_span(context, wordss, start, stop): 22 | spanss = get_2d_spans(context, wordss) 23 | idxs = [] 24 | for sent_idx, spans in enumerate(spanss): 25 | for word_idx, span in enumerate(spans): 26 | if not (stop <= span[0] or start >= span[1]): 27 | idxs.append((sent_idx, word_idx)) 28 | 29 | assert len(idxs) > 0, "{} {} {} {}".format(context, spanss, start, stop) 30 | return idxs[0], (idxs[-1][0], idxs[-1][1] + 1) 31 | 32 | 33 | def get_phrase(context, wordss, span): 34 | """ 35 | Obtain phrase as substring of context given start and stop indices in word level 36 | :param context: 37 | :param wordss: 38 | :param start: [sent_idx, word_idx] 39 | :param stop: [sent_idx, word_idx] 40 | :return: 41 | """ 42 | start, stop = span 43 | flat_start = get_flat_idx(wordss, start) 44 | flat_stop = get_flat_idx(wordss, stop) 45 | words = sum(wordss, []) 46 | char_idx = 0 47 | char_start, char_stop = None, None 48 | for word_idx, word in enumerate(words): 49 | char_idx = context.find(word, char_idx) 50 | assert char_idx >= 0 51 | if word_idx == flat_start: 52 | char_start = char_idx 53 | char_idx += len(word) 54 | if word_idx == flat_stop - 1: 55 | char_stop = char_idx 56 | assert char_start is not None 57 | assert char_stop is not None 58 | return context[char_start:char_stop] 59 | 60 | 61 | def get_flat_idx(wordss, idx): 62 | return sum(len(words) for words in wordss[:idx[0]]) + idx[1] 63 | 64 | 65 | def get_word_idx(context, wordss, idx): 66 | spanss = get_2d_spans(context, wordss) 67 | return spanss[idx[0]][idx[1]][0] 68 | 69 | 70 | def process_tokens(temp_tokens): 71 | tokens = [] 72 | for token in temp_tokens: 73 | flag = False 74 | l = ("-", "\u2212", "\u2014", "\u2013", "/", "~", '"', "'", "\u201C", "\u2019", "\u201D", "\u2018", "\u00B0") 75 | # \u2013 is en-dash. Used for number to nubmer 76 | # l = ("-", "\u2212", "\u2014", "\u2013") 77 | # l = ("\u2013",) 78 | tokens.extend(re.split("([{}])".format("".join(l)), token)) 79 | return tokens 80 | 81 | 82 | def get_best_span(ypi, yp2i): 83 | max_val = 0 84 | best_word_span = (0, 1) 85 | best_sent_idx = 0 86 | for f, (ypif, yp2if) in enumerate(zip(ypi, yp2i)): 87 | argmax_j1 = 0 88 | for j in range(len(ypif)): 89 | val1 = ypif[argmax_j1] 90 | if val1 < ypif[j]: 91 | val1 = ypif[j] 92 | argmax_j1 = j 93 | 94 | val2 = yp2if[j] 95 | if val1 * val2 > max_val: 96 | best_word_span = (argmax_j1, j) 97 | best_sent_idx = f 98 | max_val = val1 * val2 99 | return ((best_sent_idx, best_word_span[0]), (best_sent_idx, best_word_span[1] + 1)), float(max_val) 100 | 101 | 102 | def get_span_score_pairs(ypi, yp2i): 103 | span_score_pairs = [] 104 | for f, (ypif, yp2if) in enumerate(zip(ypi, yp2i)): 105 | for j in range(len(ypif)): 106 | for k in range(j, len(yp2if)): 107 | span = ((f, j), (f, k+1)) 108 | score = ypif[j] * yp2if[k] 109 | span_score_pairs.append((span, score)) 110 | return span_score_pairs 111 | 112 | 113 | -------------------------------------------------------------------------------- /Experiment 1/tree/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imraviagrawal/ReadingComprehension/b1eb0f4a20a6a73bf07cabd48a1aeae01639830a/Experiment 1/tree/__init__.py -------------------------------------------------------------------------------- /Experiment 1/tree/cli.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pprint import pprint 3 | 4 | import tensorflow as tf 5 | 6 | from tree.main import main as m 7 | 8 | flags = tf.app.flags 9 | 10 | flags.DEFINE_string("model_name", "tree", "Model name [tree]") 11 | flags.DEFINE_string("data_dir", "data/squad", "Data dir [data/squad]") 12 | flags.DEFINE_integer("run_id", 0, "Run ID [0]") 13 | 14 | flags.DEFINE_integer("batch_size", 128, "Batch size [128]") 15 | flags.DEFINE_float("init_lr", 0.5, "Initial learning rate [0.5]") 16 | flags.DEFINE_integer("num_epochs", 50, "Total number of epochs for training [50]") 17 | flags.DEFINE_integer("num_steps", 0, "Number of steps [0]") 18 | flags.DEFINE_integer("eval_num_batches", 100, "eval num batches [100]") 19 | flags.DEFINE_integer("load_step", 0, "load step [0]") 20 | flags.DEFINE_integer("early_stop", 4, "early stop [4]") 21 | 22 | flags.DEFINE_string("mode", "test", "train | test | forward [test]") 23 | flags.DEFINE_boolean("load", True, "load saved data? [True]") 24 | flags.DEFINE_boolean("progress", True, "Show progress? [True]") 25 | flags.DEFINE_integer("log_period", 100, "Log period [100]") 26 | flags.DEFINE_integer("eval_period", 1000, "Eval period [1000]") 27 | flags.DEFINE_integer("save_period", 1000, "Save Period [1000]") 28 | flags.DEFINE_float("decay", 0.9, "Exponential moving average decay [0.9]") 29 | 30 | flags.DEFINE_boolean("draft", False, "Draft for quick testing? [False]") 31 | 32 | flags.DEFINE_integer("hidden_size", 32, "Hidden size [32]") 33 | flags.DEFINE_float("input_keep_prob", 0.5, "Input keep prob [0.5]") 34 | flags.DEFINE_integer("char_emb_size", 8, "Char emb size [8]") 35 | flags.DEFINE_integer("char_filter_height", 5, "Char filter height [5]") 36 | flags.DEFINE_float("wd", 0.0001, "Weight decay [0.001]") 37 | flags.DEFINE_bool("lower_word", True, "lower word [True]") 38 | flags.DEFINE_bool("dump_eval", True, "dump eval? [True]") 39 | 40 | flags.DEFINE_integer("word_count_th", 100, "word count th [100]") 41 | flags.DEFINE_integer("char_count_th", 500, "char count th [500]") 42 | flags.DEFINE_integer("sent_size_th", 64, "sent size th [64]") 43 | flags.DEFINE_integer("num_sents_th", 8, "num sents th [8]") 44 | flags.DEFINE_integer("ques_size_th", 64, "ques size th [64]") 45 | flags.DEFINE_integer("word_size_th", 16, "word size th [16]") 46 | flags.DEFINE_integer("tree_height_th", 16, "tree height th [16]") 47 | 48 | 49 | def main(_): 50 | config = flags.FLAGS 51 | 52 | config.out_dir = os.path.join("out", config.model_name, str(config.run_id).zfill(2)) 53 | 54 | m(config) 55 | 56 | if __name__ == "__main__": 57 | tf.app.run() 58 | -------------------------------------------------------------------------------- /Experiment 1/tree/evaluator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from tree.read_data import DataSet 5 | from my.nltk_utils import span_f1 6 | 7 | 8 | class Evaluation(object): 9 | def __init__(self, data_type, global_step, idxs, yp): 10 | self.data_type = data_type 11 | self.global_step = global_step 12 | self.idxs = idxs 13 | self.yp = yp 14 | self.num_examples = len(yp) 15 | self.dict = {'data_type': data_type, 16 | 'global_step': global_step, 17 | 'yp': yp, 18 | 'idxs': idxs, 19 | 'num_examples': self.num_examples} 20 | self.summaries = None 21 | 22 | def __repr__(self): 23 | return "{} step {}".format(self.data_type, self.global_step) 24 | 25 | def __add__(self, other): 26 | if other == 0: 27 | return self 28 | assert self.data_type == other.data_type 29 | assert self.global_step == other.global_step 30 | new_yp = self.yp + other.yp 31 | new_idxs = self.idxs + other.idxs 32 | return Evaluation(self.data_type, self.global_step, new_idxs, new_yp) 33 | 34 | def __radd__(self, other): 35 | return self.__add__(other) 36 | 37 | 38 | class LabeledEvaluation(Evaluation): 39 | def __init__(self, data_type, global_step, idxs, yp, y): 40 | super(LabeledEvaluation, self).__init__(data_type, global_step, idxs, yp) 41 | self.y = y 42 | self.dict['y'] = y 43 | 44 | def __add__(self, other): 45 | if other == 0: 46 | return self 47 | assert self.data_type == other.data_type 48 | assert self.global_step == other.global_step 49 | new_yp = self.yp + other.yp 50 | new_y = self.y + other.y 51 | new_idxs = self.idxs + other.idxs 52 | return LabeledEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_y) 53 | 54 | 55 | class AccuracyEvaluation(LabeledEvaluation): 56 | def __init__(self, data_type, global_step, idxs, yp, y, correct, loss): 57 | super(AccuracyEvaluation, self).__init__(data_type, global_step, idxs, yp, y) 58 | self.loss = loss 59 | self.correct = correct 60 | self.acc = sum(correct) / len(correct) 61 | self.dict['loss'] = loss 62 | self.dict['correct'] = correct 63 | self.dict['acc'] = self.acc 64 | loss_summary = tf.Summary(value=[tf.Summary.Value(tag='dev/loss', simple_value=self.loss)]) 65 | acc_summary = tf.Summary(value=[tf.Summary.Value(tag='dev/acc', simple_value=self.acc)]) 66 | self.summaries = [loss_summary, acc_summary] 67 | 68 | def __repr__(self): 69 | return "{} step {}: accuracy={}, loss={}".format(self.data_type, self.global_step, self.acc, self.loss) 70 | 71 | def __add__(self, other): 72 | if other == 0: 73 | return self 74 | assert self.data_type == other.data_type 75 | assert self.global_step == other.global_step 76 | new_idxs = self.idxs + other.idxs 77 | new_yp = self.yp + other.yp 78 | new_y = self.y + other.y 79 | new_correct = self.correct + other.correct 80 | new_loss = (self.loss * self.num_examples + other.loss * other.num_examples) / len(new_correct) 81 | return AccuracyEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_y, new_correct, new_loss) 82 | 83 | 84 | class Evaluator(object): 85 | def __init__(self, config, model): 86 | self.config = config 87 | self.model = model 88 | 89 | def get_evaluation(self, sess, batch): 90 | idxs, data_set = batch 91 | feed_dict = self.model.get_feed_dict(data_set, False, supervised=False) 92 | global_step, yp = sess.run([self.model.global_step, self.model.yp], feed_dict=feed_dict) 93 | yp = yp[:data_set.num_examples] 94 | e = Evaluation(data_set.data_type, int(global_step), idxs, yp.tolist()) 95 | return e 96 | 97 | def get_evaluation_from_batches(self, sess, batches): 98 | e = sum(self.get_evaluation(sess, batch) for batch in batches) 99 | return e 100 | 101 | 102 | class LabeledEvaluator(Evaluator): 103 | def get_evaluation(self, sess, batch): 104 | idxs, data_set = batch 105 | feed_dict = self.model.get_feed_dict(data_set, False, supervised=False) 106 | global_step, yp = sess.run([self.model.global_step, self.model.yp], feed_dict=feed_dict) 107 | yp = yp[:data_set.num_examples] 108 | y = feed_dict[self.model.y] 109 | e = LabeledEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), y.tolist()) 110 | return e 111 | 112 | 113 | class AccuracyEvaluator(LabeledEvaluator): 114 | def get_evaluation(self, sess, batch): 115 | idxs, data_set = batch 116 | assert isinstance(data_set, DataSet) 117 | feed_dict = self.model.get_feed_dict(data_set, False) 118 | global_step, yp, loss = sess.run([self.model.global_step, self.model.yp, self.model.loss], feed_dict=feed_dict) 119 | y = feed_dict[self.model.y] 120 | yp = yp[:data_set.num_examples] 121 | correct = [self.__class__.compare(yi, ypi) for yi, ypi in zip(y, yp)] 122 | e = AccuracyEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), y.tolist(), correct, float(loss)) 123 | return e 124 | 125 | @staticmethod 126 | def compare(yi, ypi): 127 | return int(np.argmax(yi)) == int(np.argmax(ypi)) 128 | 129 | 130 | class AccuracyEvaluator2(AccuracyEvaluator): 131 | @staticmethod 132 | def compare(yi, ypi): 133 | i = int(np.argmax(yi.flatten())) 134 | j = int(np.argmax(ypi.flatten())) 135 | # print(i, j, i == j) 136 | return i == j 137 | 138 | 139 | class TempEvaluation(AccuracyEvaluation): 140 | def __init__(self, data_type, global_step, idxs, yp, yp2, y, y2, correct, loss, f1s): 141 | super(TempEvaluation, self).__init__(data_type, global_step, idxs, yp, y, correct, loss) 142 | self.y2 = y2 143 | self.yp2 = yp2 144 | self.f1s = f1s 145 | self.f1 = float(np.mean(f1s)) 146 | self.dict['y2'] = y2 147 | self.dict['yp2'] = yp2 148 | self.dict['f1s'] = f1s 149 | self.dict['f1'] = self.f1 150 | f1_summary = tf.Summary(value=[tf.Summary.Value(tag='dev/f1', simple_value=self.f1)]) 151 | self.summaries.append(f1_summary) 152 | 153 | def __add__(self, other): 154 | if other == 0: 155 | return self 156 | assert self.data_type == other.data_type 157 | assert self.global_step == other.global_step 158 | new_idxs = self.idxs + other.idxs 159 | new_yp = self.yp + other.yp 160 | new_yp2 = self.yp2 + other.yp2 161 | new_y = self.y + other.y 162 | new_y2 = self.y2 + other.y2 163 | new_correct = self.correct + other.correct 164 | new_f1s = self.f1s + other.f1s 165 | new_loss = (self.loss * self.num_examples + other.loss * other.num_examples) / len(new_correct) 166 | return TempEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_yp2, new_y, new_y2, new_correct, new_loss, new_f1s) 167 | 168 | 169 | class TempEvaluator(LabeledEvaluator): 170 | def get_evaluation(self, sess, batch): 171 | idxs, data_set = batch 172 | assert isinstance(data_set, DataSet) 173 | feed_dict = self.model.get_feed_dict(data_set, False) 174 | global_step, yp, yp2, loss = sess.run([self.model.global_step, self.model.yp, self.model.yp2, self.model.loss], feed_dict=feed_dict) 175 | y, y2 = feed_dict[self.model.y], feed_dict[self.model.y2] 176 | yp, yp2 = yp[:data_set.num_examples], yp2[:data_set.num_examples] 177 | correct = [self.__class__.compare(yi, y2i, ypi, yp2i) for yi, y2i, ypi, yp2i in zip(y, y2, yp, yp2)] 178 | f1s = [self.__class__.span_f1(yi, y2i, ypi, yp2i) for yi, y2i, ypi, yp2i in zip(y, y2, yp, yp2)] 179 | e = TempEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), yp2.tolist(), y.tolist(), y2.tolist(), correct, float(loss), f1s) 180 | return e 181 | 182 | @staticmethod 183 | def compare(yi, y2i, ypi, yp2i): 184 | i = int(np.argmax(yi.flatten())) 185 | j = int(np.argmax(ypi.flatten())) 186 | k = int(np.argmax(y2i.flatten())) 187 | l = int(np.argmax(yp2i.flatten())) 188 | # print(i, j, i == j) 189 | return i == j and k == l 190 | 191 | @staticmethod 192 | def span_f1(yi, y2i, ypi, yp2i): 193 | true_span = (np.argmax(yi.flatten()), np.argmax(y2i.flatten())+1) 194 | pred_span = (np.argmax(ypi.flatten()), np.argmax(yp2i.flatten())+1) 195 | f1 = span_f1(true_span, pred_span) 196 | return f1 197 | 198 | -------------------------------------------------------------------------------- /Experiment 1/tree/graph_handler.py: -------------------------------------------------------------------------------- 1 | import json 2 | from json import encoder 3 | import os 4 | 5 | import tensorflow as tf 6 | 7 | from tree.evaluator import Evaluation 8 | from my.utils import short_floats 9 | 10 | 11 | class GraphHandler(object): 12 | def __init__(self, config): 13 | self.config = config 14 | self.saver = tf.train.Saver() 15 | self.writer = None 16 | self.save_path = os.path.join(config.save_dir, config.model_name) 17 | 18 | def initialize(self, sess): 19 | if self.config.load: 20 | self._load(sess) 21 | else: 22 | sess.run(tf.initialize_all_variables()) 23 | 24 | if self.config.mode == 'train': 25 | self.writer = tf.train.SummaryWriter(self.config.log_dir, graph=tf.get_default_graph()) 26 | 27 | def save(self, sess, global_step=None): 28 | self.saver.save(sess, self.save_path, global_step=global_step) 29 | 30 | def _load(self, sess): 31 | config = self.config 32 | if config.load_step > 0: 33 | save_path = os.path.join(config.save_dir, "{}-{}".format(config.model_name, config.load_step)) 34 | else: 35 | save_dir = config.save_dir 36 | checkpoint = tf.train.get_checkpoint_state(save_dir) 37 | assert checkpoint is not None, "cannot load checkpoint at {}".format(save_dir) 38 | save_path = checkpoint.model_checkpoint_path 39 | print("Loading saved model from {}".format(save_path)) 40 | self.saver.restore(sess, save_path) 41 | 42 | def add_summary(self, summary, global_step): 43 | self.writer.add_summary(summary, global_step) 44 | 45 | def add_summaries(self, summaries, global_step): 46 | for summary in summaries: 47 | self.add_summary(summary, global_step) 48 | 49 | def dump_eval(self, e, precision=2): 50 | assert isinstance(e, Evaluation) 51 | path = os.path.join(self.config.eval_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6))) 52 | with open(path, 'w') as fh: 53 | json.dump(short_floats(e.dict, precision), fh) 54 | 55 | -------------------------------------------------------------------------------- /Experiment 1/tree/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import math 4 | import os 5 | import shutil 6 | from pprint import pprint 7 | 8 | import tensorflow as tf 9 | from tqdm import tqdm 10 | import numpy as np 11 | 12 | from tree.evaluator import AccuracyEvaluator2, Evaluator 13 | from tree.graph_handler import GraphHandler 14 | from tree.model import Model 15 | from tree.trainer import Trainer 16 | 17 | from tree.read_data import load_metadata, read_data, get_squad_data_filter, update_config 18 | 19 | 20 | def main(config): 21 | set_dirs(config) 22 | if config.mode == 'train': 23 | _train(config) 24 | elif config.mode == 'test': 25 | _test(config) 26 | elif config.mode == 'forward': 27 | _forward(config) 28 | else: 29 | raise ValueError("invalid value for 'mode': {}".format(config.mode)) 30 | 31 | 32 | def _config_draft(config): 33 | if config.draft: 34 | config.num_steps = 10 35 | config.eval_period = 10 36 | config.log_period = 1 37 | config.save_period = 10 38 | config.eval_num_batches = 1 39 | 40 | 41 | def _train(config): 42 | # load_metadata(config, 'train') # this updates the config file according to metadata file 43 | 44 | data_filter = get_squad_data_filter(config) 45 | train_data = read_data(config, 'train', config.load, data_filter=data_filter) 46 | dev_data = read_data(config, 'dev', True, data_filter=data_filter) 47 | update_config(config, [train_data, dev_data]) 48 | 49 | _config_draft(config) 50 | 51 | word2vec_dict = train_data.shared['lower_word2vec'] if config.lower_word else train_data.shared['word2vec'] 52 | word2idx_dict = train_data.shared['word2idx'] 53 | idx2vec_dict = {word2idx_dict[word]: vec for word, vec in word2vec_dict.items() if word in word2idx_dict} 54 | print("{}/{} unique words have corresponding glove vectors.".format(len(idx2vec_dict), len(word2idx_dict))) 55 | emb_mat = np.array([idx2vec_dict[idx] if idx in idx2vec_dict 56 | else np.random.multivariate_normal(np.zeros(config.word_emb_size), np.eye(config.word_emb_size)) 57 | for idx in range(config.word_vocab_size)]) 58 | config.emb_mat = emb_mat 59 | 60 | # construct model graph and variables (using default graph) 61 | pprint(config.__flags, indent=2) 62 | model = Model(config) 63 | trainer = Trainer(config, model) 64 | evaluator = AccuracyEvaluator2(config, model) 65 | graph_handler = GraphHandler(config) # controls all tensors and variables in the graph, including loading /saving 66 | 67 | # Variables 68 | sess = tf.Session() 69 | graph_handler.initialize(sess) 70 | 71 | # begin training 72 | num_steps = config.num_steps or int(config.num_epochs * train_data.num_examples / config.batch_size) 73 | max_acc = 0 74 | noupdate_count = 0 75 | global_step = 0 76 | for _, batch in tqdm(train_data.get_batches(config.batch_size, num_batches=num_steps, shuffle=True), total=num_steps): 77 | global_step = sess.run(model.global_step) + 1 # +1 because all calculations are done after step 78 | get_summary = global_step % config.log_period == 0 79 | loss, summary, train_op = trainer.step(sess, batch, get_summary=get_summary) 80 | if get_summary: 81 | graph_handler.add_summary(summary, global_step) 82 | 83 | # Occasional evaluation and saving 84 | if global_step % config.save_period == 0: 85 | graph_handler.save(sess, global_step=global_step) 86 | if global_step % config.eval_period == 0: 87 | num_batches = math.ceil(dev_data.num_examples / config.batch_size) 88 | if 0 < config.eval_num_batches < num_batches: 89 | num_batches = config.eval_num_batches 90 | e = evaluator.get_evaluation_from_batches( 91 | sess, tqdm(dev_data.get_batches(config.batch_size, num_batches=num_batches), total=num_batches)) 92 | graph_handler.add_summaries(e.summaries, global_step) 93 | if e.acc > max_acc: 94 | max_acc = e.acc 95 | noupdate_count = 0 96 | else: 97 | noupdate_count += 1 98 | if noupdate_count == config.early_stop: 99 | break 100 | if config.dump_eval: 101 | graph_handler.dump_eval(e) 102 | if global_step % config.save_period != 0: 103 | graph_handler.save(sess, global_step=global_step) 104 | 105 | 106 | def _test(config): 107 | test_data = read_data(config, 'test', True) 108 | update_config(config, [test_data]) 109 | 110 | _config_draft(config) 111 | 112 | pprint(config.__flags, indent=2) 113 | model = Model(config) 114 | evaluator = AccuracyEvaluator2(config, model) 115 | graph_handler = GraphHandler(config) # controls all tensors and variables in the graph, including loading /saving 116 | 117 | sess = tf.Session() 118 | graph_handler.initialize(sess) 119 | 120 | num_batches = math.ceil(test_data.num_examples / config.batch_size) 121 | if 0 < config.eval_num_batches < num_batches: 122 | num_batches = config.eval_num_batches 123 | e = evaluator.get_evaluation_from_batches(sess, tqdm(test_data.get_batches(config.batch_size, num_batches=num_batches), total=num_batches)) 124 | print(e) 125 | if config.dump_eval: 126 | graph_handler.dump_eval(e) 127 | 128 | 129 | def _forward(config): 130 | 131 | forward_data = read_data(config, 'forward', True) 132 | 133 | _config_draft(config) 134 | 135 | pprint(config.__flag, indent=2) 136 | model = Model(config) 137 | evaluator = Evaluator(config, model) 138 | graph_handler = GraphHandler(config) # controls all tensors and variables in the graph, including loading /saving 139 | 140 | sess = tf.Session() 141 | graph_handler.initialize(sess) 142 | 143 | num_batches = math.ceil(forward_data.num_examples / config.batch_size) 144 | if 0 < config.eval_num_batches < num_batches: 145 | num_batches = config.eval_num_batches 146 | e = evaluator.get_evaluation_from_batches(sess, tqdm(forward_data.get_batches(config.batch_size, num_batches=num_batches), total=num_batches)) 147 | print(e) 148 | if config.dump_eval: 149 | graph_handler.dump_eval(e) 150 | 151 | 152 | def set_dirs(config): 153 | # create directories 154 | if not config.load and os.path.exists(config.out_dir): 155 | shutil.rmtree(config.out_dir) 156 | 157 | config.save_dir = os.path.join(config.out_dir, "save") 158 | config.log_dir = os.path.join(config.out_dir, "log") 159 | config.eval_dir = os.path.join(config.out_dir, "eval") 160 | if not os.path.exists(config.out_dir): 161 | os.makedirs(config.out_dir) 162 | if not os.path.exists(config.save_dir): 163 | os.mkdir(config.save_dir) 164 | if not os.path.exists(config.log_dir): 165 | os.mkdir(config.eval_dir) 166 | 167 | 168 | def _get_args(): 169 | parser = argparse.ArgumentParser() 170 | parser.add_argument("config_path") 171 | return parser.parse_args() 172 | 173 | 174 | class Config(object): 175 | def __init__(self, **entries): 176 | self.__dict__.update(entries) 177 | 178 | 179 | def _run(): 180 | args = _get_args() 181 | with open(args.config_path, 'r') as fh: 182 | config = Config(**json.load(fh)) 183 | main(config) 184 | 185 | 186 | if __name__ == "__main__": 187 | _run() 188 | -------------------------------------------------------------------------------- /Experiment 1/tree/read_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import random 4 | import itertools 5 | import math 6 | 7 | import nltk 8 | 9 | from my.nltk_utils import load_compressed_tree 10 | from my.utils import index 11 | 12 | 13 | class DataSet(object): 14 | def __init__(self, data, data_type, shared=None, valid_idxs=None): 15 | total_num_examples = len(next(iter(data.values()))) 16 | self.data = data # e.g. {'X': [0, 1, 2], 'Y': [2, 3, 4]} 17 | self.data_type = data_type 18 | self.shared = shared 19 | self.valid_idxs = range(total_num_examples) if valid_idxs is None else valid_idxs 20 | self.num_examples = len(self.valid_idxs) 21 | 22 | def get_batches(self, batch_size, num_batches=None, shuffle=False): 23 | num_batches_per_epoch = int(math.ceil(self.num_examples / batch_size)) 24 | if num_batches is None: 25 | num_batches = num_batches_per_epoch 26 | num_epochs = int(math.ceil(num_batches / num_batches_per_epoch)) 27 | 28 | idxs = itertools.chain.from_iterable(random.sample(self.valid_idxs, len(self.valid_idxs)) 29 | if shuffle else self.valid_idxs 30 | for _ in range(num_epochs)) 31 | for _ in range(num_batches): 32 | batch_idxs = tuple(itertools.islice(idxs, batch_size)) 33 | batch_data = {} 34 | for key, val in self.data.items(): 35 | if key.startswith('*'): 36 | assert self.shared is not None 37 | shared_key = key[1:] 38 | batch_data[shared_key] = [index(self.shared[shared_key], val[idx]) for idx in batch_idxs] 39 | else: 40 | batch_data[key] = list(map(val.__getitem__, batch_idxs)) 41 | 42 | batch_ds = DataSet(batch_data, self.data_type, shared=self.shared) 43 | yield batch_idxs, batch_ds 44 | 45 | 46 | class SquadDataSet(DataSet): 47 | def __init__(self, data, data_type, shared=None, valid_idxs=None): 48 | super(SquadDataSet, self).__init__(data, data_type, shared=shared, valid_idxs=valid_idxs) 49 | 50 | 51 | def load_metadata(config, data_type): 52 | metadata_path = os.path.join(config.data_dir, "metadata_{}.json".format(data_type)) 53 | with open(metadata_path, 'r') as fh: 54 | metadata = json.load(fh) 55 | for key, val in metadata.items(): 56 | config.__setattr__(key, val) 57 | return metadata 58 | 59 | 60 | def read_data(config, data_type, ref, data_filter=None): 61 | data_path = os.path.join(config.data_dir, "data_{}.json".format(data_type)) 62 | shared_path = os.path.join(config.data_dir, "shared_{}.json".format(data_type)) 63 | with open(data_path, 'r') as fh: 64 | data = json.load(fh) 65 | with open(shared_path, 'r') as fh: 66 | shared = json.load(fh) 67 | 68 | num_examples = len(next(iter(data.values()))) 69 | if data_filter is None: 70 | valid_idxs = range(num_examples) 71 | else: 72 | mask = [] 73 | keys = data.keys() 74 | values = data.values() 75 | for vals in zip(*values): 76 | each = {key: val for key, val in zip(keys, vals)} 77 | mask.append(data_filter(each, shared)) 78 | valid_idxs = [idx for idx in range(len(mask)) if mask[idx]] 79 | 80 | print("Loaded {}/{} examples from {}".format(len(valid_idxs), num_examples, data_type)) 81 | 82 | shared_path = os.path.join(config.out_dir, "shared.json") 83 | if not ref: 84 | word_counter = shared['lower_word_counter'] if config.lower_word else shared['word_counter'] 85 | char_counter = shared['char_counter'] 86 | pos_counter = shared['pos_counter'] 87 | shared['word2idx'] = {word: idx + 2 for idx, word in 88 | enumerate(word for word, count in word_counter.items() 89 | if count > config.word_count_th)} 90 | shared['char2idx'] = {char: idx + 2 for idx, char in 91 | enumerate(char for char, count in char_counter.items() 92 | if count > config.char_count_th)} 93 | shared['pos2idx'] = {pos: idx + 2 for idx, pos in enumerate(pos_counter.keys())} 94 | NULL = "-NULL-" 95 | UNK = "-UNK-" 96 | shared['word2idx'][NULL] = 0 97 | shared['word2idx'][UNK] = 1 98 | shared['char2idx'][NULL] = 0 99 | shared['char2idx'][UNK] = 1 100 | shared['pos2idx'][NULL] = 0 101 | shared['pos2idx'][UNK] = 1 102 | json.dump({'word2idx': shared['word2idx'], 'char2idx': shared['char2idx'], 103 | 'pos2idx': shared['pos2idx']}, open(shared_path, 'w')) 104 | else: 105 | new_shared = json.load(open(shared_path, 'r')) 106 | for key, val in new_shared.items(): 107 | shared[key] = val 108 | 109 | data_set = DataSet(data, data_type, shared=shared, valid_idxs=valid_idxs) 110 | return data_set 111 | 112 | 113 | def get_squad_data_filter(config): 114 | def data_filter(data_point, shared): 115 | assert shared is not None 116 | rx, rcx, q, cq, y = (data_point[key] for key in ('*x', '*cx', 'q', 'cq', 'y')) 117 | x, cx, stx = shared['x'], shared['cx'], shared['stx'] 118 | if len(q) > config.ques_size_th: 119 | return False 120 | xi = x[rx[0]][rx[1]] 121 | if len(xi) > config.num_sents_th: 122 | return False 123 | if any(len(xij) > config.sent_size_th for xij in xi): 124 | return False 125 | stxi = stx[rx[0]][rx[1]] 126 | if any(nltk.tree.Tree.fromstring(s).height() > config.tree_height_th for s in stxi): 127 | return False 128 | return True 129 | return data_filter 130 | 131 | 132 | def update_config(config, data_sets): 133 | config.max_num_sents = 0 134 | config.max_sent_size = 0 135 | config.max_ques_size = 0 136 | config.max_word_size = 0 137 | config.max_tree_height = 0 138 | for data_set in data_sets: 139 | data = data_set.data 140 | shared = data_set.shared 141 | for idx in data_set.valid_idxs: 142 | rx = data['*x'][idx] 143 | q = data['q'][idx] 144 | sents = shared['x'][rx[0]][rx[1]] 145 | trees = map(nltk.tree.Tree.fromstring, shared['stx'][rx[0]][rx[1]]) 146 | config.max_tree_height = max(config.max_tree_height, max(tree.height() for tree in trees)) 147 | config.max_num_sents = max(config.max_num_sents, len(sents)) 148 | config.max_sent_size = max(config.max_sent_size, max(map(len, sents))) 149 | config.max_word_size = max(config.max_word_size, max(len(word) for sent in sents for word in sent)) 150 | if len(q) > 0: 151 | config.max_ques_size = max(config.max_ques_size, len(q)) 152 | config.max_word_size = max(config.max_word_size, max(len(word) for word in q)) 153 | 154 | config.max_word_size = min(config.max_word_size, config.word_size_th) 155 | 156 | config.char_vocab_size = len(data_sets[0].shared['char2idx']) 157 | config.word_emb_size = len(next(iter(data_sets[0].shared['word2vec'].values()))) 158 | config.word_vocab_size = len(data_sets[0].shared['word2idx']) 159 | config.pos_vocab_size = len(data_sets[0].shared['pos2idx']) 160 | -------------------------------------------------------------------------------- /Experiment 1/tree/templates/visualizer.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {{ title }} 6 | 7 | 8 | 19 | 20 | 23 | 24 |

    {{ title }}

    25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | {% for row in rows %} 33 | 34 | 35 | 40 | 41 | 63 | 64 | {% endfor %} 65 |
    IDQuestionAnswerParagraph
    {{ row.id }} 36 | {% for qj in row.ques %} 37 | {{ qj }} 38 | {% endfor %} 39 | {{ row.a }} 42 | 43 | {% for xj, yj, y2j, ypj, yp2j in zip(row.para, row.y, row.y2, row.yp, row.yp2) %} 44 | 45 | {% for xjk, yjk, y2jk, ypjk in zip(xj, yj, y2j, ypj) %} 46 | 53 | {% endfor %} 54 | 55 | 56 | {% for xjk, yp2jk in zip(xj, yp2j) %} 57 | 58 | {% endfor %} 59 | 60 | {% endfor %} 61 |
    47 | {% if yjk or y2jk %} 48 | {{ xjk }} 49 | {% else %} 50 | {{ xjk }} 51 | {% endif %} 52 |
    -
    62 |
    66 | 67 | -------------------------------------------------------------------------------- /Experiment 1/tree/test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import nltk\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "%matplotlib inline" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 10, 19 | "metadata": { 20 | "collapsed": false 21 | }, 22 | "outputs": [ 23 | { 24 | "name": "stdout", 25 | "output_type": "stream", 26 | "text": [ 27 | "(S (PRP I) (VP (VBP am) (NNP Sam)) (. .))\n", 28 | "(PRP I)\n", 29 | "(VP (VBP am) (NNP Sam))\n", 30 | "(VBP am)\n", 31 | "(NNP Sam)\n", 32 | "(. .)\n", 33 | "(S (PRP I) (VP (VBP am) (NNP Sam)) (. .))\n" 34 | ] 35 | } 36 | ], 37 | "source": [ 38 | "string = \"(ROOT(S(NP (PRP I))(VP (VBP am)(NP (NNP Sam)))(. .)))\"\n", 39 | "tree = nltk.tree.Tree.fromstring(string)\n", 40 | "\n", 41 | "def load_compressed_tree(s):\n", 42 | "\n", 43 | " def compress_tree(tree):\n", 44 | " if len(tree) == 1:\n", 45 | " if isinstance(tree[0], nltk.tree.Tree):\n", 46 | " return compress_tree(tree[0])\n", 47 | " else:\n", 48 | " return tree\n", 49 | " else:\n", 50 | " for i, t in enumerate(tree):\n", 51 | " tree[i] = compress_tree(t)\n", 52 | " return tree\n", 53 | "\n", 54 | " return compress_tree(nltk.tree.Tree.fromstring(s))\n", 55 | "tree = load_compressed_tree(string)\n", 56 | "for t in tree.subtrees():\n", 57 | " print(t)\n", 58 | " \n", 59 | "print(str(tree))" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "metadata": { 66 | "collapsed": false 67 | }, 68 | "outputs": [ 69 | { 70 | "name": "stdout", 71 | "output_type": "stream", 72 | "text": [ 73 | "(ROOT I am Sam .)\n" 74 | ] 75 | } 76 | ], 77 | "source": [ 78 | "print(tree.flatten())" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 10, 84 | "metadata": { 85 | "collapsed": false 86 | }, 87 | "outputs": [ 88 | { 89 | "name": "stdout", 90 | "output_type": "stream", 91 | "text": [ 92 | "['ROOT', 'S', 'NP', 'PRP', 'VP', 'VBP', 'NP', 'NNP', '.']\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "print(list(t.label() for t in tree.subtrees()))" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 11, 103 | "metadata": { 104 | "collapsed": true 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "import json\n", 109 | "d = json.load(open(\"data/squad/shared_dev.json\", 'r'))" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 12, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [ 119 | { 120 | "data": { 121 | "text/plain": [ 122 | "73" 123 | ] 124 | }, 125 | "execution_count": 12, 126 | "metadata": {}, 127 | "output_type": "execute_result" 128 | } 129 | ], 130 | "source": [ 131 | "len(d['pos_counter'])" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 13, 137 | "metadata": { 138 | "collapsed": false 139 | }, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "{'#': 6,\n", 145 | " '$': 80,\n", 146 | " \"''\": 1291,\n", 147 | " ',': 14136,\n", 148 | " '-LRB-': 1926,\n", 149 | " '-RRB-': 1925,\n", 150 | " '.': 9505,\n", 151 | " ':': 1455,\n", 152 | " 'ADJP': 3426,\n", 153 | " 'ADVP': 4936,\n", 154 | " 'CC': 9300,\n", 155 | " 'CD': 6216,\n", 156 | " 'CONJP': 191,\n", 157 | " 'DT': 26286,\n", 158 | " 'EX': 288,\n", 159 | " 'FRAG': 107,\n", 160 | " 'FW': 96,\n", 161 | " 'IN': 32564,\n", 162 | " 'INTJ': 12,\n", 163 | " 'JJ': 21452,\n", 164 | " 'JJR': 563,\n", 165 | " 'JJS': 569,\n", 166 | " 'LS': 7,\n", 167 | " 'LST': 1,\n", 168 | " 'MD': 1051,\n", 169 | " 'NAC': 19,\n", 170 | " 'NN': 34750,\n", 171 | " 'NNP': 28392,\n", 172 | " 'NNPS': 1400,\n", 173 | " 'NNS': 16716,\n", 174 | " 'NP': 91636,\n", 175 | " 'NP-TMP': 236,\n", 176 | " 'NX': 108,\n", 177 | " 'PDT': 89,\n", 178 | " 'POS': 1451,\n", 179 | " 'PP': 33278,\n", 180 | " 'PRN': 2085,\n", 181 | " 'PRP': 2320,\n", 182 | " 'PRP$': 1959,\n", 183 | " 'PRT': 450,\n", 184 | " 'QP': 838,\n", 185 | " 'RB': 7611,\n", 186 | " 'RBR': 301,\n", 187 | " 'RBS': 252,\n", 188 | " 'ROOT': 9587,\n", 189 | " 'RP': 454,\n", 190 | " 'RRC': 19,\n", 191 | " 'S': 21557,\n", 192 | " 'SBAR': 5009,\n", 193 | " 'SBARQ': 6,\n", 194 | " 'SINV': 135,\n", 195 | " 'SQ': 5,\n", 196 | " 'SYM': 17,\n", 197 | " 'TO': 5167,\n", 198 | " 'UCP': 143,\n", 199 | " 'UH': 15,\n", 200 | " 'VB': 4197,\n", 201 | " 'VBD': 8377,\n", 202 | " 'VBG': 3570,\n", 203 | " 'VBN': 7218,\n", 204 | " 'VBP': 2897,\n", 205 | " 'VBZ': 4146,\n", 206 | " 'VP': 33696,\n", 207 | " 'WDT': 1368,\n", 208 | " 'WHADJP': 5,\n", 209 | " 'WHADVP': 439,\n", 210 | " 'WHNP': 1927,\n", 211 | " 'WHPP': 153,\n", 212 | " 'WP': 482,\n", 213 | " 'WP$': 50,\n", 214 | " 'WRB': 442,\n", 215 | " 'X': 23,\n", 216 | " '``': 1269}" 217 | ] 218 | }, 219 | "execution_count": 13, 220 | "metadata": {}, 221 | "output_type": "execute_result" 222 | } 223 | ], 224 | "source": [ 225 | "d['pos_counter']" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 3, 231 | "metadata": { 232 | "collapsed": false 233 | }, 234 | "outputs": [ 235 | { 236 | "name": "stdout", 237 | "output_type": "stream", 238 | "text": [ 239 | "[[False False False False]\n", 240 | " [False True False False]\n", 241 | " [False False False False]]\n", 242 | "[[0 2 2 0]\n", 243 | " [2 2 0 2]\n", 244 | " [2 0 0 0]]\n" 245 | ] 246 | } 247 | ], 248 | "source": [ 249 | "from my.nltk_utils import tree2matrix, load_compressed_tree, find_max_f1_subtree, set_span\n", 250 | "string = \"(ROOT(S(NP (PRP I))(VP (VBP am)(NP (NNP Sam)))(. .)))\"\n", 251 | "tree = load_compressed_tree(string)\n", 252 | "span = (1, 3)\n", 253 | "set_span(tree)\n", 254 | "subtree = find_max_f1_subtree(tree, span)\n", 255 | "f = lambda t: t == subtree\n", 256 | "g = lambda t: 1 if isinstance(t, str) else 2\n", 257 | "a, b = tree2matrix(tree, f, dtype='bool')\n", 258 | "c, d = tree2matrix(tree, g, dtype='int32')\n", 259 | "print(a)\n", 260 | "print(c)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": { 267 | "collapsed": true 268 | }, 269 | "outputs": [], 270 | "source": [] 271 | } 272 | ], 273 | "metadata": { 274 | "kernelspec": { 275 | "display_name": "Python 3", 276 | "language": "python", 277 | "name": "python3" 278 | }, 279 | "language_info": { 280 | "codemirror_mode": { 281 | "name": "ipython", 282 | "version": 3 283 | }, 284 | "file_extension": ".py", 285 | "mimetype": "text/x-python", 286 | "name": "python", 287 | "nbconvert_exporter": "python", 288 | "pygments_lexer": "ipython3", 289 | "version": "3.5.1" 290 | } 291 | }, 292 | "nbformat": 4, 293 | "nbformat_minor": 0 294 | } 295 | -------------------------------------------------------------------------------- /Experiment 1/tree/trainer.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from tree.model import Model 4 | 5 | 6 | class Trainer(object): 7 | def __init__(self, config, model): 8 | assert isinstance(model, Model) 9 | self.config = config 10 | self.model = model 11 | self.opt = tf.train.AdagradOptimizer(config.init_lr) 12 | self.loss = model.get_loss() 13 | self.var_list = model.get_var_list() 14 | self.global_step = model.get_global_step() 15 | self.ema_op = model.ema_op 16 | self.summary = model.summary 17 | self.grads = self.opt.compute_gradients(self.loss, var_list=self.var_list) 18 | opt_op = self.opt.apply_gradients(self.grads, global_step=self.global_step) 19 | 20 | # Define train op 21 | with tf.control_dependencies([opt_op]): 22 | self.train_op = tf.group(self.ema_op) 23 | 24 | def get_train_op(self): 25 | return self.train_op 26 | 27 | def step(self, sess, batch, get_summary=False): 28 | assert isinstance(sess, tf.Session) 29 | feed_dict = self.model.get_feed_dict(batch, True) 30 | if get_summary: 31 | loss, summary, train_op = \ 32 | sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict) 33 | else: 34 | loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict) 35 | summary = None 36 | return loss, summary, train_op 37 | -------------------------------------------------------------------------------- /Experiment 1/tree/visualizer.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from collections import OrderedDict 3 | import http.server 4 | import socketserver 5 | import argparse 6 | import json 7 | import os 8 | import numpy as np 9 | from tqdm import tqdm 10 | 11 | from jinja2 import Environment, FileSystemLoader 12 | 13 | 14 | def bool_(string): 15 | if string == 'True': 16 | return True 17 | elif string == 'False': 18 | return False 19 | else: 20 | raise Exception() 21 | 22 | def get_args(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--model_name", type=str, default='basic') 25 | parser.add_argument("--data_type", type=str, default='dev') 26 | parser.add_argument("--step", type=int, default=5000) 27 | parser.add_argument("--template_name", type=str, default="visualizer.html") 28 | parser.add_argument("--num_per_page", type=int, default=100) 29 | parser.add_argument("--data_dir", type=str, default="data/squad") 30 | parser.add_argument("--port", type=int, default=8000) 31 | parser.add_argument("--host", type=str, default="0.0.0.0") 32 | parser.add_argument("--open", type=str, default='False') 33 | parser.add_argument("--run_id", type=str, default="0") 34 | 35 | args = parser.parse_args() 36 | return args 37 | 38 | 39 | def _decode(decoder, sent): 40 | return " ".join(decoder[idx] for idx in sent) 41 | 42 | 43 | def accuracy2_visualizer(args): 44 | model_name = args.model_name 45 | data_type = args.data_type 46 | num_per_page = args.num_per_page 47 | data_dir = args.data_dir 48 | run_id = args.run_id.zfill(2) 49 | step = args.step 50 | 51 | eval_path =os.path.join("out", model_name, run_id, "eval", "{}-{}.json".format(data_type, str(step).zfill(6))) 52 | eval_ = json.load(open(eval_path, 'r')) 53 | 54 | _id = 0 55 | html_dir = "/tmp/list_results%d" % _id 56 | while os.path.exists(html_dir): 57 | _id += 1 58 | html_dir = "/tmp/list_results%d" % _id 59 | 60 | if os.path.exists(html_dir): 61 | shutil.rmtree(html_dir) 62 | os.mkdir(html_dir) 63 | 64 | cur_dir = os.path.dirname(os.path.realpath(__file__)) 65 | templates_dir = os.path.join(cur_dir, 'templates') 66 | env = Environment(loader=FileSystemLoader(templates_dir)) 67 | env.globals.update(zip=zip, reversed=reversed) 68 | template = env.get_template(args.template_name) 69 | 70 | data_path = os.path.join(data_dir, "data_{}.json".format(data_type)) 71 | shared_path = os.path.join(data_dir, "shared_{}.json".format(data_type)) 72 | data = json.load(open(data_path, 'r')) 73 | shared = json.load(open(shared_path, 'r')) 74 | 75 | rows = [] 76 | for i, (idx, yi, ypi) in enumerate(zip(*[eval_[key] for key in ('idxs', 'y', 'yp')])): 77 | id_, q, rx = (data[key][idx] for key in ('ids', 'q', '*x')) 78 | x = shared['x'][rx[0]][rx[1]] 79 | ques = [" ".join(q)] 80 | para = [[word for word in sent] for sent in x] 81 | row = { 82 | 'id': id_, 83 | 'title': "Hello world!", 84 | 'ques': ques, 85 | 'para': para, 86 | 'y': yi, 87 | 'y2': yi, 88 | 'yp': ypi, 89 | 'yp2': ypi, 90 | 'a': "" 91 | } 92 | rows.append(row) 93 | 94 | if i % num_per_page == 0: 95 | html_path = os.path.join(html_dir, "%s.html" % str(i).zfill(8)) 96 | 97 | if (i + 1) % num_per_page == 0 or (i + 1) == len(eval_['y']): 98 | var_dict = {'title': "Accuracy Visualization", 99 | 'rows': rows 100 | } 101 | with open(html_path, "wb") as f: 102 | f.write(template.render(**var_dict).encode('UTF-8')) 103 | rows = [] 104 | 105 | os.chdir(html_dir) 106 | port = args.port 107 | host = args.host 108 | # Overriding to suppress log message 109 | class MyHandler(http.server.SimpleHTTPRequestHandler): 110 | def log_message(self, format, *args): 111 | pass 112 | handler = MyHandler 113 | httpd = socketserver.TCPServer((host, port), handler) 114 | if args.open == 'True': 115 | os.system("open http://%s:%d" % (args.host, args.port)) 116 | print("serving at %s:%d" % (host, port)) 117 | httpd.serve_forever() 118 | 119 | 120 | if __name__ == "__main__": 121 | ARGS = get_args() 122 | accuracy2_visualizer(ARGS) -------------------------------------------------------------------------------- /Experiment 2/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imraviagrawal/ReadingComprehension/b1eb0f4a20a6a73bf07cabd48a1aeae01639830a/Experiment 2/.DS_Store -------------------------------------------------------------------------------- /Experiment 2/code/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imraviagrawal/ReadingComprehension/b1eb0f4a20a6a73bf07cabd48a1aeae01639830a/Experiment 2/code/__init__.py -------------------------------------------------------------------------------- /Experiment 2/code/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:0.12.1 2 | RUN pip install joblib 3 | RUN pip install nltk 4 | RUN pip install tqdm 5 | RUN pip install pyprind 6 | RUN python -m nltk.downloader --dir=/usr/local/share/nltk_data perluniprops punkt 7 | -------------------------------------------------------------------------------- /Experiment 2/code/evaluate.py: -------------------------------------------------------------------------------- 1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """ 2 | from __future__ import print_function 3 | from collections import Counter 4 | import string 5 | import re 6 | import argparse 7 | import json 8 | import sys 9 | 10 | 11 | def normalize_answer(s): 12 | """Lower text and remove punctuation, articles and extra whitespace.""" 13 | def remove_articles(text): 14 | return re.sub(r'\b(a|an|the)\b', ' ', text) 15 | def white_space_fix(text): 16 | return ' '.join(text.split()) 17 | def remove_punc(text): 18 | exclude = set(string.punctuation) 19 | return ''.join(ch for ch in text if ch not in exclude) 20 | def lower(text): 21 | return text.lower() 22 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 23 | 24 | 25 | def f1_score(prediction, ground_truth): 26 | prediction_tokens = normalize_answer(prediction).split() 27 | ground_truth_tokens = normalize_answer(ground_truth).split() 28 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 29 | num_same = sum(common.values()) 30 | if num_same == 0: 31 | return 0 32 | precision = 1.0 * num_same / len(prediction_tokens) 33 | recall = 1.0 * num_same / len(ground_truth_tokens) 34 | f1 = (2 * precision * recall) / (precision + recall) 35 | return f1 36 | 37 | def f1_score_ours(prediction, ground_truth): 38 | if len(prediction) <= 0 or len(ground_truth) <= 0: 39 | return 0 40 | if prediction[0] > prediction[1]: 41 | return 0 42 | ours = range(prediction[0], prediction[1]+1) 43 | truth = range(ground_truth[0], ground_truth[1]+1) 44 | common = Counter(ours) & Counter(truth) 45 | num_same = sum(common.values()) 46 | if num_same == 0: 47 | return 0 48 | precision = 1.0 * num_same / len(ours) 49 | recall = 1.0 * num_same / len(truth) 50 | f1 = (2 * precision * recall) / (precision + recall) 51 | return f1 52 | 53 | def exact_match_score(prediction, ground_truth): 54 | return (normalize_answer(prediction) == normalize_answer(ground_truth)) 55 | 56 | 57 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 58 | scores_for_ground_truths = [] 59 | for ground_truth in ground_truths: 60 | score = metric_fn(prediction, ground_truth) 61 | scores_for_ground_truths.append(score) 62 | return max(scores_for_ground_truths) 63 | 64 | 65 | def evaluate(dataset, predictions): 66 | f1 = exact_match = total = 0 67 | for article in dataset: 68 | for paragraph in article['paragraphs']: 69 | for qa in paragraph['qas']: 70 | total += 1 71 | if qa['id'] not in predictions: 72 | message = 'Unanswered question ' + qa['id'] + \ 73 | ' will receive score 0.' 74 | print(message, file=sys.stderr) 75 | continue 76 | ground_truths = list(map(lambda x: x['text'], qa['answers'])) 77 | prediction = predictions[qa['id']] 78 | exact_match += metric_max_over_ground_truths( 79 | exact_match_score, prediction, ground_truths) 80 | f1 += metric_max_over_ground_truths( 81 | f1_score, prediction, ground_truths) 82 | 83 | exact_match = 100.0 * exact_match / total 84 | f1 = 100.0 * f1 / total 85 | 86 | return {'exact_match': exact_match, 'f1': f1} 87 | 88 | 89 | if __name__ == '__main__': 90 | expected_version = '1.1' 91 | parser = argparse.ArgumentParser( 92 | description='Evaluation for SQuAD ' + expected_version) 93 | parser.add_argument('dataset_file', help='Dataset file') 94 | parser.add_argument('prediction_file', help='Prediction File') 95 | args = parser.parse_args() 96 | with open(args.dataset_file) as dataset_file: 97 | dataset_json = json.load(dataset_file) 98 | if (dataset_json['version'] != expected_version): 99 | print('Evaluation expects v-' + expected_version + 100 | ', but got dataset with v-' + dataset_json['version'], 101 | file=sys.stderr) 102 | dataset = dataset_json['data'] 103 | with open(args.prediction_file) as prediction_file: 104 | predictions = json.load(prediction_file) 105 | print(json.dumps(evaluate(dataset, predictions))) 106 | -------------------------------------------------------------------------------- /Experiment 2/code/evaluate.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imraviagrawal/ReadingComprehension/b1eb0f4a20a6a73bf07cabd48a1aeae01639830a/Experiment 2/code/evaluate.pyc -------------------------------------------------------------------------------- /Experiment 2/code/get_started.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Downloads raw data into ./download 3 | # and saves preprocessed data into ./data 4 | # Get directory containing this script 5 | 6 | CODE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 7 | 8 | export PYTHONPATH=$PYTHONPATH:$CODE_DIR 9 | 10 | pip install -r $CODE_DIR/requirements.txt 11 | 12 | # download punkt, perluniprops 13 | if [ ! -d "/usr/local/share/nltk_data/tokenizers/punkt" ]; then 14 | python2 -m nltk.downloader punkt 15 | fi 16 | 17 | 18 | # SQuAD preprocess is in charge of downloading 19 | # and formatting the data to be consumed later 20 | DATA_DIR=data 21 | DOWNLOAD_DIR=download 22 | mkdir -p $DATA_DIR 23 | rm -rf $DATA_DIR 24 | python2 $CODE_DIR/preprocessing/squad_preprocess.py 25 | 26 | # Download distributed word representations 27 | python2 $CODE_DIR/preprocessing/dwr.py 28 | 29 | # Data processing for TensorFlow 30 | python2 $CODE_DIR/qa_data.py --glove_dim 100 31 | -------------------------------------------------------------------------------- /Experiment 2/code/plotHistograms.py: -------------------------------------------------------------------------------- 1 | import train 2 | import matplotlib.pyplot as plt 3 | 4 | if __name__ == "__main__": 5 | training_set = train.load_train_dataset("data/squad") 6 | questions, contexts, answers = zip(*training_set) 7 | question_lengths = map(len, questions) 8 | context_lengths = map(len, contexts) 9 | answer_lengths = [a[1] - a[0] + 1 for a in answers] 10 | plt.figure(1) 11 | plt.xlabel('Question Lengths') 12 | plt.ylabel('Counts') 13 | plt.hist(question_lengths, 60) 14 | plt.figure(2) 15 | plt.xlabel('Context Paragraph Lengths') 16 | plt.ylabel('Counts') 17 | plt.hist(context_lengths, 200) 18 | plt.figure(3) 19 | plt.xlabel('Answer Lengths') 20 | plt.ylabel('Counts') 21 | plt.hist(answer_lengths, 60) 22 | plt.show() 23 | -------------------------------------------------------------------------------- /Experiment 2/code/preprocessing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imraviagrawal/ReadingComprehension/b1eb0f4a20a6a73bf07cabd48a1aeae01639830a/Experiment 2/code/preprocessing/__init__.py -------------------------------------------------------------------------------- /Experiment 2/code/preprocessing/dwr.py: -------------------------------------------------------------------------------- 1 | import zipfile 2 | from squad_preprocess import * 3 | 4 | 5 | if __name__ == '__main__': 6 | glove_base_url = "http://nlp.stanford.edu/data/" 7 | glove_filename = "glove.6B.zip" 8 | prefix = os.path.join("download", "dwr") 9 | 10 | print("Storing datasets in {}".format(prefix)) 11 | 12 | if not os.path.exists(prefix): 13 | os.makedirs(prefix) 14 | 15 | glove_zip = maybe_download(glove_base_url, glove_filename, prefix, 862182613L) 16 | glove_zip_ref = zipfile.ZipFile(os.path.join(prefix, glove_filename), 'r') 17 | 18 | glove_zip_ref.extractall(prefix) 19 | glove_zip_ref.close() 20 | -------------------------------------------------------------------------------- /Experiment 2/code/preprocessing/squad_preprocess.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imraviagrawal/ReadingComprehension/b1eb0f4a20a6a73bf07cabd48a1aeae01639830a/Experiment 2/code/preprocessing/squad_preprocess.pyc -------------------------------------------------------------------------------- /Experiment 2/code/qa_bidaf_model.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imraviagrawal/ReadingComprehension/b1eb0f4a20a6a73bf07cabd48a1aeae01639830a/Experiment 2/code/qa_bidaf_model.pyc -------------------------------------------------------------------------------- /Experiment 2/code/qa_data.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import gzip 6 | import os 7 | import re 8 | import tarfile 9 | import argparse 10 | 11 | from six.moves import urllib 12 | 13 | from tensorflow.python.platform import gfile 14 | from tqdm import * 15 | import numpy as np 16 | from os.path import join as pjoin 17 | 18 | _PAD = b"" 19 | _SOS = b"" 20 | _UNK = b"" 21 | _START_VOCAB = [_PAD, _SOS, _UNK] 22 | 23 | PAD_ID = 0 24 | SOS_ID = 1 25 | UNK_ID = 2 26 | 27 | def setup_args(): 28 | parser = argparse.ArgumentParser() 29 | code_dir = os.path.join(os.path.dirname(os.path.realpath(__file__))) 30 | vocab_dir = os.path.join("data", "squad") 31 | glove_dir = os.path.join("download", "dwr") 32 | source_dir = os.path.join("data", "squad") 33 | parser.add_argument("--source_dir", default=source_dir) 34 | parser.add_argument("--glove_dir", default=glove_dir) 35 | parser.add_argument("--vocab_dir", default=vocab_dir) 36 | parser.add_argument("--glove_dim", default=100, type=int) 37 | parser.add_argument("--random_init", default=True, type=bool) 38 | return parser.parse_args() 39 | 40 | 41 | def basic_tokenizer(sentence): 42 | words = [] 43 | for space_separated_fragment in sentence.strip().split(): 44 | words.extend(re.split(" ", space_separated_fragment)) 45 | return [w for w in words if w] 46 | 47 | 48 | def initialize_vocabulary(vocabulary_path): 49 | # map vocab to word embeddings 50 | if gfile.Exists(vocabulary_path): 51 | rev_vocab = [] 52 | with gfile.GFile(vocabulary_path, mode="r") as f: 53 | rev_vocab.extend(f.readlines()) 54 | rev_vocab = [line.strip('\n') for line in rev_vocab] 55 | vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) 56 | return vocab, rev_vocab 57 | else: 58 | raise ValueError("Vocabulary file %s not found.", vocabulary_path) 59 | 60 | 61 | def process_glove(args, vocab_list, save_path, size=4e5, random_init=True): 62 | """ 63 | :param vocab_list: [vocab] 64 | :return: 65 | """ 66 | if not gfile.Exists(save_path + ".npz"): 67 | glove_path = os.path.join(args.glove_dir, "glove.6B.{}d.txt".format(args.glove_dim)) 68 | if random_init: 69 | glove = np.random.randn(len(vocab_list), args.glove_dim) 70 | else: 71 | glove = np.zeros((len(vocab_list), args.glove_dim)) 72 | found = 0 73 | with open(glove_path, 'r') as fh: 74 | for line in tqdm(fh, total=size): 75 | array = line.lstrip().rstrip().split(" ") 76 | word = array[0] 77 | vector = list(map(float, array[1:])) 78 | if word in vocab_list: 79 | idx = vocab_list.index(word) 80 | glove[idx, :] = vector 81 | found += 1 82 | if word.capitalize() in vocab_list: 83 | idx = vocab_list.index(word.capitalize()) 84 | glove[idx, :] = vector 85 | found += 1 86 | if word.upper() in vocab_list: 87 | idx = vocab_list.index(word.upper()) 88 | glove[idx, :] = vector 89 | found += 1 90 | 91 | print("{}/{} of word vocab have corresponding vectors in {}".format(found, len(vocab_list), glove_path)) 92 | np.savez_compressed(save_path, glove=glove) 93 | print("saved trimmed glove matrix at: {}".format(save_path)) 94 | 95 | 96 | def create_vocabulary(vocabulary_path, data_paths, tokenizer=None): 97 | if not gfile.Exists(vocabulary_path): 98 | print("Creating vocabulary %s from data %s" % (vocabulary_path, str(data_paths))) 99 | vocab = {} 100 | for path in data_paths: 101 | with open(path, mode="rb") as f: 102 | counter = 0 103 | for line in f: 104 | counter += 1 105 | if counter % 100000 == 0: 106 | print("processing line %d" % counter) 107 | tokens = tokenizer(line) if tokenizer else basic_tokenizer(line) 108 | for w in tokens: 109 | if w in vocab: 110 | vocab[w] += 1 111 | else: 112 | vocab[w] = 1 113 | vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) 114 | print("Vocabulary size: %d" % len(vocab_list)) 115 | with gfile.GFile(vocabulary_path, mode="wb") as vocab_file: 116 | for w in vocab_list: 117 | vocab_file.write(w + b"\n") 118 | 119 | 120 | def sentence_to_token_ids(sentence, vocabulary, tokenizer=None): 121 | if tokenizer: 122 | words = tokenizer(sentence) 123 | else: 124 | words = basic_tokenizer(sentence) 125 | return [vocabulary.get(w, UNK_ID) for w in words] 126 | 127 | 128 | def data_to_token_ids(data_path, target_path, vocabulary_path, 129 | tokenizer=None): 130 | if not gfile.Exists(target_path): 131 | print("Tokenizing data in %s" % data_path) 132 | vocab, _ = initialize_vocabulary(vocabulary_path) 133 | with gfile.GFile(data_path, mode="rb") as data_file: 134 | with gfile.GFile(target_path, mode="w") as tokens_file: 135 | counter = 0 136 | for line in data_file: 137 | counter += 1 138 | if counter % 5000 == 0: 139 | print("tokenizing line %d" % counter) 140 | token_ids = sentence_to_token_ids(line, vocab, tokenizer) 141 | tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n") 142 | 143 | 144 | if __name__ == '__main__': 145 | args = setup_args() 146 | vocab_path = pjoin(args.vocab_dir, "vocab.dat") 147 | 148 | train_path = pjoin(args.source_dir, "train") 149 | valid_path = pjoin(args.source_dir, "val") 150 | dev_path = pjoin(args.source_dir, "dev") 151 | 152 | create_vocabulary(vocab_path, 153 | [pjoin(args.source_dir, "train.context"), 154 | pjoin(args.source_dir, "train.question"), 155 | pjoin(args.source_dir, "val.context"), 156 | pjoin(args.source_dir, "val.question")]) 157 | vocab, rev_vocab = initialize_vocabulary(pjoin(args.vocab_dir, "vocab.dat")) 158 | 159 | # ======== Trim Distributed Word Representation ======= 160 | # If you use other word representations, you should change the code below 161 | 162 | process_glove(args, rev_vocab, args.source_dir + "/glove.trimmed.{}".format(args.glove_dim), 163 | random_init=args.random_init) 164 | 165 | # ======== Creating Dataset ========= 166 | # We created our data files seperately 167 | # If your model loads data differently (like in bulk) 168 | # You should change the below code 169 | 170 | x_train_dis_path = train_path + ".ids.context" 171 | y_train_ids_path = train_path + ".ids.question" 172 | data_to_token_ids(train_path + ".context", x_train_dis_path, vocab_path) 173 | data_to_token_ids(train_path + ".question", y_train_ids_path, vocab_path) 174 | 175 | x_dis_path = valid_path + ".ids.context" 176 | y_ids_path = valid_path + ".ids.question" 177 | data_to_token_ids(valid_path + ".context", x_dis_path, vocab_path) 178 | data_to_token_ids(valid_path + ".question", y_ids_path, vocab_path) -------------------------------------------------------------------------------- /Experiment 2/code/qa_model.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imraviagrawal/ReadingComprehension/b1eb0f4a20a6a73bf07cabd48a1aeae01639830a/Experiment 2/code/qa_model.pyc -------------------------------------------------------------------------------- /Experiment 2/code/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | joblib 3 | nltk >= 3.2.2 4 | tqdm 5 | pyprind 6 | -------------------------------------------------------------------------------- /Experiment 2/code/train.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import os 6 | import json 7 | 8 | import tensorflow as tf 9 | import numpy as np 10 | 11 | from qa_model import Encoder, QASystem, Decoder 12 | from qa_bidaf_model import BidafEncoder, BidafQASystem, BidafDecoder 13 | from os.path import join as pjoin 14 | 15 | import logging 16 | 17 | logging.basicConfig(level=logging.INFO) 18 | 19 | tf.app.flags.DEFINE_float("learning_rate", 0.001, "Learning rate.") 20 | tf.app.flags.DEFINE_float("max_gradient_norm", 20.0, "Clip gradients to this norm.") 21 | tf.app.flags.DEFINE_float("dropout", 0.15, "Fraction of units randomly dropped on non-recurrent connections.") 22 | tf.app.flags.DEFINE_integer("batch_size", 20, "Batch size to use during training.") 23 | tf.app.flags.DEFINE_integer("epochs", 10, "Number of epochs to train.") 24 | tf.app.flags.DEFINE_integer("state_size", 100, "Size of each model layer.") 25 | tf.app.flags.DEFINE_integer("output_size", 300, "The output size of your model.") 26 | tf.app.flags.DEFINE_integer("embedding_size", 100, "Size of the pretrained vocabulary.") 27 | tf.app.flags.DEFINE_integer("question_max_length", 60, "Maximum length of an input question.") 28 | tf.app.flags.DEFINE_integer("context_paragraph_max_length", 300, "Maximum length of a context paragraph.") 29 | tf.app.flags.DEFINE_string("data_dir", "data/squad", "SQuAD directory (default ./data/squad)") 30 | tf.app.flags.DEFINE_string("train_dir", "train", "Training directory to save the model parameters (default: ./train), it will use the model name as a second directory.") 31 | tf.app.flags.DEFINE_string("load_train_dir", "", "Training directory to load model parameters from to resume training (default: {train_dir}).") 32 | tf.app.flags.DEFINE_string("log_dir", "log", "Path to store log and flag files (default: ./log)") 33 | tf.app.flags.DEFINE_string("optimizer", "adam", "adam / sgd") 34 | tf.app.flags.DEFINE_integer("print_every", 100, "How many iterations to do per print.") # want to print every 1000 examples (so approx 100 times per epoch) 35 | tf.app.flags.DEFINE_integer("keep", 0, "How many checkpoints to keep, 0 indicates keep all.") 36 | tf.app.flags.DEFINE_string("vocab_path", "data/squad/vocab.dat", "Path to vocab file (default: ./data/squad/vocab.dat)") 37 | tf.app.flags.DEFINE_string("embed_path", "", "Path to the trimmed GLoVe embedding (default: ./data/squad/glove.trimmed.{embedding_size}.npz)") 38 | tf.app.flags.DEFINE_boolean("debug", False, "Are we debugging?") 39 | tf.app.flags.DEFINE_integer("debug_training_size", 100, "A smaller training size for debugging, so that epochs are quick, and we can test logging etc") 40 | tf.app.flags.DEFINE_boolean("log_score", True, "If we want to log f1 and em score in a txt file, alongside the model params in the pa4/train/ directory") 41 | tf.app.flags.DEFINE_string("model_name", "BiDAF", "The model to use, pick from: 'baseline', 'embedding_backprop', 'deep_encoder_2layer', 'deep_encoder_3layer', 'deep_decoder_2layer', 'deep_decoder_3layer', 'QRNNs', 'BiDAF'") 42 | tf.app.flags.DEFINE_string("model_version", "_3", "Make this '' for initial model, if we ever want to retrain a model, then we can use this (with '_i') to not overwrite the original data") 43 | tf.app.flags.DEFINE_boolean("clip_norms", True, "Do we wish to clip norms?") 44 | tf.app.flags.DEFINE_string("train_prefix", "train", "Prefix of all the training data files") 45 | tf.app.flags.DEFINE_string("val_prefix", "val", "Prefix of all the validation data files") 46 | tf.app.flags.DEFINE_integer("epoch_base", 10, "The first epoch, so that we are saving the correct model and outputting the correct numbers if we restarted") 47 | 48 | 49 | FLAGS = tf.app.flags.FLAGS 50 | 51 | 52 | def initialize_model(session, model, train_dir): 53 | ckpt = tf.train.get_checkpoint_state(train_dir) 54 | v2_path = ckpt.model_checkpoint_path + ".index" if ckpt else "" 55 | if ckpt and (tf.gfile.Exists(ckpt.model_checkpoint_path) or tf.gfile.Exists(v2_path)): 56 | logging.info("Reading model parameters from %s" % ckpt.model_checkpoint_path) 57 | model.saver.restore(session, ckpt.model_checkpoint_path) 58 | else: 59 | logging.info("Created model with fresh parameters.") 60 | session.run(tf.global_variables_initializer()) 61 | logging.info('Num params: %d' % sum(v.get_shape().num_elements() for v in tf.trainable_variables())) 62 | return model 63 | 64 | 65 | def initialize_vocab(vocab_path): 66 | if tf.gfile.Exists(vocab_path): 67 | rev_vocab = [] 68 | with tf.gfile.GFile(vocab_path, mode="rb") as f: 69 | rev_vocab.extend(f.readlines()) 70 | rev_vocab = [line.strip('\n') for line in rev_vocab] 71 | vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) 72 | return vocab, rev_vocab 73 | else: 74 | raise ValueError("Vocabulary file %s not found.", vocab_path) 75 | 76 | 77 | def get_normalized_train_dir(train_dir): 78 | """ 79 | Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the 80 | file paths saved in the checkpoint. This allows the model to be reloaded even 81 | if the location of the checkpoint files has moved, allowing usage with CodaLab. 82 | This must be done on both train.py and qa_answer.py in order to work. 83 | """ 84 | global_train_dir = '/tmp/cs224n-squad-train' 85 | if os.path.exists(global_train_dir): 86 | os.unlink(global_train_dir) 87 | if not os.path.exists(train_dir): 88 | os.makedirs(train_dir) 89 | os.symlink(os.path.abspath(train_dir), global_train_dir) 90 | return global_train_dir 91 | 92 | 93 | def main(_): 94 | 95 | # Work out flags that decide the architecture we're doing to use (defaults for the baseline) 96 | backprop_word_embeddings = False 97 | encoder_layers = 1 98 | decoder_layers = 1 99 | 100 | if FLAGS.model_name == "embedding_backprop": 101 | backprop_word_embeddings = True 102 | elif FLAGS.model_name == "deep_encoder_2layer": 103 | backprop_word_embeddings = True # false if that did better 104 | encoder_layers = 2 105 | elif FLAGS.model_name == "deep_encoder_3layer": 106 | backprop_word_embeddings = True # false if that did better 107 | encoder_layers = 3 108 | elif FLAGS.model_name == "deep_decoder_2layer": 109 | backprop_word_embeddings = True # false if that did better 110 | encoder_layers = 2 # 1, 2 if one of them did better 111 | decoder_layer = 2 112 | elif FLAGS.model_name == "deep_decoder_3layer": 113 | backprop_word_embeddings = True # false if that did better 114 | encoder_layers = 3 # 1, 2 if one of them did better 115 | decoder_layers = 3 116 | elif FLAGS.model_name == "BiDAF": 117 | # do nothing 118 | pass 119 | elif not (FLAGS.model_name == "baseline"): 120 | raise Exception("Invalid model name selected") 121 | 122 | # Do what you need to load datasets from FLAGS.data_dir 123 | #train_dataset = load_dataset(FLAGS.data_dir, "train") 124 | #val_dataset = load_dataset(FLAGS.data_dir, "val") 125 | train_dataset_address = FLAGS.data_dir + "/" + FLAGS.train_prefix 126 | val_dataset_address = FLAGS.data_dir + "/" + FLAGS.val_prefix 127 | 128 | embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) 129 | vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") 130 | vocab, rev_vocab = initialize_vocab(vocab_path) # vocab = {words : indices}, rev_vocab = [words] (where each word is at index as dictated by vocab) 131 | 132 | # load in the embeddings 133 | embeddings = np.load(embed_path)['glove'] 134 | 135 | if FLAGS.model_name == "BiDAF": 136 | encoder = BidafEncoder(FLAGS.state_size, FLAGS.embedding_size) 137 | decoder = BidafDecoder(FLAGS.state_size) 138 | qa = BidafQASystem(encoder, decoder, embeddings, backprop_word_embeddings) 139 | else: 140 | encoder = Encoder(FLAGS.state_size, FLAGS.embedding_size, FLAGS.dropout, encoder_layers, FLAGS.model_name) 141 | decoder = Decoder(FLAGS.state_size, FLAGS.dropout, decoder_layers) 142 | qa = QASystem(encoder, decoder, embeddings, backprop_word_embeddings) 143 | 144 | if not os.path.exists(FLAGS.log_dir): 145 | os.makedirs(FLAGS.log_dir) 146 | file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) 147 | logging.getLogger().addHandler(file_handler) 148 | 149 | print(vars(FLAGS)) 150 | with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: 151 | json.dump(FLAGS.__flags, fout) 152 | 153 | with tf.Session() as sess: 154 | load_train_dir = get_normalized_train_dir(FLAGS.load_train_dir or FLAGS.train_dir) 155 | load_train_dir += "/" + FLAGS.model_name + FLAGS.model_version # each model gets its own subdirectory 156 | initialize_model(sess, qa, load_train_dir) 157 | 158 | save_train_dir = get_normalized_train_dir(FLAGS.train_dir) 159 | save_train_dir += "/" + FLAGS.model_name + FLAGS.model_version # each model gets its own subdirectory 160 | if not os.path.exists(save_train_dir): 161 | os.makedirs(save_train_dir) 162 | create_train_dir = (save_train_dir) 163 | qa.train(sess, train_dataset_address, val_dataset_address, save_train_dir) 164 | 165 | 166 | if __name__ == "__main__": 167 | tf.app.run() 168 | -------------------------------------------------------------------------------- /Experiment 2/code/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import random 4 | from functools import reduce 5 | from operator import mul 6 | 7 | """ 8 | def minibatch(data, minibatch_idx): 9 | return data[minibatch_idx] if type(data) is np.ndarray else [data[i] for i in minibatch_idx] 10 | 11 | def get_minibatches(data, minibatch_size, shuffle=True): 12 | list_data = type(data) is list and (type(data[0]) is list or type(data[0]) is np.ndarray) 13 | data_size = len(data[0]) if list_data else len(data) 14 | indices = np.arange(data_size) 15 | if shuffle: 16 | np.random.shuffle(indices) 17 | for minibatch_start in np.arange(0, data_size, minibatch_size): 18 | minibatch_indices = indices[minibatch_start:minibatch_start + minibatch_size] 19 | yield [minibatch(d, minibatch_indices) for d in data] if list_data \ 20 | else minibatch(data, minibatch_indices) 21 | """ 22 | 23 | 24 | # flattens tensor of shape [..., d] into [x, d] where x is the product of ... 25 | def flatten(tensor): 26 | fixed_shape = tensor.get_shape().as_list() 27 | tensor_rank = len(fixed_shape) 28 | flatten_dim = reduce(mul, [fixed_shape[i] or tf.shape(tensor)[i] for i in range(1,tensor_rank)]) 29 | minibatch_size = fixed_shape[0] or tf.shape(tensor)[0] 30 | out_shape = [minibatch_size, flatten_dim] 31 | #out_shape = [left] + [fixed_shape[i] or tf.shape(tenor)[i] for i in range(start, len(fixed_shape))] 32 | flat = tf.reshape(tensor, out_shape) 33 | return flat 34 | #def flatten(tensor): 35 | # return tf.reshape(tensor, [-1, tensor.get_shape().as_list()[-1]]) 36 | 37 | 38 | # reshapes tensor into ref's shape 39 | def reconstruct(tensor, ref): 40 | shape = ref.get_shape().as_list() # will be none in the minibatch dim 41 | shape[0] = -1 # want -1 rather than none, so it fills correctly 42 | reshaped = tf.reshape(tensor, shape) 43 | return reshaped 44 | """ 45 | ref_shape = ref.get_shape().as_list() 46 | tensor_shape = tensor.get_shape().as_list() 47 | ref_stop = len(ref_shape) - 1 48 | tensor_start = len(tensor_shape) - 1 49 | pre_shape = [ref_shape[i] or tf.shape(ref)[i] for i in range(ref_stop)] 50 | keep_shape = [tensor_shape[i] or tf.shape(tensor)[i] for i in range(tensor_start, len(tensor_shape))] 51 | # pre_shape = [tf.shape(ref)[i] for i in range(len(ref.get_shape().as_list()[:-keep]))] 52 | # keep_shape = tensor.get_shape().as_list()[-keep:] 53 | target_shape = pre_shape + keep_shape 54 | out = tf.reshape(tensor, target_shape) 55 | if tensor.get_shape().as_list()[-1] == 1: 56 | out = tf.squeeze(out, [len(ref.get_shape().as_list())-1]) 57 | return out 58 | #def reconstruct(tensor, ref): 59 | # return tf.reshape(tensor, ref.get_shape().as_list()) 60 | """ 61 | 62 | 63 | # first flattens logits, takes softmax, and reshapes it back 64 | def softmax(logits): 65 | with tf.name_scope("Softmax"): 66 | flat_logits = flatten(logits) 67 | #logits_shape = logits.get_shape().as_list() 68 | #flat_dim = np.prod(logits_shape[1:]) 69 | #flat_logits = tf.reshape(logits, [-1, flat_dim]) 70 | flat_out = tf.nn.softmax(flat_logits) 71 | out = reconstruct(flat_out, logits) 72 | #logits_shape[0] = -1 # would be None when read shape, BUT, need as -1 when inputting into reshape 73 | #out = tf.reshape(flat_logits, logits_shape) 74 | return out 75 | 76 | 77 | 78 | # creates U^tilde and h^tilde in c2q and q2c attention (and later in decode) 79 | def softsel(target, logits): 80 | with tf.name_scope("Softsel"): 81 | a = softmax(logits) 82 | target_rank = len(target.get_shape().as_list()) 83 | out = tf.reduce_sum(tf.expand_dims(a, -1) * target, target_rank - 2) 84 | return out 85 | 86 | 87 | def unlistify(llist): 88 | return [item for sublist in llist for item in sublist] 89 | 90 | 91 | 92 | def get_minibatches(dataset_address, max_context_length, minibatch_size = 100): 93 | data_size = sum(1 for line in open(dataset_address + ".span")) 94 | question_id_file = open(dataset_address + ".ids.question", 'r') 95 | context_id_file = open(dataset_address + ".ids.context", 'r') 96 | answer_file = open(dataset_address + ".span", 'r') 97 | file_eof = False 98 | 99 | while not file_eof: 100 | 101 | questions = [] 102 | contexts = [] 103 | start_answers = [] 104 | end_answers = [] 105 | 106 | minibatch_counter = 0 107 | while minibatch_counter < minibatch_size: 108 | question_line = question_id_file.readline() 109 | context_line = context_id_file.readline() 110 | answer_line = answer_file.readline() 111 | 112 | if (question_line == ""): # EOF 113 | file_eof = True 114 | break 115 | 116 | context = map(int, context_line.split(" ")) 117 | question = map(int, question_line.split(" ")) 118 | answer = map(int, answer_line.split(" ")) 119 | 120 | if answer[0] >= max_context_length or answer[1] >= max_context_length: 121 | continue; 122 | 123 | contexts.append(context) 124 | questions.append(question) 125 | start_answers.append([answer[0]]) 126 | end_answers.append([answer[1]]) 127 | minibatch_counter += 1 128 | 129 | minibatch = [questions, contexts, start_answers, end_answers] 130 | yield minibatch # yield, return as a generator 131 | 132 | question_id_file.close() 133 | context_id_file.close() 134 | answer_file.close() 135 | 136 | 137 | 138 | def get_sample(dataset_address, max_context_length, sample_size): 139 | 140 | data_size = sum(1 for line in open(dataset_address + ".span")) 141 | 142 | #question_id_file = open(dataset_address + ".ids.question", 'r') 143 | context_id_file = open(dataset_address + ".ids.context", 'r') 144 | answer_file = open(dataset_address + ".span", 'r') 145 | 146 | indices = set(random.sample(range(0, data_size), sample_size)) 147 | max_index = max(indices) 148 | 149 | contexts = [] 150 | questions = [] 151 | start_answers = [] 152 | end_answers = [] 153 | num_samples = sample_size 154 | line_number = 0 155 | for question_line in open(dataset_address + ".ids.question", 'r'): 156 | if line_number > max_index: 157 | break 158 | context_line = context_id_file.readline() 159 | answer_line = answer_file.readline() 160 | if line_number in indices: 161 | # This is one of the selected rows, add it to the minibatch 162 | question = map(int, question_line.split(" ")) 163 | context = map(int, context_line.split(" ")) 164 | answer = map(int, answer_line.split(" ")) 165 | if answer[0] >= max_context_length or answer[1] >= max_context_length: 166 | num_samples -= 1 167 | line_number += 1 168 | continue 169 | questions.append(question) 170 | contexts.append(context) 171 | start_answers.append([answer[0]]) 172 | end_answers.append([answer[1]]) 173 | line_number += 1 174 | 175 | context_id_file.close() 176 | answer_file.close() 177 | return [[questions, contexts, start_answers, end_answers], num_samples] 178 | -------------------------------------------------------------------------------- /Experiment 2/code/util.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imraviagrawal/ReadingComprehension/b1eb0f4a20a6a73bf07cabd48a1aeae01639830a/Experiment 2/code/util.pyc -------------------------------------------------------------------------------- /Experiment 2/train/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imraviagrawal/ReadingComprehension/b1eb0f4a20a6a73bf07cabd48a1aeae01639830a/Experiment 2/train/.DS_Store -------------------------------------------------------------------------------- /Experiment 2/train/BiDAF_3/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imraviagrawal/ReadingComprehension/b1eb0f4a20a6a73bf07cabd48a1aeae01639830a/Experiment 2/train/BiDAF_3/.DS_Store -------------------------------------------------------------------------------- /Experiment 2/train/BiDAF_3/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "/tmp/cs224n-squad-train/BiDAF_3/model_params-19" 2 | all_model_checkpoint_paths: "/tmp/cs224n-squad-train/BiDAF_3/model_params-15" 3 | all_model_checkpoint_paths: "/tmp/cs224n-squad-train/BiDAF_3/model_params-16" 4 | all_model_checkpoint_paths: "/tmp/cs224n-squad-train/BiDAF_3/model_params-17" 5 | all_model_checkpoint_paths: "/tmp/cs224n-squad-train/BiDAF_3/model_params-18" 6 | all_model_checkpoint_paths: "/tmp/cs224n-squad-train/BiDAF_3/model_params-19" 7 | -------------------------------------------------------------------------------- /Experiment 2/train/BiDAF_3/training_scores.txt: -------------------------------------------------------------------------------- 1 | Epoch F1_score EM_score Epoch_time 2 | 10 0.000966578448812 0.0 4.32213401794 3 | 11 0.00455225513561 0.0 1.94104599953 4 | 12 0.00616307383967 0.0 1.99941396713 5 | 13 0.00469032113609 0.0 1.93077802658 6 | 14 0.0140761704937 0.01 2.00956082344 7 | 15 0.00899079448681 0.0 1.95876812935 8 | 16 0.0437649334719 0.01 1.99772787094 9 | 17 0.0241929613664 0.0 1.87301015854 10 | 18 0.0462353292338 0.0 1.96066689491 11 | 19 0.0383702134189 0.01 1.92262411118 12 | Epoch F1_score EM_score Epoch_time 13 | Epoch F1_score EM_score Epoch_time 14 | 10 0.237215744825 0.16 9659.51843214 15 | 11 0.274352162172 0.19 9587.09242487 16 | 12 0.354770651145 0.22 9817.67866206 17 | 13 0.471919641034 0.33 9797.47593784 18 | 14 0.558787656547 0.34 9843.74051499 19 | 15 0.605434664746 0.44 9844.51916909 20 | 16 0.701083487537 0.57 9838.91687298 21 | 17 0.671957740261 0.48 9204.16600084 22 | 18 0.689730527836 0.55 9076.31184506 23 | 19 0.697153066437 0.5 8950.21927881 24 | -------------------------------------------------------------------------------- /Experiment 2/train/BiDAF_3/validation_scores.txt: -------------------------------------------------------------------------------- 1 | Epoch F1_score EM_score Epoch_time 2 | 10 0.0033690960661 0.0 4.32213401794 3 | 11 0.00714507864857 0.0 1.94104599953 4 | 12 0.00169245402663 0.0 1.99941396713 5 | 13 0.00375934817091 0.0 1.93077802658 6 | 14 0.000933670555601 0.0 2.00956082344 7 | 15 0.00237128465413 0.0 1.95876812935 8 | 16 0.0261467874739 0.0 1.99772787094 9 | 17 0.0196757443476 0.0 1.87301015854 10 | 18 0.0319630511871 0.01 1.96066689491 11 | 19 0.00469146436639 0.0 1.92262411118 12 | Epoch F1_score EM_score Epoch_time 13 | Epoch F1_score EM_score Epoch_time 14 | 10 0.161244987661 0.1 9659.51843214 15 | 11 0.318810912807 0.19 9587.09242487 16 | 12 0.312387225905 0.18 9817.67866206 17 | 13 0.37985440533 0.22 9797.47593784 18 | 14 0.444408922936 0.29 9843.74051499 19 | 15 0.618744752492 0.47 9844.51916909 20 | 16 0.583562298233 0.42 9838.91687298 21 | 17 0.544788214554 0.38 9204.16600084 22 | 18 0.650020883977 0.4 9076.31184506 23 | 19 0.612790585822 0.44 8950.21927881 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reading Comprehensiom 2 | 3 | Context paragraph: On 24 March 1879, Tesla was returned to Gospic under police guard for 4 | not having a residence permit. On 17 April 1879, Milutin Tesla died at the age of 60 after 5 | contracting an unspecified illness (although some sources say that he died of a stroke). During 6 | that year, Tesla taught a large class of students in his old school, Higher Real Gymnasium, in 7 | Gospic. 8 | 9 | Question: Why was Tesla returned to Gospic? 10 | 11 | Answer : not having a residence permit 12 | 13 | 14 | ## The project has several dependencies that have to be satisfied before running the code. You can install them using your preferred method -- we list here the names of the packages using `pip`. 15 | 16 | # Requirements 17 | 18 | The starter code provided pressuposes a working installation of Python 2.7, as well as a TensorFlow 0.12.1. 19 | 20 | It should also install all needed dependnecies through 21 | `pip install -r requirements.txt`. 22 | 23 | # Running 24 | 25 | The WordEmbedding and the preprocessing scripts are taken from the Stanford CS224 class, though we will be modelling and training different neural network models. 26 | 27 | You can get started by downloading the datasets and doing dome basic preprocessing: 28 | 29 | $ code/get_started.sh 30 | 31 | Note that you will always want to run your code from this assignment directory, not the code directory, like so: 32 | 33 | $ python code/train.py 34 | 35 | This ensures that any files created in the process don't pollute the code directoy. 36 | 37 | # Dataset 38 | After the download, the SQuAD dataset is placed in the data/squad folder. SQuAD downloaded 39 | files include train and dev files in JSON format: 40 | • train-v1.1.json: a train dataset with around 87k triplets. 41 | • dev-v1.1.json: a dev dataset with around 10k triplets. 42 | 43 | Note that there is no test dataset publicly available: it is kept by the authors of SQuAD to ensure fairness in model evaluations. While developing the model, we will consider for all purposes the dev set as our test set, i.e., we won’t be using the dev set until afterinitial model development. Instead, we split the supplied train dataset into two parts: a 95% slice for training, and the rest 5% for validation purposes, including hyperparameter search. We refer to these as train.* and val.* in filenames. 44 | 45 | --------------------------------------------------------------------------------