├── License.txt ├── README.md ├── bidaf ├── README.md ├── basic │ ├── __init__.py │ ├── cli.py │ ├── combiner.py │ ├── compare.py │ ├── ensemble.py │ ├── ensemble_fast.py │ ├── evaluator.py │ ├── graph_handler.py │ ├── main.py │ ├── model.py │ ├── read_data.py │ ├── run_ensemble.sh │ ├── run_single.sh │ ├── superhighway.py │ ├── templates │ │ └── visualizer.html │ ├── trainer.py │ └── visualizer.py ├── basic_cnn │ ├── __init__.py │ ├── cli.py │ ├── evaluator.py │ ├── graph_handler.py │ ├── main.py │ ├── model.py │ ├── read_data.py │ ├── superhighway.py │ ├── templates │ │ └── visualizer.html │ ├── trainer.py │ └── visualizer.py ├── cnn_dm │ ├── __init__.py │ ├── eda.ipynb │ ├── evaluate.py │ └── prepro.py ├── data │ └── squad │ │ ├── data_dev.json │ │ ├── data_test.json │ │ ├── data_train.json │ │ ├── shared_dev.json │ │ ├── shared_test.json │ │ └── shared_train.json ├── download.sh ├── helpers │ ├── __init__.py │ ├── constants.py │ ├── file_logger.py │ ├── math_utils.py │ ├── spacy_tokenizer.py │ └── utils.py ├── install_tensorflow.sh ├── my │ ├── __init__.py │ ├── corenlp_interface.py │ ├── nltk_utils.py │ ├── tensorflow │ │ ├── __init__.py │ │ ├── general.py │ │ ├── nn.py │ │ ├── rnn.py │ │ └── rnn_cell.py │ ├── utils.py │ └── zip_save.py ├── newsqa │ ├── __init__.py │ ├── evaluate.py │ └── prepro.py ├── newsqa_unsupervised_old │ └── data_train.json ├── newsqa_unsupervised_old_verb_filtered │ └── data_train.json ├── out │ └── basic │ │ └── 06 │ │ ├── save │ │ ├── basic-40000.data-00000-of-00001 │ │ ├── basic-40000.index │ │ ├── basic-40000.meta │ │ └── checkpoint │ │ └── shared.json ├── requirements.txt ├── run.sh ├── scripts.sh ├── scripts │ ├── compare_models.sh │ ├── evaluate_baseline_models.sh │ ├── evaluate_run.sh │ ├── finetune_newsqa.sh │ ├── finetune_squad.sh │ ├── install_tensorflow.sh │ ├── run.sh │ ├── run_ensemble_unsupervised.sh │ ├── run_evaluation.sh │ ├── run_huge_evaluation.sh │ ├── run_intra_evaluation.sh │ ├── run_intra_helper.sh │ └── run_new.sh ├── squad │ ├── __init__.py │ ├── aug_squad.py │ ├── eda_aug_dev.ipynb │ ├── eda_aug_train.ipynb │ ├── evaluate-v1.1.py │ ├── evaluate.py │ ├── prepro.py │ ├── prepro_aug.py │ └── utils.py ├── tests │ ├── __init__.py │ ├── check_results.py │ ├── create_bidaf_dataset.py │ ├── create_bidaf_old_dataset.py │ └── create_generation_dataset_unsupervised.py ├── tree │ ├── __init__.py │ ├── cli.py │ ├── evaluator.py │ ├── graph_handler.py │ ├── main.py │ ├── model.py │ ├── read_data.py │ ├── templates │ │ └── visualizer.html │ ├── test.ipynb │ ├── trainer.py │ └── visualizer.py └── visualization │ ├── compare_models.py │ └── compare_models_newsqa.py ├── data_loaders ├── __init__.py ├── iob_loader.py ├── language_model_loader.py └── language_model_loader_truncate.py ├── datasets ├── iob_test │ ├── label_vocab.txt │ ├── test │ │ ├── inputs.txt │ │ └── labels.txt │ ├── train │ │ ├── inputs.txt │ │ └── labels.txt │ ├── validation │ │ ├── inputs.txt │ │ └── labels.txt │ └── vocab.txt ├── newsqa_unsupervised │ ├── test │ │ ├── answer_ends.txt │ │ ├── answer_starts.txt │ │ ├── ids.txt │ │ ├── indices.txt │ │ └── outputs.txt │ ├── train │ │ ├── answer_ends.txt │ │ ├── answer_starts.txt │ │ ├── ids.txt │ │ ├── indices.txt │ │ └── outputs.txt │ ├── validation │ │ ├── answer_ends.txt │ │ ├── answer_starts.txt │ │ ├── ids.txt │ │ ├── indices.txt │ │ └── outputs.txt │ ├── vocab.txt │ └── word_embeddings.npy ├── newsqa_unsupervised_large │ ├── test │ │ ├── answer_ends.txt │ │ ├── answer_starts.txt │ │ ├── ids.txt │ │ ├── indices.txt │ │ └── outputs.txt │ ├── train │ │ ├── answer_ends.txt │ │ ├── answer_starts.txt │ │ ├── ids.txt │ │ ├── indices.txt │ │ └── outputs.txt │ ├── validation │ │ ├── answer_ends.txt │ │ ├── answer_starts.txt │ │ ├── ids.txt │ │ ├── indices.txt │ │ └── outputs.txt │ └── vocab.txt ├── newsqa_unsupervised_old │ ├── test │ │ ├── answer_ends.txt │ │ ├── answer_starts.txt │ │ ├── ids.txt │ │ ├── indices.txt │ │ └── outputs.txt │ ├── train │ │ ├── answer_ends.txt │ │ ├── answer_starts.txt │ │ ├── ids.txt │ │ ├── indices.txt │ │ ├── outputs.txt │ │ └── predictions.txt │ ├── validation │ │ ├── answer_ends.txt │ │ ├── answer_starts.txt │ │ ├── ids.txt │ │ ├── indices.txt │ │ └── outputs.txt │ └── vocab.txt ├── question_generator │ ├── test │ │ ├── indices.txt │ │ ├── inputs.txt │ │ └── outputs.txt │ ├── train │ │ ├── indices.txt │ │ ├── inputs.txt │ │ └── outputs.txt │ ├── validation │ │ ├── indices.txt │ │ ├── inputs.txt │ │ └── outputs.txt │ ├── vocab.txt │ └── word_embeddings.npy ├── squad │ ├── test │ │ ├── answer_ends.txt │ │ ├── answer_starts.txt │ │ ├── ids.txt │ │ ├── indices.txt │ │ ├── inputs.txt │ │ └── outputs.txt │ ├── train │ │ ├── answer_ends.txt │ │ ├── answer_starts.txt │ │ ├── ids.txt │ │ ├── indices.txt │ │ ├── inputs.txt │ │ └── outputs.txt │ ├── validation │ │ ├── answer_ends.txt │ │ ├── answer_starts.txt │ │ ├── ids.txt │ │ ├── indices.txt │ │ ├── inputs.txt │ │ └── outputs.txt │ ├── vocab.txt │ └── word_embeddings.npy └── squad_iob │ ├── label_vocab.txt │ ├── test │ ├── inputs.txt │ ├── label_vocab.txt │ ├── labels.txt │ └── vocab.txt │ ├── train │ ├── inputs.txt │ └── labels.txt │ ├── validation │ ├── inputs.txt │ ├── label_vocab.txt │ ├── labels.txt │ └── vocab.txt │ └── vocab.txt ├── dnn_units ├── __init__.py └── lstm_attention.py ├── helpers ├── __init__.py ├── constants.py ├── io_utils.py ├── logger.py ├── proc_wrapper.py ├── tokenizer.py ├── torch_utils.py ├── twitter_tokenizer.py ├── utils.py └── vocab.py ├── install.sh ├── iob └── logs │ └── README.md ├── logs └── results │ ├── answer_out_of_domain_baseline.json │ ├── context_aoracle.json │ ├── double_model.json │ ├── newsqa │ ├── data_test.json │ └── evaluate.py │ ├── script.sh │ ├── single_model.json │ ├── single_model_result_run_42_with_baseline.json │ ├── single_model_result_run_43_with_baseline.json │ ├── single_model_result_run_44_with_baseline.json │ ├── single_model_result_run_45_with_baseline.json │ ├── single_model_result_run_46_with_baseline.json │ ├── single_model_result_run_47_with_baseline.json │ ├── single_model_result_run_48_with_baseline.json │ ├── single_model_result_run_49_with_baseline.json │ ├── single_model_results_42.json │ ├── single_model_results_43.json │ ├── single_model_results_44.json │ ├── single_model_results_45.json │ ├── single_model_results_46.json │ ├── single_model_results_47.json │ ├── single_model_results_48.json │ └── single_model_results_49.json ├── models ├── __init__.py ├── iob │ ├── __init__.py │ └── iob_model.py ├── language_model.py ├── language_trainer.py └── language_wrapper.py ├── pretrained_models ├── nltk │ └── tokenizers │ │ ├── punkt.zip │ │ └── punkt │ │ ├── PY3 │ │ ├── README │ │ ├── czech.pickle │ │ ├── danish.pickle │ │ ├── dutch.pickle │ │ ├── english.pickle │ │ ├── estonian.pickle │ │ ├── finnish.pickle │ │ ├── french.pickle │ │ ├── german.pickle │ │ ├── greek.pickle │ │ ├── italian.pickle │ │ ├── norwegian.pickle │ │ ├── polish.pickle │ │ ├── portuguese.pickle │ │ ├── slovene.pickle │ │ ├── spanish.pickle │ │ ├── swedish.pickle │ │ └── turkish.pickle │ │ ├── README │ │ ├── czech.pickle │ │ ├── danish.pickle │ │ ├── dutch.pickle │ │ ├── english.pickle │ │ ├── estonian.pickle │ │ ├── finnish.pickle │ │ ├── french.pickle │ │ ├── german.pickle │ │ ├── greek.pickle │ │ ├── italian.pickle │ │ ├── norwegian.pickle │ │ ├── polish.pickle │ │ ├── portuguese.pickle │ │ ├── slovene.pickle │ │ ├── spanish.pickle │ │ ├── swedish.pickle │ │ └── turkish.pickle └── scripts │ ├── __init__.py │ ├── create_glove_embeddings.sh │ ├── download_glove_embeddings.py │ └── transfer_glove_embeddings.py ├── requirements.txt ├── scripts.sh ├── tests ├── __init__.py ├── gather_test.py ├── iob_loader_test.py ├── iob_test.py ├── iob_trainer_test.py ├── language_model_memory_test.py ├── language_model_predict_test.py ├── language_model_test.py ├── language_model_trainer_test.py ├── load_questions.py ├── newsqa_predictor_test_unsup.py ├── newsqa_predictor_test_unsup_large.py ├── newsqa_predictor_test_unsup_truncated.py ├── newsqa_predictor_test_verb.py ├── newsqa_trainer_test.py ├── pointer_network_test.py ├── question_discriminator_test.py ├── squad_discriminator_test.py ├── squad_loader_test_v2.py ├── squad_predictor_test.py ├── squad_predictor_truncated_test.py ├── squad_trainer_test.py ├── squad_trainer_truncated_expanded_test.py ├── squad_trainer_truncated_test.py ├── test_expand_dims.py ├── test_load_dataset.py ├── test_lstm_attention.py ├── test_lstm_attention_dot.py ├── test_model.py ├── test_model_saving.py └── test_padded_sequence.py └── trainers ├── __init__.py ├── iob_predictor.py └── iob_trainer.py /License.txt: -------------------------------------------------------------------------------- 1 | NewsQA Code 2 | Copyright (c) Microsoft Corporation 3 | All rights reserved. 4 | MIT License 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /bidaf/README.md: -------------------------------------------------------------------------------- 1 | # Finetuning BiDAF with SynthNets: 2 | 3 | - This repository implements finetuning a [Bi-directional Attention Flow for Machine Comprehension] (Seo et al., 2016) model trained on a source collection of documents to answer questions on a target set of documents using [Two-stage SynthNets]. It assumes a SynthNet already generated question, answer tuples over the desired set. 4 | 5 | ## 0. Requirements 6 | #### General 7 | - Python (verified on 3.5.2. Issues have been reported with Python 2!) 8 | - unzip, wget (for running `download.sh` only) 9 | 10 | #### Python Packages 11 | - tensorflow (deep learning library, verified on r0.11) 12 | - nltk (NLP tools, verified on 3.2.1) 13 | - tqdm (progress bar, verified on 4.7.4) 14 | - jinja2 (for visaulization; if you only train and test, not needed) 15 | 16 | ## 1. Downloading Data 17 | Run: 18 | ``` 19 | git lfs pull 20 | ``` 21 | 22 | ## Scripts 23 | All commands used to train, test models are stored under [scripts]. 24 | 25 | Each command inside each script file should be run from the root directory of the repository. 26 | 27 | ## 2. Training 28 | To finetune a pretrained [SQUAD] BIDAF model on [NewsQA], see the scripts at [scripts/finetune_newsqa]. 29 | 30 | To finetune a pretrained [NewsQA] model on [SQUAD], see the scripts at [scripts/finetune_squad]. 31 | 32 | ## 3. Test 33 | To evaluate single models, see the scripts at scripts/evaluate_*.sh. 34 | 35 | To evaluate intra-run averaged models, ensembles, etc. see the scripts at scripts/*_evaluation.sh 36 | 37 | [Two-stage SynthNets]: https://arxiv.org/TODO 38 | [Bi-directional Attention Flow for Machine Comprehension]: https://github.com/allenai/bi-att-flow 39 | [scripts]: https://github.com/davidgolub/ReadingComprehension/tree/master/scripts 40 | [scripts/finetune_newsqa]: https://github.com/davidgolub/ReadingComprehension/tree/master/scripts/finetune_newsqa.sh 41 | [scripts/finetune_squad]: https://github.com/davidgolub/ReadingComprehension/tree/master/scripts/finetune_squad.sh 42 | [code]: https://github.com/allenai/bi-att-flow 43 | [multi-gpu]: https://www.tensorflow.org/versions/r0.11/tutorials/deep_cnn/index.html#training-a-model-using-multiple-gpu-cards 44 | [SQUAD]: http://stanford-qa.com 45 | [NEWSQA]: https://datasets.maluuba.com/NewsQA 46 | [paper]: https://arxiv.org/abs/1611.01603 47 | [davidgolub]: https://davidgolub.github.io 48 | [davidgolub-github]: https://github.com/davidgolub 49 | -------------------------------------------------------------------------------- /bidaf/basic/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/bidaf/basic/__init__.py -------------------------------------------------------------------------------- /bidaf/basic/combiner.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | 4 | third_path = sys.argv[1] 5 | other_paths = sys.argv[2:] 6 | 7 | others = [json.load(open(path, 'r')) for path in other_paths] 8 | 9 | 10 | c = {} 11 | 12 | assert min(map(len, others)) == max(map(len, others)), list(map(len, others)) 13 | 14 | for key in others[0].keys(): 15 | if key == 'scores': 16 | continue 17 | probs = [other['scores'][key] for other in others] 18 | vals = [other[key] for other in others] 19 | largest_val = max(zip(vals, probs), key=lambda pair: pair[1])[0] 20 | c[key] = largest_val 21 | 22 | json.dump(c, open(third_path, 'w')) 23 | -------------------------------------------------------------------------------- /bidaf/basic/compare.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | 4 | first_path = sys.argv[1] 5 | second_path = sys.argv[2] 6 | 7 | a = json.load(open(first_path, 'r')) 8 | b = json.load(open(second_path, 'r')) 9 | 10 | assert len(a) == len(b) 11 | 12 | diff_count = 0 13 | 14 | for key, val in a.items(): 15 | b_val = b[key] 16 | if val != b_val: 17 | print(val, "|||", b_val) 18 | diff_count += 1 19 | 20 | print("{}/{} = {}".format(diff_count, len(a), diff_count/len(a))) -------------------------------------------------------------------------------- /bidaf/basic/ensemble.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | import gzip 4 | import json 5 | import pickle 6 | from collections import defaultdict 7 | from operator import mul 8 | 9 | from tqdm import tqdm 10 | from squad.utils import get_phrase, get_best_span 11 | 12 | 13 | def get_args(): 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('paths', nargs='+') 16 | parser.add_argument('-o', '--out', default='ensemble.json') 17 | parser.add_argument("--data_path", default="data/squad/data_test.json") 18 | parser.add_argument("--shared_path", default="data/squad/shared_test.json") 19 | args = parser.parse_args() 20 | return args 21 | 22 | 23 | def ensemble(args): 24 | e_list = [] 25 | for path in tqdm(args.paths): 26 | with gzip.open(path, 'r') as fh: 27 | e = pickle.load(fh) 28 | e_list.append(e) 29 | 30 | with open(args.data_path, 'r') as fh: 31 | data = json.load(fh) 32 | 33 | with open(args.shared_path, 'r') as fh: 34 | shared = json.load(fh) 35 | 36 | out = {} 37 | for idx, (id_, rx) in tqdm(enumerate(zip(data['ids'], data['*x'])), total=len(e['yp'])): 38 | if idx >= len(e['yp']): 39 | # for debugging purpose 40 | break 41 | context = shared['p'][rx[0]][rx[1]] 42 | wordss = shared['x'][rx[0]][rx[1]] 43 | yp_list = [e['yp'][idx] for e in e_list] 44 | yp2_list = [e['yp2'][idx] for e in e_list] 45 | answer = ensemble3(context, wordss, yp_list, yp2_list) 46 | out[id_] = answer 47 | 48 | with open(args.out, 'w') as fh: 49 | json.dump(out, fh) 50 | 51 | 52 | def ensemble1(context, wordss, y1_list, y2_list): 53 | """ 54 | 55 | :param context: Original context 56 | :param wordss: tokenized words (nested 2D list) 57 | :param y1_list: list of start index probs (each element corresponds to probs form single model) 58 | :param y2_list: list of stop index probs 59 | :return: 60 | """ 61 | sum_y1 = combine_y_list(y1_list) 62 | sum_y2 = combine_y_list(y2_list) 63 | span, score = get_best_span(sum_y1, sum_y2) 64 | return get_phrase(context, wordss, span) 65 | 66 | 67 | def ensemble2(context, wordss, y1_list, y2_list): 68 | start_dict = defaultdict(float) 69 | stop_dict = defaultdict(float) 70 | for y1, y2 in zip(y1_list, y2_list): 71 | span, score = get_best_span(y1, y2) 72 | start_dict[span[0]] += y1[span[0][0]][span[0][1]] 73 | stop_dict[span[1]] += y2[span[1][0]][span[1][1]] 74 | start = max(start_dict.items(), key=lambda pair: pair[1])[0] 75 | stop = max(stop_dict.items(), key=lambda pair: pair[1])[0] 76 | best_span = (start, stop) 77 | return get_phrase(context, wordss, best_span) 78 | 79 | 80 | def ensemble3(context, wordss, y1_list, y2_list): 81 | d = defaultdict(float) 82 | for y1, y2 in zip(y1_list, y2_list): 83 | span, score = get_best_span(y1, y2) 84 | phrase = get_phrase(context, wordss, span) 85 | d[phrase] += score 86 | return max(d.items(), key=lambda pair: pair[1])[0] 87 | 88 | 89 | def combine_y_list(y_list, op='*'): 90 | if op == '+': 91 | func = sum 92 | elif op == '*': 93 | def func(l): return functools.reduce(mul, l) 94 | else: 95 | func = op 96 | return [[func(yij_list) for yij_list in zip(*yi_list)] for yi_list in zip(*y_list)] 97 | 98 | 99 | def main(): 100 | args = get_args() 101 | ensemble(args) 102 | 103 | if __name__ == "__main__": 104 | main() 105 | 106 | 107 | -------------------------------------------------------------------------------- /bidaf/basic/ensemble_fast.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | from collections import Counter, defaultdict 4 | import re 5 | 6 | def key_func(pair): 7 | return pair[1] 8 | 9 | 10 | def get_func(vals, probs): 11 | counter = Counter(vals) 12 | # return max(zip(vals, probs), key=lambda pair: pair[1])[0] 13 | # return max(zip(vals, probs), key=lambda pair: pair[1] * counter[pair[0]] / len(counter) - 999 * (len(pair[0]) == 0) )[0] 14 | # return max(zip(vals, probs), key=lambda pair: pair[1] + 0.7 * counter[pair[0]] / len(counter) - 999 * (len(pair[0]) == 0) )[0] 15 | d = defaultdict(float) 16 | for val, prob in zip(vals, probs): 17 | d[val] += prob 18 | d[''] = 0 19 | return max(d.items(), key=lambda pair: pair[1])[0] 20 | 21 | third_path = sys.argv[1] 22 | other_paths = sys.argv[2:] 23 | 24 | others = [json.load(open(path, 'r')) for path in other_paths] 25 | 26 | 27 | c = {} 28 | 29 | assert min(map(len, others)) == max(map(len, others)), list(map(len, others)) 30 | 31 | for key in others[0].keys(): 32 | if key == 'scores': 33 | continue 34 | probs = [other['scores'][key] for other in others] 35 | vals = [other[key] for other in others] 36 | largest_val = get_func(vals, probs) 37 | c[key] = largest_val 38 | 39 | json.dump(c, open(third_path, 'w')) -------------------------------------------------------------------------------- /bidaf/basic/graph_handler.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import json 3 | from json import encoder 4 | import os 5 | 6 | import tensorflow as tf 7 | 8 | from basic.evaluator import Evaluation, F1Evaluation 9 | from my.utils import short_floats 10 | 11 | import pickle 12 | 13 | 14 | class GraphHandler(object): 15 | def __init__(self, config): 16 | self.config = config 17 | self.saver = tf.train.Saver(max_to_keep=config.max_to_keep) 18 | self.writer = None 19 | self.save_path = os.path.join(config.save_dir, config.model_name) 20 | 21 | def initialize(self, sess): 22 | if self.config.load: 23 | self._load(sess) 24 | else: 25 | sess.run(tf.initialize_all_variables()) 26 | 27 | if self.config.mode == 'train': 28 | self.writer = tf.train.SummaryWriter(self.config.log_dir, graph=tf.get_default_graph()) 29 | 30 | def save(self, sess, global_step=None): 31 | self.saver.save(sess, self.save_path, global_step=global_step) 32 | 33 | def _load(self, sess): 34 | config = self.config 35 | if config.load_path: 36 | save_path = config.load_path 37 | elif config.load_step > 0: 38 | save_path = os.path.join(config.save_dir, "{}-{}".format(config.model_name, config.load_step)) 39 | else: 40 | save_dir = config.save_dir 41 | checkpoint = tf.train.get_checkpoint_state(save_dir) 42 | assert checkpoint is not None, "cannot load checkpoint at {}".format(save_dir) 43 | save_path = checkpoint.model_checkpoint_path 44 | print("Loading saved model from {}".format(save_path)) 45 | self.saver.restore(sess, save_path) 46 | 47 | def add_summary(self, summary, global_step): 48 | self.writer.add_summary(summary, global_step) 49 | 50 | def add_summaries(self, summaries, global_step): 51 | for summary in summaries: 52 | self.add_summary(summary, global_step) 53 | 54 | def dump_eval(self, e, precision=2, path=None): 55 | assert isinstance(e, Evaluation) 56 | if self.config.dump_pickle: 57 | path = path or os.path.join(self.config.eval_dir, "{}-{}.pklz".format(e.data_type, str(e.global_step).zfill(6))) 58 | with gzip.open(path, 'wb', compresslevel=3) as fh: 59 | pickle.dump(e.dict, fh) 60 | else: 61 | path = path or os.path.join(self.config.eval_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6))) 62 | with open(path, 'w') as fh: 63 | json.dump(short_floats(e.dict, precision), fh) 64 | 65 | def dump_answer(self, e, path=None): 66 | assert isinstance(e, Evaluation) 67 | path = path or os.path.join(self.config.answer_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6))) 68 | with open(path, 'w') as fh: 69 | json.dump(e.id2answer_dict, fh) 70 | 71 | -------------------------------------------------------------------------------- /bidaf/basic/run_ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | source_path=$1 3 | target_path=$2 4 | inter_dir="inter_ensemble" 5 | root_dir="save" 6 | 7 | parg="" 8 | marg="" 9 | if [ "$3" = "debug" ] 10 | then 11 | parg="-d" 12 | marg="--debug" 13 | fi 14 | 15 | # Preprocess data 16 | python3.5 -m newsqa.prepro --mode single --single_path $source_path $parg --target_dir $inter_dir --glove_dir . 17 | 18 | eargs="" 19 | for num in 31 33 34 35 36 37 40; do 20 | load_path="$root_dir/$num/save" 21 | shared_path="$root_dir/$num/shared.json" 22 | eval_path="$inter_dir/eval-$num.json" 23 | eargs="$eargs $eval_path" 24 | python3.5 -m basic.cli --data_dir $inter_dir --eval_path $eval_path --nodump_answer --load_path $load_path --shared_path $shared_path $marg --eval_num_batches 0 --mode forward --batch_size 1 --len_opt --cluster --cpu_opt --load_ema & 25 | done 26 | wait 27 | 28 | # Ensemble 29 | python3.5 -m basic.ensemble --data_path $inter_dir/data_single.json --shared_path $inter_dir/shared_single.json -o $target_path $eargs 30 | -------------------------------------------------------------------------------- /bidaf/basic/run_single.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | source_path=$1 3 | target_path=$2 4 | inter_dir="inter_single" 5 | root_dir="save" 6 | 7 | parg="" 8 | marg="" 9 | if [ "$3" = "debug" ] 10 | then 11 | parg="-d" 12 | marg="--debug" 13 | fi 14 | 15 | # Preprocess data 16 | python3 -m squad.prepro --mode single --single_path $source_path $parg --target_dir $inter_dir --glove_dir . 17 | 18 | num=37 19 | load_path="$root_dir/$num/save" 20 | shared_path="$root_dir/$num/shared.json" 21 | eval_path="$inter_dir/eval.json" 22 | python3 -m basic.cli --data_dir $inter_dir --eval_path $eval_path --nodump_answer --load_path $load_path --shared_path $shared_path $marg --eval_num_batches 0 --mode forward --batch_size 1 --len_opt --cluster --cpu_opt --load_ema 23 | 24 | # Ensemble (for single run, just one input) 25 | python3 -m basic.ensemble --data_path $inter_dir/data_single.json --shared_path $inter_dir/shared_single.json -o $target_path $eval_path 26 | 27 | 28 | -------------------------------------------------------------------------------- /bidaf/basic/superhighway.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.python.ops.rnn_cell import RNNCell 3 | 4 | from my.tensorflow.nn import linear 5 | 6 | 7 | class SHCell(RNNCell): 8 | """ 9 | Super-Highway Cell 10 | """ 11 | def __init__(self, input_size, logit_func='tri_linear', scalar=True, bias=3.0): 12 | self._state_size = input_size 13 | self._output_size = input_size 14 | self._logit_func = logit_func 15 | self._scalar = scalar 16 | self._bias = bias 17 | 18 | @property 19 | def state_size(self): 20 | return self._state_size 21 | 22 | @property 23 | def output_size(self): 24 | return self._output_size 25 | 26 | def __call__(self, inputs, state, scope=None): 27 | with tf.variable_scope(scope or "SHCell"): 28 | a_size = 1 if self._scalar else self._state_size 29 | h, u = tf.split(1, 2, inputs) 30 | if self._logit_func == 'mul_linear': 31 | args = [h * u] 32 | a = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='a')) 33 | r = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='r')) 34 | elif self._logit_func == 'linear': 35 | args = [h, u] 36 | a = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='a')) 37 | r = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='r')) 38 | elif self._logit_func == 'tri_linear': 39 | args = [h, u, h * u] 40 | a = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='a')) 41 | r = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='r')) 42 | elif self._logit_func == 'double': 43 | args = [h, u] 44 | a = tf.nn.sigmoid(linear(tf.tanh(linear(args, a_size, True)), self._state_size, True, bias_start=self._bias)) 45 | r = tf.nn.sigmoid(linear(tf.tanh(linear(args, a_size, True)), self._state_size, True, bias_start=self._bias)) 46 | 47 | else: 48 | raise Exception() 49 | new_state = a * state + r * (1 - a) * h 50 | outputs = state 51 | return outputs, new_state 52 | 53 | -------------------------------------------------------------------------------- /bidaf/basic/templates/visualizer.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {{ title }} 6 | 7 | 8 | 19 | 20 | 23 | 24 |

{{ title }}

25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | {% for row in rows %} 35 | 36 | 37 | 42 | 47 | 48 | 49 | 72 | 73 | {% endfor %} 74 |
IDQuestionAnswersPredictedScoreParagraph
{{ row.id }} 38 | {% for qj in row.ques %} 39 | {{ qj }} 40 | {% endfor %} 41 | 43 | {% for aa in row.a %} 44 |
  • {{ aa }}
  • 45 | {% endfor %} 46 |
    {{ row.ap }}{{ row.score }} 50 | 51 | {% for xj, ypj, yp2j in zip(row.para, row.yp, row.yp2) %} 52 | 53 | {% set rowloop = loop %} 54 | {% for xjk, ypjk in zip(xj, ypj) %} 55 | 62 | {% endfor %} 63 | 64 | 65 | {% for xjk, yp2jk in zip(xj, yp2j) %} 66 | 67 | {% endfor %} 68 | 69 | {% endfor %} 70 |
    56 | {% if row.y[0][0] == rowloop.index0 and row.y[0][1] <= loop.index0 <= row.y[1][1] %} 57 | {{ xjk }} 58 | {% else %} 59 | {{ xjk }} 60 | {% endif %} 61 |
    -
    71 |
    75 | 76 | -------------------------------------------------------------------------------- /bidaf/basic_cnn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/bidaf/basic_cnn/__init__.py -------------------------------------------------------------------------------- /bidaf/basic_cnn/graph_handler.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import json 3 | from json import encoder 4 | import os 5 | 6 | import tensorflow as tf 7 | 8 | from basic_cnn.evaluator import Evaluation, F1Evaluation 9 | from my.utils import short_floats 10 | 11 | import pickle 12 | 13 | 14 | class GraphHandler(object): 15 | def __init__(self, config): 16 | self.config = config 17 | self.saver = tf.train.Saver(max_to_keep=config.max_to_keep) 18 | self.writer = None 19 | self.save_path = os.path.join(config.save_dir, config.model_name) 20 | 21 | def initialize(self, sess): 22 | if self.config.load: 23 | self._load(sess) 24 | else: 25 | sess.run(tf.initialize_all_variables()) 26 | 27 | if self.config.mode == 'train': 28 | self.writer = tf.train.SummaryWriter(self.config.log_dir, graph=tf.get_default_graph()) 29 | 30 | def save(self, sess, global_step=None): 31 | self.saver.save(sess, self.save_path, global_step=global_step) 32 | 33 | def _load(self, sess): 34 | config = self.config 35 | if config.load_path: 36 | save_path = config.load_path 37 | elif config.load_step > 0: 38 | save_path = os.path.join(config.save_dir, "{}-{}".format(config.model_name, config.load_step)) 39 | else: 40 | save_dir = config.save_dir 41 | checkpoint = tf.train.get_checkpoint_state(save_dir) 42 | assert checkpoint is not None, "cannot load checkpoint at {}".format(save_dir) 43 | save_path = checkpoint.model_checkpoint_path 44 | print("Loading saved model from {}".format(save_path)) 45 | self.saver.restore(sess, save_path) 46 | 47 | def add_summary(self, summary, global_step): 48 | self.writer.add_summary(summary, global_step) 49 | 50 | def add_summaries(self, summaries, global_step): 51 | for summary in summaries: 52 | self.add_summary(summary, global_step) 53 | 54 | def dump_eval(self, e, precision=2, path=None): 55 | assert isinstance(e, Evaluation) 56 | if self.config.dump_pickle: 57 | path = path or os.path.join(self.config.eval_dir, "{}-{}.pklz".format(e.data_type, str(e.global_step).zfill(6))) 58 | with gzip.open(path, 'wb', compresslevel=3) as fh: 59 | pickle.dump(e.dict, fh) 60 | else: 61 | path = path or os.path.join(self.config.eval_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6))) 62 | with open(path, 'w') as fh: 63 | json.dump(short_floats(e.dict, precision), fh) 64 | 65 | def dump_answer(self, e, path=None): 66 | assert isinstance(e, Evaluation) 67 | path = path or os.path.join(self.config.answer_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6))) 68 | with open(path, 'w') as fh: 69 | json.dump(e.id2answer_dict, fh) 70 | 71 | -------------------------------------------------------------------------------- /bidaf/basic_cnn/superhighway.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.python.ops.rnn_cell import RNNCell 3 | 4 | from my.tensorflow.nn import linear 5 | 6 | 7 | class SHCell(RNNCell): 8 | """ 9 | Super-Highway Cell 10 | """ 11 | def __init__(self, input_size, logit_func='tri_linear', scalar=False): 12 | self._state_size = input_size 13 | self._output_size = input_size 14 | self._logit_func = logit_func 15 | self._scalar = scalar 16 | 17 | @property 18 | def state_size(self): 19 | return self._state_size 20 | 21 | @property 22 | def output_size(self): 23 | return self._output_size 24 | 25 | def __call__(self, inputs, state, scope=None): 26 | with tf.variable_scope(scope or "SHCell"): 27 | a_size = 1 if self._scalar else self._state_size 28 | h, u = tf.split(1, 2, inputs) 29 | if self._logit_func == 'mul_linear': 30 | args = [h * u, state * u] 31 | a = tf.nn.sigmoid(linear(args, a_size, True)) 32 | elif self._logit_func == 'linear': 33 | args = [h, u, state] 34 | a = tf.nn.sigmoid(linear(args, a_size, True)) 35 | elif self._logit_func == 'tri_linear': 36 | args = [h, u, state, h * u, state * u] 37 | a = tf.nn.sigmoid(linear(args, a_size, True)) 38 | elif self._logit_func == 'double': 39 | args = [h, u, state] 40 | a = tf.nn.sigmoid(linear(tf.tanh(linear(args, a_size, True)), self._state_size, True)) 41 | 42 | else: 43 | raise Exception() 44 | new_state = a * state + (1 - a) * h 45 | outputs = state 46 | return outputs, new_state 47 | 48 | -------------------------------------------------------------------------------- /bidaf/basic_cnn/templates/visualizer.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {{ title }} 6 | 7 | 8 | 19 | 20 | 23 | 24 |

    {{ title }}

    25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | {% for row in rows %} 35 | 36 | 37 | 42 | 47 | 48 | 49 | 72 | 73 | {% endfor %} 74 |
    IDQuestionAnswersPredictedScoreParagraph
    {{ row.id }} 38 | {% for qj in row.ques %} 39 | {{ qj }} 40 | {% endfor %} 41 | 43 | {% for aa in row.a %} 44 |
  • {{ aa }}
  • 45 | {% endfor %} 46 |
    {{ row.ap }}{{ row.score }} 50 | 51 | {% for xj, ypj, yp2j in zip(row.para, row.yp, row.yp2) %} 52 | 53 | {% set rowloop = loop %} 54 | {% for xjk, ypjk in zip(xj, ypj) %} 55 | 62 | {% endfor %} 63 | 64 | 65 | {% for xjk, yp2jk in zip(xj, yp2j) %} 66 | 67 | {% endfor %} 68 | 69 | {% endfor %} 70 |
    56 | {% if row.y[0][0] == rowloop.index0 and row.y[0][1] <= loop.index0 <= row.y[1][1] %} 57 | {{ xjk }} 58 | {% else %} 59 | {{ xjk }} 60 | {% endif %} 61 |
    -
    71 |
    75 | 76 | -------------------------------------------------------------------------------- /bidaf/basic_cnn/trainer.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from basic_cnn.model import Model 4 | from my.tensorflow import average_gradients 5 | 6 | 7 | class Trainer(object): 8 | def __init__(self, config, model): 9 | assert isinstance(model, Model) 10 | self.config = config 11 | self.model = model 12 | self.opt = tf.train.AdadeltaOptimizer(config.init_lr) 13 | self.loss = model.get_loss() 14 | self.var_list = model.get_var_list() 15 | self.global_step = model.get_global_step() 16 | self.summary = model.summary 17 | self.grads = self.opt.compute_gradients(self.loss, var_list=self.var_list) 18 | self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step) 19 | 20 | def get_train_op(self): 21 | return self.train_op 22 | 23 | def step(self, sess, batch, get_summary=False): 24 | assert isinstance(sess, tf.Session) 25 | _, ds = batch 26 | feed_dict = self.model.get_feed_dict(ds, True) 27 | if get_summary: 28 | loss, summary, train_op = \ 29 | sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict) 30 | else: 31 | loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict) 32 | summary = None 33 | return loss, summary, train_op 34 | 35 | 36 | class MultiGPUTrainer(object): 37 | def __init__(self, config, models): 38 | model = models[0] 39 | assert isinstance(model, Model) 40 | self.config = config 41 | self.model = model 42 | self.opt = tf.train.AdadeltaOptimizer(config.init_lr) 43 | self.var_list = model.get_var_list() 44 | self.global_step = model.get_global_step() 45 | self.summary = model.summary 46 | self.models = models 47 | losses = [] 48 | grads_list = [] 49 | for gpu_idx, model in enumerate(models): 50 | with tf.name_scope("grads_{}".format(gpu_idx)), tf.device("/gpu:{}".format(gpu_idx)): 51 | loss = model.get_loss() 52 | grads = self.opt.compute_gradients(loss, var_list=self.var_list) 53 | losses.append(loss) 54 | grads_list.append(grads) 55 | 56 | self.loss = tf.add_n(losses)/len(losses) 57 | self.grads = average_gradients(grads_list) 58 | self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step) 59 | 60 | def step(self, sess, batches, get_summary=False): 61 | assert isinstance(sess, tf.Session) 62 | feed_dict = {} 63 | for batch, model in zip(batches, self.models): 64 | _, ds = batch 65 | feed_dict.update(model.get_feed_dict(ds, True)) 66 | 67 | if get_summary: 68 | loss, summary, train_op = \ 69 | sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict) 70 | else: 71 | loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict) 72 | summary = None 73 | return loss, summary, train_op 74 | -------------------------------------------------------------------------------- /bidaf/cnn_dm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/bidaf/cnn_dm/__init__.py -------------------------------------------------------------------------------- /bidaf/cnn_dm/evaluate.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | 5 | root_dir = sys.argv[1] 6 | answer_path = sys.argv[2] 7 | file_names = os.listdir(root_dir) 8 | 9 | num_correct = 0 10 | num_wrong = 0 11 | 12 | with open(answer_path, 'r') as fh: 13 | id2answer_dict = json.load(fh) 14 | 15 | for file_name in file_names: 16 | if not file_name.endswith(".question"): 17 | continue 18 | with open(os.path.join(root_dir, file_name), 'r') as fh: 19 | url = fh.readline().strip() 20 | _ = fh.readline() 21 | para = fh.readline().strip() 22 | _ = fh.readline() 23 | ques = fh.readline().strip() 24 | _ = fh.readline() 25 | answer = fh.readline().strip() 26 | _ = fh.readline() 27 | if file_name in id2answer_dict: 28 | pred = id2answer_dict[file_name] 29 | if pred == answer: 30 | num_correct += 1 31 | else: 32 | num_wrong += 1 33 | else: 34 | num_wrong += 1 35 | 36 | total = num_correct + num_wrong 37 | acc = float(num_correct) / total 38 | print("{} = {} / {}".format(acc, num_correct, total)) -------------------------------------------------------------------------------- /bidaf/data/squad/data_dev.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:43899b7e7a5098aab61718e162399c26d7d2927a0323ae7e96ccd836ec71689a 3 | size 6486869 4 | -------------------------------------------------------------------------------- /bidaf/data/squad/data_test.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:43899b7e7a5098aab61718e162399c26d7d2927a0323ae7e96ccd836ec71689a 3 | size 6486869 4 | -------------------------------------------------------------------------------- /bidaf/data/squad/data_train.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:72faa687d3686a6a07b289c4ab13f53a846ac2034d0b1b75ab025b55ca5c9ca4 3 | size 43602519 4 | -------------------------------------------------------------------------------- /bidaf/data/squad/shared_dev.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3c3c2245183b060a88184d4e0144c56ceb581a9d85de9555a534cf86d32f1bf1 3 | size 57027001 4 | -------------------------------------------------------------------------------- /bidaf/data/squad/shared_test.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3c3c2245183b060a88184d4e0144c56ceb581a9d85de9555a534cf86d32f1bf1 3 | size 57027001 4 | -------------------------------------------------------------------------------- /bidaf/data/squad/shared_train.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:aca764b536502380700ce7dc0a9c5f8609a1d3a0f2c400afb659eddb7eb43c5d 3 | size 244088547 4 | -------------------------------------------------------------------------------- /bidaf/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | DATA_DIR=$HOME/data 4 | mkdir $DATA_DIR 5 | 6 | # Download SQuAD 7 | SQUAD_DIR=$DATA_DIR/squad 8 | mkdir $SQUAD_DIR 9 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O $SQUAD_DIR/train-v1.1.json 10 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O $SQUAD_DIR/dev-v1.1.json 11 | 12 | 13 | # Download CNN and DailyMail 14 | # Download at: http://cs.nyu.edu/~kcho/DMQA/ 15 | 16 | 17 | # Download GloVe 18 | GLOVE_DIR=$DATA_DIR/glove 19 | mkdir $GLOVE_DIR 20 | wget http://nlp.stanford.edu/data/glove.6B.zip -O $GLOVE_DIR/glove.6B.zip 21 | unzip $GLOVE_DIR/glove.6B.zip -d $GLOVE_DIR 22 | 23 | # Download NLTK (for tokenizer) 24 | # Make sure that nltk is installed! 25 | python3 -m nltk.downloader -d $HOME/nltk_data punkt 26 | -------------------------------------------------------------------------------- /bidaf/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /bidaf/helpers/constants.py: -------------------------------------------------------------------------------- 1 | """ 2 | Constants for easier reference 3 | """ 4 | 5 | TOKENIZER_TEXTBLOB = 'TOKENIZER_TEXTBLOB' 6 | TOKENIZER_NLTK = 'TOKENIZER_NLTK' 7 | TOKENIZER_REGEX = 'TOKENIZER_REGEX' 8 | TOKENIZER_TWITTER = 'TOKENIZER_TWITTER' 9 | TOKENIZER_STANFORD_NLP = 'TOKENIZER_STANFORD_NLP' 10 | 11 | TRAIN_INDEX = 0 12 | VAL_INDEX = 1 13 | TEST_INDEX = 2 14 | JOB_ENDPOINT = 'http://ec2-52-33-179-156.us-west-2.compute.amazonaws.com:8000/api/v1' #'https://104.155.188.251:8080/api/v1' 15 | JOB_ENDPOINT = 'http://104.155.132.60:8080/api/v1' 16 | 17 | ## 18 | ACCESS_TOKEN = '49553f53ef5178db88e7cf4e192a1db1a77cfdbb' 19 | 20 | TRAIN_MODE = 0 21 | TEST_MODE = 1 22 | 23 | WORD_LEVEL = 'WORD_LEVEL' 24 | CHAR_LEVEL = 'CHAR_LEVEL' 25 | WORD_CHAR_LEVEL = 'WORD_CHAR_LEVEL' #Word embeddings with char. lvl lstms 26 | WORD_HASHING_LEVEL = 'WORD_HASHING_LEVEL' 27 | WORD_HASHING_CONSTANT = '%' 28 | 29 | DATASET_TRAIN = 'DATASET_TRAIN' 30 | DATASET_TEST = 'DATASET_TEST' 31 | DATASET_VALIDATION = 'DATASET_VALIDATION' 32 | 33 | GPU_MODE = 'GPU_MODE' 34 | CPU_MODE = 'CPU_MODE' 35 | 36 | CLOUD_MODEL_DIR = 'softmax_models' 37 | CLOUD_MODEL_ENDPOINT = 's3.amazonaws.com' 38 | 39 | AWS_KEY = 'AKIAJ3OQL4ACVRTLQSJA' 40 | AWS_SECRET = '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' 41 | 42 | LOCAL_MODEL_DIR = 'softmax_models' 43 | 44 | PREPROCESS_TYPE_INCEPTION = 'PREPROCESS_TYPE_INCEPTION' 45 | PREPROCESS_TYPE_GOOGLENET = 'PREPROCESS_TYPE_GOOGLENET' 46 | PREPROCESS_TYPE_RESNET = 'PREPROCESS_TYPE_RESNET' 47 | 48 | PREPROCESS_TYPE_RESNET_50 = 'PREPROCESS_TYPE_RESNET_50' 49 | PREPROCESS_TYPE_RESNET_101 = 'PREPROCESS_TYPE_RESNET_101' 50 | PREPROCESS_TYPE_RESNET_152 = 'PREPROCESS_TYPE_RESNET_152' 51 | 52 | NETWORK_TYPE_INCEPTION = 'NETWORK_TYPE_INCEPTION' 53 | NETWORK_TYPE_GOOGLENET = 'NETWORK_TYPE_GOOGLENET' 54 | NETWORK_TYPE_RESNET = 'NETWORK_TYPE_RESNET' 55 | 56 | NETWORK_TYPE_RESNET_30 = 'NETWORK_TYPE_RESNET_30' 57 | NETWORK_TYPE_RESNET_50 = 'NETWORK_TYPE_RESNET_50' 58 | NETWORK_TYPE_RESNET_101 = 'NETWORK_TYPE_RESNET_101' 59 | NETWORK_TYPE_RESNET_152 = 'NETWORK_TYPE_RESNET_152' 60 | 61 | OPTIMIZER_RMSPROP = 'OPTIMIZER_RMSPROP' 62 | OPTIMIZER_ADAM = 'OPTIMIZER_ADAM' 63 | OPTIMIZER_SGD = 'OPTIMIZER_SGD' 64 | 65 | NLTK_DATA_PATH = '../../pretrained_models/nltk' 66 | 67 | # Dependency embeddings path 68 | PRETRAINED_EMBEDDINGS_PATH = '../../pretrained_models/word_embeddings/dependency_embeddings/embeddings.npy' 69 | PRETRAINED_VOCAB_PATH = '../../pretrained_models/word_embeddings/dependency_embeddings/vocab.txt' 70 | 71 | # Part of speech vocab path 72 | POS_VOCAB_PATH = '../../pretrained_models/word_embeddings/pos_tags/vocab.txt' 73 | STANFORD_CORENLP_PATH = '../../pretrained_models/stanford_corenlp/2015-12-09/*' 74 | 75 | MACHINE_READING_MODEL_JOINT_HARD_NEGATIVES_OP = "MACHINE_READING_MODEL_JOINT_HARD_NEGATIVES_OP" 76 | MACHINE_READING_MODEL_JOINT_OP = "MACHINE_READING_MODEL_JOINT_OP" 77 | MACHINE_READING_MODEL_SENTENCE_OP = "MACHINE_READING_MODEL_SENTENCE_OP" 78 | MACHINE_READING_MODEL_ANSWER_OP = "MACHINE_READING_MODEL_ANSWER_OP" 79 | 80 | # Initializer for weights (zero, uniform and random) 81 | INITIALIZER_ZERO = 'INITIALIZER_ZERO' 82 | INITIALIZER_UNIFORM_RANDOM = 'INITIALIZER_UNIFORM_RANDOM' 83 | 84 | # To load vocab things 85 | PATH_NPY_ARRAY = 'PATH_NPY_ARRAY' 86 | PATH_TEXT_ARRAY = 'PATH_TEXT_ARRAY' -------------------------------------------------------------------------------- /bidaf/helpers/file_logger.py: -------------------------------------------------------------------------------- 1 | class FileLogger(object): 2 | """ Simple logger to insert stuff into a file """ 3 | def __init__(self, path): 4 | self.file = open(path, 'w') 5 | 6 | def write(self, text, print_text=True): 7 | if print_text: 8 | print("FILE LOGGER: %s" % text) 9 | self.file.write(str(text) + "\n") 10 | self.file.flush() -------------------------------------------------------------------------------- /bidaf/helpers/math_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def top_k_spans(start_probs, end_probs, n, k): 4 | """ 5 | Returns top k non overlapping spans for a passage 6 | sorted by start/end probabilities 7 | """ 8 | probs = [] 9 | argmax_spans = [] 10 | for i in range(k + 1): 11 | probs.append([]) 12 | argmax_spans.append([]) 13 | for j in range(n + 1): 14 | probs[i].append(0) 15 | argmax_spans[i].append([-1, -1]) 16 | 17 | for i in range(k + 1): 18 | probs[i][0] = 0; 19 | 20 | for j in range(n + 1): 21 | probs[0][j] = 0 22 | 23 | # fill the table in bottom-up fashion 24 | for i in range(1, k + 1): 25 | prev_diff = -10000 26 | prev_idx = -1 27 | for j in range(1, n): 28 | if prev_diff < probs[i-1][j-1] - start_probs[j-1]: 29 | prev_diff = probs[i-1][j-1] - start_probs[j-1] 30 | prev_idx = j-1 31 | if probs[i][j-1] > end_probs[j] + prev_diff: 32 | probs[i][j] = probs[i][j-1] 33 | argmax_spans[i][j] = argmax_spans[i][j-1] 34 | else: 35 | probs[i][j] = end_probs[j] + prev_diff 36 | argmax_spans[i][j] = (prev_idx, j) 37 | 38 | max_probs = probs[k][n-1] 39 | cur_probs = max_probs 40 | cur_spans = argmax_spans[k][n-1] 41 | start_end_idxs = [] 42 | start_end_probs = [] 43 | 44 | while cur_probs > 0: 45 | cur_indices = cur_spans 46 | cur_prob = end_probs[cur_indices[1]] - start_probs[cur_indices[0]] 47 | start_end_probs.append(cur_prob) 48 | cur_probs = cur_probs - cur_prob 49 | start_end_idxs.append(cur_indices) 50 | cur_spans = argmax_spans[k][cur_indices[0]] 51 | 52 | return max_probs, start_end_idxs, start_end_probs -------------------------------------------------------------------------------- /bidaf/install_tensorflow.sh: -------------------------------------------------------------------------------- 1 | sudo pip uninstall tensorflow 2 | export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.12.1-cp35-cp35m-linux_x86_64.whl 3 | 4 | sudo pip install --upgrade pip 5 | sudo pip install --upgrade $TF_BINARY_URL 6 | 7 | sudo pip install requests 8 | sudo pip install tqdm 9 | sudo pip install pandas 10 | sudo pip install nltk 11 | 12 | sudo apt-get update 13 | sudo apt-get install python-software-properties 14 | sudo add-apt-repository ppa:git-core/ppa 15 | curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash 16 | sudo apt-get install git-lfs 17 | git lfs install -------------------------------------------------------------------------------- /bidaf/my/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/bidaf/my/__init__.py -------------------------------------------------------------------------------- /bidaf/my/corenlp_interface.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import requests 4 | import nltk 5 | import json 6 | import networkx as nx 7 | import time 8 | 9 | 10 | class CoreNLPInterface(object): 11 | def __init__(self, url, port): 12 | self._url = url 13 | self._port = port 14 | 15 | def get(self, type_, in_, num_max_requests=100): 16 | in_ = in_.encode("utf-8") 17 | url = "http://{}:{}/{}".format(self._url, self._port, type_) 18 | out = None 19 | for _ in range(num_max_requests): 20 | try: 21 | r = requests.post(url, data=in_) 22 | out = r.content.decode('utf-8') 23 | if out == 'error': 24 | out = None 25 | break 26 | except: 27 | time.sleep(1) 28 | return out 29 | 30 | def split_doc(self, doc): 31 | out = self.get("doc", doc) 32 | return out if out is None else json.loads(out) 33 | 34 | def split_sent(self, sent): 35 | out = self.get("sent", sent) 36 | return out if out is None else json.loads(out) 37 | 38 | def get_dep(self, sent): 39 | out = self.get("dep", sent) 40 | return out if out is None else json.loads(out) 41 | 42 | def get_const(self, sent): 43 | out = self.get("const", sent) 44 | return out 45 | 46 | def get_const_tree(self, sent): 47 | out = self.get_const(sent) 48 | return out if out is None else nltk.tree.Tree.fromstring(out) 49 | 50 | @staticmethod 51 | def dep2tree(dep): 52 | tree = nx.DiGraph() 53 | for dep, i, gov, j, label in dep: 54 | tree.add_edge(gov, dep, label=label) 55 | return tree 56 | -------------------------------------------------------------------------------- /bidaf/my/nltk_utils.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import numpy as np 3 | 4 | 5 | def _set_span(t, i): 6 | if isinstance(t[0], str): 7 | t.span = (i, i+len(t)) 8 | else: 9 | first = True 10 | for c in t: 11 | cur_span = _set_span(c, i) 12 | i = cur_span[1] 13 | if first: 14 | min_ = cur_span[0] 15 | first = False 16 | max_ = cur_span[1] 17 | t.span = (min_, max_) 18 | return t.span 19 | 20 | 21 | def set_span(t): 22 | assert isinstance(t, nltk.tree.Tree) 23 | try: 24 | return _set_span(t, 0) 25 | except: 26 | print(t) 27 | exit() 28 | 29 | 30 | def tree_contains_span(tree, span): 31 | """ 32 | Assumes that tree span has been set with set_span 33 | Returns true if any subtree of t has exact span as the given span 34 | :param t: 35 | :param span: 36 | :return bool: 37 | """ 38 | return span in set(t.span for t in tree.subtrees()) 39 | 40 | 41 | def span_len(span): 42 | return span[1] - span[0] 43 | 44 | 45 | def span_overlap(s1, s2): 46 | start = max(s1[0], s2[0]) 47 | stop = min(s1[1], s2[1]) 48 | if stop > start: 49 | return start, stop 50 | return None 51 | 52 | 53 | def span_prec(true_span, pred_span): 54 | overlap = span_overlap(true_span, pred_span) 55 | if overlap is None: 56 | return 0 57 | return span_len(overlap) / span_len(pred_span) 58 | 59 | 60 | def span_recall(true_span, pred_span): 61 | overlap = span_overlap(true_span, pred_span) 62 | if overlap is None: 63 | return 0 64 | return span_len(overlap) / span_len(true_span) 65 | 66 | 67 | def span_f1(true_span, pred_span): 68 | p = span_prec(true_span, pred_span) 69 | r = span_recall(true_span, pred_span) 70 | if p == 0 or r == 0: 71 | return 0.0 72 | return 2 * p * r / (p + r) 73 | 74 | 75 | def find_max_f1_span(tree, span): 76 | return find_max_f1_subtree(tree, span).span 77 | 78 | 79 | def find_max_f1_subtree(tree, span): 80 | return max(((t, span_f1(span, t.span)) for t in tree.subtrees()), key=lambda p: p[1])[0] 81 | 82 | 83 | def tree2matrix(tree, node2num, row_size=None, col_size=None, dtype='int32'): 84 | set_span(tree) 85 | D = tree.height() - 1 86 | B = len(tree.leaves()) 87 | row_size = row_size or D 88 | col_size = col_size or B 89 | matrix = np.zeros([row_size, col_size], dtype=dtype) 90 | mask = np.zeros([row_size, col_size, col_size], dtype='bool') 91 | 92 | for subtree in tree.subtrees(): 93 | row = subtree.height() - 2 94 | col = subtree.span[0] 95 | matrix[row, col] = node2num(subtree) 96 | for subsub in subtree.subtrees(): 97 | if isinstance(subsub, nltk.tree.Tree): 98 | mask[row, col, subsub.span[0]] = True 99 | if not isinstance(subsub[0], nltk.tree.Tree): 100 | c = subsub.span[0] 101 | for r in range(row): 102 | mask[r, c, c] = True 103 | else: 104 | mask[row, col, col] = True 105 | 106 | return matrix, mask 107 | 108 | 109 | def load_compressed_tree(s): 110 | 111 | def compress_tree(tree): 112 | assert not isinstance(tree, str) 113 | if len(tree) == 1: 114 | if isinstance(tree[0], nltk.tree.Tree): 115 | return compress_tree(tree[0]) 116 | else: 117 | return tree 118 | else: 119 | for i, t in enumerate(tree): 120 | if isinstance(t, nltk.tree.Tree): 121 | tree[i] = compress_tree(t) 122 | else: 123 | tree[i] = t 124 | return tree 125 | 126 | return compress_tree(nltk.tree.Tree.fromstring(s)) 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /bidaf/my/tensorflow/__init__.py: -------------------------------------------------------------------------------- 1 | from my.tensorflow.general import * -------------------------------------------------------------------------------- /bidaf/my/tensorflow/rnn.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.python.ops.rnn import dynamic_rnn as _dynamic_rnn, \ 3 | bidirectional_dynamic_rnn as _bidirectional_dynamic_rnn 4 | from tensorflow.python.ops.rnn import bidirectional_rnn as _bidirectional_rnn 5 | 6 | from my.tensorflow import flatten, reconstruct 7 | 8 | 9 | def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None, 10 | dtype=None, parallel_iterations=None, swap_memory=False, 11 | time_major=False, scope=None): 12 | assert not time_major # TODO : to be implemented later! 13 | flat_inputs = flatten(inputs, 2) # [-1, J, d] 14 | flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64') 15 | 16 | flat_outputs, final_state = _dynamic_rnn(cell, flat_inputs, sequence_length=flat_len, 17 | initial_state=initial_state, dtype=dtype, 18 | parallel_iterations=parallel_iterations, swap_memory=swap_memory, 19 | time_major=time_major, scope=scope) 20 | 21 | outputs = reconstruct(flat_outputs, inputs, 2) 22 | return outputs, final_state 23 | 24 | 25 | def bw_dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None, 26 | dtype=None, parallel_iterations=None, swap_memory=False, 27 | time_major=False, scope=None): 28 | assert not time_major # TODO : to be implemented later! 29 | 30 | flat_inputs = flatten(inputs, 2) # [-1, J, d] 31 | flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64') 32 | 33 | flat_inputs = tf.reverse(flat_inputs, 1) if sequence_length is None \ 34 | else tf.reverse_sequence(flat_inputs, sequence_length, 1) 35 | flat_outputs, final_state = _dynamic_rnn(cell, flat_inputs, sequence_length=flat_len, 36 | initial_state=initial_state, dtype=dtype, 37 | parallel_iterations=parallel_iterations, swap_memory=swap_memory, 38 | time_major=time_major, scope=scope) 39 | flat_outputs = tf.reverse(flat_outputs, 1) if sequence_length is None \ 40 | else tf.reverse_sequence(flat_outputs, sequence_length, 1) 41 | 42 | outputs = reconstruct(flat_outputs, inputs, 2) 43 | return outputs, final_state 44 | 45 | 46 | def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None, 47 | initial_state_fw=None, initial_state_bw=None, 48 | dtype=None, parallel_iterations=None, 49 | swap_memory=False, time_major=False, scope=None): 50 | assert not time_major 51 | 52 | flat_inputs = flatten(inputs, 2) # [-1, J, d] 53 | flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64') 54 | 55 | (flat_fw_outputs, flat_bw_outputs), final_state = \ 56 | _bidirectional_dynamic_rnn(cell_fw, cell_bw, flat_inputs, sequence_length=flat_len, 57 | initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw, 58 | dtype=dtype, parallel_iterations=parallel_iterations, swap_memory=swap_memory, 59 | time_major=time_major, scope=scope) 60 | 61 | fw_outputs = reconstruct(flat_fw_outputs, inputs, 2) 62 | bw_outputs = reconstruct(flat_bw_outputs, inputs, 2) 63 | # FIXME : final state is not reshaped! 64 | return (fw_outputs, bw_outputs), final_state 65 | 66 | 67 | def bidirectional_rnn(cell_fw, cell_bw, inputs, 68 | initial_state_fw=None, initial_state_bw=None, 69 | dtype=None, sequence_length=None, scope=None): 70 | 71 | flat_inputs = flatten(inputs, 2) # [-1, J, d] 72 | flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64') 73 | 74 | (flat_fw_outputs, flat_bw_outputs), final_state = \ 75 | _bidirectional_rnn(cell_fw, cell_bw, flat_inputs, sequence_length=flat_len, 76 | initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw, 77 | dtype=dtype, scope=scope) 78 | 79 | fw_outputs = reconstruct(flat_fw_outputs, inputs, 2) 80 | bw_outputs = reconstruct(flat_bw_outputs, inputs, 2) 81 | # FIXME : final state is not reshaped! 82 | return (fw_outputs, bw_outputs), final_state 83 | -------------------------------------------------------------------------------- /bidaf/my/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | from collections import deque 3 | 4 | import numpy as np 5 | from tqdm import tqdm 6 | 7 | 8 | def mytqdm(list_, desc="", show=True): 9 | if show: 10 | pbar = tqdm(list_) 11 | pbar.set_description(desc) 12 | return pbar 13 | return list_ 14 | 15 | 16 | def json_pretty_dump(obj, fh): 17 | return json.dump(obj, fh, sort_keys=True, indent=2, separators=(',', ': ')) 18 | 19 | 20 | def index(l, i): 21 | return index(l[i[0]], i[1:]) if len(i) > 1 else l[i[0]] 22 | 23 | 24 | def fill(l, shape, dtype=None): 25 | out = np.zeros(shape, dtype=dtype) 26 | stack = deque() 27 | stack.appendleft(((), l)) 28 | while len(stack) > 0: 29 | indices, cur = stack.pop() 30 | if len(indices) < shape: 31 | for i, sub in enumerate(cur): 32 | stack.appendleft([indices + (i,), sub]) 33 | else: 34 | out[indices] = cur 35 | return out 36 | 37 | 38 | def short_floats(o, precision): 39 | class ShortFloat(float): 40 | def __repr__(self): 41 | return '%.{}g'.format(precision) % self 42 | 43 | def _short_floats(obj): 44 | if isinstance(obj, float): 45 | return ShortFloat(obj) 46 | elif isinstance(obj, dict): 47 | return dict((k, _short_floats(v)) for k, v in obj.items()) 48 | elif isinstance(obj, (list, tuple)): 49 | return tuple(map(_short_floats, obj)) 50 | return obj 51 | 52 | return _short_floats(o) 53 | 54 | 55 | def argmax(x): 56 | return np.unravel_index(x.argmax(), x.shape) 57 | 58 | 59 | -------------------------------------------------------------------------------- /bidaf/my/zip_save.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import shutil 5 | from zipfile import ZipFile 6 | 7 | from tqdm import tqdm 8 | 9 | 10 | def get_args(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('paths', nargs='+') 13 | parser.add_argument('-o', '--out', default='save.zip') 14 | args = parser.parse_args() 15 | return args 16 | 17 | 18 | def zip_save(args): 19 | temp_dir = "." 20 | save_dir = os.path.join(temp_dir, "save") 21 | if not os.path.exists(save_dir): 22 | os.makedirs(save_dir) 23 | for save_source_path in tqdm(args.paths): 24 | # path = "out/basic/30/save/basic-18000" 25 | # target_path = "save_dir/30/save" 26 | # also output full path name to "save_dir/30/readme.txt 27 | # need to also extract "out/basic/30/shared.json" 28 | temp, _ = os.path.split(save_source_path) # "out/basic/30/save", _ 29 | model_dir, _ = os.path.split(temp) # "out/basic/30, _ 30 | _, model_name = os.path.split(model_dir) 31 | cur_dir = os.path.join(save_dir, model_name) 32 | if not os.path.exists(cur_dir): 33 | os.makedirs(cur_dir) 34 | save_target_path = os.path.join(cur_dir, "save") 35 | shared_target_path = os.path.join(cur_dir, "shared.json") 36 | readme_path = os.path.join(cur_dir, "readme.txt") 37 | shared_source_path = os.path.join(model_dir, "shared.json") 38 | shutil.copy(save_source_path, save_target_path) 39 | shutil.copy(shared_source_path, shared_target_path) 40 | with open(readme_path, 'w') as fh: 41 | fh.write(save_source_path) 42 | 43 | os.system("zip {} -r {}".format(args.out, save_dir)) 44 | 45 | def main(): 46 | args = get_args() 47 | zip_save(args) 48 | 49 | if __name__ == "__main__": 50 | main() 51 | -------------------------------------------------------------------------------- /bidaf/newsqa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/bidaf/newsqa/__init__.py -------------------------------------------------------------------------------- /bidaf/newsqa/evaluate.py: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:64f332e08e64422da089d47013f59a9596f94cadd6b1c5a142ba8aee47421ee5 3 | size 3226 4 | -------------------------------------------------------------------------------- /bidaf/newsqa/prepro.py: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:9685e6d5f587e0d34c27a5f8e521810a0f9dd62968286d86825140a78fcfbcde 3 | size 11421 4 | -------------------------------------------------------------------------------- /bidaf/newsqa_unsupervised_old/data_train.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:23ec75cb6b82dc2a6c5cbb704f146a5e0529e6ba25e344d5063a8c5a0e2af6c7 3 | size 332564109 4 | -------------------------------------------------------------------------------- /bidaf/newsqa_unsupervised_old_verb_filtered/data_train.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:44136fa355b3678a1146ad16f7e8649e94fb4fc21fe77e8310c060f61caaff8a 3 | size 2 4 | -------------------------------------------------------------------------------- /bidaf/out/basic/06/save/basic-40000.data-00000-of-00001: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c74b188ef72f2c71c487b5cea41802d532d0ab7ac78c9012d840615fbaf91b61 3 | size 33047428 4 | -------------------------------------------------------------------------------- /bidaf/out/basic/06/save/basic-40000.index: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:99cf4a33e7a4e28497ba709d20880721fff350f605442b541067cc7fda38c180 3 | size 4232 4 | -------------------------------------------------------------------------------- /bidaf/out/basic/06/save/basic-40000.meta: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3f6867f121ca4743b851303041e54976501f58a177e8d4cd4844091037f37b98 3 | size 8040647 4 | -------------------------------------------------------------------------------- /bidaf/out/basic/06/save/checkpoint: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5aa608ba2b2392385c457bc79abdd062bc9772de47dc6def4b7ecb6729fba431 3 | size 79 4 | -------------------------------------------------------------------------------- /bidaf/out/basic/06/shared.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b48087abac725c6079e62d86253b294db5d331d0de98697de24d8d182634948a 3 | size 38884 4 | -------------------------------------------------------------------------------- /bidaf/requirements.txt: -------------------------------------------------------------------------------- 1 | nltk 2 | tqdm 3 | jinja2 -------------------------------------------------------------------------------- /bidaf/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Once that's finished run evalations on the saved models 3 | # Creates pklz files that can be used for final eval. 4 | for i in 41 42 43 44 45 46 47 48 49 51 52 53 54 55 56 57 58 59; 5 | do 6 | for j in 17 18 19; 7 | do 8 | python3 -m basic.cli \ 9 | --run_id $j \ 10 | --shared_path out/basic/06/shared.json \ 11 | --load_path "out/basic/$j/save/basic-"$i"000" \ 12 | --k 10 \ 13 | --use_special_token False \ 14 | --load_ema False --gpu_idx 3 \ 15 | --mode test --data_dir newsqa \ 16 | --len_opt --batch_size 10 --num_steps 40000 \ 17 | --eval_period 1000 --save_period 1000 \ 18 | --sent_size_th 2100 --para_size_th 2100 19 | done 20 | done 21 | for num in 40 41 42 43 44 45; do 22 | eval_path="out/basic/14/eval/test-0${num}000.pklz" 23 | eargs="$eargs $eval_path" 24 | done 25 | python3 -m basic.ensemble --data_path newsqa/data_test.json --shared_path newsqa/shared_test.json -o new_results_30.json $eargs 26 | python3 newsqa/evaluate.py newsqa/data_test.json new_results_30.json 27 | -------------------------------------------------------------------------------- /bidaf/scripts.sh: -------------------------------------------------------------------------------- 1 | # Now do evaluations on the pklz files with predictions 2 | model_id=14 3 | eargs="" 4 | 5 | for num in 40; do 6 | eval_path="out/basic/${model_id}/eval/test-0${num}000.pklz" 7 | eargs="$eargs $eval_path" 8 | done 9 | #for num in 41 42 43 46; do 10 | # eval_path="out/basic/${model_id_2}/eval/test-0${num}000.pklz" 11 | # eargs="$eargs $eval_path" 12 | #done 13 | python3 -m basic.ensemble --data_path newsqa/data_test.json --shared_path newsqa/shared_test.json -o new_results_30.json $eargs 14 | python3 newsqa/evaluate.py newsqa/data_test.json new_results_30.json 15 | -------------------------------------------------------------------------------- /bidaf/scripts/compare_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python3 -m visualization.compare_models_newsqa \ 3 | -dataset newsqa/data_test.json \ 4 | -model1 out/basic/06/answer/test-040000.json \ 5 | -model2 out/basic/00/answer/test-040000.json \ 6 | -name1 "BIDAF out-domain" \ 7 | -name2 "BIDAF in-domain" \ 8 | -output "BIDAF_results/outdomain_vs_indomain" 9 | 10 | python3 -m visualization.compare_models_newsqa \ 11 | -dataset newsqa/data_test.json \ 12 | -model1 out/basic/30/answer/test-044000.json \ 13 | -model2 out/basic/00/answer/test-040000.json \ 14 | -name1 "BIDAF Synthetic, k=5, fake a, fake q" \ 15 | -name2 "BIDAF on NewsQA" \ 16 | -output "BIDAF_results/k_5_single_vs_indomain" 17 | 18 | python3 -m visualization.compare_models_newsqa \ 19 | -dataset newsqa/data_test.json \ 20 | -model1 29_all.json \ 21 | -model2 out/basic/06/answer/test-040000.json \ 22 | -name1 "BIDAF Synthetic, k=3, intra ensemble" \ 23 | -name2 "BIDAF on SQUAD -> Newsqa" \ 24 | -output "BIDAF_results/synthetic_k_3_intra_out_domain" 25 | 26 | python3 -m visualization.compare_models_newsqa \ 27 | -dataset newsqa/data_test.json \ 28 | -model1 30_all.json \ 29 | -model2 out/basic/06/answer/test-040000.json \ 30 | -name1 "BIDAF Synthetic, k=5, intra ensemble" \ 31 | -name2 "BIDAF on SQUAD -> Newsqa" \ 32 | -output "BIDAF_results/synthetic_k_5_intra_out_domain" 33 | 34 | python3 -m visualization.compare_models_newsqa \ 35 | -dataset newsqa/data_test.json \ 36 | -model1 30_all.json \ 37 | -model2 26_all.json \ 38 | -name1 "BIDAF Synthetic, k=5, intra ensemble" \ 39 | -name2 "BIDAF Synthetic, k=0, intra ensemble" \ 40 | -output "BIDAF_results/synthetic_k_5_intra_k_0_intra" 41 | 42 | 43 | python3 -m visualization.compare_models_newsqa \ 44 | -dataset newsqa/data_test.json \ 45 | -model1 30_all.json \ 46 | -model2 29_all.json \ 47 | -name1 "BIDAF Synthetic, k=5, intra ensemble" \ 48 | -name2 "BIDAF Synthetic, k=3, intra ensemble" \ 49 | -output "BIDAF_results/synthetic_k_5_intra_k_3_intra" 50 | 51 | python3 -m visualization.compare_models_newsqa \ 52 | -dataset newsqa/data_test.json \ 53 | -model1 30_all.json \ 54 | -model2 out/basic/00/answer/test-040000.json \ 55 | -name1 "BIDAF Synthetic, k=5, fake a, fake q" \ 56 | -name2 "BIDAF on SQUAD -> Newsqa" \ 57 | -output "BIDAF_results/synthetic_k_5_intra_vs_indomain" 58 | 59 | python3 -m visualization.compare_models_newsqa \ 60 | -dataset newsqa/data_test.json \ 61 | -model2 30_all.json \ 62 | -model1 12_all.json \ 63 | -name1 "BIDAF Synthetic, k=5, fake a, fake q" \ 64 | -name2 "BIDAF on SQUAD -> Newsqa" \ 65 | -output "BIDAF_results/synthetic_k_5_intra_vs_k_0_real_ans" 66 | 67 | -------------------------------------------------------------------------------- /bidaf/scripts/evaluate_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | for i in 42 43 44 45 46 47 48 49; 5 | do 6 | python3 -m basic_old.cli \ 7 | --run_id 18 \ 8 | --shared_path out/basic/06/shared.json \ 9 | --load_path "out/basic/18/save/basic-"$i"000" \ 10 | --k 10 \ 11 | --use_special_token False \ 12 | --load_ema False --gpu_idx 1 \ 13 | --mode test --data_dir newsqa \ 14 | --len_opt --batch_size 15 --num_steps 40000 \ 15 | --eval_period 1000 --save_period 1000 \ 16 | --sent_size_th 2100 --para_size_th 2100 17 | done 18 | 19 | #for i in 42 43 44 45 46 47 48 49; 20 | #do 21 | # python3 -m basic_old.cli \ 22 | # --run_id 14 \ 23 | # --shared_path out/basic/06/shared.json \ 24 | # --load_path "out/basic/14/save/basic-"$i"000" \ 25 | # --k 10 \ 26 | # --use_special_token False \ 27 | # --load_ema False --gpu_idx 3 \ 28 | # --mode test --data_dir newsqa \ 29 | # --len_opt --batch_size 15 --num_steps 40000 \ 30 | # --eval_period 1000 --save_period 1000 \ 31 | # --sent_size_th 2100 --para_size_th 2100 32 | #done -------------------------------------------------------------------------------- /bidaf/scripts/finetune_squad.sh: -------------------------------------------------------------------------------- 1 | python3 -m basic_old.cli \ 2 | --run_id 29 \ 3 | --use_special_token False \ 4 | --sup_unsup_ratio 5 \ 5 | --shared_path out/basic/06/shared.json \ 6 | --load_path out/basic/06/save/basic-40000 \ 7 | --k 10 \ 8 | --baseline_dir newsqa \ 9 | --load_ema False --gpu_idx 0 \ 10 | --num_gpus 0 \ 11 | --mode train \ 12 | --data_dir squad_train_unsupervised_verb_filter \ 13 | --len_opt --batch_size 30 \ 14 | --num_steps 40000 \ 15 | --eval_period 1000 --save_period 1000 \ 16 | --sent_size_th 300 --para_size_th 300 17 | 18 | python3 -m basic_old.cli \ 19 | --run_id 30 \ 20 | --use_special_token False \ 21 | --sup_unsup_ratio 3 \ 22 | --shared_path out/basic/06/shared.json \ 23 | --load_path out/basic/06/save/basic-40000 \ 24 | --k 10 \ 25 | --baseline_dir newsqa \ 26 | --load_ema False --gpu_idx 0 \ 27 | --num_gpus 0 \ 28 | --mode train \ 29 | --data_dir squad_train_unsupervised_verb_filter \ 30 | --len_opt --batch_size 30 \ 31 | --num_steps 40000 \ 32 | --eval_period 1000 --save_period 1000 \ 33 | --sent_size_th 300 --para_size_th 300 34 | 35 | python3 -m basic_old.cli \ 36 | --run_id 31 \ 37 | --use_special_token False \ 38 | --sup_unsup_ratio 5 \ 39 | --shared_path out/basic/06/shared.json \ 40 | --load_path out/basic/06/save/basic-40000 \ 41 | --k 10 \ 42 | --baseline_dir newsqa \ 43 | --load_ema False --gpu_idx 0 \ 44 | --num_gpus 0 \ 45 | --mode train \ 46 | --data_dir squad_train_unsupervised_verb_filter_iob \ 47 | --len_opt --batch_size 30 \ 48 | --num_steps 40000 \ 49 | --eval_period 1000 --save_period 1000 \ 50 | --sent_size_th 300 --para_size_th 300 51 | -------------------------------------------------------------------------------- /bidaf/scripts/install_tensorflow.sh: -------------------------------------------------------------------------------- 1 | sudo pip uninstall tensorflow 2 | export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.12.1-cp34-cp34m-linux_x86_64.whl 3 | 4 | sudo pip install --upgrade pip 5 | sudo pip install --upgrade $TF_BINARY_URL 6 | 7 | sudo pip install requests 8 | sudo pip install tqdm 9 | sudo pip install pandas 10 | sudo pip install nltk 11 | 12 | sudo apt-get update 13 | sudo apt-get install python-software-properties 14 | sudo add-apt-repository ppa:git-core/ppa 15 | curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash 16 | sudo apt-get install git-lfs 17 | git lfs install -------------------------------------------------------------------------------- /bidaf/scripts/run_ensemble_unsupervised.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_id=06 4 | eargs="" 5 | for num in 40 41 42 43 44; do 6 | eval_path="out/basic/${model_id}/eval/test-0${num}000.pklz" 7 | eargs="$eargs $eval_path" 8 | done 9 | python3 -m basic.ensemble --data_path newsqa/data_test.json --shared_path newsqa/shared_test.json -o new_results.json $eargs 10 | 11 | -------------------------------------------------------------------------------- /bidaf/scripts/run_evaluation.sh: -------------------------------------------------------------------------------- 1 | model_id=$1 2 | model_id_1=$2 3 | model_id_2=26 4 | eargs="" 5 | 6 | 7 | #for num in 41 42 43 44 45 46 48 50 51 52 53 54 55; do 8 | # eval_path="out/basic/${model_id}/eval/test-0${num}000.pklz" 9 | # eargs="$eargs $eval_path" 10 | #done 11 | #""" 12 | for num in 41 42 43 44 45 46 48 50 51; do 13 | eval_path="out/basic/${model_id_1}/eval/test-0${num}000.pklz" 14 | eargs="$eargs $eval_path" 15 | done 16 | #for num in 41 42 43 46; do 17 | # eval_path="out/basic/${model_id_2}/eval/test-0${num}000.pklz" 18 | # eargs="$eargs $eval_path" 19 | #done 20 | python3 -m basic.ensemble --data_path newsqa/data_test.json --shared_path newsqa/shared_test.json -o new_results_30.json $eargs 21 | python3 newsqa/evaluate.py newsqa/data_test.json new_results_30.json 22 | 23 | -------------------------------------------------------------------------------- /bidaf/scripts/run_huge_evaluation.sh: -------------------------------------------------------------------------------- 1 | model_id=30 2 | model_id_1=29 3 | model_id_2=34 4 | model_id_3=36 5 | model_id_4=32 6 | model_id_5=37 7 | eargs="" 8 | 9 | for num in 40 41 42 43 44 45 46 48 50 51 52 53 54 55; do 10 | eval_path="out/basic/${model_id}/eval/test-0${num}000.pklz" 11 | eargs="$eargs $eval_path" 12 | done 13 | 14 | for num in 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59; do 15 | eval_path="out/basic/${model_id_1}/eval/test-0${num}000.pklz" 16 | eargs="$eargs $eval_path" 17 | done 18 | 19 | for num in 40 41 42 43 44 45 46 47 48 49 50 51 52 53; do 20 | eval_path="out/basic/${model_id_2}/eval/test-0${num}000.pklz" 21 | eargs="$eargs $eval_path" 22 | done 23 | 24 | for num in 40 41 42 43 44 45 46 47 48 49; do 25 | eval_path="out/basic/${model_id_3}/eval/test-0${num}000.pklz" 26 | eargs="$eargs $eval_path" 27 | done 28 | 29 | for num in 43 44 45 46 47; do 30 | eval_path="out/basic/${model_id_4}/eval/test-0${num}000.pklz" 31 | eargs="$eargs $eval_path" 32 | done 33 | 34 | for num in 40 43 44 45 46 47 48; do 35 | eval_path="out/basic/${model_id_5}/eval/test-0${num}000.pklz" 36 | eargs="$eargs $eval_path" 37 | done 38 | #for num in 41 42 43 46; do 39 | # eval_path="out/basic/${model_id_2}/eval/test-0${num}000.pklz" 40 | # eargs="$eargs $eval_path" 41 | #done 42 | python3 -m basic.ensemble --data_path newsqa/data_test.json --shared_path newsqa/shared_test.json -o new_results_1.json $eargs 43 | python3 newsqa/evaluate.py newsqa/data_test.json new_results_1.json 44 | 45 | -------------------------------------------------------------------------------- /bidaf/scripts/run_intra_evaluation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | model_id=$1 3 | save_path="${model_id}_all.json" 4 | eval_paths="out/basic/${model_id}/eval/test-*" 5 | eargs="" 6 | for eval_path in $eval_paths; 7 | do 8 | eargs="$eargs $eval_path" 9 | done 10 | 11 | python3 -m basic.ensemble --data_path newsqa/data_test.json --shared_path newsqa/shared_test.json -o $save_path $eargs 12 | python3 newsqa/evaluate.py newsqa/data_test.json $save_path -------------------------------------------------------------------------------- /bidaf/scripts/run_intra_helper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Script to evaluate single model performance 3 | 4 | for id in 14 16 17 18 29 30 37; 5 | do 6 | echo "on run $id" 7 | ./scripts/run_new.sh $id 8 | done -------------------------------------------------------------------------------- /bidaf/scripts/run_new.sh: -------------------------------------------------------------------------------- 1 | model_id=$1 2 | save_path="${model_id}_all.json" 3 | eargs="" 4 | 5 | eval_paths="out/basic/${model_id}/eval/test-*" 6 | count=0 7 | max=5 8 | for eval_path in $eval_paths; 9 | do 10 | ((count++)) 11 | if (("$count" < "$max")) 12 | then 13 | eargs="$eargs $eval_path" 14 | fi 15 | done 16 | python3 -m basic.ensemble --data_path newsqa/data_test.json --shared_path newsqa/shared_test.json -o $save_path $eargs 17 | python3 newsqa/evaluate.py newsqa/data_test.json $save_path -------------------------------------------------------------------------------- /bidaf/squad/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/bidaf/squad/__init__.py -------------------------------------------------------------------------------- /bidaf/squad/aug_squad.py: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c3fa27b0b86b8b9142b9f83461e1cf968abceac19b7acb9228852aa56cdf34f9 3 | size 5725 4 | -------------------------------------------------------------------------------- /bidaf/squad/eda_aug_dev.ipynb: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:9d5d2057a12994d20542a10918edc301f8a5eba9812d593b649d3f8a3c88ee1a 3 | size 7050 4 | -------------------------------------------------------------------------------- /bidaf/squad/eda_aug_train.ipynb: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2ba686b66274ea73d50d8c8e47f1327197214453740d7d90972a9d87473a3295 3 | size 7812 4 | -------------------------------------------------------------------------------- /bidaf/squad/evaluate-v1.1.py: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f5a673dbbd173e29e9ea38f1b2091d883583b77b3a4c17144b223fb0f2f9bd09 3 | size 3419 4 | -------------------------------------------------------------------------------- /bidaf/squad/evaluate.py: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ff1d15e1c750cb63c0bd95a87a8fe943934ab11a3e14b19e689f2bc5ffb48a95 3 | size 3456 4 | -------------------------------------------------------------------------------- /bidaf/squad/prepro.py: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:022af2fd83bd1b72cddf31dddbbf53b8ee32d860d0827d9e2ca57d33de1dc59c 3 | size 9271 4 | -------------------------------------------------------------------------------- /bidaf/squad/prepro_aug.py: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:14f328397ec2d1710e875ea0c4a0e3914d52008e1a0829aa8fc83a49bf4c094b 3 | size 6858 4 | -------------------------------------------------------------------------------- /bidaf/squad/utils.py: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6a39505c6fd5861912ff3d7bc959e2e01c26926f214e6aa6c2c2344a400c7adb 3 | size 3492 4 | -------------------------------------------------------------------------------- /bidaf/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/bidaf/tests/__init__.py -------------------------------------------------------------------------------- /bidaf/tests/check_results.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import gzip 3 | 4 | for path in [42, 43, 44, 45, 46, 47, 48, 49, 51, 52, 53, 54]: 5 | save_path = 'out/basic/19/eval/test-0%s000.pklz' % path#'out/basic/06/eval/dev-040000.pklz'#'out/basic/12/eval/dev-047000.pklz'#out/basic/10/eval/dev-053000.pklz'#'out/basic/09/eval/dev-042000.pklz' #'out/basic/06/eval/dev-040000.pklz' 6 | f = gzip.open(save_path,'rb') 7 | res= pickle.load(f) 8 | f.close() 9 | 10 | print(save_path) 11 | print(res['f1']) 12 | print(res['acc']) 13 | 14 | 15 | #restore the object 16 | #out/basic/19/eval 17 | for path in ['041000']: 18 | save_path = 'out/basic/17/eval/test-%s.pklz' % path#'out/basic/06/eval/dev-040000.pklz'#'out/basic/12/eval/dev-047000.pklz'#out/basic/10/eval/dev-053000.pklz'#'out/basic/09/eval/dev-042000.pklz' #'out/basic/06/eval/dev-040000.pklz' 19 | f = gzip.open(save_path,'rb') 20 | res= pickle.load(f) 21 | f.close() 22 | 23 | print(save_path) 24 | print(res['f1']) 25 | print(res['acc']) 26 | 27 | for path in ['041000', '042000', '043000', '044000', '045000']: 28 | save_path = 'out/basic/14/eval/test-%s.pklz' % path#'out/basic/06/eval/dev-040000.pklz'#'out/basic/12/eval/dev-047000.pklz'#out/basic/10/eval/dev-053000.pklz'#'out/basic/09/eval/dev-042000.pklz' #'out/basic/06/eval/dev-040000.pklz' 29 | f = gzip.open(save_path,'rb') 30 | res= pickle.load(f) 31 | f.close() 32 | 33 | print(save_path) 34 | print(res['f1']) 35 | print(res['acc']) 36 | 37 | # out/basic/25/eval 38 | for path in ['044000', '045000', '046000', '047000', '048000', '049000', '050000', '051000', '052000']: 39 | save_path = 'out/basic/14/eval/dev-%s.pklz' % path#'out/basic/06/eval/dev-040000.pklz'#'out/basic/12/eval/dev-047000.pklz'#out/basic/10/eval/dev-053000.pklz'#'out/basic/09/eval/dev-042000.pklz' #'out/basic/06/eval/dev-040000.pklz' 40 | f = gzip.open(save_path,'rb') 41 | res= pickle.load(f) 42 | f.close() 43 | 44 | print(save_path) 45 | print(res['f1']) 46 | print(res['acc']) 47 | 48 | for path in ['041000', '042000']: 49 | save_path = 'out/basic/18/eval/dev-%s.pklz' % path#'out/basic/06/eval/dev-040000.pklz'#'out/basic/12/eval/dev-047000.pklz'#out/basic/10/eval/dev-053000.pklz'#'out/basic/09/eval/dev-042000.pklz' #'out/basic/06/eval/dev-040000.pklz' 50 | f = gzip.open(save_path,'rb') 51 | res= pickle.load(f) 52 | f.close() 53 | 54 | print(save_path) 55 | print(res['f1']) 56 | print(res['acc']) 57 | 58 | for path in ['041000', '042000', '043000', '044000', '045000', '046000', '047000', '048000', '049000']: 59 | save_path = 'out/basic/17/eval/dev-%s.pklz' % path#'out/basic/06/eval/dev-040000.pklz'#'out/basic/12/eval/dev-047000.pklz'#out/basic/10/eval/dev-053000.pklz'#'out/basic/09/eval/dev-042000.pklz' #'out/basic/06/eval/dev-040000.pklz' 60 | f = gzip.open(save_path,'rb') 61 | res= pickle.load(f) 62 | f.close() 63 | 64 | print(save_path) 65 | print(res['f1']) 66 | print(res['acc']) -------------------------------------------------------------------------------- /bidaf/tests/create_bidaf_old_dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | import os 4 | import re 5 | import sys 6 | import time 7 | import urllib 8 | from urllib.parse import quote 9 | from bs4 import BeautifulSoup 10 | from urllib.request import urlopen 11 | from helpers import utils 12 | from collections import defaultdict 13 | from itertools import groupby 14 | 15 | def dedup(q): 16 | grouped_L = [[k, sum(1 for i in g)] for k,g in groupby(q)] 17 | deduped_q = list(map(lambda l: l[0], grouped_L)) 18 | #if "?" not in deduped_q: 19 | #  print("Adding new question") 20 | #  deduped_q.append("?") 21 | return deduped_q 22 | 23 | def invalid_question(q): 24 | string_q = ' '.join(q) 25 | cnn_test = "CNN" in string_q 26 | unk_test = "" in string_q 27 | q_test = "?" not in string_q 28 | small_q_test = len(dedup(q)) < 5 29 | is_invalid = cnn_test or small_q_test #or q_test 30 | 31 | return is_invalid 32 | 33 | def save_results(dev_path, 34 | shared_path, 35 | gen_questions_path, 36 | gen_answers_start_path, 37 | gen_answers_end_path, 38 | gen_idxs_path, 39 | gen_ids_path, 40 | save_path): 41 | print("Loading dev json: %s and shared: %s" % (dev_path, shared_path)) 42 | dev_json = json.load(open(dev_path)) 43 | shared_json = json.load(open(shared_path)) 44 | print("Done loading dev json and shared") 45 | questions = utils.read_lines(gen_questions_path) 46 | answer_starts = utils.read_lines(gen_answers_start_path) 47 | answer_ends = utils.read_lines(gen_answers_end_path) 48 | idxs = utils.read_lines(gen_idxs_path) 49 | ids = utils.read_lines(gen_ids_path) 50 | 51 | keys = dev_json.keys() 52 | dataset = defaultdict(list) 53 | 54 | idx = 54 55 | 56 | for i in range(0, len(questions)): 57 | cur_q = questions[i].split(" ") 58 | if invalid_question(cur_q): 59 | continue 60 | cur_q = dedup(cur_q) 61 | cur_ans_start = int(answer_starts[i]) 62 | cur_ans_end = int(answer_ends[i]) 63 | idx = int(idxs[i]) 64 | id = int(ids[i]) 65 | cur_par = shared_json['x'][idx][0][0] 66 | cy_0 = 0 67 | cy_1 = len(cur_par[cur_ans_end - 1]) 68 | cy = [[cy_0, cy_1]] 69 | 70 | answerss = [cur_par[cur_ans_start:cur_ans_end]] 71 | cur_q_char = list(map(lambda token: token.split(), cur_q)) 72 | 73 | dataset['idxs'].append(idx) 74 | dataset['ids'].append(len(dataset['ids'])) 75 | dataset['cy'].append(cy) 76 | dataset['answerss'].append(answerss) 77 | dataset['span_answerss'].append(answerss) 78 | dataset['*x'].append([idx, 0]) 79 | dataset['*cx'].append([idx, 0]) 80 | dataset['*p'].append([idx, 0]) 81 | 82 | shared_json['x'][idx] 83 | dataset['y'].append([[[0, cur_ans_start], [0, cur_ans_end]]]) 84 | dataset['q'].append(cur_q) 85 | dataset['cq'].append(cur_q_char) 86 | 87 | print("Saving to path %s" % save_path) 88 | utils.save_json(dataset, save_path) 89 | 90 | save_directory = 'newsqa_unsupervised_old' 91 | utils.check_dir(save_directory) 92 | 93 | shared_path='newsqa/shared_train.json' 94 | dev_path= 'newsqa/data_train.json' 95 | base_path = '../datasets/newsqa_unsupervised_old/train' 96 | gen_questions_path = '%s/predictions.txt' % base_path#'%s/outputs.txt' % base_path#, 'newsqa/', 'newsqa/'] 97 | gen_answers_start_path = '%s/answer_starts.txt' % base_path 98 | gen_answers_end_path = '%s/answer_ends.txt' % base_path 99 | gen_ids_path = '%s/ids.txt' % base_path 100 | gen_idxs_path = '%s/indices.txt' % base_path 101 | save_path = '%s/data_train.json' % save_directory 102 | 103 | save_results(dev_path=dev_path, 104 | shared_path=shared_path, 105 | gen_questions_path=gen_questions_path, 106 | gen_answers_start_path=gen_answers_start_path, 107 | gen_answers_end_path=gen_answers_end_path, 108 | gen_ids_path=gen_ids_path, 109 | gen_idxs_path=gen_idxs_path, 110 | save_path=save_path) 111 | 112 | 113 | """ 114 | dev_paths = []#'newsqa/data_train.json', 'newsqa/data_dev.json', 'newsqa/data_test.json'] #'data/squad/data_train.json' 115 | save_paths = ['newsqa_gen_filtered_v2/data_train.json']#'newsqa_gen/data_train.json', 'newsqa_gen/data_dev.json', 'newsqa_gen/data_test.json'] #'data/squad/web_data_train.json' 116 | shared_paths = ['newsqa/shared_train.json'] 117 | 118 | #json_data = json.load(open(save_paths[0])) 119 | #shared_data = json.load(open(shared_paths[0])) 120 | #original_data = json.load(open(dev_paths[0])) 121 | 122 | print(shared_data.keys()) 123 | for idx in range(100, 101): 124 |   print(json_data['q'][idx]) 125 |   print(original_data['q'][idx]) 126 |   print(original_data['answerss'][idx]) 127 |   print(json_data['answerss'][idx]) 128 | 129 | 130 | for dev_path, gen_questions_path, save_path in zip(dev_paths, gen_questions_paths, save_paths): 131 |   save_results(dev_path, gen_questions_path, save_path) 132 | 133 | 134 | """ -------------------------------------------------------------------------------- /bidaf/tests/create_generation_dataset_unsupervised.py: -------------------------------------------------------------------------------- 1 | import json 2 | from squad.utils import get_2d_spans 3 | from helpers import utils 4 | from helpers import spacy_tokenizer 5 | 6 | def create_dataset(save_dir, data_path, shared_path): 7 | print("Loading data from path %s" % data_path) 8 | data = json.load(open(data_path)) 9 | print("Done loading data") 10 | shared_data = json.load(open(shared_path)) 11 | print("Done loading shared data from path %s" % shared_path) 12 | 13 | def count_sums(up_to_idx): 14 | total_len = 0 15 | for i in range(0, up_to_idx): 16 | total_len += len(shared_data['x'][i]) 17 | return total_len 18 | 19 | idxs = [] 20 | xs = [] 21 | answer_starts = [] 22 | answer_ends = [] 23 | indices = [] 24 | questions = [] 25 | 26 | for i in range(len(shared_data['x'])): 27 | print("On %s of %s" % (i, len(shared_data['x']))) 28 | for j in range(len(shared_data['x'][i])): 29 | cur_tokens = shared_data['x'][i][j][0] 30 | cur_text = " ".join(cur_tokens) 31 | cur_ans_starts, cur_ans_ends = spacy_tokenizer.extract_phrases(cur_text, 2) 32 | answer_starts.extend([str(ans) for ans in cur_ans_starts]) 33 | answer_ends.extend([str(ans) for ans in cur_ans_ends]) 34 | idxs.extend(range(len(idxs), len(idxs) + len(cur_ans_starts))) 35 | questions.extend([""] * len(cur_ans_starts)) 36 | indices.extend([str(len(xs))] * len(cur_ans_starts)) 37 | xs.append('\t'.join(cur_tokens)) 38 | 39 | idxs = list(map(lambda idx: str(idx), idxs)) 40 | utils.save_lines(idxs, '%s/ids.txt' % save_dir) 41 | utils.save_lines(questions, '%s/outputs.txt' % save_dir) 42 | utils.save_lines(answer_starts, '%s/answer_starts.txt' % save_dir) 43 | utils.save_lines(answer_ends, '%s/answer_ends.txt' % save_dir) 44 | utils.save_lines(xs, '%s/inputs.txt' % save_dir) 45 | utils.save_lines(indices, '%s/indices.txt' % save_dir) 46 | 47 | 48 | # Create squad dataset 49 | create_dataset(save_dir='../datasets/newsqa_unsupervised/', 50 | data_path='newsqa/data_train.json', 51 | shared_path='newsqa/shared_train.json') 52 | 53 | create_dataset(save_dir='../datasets/newsqa_unsupervised/train', 54 | data_path='newsqa/data_train.json', 55 | shared_path='newsqa/shared_train.json') 56 | 57 | create_dataset(save_dir='../datasets/newsqa_unsupervised/validation', 58 | data_path='newsqa/data_validation.json', 59 | shared_path='newsqa/shared_validation.json') 60 | 61 | create_dataset(save_dir='../datasets/newsqa_unsupervised/test', 62 | data_path='newsqa/data_test.json', 63 | shared_path='newsqa/shared_test.json') 64 | 65 | -------------------------------------------------------------------------------- /bidaf/tree/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/bidaf/tree/__init__.py -------------------------------------------------------------------------------- /bidaf/tree/cli.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pprint import pprint 3 | 4 | import tensorflow as tf 5 | 6 | from tree.main import main as m 7 | 8 | flags = tf.app.flags 9 | 10 | flags.DEFINE_string("model_name", "tree", "Model name [tree]") 11 | flags.DEFINE_string("data_dir", "data/squad", "Data dir [data/squad]") 12 | flags.DEFINE_integer("run_id", 0, "Run ID [0]") 13 | 14 | flags.DEFINE_integer("batch_size", 128, "Batch size [128]") 15 | flags.DEFINE_float("init_lr", 0.5, "Initial learning rate [0.5]") 16 | flags.DEFINE_integer("num_epochs", 50, "Total number of epochs for training [50]") 17 | flags.DEFINE_integer("num_steps", 0, "Number of steps [0]") 18 | flags.DEFINE_integer("eval_num_batches", 100, "eval num batches [100]") 19 | flags.DEFINE_integer("load_step", 0, "load step [0]") 20 | flags.DEFINE_integer("early_stop", 4, "early stop [4]") 21 | 22 | flags.DEFINE_string("mode", "test", "train | test | forward [test]") 23 | flags.DEFINE_boolean("load", True, "load saved data? [True]") 24 | flags.DEFINE_boolean("progress", True, "Show progress? [True]") 25 | flags.DEFINE_integer("log_period", 100, "Log period [100]") 26 | flags.DEFINE_integer("eval_period", 1000, "Eval period [1000]") 27 | flags.DEFINE_integer("save_period", 1000, "Save Period [1000]") 28 | flags.DEFINE_float("decay", 0.9, "Exponential moving average decay [0.9]") 29 | 30 | flags.DEFINE_boolean("draft", False, "Draft for quick testing? [False]") 31 | 32 | flags.DEFINE_integer("hidden_size", 32, "Hidden size [32]") 33 | flags.DEFINE_float("input_keep_prob", 0.5, "Input keep prob [0.5]") 34 | flags.DEFINE_integer("char_emb_size", 8, "Char emb size [8]") 35 | flags.DEFINE_integer("char_filter_height", 5, "Char filter height [5]") 36 | flags.DEFINE_float("wd", 0.0001, "Weight decay [0.001]") 37 | flags.DEFINE_bool("lower_word", True, "lower word [True]") 38 | flags.DEFINE_bool("dump_eval", True, "dump eval? [True]") 39 | 40 | flags.DEFINE_integer("word_count_th", 100, "word count th [100]") 41 | flags.DEFINE_integer("char_count_th", 500, "char count th [500]") 42 | flags.DEFINE_integer("sent_size_th", 64, "sent size th [64]") 43 | flags.DEFINE_integer("num_sents_th", 8, "num sents th [8]") 44 | flags.DEFINE_integer("ques_size_th", 64, "ques size th [64]") 45 | flags.DEFINE_integer("word_size_th", 16, "word size th [16]") 46 | flags.DEFINE_integer("tree_height_th", 16, "tree height th [16]") 47 | 48 | 49 | def main(_): 50 | config = flags.FLAGS 51 | 52 | config.out_dir = os.path.join("out", config.model_name, str(config.run_id).zfill(2)) 53 | 54 | m(config) 55 | 56 | if __name__ == "__main__": 57 | tf.app.run() 58 | -------------------------------------------------------------------------------- /bidaf/tree/graph_handler.py: -------------------------------------------------------------------------------- 1 | import json 2 | from json import encoder 3 | import os 4 | 5 | import tensorflow as tf 6 | 7 | from tree.evaluator import Evaluation 8 | from my.utils import short_floats 9 | 10 | 11 | class GraphHandler(object): 12 | def __init__(self, config): 13 | self.config = config 14 | self.saver = tf.train.Saver() 15 | self.writer = None 16 | self.save_path = os.path.join(config.save_dir, config.model_name) 17 | 18 | def initialize(self, sess): 19 | if self.config.load: 20 | self._load(sess) 21 | else: 22 | sess.run(tf.initialize_all_variables()) 23 | 24 | if self.config.mode == 'train': 25 | self.writer = tf.train.SummaryWriter(self.config.log_dir, graph=tf.get_default_graph()) 26 | 27 | def save(self, sess, global_step=None): 28 | self.saver.save(sess, self.save_path, global_step=global_step) 29 | 30 | def _load(self, sess): 31 | config = self.config 32 | if config.load_step > 0: 33 | save_path = os.path.join(config.save_dir, "{}-{}".format(config.model_name, config.load_step)) 34 | else: 35 | save_dir = config.save_dir 36 | checkpoint = tf.train.get_checkpoint_state(save_dir) 37 | assert checkpoint is not None, "cannot load checkpoint at {}".format(save_dir) 38 | save_path = checkpoint.model_checkpoint_path 39 | print("Loading saved model from {}".format(save_path)) 40 | self.saver.restore(sess, save_path) 41 | 42 | def add_summary(self, summary, global_step): 43 | self.writer.add_summary(summary, global_step) 44 | 45 | def add_summaries(self, summaries, global_step): 46 | for summary in summaries: 47 | self.add_summary(summary, global_step) 48 | 49 | def dump_eval(self, e, precision=2): 50 | assert isinstance(e, Evaluation) 51 | path = os.path.join(self.config.eval_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6))) 52 | with open(path, 'w') as fh: 53 | json.dump(short_floats(e.dict, precision), fh) 54 | 55 | -------------------------------------------------------------------------------- /bidaf/tree/templates/visualizer.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {{ title }} 6 | 7 | 8 | 19 | 20 | 23 | 24 |

    {{ title }}

    25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | {% for row in rows %} 33 | 34 | 35 | 40 | 41 | 63 | 64 | {% endfor %} 65 |
    IDQuestionAnswerParagraph
    {{ row.id }} 36 | {% for qj in row.ques %} 37 | {{ qj }} 38 | {% endfor %} 39 | {{ row.a }} 42 | 43 | {% for xj, yj, y2j, ypj, yp2j in zip(row.para, row.y, row.y2, row.yp, row.yp2) %} 44 | 45 | {% for xjk, yjk, y2jk, ypjk in zip(xj, yj, y2j, ypj) %} 46 | 53 | {% endfor %} 54 | 55 | 56 | {% for xjk, yp2jk in zip(xj, yp2j) %} 57 | 58 | {% endfor %} 59 | 60 | {% endfor %} 61 |
    47 | {% if yjk or y2jk %} 48 | {{ xjk }} 49 | {% else %} 50 | {{ xjk }} 51 | {% endif %} 52 |
    -
    62 |
    66 | 67 | -------------------------------------------------------------------------------- /bidaf/tree/trainer.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from tree.model import Model 4 | 5 | 6 | class Trainer(object): 7 | def __init__(self, config, model): 8 | assert isinstance(model, Model) 9 | self.config = config 10 | self.model = model 11 | self.opt = tf.train.AdagradOptimizer(config.init_lr) 12 | self.loss = model.get_loss() 13 | self.var_list = model.get_var_list() 14 | self.global_step = model.get_global_step() 15 | self.ema_op = model.ema_op 16 | self.summary = model.summary 17 | self.grads = self.opt.compute_gradients(self.loss, var_list=self.var_list) 18 | opt_op = self.opt.apply_gradients(self.grads, global_step=self.global_step) 19 | 20 | # Define train op 21 | with tf.control_dependencies([opt_op]): 22 | self.train_op = tf.group(self.ema_op) 23 | 24 | def get_train_op(self): 25 | return self.train_op 26 | 27 | def step(self, sess, batch, get_summary=False): 28 | assert isinstance(sess, tf.Session) 29 | feed_dict = self.model.get_feed_dict(batch, True) 30 | if get_summary: 31 | loss, summary, train_op = \ 32 | sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict) 33 | else: 34 | loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict) 35 | summary = None 36 | return loss, summary, train_op 37 | -------------------------------------------------------------------------------- /bidaf/tree/visualizer.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from collections import OrderedDict 3 | import http.server 4 | import socketserver 5 | import argparse 6 | import json 7 | import os 8 | import numpy as np 9 | from tqdm import tqdm 10 | 11 | from jinja2 import Environment, FileSystemLoader 12 | 13 | 14 | def bool_(string): 15 | if string == 'True': 16 | return True 17 | elif string == 'False': 18 | return False 19 | else: 20 | raise Exception() 21 | 22 | def get_args(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--model_name", type=str, default='basic') 25 | parser.add_argument("--data_type", type=str, default='dev') 26 | parser.add_argument("--step", type=int, default=5000) 27 | parser.add_argument("--template_name", type=str, default="visualizer.html") 28 | parser.add_argument("--num_per_page", type=int, default=100) 29 | parser.add_argument("--data_dir", type=str, default="data/squad") 30 | parser.add_argument("--port", type=int, default=8000) 31 | parser.add_argument("--host", type=str, default="0.0.0.0") 32 | parser.add_argument("--open", type=str, default='False') 33 | parser.add_argument("--run_id", type=str, default="0") 34 | 35 | args = parser.parse_args() 36 | return args 37 | 38 | 39 | def _decode(decoder, sent): 40 | return " ".join(decoder[idx] for idx in sent) 41 | 42 | 43 | def accuracy2_visualizer(args): 44 | model_name = args.model_name 45 | data_type = args.data_type 46 | num_per_page = args.num_per_page 47 | data_dir = args.data_dir 48 | run_id = args.run_id.zfill(2) 49 | step = args.step 50 | 51 | eval_path =os.path.join("out", model_name, run_id, "eval", "{}-{}.json".format(data_type, str(step).zfill(6))) 52 | eval_ = json.load(open(eval_path, 'r')) 53 | 54 | _id = 0 55 | html_dir = "/tmp/list_results%d" % _id 56 | while os.path.exists(html_dir): 57 | _id += 1 58 | html_dir = "/tmp/list_results%d" % _id 59 | 60 | if os.path.exists(html_dir): 61 | shutil.rmtree(html_dir) 62 | os.mkdir(html_dir) 63 | 64 | cur_dir = os.path.dirname(os.path.realpath(__file__)) 65 | templates_dir = os.path.join(cur_dir, 'templates') 66 | env = Environment(loader=FileSystemLoader(templates_dir)) 67 | env.globals.update(zip=zip, reversed=reversed) 68 | template = env.get_template(args.template_name) 69 | 70 | data_path = os.path.join(data_dir, "data_{}.json".format(data_type)) 71 | shared_path = os.path.join(data_dir, "shared_{}.json".format(data_type)) 72 | data = json.load(open(data_path, 'r')) 73 | shared = json.load(open(shared_path, 'r')) 74 | 75 | rows = [] 76 | for i, (idx, yi, ypi) in enumerate(zip(*[eval_[key] for key in ('idxs', 'y', 'yp')])): 77 | id_, q, rx = (data[key][idx] for key in ('ids', 'q', '*x')) 78 | x = shared['x'][rx[0]][rx[1]] 79 | ques = [" ".join(q)] 80 | para = [[word for word in sent] for sent in x] 81 | row = { 82 | 'id': id_, 83 | 'title': "Hello world!", 84 | 'ques': ques, 85 | 'para': para, 86 | 'y': yi, 87 | 'y2': yi, 88 | 'yp': ypi, 89 | 'yp2': ypi, 90 | 'a': "" 91 | } 92 | rows.append(row) 93 | 94 | if i % num_per_page == 0: 95 | html_path = os.path.join(html_dir, "%s.html" % str(i).zfill(8)) 96 | 97 | if (i + 1) % num_per_page == 0 or (i + 1) == len(eval_['y']): 98 | var_dict = {'title': "Accuracy Visualization", 99 | 'rows': rows 100 | } 101 | with open(html_path, "wb") as f: 102 | f.write(template.render(**var_dict).encode('UTF-8')) 103 | rows = [] 104 | 105 | os.chdir(html_dir) 106 | port = args.port 107 | host = args.host 108 | # Overriding to suppress log message 109 | class MyHandler(http.server.SimpleHTTPRequestHandler): 110 | def log_message(self, format, *args): 111 | pass 112 | handler = MyHandler 113 | httpd = socketserver.TCPServer((host, port), handler) 114 | if args.open == 'True': 115 | os.system("open http://%s:%d" % (args.host, args.port)) 116 | print("serving at %s:%d" % (host, port)) 117 | httpd.serve_forever() 118 | 119 | 120 | if __name__ == "__main__": 121 | ARGS = get_args() 122 | accuracy2_visualizer(ARGS) -------------------------------------------------------------------------------- /data_loaders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/data_loaders/__init__.py -------------------------------------------------------------------------------- /datasets/iob_test/label_vocab.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:9e6f95acefd53c7d31c37565bb19a57c8a5804bd8cbc903735fcc921eff7323d 3 | size 14 4 | -------------------------------------------------------------------------------- /datasets/iob_test/test/inputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ed0525e09813b3b5c00dcf14d5ad24a046687f803862acd21443b3941bc2ff1a 3 | size 339 4 | -------------------------------------------------------------------------------- /datasets/iob_test/test/labels.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:21c784fbc151c08fc6f6face1461c7f8c1d27b7d722cdec8a0925e44758ab3dc 3 | size 374 4 | -------------------------------------------------------------------------------- /datasets/iob_test/train/inputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ed0525e09813b3b5c00dcf14d5ad24a046687f803862acd21443b3941bc2ff1a 3 | size 339 4 | -------------------------------------------------------------------------------- /datasets/iob_test/train/labels.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:21c784fbc151c08fc6f6face1461c7f8c1d27b7d722cdec8a0925e44758ab3dc 3 | size 374 4 | -------------------------------------------------------------------------------- /datasets/iob_test/validation/inputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ed0525e09813b3b5c00dcf14d5ad24a046687f803862acd21443b3941bc2ff1a 3 | size 339 4 | -------------------------------------------------------------------------------- /datasets/iob_test/validation/labels.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:21c784fbc151c08fc6f6face1461c7f8c1d27b7d722cdec8a0925e44758ab3dc 3 | size 374 4 | -------------------------------------------------------------------------------- /datasets/iob_test/vocab.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:8d364496ba649830b4e63d911f2e81b75aa3f21c5788a4e91be8fb2a3428a4c5 3 | size 40 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised/test/answer_ends.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:037af1302de3d0fa01ad348e3859528787a1197e43baf7b81c63f9141e1bb560 3 | size 876366 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised/test/answer_starts.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:84b6eefcf57226e557664dba3df4b92e5d6a36f41e443a40cb7f8cb0b2725cfc 3 | size 871721 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised/test/ids.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:265789c92077c71a017dd1e6e1028e5a022ab34efddf091969dd2f6110d90566 3 | size 1634129 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised/test/indices.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e75f8c969460d99eea735258e2f2f3655c277daa4c59145cdad907f9630e43de 3 | size 1254429 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised/test/outputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2bf4737b32f5adf2faddde48402bf66f5e75f88c06defb2f6809f3fa571dabf3 3 | size 1745239 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised/train/answer_ends.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:037af1302de3d0fa01ad348e3859528787a1197e43baf7b81c63f9141e1bb560 3 | size 876366 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised/train/answer_starts.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:84b6eefcf57226e557664dba3df4b92e5d6a36f41e443a40cb7f8cb0b2725cfc 3 | size 871721 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised/train/ids.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:265789c92077c71a017dd1e6e1028e5a022ab34efddf091969dd2f6110d90566 3 | size 1634129 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised/train/indices.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e75f8c969460d99eea735258e2f2f3655c277daa4c59145cdad907f9630e43de 3 | size 1254429 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised/train/outputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2bf4737b32f5adf2faddde48402bf66f5e75f88c06defb2f6809f3fa571dabf3 3 | size 1745239 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised/validation/answer_ends.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:037af1302de3d0fa01ad348e3859528787a1197e43baf7b81c63f9141e1bb560 3 | size 876366 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised/validation/answer_starts.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:84b6eefcf57226e557664dba3df4b92e5d6a36f41e443a40cb7f8cb0b2725cfc 3 | size 871721 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised/validation/ids.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:265789c92077c71a017dd1e6e1028e5a022ab34efddf091969dd2f6110d90566 3 | size 1634129 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised/validation/indices.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e75f8c969460d99eea735258e2f2f3655c277daa4c59145cdad907f9630e43de 3 | size 1254429 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised/validation/outputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2bf4737b32f5adf2faddde48402bf66f5e75f88c06defb2f6809f3fa571dabf3 3 | size 1745239 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised/vocab.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3e7b26c130a8ba8be7bf34416bfcf472f01ab9ef641dea97f99fb7a1b2fe553c 3 | size 1006234 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised/word_embeddings.npy: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:07007189652bf43a0e3344fcab1080cb5ad6c96bb46bab61e6c5848f53213ffb 3 | size 264444080 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_large/test/answer_ends.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:cd9afca6022b499bde4845e4238bda9e558c38b2f0fb531cc2b1701b2ccc3acf 3 | size 1825956 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_large/test/answer_starts.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:406c3ff9997978e5ebb39d17a93df6740bee12185bc7fe7caa4d9e79947c2ea0 3 | size 1819658 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_large/test/ids.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:49aae5695d087c3e522b19198141dae59d33e9b86e8b317a32f039fa5fda79c5 3 | size 3370675 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_large/test/indices.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:242a4e7181d6dd5b2adc5f7c33485ebae7895cb59de84f4f6214f3b60b6ddc74 3 | size 2502865 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_large/test/outputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3c27d065ee9912ac78f85b4fabe70ff79e6a8f43cc706c5a63881cc6261fa8d6 3 | size 3481785 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_large/train/answer_ends.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:cd9afca6022b499bde4845e4238bda9e558c38b2f0fb531cc2b1701b2ccc3acf 3 | size 1825956 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_large/train/answer_starts.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:406c3ff9997978e5ebb39d17a93df6740bee12185bc7fe7caa4d9e79947c2ea0 3 | size 1819658 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_large/train/ids.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:49aae5695d087c3e522b19198141dae59d33e9b86e8b317a32f039fa5fda79c5 3 | size 3370675 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_large/train/indices.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:242a4e7181d6dd5b2adc5f7c33485ebae7895cb59de84f4f6214f3b60b6ddc74 3 | size 2502865 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_large/train/outputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3c27d065ee9912ac78f85b4fabe70ff79e6a8f43cc706c5a63881cc6261fa8d6 3 | size 3481785 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_large/validation/answer_ends.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:cd9afca6022b499bde4845e4238bda9e558c38b2f0fb531cc2b1701b2ccc3acf 3 | size 1825956 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_large/validation/answer_starts.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:406c3ff9997978e5ebb39d17a93df6740bee12185bc7fe7caa4d9e79947c2ea0 3 | size 1819658 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_large/validation/ids.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:49aae5695d087c3e522b19198141dae59d33e9b86e8b317a32f039fa5fda79c5 3 | size 3370675 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_large/validation/indices.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:242a4e7181d6dd5b2adc5f7c33485ebae7895cb59de84f4f6214f3b60b6ddc74 3 | size 2502865 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_large/validation/outputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3c27d065ee9912ac78f85b4fabe70ff79e6a8f43cc706c5a63881cc6261fa8d6 3 | size 3481785 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_large/vocab.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3e7b26c130a8ba8be7bf34416bfcf472f01ab9ef641dea97f99fb7a1b2fe553c 3 | size 1006234 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_old/test/answer_ends.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:037af1302de3d0fa01ad348e3859528787a1197e43baf7b81c63f9141e1bb560 3 | size 876366 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_old/test/answer_starts.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:84b6eefcf57226e557664dba3df4b92e5d6a36f41e443a40cb7f8cb0b2725cfc 3 | size 871721 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_old/test/ids.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:265789c92077c71a017dd1e6e1028e5a022ab34efddf091969dd2f6110d90566 3 | size 1634129 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_old/test/indices.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e75f8c969460d99eea735258e2f2f3655c277daa4c59145cdad907f9630e43de 3 | size 1254429 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_old/test/outputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2bf4737b32f5adf2faddde48402bf66f5e75f88c06defb2f6809f3fa571dabf3 3 | size 1745239 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_old/train/answer_ends.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:037af1302de3d0fa01ad348e3859528787a1197e43baf7b81c63f9141e1bb560 3 | size 876366 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_old/train/answer_starts.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:84b6eefcf57226e557664dba3df4b92e5d6a36f41e443a40cb7f8cb0b2725cfc 3 | size 871721 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_old/train/ids.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:265789c92077c71a017dd1e6e1028e5a022ab34efddf091969dd2f6110d90566 3 | size 1634129 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_old/train/indices.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e75f8c969460d99eea735258e2f2f3655c277daa4c59145cdad907f9630e43de 3 | size 1254429 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_old/train/outputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2bf4737b32f5adf2faddde48402bf66f5e75f88c06defb2f6809f3fa571dabf3 3 | size 1745239 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_old/train/predictions.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:8b9c0f018ef8ffde83f03cd0bd0fc64699dcf2bc311d2e6f6c4a85a04c825f80 3 | size 11889229 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_old/validation/answer_ends.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:037af1302de3d0fa01ad348e3859528787a1197e43baf7b81c63f9141e1bb560 3 | size 876366 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_old/validation/answer_starts.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:84b6eefcf57226e557664dba3df4b92e5d6a36f41e443a40cb7f8cb0b2725cfc 3 | size 871721 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_old/validation/ids.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:265789c92077c71a017dd1e6e1028e5a022ab34efddf091969dd2f6110d90566 3 | size 1634129 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_old/validation/indices.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e75f8c969460d99eea735258e2f2f3655c277daa4c59145cdad907f9630e43de 3 | size 1254429 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_old/validation/outputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2bf4737b32f5adf2faddde48402bf66f5e75f88c06defb2f6809f3fa571dabf3 3 | size 1745239 4 | -------------------------------------------------------------------------------- /datasets/newsqa_unsupervised_old/vocab.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3e7b26c130a8ba8be7bf34416bfcf472f01ab9ef641dea97f99fb7a1b2fe553c 3 | size 1006234 4 | -------------------------------------------------------------------------------- /datasets/question_generator/test/indices.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6392c376c4f872a61dcb12d065140b89519bb4832e3dff5dff814dfed667a14b 3 | size 34 4 | -------------------------------------------------------------------------------- /datasets/question_generator/test/inputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:87d6699635e7201069b652f774311aea81adf07963fbfcda3f0ff6dc948841e0 3 | size 399 4 | -------------------------------------------------------------------------------- /datasets/question_generator/test/outputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:aa65a846ec104385828b4f4a230ee3ebe0671c553369d264a680044e824e8da3 3 | size 464 4 | -------------------------------------------------------------------------------- /datasets/question_generator/train/indices.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6392c376c4f872a61dcb12d065140b89519bb4832e3dff5dff814dfed667a14b 3 | size 34 4 | -------------------------------------------------------------------------------- /datasets/question_generator/train/inputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:87d6699635e7201069b652f774311aea81adf07963fbfcda3f0ff6dc948841e0 3 | size 399 4 | -------------------------------------------------------------------------------- /datasets/question_generator/train/outputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:aa65a846ec104385828b4f4a230ee3ebe0671c553369d264a680044e824e8da3 3 | size 464 4 | -------------------------------------------------------------------------------- /datasets/question_generator/validation/indices.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6392c376c4f872a61dcb12d065140b89519bb4832e3dff5dff814dfed667a14b 3 | size 34 4 | -------------------------------------------------------------------------------- /datasets/question_generator/validation/inputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:87d6699635e7201069b652f774311aea81adf07963fbfcda3f0ff6dc948841e0 3 | size 399 4 | -------------------------------------------------------------------------------- /datasets/question_generator/validation/outputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:aa65a846ec104385828b4f4a230ee3ebe0671c553369d264a680044e824e8da3 3 | size 464 4 | -------------------------------------------------------------------------------- /datasets/question_generator/vocab.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b7ab460035a08214f7706daa086faa0147b84bdcf41084d76696f39d0c8a17e4 3 | size 47 4 | -------------------------------------------------------------------------------- /datasets/question_generator/word_embeddings.npy: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:836c6845d79fe9afbb6ce4f04519490147b01a0804d2b6c12736b6de88eb795c 3 | size 40880 4 | -------------------------------------------------------------------------------- /datasets/squad/test/answer_ends.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:76794bda0b4ce21b2d7d296979a3440e693f10b2d203ca9ce7cd6ddb44ab63cf 3 | size 32729 4 | -------------------------------------------------------------------------------- /datasets/squad/test/answer_starts.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:79497c26f0078664229f9c4950535ac6349c2d55e5bf32b6832d2d386e8bbfb4 3 | size 32223 4 | -------------------------------------------------------------------------------- /datasets/squad/test/ids.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:cc732368ef367f6f04f8efa42a79c6e130f34a88d0270c990f87ed9bcba0ad85 3 | size 264249 4 | -------------------------------------------------------------------------------- /datasets/squad/test/indices.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b58d8062546a48ab079ad3af87be637ed279192189e15baf1bd7800e6d1c92e1 3 | size 46137 4 | -------------------------------------------------------------------------------- /datasets/squad/test/inputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e58b81a330876cc0b7f7fda075f99054514fc24aa8360ebd0ec55786c7a42dc9 3 | size 5104041 4 | -------------------------------------------------------------------------------- /datasets/squad/test/outputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:31dfa0b3d1a8aa10e1b663d7abc1cf9db35b0fb0fd4f5a393f7aa8f1f6174ad8 3 | size 657565 4 | -------------------------------------------------------------------------------- /datasets/squad/train/answer_ends.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:aa3fc91e476ee541944dc71f849cf974181663af5a6e7b48611f876067790644 3 | size 270983 4 | -------------------------------------------------------------------------------- /datasets/squad/train/answer_starts.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:04f6da5479a495f05d07abec5902bfc1c4f6a10f155114823059141db3134ab0 3 | size 266277 4 | -------------------------------------------------------------------------------- /datasets/squad/train/ids.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f04ce115e64ab652c743fcf7b8bfcd2666a70ae58b9acb6148cc57c10ad1db45 3 | size 2189974 4 | -------------------------------------------------------------------------------- /datasets/squad/train/indices.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:33f7fb47d5657b5ae647dddb91cf064a5ab74cd16f392aad8d7a0e08146c8058 3 | size 472305 4 | -------------------------------------------------------------------------------- /datasets/squad/train/inputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c09deb8e15b21d4f7b81afd1f07e52dbe3476ff0eec7809d1dd00a5e702e790e 3 | size 44472596 4 | -------------------------------------------------------------------------------- /datasets/squad/train/outputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:a349f3ab5037f45f997d178064165c6d8d8a60f672203540bd3fc804a7e05cc5 3 | size 5384930 4 | -------------------------------------------------------------------------------- /datasets/squad/validation/answer_ends.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:aa3fc91e476ee541944dc71f849cf974181663af5a6e7b48611f876067790644 3 | size 270983 4 | -------------------------------------------------------------------------------- /datasets/squad/validation/answer_starts.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:04f6da5479a495f05d07abec5902bfc1c4f6a10f155114823059141db3134ab0 3 | size 266277 4 | -------------------------------------------------------------------------------- /datasets/squad/validation/ids.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f04ce115e64ab652c743fcf7b8bfcd2666a70ae58b9acb6148cc57c10ad1db45 3 | size 2189974 4 | -------------------------------------------------------------------------------- /datasets/squad/validation/indices.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:33f7fb47d5657b5ae647dddb91cf064a5ab74cd16f392aad8d7a0e08146c8058 3 | size 472305 4 | -------------------------------------------------------------------------------- /datasets/squad/validation/inputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:7c67cb58a444013cc2ef584a46f1738a793c16cf24e226c93c490436b3ed57c9 3 | size 14281184 4 | -------------------------------------------------------------------------------- /datasets/squad/validation/outputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:a349f3ab5037f45f997d178064165c6d8d8a60f672203540bd3fc804a7e05cc5 3 | size 5384930 4 | -------------------------------------------------------------------------------- /datasets/squad/vocab.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:8efc25e3ba7d0a4137fd27d0824166c0d80c8bd5b652c118541d1e4633bb1bd4 3 | size 1006231 4 | -------------------------------------------------------------------------------- /datasets/squad/word_embeddings.npy: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:07007189652bf43a0e3344fcab1080cb5ad6c96bb46bab61e6c5848f53213ffb 3 | size 264444080 4 | -------------------------------------------------------------------------------- /datasets/squad_iob/label_vocab.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:606f0ee17c5d6a8218cf46208a6608f610fe3a84beb18ba4c24e94fd59bf5809 3 | size 38 4 | -------------------------------------------------------------------------------- /datasets/squad_iob/test/inputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6d02ca31b2dd2be6b3d7f4208fc24cfa347743b673294090db085396d5314b99 3 | size 1642029 4 | -------------------------------------------------------------------------------- /datasets/squad_iob/test/label_vocab.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:606f0ee17c5d6a8218cf46208a6608f610fe3a84beb18ba4c24e94fd59bf5809 3 | size 38 4 | -------------------------------------------------------------------------------- /datasets/squad_iob/test/labels.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:7b79a2270314f3a85db52a5a09db919bb534bebe0eda7164f28f7dbae2bf0a60 3 | size 2431581 4 | -------------------------------------------------------------------------------- /datasets/squad_iob/test/vocab.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:9e07f5f2c65709ae66fc2452cddbd5e73d9376faa100502aca0de37f0e643ad8 3 | size 245190 4 | -------------------------------------------------------------------------------- /datasets/squad_iob/train/inputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:cf9bc9665878d7202115103d702f2c6667329bc62657b4f7746759032e065ab8 3 | size 14278820 4 | -------------------------------------------------------------------------------- /datasets/squad_iob/train/labels.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:85732a286a29d505151910dbf27ec689a734d374eaaf0cce36aa33b17b497240 3 | size 21134049 4 | -------------------------------------------------------------------------------- /datasets/squad_iob/validation/inputs.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6d02ca31b2dd2be6b3d7f4208fc24cfa347743b673294090db085396d5314b99 3 | size 1642029 4 | -------------------------------------------------------------------------------- /datasets/squad_iob/validation/label_vocab.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:606f0ee17c5d6a8218cf46208a6608f610fe3a84beb18ba4c24e94fd59bf5809 3 | size 38 4 | -------------------------------------------------------------------------------- /datasets/squad_iob/validation/labels.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:1e2d073ab6528d38d6d2ca74b48b0604090ce81b0bfb9da3515cdcdd3dacaa32 3 | size 2429470 4 | -------------------------------------------------------------------------------- /datasets/squad_iob/validation/vocab.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:9e07f5f2c65709ae66fc2452cddbd5e73d9376faa100502aca0de37f0e643ad8 3 | size 245190 4 | -------------------------------------------------------------------------------- /datasets/squad_iob/vocab.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:af38a3df050779170fb1ca1d2c42e7b9cbf02a09a01da5b13420984fb3e9e429 3 | size 1005904 4 | -------------------------------------------------------------------------------- /dnn_units/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/dnn_units/__init__.py -------------------------------------------------------------------------------- /helpers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/helpers/__init__.py -------------------------------------------------------------------------------- /helpers/constants.py: -------------------------------------------------------------------------------- 1 | """ 2 | Constants for easier reference 3 | """ 4 | 5 | NLTK_DATA_PATH = 'pretrained_models/nltk' 6 | TOKENIZER_TEXTBLOB = 'TOKENIZER_TEXTBLOB' 7 | TOKENIZER_NLTK = 'TOKENIZER_NLTK' 8 | TOKENIZER_REGEX = 'TOKENIZER_REGEX' 9 | TOKENIZER_TWITTER = 'TOKENIZER_TWITTER' 10 | TOKENIZER_STANFORD_NLP = 'TOKENIZER_STANFORD_NLP' 11 | TOKENIZER_TAB = 'TOKENIZER_TAB' 12 | TOKENIZER_SPECIAL_DELIMITER = 'TOKENIZER_SPECIAL_DELIMITER' 13 | TOKENIZER_SPACE = ' ' 14 | 15 | TRAIN_INDEX = 0 16 | VAL_INDEX = 1 17 | TEST_INDEX = 2 18 | 19 | TRAIN_MODE = 0 20 | TEST_MODE = 1 21 | 22 | MODEL_TYPE_LANGUAGE_MODEL = 'MODEL_TYPE_LANGUAGE_MODEL' 23 | 24 | WORD_LEVEL = 'WORD_LEVEL' 25 | CHAR_LEVEL = 'CHAR_LEVEL' 26 | WORD_CHAR_LEVEL = 'WORD_CHAR_LEVEL' #Word embeddings with char. lvl lstms 27 | WORD_HASHING_LEVEL = 'WORD_HASHING_LEVEL' 28 | WORD_HASHING_CONSTANT = '%' 29 | 30 | DATASET_TRAIN = 'DATASET_TRAIN' 31 | DATASET_TEST = 'DATASET_TEST' 32 | DATASET_VALIDATION = 'DATASET_VALIDATION' 33 | 34 | GPU_MODE = 'GPU_MODE' 35 | CPU_MODE = 'CPU_MODE' 36 | 37 | PREPROCESS_TYPE_INCEPTION = 'PREPROCESS_TYPE_INCEPTION' 38 | PREPROCESS_TYPE_GOOGLENET = 'PREPROCESS_TYPE_GOOGLENET' 39 | PREPROCESS_TYPE_RESNET = 'PREPROCESS_TYPE_RESNET' 40 | 41 | PREPROCESS_TYPE_RESNET_50 = 'PREPROCESS_TYPE_RESNET_50' 42 | PREPROCESS_TYPE_RESNET_101 = 'PREPROCESS_TYPE_RESNET_101' 43 | PREPROCESS_TYPE_RESNET_152 = 'PREPROCESS_TYPE_RESNET_152' 44 | 45 | NETWORK_TYPE_INCEPTION = 'NETWORK_TYPE_INCEPTION' 46 | NETWORK_TYPE_GOOGLENET = 'NETWORK_TYPE_GOOGLENET' 47 | NETWORK_TYPE_RESNET = 'NETWORK_TYPE_RESNET' 48 | 49 | NETWORK_TYPE_RESNET_30 = 'NETWORK_TYPE_RESNET_30' 50 | NETWORK_TYPE_RESNET_50 = 'NETWORK_TYPE_RESNET_50' 51 | NETWORK_TYPE_RESNET_101 = 'NETWORK_TYPE_RESNET_101' 52 | NETWORK_TYPE_RESNET_152 = 'NETWORK_TYPE_RESNET_152' 53 | 54 | OPTIMIZER_RMSPROP = 'OPTIMIZER_RMSPROP' 55 | OPTIMIZER_ADAM = 'OPTIMIZER_ADAM' 56 | OPTIMIZER_SGD = 'OPTIMIZER_SGD' 57 | 58 | # Initializer for weights (zero, uniform and random) 59 | INITIALIZER_ZERO = 'INITIALIZER_ZERO' 60 | INITIALIZER_UNIFORM_RANDOM = 'INITIALIZER_UNIFORM_RANDOM' 61 | 62 | # To load vocab things 63 | PATH_NPY_ARRAY = 'PATH_NPY_ARRAY' 64 | PATH_TEXT_ARRAY = 'PATH_TEXT_ARRAY' 65 | 66 | # For card types 67 | PREDICTOR_TEXT_FIELD = 'TEXT_FIELD_PREDICTOR' 68 | PREDICTOR_SINGULAR_FIELD = 'SINGULAR_FIELD_PREDICTOR' 69 | PREDICTOR_CHARACTER = 'CHARACTER_PREDICTOR' 70 | -------------------------------------------------------------------------------- /helpers/io_utils.py: -------------------------------------------------------------------------------- 1 | import zipfile 2 | import glob 3 | import datetime 4 | import os 5 | import urllib 6 | import shutil 7 | import os 8 | import pickle 9 | 10 | def pickle_save(data, save_path): 11 | print("Saving data to path %s" % save_path) 12 | save_file = open(save_path, 'w') 13 | pickle.dump(data, save_file) 14 | save_file.close() 15 | 16 | def pickle_load(load_path): 17 | print("Loading data from path %s" % load_path) 18 | 19 | load_file = open(load_path, 'r') 20 | data = pickle.load(load_file) 21 | load_file.close() 22 | 23 | print("Done loading data from path %s" % load_path) 24 | return data 25 | 26 | def get_subdirs(src_dir): 27 | return [os.path.join(src_dir, name) for name in os.listdir(src_dir) \ 28 | if os.path.isdir(os.path.join(src_dir, name))] 29 | 30 | def copy_files(src_dir, dest_dir): 31 | for filename in glob.glob(os.path.join(src_dir, '*.*')): 32 | shutil.copy(filename, dest_dir) 33 | 34 | def copy_file(src_name, dest_name): 35 | shutil.copyfile(src_name, dest_name) 36 | 37 | def get_files(src_dir): 38 | """ 39 | Gets all files from source directory 40 | """ 41 | files = glob.glob(os.path.join(src_dir, '*.*')) 42 | return files 43 | 44 | def download_file(url, save_path): 45 | """ Downloads url to save_path """ 46 | url_opener = urllib.URLopener() 47 | url_opener.retrieve(url, save_path) 48 | 49 | def check_dir(save_dir): 50 | """ Creates dir if not exists""" 51 | if not os.path.exists(save_dir): 52 | print("Directory %s does not exist, making it now" % save_dir) 53 | os.makedirs(save_dir) 54 | return False 55 | else: 56 | print("Directory %s exists, all good" % save_dir) 57 | return True 58 | 59 | def get_matching_files(regex): 60 | files = glob.glob(regex) 61 | return files 62 | 63 | def zip_files(file_list, save_path): 64 | print('creating archive into path %s' % save_path) 65 | zf = zipfile.ZipFile(save_path, mode='w') 66 | 67 | for f in file_list: 68 | print(f) 69 | zf.write(f) 70 | zf.close() 71 | print_info(save_path) 72 | 73 | def unzip_files(zip_path, directory_to_extract_to): 74 | print("Unzipping files from path %s to dir %s" \ 75 | % (zip_path, directory_to_extract_to)) 76 | zip_ref = zipfile.ZipFile(zip_path, 'r') 77 | zip_ref.extractall(directory_to_extract_to) 78 | zip_ref.close() 79 | 80 | def print_info(archive_name): 81 | zf = zipfile.ZipFile(archive_name) 82 | for info in zf.infolist(): 83 | print(info.filename) 84 | print('\tComment:\t', info.comment) 85 | print('\tModified:\t', datetime.datetime(*info.date_time)) 86 | print('\tSystem:\t\t', info.create_system, '(0 = Windows, 3 = Unix)') 87 | print('\tZIP version:\t', info.create_version) 88 | print('\tCompressed:\t', info.compress_size, 'bytes') 89 | print('\tUncompressed:\t', info.file_size, 'bytes') 90 | print 91 | 92 | -------------------------------------------------------------------------------- /helpers/logger.py: -------------------------------------------------------------------------------- 1 | class FileLogger(object): 2 | """ Simple logger to insert stuff into a file """ 3 | def __init__(self, path): 4 | self.file = open(path, 'w') 5 | 6 | def write(self, text, print_text=False): 7 | if print_text: 8 | print("FILE LOGGER: %s" % text) 9 | self.file.write(text + "\n") 10 | self.file.flush() -------------------------------------------------------------------------------- /helpers/proc_wrapper.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import time 3 | import re 4 | from subprocess import Popen 5 | import urllib2, urllib 6 | import json 7 | import httplib 8 | import os 9 | import urlparse 10 | from optparse import OptionParser 11 | 12 | # This wraps a process within a logger and logs the information in the cloud 13 | def spawner(cmd_list): 14 | print("Spawning process") 15 | p = subprocess.Popen(cmd_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 16 | 17 | print("Done spawning") 18 | # Parameters needed for logging 19 | job_id = -1 20 | job_id_got = False 21 | 22 | job_endpoint = 'NONE' 23 | job_endpoint_got = False 24 | 25 | error_log_endpoint = 'NONE' 26 | error_log_endpoint_got = False 27 | 28 | weird_constant = '\x1b[0m' # For some reason adds this weird constant to every line 29 | 30 | while True: 31 | print("Reading line out") 32 | line = p.stdout.readline() 33 | print(line) 34 | if line != '': 35 | #the real code does filtering here 36 | stripped_line = line.rstrip() 37 | stripped_line = str.replace(stripped_line, weird_constant, '', ) 38 | 39 | print('STDOUT ' + stripped_line) 40 | if not job_id_got: 41 | print("Got job id") 42 | job_id_got, job_id = get_job_id(stripped_line) 43 | 44 | if not job_endpoint_got: 45 | print("Got job endpoint") 46 | job_endpoint_got, job_endpoint = get_job_endpoint(stripped_line) 47 | 48 | if not error_log_endpoint_got: 49 | print("Getting error log endpoint") 50 | error_log_endpoint_got, error_log_endpoint = get_error_log_endpoint(stripped_line) 51 | else: 52 | break 53 | 54 | stdout, stderr = p.communicate() 55 | 56 | if stderr is not None and stderr is not ' ': 57 | # Job url: http://api_endpoint/api/job/id/ 58 | # job_endpoint: http://api_endpoint/api/job/ 59 | 60 | print('STDERR ' + stderr) 61 | job_url = "{0}{1}{2}".format(job_endpoint, int(job_id), '/') 62 | 63 | print("JOB URL IS " + job_url) 64 | data = {} 65 | data['job'] = job_url 66 | data['text'] = stderr 67 | 68 | send_post_message(error_log_endpoint.rstrip(), data) 69 | else: 70 | print("Job finished successfully.") 71 | 72 | def send_post_message(url, data): 73 | # split url into base and relative path 74 | result = urlparse.urlparse(url) 75 | base_path = result.netloc 76 | relative_path = result.path 77 | encoded_data = urllib.urlencode(data) 78 | 79 | h = httplib.HTTPConnection(base_path) 80 | 81 | headers = {"Content-type": "application/x-www-form-urlencoded"} 82 | 83 | h.request('POST', '/api/v1/error_log/', encoded_data, headers) 84 | 85 | r = h.getresponse() 86 | print r.read() 87 | 88 | 89 | def get_job_id(message): 90 | m = re.search('(?<=job with id )[0-9]*', message) 91 | if m is not None: 92 | job_id = m.group(0) 93 | print("Got job id %s" % job_id) 94 | return True, job_id 95 | else: 96 | return False, -1 97 | 98 | def get_job_endpoint(message): 99 | m = re.search('(?<=Job endpoint ).*', message) 100 | if m is not None: 101 | print("Got job endpoint " + message) 102 | job_endpoint = m.group(0) 103 | print("Job endpoint %s" % job_endpoint) 104 | return True, job_endpoint 105 | else: 106 | return False, '' 107 | 108 | def get_error_log_endpoint(message): 109 | m = re.search('(?<=Error log endpoint ).*', message) 110 | if m is not None: 111 | print("Got error log endpoint") 112 | error_endpoint = m.group(0) 113 | print("Endpoint is %s" % error_endpoint) 114 | return True, error_endpoint 115 | else: 116 | return False, '' 117 | 118 | if __name__ == '__main__': 119 | print("Starting stuff") 120 | parser = OptionParser() 121 | parser.add_option("--command", "--command", dest="command", 122 | help="Command to execute") 123 | 124 | parser.add_option("--args", "--args", dest="args", 125 | help="Args for command") 126 | 127 | (options, args) = parser.parse_args() 128 | 129 | split_args = options.args.split(' ') 130 | 131 | concatenated_args = [options.command] 132 | concatenated_args.extend(split_args) 133 | spawner(concatenated_args) -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | pip install tqdm 3 | pip install unidecode 4 | pip install textblob 5 | pip3 install tqdm 6 | pip3 install unidecode 7 | pip3 install textblob 8 | pip3 install http://download.pytorch.org/whl/cu80/torch-0.1.12.post2-cp35-cp35m-linux_x86_64.whl 9 | export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.12.1-cp35-cp35m-linux_x86_64.whl 10 | sudo pip3 install --upgrade pip 11 | sudo pip3 install --upgrade $TF_BINARY_URL 12 | sudo pip install tensorflow-gpu 13 | pip install spacy && python -m spacy download en 14 | pip3 install spacy && python3 -m spacy download en 15 | cd bidaf 16 | ./download.sh 17 | cd ../ 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /iob/logs/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/iob/logs/README.md -------------------------------------------------------------------------------- /logs/results/newsqa/evaluate.py: -------------------------------------------------------------------------------- 1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """ 2 | from __future__ import print_function 3 | from collections import Counter 4 | import string 5 | import re 6 | import argparse 7 | import json 8 | import sys 9 | 10 | 11 | def normalize_answer(s): 12 | """Lower text and remove punctuation, articles and extra whitespace.""" 13 | def remove_articles(text): 14 | return re.sub(r'\b(a|an|the)\b', ' ', text) 15 | 16 | def white_space_fix(text): 17 | return ' '.join(text.split()) 18 | 19 | def remove_punc(text): 20 | exclude = set(string.punctuation) 21 | return ''.join(ch for ch in text if ch not in exclude) 22 | 23 | def lower(text): 24 | return text.lower() 25 | 26 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 27 | 28 | 29 | def f1_score(prediction, ground_truth): 30 | prediction_tokens = normalize_answer(prediction).split() 31 | ground_truth_tokens = normalize_answer(ground_truth).split() 32 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 33 | num_same = sum(common.values()) 34 | if num_same == 0: 35 | return 0 36 | precision = 1.0 * num_same / len(prediction_tokens) 37 | recall = 1.0 * num_same / len(ground_truth_tokens) 38 | f1 = (2 * precision * recall) / (precision + recall) 39 | return f1 40 | 41 | 42 | def exact_match_score(prediction, ground_truth): 43 | return (normalize_answer(prediction) == normalize_answer(ground_truth)) 44 | 45 | 46 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 47 | scores_for_ground_truths = [] 48 | for ground_truth in ground_truths: 49 | score = metric_fn(prediction, ground_truth) 50 | scores_for_ground_truths.append(score) 51 | return max(scores_for_ground_truths) 52 | 53 | 54 | def evaluate(dataset, predictions): 55 | f1 = exact_match = total = 0 56 | unanswered = 0 57 | for i in range(0, len(dataset['answerss'])): 58 | question_id = str(dataset['ids'][i]) 59 | question = dataset['q'][i] 60 | ground_truths = dataset['answerss'][i] 61 | if len(ground_truths) == 0: 62 | ground_truths = [""] 63 | #total += 1 64 | 65 | 66 | if question_id not in predictions: 67 | unanswered += 1 68 | message = 'Unanswered question ' + str(question_id) + \ 69 | ' will receive score 0.' 70 | print(message, file=sys.stderr) 71 | continue 72 | total += 1 73 | prediction = predictions[question_id] 74 | exact_match += metric_max_over_ground_truths( 75 | exact_match_score, prediction, ground_truths) 76 | f1 += metric_max_over_ground_truths( 77 | f1_score, prediction, ground_truths) 78 | 79 | exact_match = 100.0 * exact_match / total 80 | f1 = 100.0 * f1 / total 81 | print("Number unanswered %s" % unanswered) 82 | 83 | return {'exact_match': exact_match, 'f1': f1} 84 | 85 | 86 | if __name__ == '__main__': 87 | expected_version = '1.1' 88 | parser = argparse.ArgumentParser( 89 | description='Evaluation for SQuAD ' + expected_version) 90 | parser.add_argument('dataset_file', help='Dataset file') 91 | parser.add_argument('prediction_file', help='Prediction File') 92 | args = parser.parse_args() 93 | with open(args.dataset_file) as dataset_file: 94 | dataset = json.load(dataset_file) 95 | with open(args.prediction_file) as prediction_file: 96 | predictions = json.load(prediction_file) 97 | print(json.dumps(evaluate(dataset, predictions))) 98 | -------------------------------------------------------------------------------- /logs/results/script.sh: -------------------------------------------------------------------------------- 1 | # Evaluate out of domain baseline 2 | echo "out of domain baseline" 3 | python3 newsqa/evaluate.py newsqa/data_test.json answer_out_of_domain_baseline.json 4 | 5 | # Evaluate single model results (for steps 2k-10k) 6 | 7 | # Evaluate single model results + baseline (for steps 2k-10k) 8 | 9 | # Evaluate single-model result (44.5 F1) 10 | echo "A_(gen + Ner)" 11 | python3 newsqa/evaluate.py newsqa/data_test.json single_model.json 12 | 13 | # Evaluate two-model result (45.6 F1) 14 | echo "Double model result-2 A_(gen + ner) + A_ner" 15 | python3 newsqa/evaluate.py newsqa/data_test.json double_model.json 16 | 17 | # Evaluate AOracle + Context 18 | echo "Answer oracle with context for question generation, single model" 19 | python3 newsqa/evaluate.py newsqa/data_test.json context_aoracle.json 20 | 21 | echo "Single BiDAF model finetuned on NewsQA 4k steps" 22 | python3 newsqa/evaluate.py newsqa/data_test.json "single_model_results_44.json" 23 | 24 | echo "Single BiDaf finetuned on NewsQA 4k steps ensembled w. baseline results" 25 | python3 newsqa/evaluate.py newsqa/data_test.json "single_model_result_run_44_with_baseline.json" 26 | 27 | # Evaluate single model result of BiDAF finetuned on NewsQA 28 | echo "Single BiDAF model finetuned on NewsQA results" 29 | for num in 42 43 44 45 46 47 48 49; do 30 | python3 newsqa/evaluate.py newsqa/data_test.json "single_model_results_${num}.json" 31 | done 32 | 33 | echo "Single BiDAF model finetuned on NewsQA ensembled w. baseline results" 34 | # Evaluate single model ensembled with baseline result of BiDAF finetuned on NewsQA 35 | for num in 42 43 44 45 46 47 48 49; do 36 | python3 newsqa/evaluate.py newsqa/data_test.json "single_model_result_run_${num}_with_baseline.json" 37 | done 38 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/models/__init__.py -------------------------------------------------------------------------------- /models/iob/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/models/iob/__init__.py -------------------------------------------------------------------------------- /models/language_wrapper.py: -------------------------------------------------------------------------------- 1 | from models.language_model import LanguageModel, LanguageDiscriminator 2 | from helpers import vocab 3 | import numpy as np 4 | import torch 5 | from torch.autograd import variable 6 | 7 | class LanguageWrapper(object): 8 | def __init__(self, model, vocab): 9 | self.model = model 10 | self.vocab = vocab 11 | self.create_discriminator() 12 | 13 | def create_discriminator(self): 14 | # Hack to create discriminator: note 15 | # Violates interface design pattern 16 | embeddings = self.model.embedder 17 | text_field_predictor = self.model.text_field_predictor 18 | base_lstm = self.model.base_lstm 19 | 20 | self.discriminator = LanguageDiscriminator(self.model.config, 21 | embeddings, text_field_predictor, base_lstm).cuda() 22 | 23 | def get_discriminator(self): 24 | return self.discriminator 25 | 26 | def get_model(self): 27 | return self.model 28 | 29 | def predict(self, context_tokens, answer_features, max_length, pad=False): 30 | input_token = variable.Variable(torch.LongTensor([[self.vocab.start_index]])).cuda() 31 | end_token = torch.LongTensor([[self.vocab.end_index]]).cuda() 32 | context_tokens = variable.Variable(torch.LongTensor(context_tokens)).cuda() 33 | answer_features = variable.Variable(torch.from_numpy(answer_features)).cuda() 34 | 35 | predictions = self.model.predict(input_token=input_token, 36 | context_tokens=context_tokens, 37 | end_token=end_token, 38 | answer_features=answer_features, 39 | max_length=max_length) 40 | 41 | if pad: 42 | pad_token = variable.Variable(torch.LongTensor([self.vocab.pad_index]).cuda()) 43 | while len(predictions) < max_length: 44 | predictions.append(pad_token) 45 | 46 | stacked_predictions = torch.stack(predictions, 0) 47 | tokens = self.get_tokens_single(stacked_predictions.cpu()) 48 | sentence = " ".join(tokens) 49 | return sentence, stacked_predictions 50 | 51 | def get_tokens_single(self, predictions): 52 | numpy_predictions = torch.squeeze(predictions).data.numpy() 53 | tokens = self.vocab.tokens(numpy_predictions) 54 | return tokens 55 | 56 | def get_tokens(self, predictions): 57 | numpy_predictions = torch.squeeze(predictions).data.numpy() 58 | tokens = self.vocab.tokens_list(numpy_predictions) 59 | return tokens -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt.zip -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/PY3/czech.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/czech.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/PY3/danish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/danish.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/PY3/dutch.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/dutch.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/PY3/english.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/english.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/PY3/estonian.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/estonian.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/PY3/finnish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/finnish.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/PY3/french.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/french.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/PY3/german.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/german.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/PY3/greek.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/greek.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/PY3/italian.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/italian.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/PY3/norwegian.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/norwegian.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/PY3/polish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/polish.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/PY3/portuguese.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/portuguese.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/PY3/slovene.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/slovene.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/PY3/spanish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/spanish.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/PY3/swedish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/swedish.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/PY3/turkish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/turkish.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/czech.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/czech.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/danish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/danish.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/dutch.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/dutch.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/estonian.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/estonian.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/finnish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/finnish.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/french.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/french.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/german.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/german.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/italian.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/italian.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/norwegian.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/norwegian.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/polish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/polish.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/portuguese.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/portuguese.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/slovene.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/slovene.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/spanish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/spanish.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/swedish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/swedish.pickle -------------------------------------------------------------------------------- /pretrained_models/nltk/tokenizers/punkt/turkish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/turkish.pickle -------------------------------------------------------------------------------- /pretrained_models/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/scripts/__init__.py -------------------------------------------------------------------------------- /pretrained_models/scripts/create_glove_embeddings.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python -m pretrained_models.scripts.download_glove_embeddings 3 | python -m pretrained_models.scripts.transfer_glove_embeddings --path 'pretrained_models/word_embeddings/glove/glove.840B.300d.txt' \ 4 | --save_word_path 'datasets/squad/vocab.txt' \ 5 | --save_embeddings_path 'datasets/squad/word_embeddings.npy' 6 | 7 | python -m pretrained_models.scripts.transfer_glove_embeddings --path 'pretrained_models/word_embeddings/glove/glove.840B.300d.txt' \ 8 | --save_word_path 'datasets/newsqa_unsupervised/vocab.txt' \ 9 | --save_embeddings_path 'datasets/newsqa_unsupervised/word_embeddings.npy' 10 | 11 | python -m pretrained_models.scripts.transfer_glove_embeddings --path 'pretrained_models/word_embeddings/glove/glove.840B.300d.txt' \ 12 | --save_word_path 'datasets/newsqa_unsupervised_large/vocab.txt' \ 13 | --save_embeddings_path 'datasets/newsqa_unsupervised_large/word_embeddings.npy' -------------------------------------------------------------------------------- /pretrained_models/scripts/download_glove_embeddings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Downloads the following: 3 | - Stanford parser 4 | - Stanford POS tagger 5 | - Glove vectors 6 | - SICK dataset (semantic relatedness task) 7 | - Stanford Sentiment Treebank (sentiment classification task) 8 | """ 9 | 10 | from __future__ import print_function 11 | import urllib2 12 | import sys 13 | import os 14 | import shutil 15 | import zipfile 16 | import gzip 17 | from helpers import io_utils 18 | 19 | def download(url, dirpath): 20 | io_utils.check_dir(dirpath) 21 | filename = url.split('/')[-1] 22 | filepath = os.path.join(dirpath, filename) 23 | try: 24 | u = urllib2.urlopen(url) 25 | except: 26 | print("URL %s failed to open" %url) 27 | raise Exception 28 | try: 29 | f = open(filepath, 'wb') 30 | except: 31 | print("Cannot write %s" %filepath) 32 | raise Exception 33 | try: 34 | filesize = int(u.info().getheaders("Content-Length")[0]) 35 | except: 36 | print("URL %s failed to report length" %url) 37 | raise Exception 38 | print("Downloading: %s Bytes: %s" % (filename, filesize)) 39 | 40 | downloaded = 0 41 | block_sz = 8192 42 | status_width = 70 43 | while True: 44 | buf = u.read(block_sz) 45 | if not buf: 46 | print('') 47 | break 48 | else: 49 | print('', end='\r') 50 | downloaded += len(buf) 51 | f.write(buf) 52 | status = (("[%-" + str(status_width + 1) + "s] %3.2f%%") % 53 | ('=' * int(float(downloaded) / filesize * status_width) + '>', downloaded * 100. / filesize)) 54 | print(status, end='') 55 | sys.stdout.flush() 56 | f.close() 57 | return filepath 58 | 59 | def unzip(filepath): 60 | print("Extracting: " + filepath) 61 | dirpath = os.path.dirname(filepath) 62 | with zipfile.ZipFile(filepath) as zf: 63 | zf.extractall(dirpath) 64 | os.remove(filepath) 65 | 66 | def download_wordvecs(dirpath): 67 | url = 'http://www-nlp.stanford.edu/data/glove.840B.300d.zip' 68 | unzip(download(url, dirpath)) 69 | 70 | def create_glove_vocab(dirpath): 71 | glove_path = os.path.join(dirpath, 'glove.840B.300d.txt') 72 | with open(glove_path) as f: 73 | line = f.readline().split(' ') 74 | word = line[0] 75 | vecs = map(lambda item: float(item), line[1:]) 76 | print(word) 77 | print(vecs) 78 | 79 | if __name__ == '__main__': 80 | base_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 81 | 82 | # data 83 | data_dir = os.path.join(base_dir, 'word_embeddings') 84 | wordvec_dir = os.path.join(data_dir, 'glove') 85 | 86 | # download dependencies 87 | download_wordvecs(wordvec_dir) 88 | 89 | # create the vocabulary file from the word embeddings 90 | create_glove_vocab(wordvec_dir) -------------------------------------------------------------------------------- /pretrained_models/scripts/transfer_glove_embeddings.py: -------------------------------------------------------------------------------- 1 | from helpers import utils 2 | from helpers import constants 3 | from helpers.vocab import Vocab 4 | 5 | import numpy as np 6 | 7 | from optparse import OptionParser 8 | 9 | parser = OptionParser() 10 | parser.add_option("--path", "--path", dest="path", 11 | help="Path to save words from") 12 | parser.add_option("--save_word_path", "--save_word_path", dest="save_word_path", 13 | help="Where to save vocab") 14 | parser.add_option("--save_embeddings_path", "--save_embeddings_path", dest="save_embeddings_path", 15 | default = 'save_embeddings_path', help="Where to save embeddings to") 16 | (options, args) = parser.parse_args() 17 | 18 | path = options.path 19 | save_word_path = options.save_word_path 20 | save_embeddings_path = options.save_embeddings_path 21 | save_embeddings_file = open(save_embeddings_path, 'w') 22 | 23 | # Test loading it into vocab 24 | vocab = Vocab(vocab_type=constants.WORD_LEVEL, add_start_end=True) 25 | vocab.init_from_path(path=save_word_path) 26 | 27 | token_to_embedding = dict() 28 | num_items = [] 29 | original_item = [] 30 | original_embedding = [] 31 | cur_index = 0 32 | def read_line(line): 33 | items = line.strip().split(' ') 34 | embed_size = len(items) - 1 # First item is word 35 | cur_index = len(num_items) 36 | 37 | word = items[0] 38 | embedding_vector = items[1:] 39 | if cur_index % 100 == 0: 40 | print(cur_index) 41 | if len(original_item) == 0: 42 | original_item.append(word) 43 | original_embedding.append(map(lambda vec: float(vec), embedding_vector)) 44 | num_items.append(embed_size) 45 | 46 | if word in vocab.token_to_idx: 47 | token_to_embedding[word] = map(lambda vec: float(vec), embedding_vector) 48 | 49 | print("Reading embeddings") 50 | # Read in raw embeddings 51 | utils.read_lines_with_func(func=read_line, path=path) 52 | 53 | print("Done reading embeddings, now creating save matrix") 54 | original_embedding_size = num_items[0] 55 | word_embedding_matrix = [] 56 | num_items_saved = 0 57 | for i in range(0, vocab.size()): 58 | if i % 100 == 0: 59 | print("On index %s from %s" % (i, vocab.size())) 60 | cur_token = vocab.token(i) 61 | embedding_vector = np.zeros(original_embedding_size) 62 | if cur_token in token_to_embedding: 63 | embedding_vector = token_to_embedding[cur_token] 64 | num_items_saved = num_items_saved + 1 65 | word_embedding_matrix.append(embedding_vector) 66 | 67 | utils.save_matrix(matrix=word_embedding_matrix, path=save_embeddings_path) 68 | vocab.init_embeddings(embeddings_path=save_embeddings_path, path_type=constants.PATH_NPY_ARRAY) 69 | 70 | print("Saved %s of %s tokens" % (num_items_saved, vocab.size())) 71 | print("Testing embeddings for original token %s" % original_item) 72 | index = vocab.index(original_item[0]) 73 | embedding = vocab.embeddings[index] 74 | 75 | print(index) 76 | print(embedding) 77 | 78 | tmp = np.array(original_embedding[0]) 79 | diff = embedding - tmp 80 | print(diff) 81 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requirements.txt -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/tests/__init__.py -------------------------------------------------------------------------------- /tests/gather_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | input = torch.LongTensor([[1, 2], [3, 4], [5,6]]) 3 | dim = 0 4 | index = torch.LongTensor([1, 2]) 5 | res = torch.gather(input, dim, index) 6 | print(res) -------------------------------------------------------------------------------- /tests/iob_loader_test.py: -------------------------------------------------------------------------------- 1 | from data_loaders.iob_loader import IOBLoader 2 | from helpers import constants 3 | 4 | base_directory = 'datasets/iob_test' 5 | 6 | tmp = IOBLoader(base_directory, tokenizer_type=constants.TOKENIZER_NLTK) 7 | tmp.mix_indices() 8 | 9 | batch = tmp.get_batch(constants.DATASET_TRAIN, 2) 10 | 11 | print(batch) -------------------------------------------------------------------------------- /tests/iob_test.py: -------------------------------------------------------------------------------- 1 | from models.iob.iob_model import IOBModel 2 | import numpy as np 3 | 4 | config = { 5 | 'input_max_length': 20, 6 | 'vocab_size': 10, 7 | 'embeddings_size': 25, 8 | 'hidden_size': 30, 9 | 'out_size': 5, 10 | 'num_classes': 3, 11 | 'batch_size': 5, 12 | 'learning_rate': 1e-2 13 | } 14 | 15 | model = IOBModel(config) 16 | inputs = np.random.random_integers(0, config['vocab_size'] - 1, 17 | size=[config['batch_size'], config['input_max_length']]) 18 | input_lengths = np.ones((config['batch_size']), dtype=np.int32) * 1 19 | input_masks = np.ones((config['batch_size'], config['input_max_length']), dtype=np.int32) 20 | 21 | for i in range(0, config['batch_size']): 22 | cur_input_length = input_lengths[i] 23 | input_masks[i][cur_input_length:] = 0 24 | 25 | labels = np.random.random_integers(0, config['num_classes'] - 1, 26 | size=[config['batch_size'], config['input_max_length']]) 27 | 28 | batch = { 'inputs': inputs, 29 | 'input_lengths': input_lengths, 30 | 'input_masks': input_masks, 31 | 'labels': labels 32 | } 33 | 34 | for i in range(0, 100): 35 | loss, predictions = model.forward(batch) 36 | print(loss) 37 | print(predictions) 38 | print(labels) 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /tests/iob_trainer_test.py: -------------------------------------------------------------------------------- 1 | from data_loaders.iob_loader import IOBLoader 2 | from models.iob.iob_model import IOBModel 3 | from helpers import constants, utils 4 | 5 | base_directory = 'datasets/iob_test' 6 | 7 | data_loader = IOBLoader(base_directory, tokenizer_type=constants.TOKENIZER_SPACE) 8 | data_loader.mix_indices() 9 | 10 | 11 | config = { 12 | 'input_max_length': data_loader.input_max_length, 13 | 'vocab_size': data_loader.vocab.size(), 14 | 'embeddings_size': 25, 15 | 'hidden_size': 30, 16 | 'out_size': 5, 17 | 'num_classes': data_loader.label_vocab.size(), 18 | 'batch_size': 3, 19 | 'learning_rate': 1e-2, 20 | 'save_path': 'iob/logs'} 21 | 22 | config_path = 'iob/logs/config.json' 23 | params_path = 'iob/logs/model_params.ckpt' 24 | 25 | model = IOBModel(config, embeddings=None) 26 | model.save(config_path, params_path) 27 | model.restore(params_path) 28 | 29 | batch = data_loader.get_batch(constants.DATASET_TRAIN, config['batch_size']) 30 | 31 | for i in range(0, 100): 32 | while batch is not None: 33 | loss, predictions = model.forward(batch) 34 | batch = data_loader.get_batch(constants.DATASET_TRAIN, config['batch_size']) 35 | print(predictions) 36 | print(loss) 37 | 38 | if i % 3 == 0: 39 | data_loader.reset_indices() 40 | total_predictions = [] 41 | while True: 42 | batch = data_loader.get_batch(constants.DATASET_TEST, config['batch_size']) 43 | if batch is None: 44 | break 45 | predictions = model.predict(batch) 46 | texts = data_loader.label_vocab.tokens_list(predictions) 47 | for i in range(0, len(texts)): 48 | cur_input_length = batch['input_lengths'][i] 49 | cur_text = texts[i] 50 | text_str = " ".join(cur_text[0:cur_input_length]) 51 | total_predictions.append(text_str) 52 | utils.save_lines(total_predictions, \ 53 | '%s/predictions_test_%s.txt' % (config['save_path'], i)) 54 | 55 | data_loader.mix_indices() 56 | batch = data_loader.get_batch(constants.DATASET_TRAIN, config['batch_size']) 57 | 58 | 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /tests/language_model_memory_test.py: -------------------------------------------------------------------------------- 1 | from models.language_model import LanguageModel 2 | import torch 3 | from torch import nn 4 | from torch import optim 5 | from torch.autograd import variable 6 | from helpers import torch_utils 7 | 8 | config = {} 9 | config['vocab_size'] = 110000 10 | config['hidden_size'] = 150 11 | config['embedding_size'] = 100 12 | config['num_layers'] = 1 13 | config['dropout'] = 0.0 14 | config['batch_first'] = False 15 | config['use_pretrained_embeddings'] = False 16 | config['finetune_embeddings'] = True 17 | 18 | language_model = LanguageModel(config).cuda() 19 | 20 | # contexts: context_length x batch_size 21 | # inputs: input_length x batch_size 22 | # desired_inputs: input_length x batch_size 23 | 24 | 25 | optimizer = optim.Adam(language_model.parameters(), lr=3e-2) 26 | criterion = nn.NLLLoss() 27 | 28 | for i in range(0, 1000): 29 | optimizer.zero_grad() 30 | inputs = variable.Variable(torch.LongTensor([[1, 2, 3, 4, 5, 6, 7]] * 100)).cuda() 31 | contexts = variable.Variable(torch.LongTensor([[4, 5, 6, 7, 8, 9, 10], [4, 5, 6, 7, 8, 9, 10], [4, 5, 6, 7, 8, 9, 10], [4, 5, 6, 7, 8, 9, 10]])).cuda() 32 | context_masks = variable.Variable(torch.FloatTensor([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])).cuda() 33 | desired_inputs = variable.Variable(torch.LongTensor([[1, 2, 3, 4, 5, 6, 7]] * 100)).cuda() 34 | input_masks = variable.Variable(torch.FloatTensor([[1, 1, 1, 1, 1, 1, 1]] * 100)).cuda() 35 | answer_features = variable.Variable(torch.LongTensor([[4, 5, 6, 7, 8, 9, 10], [4, 5, 6, 7, 8, 9, 10], [4, 5, 6, 7, 8, 9, 10], [4, 5, 6, 7, 8, 9, 10]])).cuda() 36 | print("On index %s" % i) 37 | 38 | optimizer.zero_grad() 39 | language_probs = language_model.forward(inputs, contexts, context_masks, answer_features=None) 40 | reshaped_inputs = desired_inputs.view(-1) 41 | reshaped_language_probs = language_probs.view(-1, config['vocab_size']) 42 | loss = criterion(reshaped_language_probs, reshaped_inputs) 43 | loss.backward() 44 | optimizer.step() 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /tests/language_model_predict_test.py: -------------------------------------------------------------------------------- 1 | from models.language_model import LanguageModel 2 | import torch 3 | from torch import nn 4 | from torch import optim 5 | from torch.autograd import variable 6 | from helpers import torch_utils 7 | 8 | config = {} 9 | config['vocab_size'] = 25 10 | config['hidden_size'] = 50 11 | config['embedding_size'] = 10 12 | config['num_layers'] = 1 13 | config['dropout'] = 0.0 14 | config['batch_first'] = False 15 | 16 | language_model = LanguageModel(config) 17 | language_model.cuda() 18 | # contexts: context_length x batch_size 19 | # inputs: input_length x batch_size 20 | # desired_inputs: input_length x batch_size 21 | 22 | input_token = variable.Variable(torch.LongTensor([[1]])) 23 | context_tokens = variable.Variable(torch.LongTensor([[2], [3], [4], [5], [6], [7], [8]])) 24 | language_model.predict(input_token, context_tokens, torch.LongTensor([[1]])) -------------------------------------------------------------------------------- /tests/language_model_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch import optim 4 | from torch.autograd import variable 5 | from helpers import torch_utils 6 | from models.language_model import LanguageModel 7 | 8 | config = {} 9 | config['vocab_size'] = 25 10 | config['hidden_size'] = 50 11 | config['embedding_size'] = 10 12 | config['num_layers'] = 1 13 | config['dropout'] = 0.0 14 | config['batch_first'] = False 15 | config['use_pretrained_embeddings'] = False 16 | config['gpu_mode'] = True 17 | 18 | language_model = LanguageModel(config) 19 | 20 | # contexts: context_length x batch_size 21 | # inputs: input_length x batch_size 22 | # desired_inputs: input_length x batch_size 23 | 24 | inputs = variable.Variable(torch.LongTensor([[1, 2, 3], [4,5,6]])).cuda() 25 | contexts = variable.Variable(torch.LongTensor([[4, 5, 6], [7, 8, 9], [4, 5, 6], [7, 8, 9]])).cuda() 26 | desired_inputs = variable.Variable(torch.LongTensor([[2, 3, 4], [5, 6, 7]])).cuda() 27 | 28 | optimizer = optim.Adam(language_model.parameters(), lr=3e-2) 29 | criterion = nn.NLLLoss() 30 | language_model.cuda() 31 | 32 | for i in range(0, 100): 33 | optimizer.zero_grad() 34 | language_probs = language_model.forward(inputs, contexts, context_masks=None, answer_features=contexts.float()) 35 | reshaped_inputs = desired_inputs.view(-1) 36 | reshaped_language_probs = language_probs.view(-1, config['vocab_size']) 37 | 38 | max_likelihoods, best_indices = torch.max(language_probs, 2) 39 | diff = torch.eq(torch.squeeze(best_indices).data,desired_inputs.data) 40 | accuracy = (diff.sum()) / torch_utils.num_elements(diff) 41 | 42 | loss = criterion(reshaped_language_probs, reshaped_inputs) 43 | loss.backward() 44 | optimizer.step() 45 | 46 | print(loss) 47 | print(accuracy) 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /tests/language_model_trainer_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import variable 3 | from data_loaders.language_model_loader import LanguageModelLoader 4 | from models.language_model import LanguageModel 5 | from models.language_trainer import LanguageTrainer 6 | from models.language_wrapper import LanguageWrapper 7 | from helpers import constants, torch_utils 8 | 9 | 10 | base_path = 'datasets/question_generator' 11 | 12 | 13 | language_model_loader = LanguageModelLoader(base_path) 14 | 15 | config = {} 16 | config['vocab_size'] = language_model_loader.get_vocab().size() 17 | config['hidden_size'] = 100 18 | config['embedding_size'] = 300 19 | config['num_layers'] = 1 20 | config['dropout'] = 0.0 21 | config['batch_first'] = False 22 | config['batch_size'] = 3 23 | config['learning_rate'] = 1e-3 24 | config['log_path'] = 'logs.txt' 25 | config['save_directory'] = 'logs/saved_data' 26 | config['use_pretrained_embeddings'] = True 27 | config['pretrained_embeddings_path'] = 'datasets/question_generator/word_embeddings.npy' 28 | config['finetune_embeddings'] = True 29 | config['gpu_mode'] = True 30 | 31 | language_model = LanguageModel(config).cuda() 32 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab()) 33 | language_trainer = LanguageTrainer(config, language_wrapper, language_model_loader) 34 | 35 | for i in range(0, 100): 36 | loss, accuracy, predictions = language_trainer.train(epoch_num=i) 37 | 38 | if i % 10 == 0: 39 | predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST,epoch_num=10, max_length=20) 40 | language_trainer.save_predictions(i, predictions) 41 | language_trainer.save(i) 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /tests/load_questions.py: -------------------------------------------------------------------------------- 1 | from helpers import utils 2 | from helpers import spacy_tokenizer 3 | import numpy as np 4 | 5 | delimiter='*@#$*($#@*@#$' 6 | def func(l): 7 | items = l.strip().split(delimiter) 8 | return items 9 | 10 | def gen_func(l): 11 | items = l.strip().split(" ") 12 | return items 13 | 14 | answer_starts_path = 'datasets/newsqa/train/answer_starts.txt' 15 | answer_ends_path = 'datasets/newsqa/train/answer_ends.txt' 16 | input_path = 'datasets/newsqa/train/inputs.txt' 17 | output_path = 'datasets/newsqa/train/outputs.txt' 18 | generated_path = 'logs/newsqa_saved_data/dummy5_train_predictions_epoch_6.txt' 19 | indices_path = 'datasets/newsqa/train/indices.txt' 20 | 21 | 22 | inputs = utils.read_lines_with_func(func, input_path) 23 | outputs = utils.read_tabbed_lines(output_path) 24 | generated = utils.read_lines_with_func(gen_func, generated_path) 25 | answer_starts = list(map(lambda l: int(l), utils.read_lines(answer_starts_path))) 26 | answer_ends = list(map(lambda l: int(l), utils.read_lines(answer_ends_path))) 27 | indices = list(map(lambda l: int(l), utils.read_lines(indices_path))) 28 | 29 | answers = [] 30 | truncated_contexts = [] 31 | questions = [] 32 | generated_questions = [] 33 | 34 | num_overlap = [] 35 | num_items = len(generated) 36 | 37 | question_counter = 0 38 | generated_question_counter = 0 39 | filtered_words = ["a", "the", "who", "what", "when", "where", "why", "it"] 40 | for i in range(num_items): 41 | start_idx = answer_starts[i] 42 | end_idx = answer_ends[i] 43 | idx = indices[i] 44 | padded_start_idx = np.max([0, start_idx-10]) 45 | padded_end_idx = np.min([end_idx + 10, len(inputs[idx])]) 46 | truncated_context = inputs[idx][padded_start_idx:padded_end_idx] 47 | 48 | answers.append(inputs[idx][start_idx:end_idx]) 49 | truncated_contexts.append(truncated_context) 50 | 51 | question = outputs[i] 52 | generated_question = generated[i] 53 | 54 | questions.append(question) 55 | generated_questions.append(generated_question) 56 | 57 | for t in question: 58 | if t not in filtered_words: 59 | if t in truncated_context: 60 | question_counter += 1 61 | 62 | for t in generated_question: 63 | if t not in filtered_words: 64 | if t in truncated_context: 65 | generated_question_counter += 1 66 | 67 | 68 | #ner_tokens = spacy_tokenizer.extract_NER(' '.join(truncated_context)) 69 | #assert(False) 70 | 71 | utils.save_tabbed_lines(questions, "analysis/questions.txt") 72 | utils.save_tabbed_lines(generated_questions, "analysis/generated_questions.txt") 73 | utils.save_tabbed_lines(answers, "analysis/answers.txt") 74 | utils.save_tabbed_lines(truncated_contexts, "analysis/truncated_contexts.txt") 75 | 76 | num_tokens_q = question_counter / float(num_items) 77 | num_tokens_generated_q = generated_question_counter / float(num_items) 78 | 79 | 80 | print(num_tokens_q) 81 | print(num_tokens_generated_q) 82 | 83 | 1.8647080433936984 84 | 2.5078769935600986 85 | 86 | 2.958032588494619 87 | 4.5196546656869945 88 | 89 | 2.469334831654925 90 | 4.094059298958379 91 | #utils.save_lines() 92 | #utils.save_lines(): 93 | 94 | #If you look at the fraction of questions that have: overlapping words with the context 95 | #Of question words that overlap with the context from synthetically generated questions 96 | #Of question words that overlap with the context from human-generated words. 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | -------------------------------------------------------------------------------- /tests/newsqa_predictor_test_unsup.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from data_loaders.language_model_loader import LanguageModelLoader 3 | from models.language_model import LanguageModel 4 | from models.language_trainer import LanguageTrainer 5 | from models.language_wrapper import LanguageWrapper 6 | from helpers import constants 7 | from helpers import torch_utils, utils 8 | from torch.autograd import variable 9 | 10 | dataset_path = 'datasets/newsqa_unsupervised_large' 11 | load_path = 'logs/squad_saved_data/model_14.pyt7' 12 | 13 | language_model_loader = LanguageModelLoader(dataset_path, tokenizer_type=constants.TOKENIZER_TAB, 14 | context_tokenizer_type=constants.TOKENIZER_TAB) 15 | language_model = torch_utils.load_model(load_path).cuda() 16 | language_model.config['save_directory'] = 'logs/newsqa_unsupervised_saved_data' 17 | language_model.config['gpu_mode'] = True 18 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab()) 19 | language_trainer = LanguageTrainer(language_model.config, language_wrapper, language_model_loader) 20 | 21 | #test_predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST, 22 | # epoch_num=10, max_length=20) 23 | #dev_predictions = language_trainer.predict(dataset_type=constants.DATASET_VALIDATION, 24 | # epoch_num=10, max_length=10) 25 | train_predictions = language_trainer.predict(dataset_type=constants.DATASET_TRAIN, 26 | epoch_num=10, max_length=25, beam_size=5) 27 | 28 | utils.save_lines(train_predictions, 'logs/newsqa_saved_data/train_predictions_epoch_8.txt') 29 | #utils.save_lines(dev_predictions, 'logs/newsqa_saved_data/validation_predictions_epoch_6.txt') 30 | #utils.save_lines(test_predictions, 'logs/newsqa_saved_data/test_predictions_epoch_6.txt') 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /tests/newsqa_predictor_test_unsup_large.py: -------------------------------------------------------------------------------- 1 | from data_loaders.language_model_loader import LanguageModelLoader 2 | from models.language_model import LanguageModel 3 | from models.language_trainer import LanguageTrainer 4 | from models.language_wrapper import LanguageWrapper 5 | from helpers import constants 6 | import torch 7 | from helpers import torch_utils, utils 8 | from torch.autograd import variable 9 | 10 | dataset_path = 'datasets/newsqa_unsupervised_large_verb_filtered' 11 | load_path = 'logs/squad_saved_data/model_8.pyt7' # CHANGE THIS TO WHATEVER YOU WANT 12 | 13 | language_model_loader = LanguageModelLoader(dataset_path, tokenizer_type=constants.TOKENIZER_TAB) 14 | language_model = torch_utils.load_model(load_path).cuda() 15 | language_model.config['save_directory'] = 'logs/newsqa_unsupervised_saved_data' 16 | 17 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab()) 18 | language_trainer = LanguageTrainer(language_model.config, language_wrapper, language_model_loader) 19 | 20 | ##test_predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST, 21 | # epoch_num=10, max_length=20, beam_size=5) 22 | #dev_predictions = language_trainer.predict(dataset_type=constants.DATASET_VALIDATION, 23 | # epoch_num=10, max_length=10, beam_size=5) 24 | train_predictions = language_trainer.predict(dataset_type=constants.DATASET_TRAIN, 25 | epoch_num=10, max_length=10) 26 | 27 | utils.save_lines(train_predictions, 'logs/newsqa_saved_data/train_predictions_epoch_6.txt') 28 | #utils.save_lines(dev_predictions, 'logs/newsqa_saved_data/validation_predictions_epoch_6.txt') 29 | #utils.save_lines(test_predictions, 'logs/newsqa_saved_data/test_predictions_epoch_6.txt') 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /tests/newsqa_predictor_test_unsup_truncated.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from data_loaders.language_model_loader_truncate import LanguageModelLoaderTruncate 3 | from models.language_model import LanguageModel 4 | from models.language_trainer import LanguageTrainer 5 | from models.language_wrapper import LanguageWrapper 6 | from helpers import constants 7 | from helpers import torch_utils, utils 8 | from torch.autograd import variable 9 | 10 | dataset_path = 'datasets/newsqa_unsupervised_verb_filtered' 11 | load_path = 'logs/squad_saved_data_truncated/model_14.pyt7' # CHANGE THIS TO EHATEVER YOU WANT 12 | 13 | language_model_loader = LanguageModelLoaderTruncate(dataset_path, tokenizer_type=constants.TOKENIZER_TAB) 14 | language_model = torch_utils.load_model(load_path).cuda() 15 | language_model.config['save_directory'] = 'logs/newsqa_saved_data' 16 | 17 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab()) 18 | language_trainer = LanguageTrainer(language_model.config, language_wrapper, language_model_loader) 19 | 20 | #test_predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST, 21 | # epoch_num=10, max_length=20) 22 | #dev_predictions = language_trainer.predict(dataset_type=constants.DATASET_VALIDATION, 23 | # epoch_num=10, max_length=10) 24 | train_predictions = language_trainer.predict(dataset_type=constants.DATASET_TRAIN, 25 | epoch_num=10, max_length=10) 26 | 27 | utils.save_lines(train_predictions, 'logs/newsqa_saved_data/dummy5_unsup_train_predictions_epoch_6.txt') 28 | utils.save_lines(dev_predictions, 'logs/newsqa_saved_data/dummy5_unsup_validation_predictions_epoch_6.txt') 29 | utils.save_lines(test_predictions, 'logs/newsqa_saved_data/dummy5_unsup_test_predictions_epoch_6.txt') 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /tests/newsqa_predictor_test_verb.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from helpers import torch_utils, utils 3 | from torch.autograd import variable 4 | from data_loaders.language_model_loader import LanguageModelLoader 5 | from models.language_model import LanguageModel 6 | from models.language_trainer import LanguageTrainer 7 | from models.language_wrapper import LanguageWrapper 8 | from helpers import constants 9 | 10 | 11 | dataset_path = 'datasets/newsqa_unsupervised_verb_filtered' 12 | load_path = 'logs/squad_saved_data/model_14.pyt7' 13 | 14 | language_model_loader = LanguageModelLoader(dataset_path, tokenizer_type=constants.TOKENIZER_TAB) 15 | language_model = torch_utils.load_model(load_path).cuda() 16 | language_model.config['save_directory'] = 'logs/newsqa_unsupervised_verb_filtered' 17 | 18 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab()) 19 | language_trainer = LanguageTrainer(language_model.config, language_wrapper, language_model_loader) 20 | 21 | #test_predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST, 22 | # epoch_num=10, max_length=20) 23 | #dev_predictions = language_trainer.predict(dataset_type=constants.DATASET_VALIDATION, 24 | # epoch_num=10, max_length=10) 25 | train_predictions = language_trainer.predict(dataset_type=constants.DATASET_TRAIN, 26 | epoch_num=12, max_length=15, 27 | beam_size=5) 28 | 29 | utils.save_lines(train_predictions, 'logs/newsqa_saved_data/train_predictions_epoch_6_verb_filtered.txt') 30 | #utils.save_lines(dev_predictions, 'logs/newsqa_saved_data/validation_predictions_epoch_6.txt') 31 | #utils.save_lines(test_predictions, 'logs/newsqa_saved_data/test_predictions_epoch_6.txt') 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /tests/newsqa_trainer_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import variable 3 | from data_loaders.language_model_loader import LanguageModelLoader 4 | from models.language_model import LanguageModel 5 | from models.language_trainer import LanguageTrainer 6 | from models.language_wrapper import LanguageWrapper 7 | from helpers import constants, torch_utils 8 | 9 | base_path = 'datasets/newsqa_train' 10 | 11 | language_model_loader = LanguageModelLoader(base_path, tokenizer_type=constants.TOKENIZER_TAB) 12 | 13 | config = {} 14 | config['vocab_size'] = language_model_loader.get_vocab().size() 15 | config['hidden_size'] = 100 16 | config['embedding_size'] = 300 17 | config['num_layers'] = 1 18 | config['dropout'] = 0.0 19 | config['batch_first'] = False 20 | config['batch_size'] = 10 21 | config['learning_rate'] = 1e-3 22 | config['log_path'] = 'logs.txt' 23 | config['save_directory'] = 'logs/newsqa_train_saved_data' 24 | config['use_pretrained_embeddings'] = True 25 | config['pretrained_embeddings_path'] = 'datasets/newsqa_train/word_embeddings.npy' 26 | config['finetune_embeddings'] = False 27 | config['load_model'] = True 28 | config['saved_epoch'] = 1 29 | config['load_path'] = 'logs/newsqa_train_saved_data/model_1.pyt7' 30 | 31 | language_model = LanguageModel(config) 32 | if config['load_model']: 33 | language_model = torch_utils.load_model(config['load_path']) 34 | 35 | language_model.cuda() 36 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab()) 37 | language_trainer = LanguageTrainer(config, language_wrapper, language_model_loader) 38 | 39 | for i in range(0, 100): 40 | loss, accuracy, predictions = language_trainer.train(epoch_num=i) 41 | 42 | if i % 3 == 1: 43 | predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST, 44 | epoch_num=10, max_length=20) 45 | language_trainer.save(i + config['saved_epoch']) 46 | language_trainer.save_predictions(i + config['saved_epoch'], predictions) 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /tests/pointer_network_test.py: -------------------------------------------------------------------------------- 1 | from data_loaders.language_model_loader import LanguageModelLoader 2 | from models.pointer_network import PointerNetwork 3 | from helpers import constants 4 | import torch 5 | from torch.autograd import variable 6 | from torch import optim 7 | from torch import nn 8 | 9 | base_path = 'datasets/newsqa_train' 10 | language_model_loader = LanguageModelLoader(base_path, tokenizer_type=constants.TOKENIZER_NLTK) 11 | language_model_loader.mix_indices() 12 | 13 | config = {} 14 | config['vocab_size'] = language_model_loader.get_vocab().size() 15 | config['hidden_size'] = 100 16 | config['embedding_size'] = 300 17 | config['num_layers'] = 1 18 | config['dropout'] = 0.0 19 | config['batch_first'] = False 20 | config['batch_size'] = 24 21 | config['learning_rate'] = 1e-3 22 | config['log_path'] = 'logs.txt' 23 | config['save_directory'] = 'logs/squad_saved_data' 24 | config['use_pretrained_embeddings'] = True 25 | config['pretrained_embeddings_path'] = 'datasets/squad/word_embeddings.npy' 26 | config['finetune_embeddings'] = False 27 | config['load_model'] = True 28 | config['load_path'] = 'logs/squad_saved_data/model_7_old.pyt7' 29 | 30 | pointer_network = PointerNetwork(config).cuda() 31 | 32 | 33 | criterion1 = nn.CrossEntropyLoss().cuda() 34 | criterion2 = nn.CrossEntropyLoss().cuda() 35 | optimizer = optim.Adam(pointer_network.parameters(), 1e-2) 36 | 37 | 38 | batch = language_model_loader.get_batch(dataset_type=constants.DATASET_TRAIN, batch_size=config['batch_size']) 39 | 40 | large_negative_number = -1.e-10 41 | while batch is not None: 42 | optimizer.zero_grad() 43 | input_lengths = variable.Variable(torch.from_numpy(batch['context_lengths'])).cuda() 44 | input_vals = variable.Variable(torch.from_numpy(batch['context_tokens'])).cuda() 45 | answer_starts = variable.Variable(torch.from_numpy(batch['answer_starts'])).cuda() 46 | answer_ends = variable.Variable(torch.from_numpy(batch['answer_ends'])).cuda() 47 | masks = variable.Variable(torch.from_numpy(batch['context_masks'].T).float()).cuda() 48 | 49 | p_start, p_end = pointer_network.forward(input_vals, input_lengths, masks) 50 | 51 | # Batch first 52 | loss = criterion1(p_start, answer_starts) + \ 53 | criterion2(p_end, answer_ends) 54 | 55 | print(loss) 56 | loss.backward() 57 | optimizer.step() 58 | batch = language_model_loader.get_batch(dataset_type=constants.DATASET_TRAIN, batch_size=config['batch_size']) 59 | 60 | 61 | -------------------------------------------------------------------------------- /tests/question_discriminator_test.py: -------------------------------------------------------------------------------- 1 | from models.language_model import TextFieldPredictor, LanguageModel, LanguageDiscriminator 2 | from dnn_units.lstm_attention import LSTMAttentionDot 3 | from torch import nn 4 | from torch import optim 5 | from helpers import torch_utils 6 | import torch 7 | from torch.autograd import variable 8 | 9 | load_path = 'logs/squad_saved_data/model_6.pyt7' 10 | language_model = torch_utils.load_model(load_path) 11 | language_model = language_model.cuda() 12 | 13 | batch_size = 3 14 | 15 | embeddings = language_model.embedder 16 | text_field_predictor = language_model.text_field_predictor 17 | base_lstm = language_model.base_lstm 18 | 19 | discriminator = LanguageDiscriminator(language_model.config, 20 | embeddings, text_field_predictor, base_lstm).cuda() 21 | 22 | discriminator_optimizer = optim.Adam(discriminator.parameters(), lr=3e-2) 23 | discriminator_criterion = nn.BCELoss() 24 | 25 | contexts = variable.Variable(torch.LongTensor([[1, 2, 3], [2, 3, 4], [4, 5, 6]])).cuda() 26 | answer_features = variable.Variable(torch.FloatTensor([[0, 0, 0], [0, 0, 0], [0, 0, 0]])).cuda() 27 | inputs = variable.Variable(torch.LongTensor([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])).cuda() 28 | 29 | desired_indices = variable.Variable(torch.FloatTensor([1, 1, 1])).cuda() 30 | 31 | for i in range(0, 100): 32 | discriminator_optimizer.zero_grad() 33 | pred = discriminator.forward(inputs, contexts, answer_features) 34 | bce_loss = discriminator_criterion(pred, desired_indices) 35 | bce_loss.backward() 36 | 37 | print(bce_loss) 38 | discriminator_optimizer.step() 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /tests/squad_discriminator_test.py: -------------------------------------------------------------------------------- 1 | from data_loaders.language_model_loader import LanguageModelLoader 2 | from models.language_model import LanguageModel 3 | from models.language_discriminator_trainer import LanguageDiscriminatorTrainer 4 | from models.language_wrapper import LanguageWrapper 5 | from helpers import constants 6 | import torch 7 | from helpers import torch_utils, utils 8 | from torch.autograd import variable 9 | 10 | dataset_path = 'datasets/squad' 11 | load_path = 'logs/squad_saved_data/model_6.pyt7' 12 | 13 | language_model_loader = LanguageModelLoader(dataset_path, tokenizer_type=constants.TOKENIZER_TAB) 14 | language_model = torch_utils.load_model(load_path).cuda() 15 | language_model.config['save_directory'] = 'logs/newsqa_saved_data' 16 | 17 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab()) 18 | language_trainer = LanguageDiscriminatorTrainer(language_model.config, language_wrapper, language_model_loader) 19 | 20 | for i in range(0, 100): 21 | language_trainer.predict(dataset_type=constants.DATASET_TRAIN, epoch_num=1, max_length=20) 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /tests/squad_loader_test_v2.py: -------------------------------------------------------------------------------- 1 | from data_loaders.language_model_loader import LanguageModelLoader 2 | from models.language_model import LanguageModel 3 | from helpers import constants 4 | 5 | base_path = 'datasets/newsqa' 6 | language_model_loader = LanguageModelLoader(base_path, tokenizer_type=constants.TOKENIZER_NLTK) 7 | language_model_loader.reset_indices() 8 | batch = language_model_loader.get_batch(dataset_type=constants.DATASET_TRAIN, batch_size=10) 9 | 10 | config = {} 11 | config['vocab_size'] = language_model_loader.get_vocab().size() 12 | config['hidden_size'] = 100 13 | config['embedding_size'] = 300 14 | config['num_layers'] = 1 15 | config['dropout'] = 0.0 16 | config['batch_first'] = False 17 | config['batch_size'] = 24 18 | config['learning_rate'] = 1e-3 19 | config['log_path'] = 'logs.txt' 20 | config['save_directory'] = 'logs/squad_saved_data' 21 | config['use_pretrained_embeddings'] = True 22 | config['pretrained_embeddings_path'] = 'datasets/squad/word_embeddings.npy' 23 | config['finetune_embeddings'] = False 24 | config['load_model'] = True 25 | config['load_path'] = 'logs/squad_saved_data/model_7_old.pyt7' 26 | 27 | language_model = LanguageModel(config) 28 | 29 | -------------------------------------------------------------------------------- /tests/squad_predictor_test.py: -------------------------------------------------------------------------------- 1 | from data_loaders.language_model_loader import LanguageModelLoader 2 | from models.language_model import LanguageModel 3 | from models.language_trainer import LanguageTrainer 4 | from models.language_wrapper import LanguageWrapper 5 | from helpers import constants 6 | import torch 7 | from helpers import torch_utils, utils 8 | from torch.autograd import variable 9 | 10 | dataset_path = 'datasets/newsqa' 11 | load_path = 'logs/squad_saved_data/model_12.pyt7' 12 | 13 | language_model_loader = LanguageModelLoader(dataset_path, tokenizer_type=constants.TOKENIZER_TAB) 14 | language_model = torch_utils.load_model(load_path).cuda() 15 | language_model.config['save_directory'] = 'logs/newsqa_saved_data' 16 | 17 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab()) 18 | language_trainer = LanguageTrainer(language_model.config, language_wrapper, language_model_loader) 19 | 20 | test_predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST, 21 | epoch_num=10, max_length=20) 22 | dev_predictions = language_trainer.predict(dataset_type=constants.DATASET_VALIDATION, 23 | epoch_num=10, max_length=10) 24 | train_predictions = language_trainer.predict(dataset_type=constants.DATASET_TRAIN, 25 | epoch_num=10, max_length=10) 26 | 27 | utils.save_lines(train_predictions, 'logs/newsqa_saved_data/dummy8_train_predictions_epoch_.txt') 28 | utils.save_lines(dev_predictions, 'logs/newsqa_saved_data/dummy8_validation_predictions_epoch_6.txt') 29 | utils.save_lines(test_predictions, 'logs/newsqa_saved_data/dummy8_test_predictions_epoch_6.txt') 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /tests/squad_predictor_truncated_test.py: -------------------------------------------------------------------------------- 1 | from data_loaders.language_model_loader_truncate import LanguageModelLoaderTruncate 2 | from models.language_model import LanguageModel 3 | from models.language_trainer import LanguageTrainer 4 | from models.language_wrapper import LanguageWrapper 5 | from helpers import constants 6 | import torch 7 | from helpers import torch_utils, utils 8 | from torch.autograd import variable 9 | 10 | dataset_path = 'datasets/newsqa' 11 | load_path = 'logs/squad_saved_data_truncated/model_2.pyt7' 12 | 13 | language_model_loader = LanguageModelLoaderTruncate(dataset_path, tokenizer_type=constants.TOKENIZER_TAB) 14 | language_model = torch_utils.load_model(load_path).cuda() 15 | language_model.config['save_directory'] = 'logs/newsqa_saved_data' 16 | 17 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab()) 18 | language_trainer = LanguageTrainer(language_model.config, language_wrapper, language_model_loader) 19 | 20 | #test_predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST, 21 | # epoch_num=10, max_length=20) 22 | #dev_predictions = language_trainer.predict(dataset_type=constants.DATASET_VALIDATION, 23 | # epoch_num=10, max_length=10) 24 | train_predictions = language_trainer.predict(dataset_type=constants.DATASET_TRAIN, 25 | epoch_num=10, max_length=10) 26 | 27 | utils.save_lines(train_predictions, 'logs/newsqa_saved_data/dummy5_train_predictions_epoch_6.txt') 28 | utils.save_lines(dev_predictions, 'logs/newsqa_saved_data/dummy5_validation_predictions_epoch_6.txt') 29 | utils.save_lines(test_predictions, 'logs/newsqa_saved_data/dummy5_test_predictions_epoch_6.txt') 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /tests/squad_trainer_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import variable 3 | from data_loaders.language_model_loader import LanguageModelLoader 4 | from models.language_model import LanguageModel 5 | from models.language_trainer import LanguageTrainer 6 | from models.language_wrapper import LanguageWrapper 7 | from helpers import constants, torch_utils, io_utils 8 | 9 | base_path = 'datasets/squad/' 10 | language_model_loader = LanguageModelLoader(base_path, tokenizer_type=constants.TOKENIZER_TAB) 11 | 12 | config = {} 13 | config['vocab_size'] = language_model_loader.get_vocab().size() 14 | config['hidden_size'] = 100 15 | config['embedding_size'] = 300 16 | config['num_layers'] = 1 17 | config['dropout'] = 0.0 18 | config['batch_first'] = False 19 | config['batch_size'] = 24 20 | config['learning_rate'] = 1e-3 21 | config['beam_size'] = 5 22 | config['log_path'] = 'logs.txt' 23 | config['save_directory'] = 'logs/squad_saved_data' 24 | config['use_pretrained_embeddings'] = True 25 | config['pretrained_embeddings_path'] = 'datasets/squad/word_embeddings.npy' 26 | config['finetune_embeddings'] = False 27 | config['load_model'] = False 28 | config['gpu_mode'] = True 29 | config['load_path'] = 'logs/squad_saved_data/model_6.pyt7' # CHANGE THIS TO WHATEVER PATH YOU WANT 30 | 31 | io_utils.check_dir('logs/squad_saved_data') 32 | 33 | language_model = LanguageModel(config) 34 | if config['load_model']: 35 | language_model = torch_utils.load_model(config['load_path']) 36 | 37 | language_model.cuda() 38 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab()) 39 | language_trainer = LanguageTrainer(config, language_wrapper, language_model_loader) 40 | 41 | for i in range(0, 15): 42 | loss, accuracy, predictions = language_trainer.train(epoch_num=i) 43 | 44 | if i % 3 == 2: 45 | predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST, 46 | epoch_num=10, max_length=20, beam_size=config['beam_size']) 47 | language_trainer.save(i) 48 | language_trainer.save_predictions(i, predictions) 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /tests/squad_trainer_truncated_expanded_test.py: -------------------------------------------------------------------------------- 1 | from data_loaders.language_model_loader_truncate import LanguageModelLoaderTruncate 2 | from models.language_model import LanguageModel 3 | from models.language_trainer import LanguageTrainer 4 | from models.language_wrapper import LanguageWrapper 5 | from helpers import constants, torch_utils 6 | import torch 7 | from torch.autograd import variable 8 | 9 | base_path = 'datasets/squad_expanded_vocab' 10 | 11 | 12 | language_model_loader = LanguageModelLoaderTruncate(base_path, tokenizer_type=constants.TOKENIZER_TAB) 13 | 14 | config = {} 15 | config['vocab_size'] = language_model_loader.get_vocab().size() 16 | config['hidden_size'] = 100 17 | config['embedding_size'] = 300 18 | config['num_layers'] = 1 19 | config['dropout'] = 0.0 20 | config['batch_first'] = False 21 | config['batch_size'] = 20 22 | config['learning_rate'] = 1e-3 23 | config['log_path'] = 'logs.txt' 24 | config['save_directory'] = 'logs/squad_saved_data_truncated_expanded_vocab' 25 | config['use_pretrained_embeddings'] = True 26 | config['pretrained_embeddings_path'] = 'datasets/squad_expanded_vocab/word_embeddings.npy' 27 | config['finetune_embeddings'] = False 28 | config['load_model'] = False 29 | config['beam_size'] = 5 30 | config['load_path'] = 'logs/squad_saved_data_truncated/model_0.pyt7' # CHANGE THIS TO ONE OF THE SAVED MODEL PATHS 31 | 32 | language_model = LanguageModel(config) 33 | if config['load_model']: 34 | language_model = torch_utils.load_model(config['load_path']) 35 | 36 | language_model.cuda() 37 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab()) 38 | language_trainer = LanguageTrainer(config, language_wrapper, language_model_loader) 39 | 40 | for i in range(0, 10): 41 | loss, accuracy, predictions = language_trainer.train(epoch_num=i) 42 | 43 | if i % 2 == 0: 44 | predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST, 45 | epoch_num=10, max_length=20) 46 | language_trainer.save(i) 47 | language_trainer.save_predictions(i, predictions) 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /tests/squad_trainer_truncated_test.py: -------------------------------------------------------------------------------- 1 | from data_loaders.language_model_loader_truncate import LanguageModelLoaderTruncate 2 | from models.language_model import LanguageModel 3 | from models.language_trainer import LanguageTrainer 4 | from models.language_wrapper import LanguageWrapper 5 | from helpers import constants, torch_utils 6 | import torch 7 | from torch.autograd import variable 8 | 9 | base_path = 'datasets/squad' 10 | 11 | 12 | language_model_loader = LanguageModelLoaderTruncate(base_path, tokenizer_type=constants.TOKENIZER_TAB) 13 | 14 | config = {} 15 | config['vocab_size'] = language_model_loader.get_vocab().size() 16 | config['hidden_size'] = 100 17 | config['embedding_size'] = 300 18 | config['num_layers'] = 1 19 | config['dropout'] = 0.0 20 | config['batch_first'] = False 21 | config['batch_size'] = 24 22 | config['learning_rate'] = 1e-3 23 | config['log_path'] = 'logs.txt' 24 | config['save_directory'] = 'logs/squad_saved_data_truncated' 25 | config['use_pretrained_embeddings'] = True 26 | config['pretrained_embeddings_path'] = 'datasets/squad/word_embeddings.npy' 27 | config['finetune_embeddings'] = False 28 | config['load_model'] = True 29 | config['load_path'] = 'logs/squad_saved_data_truncated/model_0.pyt7' # CHANGE THIS TO WHATEVER YOU WANT 30 | 31 | language_model = LanguageModel(config) 32 | if config['load_model']: 33 | language_model = torch_utils.load_model(config['load_path']) 34 | 35 | language_model.cuda() 36 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab()) 37 | language_trainer = LanguageTrainer(config, language_wrapper, language_model_loader) 38 | 39 | for i in range(0, 100): 40 | loss, accuracy, predictions = language_trainer.train(epoch_num=i) 41 | 42 | if i % 2 == 0: 43 | predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST, 44 | epoch_num=10, max_length=20) 45 | language_trainer.save(i) 46 | language_trainer.save_predictions(i, predictions) 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /tests/test_expand_dims.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.autograd import variable 4 | 5 | x = torch.Tensor([[1], [2], [3]]) 6 | print(x.size()) 7 | torch.Size([3, 1]) 8 | print(x.expand(3, 1, 1)) -------------------------------------------------------------------------------- /tests/test_load_dataset.py: -------------------------------------------------------------------------------- 1 | from data_loaders.card_loader import CardLoader 2 | 3 | card_loader = CardLoader(base_path='card2code/third_party/magic') 4 | print(card_loader.train_dataset['inputs'][0]) 5 | print(card_loader.train_dataset['outputs'][0]) -------------------------------------------------------------------------------- /tests/test_lstm_attention.py: -------------------------------------------------------------------------------- 1 | from dnn_units.lstm_attention import LSTMAttentionDot, LSTMAttention 2 | import torch 3 | from torch import nn 4 | from torch.autograd import variable 5 | from torch import optim 6 | 7 | batch_size = 25 8 | input_size = 125 9 | input_length = 25 10 | hidden_size = 250 11 | ctx_length = 230 12 | 13 | net = LSTMAttentionDot(input_size=input_size, 14 | hidden_size=hidden_size, 15 | batch_first=False).cuda() 16 | 17 | inputs = variable.Variable(torch.randn(input_length, batch_size, input_size)).cuda() 18 | hidden = variable.Variable(torch.randn(batch_size, hidden_size)).cuda() 19 | cell = variable.Variable(torch.randn(batch_size, hidden_size)).cuda() 20 | context = variable.Variable(torch.randn(ctx_length, batch_size, hidden_size)).cuda() 21 | desired = variable.Variable(torch.randn(batch_size, hidden_size)).cuda() 22 | 23 | criterion = nn.MSELoss() 24 | 25 | optimizer = optim.Adam(net.parameters(), lr=3e-2) 26 | 27 | for i in range(0, 1000): 28 | print(i) 29 | optimizer.zero_grad() 30 | out, h = net.forward(inputs, [hidden, cell], context) 31 | loss = criterion(h[0], desired) 32 | loss.backward() 33 | optimizer.step() 34 | -------------------------------------------------------------------------------- /tests/test_lstm_attention_dot.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from torch.autograd import variable 4 | from models.language_model import TextFieldPredictor, SoftmaxPredictor 5 | 6 | config = {} 7 | config['vocab_size'] = 12 8 | config['embedding_size'] = 20 9 | config['hidden_size'] = 50 10 | config['num_layers'] = 1 11 | config['dropout'] = 0.0 12 | config['batch_first'] = True 13 | 14 | # First test text field predictor 15 | inp = variable.Variable(torch.LongTensor([[1, 2, 3], [4, 5, 6]])) 16 | hidden = variable.Variable(torch.randn(2, config['hidden_size'])) 17 | predictor = TextFieldPredictor(config) 18 | lstm_embeddings = predictor.forward_prepro(inp) 19 | h_tilde, attentions, inp = predictor.forward_similarity(hidden) 20 | 21 | inp1 = variable.Variable(torch.LongTensor(2, config['vocab_size'] - 3).zero_()) 22 | inp2 = variable.Variable(torch.zeros(2, config['vocab_size'] - 3)) 23 | stacked_inps = torch.cat((inp, inp1), 1) 24 | stacked_attentions = torch.cat((attentions, inp2), 1) 25 | 26 | # Second test softma predictor 27 | softmax_predictor = SoftmaxPredictor(config) 28 | softmax_logits = softmax_predictor.forward(hidden) 29 | 30 | res = variable.Variable(torch.zeros(2, config['vocab_size'])) 31 | res.scatter_(1, stacked_inps, stacked_attentions) 32 | 33 | tmp = softmax_logits + res 34 | 35 | print(tmp) 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /tests/test_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.optim 5 | from torch.autograd import variable 6 | 7 | from models.card_model import CardModel 8 | 9 | config = {} 10 | config['vocab_size'] = 52 11 | config['embedding_size'] = 23 12 | 13 | model = CardModel(config) 14 | 15 | emb1 = nn.Embedding(config['vocab_size'], config['embedding_size']) 16 | 17 | desired = variable.Variable(torch.randn(3, 23)) 18 | tmp = variable.Variable(torch.LongTensor([1,2,3])) 19 | tmp1 = emb1(tmp) 20 | tmp2 = emb1(tmp) 21 | 22 | criterion = nn.MSELoss() 23 | loss = criterion(tmp1 + tmp2, desired) 24 | loss.backward() -------------------------------------------------------------------------------- /tests/test_model_saving.py: -------------------------------------------------------------------------------- 1 | from models.language_model import LanguageModel 2 | from helpers import torch_utils 3 | 4 | config = {} 5 | config['vocab_size'] = 12 6 | config['embedding_size'] = 20 7 | config['hidden_size'] = 50 8 | config['num_layers'] = 1 9 | config['dropout'] = 0.0 10 | config['batch_first'] = True 11 | 12 | model = LanguageModel(config) 13 | 14 | torch_utils.save_model(model, path='test.model') 15 | model = torch_utils.load_model(path='test.model') -------------------------------------------------------------------------------- /tests/test_padded_sequence.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import random 4 | import unittest 5 | import itertools 6 | import contextlib 7 | from copy import deepcopy 8 | from itertools import repeat, product 9 | from functools import wraps 10 | 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | import torch.nn.parallel as dp 14 | import torch.nn.utils.rnn as rnn_utils 15 | from torch.nn.utils import clip_grad_norm 16 | from torch.autograd import Variable 17 | from torch.nn import Parameter 18 | 19 | lengths = [10, 10, 6, 2, 2, 1, 1] 20 | lengths_tensor = Variable(torch.LongTensor(lengths)) 21 | max_length = lengths[0] 22 | x = Variable(torch.randn(max_length, len(lengths), 3), requires_grad=True) 23 | lstm = nn.LSTM(3, 4, bidirectional=True, num_layers=2, batch_first=False) 24 | 25 | packed = rnn_utils.pack_padded_sequence(x, lengths) 26 | packed_out, packed_hidden = lstm(packed) 27 | unpacked, unpacked_len = rnn_utils.pad_packed_sequence(packed_out) 28 | 29 | def sort_sequence(tensor, lengths, batch_first=False): 30 | """ 31 | Sorts sequence in descending order 32 | tensor: Padded tensor of variable length stuff (Torch tensor) 33 | lengths: Lengths of padded tensor (Torch LongTensor) 34 | batch_first: Boolean, whether tensor is batch_first or not 35 | """ 36 | idx = None 37 | if batch_first: 38 | idx = 0 39 | else: 40 | idx = 1 41 | 42 | sorted_lengths, indices = torch.sort(lengths, dim=0, descending=True) 43 | new_tensor = torch.index_select(tensor, idx, indices) 44 | return new_tensor, sorted_lengths, indices 45 | 46 | def unsort_sequence(tensor, indices, batch_first=False): 47 | """ 48 | Unsort a tensor according to indices and idx 49 | """ 50 | if batch_first: 51 | idx = 0 52 | else: 53 | idx = 1 54 | unsorted_tensor = torch.index_select(tensor, idx, indices) 55 | return unsorted_tensor 56 | 57 | def pack_forward(rnn, tensor, lengths, batch_first=False): 58 | """ 59 | Forwards a padded tensor with lengths lengths thru rnn 60 | rnn: Cell to forward through 61 | tensor: Tensor to use 62 | lengths: Lengths to use 63 | batch_first: Whether tensor is batch first or not 64 | """ 65 | 66 | sorted_tensor, sorted_lengths, sorted_indices = sort_sequence(tensor, lengths, batch_first) 67 | packed = rnn_utils.pack_padded_sequence(sorted_tensor, sorted_lengths.data.numpy()) 68 | packed_out, packed_hidden = lstm(packed) 69 | unpacked, unpacked_len = rnn_utils.pad_packed_sequence(packed_out) 70 | unsorted_out = unsort_sequence(unpacked, sorted_indices, batch_first=False) 71 | unsorted_hidden = list(map(lambda idx: unsort_sequence(packed_hidden[idx], sorted_indices, batch_first=False), [0, 1])) 72 | return unsorted_out, unsorted_hidden 73 | 74 | sorted_tensor, sorted_indices, sorted_idx = sort_sequence(x, lengths_tensor, batch_first=False) 75 | unsorted_tensor = unsort_sequence(sorted_tensor, sorted_idx) 76 | 77 | unsorted_out, unsorted_hidden = pack_forward(lstm, x, lengths_tensor, ) 78 | print(packed_out[0].size()) 79 | print(unsorted_out[0].size()) 80 | 81 | -------------------------------------------------------------------------------- /trainers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/trainers/__init__.py -------------------------------------------------------------------------------- /trainers/iob_predictor.py: -------------------------------------------------------------------------------- 1 | from data_loaders.iob_loader import IOBLoader 2 | from models.iob.iob_model import IOBModel 3 | from helpers import constants, utils 4 | import os 5 | os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" # see issue #152 6 | os.environ["CUDA_VISIBLE_DEVICES"]="1" 7 | 8 | embeddings = utils.load_matrix('datasets/squad_iob/word_embeddings.npy') 9 | base_directory = 'datasets/newsqa_iob' 10 | config_path = 'iob/logs/squad/config.json' 11 | params_path = 'iob/logs/squad/model_params_3.ckpt' 12 | predictions_save_path = 'iob/logs/newsqa/train_predictions_1.txt' 13 | 14 | data_loader = IOBLoader(base_directory, tokenizer_type=constants.TOKENIZER_SPECIAL_DELIMITER, 15 | input_max_length=2100)#00) 16 | 17 | config = utils.load_json(config_path) 18 | config['batch_size'] = 25 19 | config['input_max_length'] = data_loader.input_max_length 20 | model = IOBModel(config, embeddings=embeddings) 21 | model.restore(params_path) 22 | 23 | num_steps = 0 24 | 25 | data_loader.reset_indices() 26 | total_predictions = [] 27 | num_steps = 0 28 | 29 | while True: 30 | batch = data_loader.get_batch(constants.DATASET_TRAIN, config['batch_size']) 31 | num_steps += config['batch_size'] 32 | print(num_steps) 33 | if batch is None: 34 | break 35 | predictions = model.predict(batch) 36 | texts = data_loader.label_vocab.tokens_list(predictions) 37 | for i in range(0, len(texts)): 38 | cur_input_length = batch['input_lengths'][i] 39 | cur_text = texts[i] 40 | 41 | text_str = " ".join(cur_text[0:cur_input_length]) 42 | total_predictions.append(text_str) 43 | 44 | utils.save_lines(total_predictions, predictions_save_path) -------------------------------------------------------------------------------- /trainers/iob_trainer.py: -------------------------------------------------------------------------------- 1 | from data_loaders.iob_loader import IOBLoader 2 | from models.iob.iob_model import IOBModel 3 | from helpers import constants, utils 4 | import os 5 | os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" # see issue #152 6 | os.environ["CUDA_VISIBLE_DEVICES"]="0" 7 | 8 | base_directory = 'datasets/squad_iob' 9 | 10 | data_loader = IOBLoader(base_directory, tokenizer_type=constants.TOKENIZER_SPACE) 11 | data_loader.mix_indices() 12 | 13 | config = { 14 | 'input_max_length': data_loader.input_max_length, 15 | 'vocab_size': data_loader.vocab.size(), 16 | 'embeddings_size': 300, 17 | 'hidden_size': 150, 18 | 'out_size': 100, 19 | 'num_classes': data_loader.label_vocab.size(), 20 | 'batch_size': 25, 21 | 'learning_rate': 1e-2, 22 | 'save_path': 'iob/logs'} 23 | 24 | embeddings = utils.load_matrix('%s/word_embeddings.npy' % base_directory) 25 | config_path = 'iob/logs/squad/config.json' 26 | params_path = 'iob/logs/squad/model_params_%s.ckpt' 27 | 28 | model = IOBModel(config, embeddings=embeddings) 29 | model.save(config_path, params_path) 30 | model.restore(params_path) 31 | 32 | batch = data_loader.get_batch(constants.DATASET_TRAIN, config['batch_size']) 33 | 34 | num_steps = 0 35 | 36 | for i in range(0, 100): 37 | while batch is not None: 38 | loss, predictions = model.forward(batch) 39 | batch = data_loader.get_batch(constants.DATASET_TRAIN, config['batch_size']) 40 | num_steps += config['batch_size'] 41 | 42 | print(num_steps) 43 | print(loss) 44 | 45 | if i % 3 == 0: 46 | model.save(config_path, params_path % i) 47 | data_loader.reset_indices() 48 | total_predictions = [] 49 | while True: 50 | batch = data_loader.get_batch(constants.DATASET_TEST, config['batch_size']) 51 | if batch is None: 52 | break 53 | predictions = model.predict(batch) 54 | texts = data_loader.label_vocab.tokens_list(predictions) 55 | for i in range(0, len(texts)): 56 | cur_input_length = batch['input_lengths'][i] 57 | cur_text = texts[i] 58 | text_str = " ".join(cur_text[0:cur_input_length]) 59 | total_predictions.append(text_str) 60 | utils.save_lines(total_predictions, \ 61 | '%s/predictions_test_%s.txt' % (config['save_path'], i)) 62 | 63 | data_loader.mix_indices() 64 | batch = data_loader.get_batch(constants.DATASET_TRAIN, config['batch_size']) 65 | 66 | 67 | 68 | 69 | 70 | 71 | --------------------------------------------------------------------------------