├── License.txt
├── README.md
├── bidaf
├── README.md
├── basic
│ ├── __init__.py
│ ├── cli.py
│ ├── combiner.py
│ ├── compare.py
│ ├── ensemble.py
│ ├── ensemble_fast.py
│ ├── evaluator.py
│ ├── graph_handler.py
│ ├── main.py
│ ├── model.py
│ ├── read_data.py
│ ├── run_ensemble.sh
│ ├── run_single.sh
│ ├── superhighway.py
│ ├── templates
│ │ └── visualizer.html
│ ├── trainer.py
│ └── visualizer.py
├── basic_cnn
│ ├── __init__.py
│ ├── cli.py
│ ├── evaluator.py
│ ├── graph_handler.py
│ ├── main.py
│ ├── model.py
│ ├── read_data.py
│ ├── superhighway.py
│ ├── templates
│ │ └── visualizer.html
│ ├── trainer.py
│ └── visualizer.py
├── cnn_dm
│ ├── __init__.py
│ ├── eda.ipynb
│ ├── evaluate.py
│ └── prepro.py
├── data
│ └── squad
│ │ ├── data_dev.json
│ │ ├── data_test.json
│ │ ├── data_train.json
│ │ ├── shared_dev.json
│ │ ├── shared_test.json
│ │ └── shared_train.json
├── download.sh
├── helpers
│ ├── __init__.py
│ ├── constants.py
│ ├── file_logger.py
│ ├── math_utils.py
│ ├── spacy_tokenizer.py
│ └── utils.py
├── install_tensorflow.sh
├── my
│ ├── __init__.py
│ ├── corenlp_interface.py
│ ├── nltk_utils.py
│ ├── tensorflow
│ │ ├── __init__.py
│ │ ├── general.py
│ │ ├── nn.py
│ │ ├── rnn.py
│ │ └── rnn_cell.py
│ ├── utils.py
│ └── zip_save.py
├── newsqa
│ ├── __init__.py
│ ├── evaluate.py
│ └── prepro.py
├── newsqa_unsupervised_old
│ └── data_train.json
├── newsqa_unsupervised_old_verb_filtered
│ └── data_train.json
├── out
│ └── basic
│ │ └── 06
│ │ ├── save
│ │ ├── basic-40000.data-00000-of-00001
│ │ ├── basic-40000.index
│ │ ├── basic-40000.meta
│ │ └── checkpoint
│ │ └── shared.json
├── requirements.txt
├── run.sh
├── scripts.sh
├── scripts
│ ├── compare_models.sh
│ ├── evaluate_baseline_models.sh
│ ├── evaluate_run.sh
│ ├── finetune_newsqa.sh
│ ├── finetune_squad.sh
│ ├── install_tensorflow.sh
│ ├── run.sh
│ ├── run_ensemble_unsupervised.sh
│ ├── run_evaluation.sh
│ ├── run_huge_evaluation.sh
│ ├── run_intra_evaluation.sh
│ ├── run_intra_helper.sh
│ └── run_new.sh
├── squad
│ ├── __init__.py
│ ├── aug_squad.py
│ ├── eda_aug_dev.ipynb
│ ├── eda_aug_train.ipynb
│ ├── evaluate-v1.1.py
│ ├── evaluate.py
│ ├── prepro.py
│ ├── prepro_aug.py
│ └── utils.py
├── tests
│ ├── __init__.py
│ ├── check_results.py
│ ├── create_bidaf_dataset.py
│ ├── create_bidaf_old_dataset.py
│ └── create_generation_dataset_unsupervised.py
├── tree
│ ├── __init__.py
│ ├── cli.py
│ ├── evaluator.py
│ ├── graph_handler.py
│ ├── main.py
│ ├── model.py
│ ├── read_data.py
│ ├── templates
│ │ └── visualizer.html
│ ├── test.ipynb
│ ├── trainer.py
│ └── visualizer.py
└── visualization
│ ├── compare_models.py
│ └── compare_models_newsqa.py
├── data_loaders
├── __init__.py
├── iob_loader.py
├── language_model_loader.py
└── language_model_loader_truncate.py
├── datasets
├── iob_test
│ ├── label_vocab.txt
│ ├── test
│ │ ├── inputs.txt
│ │ └── labels.txt
│ ├── train
│ │ ├── inputs.txt
│ │ └── labels.txt
│ ├── validation
│ │ ├── inputs.txt
│ │ └── labels.txt
│ └── vocab.txt
├── newsqa_unsupervised
│ ├── test
│ │ ├── answer_ends.txt
│ │ ├── answer_starts.txt
│ │ ├── ids.txt
│ │ ├── indices.txt
│ │ └── outputs.txt
│ ├── train
│ │ ├── answer_ends.txt
│ │ ├── answer_starts.txt
│ │ ├── ids.txt
│ │ ├── indices.txt
│ │ └── outputs.txt
│ ├── validation
│ │ ├── answer_ends.txt
│ │ ├── answer_starts.txt
│ │ ├── ids.txt
│ │ ├── indices.txt
│ │ └── outputs.txt
│ ├── vocab.txt
│ └── word_embeddings.npy
├── newsqa_unsupervised_large
│ ├── test
│ │ ├── answer_ends.txt
│ │ ├── answer_starts.txt
│ │ ├── ids.txt
│ │ ├── indices.txt
│ │ └── outputs.txt
│ ├── train
│ │ ├── answer_ends.txt
│ │ ├── answer_starts.txt
│ │ ├── ids.txt
│ │ ├── indices.txt
│ │ └── outputs.txt
│ ├── validation
│ │ ├── answer_ends.txt
│ │ ├── answer_starts.txt
│ │ ├── ids.txt
│ │ ├── indices.txt
│ │ └── outputs.txt
│ └── vocab.txt
├── newsqa_unsupervised_old
│ ├── test
│ │ ├── answer_ends.txt
│ │ ├── answer_starts.txt
│ │ ├── ids.txt
│ │ ├── indices.txt
│ │ └── outputs.txt
│ ├── train
│ │ ├── answer_ends.txt
│ │ ├── answer_starts.txt
│ │ ├── ids.txt
│ │ ├── indices.txt
│ │ ├── outputs.txt
│ │ └── predictions.txt
│ ├── validation
│ │ ├── answer_ends.txt
│ │ ├── answer_starts.txt
│ │ ├── ids.txt
│ │ ├── indices.txt
│ │ └── outputs.txt
│ └── vocab.txt
├── question_generator
│ ├── test
│ │ ├── indices.txt
│ │ ├── inputs.txt
│ │ └── outputs.txt
│ ├── train
│ │ ├── indices.txt
│ │ ├── inputs.txt
│ │ └── outputs.txt
│ ├── validation
│ │ ├── indices.txt
│ │ ├── inputs.txt
│ │ └── outputs.txt
│ ├── vocab.txt
│ └── word_embeddings.npy
├── squad
│ ├── test
│ │ ├── answer_ends.txt
│ │ ├── answer_starts.txt
│ │ ├── ids.txt
│ │ ├── indices.txt
│ │ ├── inputs.txt
│ │ └── outputs.txt
│ ├── train
│ │ ├── answer_ends.txt
│ │ ├── answer_starts.txt
│ │ ├── ids.txt
│ │ ├── indices.txt
│ │ ├── inputs.txt
│ │ └── outputs.txt
│ ├── validation
│ │ ├── answer_ends.txt
│ │ ├── answer_starts.txt
│ │ ├── ids.txt
│ │ ├── indices.txt
│ │ ├── inputs.txt
│ │ └── outputs.txt
│ ├── vocab.txt
│ └── word_embeddings.npy
└── squad_iob
│ ├── label_vocab.txt
│ ├── test
│ ├── inputs.txt
│ ├── label_vocab.txt
│ ├── labels.txt
│ └── vocab.txt
│ ├── train
│ ├── inputs.txt
│ └── labels.txt
│ ├── validation
│ ├── inputs.txt
│ ├── label_vocab.txt
│ ├── labels.txt
│ └── vocab.txt
│ └── vocab.txt
├── dnn_units
├── __init__.py
└── lstm_attention.py
├── helpers
├── __init__.py
├── constants.py
├── io_utils.py
├── logger.py
├── proc_wrapper.py
├── tokenizer.py
├── torch_utils.py
├── twitter_tokenizer.py
├── utils.py
└── vocab.py
├── install.sh
├── iob
└── logs
│ └── README.md
├── logs
└── results
│ ├── answer_out_of_domain_baseline.json
│ ├── context_aoracle.json
│ ├── double_model.json
│ ├── newsqa
│ ├── data_test.json
│ └── evaluate.py
│ ├── script.sh
│ ├── single_model.json
│ ├── single_model_result_run_42_with_baseline.json
│ ├── single_model_result_run_43_with_baseline.json
│ ├── single_model_result_run_44_with_baseline.json
│ ├── single_model_result_run_45_with_baseline.json
│ ├── single_model_result_run_46_with_baseline.json
│ ├── single_model_result_run_47_with_baseline.json
│ ├── single_model_result_run_48_with_baseline.json
│ ├── single_model_result_run_49_with_baseline.json
│ ├── single_model_results_42.json
│ ├── single_model_results_43.json
│ ├── single_model_results_44.json
│ ├── single_model_results_45.json
│ ├── single_model_results_46.json
│ ├── single_model_results_47.json
│ ├── single_model_results_48.json
│ └── single_model_results_49.json
├── models
├── __init__.py
├── iob
│ ├── __init__.py
│ └── iob_model.py
├── language_model.py
├── language_trainer.py
└── language_wrapper.py
├── pretrained_models
├── nltk
│ └── tokenizers
│ │ ├── punkt.zip
│ │ └── punkt
│ │ ├── PY3
│ │ ├── README
│ │ ├── czech.pickle
│ │ ├── danish.pickle
│ │ ├── dutch.pickle
│ │ ├── english.pickle
│ │ ├── estonian.pickle
│ │ ├── finnish.pickle
│ │ ├── french.pickle
│ │ ├── german.pickle
│ │ ├── greek.pickle
│ │ ├── italian.pickle
│ │ ├── norwegian.pickle
│ │ ├── polish.pickle
│ │ ├── portuguese.pickle
│ │ ├── slovene.pickle
│ │ ├── spanish.pickle
│ │ ├── swedish.pickle
│ │ └── turkish.pickle
│ │ ├── README
│ │ ├── czech.pickle
│ │ ├── danish.pickle
│ │ ├── dutch.pickle
│ │ ├── english.pickle
│ │ ├── estonian.pickle
│ │ ├── finnish.pickle
│ │ ├── french.pickle
│ │ ├── german.pickle
│ │ ├── greek.pickle
│ │ ├── italian.pickle
│ │ ├── norwegian.pickle
│ │ ├── polish.pickle
│ │ ├── portuguese.pickle
│ │ ├── slovene.pickle
│ │ ├── spanish.pickle
│ │ ├── swedish.pickle
│ │ └── turkish.pickle
└── scripts
│ ├── __init__.py
│ ├── create_glove_embeddings.sh
│ ├── download_glove_embeddings.py
│ └── transfer_glove_embeddings.py
├── requirements.txt
├── scripts.sh
├── tests
├── __init__.py
├── gather_test.py
├── iob_loader_test.py
├── iob_test.py
├── iob_trainer_test.py
├── language_model_memory_test.py
├── language_model_predict_test.py
├── language_model_test.py
├── language_model_trainer_test.py
├── load_questions.py
├── newsqa_predictor_test_unsup.py
├── newsqa_predictor_test_unsup_large.py
├── newsqa_predictor_test_unsup_truncated.py
├── newsqa_predictor_test_verb.py
├── newsqa_trainer_test.py
├── pointer_network_test.py
├── question_discriminator_test.py
├── squad_discriminator_test.py
├── squad_loader_test_v2.py
├── squad_predictor_test.py
├── squad_predictor_truncated_test.py
├── squad_trainer_test.py
├── squad_trainer_truncated_expanded_test.py
├── squad_trainer_truncated_test.py
├── test_expand_dims.py
├── test_load_dataset.py
├── test_lstm_attention.py
├── test_lstm_attention_dot.py
├── test_model.py
├── test_model_saving.py
└── test_padded_sequence.py
└── trainers
├── __init__.py
├── iob_predictor.py
└── iob_trainer.py
/License.txt:
--------------------------------------------------------------------------------
1 | NewsQA Code
2 | Copyright (c) Microsoft Corporation
3 | All rights reserved.
4 | MIT License
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
7 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/bidaf/README.md:
--------------------------------------------------------------------------------
1 | # Finetuning BiDAF with SynthNets:
2 |
3 | - This repository implements finetuning a [Bi-directional Attention Flow for Machine Comprehension] (Seo et al., 2016) model trained on a source collection of documents to answer questions on a target set of documents using [Two-stage SynthNets]. It assumes a SynthNet already generated question, answer tuples over the desired set.
4 |
5 | ## 0. Requirements
6 | #### General
7 | - Python (verified on 3.5.2. Issues have been reported with Python 2!)
8 | - unzip, wget (for running `download.sh` only)
9 |
10 | #### Python Packages
11 | - tensorflow (deep learning library, verified on r0.11)
12 | - nltk (NLP tools, verified on 3.2.1)
13 | - tqdm (progress bar, verified on 4.7.4)
14 | - jinja2 (for visaulization; if you only train and test, not needed)
15 |
16 | ## 1. Downloading Data
17 | Run:
18 | ```
19 | git lfs pull
20 | ```
21 |
22 | ## Scripts
23 | All commands used to train, test models are stored under [scripts].
24 |
25 | Each command inside each script file should be run from the root directory of the repository.
26 |
27 | ## 2. Training
28 | To finetune a pretrained [SQUAD] BIDAF model on [NewsQA], see the scripts at [scripts/finetune_newsqa].
29 |
30 | To finetune a pretrained [NewsQA] model on [SQUAD], see the scripts at [scripts/finetune_squad].
31 |
32 | ## 3. Test
33 | To evaluate single models, see the scripts at scripts/evaluate_*.sh.
34 |
35 | To evaluate intra-run averaged models, ensembles, etc. see the scripts at scripts/*_evaluation.sh
36 |
37 | [Two-stage SynthNets]: https://arxiv.org/TODO
38 | [Bi-directional Attention Flow for Machine Comprehension]: https://github.com/allenai/bi-att-flow
39 | [scripts]: https://github.com/davidgolub/ReadingComprehension/tree/master/scripts
40 | [scripts/finetune_newsqa]: https://github.com/davidgolub/ReadingComprehension/tree/master/scripts/finetune_newsqa.sh
41 | [scripts/finetune_squad]: https://github.com/davidgolub/ReadingComprehension/tree/master/scripts/finetune_squad.sh
42 | [code]: https://github.com/allenai/bi-att-flow
43 | [multi-gpu]: https://www.tensorflow.org/versions/r0.11/tutorials/deep_cnn/index.html#training-a-model-using-multiple-gpu-cards
44 | [SQUAD]: http://stanford-qa.com
45 | [NEWSQA]: https://datasets.maluuba.com/NewsQA
46 | [paper]: https://arxiv.org/abs/1611.01603
47 | [davidgolub]: https://davidgolub.github.io
48 | [davidgolub-github]: https://github.com/davidgolub
49 |
--------------------------------------------------------------------------------
/bidaf/basic/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/bidaf/basic/__init__.py
--------------------------------------------------------------------------------
/bidaf/basic/combiner.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import json
3 |
4 | third_path = sys.argv[1]
5 | other_paths = sys.argv[2:]
6 |
7 | others = [json.load(open(path, 'r')) for path in other_paths]
8 |
9 |
10 | c = {}
11 |
12 | assert min(map(len, others)) == max(map(len, others)), list(map(len, others))
13 |
14 | for key in others[0].keys():
15 | if key == 'scores':
16 | continue
17 | probs = [other['scores'][key] for other in others]
18 | vals = [other[key] for other in others]
19 | largest_val = max(zip(vals, probs), key=lambda pair: pair[1])[0]
20 | c[key] = largest_val
21 |
22 | json.dump(c, open(third_path, 'w'))
23 |
--------------------------------------------------------------------------------
/bidaf/basic/compare.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import json
3 |
4 | first_path = sys.argv[1]
5 | second_path = sys.argv[2]
6 |
7 | a = json.load(open(first_path, 'r'))
8 | b = json.load(open(second_path, 'r'))
9 |
10 | assert len(a) == len(b)
11 |
12 | diff_count = 0
13 |
14 | for key, val in a.items():
15 | b_val = b[key]
16 | if val != b_val:
17 | print(val, "|||", b_val)
18 | diff_count += 1
19 |
20 | print("{}/{} = {}".format(diff_count, len(a), diff_count/len(a)))
--------------------------------------------------------------------------------
/bidaf/basic/ensemble.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import functools
3 | import gzip
4 | import json
5 | import pickle
6 | from collections import defaultdict
7 | from operator import mul
8 |
9 | from tqdm import tqdm
10 | from squad.utils import get_phrase, get_best_span
11 |
12 |
13 | def get_args():
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('paths', nargs='+')
16 | parser.add_argument('-o', '--out', default='ensemble.json')
17 | parser.add_argument("--data_path", default="data/squad/data_test.json")
18 | parser.add_argument("--shared_path", default="data/squad/shared_test.json")
19 | args = parser.parse_args()
20 | return args
21 |
22 |
23 | def ensemble(args):
24 | e_list = []
25 | for path in tqdm(args.paths):
26 | with gzip.open(path, 'r') as fh:
27 | e = pickle.load(fh)
28 | e_list.append(e)
29 |
30 | with open(args.data_path, 'r') as fh:
31 | data = json.load(fh)
32 |
33 | with open(args.shared_path, 'r') as fh:
34 | shared = json.load(fh)
35 |
36 | out = {}
37 | for idx, (id_, rx) in tqdm(enumerate(zip(data['ids'], data['*x'])), total=len(e['yp'])):
38 | if idx >= len(e['yp']):
39 | # for debugging purpose
40 | break
41 | context = shared['p'][rx[0]][rx[1]]
42 | wordss = shared['x'][rx[0]][rx[1]]
43 | yp_list = [e['yp'][idx] for e in e_list]
44 | yp2_list = [e['yp2'][idx] for e in e_list]
45 | answer = ensemble3(context, wordss, yp_list, yp2_list)
46 | out[id_] = answer
47 |
48 | with open(args.out, 'w') as fh:
49 | json.dump(out, fh)
50 |
51 |
52 | def ensemble1(context, wordss, y1_list, y2_list):
53 | """
54 |
55 | :param context: Original context
56 | :param wordss: tokenized words (nested 2D list)
57 | :param y1_list: list of start index probs (each element corresponds to probs form single model)
58 | :param y2_list: list of stop index probs
59 | :return:
60 | """
61 | sum_y1 = combine_y_list(y1_list)
62 | sum_y2 = combine_y_list(y2_list)
63 | span, score = get_best_span(sum_y1, sum_y2)
64 | return get_phrase(context, wordss, span)
65 |
66 |
67 | def ensemble2(context, wordss, y1_list, y2_list):
68 | start_dict = defaultdict(float)
69 | stop_dict = defaultdict(float)
70 | for y1, y2 in zip(y1_list, y2_list):
71 | span, score = get_best_span(y1, y2)
72 | start_dict[span[0]] += y1[span[0][0]][span[0][1]]
73 | stop_dict[span[1]] += y2[span[1][0]][span[1][1]]
74 | start = max(start_dict.items(), key=lambda pair: pair[1])[0]
75 | stop = max(stop_dict.items(), key=lambda pair: pair[1])[0]
76 | best_span = (start, stop)
77 | return get_phrase(context, wordss, best_span)
78 |
79 |
80 | def ensemble3(context, wordss, y1_list, y2_list):
81 | d = defaultdict(float)
82 | for y1, y2 in zip(y1_list, y2_list):
83 | span, score = get_best_span(y1, y2)
84 | phrase = get_phrase(context, wordss, span)
85 | d[phrase] += score
86 | return max(d.items(), key=lambda pair: pair[1])[0]
87 |
88 |
89 | def combine_y_list(y_list, op='*'):
90 | if op == '+':
91 | func = sum
92 | elif op == '*':
93 | def func(l): return functools.reduce(mul, l)
94 | else:
95 | func = op
96 | return [[func(yij_list) for yij_list in zip(*yi_list)] for yi_list in zip(*y_list)]
97 |
98 |
99 | def main():
100 | args = get_args()
101 | ensemble(args)
102 |
103 | if __name__ == "__main__":
104 | main()
105 |
106 |
107 |
--------------------------------------------------------------------------------
/bidaf/basic/ensemble_fast.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import json
3 | from collections import Counter, defaultdict
4 | import re
5 |
6 | def key_func(pair):
7 | return pair[1]
8 |
9 |
10 | def get_func(vals, probs):
11 | counter = Counter(vals)
12 | # return max(zip(vals, probs), key=lambda pair: pair[1])[0]
13 | # return max(zip(vals, probs), key=lambda pair: pair[1] * counter[pair[0]] / len(counter) - 999 * (len(pair[0]) == 0) )[0]
14 | # return max(zip(vals, probs), key=lambda pair: pair[1] + 0.7 * counter[pair[0]] / len(counter) - 999 * (len(pair[0]) == 0) )[0]
15 | d = defaultdict(float)
16 | for val, prob in zip(vals, probs):
17 | d[val] += prob
18 | d[''] = 0
19 | return max(d.items(), key=lambda pair: pair[1])[0]
20 |
21 | third_path = sys.argv[1]
22 | other_paths = sys.argv[2:]
23 |
24 | others = [json.load(open(path, 'r')) for path in other_paths]
25 |
26 |
27 | c = {}
28 |
29 | assert min(map(len, others)) == max(map(len, others)), list(map(len, others))
30 |
31 | for key in others[0].keys():
32 | if key == 'scores':
33 | continue
34 | probs = [other['scores'][key] for other in others]
35 | vals = [other[key] for other in others]
36 | largest_val = get_func(vals, probs)
37 | c[key] = largest_val
38 |
39 | json.dump(c, open(third_path, 'w'))
--------------------------------------------------------------------------------
/bidaf/basic/graph_handler.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import json
3 | from json import encoder
4 | import os
5 |
6 | import tensorflow as tf
7 |
8 | from basic.evaluator import Evaluation, F1Evaluation
9 | from my.utils import short_floats
10 |
11 | import pickle
12 |
13 |
14 | class GraphHandler(object):
15 | def __init__(self, config):
16 | self.config = config
17 | self.saver = tf.train.Saver(max_to_keep=config.max_to_keep)
18 | self.writer = None
19 | self.save_path = os.path.join(config.save_dir, config.model_name)
20 |
21 | def initialize(self, sess):
22 | if self.config.load:
23 | self._load(sess)
24 | else:
25 | sess.run(tf.initialize_all_variables())
26 |
27 | if self.config.mode == 'train':
28 | self.writer = tf.train.SummaryWriter(self.config.log_dir, graph=tf.get_default_graph())
29 |
30 | def save(self, sess, global_step=None):
31 | self.saver.save(sess, self.save_path, global_step=global_step)
32 |
33 | def _load(self, sess):
34 | config = self.config
35 | if config.load_path:
36 | save_path = config.load_path
37 | elif config.load_step > 0:
38 | save_path = os.path.join(config.save_dir, "{}-{}".format(config.model_name, config.load_step))
39 | else:
40 | save_dir = config.save_dir
41 | checkpoint = tf.train.get_checkpoint_state(save_dir)
42 | assert checkpoint is not None, "cannot load checkpoint at {}".format(save_dir)
43 | save_path = checkpoint.model_checkpoint_path
44 | print("Loading saved model from {}".format(save_path))
45 | self.saver.restore(sess, save_path)
46 |
47 | def add_summary(self, summary, global_step):
48 | self.writer.add_summary(summary, global_step)
49 |
50 | def add_summaries(self, summaries, global_step):
51 | for summary in summaries:
52 | self.add_summary(summary, global_step)
53 |
54 | def dump_eval(self, e, precision=2, path=None):
55 | assert isinstance(e, Evaluation)
56 | if self.config.dump_pickle:
57 | path = path or os.path.join(self.config.eval_dir, "{}-{}.pklz".format(e.data_type, str(e.global_step).zfill(6)))
58 | with gzip.open(path, 'wb', compresslevel=3) as fh:
59 | pickle.dump(e.dict, fh)
60 | else:
61 | path = path or os.path.join(self.config.eval_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
62 | with open(path, 'w') as fh:
63 | json.dump(short_floats(e.dict, precision), fh)
64 |
65 | def dump_answer(self, e, path=None):
66 | assert isinstance(e, Evaluation)
67 | path = path or os.path.join(self.config.answer_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
68 | with open(path, 'w') as fh:
69 | json.dump(e.id2answer_dict, fh)
70 |
71 |
--------------------------------------------------------------------------------
/bidaf/basic/run_ensemble.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | source_path=$1
3 | target_path=$2
4 | inter_dir="inter_ensemble"
5 | root_dir="save"
6 |
7 | parg=""
8 | marg=""
9 | if [ "$3" = "debug" ]
10 | then
11 | parg="-d"
12 | marg="--debug"
13 | fi
14 |
15 | # Preprocess data
16 | python3.5 -m newsqa.prepro --mode single --single_path $source_path $parg --target_dir $inter_dir --glove_dir .
17 |
18 | eargs=""
19 | for num in 31 33 34 35 36 37 40; do
20 | load_path="$root_dir/$num/save"
21 | shared_path="$root_dir/$num/shared.json"
22 | eval_path="$inter_dir/eval-$num.json"
23 | eargs="$eargs $eval_path"
24 | python3.5 -m basic.cli --data_dir $inter_dir --eval_path $eval_path --nodump_answer --load_path $load_path --shared_path $shared_path $marg --eval_num_batches 0 --mode forward --batch_size 1 --len_opt --cluster --cpu_opt --load_ema &
25 | done
26 | wait
27 |
28 | # Ensemble
29 | python3.5 -m basic.ensemble --data_path $inter_dir/data_single.json --shared_path $inter_dir/shared_single.json -o $target_path $eargs
30 |
--------------------------------------------------------------------------------
/bidaf/basic/run_single.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | source_path=$1
3 | target_path=$2
4 | inter_dir="inter_single"
5 | root_dir="save"
6 |
7 | parg=""
8 | marg=""
9 | if [ "$3" = "debug" ]
10 | then
11 | parg="-d"
12 | marg="--debug"
13 | fi
14 |
15 | # Preprocess data
16 | python3 -m squad.prepro --mode single --single_path $source_path $parg --target_dir $inter_dir --glove_dir .
17 |
18 | num=37
19 | load_path="$root_dir/$num/save"
20 | shared_path="$root_dir/$num/shared.json"
21 | eval_path="$inter_dir/eval.json"
22 | python3 -m basic.cli --data_dir $inter_dir --eval_path $eval_path --nodump_answer --load_path $load_path --shared_path $shared_path $marg --eval_num_batches 0 --mode forward --batch_size 1 --len_opt --cluster --cpu_opt --load_ema
23 |
24 | # Ensemble (for single run, just one input)
25 | python3 -m basic.ensemble --data_path $inter_dir/data_single.json --shared_path $inter_dir/shared_single.json -o $target_path $eval_path
26 |
27 |
28 |
--------------------------------------------------------------------------------
/bidaf/basic/superhighway.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow.python.ops.rnn_cell import RNNCell
3 |
4 | from my.tensorflow.nn import linear
5 |
6 |
7 | class SHCell(RNNCell):
8 | """
9 | Super-Highway Cell
10 | """
11 | def __init__(self, input_size, logit_func='tri_linear', scalar=True, bias=3.0):
12 | self._state_size = input_size
13 | self._output_size = input_size
14 | self._logit_func = logit_func
15 | self._scalar = scalar
16 | self._bias = bias
17 |
18 | @property
19 | def state_size(self):
20 | return self._state_size
21 |
22 | @property
23 | def output_size(self):
24 | return self._output_size
25 |
26 | def __call__(self, inputs, state, scope=None):
27 | with tf.variable_scope(scope or "SHCell"):
28 | a_size = 1 if self._scalar else self._state_size
29 | h, u = tf.split(1, 2, inputs)
30 | if self._logit_func == 'mul_linear':
31 | args = [h * u]
32 | a = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='a'))
33 | r = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='r'))
34 | elif self._logit_func == 'linear':
35 | args = [h, u]
36 | a = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='a'))
37 | r = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='r'))
38 | elif self._logit_func == 'tri_linear':
39 | args = [h, u, h * u]
40 | a = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='a'))
41 | r = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='r'))
42 | elif self._logit_func == 'double':
43 | args = [h, u]
44 | a = tf.nn.sigmoid(linear(tf.tanh(linear(args, a_size, True)), self._state_size, True, bias_start=self._bias))
45 | r = tf.nn.sigmoid(linear(tf.tanh(linear(args, a_size, True)), self._state_size, True, bias_start=self._bias))
46 |
47 | else:
48 | raise Exception()
49 | new_state = a * state + r * (1 - a) * h
50 | outputs = state
51 | return outputs, new_state
52 |
53 |
--------------------------------------------------------------------------------
/bidaf/basic/templates/visualizer.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {{ title }}
6 |
7 |
8 |
19 |
20 |
23 |
24 | {{ title }}
25 |
26 |
27 | ID |
28 | Question |
29 | Answers |
30 | Predicted |
31 | Score |
32 | Paragraph |
33 |
34 | {% for row in rows %}
35 |
36 | {{ row.id }} |
37 |
38 | {% for qj in row.ques %}
39 | {{ qj }}
40 | {% endfor %}
41 | |
42 |
43 | {% for aa in row.a %}
44 | {{ aa }}
45 | {% endfor %}
46 | |
47 | {{ row.ap }} |
48 | {{ row.score }} |
49 |
50 |
51 | {% for xj, ypj, yp2j in zip(row.para, row.yp, row.yp2) %}
52 |
53 | {% set rowloop = loop %}
54 | {% for xjk, ypjk in zip(xj, ypj) %}
55 |
56 | {% if row.y[0][0] == rowloop.index0 and row.y[0][1] <= loop.index0 <= row.y[1][1] %}
57 | {{ xjk }}
58 | {% else %}
59 | {{ xjk }}
60 | {% endif %}
61 | |
62 | {% endfor %}
63 |
64 |
65 | {% for xjk, yp2jk in zip(xj, yp2j) %}
66 | - |
67 | {% endfor %}
68 |
69 | {% endfor %}
70 |
71 | |
72 |
73 | {% endfor %}
74 |
75 |
76 |
--------------------------------------------------------------------------------
/bidaf/basic_cnn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/bidaf/basic_cnn/__init__.py
--------------------------------------------------------------------------------
/bidaf/basic_cnn/graph_handler.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import json
3 | from json import encoder
4 | import os
5 |
6 | import tensorflow as tf
7 |
8 | from basic_cnn.evaluator import Evaluation, F1Evaluation
9 | from my.utils import short_floats
10 |
11 | import pickle
12 |
13 |
14 | class GraphHandler(object):
15 | def __init__(self, config):
16 | self.config = config
17 | self.saver = tf.train.Saver(max_to_keep=config.max_to_keep)
18 | self.writer = None
19 | self.save_path = os.path.join(config.save_dir, config.model_name)
20 |
21 | def initialize(self, sess):
22 | if self.config.load:
23 | self._load(sess)
24 | else:
25 | sess.run(tf.initialize_all_variables())
26 |
27 | if self.config.mode == 'train':
28 | self.writer = tf.train.SummaryWriter(self.config.log_dir, graph=tf.get_default_graph())
29 |
30 | def save(self, sess, global_step=None):
31 | self.saver.save(sess, self.save_path, global_step=global_step)
32 |
33 | def _load(self, sess):
34 | config = self.config
35 | if config.load_path:
36 | save_path = config.load_path
37 | elif config.load_step > 0:
38 | save_path = os.path.join(config.save_dir, "{}-{}".format(config.model_name, config.load_step))
39 | else:
40 | save_dir = config.save_dir
41 | checkpoint = tf.train.get_checkpoint_state(save_dir)
42 | assert checkpoint is not None, "cannot load checkpoint at {}".format(save_dir)
43 | save_path = checkpoint.model_checkpoint_path
44 | print("Loading saved model from {}".format(save_path))
45 | self.saver.restore(sess, save_path)
46 |
47 | def add_summary(self, summary, global_step):
48 | self.writer.add_summary(summary, global_step)
49 |
50 | def add_summaries(self, summaries, global_step):
51 | for summary in summaries:
52 | self.add_summary(summary, global_step)
53 |
54 | def dump_eval(self, e, precision=2, path=None):
55 | assert isinstance(e, Evaluation)
56 | if self.config.dump_pickle:
57 | path = path or os.path.join(self.config.eval_dir, "{}-{}.pklz".format(e.data_type, str(e.global_step).zfill(6)))
58 | with gzip.open(path, 'wb', compresslevel=3) as fh:
59 | pickle.dump(e.dict, fh)
60 | else:
61 | path = path or os.path.join(self.config.eval_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
62 | with open(path, 'w') as fh:
63 | json.dump(short_floats(e.dict, precision), fh)
64 |
65 | def dump_answer(self, e, path=None):
66 | assert isinstance(e, Evaluation)
67 | path = path or os.path.join(self.config.answer_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
68 | with open(path, 'w') as fh:
69 | json.dump(e.id2answer_dict, fh)
70 |
71 |
--------------------------------------------------------------------------------
/bidaf/basic_cnn/superhighway.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow.python.ops.rnn_cell import RNNCell
3 |
4 | from my.tensorflow.nn import linear
5 |
6 |
7 | class SHCell(RNNCell):
8 | """
9 | Super-Highway Cell
10 | """
11 | def __init__(self, input_size, logit_func='tri_linear', scalar=False):
12 | self._state_size = input_size
13 | self._output_size = input_size
14 | self._logit_func = logit_func
15 | self._scalar = scalar
16 |
17 | @property
18 | def state_size(self):
19 | return self._state_size
20 |
21 | @property
22 | def output_size(self):
23 | return self._output_size
24 |
25 | def __call__(self, inputs, state, scope=None):
26 | with tf.variable_scope(scope or "SHCell"):
27 | a_size = 1 if self._scalar else self._state_size
28 | h, u = tf.split(1, 2, inputs)
29 | if self._logit_func == 'mul_linear':
30 | args = [h * u, state * u]
31 | a = tf.nn.sigmoid(linear(args, a_size, True))
32 | elif self._logit_func == 'linear':
33 | args = [h, u, state]
34 | a = tf.nn.sigmoid(linear(args, a_size, True))
35 | elif self._logit_func == 'tri_linear':
36 | args = [h, u, state, h * u, state * u]
37 | a = tf.nn.sigmoid(linear(args, a_size, True))
38 | elif self._logit_func == 'double':
39 | args = [h, u, state]
40 | a = tf.nn.sigmoid(linear(tf.tanh(linear(args, a_size, True)), self._state_size, True))
41 |
42 | else:
43 | raise Exception()
44 | new_state = a * state + (1 - a) * h
45 | outputs = state
46 | return outputs, new_state
47 |
48 |
--------------------------------------------------------------------------------
/bidaf/basic_cnn/templates/visualizer.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {{ title }}
6 |
7 |
8 |
19 |
20 |
23 |
24 | {{ title }}
25 |
26 |
27 | ID |
28 | Question |
29 | Answers |
30 | Predicted |
31 | Score |
32 | Paragraph |
33 |
34 | {% for row in rows %}
35 |
36 | {{ row.id }} |
37 |
38 | {% for qj in row.ques %}
39 | {{ qj }}
40 | {% endfor %}
41 | |
42 |
43 | {% for aa in row.a %}
44 | {{ aa }}
45 | {% endfor %}
46 | |
47 | {{ row.ap }} |
48 | {{ row.score }} |
49 |
50 |
51 | {% for xj, ypj, yp2j in zip(row.para, row.yp, row.yp2) %}
52 |
53 | {% set rowloop = loop %}
54 | {% for xjk, ypjk in zip(xj, ypj) %}
55 |
56 | {% if row.y[0][0] == rowloop.index0 and row.y[0][1] <= loop.index0 <= row.y[1][1] %}
57 | {{ xjk }}
58 | {% else %}
59 | {{ xjk }}
60 | {% endif %}
61 | |
62 | {% endfor %}
63 |
64 |
65 | {% for xjk, yp2jk in zip(xj, yp2j) %}
66 | - |
67 | {% endfor %}
68 |
69 | {% endfor %}
70 |
71 | |
72 |
73 | {% endfor %}
74 |
75 |
76 |
--------------------------------------------------------------------------------
/bidaf/basic_cnn/trainer.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | from basic_cnn.model import Model
4 | from my.tensorflow import average_gradients
5 |
6 |
7 | class Trainer(object):
8 | def __init__(self, config, model):
9 | assert isinstance(model, Model)
10 | self.config = config
11 | self.model = model
12 | self.opt = tf.train.AdadeltaOptimizer(config.init_lr)
13 | self.loss = model.get_loss()
14 | self.var_list = model.get_var_list()
15 | self.global_step = model.get_global_step()
16 | self.summary = model.summary
17 | self.grads = self.opt.compute_gradients(self.loss, var_list=self.var_list)
18 | self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
19 |
20 | def get_train_op(self):
21 | return self.train_op
22 |
23 | def step(self, sess, batch, get_summary=False):
24 | assert isinstance(sess, tf.Session)
25 | _, ds = batch
26 | feed_dict = self.model.get_feed_dict(ds, True)
27 | if get_summary:
28 | loss, summary, train_op = \
29 | sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict)
30 | else:
31 | loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
32 | summary = None
33 | return loss, summary, train_op
34 |
35 |
36 | class MultiGPUTrainer(object):
37 | def __init__(self, config, models):
38 | model = models[0]
39 | assert isinstance(model, Model)
40 | self.config = config
41 | self.model = model
42 | self.opt = tf.train.AdadeltaOptimizer(config.init_lr)
43 | self.var_list = model.get_var_list()
44 | self.global_step = model.get_global_step()
45 | self.summary = model.summary
46 | self.models = models
47 | losses = []
48 | grads_list = []
49 | for gpu_idx, model in enumerate(models):
50 | with tf.name_scope("grads_{}".format(gpu_idx)), tf.device("/gpu:{}".format(gpu_idx)):
51 | loss = model.get_loss()
52 | grads = self.opt.compute_gradients(loss, var_list=self.var_list)
53 | losses.append(loss)
54 | grads_list.append(grads)
55 |
56 | self.loss = tf.add_n(losses)/len(losses)
57 | self.grads = average_gradients(grads_list)
58 | self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
59 |
60 | def step(self, sess, batches, get_summary=False):
61 | assert isinstance(sess, tf.Session)
62 | feed_dict = {}
63 | for batch, model in zip(batches, self.models):
64 | _, ds = batch
65 | feed_dict.update(model.get_feed_dict(ds, True))
66 |
67 | if get_summary:
68 | loss, summary, train_op = \
69 | sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict)
70 | else:
71 | loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
72 | summary = None
73 | return loss, summary, train_op
74 |
--------------------------------------------------------------------------------
/bidaf/cnn_dm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/bidaf/cnn_dm/__init__.py
--------------------------------------------------------------------------------
/bidaf/cnn_dm/evaluate.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import sys
4 |
5 | root_dir = sys.argv[1]
6 | answer_path = sys.argv[2]
7 | file_names = os.listdir(root_dir)
8 |
9 | num_correct = 0
10 | num_wrong = 0
11 |
12 | with open(answer_path, 'r') as fh:
13 | id2answer_dict = json.load(fh)
14 |
15 | for file_name in file_names:
16 | if not file_name.endswith(".question"):
17 | continue
18 | with open(os.path.join(root_dir, file_name), 'r') as fh:
19 | url = fh.readline().strip()
20 | _ = fh.readline()
21 | para = fh.readline().strip()
22 | _ = fh.readline()
23 | ques = fh.readline().strip()
24 | _ = fh.readline()
25 | answer = fh.readline().strip()
26 | _ = fh.readline()
27 | if file_name in id2answer_dict:
28 | pred = id2answer_dict[file_name]
29 | if pred == answer:
30 | num_correct += 1
31 | else:
32 | num_wrong += 1
33 | else:
34 | num_wrong += 1
35 |
36 | total = num_correct + num_wrong
37 | acc = float(num_correct) / total
38 | print("{} = {} / {}".format(acc, num_correct, total))
--------------------------------------------------------------------------------
/bidaf/data/squad/data_dev.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:43899b7e7a5098aab61718e162399c26d7d2927a0323ae7e96ccd836ec71689a
3 | size 6486869
4 |
--------------------------------------------------------------------------------
/bidaf/data/squad/data_test.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:43899b7e7a5098aab61718e162399c26d7d2927a0323ae7e96ccd836ec71689a
3 | size 6486869
4 |
--------------------------------------------------------------------------------
/bidaf/data/squad/data_train.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:72faa687d3686a6a07b289c4ab13f53a846ac2034d0b1b75ab025b55ca5c9ca4
3 | size 43602519
4 |
--------------------------------------------------------------------------------
/bidaf/data/squad/shared_dev.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3c3c2245183b060a88184d4e0144c56ceb581a9d85de9555a534cf86d32f1bf1
3 | size 57027001
4 |
--------------------------------------------------------------------------------
/bidaf/data/squad/shared_test.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3c3c2245183b060a88184d4e0144c56ceb581a9d85de9555a534cf86d32f1bf1
3 | size 57027001
4 |
--------------------------------------------------------------------------------
/bidaf/data/squad/shared_train.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:aca764b536502380700ce7dc0a9c5f8609a1d3a0f2c400afb659eddb7eb43c5d
3 | size 244088547
4 |
--------------------------------------------------------------------------------
/bidaf/download.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | DATA_DIR=$HOME/data
4 | mkdir $DATA_DIR
5 |
6 | # Download SQuAD
7 | SQUAD_DIR=$DATA_DIR/squad
8 | mkdir $SQUAD_DIR
9 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O $SQUAD_DIR/train-v1.1.json
10 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O $SQUAD_DIR/dev-v1.1.json
11 |
12 |
13 | # Download CNN and DailyMail
14 | # Download at: http://cs.nyu.edu/~kcho/DMQA/
15 |
16 |
17 | # Download GloVe
18 | GLOVE_DIR=$DATA_DIR/glove
19 | mkdir $GLOVE_DIR
20 | wget http://nlp.stanford.edu/data/glove.6B.zip -O $GLOVE_DIR/glove.6B.zip
21 | unzip $GLOVE_DIR/glove.6B.zip -d $GLOVE_DIR
22 |
23 | # Download NLTK (for tokenizer)
24 | # Make sure that nltk is installed!
25 | python3 -m nltk.downloader -d $HOME/nltk_data punkt
26 |
--------------------------------------------------------------------------------
/bidaf/helpers/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/bidaf/helpers/constants.py:
--------------------------------------------------------------------------------
1 | """
2 | Constants for easier reference
3 | """
4 |
5 | TOKENIZER_TEXTBLOB = 'TOKENIZER_TEXTBLOB'
6 | TOKENIZER_NLTK = 'TOKENIZER_NLTK'
7 | TOKENIZER_REGEX = 'TOKENIZER_REGEX'
8 | TOKENIZER_TWITTER = 'TOKENIZER_TWITTER'
9 | TOKENIZER_STANFORD_NLP = 'TOKENIZER_STANFORD_NLP'
10 |
11 | TRAIN_INDEX = 0
12 | VAL_INDEX = 1
13 | TEST_INDEX = 2
14 | JOB_ENDPOINT = 'http://ec2-52-33-179-156.us-west-2.compute.amazonaws.com:8000/api/v1' #'https://104.155.188.251:8080/api/v1'
15 | JOB_ENDPOINT = 'http://104.155.132.60:8080/api/v1'
16 |
17 | ##
18 | ACCESS_TOKEN = '49553f53ef5178db88e7cf4e192a1db1a77cfdbb'
19 |
20 | TRAIN_MODE = 0
21 | TEST_MODE = 1
22 |
23 | WORD_LEVEL = 'WORD_LEVEL'
24 | CHAR_LEVEL = 'CHAR_LEVEL'
25 | WORD_CHAR_LEVEL = 'WORD_CHAR_LEVEL' #Word embeddings with char. lvl lstms
26 | WORD_HASHING_LEVEL = 'WORD_HASHING_LEVEL'
27 | WORD_HASHING_CONSTANT = '%'
28 |
29 | DATASET_TRAIN = 'DATASET_TRAIN'
30 | DATASET_TEST = 'DATASET_TEST'
31 | DATASET_VALIDATION = 'DATASET_VALIDATION'
32 |
33 | GPU_MODE = 'GPU_MODE'
34 | CPU_MODE = 'CPU_MODE'
35 |
36 | CLOUD_MODEL_DIR = 'softmax_models'
37 | CLOUD_MODEL_ENDPOINT = 's3.amazonaws.com'
38 |
39 | AWS_KEY = 'AKIAJ3OQL4ACVRTLQSJA'
40 | AWS_SECRET = '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'
41 |
42 | LOCAL_MODEL_DIR = 'softmax_models'
43 |
44 | PREPROCESS_TYPE_INCEPTION = 'PREPROCESS_TYPE_INCEPTION'
45 | PREPROCESS_TYPE_GOOGLENET = 'PREPROCESS_TYPE_GOOGLENET'
46 | PREPROCESS_TYPE_RESNET = 'PREPROCESS_TYPE_RESNET'
47 |
48 | PREPROCESS_TYPE_RESNET_50 = 'PREPROCESS_TYPE_RESNET_50'
49 | PREPROCESS_TYPE_RESNET_101 = 'PREPROCESS_TYPE_RESNET_101'
50 | PREPROCESS_TYPE_RESNET_152 = 'PREPROCESS_TYPE_RESNET_152'
51 |
52 | NETWORK_TYPE_INCEPTION = 'NETWORK_TYPE_INCEPTION'
53 | NETWORK_TYPE_GOOGLENET = 'NETWORK_TYPE_GOOGLENET'
54 | NETWORK_TYPE_RESNET = 'NETWORK_TYPE_RESNET'
55 |
56 | NETWORK_TYPE_RESNET_30 = 'NETWORK_TYPE_RESNET_30'
57 | NETWORK_TYPE_RESNET_50 = 'NETWORK_TYPE_RESNET_50'
58 | NETWORK_TYPE_RESNET_101 = 'NETWORK_TYPE_RESNET_101'
59 | NETWORK_TYPE_RESNET_152 = 'NETWORK_TYPE_RESNET_152'
60 |
61 | OPTIMIZER_RMSPROP = 'OPTIMIZER_RMSPROP'
62 | OPTIMIZER_ADAM = 'OPTIMIZER_ADAM'
63 | OPTIMIZER_SGD = 'OPTIMIZER_SGD'
64 |
65 | NLTK_DATA_PATH = '../../pretrained_models/nltk'
66 |
67 | # Dependency embeddings path
68 | PRETRAINED_EMBEDDINGS_PATH = '../../pretrained_models/word_embeddings/dependency_embeddings/embeddings.npy'
69 | PRETRAINED_VOCAB_PATH = '../../pretrained_models/word_embeddings/dependency_embeddings/vocab.txt'
70 |
71 | # Part of speech vocab path
72 | POS_VOCAB_PATH = '../../pretrained_models/word_embeddings/pos_tags/vocab.txt'
73 | STANFORD_CORENLP_PATH = '../../pretrained_models/stanford_corenlp/2015-12-09/*'
74 |
75 | MACHINE_READING_MODEL_JOINT_HARD_NEGATIVES_OP = "MACHINE_READING_MODEL_JOINT_HARD_NEGATIVES_OP"
76 | MACHINE_READING_MODEL_JOINT_OP = "MACHINE_READING_MODEL_JOINT_OP"
77 | MACHINE_READING_MODEL_SENTENCE_OP = "MACHINE_READING_MODEL_SENTENCE_OP"
78 | MACHINE_READING_MODEL_ANSWER_OP = "MACHINE_READING_MODEL_ANSWER_OP"
79 |
80 | # Initializer for weights (zero, uniform and random)
81 | INITIALIZER_ZERO = 'INITIALIZER_ZERO'
82 | INITIALIZER_UNIFORM_RANDOM = 'INITIALIZER_UNIFORM_RANDOM'
83 |
84 | # To load vocab things
85 | PATH_NPY_ARRAY = 'PATH_NPY_ARRAY'
86 | PATH_TEXT_ARRAY = 'PATH_TEXT_ARRAY'
--------------------------------------------------------------------------------
/bidaf/helpers/file_logger.py:
--------------------------------------------------------------------------------
1 | class FileLogger(object):
2 | """ Simple logger to insert stuff into a file """
3 | def __init__(self, path):
4 | self.file = open(path, 'w')
5 |
6 | def write(self, text, print_text=True):
7 | if print_text:
8 | print("FILE LOGGER: %s" % text)
9 | self.file.write(str(text) + "\n")
10 | self.file.flush()
--------------------------------------------------------------------------------
/bidaf/helpers/math_utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def top_k_spans(start_probs, end_probs, n, k):
4 | """
5 | Returns top k non overlapping spans for a passage
6 | sorted by start/end probabilities
7 | """
8 | probs = []
9 | argmax_spans = []
10 | for i in range(k + 1):
11 | probs.append([])
12 | argmax_spans.append([])
13 | for j in range(n + 1):
14 | probs[i].append(0)
15 | argmax_spans[i].append([-1, -1])
16 |
17 | for i in range(k + 1):
18 | probs[i][0] = 0;
19 |
20 | for j in range(n + 1):
21 | probs[0][j] = 0
22 |
23 | # fill the table in bottom-up fashion
24 | for i in range(1, k + 1):
25 | prev_diff = -10000
26 | prev_idx = -1
27 | for j in range(1, n):
28 | if prev_diff < probs[i-1][j-1] - start_probs[j-1]:
29 | prev_diff = probs[i-1][j-1] - start_probs[j-1]
30 | prev_idx = j-1
31 | if probs[i][j-1] > end_probs[j] + prev_diff:
32 | probs[i][j] = probs[i][j-1]
33 | argmax_spans[i][j] = argmax_spans[i][j-1]
34 | else:
35 | probs[i][j] = end_probs[j] + prev_diff
36 | argmax_spans[i][j] = (prev_idx, j)
37 |
38 | max_probs = probs[k][n-1]
39 | cur_probs = max_probs
40 | cur_spans = argmax_spans[k][n-1]
41 | start_end_idxs = []
42 | start_end_probs = []
43 |
44 | while cur_probs > 0:
45 | cur_indices = cur_spans
46 | cur_prob = end_probs[cur_indices[1]] - start_probs[cur_indices[0]]
47 | start_end_probs.append(cur_prob)
48 | cur_probs = cur_probs - cur_prob
49 | start_end_idxs.append(cur_indices)
50 | cur_spans = argmax_spans[k][cur_indices[0]]
51 |
52 | return max_probs, start_end_idxs, start_end_probs
--------------------------------------------------------------------------------
/bidaf/install_tensorflow.sh:
--------------------------------------------------------------------------------
1 | sudo pip uninstall tensorflow
2 | export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.12.1-cp35-cp35m-linux_x86_64.whl
3 |
4 | sudo pip install --upgrade pip
5 | sudo pip install --upgrade $TF_BINARY_URL
6 |
7 | sudo pip install requests
8 | sudo pip install tqdm
9 | sudo pip install pandas
10 | sudo pip install nltk
11 |
12 | sudo apt-get update
13 | sudo apt-get install python-software-properties
14 | sudo add-apt-repository ppa:git-core/ppa
15 | curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
16 | sudo apt-get install git-lfs
17 | git lfs install
--------------------------------------------------------------------------------
/bidaf/my/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/bidaf/my/__init__.py
--------------------------------------------------------------------------------
/bidaf/my/corenlp_interface.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import requests
4 | import nltk
5 | import json
6 | import networkx as nx
7 | import time
8 |
9 |
10 | class CoreNLPInterface(object):
11 | def __init__(self, url, port):
12 | self._url = url
13 | self._port = port
14 |
15 | def get(self, type_, in_, num_max_requests=100):
16 | in_ = in_.encode("utf-8")
17 | url = "http://{}:{}/{}".format(self._url, self._port, type_)
18 | out = None
19 | for _ in range(num_max_requests):
20 | try:
21 | r = requests.post(url, data=in_)
22 | out = r.content.decode('utf-8')
23 | if out == 'error':
24 | out = None
25 | break
26 | except:
27 | time.sleep(1)
28 | return out
29 |
30 | def split_doc(self, doc):
31 | out = self.get("doc", doc)
32 | return out if out is None else json.loads(out)
33 |
34 | def split_sent(self, sent):
35 | out = self.get("sent", sent)
36 | return out if out is None else json.loads(out)
37 |
38 | def get_dep(self, sent):
39 | out = self.get("dep", sent)
40 | return out if out is None else json.loads(out)
41 |
42 | def get_const(self, sent):
43 | out = self.get("const", sent)
44 | return out
45 |
46 | def get_const_tree(self, sent):
47 | out = self.get_const(sent)
48 | return out if out is None else nltk.tree.Tree.fromstring(out)
49 |
50 | @staticmethod
51 | def dep2tree(dep):
52 | tree = nx.DiGraph()
53 | for dep, i, gov, j, label in dep:
54 | tree.add_edge(gov, dep, label=label)
55 | return tree
56 |
--------------------------------------------------------------------------------
/bidaf/my/nltk_utils.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | import numpy as np
3 |
4 |
5 | def _set_span(t, i):
6 | if isinstance(t[0], str):
7 | t.span = (i, i+len(t))
8 | else:
9 | first = True
10 | for c in t:
11 | cur_span = _set_span(c, i)
12 | i = cur_span[1]
13 | if first:
14 | min_ = cur_span[0]
15 | first = False
16 | max_ = cur_span[1]
17 | t.span = (min_, max_)
18 | return t.span
19 |
20 |
21 | def set_span(t):
22 | assert isinstance(t, nltk.tree.Tree)
23 | try:
24 | return _set_span(t, 0)
25 | except:
26 | print(t)
27 | exit()
28 |
29 |
30 | def tree_contains_span(tree, span):
31 | """
32 | Assumes that tree span has been set with set_span
33 | Returns true if any subtree of t has exact span as the given span
34 | :param t:
35 | :param span:
36 | :return bool:
37 | """
38 | return span in set(t.span for t in tree.subtrees())
39 |
40 |
41 | def span_len(span):
42 | return span[1] - span[0]
43 |
44 |
45 | def span_overlap(s1, s2):
46 | start = max(s1[0], s2[0])
47 | stop = min(s1[1], s2[1])
48 | if stop > start:
49 | return start, stop
50 | return None
51 |
52 |
53 | def span_prec(true_span, pred_span):
54 | overlap = span_overlap(true_span, pred_span)
55 | if overlap is None:
56 | return 0
57 | return span_len(overlap) / span_len(pred_span)
58 |
59 |
60 | def span_recall(true_span, pred_span):
61 | overlap = span_overlap(true_span, pred_span)
62 | if overlap is None:
63 | return 0
64 | return span_len(overlap) / span_len(true_span)
65 |
66 |
67 | def span_f1(true_span, pred_span):
68 | p = span_prec(true_span, pred_span)
69 | r = span_recall(true_span, pred_span)
70 | if p == 0 or r == 0:
71 | return 0.0
72 | return 2 * p * r / (p + r)
73 |
74 |
75 | def find_max_f1_span(tree, span):
76 | return find_max_f1_subtree(tree, span).span
77 |
78 |
79 | def find_max_f1_subtree(tree, span):
80 | return max(((t, span_f1(span, t.span)) for t in tree.subtrees()), key=lambda p: p[1])[0]
81 |
82 |
83 | def tree2matrix(tree, node2num, row_size=None, col_size=None, dtype='int32'):
84 | set_span(tree)
85 | D = tree.height() - 1
86 | B = len(tree.leaves())
87 | row_size = row_size or D
88 | col_size = col_size or B
89 | matrix = np.zeros([row_size, col_size], dtype=dtype)
90 | mask = np.zeros([row_size, col_size, col_size], dtype='bool')
91 |
92 | for subtree in tree.subtrees():
93 | row = subtree.height() - 2
94 | col = subtree.span[0]
95 | matrix[row, col] = node2num(subtree)
96 | for subsub in subtree.subtrees():
97 | if isinstance(subsub, nltk.tree.Tree):
98 | mask[row, col, subsub.span[0]] = True
99 | if not isinstance(subsub[0], nltk.tree.Tree):
100 | c = subsub.span[0]
101 | for r in range(row):
102 | mask[r, c, c] = True
103 | else:
104 | mask[row, col, col] = True
105 |
106 | return matrix, mask
107 |
108 |
109 | def load_compressed_tree(s):
110 |
111 | def compress_tree(tree):
112 | assert not isinstance(tree, str)
113 | if len(tree) == 1:
114 | if isinstance(tree[0], nltk.tree.Tree):
115 | return compress_tree(tree[0])
116 | else:
117 | return tree
118 | else:
119 | for i, t in enumerate(tree):
120 | if isinstance(t, nltk.tree.Tree):
121 | tree[i] = compress_tree(t)
122 | else:
123 | tree[i] = t
124 | return tree
125 |
126 | return compress_tree(nltk.tree.Tree.fromstring(s))
127 |
128 |
129 |
130 |
--------------------------------------------------------------------------------
/bidaf/my/tensorflow/__init__.py:
--------------------------------------------------------------------------------
1 | from my.tensorflow.general import *
--------------------------------------------------------------------------------
/bidaf/my/tensorflow/rnn.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow.python.ops.rnn import dynamic_rnn as _dynamic_rnn, \
3 | bidirectional_dynamic_rnn as _bidirectional_dynamic_rnn
4 | from tensorflow.python.ops.rnn import bidirectional_rnn as _bidirectional_rnn
5 |
6 | from my.tensorflow import flatten, reconstruct
7 |
8 |
9 | def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
10 | dtype=None, parallel_iterations=None, swap_memory=False,
11 | time_major=False, scope=None):
12 | assert not time_major # TODO : to be implemented later!
13 | flat_inputs = flatten(inputs, 2) # [-1, J, d]
14 | flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64')
15 |
16 | flat_outputs, final_state = _dynamic_rnn(cell, flat_inputs, sequence_length=flat_len,
17 | initial_state=initial_state, dtype=dtype,
18 | parallel_iterations=parallel_iterations, swap_memory=swap_memory,
19 | time_major=time_major, scope=scope)
20 |
21 | outputs = reconstruct(flat_outputs, inputs, 2)
22 | return outputs, final_state
23 |
24 |
25 | def bw_dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
26 | dtype=None, parallel_iterations=None, swap_memory=False,
27 | time_major=False, scope=None):
28 | assert not time_major # TODO : to be implemented later!
29 |
30 | flat_inputs = flatten(inputs, 2) # [-1, J, d]
31 | flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64')
32 |
33 | flat_inputs = tf.reverse(flat_inputs, 1) if sequence_length is None \
34 | else tf.reverse_sequence(flat_inputs, sequence_length, 1)
35 | flat_outputs, final_state = _dynamic_rnn(cell, flat_inputs, sequence_length=flat_len,
36 | initial_state=initial_state, dtype=dtype,
37 | parallel_iterations=parallel_iterations, swap_memory=swap_memory,
38 | time_major=time_major, scope=scope)
39 | flat_outputs = tf.reverse(flat_outputs, 1) if sequence_length is None \
40 | else tf.reverse_sequence(flat_outputs, sequence_length, 1)
41 |
42 | outputs = reconstruct(flat_outputs, inputs, 2)
43 | return outputs, final_state
44 |
45 |
46 | def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
47 | initial_state_fw=None, initial_state_bw=None,
48 | dtype=None, parallel_iterations=None,
49 | swap_memory=False, time_major=False, scope=None):
50 | assert not time_major
51 |
52 | flat_inputs = flatten(inputs, 2) # [-1, J, d]
53 | flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64')
54 |
55 | (flat_fw_outputs, flat_bw_outputs), final_state = \
56 | _bidirectional_dynamic_rnn(cell_fw, cell_bw, flat_inputs, sequence_length=flat_len,
57 | initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw,
58 | dtype=dtype, parallel_iterations=parallel_iterations, swap_memory=swap_memory,
59 | time_major=time_major, scope=scope)
60 |
61 | fw_outputs = reconstruct(flat_fw_outputs, inputs, 2)
62 | bw_outputs = reconstruct(flat_bw_outputs, inputs, 2)
63 | # FIXME : final state is not reshaped!
64 | return (fw_outputs, bw_outputs), final_state
65 |
66 |
67 | def bidirectional_rnn(cell_fw, cell_bw, inputs,
68 | initial_state_fw=None, initial_state_bw=None,
69 | dtype=None, sequence_length=None, scope=None):
70 |
71 | flat_inputs = flatten(inputs, 2) # [-1, J, d]
72 | flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64')
73 |
74 | (flat_fw_outputs, flat_bw_outputs), final_state = \
75 | _bidirectional_rnn(cell_fw, cell_bw, flat_inputs, sequence_length=flat_len,
76 | initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw,
77 | dtype=dtype, scope=scope)
78 |
79 | fw_outputs = reconstruct(flat_fw_outputs, inputs, 2)
80 | bw_outputs = reconstruct(flat_bw_outputs, inputs, 2)
81 | # FIXME : final state is not reshaped!
82 | return (fw_outputs, bw_outputs), final_state
83 |
--------------------------------------------------------------------------------
/bidaf/my/utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | from collections import deque
3 |
4 | import numpy as np
5 | from tqdm import tqdm
6 |
7 |
8 | def mytqdm(list_, desc="", show=True):
9 | if show:
10 | pbar = tqdm(list_)
11 | pbar.set_description(desc)
12 | return pbar
13 | return list_
14 |
15 |
16 | def json_pretty_dump(obj, fh):
17 | return json.dump(obj, fh, sort_keys=True, indent=2, separators=(',', ': '))
18 |
19 |
20 | def index(l, i):
21 | return index(l[i[0]], i[1:]) if len(i) > 1 else l[i[0]]
22 |
23 |
24 | def fill(l, shape, dtype=None):
25 | out = np.zeros(shape, dtype=dtype)
26 | stack = deque()
27 | stack.appendleft(((), l))
28 | while len(stack) > 0:
29 | indices, cur = stack.pop()
30 | if len(indices) < shape:
31 | for i, sub in enumerate(cur):
32 | stack.appendleft([indices + (i,), sub])
33 | else:
34 | out[indices] = cur
35 | return out
36 |
37 |
38 | def short_floats(o, precision):
39 | class ShortFloat(float):
40 | def __repr__(self):
41 | return '%.{}g'.format(precision) % self
42 |
43 | def _short_floats(obj):
44 | if isinstance(obj, float):
45 | return ShortFloat(obj)
46 | elif isinstance(obj, dict):
47 | return dict((k, _short_floats(v)) for k, v in obj.items())
48 | elif isinstance(obj, (list, tuple)):
49 | return tuple(map(_short_floats, obj))
50 | return obj
51 |
52 | return _short_floats(o)
53 |
54 |
55 | def argmax(x):
56 | return np.unravel_index(x.argmax(), x.shape)
57 |
58 |
59 |
--------------------------------------------------------------------------------
/bidaf/my/zip_save.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | import shutil
5 | from zipfile import ZipFile
6 |
7 | from tqdm import tqdm
8 |
9 |
10 | def get_args():
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('paths', nargs='+')
13 | parser.add_argument('-o', '--out', default='save.zip')
14 | args = parser.parse_args()
15 | return args
16 |
17 |
18 | def zip_save(args):
19 | temp_dir = "."
20 | save_dir = os.path.join(temp_dir, "save")
21 | if not os.path.exists(save_dir):
22 | os.makedirs(save_dir)
23 | for save_source_path in tqdm(args.paths):
24 | # path = "out/basic/30/save/basic-18000"
25 | # target_path = "save_dir/30/save"
26 | # also output full path name to "save_dir/30/readme.txt
27 | # need to also extract "out/basic/30/shared.json"
28 | temp, _ = os.path.split(save_source_path) # "out/basic/30/save", _
29 | model_dir, _ = os.path.split(temp) # "out/basic/30, _
30 | _, model_name = os.path.split(model_dir)
31 | cur_dir = os.path.join(save_dir, model_name)
32 | if not os.path.exists(cur_dir):
33 | os.makedirs(cur_dir)
34 | save_target_path = os.path.join(cur_dir, "save")
35 | shared_target_path = os.path.join(cur_dir, "shared.json")
36 | readme_path = os.path.join(cur_dir, "readme.txt")
37 | shared_source_path = os.path.join(model_dir, "shared.json")
38 | shutil.copy(save_source_path, save_target_path)
39 | shutil.copy(shared_source_path, shared_target_path)
40 | with open(readme_path, 'w') as fh:
41 | fh.write(save_source_path)
42 |
43 | os.system("zip {} -r {}".format(args.out, save_dir))
44 |
45 | def main():
46 | args = get_args()
47 | zip_save(args)
48 |
49 | if __name__ == "__main__":
50 | main()
51 |
--------------------------------------------------------------------------------
/bidaf/newsqa/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/bidaf/newsqa/__init__.py
--------------------------------------------------------------------------------
/bidaf/newsqa/evaluate.py:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:64f332e08e64422da089d47013f59a9596f94cadd6b1c5a142ba8aee47421ee5
3 | size 3226
4 |
--------------------------------------------------------------------------------
/bidaf/newsqa/prepro.py:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9685e6d5f587e0d34c27a5f8e521810a0f9dd62968286d86825140a78fcfbcde
3 | size 11421
4 |
--------------------------------------------------------------------------------
/bidaf/newsqa_unsupervised_old/data_train.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:23ec75cb6b82dc2a6c5cbb704f146a5e0529e6ba25e344d5063a8c5a0e2af6c7
3 | size 332564109
4 |
--------------------------------------------------------------------------------
/bidaf/newsqa_unsupervised_old_verb_filtered/data_train.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:44136fa355b3678a1146ad16f7e8649e94fb4fc21fe77e8310c060f61caaff8a
3 | size 2
4 |
--------------------------------------------------------------------------------
/bidaf/out/basic/06/save/basic-40000.data-00000-of-00001:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c74b188ef72f2c71c487b5cea41802d532d0ab7ac78c9012d840615fbaf91b61
3 | size 33047428
4 |
--------------------------------------------------------------------------------
/bidaf/out/basic/06/save/basic-40000.index:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:99cf4a33e7a4e28497ba709d20880721fff350f605442b541067cc7fda38c180
3 | size 4232
4 |
--------------------------------------------------------------------------------
/bidaf/out/basic/06/save/basic-40000.meta:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3f6867f121ca4743b851303041e54976501f58a177e8d4cd4844091037f37b98
3 | size 8040647
4 |
--------------------------------------------------------------------------------
/bidaf/out/basic/06/save/checkpoint:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5aa608ba2b2392385c457bc79abdd062bc9772de47dc6def4b7ecb6729fba431
3 | size 79
4 |
--------------------------------------------------------------------------------
/bidaf/out/basic/06/shared.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b48087abac725c6079e62d86253b294db5d331d0de98697de24d8d182634948a
3 | size 38884
4 |
--------------------------------------------------------------------------------
/bidaf/requirements.txt:
--------------------------------------------------------------------------------
1 | nltk
2 | tqdm
3 | jinja2
--------------------------------------------------------------------------------
/bidaf/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Once that's finished run evalations on the saved models
3 | # Creates pklz files that can be used for final eval.
4 | for i in 41 42 43 44 45 46 47 48 49 51 52 53 54 55 56 57 58 59;
5 | do
6 | for j in 17 18 19;
7 | do
8 | python3 -m basic.cli \
9 | --run_id $j \
10 | --shared_path out/basic/06/shared.json \
11 | --load_path "out/basic/$j/save/basic-"$i"000" \
12 | --k 10 \
13 | --use_special_token False \
14 | --load_ema False --gpu_idx 3 \
15 | --mode test --data_dir newsqa \
16 | --len_opt --batch_size 10 --num_steps 40000 \
17 | --eval_period 1000 --save_period 1000 \
18 | --sent_size_th 2100 --para_size_th 2100
19 | done
20 | done
21 | for num in 40 41 42 43 44 45; do
22 | eval_path="out/basic/14/eval/test-0${num}000.pklz"
23 | eargs="$eargs $eval_path"
24 | done
25 | python3 -m basic.ensemble --data_path newsqa/data_test.json --shared_path newsqa/shared_test.json -o new_results_30.json $eargs
26 | python3 newsqa/evaluate.py newsqa/data_test.json new_results_30.json
27 |
--------------------------------------------------------------------------------
/bidaf/scripts.sh:
--------------------------------------------------------------------------------
1 | # Now do evaluations on the pklz files with predictions
2 | model_id=14
3 | eargs=""
4 |
5 | for num in 40; do
6 | eval_path="out/basic/${model_id}/eval/test-0${num}000.pklz"
7 | eargs="$eargs $eval_path"
8 | done
9 | #for num in 41 42 43 46; do
10 | # eval_path="out/basic/${model_id_2}/eval/test-0${num}000.pklz"
11 | # eargs="$eargs $eval_path"
12 | #done
13 | python3 -m basic.ensemble --data_path newsqa/data_test.json --shared_path newsqa/shared_test.json -o new_results_30.json $eargs
14 | python3 newsqa/evaluate.py newsqa/data_test.json new_results_30.json
15 |
--------------------------------------------------------------------------------
/bidaf/scripts/compare_models.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python3 -m visualization.compare_models_newsqa \
3 | -dataset newsqa/data_test.json \
4 | -model1 out/basic/06/answer/test-040000.json \
5 | -model2 out/basic/00/answer/test-040000.json \
6 | -name1 "BIDAF out-domain" \
7 | -name2 "BIDAF in-domain" \
8 | -output "BIDAF_results/outdomain_vs_indomain"
9 |
10 | python3 -m visualization.compare_models_newsqa \
11 | -dataset newsqa/data_test.json \
12 | -model1 out/basic/30/answer/test-044000.json \
13 | -model2 out/basic/00/answer/test-040000.json \
14 | -name1 "BIDAF Synthetic, k=5, fake a, fake q" \
15 | -name2 "BIDAF on NewsQA" \
16 | -output "BIDAF_results/k_5_single_vs_indomain"
17 |
18 | python3 -m visualization.compare_models_newsqa \
19 | -dataset newsqa/data_test.json \
20 | -model1 29_all.json \
21 | -model2 out/basic/06/answer/test-040000.json \
22 | -name1 "BIDAF Synthetic, k=3, intra ensemble" \
23 | -name2 "BIDAF on SQUAD -> Newsqa" \
24 | -output "BIDAF_results/synthetic_k_3_intra_out_domain"
25 |
26 | python3 -m visualization.compare_models_newsqa \
27 | -dataset newsqa/data_test.json \
28 | -model1 30_all.json \
29 | -model2 out/basic/06/answer/test-040000.json \
30 | -name1 "BIDAF Synthetic, k=5, intra ensemble" \
31 | -name2 "BIDAF on SQUAD -> Newsqa" \
32 | -output "BIDAF_results/synthetic_k_5_intra_out_domain"
33 |
34 | python3 -m visualization.compare_models_newsqa \
35 | -dataset newsqa/data_test.json \
36 | -model1 30_all.json \
37 | -model2 26_all.json \
38 | -name1 "BIDAF Synthetic, k=5, intra ensemble" \
39 | -name2 "BIDAF Synthetic, k=0, intra ensemble" \
40 | -output "BIDAF_results/synthetic_k_5_intra_k_0_intra"
41 |
42 |
43 | python3 -m visualization.compare_models_newsqa \
44 | -dataset newsqa/data_test.json \
45 | -model1 30_all.json \
46 | -model2 29_all.json \
47 | -name1 "BIDAF Synthetic, k=5, intra ensemble" \
48 | -name2 "BIDAF Synthetic, k=3, intra ensemble" \
49 | -output "BIDAF_results/synthetic_k_5_intra_k_3_intra"
50 |
51 | python3 -m visualization.compare_models_newsqa \
52 | -dataset newsqa/data_test.json \
53 | -model1 30_all.json \
54 | -model2 out/basic/00/answer/test-040000.json \
55 | -name1 "BIDAF Synthetic, k=5, fake a, fake q" \
56 | -name2 "BIDAF on SQUAD -> Newsqa" \
57 | -output "BIDAF_results/synthetic_k_5_intra_vs_indomain"
58 |
59 | python3 -m visualization.compare_models_newsqa \
60 | -dataset newsqa/data_test.json \
61 | -model2 30_all.json \
62 | -model1 12_all.json \
63 | -name1 "BIDAF Synthetic, k=5, fake a, fake q" \
64 | -name2 "BIDAF on SQUAD -> Newsqa" \
65 | -output "BIDAF_results/synthetic_k_5_intra_vs_k_0_real_ans"
66 |
67 |
--------------------------------------------------------------------------------
/bidaf/scripts/evaluate_run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 |
4 | for i in 42 43 44 45 46 47 48 49;
5 | do
6 | python3 -m basic_old.cli \
7 | --run_id 18 \
8 | --shared_path out/basic/06/shared.json \
9 | --load_path "out/basic/18/save/basic-"$i"000" \
10 | --k 10 \
11 | --use_special_token False \
12 | --load_ema False --gpu_idx 1 \
13 | --mode test --data_dir newsqa \
14 | --len_opt --batch_size 15 --num_steps 40000 \
15 | --eval_period 1000 --save_period 1000 \
16 | --sent_size_th 2100 --para_size_th 2100
17 | done
18 |
19 | #for i in 42 43 44 45 46 47 48 49;
20 | #do
21 | # python3 -m basic_old.cli \
22 | # --run_id 14 \
23 | # --shared_path out/basic/06/shared.json \
24 | # --load_path "out/basic/14/save/basic-"$i"000" \
25 | # --k 10 \
26 | # --use_special_token False \
27 | # --load_ema False --gpu_idx 3 \
28 | # --mode test --data_dir newsqa \
29 | # --len_opt --batch_size 15 --num_steps 40000 \
30 | # --eval_period 1000 --save_period 1000 \
31 | # --sent_size_th 2100 --para_size_th 2100
32 | #done
--------------------------------------------------------------------------------
/bidaf/scripts/finetune_squad.sh:
--------------------------------------------------------------------------------
1 | python3 -m basic_old.cli \
2 | --run_id 29 \
3 | --use_special_token False \
4 | --sup_unsup_ratio 5 \
5 | --shared_path out/basic/06/shared.json \
6 | --load_path out/basic/06/save/basic-40000 \
7 | --k 10 \
8 | --baseline_dir newsqa \
9 | --load_ema False --gpu_idx 0 \
10 | --num_gpus 0 \
11 | --mode train \
12 | --data_dir squad_train_unsupervised_verb_filter \
13 | --len_opt --batch_size 30 \
14 | --num_steps 40000 \
15 | --eval_period 1000 --save_period 1000 \
16 | --sent_size_th 300 --para_size_th 300
17 |
18 | python3 -m basic_old.cli \
19 | --run_id 30 \
20 | --use_special_token False \
21 | --sup_unsup_ratio 3 \
22 | --shared_path out/basic/06/shared.json \
23 | --load_path out/basic/06/save/basic-40000 \
24 | --k 10 \
25 | --baseline_dir newsqa \
26 | --load_ema False --gpu_idx 0 \
27 | --num_gpus 0 \
28 | --mode train \
29 | --data_dir squad_train_unsupervised_verb_filter \
30 | --len_opt --batch_size 30 \
31 | --num_steps 40000 \
32 | --eval_period 1000 --save_period 1000 \
33 | --sent_size_th 300 --para_size_th 300
34 |
35 | python3 -m basic_old.cli \
36 | --run_id 31 \
37 | --use_special_token False \
38 | --sup_unsup_ratio 5 \
39 | --shared_path out/basic/06/shared.json \
40 | --load_path out/basic/06/save/basic-40000 \
41 | --k 10 \
42 | --baseline_dir newsqa \
43 | --load_ema False --gpu_idx 0 \
44 | --num_gpus 0 \
45 | --mode train \
46 | --data_dir squad_train_unsupervised_verb_filter_iob \
47 | --len_opt --batch_size 30 \
48 | --num_steps 40000 \
49 | --eval_period 1000 --save_period 1000 \
50 | --sent_size_th 300 --para_size_th 300
51 |
--------------------------------------------------------------------------------
/bidaf/scripts/install_tensorflow.sh:
--------------------------------------------------------------------------------
1 | sudo pip uninstall tensorflow
2 | export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.12.1-cp34-cp34m-linux_x86_64.whl
3 |
4 | sudo pip install --upgrade pip
5 | sudo pip install --upgrade $TF_BINARY_URL
6 |
7 | sudo pip install requests
8 | sudo pip install tqdm
9 | sudo pip install pandas
10 | sudo pip install nltk
11 |
12 | sudo apt-get update
13 | sudo apt-get install python-software-properties
14 | sudo add-apt-repository ppa:git-core/ppa
15 | curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
16 | sudo apt-get install git-lfs
17 | git lfs install
--------------------------------------------------------------------------------
/bidaf/scripts/run_ensemble_unsupervised.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | model_id=06
4 | eargs=""
5 | for num in 40 41 42 43 44; do
6 | eval_path="out/basic/${model_id}/eval/test-0${num}000.pklz"
7 | eargs="$eargs $eval_path"
8 | done
9 | python3 -m basic.ensemble --data_path newsqa/data_test.json --shared_path newsqa/shared_test.json -o new_results.json $eargs
10 |
11 |
--------------------------------------------------------------------------------
/bidaf/scripts/run_evaluation.sh:
--------------------------------------------------------------------------------
1 | model_id=$1
2 | model_id_1=$2
3 | model_id_2=26
4 | eargs=""
5 |
6 |
7 | #for num in 41 42 43 44 45 46 48 50 51 52 53 54 55; do
8 | # eval_path="out/basic/${model_id}/eval/test-0${num}000.pklz"
9 | # eargs="$eargs $eval_path"
10 | #done
11 | #"""
12 | for num in 41 42 43 44 45 46 48 50 51; do
13 | eval_path="out/basic/${model_id_1}/eval/test-0${num}000.pklz"
14 | eargs="$eargs $eval_path"
15 | done
16 | #for num in 41 42 43 46; do
17 | # eval_path="out/basic/${model_id_2}/eval/test-0${num}000.pklz"
18 | # eargs="$eargs $eval_path"
19 | #done
20 | python3 -m basic.ensemble --data_path newsqa/data_test.json --shared_path newsqa/shared_test.json -o new_results_30.json $eargs
21 | python3 newsqa/evaluate.py newsqa/data_test.json new_results_30.json
22 |
23 |
--------------------------------------------------------------------------------
/bidaf/scripts/run_huge_evaluation.sh:
--------------------------------------------------------------------------------
1 | model_id=30
2 | model_id_1=29
3 | model_id_2=34
4 | model_id_3=36
5 | model_id_4=32
6 | model_id_5=37
7 | eargs=""
8 |
9 | for num in 40 41 42 43 44 45 46 48 50 51 52 53 54 55; do
10 | eval_path="out/basic/${model_id}/eval/test-0${num}000.pklz"
11 | eargs="$eargs $eval_path"
12 | done
13 |
14 | for num in 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59; do
15 | eval_path="out/basic/${model_id_1}/eval/test-0${num}000.pklz"
16 | eargs="$eargs $eval_path"
17 | done
18 |
19 | for num in 40 41 42 43 44 45 46 47 48 49 50 51 52 53; do
20 | eval_path="out/basic/${model_id_2}/eval/test-0${num}000.pklz"
21 | eargs="$eargs $eval_path"
22 | done
23 |
24 | for num in 40 41 42 43 44 45 46 47 48 49; do
25 | eval_path="out/basic/${model_id_3}/eval/test-0${num}000.pklz"
26 | eargs="$eargs $eval_path"
27 | done
28 |
29 | for num in 43 44 45 46 47; do
30 | eval_path="out/basic/${model_id_4}/eval/test-0${num}000.pklz"
31 | eargs="$eargs $eval_path"
32 | done
33 |
34 | for num in 40 43 44 45 46 47 48; do
35 | eval_path="out/basic/${model_id_5}/eval/test-0${num}000.pklz"
36 | eargs="$eargs $eval_path"
37 | done
38 | #for num in 41 42 43 46; do
39 | # eval_path="out/basic/${model_id_2}/eval/test-0${num}000.pklz"
40 | # eargs="$eargs $eval_path"
41 | #done
42 | python3 -m basic.ensemble --data_path newsqa/data_test.json --shared_path newsqa/shared_test.json -o new_results_1.json $eargs
43 | python3 newsqa/evaluate.py newsqa/data_test.json new_results_1.json
44 |
45 |
--------------------------------------------------------------------------------
/bidaf/scripts/run_intra_evaluation.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | model_id=$1
3 | save_path="${model_id}_all.json"
4 | eval_paths="out/basic/${model_id}/eval/test-*"
5 | eargs=""
6 | for eval_path in $eval_paths;
7 | do
8 | eargs="$eargs $eval_path"
9 | done
10 |
11 | python3 -m basic.ensemble --data_path newsqa/data_test.json --shared_path newsqa/shared_test.json -o $save_path $eargs
12 | python3 newsqa/evaluate.py newsqa/data_test.json $save_path
--------------------------------------------------------------------------------
/bidaf/scripts/run_intra_helper.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Script to evaluate single model performance
3 |
4 | for id in 14 16 17 18 29 30 37;
5 | do
6 | echo "on run $id"
7 | ./scripts/run_new.sh $id
8 | done
--------------------------------------------------------------------------------
/bidaf/scripts/run_new.sh:
--------------------------------------------------------------------------------
1 | model_id=$1
2 | save_path="${model_id}_all.json"
3 | eargs=""
4 |
5 | eval_paths="out/basic/${model_id}/eval/test-*"
6 | count=0
7 | max=5
8 | for eval_path in $eval_paths;
9 | do
10 | ((count++))
11 | if (("$count" < "$max"))
12 | then
13 | eargs="$eargs $eval_path"
14 | fi
15 | done
16 | python3 -m basic.ensemble --data_path newsqa/data_test.json --shared_path newsqa/shared_test.json -o $save_path $eargs
17 | python3 newsqa/evaluate.py newsqa/data_test.json $save_path
--------------------------------------------------------------------------------
/bidaf/squad/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/bidaf/squad/__init__.py
--------------------------------------------------------------------------------
/bidaf/squad/aug_squad.py:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c3fa27b0b86b8b9142b9f83461e1cf968abceac19b7acb9228852aa56cdf34f9
3 | size 5725
4 |
--------------------------------------------------------------------------------
/bidaf/squad/eda_aug_dev.ipynb:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9d5d2057a12994d20542a10918edc301f8a5eba9812d593b649d3f8a3c88ee1a
3 | size 7050
4 |
--------------------------------------------------------------------------------
/bidaf/squad/eda_aug_train.ipynb:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2ba686b66274ea73d50d8c8e47f1327197214453740d7d90972a9d87473a3295
3 | size 7812
4 |
--------------------------------------------------------------------------------
/bidaf/squad/evaluate-v1.1.py:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f5a673dbbd173e29e9ea38f1b2091d883583b77b3a4c17144b223fb0f2f9bd09
3 | size 3419
4 |
--------------------------------------------------------------------------------
/bidaf/squad/evaluate.py:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ff1d15e1c750cb63c0bd95a87a8fe943934ab11a3e14b19e689f2bc5ffb48a95
3 | size 3456
4 |
--------------------------------------------------------------------------------
/bidaf/squad/prepro.py:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:022af2fd83bd1b72cddf31dddbbf53b8ee32d860d0827d9e2ca57d33de1dc59c
3 | size 9271
4 |
--------------------------------------------------------------------------------
/bidaf/squad/prepro_aug.py:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:14f328397ec2d1710e875ea0c4a0e3914d52008e1a0829aa8fc83a49bf4c094b
3 | size 6858
4 |
--------------------------------------------------------------------------------
/bidaf/squad/utils.py:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6a39505c6fd5861912ff3d7bc959e2e01c26926f214e6aa6c2c2344a400c7adb
3 | size 3492
4 |
--------------------------------------------------------------------------------
/bidaf/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/bidaf/tests/__init__.py
--------------------------------------------------------------------------------
/bidaf/tests/check_results.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import gzip
3 |
4 | for path in [42, 43, 44, 45, 46, 47, 48, 49, 51, 52, 53, 54]:
5 | save_path = 'out/basic/19/eval/test-0%s000.pklz' % path#'out/basic/06/eval/dev-040000.pklz'#'out/basic/12/eval/dev-047000.pklz'#out/basic/10/eval/dev-053000.pklz'#'out/basic/09/eval/dev-042000.pklz' #'out/basic/06/eval/dev-040000.pklz'
6 | f = gzip.open(save_path,'rb')
7 | res= pickle.load(f)
8 | f.close()
9 |
10 | print(save_path)
11 | print(res['f1'])
12 | print(res['acc'])
13 |
14 |
15 | #restore the object
16 | #out/basic/19/eval
17 | for path in ['041000']:
18 | save_path = 'out/basic/17/eval/test-%s.pklz' % path#'out/basic/06/eval/dev-040000.pklz'#'out/basic/12/eval/dev-047000.pklz'#out/basic/10/eval/dev-053000.pklz'#'out/basic/09/eval/dev-042000.pklz' #'out/basic/06/eval/dev-040000.pklz'
19 | f = gzip.open(save_path,'rb')
20 | res= pickle.load(f)
21 | f.close()
22 |
23 | print(save_path)
24 | print(res['f1'])
25 | print(res['acc'])
26 |
27 | for path in ['041000', '042000', '043000', '044000', '045000']:
28 | save_path = 'out/basic/14/eval/test-%s.pklz' % path#'out/basic/06/eval/dev-040000.pklz'#'out/basic/12/eval/dev-047000.pklz'#out/basic/10/eval/dev-053000.pklz'#'out/basic/09/eval/dev-042000.pklz' #'out/basic/06/eval/dev-040000.pklz'
29 | f = gzip.open(save_path,'rb')
30 | res= pickle.load(f)
31 | f.close()
32 |
33 | print(save_path)
34 | print(res['f1'])
35 | print(res['acc'])
36 |
37 | # out/basic/25/eval
38 | for path in ['044000', '045000', '046000', '047000', '048000', '049000', '050000', '051000', '052000']:
39 | save_path = 'out/basic/14/eval/dev-%s.pklz' % path#'out/basic/06/eval/dev-040000.pklz'#'out/basic/12/eval/dev-047000.pklz'#out/basic/10/eval/dev-053000.pklz'#'out/basic/09/eval/dev-042000.pklz' #'out/basic/06/eval/dev-040000.pklz'
40 | f = gzip.open(save_path,'rb')
41 | res= pickle.load(f)
42 | f.close()
43 |
44 | print(save_path)
45 | print(res['f1'])
46 | print(res['acc'])
47 |
48 | for path in ['041000', '042000']:
49 | save_path = 'out/basic/18/eval/dev-%s.pklz' % path#'out/basic/06/eval/dev-040000.pklz'#'out/basic/12/eval/dev-047000.pklz'#out/basic/10/eval/dev-053000.pklz'#'out/basic/09/eval/dev-042000.pklz' #'out/basic/06/eval/dev-040000.pklz'
50 | f = gzip.open(save_path,'rb')
51 | res= pickle.load(f)
52 | f.close()
53 |
54 | print(save_path)
55 | print(res['f1'])
56 | print(res['acc'])
57 |
58 | for path in ['041000', '042000', '043000', '044000', '045000', '046000', '047000', '048000', '049000']:
59 | save_path = 'out/basic/17/eval/dev-%s.pklz' % path#'out/basic/06/eval/dev-040000.pklz'#'out/basic/12/eval/dev-047000.pklz'#out/basic/10/eval/dev-053000.pklz'#'out/basic/09/eval/dev-042000.pklz' #'out/basic/06/eval/dev-040000.pklz'
60 | f = gzip.open(save_path,'rb')
61 | res= pickle.load(f)
62 | f.close()
63 |
64 | print(save_path)
65 | print(res['f1'])
66 | print(res['acc'])
--------------------------------------------------------------------------------
/bidaf/tests/create_bidaf_old_dataset.py:
--------------------------------------------------------------------------------
1 | import json
2 | import argparse
3 | import os
4 | import re
5 | import sys
6 | import time
7 | import urllib
8 | from urllib.parse import quote
9 | from bs4 import BeautifulSoup
10 | from urllib.request import urlopen
11 | from helpers import utils
12 | from collections import defaultdict
13 | from itertools import groupby
14 |
15 | def dedup(q):
16 | grouped_L = [[k, sum(1 for i in g)] for k,g in groupby(q)]
17 | deduped_q = list(map(lambda l: l[0], grouped_L))
18 | #if "?" not in deduped_q:
19 | # print("Adding new question")
20 | # deduped_q.append("?")
21 | return deduped_q
22 |
23 | def invalid_question(q):
24 | string_q = ' '.join(q)
25 | cnn_test = "CNN" in string_q
26 | unk_test = "" in string_q
27 | q_test = "?" not in string_q
28 | small_q_test = len(dedup(q)) < 5
29 | is_invalid = cnn_test or small_q_test #or q_test
30 |
31 | return is_invalid
32 |
33 | def save_results(dev_path,
34 | shared_path,
35 | gen_questions_path,
36 | gen_answers_start_path,
37 | gen_answers_end_path,
38 | gen_idxs_path,
39 | gen_ids_path,
40 | save_path):
41 | print("Loading dev json: %s and shared: %s" % (dev_path, shared_path))
42 | dev_json = json.load(open(dev_path))
43 | shared_json = json.load(open(shared_path))
44 | print("Done loading dev json and shared")
45 | questions = utils.read_lines(gen_questions_path)
46 | answer_starts = utils.read_lines(gen_answers_start_path)
47 | answer_ends = utils.read_lines(gen_answers_end_path)
48 | idxs = utils.read_lines(gen_idxs_path)
49 | ids = utils.read_lines(gen_ids_path)
50 |
51 | keys = dev_json.keys()
52 | dataset = defaultdict(list)
53 |
54 | idx = 54
55 |
56 | for i in range(0, len(questions)):
57 | cur_q = questions[i].split(" ")
58 | if invalid_question(cur_q):
59 | continue
60 | cur_q = dedup(cur_q)
61 | cur_ans_start = int(answer_starts[i])
62 | cur_ans_end = int(answer_ends[i])
63 | idx = int(idxs[i])
64 | id = int(ids[i])
65 | cur_par = shared_json['x'][idx][0][0]
66 | cy_0 = 0
67 | cy_1 = len(cur_par[cur_ans_end - 1])
68 | cy = [[cy_0, cy_1]]
69 |
70 | answerss = [cur_par[cur_ans_start:cur_ans_end]]
71 | cur_q_char = list(map(lambda token: token.split(), cur_q))
72 |
73 | dataset['idxs'].append(idx)
74 | dataset['ids'].append(len(dataset['ids']))
75 | dataset['cy'].append(cy)
76 | dataset['answerss'].append(answerss)
77 | dataset['span_answerss'].append(answerss)
78 | dataset['*x'].append([idx, 0])
79 | dataset['*cx'].append([idx, 0])
80 | dataset['*p'].append([idx, 0])
81 |
82 | shared_json['x'][idx]
83 | dataset['y'].append([[[0, cur_ans_start], [0, cur_ans_end]]])
84 | dataset['q'].append(cur_q)
85 | dataset['cq'].append(cur_q_char)
86 |
87 | print("Saving to path %s" % save_path)
88 | utils.save_json(dataset, save_path)
89 |
90 | save_directory = 'newsqa_unsupervised_old'
91 | utils.check_dir(save_directory)
92 |
93 | shared_path='newsqa/shared_train.json'
94 | dev_path= 'newsqa/data_train.json'
95 | base_path = '../datasets/newsqa_unsupervised_old/train'
96 | gen_questions_path = '%s/predictions.txt' % base_path#'%s/outputs.txt' % base_path#, 'newsqa/', 'newsqa/']
97 | gen_answers_start_path = '%s/answer_starts.txt' % base_path
98 | gen_answers_end_path = '%s/answer_ends.txt' % base_path
99 | gen_ids_path = '%s/ids.txt' % base_path
100 | gen_idxs_path = '%s/indices.txt' % base_path
101 | save_path = '%s/data_train.json' % save_directory
102 |
103 | save_results(dev_path=dev_path,
104 | shared_path=shared_path,
105 | gen_questions_path=gen_questions_path,
106 | gen_answers_start_path=gen_answers_start_path,
107 | gen_answers_end_path=gen_answers_end_path,
108 | gen_ids_path=gen_ids_path,
109 | gen_idxs_path=gen_idxs_path,
110 | save_path=save_path)
111 |
112 |
113 | """
114 | dev_paths = []#'newsqa/data_train.json', 'newsqa/data_dev.json', 'newsqa/data_test.json'] #'data/squad/data_train.json'
115 | save_paths = ['newsqa_gen_filtered_v2/data_train.json']#'newsqa_gen/data_train.json', 'newsqa_gen/data_dev.json', 'newsqa_gen/data_test.json'] #'data/squad/web_data_train.json'
116 | shared_paths = ['newsqa/shared_train.json']
117 |
118 | #json_data = json.load(open(save_paths[0]))
119 | #shared_data = json.load(open(shared_paths[0]))
120 | #original_data = json.load(open(dev_paths[0]))
121 |
122 | print(shared_data.keys())
123 | for idx in range(100, 101):
124 | print(json_data['q'][idx])
125 | print(original_data['q'][idx])
126 | print(original_data['answerss'][idx])
127 | print(json_data['answerss'][idx])
128 |
129 |
130 | for dev_path, gen_questions_path, save_path in zip(dev_paths, gen_questions_paths, save_paths):
131 | save_results(dev_path, gen_questions_path, save_path)
132 |
133 |
134 | """
--------------------------------------------------------------------------------
/bidaf/tests/create_generation_dataset_unsupervised.py:
--------------------------------------------------------------------------------
1 | import json
2 | from squad.utils import get_2d_spans
3 | from helpers import utils
4 | from helpers import spacy_tokenizer
5 |
6 | def create_dataset(save_dir, data_path, shared_path):
7 | print("Loading data from path %s" % data_path)
8 | data = json.load(open(data_path))
9 | print("Done loading data")
10 | shared_data = json.load(open(shared_path))
11 | print("Done loading shared data from path %s" % shared_path)
12 |
13 | def count_sums(up_to_idx):
14 | total_len = 0
15 | for i in range(0, up_to_idx):
16 | total_len += len(shared_data['x'][i])
17 | return total_len
18 |
19 | idxs = []
20 | xs = []
21 | answer_starts = []
22 | answer_ends = []
23 | indices = []
24 | questions = []
25 |
26 | for i in range(len(shared_data['x'])):
27 | print("On %s of %s" % (i, len(shared_data['x'])))
28 | for j in range(len(shared_data['x'][i])):
29 | cur_tokens = shared_data['x'][i][j][0]
30 | cur_text = " ".join(cur_tokens)
31 | cur_ans_starts, cur_ans_ends = spacy_tokenizer.extract_phrases(cur_text, 2)
32 | answer_starts.extend([str(ans) for ans in cur_ans_starts])
33 | answer_ends.extend([str(ans) for ans in cur_ans_ends])
34 | idxs.extend(range(len(idxs), len(idxs) + len(cur_ans_starts)))
35 | questions.extend([""] * len(cur_ans_starts))
36 | indices.extend([str(len(xs))] * len(cur_ans_starts))
37 | xs.append('\t'.join(cur_tokens))
38 |
39 | idxs = list(map(lambda idx: str(idx), idxs))
40 | utils.save_lines(idxs, '%s/ids.txt' % save_dir)
41 | utils.save_lines(questions, '%s/outputs.txt' % save_dir)
42 | utils.save_lines(answer_starts, '%s/answer_starts.txt' % save_dir)
43 | utils.save_lines(answer_ends, '%s/answer_ends.txt' % save_dir)
44 | utils.save_lines(xs, '%s/inputs.txt' % save_dir)
45 | utils.save_lines(indices, '%s/indices.txt' % save_dir)
46 |
47 |
48 | # Create squad dataset
49 | create_dataset(save_dir='../datasets/newsqa_unsupervised/',
50 | data_path='newsqa/data_train.json',
51 | shared_path='newsqa/shared_train.json')
52 |
53 | create_dataset(save_dir='../datasets/newsqa_unsupervised/train',
54 | data_path='newsqa/data_train.json',
55 | shared_path='newsqa/shared_train.json')
56 |
57 | create_dataset(save_dir='../datasets/newsqa_unsupervised/validation',
58 | data_path='newsqa/data_validation.json',
59 | shared_path='newsqa/shared_validation.json')
60 |
61 | create_dataset(save_dir='../datasets/newsqa_unsupervised/test',
62 | data_path='newsqa/data_test.json',
63 | shared_path='newsqa/shared_test.json')
64 |
65 |
--------------------------------------------------------------------------------
/bidaf/tree/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/bidaf/tree/__init__.py
--------------------------------------------------------------------------------
/bidaf/tree/cli.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pprint import pprint
3 |
4 | import tensorflow as tf
5 |
6 | from tree.main import main as m
7 |
8 | flags = tf.app.flags
9 |
10 | flags.DEFINE_string("model_name", "tree", "Model name [tree]")
11 | flags.DEFINE_string("data_dir", "data/squad", "Data dir [data/squad]")
12 | flags.DEFINE_integer("run_id", 0, "Run ID [0]")
13 |
14 | flags.DEFINE_integer("batch_size", 128, "Batch size [128]")
15 | flags.DEFINE_float("init_lr", 0.5, "Initial learning rate [0.5]")
16 | flags.DEFINE_integer("num_epochs", 50, "Total number of epochs for training [50]")
17 | flags.DEFINE_integer("num_steps", 0, "Number of steps [0]")
18 | flags.DEFINE_integer("eval_num_batches", 100, "eval num batches [100]")
19 | flags.DEFINE_integer("load_step", 0, "load step [0]")
20 | flags.DEFINE_integer("early_stop", 4, "early stop [4]")
21 |
22 | flags.DEFINE_string("mode", "test", "train | test | forward [test]")
23 | flags.DEFINE_boolean("load", True, "load saved data? [True]")
24 | flags.DEFINE_boolean("progress", True, "Show progress? [True]")
25 | flags.DEFINE_integer("log_period", 100, "Log period [100]")
26 | flags.DEFINE_integer("eval_period", 1000, "Eval period [1000]")
27 | flags.DEFINE_integer("save_period", 1000, "Save Period [1000]")
28 | flags.DEFINE_float("decay", 0.9, "Exponential moving average decay [0.9]")
29 |
30 | flags.DEFINE_boolean("draft", False, "Draft for quick testing? [False]")
31 |
32 | flags.DEFINE_integer("hidden_size", 32, "Hidden size [32]")
33 | flags.DEFINE_float("input_keep_prob", 0.5, "Input keep prob [0.5]")
34 | flags.DEFINE_integer("char_emb_size", 8, "Char emb size [8]")
35 | flags.DEFINE_integer("char_filter_height", 5, "Char filter height [5]")
36 | flags.DEFINE_float("wd", 0.0001, "Weight decay [0.001]")
37 | flags.DEFINE_bool("lower_word", True, "lower word [True]")
38 | flags.DEFINE_bool("dump_eval", True, "dump eval? [True]")
39 |
40 | flags.DEFINE_integer("word_count_th", 100, "word count th [100]")
41 | flags.DEFINE_integer("char_count_th", 500, "char count th [500]")
42 | flags.DEFINE_integer("sent_size_th", 64, "sent size th [64]")
43 | flags.DEFINE_integer("num_sents_th", 8, "num sents th [8]")
44 | flags.DEFINE_integer("ques_size_th", 64, "ques size th [64]")
45 | flags.DEFINE_integer("word_size_th", 16, "word size th [16]")
46 | flags.DEFINE_integer("tree_height_th", 16, "tree height th [16]")
47 |
48 |
49 | def main(_):
50 | config = flags.FLAGS
51 |
52 | config.out_dir = os.path.join("out", config.model_name, str(config.run_id).zfill(2))
53 |
54 | m(config)
55 |
56 | if __name__ == "__main__":
57 | tf.app.run()
58 |
--------------------------------------------------------------------------------
/bidaf/tree/graph_handler.py:
--------------------------------------------------------------------------------
1 | import json
2 | from json import encoder
3 | import os
4 |
5 | import tensorflow as tf
6 |
7 | from tree.evaluator import Evaluation
8 | from my.utils import short_floats
9 |
10 |
11 | class GraphHandler(object):
12 | def __init__(self, config):
13 | self.config = config
14 | self.saver = tf.train.Saver()
15 | self.writer = None
16 | self.save_path = os.path.join(config.save_dir, config.model_name)
17 |
18 | def initialize(self, sess):
19 | if self.config.load:
20 | self._load(sess)
21 | else:
22 | sess.run(tf.initialize_all_variables())
23 |
24 | if self.config.mode == 'train':
25 | self.writer = tf.train.SummaryWriter(self.config.log_dir, graph=tf.get_default_graph())
26 |
27 | def save(self, sess, global_step=None):
28 | self.saver.save(sess, self.save_path, global_step=global_step)
29 |
30 | def _load(self, sess):
31 | config = self.config
32 | if config.load_step > 0:
33 | save_path = os.path.join(config.save_dir, "{}-{}".format(config.model_name, config.load_step))
34 | else:
35 | save_dir = config.save_dir
36 | checkpoint = tf.train.get_checkpoint_state(save_dir)
37 | assert checkpoint is not None, "cannot load checkpoint at {}".format(save_dir)
38 | save_path = checkpoint.model_checkpoint_path
39 | print("Loading saved model from {}".format(save_path))
40 | self.saver.restore(sess, save_path)
41 |
42 | def add_summary(self, summary, global_step):
43 | self.writer.add_summary(summary, global_step)
44 |
45 | def add_summaries(self, summaries, global_step):
46 | for summary in summaries:
47 | self.add_summary(summary, global_step)
48 |
49 | def dump_eval(self, e, precision=2):
50 | assert isinstance(e, Evaluation)
51 | path = os.path.join(self.config.eval_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
52 | with open(path, 'w') as fh:
53 | json.dump(short_floats(e.dict, precision), fh)
54 |
55 |
--------------------------------------------------------------------------------
/bidaf/tree/templates/visualizer.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {{ title }}
6 |
7 |
8 |
19 |
20 |
23 |
24 | {{ title }}
25 |
26 |
27 | ID |
28 | Question |
29 | Answer |
30 | Paragraph |
31 |
32 | {% for row in rows %}
33 |
34 | {{ row.id }} |
35 |
36 | {% for qj in row.ques %}
37 | {{ qj }}
38 | {% endfor %}
39 | |
40 | {{ row.a }} |
41 |
42 |
43 | {% for xj, yj, y2j, ypj, yp2j in zip(row.para, row.y, row.y2, row.yp, row.yp2) %}
44 |
45 | {% for xjk, yjk, y2jk, ypjk in zip(xj, yj, y2j, ypj) %}
46 |
47 | {% if yjk or y2jk %}
48 | {{ xjk }}
49 | {% else %}
50 | {{ xjk }}
51 | {% endif %}
52 | |
53 | {% endfor %}
54 |
55 |
56 | {% for xjk, yp2jk in zip(xj, yp2j) %}
57 | - |
58 | {% endfor %}
59 |
60 | {% endfor %}
61 |
62 | |
63 |
64 | {% endfor %}
65 |
66 |
67 |
--------------------------------------------------------------------------------
/bidaf/tree/trainer.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | from tree.model import Model
4 |
5 |
6 | class Trainer(object):
7 | def __init__(self, config, model):
8 | assert isinstance(model, Model)
9 | self.config = config
10 | self.model = model
11 | self.opt = tf.train.AdagradOptimizer(config.init_lr)
12 | self.loss = model.get_loss()
13 | self.var_list = model.get_var_list()
14 | self.global_step = model.get_global_step()
15 | self.ema_op = model.ema_op
16 | self.summary = model.summary
17 | self.grads = self.opt.compute_gradients(self.loss, var_list=self.var_list)
18 | opt_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
19 |
20 | # Define train op
21 | with tf.control_dependencies([opt_op]):
22 | self.train_op = tf.group(self.ema_op)
23 |
24 | def get_train_op(self):
25 | return self.train_op
26 |
27 | def step(self, sess, batch, get_summary=False):
28 | assert isinstance(sess, tf.Session)
29 | feed_dict = self.model.get_feed_dict(batch, True)
30 | if get_summary:
31 | loss, summary, train_op = \
32 | sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict)
33 | else:
34 | loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
35 | summary = None
36 | return loss, summary, train_op
37 |
--------------------------------------------------------------------------------
/bidaf/tree/visualizer.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | from collections import OrderedDict
3 | import http.server
4 | import socketserver
5 | import argparse
6 | import json
7 | import os
8 | import numpy as np
9 | from tqdm import tqdm
10 |
11 | from jinja2 import Environment, FileSystemLoader
12 |
13 |
14 | def bool_(string):
15 | if string == 'True':
16 | return True
17 | elif string == 'False':
18 | return False
19 | else:
20 | raise Exception()
21 |
22 | def get_args():
23 | parser = argparse.ArgumentParser()
24 | parser.add_argument("--model_name", type=str, default='basic')
25 | parser.add_argument("--data_type", type=str, default='dev')
26 | parser.add_argument("--step", type=int, default=5000)
27 | parser.add_argument("--template_name", type=str, default="visualizer.html")
28 | parser.add_argument("--num_per_page", type=int, default=100)
29 | parser.add_argument("--data_dir", type=str, default="data/squad")
30 | parser.add_argument("--port", type=int, default=8000)
31 | parser.add_argument("--host", type=str, default="0.0.0.0")
32 | parser.add_argument("--open", type=str, default='False')
33 | parser.add_argument("--run_id", type=str, default="0")
34 |
35 | args = parser.parse_args()
36 | return args
37 |
38 |
39 | def _decode(decoder, sent):
40 | return " ".join(decoder[idx] for idx in sent)
41 |
42 |
43 | def accuracy2_visualizer(args):
44 | model_name = args.model_name
45 | data_type = args.data_type
46 | num_per_page = args.num_per_page
47 | data_dir = args.data_dir
48 | run_id = args.run_id.zfill(2)
49 | step = args.step
50 |
51 | eval_path =os.path.join("out", model_name, run_id, "eval", "{}-{}.json".format(data_type, str(step).zfill(6)))
52 | eval_ = json.load(open(eval_path, 'r'))
53 |
54 | _id = 0
55 | html_dir = "/tmp/list_results%d" % _id
56 | while os.path.exists(html_dir):
57 | _id += 1
58 | html_dir = "/tmp/list_results%d" % _id
59 |
60 | if os.path.exists(html_dir):
61 | shutil.rmtree(html_dir)
62 | os.mkdir(html_dir)
63 |
64 | cur_dir = os.path.dirname(os.path.realpath(__file__))
65 | templates_dir = os.path.join(cur_dir, 'templates')
66 | env = Environment(loader=FileSystemLoader(templates_dir))
67 | env.globals.update(zip=zip, reversed=reversed)
68 | template = env.get_template(args.template_name)
69 |
70 | data_path = os.path.join(data_dir, "data_{}.json".format(data_type))
71 | shared_path = os.path.join(data_dir, "shared_{}.json".format(data_type))
72 | data = json.load(open(data_path, 'r'))
73 | shared = json.load(open(shared_path, 'r'))
74 |
75 | rows = []
76 | for i, (idx, yi, ypi) in enumerate(zip(*[eval_[key] for key in ('idxs', 'y', 'yp')])):
77 | id_, q, rx = (data[key][idx] for key in ('ids', 'q', '*x'))
78 | x = shared['x'][rx[0]][rx[1]]
79 | ques = [" ".join(q)]
80 | para = [[word for word in sent] for sent in x]
81 | row = {
82 | 'id': id_,
83 | 'title': "Hello world!",
84 | 'ques': ques,
85 | 'para': para,
86 | 'y': yi,
87 | 'y2': yi,
88 | 'yp': ypi,
89 | 'yp2': ypi,
90 | 'a': ""
91 | }
92 | rows.append(row)
93 |
94 | if i % num_per_page == 0:
95 | html_path = os.path.join(html_dir, "%s.html" % str(i).zfill(8))
96 |
97 | if (i + 1) % num_per_page == 0 or (i + 1) == len(eval_['y']):
98 | var_dict = {'title': "Accuracy Visualization",
99 | 'rows': rows
100 | }
101 | with open(html_path, "wb") as f:
102 | f.write(template.render(**var_dict).encode('UTF-8'))
103 | rows = []
104 |
105 | os.chdir(html_dir)
106 | port = args.port
107 | host = args.host
108 | # Overriding to suppress log message
109 | class MyHandler(http.server.SimpleHTTPRequestHandler):
110 | def log_message(self, format, *args):
111 | pass
112 | handler = MyHandler
113 | httpd = socketserver.TCPServer((host, port), handler)
114 | if args.open == 'True':
115 | os.system("open http://%s:%d" % (args.host, args.port))
116 | print("serving at %s:%d" % (host, port))
117 | httpd.serve_forever()
118 |
119 |
120 | if __name__ == "__main__":
121 | ARGS = get_args()
122 | accuracy2_visualizer(ARGS)
--------------------------------------------------------------------------------
/data_loaders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/data_loaders/__init__.py
--------------------------------------------------------------------------------
/datasets/iob_test/label_vocab.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9e6f95acefd53c7d31c37565bb19a57c8a5804bd8cbc903735fcc921eff7323d
3 | size 14
4 |
--------------------------------------------------------------------------------
/datasets/iob_test/test/inputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ed0525e09813b3b5c00dcf14d5ad24a046687f803862acd21443b3941bc2ff1a
3 | size 339
4 |
--------------------------------------------------------------------------------
/datasets/iob_test/test/labels.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:21c784fbc151c08fc6f6face1461c7f8c1d27b7d722cdec8a0925e44758ab3dc
3 | size 374
4 |
--------------------------------------------------------------------------------
/datasets/iob_test/train/inputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ed0525e09813b3b5c00dcf14d5ad24a046687f803862acd21443b3941bc2ff1a
3 | size 339
4 |
--------------------------------------------------------------------------------
/datasets/iob_test/train/labels.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:21c784fbc151c08fc6f6face1461c7f8c1d27b7d722cdec8a0925e44758ab3dc
3 | size 374
4 |
--------------------------------------------------------------------------------
/datasets/iob_test/validation/inputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ed0525e09813b3b5c00dcf14d5ad24a046687f803862acd21443b3941bc2ff1a
3 | size 339
4 |
--------------------------------------------------------------------------------
/datasets/iob_test/validation/labels.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:21c784fbc151c08fc6f6face1461c7f8c1d27b7d722cdec8a0925e44758ab3dc
3 | size 374
4 |
--------------------------------------------------------------------------------
/datasets/iob_test/vocab.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8d364496ba649830b4e63d911f2e81b75aa3f21c5788a4e91be8fb2a3428a4c5
3 | size 40
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised/test/answer_ends.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:037af1302de3d0fa01ad348e3859528787a1197e43baf7b81c63f9141e1bb560
3 | size 876366
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised/test/answer_starts.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:84b6eefcf57226e557664dba3df4b92e5d6a36f41e443a40cb7f8cb0b2725cfc
3 | size 871721
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised/test/ids.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:265789c92077c71a017dd1e6e1028e5a022ab34efddf091969dd2f6110d90566
3 | size 1634129
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised/test/indices.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e75f8c969460d99eea735258e2f2f3655c277daa4c59145cdad907f9630e43de
3 | size 1254429
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised/test/outputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2bf4737b32f5adf2faddde48402bf66f5e75f88c06defb2f6809f3fa571dabf3
3 | size 1745239
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised/train/answer_ends.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:037af1302de3d0fa01ad348e3859528787a1197e43baf7b81c63f9141e1bb560
3 | size 876366
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised/train/answer_starts.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:84b6eefcf57226e557664dba3df4b92e5d6a36f41e443a40cb7f8cb0b2725cfc
3 | size 871721
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised/train/ids.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:265789c92077c71a017dd1e6e1028e5a022ab34efddf091969dd2f6110d90566
3 | size 1634129
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised/train/indices.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e75f8c969460d99eea735258e2f2f3655c277daa4c59145cdad907f9630e43de
3 | size 1254429
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised/train/outputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2bf4737b32f5adf2faddde48402bf66f5e75f88c06defb2f6809f3fa571dabf3
3 | size 1745239
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised/validation/answer_ends.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:037af1302de3d0fa01ad348e3859528787a1197e43baf7b81c63f9141e1bb560
3 | size 876366
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised/validation/answer_starts.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:84b6eefcf57226e557664dba3df4b92e5d6a36f41e443a40cb7f8cb0b2725cfc
3 | size 871721
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised/validation/ids.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:265789c92077c71a017dd1e6e1028e5a022ab34efddf091969dd2f6110d90566
3 | size 1634129
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised/validation/indices.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e75f8c969460d99eea735258e2f2f3655c277daa4c59145cdad907f9630e43de
3 | size 1254429
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised/validation/outputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2bf4737b32f5adf2faddde48402bf66f5e75f88c06defb2f6809f3fa571dabf3
3 | size 1745239
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised/vocab.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3e7b26c130a8ba8be7bf34416bfcf472f01ab9ef641dea97f99fb7a1b2fe553c
3 | size 1006234
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised/word_embeddings.npy:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:07007189652bf43a0e3344fcab1080cb5ad6c96bb46bab61e6c5848f53213ffb
3 | size 264444080
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_large/test/answer_ends.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:cd9afca6022b499bde4845e4238bda9e558c38b2f0fb531cc2b1701b2ccc3acf
3 | size 1825956
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_large/test/answer_starts.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:406c3ff9997978e5ebb39d17a93df6740bee12185bc7fe7caa4d9e79947c2ea0
3 | size 1819658
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_large/test/ids.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:49aae5695d087c3e522b19198141dae59d33e9b86e8b317a32f039fa5fda79c5
3 | size 3370675
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_large/test/indices.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:242a4e7181d6dd5b2adc5f7c33485ebae7895cb59de84f4f6214f3b60b6ddc74
3 | size 2502865
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_large/test/outputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3c27d065ee9912ac78f85b4fabe70ff79e6a8f43cc706c5a63881cc6261fa8d6
3 | size 3481785
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_large/train/answer_ends.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:cd9afca6022b499bde4845e4238bda9e558c38b2f0fb531cc2b1701b2ccc3acf
3 | size 1825956
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_large/train/answer_starts.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:406c3ff9997978e5ebb39d17a93df6740bee12185bc7fe7caa4d9e79947c2ea0
3 | size 1819658
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_large/train/ids.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:49aae5695d087c3e522b19198141dae59d33e9b86e8b317a32f039fa5fda79c5
3 | size 3370675
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_large/train/indices.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:242a4e7181d6dd5b2adc5f7c33485ebae7895cb59de84f4f6214f3b60b6ddc74
3 | size 2502865
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_large/train/outputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3c27d065ee9912ac78f85b4fabe70ff79e6a8f43cc706c5a63881cc6261fa8d6
3 | size 3481785
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_large/validation/answer_ends.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:cd9afca6022b499bde4845e4238bda9e558c38b2f0fb531cc2b1701b2ccc3acf
3 | size 1825956
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_large/validation/answer_starts.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:406c3ff9997978e5ebb39d17a93df6740bee12185bc7fe7caa4d9e79947c2ea0
3 | size 1819658
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_large/validation/ids.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:49aae5695d087c3e522b19198141dae59d33e9b86e8b317a32f039fa5fda79c5
3 | size 3370675
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_large/validation/indices.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:242a4e7181d6dd5b2adc5f7c33485ebae7895cb59de84f4f6214f3b60b6ddc74
3 | size 2502865
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_large/validation/outputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3c27d065ee9912ac78f85b4fabe70ff79e6a8f43cc706c5a63881cc6261fa8d6
3 | size 3481785
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_large/vocab.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3e7b26c130a8ba8be7bf34416bfcf472f01ab9ef641dea97f99fb7a1b2fe553c
3 | size 1006234
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_old/test/answer_ends.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:037af1302de3d0fa01ad348e3859528787a1197e43baf7b81c63f9141e1bb560
3 | size 876366
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_old/test/answer_starts.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:84b6eefcf57226e557664dba3df4b92e5d6a36f41e443a40cb7f8cb0b2725cfc
3 | size 871721
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_old/test/ids.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:265789c92077c71a017dd1e6e1028e5a022ab34efddf091969dd2f6110d90566
3 | size 1634129
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_old/test/indices.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e75f8c969460d99eea735258e2f2f3655c277daa4c59145cdad907f9630e43de
3 | size 1254429
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_old/test/outputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2bf4737b32f5adf2faddde48402bf66f5e75f88c06defb2f6809f3fa571dabf3
3 | size 1745239
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_old/train/answer_ends.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:037af1302de3d0fa01ad348e3859528787a1197e43baf7b81c63f9141e1bb560
3 | size 876366
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_old/train/answer_starts.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:84b6eefcf57226e557664dba3df4b92e5d6a36f41e443a40cb7f8cb0b2725cfc
3 | size 871721
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_old/train/ids.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:265789c92077c71a017dd1e6e1028e5a022ab34efddf091969dd2f6110d90566
3 | size 1634129
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_old/train/indices.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e75f8c969460d99eea735258e2f2f3655c277daa4c59145cdad907f9630e43de
3 | size 1254429
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_old/train/outputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2bf4737b32f5adf2faddde48402bf66f5e75f88c06defb2f6809f3fa571dabf3
3 | size 1745239
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_old/train/predictions.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8b9c0f018ef8ffde83f03cd0bd0fc64699dcf2bc311d2e6f6c4a85a04c825f80
3 | size 11889229
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_old/validation/answer_ends.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:037af1302de3d0fa01ad348e3859528787a1197e43baf7b81c63f9141e1bb560
3 | size 876366
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_old/validation/answer_starts.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:84b6eefcf57226e557664dba3df4b92e5d6a36f41e443a40cb7f8cb0b2725cfc
3 | size 871721
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_old/validation/ids.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:265789c92077c71a017dd1e6e1028e5a022ab34efddf091969dd2f6110d90566
3 | size 1634129
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_old/validation/indices.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e75f8c969460d99eea735258e2f2f3655c277daa4c59145cdad907f9630e43de
3 | size 1254429
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_old/validation/outputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2bf4737b32f5adf2faddde48402bf66f5e75f88c06defb2f6809f3fa571dabf3
3 | size 1745239
4 |
--------------------------------------------------------------------------------
/datasets/newsqa_unsupervised_old/vocab.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3e7b26c130a8ba8be7bf34416bfcf472f01ab9ef641dea97f99fb7a1b2fe553c
3 | size 1006234
4 |
--------------------------------------------------------------------------------
/datasets/question_generator/test/indices.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6392c376c4f872a61dcb12d065140b89519bb4832e3dff5dff814dfed667a14b
3 | size 34
4 |
--------------------------------------------------------------------------------
/datasets/question_generator/test/inputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:87d6699635e7201069b652f774311aea81adf07963fbfcda3f0ff6dc948841e0
3 | size 399
4 |
--------------------------------------------------------------------------------
/datasets/question_generator/test/outputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:aa65a846ec104385828b4f4a230ee3ebe0671c553369d264a680044e824e8da3
3 | size 464
4 |
--------------------------------------------------------------------------------
/datasets/question_generator/train/indices.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6392c376c4f872a61dcb12d065140b89519bb4832e3dff5dff814dfed667a14b
3 | size 34
4 |
--------------------------------------------------------------------------------
/datasets/question_generator/train/inputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:87d6699635e7201069b652f774311aea81adf07963fbfcda3f0ff6dc948841e0
3 | size 399
4 |
--------------------------------------------------------------------------------
/datasets/question_generator/train/outputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:aa65a846ec104385828b4f4a230ee3ebe0671c553369d264a680044e824e8da3
3 | size 464
4 |
--------------------------------------------------------------------------------
/datasets/question_generator/validation/indices.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6392c376c4f872a61dcb12d065140b89519bb4832e3dff5dff814dfed667a14b
3 | size 34
4 |
--------------------------------------------------------------------------------
/datasets/question_generator/validation/inputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:87d6699635e7201069b652f774311aea81adf07963fbfcda3f0ff6dc948841e0
3 | size 399
4 |
--------------------------------------------------------------------------------
/datasets/question_generator/validation/outputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:aa65a846ec104385828b4f4a230ee3ebe0671c553369d264a680044e824e8da3
3 | size 464
4 |
--------------------------------------------------------------------------------
/datasets/question_generator/vocab.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b7ab460035a08214f7706daa086faa0147b84bdcf41084d76696f39d0c8a17e4
3 | size 47
4 |
--------------------------------------------------------------------------------
/datasets/question_generator/word_embeddings.npy:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:836c6845d79fe9afbb6ce4f04519490147b01a0804d2b6c12736b6de88eb795c
3 | size 40880
4 |
--------------------------------------------------------------------------------
/datasets/squad/test/answer_ends.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:76794bda0b4ce21b2d7d296979a3440e693f10b2d203ca9ce7cd6ddb44ab63cf
3 | size 32729
4 |
--------------------------------------------------------------------------------
/datasets/squad/test/answer_starts.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:79497c26f0078664229f9c4950535ac6349c2d55e5bf32b6832d2d386e8bbfb4
3 | size 32223
4 |
--------------------------------------------------------------------------------
/datasets/squad/test/ids.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:cc732368ef367f6f04f8efa42a79c6e130f34a88d0270c990f87ed9bcba0ad85
3 | size 264249
4 |
--------------------------------------------------------------------------------
/datasets/squad/test/indices.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b58d8062546a48ab079ad3af87be637ed279192189e15baf1bd7800e6d1c92e1
3 | size 46137
4 |
--------------------------------------------------------------------------------
/datasets/squad/test/inputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e58b81a330876cc0b7f7fda075f99054514fc24aa8360ebd0ec55786c7a42dc9
3 | size 5104041
4 |
--------------------------------------------------------------------------------
/datasets/squad/test/outputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:31dfa0b3d1a8aa10e1b663d7abc1cf9db35b0fb0fd4f5a393f7aa8f1f6174ad8
3 | size 657565
4 |
--------------------------------------------------------------------------------
/datasets/squad/train/answer_ends.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:aa3fc91e476ee541944dc71f849cf974181663af5a6e7b48611f876067790644
3 | size 270983
4 |
--------------------------------------------------------------------------------
/datasets/squad/train/answer_starts.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:04f6da5479a495f05d07abec5902bfc1c4f6a10f155114823059141db3134ab0
3 | size 266277
4 |
--------------------------------------------------------------------------------
/datasets/squad/train/ids.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f04ce115e64ab652c743fcf7b8bfcd2666a70ae58b9acb6148cc57c10ad1db45
3 | size 2189974
4 |
--------------------------------------------------------------------------------
/datasets/squad/train/indices.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:33f7fb47d5657b5ae647dddb91cf064a5ab74cd16f392aad8d7a0e08146c8058
3 | size 472305
4 |
--------------------------------------------------------------------------------
/datasets/squad/train/inputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c09deb8e15b21d4f7b81afd1f07e52dbe3476ff0eec7809d1dd00a5e702e790e
3 | size 44472596
4 |
--------------------------------------------------------------------------------
/datasets/squad/train/outputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a349f3ab5037f45f997d178064165c6d8d8a60f672203540bd3fc804a7e05cc5
3 | size 5384930
4 |
--------------------------------------------------------------------------------
/datasets/squad/validation/answer_ends.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:aa3fc91e476ee541944dc71f849cf974181663af5a6e7b48611f876067790644
3 | size 270983
4 |
--------------------------------------------------------------------------------
/datasets/squad/validation/answer_starts.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:04f6da5479a495f05d07abec5902bfc1c4f6a10f155114823059141db3134ab0
3 | size 266277
4 |
--------------------------------------------------------------------------------
/datasets/squad/validation/ids.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f04ce115e64ab652c743fcf7b8bfcd2666a70ae58b9acb6148cc57c10ad1db45
3 | size 2189974
4 |
--------------------------------------------------------------------------------
/datasets/squad/validation/indices.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:33f7fb47d5657b5ae647dddb91cf064a5ab74cd16f392aad8d7a0e08146c8058
3 | size 472305
4 |
--------------------------------------------------------------------------------
/datasets/squad/validation/inputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:7c67cb58a444013cc2ef584a46f1738a793c16cf24e226c93c490436b3ed57c9
3 | size 14281184
4 |
--------------------------------------------------------------------------------
/datasets/squad/validation/outputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a349f3ab5037f45f997d178064165c6d8d8a60f672203540bd3fc804a7e05cc5
3 | size 5384930
4 |
--------------------------------------------------------------------------------
/datasets/squad/vocab.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8efc25e3ba7d0a4137fd27d0824166c0d80c8bd5b652c118541d1e4633bb1bd4
3 | size 1006231
4 |
--------------------------------------------------------------------------------
/datasets/squad/word_embeddings.npy:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:07007189652bf43a0e3344fcab1080cb5ad6c96bb46bab61e6c5848f53213ffb
3 | size 264444080
4 |
--------------------------------------------------------------------------------
/datasets/squad_iob/label_vocab.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:606f0ee17c5d6a8218cf46208a6608f610fe3a84beb18ba4c24e94fd59bf5809
3 | size 38
4 |
--------------------------------------------------------------------------------
/datasets/squad_iob/test/inputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6d02ca31b2dd2be6b3d7f4208fc24cfa347743b673294090db085396d5314b99
3 | size 1642029
4 |
--------------------------------------------------------------------------------
/datasets/squad_iob/test/label_vocab.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:606f0ee17c5d6a8218cf46208a6608f610fe3a84beb18ba4c24e94fd59bf5809
3 | size 38
4 |
--------------------------------------------------------------------------------
/datasets/squad_iob/test/labels.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:7b79a2270314f3a85db52a5a09db919bb534bebe0eda7164f28f7dbae2bf0a60
3 | size 2431581
4 |
--------------------------------------------------------------------------------
/datasets/squad_iob/test/vocab.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9e07f5f2c65709ae66fc2452cddbd5e73d9376faa100502aca0de37f0e643ad8
3 | size 245190
4 |
--------------------------------------------------------------------------------
/datasets/squad_iob/train/inputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:cf9bc9665878d7202115103d702f2c6667329bc62657b4f7746759032e065ab8
3 | size 14278820
4 |
--------------------------------------------------------------------------------
/datasets/squad_iob/train/labels.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:85732a286a29d505151910dbf27ec689a734d374eaaf0cce36aa33b17b497240
3 | size 21134049
4 |
--------------------------------------------------------------------------------
/datasets/squad_iob/validation/inputs.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6d02ca31b2dd2be6b3d7f4208fc24cfa347743b673294090db085396d5314b99
3 | size 1642029
4 |
--------------------------------------------------------------------------------
/datasets/squad_iob/validation/label_vocab.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:606f0ee17c5d6a8218cf46208a6608f610fe3a84beb18ba4c24e94fd59bf5809
3 | size 38
4 |
--------------------------------------------------------------------------------
/datasets/squad_iob/validation/labels.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1e2d073ab6528d38d6d2ca74b48b0604090ce81b0bfb9da3515cdcdd3dacaa32
3 | size 2429470
4 |
--------------------------------------------------------------------------------
/datasets/squad_iob/validation/vocab.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9e07f5f2c65709ae66fc2452cddbd5e73d9376faa100502aca0de37f0e643ad8
3 | size 245190
4 |
--------------------------------------------------------------------------------
/datasets/squad_iob/vocab.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:af38a3df050779170fb1ca1d2c42e7b9cbf02a09a01da5b13420984fb3e9e429
3 | size 1005904
4 |
--------------------------------------------------------------------------------
/dnn_units/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/dnn_units/__init__.py
--------------------------------------------------------------------------------
/helpers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/helpers/__init__.py
--------------------------------------------------------------------------------
/helpers/constants.py:
--------------------------------------------------------------------------------
1 | """
2 | Constants for easier reference
3 | """
4 |
5 | NLTK_DATA_PATH = 'pretrained_models/nltk'
6 | TOKENIZER_TEXTBLOB = 'TOKENIZER_TEXTBLOB'
7 | TOKENIZER_NLTK = 'TOKENIZER_NLTK'
8 | TOKENIZER_REGEX = 'TOKENIZER_REGEX'
9 | TOKENIZER_TWITTER = 'TOKENIZER_TWITTER'
10 | TOKENIZER_STANFORD_NLP = 'TOKENIZER_STANFORD_NLP'
11 | TOKENIZER_TAB = 'TOKENIZER_TAB'
12 | TOKENIZER_SPECIAL_DELIMITER = 'TOKENIZER_SPECIAL_DELIMITER'
13 | TOKENIZER_SPACE = ' '
14 |
15 | TRAIN_INDEX = 0
16 | VAL_INDEX = 1
17 | TEST_INDEX = 2
18 |
19 | TRAIN_MODE = 0
20 | TEST_MODE = 1
21 |
22 | MODEL_TYPE_LANGUAGE_MODEL = 'MODEL_TYPE_LANGUAGE_MODEL'
23 |
24 | WORD_LEVEL = 'WORD_LEVEL'
25 | CHAR_LEVEL = 'CHAR_LEVEL'
26 | WORD_CHAR_LEVEL = 'WORD_CHAR_LEVEL' #Word embeddings with char. lvl lstms
27 | WORD_HASHING_LEVEL = 'WORD_HASHING_LEVEL'
28 | WORD_HASHING_CONSTANT = '%'
29 |
30 | DATASET_TRAIN = 'DATASET_TRAIN'
31 | DATASET_TEST = 'DATASET_TEST'
32 | DATASET_VALIDATION = 'DATASET_VALIDATION'
33 |
34 | GPU_MODE = 'GPU_MODE'
35 | CPU_MODE = 'CPU_MODE'
36 |
37 | PREPROCESS_TYPE_INCEPTION = 'PREPROCESS_TYPE_INCEPTION'
38 | PREPROCESS_TYPE_GOOGLENET = 'PREPROCESS_TYPE_GOOGLENET'
39 | PREPROCESS_TYPE_RESNET = 'PREPROCESS_TYPE_RESNET'
40 |
41 | PREPROCESS_TYPE_RESNET_50 = 'PREPROCESS_TYPE_RESNET_50'
42 | PREPROCESS_TYPE_RESNET_101 = 'PREPROCESS_TYPE_RESNET_101'
43 | PREPROCESS_TYPE_RESNET_152 = 'PREPROCESS_TYPE_RESNET_152'
44 |
45 | NETWORK_TYPE_INCEPTION = 'NETWORK_TYPE_INCEPTION'
46 | NETWORK_TYPE_GOOGLENET = 'NETWORK_TYPE_GOOGLENET'
47 | NETWORK_TYPE_RESNET = 'NETWORK_TYPE_RESNET'
48 |
49 | NETWORK_TYPE_RESNET_30 = 'NETWORK_TYPE_RESNET_30'
50 | NETWORK_TYPE_RESNET_50 = 'NETWORK_TYPE_RESNET_50'
51 | NETWORK_TYPE_RESNET_101 = 'NETWORK_TYPE_RESNET_101'
52 | NETWORK_TYPE_RESNET_152 = 'NETWORK_TYPE_RESNET_152'
53 |
54 | OPTIMIZER_RMSPROP = 'OPTIMIZER_RMSPROP'
55 | OPTIMIZER_ADAM = 'OPTIMIZER_ADAM'
56 | OPTIMIZER_SGD = 'OPTIMIZER_SGD'
57 |
58 | # Initializer for weights (zero, uniform and random)
59 | INITIALIZER_ZERO = 'INITIALIZER_ZERO'
60 | INITIALIZER_UNIFORM_RANDOM = 'INITIALIZER_UNIFORM_RANDOM'
61 |
62 | # To load vocab things
63 | PATH_NPY_ARRAY = 'PATH_NPY_ARRAY'
64 | PATH_TEXT_ARRAY = 'PATH_TEXT_ARRAY'
65 |
66 | # For card types
67 | PREDICTOR_TEXT_FIELD = 'TEXT_FIELD_PREDICTOR'
68 | PREDICTOR_SINGULAR_FIELD = 'SINGULAR_FIELD_PREDICTOR'
69 | PREDICTOR_CHARACTER = 'CHARACTER_PREDICTOR'
70 |
--------------------------------------------------------------------------------
/helpers/io_utils.py:
--------------------------------------------------------------------------------
1 | import zipfile
2 | import glob
3 | import datetime
4 | import os
5 | import urllib
6 | import shutil
7 | import os
8 | import pickle
9 |
10 | def pickle_save(data, save_path):
11 | print("Saving data to path %s" % save_path)
12 | save_file = open(save_path, 'w')
13 | pickle.dump(data, save_file)
14 | save_file.close()
15 |
16 | def pickle_load(load_path):
17 | print("Loading data from path %s" % load_path)
18 |
19 | load_file = open(load_path, 'r')
20 | data = pickle.load(load_file)
21 | load_file.close()
22 |
23 | print("Done loading data from path %s" % load_path)
24 | return data
25 |
26 | def get_subdirs(src_dir):
27 | return [os.path.join(src_dir, name) for name in os.listdir(src_dir) \
28 | if os.path.isdir(os.path.join(src_dir, name))]
29 |
30 | def copy_files(src_dir, dest_dir):
31 | for filename in glob.glob(os.path.join(src_dir, '*.*')):
32 | shutil.copy(filename, dest_dir)
33 |
34 | def copy_file(src_name, dest_name):
35 | shutil.copyfile(src_name, dest_name)
36 |
37 | def get_files(src_dir):
38 | """
39 | Gets all files from source directory
40 | """
41 | files = glob.glob(os.path.join(src_dir, '*.*'))
42 | return files
43 |
44 | def download_file(url, save_path):
45 | """ Downloads url to save_path """
46 | url_opener = urllib.URLopener()
47 | url_opener.retrieve(url, save_path)
48 |
49 | def check_dir(save_dir):
50 | """ Creates dir if not exists"""
51 | if not os.path.exists(save_dir):
52 | print("Directory %s does not exist, making it now" % save_dir)
53 | os.makedirs(save_dir)
54 | return False
55 | else:
56 | print("Directory %s exists, all good" % save_dir)
57 | return True
58 |
59 | def get_matching_files(regex):
60 | files = glob.glob(regex)
61 | return files
62 |
63 | def zip_files(file_list, save_path):
64 | print('creating archive into path %s' % save_path)
65 | zf = zipfile.ZipFile(save_path, mode='w')
66 |
67 | for f in file_list:
68 | print(f)
69 | zf.write(f)
70 | zf.close()
71 | print_info(save_path)
72 |
73 | def unzip_files(zip_path, directory_to_extract_to):
74 | print("Unzipping files from path %s to dir %s" \
75 | % (zip_path, directory_to_extract_to))
76 | zip_ref = zipfile.ZipFile(zip_path, 'r')
77 | zip_ref.extractall(directory_to_extract_to)
78 | zip_ref.close()
79 |
80 | def print_info(archive_name):
81 | zf = zipfile.ZipFile(archive_name)
82 | for info in zf.infolist():
83 | print(info.filename)
84 | print('\tComment:\t', info.comment)
85 | print('\tModified:\t', datetime.datetime(*info.date_time))
86 | print('\tSystem:\t\t', info.create_system, '(0 = Windows, 3 = Unix)')
87 | print('\tZIP version:\t', info.create_version)
88 | print('\tCompressed:\t', info.compress_size, 'bytes')
89 | print('\tUncompressed:\t', info.file_size, 'bytes')
90 | print
91 |
92 |
--------------------------------------------------------------------------------
/helpers/logger.py:
--------------------------------------------------------------------------------
1 | class FileLogger(object):
2 | """ Simple logger to insert stuff into a file """
3 | def __init__(self, path):
4 | self.file = open(path, 'w')
5 |
6 | def write(self, text, print_text=False):
7 | if print_text:
8 | print("FILE LOGGER: %s" % text)
9 | self.file.write(text + "\n")
10 | self.file.flush()
--------------------------------------------------------------------------------
/helpers/proc_wrapper.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import time
3 | import re
4 | from subprocess import Popen
5 | import urllib2, urllib
6 | import json
7 | import httplib
8 | import os
9 | import urlparse
10 | from optparse import OptionParser
11 |
12 | # This wraps a process within a logger and logs the information in the cloud
13 | def spawner(cmd_list):
14 | print("Spawning process")
15 | p = subprocess.Popen(cmd_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
16 |
17 | print("Done spawning")
18 | # Parameters needed for logging
19 | job_id = -1
20 | job_id_got = False
21 |
22 | job_endpoint = 'NONE'
23 | job_endpoint_got = False
24 |
25 | error_log_endpoint = 'NONE'
26 | error_log_endpoint_got = False
27 |
28 | weird_constant = '\x1b[0m' # For some reason adds this weird constant to every line
29 |
30 | while True:
31 | print("Reading line out")
32 | line = p.stdout.readline()
33 | print(line)
34 | if line != '':
35 | #the real code does filtering here
36 | stripped_line = line.rstrip()
37 | stripped_line = str.replace(stripped_line, weird_constant, '', )
38 |
39 | print('STDOUT ' + stripped_line)
40 | if not job_id_got:
41 | print("Got job id")
42 | job_id_got, job_id = get_job_id(stripped_line)
43 |
44 | if not job_endpoint_got:
45 | print("Got job endpoint")
46 | job_endpoint_got, job_endpoint = get_job_endpoint(stripped_line)
47 |
48 | if not error_log_endpoint_got:
49 | print("Getting error log endpoint")
50 | error_log_endpoint_got, error_log_endpoint = get_error_log_endpoint(stripped_line)
51 | else:
52 | break
53 |
54 | stdout, stderr = p.communicate()
55 |
56 | if stderr is not None and stderr is not ' ':
57 | # Job url: http://api_endpoint/api/job/id/
58 | # job_endpoint: http://api_endpoint/api/job/
59 |
60 | print('STDERR ' + stderr)
61 | job_url = "{0}{1}{2}".format(job_endpoint, int(job_id), '/')
62 |
63 | print("JOB URL IS " + job_url)
64 | data = {}
65 | data['job'] = job_url
66 | data['text'] = stderr
67 |
68 | send_post_message(error_log_endpoint.rstrip(), data)
69 | else:
70 | print("Job finished successfully.")
71 |
72 | def send_post_message(url, data):
73 | # split url into base and relative path
74 | result = urlparse.urlparse(url)
75 | base_path = result.netloc
76 | relative_path = result.path
77 | encoded_data = urllib.urlencode(data)
78 |
79 | h = httplib.HTTPConnection(base_path)
80 |
81 | headers = {"Content-type": "application/x-www-form-urlencoded"}
82 |
83 | h.request('POST', '/api/v1/error_log/', encoded_data, headers)
84 |
85 | r = h.getresponse()
86 | print r.read()
87 |
88 |
89 | def get_job_id(message):
90 | m = re.search('(?<=job with id )[0-9]*', message)
91 | if m is not None:
92 | job_id = m.group(0)
93 | print("Got job id %s" % job_id)
94 | return True, job_id
95 | else:
96 | return False, -1
97 |
98 | def get_job_endpoint(message):
99 | m = re.search('(?<=Job endpoint ).*', message)
100 | if m is not None:
101 | print("Got job endpoint " + message)
102 | job_endpoint = m.group(0)
103 | print("Job endpoint %s" % job_endpoint)
104 | return True, job_endpoint
105 | else:
106 | return False, ''
107 |
108 | def get_error_log_endpoint(message):
109 | m = re.search('(?<=Error log endpoint ).*', message)
110 | if m is not None:
111 | print("Got error log endpoint")
112 | error_endpoint = m.group(0)
113 | print("Endpoint is %s" % error_endpoint)
114 | return True, error_endpoint
115 | else:
116 | return False, ''
117 |
118 | if __name__ == '__main__':
119 | print("Starting stuff")
120 | parser = OptionParser()
121 | parser.add_option("--command", "--command", dest="command",
122 | help="Command to execute")
123 |
124 | parser.add_option("--args", "--args", dest="args",
125 | help="Args for command")
126 |
127 | (options, args) = parser.parse_args()
128 |
129 | split_args = options.args.split(' ')
130 |
131 | concatenated_args = [options.command]
132 | concatenated_args.extend(split_args)
133 | spawner(concatenated_args)
--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | pip install tqdm
3 | pip install unidecode
4 | pip install textblob
5 | pip3 install tqdm
6 | pip3 install unidecode
7 | pip3 install textblob
8 | pip3 install http://download.pytorch.org/whl/cu80/torch-0.1.12.post2-cp35-cp35m-linux_x86_64.whl
9 | export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.12.1-cp35-cp35m-linux_x86_64.whl
10 | sudo pip3 install --upgrade pip
11 | sudo pip3 install --upgrade $TF_BINARY_URL
12 | sudo pip install tensorflow-gpu
13 | pip install spacy && python -m spacy download en
14 | pip3 install spacy && python3 -m spacy download en
15 | cd bidaf
16 | ./download.sh
17 | cd ../
18 |
19 |
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/iob/logs/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/iob/logs/README.md
--------------------------------------------------------------------------------
/logs/results/newsqa/evaluate.py:
--------------------------------------------------------------------------------
1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """
2 | from __future__ import print_function
3 | from collections import Counter
4 | import string
5 | import re
6 | import argparse
7 | import json
8 | import sys
9 |
10 |
11 | def normalize_answer(s):
12 | """Lower text and remove punctuation, articles and extra whitespace."""
13 | def remove_articles(text):
14 | return re.sub(r'\b(a|an|the)\b', ' ', text)
15 |
16 | def white_space_fix(text):
17 | return ' '.join(text.split())
18 |
19 | def remove_punc(text):
20 | exclude = set(string.punctuation)
21 | return ''.join(ch for ch in text if ch not in exclude)
22 |
23 | def lower(text):
24 | return text.lower()
25 |
26 | return white_space_fix(remove_articles(remove_punc(lower(s))))
27 |
28 |
29 | def f1_score(prediction, ground_truth):
30 | prediction_tokens = normalize_answer(prediction).split()
31 | ground_truth_tokens = normalize_answer(ground_truth).split()
32 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
33 | num_same = sum(common.values())
34 | if num_same == 0:
35 | return 0
36 | precision = 1.0 * num_same / len(prediction_tokens)
37 | recall = 1.0 * num_same / len(ground_truth_tokens)
38 | f1 = (2 * precision * recall) / (precision + recall)
39 | return f1
40 |
41 |
42 | def exact_match_score(prediction, ground_truth):
43 | return (normalize_answer(prediction) == normalize_answer(ground_truth))
44 |
45 |
46 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
47 | scores_for_ground_truths = []
48 | for ground_truth in ground_truths:
49 | score = metric_fn(prediction, ground_truth)
50 | scores_for_ground_truths.append(score)
51 | return max(scores_for_ground_truths)
52 |
53 |
54 | def evaluate(dataset, predictions):
55 | f1 = exact_match = total = 0
56 | unanswered = 0
57 | for i in range(0, len(dataset['answerss'])):
58 | question_id = str(dataset['ids'][i])
59 | question = dataset['q'][i]
60 | ground_truths = dataset['answerss'][i]
61 | if len(ground_truths) == 0:
62 | ground_truths = [""]
63 | #total += 1
64 |
65 |
66 | if question_id not in predictions:
67 | unanswered += 1
68 | message = 'Unanswered question ' + str(question_id) + \
69 | ' will receive score 0.'
70 | print(message, file=sys.stderr)
71 | continue
72 | total += 1
73 | prediction = predictions[question_id]
74 | exact_match += metric_max_over_ground_truths(
75 | exact_match_score, prediction, ground_truths)
76 | f1 += metric_max_over_ground_truths(
77 | f1_score, prediction, ground_truths)
78 |
79 | exact_match = 100.0 * exact_match / total
80 | f1 = 100.0 * f1 / total
81 | print("Number unanswered %s" % unanswered)
82 |
83 | return {'exact_match': exact_match, 'f1': f1}
84 |
85 |
86 | if __name__ == '__main__':
87 | expected_version = '1.1'
88 | parser = argparse.ArgumentParser(
89 | description='Evaluation for SQuAD ' + expected_version)
90 | parser.add_argument('dataset_file', help='Dataset file')
91 | parser.add_argument('prediction_file', help='Prediction File')
92 | args = parser.parse_args()
93 | with open(args.dataset_file) as dataset_file:
94 | dataset = json.load(dataset_file)
95 | with open(args.prediction_file) as prediction_file:
96 | predictions = json.load(prediction_file)
97 | print(json.dumps(evaluate(dataset, predictions)))
98 |
--------------------------------------------------------------------------------
/logs/results/script.sh:
--------------------------------------------------------------------------------
1 | # Evaluate out of domain baseline
2 | echo "out of domain baseline"
3 | python3 newsqa/evaluate.py newsqa/data_test.json answer_out_of_domain_baseline.json
4 |
5 | # Evaluate single model results (for steps 2k-10k)
6 |
7 | # Evaluate single model results + baseline (for steps 2k-10k)
8 |
9 | # Evaluate single-model result (44.5 F1)
10 | echo "A_(gen + Ner)"
11 | python3 newsqa/evaluate.py newsqa/data_test.json single_model.json
12 |
13 | # Evaluate two-model result (45.6 F1)
14 | echo "Double model result-2 A_(gen + ner) + A_ner"
15 | python3 newsqa/evaluate.py newsqa/data_test.json double_model.json
16 |
17 | # Evaluate AOracle + Context
18 | echo "Answer oracle with context for question generation, single model"
19 | python3 newsqa/evaluate.py newsqa/data_test.json context_aoracle.json
20 |
21 | echo "Single BiDAF model finetuned on NewsQA 4k steps"
22 | python3 newsqa/evaluate.py newsqa/data_test.json "single_model_results_44.json"
23 |
24 | echo "Single BiDaf finetuned on NewsQA 4k steps ensembled w. baseline results"
25 | python3 newsqa/evaluate.py newsqa/data_test.json "single_model_result_run_44_with_baseline.json"
26 |
27 | # Evaluate single model result of BiDAF finetuned on NewsQA
28 | echo "Single BiDAF model finetuned on NewsQA results"
29 | for num in 42 43 44 45 46 47 48 49; do
30 | python3 newsqa/evaluate.py newsqa/data_test.json "single_model_results_${num}.json"
31 | done
32 |
33 | echo "Single BiDAF model finetuned on NewsQA ensembled w. baseline results"
34 | # Evaluate single model ensembled with baseline result of BiDAF finetuned on NewsQA
35 | for num in 42 43 44 45 46 47 48 49; do
36 | python3 newsqa/evaluate.py newsqa/data_test.json "single_model_result_run_${num}_with_baseline.json"
37 | done
38 |
--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/models/__init__.py
--------------------------------------------------------------------------------
/models/iob/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/models/iob/__init__.py
--------------------------------------------------------------------------------
/models/language_wrapper.py:
--------------------------------------------------------------------------------
1 | from models.language_model import LanguageModel, LanguageDiscriminator
2 | from helpers import vocab
3 | import numpy as np
4 | import torch
5 | from torch.autograd import variable
6 |
7 | class LanguageWrapper(object):
8 | def __init__(self, model, vocab):
9 | self.model = model
10 | self.vocab = vocab
11 | self.create_discriminator()
12 |
13 | def create_discriminator(self):
14 | # Hack to create discriminator: note
15 | # Violates interface design pattern
16 | embeddings = self.model.embedder
17 | text_field_predictor = self.model.text_field_predictor
18 | base_lstm = self.model.base_lstm
19 |
20 | self.discriminator = LanguageDiscriminator(self.model.config,
21 | embeddings, text_field_predictor, base_lstm).cuda()
22 |
23 | def get_discriminator(self):
24 | return self.discriminator
25 |
26 | def get_model(self):
27 | return self.model
28 |
29 | def predict(self, context_tokens, answer_features, max_length, pad=False):
30 | input_token = variable.Variable(torch.LongTensor([[self.vocab.start_index]])).cuda()
31 | end_token = torch.LongTensor([[self.vocab.end_index]]).cuda()
32 | context_tokens = variable.Variable(torch.LongTensor(context_tokens)).cuda()
33 | answer_features = variable.Variable(torch.from_numpy(answer_features)).cuda()
34 |
35 | predictions = self.model.predict(input_token=input_token,
36 | context_tokens=context_tokens,
37 | end_token=end_token,
38 | answer_features=answer_features,
39 | max_length=max_length)
40 |
41 | if pad:
42 | pad_token = variable.Variable(torch.LongTensor([self.vocab.pad_index]).cuda())
43 | while len(predictions) < max_length:
44 | predictions.append(pad_token)
45 |
46 | stacked_predictions = torch.stack(predictions, 0)
47 | tokens = self.get_tokens_single(stacked_predictions.cpu())
48 | sentence = " ".join(tokens)
49 | return sentence, stacked_predictions
50 |
51 | def get_tokens_single(self, predictions):
52 | numpy_predictions = torch.squeeze(predictions).data.numpy()
53 | tokens = self.vocab.tokens(numpy_predictions)
54 | return tokens
55 |
56 | def get_tokens(self, predictions):
57 | numpy_predictions = torch.squeeze(predictions).data.numpy()
58 | tokens = self.vocab.tokens_list(numpy_predictions)
59 | return tokens
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt.zip
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/PY3/czech.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/czech.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/PY3/danish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/danish.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/PY3/dutch.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/dutch.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/PY3/english.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/english.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/PY3/estonian.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/estonian.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/PY3/finnish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/finnish.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/PY3/french.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/french.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/PY3/german.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/german.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/PY3/greek.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/greek.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/PY3/italian.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/italian.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/PY3/norwegian.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/norwegian.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/PY3/polish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/polish.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/PY3/portuguese.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/portuguese.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/PY3/slovene.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/slovene.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/PY3/spanish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/spanish.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/PY3/swedish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/swedish.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/PY3/turkish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/PY3/turkish.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/czech.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/czech.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/danish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/danish.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/dutch.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/dutch.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/estonian.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/estonian.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/finnish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/finnish.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/french.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/french.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/german.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/german.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/italian.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/italian.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/norwegian.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/norwegian.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/polish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/polish.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/portuguese.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/portuguese.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/slovene.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/slovene.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/spanish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/spanish.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/swedish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/swedish.pickle
--------------------------------------------------------------------------------
/pretrained_models/nltk/tokenizers/punkt/turkish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/nltk/tokenizers/punkt/turkish.pickle
--------------------------------------------------------------------------------
/pretrained_models/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/pretrained_models/scripts/__init__.py
--------------------------------------------------------------------------------
/pretrained_models/scripts/create_glove_embeddings.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python -m pretrained_models.scripts.download_glove_embeddings
3 | python -m pretrained_models.scripts.transfer_glove_embeddings --path 'pretrained_models/word_embeddings/glove/glove.840B.300d.txt' \
4 | --save_word_path 'datasets/squad/vocab.txt' \
5 | --save_embeddings_path 'datasets/squad/word_embeddings.npy'
6 |
7 | python -m pretrained_models.scripts.transfer_glove_embeddings --path 'pretrained_models/word_embeddings/glove/glove.840B.300d.txt' \
8 | --save_word_path 'datasets/newsqa_unsupervised/vocab.txt' \
9 | --save_embeddings_path 'datasets/newsqa_unsupervised/word_embeddings.npy'
10 |
11 | python -m pretrained_models.scripts.transfer_glove_embeddings --path 'pretrained_models/word_embeddings/glove/glove.840B.300d.txt' \
12 | --save_word_path 'datasets/newsqa_unsupervised_large/vocab.txt' \
13 | --save_embeddings_path 'datasets/newsqa_unsupervised_large/word_embeddings.npy'
--------------------------------------------------------------------------------
/pretrained_models/scripts/download_glove_embeddings.py:
--------------------------------------------------------------------------------
1 | """
2 | Downloads the following:
3 | - Stanford parser
4 | - Stanford POS tagger
5 | - Glove vectors
6 | - SICK dataset (semantic relatedness task)
7 | - Stanford Sentiment Treebank (sentiment classification task)
8 | """
9 |
10 | from __future__ import print_function
11 | import urllib2
12 | import sys
13 | import os
14 | import shutil
15 | import zipfile
16 | import gzip
17 | from helpers import io_utils
18 |
19 | def download(url, dirpath):
20 | io_utils.check_dir(dirpath)
21 | filename = url.split('/')[-1]
22 | filepath = os.path.join(dirpath, filename)
23 | try:
24 | u = urllib2.urlopen(url)
25 | except:
26 | print("URL %s failed to open" %url)
27 | raise Exception
28 | try:
29 | f = open(filepath, 'wb')
30 | except:
31 | print("Cannot write %s" %filepath)
32 | raise Exception
33 | try:
34 | filesize = int(u.info().getheaders("Content-Length")[0])
35 | except:
36 | print("URL %s failed to report length" %url)
37 | raise Exception
38 | print("Downloading: %s Bytes: %s" % (filename, filesize))
39 |
40 | downloaded = 0
41 | block_sz = 8192
42 | status_width = 70
43 | while True:
44 | buf = u.read(block_sz)
45 | if not buf:
46 | print('')
47 | break
48 | else:
49 | print('', end='\r')
50 | downloaded += len(buf)
51 | f.write(buf)
52 | status = (("[%-" + str(status_width + 1) + "s] %3.2f%%") %
53 | ('=' * int(float(downloaded) / filesize * status_width) + '>', downloaded * 100. / filesize))
54 | print(status, end='')
55 | sys.stdout.flush()
56 | f.close()
57 | return filepath
58 |
59 | def unzip(filepath):
60 | print("Extracting: " + filepath)
61 | dirpath = os.path.dirname(filepath)
62 | with zipfile.ZipFile(filepath) as zf:
63 | zf.extractall(dirpath)
64 | os.remove(filepath)
65 |
66 | def download_wordvecs(dirpath):
67 | url = 'http://www-nlp.stanford.edu/data/glove.840B.300d.zip'
68 | unzip(download(url, dirpath))
69 |
70 | def create_glove_vocab(dirpath):
71 | glove_path = os.path.join(dirpath, 'glove.840B.300d.txt')
72 | with open(glove_path) as f:
73 | line = f.readline().split(' ')
74 | word = line[0]
75 | vecs = map(lambda item: float(item), line[1:])
76 | print(word)
77 | print(vecs)
78 |
79 | if __name__ == '__main__':
80 | base_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
81 |
82 | # data
83 | data_dir = os.path.join(base_dir, 'word_embeddings')
84 | wordvec_dir = os.path.join(data_dir, 'glove')
85 |
86 | # download dependencies
87 | download_wordvecs(wordvec_dir)
88 |
89 | # create the vocabulary file from the word embeddings
90 | create_glove_vocab(wordvec_dir)
--------------------------------------------------------------------------------
/pretrained_models/scripts/transfer_glove_embeddings.py:
--------------------------------------------------------------------------------
1 | from helpers import utils
2 | from helpers import constants
3 | from helpers.vocab import Vocab
4 |
5 | import numpy as np
6 |
7 | from optparse import OptionParser
8 |
9 | parser = OptionParser()
10 | parser.add_option("--path", "--path", dest="path",
11 | help="Path to save words from")
12 | parser.add_option("--save_word_path", "--save_word_path", dest="save_word_path",
13 | help="Where to save vocab")
14 | parser.add_option("--save_embeddings_path", "--save_embeddings_path", dest="save_embeddings_path",
15 | default = 'save_embeddings_path', help="Where to save embeddings to")
16 | (options, args) = parser.parse_args()
17 |
18 | path = options.path
19 | save_word_path = options.save_word_path
20 | save_embeddings_path = options.save_embeddings_path
21 | save_embeddings_file = open(save_embeddings_path, 'w')
22 |
23 | # Test loading it into vocab
24 | vocab = Vocab(vocab_type=constants.WORD_LEVEL, add_start_end=True)
25 | vocab.init_from_path(path=save_word_path)
26 |
27 | token_to_embedding = dict()
28 | num_items = []
29 | original_item = []
30 | original_embedding = []
31 | cur_index = 0
32 | def read_line(line):
33 | items = line.strip().split(' ')
34 | embed_size = len(items) - 1 # First item is word
35 | cur_index = len(num_items)
36 |
37 | word = items[0]
38 | embedding_vector = items[1:]
39 | if cur_index % 100 == 0:
40 | print(cur_index)
41 | if len(original_item) == 0:
42 | original_item.append(word)
43 | original_embedding.append(map(lambda vec: float(vec), embedding_vector))
44 | num_items.append(embed_size)
45 |
46 | if word in vocab.token_to_idx:
47 | token_to_embedding[word] = map(lambda vec: float(vec), embedding_vector)
48 |
49 | print("Reading embeddings")
50 | # Read in raw embeddings
51 | utils.read_lines_with_func(func=read_line, path=path)
52 |
53 | print("Done reading embeddings, now creating save matrix")
54 | original_embedding_size = num_items[0]
55 | word_embedding_matrix = []
56 | num_items_saved = 0
57 | for i in range(0, vocab.size()):
58 | if i % 100 == 0:
59 | print("On index %s from %s" % (i, vocab.size()))
60 | cur_token = vocab.token(i)
61 | embedding_vector = np.zeros(original_embedding_size)
62 | if cur_token in token_to_embedding:
63 | embedding_vector = token_to_embedding[cur_token]
64 | num_items_saved = num_items_saved + 1
65 | word_embedding_matrix.append(embedding_vector)
66 |
67 | utils.save_matrix(matrix=word_embedding_matrix, path=save_embeddings_path)
68 | vocab.init_embeddings(embeddings_path=save_embeddings_path, path_type=constants.PATH_NPY_ARRAY)
69 |
70 | print("Saved %s of %s tokens" % (num_items_saved, vocab.size()))
71 | print("Testing embeddings for original token %s" % original_item)
72 | index = vocab.index(original_item[0])
73 | embedding = vocab.embeddings[index]
74 |
75 | print(index)
76 | print(embedding)
77 |
78 | tmp = np.array(original_embedding[0])
79 | diff = embedding - tmp
80 | print(diff)
81 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requirements.txt
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/tests/__init__.py
--------------------------------------------------------------------------------
/tests/gather_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | input = torch.LongTensor([[1, 2], [3, 4], [5,6]])
3 | dim = 0
4 | index = torch.LongTensor([1, 2])
5 | res = torch.gather(input, dim, index)
6 | print(res)
--------------------------------------------------------------------------------
/tests/iob_loader_test.py:
--------------------------------------------------------------------------------
1 | from data_loaders.iob_loader import IOBLoader
2 | from helpers import constants
3 |
4 | base_directory = 'datasets/iob_test'
5 |
6 | tmp = IOBLoader(base_directory, tokenizer_type=constants.TOKENIZER_NLTK)
7 | tmp.mix_indices()
8 |
9 | batch = tmp.get_batch(constants.DATASET_TRAIN, 2)
10 |
11 | print(batch)
--------------------------------------------------------------------------------
/tests/iob_test.py:
--------------------------------------------------------------------------------
1 | from models.iob.iob_model import IOBModel
2 | import numpy as np
3 |
4 | config = {
5 | 'input_max_length': 20,
6 | 'vocab_size': 10,
7 | 'embeddings_size': 25,
8 | 'hidden_size': 30,
9 | 'out_size': 5,
10 | 'num_classes': 3,
11 | 'batch_size': 5,
12 | 'learning_rate': 1e-2
13 | }
14 |
15 | model = IOBModel(config)
16 | inputs = np.random.random_integers(0, config['vocab_size'] - 1,
17 | size=[config['batch_size'], config['input_max_length']])
18 | input_lengths = np.ones((config['batch_size']), dtype=np.int32) * 1
19 | input_masks = np.ones((config['batch_size'], config['input_max_length']), dtype=np.int32)
20 |
21 | for i in range(0, config['batch_size']):
22 | cur_input_length = input_lengths[i]
23 | input_masks[i][cur_input_length:] = 0
24 |
25 | labels = np.random.random_integers(0, config['num_classes'] - 1,
26 | size=[config['batch_size'], config['input_max_length']])
27 |
28 | batch = { 'inputs': inputs,
29 | 'input_lengths': input_lengths,
30 | 'input_masks': input_masks,
31 | 'labels': labels
32 | }
33 |
34 | for i in range(0, 100):
35 | loss, predictions = model.forward(batch)
36 | print(loss)
37 | print(predictions)
38 | print(labels)
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/tests/iob_trainer_test.py:
--------------------------------------------------------------------------------
1 | from data_loaders.iob_loader import IOBLoader
2 | from models.iob.iob_model import IOBModel
3 | from helpers import constants, utils
4 |
5 | base_directory = 'datasets/iob_test'
6 |
7 | data_loader = IOBLoader(base_directory, tokenizer_type=constants.TOKENIZER_SPACE)
8 | data_loader.mix_indices()
9 |
10 |
11 | config = {
12 | 'input_max_length': data_loader.input_max_length,
13 | 'vocab_size': data_loader.vocab.size(),
14 | 'embeddings_size': 25,
15 | 'hidden_size': 30,
16 | 'out_size': 5,
17 | 'num_classes': data_loader.label_vocab.size(),
18 | 'batch_size': 3,
19 | 'learning_rate': 1e-2,
20 | 'save_path': 'iob/logs'}
21 |
22 | config_path = 'iob/logs/config.json'
23 | params_path = 'iob/logs/model_params.ckpt'
24 |
25 | model = IOBModel(config, embeddings=None)
26 | model.save(config_path, params_path)
27 | model.restore(params_path)
28 |
29 | batch = data_loader.get_batch(constants.DATASET_TRAIN, config['batch_size'])
30 |
31 | for i in range(0, 100):
32 | while batch is not None:
33 | loss, predictions = model.forward(batch)
34 | batch = data_loader.get_batch(constants.DATASET_TRAIN, config['batch_size'])
35 | print(predictions)
36 | print(loss)
37 |
38 | if i % 3 == 0:
39 | data_loader.reset_indices()
40 | total_predictions = []
41 | while True:
42 | batch = data_loader.get_batch(constants.DATASET_TEST, config['batch_size'])
43 | if batch is None:
44 | break
45 | predictions = model.predict(batch)
46 | texts = data_loader.label_vocab.tokens_list(predictions)
47 | for i in range(0, len(texts)):
48 | cur_input_length = batch['input_lengths'][i]
49 | cur_text = texts[i]
50 | text_str = " ".join(cur_text[0:cur_input_length])
51 | total_predictions.append(text_str)
52 | utils.save_lines(total_predictions, \
53 | '%s/predictions_test_%s.txt' % (config['save_path'], i))
54 |
55 | data_loader.mix_indices()
56 | batch = data_loader.get_batch(constants.DATASET_TRAIN, config['batch_size'])
57 |
58 |
59 |
60 |
61 |
62 |
63 |
--------------------------------------------------------------------------------
/tests/language_model_memory_test.py:
--------------------------------------------------------------------------------
1 | from models.language_model import LanguageModel
2 | import torch
3 | from torch import nn
4 | from torch import optim
5 | from torch.autograd import variable
6 | from helpers import torch_utils
7 |
8 | config = {}
9 | config['vocab_size'] = 110000
10 | config['hidden_size'] = 150
11 | config['embedding_size'] = 100
12 | config['num_layers'] = 1
13 | config['dropout'] = 0.0
14 | config['batch_first'] = False
15 | config['use_pretrained_embeddings'] = False
16 | config['finetune_embeddings'] = True
17 |
18 | language_model = LanguageModel(config).cuda()
19 |
20 | # contexts: context_length x batch_size
21 | # inputs: input_length x batch_size
22 | # desired_inputs: input_length x batch_size
23 |
24 |
25 | optimizer = optim.Adam(language_model.parameters(), lr=3e-2)
26 | criterion = nn.NLLLoss()
27 |
28 | for i in range(0, 1000):
29 | optimizer.zero_grad()
30 | inputs = variable.Variable(torch.LongTensor([[1, 2, 3, 4, 5, 6, 7]] * 100)).cuda()
31 | contexts = variable.Variable(torch.LongTensor([[4, 5, 6, 7, 8, 9, 10], [4, 5, 6, 7, 8, 9, 10], [4, 5, 6, 7, 8, 9, 10], [4, 5, 6, 7, 8, 9, 10]])).cuda()
32 | context_masks = variable.Variable(torch.FloatTensor([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])).cuda()
33 | desired_inputs = variable.Variable(torch.LongTensor([[1, 2, 3, 4, 5, 6, 7]] * 100)).cuda()
34 | input_masks = variable.Variable(torch.FloatTensor([[1, 1, 1, 1, 1, 1, 1]] * 100)).cuda()
35 | answer_features = variable.Variable(torch.LongTensor([[4, 5, 6, 7, 8, 9, 10], [4, 5, 6, 7, 8, 9, 10], [4, 5, 6, 7, 8, 9, 10], [4, 5, 6, 7, 8, 9, 10]])).cuda()
36 | print("On index %s" % i)
37 |
38 | optimizer.zero_grad()
39 | language_probs = language_model.forward(inputs, contexts, context_masks, answer_features=None)
40 | reshaped_inputs = desired_inputs.view(-1)
41 | reshaped_language_probs = language_probs.view(-1, config['vocab_size'])
42 | loss = criterion(reshaped_language_probs, reshaped_inputs)
43 | loss.backward()
44 | optimizer.step()
45 |
46 |
47 |
48 |
--------------------------------------------------------------------------------
/tests/language_model_predict_test.py:
--------------------------------------------------------------------------------
1 | from models.language_model import LanguageModel
2 | import torch
3 | from torch import nn
4 | from torch import optim
5 | from torch.autograd import variable
6 | from helpers import torch_utils
7 |
8 | config = {}
9 | config['vocab_size'] = 25
10 | config['hidden_size'] = 50
11 | config['embedding_size'] = 10
12 | config['num_layers'] = 1
13 | config['dropout'] = 0.0
14 | config['batch_first'] = False
15 |
16 | language_model = LanguageModel(config)
17 | language_model.cuda()
18 | # contexts: context_length x batch_size
19 | # inputs: input_length x batch_size
20 | # desired_inputs: input_length x batch_size
21 |
22 | input_token = variable.Variable(torch.LongTensor([[1]]))
23 | context_tokens = variable.Variable(torch.LongTensor([[2], [3], [4], [5], [6], [7], [8]]))
24 | language_model.predict(input_token, context_tokens, torch.LongTensor([[1]]))
--------------------------------------------------------------------------------
/tests/language_model_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | from torch import optim
4 | from torch.autograd import variable
5 | from helpers import torch_utils
6 | from models.language_model import LanguageModel
7 |
8 | config = {}
9 | config['vocab_size'] = 25
10 | config['hidden_size'] = 50
11 | config['embedding_size'] = 10
12 | config['num_layers'] = 1
13 | config['dropout'] = 0.0
14 | config['batch_first'] = False
15 | config['use_pretrained_embeddings'] = False
16 | config['gpu_mode'] = True
17 |
18 | language_model = LanguageModel(config)
19 |
20 | # contexts: context_length x batch_size
21 | # inputs: input_length x batch_size
22 | # desired_inputs: input_length x batch_size
23 |
24 | inputs = variable.Variable(torch.LongTensor([[1, 2, 3], [4,5,6]])).cuda()
25 | contexts = variable.Variable(torch.LongTensor([[4, 5, 6], [7, 8, 9], [4, 5, 6], [7, 8, 9]])).cuda()
26 | desired_inputs = variable.Variable(torch.LongTensor([[2, 3, 4], [5, 6, 7]])).cuda()
27 |
28 | optimizer = optim.Adam(language_model.parameters(), lr=3e-2)
29 | criterion = nn.NLLLoss()
30 | language_model.cuda()
31 |
32 | for i in range(0, 100):
33 | optimizer.zero_grad()
34 | language_probs = language_model.forward(inputs, contexts, context_masks=None, answer_features=contexts.float())
35 | reshaped_inputs = desired_inputs.view(-1)
36 | reshaped_language_probs = language_probs.view(-1, config['vocab_size'])
37 |
38 | max_likelihoods, best_indices = torch.max(language_probs, 2)
39 | diff = torch.eq(torch.squeeze(best_indices).data,desired_inputs.data)
40 | accuracy = (diff.sum()) / torch_utils.num_elements(diff)
41 |
42 | loss = criterion(reshaped_language_probs, reshaped_inputs)
43 | loss.backward()
44 | optimizer.step()
45 |
46 | print(loss)
47 | print(accuracy)
48 |
49 |
50 |
51 |
--------------------------------------------------------------------------------
/tests/language_model_trainer_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.autograd import variable
3 | from data_loaders.language_model_loader import LanguageModelLoader
4 | from models.language_model import LanguageModel
5 | from models.language_trainer import LanguageTrainer
6 | from models.language_wrapper import LanguageWrapper
7 | from helpers import constants, torch_utils
8 |
9 |
10 | base_path = 'datasets/question_generator'
11 |
12 |
13 | language_model_loader = LanguageModelLoader(base_path)
14 |
15 | config = {}
16 | config['vocab_size'] = language_model_loader.get_vocab().size()
17 | config['hidden_size'] = 100
18 | config['embedding_size'] = 300
19 | config['num_layers'] = 1
20 | config['dropout'] = 0.0
21 | config['batch_first'] = False
22 | config['batch_size'] = 3
23 | config['learning_rate'] = 1e-3
24 | config['log_path'] = 'logs.txt'
25 | config['save_directory'] = 'logs/saved_data'
26 | config['use_pretrained_embeddings'] = True
27 | config['pretrained_embeddings_path'] = 'datasets/question_generator/word_embeddings.npy'
28 | config['finetune_embeddings'] = True
29 | config['gpu_mode'] = True
30 |
31 | language_model = LanguageModel(config).cuda()
32 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab())
33 | language_trainer = LanguageTrainer(config, language_wrapper, language_model_loader)
34 |
35 | for i in range(0, 100):
36 | loss, accuracy, predictions = language_trainer.train(epoch_num=i)
37 |
38 | if i % 10 == 0:
39 | predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST,epoch_num=10, max_length=20)
40 | language_trainer.save_predictions(i, predictions)
41 | language_trainer.save(i)
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/tests/load_questions.py:
--------------------------------------------------------------------------------
1 | from helpers import utils
2 | from helpers import spacy_tokenizer
3 | import numpy as np
4 |
5 | delimiter='*@#$*($#@*@#$'
6 | def func(l):
7 | items = l.strip().split(delimiter)
8 | return items
9 |
10 | def gen_func(l):
11 | items = l.strip().split(" ")
12 | return items
13 |
14 | answer_starts_path = 'datasets/newsqa/train/answer_starts.txt'
15 | answer_ends_path = 'datasets/newsqa/train/answer_ends.txt'
16 | input_path = 'datasets/newsqa/train/inputs.txt'
17 | output_path = 'datasets/newsqa/train/outputs.txt'
18 | generated_path = 'logs/newsqa_saved_data/dummy5_train_predictions_epoch_6.txt'
19 | indices_path = 'datasets/newsqa/train/indices.txt'
20 |
21 |
22 | inputs = utils.read_lines_with_func(func, input_path)
23 | outputs = utils.read_tabbed_lines(output_path)
24 | generated = utils.read_lines_with_func(gen_func, generated_path)
25 | answer_starts = list(map(lambda l: int(l), utils.read_lines(answer_starts_path)))
26 | answer_ends = list(map(lambda l: int(l), utils.read_lines(answer_ends_path)))
27 | indices = list(map(lambda l: int(l), utils.read_lines(indices_path)))
28 |
29 | answers = []
30 | truncated_contexts = []
31 | questions = []
32 | generated_questions = []
33 |
34 | num_overlap = []
35 | num_items = len(generated)
36 |
37 | question_counter = 0
38 | generated_question_counter = 0
39 | filtered_words = ["a", "the", "who", "what", "when", "where", "why", "it"]
40 | for i in range(num_items):
41 | start_idx = answer_starts[i]
42 | end_idx = answer_ends[i]
43 | idx = indices[i]
44 | padded_start_idx = np.max([0, start_idx-10])
45 | padded_end_idx = np.min([end_idx + 10, len(inputs[idx])])
46 | truncated_context = inputs[idx][padded_start_idx:padded_end_idx]
47 |
48 | answers.append(inputs[idx][start_idx:end_idx])
49 | truncated_contexts.append(truncated_context)
50 |
51 | question = outputs[i]
52 | generated_question = generated[i]
53 |
54 | questions.append(question)
55 | generated_questions.append(generated_question)
56 |
57 | for t in question:
58 | if t not in filtered_words:
59 | if t in truncated_context:
60 | question_counter += 1
61 |
62 | for t in generated_question:
63 | if t not in filtered_words:
64 | if t in truncated_context:
65 | generated_question_counter += 1
66 |
67 |
68 | #ner_tokens = spacy_tokenizer.extract_NER(' '.join(truncated_context))
69 | #assert(False)
70 |
71 | utils.save_tabbed_lines(questions, "analysis/questions.txt")
72 | utils.save_tabbed_lines(generated_questions, "analysis/generated_questions.txt")
73 | utils.save_tabbed_lines(answers, "analysis/answers.txt")
74 | utils.save_tabbed_lines(truncated_contexts, "analysis/truncated_contexts.txt")
75 |
76 | num_tokens_q = question_counter / float(num_items)
77 | num_tokens_generated_q = generated_question_counter / float(num_items)
78 |
79 |
80 | print(num_tokens_q)
81 | print(num_tokens_generated_q)
82 |
83 | 1.8647080433936984
84 | 2.5078769935600986
85 |
86 | 2.958032588494619
87 | 4.5196546656869945
88 |
89 | 2.469334831654925
90 | 4.094059298958379
91 | #utils.save_lines()
92 | #utils.save_lines():
93 |
94 | #If you look at the fraction of questions that have: overlapping words with the context
95 | #Of question words that overlap with the context from synthetically generated questions
96 | #Of question words that overlap with the context from human-generated words.
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
--------------------------------------------------------------------------------
/tests/newsqa_predictor_test_unsup.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from data_loaders.language_model_loader import LanguageModelLoader
3 | from models.language_model import LanguageModel
4 | from models.language_trainer import LanguageTrainer
5 | from models.language_wrapper import LanguageWrapper
6 | from helpers import constants
7 | from helpers import torch_utils, utils
8 | from torch.autograd import variable
9 |
10 | dataset_path = 'datasets/newsqa_unsupervised_large'
11 | load_path = 'logs/squad_saved_data/model_14.pyt7'
12 |
13 | language_model_loader = LanguageModelLoader(dataset_path, tokenizer_type=constants.TOKENIZER_TAB,
14 | context_tokenizer_type=constants.TOKENIZER_TAB)
15 | language_model = torch_utils.load_model(load_path).cuda()
16 | language_model.config['save_directory'] = 'logs/newsqa_unsupervised_saved_data'
17 | language_model.config['gpu_mode'] = True
18 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab())
19 | language_trainer = LanguageTrainer(language_model.config, language_wrapper, language_model_loader)
20 |
21 | #test_predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST,
22 | # epoch_num=10, max_length=20)
23 | #dev_predictions = language_trainer.predict(dataset_type=constants.DATASET_VALIDATION,
24 | # epoch_num=10, max_length=10)
25 | train_predictions = language_trainer.predict(dataset_type=constants.DATASET_TRAIN,
26 | epoch_num=10, max_length=25, beam_size=5)
27 |
28 | utils.save_lines(train_predictions, 'logs/newsqa_saved_data/train_predictions_epoch_8.txt')
29 | #utils.save_lines(dev_predictions, 'logs/newsqa_saved_data/validation_predictions_epoch_6.txt')
30 | #utils.save_lines(test_predictions, 'logs/newsqa_saved_data/test_predictions_epoch_6.txt')
31 |
32 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/tests/newsqa_predictor_test_unsup_large.py:
--------------------------------------------------------------------------------
1 | from data_loaders.language_model_loader import LanguageModelLoader
2 | from models.language_model import LanguageModel
3 | from models.language_trainer import LanguageTrainer
4 | from models.language_wrapper import LanguageWrapper
5 | from helpers import constants
6 | import torch
7 | from helpers import torch_utils, utils
8 | from torch.autograd import variable
9 |
10 | dataset_path = 'datasets/newsqa_unsupervised_large_verb_filtered'
11 | load_path = 'logs/squad_saved_data/model_8.pyt7' # CHANGE THIS TO WHATEVER YOU WANT
12 |
13 | language_model_loader = LanguageModelLoader(dataset_path, tokenizer_type=constants.TOKENIZER_TAB)
14 | language_model = torch_utils.load_model(load_path).cuda()
15 | language_model.config['save_directory'] = 'logs/newsqa_unsupervised_saved_data'
16 |
17 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab())
18 | language_trainer = LanguageTrainer(language_model.config, language_wrapper, language_model_loader)
19 |
20 | ##test_predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST,
21 | # epoch_num=10, max_length=20, beam_size=5)
22 | #dev_predictions = language_trainer.predict(dataset_type=constants.DATASET_VALIDATION,
23 | # epoch_num=10, max_length=10, beam_size=5)
24 | train_predictions = language_trainer.predict(dataset_type=constants.DATASET_TRAIN,
25 | epoch_num=10, max_length=10)
26 |
27 | utils.save_lines(train_predictions, 'logs/newsqa_saved_data/train_predictions_epoch_6.txt')
28 | #utils.save_lines(dev_predictions, 'logs/newsqa_saved_data/validation_predictions_epoch_6.txt')
29 | #utils.save_lines(test_predictions, 'logs/newsqa_saved_data/test_predictions_epoch_6.txt')
30 |
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/tests/newsqa_predictor_test_unsup_truncated.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from data_loaders.language_model_loader_truncate import LanguageModelLoaderTruncate
3 | from models.language_model import LanguageModel
4 | from models.language_trainer import LanguageTrainer
5 | from models.language_wrapper import LanguageWrapper
6 | from helpers import constants
7 | from helpers import torch_utils, utils
8 | from torch.autograd import variable
9 |
10 | dataset_path = 'datasets/newsqa_unsupervised_verb_filtered'
11 | load_path = 'logs/squad_saved_data_truncated/model_14.pyt7' # CHANGE THIS TO EHATEVER YOU WANT
12 |
13 | language_model_loader = LanguageModelLoaderTruncate(dataset_path, tokenizer_type=constants.TOKENIZER_TAB)
14 | language_model = torch_utils.load_model(load_path).cuda()
15 | language_model.config['save_directory'] = 'logs/newsqa_saved_data'
16 |
17 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab())
18 | language_trainer = LanguageTrainer(language_model.config, language_wrapper, language_model_loader)
19 |
20 | #test_predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST,
21 | # epoch_num=10, max_length=20)
22 | #dev_predictions = language_trainer.predict(dataset_type=constants.DATASET_VALIDATION,
23 | # epoch_num=10, max_length=10)
24 | train_predictions = language_trainer.predict(dataset_type=constants.DATASET_TRAIN,
25 | epoch_num=10, max_length=10)
26 |
27 | utils.save_lines(train_predictions, 'logs/newsqa_saved_data/dummy5_unsup_train_predictions_epoch_6.txt')
28 | utils.save_lines(dev_predictions, 'logs/newsqa_saved_data/dummy5_unsup_validation_predictions_epoch_6.txt')
29 | utils.save_lines(test_predictions, 'logs/newsqa_saved_data/dummy5_unsup_test_predictions_epoch_6.txt')
30 |
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/tests/newsqa_predictor_test_verb.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from helpers import torch_utils, utils
3 | from torch.autograd import variable
4 | from data_loaders.language_model_loader import LanguageModelLoader
5 | from models.language_model import LanguageModel
6 | from models.language_trainer import LanguageTrainer
7 | from models.language_wrapper import LanguageWrapper
8 | from helpers import constants
9 |
10 |
11 | dataset_path = 'datasets/newsqa_unsupervised_verb_filtered'
12 | load_path = 'logs/squad_saved_data/model_14.pyt7'
13 |
14 | language_model_loader = LanguageModelLoader(dataset_path, tokenizer_type=constants.TOKENIZER_TAB)
15 | language_model = torch_utils.load_model(load_path).cuda()
16 | language_model.config['save_directory'] = 'logs/newsqa_unsupervised_verb_filtered'
17 |
18 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab())
19 | language_trainer = LanguageTrainer(language_model.config, language_wrapper, language_model_loader)
20 |
21 | #test_predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST,
22 | # epoch_num=10, max_length=20)
23 | #dev_predictions = language_trainer.predict(dataset_type=constants.DATASET_VALIDATION,
24 | # epoch_num=10, max_length=10)
25 | train_predictions = language_trainer.predict(dataset_type=constants.DATASET_TRAIN,
26 | epoch_num=12, max_length=15,
27 | beam_size=5)
28 |
29 | utils.save_lines(train_predictions, 'logs/newsqa_saved_data/train_predictions_epoch_6_verb_filtered.txt')
30 | #utils.save_lines(dev_predictions, 'logs/newsqa_saved_data/validation_predictions_epoch_6.txt')
31 | #utils.save_lines(test_predictions, 'logs/newsqa_saved_data/test_predictions_epoch_6.txt')
32 |
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/tests/newsqa_trainer_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.autograd import variable
3 | from data_loaders.language_model_loader import LanguageModelLoader
4 | from models.language_model import LanguageModel
5 | from models.language_trainer import LanguageTrainer
6 | from models.language_wrapper import LanguageWrapper
7 | from helpers import constants, torch_utils
8 |
9 | base_path = 'datasets/newsqa_train'
10 |
11 | language_model_loader = LanguageModelLoader(base_path, tokenizer_type=constants.TOKENIZER_TAB)
12 |
13 | config = {}
14 | config['vocab_size'] = language_model_loader.get_vocab().size()
15 | config['hidden_size'] = 100
16 | config['embedding_size'] = 300
17 | config['num_layers'] = 1
18 | config['dropout'] = 0.0
19 | config['batch_first'] = False
20 | config['batch_size'] = 10
21 | config['learning_rate'] = 1e-3
22 | config['log_path'] = 'logs.txt'
23 | config['save_directory'] = 'logs/newsqa_train_saved_data'
24 | config['use_pretrained_embeddings'] = True
25 | config['pretrained_embeddings_path'] = 'datasets/newsqa_train/word_embeddings.npy'
26 | config['finetune_embeddings'] = False
27 | config['load_model'] = True
28 | config['saved_epoch'] = 1
29 | config['load_path'] = 'logs/newsqa_train_saved_data/model_1.pyt7'
30 |
31 | language_model = LanguageModel(config)
32 | if config['load_model']:
33 | language_model = torch_utils.load_model(config['load_path'])
34 |
35 | language_model.cuda()
36 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab())
37 | language_trainer = LanguageTrainer(config, language_wrapper, language_model_loader)
38 |
39 | for i in range(0, 100):
40 | loss, accuracy, predictions = language_trainer.train(epoch_num=i)
41 |
42 | if i % 3 == 1:
43 | predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST,
44 | epoch_num=10, max_length=20)
45 | language_trainer.save(i + config['saved_epoch'])
46 | language_trainer.save_predictions(i + config['saved_epoch'], predictions)
47 |
48 |
49 |
50 |
51 |
--------------------------------------------------------------------------------
/tests/pointer_network_test.py:
--------------------------------------------------------------------------------
1 | from data_loaders.language_model_loader import LanguageModelLoader
2 | from models.pointer_network import PointerNetwork
3 | from helpers import constants
4 | import torch
5 | from torch.autograd import variable
6 | from torch import optim
7 | from torch import nn
8 |
9 | base_path = 'datasets/newsqa_train'
10 | language_model_loader = LanguageModelLoader(base_path, tokenizer_type=constants.TOKENIZER_NLTK)
11 | language_model_loader.mix_indices()
12 |
13 | config = {}
14 | config['vocab_size'] = language_model_loader.get_vocab().size()
15 | config['hidden_size'] = 100
16 | config['embedding_size'] = 300
17 | config['num_layers'] = 1
18 | config['dropout'] = 0.0
19 | config['batch_first'] = False
20 | config['batch_size'] = 24
21 | config['learning_rate'] = 1e-3
22 | config['log_path'] = 'logs.txt'
23 | config['save_directory'] = 'logs/squad_saved_data'
24 | config['use_pretrained_embeddings'] = True
25 | config['pretrained_embeddings_path'] = 'datasets/squad/word_embeddings.npy'
26 | config['finetune_embeddings'] = False
27 | config['load_model'] = True
28 | config['load_path'] = 'logs/squad_saved_data/model_7_old.pyt7'
29 |
30 | pointer_network = PointerNetwork(config).cuda()
31 |
32 |
33 | criterion1 = nn.CrossEntropyLoss().cuda()
34 | criterion2 = nn.CrossEntropyLoss().cuda()
35 | optimizer = optim.Adam(pointer_network.parameters(), 1e-2)
36 |
37 |
38 | batch = language_model_loader.get_batch(dataset_type=constants.DATASET_TRAIN, batch_size=config['batch_size'])
39 |
40 | large_negative_number = -1.e-10
41 | while batch is not None:
42 | optimizer.zero_grad()
43 | input_lengths = variable.Variable(torch.from_numpy(batch['context_lengths'])).cuda()
44 | input_vals = variable.Variable(torch.from_numpy(batch['context_tokens'])).cuda()
45 | answer_starts = variable.Variable(torch.from_numpy(batch['answer_starts'])).cuda()
46 | answer_ends = variable.Variable(torch.from_numpy(batch['answer_ends'])).cuda()
47 | masks = variable.Variable(torch.from_numpy(batch['context_masks'].T).float()).cuda()
48 |
49 | p_start, p_end = pointer_network.forward(input_vals, input_lengths, masks)
50 |
51 | # Batch first
52 | loss = criterion1(p_start, answer_starts) + \
53 | criterion2(p_end, answer_ends)
54 |
55 | print(loss)
56 | loss.backward()
57 | optimizer.step()
58 | batch = language_model_loader.get_batch(dataset_type=constants.DATASET_TRAIN, batch_size=config['batch_size'])
59 |
60 |
61 |
--------------------------------------------------------------------------------
/tests/question_discriminator_test.py:
--------------------------------------------------------------------------------
1 | from models.language_model import TextFieldPredictor, LanguageModel, LanguageDiscriminator
2 | from dnn_units.lstm_attention import LSTMAttentionDot
3 | from torch import nn
4 | from torch import optim
5 | from helpers import torch_utils
6 | import torch
7 | from torch.autograd import variable
8 |
9 | load_path = 'logs/squad_saved_data/model_6.pyt7'
10 | language_model = torch_utils.load_model(load_path)
11 | language_model = language_model.cuda()
12 |
13 | batch_size = 3
14 |
15 | embeddings = language_model.embedder
16 | text_field_predictor = language_model.text_field_predictor
17 | base_lstm = language_model.base_lstm
18 |
19 | discriminator = LanguageDiscriminator(language_model.config,
20 | embeddings, text_field_predictor, base_lstm).cuda()
21 |
22 | discriminator_optimizer = optim.Adam(discriminator.parameters(), lr=3e-2)
23 | discriminator_criterion = nn.BCELoss()
24 |
25 | contexts = variable.Variable(torch.LongTensor([[1, 2, 3], [2, 3, 4], [4, 5, 6]])).cuda()
26 | answer_features = variable.Variable(torch.FloatTensor([[0, 0, 0], [0, 0, 0], [0, 0, 0]])).cuda()
27 | inputs = variable.Variable(torch.LongTensor([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])).cuda()
28 |
29 | desired_indices = variable.Variable(torch.FloatTensor([1, 1, 1])).cuda()
30 |
31 | for i in range(0, 100):
32 | discriminator_optimizer.zero_grad()
33 | pred = discriminator.forward(inputs, contexts, answer_features)
34 | bce_loss = discriminator_criterion(pred, desired_indices)
35 | bce_loss.backward()
36 |
37 | print(bce_loss)
38 | discriminator_optimizer.step()
39 |
40 |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/tests/squad_discriminator_test.py:
--------------------------------------------------------------------------------
1 | from data_loaders.language_model_loader import LanguageModelLoader
2 | from models.language_model import LanguageModel
3 | from models.language_discriminator_trainer import LanguageDiscriminatorTrainer
4 | from models.language_wrapper import LanguageWrapper
5 | from helpers import constants
6 | import torch
7 | from helpers import torch_utils, utils
8 | from torch.autograd import variable
9 |
10 | dataset_path = 'datasets/squad'
11 | load_path = 'logs/squad_saved_data/model_6.pyt7'
12 |
13 | language_model_loader = LanguageModelLoader(dataset_path, tokenizer_type=constants.TOKENIZER_TAB)
14 | language_model = torch_utils.load_model(load_path).cuda()
15 | language_model.config['save_directory'] = 'logs/newsqa_saved_data'
16 |
17 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab())
18 | language_trainer = LanguageDiscriminatorTrainer(language_model.config, language_wrapper, language_model_loader)
19 |
20 | for i in range(0, 100):
21 | language_trainer.predict(dataset_type=constants.DATASET_TRAIN, epoch_num=1, max_length=20)
22 |
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/tests/squad_loader_test_v2.py:
--------------------------------------------------------------------------------
1 | from data_loaders.language_model_loader import LanguageModelLoader
2 | from models.language_model import LanguageModel
3 | from helpers import constants
4 |
5 | base_path = 'datasets/newsqa'
6 | language_model_loader = LanguageModelLoader(base_path, tokenizer_type=constants.TOKENIZER_NLTK)
7 | language_model_loader.reset_indices()
8 | batch = language_model_loader.get_batch(dataset_type=constants.DATASET_TRAIN, batch_size=10)
9 |
10 | config = {}
11 | config['vocab_size'] = language_model_loader.get_vocab().size()
12 | config['hidden_size'] = 100
13 | config['embedding_size'] = 300
14 | config['num_layers'] = 1
15 | config['dropout'] = 0.0
16 | config['batch_first'] = False
17 | config['batch_size'] = 24
18 | config['learning_rate'] = 1e-3
19 | config['log_path'] = 'logs.txt'
20 | config['save_directory'] = 'logs/squad_saved_data'
21 | config['use_pretrained_embeddings'] = True
22 | config['pretrained_embeddings_path'] = 'datasets/squad/word_embeddings.npy'
23 | config['finetune_embeddings'] = False
24 | config['load_model'] = True
25 | config['load_path'] = 'logs/squad_saved_data/model_7_old.pyt7'
26 |
27 | language_model = LanguageModel(config)
28 |
29 |
--------------------------------------------------------------------------------
/tests/squad_predictor_test.py:
--------------------------------------------------------------------------------
1 | from data_loaders.language_model_loader import LanguageModelLoader
2 | from models.language_model import LanguageModel
3 | from models.language_trainer import LanguageTrainer
4 | from models.language_wrapper import LanguageWrapper
5 | from helpers import constants
6 | import torch
7 | from helpers import torch_utils, utils
8 | from torch.autograd import variable
9 |
10 | dataset_path = 'datasets/newsqa'
11 | load_path = 'logs/squad_saved_data/model_12.pyt7'
12 |
13 | language_model_loader = LanguageModelLoader(dataset_path, tokenizer_type=constants.TOKENIZER_TAB)
14 | language_model = torch_utils.load_model(load_path).cuda()
15 | language_model.config['save_directory'] = 'logs/newsqa_saved_data'
16 |
17 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab())
18 | language_trainer = LanguageTrainer(language_model.config, language_wrapper, language_model_loader)
19 |
20 | test_predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST,
21 | epoch_num=10, max_length=20)
22 | dev_predictions = language_trainer.predict(dataset_type=constants.DATASET_VALIDATION,
23 | epoch_num=10, max_length=10)
24 | train_predictions = language_trainer.predict(dataset_type=constants.DATASET_TRAIN,
25 | epoch_num=10, max_length=10)
26 |
27 | utils.save_lines(train_predictions, 'logs/newsqa_saved_data/dummy8_train_predictions_epoch_.txt')
28 | utils.save_lines(dev_predictions, 'logs/newsqa_saved_data/dummy8_validation_predictions_epoch_6.txt')
29 | utils.save_lines(test_predictions, 'logs/newsqa_saved_data/dummy8_test_predictions_epoch_6.txt')
30 |
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/tests/squad_predictor_truncated_test.py:
--------------------------------------------------------------------------------
1 | from data_loaders.language_model_loader_truncate import LanguageModelLoaderTruncate
2 | from models.language_model import LanguageModel
3 | from models.language_trainer import LanguageTrainer
4 | from models.language_wrapper import LanguageWrapper
5 | from helpers import constants
6 | import torch
7 | from helpers import torch_utils, utils
8 | from torch.autograd import variable
9 |
10 | dataset_path = 'datasets/newsqa'
11 | load_path = 'logs/squad_saved_data_truncated/model_2.pyt7'
12 |
13 | language_model_loader = LanguageModelLoaderTruncate(dataset_path, tokenizer_type=constants.TOKENIZER_TAB)
14 | language_model = torch_utils.load_model(load_path).cuda()
15 | language_model.config['save_directory'] = 'logs/newsqa_saved_data'
16 |
17 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab())
18 | language_trainer = LanguageTrainer(language_model.config, language_wrapper, language_model_loader)
19 |
20 | #test_predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST,
21 | # epoch_num=10, max_length=20)
22 | #dev_predictions = language_trainer.predict(dataset_type=constants.DATASET_VALIDATION,
23 | # epoch_num=10, max_length=10)
24 | train_predictions = language_trainer.predict(dataset_type=constants.DATASET_TRAIN,
25 | epoch_num=10, max_length=10)
26 |
27 | utils.save_lines(train_predictions, 'logs/newsqa_saved_data/dummy5_train_predictions_epoch_6.txt')
28 | utils.save_lines(dev_predictions, 'logs/newsqa_saved_data/dummy5_validation_predictions_epoch_6.txt')
29 | utils.save_lines(test_predictions, 'logs/newsqa_saved_data/dummy5_test_predictions_epoch_6.txt')
30 |
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/tests/squad_trainer_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.autograd import variable
3 | from data_loaders.language_model_loader import LanguageModelLoader
4 | from models.language_model import LanguageModel
5 | from models.language_trainer import LanguageTrainer
6 | from models.language_wrapper import LanguageWrapper
7 | from helpers import constants, torch_utils, io_utils
8 |
9 | base_path = 'datasets/squad/'
10 | language_model_loader = LanguageModelLoader(base_path, tokenizer_type=constants.TOKENIZER_TAB)
11 |
12 | config = {}
13 | config['vocab_size'] = language_model_loader.get_vocab().size()
14 | config['hidden_size'] = 100
15 | config['embedding_size'] = 300
16 | config['num_layers'] = 1
17 | config['dropout'] = 0.0
18 | config['batch_first'] = False
19 | config['batch_size'] = 24
20 | config['learning_rate'] = 1e-3
21 | config['beam_size'] = 5
22 | config['log_path'] = 'logs.txt'
23 | config['save_directory'] = 'logs/squad_saved_data'
24 | config['use_pretrained_embeddings'] = True
25 | config['pretrained_embeddings_path'] = 'datasets/squad/word_embeddings.npy'
26 | config['finetune_embeddings'] = False
27 | config['load_model'] = False
28 | config['gpu_mode'] = True
29 | config['load_path'] = 'logs/squad_saved_data/model_6.pyt7' # CHANGE THIS TO WHATEVER PATH YOU WANT
30 |
31 | io_utils.check_dir('logs/squad_saved_data')
32 |
33 | language_model = LanguageModel(config)
34 | if config['load_model']:
35 | language_model = torch_utils.load_model(config['load_path'])
36 |
37 | language_model.cuda()
38 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab())
39 | language_trainer = LanguageTrainer(config, language_wrapper, language_model_loader)
40 |
41 | for i in range(0, 15):
42 | loss, accuracy, predictions = language_trainer.train(epoch_num=i)
43 |
44 | if i % 3 == 2:
45 | predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST,
46 | epoch_num=10, max_length=20, beam_size=config['beam_size'])
47 | language_trainer.save(i)
48 | language_trainer.save_predictions(i, predictions)
49 |
50 |
51 |
52 |
53 |
--------------------------------------------------------------------------------
/tests/squad_trainer_truncated_expanded_test.py:
--------------------------------------------------------------------------------
1 | from data_loaders.language_model_loader_truncate import LanguageModelLoaderTruncate
2 | from models.language_model import LanguageModel
3 | from models.language_trainer import LanguageTrainer
4 | from models.language_wrapper import LanguageWrapper
5 | from helpers import constants, torch_utils
6 | import torch
7 | from torch.autograd import variable
8 |
9 | base_path = 'datasets/squad_expanded_vocab'
10 |
11 |
12 | language_model_loader = LanguageModelLoaderTruncate(base_path, tokenizer_type=constants.TOKENIZER_TAB)
13 |
14 | config = {}
15 | config['vocab_size'] = language_model_loader.get_vocab().size()
16 | config['hidden_size'] = 100
17 | config['embedding_size'] = 300
18 | config['num_layers'] = 1
19 | config['dropout'] = 0.0
20 | config['batch_first'] = False
21 | config['batch_size'] = 20
22 | config['learning_rate'] = 1e-3
23 | config['log_path'] = 'logs.txt'
24 | config['save_directory'] = 'logs/squad_saved_data_truncated_expanded_vocab'
25 | config['use_pretrained_embeddings'] = True
26 | config['pretrained_embeddings_path'] = 'datasets/squad_expanded_vocab/word_embeddings.npy'
27 | config['finetune_embeddings'] = False
28 | config['load_model'] = False
29 | config['beam_size'] = 5
30 | config['load_path'] = 'logs/squad_saved_data_truncated/model_0.pyt7' # CHANGE THIS TO ONE OF THE SAVED MODEL PATHS
31 |
32 | language_model = LanguageModel(config)
33 | if config['load_model']:
34 | language_model = torch_utils.load_model(config['load_path'])
35 |
36 | language_model.cuda()
37 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab())
38 | language_trainer = LanguageTrainer(config, language_wrapper, language_model_loader)
39 |
40 | for i in range(0, 10):
41 | loss, accuracy, predictions = language_trainer.train(epoch_num=i)
42 |
43 | if i % 2 == 0:
44 | predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST,
45 | epoch_num=10, max_length=20)
46 | language_trainer.save(i)
47 | language_trainer.save_predictions(i, predictions)
48 |
49 |
50 |
51 |
52 |
--------------------------------------------------------------------------------
/tests/squad_trainer_truncated_test.py:
--------------------------------------------------------------------------------
1 | from data_loaders.language_model_loader_truncate import LanguageModelLoaderTruncate
2 | from models.language_model import LanguageModel
3 | from models.language_trainer import LanguageTrainer
4 | from models.language_wrapper import LanguageWrapper
5 | from helpers import constants, torch_utils
6 | import torch
7 | from torch.autograd import variable
8 |
9 | base_path = 'datasets/squad'
10 |
11 |
12 | language_model_loader = LanguageModelLoaderTruncate(base_path, tokenizer_type=constants.TOKENIZER_TAB)
13 |
14 | config = {}
15 | config['vocab_size'] = language_model_loader.get_vocab().size()
16 | config['hidden_size'] = 100
17 | config['embedding_size'] = 300
18 | config['num_layers'] = 1
19 | config['dropout'] = 0.0
20 | config['batch_first'] = False
21 | config['batch_size'] = 24
22 | config['learning_rate'] = 1e-3
23 | config['log_path'] = 'logs.txt'
24 | config['save_directory'] = 'logs/squad_saved_data_truncated'
25 | config['use_pretrained_embeddings'] = True
26 | config['pretrained_embeddings_path'] = 'datasets/squad/word_embeddings.npy'
27 | config['finetune_embeddings'] = False
28 | config['load_model'] = True
29 | config['load_path'] = 'logs/squad_saved_data_truncated/model_0.pyt7' # CHANGE THIS TO WHATEVER YOU WANT
30 |
31 | language_model = LanguageModel(config)
32 | if config['load_model']:
33 | language_model = torch_utils.load_model(config['load_path'])
34 |
35 | language_model.cuda()
36 | language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab())
37 | language_trainer = LanguageTrainer(config, language_wrapper, language_model_loader)
38 |
39 | for i in range(0, 100):
40 | loss, accuracy, predictions = language_trainer.train(epoch_num=i)
41 |
42 | if i % 2 == 0:
43 | predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST,
44 | epoch_num=10, max_length=20)
45 | language_trainer.save(i)
46 | language_trainer.save_predictions(i, predictions)
47 |
48 |
49 |
50 |
51 |
--------------------------------------------------------------------------------
/tests/test_expand_dims.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from torch.autograd import variable
4 |
5 | x = torch.Tensor([[1], [2], [3]])
6 | print(x.size())
7 | torch.Size([3, 1])
8 | print(x.expand(3, 1, 1))
--------------------------------------------------------------------------------
/tests/test_load_dataset.py:
--------------------------------------------------------------------------------
1 | from data_loaders.card_loader import CardLoader
2 |
3 | card_loader = CardLoader(base_path='card2code/third_party/magic')
4 | print(card_loader.train_dataset['inputs'][0])
5 | print(card_loader.train_dataset['outputs'][0])
--------------------------------------------------------------------------------
/tests/test_lstm_attention.py:
--------------------------------------------------------------------------------
1 | from dnn_units.lstm_attention import LSTMAttentionDot, LSTMAttention
2 | import torch
3 | from torch import nn
4 | from torch.autograd import variable
5 | from torch import optim
6 |
7 | batch_size = 25
8 | input_size = 125
9 | input_length = 25
10 | hidden_size = 250
11 | ctx_length = 230
12 |
13 | net = LSTMAttentionDot(input_size=input_size,
14 | hidden_size=hidden_size,
15 | batch_first=False).cuda()
16 |
17 | inputs = variable.Variable(torch.randn(input_length, batch_size, input_size)).cuda()
18 | hidden = variable.Variable(torch.randn(batch_size, hidden_size)).cuda()
19 | cell = variable.Variable(torch.randn(batch_size, hidden_size)).cuda()
20 | context = variable.Variable(torch.randn(ctx_length, batch_size, hidden_size)).cuda()
21 | desired = variable.Variable(torch.randn(batch_size, hidden_size)).cuda()
22 |
23 | criterion = nn.MSELoss()
24 |
25 | optimizer = optim.Adam(net.parameters(), lr=3e-2)
26 |
27 | for i in range(0, 1000):
28 | print(i)
29 | optimizer.zero_grad()
30 | out, h = net.forward(inputs, [hidden, cell], context)
31 | loss = criterion(h[0], desired)
32 | loss.backward()
33 | optimizer.step()
34 |
--------------------------------------------------------------------------------
/tests/test_lstm_attention_dot.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from torch.autograd import variable
4 | from models.language_model import TextFieldPredictor, SoftmaxPredictor
5 |
6 | config = {}
7 | config['vocab_size'] = 12
8 | config['embedding_size'] = 20
9 | config['hidden_size'] = 50
10 | config['num_layers'] = 1
11 | config['dropout'] = 0.0
12 | config['batch_first'] = True
13 |
14 | # First test text field predictor
15 | inp = variable.Variable(torch.LongTensor([[1, 2, 3], [4, 5, 6]]))
16 | hidden = variable.Variable(torch.randn(2, config['hidden_size']))
17 | predictor = TextFieldPredictor(config)
18 | lstm_embeddings = predictor.forward_prepro(inp)
19 | h_tilde, attentions, inp = predictor.forward_similarity(hidden)
20 |
21 | inp1 = variable.Variable(torch.LongTensor(2, config['vocab_size'] - 3).zero_())
22 | inp2 = variable.Variable(torch.zeros(2, config['vocab_size'] - 3))
23 | stacked_inps = torch.cat((inp, inp1), 1)
24 | stacked_attentions = torch.cat((attentions, inp2), 1)
25 |
26 | # Second test softma predictor
27 | softmax_predictor = SoftmaxPredictor(config)
28 | softmax_logits = softmax_predictor.forward(hidden)
29 |
30 | res = variable.Variable(torch.zeros(2, config['vocab_size']))
31 | res.scatter_(1, stacked_inps, stacked_attentions)
32 |
33 | tmp = softmax_logits + res
34 |
35 | print(tmp)
36 |
37 |
38 |
39 |
40 |
--------------------------------------------------------------------------------
/tests/test_model.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 | import torch.optim
5 | from torch.autograd import variable
6 |
7 | from models.card_model import CardModel
8 |
9 | config = {}
10 | config['vocab_size'] = 52
11 | config['embedding_size'] = 23
12 |
13 | model = CardModel(config)
14 |
15 | emb1 = nn.Embedding(config['vocab_size'], config['embedding_size'])
16 |
17 | desired = variable.Variable(torch.randn(3, 23))
18 | tmp = variable.Variable(torch.LongTensor([1,2,3]))
19 | tmp1 = emb1(tmp)
20 | tmp2 = emb1(tmp)
21 |
22 | criterion = nn.MSELoss()
23 | loss = criterion(tmp1 + tmp2, desired)
24 | loss.backward()
--------------------------------------------------------------------------------
/tests/test_model_saving.py:
--------------------------------------------------------------------------------
1 | from models.language_model import LanguageModel
2 | from helpers import torch_utils
3 |
4 | config = {}
5 | config['vocab_size'] = 12
6 | config['embedding_size'] = 20
7 | config['hidden_size'] = 50
8 | config['num_layers'] = 1
9 | config['dropout'] = 0.0
10 | config['batch_first'] = True
11 |
12 | model = LanguageModel(config)
13 |
14 | torch_utils.save_model(model, path='test.model')
15 | model = torch_utils.load_model(path='test.model')
--------------------------------------------------------------------------------
/tests/test_padded_sequence.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | import random
4 | import unittest
5 | import itertools
6 | import contextlib
7 | from copy import deepcopy
8 | from itertools import repeat, product
9 | from functools import wraps
10 |
11 | import torch.nn as nn
12 | import torch.nn.functional as F
13 | import torch.nn.parallel as dp
14 | import torch.nn.utils.rnn as rnn_utils
15 | from torch.nn.utils import clip_grad_norm
16 | from torch.autograd import Variable
17 | from torch.nn import Parameter
18 |
19 | lengths = [10, 10, 6, 2, 2, 1, 1]
20 | lengths_tensor = Variable(torch.LongTensor(lengths))
21 | max_length = lengths[0]
22 | x = Variable(torch.randn(max_length, len(lengths), 3), requires_grad=True)
23 | lstm = nn.LSTM(3, 4, bidirectional=True, num_layers=2, batch_first=False)
24 |
25 | packed = rnn_utils.pack_padded_sequence(x, lengths)
26 | packed_out, packed_hidden = lstm(packed)
27 | unpacked, unpacked_len = rnn_utils.pad_packed_sequence(packed_out)
28 |
29 | def sort_sequence(tensor, lengths, batch_first=False):
30 | """
31 | Sorts sequence in descending order
32 | tensor: Padded tensor of variable length stuff (Torch tensor)
33 | lengths: Lengths of padded tensor (Torch LongTensor)
34 | batch_first: Boolean, whether tensor is batch_first or not
35 | """
36 | idx = None
37 | if batch_first:
38 | idx = 0
39 | else:
40 | idx = 1
41 |
42 | sorted_lengths, indices = torch.sort(lengths, dim=0, descending=True)
43 | new_tensor = torch.index_select(tensor, idx, indices)
44 | return new_tensor, sorted_lengths, indices
45 |
46 | def unsort_sequence(tensor, indices, batch_first=False):
47 | """
48 | Unsort a tensor according to indices and idx
49 | """
50 | if batch_first:
51 | idx = 0
52 | else:
53 | idx = 1
54 | unsorted_tensor = torch.index_select(tensor, idx, indices)
55 | return unsorted_tensor
56 |
57 | def pack_forward(rnn, tensor, lengths, batch_first=False):
58 | """
59 | Forwards a padded tensor with lengths lengths thru rnn
60 | rnn: Cell to forward through
61 | tensor: Tensor to use
62 | lengths: Lengths to use
63 | batch_first: Whether tensor is batch first or not
64 | """
65 |
66 | sorted_tensor, sorted_lengths, sorted_indices = sort_sequence(tensor, lengths, batch_first)
67 | packed = rnn_utils.pack_padded_sequence(sorted_tensor, sorted_lengths.data.numpy())
68 | packed_out, packed_hidden = lstm(packed)
69 | unpacked, unpacked_len = rnn_utils.pad_packed_sequence(packed_out)
70 | unsorted_out = unsort_sequence(unpacked, sorted_indices, batch_first=False)
71 | unsorted_hidden = list(map(lambda idx: unsort_sequence(packed_hidden[idx], sorted_indices, batch_first=False), [0, 1]))
72 | return unsorted_out, unsorted_hidden
73 |
74 | sorted_tensor, sorted_indices, sorted_idx = sort_sequence(x, lengths_tensor, batch_first=False)
75 | unsorted_tensor = unsort_sequence(sorted_tensor, sorted_idx)
76 |
77 | unsorted_out, unsorted_hidden = pack_forward(lstm, x, lengths_tensor, )
78 | print(packed_out[0].size())
79 | print(unsorted_out[0].size())
80 |
81 |
--------------------------------------------------------------------------------
/trainers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidgolub/QuestionGeneration/6b31e1a8855774230051093ca24ba0a7750a6712/trainers/__init__.py
--------------------------------------------------------------------------------
/trainers/iob_predictor.py:
--------------------------------------------------------------------------------
1 | from data_loaders.iob_loader import IOBLoader
2 | from models.iob.iob_model import IOBModel
3 | from helpers import constants, utils
4 | import os
5 | os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" # see issue #152
6 | os.environ["CUDA_VISIBLE_DEVICES"]="1"
7 |
8 | embeddings = utils.load_matrix('datasets/squad_iob/word_embeddings.npy')
9 | base_directory = 'datasets/newsqa_iob'
10 | config_path = 'iob/logs/squad/config.json'
11 | params_path = 'iob/logs/squad/model_params_3.ckpt'
12 | predictions_save_path = 'iob/logs/newsqa/train_predictions_1.txt'
13 |
14 | data_loader = IOBLoader(base_directory, tokenizer_type=constants.TOKENIZER_SPECIAL_DELIMITER,
15 | input_max_length=2100)#00)
16 |
17 | config = utils.load_json(config_path)
18 | config['batch_size'] = 25
19 | config['input_max_length'] = data_loader.input_max_length
20 | model = IOBModel(config, embeddings=embeddings)
21 | model.restore(params_path)
22 |
23 | num_steps = 0
24 |
25 | data_loader.reset_indices()
26 | total_predictions = []
27 | num_steps = 0
28 |
29 | while True:
30 | batch = data_loader.get_batch(constants.DATASET_TRAIN, config['batch_size'])
31 | num_steps += config['batch_size']
32 | print(num_steps)
33 | if batch is None:
34 | break
35 | predictions = model.predict(batch)
36 | texts = data_loader.label_vocab.tokens_list(predictions)
37 | for i in range(0, len(texts)):
38 | cur_input_length = batch['input_lengths'][i]
39 | cur_text = texts[i]
40 |
41 | text_str = " ".join(cur_text[0:cur_input_length])
42 | total_predictions.append(text_str)
43 |
44 | utils.save_lines(total_predictions, predictions_save_path)
--------------------------------------------------------------------------------
/trainers/iob_trainer.py:
--------------------------------------------------------------------------------
1 | from data_loaders.iob_loader import IOBLoader
2 | from models.iob.iob_model import IOBModel
3 | from helpers import constants, utils
4 | import os
5 | os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" # see issue #152
6 | os.environ["CUDA_VISIBLE_DEVICES"]="0"
7 |
8 | base_directory = 'datasets/squad_iob'
9 |
10 | data_loader = IOBLoader(base_directory, tokenizer_type=constants.TOKENIZER_SPACE)
11 | data_loader.mix_indices()
12 |
13 | config = {
14 | 'input_max_length': data_loader.input_max_length,
15 | 'vocab_size': data_loader.vocab.size(),
16 | 'embeddings_size': 300,
17 | 'hidden_size': 150,
18 | 'out_size': 100,
19 | 'num_classes': data_loader.label_vocab.size(),
20 | 'batch_size': 25,
21 | 'learning_rate': 1e-2,
22 | 'save_path': 'iob/logs'}
23 |
24 | embeddings = utils.load_matrix('%s/word_embeddings.npy' % base_directory)
25 | config_path = 'iob/logs/squad/config.json'
26 | params_path = 'iob/logs/squad/model_params_%s.ckpt'
27 |
28 | model = IOBModel(config, embeddings=embeddings)
29 | model.save(config_path, params_path)
30 | model.restore(params_path)
31 |
32 | batch = data_loader.get_batch(constants.DATASET_TRAIN, config['batch_size'])
33 |
34 | num_steps = 0
35 |
36 | for i in range(0, 100):
37 | while batch is not None:
38 | loss, predictions = model.forward(batch)
39 | batch = data_loader.get_batch(constants.DATASET_TRAIN, config['batch_size'])
40 | num_steps += config['batch_size']
41 |
42 | print(num_steps)
43 | print(loss)
44 |
45 | if i % 3 == 0:
46 | model.save(config_path, params_path % i)
47 | data_loader.reset_indices()
48 | total_predictions = []
49 | while True:
50 | batch = data_loader.get_batch(constants.DATASET_TEST, config['batch_size'])
51 | if batch is None:
52 | break
53 | predictions = model.predict(batch)
54 | texts = data_loader.label_vocab.tokens_list(predictions)
55 | for i in range(0, len(texts)):
56 | cur_input_length = batch['input_lengths'][i]
57 | cur_text = texts[i]
58 | text_str = " ".join(cur_text[0:cur_input_length])
59 | total_predictions.append(text_str)
60 | utils.save_lines(total_predictions, \
61 | '%s/predictions_test_%s.txt' % (config['save_path'], i))
62 |
63 | data_loader.mix_indices()
64 | batch = data_loader.get_batch(constants.DATASET_TRAIN, config['batch_size'])
65 |
66 |
67 |
68 |
69 |
70 |
71 |
--------------------------------------------------------------------------------