├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── api_docs ├── APIDocGeneration │ ├── Makefile │ ├── conf.py │ ├── index.rst │ ├── jtr.format.convert.rst │ ├── jtr.format.rst │ ├── jtr.load.embeddings.rst │ ├── jtr.load.rst │ ├── jtr.nn.rst │ ├── jtr.rst │ ├── jtr.util.rst │ ├── modules.rst │ └── setup.rst ├── How_to_apidocstrings2htmlmd ├── How_to_contribute.md ├── genindex.html ├── index.html ├── jtr.format.convert.html ├── jtr.format.html ├── jtr.html ├── jtr.load.embeddings.html ├── jtr.load.html ├── jtr.nn.html ├── jtr.util.html ├── modules.html ├── objects.inv ├── py-modindex.html ├── search.html ├── searchindex.js ├── setup.html ├── sources │ ├── index.rst.txt │ ├── jtr.format.convert.rst.txt │ ├── jtr.format.rst.txt │ ├── jtr.load.embeddings.rst.txt │ ├── jtr.load.rst.txt │ ├── jtr.nn.rst.txt │ ├── jtr.rst.txt │ ├── jtr.util.rst.txt │ ├── modules.rst.txt │ └── setup.rst.txt └── static │ ├── ajax-loader.gif │ ├── alabaster.css │ ├── basic.css │ ├── comment-bright.png │ ├── comment-close.png │ ├── comment.png │ ├── custom.css │ ├── doctools.js │ ├── down-pressed.png │ ├── down.png │ ├── file.png │ ├── jquery-3.1.0.js │ ├── jquery.js │ ├── minus.png │ ├── plus.png │ ├── pygments.css │ ├── searchtools.js │ ├── underscore-1.3.1.js │ ├── underscore.js │ ├── up-pressed.png │ ├── up.png │ └── websupport.js ├── bin ├── create-squad-predictions.py ├── jack-eval.py ├── jack-train.py ├── mmap-cli.py └── squad_evaluate-v1.1.py ├── conf ├── jack.yaml ├── lp │ ├── complex.yaml │ ├── complex_fb.yaml │ ├── complex_wn18rr.yaml │ ├── distmult.yaml │ ├── distmult_fb.yaml │ ├── distmult_test.yaml │ ├── distmult_wn18rr.yaml │ ├── transe.yaml │ ├── transe_fb.yaml │ └── transe_wn18rr.yaml ├── nli │ ├── cbilstm.yaml │ ├── dam.yaml │ ├── esim.yaml │ ├── modular_nli.yaml │ ├── multinli │ │ ├── abstract_multinli.yaml │ │ ├── cbilstm.yaml │ │ ├── dam.yaml │ │ └── esim.yaml │ └── snli │ │ ├── abstract_snli.yaml │ │ ├── cbilstm.yaml │ │ ├── dam.yaml │ │ └── esim.yaml └── qa │ ├── bidaf.yaml │ ├── fastqa.yaml │ ├── jackqa.yaml │ ├── modular_qa.yaml │ ├── squad │ ├── abstract_squad.yaml │ ├── bidaf.yaml │ ├── fastqa.yaml │ └── jackqa.yaml │ └── triviaqa │ ├── web │ ├── abstract_triviaqa.yaml │ ├── bidaf.yaml │ ├── fastqa.yaml │ └── jackqa.yaml │ └── wiki │ ├── abstract_triviaqa.yaml │ ├── bidaf.yaml │ ├── fastqa.yaml │ └── jackqa.yaml ├── data ├── CBT │ ├── .gitignore │ ├── download.sh │ └── snippet.jtr.json ├── FB15k-237 │ ├── .gitignore │ ├── download.sh │ └── snippet.jtr.json ├── GloVe │ ├── download.sh │ ├── download_small.sh │ └── glove.the.50d.txt ├── LS │ ├── download.sh │ └── snippet.jtr.json ├── MCTest │ ├── .gitignore │ ├── download.sh │ └── snippet.jtr.json ├── MultiNLI │ ├── download.sh │ └── snippet.jtr.json ├── NYT │ ├── download.sh │ └── naacl2013_snippet.jtr.json ├── NewsQA │ └── download.sh ├── PTB │ └── download.sh ├── QAngaroo │ ├── instructions.md │ └── qangaroo2squad.py ├── SNLI │ ├── .gitignore │ ├── README.md │ ├── download.sh │ ├── snippet.json │ ├── snippet.jtr_v1.json │ └── snippet.jtr_v2.json ├── SQuAD │ ├── .gitignore │ ├── download.sh │ ├── snippet.json │ └── snippet.jtr.json ├── TBD │ ├── SemEval2017Task10 │ │ ├── S0022311514005480.ann │ │ └── S0022311514005480.txt │ ├── StoryCloze │ │ ├── debug_shuffled.tsv │ │ ├── dev_shuffled.tsv │ │ ├── test_shuffled.tsv │ │ └── train_shuffled.tsv │ ├── TACKBP │ │ └── tackbp_snippet.json │ └── scienceQA │ │ ├── scienceQA_cloze_snippet.json │ │ └── scienceQA_cloze_snippet.txt ├── WN18 │ ├── download.sh │ ├── snippet.jtr │ └── snippet.txt ├── WN18RR │ └── download.sh ├── emoji2vec │ ├── download.sh │ └── visualize.py ├── rc-data │ ├── .gitignore │ ├── README.md │ ├── cnn_snippet.jtr.json │ └── post_download.sh ├── sentihood │ ├── download.sh │ ├── sentihood-dev.json │ ├── sentihood-test.json │ ├── sentihood-train.json │ └── single_jtr.json ├── simpleQuestions │ ├── README │ ├── download.sh │ ├── snippet.jtr.json │ └── snippet.txt ├── triviaqa │ ├── README │ ├── config.py │ ├── convert2jack.py │ └── download.sh └── word2vec │ └── download.sh ├── docs ├── CLI.md ├── Encoder_Modules.md ├── Formats_for_Embeddings.md ├── How_to_test.md ├── TensorPorts.md └── Understanding_Jack_the_Reader.md ├── jack ├── __init__.py ├── core │ ├── __init__.py │ ├── data_structures.py │ ├── input_module.py │ ├── model_module.py │ ├── output_module.py │ ├── reader.py │ ├── shared_resources.py │ ├── tensorflow.py │ ├── tensorport.py │ └── torch.py ├── eval │ ├── __init__.py │ ├── base.py │ ├── classification.py │ ├── extractive_qa.py │ ├── link_prediction.py │ └── output_schema.json ├── io │ ├── CBT2jtr.py │ ├── FB15K2jtr.py │ ├── MCTest2jtr.py │ ├── NYT2jtr.py │ ├── SNLI2jtr.py │ ├── SNLI2jtr_concat.py │ ├── SQuAD2jtr.py │ ├── WN182jtr.py │ ├── __init__.py │ ├── bAbI2JTR.py │ ├── dataset_schema.json │ ├── embeddings │ │ ├── __init__.py │ │ ├── embeddings.py │ │ ├── fasttext.py │ │ ├── glove.py │ │ ├── memory_map.py │ │ └── word_to_vec.py │ ├── load.py │ ├── ls2jtr.py │ ├── merge_JTR_data_files.py │ ├── multiNLI2jtr.py │ ├── newsqa2jtr.py │ ├── newsqa2squad.py │ ├── rc-data2jtr.py │ ├── read_semeval2017Task10.py │ ├── scienceQA2jtr.py │ ├── sentihood2jtr.py │ ├── simpleQuestions2jtr.py │ └── validate.py ├── readers │ ├── __init__.py │ ├── classification │ │ ├── __init__.py │ │ ├── shared.py │ │ └── util.py │ ├── extractive_qa │ │ ├── __init__.py │ │ ├── shared.py │ │ ├── tensorflow │ │ │ ├── __init__.py │ │ │ ├── abstract_model.py │ │ │ ├── answer_layer.py │ │ │ ├── fastqa.py │ │ │ └── modular_qa_model.py │ │ ├── torch │ │ │ ├── __init__.py │ │ │ └── fastqa.py │ │ └── util.py │ ├── implementations.py │ ├── link_prediction │ │ ├── __init__.py │ │ ├── models.py │ │ ├── scores.py │ │ └── similarities.py │ └── natural_language_inference │ │ ├── __init__.py │ │ ├── conditional_bilstm.py │ │ ├── decomposable_attention.py │ │ ├── modular_nli_model.py │ │ └── prediction_layer.py ├── train_reader.py └── util │ ├── __init__.py │ ├── batch.py │ ├── hooks.py │ ├── map.py │ ├── preprocessing.py │ ├── random.py │ ├── tf │ ├── __init__.py │ ├── activations.py │ ├── attention.py │ ├── dropout.py │ ├── embedding.py │ ├── highway.py │ ├── interaction_layer.py │ ├── masking.py │ ├── misc.py │ ├── modular_encoder.py │ ├── pairwise_losses.py │ ├── rnn.py │ ├── segment.py │ ├── sequence_encoder.py │ ├── simple.py │ └── xqa.py │ ├── torch │ ├── __init__.py │ ├── embedding.py │ ├── highway.py │ ├── misc.py │ ├── rnn.py │ ├── segment.py │ └── xqa.py │ └── vocab.py ├── notebooks ├── model_implementation.ipynb ├── model_training.ipynb ├── prettyprint.py └── quick_start.ipynb ├── projects └── knowledge_integration │ ├── README.md │ ├── __init__.py │ ├── conf │ ├── nli │ │ ├── multinli │ │ │ └── cbilstm_assertion.yaml │ │ └── snli │ │ │ └── cbilstm_assertion.yaml │ └── qa │ │ ├── bilstm_assertion.yaml │ │ ├── squad │ │ ├── bilstm_assertion.yaml │ │ └── bilstm_assertion_definition.yaml │ │ └── triviaqa │ │ ├── web │ │ ├── bilstm_assertion.yaml │ │ └── bilstm_assertion_definition.yaml │ │ └── wiki │ │ ├── bilstm_assertion.yaml │ │ └── bilstm_assertion_definition.yaml │ ├── knowledge_store.py │ ├── nli.py │ ├── qa │ ├── __init__.py │ ├── definition_model.py │ └── shared.py │ ├── readers.py │ ├── scripts │ ├── __init__.py │ ├── extract_conceptnet.py │ ├── extract_side_information_for_dataset.py │ └── extract_wikipedia_short_abstract.py │ ├── shared.py │ └── tfutil.py ├── pytest.ini ├── requirements.txt ├── setup.cfg ├── setup.py ├── tests ├── conftest.py ├── jack │ ├── debug │ │ └── test_debug.py │ ├── eval │ │ └── test_kbp_eval.py │ ├── load │ │ └── test_loaders.py │ ├── preprocess │ │ ├── test_batch.py │ │ ├── test_map.py │ │ └── test_vocab_prune.py │ ├── readers │ │ ├── extractive_qa │ │ │ └── test_util.py │ │ ├── multiple_choice │ │ │ └── test_simple_mcqa.py │ │ ├── test_fastqa.py │ │ ├── test_fastqa_loop.py │ │ ├── test_kbp.py │ │ ├── test_models.py │ │ ├── test_readers.py │ │ └── test_serialization.py │ ├── test_core.py │ └── test_embeddings.py ├── test_conf │ ├── dam_test.yaml │ ├── fastqa_test.yaml │ └── snli_small_adagrad_test.yaml ├── test_data │ ├── MultiNLI │ │ ├── 1000_samples_dev_jtr.json │ │ ├── 2000_samples_train_jtr.json │ │ └── overfit.json │ ├── SNLI │ │ ├── 1000_samples_dev_jtr_v1.json │ │ ├── 1000_samples_snli_1.0_train.jsonl │ │ ├── 2000_samples_test_jtr_v1.json │ │ ├── 2000_samples_train_jtr_v1.json │ │ ├── dev.json │ │ ├── overfit.json │ │ ├── test.json │ │ └── train.json │ ├── WN18 │ │ └── wn18-snippet.jack.json │ ├── glove.500.50d.txt │ ├── glove.840B.300d_top256.txt │ ├── sentihood │ │ ├── overfit.json │ │ ├── sentihood-dev.json │ │ ├── sentihood-test.json │ │ └── sentihood-train.json │ ├── snli.json │ ├── snli_1k.json │ ├── snli_3k.json │ ├── squad │ │ ├── dev.json │ │ ├── overfit.json │ │ ├── snippet_jtr.json │ │ ├── test.json │ │ └── train.json │ └── wiki.json ├── test_readme.py └── test_results │ ├── dam_test │ ├── checkpoint │ ├── model_module.data-00000-of-00001 │ ├── model_module.index │ ├── model_module.meta │ └── shared_resources │ │ ├── answer_vocab │ │ ├── config.yaml │ │ ├── remainder │ │ └── vocab │ ├── fastqa_test │ ├── checkpoint │ ├── model_module.data-00000-of-00001 │ ├── model_module.index │ ├── model_module.meta │ └── shared_resources │ │ ├── config.yaml │ │ ├── embeddings │ │ └── config.yaml │ │ ├── remainder │ │ └── vocab │ ├── overfit_test │ ├── SNLI │ │ ├── dam │ │ │ └── expected_results.txt │ │ └── esim │ │ │ └── expected_results.txt │ └── squad │ │ └── fastqa │ │ └── expected_results.txt │ ├── rename_recursively.py │ └── smalldata_test │ ├── SNLI │ ├── dam │ │ └── expected_results.txt │ └── esim │ │ └── expected_results.txt │ └── squad │ └── fastqa │ └── expected_results.txt └── wercker.yml /.gitignore: -------------------------------------------------------------------------------- 1 | # OSX specific 2 | .DS_Store 3 | __MACOSX 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *,cover 50 | .hypothesis/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # IPython Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | venv/ 87 | ENV/ 88 | 89 | # Spyder project settings 90 | .spyderproject 91 | 92 | # Rope project settings 93 | .ropeproject 94 | 95 | # Idea 96 | .idea/ 97 | 98 | # TensorBoard dirs 99 | .tb 100 | 101 | # Vim buffer files 102 | *.swp 103 | 104 | # Test result files 105 | testresult_* 106 | 107 | # Snippets 108 | snippets/ 109 | 110 | # Datasets 111 | data/WN18/*.tgz 112 | data/WN18/wordnet-mlj12 113 | tests/test_results/fastqa_reader_test/ 114 | tests/test_results/dam_reader_test/ 115 | data/GloVe/glove.840B.300d.* 116 | data/MultiNLI/multinli_1.0/ 117 | 118 | data/WN18/train.jtr 119 | data/WN18/valid.jtr 120 | data/WN18/test.jtr 121 | data/WN18/wn18* 122 | 123 | data/WN18RR/test.jtr 124 | data/WN18RR/test.txt 125 | data/WN18RR/train.jtr 126 | data/WN18RR/train.txt 127 | data/WN18RR/valid.jtr 128 | data/WN18RR/valid.txt 129 | data/WN18RR/wn18.tgz 130 | 131 | saved_reader/ 132 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 UCL Machine Reading 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # simple makefile to simplify repetitive build env management tasks under posix 2 | PYTHON := python3 3 | PIP := $(PYTHON) -m pip 4 | PYTEST := $(PYTHON) -m pytest 5 | 6 | init: 7 | $(PIP) install -r requirements.txt 8 | install: 9 | $(PYTHON) setup.py install 10 | install-develop: 11 | $(PYTHON) setup.py develop 12 | install-user: 13 | $(PYTHON) setup.py install --user 14 | clean: 15 | $(PYTHON) setup.py clean --all 16 | unittest: 17 | $(PYTEST) tests -v -m "not (overfit or smalldata)" -k "not test_pipeline" 18 | test: 19 | $(PYTEST) tests -v -m "not (smalldata)" 20 | 21 | # FIXME: this should probably be test-overfit rather than overfit 22 | overfit: 23 | $(PYTEST) tests -v -m "overfit" 24 | smalldata: 25 | $(PYTEST) tests -v -m "smalldata" 26 | 27 | SNLI: 28 | $(PYTEST) tests -v -m SNLI 29 | doctests: 30 | $(PYTEST) --doctest-modules jtr/preprocess/vocab.py 31 | -------------------------------------------------------------------------------- /api_docs/APIDocGeneration/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = jtr 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /api_docs/APIDocGeneration/index.rst: -------------------------------------------------------------------------------- 1 | .. jtr documentation master file, created by 2 | sphinx-quickstart on Mon Jan 9 17:30:20 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to jtr's documentation! 7 | ================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 4 11 | :caption: Contents: 12 | 13 | modules 14 | 15 | 16 | 17 | Indices and tables 18 | ================== 19 | 20 | * :ref:`genindex` 21 | * :ref:`modindex` 22 | * :ref:`search` 23 | -------------------------------------------------------------------------------- /api_docs/APIDocGeneration/jtr.format.convert.rst: -------------------------------------------------------------------------------- 1 | jtr.format.convert package 2 | ========================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | jtr.format.convert.nyt module 8 | ----------------------------- 9 | 10 | .. automodule:: jtr.format.convert.nyt 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | jtr.format.convert.squad module 16 | ------------------------------- 17 | 18 | .. automodule:: jtr.format.convert.squad 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | 24 | Module contents 25 | --------------- 26 | 27 | .. automodule:: jtr.format.convert 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | -------------------------------------------------------------------------------- /api_docs/APIDocGeneration/jtr.format.rst: -------------------------------------------------------------------------------- 1 | jtr.format package 2 | ================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | jtr.format.convert 10 | 11 | Submodules 12 | ---------- 13 | 14 | jtr.format.validate module 15 | -------------------------- 16 | 17 | .. automodule:: jtr.format.validate 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: jtr.format 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /api_docs/APIDocGeneration/jtr.load.embeddings.rst: -------------------------------------------------------------------------------- 1 | jtr.load.embeddings package 2 | =========================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | jtr.load.embeddings.embeddings module 8 | ------------------------------------- 9 | 10 | .. automodule:: jtr.load.embeddings.embeddings 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | jtr.load.embeddings.glove module 16 | -------------------------------- 17 | 18 | .. automodule:: jtr.load.embeddings.glove 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | jtr.load.embeddings.vocabulary module 24 | ------------------------------------- 25 | 26 | .. automodule:: jtr.load.embeddings.vocabulary 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | jtr.load.embeddings.word_to_vec module 32 | -------------------------------------- 33 | 34 | .. automodule:: jtr.load.embeddings.word_to_vec 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | 40 | Module contents 41 | --------------- 42 | 43 | .. automodule:: jtr.load.embeddings 44 | :members: 45 | :undoc-members: 46 | :show-inheritance: 47 | -------------------------------------------------------------------------------- /api_docs/APIDocGeneration/jtr.load.rst: -------------------------------------------------------------------------------- 1 | jtr.load package 2 | ================ 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | jtr.load.embeddings 10 | 11 | Submodules 12 | ---------- 13 | 14 | jtr.load.FB15K2jtr module 15 | ------------------------- 16 | 17 | .. automodule:: jtr.load.FB15K2jtr 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | jtr.load.NYT2jtr module 23 | ----------------------- 24 | 25 | .. automodule:: jtr.load.NYT2jtr 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | 30 | jtr.load.SNLI2jtr_v1 module 31 | --------------------------- 32 | 33 | .. automodule:: jtr.load.SNLI2jtr_v1 34 | :members: 35 | :undoc-members: 36 | :show-inheritance: 37 | 38 | jtr.load.SNLI2jtr_v2 module 39 | --------------------------- 40 | 41 | .. automodule:: jtr.load.SNLI2jtr_v2 42 | :members: 43 | :undoc-members: 44 | :show-inheritance: 45 | 46 | jtr.load.create_dummpy_scienceQA module 47 | --------------------------------------- 48 | 49 | .. automodule:: jtr.load.create_dummpy_scienceQA 50 | :members: 51 | :undoc-members: 52 | :show-inheritance: 53 | 54 | jtr.load.ls2jtr module 55 | ---------------------- 56 | 57 | .. automodule:: jtr.load.ls2jtr 58 | :members: 59 | :undoc-members: 60 | :show-inheritance: 61 | 62 | jtr.load.read_jtr module 63 | ------------------------ 64 | 65 | .. automodule:: jtr.load.read_jtr 66 | :members: 67 | :undoc-members: 68 | :show-inheritance: 69 | 70 | jtr.load.read_semeval2017Task10 module 71 | -------------------------------------- 72 | 73 | .. automodule:: jtr.load.read_semeval2017Task10 74 | :members: 75 | :undoc-members: 76 | :show-inheritance: 77 | 78 | jtr.load.scienceQA2jtr module 79 | ----------------------------- 80 | 81 | .. automodule:: jtr.load.scienceQA2jtr 82 | :members: 83 | :undoc-members: 84 | :show-inheritance: 85 | 86 | jtr.load.sentihood2jtr module 87 | ----------------------------- 88 | 89 | .. automodule:: jtr.load.sentihood2jtr 90 | :members: 91 | :undoc-members: 92 | :show-inheritance: 93 | 94 | jtr.load.simpleQuestions2jtr module 95 | ----------------------------------- 96 | 97 | .. automodule:: jtr.load.simpleQuestions2jtr 98 | :members: 99 | :undoc-members: 100 | :show-inheritance: 101 | 102 | 103 | Module contents 104 | --------------- 105 | 106 | .. automodule:: jtr.load 107 | :members: 108 | :undoc-members: 109 | :show-inheritance: 110 | -------------------------------------------------------------------------------- /api_docs/APIDocGeneration/jtr.nn.rst: -------------------------------------------------------------------------------- 1 | jtr.nn package 2 | ============== 3 | 4 | Submodules 5 | ---------- 6 | 7 | jtr.nn.core_models module 8 | ------------------------- 9 | 10 | .. automodule:: jtr.nn.core_models 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | jtr.nn.models module 16 | -------------------- 17 | 18 | .. automodule:: jtr.nn.models 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | 24 | Module contents 25 | --------------- 26 | 27 | .. automodule:: jtr.nn 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | -------------------------------------------------------------------------------- /api_docs/APIDocGeneration/jtr.rst: -------------------------------------------------------------------------------- 1 | jtr package 2 | =========== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | jtr.format 10 | jtr.load 11 | jtr.nn 12 | jtr.util 13 | 14 | Submodules 15 | ---------- 16 | 17 | jtr.pairwise_losses module 18 | -------------------------- 19 | 20 | .. automodule:: jtr.pairwise_losses 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | 25 | jtr.pipelines module 26 | -------------------- 27 | 28 | .. automodule:: jtr.pipelines 29 | :members: 30 | :undoc-members: 31 | :show-inheritance: 32 | 33 | jtr.train module 34 | ---------------- 35 | 36 | .. automodule:: jtr.train 37 | :members: 38 | :undoc-members: 39 | :show-inheritance: 40 | 41 | jtr.training_pipeline module 42 | ---------------------------- 43 | 44 | .. automodule:: jtr.training_pipeline 45 | :members: 46 | :undoc-members: 47 | :show-inheritance: 48 | 49 | 50 | Module contents 51 | --------------- 52 | 53 | .. automodule:: jtr 54 | :members: 55 | :undoc-members: 56 | :show-inheritance: 57 | -------------------------------------------------------------------------------- /api_docs/APIDocGeneration/jtr.util.rst: -------------------------------------------------------------------------------- 1 | jtr.util package 2 | ================ 3 | 4 | Submodules 5 | ---------- 6 | 7 | jtr.util.gen_data module 8 | ------------------------ 9 | 10 | .. automodule:: jtr.util.gen_data 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | jtr.util.hooks module 16 | --------------------- 17 | 18 | .. automodule:: jtr.util.hooks 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | jtr.util.rs module 24 | ------------------ 25 | 26 | .. automodule:: jtr.util.rs 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | jtr.util.tfutil module 32 | ---------------------- 33 | 34 | .. automodule:: jtr.util.tfutil 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | jtr.util.util module 40 | -------------------- 41 | 42 | .. automodule:: jtr.util.util 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | 48 | Module contents 49 | --------------- 50 | 51 | .. automodule:: jtr.util 52 | :members: 53 | :undoc-members: 54 | :show-inheritance: 55 | -------------------------------------------------------------------------------- /api_docs/APIDocGeneration/modules.rst: -------------------------------------------------------------------------------- 1 | jtr 2 | ====== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | jtr 8 | setup 9 | -------------------------------------------------------------------------------- /api_docs/APIDocGeneration/setup.rst: -------------------------------------------------------------------------------- 1 | setup module 2 | ============ 3 | 4 | .. automodule:: setup 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /api_docs/How_to_apidocstrings2htmlmd: -------------------------------------------------------------------------------- 1 | # How to Generate New Docs From Scratch 2 | Install [Sphinx](http://www.sphinx-doc.org/en/1.5.1/install.html) and its requirements. Then execute the steps below as taken from [this github issue](Ahttp://stackoverflow.com/questions/20354768/python-sphinx-how-to-document-one-file-with-functions) 3 | 4 | Here is a step-by-step list: 5 | 6 | 1. Create documentation folder: `mkdir doc` 7 | 2. Enter doc/: `cd doc` 8 | 3. Execute sphinx-quickstart (Be sure to select autodoc: y, Makefile: y) 9 | 4. Edit conf.py to specify sys.path: `sys.path.insert(0, os.path.abspath('..'))` 10 | 5. Edit index.rst and specify modules in the toctree: 11 | ``` 12 | .. toctree:: 13 | :maxdepth: 2 14 | 15 | modules 16 | ``` 17 | 6. Execute sphinx-apidoc -o . .. 18 | 7. Generate the html output: make html 19 | 8. View your documentation: firefox _build/html/index.html 20 | -------------------------------------------------------------------------------- /api_docs/How_to_contribute.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | In the following, you can see a simple baseline model for JTR: 4 | 5 | TODO: describe nvocab, **options 6 | 7 | ```python 8 | def boe_nosupport_cands_reader_model(placeholders, nvocab, **options): 9 | """ 10 | Bag of embedding reader with pairs of (question, support) and candidates 11 | """ 12 | 13 | # Model 14 | # [batch_size, max_seq1_length] 15 | question = placeholders['question'] 16 | 17 | # [batch_size, candidate_size] 18 | targets = placeholders['targets'] 19 | 20 | # [batch_size, max_num_cands] 21 | candidates = placeholders['candidates'] 22 | 23 | with tf.variable_scope("embedders") as varscope: 24 | question_embedded = nvocab(question) 25 | varscope.reuse_variables() 26 | candidates_embedded = nvocab(candidates) 27 | 28 | logger.info('TRAINABLE VARIABLES (only embeddings): {}'.format(get_total_trainable_variables())) 29 | question_encoding = tf.reduce_sum(question_embedded, 1) 30 | 31 | scores = logits = tf.reduce_sum(tf.expand_dims(question_encoding, 1) * candidates_embedded, 2) 32 | loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(scores, targets), name='predictor_loss') 33 | predict = tf.arg_max(tf.nn.softmax(logits), 1, name='prediction') 34 | 35 | logger.info('TRAINABLE VARIABLES (embeddings + model): {}'.format(get_total_trainable_variables())) 36 | logger.info('ALL VARIABLES (embeddings + model): {}'.format(get_total_variables())) 37 | 38 | return logits, loss, predict 39 | ``` 40 | -------------------------------------------------------------------------------- /api_docs/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/api_docs/objects.inv -------------------------------------------------------------------------------- /api_docs/sources/index.rst.txt: -------------------------------------------------------------------------------- 1 | .. jtr documentation master file, created by 2 | sphinx-quickstart on Mon Jan 9 17:30:20 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to jtr's documentation! 7 | ================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 4 11 | :caption: Contents: 12 | 13 | modules 14 | 15 | 16 | 17 | Indices and tables 18 | ================== 19 | 20 | * :ref:`genindex` 21 | * :ref:`modindex` 22 | * :ref:`search` 23 | -------------------------------------------------------------------------------- /api_docs/sources/jtr.format.convert.rst.txt: -------------------------------------------------------------------------------- 1 | jtr.format.convert package 2 | ========================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | jtr.format.convert.nyt module 8 | ----------------------------- 9 | 10 | .. automodule:: jtr.format.convert.nyt 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | jtr.format.convert.squad module 16 | ------------------------------- 17 | 18 | .. automodule:: jtr.format.convert.squad 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | 24 | Module contents 25 | --------------- 26 | 27 | .. automodule:: jtr.format.convert 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | -------------------------------------------------------------------------------- /api_docs/sources/jtr.format.rst.txt: -------------------------------------------------------------------------------- 1 | jtr.format package 2 | ================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | jtr.format.convert 10 | 11 | Submodules 12 | ---------- 13 | 14 | jtr.format.validate module 15 | -------------------------- 16 | 17 | .. automodule:: jtr.format.validate 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: jtr.format 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /api_docs/sources/jtr.load.embeddings.rst.txt: -------------------------------------------------------------------------------- 1 | jtr.load.embeddings package 2 | =========================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | jtr.load.embeddings.embeddings module 8 | ------------------------------------- 9 | 10 | .. automodule:: jtr.load.embeddings.embeddings 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | jtr.load.embeddings.glove module 16 | -------------------------------- 17 | 18 | .. automodule:: jtr.load.embeddings.glove 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | jtr.load.embeddings.vocabulary module 24 | ------------------------------------- 25 | 26 | .. automodule:: jtr.load.embeddings.vocabulary 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | jtr.load.embeddings.word_to_vec module 32 | -------------------------------------- 33 | 34 | .. automodule:: jtr.load.embeddings.word_to_vec 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | 40 | Module contents 41 | --------------- 42 | 43 | .. automodule:: jtr.load.embeddings 44 | :members: 45 | :undoc-members: 46 | :show-inheritance: 47 | -------------------------------------------------------------------------------- /api_docs/sources/jtr.load.rst.txt: -------------------------------------------------------------------------------- 1 | jtr.load package 2 | ================ 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | jtr.load.embeddings 10 | 11 | Submodules 12 | ---------- 13 | 14 | jtr.load.FB15K2jtr module 15 | ------------------------- 16 | 17 | .. automodule:: jtr.load.FB15K2jtr 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | jtr.load.NYT2jtr module 23 | ----------------------- 24 | 25 | .. automodule:: jtr.load.NYT2jtr 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | 30 | jtr.load.SNLI2jtr_v1 module 31 | --------------------------- 32 | 33 | .. automodule:: jtr.load.SNLI2jtr_v1 34 | :members: 35 | :undoc-members: 36 | :show-inheritance: 37 | 38 | jtr.load.SNLI2jtr_v2 module 39 | --------------------------- 40 | 41 | .. automodule:: jtr.load.SNLI2jtr_v2 42 | :members: 43 | :undoc-members: 44 | :show-inheritance: 45 | 46 | jtr.load.create_dummpy_scienceQA module 47 | --------------------------------------- 48 | 49 | .. automodule:: jtr.load.create_dummpy_scienceQA 50 | :members: 51 | :undoc-members: 52 | :show-inheritance: 53 | 54 | jtr.load.ls2jtr module 55 | ---------------------- 56 | 57 | .. automodule:: jtr.load.ls2jtr 58 | :members: 59 | :undoc-members: 60 | :show-inheritance: 61 | 62 | jtr.load.read_jtr module 63 | ------------------------ 64 | 65 | .. automodule:: jtr.load.read_jtr 66 | :members: 67 | :undoc-members: 68 | :show-inheritance: 69 | 70 | jtr.load.read_semeval2017Task10 module 71 | -------------------------------------- 72 | 73 | .. automodule:: jtr.load.read_semeval2017Task10 74 | :members: 75 | :undoc-members: 76 | :show-inheritance: 77 | 78 | jtr.load.scienceQA2jtr module 79 | ----------------------------- 80 | 81 | .. automodule:: jtr.load.scienceQA2jtr 82 | :members: 83 | :undoc-members: 84 | :show-inheritance: 85 | 86 | jtr.load.sentihood2jtr module 87 | ----------------------------- 88 | 89 | .. automodule:: jtr.load.sentihood2jtr 90 | :members: 91 | :undoc-members: 92 | :show-inheritance: 93 | 94 | jtr.load.simpleQuestions2jtr module 95 | ----------------------------------- 96 | 97 | .. automodule:: jtr.load.simpleQuestions2jtr 98 | :members: 99 | :undoc-members: 100 | :show-inheritance: 101 | 102 | 103 | Module contents 104 | --------------- 105 | 106 | .. automodule:: jtr.load 107 | :members: 108 | :undoc-members: 109 | :show-inheritance: 110 | -------------------------------------------------------------------------------- /api_docs/sources/jtr.nn.rst.txt: -------------------------------------------------------------------------------- 1 | jtr.nn package 2 | ============== 3 | 4 | Submodules 5 | ---------- 6 | 7 | jtr.nn.core_models module 8 | ------------------------- 9 | 10 | .. automodule:: jtr.nn.core_models 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | jtr.nn.models module 16 | -------------------- 17 | 18 | .. automodule:: jtr.nn.models 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | 24 | Module contents 25 | --------------- 26 | 27 | .. automodule:: jtr.nn 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | -------------------------------------------------------------------------------- /api_docs/sources/jtr.rst.txt: -------------------------------------------------------------------------------- 1 | jtr package 2 | =========== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | jtr.format 10 | jtr.load 11 | jtr.nn 12 | jtr.util 13 | 14 | Submodules 15 | ---------- 16 | 17 | jtr.pairwise_losses module 18 | -------------------------- 19 | 20 | .. automodule:: jtr.pairwise_losses 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | 25 | jtr.pipelines module 26 | -------------------- 27 | 28 | .. automodule:: jtr.pipelines 29 | :members: 30 | :undoc-members: 31 | :show-inheritance: 32 | 33 | jtr.train module 34 | ---------------- 35 | 36 | .. automodule:: jtr.train 37 | :members: 38 | :undoc-members: 39 | :show-inheritance: 40 | 41 | jtr.training_pipeline module 42 | ---------------------------- 43 | 44 | .. automodule:: jtr.training_pipeline 45 | :members: 46 | :undoc-members: 47 | :show-inheritance: 48 | 49 | 50 | Module contents 51 | --------------- 52 | 53 | .. automodule:: jtr 54 | :members: 55 | :undoc-members: 56 | :show-inheritance: 57 | -------------------------------------------------------------------------------- /api_docs/sources/jtr.util.rst.txt: -------------------------------------------------------------------------------- 1 | jtr.util package 2 | ================ 3 | 4 | Submodules 5 | ---------- 6 | 7 | jtr.util.gen_data module 8 | ------------------------ 9 | 10 | .. automodule:: jtr.util.gen_data 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | jtr.util.hooks module 16 | --------------------- 17 | 18 | .. automodule:: jtr.util.hooks 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | jtr.util.rs module 24 | ------------------ 25 | 26 | .. automodule:: jtr.util.rs 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | jtr.util.tfutil module 32 | ---------------------- 33 | 34 | .. automodule:: jtr.util.tfutil 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | jtr.util.util module 40 | -------------------- 41 | 42 | .. automodule:: jtr.util.util 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | 48 | Module contents 49 | --------------- 50 | 51 | .. automodule:: jtr.util 52 | :members: 53 | :undoc-members: 54 | :show-inheritance: 55 | -------------------------------------------------------------------------------- /api_docs/sources/modules.rst.txt: -------------------------------------------------------------------------------- 1 | jtr 2 | ====== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | jtr 8 | setup 9 | -------------------------------------------------------------------------------- /api_docs/sources/setup.rst.txt: -------------------------------------------------------------------------------- 1 | setup module 2 | ============ 3 | 4 | .. automodule:: setup 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /api_docs/static/ajax-loader.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/api_docs/static/ajax-loader.gif -------------------------------------------------------------------------------- /api_docs/static/comment-bright.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/api_docs/static/comment-bright.png -------------------------------------------------------------------------------- /api_docs/static/comment-close.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/api_docs/static/comment-close.png -------------------------------------------------------------------------------- /api_docs/static/comment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/api_docs/static/comment.png -------------------------------------------------------------------------------- /api_docs/static/custom.css: -------------------------------------------------------------------------------- 1 | /* This file intentionally left blank. */ 2 | -------------------------------------------------------------------------------- /api_docs/static/down-pressed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/api_docs/static/down-pressed.png -------------------------------------------------------------------------------- /api_docs/static/down.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/api_docs/static/down.png -------------------------------------------------------------------------------- /api_docs/static/file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/api_docs/static/file.png -------------------------------------------------------------------------------- /api_docs/static/minus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/api_docs/static/minus.png -------------------------------------------------------------------------------- /api_docs/static/plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/api_docs/static/plus.png -------------------------------------------------------------------------------- /api_docs/static/up-pressed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/api_docs/static/up-pressed.png -------------------------------------------------------------------------------- /api_docs/static/up.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/api_docs/static/up.png -------------------------------------------------------------------------------- /bin/create-squad-predictions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import json 5 | import logging 6 | import os 7 | import sys 8 | 9 | import tensorflow as tf 10 | 11 | from jack.io.load import loaders 12 | from jack.readers.implementations import reader_from_file 13 | 14 | logger = logging.getLogger(os.path.basename(sys.argv[0])) 15 | logging.basicConfig(level=logging.INFO) 16 | 17 | tf.app.flags.DEFINE_string('dataset', None, 'dataset file') 18 | tf.app.flags.DEFINE_string('loader', 'squad', 'either squad or jack') 19 | tf.app.flags.DEFINE_string('load_dir', None, 'directory to saved model') 20 | tf.app.flags.DEFINE_string('out', "results.json", 'Result file path.') 21 | tf.app.flags.DEFINE_integer('batch_size', 64, 'batch size') 22 | tf.app.flags.DEFINE_string('overwrite', '{}', 'json string that can overwrite configuration.') 23 | 24 | FLAGS = tf.app.flags.FLAGS 25 | 26 | logger.info("Creating and loading reader from {}...".format(FLAGS.load_dir)) 27 | config = {"max_support_length": None} 28 | config.update(json.loads(FLAGS.overwrite)) 29 | reader = reader_from_file(FLAGS.load_dir, **config) 30 | 31 | dataset = loaders[FLAGS.loader](FLAGS.file) 32 | 33 | logger.info("Start!") 34 | answers = reader.process_dataset(dataset, FLAGS.batch_size, silent=False) 35 | results = {dataset[i][0].id: a.text for i, a in enumerate(answers)} 36 | with open(FLAGS.out, "w") as out_file: 37 | json.dump(results, out_file) 38 | 39 | logger.info("Done!") 40 | -------------------------------------------------------------------------------- /bin/jack-eval.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import json 5 | import logging 6 | import os 7 | import sys 8 | 9 | import tensorflow as tf 10 | 11 | from jack.eval import evaluate_reader, pretty_print_results 12 | from jack.io.load import loaders 13 | from jack.readers import reader_from_file 14 | 15 | logger = logging.getLogger(os.path.basename(sys.argv[0])) 16 | logging.basicConfig(level=logging.INFO) 17 | 18 | tf.app.flags.DEFINE_string('dataset', None, 'dataset file') 19 | tf.app.flags.DEFINE_string('loader', 'jack', 'name of loader') 20 | tf.app.flags.DEFINE_string('load_dir', None, 'directory to saved model') 21 | tf.app.flags.DEFINE_integer('batch_size', 64, 'batch size') 22 | tf.app.flags.DEFINE_integer('max_examples', None, 'maximum number of examples to evaluate') 23 | tf.app.flags.DEFINE_string('overwrite', '{}', 'json string that overwrites configuration.') 24 | 25 | FLAGS = tf.app.flags.FLAGS 26 | 27 | logger.info("Creating and loading reader from {}...".format(FLAGS.load_dir)) 28 | 29 | kwargs = json.loads(FLAGS.overwrite) 30 | 31 | reader = reader_from_file(FLAGS.load_dir, **kwargs) 32 | dataset = loaders[FLAGS.loader](FLAGS.dataset) 33 | if FLAGS.max_examples: 34 | dataset = dataset[:FLAGS.max_examples] 35 | 36 | logger.info("Start!") 37 | result_dict = evaluate_reader(reader, dataset, FLAGS.batch_size) 38 | 39 | 40 | logger.info("############### RESULTS ##############") 41 | pretty_print_results(result_dict) 42 | -------------------------------------------------------------------------------- /bin/mmap-cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import sys 6 | 7 | from jack.io.embeddings import load_embeddings 8 | from jack.io.embeddings.memory_map import save_as_memory_map_dir 9 | 10 | import logging 11 | logger = logging.getLogger(os.path.basename(sys.argv[0])) 12 | 13 | 14 | def main(): 15 | import argparse 16 | parser = argparse.ArgumentParser(description='Convert embeddings to memory map directory') 17 | parser.add_argument("input_file", help="The input embedding file.") 18 | parser.add_argument("output_dir", 19 | help="The name of the directory to store the memory map in. Will be created if it doesn't " 20 | "exist.") 21 | parser.add_argument("-f", "--input_format", help="Format of input embeddings.", default="glove", 22 | choices=["glove", "word2vec", "memory_map_dir"]) 23 | args = parser.parse_args() 24 | input_name = args.input_file 25 | output_dir = args.output_dir 26 | embeddings = load_embeddings(input_name, typ=args.input_format) 27 | logging.info("Loaded embeddings from {}".format(input_name)) 28 | save_as_memory_map_dir(output_dir, embeddings) 29 | logging.info("Stored embeddings to {}".format(output_dir)) 30 | 31 | 32 | if __name__ == "__main__": 33 | main() 34 | -------------------------------------------------------------------------------- /bin/squad_evaluate-v1.1.py: -------------------------------------------------------------------------------- 1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """ 2 | from __future__ import print_function 3 | 4 | import argparse 5 | import json 6 | 7 | from jack.eval.extractive_qa_eval import * 8 | 9 | 10 | def evaluate(dataset, predictions): 11 | f1 = exact_match = total = 0 12 | for article in dataset: 13 | for paragraph in article['paragraphs']: 14 | for qa in paragraph['qas']: 15 | total += 1 16 | if qa['id'] not in predictions: 17 | message = 'Unanswered question ' + qa['id'] + \ 18 | ' will receive score 0.' 19 | print(message, file=sys.stderr) 20 | continue 21 | ground_truths = list(map(lambda x: x['text'], qa['answers'])) 22 | prediction = predictions[qa['id']] 23 | exact_match += metric_max_over_ground_truths( 24 | exact_match_score, prediction, ground_truths) 25 | f1 += metric_max_over_ground_truths( 26 | f1_score, prediction, ground_truths) 27 | 28 | exact_match = 100.0 * exact_match / total 29 | f1 = 100.0 * f1 / total 30 | 31 | return {'exact_match': exact_match, 'f1': f1} 32 | 33 | 34 | if __name__ == '__main__': 35 | expected_version = '1.1' 36 | parser = argparse.ArgumentParser( 37 | description='Evaluation for SQuAD ' + expected_version) 38 | parser.add_argument('dataset_file', help='Dataset file') 39 | parser.add_argument('prediction_file', help='Prediction File') 40 | args = parser.parse_args() 41 | with open(args.dataset_file) as dataset_file: 42 | dataset_json = json.load(dataset_file) 43 | if (dataset_json['version'] != expected_version): 44 | print('Evaluation expects v-' + expected_version + 45 | ', but got dataset with v-' + dataset_json['version'], 46 | file=sys.stderr) 47 | dataset = dataset_json['data'] 48 | with open(args.prediction_file) as prediction_file: 49 | predictions = json.load(prediction_file) 50 | print(json.dumps(evaluate(dataset, predictions))) 51 | -------------------------------------------------------------------------------- /conf/lp/complex.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | DistMult - https://www.microsoft.com/en-us/research/publication/embedding-entities-and-relations-for-learning-and-inference-in-knowledge-bases/ 3 | 4 | parent_config: 'conf/jack.yaml' 5 | loader: jack 6 | 7 | name: 'complex' 8 | debug: False 9 | 10 | reader: 'complex_reader' 11 | 12 | train: 'data/WN18/train.jtr' 13 | dev: 'data/WN18/valid.jtr' 14 | test: 'data/WN18/test.jtr' 15 | 16 | seed: 1337 17 | epochs: 100 18 | repr_dim: 200 19 | num_negative: 1 20 | with_char_embeddings: False 21 | prune: False 22 | lowercase: False 23 | 24 | batch_size: 32 25 | optimizer: adam 26 | learning_rate: 0.005 27 | learning_rate_decay: 1 28 | -------------------------------------------------------------------------------- /conf/lp/complex_fb.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | DistMult - https://www.microsoft.com/en-us/research/publication/embedding-entities-and-relations-for-learning-and-inference-in-knowledge-bases/ 3 | 4 | parent_config: 'conf/jack.yaml' 5 | loader: jack 6 | 7 | name: 'complex' 8 | debug: False 9 | 10 | reader: 'complex_reader' 11 | 12 | train: 'data/FB15k-237/train.jtr' 13 | dev: 'data/FB15k-237/valid.jtr' 14 | test: 'data/FB15k-237/test.jtr' 15 | 16 | seed: 1337 17 | epochs: 100 18 | repr_dim: 200 19 | num_negative: 1 20 | with_char_embeddings: False 21 | prune: False 22 | lowercase: False 23 | 24 | batch_size: 32 25 | optimizer: adam 26 | learning_rate: 0.005 27 | learning_rate_decay: 1 28 | -------------------------------------------------------------------------------- /conf/lp/complex_wn18rr.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | DistMult - https://www.microsoft.com/en-us/research/publication/embedding-entities-and-relations-for-learning-and-inference-in-knowledge-bases/ 3 | 4 | parent_config: 'conf/jack.yaml' 5 | loader: jack 6 | 7 | name: 'complex' 8 | debug: False 9 | 10 | reader: 'complex_reader' 11 | 12 | train: 'data/WN18RR/train.jtr' 13 | dev: 'data/WN18RR/valid.jtr' 14 | test: 'data/WN18RR/test.jtr' 15 | 16 | seed: 1337 17 | epochs: 100 18 | repr_dim: 200 19 | num_negative: 1 20 | with_char_embeddings: False 21 | prune: False 22 | lowercase: False 23 | 24 | batch_size: 32 25 | optimizer: adam 26 | learning_rate: 0.005 27 | learning_rate_decay: 1 28 | -------------------------------------------------------------------------------- /conf/lp/distmult.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | DistMult - https://www.microsoft.com/en-us/research/publication/embedding-entities-and-relations-for-learning-and-inference-in-knowledge-bases/ 3 | 4 | parent_config: 'conf/jack.yaml' 5 | loader: jack 6 | 7 | name: 'distmult' 8 | debug: False 9 | 10 | reader: 'distmult_reader' 11 | 12 | train: 'data/WN18/train.jtr' 13 | dev: 'data/WN18/valid.jtr' 14 | test: 'data/WN18/test.jtr' 15 | 16 | seed: 1337 17 | epochs: 100 18 | repr_dim: 200 19 | num_negative: 1 20 | with_char_embeddings: False 21 | prune: False 22 | lowercase: False 23 | 24 | batch_size: 8192 25 | optimizer: adam 26 | learning_rate: 0.005 27 | learning_rate_decay: 1 28 | -------------------------------------------------------------------------------- /conf/lp/distmult_fb.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | DistMult - https://www.microsoft.com/en-us/research/publication/embedding-entities-and-relations-for-learning-and-inference-in-knowledge-bases/ 3 | 4 | parent_config: 'conf/jack.yaml' 5 | loader: jack 6 | 7 | name: 'distmult' 8 | debug: False 9 | 10 | reader: 'distmult_reader' 11 | 12 | train: 'data/FB15k-237/train.jtr' 13 | dev: 'data/FB15k-237/valid.jtr' 14 | test: 'data/FB15k-237/test.jtr' 15 | 16 | seed: 1337 17 | epochs: 100 18 | repr_dim: 200 19 | num_negative: 1 20 | with_char_embeddings: False 21 | prune: False 22 | lowercase: False 23 | 24 | batch_size: 8192 25 | optimizer: adam 26 | learning_rate: 0.005 27 | learning_rate_decay: 1 28 | -------------------------------------------------------------------------------- /conf/lp/distmult_test.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | DistMult - https://www.microsoft.com/en-us/research/publication/embedding-entities-and-relations-for-learning-and-inference-in-knowledge-bases/ 3 | 4 | parent_config: 'conf/jack.yaml' 5 | loader: jack 6 | 7 | name: 'distmult' 8 | debug: False 9 | 10 | reader: 'distmult_reader' 11 | 12 | train: 'data/WN18/snippet.jtr' 13 | dev: 'data/WN18/snippet.jtr' 14 | test: 'data/WN18/snippet.jtr' 15 | 16 | seed: 1337 17 | epochs: 100 18 | repr_dim: 20 19 | num_negative: 1 20 | with_char_embeddings: False 21 | prune: False 22 | lowercase: False 23 | 24 | batch_size: 32 25 | optimizer: adam 26 | learning_rate: 0.005 27 | learning_rate_decay: 1 28 | -------------------------------------------------------------------------------- /conf/lp/distmult_wn18rr.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | DistMult - https://www.microsoft.com/en-us/research/publication/embedding-entities-and-relations-for-learning-and-inference-in-knowledge-bases/ 3 | 4 | parent_config: 'conf/jack.yaml' 5 | loader: jack 6 | 7 | name: 'distmult' 8 | debug: False 9 | 10 | reader: 'distmult_reader' 11 | 12 | train: 'data/WN18RR/train.jtr' 13 | dev: 'data/WN18RR/valid.jtr' 14 | test: 'data/WN18RR/test.jtr' 15 | 16 | seed: 1337 17 | epochs: 100 18 | repr_dim: 200 19 | num_negative: 1 20 | with_char_embeddings: False 21 | prune: False 22 | lowercase: False 23 | 24 | batch_size: 8192 25 | optimizer: adam 26 | learning_rate: 0.005 27 | learning_rate_decay: 1 28 | -------------------------------------------------------------------------------- /conf/lp/transe.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | DistMult - https://www.microsoft.com/en-us/research/publication/embedding-entities-and-relations-for-learning-and-inference-in-knowledge-bases/ 3 | 4 | parent_config: 'conf/jack.yaml' 5 | loader: jack 6 | 7 | name: 'transe' 8 | debug: False 9 | 10 | reader: 'transe_reader' 11 | 12 | train: 'data/WN18/train.jtr' 13 | dev: 'data/WN18/valid.jtr' 14 | test: 'data/WN18/test.jtr' 15 | 16 | seed: 1337 17 | epochs: 100 18 | repr_dim: 200 19 | num_negative: 1 20 | with_char_embeddings: False 21 | prune: False 22 | lowercase: False 23 | 24 | batch_size: 32 25 | optimizer: adam 26 | learning_rate: 0.005 27 | learning_rate_decay: 1 28 | -------------------------------------------------------------------------------- /conf/lp/transe_fb.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | DistMult - https://www.microsoft.com/en-us/research/publication/embedding-entities-and-relations-for-learning-and-inference-in-knowledge-bases/ 3 | 4 | parent_config: 'conf/jack.yaml' 5 | loader: jack 6 | 7 | name: 'transe' 8 | debug: False 9 | 10 | reader: 'transe_reader' 11 | 12 | train: 'data/FB15k-237/train.jtr' 13 | dev: 'data/FB15k-237/valid.jtr' 14 | test: 'data/FB15k-237/test.jtr' 15 | 16 | seed: 1337 17 | epochs: 100 18 | repr_dim: 200 19 | num_negative: 1 20 | with_char_embeddings: False 21 | prune: False 22 | lowercase: False 23 | 24 | batch_size: 32 25 | optimizer: adam 26 | learning_rate: 0.005 27 | learning_rate_decay: 1 28 | -------------------------------------------------------------------------------- /conf/lp/transe_wn18rr.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | DistMult - https://www.microsoft.com/en-us/research/publication/embedding-entities-and-relations-for-learning-and-inference-in-knowledge-bases/ 3 | 4 | parent_config: 'conf/jack.yaml' 5 | loader: jack 6 | 7 | name: 'transe' 8 | debug: False 9 | 10 | reader: 'transe_reader' 11 | 12 | train: 'data/WN18RR/train.jtr' 13 | dev: 'data/WN18RR/valid.jtr' 14 | test: 'data/WN18RR/test.jtr' 15 | 16 | seed: 1337 17 | epochs: 100 18 | repr_dim: 200 19 | num_negative: 1 20 | with_char_embeddings: False 21 | prune: False 22 | lowercase: False 23 | 24 | batch_size: 32 25 | optimizer: adam 26 | learning_rate: 0.005 27 | learning_rate_decay: 1 28 | -------------------------------------------------------------------------------- /conf/nli/cbilstm.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | Conditional BiLSTM, processes premise with BiLSTM conditioned on processed hypothesis. Max pooling over the processed 3 | premise states is used prior to classification. 4 | 5 | parent_config: './conf/jack.yaml' 6 | 7 | # Reader model to use, see jack/readers/implementations.py for options 8 | reader: 'cbilstm_nli_reader' 9 | -------------------------------------------------------------------------------- /conf/nli/dam.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | Decomposable Attention Model https://arxiv.org/abs/1606.01933 3 | 4 | parent_config: './conf/jack.yaml' 5 | 6 | # Reader model to use, see jack/readers/implementations.py for options 7 | reader: 'dam_snli_reader' 8 | 9 | normalize_embeddings: True 10 | -------------------------------------------------------------------------------- /conf/nli/esim.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | ESIM model https://arxiv.org/abs/1609.06038 3 | 4 | parent_config: './conf/nli/modular_nli.yaml' 5 | 6 | name: 'esim_reader' 7 | 8 | model: 9 | encoder_layer: 10 | # BiLSTM 11 | - input: 'hypothesis' 12 | module: 'lstm' 13 | with_projection: True # not in original model but helps 14 | activation: 'tanh' 15 | name: 'encoder' 16 | dropout: True 17 | 18 | # BiLSTM 19 | - input: 'premise' 20 | module: 'lstm' 21 | with_projection: True # not in original model but helps 22 | activation: 'tanh' 23 | name: 'encoder' 24 | dropout: True 25 | 26 | # Attention 27 | - input: 'premise' 28 | dependent: 'hypothesis' 29 | output: 'hypothesis_attn' 30 | module: 'attention_matching' 31 | attn_type: 'dot' 32 | concat: False 33 | - input: 'hypothesis' 34 | dependent: 'premise' 35 | output: 'premise_attn' 36 | module: 'attention_matching' 37 | attn_type: 'dot' 38 | concat: False 39 | 40 | - input: ['premise', 'hypothesis_attn'] 41 | output: 'premise_mul' 42 | module: 'mul' 43 | - input: ['premise', 'hypothesis_attn'] 44 | output: 'premise_sub' 45 | module: 'sub' 46 | - input: ['premise', 'hypothesis_attn', 'premise_mul', 'premise_sub'] 47 | output: 'premise' 48 | module: 'concat' 49 | - input: 'premise' 50 | module: 'dense' 51 | name: 'projection' 52 | activation: 'relu' 53 | dropout: True 54 | 55 | - input: ['hypothesis', 'premise_attn'] 56 | output: 'hypothesis_mul' 57 | module: 'mul' 58 | - input: ['hypothesis', 'premise_attn'] 59 | output: 'hypothesis_sub' 60 | module: 'sub' 61 | - input: ['hypothesis', 'premise_attn', 'hypothesis_mul', 'hypothesis_sub'] 62 | output: 'hypothesis' 63 | module: 'concat' 64 | - input: 'hypothesis' 65 | module: 'dense' 66 | name: 'projection' 67 | activation: 'relu' 68 | dropout: True 69 | 70 | # inference composition 71 | # BiLSTM 72 | - input: 'hypothesis' 73 | module: 'lstm' 74 | name: 'composition' 75 | 76 | # BiLSTM 77 | - input: 'premise' 78 | module: 'lstm' 79 | name: 'composition' 80 | 81 | prediction_layer: 82 | module: 'max_avg_mlp' 83 | dropout: True 84 | -------------------------------------------------------------------------------- /conf/nli/modular_nli.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | A configuration inheriting from the default jack.yaml 3 | 4 | parent_config: './conf/jack.yaml' 5 | 6 | # Reader model to use, see jack/readers/implementations.py for options 7 | reader: 'modular_nli_reader' 8 | 9 | with_char_embeddings: False 10 | model: 11 | encoder_layer: null # list of encoder modules with input key ('question' or 'support' in beginning) 12 | prediction_layer: null # 'max_avg_mlp' (h_max, h_avg, p_max, p_avg), 'max_mlp' (h_max, p_max), 'max_interaction_mlp' (h_max, p_max, h_max - p_max, h+max * p_max) 13 | 14 | # encoder modules can be selected as wanted and are defined by the following keys 15 | # * input - required, string indicating what input to encode (starts with possibilities 'question' or 'support') 16 | # * output - optional, set to input by default but can be overwritten to something else => after defining a new output key it can be used later as input somewhere else 17 | # * repr_dim - dimensionality of output 18 | # * module - BiRNNs: 'lstm', 'gru', 'sru' ('with_projection: True' will also employ a projection layer on top of the BiRNNs which is recommended) 19 | # CONVs: 'gldr' (gated linear dilated residual network), 'conv' (convolution) 20 | # MISC: 'projection' (linear projection), 'self_attn', 'concat' (use 'input' to define list of keys to concatenate) 21 | # * residual - whether this encoder should be residually employed 22 | # * num_layers number of times this is consecutively 23 | # 'conv' requires another parameter, 'conv_width' (3 by default) and can have and 'activation' 24 | # 'gldr' requires another parameters, 'conv_width' (3 by default) and 'dilations' 25 | # (a list of dilations for each layer of the gldr network) 26 | # 'projection' has additional 'activation' attribute which can be 'relu', 'tanh','sigmoid', etc (everything in tf.nn) 27 | # 'self_attn' support attn types: 'dot', 'bilinear', 'diagonal_bilinear', 'mlp' 28 | # 'dot', 'bilinear', 'diagonal_bilinear' have additional scale attribute which scales attn scores by sqrt of repr_dim 29 | # of input states, it is recommended to use it for 'dot' and 'diagonal_bilinear' 30 | # 'mlp' has additional 'repr_dim' and 'activation' property for the dimensionality and activation of the hidden layer 31 | # you can set the number of parallel attention heads using num_attn_heads 32 | # 33 | # You can reuse encoders (i.e., their parameters) by giving them the same name and set 'reuse: True' 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /conf/nli/multinli/abstract_multinli.yaml: -------------------------------------------------------------------------------- 1 | parent_config: './conf/jack.yaml' 2 | 3 | # loader for the dataset, ['jack', 'squad', 'snli'] are supported. For everything else convert to jtr format first. 4 | loader: snli 5 | 6 | # MultiNLI training file 7 | train: 'data/MultiNLI/multinli_1.0/multinli_1.0_train.jsonl' 8 | 9 | # MultiNLI dev file 10 | dev: 'data/MultiNLI/multinli_1.0/multinli_1.0_dev.jsonl' 11 | 12 | # MultiNLI test file 13 | test: null 14 | 15 | epochs: 20 16 | 17 | embedding_format: 'memory_map_dir' 18 | embedding_file: 'data/GloVe/glove.840B.300d.memory_map_dir' 19 | 20 | # Use fixed vocab of pretrained embeddings 21 | vocab_from_embeddings: True 22 | 23 | # Use also character based embeddings in readers which support it 24 | with_char_embeddings: False 25 | 26 | batch_size: 64 27 | 28 | lowercase: False 29 | 30 | learning_rate: 0.001 31 | min_learning_rate: 0.0001 32 | learning_rate_decay: 0.8 33 | -------------------------------------------------------------------------------- /conf/nli/multinli/cbilstm.yaml: -------------------------------------------------------------------------------- 1 | parent_config: ['./conf/nli/cbilstm.yaml', './conf/nli/snli/abstract_snli.yaml'] 2 | 3 | # fixed experiment seed 4 | seed: 1337 5 | dropout: 0.2 6 | repr_dim: 300 7 | -------------------------------------------------------------------------------- /conf/nli/multinli/dam.yaml: -------------------------------------------------------------------------------- 1 | parent_config: ['./conf/nli/dam.yaml', './conf/nli/multinli/abstract_multinli.yaml'] 2 | 3 | # fixed experiment seed 4 | seed: 1337 5 | dropout: 0.2 6 | repr_dim: 300 7 | epochs: 100 8 | batch_size: 32 9 | optimizer: adagrad 10 | learning_rate: 0.05 11 | learning_rate_decay: 1.0 12 | -------------------------------------------------------------------------------- /conf/nli/multinli/esim.yaml: -------------------------------------------------------------------------------- 1 | parent_config: ['./conf/nli/esim.yaml', './conf/nli/multinli/abstract_multinli.yaml'] 2 | 3 | # fixed experiment seed 4 | seed: 1337 5 | dropout: 0.2 6 | repr_dim: 300 7 | validation_interval: 2000 8 | learning_rate: 0.0004 9 | learning_rate_decay: 1.0 10 | batch_size: 32 11 | epochs: 10 12 | -------------------------------------------------------------------------------- /conf/nli/snli/abstract_snli.yaml: -------------------------------------------------------------------------------- 1 | parent_config: './conf/jack.yaml' 2 | 3 | # loader for the dataset, ['jack', 'squad', 'snli'] are supported. For everything else convert to jtr format first. 4 | loader: snli 5 | 6 | # SNLI training file 7 | train: 'data/SNLI/snli_1.0/snli_1.0_train.jsonl' 8 | 9 | # SNLI dev file 10 | dev: 'data/SNLI/snli_1.0/snli_1.0_dev.jsonl' 11 | 12 | # SNLI test file 13 | test: 'data/SNLI/snli_1.0/snli_1.0_test.jsonl' 14 | 15 | epochs: 20 16 | 17 | repr_dim: 300 18 | 19 | embedding_format: 'memory_map_dir' 20 | embedding_file: 'data/GloVe/glove.840B.300d.memory_map_dir' 21 | 22 | # Use fixed vocab of pretrained embeddings 23 | vocab_from_embeddings: True 24 | 25 | # Use also character based embeddings in readers which support it 26 | with_char_embeddings: False 27 | 28 | batch_size: 64 29 | lowercase: False 30 | learning_rate: 0.001 31 | min_learning_rate: 0.0001 32 | learning_rate_decay: 0.8 33 | 34 | -------------------------------------------------------------------------------- /conf/nli/snli/cbilstm.yaml: -------------------------------------------------------------------------------- 1 | parent_config: ['./conf/nli/cbilstm.yaml', './conf/nli/snli/abstract_snli.yaml'] 2 | 3 | # fixed experiment seed 4 | seed: 1337 5 | dropout: 0.2 6 | repr_dim: 300 7 | -------------------------------------------------------------------------------- /conf/nli/snli/dam.yaml: -------------------------------------------------------------------------------- 1 | parent_config: ['./conf/nli/dam.yaml', './conf/nli/snli/abstract_snli.yaml'] 2 | 3 | # fixed experiment seed 4 | seed: 1337 5 | dropout: 0.2 6 | repr_dim: 200 7 | epochs: 100 8 | batch_size: 32 9 | optimizer: adagrad 10 | learning_rate: 0.05 11 | learning_rate_decay: 1.0 12 | -------------------------------------------------------------------------------- /conf/nli/snli/esim.yaml: -------------------------------------------------------------------------------- 1 | parent_config: ['./conf/nli/esim.yaml', './conf/nli/snli/abstract_snli.yaml'] 2 | 3 | # fixed experiment seed 4 | seed: 1337 5 | dropout: 0.2 6 | repr_dim: 300 7 | validation_interval: 2000 8 | learning_rate: 0.0004 9 | learning_rate_decay: 1.0 10 | batch_size: 32 11 | epochs: 10 12 | -------------------------------------------------------------------------------- /conf/qa/bidaf.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | BiDAF reader implementation as described in https://arxiv.org/abs/1611.01603. This is a slightly adapted version. 3 | 4 | parent_config: './conf/qa/modular_qa.yaml' 5 | 6 | # Reader model to use, see jack/readers/implementations.py for options 7 | name: 'bidaf_reader' 8 | 9 | # fixed experiment seed 10 | seed: 1337 11 | 12 | # where to store the reader 13 | save_dir: './bidaf_reader' 14 | 15 | with_char_embeddings: True 16 | 17 | max_span_size: 16 18 | 19 | model: 20 | encoder_layer: 21 | # Embedding computation 22 | # Support 23 | - input: ['support', 'char_support'] 24 | output: 'support' 25 | module: 'concat' 26 | - input: 'support' 27 | name: 'embedding_highway' 28 | module: 'highway' 29 | num_layers: 2 30 | 31 | # Question 32 | - input: ['question', 'char_question'] 33 | output: 'question' 34 | module: 'concat' 35 | - input: 'question' 36 | name: 'embedding_highway' # use same network as support 37 | module: 'highway' 38 | num_layers: 2 39 | 40 | # Contextual Encoding 41 | - input: 'question' 42 | module: 'lstm' 43 | name: 'contextual_encoding' 44 | with_projection: True # not in the original bidaf implementation, but help 45 | dropout: True 46 | - input: 'support' 47 | module: 'lstm' 48 | with_projection: True # not in the original bidaf implementation, but helps 49 | name: 'contextual_encoding' # shared encoding at this point helps 50 | dropout: True 51 | 52 | # Attention Encoding 53 | - input: 'support' 54 | dependent: 'question' 55 | module: 'bidaf' 56 | 57 | - input: 'support' 58 | module: 'lstm' 59 | with_projection: True # not in the original bidaf implementation, but helps 60 | num_layers: 2 61 | dropout: True 62 | 63 | answer_layer: 64 | module: 'bidaf' 65 | encoder: # only needed for bidaf answer layer 66 | module: 'lstm' 67 | 68 | -------------------------------------------------------------------------------- /conf/qa/fastqa.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | FastQA configuration to train model on SQuAD as described in https://arxiv.org/abs/1703.04816. 3 | 4 | parent_config: './conf/jack.yaml' 5 | 6 | # Reader model to use, see jack/readers/implementations.py for options 7 | reader: 'fastqa_reader' 8 | 9 | # fixed experiment seed 10 | seed: 1337 11 | 12 | # where to store the reader 13 | save_dir: './fastqa_reader' 14 | 15 | # 'lstm', 'gru', 'sru' (simple recurrent unit) 16 | encoder: 'lstm' 17 | with_char_embeddings: True 18 | 19 | # 'conditional' (original fastqa, end score conditioned on predicted start), 'conditional_bilinear', 'bilinear' 20 | # bilinear has shown best performance 21 | answer_layer: bilinear 22 | -------------------------------------------------------------------------------- /conf/qa/modular_qa.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | Modular QA Reader base configuration. 3 | 4 | parent_config: './conf/jack.yaml' 5 | 6 | reader: 'modular_qa_reader' 7 | 8 | with_char_embeddings: True 9 | model: 10 | encoder_layer: null # list of encoder modules with input key ('question' or 'support' in beginning) 11 | answer_layer: null # 'bilinear', 'mlp', 'conditional' (i.e., fastqa), 'conditional_bilinear', 'bidaf', 'san' (https://arxiv.org/pdf/1712.03556.pdf) 12 | 13 | 14 | # encoder modules can be selected as wanted and are defined by the following keys 15 | # * input - required, string indicating what input to encode (starts with possibilities 'question' or 'support') 16 | # * output - optional, set to input by default but can be overwritten to something else => after defining a new output key it can be used later as input somewhere else 17 | # * repr_dim - dimensionality of output 18 | # * module - BiRNNs: 'lstm', 'gru', 'sru' ('with_projection: True' will also employ a projection layer on top of the BiRNNs which is recommended) 19 | # CONVs: 'gldr' (gated linear dilated residual network), 'conv' (convolution) 20 | # MISC: 'projection' (linear projection), 'self_attn', 'concat' (use 'input' to define list of keys to concatenate) 21 | # * residual - whether this encoder should be residually employed 22 | # * num_layers number of times this is consecutively 23 | # 'conv' requires another parameter, 'conv_width' (3 by default) and can have and 'activation' 24 | # 'gldr' requires another parameters, 'conv_width' (3 by default) and 'dilations' 25 | # (a list of dilations for each layer of the gldr network) 26 | # 'projection' has additional 'activation' attribute which can be 'relu', 'tanh','sigmoid', etc (everything in tf.nn) 27 | # 'self_attn' support attn types: 'dot', 'bilinear', 'diagonal_bilinear', 'mlp' 28 | # 'dot', 'bilinear', 'diagonal_bilinear' have additional scale attribute which scales attn scores by sqrt of repr_dim 29 | # of input states, it is recommended to use it for 'dot' and 'diagonal_bilinear' 30 | # 'mlp' has additional 'repr_dim' and 'activation' property for the dimensionality and activation of the hidden layer 31 | # you can set the number of parallel attention heads using num_attn_heads 32 | # 33 | # You can reuse encoders (i.e., their parameters) by giving them the same name and set 'reuse: True' 34 | 35 | -------------------------------------------------------------------------------- /conf/qa/squad/abstract_squad.yaml: -------------------------------------------------------------------------------- 1 | parent_config: './conf/jack.yaml' 2 | 3 | # loader to use in experiment 4 | loader: 'squad' 5 | 6 | # How large the support should be. Can be used for cutting or filtering QA examples 7 | max_support_length: -1 8 | 9 | train: 'data/SQuAD/train-v1.1.json' 10 | dev: 'data/SQuAD/dev-v1.1.json' 11 | test: null 12 | 13 | # [word2vec], [glove] or [memory_map_dir] format of embeddings to be loaded 14 | embedding_format: 'memory_map_dir' 15 | 16 | # embeddings to be loaded 17 | embedding_file: 'data/GloVe/glove.840B.300d.memory_map_dir' 18 | 19 | # Use fixed vocab of pretrained embeddings 20 | vocab_from_embeddings: True 21 | 22 | epochs: 20 23 | 24 | dropout: 0.2 25 | 26 | batch_size: 64 27 | lowercase: False 28 | 29 | max_span_size: 16 30 | 31 | learning_rate: 0.001 32 | min_learning_rate: 0.0001 33 | learning_rate_decay: 0.5 34 | 35 | # 'sum' (loss for summed prob. over all possible gold answer spans), 'max' (loss for best span) 36 | loss: 'sum' 37 | -------------------------------------------------------------------------------- /conf/qa/squad/bidaf.yaml: -------------------------------------------------------------------------------- 1 | parent_config: ['./conf/qa/bidaf.yaml', './conf/qa/squad/abstract_squad.yaml'] 2 | seed: 1337 3 | 4 | repr_dim: 100 5 | dropout: 0.2 6 | -------------------------------------------------------------------------------- /conf/qa/squad/fastqa.yaml: -------------------------------------------------------------------------------- 1 | parent_config: ['./conf/qa/fastqa.yaml', './conf/qa/squad/abstract_squad.yaml'] 2 | seed: 1337 3 | 4 | repr_dim: 150 5 | dropout: 0.2 6 | -------------------------------------------------------------------------------- /conf/qa/squad/jackqa.yaml: -------------------------------------------------------------------------------- 1 | parent_config: ['./conf/qa/jackqa.yaml', './conf/qa/squad/abstract_squad.yaml'] 2 | seed: 1337 3 | 4 | repr_dim: 100 5 | dropout: 0.2 6 | -------------------------------------------------------------------------------- /conf/qa/triviaqa/web/abstract_triviaqa.yaml: -------------------------------------------------------------------------------- 1 | parent_config: './conf/jack.yaml' 2 | 3 | # loader to use in experiment 4 | loader: 'jack' 5 | 6 | # How large the support should be. Can be used for cutting or filtering QA examples 7 | max_support_length: 600 8 | 9 | train: 'data/triviaqa/web-train.json' 10 | dev: 'data/triviaqa/web-dev.json' 11 | test: null 12 | 13 | # cache preprocessed examples on file in JACK_TEMP to avoid RAM problems 14 | file_cache: True 15 | 16 | # [word2vec], [glove] or [memory_map_dir] format of embeddings to be loaded 17 | embedding_format: 'memory_map_dir' 18 | 19 | # embeddings to be loaded 20 | embedding_file: 'data/GloVe/glove.840B.300d.memory_map_dir' 21 | 22 | # Use fixed vocab of pretrained embeddings 23 | vocab_from_embeddings: True 24 | 25 | epochs: 3 26 | 27 | dropout: 0.2 28 | 29 | batch_size: 16 30 | 31 | lowercase: False 32 | 33 | # default take all, if set to >0 will be used to select only the top supports based on tf idf with question 34 | max_num_support: 6 35 | # set to -1 if you want to use all during training (which will of course demand more training time) 36 | # paragraphs are subsampled from the top `max_num_support`, the best paragraph is sampled twice as likely as rest 37 | max_training_support: 2 38 | 39 | max_span_size: 8 40 | 41 | learning_rate: 0.001 42 | min_learning_rate: 0.0001 43 | learning_rate_decay: 0.5 44 | validation_interval: 2000 45 | num_dev_examples: 1000 46 | 47 | # 'sum' (loss for summed prob. over all possible gold answer spans), 'max' (loss for best span) 48 | loss: 'sum' 49 | -------------------------------------------------------------------------------- /conf/qa/triviaqa/web/bidaf.yaml: -------------------------------------------------------------------------------- 1 | parent_config: ['./conf/qa/bidaf.yaml', './conf/qa/triviaqa/web/abstract_triviaqa.yaml'] 2 | seed: 1337 3 | 4 | repr_dim: 100 5 | dropout: 0.2 6 | -------------------------------------------------------------------------------- /conf/qa/triviaqa/web/fastqa.yaml: -------------------------------------------------------------------------------- 1 | parent_config: ['./conf/qa/fastqa.yaml', './conf/qa/triviaqa/web/abstract_triviaqa.yaml'] 2 | seed: 1337 3 | 4 | repr_dim: 150 5 | dropout: 0.2 6 | -------------------------------------------------------------------------------- /conf/qa/triviaqa/web/jackqa.yaml: -------------------------------------------------------------------------------- 1 | parent_config: ['./conf/qa/jackqa.yaml', './conf/qa/triviaqa/web/abstract_triviaqa.yaml'] 2 | seed: 1337 3 | 4 | repr_dim: 100 5 | dropout: 0.2 6 | -------------------------------------------------------------------------------- /conf/qa/triviaqa/wiki/abstract_triviaqa.yaml: -------------------------------------------------------------------------------- 1 | parent_config: './conf/jack.yaml' 2 | 3 | # loader to use in experiment 4 | loader: 'jack' 5 | 6 | # How large the support should be. Can be used for cutting or filtering QA examples 7 | max_support_length: 600 8 | 9 | train: 'data/triviaqa/wiki-train.json' 10 | dev: 'data/triviaqa/wiki-dev.json' 11 | test: null 12 | 13 | # [word2vec], [glove] or [memory_map_dir] format of embeddings to be loaded 14 | embedding_format: 'memory_map_dir' 15 | 16 | # embeddings to be loaded 17 | embedding_file: 'data/GloVe/glove.840B.300d.memory_map_dir' 18 | 19 | # Use fixed vocab of pretrained embeddings 20 | vocab_from_embeddings: True 21 | 22 | epochs: 20 23 | 24 | dropout: 0.2 25 | 26 | batch_size: 16 27 | 28 | lowercase: False 29 | 30 | # default take all, if set to >0 will be used to select only the top supports based on tf idf with question 31 | max_num_support: 6 32 | # set to -1 if you want to use all during training (which will of course demand more training time) 33 | # paragraphs are subsampled from the top `max_num_support`, the best paragraph is sampled twice as likely as rest 34 | max_training_support: 2 35 | 36 | max_span_size: 8 37 | 38 | learning_rate: 0.001 39 | min_learning_rate: 0.0001 40 | learning_rate_decay: 0.5 41 | validation_interval: 2000 42 | num_dev_examples: 1000 43 | 44 | # 'sum' (loss for summed prob. over all possible gold answer spans), 'max' (loss for best span) 45 | loss: 'sum' 46 | -------------------------------------------------------------------------------- /conf/qa/triviaqa/wiki/bidaf.yaml: -------------------------------------------------------------------------------- 1 | parent_config: ['./conf/qa/bidaf.yaml', './conf/qa/triviaqa/wiki/abstract_triviaqa.yaml'] 2 | seed: 1337 3 | 4 | repr_dim: 100 5 | dropout: 0.2 6 | -------------------------------------------------------------------------------- /conf/qa/triviaqa/wiki/fastqa.yaml: -------------------------------------------------------------------------------- 1 | parent_config: ['./conf/qa/fastqa.yaml', './conf/qa/triviaqa/wiki/abstract_triviaqa.yaml'] 2 | seed: 1337 3 | 4 | repr_dim: 150 5 | dropout: 0.2 6 | -------------------------------------------------------------------------------- /conf/qa/triviaqa/wiki/jackqa.yaml: -------------------------------------------------------------------------------- 1 | parent_config: ['./conf/qa/jackqa.yaml', './conf/qa/triviaqa/wiki/abstract_triviaqa.yaml'] 2 | seed: 1337 3 | 4 | repr_dim: 100 5 | dropout: 0.2 6 | -------------------------------------------------------------------------------- /data/CBT/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | 4 | # ...except: 5 | !.gitignore 6 | !download.sh 7 | !snippet.json 8 | !snippet.jtr.json 9 | -------------------------------------------------------------------------------- /data/CBT/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | wget -P data/CBT http://www.thespermwhale.com/jaseweston/babi/CBTest.tgz 3 | tar -xzf data/CBT/CBTest.tgz -C data/CBT/ 4 | -------------------------------------------------------------------------------- /data/FB15k-237/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | 4 | # ...except: 5 | !.gitignore 6 | !download.sh 7 | !snippet.jtr.json 8 | -------------------------------------------------------------------------------- /data/FB15k-237/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | wget -O data/FB15k-237/fb15k-237.zip https://download.microsoft.com/download/8/7/0/8700516A-AB3D-4850-B4BB-805C515AECE1/FB15K-237.2.zip 3 | unzip -xzf data/FB15k-237/fb15k-237.zip -d data/FB15k-237/ 4 | #rm data/FB15k/fb15k.tgz 5 | -------------------------------------------------------------------------------- /data/FB15k-237/snippet.jtr.json: -------------------------------------------------------------------------------- 1 | { 2 | "meta": "FB15K with entity neighbours as supporting facts.", 3 | "instances": [ 4 | { 5 | "support": [ 6 | { 7 | "text": "1 0 2" 8 | } 9 | ], 10 | "questions": [ 11 | { 12 | "answers": [ 13 | { 14 | "text": "2" 15 | } 16 | ], 17 | "candidates": [], 18 | "question": "1 0" 19 | } 20 | ] 21 | }, 22 | { 23 | "support": [ 24 | { 25 | "text": "0 1 3" 26 | } 27 | ], 28 | "questions": [ 29 | { 30 | "answers": [ 31 | { 32 | "text": "3" 33 | } 34 | ], 35 | "candidates": [], 36 | "question": "0 1" 37 | } 38 | ] 39 | } 40 | ], 41 | "globals": { 42 | "candidates": [ 43 | { 44 | "text": "0" 45 | }, 46 | { 47 | "text": "1" 48 | }, 49 | { 50 | "text": "2" 51 | }, 52 | { 53 | "text": "3" 54 | } 55 | ] 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /data/GloVe/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ -f "data/GloVe/glove.840B.300d.txt" ] 4 | then 5 | echo "data/GloVe/glove.840B.300d.txt already exists! Doing nothing!" 6 | else 7 | # echo "Downloading glove.840B.300d.txt!" 8 | # wget -c -P data/GloVe/ http://nlp.stanford.edu/data/glove.840B.300d.zip 9 | # unzip -d data/GloVe/ data/GloVe/glove.840B.300d.zip 10 | echo "Downloading glove.840B.300d.memory_map_dir!" 11 | wget -c -P data/GloVe/ http://data.neuralnoise.com/jack/embeddings/glove.840B.300d.memory_map_dir.tar.gz 12 | tar xvfz data/GloVe/glove.840B.300d.memory_map_dir.tar.gz -C data/GloVe/ 13 | fi 14 | -------------------------------------------------------------------------------- /data/GloVe/download_small.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ -f "data/GloVe/glove.6B.50d.txt" ] 4 | then 5 | echo "glove.6B.50d.txt already exists! Doing nothing!" 6 | else 7 | echo "Downloading glove.6B.50d.txt!" 8 | wget -c -P data/GloVe/ http://nlp.stanford.edu/data/glove.6B.zip 9 | unzip data/GloVe/glove.6B.zip -d data/GloVe 10 | fi 11 | -------------------------------------------------------------------------------- /data/GloVe/glove.the.50d.txt: -------------------------------------------------------------------------------- 1 | the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581 -------------------------------------------------------------------------------- /data/LS/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | wget http://www.dianamccarthy.co.uk/files/task10data.tar.gz 3 | wget http://nlp.cs.swarthmore.edu/semeval/tasks/task10/data/trial.tar.gz 4 | wget http://nlp.cs.swarthmore.edu/semeval/tasks/task10/data/test.tar.gz 5 | wget http://nlp.cs.swarthmore.edu/semeval/tasks/task10/data/key.tar.gz 6 | tar -xzf task10data.tar.gz 7 | tar -xzf trial.tar.gz 8 | tar -xzf test.tar.gz 9 | tar -xzf key.tar.gz 10 | curl -O -L https://raw.githubusercontent.com/gaurav324/English-Lexicalized-Text-Substituion/master/TaskTestData/test/lexsub_test_cleaned.xml 11 | curl -O -L https://raw.githubusercontent.com/gaurav324/English-Lexicalized-Text-Substituion/master/TaskTestData/trial/lexsub_trial_cleaned.xml 12 | mv lexsub_test_cleaned.xml ./test/ 13 | mv lexsub_trial_cleaned.xml ./trial/ -------------------------------------------------------------------------------- /data/MCTest/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | 4 | # ...except: 5 | !.gitignore 6 | !download.sh 7 | !snippet.jtr.json 8 | -------------------------------------------------------------------------------- /data/MCTest/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | cd data/MCTest 3 | wget http://research-srv.microsoft.com/en-us/um/redmond/projects/mctest/data/MCTest.zip 4 | unzip MCTest.zip 5 | cd ../.. 6 | -------------------------------------------------------------------------------- /data/MultiNLI/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | wget -P data/MultiNLI/ https://www.nyu.edu/projects/bowman/multinli/multinli_1.0.zip 3 | unzip data/MultiNLI/multinli_1.0.zip -d data/MultiNLI 4 | rm data/MultiNLI/multinli_1.0.zip 5 | 6 | # Create joint dev set 7 | cat data/MultiNLI/multinli_1.0/multinli_1.0_dev_matched.jsonl data/MultiNLI/multinli_1.0/multinli_1.0_dev_mismatched.jsonl > data/MultiNLI/multinli_1.0/multinli_1.0_dev.jsonl 8 | -------------------------------------------------------------------------------- /data/MultiNLI/snippet.jtr.json: -------------------------------------------------------------------------------- 1 | { 2 | "instances": [], 3 | "globals": { 4 | "candidates": [ 5 | { 6 | "text": "entailment" 7 | }, 8 | { 9 | "text": "neutral" 10 | }, 11 | { 12 | "text": "contradiction" 13 | } 14 | ] 15 | }, 16 | "meta": "MultiSNLI" 17 | } -------------------------------------------------------------------------------- /data/NYT/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | wget -O data/NYT/naacl2013.txt.zip https://www.dropbox.com/s/5iulumlihydo1k7/naacl2013.txt.zip?dl=1 3 | unzip data/NYT/naacl2013.txt.zip -d data/NYT/ 4 | rm data/NYT/naacl2013.txt.zip -------------------------------------------------------------------------------- /data/NewsQA/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd "$(dirname "$0")" 4 | 5 | echo "Cloning NewsQA repo to newsqa..." 6 | git clone https://github.com/Maluuba/newsqa.git 7 | 8 | cd newsqa 9 | pip2 install --requirement requirements.txt 10 | 11 | # download cnn 12 | echo "Download the CNN stories manually to the maluuba/newsqa folder from (don't extract): http://cs.nyu.edu/~kcho/DMQA/" 13 | echo "Press [Enter] when done so.." 14 | read a 15 | 16 | echo "Download the questions and answers to the maluuba/newsqa folder manually from: https://datasets.maluuba.com/NewsQA/dl..." 17 | echo "Press [Enter] when done so.." 18 | read a 19 | 20 | cd maluuba/newsqa 21 | tar xzf newsqa-data-v1.tar.gz 22 | 23 | cd ../.. 24 | 25 | # fix a bug 26 | sed -ie 's/\\r/\\n/g' maluuba/newsqa/data_processing.py 27 | rm maluuba/newsqa/data_processing.pye 28 | 29 | python2 maluuba/newsqa/example.py 30 | python2 maluuba/newsqa/split_dataset.py 31 | 32 | mv newsqa/maluuba/newsqa/* . 33 | rm -r newsqa 34 | 35 | echo "Find resulting dataset in data/NewsQA/newsqa/maluuba/newsqa/[train,dev,test]_story_ids.csv and newsqa/maluuba/newsqa/split_data" 36 | echo "These can be used as input to the conversion scripts in jack/io/NewsQA2*.py" 37 | -------------------------------------------------------------------------------- /data/PTB/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz 3 | tar -xzf simple-examples.tgz -------------------------------------------------------------------------------- /data/QAngaroo/instructions.md: -------------------------------------------------------------------------------- 1 | You have to download the dataset manually from 2 | 3 | http://qangaroo.cs.ucl.ac.uk 4 | 5 | by clicking the _Download Dataset_ button. 6 | 7 | After unzipping the contents, you can convert the dataset into a SQuAD-like format. 8 | 9 | E.g. 10 | 11 | `python3 qangaroo2squad.py qangaroo_v1.1/wikihop/dev.json wikihop_dev.squad_format.json` 12 | 13 | 14 | -------------------------------------------------------------------------------- /data/QAngaroo/qangaroo2squad.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | 4 | 5 | def load_json(path): 6 | with open(path, 'r') as f: 7 | return json.load(f) 8 | 9 | 10 | def convert2SQUAD_format(hoppy_data, write_file_name): 11 | """ 12 | Converts QAngaroo data (hoppy_data) into SQuAD format. 13 | The SQuAD-formatted data is written to disk at write_file_name. 14 | Note: All given support documents per example are concatenated 15 | into one super-document. All text is lowercased. 16 | """ 17 | # adapt the JSON tree structure used in SQUAD. 18 | squad_formatted_content = dict() 19 | squad_formatted_content['version'] = 'hoppy_squad_format' 20 | data = [] 21 | 22 | # loop over dataset 23 | for datum in hoppy_data: 24 | 25 | # Format is deeply nested JSON -- prepare data structures 26 | data_ELEMENT = dict() 27 | data_ELEMENT['title'] = 'dummyTitle' 28 | paragraphs = [] 29 | paragraphs_ELEMENT = dict() 30 | qas = [] 31 | qas_ELEMENT = dict() 32 | qas_ELEMENT_ANSWERS = [] 33 | ANSWERS_ELEMENT = dict() 34 | 35 | 36 | ### content start 37 | qas_ELEMENT['id'] = datum['id'] 38 | qas_ELEMENT['question'] = datum['query'] 39 | 40 | # concatenate all support documents into one superdocument 41 | superdocument = " ".join(datum['supports']).lower() 42 | 43 | # where is the answer in the superdocument? 44 | answer_position = superdocument.find(datum['answer'].lower()) 45 | if answer_position == -1: 46 | continue 47 | 48 | ANSWERS_ELEMENT['answer_start'] = answer_position 49 | ANSWERS_ELEMENT['text'] = datum['answer'].lower() 50 | ### content end 51 | 52 | 53 | # recursively fill in content into the nested SQuAD data format 54 | paragraphs_ELEMENT['context'] = superdocument 55 | qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT) 56 | 57 | qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS 58 | qas.append(qas_ELEMENT) 59 | 60 | paragraphs_ELEMENT['qas'] = qas 61 | paragraphs.append(paragraphs_ELEMENT) 62 | 63 | data_ELEMENT['paragraphs'] = paragraphs 64 | data.append(data_ELEMENT) 65 | 66 | squad_formatted_content['data'] = data 67 | 68 | with open(write_file_name, 'w') as f: 69 | json.dump(squad_formatted_content, f, indent=1) 70 | 71 | print('Done writing SQuAD-formatted data to: ',write_file_name) 72 | 73 | 74 | 75 | 76 | def main(): 77 | input_path = sys.argv[1] 78 | output_path = sys.argv[2] 79 | convert2SQUAD_format(load_json(input_path), output_path) 80 | 81 | 82 | if __name__ == "__main__": 83 | main() 84 | -------------------------------------------------------------------------------- /data/SNLI/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | 4 | # ...except: 5 | !.gitignore 6 | !download.sh 7 | !snippet.json 8 | !snippet.jtr.json 9 | -------------------------------------------------------------------------------- /data/SNLI/README.md: -------------------------------------------------------------------------------- 1 | - Mapping to jtr format 2 | `$ python3 jtr/io/SNLI2jtr_v1.py` 3 | - Validating format 4 | `$ python3 jtr/io/validate.py ./jtr/data/snippet/SNLI_v1/snippet_jtrformat.json jtr/io/dataset_schema.json` 5 | - Debugging 6 | `$ python3 jtr/model/reader.py --train jtr/data/SNLI/snli_1.0/snli_1.0_debug_jtr.jsonl --test jtr/data/SNLI/snli_1.0/snli_1.0_debug_jtr.jsonl` -------------------------------------------------------------------------------- /data/SNLI/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | wget -P data/SNLI/ http://nlp.stanford.edu/projects/snli/snli_1.0.zip 3 | unzip data/SNLI/snli_1.0.zip -d data/SNLI 4 | rm data/SNLI/snli_1.0.zip 5 | -------------------------------------------------------------------------------- /data/SNLI/snippet.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotator_labels": [ 3 | "neutral" 4 | ], 5 | "captionID": "3416050480.jpg#4", 6 | "gold_label": "neutral", 7 | "pairID": "3416050480.jpg#4r1n", 8 | "sentence1": "A person on a horse jumps over a broken down airplane.", 9 | "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", 10 | "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", 11 | "sentence2": "A person is training his horse for a competition.", 12 | "sentence2_binary_parse": "( ( A person ) ( ( is ( ( training ( his horse ) ) ( for ( a competition ) ) ) ) . ) )", 13 | "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (VP (VBG training) (NP (PRP$ his) (NN horse)) (PP (IN for) (NP (DT a) (NN competition))))) (. .)))" 14 | } 15 | -------------------------------------------------------------------------------- /data/SQuAD/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | 4 | # ...except: 5 | !.gitignore 6 | !download.sh 7 | !snippet.json 8 | !snippet.jtr.json 9 | -------------------------------------------------------------------------------- /data/SQuAD/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | 4 | if [ -f "data/SQuAD/dev-v1.1.json" ] 5 | then 6 | echo "Already downloaded." 7 | else 8 | wget -P data/SQuAD/ https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json 9 | wget -P data/SQuAD/ https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json 10 | fi 11 | -------------------------------------------------------------------------------- /data/TBD/SemEval2017Task10/S0022311514005480.ann: -------------------------------------------------------------------------------- 1 | T1 Task 1232 1286 biaxial compressive and triaxial tensile stress states 2 | T2 Task 4 23 second stress state 3 | T3 Task 29 53 tri-axial tensile stress 4 | R1 Hyponym-of Arg1:T2 Arg2:T3 5 | T4 Process 259 282 Finite element analysis 6 | T5 Task 875 935 simulated by applying tensile stress in direction 1, 2 and 3 7 | T6 Material 97 116 advancing crack tip 8 | T7 Material 171 182 oxide layer 9 | T8 Material 227 248 metal–oxide interface 10 | T9 Material 321 327 cracks 11 | T10 Material 348 374 localised tensile stresses 12 | T11 Material 762 804 manufactured partially stabilised zirconia 13 | T12 Material 1185 1217 manufactured stabilized zirconia 14 | T13 Process 999 1023 applied tensile stresses 15 | T14 Material 394 415 metal–oxide interface 16 | T15 Process 521 566 tetragonal to monoclinic phase transformation 17 | T16 Material 664 669 oxide 18 | T17 Process 676 686 de-bonding 19 | T18 Process 706 729 triaxial tensile stress 20 | T19 Process 849 865 tetragonal phase 21 | T20 Process 1137 1145 fracture 22 | T21 Process 888 911 applying tensile stress 23 | T22 Process 949 963 maximum stress 24 | -------------------------------------------------------------------------------- /data/TBD/SemEval2017Task10/S0022311514005480.txt: -------------------------------------------------------------------------------- 1 | The second stress state is a tri-axial tensile stress designed to represent the zone ahead of an advancing crack tip. Micro-scale lateral cracks have been observed in the oxide layer, and appear to form very close to or at the metal–oxide interface (Fig. 1). Finite element analysis by Parise et al. indicated that these cracks form as a result of localised tensile stresses above peaks in the metal–oxide interface roughness [31]. These cracks are considered separate to any nano-scale cracks that might result from the tetragonal to monoclinic phase transformation. An assumption is made here that whether the micro-scale lateral cracks form via fracture of the oxide or by de-bonding at the interface a triaxial tensile stress state will still be present. In manufactured partially stabilised zirconia cracks would be expected to destabilise the tetragonal phase. This is simulated by applying tensile stress in direction 1, 2 and 3. As this the maximum stress at the crack tip is not known, the applied tensile stresses cover a range from 0.1GPa up to a maximum stress value of 2.2GPa as it is approximately equal to three times the fracture strength of bulk fracture strength for manufactured stabilized zirconia [34]. For the biaxial compressive and triaxial tensile stress states it is the trends in behaviour rather than the absolute values that are considered of greatest importance for this work. 2 | -------------------------------------------------------------------------------- /data/WN18/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | wget -O data/WN18/wn18.zip https://github.com/ttrouill/complex/raw/master/datasets/wn18.zip 3 | unzip data/WN18/wn18.zip -d data/WN18/ 4 | -------------------------------------------------------------------------------- /data/WN18/snippet.txt: -------------------------------------------------------------------------------- 1 | 03964744 _hyponym 04371774 2 | 00260881 _hypernym 00260622 3 | 02199712 _member_holonym 02188065 4 | 01332730 _derivationally_related_form 03122748 5 | 06066555 _derivationally_related_form 00645415 6 | 09322930 _instance_hypernym 09360122 7 | 11575425 _hyponym 12255934 8 | 07193596 _derivationally_related_form 00784342 9 | 05726596 _hyponym 06162979 10 | 01768969 _derivationally_related_form 02636811 11 | 02557199 _hyponym 02557790 12 | 01455754 _hypernym 01974062 13 | 02716866 _hyponym 03032576 14 | 03214670 _hyponym 04423288 15 | 07554856 _hypernym 07553301 16 | 11669921 _hyponym 11992806 17 | 01291069 _hyponym 01530678 18 | 07965085 _hyponym 08278169 19 | 00057306 _hypernym 00056912 20 | 10341660 _derivationally_related_form 02661252 21 | 13219258 _hypernym 13167078 22 | 01698271 _also_see 01754576 23 | 08189659 _hyponym 08077292 24 | 10499355 _hypernym 10083823 25 | 02222318 _hyponym 02223238 26 | 02103406 _hypernym 02084071 27 | 07190941 _hypernym 07185325 28 | 12090318 _member_meronym 12093769 29 | 08620061 _hyponym 08620763 30 | 03562126 _hyponym 03318438 31 | 12213635 _member_meronym 12214245 32 | 02651424 _derivationally_related_form 02672371 33 | -------------------------------------------------------------------------------- /data/WN18RR/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | wget -O data/WN18RR/wn18.tgz https://github.com/TimDettmers/ConvE/raw/master/WN18RR.tar.gz 3 | tar -xzf data/WN18RR/wn18.tgz -C data/WN18RR/ 4 | -------------------------------------------------------------------------------- /data/emoji2vec/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | wget https://github.com/uclmr/emoji2vec/raw/master/pre-trained/emoji2vec.txt 3 | -------------------------------------------------------------------------------- /data/emoji2vec/visualize.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | from tensorflow.contrib.tensorboard.plugins import projector 5 | import os 6 | 7 | import numpy as np 8 | 9 | dir = "./jack/data/emoji2vec/" 10 | emojis = [] 11 | vecs = [] 12 | with open(dir + "metadata.tsv", "w") as f_out: 13 | # f_out.write("emoji\n") 14 | with open(dir + "emoji2vec.txt", "r") as f_in: 15 | for ix, line in enumerate(f_in.readlines()[1:]): 16 | splits = line.strip().split(" ") 17 | emoji = splits[0] 18 | vec = [float(x) for x in splits[1:]] 19 | assert len(vec) == 300 20 | # print(emoji, vec) 21 | emojis.append(emoji) 22 | vecs.append(vec) 23 | f_out.write(emoji+"\n") 24 | f_in.close() 25 | f_out.close() 26 | 27 | emoji2vec = tf.constant(np.array(vecs)) 28 | tf_emoji2vec = tf.get_variable("emoji2vec", [len(vecs), 300], tf.float64) 29 | 30 | # save embeddings to file 31 | with tf.Session() as sess: 32 | sess.run(tf.global_variables_initializer()) 33 | sess.run(tf_emoji2vec.assign(emoji2vec)) 34 | 35 | saver = tf.train.Saver() 36 | saver.save(sess, os.path.join(dir, "model.ckpt"), 0) 37 | 38 | # Use the same LOG_DIR where you stored your checkpoint. 39 | summary_writer = tf.summary.FileWriter(dir) 40 | 41 | # Format: tensorflow/contrib/tensorboard/plugins/projector/projector_config.proto 42 | config = projector.ProjectorConfig() 43 | 44 | # You can add multiple embeddings. Here we add only one. 45 | embedding = config.embeddings.add() 46 | embedding.tensor_name = tf_emoji2vec.name 47 | # Link this tensor to its metadata file (e.g. labels). 48 | embedding.metadata_path = os.path.join(dir, 'metadata.tsv') 49 | 50 | # Saves a configuration file that TensorBoard will read during startup. 51 | projector.visualize_embeddings(summary_writer, config) 52 | -------------------------------------------------------------------------------- /data/rc-data/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | 4 | # ...except: 5 | !.gitignore 6 | !post_download.sh 7 | !README.md 8 | !cnn_snippet.jtr.json 9 | -------------------------------------------------------------------------------- /data/rc-data/README.md: -------------------------------------------------------------------------------- 1 | Download data here: 2 | http://cs.nyu.edu/~kcho/DMQA/ 3 | -------------------------------------------------------------------------------- /data/rc-data/post_download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # download the data per instructions in README.md and then execute this script 3 | cd data/rc-data 4 | 5 | tar -xvzf cnn.tgz 6 | tar -xvzf cnn_stories.tgz 7 | tar -xvzf dailymail.tgz 8 | tar -xvzf dailymail_stories.tgz 9 | wget https://github.com/deepmind/rc-data/raw/master/generate_questions.py 10 | 11 | # obtained from: https://github.com/deepmind/rc-data/blob/master/README.md 12 | virtualenv venv 13 | source venv/bin/activate 14 | wget https://github.com/deepmind/rc-data/raw/master/requirements.txt 15 | pip install -r requirements.txt 16 | python generate_questions.py --corpus=cnn --mode=generate 17 | python generate_questions.py --corpus=dailymail --mode=generate 18 | deactivate 19 | cd ../.. -------------------------------------------------------------------------------- /data/sentihood/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | wget http://annotate-neighborhood.com/download/sentihood-train.json 3 | wget http://annotate-neighborhood.com/download/sentihood-dev.json 4 | wget http://annotate-neighborhood.com/download/sentihood-test.json 5 | -------------------------------------------------------------------------------- /data/simpleQuestions/README: -------------------------------------------------------------------------------- 1 | Note that the full context for all questions are the triples in the subfolder freebase-subsets. 2 | The exact triples selected as full context for each question are not distributed, only the Freebase triple the question was based on. 3 | From this triple, the full context can be looked up. 4 | Therefore the conversion script only converts the subject and rel of the triple as context. 5 | See the paper [http://arxiv.org/pdf/1506.02075v1.pdf] for how the full context for each question is obtained. -------------------------------------------------------------------------------- /data/simpleQuestions/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | wget -P data/simpleQuestions/ https://www.dropbox.com/s/tohrsllcfy7rch4/SimpleQuestions_v2.tgz 4 | tar -xzvf data/simpleQuestions/SimpleQuestions_v2.tgz -C data/simpleQuestions/ -------------------------------------------------------------------------------- /data/simpleQuestions/snippet.jtr.json: -------------------------------------------------------------------------------- 1 | { 2 | "meta": "simpleQuestions.json", 3 | "instances": [ 4 | { 5 | "questions": [ 6 | { 7 | "answers": [ 8 | "www.freebase.com/m/01cj3p" 9 | ], 10 | "question": "what is the book e about" 11 | } 12 | ], 13 | "support": [ 14 | { 15 | "text": "www.freebase.com/m/04whkz5 www.freebase.com/book/written_work/subjects" 16 | } 17 | ] 18 | }, 19 | { 20 | "questions": [ 21 | { 22 | "answers": [ 23 | "www.freebase.com/m/0sjc7c1" 24 | ], 25 | "question": "to what release does the release track cardiac arrest come from" 26 | } 27 | ], 28 | "support": [ 29 | { 30 | "text": "www.freebase.com/m/0tp2p24 www.freebase.com/music/release_track/release" 31 | } 32 | ] 33 | }, 34 | { 35 | "questions": [ 36 | { 37 | "answers": [ 38 | "www.freebase.com/m/07ssc" 39 | ], 40 | "question": "what country was the film the debt from" 41 | } 42 | ], 43 | "support": [ 44 | { 45 | "text": "www.freebase.com/m/04j0t75 www.freebase.com/film/film/country" 46 | } 47 | ] 48 | }, 49 | { 50 | "questions": [ 51 | { 52 | "answers": [ 53 | "www.freebase.com/m/0p600l" 54 | ], 55 | "question": "what songs have nobuo uematsu produced?" 56 | } 57 | ], 58 | "support": [ 59 | { 60 | "text": "www.freebase.com/m/0ftqr www.freebase.com/music/producer/tracks_produced" 61 | } 62 | ] 63 | }, 64 | { 65 | "questions": [ 66 | { 67 | "answers": [ 68 | "www.freebase.com/m/0677ng" 69 | ], 70 | "question": "Who produced eve-olution?" 71 | } 72 | ], 73 | "support": [ 74 | { 75 | "text": "www.freebase.com/m/036p007 www.freebase.com/music/release/producers" 76 | } 77 | ] 78 | } 79 | ] 80 | } -------------------------------------------------------------------------------- /data/simpleQuestions/snippet.txt: -------------------------------------------------------------------------------- 1 | www.freebase.com/m/04whkz5 www.freebase.com/book/written_work/subjects www.freebase.com/m/01cj3p what is the book e about 2 | www.freebase.com/m/0tp2p24 www.freebase.com/music/release_track/release www.freebase.com/m/0sjc7c1 to what release does the release track cardiac arrest come from 3 | www.freebase.com/m/04j0t75 www.freebase.com/film/film/country www.freebase.com/m/07ssc what country was the film the debt from 4 | www.freebase.com/m/0ftqr www.freebase.com/music/producer/tracks_produced www.freebase.com/m/0p600l what songs have nobuo uematsu produced? 5 | www.freebase.com/m/036p007 www.freebase.com/music/release/producers www.freebase.com/m/0677ng Who produced eve-olution? 6 | -------------------------------------------------------------------------------- /data/triviaqa/README: -------------------------------------------------------------------------------- 1 | The download script does the following: 2 | * downloads and unpacks triviaqa 3 | * clones github.com/allenai/document-qa temporarily (third party) 4 | * preprocesses triviaqa with third party 5 | * converts datasets from third-party to Jack format while sub-sampling supporting paragraphs using tf-idf following [1] 6 | 7 | You can set parallelism with with the first argument to download script and the download dir of triviaqa 8 | with the 2nd argument. Try to use an SSD to speed up things. The whole setup can take a few hours. 9 | 10 | 11 | [1] https://arxiv.org/pdf/1710.10723.pdf 12 | -------------------------------------------------------------------------------- /data/triviaqa/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os.path import join 3 | 4 | """ 5 | Global config options 6 | """ 7 | 8 | TRIVIA_QA = os.environ.get('TRIVIAQA_HOME', None) 9 | TRIVIA_QA_UNFILTERED = os.environ.get('TRIVIAQA_UNFILTERED_HOME', None) 10 | 11 | CORPUS_DIR = join(os.environ.get('TRIVIAQA_HOME', ''), "preprocessed") 12 | 13 | VEC_DIR = '' 14 | -------------------------------------------------------------------------------- /data/triviaqa/download.sh: -------------------------------------------------------------------------------- 1 | echo "This script might take a while (a couple of hours)." 2 | echo "You can set parallelism up with the first argument and the working dir with the 2nd argument. Try to use an SSD to speed up things." 3 | 4 | #set parallelism 5 | if [ $# -lt 1 ]; then 6 | N=`nproc --all` 7 | else 8 | N=$1 9 | fi 10 | 11 | if [ $# -lt 2 ]; then 12 | DOWNLOADPATH=data/triviaqa 13 | else 14 | DOWNLOADPATH=$2 15 | fi 16 | 17 | export TRIVIAQA_HOME=$DOWNLOADPATH/triviaqa-rc 18 | if [ ! -d $TRIVIAQA_HOME ]; then 19 | echo "Downloading and extracting dataset..." 20 | wget -P $DOWNLOADPATH http://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz 21 | tar xf $DOWNLOADPATH/triviaqa-rc.tar.gz -C $DOWNLOADPATH 22 | fi 23 | 24 | git clone https://github.com/dirkweissenborn/document-qa.git data/triviaqa/document-qa 25 | 26 | cp data/triviaqa/config.py data/triviaqa/document-qa/docqa/ 27 | export PYTHONPATH=data/triviaqa/document-qa:$PYTHONPATH 28 | 29 | echo "Third-party preprocessing..." 30 | python3 data/triviaqa/document-qa/docqa/triviaqa/evidence_corpus.py -n $N 31 | python3 data/triviaqa/document-qa/docqa/triviaqa/build_span_corpus.py wiki --n_processes $N 32 | python3 data/triviaqa/document-qa/docqa/triviaqa/build_span_corpus.py web --n_processes $N 33 | 34 | echo "Converting to Jack format..." 35 | # We only extract the top (tf/idf) 6 paragraphs (merged/split to maximum of 600 tokens each) to save disk space. 36 | # In case you want all paragraphs, change 6 to -1. 37 | 38 | # for dev and test take all paragraphs 39 | python3 data/triviaqa/convert2jack.py web-dev $N -1 600 40 | python3 data/triviaqa/convert2jack.py wiki-dev $N -1 600 41 | python3 data/triviaqa/convert2jack.py web-test $N -1 600 42 | python3 data/triviaqa/convert2jack.py wiki-test $N -1 600 43 | 44 | # for training we only need the top k paragraphs 45 | python3 data/triviaqa/convert2jack.py web-train $N 4 600 46 | python3 data/triviaqa/convert2jack.py wiki-train $N 6 600 47 | 48 | echo "Removing data/triviaqa/document-qa repository, since it is not needed anymore." 49 | rm -rf data/triviaqa/document-qa 50 | 51 | echo "Find prepared datasets in data/triviaqa/. If you want, you can safely remove $DOWNLOADPATH/triviaqa-rc now." 52 | -------------------------------------------------------------------------------- /data/word2vec/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd "$(dirname "$0")" 4 | 5 | wget https://www.dropbox.com/s/bnm0trligffakd9/GoogleNews-vectors-negative300.bin.gz 6 | 7 | 8 | -------------------------------------------------------------------------------- /docs/Formats_for_Embeddings.md: -------------------------------------------------------------------------------- 1 | # Formats for Embeddings 2 | 3 | Jack supports loading of various embedding formats, including glove and word2vec. These can be specified in the 4 | configuration files or command line parameters of your models via the `embedding_format` parameter. In particular, 5 | we support 6 | 7 | * `glove`: the original glove format, either as txt file or zipped 8 | * `word2vec`: word2vec format 9 | * `fasttext`: fasttext format 10 | * `memory_map_dir`: a directory that contains the embeddings as a numpy memory map, and meta information necessary to 11 | instantiate it. 12 | 13 | ## Memory Map Directories 14 | For large embeddings (large dimensions, many words), loading embeddings into memory can both take up a lot of 15 | CPU memory, and be very slow. Numpy provides a file format for matrices that loads vectors on the fly. In Jack 16 | this functionality is used via the `memory_map_dir` format. 17 | 18 | You can convert your embeddings into this format via the `memory_map.py` script. For example, to convert Glove embeddings, 19 | assuming you are in the top level jack directory, write: 20 | 21 | ```bash 22 | $ export PYTHONPATH=$PYTHONPATH:. 23 | $ python3 bin/mmap-cli.py --help 24 | $ python3 bin/mmap-cli.py data/GloVe/glove.840B.300d.txt data/GloVe/glove.840B.300d.memory_map_dir 25 | ``` 26 | 27 | This creates a directory `data/GloVe/glove.840B.300d.memory_map_dir` that stores the memory map and some necessary 28 | meta information. 29 | 30 | Using this format can substantially reduce start-up times and memory footprint. 31 | -------------------------------------------------------------------------------- /jack/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from jack.train_reader import train 4 | 5 | __all__ = [ 6 | 'train' 7 | ] 8 | -------------------------------------------------------------------------------- /jack/core/__init__.py: -------------------------------------------------------------------------------- 1 | from jack.core.input_module import * 2 | from jack.core.model_module import * 3 | from jack.core.output_module import * 4 | from jack.core.reader import * 5 | from jack.core.tensorport import * 6 | from jack.core.shared_resources import * 7 | -------------------------------------------------------------------------------- /jack/core/output_module.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from abc import abstractmethod 4 | from typing import Sequence, Mapping 5 | 6 | import numpy as np 7 | 8 | from jack.core.data_structures import QASetting, Answer 9 | from jack.core.tensorport import TensorPort 10 | 11 | 12 | class OutputModule: 13 | """ 14 | An output module takes the output (numpy) tensors of the model module and turns them into 15 | jack data structures. 16 | """ 17 | 18 | @property 19 | @abstractmethod 20 | def input_ports(self) -> Sequence[TensorPort]: 21 | """Returns: correspond to a subset of output ports of model module.""" 22 | raise NotImplementedError 23 | 24 | @abstractmethod 25 | def __call__(self, questions: Sequence[QASetting], tensors: Mapping[TensorPort, np.array]) \ 26 | -> Sequence[Answer]: 27 | """ 28 | Process the tensors corresponding to the defined `input_ports` for a batch to produce a list of answers. 29 | The module has access to the original inputs. 30 | Args: 31 | questions: 32 | prediction: 33 | 34 | Returns: 35 | 36 | """ 37 | raise NotImplementedError 38 | 39 | @abstractmethod 40 | def setup(self): 41 | pass 42 | 43 | def store(self, path): 44 | """Store the state of this module. Default is that there is no state, so nothing to store.""" 45 | pass 46 | 47 | def load(self, path): 48 | """Load the state of this module. Default is that there is no state, so nothing to load.""" 49 | pass 50 | -------------------------------------------------------------------------------- /jack/eval/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from jack.eval import extractive_qa, link_prediction, classification 4 | from jack.eval.base import evaluators, evaluate_reader, pretty_print_results 5 | -------------------------------------------------------------------------------- /jack/eval/base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from jack.eval import extractive_qa, link_prediction, classification 4 | 5 | evaluators = { 6 | 'extractive_qa': extractive_qa.evaluate, 7 | 'link_prediction': link_prediction.evaluate, 8 | 'classification': None 9 | } 10 | 11 | 12 | def evaluate_reader(reader, dataset, batch_size): 13 | from jack.readers.implementations import extractive_qa_readers, classification_readers, link_prediction_readers 14 | reader_name = reader.shared_resources.config.get('reader') 15 | if reader_name in extractive_qa_readers: 16 | return extractive_qa.evaluate(reader, dataset, batch_size) 17 | elif reader_name in link_prediction_readers: 18 | return link_prediction.evaluate(reader, dataset, batch_size) 19 | elif reader_name in classification_readers: 20 | return classification.evaluate(reader, dataset, batch_size) 21 | 22 | 23 | def pretty_print_results(d, prefix=''): 24 | for k, v in sorted(d.items(), key=lambda x: x[0]): 25 | if isinstance(v, dict): 26 | print(prefix + k + ":") 27 | pretty_print_results(v, prefix + '\t') 28 | elif '\n' in str(v): 29 | print(prefix + k + ":") 30 | print(str(v).replace('\n', '\n' + prefix + '\t')) 31 | else: 32 | print(prefix + k + ":", str(v)) 33 | -------------------------------------------------------------------------------- /jack/eval/classification.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | 4 | def evaluate(reader, dataset, batch_size): 5 | answers = reader.process_dataset(dataset, batch_size, silent=False) 6 | 7 | confusion_matrix = defaultdict(lambda: defaultdict(int)) 8 | 9 | for (q, a), pa in zip(dataset, answers): 10 | confusion_matrix[a[0].text][pa.text] += 1 11 | 12 | classes = sorted(confusion_matrix.keys()) 13 | max_class = max(6, len(max(classes, key=len))) 14 | 15 | precision = dict() 16 | recall = dict() 17 | f1 = dict() 18 | 19 | confusion_matrix_string = ['\n', ' ' * max_class] 20 | for c in classes: 21 | confusion_matrix_string.append('\t') 22 | confusion_matrix_string.append(c) 23 | confusion_matrix_string.append(' ' * (max_class - len(c))) 24 | confusion_matrix_string.append('\n') 25 | for c1 in classes: 26 | confusion_matrix_string.append(c1) 27 | confusion_matrix_string.append(' ' * (max_class - len(c1))) 28 | for c2 in classes: 29 | confusion_matrix_string.append('\t') 30 | ct = str(confusion_matrix[c1][c2]) 31 | confusion_matrix_string.append(ct) 32 | confusion_matrix_string.append(' ' * (max_class - len(ct))) 33 | confusion_matrix_string.append('\n') 34 | precision[c1] = confusion_matrix[c1][c1] / max(1.0, sum(p[c1] for p in confusion_matrix.values())) 35 | recall[c1] = confusion_matrix[c1][c1] / max(1.0, sum(confusion_matrix[c1].values())) 36 | f1[c1] = 2 * precision[c1] * recall[c1] / max(1.0, precision[c1] + recall[c1]) 37 | 38 | accuracy = sum(confusion_matrix[c][c] for c in classes) / max( 39 | 1.0, sum(sum(vs.values()) for vs in confusion_matrix.values())) 40 | 41 | return { 42 | 'Accuracy': accuracy, 43 | 'Precision': precision, 44 | 'Recall': recall, 45 | 'F1': f1, 46 | 'Confusion Matrix': ''.join(confusion_matrix_string) 47 | } 48 | -------------------------------------------------------------------------------- /jack/eval/extractive_qa.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | import string 4 | from collections import Counter 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | def evaluate(reader, dataset, batch_size): 10 | answers = reader.process_dataset(dataset, batch_size, silent=False) 11 | 12 | f1 = exact_match = 0 13 | for pa, (q, ass) in zip(answers, dataset): 14 | ground_truth = [a.text for a in ass] 15 | f1 += metric_max_over_ground_truths(f1_score, pa.text, ground_truth) 16 | exact_match += metric_max_over_ground_truths(exact_match_score, pa.text, ground_truth) 17 | 18 | f1 /= len(answers) 19 | exact_match /= len(answers) 20 | 21 | return {'F1': f1, 'Exact': exact_match} 22 | 23 | 24 | def normalize_answer(s): 25 | """Lower text and remove punctuation, articles and extra whitespace.""" 26 | 27 | def remove_articles(text): 28 | return re.sub(r'\b(a|an|the)\b', ' ', text) 29 | 30 | def white_space_fix(text): 31 | return ' '.join(text.split()) 32 | 33 | def remove_punc(text): 34 | exclude = set(string.punctuation) 35 | return ''.join(ch for ch in text if ch not in exclude) 36 | 37 | def lower(text): 38 | return text.lower() 39 | 40 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 41 | 42 | 43 | def f1_score(prediction, ground_truth): 44 | prediction_tokens = normalize_answer(prediction).split() 45 | ground_truth_tokens = normalize_answer(ground_truth).split() 46 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 47 | num_same = sum(common.values()) 48 | if num_same == 0: 49 | return 0 50 | precision = 1.0 * num_same / len(prediction_tokens) 51 | recall = 1.0 * num_same / len(ground_truth_tokens) 52 | f1 = (2 * precision * recall) / (precision + recall) 53 | return f1 54 | 55 | 56 | def exact_match_score(prediction, ground_truth): 57 | return normalize_answer(prediction) == normalize_answer(ground_truth) 58 | 59 | 60 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 61 | scores_for_ground_truths = [0.0] 62 | for ground_truth in ground_truths: 63 | score = metric_fn(prediction, ground_truth) 64 | scores_for_ground_truths.append(score) 65 | return max(scores_for_ground_truths) 66 | -------------------------------------------------------------------------------- /jack/eval/output_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-04/schema#", 3 | "title": "output schema", 4 | "description": "The schema for predictions of a model.", 5 | "type": "array", 6 | "items": { 7 | "instances": { 8 | "type": "array", 9 | "items": { 10 | "type":"array", 11 | "items": { 12 | "type":"object", 13 | "properties": { 14 | "candidates": { 15 | "type":"array", 16 | "items": { 17 | "type":"object", 18 | "properties": { 19 | "text": {"type":"string" }, 20 | "label": {"type": "string" }, 21 | "score": {"type": "number"}, 22 | "span": { 23 | "type": "array", 24 | "items": { 25 | "type":"integer" 26 | } 27 | } 28 | } 29 | } 30 | } 31 | } 32 | } 33 | } 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /jack/io/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/jack/io/__init__.py -------------------------------------------------------------------------------- /jack/io/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from jack.io.embeddings.embeddings import Embeddings, load_embeddings 4 | from jack.io.embeddings.glove import load_glove 5 | 6 | __all__ = [ 7 | 'Embeddings', 8 | 'load_embeddings' 9 | 'load_word2vec', 10 | 'get_word2vec_vocabulary', 11 | 'load_glove', 12 | ] 13 | -------------------------------------------------------------------------------- /jack/io/embeddings/fasttext.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | 5 | import numpy as np 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | def load_fasttext(stream, vocab=None): 11 | """Loads fastText file and merges it if optional vocabulary 12 | Args: 13 | stream (iterable): An opened filestream to the fastText file. 14 | vocab (dict=None): Word2idx dict of existing vocabulary. 15 | Returns: 16 | return_vocab (Vocabulary), lookup (matrix); Vocabulary contains the 17 | word2idx and the matrix contains the embedded words. 18 | """ 19 | logger.info('Loading fastText vectors ..') 20 | 21 | word2idx = {} 22 | vec_n, vec_size = map(int, stream.readline().split()) 23 | lookup = np.empty([vocab.get_size() if vocab is not None else vec_n, vec_size], dtype=np.float) 24 | n = 0 25 | for line in stream: 26 | word, vec = line.rstrip().split(maxsplit=1) 27 | if vocab is None or word in vocab and word not in word2idx: 28 | word = word.decode('utf-8') 29 | idx = len(word2idx) 30 | word2idx[word] = idx 31 | # if idx > np.size(lookup, axis=0) - 1: 32 | # lookup.resize([lookup.shape[0] + 500000, lookup.shape[1]]) 33 | lookup[idx] = np.fromstring(vec, sep=' ') 34 | n += 1 35 | # lookup.resize([len(word2idx), dim]) 36 | logger.info('Loading fastText vectors completed.') 37 | return word2idx, lookup 38 | -------------------------------------------------------------------------------- /jack/io/embeddings/glove.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | 5 | import numpy as np 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | def load_glove(stream, vocab=None): 11 | """Loads GloVe file and merges it if optional vocabulary 12 | Args: 13 | stream (iterable): An opened filestream to the GloVe file. 14 | vocab (dict=None): Word2idx dict of existing vocabulary. 15 | Returns: 16 | return_vocab (Vocabulary), lookup (matrix); Vocabulary contains the 17 | word2idx and the matrix contains the embedded words. 18 | """ 19 | logger.info('Loading GloVe vectors ..') 20 | 21 | word2idx = {} 22 | first_line = stream.readline() 23 | dim = len(first_line.split()) - 1 24 | lookup = np.empty([500000, dim], dtype=np.float) 25 | lookup[0] = np.fromstring(first_line.split(maxsplit=1)[1], sep=' ') 26 | word2idx[first_line.split(maxsplit=1)[0].decode('utf-8')] = 0 27 | n = 1 28 | for line in stream: 29 | word, vec = line.rstrip().split(maxsplit=1) 30 | if vocab is None or word in vocab and word not in word2idx: 31 | word = word.decode('utf-8') 32 | idx = len(word2idx) 33 | word2idx[word] = idx 34 | if idx > np.size(lookup, axis=0) - 1: 35 | lookup.resize([lookup.shape[0] + 500000, lookup.shape[1]]) 36 | lookup[idx] = np.fromstring(vec, sep=' ') 37 | n += 1 38 | lookup.resize([len(word2idx), dim]) 39 | logger.info('Loading GloVe vectors completed.') 40 | return word2idx, lookup 41 | -------------------------------------------------------------------------------- /jack/io/embeddings/memory_map.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | import os 5 | 6 | import numpy as np 7 | 8 | from jack.io.embeddings import Embeddings 9 | 10 | 11 | def load_memory_map_dir(directory: str) -> Embeddings: 12 | """ 13 | Loads embeddings from a memory map directory to allow lazy loading (and reduce the memory usage). 14 | Args: 15 | directory: a file prefix. This function loads two files in the directory: a meta json file with shape information 16 | and the vocabulary, and the actual memory map file. 17 | 18 | Returns: 19 | Embeddings object with a lookup matrix that is backed by a memory map. 20 | 21 | """ 22 | meta_file = os.path.join(directory, "meta.json") 23 | mem_map_file = os.path.join(directory, "memory_map") 24 | with open(meta_file, "r") as f: 25 | meta = json.load(f) 26 | shape = tuple(meta['shape']) 27 | vocab = meta['vocab'] 28 | mem_map = np.memmap(mem_map_file, dtype='float32', mode='r+', shape=shape) 29 | result = Embeddings(vocab, mem_map, filename=directory, emb_format="memory_map_dir") 30 | return result 31 | 32 | 33 | def save_as_memory_map_dir(directory: str, emb: Embeddings): 34 | """ 35 | Saves the given embeddings as memory map file and corresponding meta data in a directory. 36 | Args: 37 | directory: the directory to store the memory map file in (called `memory_map`) and the meta file (called 38 | `meta.json` that stores the shape of the memory map and the actual vocabulary. 39 | emb: the embeddings to store. 40 | """ 41 | if not os.path.exists(directory): 42 | os.makedirs(directory) 43 | 44 | meta_file = os.path.join(directory, "meta.json") 45 | mem_map_file = os.path.join(directory, "memory_map") 46 | with open(meta_file, "w") as f: 47 | json.dump({ 48 | "vocab": emb.vocabulary, 49 | "shape": emb.shape 50 | }, f) 51 | mem_map = np.memmap(mem_map_file, dtype='float32', mode='w+', shape=emb.shape) 52 | mem_map[:] = emb.lookup[:] 53 | mem_map.flush() 54 | del mem_map 55 | -------------------------------------------------------------------------------- /jack/io/embeddings/word_to_vec.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import gzip 4 | import numpy as np 5 | 6 | import logging 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | def load_word2vec(filename, vocab=None, normalise=True): 12 | """Loads a word2vec file and merges existing vocabulary. 13 | 14 | Args: 15 | filename (string): Path to the word2vec file. 16 | vocab (Vocabulary=None): Existing vocabulary to be merged. 17 | normalise (bool=True): If the word embeddings should be unit 18 | normalized or not. 19 | Returns: 20 | return_vocab (dict), lookup (matrix): The dict is a word2idx dict and 21 | the lookup matrix is the matrix of embedding vectors. 22 | """ 23 | logger.info("Loading word2vec vectors ..") 24 | with gzip.open(filename, 'rb') as f: 25 | vec_n, vec_size = map(int, f.readline().split()) 26 | byte_size = vec_size * 4 27 | lookup = np.empty([vocab.get_size() if vocab is not None else vec_n, vec_size], dtype=np.float32) 28 | word2idx = {} 29 | idx = 0 30 | for n in range(vec_n): 31 | word = b'' 32 | while True: 33 | c = f.read(1) 34 | if c == b' ': 35 | break 36 | else: 37 | word += c 38 | 39 | word = word.decode('utf-8') 40 | vector = np.fromstring(f.read(byte_size), dtype=np.float32) 41 | if vocab is None or vocab.contains_word(word): 42 | word2idx[word] = idx 43 | lookup[idx] = _normalise(vector) if normalise else vector 44 | idx += 1 45 | 46 | lookup.resize([idx, vec_size]) 47 | logger.info('Loading word2vec vectors completed.') 48 | return word2idx, lookup 49 | 50 | 51 | def _normalise(x): 52 | """Unit normalize x with L2 norm.""" 53 | return (1.0 / np.linalg.norm(x, ord=2)) * x 54 | 55 | 56 | def get_word2vec_vocabulary(fname): 57 | """Loads word2vec file and returns the vocabulary as dict word2idx.""" 58 | voc, _ = load_word2vec(fname) 59 | return voc 60 | 61 | 62 | if __name__ == "__main__": 63 | pickle_tokens = False 64 | vocab, _ = load_word2vec('../../data/word2vec/GoogleNews-vectors-negative300.bin.gz') 65 | 66 | # pickle token set 67 | if pickle_tokens: 68 | import pickle 69 | w2v_words = set(vocab.get_all_words()) 70 | pickle.dump(w2v_words, open('./data/w2v_tokens.pickle', 'wb')) 71 | -------------------------------------------------------------------------------- /jack/io/load.py: -------------------------------------------------------------------------------- 1 | """Implementation of loaders for common datasets.""" 2 | 3 | import json 4 | 5 | from jack.core.data_structures import * 6 | from jack.io.SNLI2jtr import convert_snli 7 | from jack.io.SQuAD2jtr import convert_squad 8 | 9 | loaders = dict() 10 | 11 | 12 | def _register(name): 13 | def _decorator(f): 14 | loaders[name] = f 15 | return f 16 | 17 | return _decorator 18 | 19 | 20 | @_register('jack') 21 | def load_jack(path, max_count=None): 22 | """ 23 | This function loads a jack json file from a specific location. 24 | Args: 25 | path: the location to load from. 26 | max_count: how many instances to load at most 27 | 28 | Returns: 29 | A list of input-answer pairs. 30 | 31 | """ 32 | # We load json directly instead 33 | with open(path) as f: 34 | jtr_data = json.load(f) 35 | 36 | return jack_to_qasetting(jtr_data, max_count) 37 | 38 | 39 | @_register('squad') 40 | def load_squad(path, max_count=None): 41 | """ 42 | This function loads a squad json file from a specific location. 43 | Args: 44 | path: the location to load from. 45 | max_count: how many instances to load at most 46 | 47 | Returns: 48 | A list of input-answer pairs. 49 | """ 50 | # We load to jtr dict and convert to qa settings for now 51 | jtr_data = convert_squad(path) 52 | return jack_to_qasetting(jtr_data, max_count) 53 | 54 | 55 | @_register('snli') 56 | def load_snli(path, max_count=None): 57 | """ 58 | This function loads a jack json file with labelled answers from a specific location. 59 | Args: 60 | path: the location to load from. 61 | max_count: how many instances to load at most 62 | 63 | Returns: 64 | A list of input-answer pairs. 65 | """ 66 | # We load to jtr dict and convert to qa settings for now 67 | jtr_data = convert_snli(path) 68 | return jack_to_qasetting(jtr_data, max_count) 69 | -------------------------------------------------------------------------------- /jack/io/merge_JTR_data_files.py: -------------------------------------------------------------------------------- 1 | """ 2 | This files merges two data files, both in JTR format, into a single JTR data file. 3 | It assumes that the structure of instances is identical for both input files 4 | and only concatenates the two instances lists. 5 | It also assumes that the global variables are identical in both input files. 6 | """ 7 | 8 | import json 9 | import sys 10 | 11 | 12 | def main(): 13 | 14 | if len(sys.argv) != 4: 15 | print('Wrong arguments for merging two data files in Jack format into one. Usage:') 16 | print('\tpython3 merge_JTR_data_files.py input1.json input2.json output.json') 17 | else: 18 | # load input 1 19 | with open(sys.argv[1], 'r') as inputfile1: 20 | content1 = json.load(inputfile1) 21 | 22 | # load input 2 23 | with open(sys.argv[2], 'r') as inputfile2: 24 | content2 = json.load(inputfile2) 25 | 26 | # define new 'meta' field 27 | meta_ = "Merged Content of {} and {}".format(content1['meta'], content2['meta']) 28 | 29 | # define new 'globals' field. Note: so far assuming same globals in both input files. 30 | assert (content1['globals']) == content2['globals'] 31 | globals_ = content1['globals'] 32 | 33 | # concatenating instances of both input files 34 | instances_ = content1['instances'] + content2['instances'] 35 | 36 | # defining the dictionary for dumping into json 37 | merged_content = {'meta': meta_, 'globals': globals_, 'instances': instances_} 38 | 39 | # sanity check: nothing unexpected got lost or added 40 | assert len(content1['instances']) + len(content2['instances']) == len(merged_content['instances']) 41 | 42 | # summary print 43 | print('Merged file {} with {} into {}'.format(sys.argv[1],sys.argv[2],sys.argv[3])) 44 | print('Number of instances: input1: {} input2: {} output: {}'\ 45 | .format(len(content1['instances']), len(content2['instances']), len(merged_content['instances']))) 46 | 47 | # dump merged content into JTR output file. 48 | with open(sys.argv[3], 'w') as outputfile: 49 | json.dump(merged_content, outputfile) 50 | 51 | 52 | 53 | if __name__ == "__main__": 54 | main() 55 | -------------------------------------------------------------------------------- /jack/io/newsqa2squad.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | import sys 4 | from collections import Counter 5 | 6 | input_fn = sys.argv[1] 7 | output_fn = sys.argv[2] 8 | 9 | dataset = [] 10 | squad_style_dataset = {"data": dataset, "version": "1"} 11 | 12 | with open(input_fn, "r") as f: 13 | reader = csv.reader(f) 14 | reader.__next__() 15 | for row in reader: 16 | [story_id, question, answer_char_ranges, is_answer_absent, is_question_bad, validated_answers, story_text] = row 17 | 18 | spans = None 19 | if validated_answers: 20 | answers = json.loads(validated_answers) 21 | spans = [k for k, v in answers.items() if ":" in k] 22 | else: 23 | answers = Counter() 24 | for rs in answer_char_ranges.split("|"): 25 | for r in set(rs.split(",")): 26 | if ":" in r: 27 | answers[r] += 1 28 | spans = [k for k, v in answers.items() if ":" in k and v >= 2] 29 | 30 | if spans: 31 | example = {"title": story_id, "paragraphs": [ 32 | { 33 | "context": story_text, 34 | "qas": [{ 35 | "question": question, 36 | "id": story_id + "_" + question.replace(" ", "_"), 37 | "answers": [{ 38 | "answer_start": int(span.split(":")[0]), 39 | "text": story_text[int(span.split(":")[0]):int(span.split(":")[1])] 40 | } for span in spans] 41 | }] 42 | } 43 | ]} 44 | dataset.append(example) 45 | # else: 46 | # print("No span found for %s" % story_id) 47 | 48 | with open(output_fn, "w") as f: 49 | json.dump(squad_style_dataset, f) 50 | -------------------------------------------------------------------------------- /jack/io/read_semeval2017Task10.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def readAnn(textfolder="../data/SemEval2017Task10/"): 5 | ''' 6 | Read .ann files and look up corresponding spans in .txt files 7 | 8 | Args: 9 | textfolder: 10 | ''' 11 | 12 | flist = os.listdir(textfolder) 13 | for f in flist: 14 | if not f.endswith(".ann"): 15 | continue 16 | 17 | f_anno = open(os.path.join(textfolder, f), "rU") 18 | f_text = open(os.path.join(textfolder, f.replace(".ann", ".txt")), "rU") 19 | 20 | # there's only one line, as each .ann file is one text paragraph 21 | for l in f_text: 22 | text = l 23 | 24 | #@TODO: collect all keyphrase and relation annotations, create pairs of all keyphrase that appear in same sentence for USchema style RE 25 | 26 | for l in f_anno: 27 | anno_inst = l.strip().split("\t") 28 | if len(anno_inst) == 3: 29 | keytype, start, end = anno_inst[1].split(" ") 30 | if not keytype.endswith("-of"): 31 | 32 | # look up span in text and print error message if it doesn't match the .ann span text 33 | keyphr_text_lookup = text[int(start):int(end)] 34 | keyphr_ann = anno_inst[2] 35 | if keyphr_text_lookup != keyphr_ann: 36 | print("Spans don't match for anno " + l.strip() + " in file " + f) 37 | 38 | #if keytype.endswith("-of"): 39 | 40 | 41 | if __name__ == '__main__': 42 | readAnn() -------------------------------------------------------------------------------- /jack/io/scienceQA2jtr.py: -------------------------------------------------------------------------------- 1 | import json 2 | import io 3 | import random 4 | 5 | def convert_scienceCloze_to_jtr(scienceQAFile): 6 | 7 | instances = [] 8 | 9 | f = io.open(scienceQAFile, "r", encoding="utf-8") 10 | 11 | for l in f: 12 | l = l.strip().lower().split("\t") # do the lower case preprocessing here 13 | try: 14 | quest, answs, cands, context, contextID = l 15 | except ValueError: 16 | print(l) 17 | continue 18 | 19 | context = context[2:-2].split('\', \'') 20 | 21 | support = [] 22 | for i, c in enumerate(context): 23 | support.append({"id": contextID + "_" + str(i), "text": c}) 24 | candidates = cands[2:-2].split('\', \'') 25 | 26 | qdict = { 27 | 'question': quest, 28 | 'candidates': [ 29 | { 30 | 'text': cand 31 | } for cand in candidates 32 | ], 33 | 'answers': [{'text': answs}] 34 | } 35 | qset_dict = { 36 | 'support': support, 37 | 'questions': [qdict] 38 | } 39 | 40 | instances.append(qset_dict) 41 | 42 | 43 | instances.append(qset_dict) 44 | random.shuffle(instances) 45 | 46 | corpus_dict = { 47 | 'meta': "scienceQA.json", 48 | 'instances': instances 49 | } 50 | 51 | f.close() 52 | 53 | return corpus_dict 54 | 55 | 56 | 57 | if __name__ == "__main__": 58 | corpus = convert_scienceCloze_to_jtr("../data/scienceQA/clozeSummaryLocal_test.txt") 59 | with open("../data/scienceQA/scienceQA_clozeSummaryLocal_test.json", 'w') as outfile: 60 | json.dump(corpus, outfile, indent=2, ensure_ascii=False) -------------------------------------------------------------------------------- /jack/io/sentihood2jtr.py: -------------------------------------------------------------------------------- 1 | import json 2 | from collections import defaultdict 3 | import sys 4 | import os 5 | 6 | 7 | def main(): 8 | # = parse_cbt_example(instances[0]) 9 | if len(sys.argv) == 2: 10 | with open(sys.argv[1], 'r') as f: 11 | sentihood_data = json.load(f) 12 | 13 | convert_to_jtr(sentihood_data) 14 | elif len(sys.argv) ==1: 15 | data_path = '../data/sentihood/' 16 | filenames = ['sentihood-train.json', 'sentihood-dev.json', 17 | 'sentihood-test.json'] 18 | for i, f in enumerate(filenames): 19 | raw_data = json.load(open(os.path.join(data_path, f))) 20 | instances = convert_to_jtr(raw_data) 21 | 22 | if i == 0: # training data -> write overfit set 23 | json.dump(wrap_into_jtr_global(instances[:100]), 24 | open('../../tests/test_data/sentihood/overfit.json','w'), 25 | indent=2) 26 | 27 | # write data sets for smalldata tests 28 | json.dump(wrap_into_jtr_global(instances[:1000]), 29 | open(os.path.join('../../tests/test_data/sentihood/',f),'w'), 30 | indent=2) 31 | 32 | def wrap_into_jtr_global(instances): 33 | reading_dataset = { 34 | 'globals': { 35 | 'candidates': [ 36 | {'text': 'Negative'}, 37 | {'text': 'Positive'}, 38 | {'text': 'Neutral'} 39 | ] 40 | }, 41 | 'instances': instances 42 | } 43 | return reading_dataset 44 | 45 | 46 | 47 | def convert_to_jtr(sentihood_data, exhaustive=True): 48 | instances = [] 49 | # collect all aspects 50 | aspects = set() 51 | for instance in sentihood_data: 52 | if 'opinions' in instance.keys(): 53 | for opinion in instance['opinions']: 54 | aspects.add(opinion['aspect']) 55 | for instance in sentihood_data: 56 | text = instance['text'] 57 | answers = defaultdict(lambda: 'Neutral') 58 | if 'opinions' in instance.keys(): 59 | for opinion in instance['opinions']: 60 | aspect = opinion['aspect'] 61 | answers[aspect] = opinion['sentiment'] 62 | 63 | for aspect in aspects if exhaustive else answers.keys(): 64 | reading_instance = { 65 | 'support': [{'text': text}], 66 | 'questions': [{'question': aspect, 'answers': [{'text': answers[aspect]}]}] 67 | } 68 | instances.append(reading_instance) 69 | 70 | return instances 71 | 72 | 73 | if __name__ == "__main__": 74 | main() 75 | -------------------------------------------------------------------------------- /jack/io/simpleQuestions2jtr.py: -------------------------------------------------------------------------------- 1 | import json 2 | import io 3 | 4 | 5 | def create_snippet(file_path, first_n=5): 6 | with open(file_path, 'r') as f: 7 | return [next(f) for _ in range(first_n)] 8 | 9 | 10 | def create_jtr_snippet(file_path): 11 | return convert_simplequestions(file_path, first_n=5) 12 | 13 | 14 | def convert_simplequestions(file_path, first_n=None): 15 | instances = [] 16 | f = io.open(file_path, "r") 17 | i = 0 18 | for l in f: 19 | i += 1 20 | if first_n and i > first_n: 21 | break 22 | subj, rel, obj, qu = l.strip().split("\t") 23 | 24 | support = [" ".join([subj, rel])] 25 | qdict = { 26 | 'question': qu, 27 | 'answers': [obj] 28 | } 29 | qset_dict = { 30 | 'support': [{'text': supp} for supp in support], 31 | 'questions': [qdict] 32 | } 33 | instances.append(qset_dict) 34 | 35 | corpus_dict = { 36 | 'meta': "simpleQuestions.json", 37 | 'instances': instances 38 | } 39 | 40 | f.close() 41 | 42 | return corpus_dict 43 | 44 | 45 | def main(): 46 | # some tests: 47 | # raw_data = load_cbt_file(path=None, part='valid', mode='NE') 48 | # instances = split_cbt(raw_data) 49 | # = parse_cbt_example(instances[0]) 50 | 51 | import sys 52 | if len(sys.argv) == 3: 53 | # corpus = create_jtr_snippet(sys.argv[1]) 54 | # out = create_snippet(sys.argv[1]) 55 | # with open(sys.argv[2], 'w') as outfile: 56 | # outfile.writelines(out) 57 | corpus = convert_simplequestions(sys.argv[1]) 58 | with open(sys.argv[2], 'w') as outfile: 59 | json.dump(corpus, outfile, indent=2) 60 | else: 61 | print("Usage: python3 simpleQuestions2jtr.py path/to/simpleQuestions save/to/simpleQuestions.jack.json") 62 | 63 | 64 | if __name__ == "__main__": 65 | main() 66 | -------------------------------------------------------------------------------- /jack/io/validate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import json 4 | import jsonschema 5 | from sys import argv 6 | 7 | def main(arg1, arg2): 8 | with open(arg1) as f: 9 | data = json.load(f) 10 | 11 | with open(arg2) as f: 12 | schema = json.load(f) 13 | 14 | try: 15 | jsonschema.validate(data, schema) 16 | return 'JSON successfully validated.' 17 | except jsonschema.ValidationError as e: 18 | return e.message 19 | except jsonschema.SchemaError as e: 20 | return e 21 | 22 | 23 | if __name__ == '__main__': 24 | response = main(argv[1], argv[2]) 25 | print(response) 26 | -------------------------------------------------------------------------------- /jack/readers/__init__.py: -------------------------------------------------------------------------------- 1 | from jack.readers.implementations import * 2 | -------------------------------------------------------------------------------- /jack/readers/classification/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/jack/readers/classification/__init__.py -------------------------------------------------------------------------------- /jack/readers/classification/util.py: -------------------------------------------------------------------------------- 1 | """Shared utilities for multiple choice.""" 2 | from typing import Iterable 3 | 4 | from jack.core.data_structures import QASetting, Answer 5 | from jack.util.vocab import Vocab 6 | 7 | 8 | def create_answer_vocab(qa_settings: Iterable[QASetting] = None, answers: Iterable[Answer] = None): 9 | vocab = Vocab(unk=None) 10 | if qa_settings is not None: 11 | for qa in qa_settings: 12 | if qa.candidates: 13 | for c in qa.candidates: 14 | vocab(c) 15 | if answers is not None: 16 | for a in answers: 17 | vocab(a.text) 18 | return vocab 19 | 20 | 21 | def candidate_one_hot(candidates, answer_str): 22 | return [1.0 if candidates[answer_str] == cand else 0.0 for cand in candidates] 23 | -------------------------------------------------------------------------------- /jack/readers/extractive_qa/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /jack/readers/extractive_qa/tensorflow/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/jack/readers/extractive_qa/tensorflow/__init__.py -------------------------------------------------------------------------------- /jack/readers/extractive_qa/tensorflow/abstract_model.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence 2 | 3 | from jack.core import Ports, TensorPort, TensorPortTensors 4 | from jack.core.tensorflow import TFModelModule 5 | from jack.readers.extractive_qa.shared import XQAPorts 6 | from jack.util.tf.xqa import xqa_crossentropy_loss 7 | 8 | 9 | class AbstractXQAModelModule(TFModelModule): 10 | _input_ports = [XQAPorts.emb_question, XQAPorts.question_length, 11 | XQAPorts.emb_support, XQAPorts.support_length, XQAPorts.support2question, 12 | # char embedding inputs 13 | XQAPorts.word_chars, XQAPorts.word_char_length, 14 | XQAPorts.question_batch_words, XQAPorts.support_batch_words, 15 | # feature input 16 | XQAPorts.word_in_question, 17 | # optional input, provided only during training 18 | XQAPorts.correct_start, XQAPorts.answer2support_training, 19 | XQAPorts.is_eval] 20 | 21 | _output_ports = [XQAPorts.start_scores, XQAPorts.end_scores, 22 | XQAPorts.answer_span] 23 | _training_input_ports = [XQAPorts.start_scores, XQAPorts.end_scores, 24 | XQAPorts.answer_span_target, XQAPorts.answer2support_training, XQAPorts.support2question] 25 | _training_output_ports = [Ports.loss] 26 | 27 | @property 28 | def output_ports(self) -> Sequence[TensorPort]: 29 | return self._output_ports 30 | 31 | @property 32 | def input_ports(self) -> Sequence[TensorPort]: 33 | return self._input_ports 34 | 35 | @property 36 | def training_input_ports(self) -> Sequence[TensorPort]: 37 | return self._training_input_ports 38 | 39 | @property 40 | def training_output_ports(self) -> Sequence[TensorPort]: 41 | return self._training_output_ports 42 | 43 | def create_training_output(self, shared_resources, input_tensors): 44 | tensors = TensorPortTensors(input_tensors) 45 | return { 46 | Ports.loss: xqa_crossentropy_loss(tensors.start_scores, tensors.end_scores, 47 | tensors.answer_span_target, tensors.answer2support, 48 | tensors.support2question, 49 | use_sum=shared_resources.config.get('loss', 'sum') == 'sum') 50 | } 51 | -------------------------------------------------------------------------------- /jack/readers/extractive_qa/torch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/jack/readers/extractive_qa/torch/__init__.py -------------------------------------------------------------------------------- /jack/readers/link_prediction/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/jack/readers/link_prediction/__init__.py -------------------------------------------------------------------------------- /jack/readers/link_prediction/similarities.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | 5 | import tensorflow as tf 6 | 7 | 8 | def negative_l1_distance(x1, x2, axis=1): 9 | """ 10 | Negative L1 Distance. 11 | 12 | .. math:: L = - \\sum_i \\abs(x1_i - x2_i) 13 | 14 | Args: 15 | x1: First term. 16 | x2: Second term. 17 | axis: Reduction Indices. 18 | 19 | Returns: 20 | Similarity Value. 21 | """ 22 | distance = tf.reduce_sum(tf.abs(x1 - x2), axis=axis) 23 | return - distance 24 | 25 | 26 | def negative_l2_distance(x1, x2, axis=1): 27 | """ 28 | Negative L2 Distance. 29 | 30 | .. math:: L = - \\sqrt{\\sum_i (x1_i - x2_i)^2} 31 | 32 | Args: 33 | x1: First term. 34 | x2: Second term. 35 | axis: Reduction Indices. 36 | 37 | Returns: 38 | Similarity Value. 39 | """ 40 | 41 | distance = tf.sqrt(tf.reduce_sum(tf.square(x1 - x2), axis=axis)) 42 | return - distance 43 | 44 | 45 | def negative_square_l2_distance(x1, x2, axis=1): 46 | """ 47 | Negative Square L2 Distance. 48 | 49 | .. math:: L = - \\sum_i (x1_i - x2_i)^2 50 | 51 | Args: 52 | x1: First term. 53 | x2: Second term. 54 | axis: Reduction Indices. 55 | 56 | Returns: 57 | Similarity Value. 58 | """ 59 | distance = tf.reduce_sum(tf.square(x1 - x2), axis=axis) 60 | return - distance 61 | 62 | 63 | def dot_product(x1, x2, axis=1): 64 | """ 65 | Dot Product. 66 | 67 | .. math:: L = \\sum_i x1_i x2_i 68 | 69 | Args: 70 | x1: First term. 71 | x2: Second term. 72 | axis: Reduction Indices. 73 | 74 | Returns: 75 | Similarity Value. 76 | """ 77 | 78 | similarity = tf.reduce_sum(x1 * x2, axis=axis) 79 | return similarity 80 | 81 | 82 | # Aliases 83 | l1 = L1 = negative_l1_distance 84 | l2 = L2 = negative_l2_distance 85 | l2_sqr = L2_SQR = negative_square_l2_distance 86 | dot = DOT = dot_product 87 | 88 | 89 | def get_function(function_name): 90 | this_module = sys.modules[__name__] 91 | if not hasattr(this_module, function_name): 92 | raise ValueError('Unknown similarity function: {}'.format(function_name)) 93 | return getattr(this_module, function_name) 94 | -------------------------------------------------------------------------------- /jack/readers/natural_language_inference/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/jack/readers/natural_language_inference/__init__.py -------------------------------------------------------------------------------- /jack/readers/natural_language_inference/conditional_bilstm.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from jack.readers.classification.shared import AbstractSingleSupportClassificationModel 4 | from jack.util.tf.rnn import fused_birnn 5 | 6 | 7 | class ConditionalBiLSTMClassificationModel(AbstractSingleSupportClassificationModel): 8 | def forward_pass(self, shared_resources, embedded_question, embedded_support, num_classes, tensors): 9 | # question - hypothesis; support - premise 10 | repr_dim = shared_resources.config['repr_dim'] 11 | dropout = shared_resources.config.get("dropout", 0.0) 12 | 13 | with tf.variable_scope('embedding_projection') as vs: 14 | embedded_question = tf.layers.dense(embedded_question, repr_dim, tf.tanh, name='projection') 15 | vs.reuse_variables() 16 | embedded_support = tf.layers.dense(embedded_support, repr_dim, tf.tanh, name='projection') 17 | # keep dropout mask constant over time 18 | dropout_shape = [tf.shape(embedded_question)[0], 1, tf.shape(embedded_question)[2]] 19 | embedded_question = tf.nn.dropout(embedded_question, 1.0 - dropout, dropout_shape) 20 | embedded_support = tf.nn.dropout(embedded_support, 1.0 - dropout, dropout_shape) 21 | 22 | fused_rnn = tf.contrib.rnn.LSTMBlockFusedCell(repr_dim) 23 | # [batch, 2*output_dim] -> [batch, num_classes] 24 | _, q_states = fused_birnn(fused_rnn, embedded_question, sequence_length=tensors.question_length, 25 | dtype=tf.float32, time_major=False, scope="question_rnn") 26 | 27 | outputs, _ = fused_birnn(fused_rnn, embedded_support, sequence_length=tensors.support_length, 28 | dtype=tf.float32, initial_state=q_states, time_major=False, scope="support_rnn") 29 | 30 | # [batch, T, 2 * dim] -> [batch, dim] 31 | outputs = tf.concat([outputs[0], outputs[1]], axis=2) 32 | hidden = tf.layers.dense(outputs, repr_dim, tf.nn.relu, name="hidden") * tf.expand_dims( 33 | tf.sequence_mask(tensors.support_length, maxlen=tf.shape(outputs)[1], dtype=tf.float32), 2) 34 | hidden = tf.reduce_max(hidden, axis=1) 35 | # [batch, dim] -> [batch, num_classes] 36 | outputs = tf.layers.dense(hidden, num_classes, name="classification") 37 | return outputs 38 | -------------------------------------------------------------------------------- /jack/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/jack/util/__init__.py -------------------------------------------------------------------------------- /jack/util/map.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | import numpy as np 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | def get_list_shape(xs): 10 | if isinstance(xs, int): 11 | shape = [] 12 | else: 13 | shape = [len(xs)] 14 | for i, x in enumerate(xs): 15 | if isinstance(x, list) or isinstance(x, tuple): 16 | if len(shape) == 1: 17 | shape.append(0) 18 | shape[1] = max(len(x), shape[1]) 19 | for j, y in enumerate(x): 20 | if isinstance(y, list): 21 | if len(shape) == 2: 22 | shape.append(0) 23 | shape[2] = max(len(y), shape[2]) 24 | return shape 25 | 26 | 27 | def numpify(xs, pad=0, keys=None, dtypes=None): 28 | """Converts a dict or list of Python data into a dict of numpy arrays.""" 29 | is_dict = isinstance(xs, dict) 30 | xs_np = {} if is_dict else [0] * len(xs) 31 | xs_iter = xs.items() if is_dict else enumerate(xs) 32 | 33 | for i, (key, x) in enumerate(xs_iter): 34 | try: 35 | if (keys is None or key in keys) and not isinstance(x, np.ndarray): 36 | shape = get_list_shape(x) 37 | dtype = dtypes[i] if dtypes is not None else np.int64 38 | x_np = np.full(shape, pad, dtype) 39 | 40 | nb_dims = len(shape) 41 | 42 | if nb_dims == 0: 43 | x_np = x 44 | else: 45 | def f(tensor, values): 46 | t_shp = tensor.shape 47 | if len(t_shp) > 1: 48 | for _i, _values in enumerate(values): 49 | f(tensor[_i], _values) 50 | else: 51 | tensor[0:len(values)] = [v for v in values] 52 | 53 | f(x_np, x) 54 | 55 | xs_np[key] = x_np 56 | else: 57 | xs_np[key] = x 58 | except Exception as e: 59 | logger.error('Error numpifying value ' + str(x) + ' of key ' + str(key)) 60 | raise e 61 | return xs_np 62 | -------------------------------------------------------------------------------- /jack/util/random.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | 5 | 6 | def singleton(cls): 7 | instances = {} 8 | 9 | def getinstance(*args, **kwargs): 10 | if cls not in instances: 11 | instances[cls] = cls(*args, **kwargs) 12 | return instances[cls] 13 | return getinstance 14 | 15 | 16 | @singleton 17 | class DefaultRandomState(np.random.RandomState): 18 | def __init__(self, seed=None): 19 | super().__init__(seed) 20 | -------------------------------------------------------------------------------- /jack/util/tf/__init__.py: -------------------------------------------------------------------------------- 1 | """The tf package should contain all tf functionality of jtr for maximal reuse""" 2 | -------------------------------------------------------------------------------- /jack/util/tf/activations.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | 5 | 6 | def parametric_relu(x, name=None): 7 | alphas = tf.get_variable('{}/alpha'.format(name) if name else 'alpha', 8 | x.get_shape()[-1], 9 | initializer=tf.constant_initializer(0.0), 10 | dtype=tf.float32) 11 | return tf.nn.relu(x) + alphas * (x - abs(x)) * 0.5 12 | 13 | 14 | def selu(x, name=None): 15 | with tf.name_scope('{}/elu'.format(name) if name else 'elu') as _: 16 | alpha = 1.6732632423543772848170429916717 17 | scale = 1.0507009873554804934193349852946 18 | return scale*tf.where(x >= 0.0, x, alpha*tf.nn.elu(x)) 19 | 20 | 21 | # Aliases 22 | prelu = parametric_relu 23 | 24 | 25 | def activation_from_string(activation_str): 26 | if activation_str is None: 27 | return tf.identity 28 | return getattr(tf.nn, activation_str) 29 | -------------------------------------------------------------------------------- /jack/util/tf/dropout.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | 5 | 6 | def fixed_dropout(xs, keep_prob, noise_shape, seed=None): 7 | """ 8 | Apply dropout with same mask over all inputs 9 | Args: 10 | xs: list of tensors 11 | keep_prob: 12 | noise_shape: 13 | seed: 14 | 15 | Returns: 16 | list of dropped inputs 17 | """ 18 | with tf.name_scope("dropout", values=xs): 19 | noise_shape = noise_shape 20 | # uniform [keep_prob, 1.0 + keep_prob) 21 | random_tensor = keep_prob 22 | random_tensor += tf.random_uniform(noise_shape, seed=seed, dtype=xs[0].dtype) 23 | # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob) 24 | binary_tensor = tf.floor(random_tensor) 25 | outputs = [] 26 | for x in xs: 27 | ret = tf.div(x, keep_prob) * binary_tensor 28 | ret.set_shape(x.get_shape()) 29 | outputs.append(ret) 30 | return outputs 31 | -------------------------------------------------------------------------------- /jack/util/tf/highway.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | 5 | 6 | def highway_layer(inputs, activation, name=None): 7 | with tf.variable_scope(name or "highway_layer"): 8 | d = inputs.get_shape()[-1].value 9 | trans_gate = tf.contrib.layers.fully_connected(inputs, 2 * d, activation_fn=None, weights_initializer=None, 10 | scope='trans_gate') 11 | trans, gate = tf.split(trans_gate, 2, len(inputs.get_shape()) - 1) 12 | trans, gate = activation(trans), tf.sigmoid(gate) 13 | out = gate * trans + (1 - gate) * inputs 14 | return out 15 | 16 | 17 | def highway_network(inputs, num_layers, activation=tf.tanh, name=None, reuse=False): 18 | with tf.variable_scope(name or "highway_network", reuse=reuse): 19 | prev = inputs 20 | cur = None 21 | for layer_idx in range(num_layers): 22 | cur = highway_layer(prev, activation, name="layer_{}".format(layer_idx)) 23 | prev = cur 24 | return cur 25 | -------------------------------------------------------------------------------- /jack/util/tf/masking.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | 5 | 6 | def mask_3d(sequences, sequence_lengths, mask_value, dimension=2): 7 | """ 8 | Given a batch of matrices, each with shape m x n, mask the values in each 9 | row after the positions indicated in sentence_sizes. 10 | This function is supposed to mask the last columns in the raw attention 11 | matrix (e_{i, j}) in cases where the sentence2 is smaller than the 12 | maximum. 13 | 14 | Args: 15 | sequences: tensor with shape (batch_size, m, n) 16 | sequence_lengths: tensor with shape (batch_size) containing the sentence sizes that 17 | should be limited 18 | mask_value: scalar value to assign to items after sentence size 19 | dimension: over which dimension to mask values 20 | Returns: 21 | A tensor with the same shape as `values` 22 | """ 23 | if dimension == 1: 24 | sequences = tf.transpose(sequences, [0, 2, 1]) 25 | time_steps1, time_steps2 = tf.shape(sequences)[1], tf.shape(sequences)[2] 26 | ones = tf.ones_like(sequences, dtype=tf.int32) 27 | pad_values = mask_value * tf.cast(ones, tf.float32) 28 | mask = tf.sequence_mask(sequence_lengths, time_steps2) 29 | # mask is (batch_size, sentence2_size). we have to tile it for 3d 30 | mask3d = tf.tile(tf.expand_dims(mask, 1), (1, time_steps1, 1)) 31 | masked = tf.where(mask3d, sequences, pad_values) 32 | return tf.transpose(masked, [0, 2, 1]) if dimension == 1 else masked 33 | -------------------------------------------------------------------------------- /jack/util/tf/misc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | 5 | 6 | def mask_for_lengths(lengths, max_length=None, mask_right=True, value=-1000.0): 7 | """ 8 | Creates a [batch_size x max_length] mask. 9 | 10 | Args: 11 | lengths: int32 1-dim tensor of batch_size lengths 12 | max_length: int32 0-dim tensor or python int 13 | mask_right: if True, everything before "lengths" becomes zero and the 14 | rest "value", else vice versa 15 | value: value for the mask 16 | 17 | Returns: 18 | [batch_size x max_length] mask of zeros and "value"s 19 | """ 20 | mask = tf.sequence_mask(lengths, max_length, dtype=tf.float32) 21 | if mask_right: 22 | mask = 1.0 - mask 23 | mask *= value 24 | return mask 25 | -------------------------------------------------------------------------------- /jack/util/tf/simple.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | 5 | 6 | def fully_connected_projection(inputs, output_size): 7 | """Projects inputs onto target dimension. Returns logits, loss, and argmax. 8 | 9 | Creates fully connected projection layer. Then applies cross entropy 10 | softmax to get the loss. Calculate predictions via argmax. 11 | Args: 12 | inputs (tensor): Input into the projection layer. 13 | output_size (int): Size of the targets (used in projection layer). 14 | """ 15 | init = tf.contrib.layers.xavier_initializer(uniform=True) #uniform=False for truncated normal 16 | logits = tf.contrib.layers.fully_connected(inputs, output_size, weights_initializer=init, activation_fn=None) 17 | return logits 18 | -------------------------------------------------------------------------------- /jack/util/tf/xqa.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | 5 | from jack.util.tf.segment import segment_softmax 6 | 7 | 8 | def xqa_crossentropy_loss(start_scores, end_scores, answer_span, answer2support, support2question, use_sum=True): 9 | """Very common XQA loss function.""" 10 | num_questions = tf.reduce_max(support2question) + 1 11 | 12 | start, end = answer_span[:, 0], answer_span[:, 1] 13 | 14 | start_probs = segment_softmax(start_scores, support2question) 15 | start_probs = tf.gather_nd(start_probs, tf.stack([answer2support, start], 1)) 16 | 17 | # only start probs are normalized on multi-paragraph, end probs conditioned on start only on per support level 18 | num_answers = tf.shape(answer_span)[0] 19 | is_aligned = tf.equal(tf.shape(end_scores)[0], num_answers) 20 | end_probs = tf.cond( 21 | is_aligned, 22 | lambda: tf.gather_nd(tf.nn.softmax(end_scores), tf.stack([tf.range(num_answers, dtype=tf.int32), end], 1)), 23 | lambda: tf.gather_nd(segment_softmax(end_scores, support2question), tf.stack([answer2support, end], 1)) 24 | ) 25 | 26 | answer2question = tf.gather(support2question, answer2support) 27 | # compute losses individually 28 | if use_sum: 29 | span_probs = tf.unsorted_segment_sum( 30 | start_probs, answer2question, num_questions) * tf.unsorted_segment_sum( 31 | end_probs, answer2question, num_questions) 32 | else: 33 | span_probs = tf.unsorted_segment_max( 34 | start_probs, answer2question, num_questions) * tf.unsorted_segment_max( 35 | end_probs, answer2question, num_questions) 36 | 37 | return -tf.reduce_mean(tf.log(tf.maximum(1e-6, span_probs + 1e-6))) 38 | -------------------------------------------------------------------------------- /jack/util/torch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/jack/util/torch/__init__.py -------------------------------------------------------------------------------- /jack/util/torch/embedding.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import math 4 | 5 | import torch 6 | from torch import nn 7 | from torch.nn import functional 8 | 9 | from jack.util.torch import misc 10 | 11 | 12 | class ConvCharEmbeddingModule(nn.Module): 13 | def __init__(self, num_chars, size, conv_width=5): 14 | super(ConvCharEmbeddingModule, self).__init__() 15 | self._size = size 16 | self._conv_width = conv_width 17 | self._embeddings = torch.nn.Embedding(num_chars, size) 18 | self._embeddings.weight.data.mul_(0.1) 19 | self._conv = torch.nn.Conv1d(size, size, conv_width, padding=math.floor(conv_width / 2)) 20 | 21 | def forward(self, unique_word_chars, unique_word_lengths, sequences_as_uniqs=None): 22 | long_tensor = torch.cuda.LongTensor if torch.cuda.device_count() > 0 else torch.LongTensor 23 | embedded_chars = self._embeddings(unique_word_chars.type(long_tensor)) 24 | # [N, S, L] 25 | conv_out = self._conv(embedded_chars.transpose(1, 2)) 26 | # [N, L] 27 | conv_mask = misc.mask_for_lengths(unique_word_lengths) 28 | conv_out = conv_out + conv_mask.unsqueeze(1) 29 | embedded_words = conv_out.max(2)[0] 30 | 31 | if sequences_as_uniqs is None: 32 | return embedded_words 33 | else: 34 | if not isinstance(sequences_as_uniqs, list): 35 | sequences_as_uniqs = [sequences_as_uniqs] 36 | 37 | all_embedded = [] 38 | for word_idxs in sequences_as_uniqs: 39 | all_embedded.append(functional.embedding( 40 | word_idxs.type(long_tensor), embedded_words)) 41 | return all_embedded 42 | -------------------------------------------------------------------------------- /jack/util/torch/highway.py: -------------------------------------------------------------------------------- 1 | """Credits: https://github.com/kefirski/pytorch_Highway""" 2 | 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | class Highway(nn.Module): 8 | def __init__(self, size, num_layers, f=F.tanh): 9 | super(Highway, self).__init__() 10 | 11 | self.num_layers = num_layers 12 | 13 | self.nonlinear = nn.ModuleList([nn.Linear(size, size) for _ in range(num_layers)]) 14 | 15 | self.linear = nn.ModuleList([nn.Linear(size, size) for _ in range(num_layers)]) 16 | 17 | self.gate = nn.ModuleList([nn.Linear(size, size) for _ in range(num_layers)]) 18 | 19 | self.f = f 20 | 21 | def forward(self, x): 22 | """ 23 | :param x: tensor with shape of [batch_size, size] 24 | :return: tensor with shape of [batch_size, size] 25 | applies σ(x) ⨀ (f(G(x))) + (1 - σ(x)) ⨀ (Q(x)) transformation | G and Q is affine transformation, 26 | f is non-linear transformation, σ(x) is affine transformation with sigmoid non-linearition 27 | and ⨀ is element-wise multiplication 28 | """ 29 | 30 | for layer in range(self.num_layers): 31 | gate = F.sigmoid(self.gate[layer](x)) 32 | 33 | nonlinear = self.f(self.nonlinear[layer](x)) 34 | linear = self.linear[layer](x) 35 | 36 | x = gate * nonlinear + (1 - gate) * linear 37 | 38 | return x 39 | -------------------------------------------------------------------------------- /jack/util/torch/rnn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class BiLSTM(nn.Module): 6 | def __init__(self, input_size, size, start_state_given=False): 7 | super(BiLSTM, self).__init__() 8 | self._size = size 9 | self._bilstm = nn.LSTM(input_size, size, 1, bidirectional=True, batch_first=True) 10 | self._bilstm.bias_ih_l0.data[size:2 * size].fill_(1.0) 11 | self._bilstm.bias_ih_l0_reverse.data[size:2 * size].fill_(1.0) 12 | self._start_state_given = start_state_given 13 | if not start_state_given: 14 | self._lstm_start_hidden = nn.Parameter(torch.zeros(2, size)) 15 | self._lstm_start_state = nn.Parameter(torch.zeros(2, size)) 16 | 17 | def forward(self, inputs, lengths=None, start_state=None): 18 | if not self._start_state_given: 19 | batch_size = inputs.size(0) 20 | start_hidden = self._lstm_start_hidden.unsqueeze(1).expand(2, batch_size, self._size).contiguous() 21 | start_state = self._lstm_start_state.unsqueeze(1).expand(2, batch_size, self._size).contiguous() 22 | start_state = (start_hidden, start_state) 23 | 24 | if lengths is not None: 25 | new_lengths, indices = torch.sort(lengths, dim=0, descending=True) 26 | inputs = torch.index_select(inputs, 0, indices) 27 | if self._start_state_given: 28 | start_state = (torch.index_select(start_state[0], 1, indices), 29 | torch.index_select(start_state[1], 1, indices)) 30 | new_lengths = [l.data[0] for l in new_lengths] 31 | inputs = nn.utils.rnn.pack_padded_sequence(inputs, new_lengths, batch_first=True) 32 | 33 | output, (h_n, c_n) = self._bilstm(inputs, start_state) 34 | 35 | if lengths is not None: 36 | output = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)[0] 37 | _, back_indices = torch.sort(indices, dim=0) 38 | output = torch.index_select(output, 0, back_indices) 39 | h_n = torch.index_select(h_n, 1, back_indices) 40 | c_n = torch.index_select(c_n, 1, back_indices) 41 | 42 | return output, (h_n, c_n) 43 | -------------------------------------------------------------------------------- /jack/util/torch/xqa.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | from torch import nn 6 | 7 | 8 | class XQAMinCrossentropyLossModule(nn.Module): 9 | def forward(self, start_scores, end_scores, answer_span, answer_to_question): 10 | """very common XQA loss function.""" 11 | long_tensor = torch.cuda.LongTensor if torch.cuda.device_count() > 0 else torch.LongTensor 12 | answer_span = answer_span.type(long_tensor) 13 | start, end = answer_span[:, 0], answer_span[:, 1] 14 | 15 | batch_size1 = start.data.shape[0] 16 | batch_size2 = start_scores.data.shape[0] 17 | is_aligned = batch_size1 == batch_size2 18 | 19 | start_scores = start_scores if is_aligned else torch.index_select(start_scores, dim=0, index=answer_to_question) 20 | end_scores = end_scores if is_aligned else torch.index_select(end_scores, dim=0, index=answer_to_question) 21 | 22 | partitioned_loss = [] 23 | for i, j in enumerate(answer_to_question): 24 | j = j.data[0] 25 | while j >= len(partitioned_loss): 26 | partitioned_loss.append([]) 27 | loss = -torch.index_select(F.log_softmax(start_scores[i], dim=0), dim=0, index=start[i]) 28 | loss -= torch.index_select(F.log_softmax(end_scores[i], dim=0), dim=0, index=end[i]) 29 | partitioned_loss[j].append(loss) 30 | 31 | for j, l in enumerate(partitioned_loss): 32 | partitioned_loss[j] = torch.stack(l).min() 33 | 34 | loss = torch.stack(partitioned_loss).mean() 35 | return loss 36 | -------------------------------------------------------------------------------- /notebooks/prettyprint.py: -------------------------------------------------------------------------------- 1 | class QAPrettyPrint: 2 | def __init__(self, support, span): 3 | self.support = support 4 | self.span = span 5 | 6 | def _repr_html_(self): 7 | start, end = self.span 8 | pre_highlight = self.support[:start] 9 | highlight = self.support[start:end] 10 | post_highlight = self.support[end:] 11 | 12 | def _highlight(text): 13 | return '' + text + '' 14 | 15 | text = pre_highlight + _highlight(highlight) + post_highlight 16 | return text.replace('\n', '
') 17 | 18 | def print_nli(premise, hypothesis, label): 19 | print('{}\t--({})-->\t{}'.format(premise, label, hypothesis)) 20 | -------------------------------------------------------------------------------- /projects/knowledge_integration/__init__.py: -------------------------------------------------------------------------------- 1 | import projects.knowledge_integration.readers 2 | -------------------------------------------------------------------------------- /projects/knowledge_integration/conf/nli/multinli/cbilstm_assertion.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | Basic multiple choice configuration. 3 | 4 | parent_config: './conf/jack.yaml' 5 | 6 | assertion_dir: 'data/knowledge_integration/knowledge_store' 7 | assertion_limit: 20 8 | assertion_sources: ['conceptnet'] 9 | 10 | reading_module: 11 | - input: 'text' 12 | module: 'lstm' 13 | with_projection: True 14 | activation: relu 15 | 16 | 17 | seed: 1337 18 | 19 | reader: 'cbilstm_nli_assertion_reader' 20 | save_dir: './cbilstm_nli_assertion_reader' 21 | 22 | loader: snli 23 | 24 | train: 'data/MultiNLI/multinli_1.0/multinli_1.0_train.jsonl' 25 | dev: 'data/MultiNLI/multinli_1.0/multinli_1.0_dev.jsonl' 26 | 27 | embedding_format: 'memory_map_dir' 28 | embedding_file: 'data/GloVe/glove.840B.300d.memory_map_dir' 29 | 30 | vocab_from_embeddings: True 31 | with_char_embeddings: True 32 | 33 | repr_dim: 300 34 | epochs: 20 35 | dropout: 0.2 36 | batch_size: 128 37 | -------------------------------------------------------------------------------- /projects/knowledge_integration/conf/nli/snli/cbilstm_assertion.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | Basic multiple choice configuration. 3 | 4 | parent_config: './conf/jack.yaml' 5 | 6 | assertion_dir: 'data/knowledge_integration/knowledge_store' 7 | assertion_limit: 20 8 | assertion_sources: ['conceptnet'] 9 | 10 | reading_module: 11 | - input: 'text' 12 | module: 'lstm' 13 | with_projection: True 14 | activation: relu 15 | 16 | 17 | seed: 1337 18 | 19 | reader: 'cbilstm_nli_assertion_reader' 20 | save_dir: './cbilstm_nli_assertion_reader' 21 | 22 | loader: snli 23 | 24 | train: 'data/SNLI/snli_1.0/snli_1.0_train.jsonl' 25 | dev: 'data/SNLI/snli_1.0/snli_1.0_dev.jsonl' 26 | test: 'data/SNLI/snli_1.0/snli_1.0_test.jsonl' 27 | 28 | embedding_format: 'memory_map_dir' 29 | embedding_file: 'data/GloVe/glove.840B.300d.memory_map_dir' 30 | 31 | vocab_from_embeddings: True 32 | with_char_embeddings: True 33 | 34 | repr_dim: 300 35 | epochs: 20 36 | dropout: 0.2 37 | batch_size: 128 38 | -------------------------------------------------------------------------------- /projects/knowledge_integration/conf/qa/bilstm_assertion.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | BiLSTM with assertions. 3 | 4 | parent_config: './conf/qa/squad/abstract_squad.yaml' 5 | 6 | name: 'bilstm_assertion_reader' 7 | reader: 'modular_assertion_qa_reader' 8 | 9 | assertion_dir: 'data/knowledge_integration/knowledge_store' 10 | assertion_limit: 50 11 | assertion_sources: ['conceptnet'] 12 | no_reading: False 13 | 14 | heuristic: 'pair' 15 | 16 | reading_module: 17 | - input: 'text' 18 | module: 'lstm' 19 | name: 'reading' 20 | with_projection: True 21 | activation: 'relu' 22 | 23 | dropout: 0.2 24 | repr_dim: 150 25 | max_span_size: 16 26 | 27 | model: 28 | encoder_layer: 29 | 30 | - input: 'support' 31 | module: 'lstm' 32 | name: 'encoder' 33 | activation: 'tanh' 34 | with_projection: True 35 | dropout: True 36 | 37 | - input: 'question' 38 | module: 'lstm' 39 | name: 'encoder' 40 | with_projection: True 41 | activation: 'tanh' 42 | dropout: True 43 | 44 | answer_layer: 45 | module: 'mlp' 46 | -------------------------------------------------------------------------------- /projects/knowledge_integration/conf/qa/squad/bilstm_assertion.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | BiLSTM with assertions. 3 | 4 | parent_config: ['./projects/knowledge_integration/conf/qa/bilstm_assertion.yaml', './conf/qa/squad/abstract_squad.yaml'] 5 | 6 | # fixed experiment seed 7 | seed: 1337 8 | dropout: 0.2 9 | repr_dim: 150 10 | -------------------------------------------------------------------------------- /projects/knowledge_integration/conf/qa/squad/bilstm_assertion_definition.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | BiLSTM with assertions. 3 | 4 | parent_config: ['./projects/knowledge_integration/conf/qa/squad/bilstm_assertion.yaml', './conf/qa/squad/abstract_squad.yaml'] 5 | 6 | name: 'bilstm_assertion_definition_reader' 7 | reader: 'modular_assertion_definition_qa_reader' 8 | 9 | topk: 16 10 | 11 | # fraction of training batches where we extract definitions (1.0 means all, 0.0 means never) 12 | # can be lowered to speed up training, because using defintions requires running model twice 13 | training_fraction_with_definition: 0.3 14 | -------------------------------------------------------------------------------- /projects/knowledge_integration/conf/qa/triviaqa/web/bilstm_assertion.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | BiLSTM with assertions. 3 | 4 | parent_config: ['./projects/knowledge_integration/conf/qa/bilstm_assertion.yaml', './conf/qa/triviaqa/wiki/abstract_triviaqa.yaml'] 5 | 6 | # fixed experiment seed 7 | seed: 1337 8 | dropout: 0.2 9 | repr_dim: 150 10 | -------------------------------------------------------------------------------- /projects/knowledge_integration/conf/qa/triviaqa/web/bilstm_assertion_definition.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | BiLSTM with assertions. 3 | 4 | parent_config: ['./projects/knowledge_integration/conf/qa/triviaqa/wiki/bilstm_assertion.yaml', './conf/qa/triviaqa/wiki/abstract_triviaqa.yaml'] 5 | 6 | name: 'bilstm_assertion_definition_reader' 7 | reader: 'modular_assertion_definition_qa_reader' 8 | 9 | topk: 16 10 | 11 | # fraction of training batches where we extract definitions (1.0 means all, 0.0 means never) 12 | # can be lowered to speed up training, because using defintions requires running model twice 13 | training_fraction_with_definition: 0.3 14 | -------------------------------------------------------------------------------- /projects/knowledge_integration/conf/qa/triviaqa/wiki/bilstm_assertion.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | BiLSTM with assertions. 3 | 4 | parent_config: ['./projects/knowledge_integration/conf/qa/bilstm_assertion.yaml', './conf/qa/triviaqa/wiki/abstract_triviaqa.yaml'] 5 | 6 | # fixed experiment seed 7 | seed: 1337 8 | dropout: 0.2 9 | repr_dim: 150 10 | -------------------------------------------------------------------------------- /projects/knowledge_integration/conf/qa/triviaqa/wiki/bilstm_assertion_definition.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | BiLSTM with assertions. 3 | 4 | parent_config: ['./projects/knowledge_integration/conf/qa/triviaqa/wiki/bilstm_assertion.yaml', './conf/qa/triviaqa/wiki/abstract_triviaqa.yaml'] 5 | 6 | name: 'bilstm_assertion_definition_reader' 7 | reader: 'modular_assertion_definition_qa_reader' 8 | 9 | topk: 16 10 | 11 | # fraction of training batches where we extract definitions (1.0 means all, 0.0 means never) 12 | # can be lowered to speed up training, because using defintions requires running model twice 13 | training_fraction_with_definition: 0.3 14 | -------------------------------------------------------------------------------- /projects/knowledge_integration/qa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/projects/knowledge_integration/qa/__init__.py -------------------------------------------------------------------------------- /projects/knowledge_integration/readers.py: -------------------------------------------------------------------------------- 1 | """Reader definitions that use back""" 2 | 3 | from jack.core.tensorflow import TFReader 4 | from jack.readers.implementations import nli_reader, create_shared_resources, extractive_qa_reader 5 | 6 | 7 | @extractive_qa_reader 8 | def modular_assertion_qa_reader(resources_or_conf=None): 9 | from projects.knowledge_integration.qa.shared import XQAAssertionInputModule 10 | from jack.readers.extractive_qa.shared import XQAOutputModule 11 | from projects.knowledge_integration.qa.shared import ModularAssertionQAModel 12 | shared_resources = create_shared_resources(resources_or_conf) 13 | 14 | input_module = XQAAssertionInputModule(shared_resources) 15 | model_module = ModularAssertionQAModel(shared_resources) 16 | output_module = XQAOutputModule() 17 | return TFReader(shared_resources, input_module, model_module, output_module) 18 | 19 | 20 | @extractive_qa_reader 21 | def modular_assertion_definition_qa_reader(resources_or_conf=None): 22 | from projects.knowledge_integration.qa.definition_model import XQAAssertionDefinitionInputModule 23 | from projects.knowledge_integration.qa.definition_model import ModularAssertionDefinitionQAModel 24 | from jack.readers.extractive_qa.shared import XQAOutputModule 25 | shared_resources = create_shared_resources(resources_or_conf) 26 | 27 | input_module = XQAAssertionDefinitionInputModule(shared_resources) 28 | model_module = ModularAssertionDefinitionQAModel(shared_resources) 29 | output_module = XQAOutputModule() 30 | reader = TFReader(shared_resources, input_module, model_module, output_module) 31 | input_module.set_reader(reader) 32 | return TFReader(shared_resources, input_module, model_module, output_module) 33 | 34 | 35 | @nli_reader 36 | def cbilstm_nli_assertion_reader(resources_or_conf=None): 37 | from projects.knowledge_integration.nli import NLIAssertionModel 38 | from projects.knowledge_integration.nli import MultipleChoiceAssertionInputModule 39 | from jack.readers.classification.shared import SimpleClassificationOutputModule 40 | shared_resources = create_shared_resources(resources_or_conf) 41 | input_module = MultipleChoiceAssertionInputModule(shared_resources) 42 | model_module = NLIAssertionModel(shared_resources) 43 | output_module = SimpleClassificationOutputModule(shared_resources) 44 | return TFReader(shared_resources, input_module, model_module, output_module) 45 | -------------------------------------------------------------------------------- /projects/knowledge_integration/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/projects/knowledge_integration/scripts/__init__.py -------------------------------------------------------------------------------- /projects/knowledge_integration/shared.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from jack.core import TensorPort, Ports 4 | 5 | 6 | class AssertionMRPorts: 7 | # When feeding embeddings directly 8 | question_length = Ports.Input.question_length 9 | support_length = Ports.Input.support_length 10 | 11 | # but also ids, for char-based embeddings 12 | question = Ports.Input.question 13 | support = Ports.Input.support 14 | 15 | word_char_length = TensorPort(np.int32, [None], "word_char_length", "words length", "[U]") 16 | 17 | token_char_offsets = TensorPort(np.int32, [None, None], "token_char_offsets", 18 | "Character offsets of tokens in support.", "[S, support_length]") 19 | 20 | keep_prob = Ports.keep_prob 21 | is_eval = Ports.is_eval 22 | 23 | word_embeddings = TensorPort(np.float32, [None, None], "word_embeddings", 24 | "Embeddings only for words occuring in batch.", "[None, N]") 25 | 26 | assertion_lengths = TensorPort(np.int32, [None], "assertion_lengths", "Length of assertion.", "[R]") 27 | 28 | assertions = TensorPort(np.int32, [None, None], "assertions", 29 | "Represents batch dependent assertion word ids.", 30 | "[R, L]") 31 | assertion2question = TensorPort(np.int32, [None], "assertion2question", "Question idx per assertion", "[R]") 32 | 33 | word2lemma = TensorPort(np.int32, [None], "word2lemma", "Lemma idx per word", "[U]") 34 | 35 | word_chars = TensorPort(np.int32, [None, None], "word_chars", "Represents words as sequence of chars", 36 | "[U, max_num_chars]") 37 | 38 | question_arg_span = TensorPort(np.int32, [None, 2], "question_arg_span", 39 | "span of an argument in the question", "[Q, 2]") 40 | 41 | support_arg_span = TensorPort(np.int32, [None, 2], "support_arg_span", 42 | "span of an argument in the suppoort", "[S, 2]") 43 | 44 | assertion2question_arg_span = TensorPort(np.int32, [None], "assertion2question_arg_span", 45 | "assertion to question span mapping", "[A]") 46 | assertion2support_arg_span = TensorPort(np.int32, [None], "assertion2support_arg_span", 47 | "assertion to support span mapping", "[A]") 48 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | # Configuration of py.test 2 | [pytest] 3 | addopts=-v --forked --numprocesses=auto 4 | -n 4 5 | 6 | # Do not run tests in the build folder 7 | norecursedirs = docs *.egg-info .git appdir .tox 8 | 9 | # PEP-8 The following are ignored: 10 | # E501 line too long (82 > 79 characters) 11 | # E402 module level import not at top of file - temporary measure to continue adding ros python packaged in sys.path 12 | # E731 do not assign a lambda expression, use a def 13 | 14 | pep8ignore=* E501 \ 15 | * E402 \ 16 | * E731 \ 17 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | jsonschema 2 | numpy 3 | parse 4 | scipy 5 | sklearn 6 | typing 7 | sacred==0.7.2 8 | sqlalchemy 9 | pyyaml 10 | progressbar2 11 | spacy==1.9 12 | diskcache 13 | pytest 14 | pytest-runner 15 | pytest-xdist 16 | pytest-pep8 17 | pytest-xdist 18 | pytest-cov 19 | codecov 20 | diskcache 21 | progressbar 22 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from setuptools import find_packages 4 | from setuptools import setup 5 | from setuptools.command.develop import develop as _develop 6 | from setuptools.command.install import install as _install 7 | 8 | 9 | def spacy_download_en(): 10 | import spacy 11 | try: 12 | spacy.load('en') 13 | except: 14 | import subprocess 15 | args = ['python3 -m spacy download en'] 16 | subprocess.call(args, shell=True) 17 | 18 | 19 | class Install(_install): 20 | def run(self): 21 | _install.do_egg_install(self) 22 | spacy_download_en() 23 | _install.run(self) 24 | 25 | 26 | class Develop(_develop): 27 | def run(self): 28 | spacy_download_en() 29 | _develop.run(self) 30 | 31 | 32 | with open('requirements.txt', 'r') as f: 33 | install_requires = [l for l in f.readlines() if not l.startswith('http://')] 34 | 35 | extras_require = { 36 | 'tf': ['tensorflow==1.8.0'], 37 | 'tf_gpu': ['tensorflow-gpu==1.8.0'], 38 | 'torch': ['torch'] 39 | } 40 | 41 | with open("README.md", "r+", encoding="utf-8") as f: 42 | long_description = f.read() 43 | 44 | setup(name='uclmr-jack', 45 | version='0.2.1', 46 | description='Jack the Reader is a Python framework for Machine Reading', 47 | long_description=long_description, 48 | long_description_content_type="text/markdown", 49 | author='UCL Machine Reading', 50 | author_email='s.riedel@cs.ucl.ac.uk', 51 | url='https://github.com/uclmr/jack', 52 | test_suite='tests', 53 | license='MIT', 54 | packages=find_packages(), 55 | cmdclass={ 56 | 'install': Install, 57 | 'develop': Develop 58 | }, 59 | install_requires=install_requires, 60 | extras_require=extras_require, 61 | setup_requires=install_requires, 62 | tests_require=install_requires, 63 | classifiers=[ 64 | 'Development Status :: 4 - Beta', 65 | 'Intended Audience :: Developers', 66 | 'Intended Audience :: Education', 67 | 'Intended Audience :: Science/Research', 68 | 'License :: OSI Approved :: MIT License', 69 | 'Programming Language :: Python :: 3', 70 | 'Programming Language :: Python :: 3.6', 71 | 'Topic :: Software Development :: Libraries', 72 | 'Topic :: Software Development :: Libraries :: Python Modules', 73 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 74 | 'Operating System :: OS Independent' 75 | ], 76 | keywords='tensorflow machine learning natural language processing question answering') 77 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def pytest_collection_modifyitems(items): 5 | for item in items: 6 | if "sentihood" in item.nodeid: 7 | item.add_marker(pytest.mark.sentihood) 8 | elif "SNLI" in item.nodeid: 9 | item.add_marker(pytest.mark.SNLI) 10 | 11 | if "overfit" in item.nodeid: 12 | item.add_marker(pytest.mark.overfit) 13 | elif "smalldata" in item.nodeid: 14 | item.add_marker(pytest.mark.smalldata) 15 | elif "readme" in item.nodeid: 16 | item.add_marker(pytest.mark.readme) 17 | -------------------------------------------------------------------------------- /tests/jack/debug/test_debug.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | -------------------------------------------------------------------------------- /tests/jack/eval/test_kbp_eval.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from jack.eval.link_prediction import compute_ranks 4 | 5 | triple_to_score_map = { 6 | ('a', 'p', 'a'): 1, 7 | ('a', 'p', 'b'): 2, 8 | ('a', 'p', 'c'): 3, 9 | ('a', 'p', 'd'): 4 10 | } 11 | 12 | triples = sorted(triple for triple, _ in triple_to_score_map.items()) 13 | entity_set = {s for (s, _, _) in triples} | {o for (_, _, o) in triples} 14 | 15 | 16 | def scoring_function(triples): 17 | return [triple_to_score_map.get(triple, 0) for triple in triples] 18 | 19 | 20 | def test_kbp_eval(): 21 | ranks, f_ranks = compute_ranks(scoring_function=scoring_function, triples=triples, entity_set=entity_set) 22 | 23 | ranks_l, ranks_r = ranks 24 | f_ranks_l, f_ranks_r = f_ranks 25 | 26 | assert ranks_l == [1, 1, 1, 1] 27 | assert ranks_r == [4, 3, 2, 1] 28 | 29 | assert f_ranks_l == ranks_l 30 | assert f_ranks_r == ranks_r 31 | -------------------------------------------------------------------------------- /tests/jack/preprocess/test_batch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from jack.util import batch 4 | 5 | 6 | def test_get_buckets(): 7 | data = { 8 | 'data0': [i * [i] for i in range(1, 10)], 9 | 'data1': [i * [i] for i in range(3, 12)] 10 | } 11 | 12 | buckets2ids, ids2buckets = batch.get_buckets(data=data, 13 | order=('data0', 'data1'), 14 | structure=(2, 2)) 15 | 16 | assert buckets2ids == { 17 | '(1, 0)': [5, 6], 18 | '(1, 1)': [7, 8], 19 | '(0, 0)': [0, 1, 2], 20 | '(0, 1)': [3, 4] 21 | } 22 | assert ids2buckets == { 23 | 0: '(0, 0)', 24 | 1: '(0, 0)', 25 | 2: '(0, 0)', 26 | 3: '(0, 1)', 27 | 4: '(0, 1)', 28 | 5: '(1, 0)', 29 | 6: '(1, 0)', 30 | 7: '(1, 1)', 31 | 8: '(1, 1)' 32 | } 33 | 34 | 35 | def test_get_batches(): 36 | data = { 37 | 'data0': [[i] * 2 for i in range(10)], 38 | 'data1': [[i] * 3 for i in range(10)] 39 | } 40 | 41 | batch_generator = batch.get_batches(data, batch_size=3, exact_epoch=True) 42 | batches = list(batch_generator) 43 | 44 | assert batches[0]['data0'].shape == batches[1]['data0'].shape == batches[2]['data0'].shape == (3, 2) 45 | assert batches[0]['data1'].shape == batches[1]['data1'].shape == batches[2]['data1'].shape == (3, 3) 46 | 47 | assert batches[3]['data0'].shape == (1, 2) 48 | assert batches[3]['data1'].shape == (1, 3) 49 | 50 | assert len(batches) == 4 51 | 52 | batch_generator = batch.get_batches(data, batch_size=3, exact_epoch=False) 53 | batches = list(batch_generator) 54 | 55 | assert len(batches) == 3 56 | -------------------------------------------------------------------------------- /tests/jack/preprocess/test_map.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | 5 | from jack.util import map 6 | from jack.util import preprocessing 7 | 8 | text = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et ' \ 9 | 'dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ' \ 10 | 'ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat ' \ 11 | 'nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit ' \ 12 | 'anim id est laborum.' 13 | 14 | tokenized_text = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet', ',', 'consectetur', 'adipiscing', 'elit', ',', 'sed', 15 | 'do', 'eiusmod', 'tempor', 'incididunt', 'ut', 'labore', 'et', 'dolore', 'magna', 'aliqua', '.', 16 | 'Ut', 'enim', 'ad', 'minim', 'veniam', ',', 'quis', 'nostrud', 'exercitation', 'ullamco', 17 | 'laboris', 'nisi', 'ut', 'aliquip', 'ex', 'ea', 'commodo', 'consequat', '.', 'Duis', 'aute', 18 | 'irure', 'dolor', 'in', 'reprehenderit', 'in', 'voluptate', 'velit', 'esse', 'cillum', 'dolore', 19 | 'eu', 'fugiat', 'nulla', 'pariatur', '.', 'Excepteur', 'sint', 'occaecat', 'cupidatat', 'non', 20 | 'proident', ',', 'sunt', 'in', 'culpa', 'qui', 'officia', 'deserunt', 'mollit', 'anim', 'id', 21 | 'est', 'laborum', '.'] 22 | 23 | 24 | def test_tokenize(): 25 | assert preprocessing.tokenize(text) == tokenized_text 26 | question_text = "where is the cat?" 27 | desired_tokenised_question = ["where","is","the","cat","?"] 28 | assert preprocessing.tokenize(question_text) == desired_tokenised_question 29 | 30 | 31 | def test_get_list_shape(): 32 | data = [[1, 2, 3], [4, 5]] 33 | assert map.get_list_shape(data) == [2, 3] 34 | 35 | data = [[[1, 2, 3]], [[4, 5], [6, 7]]] 36 | assert map.get_list_shape(data) == [2, 2, 3] 37 | 38 | 39 | def test_numpify(): 40 | def _fillna(xs): 41 | data = np.array(xs) 42 | lens = np.array([len(i) for i in data]) 43 | mask = np.arange(lens.max()) < lens[:, None] 44 | out = np.zeros(mask.shape, dtype=data.dtype) 45 | out[mask] = np.concatenate(data) 46 | return out 47 | 48 | data = [[1, 2, 3], [4, 5], [6, 7, 8]] 49 | data_np = map.numpify(data) 50 | 51 | for a, b in zip([np.array(x) for x in data], data_np): 52 | assert (a == b).all() 53 | 54 | data = {0: [[1, 2, 3]], 1: [[4, 5], [6, 7, 8]], 2: [[6, 7, 8]]} 55 | data_np = map.numpify(data) 56 | 57 | for ak, bk in zip(data.keys(), data_np.keys()): 58 | a, b = data[ak], data_np[bk] 59 | assert (_fillna(a) == b).all() 60 | -------------------------------------------------------------------------------- /tests/jack/preprocess/test_vocab_prune.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from pprint import pprint 4 | 5 | from jack.core import QASetting 6 | from jack.util import preprocessing 7 | 8 | 9 | def test_vocab(): 10 | train_data = [ 11 | QASetting(question='A person is training his horse for a competition.', 12 | support=['A person on a horse jumps over a broken down airplane.'], 13 | candidates=['entailment', 'neutral', 'contradiction']) 14 | ] 15 | 16 | print('build vocab based on train data') 17 | train_vocab = preprocessing.fill_vocab(train_data) 18 | train_vocab.freeze() 19 | pprint(train_vocab._sym2freqs) 20 | pprint(train_vocab._sym2id) 21 | 22 | MIN_VOCAB_FREQ, MAX_VOCAB_CNT = 2, 10 23 | train_vocab = train_vocab.prune(MIN_VOCAB_FREQ, MAX_VOCAB_CNT) 24 | 25 | pprint(train_vocab._sym2freqs) 26 | pprint(train_vocab._sym2id) 27 | 28 | print('encode train data') 29 | train_data = preprocessing.nlp_preprocess(train_data[0].question, train_vocab)[0] 30 | print(train_data) 31 | -------------------------------------------------------------------------------- /tests/jack/readers/extractive_qa/test_util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from jack.core import QASetting, Answer 4 | from jack.readers.extractive_qa.util import prepare_data 5 | from jack.util.vocab import Vocab 6 | 7 | qa_setting = QASetting(question="What is the answer?", 8 | support=["It is not A.", "It is B."]) 9 | answers = [Answer(text="B", span=(6, 7), doc_idx=1)] 10 | 11 | 12 | def test_prepare_data(): 13 | 14 | result = prepare_data(qa_setting, answers, Vocab(), 15 | with_answers=True) 16 | 17 | question_tokens, question_ids, question_lemmas, question_length, \ 18 | support_tokens, support_ids, support_lemmas, support_length, \ 19 | word_in_question, token_offsets, answer_spans = result 20 | 21 | assert question_tokens == ['What', 'is', 'the', 'answer', '?'] 22 | assert question_ids == [1, 2, 3, 4, 5] 23 | assert question_lemmas is None 24 | assert question_length == 5 25 | 26 | assert support_tokens == [['It', 'is', 'not', 'A', '.', ], ['It', 'is', 'B', '.']] 27 | assert support_ids == [[6, 2, 7, 8, 9], [6, 2, 10, 9]] 28 | assert support_lemmas == [None, None] 29 | assert support_length == [5, 4] 30 | assert word_in_question == [[0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]] 31 | assert token_offsets == [[0, 3, 6, 10, 11], [0, 3, 6, 7]] 32 | assert answer_spans == [[], [(2, 2)]] 33 | -------------------------------------------------------------------------------- /tests/jack/readers/multiple_choice/test_simple_mcqa.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from jack.readers.classification.shared import * 4 | 5 | from jack.util.vocab import Vocab 6 | 7 | 8 | def test_single_support_fixed_class_inputs(): 9 | import logging 10 | logging.basicConfig(level=logging.INFO) 11 | data_set = [ 12 | (QASetting("Where is the cat?", ["the cat is on the mat."]), [Answer("mat")]) 13 | ] 14 | shared_resources = SharedResources(Vocab(), {}) 15 | input_module = ClassificationSingleSupportInputModule(shared_resources) 16 | input_module.setup_from_data(data_set) 17 | input_module.setup() 18 | 19 | assert len(input_module.shared_resources.answer_vocab) == 1 20 | assert len(input_module.shared_resources.vocab) == 9 21 | 22 | tensor_data_set = list(input_module.batch_generator(data_set, batch_size=3, is_eval=False)) 23 | 24 | expected_support = ["the", "cat", "is", "on", "the", "mat", "."] 25 | expected_support_ids = [[shared_resources.vocab.get_id(sym) for sym in expected_support]] 26 | first_instance = tensor_data_set[0] 27 | actual_support_ids = first_instance[Ports.Input.support] 28 | assert np.array_equal(actual_support_ids, expected_support_ids) 29 | assert first_instance[Ports.Input.support_length][0] == len(expected_support) 30 | 31 | actual_answer_ids = first_instance[Ports.Target.target_index] 32 | expected_answer = [input_module.shared_resources.answer_vocab.get_id("mat")] 33 | assert np.array_equal(actual_answer_ids, expected_answer) 34 | 35 | actual_question_ids = first_instance[Ports.Input.question] 36 | expected_question = ["where", "is", "the", "cat", "?"] 37 | expected_question_ids = [[shared_resources.vocab.get_id(sym) for sym in expected_question]] 38 | assert np.array_equal(actual_question_ids, expected_question_ids) 39 | assert first_instance[Ports.Input.question_length][0] == len(expected_question) 40 | -------------------------------------------------------------------------------- /tests/jack/readers/test_fastqa.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | import jack.readers as readers 7 | from jack.core import SharedResources 8 | from jack.io.embeddings.embeddings import Embeddings 9 | from jack.io.load import load_jack 10 | from jack.readers.extractive_qa.util import tokenize 11 | from jack.util.vocab import Vocab 12 | 13 | 14 | def test_fastqa(): 15 | tf.reset_default_graph() 16 | 17 | data = load_jack('tests/test_data/squad/snippet_jtr.json') 18 | questions = [] 19 | # fast qa must be initialized with existing embeddings, so we create some 20 | vocab = dict() 21 | for question, _ in data: 22 | questions.append(question) 23 | for t in tokenize(question.question): 24 | if t not in vocab: 25 | vocab[t] = len(vocab) 26 | embeddings = Embeddings(vocab, np.random.random([len(vocab), 10])) 27 | 28 | # we need a vocabulary (with embeddings for our fastqa_reader, but this is not always necessary) 29 | vocab = Vocab(vocab=vocab) 30 | 31 | # ... and a config 32 | config = {"batch_size": 1, "repr_dim": 10, "with_char_embeddings": True} 33 | 34 | # create/setup reader 35 | shared_resources = SharedResources(vocab, config, embeddings) 36 | fastqa_reader = readers.fastqa_reader(shared_resources) 37 | fastqa_reader.setup_from_data(data) 38 | 39 | answers = fastqa_reader(questions) 40 | 41 | assert answers, "FastQA reader should produce answers" 42 | -------------------------------------------------------------------------------- /tests/jack/readers/test_fastqa_loop.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | 5 | from jack.core import SharedResources 6 | from jack.core.tensorflow import TFReader 7 | from jack.core.tensorport import Ports 8 | from jack.io.embeddings.embeddings import load_embeddings 9 | from jack.io.load import load_jack 10 | from jack.readers.extractive_qa.shared import XQAInputModule, XQAOutputModule 11 | from jack.readers.extractive_qa.tensorflow.fastqa import FastQAModule 12 | from jack.util.vocab import Vocab 13 | 14 | 15 | def test_fastqa(): 16 | tf.reset_default_graph() 17 | 18 | data = load_jack('tests/test_data/squad/snippet_jtr.json') 19 | 20 | # fast qa must be initialized with existing embeddings, so we create some 21 | embeddings = load_embeddings('./tests/test_data/glove.840B.300d_top256.txt', 'glove') 22 | 23 | # we need a vocabulary (with embeddings for our fastqa_reader, but this is not always necessary) 24 | vocab = Vocab(vocab=embeddings.vocabulary) 25 | 26 | # ... and a config 27 | config = { 28 | "batch_size": 1, 29 | "repr_dim": 10, 30 | "with_char_embeddings": True 31 | } 32 | 33 | # create/setup reader 34 | shared_resources = SharedResources(vocab, config, embeddings) 35 | 36 | input_module = XQAInputModule(shared_resources) 37 | model_module = FastQAModule(shared_resources) 38 | output_module = XQAOutputModule() 39 | 40 | reader = TFReader(shared_resources, input_module, model_module, output_module) 41 | reader.setup_from_data(data, is_training=True) 42 | 43 | loss = reader.model_module.tensors[Ports.loss] 44 | optimizer = tf.train.AdagradOptimizer(learning_rate=0.01) 45 | min_op = optimizer.minimize(loss) 46 | 47 | session = model_module.tf_session 48 | session.run(tf.global_variables_initializer()) 49 | 50 | for epoch in range(0, 10): 51 | for batch in reader.input_module.batch_generator(data, 1, False): 52 | feed_dict = reader.model_module.convert_to_feed_dict(batch) 53 | loss_value, _ = session.run((loss, min_op), feed_dict=feed_dict) 54 | print(loss_value) 55 | -------------------------------------------------------------------------------- /tests/jack/readers/test_kbp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | 5 | import jack.readers as readers 6 | from jack.io.load import loaders 7 | 8 | 9 | def test_kbp(): 10 | data = loaders['jack']('tests/test_data/WN18/wn18-snippet.jack.json') 11 | questions = [question for question, _ in data] 12 | 13 | for model_name in ['transe', 'distmult', 'complex']: 14 | 15 | with tf.variable_scope(model_name): 16 | config = { 17 | 'batch_size': 1, 18 | 'repr_dim': 10 19 | } 20 | 21 | reader = readers.readers['{}_reader'.format(model_name)](config) 22 | reader.setup_from_data(data) 23 | 24 | answers = reader(questions) 25 | 26 | assert len(answers) == 5000 27 | 28 | assert answers, 'KBP reader should produce answers' 29 | -------------------------------------------------------------------------------- /tests/jack/readers/test_readers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Smoke test: train all readers for one iteration & run inference.""" 4 | 5 | from functools import partial 6 | 7 | import numpy as np 8 | import tensorflow as tf 9 | 10 | from jack import readers 11 | from jack.core.data_structures import QASetting, Answer 12 | from jack.core.shared_resources import SharedResources 13 | from jack.core.tensorflow import TFReader 14 | from jack.io.embeddings import Embeddings 15 | from jack.readers.extractive_qa.util import tokenize 16 | from jack.util.vocab import Vocab 17 | 18 | 19 | def teardown_function(_): 20 | tf.reset_default_graph() 21 | 22 | 23 | def build_vocab(questions): 24 | """Since some readers require an initialized vocabulary, initialize it here.""" 25 | 26 | vocab = dict() 27 | for question in questions: 28 | for t in tokenize(question.question): 29 | if t not in vocab: 30 | vocab[t] = len(vocab) 31 | embeddings = Embeddings(vocab, np.random.random([len(vocab), 10])) 32 | 33 | vocab = Vocab(vocab=embeddings.vocabulary) 34 | return vocab, embeddings 35 | 36 | 37 | def smoke_test(reader_name): 38 | """Instantiate the reader, train for one epoch, and run inference.""" 39 | 40 | data_set = [ 41 | (QASetting( 42 | question="Which is it?", 43 | support=["While b seems plausible, answer a is correct."], 44 | id="1", 45 | candidates=["a", "b", "c"]), 46 | [Answer("a", (6, 6))]) 47 | ] 48 | questions = [q for q, _ in data_set] 49 | v, e = build_vocab(questions) 50 | shared_resources = SharedResources(v, {"repr_dim": 10, "dropout": 0.5}, e) 51 | tf.reset_default_graph() 52 | reader = readers.readers[reader_name](shared_resources) 53 | if isinstance(reader, TFReader): 54 | reader.train(tf.train.AdamOptimizer(), data_set, batch_size=1, max_epochs=1) 55 | else: 56 | import torch 57 | reader.setup_from_data(data_set, is_training=True) 58 | params = list(reader.model_module.prediction_module.parameters()) 59 | params.extend(reader.model_module.loss_module.parameters()) 60 | optimizer = torch.optim.Adam(params, lr=0.01) 61 | reader.train(optimizer, data_set, batch_size=1, max_epochs=1) 62 | 63 | answers = reader(questions) 64 | 65 | assert answers, "{} should produce answers".format(reader_name) 66 | 67 | 68 | BLACKLIST = ['fastqa_reader_torch', 'modular_qa_reader', 'modular_nli_reader'] 69 | READERS = [r for r in readers.readers.keys() 70 | if r not in BLACKLIST] 71 | 72 | # Dynamically generate one test for each reader 73 | current_module = __import__(__name__) 74 | 75 | for reader_name in READERS: 76 | setattr(current_module, "test_{}".format(reader_name), partial(smoke_test, reader_name)) 77 | -------------------------------------------------------------------------------- /tests/jack/readers/test_serialization.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tempfile 4 | 5 | import tensorflow as tf 6 | 7 | from jack.io.embeddings import load_embeddings 8 | from jack.io.load import loaders 9 | from jack.readers.implementations import * 10 | from jack.util.vocab import Vocab 11 | 12 | 13 | def test_serialization(): 14 | all_readers = [ 15 | fastqa_reader, 16 | modular_qa_reader, 17 | # fastqa_reader_torch, 18 | dam_snli_reader, 19 | cbilstm_nli_reader, 20 | modular_nli_reader, 21 | distmult_reader, 22 | complex_reader, 23 | transe_reader, 24 | ] 25 | 26 | for reader in all_readers: 27 | vocab, config = {}, {} 28 | 29 | data = None 30 | if reader in {distmult_reader, complex_reader, transe_reader}: 31 | data = loaders['jack']('tests/test_data/WN18/wn18-snippet.jack.json') 32 | config['repr_dim'] = 50 33 | elif reader in {cbilstm_nli_reader, dam_snli_reader}: 34 | data = loaders['snli']('tests/test_data/SNLI/1000_samples_snli_1.0_train.jsonl') 35 | 36 | embeddings = load_embeddings("data/GloVe/glove.the.50d.txt", 'glove') 37 | vocab = Vocab(vocab=embeddings.vocabulary) 38 | config['repr_dim'] = 50 39 | elif reader in {fastqa_reader}: 40 | data = loaders['squad']('data/SQuAD/snippet.json') 41 | 42 | embeddings = load_embeddings("data/GloVe/glove.the.50d.txt", 'glove') 43 | vocab = Vocab(vocab=embeddings.vocabulary) 44 | config['repr_dim'] = 50 45 | 46 | if data is not None: 47 | tf.reset_default_graph() 48 | 49 | shared_resources = SharedResources(vocab, config, embeddings) 50 | reader_instance = reader(shared_resources) 51 | reader_instance.setup_from_data(data) 52 | 53 | temp_dir_path = tempfile.mkdtemp() 54 | reader_instance.store(temp_dir_path) 55 | 56 | reader_instance.load(temp_dir_path) 57 | 58 | assert reader_instance is not None 59 | -------------------------------------------------------------------------------- /tests/jack/test_core.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | 5 | from jack.core import SharedResources 6 | from jack.io.embeddings import load_embeddings 7 | from jack.util.vocab import Vocab 8 | 9 | 10 | def test_shared_resources_store(): 11 | embeddings_file = "data/GloVe/glove.the.50d.txt" 12 | embeddings = load_embeddings(embeddings_file, 'glove') 13 | config = { 14 | "embedding_file": embeddings_file, 15 | "embedding_format": "glove" 16 | } 17 | some_vocab = Vocab(vocab=embeddings.vocabulary) 18 | some_vocab('foo') 19 | shared_resources = SharedResources(some_vocab, config, embeddings) 20 | 21 | import tempfile 22 | with tempfile.TemporaryDirectory() as tmp_dir: 23 | path = tmp_dir + "_resources" 24 | shared_resources.store(path) 25 | 26 | new_shared_resources = SharedResources() 27 | new_shared_resources.load(path) 28 | 29 | type_a, type_b = type(new_shared_resources.vocab), type(shared_resources.vocab) 30 | assert type_a == type_b 31 | 32 | for k in new_shared_resources.vocab.__dict__: 33 | assert new_shared_resources.vocab.__dict__[k] == shared_resources.vocab.__dict__[k] 34 | assert new_shared_resources.config == shared_resources.config 35 | assert new_shared_resources.embeddings.lookup.shape == embeddings.lookup.shape 36 | assert np.array_equal(new_shared_resources.embeddings.get(b"the"), embeddings.get(b"the")) 37 | -------------------------------------------------------------------------------- /tests/jack/test_embeddings.py: -------------------------------------------------------------------------------- 1 | from jack.io.embeddings import load_embeddings 2 | import numpy as np 3 | 4 | 5 | def test_memory_map_dir(): 6 | import tempfile 7 | from jack.io.embeddings.memory_map import save_as_memory_map_dir, load_memory_map_dir 8 | embeddings_file = "data/GloVe/glove.the.50d.txt" 9 | embeddings = load_embeddings(embeddings_file, 'glove') 10 | with tempfile.TemporaryDirectory() as tmp_dir: 11 | mem_map_dir = tmp_dir + "/glove.the.50d.memmap" 12 | save_as_memory_map_dir(mem_map_dir, embeddings) 13 | loaded_embeddings = load_memory_map_dir(mem_map_dir) 14 | assert loaded_embeddings.shape == embeddings.shape 15 | assert len(loaded_embeddings.vocabulary) == 1 16 | assert loaded_embeddings.vocabulary["the"] == 0 17 | assert "foo" not in loaded_embeddings.vocabulary 18 | assert np.isclose(loaded_embeddings.get("the"), embeddings.get("the"), 1.e-5).all() 19 | -------------------------------------------------------------------------------- /tests/test_conf/dam_test.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | A configuration inheriting from the default jack.yaml 3 | 4 | parent_config: './conf/jack.yaml' 5 | 6 | name: "dam_nli" 7 | 8 | seed: 1337 9 | 10 | # where to store the reader 11 | save_dir: './tests/test_results/dam_reader_test' 12 | 13 | # jack training file 14 | train: 'tests/test_data/SNLI/train.json' 15 | 16 | # jack dev file 17 | dev: 'tests/test_data/SNLI/dev.json' 18 | 19 | # jack test file 20 | test: 'tests/test_data/SNLI/test.json' 21 | 22 | # Reading model to use 23 | reader: 'dam_snli_reader' 24 | 25 | repr_dim: 50 26 | 27 | repr_dim_task_embedding: 50 28 | 29 | max_epochs: 10 30 | 31 | dropout: 0 32 | 33 | batch_size: 64 34 | -------------------------------------------------------------------------------- /tests/test_conf/fastqa_test.yaml: -------------------------------------------------------------------------------- 1 | description: > 2 | A configuration inheriting from the default jack.yaml 3 | 4 | parent_config: './conf/jack.yaml' 5 | 6 | name: "fastqa_reader" 7 | 8 | seed: 1337 9 | 10 | # where to store the reader 11 | save_dir: './tests/test_results/fastqa_reader_test' 12 | 13 | # How large the support should be. Can be used for cutting or filtering QA examples 14 | max_support_length: -1 15 | 16 | # Use also character based embeddings in readers which support it 17 | with_char_embeddings: True 18 | 19 | # jack training file 20 | train: 'data/SQuAD/snippet.jtr.json' 21 | 22 | # jack dev file 23 | dev: 'data/SQuAD/snippet.jtr.json' 24 | 25 | # jack test file 26 | test: null 27 | 28 | # Reading model to use 29 | reader: 'fastqa_reader' 30 | 31 | # [word2vec] or [glove] format of embeddings to be loaded 32 | embedding_format: 'glove' 33 | 34 | # format of embeddings to be loaded 35 | embedding_file: 'tests/test_data/glove.840B.300d_top256.txt' 36 | 37 | # Use fixed vocab of pretrained embeddings 38 | vocab_from_embeddings: True 39 | 40 | repr_dim: 300 41 | 42 | max_epochs: 20 43 | 44 | dropout: 0 45 | 46 | batch_size: 64 47 | -------------------------------------------------------------------------------- /tests/test_conf/snli_small_adagrad_test.yaml: -------------------------------------------------------------------------------- 1 | parent_config: 'conf/jack.yaml' 2 | 3 | reader: 'dam_snli_reader' 4 | 5 | seed: 1337 6 | train: 'data/SNLI/snippet.jtr_v1.json' 7 | dev: 'data/SNLI/snippet.jtr_v1.json' 8 | test: 'data/SNLI/snippet.jtr_v1.json' 9 | 10 | epochs: 20 11 | optimizer: 'adagrad' 12 | learning_rate: 0.001 13 | batch_size: 32 14 | learning_rate_decay: 0.99 15 | l2: 0.0 16 | dev_batch_size: 32 17 | -------------------------------------------------------------------------------- /tests/test_data/MultiNLI/2000_samples_train_jtr.json: -------------------------------------------------------------------------------- 1 | { 2 | "instances": [], 3 | "globals": { 4 | "candidates": [ 5 | { 6 | "text": "entailment" 7 | }, 8 | { 9 | "text": "neutral" 10 | }, 11 | { 12 | "text": "contradiction" 13 | } 14 | ] 15 | }, 16 | "meta": "MultiSNLI" 17 | } -------------------------------------------------------------------------------- /tests/test_data/MultiNLI/overfit.json: -------------------------------------------------------------------------------- 1 | { 2 | "instances": [], 3 | "globals": { 4 | "candidates": [ 5 | { 6 | "text": "entailment" 7 | }, 8 | { 9 | "text": "neutral" 10 | }, 11 | { 12 | "text": "contradiction" 13 | } 14 | ] 15 | }, 16 | "meta": "MultiSNLI" 17 | } -------------------------------------------------------------------------------- /tests/test_data/wiki.json: -------------------------------------------------------------------------------- 1 | ["Who is this?", "Born and raised in a Hindu merchant caste family in coastal Gujarat, western India, and trained in law at the Inner Temple, London, Gandhi first employed nonviolent civil disobedience as an expatriate lawyer in South Africa, in the resident Indian community's struggle for civil rights. After his return to India in 1915, he set about organising peasants, farmers, and urban labourers to protest against excessive land-tax and discrimination. Assuming leadership of the Indian National Congress in 1921, Gandhi led nationwide campaigns for easing poverty, expanding women's rights, building religious and ethnic amity, ending untouchability, but above all for achieving Swaraj or self-rule.\n", "Gandhi"] 2 | ["Who is this?", "Near the beginning of his career, Einstein thought that Newtonian mechanics was no longer enough to reconcile the laws of classical mechanics with the laws of the electromagnetic field. This led him to develop his special theory of relativity. He realized, however, that the principle of relativity could also be extended to gravitational fields, and with his subsequent theory of gravitation in 1916, he published a paper on general relativity. He continued to deal with problems of statistical mechanics and quantum theory, which led to his explanations of particle theory and the motion of molecules. He also investigated the thermal properties of light which laid the foundation of the photon theory of light. In 1917, Einstein applied the general theory of relativity to model the large-scale structure of the universe.\n", "Albert Einstein"] 3 | ["Who is this?", "He was a pioneer of the application of operator theory to quantum mechanics, in the development of functional analysis, and a key figure in the development of game theory and the concepts of cellular automata, the universal constructor and the digital computer. He published over 150 papers in his life: about 60 in pure mathematics, 20 in physics, and 60 in applied mathematics, the remainder being on special mathematical subjects or non-mathematical ones. His last work, an unfinished manuscript written while in the hospital, was later published in book form as The Computer and the Brain.", "John von Neumann"] 4 | -------------------------------------------------------------------------------- /tests/test_readme.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import subprocess 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | from jack import readers 9 | from jack.core.data_structures import QASetting 10 | 11 | 12 | def test_readme_fastqa(): 13 | args = ['python3', './bin/jack-train.py', 'with', 'config=tests/test_conf/fastqa_test.yaml'] 14 | p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 15 | out, err = p.communicate() 16 | 17 | tf.reset_default_graph() 18 | 19 | fastqa_reader = readers.fastqa_reader() 20 | fastqa_reader.load_and_setup("tests/test_results/fastqa_reader_test") 21 | 22 | support = """"Architecturally, the school has a Catholic character. 23 | Atop the Main Building's gold dome is a golden statue of the Virgin Mary. 24 | Immediately in front of the Main Building and facing it, is a copper statue of 25 | Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the 26 | Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, 27 | a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, 28 | France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. 29 | At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), 30 | is a simple, modern stone statue of Mary.""" 31 | 32 | answers = fastqa_reader([QASetting( 33 | question="To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?", 34 | support=[support] 35 | )]) 36 | 37 | assert answers[0][0].text is not None 38 | 39 | 40 | def test_readme_dam(): 41 | args = ['python3', './bin/jack-train.py', 'with', 'config=tests/test_conf/dam_test.yaml'] 42 | p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 43 | out, err = p.communicate() 44 | 45 | tf.reset_default_graph() 46 | 47 | dam_reader = readers.dam_snli_reader() 48 | dam_reader.load_and_setup("tests/test_results/dam_reader_test") 49 | 50 | atomic_candidates = ['entailment', 'neutral', 'contradiction'] 51 | answers = dam_reader([QASetting( 52 | question="The boy plays with the ball.", 53 | support=["The boy plays with the ball."], 54 | candidates=atomic_candidates 55 | )]) 56 | 57 | assert answers[0] is not None 58 | assert isinstance(answers[0][0].score, np.float32) 59 | assert answers[0][0].text in atomic_candidates 60 | -------------------------------------------------------------------------------- /tests/test_results/dam_test/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "model_module" 2 | all_model_checkpoint_paths: "model_module" 3 | -------------------------------------------------------------------------------- /tests/test_results/dam_test/model_module.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/tests/test_results/dam_test/model_module.data-00000-of-00001 -------------------------------------------------------------------------------- /tests/test_results/dam_test/model_module.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/tests/test_results/dam_test/model_module.index -------------------------------------------------------------------------------- /tests/test_results/dam_test/model_module.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/tests/test_results/dam_test/model_module.meta -------------------------------------------------------------------------------- /tests/test_results/dam_test/shared_resources/answer_vocab: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/tests/test_results/dam_test/shared_resources/answer_vocab -------------------------------------------------------------------------------- /tests/test_results/dam_test/shared_resources/config.yaml: -------------------------------------------------------------------------------- 1 | {batch_size: 64, clip_value: 0.0, config: tests/test_conf/dam_test.yaml, debug: false, 2 | debug_examples: 10, description: 'A configuration inheriting from the default jack.yaml 3 | 4 | ', dev: tests/test_data/SNLI/dev.json, dev_batch_size: null, dropout: 0, embedding_file: null, 5 | embedding_format: null, epochs: 5, file_cache: false, l2: 0.0, learning_rate: 0.001, 6 | learning_rate_decay: 1.0, load_dir: null, loader: jack, log_interval: 100, lowercase: true, 7 | max_epochs: 10, max_num_support: null, min_learning_rate: 0.0001, name: dam_nli, 8 | num_dev_examples: null, num_train_examples: null, optimizer: adam, output_dir: ./out/, 9 | parent_config: ./conf/jack.yaml, reader: dam_snli_reader, repr_dim: 50, repr_dim_task_embedding: 50, 10 | save_dir: ./tests/test_results/dam_reader_test, seed: 1337, tensorboard_folder: null, 11 | test: tests/test_data/SNLI/test.json, train: tests/test_data/SNLI/train.json, validation_interval: null, 12 | vocab_from_embeddings: false, with_char_embeddings: true, write_metrics_to: null} 13 | -------------------------------------------------------------------------------- /tests/test_results/dam_test/shared_resources/remainder: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/tests/test_results/dam_test/shared_resources/remainder -------------------------------------------------------------------------------- /tests/test_results/dam_test/shared_resources/vocab: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/tests/test_results/dam_test/shared_resources/vocab -------------------------------------------------------------------------------- /tests/test_results/fastqa_test/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "model_module" 2 | all_model_checkpoint_paths: "model_module" 3 | -------------------------------------------------------------------------------- /tests/test_results/fastqa_test/model_module.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/tests/test_results/fastqa_test/model_module.data-00000-of-00001 -------------------------------------------------------------------------------- /tests/test_results/fastqa_test/model_module.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/tests/test_results/fastqa_test/model_module.index -------------------------------------------------------------------------------- /tests/test_results/fastqa_test/model_module.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/tests/test_results/fastqa_test/model_module.meta -------------------------------------------------------------------------------- /tests/test_results/fastqa_test/shared_resources/config.yaml: -------------------------------------------------------------------------------- 1 | {batch_size: 64, clip_value: 0.0, config: tests/test_conf/fastqa_test.yaml, debug: false, 2 | debug_examples: 10, description: 'A configuration inheriting from the default jack.yaml 3 | 4 | ', dev: data/SQuAD/snippet.jtr.json, dev_batch_size: null, dropout: 0, embedding_file: tests/test_data/glove.840B.300d_top256.txt, 5 | embedding_format: glove, epochs: 5, file_cache: false, l2: 0.0, learning_rate: 0.001, 6 | learning_rate_decay: 1.0, load_dir: null, loader: jack, log_interval: 100, lowercase: true, 7 | max_epochs: 20, max_num_support: null, max_support_length: -1, min_learning_rate: 0.0001, 8 | name: fastqa_reader, num_dev_examples: null, num_train_examples: null, optimizer: adam, 9 | output_dir: ./out/, parent_config: ./conf/jack.yaml, reader: fastqa_reader, repr_dim: 300, 10 | repr_dim_task_embedding: 0, save_dir: ./tests/test_results/fastqa_reader_test, seed: 1337, 11 | tensorboard_folder: null, test: null, train: data/SQuAD/snippet.jtr.json, validation_interval: null, 12 | vocab_from_embeddings: true, with_char_embeddings: true, write_metrics_to: null} 13 | -------------------------------------------------------------------------------- /tests/test_results/fastqa_test/shared_resources/embeddings/config.yaml: -------------------------------------------------------------------------------- 1 | {emb_format: glove, embedding_file: tests/test_data/glove.840B.300d_top256.txt} 2 | -------------------------------------------------------------------------------- /tests/test_results/fastqa_test/shared_resources/remainder: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/tests/test_results/fastqa_test/shared_resources/remainder -------------------------------------------------------------------------------- /tests/test_results/fastqa_test/shared_resources/vocab: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/jack/9e5ffbd4fb2b0bd6b816fe6e14b9045ac776bb8e/tests/test_results/fastqa_test/shared_resources/vocab -------------------------------------------------------------------------------- /tests/test_results/overfit_test/SNLI/dam/expected_results.txt: -------------------------------------------------------------------------------- 1 | 2018-06-10 14:40:49.637297 ClassificationEvalHook_Accuracy 0.35 2 | 2018-06-10 14:40:49.842983 ClassificationEvalHook_Accuracy 0.35 3 | 2018-06-10 14:40:50.056031 ClassificationEvalHook_Accuracy 0.35 4 | 2018-06-10 14:40:50.247261 ClassificationEvalHook_Accuracy 0.35 5 | 2018-06-10 14:40:50.431764 ClassificationEvalHook_Accuracy 0.35 6 | 2018-06-10 14:40:50.621095 ClassificationEvalHook_Accuracy 0.35 7 | 2018-06-10 14:40:50.810825 ClassificationEvalHook_Accuracy 0.35 8 | 2018-06-10 14:40:50.997291 ClassificationEvalHook_Accuracy 0.34 9 | 2018-06-10 14:40:51.269345 ClassificationEvalHook_Accuracy 0.39 10 | 2018-06-10 14:40:51.475754 ClassificationEvalHook_Accuracy 0.36 11 | 2018-06-10 14:40:51.682479 ClassificationEvalHook_Accuracy 0.34 12 | 2018-06-10 14:40:51.868328 ClassificationEvalHook_Accuracy 0.37 13 | 2018-06-10 14:40:52.053837 ClassificationEvalHook_Accuracy 0.39 14 | 2018-06-10 14:40:52.244374 ClassificationEvalHook_Accuracy 0.4 15 | 2018-06-10 14:40:52.450705 ClassificationEvalHook_Accuracy 0.41 16 | -------------------------------------------------------------------------------- /tests/test_results/overfit_test/SNLI/esim/expected_results.txt: -------------------------------------------------------------------------------- 1 | 2018-06-29 10:12:25.512333 ClassificationEvalHook_Accuracy 0.35 2 | 2018-06-29 10:12:25.841608 ClassificationEvalHook_Accuracy 0.34 3 | 2018-06-29 10:12:26.446310 ClassificationEvalHook_Accuracy 0.47 4 | 2018-06-29 10:12:27.306957 ClassificationEvalHook_Accuracy 0.38 5 | 2018-06-29 10:12:27.892721 ClassificationEvalHook_Accuracy 0.45 6 | 2018-06-29 10:12:28.400163 ClassificationEvalHook_Accuracy 0.52 7 | 2018-06-29 10:12:28.761276 ClassificationEvalHook_Accuracy 0.49 8 | 2018-06-29 10:12:29.185469 ClassificationEvalHook_Accuracy 0.45 9 | 2018-06-29 10:12:29.592369 ClassificationEvalHook_Accuracy 0.47 10 | 2018-06-29 10:12:29.962783 ClassificationEvalHook_Accuracy 0.5 11 | 2018-06-29 10:12:30.432859 ClassificationEvalHook_Accuracy 0.52 12 | 2018-06-29 10:12:30.866125 ClassificationEvalHook_Accuracy 0.61 13 | 2018-06-29 10:12:31.676597 ClassificationEvalHook_Accuracy 0.55 14 | 2018-06-29 10:12:32.151324 ClassificationEvalHook_Accuracy 0.62 15 | 2018-06-29 10:12:32.631887 ClassificationEvalHook_Accuracy 0.58 -------------------------------------------------------------------------------- /tests/test_results/overfit_test/squad/fastqa/expected_results.txt: -------------------------------------------------------------------------------- 1 | 2018-04-18 15:12:57.105613 XQAEvalHook_exact 0.01852 2 | 2018-04-18 15:12:57.105753 XQAEvalHook_f1 0.11447 3 | 2018-04-18 15:12:58.238815 XQAEvalHook_exact 0.0463 4 | 2018-04-18 15:12:58.238969 XQAEvalHook_f1 0.11974 5 | 2018-04-18 15:12:59.478085 XQAEvalHook_exact 0.06481 6 | 2018-04-18 15:12:59.478308 XQAEvalHook_f1 0.14797 7 | 2018-04-18 15:13:00.865977 XQAEvalHook_exact 0.0463 8 | 2018-04-18 15:13:00.866217 XQAEvalHook_f1 0.13511 9 | 2018-04-18 15:13:02.067794 XQAEvalHook_exact 0.12963 10 | 2018-04-18 15:13:02.068049 XQAEvalHook_f1 0.19646 11 | 2018-04-18 15:13:03.339741 XQAEvalHook_exact 0.12037 12 | 2018-04-18 15:13:03.339996 XQAEvalHook_f1 0.2186 13 | 2018-04-18 15:13:04.727538 XQAEvalHook_exact 0.17593 14 | 2018-04-18 15:13:04.727785 XQAEvalHook_f1 0.298 15 | 2018-04-18 15:13:05.944528 XQAEvalHook_exact 0.13889 16 | 2018-04-18 15:13:05.944777 XQAEvalHook_f1 0.27456 17 | 2018-04-18 15:13:07.160099 XQAEvalHook_exact 0.24074 18 | 2018-04-18 15:13:07.160367 XQAEvalHook_f1 0.32948 19 | 2018-04-18 15:13:08.417764 XQAEvalHook_exact 0.25926 20 | 2018-04-18 15:13:08.418021 XQAEvalHook_f1 0.39285 21 | 2018-04-18 15:13:09.710954 XQAEvalHook_exact 0.2963 22 | 2018-04-18 15:13:09.711209 XQAEvalHook_f1 0.42949 23 | 2018-04-18 15:13:10.970876 XQAEvalHook_exact 0.33333 24 | 2018-04-18 15:13:10.971064 XQAEvalHook_f1 0.42985 25 | 2018-04-18 15:13:12.242883 XQAEvalHook_exact 0.33333 26 | 2018-04-18 15:13:12.243865 XQAEvalHook_f1 0.42266 27 | 2018-04-18 15:13:13.419180 XQAEvalHook_exact 0.40741 28 | 2018-04-18 15:13:13.419408 XQAEvalHook_f1 0.53319 29 | 2018-04-18 15:13:14.851027 XQAEvalHook_exact 0.4537 30 | 2018-04-18 15:13:14.851192 XQAEvalHook_f1 0.56119 31 | -------------------------------------------------------------------------------- /tests/test_results/rename_recursively.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | path = sys.argv[1] 5 | execute = False 6 | try: 7 | execute = sys.argv[2] == '1' 8 | except: 9 | pass 10 | 11 | root_dir = os.getcwd() 12 | 13 | files = [] 14 | dirs = [] 15 | for root, directories, filenames in os.walk(path): 16 | for filename in filenames: 17 | dirs.append(os.path.join(root_dir, root)) 18 | files.append(os.path.join(root_dir, root, filename)) 19 | 20 | 21 | for f in files: 22 | if 'expected_results.txt' in f: 23 | if execute: 24 | os.remove(f) 25 | 26 | for f, d in zip(files, dirs): 27 | if not 'expected_results.txt' in f: 28 | if execute: 29 | os.rename(f,os.path.join(d,'expected_results.txt')) 30 | else: 31 | print('{0} --> {1}'.format(f, 32 | os.path.join(d,'expected_results.txt'))) 33 | -------------------------------------------------------------------------------- /wercker.yml: -------------------------------------------------------------------------------- 1 | box: python:3.6 2 | 3 | no-response-timeout: 60 4 | command-timeout: 60 5 | build: 6 | steps: 7 | - pip-install 8 | 9 | - script: 10 | name: install 11 | code: | 12 | sudo apt-get update 13 | sudo apt-get -y install libtk8.6 14 | 15 | # pip install --upgrade -r requirements.txt 16 | pip install -e .[tf] --upgrade 17 | 18 | - script: 19 | name: echo python information 20 | code: | 21 | echo "python version $(python --version) running" 22 | echo "pip version $(pip --version) running" 23 | ls 24 | 25 | - script: 26 | name: Run all tests 27 | code: | 28 | pytest tests -v --cov=jack --max-slave-restart=2 29 | 30 | - script: 31 | name: Code coverage upload 32 | code: | 33 | codecov 34 | --------------------------------------------------------------------------------