├── .gitignore ├── .gitmodules ├── LICENSE ├── Readme.md ├── data ├── cnn-dailymail │ ├── Readme.md │ └── setup.sh ├── deutsch2019 │ ├── Readme.md │ └── setup.sh ├── gigaword │ └── Readme.md ├── kedzie2018 │ └── cnn-dailymail │ │ └── setup.sh ├── onmt │ ├── Readme.md │ ├── convert_to_jsonl.py │ └── setup.sh └── wikicite │ ├── Readme.md │ └── setup.sh ├── experiments ├── deutsch2019 │ ├── Readme.md │ ├── abstractive-step │ │ ├── Readme.md │ │ ├── coverage │ │ │ ├── .gitignore │ │ │ ├── evaluate.sh │ │ │ ├── model.jsonnet │ │ │ ├── predict.sh │ │ │ ├── run.sh │ │ │ └── train.sh │ │ └── pointer-generator │ │ │ ├── .gitignore │ │ │ ├── evaluate.sh │ │ │ ├── model.jsonnet │ │ │ ├── predict.sh │ │ │ ├── run.sh │ │ │ └── train.sh │ ├── baselines │ │ ├── lead │ │ │ ├── .gitignore │ │ │ └── run.sh │ │ ├── open-ai │ │ │ ├── .gitignore │ │ │ ├── Readme.md │ │ │ ├── run.sh │ │ │ └── setup.sh │ │ └── oracle │ │ │ ├── .gitignore │ │ │ └── run.sh │ ├── demo.ipynb │ └── extractive-step │ │ ├── Readme.md │ │ ├── bm25 │ │ ├── .gitignore │ │ ├── calculate-df.sh │ │ ├── evaluate.sh │ │ └── predict.sh │ │ ├── extractive-model │ │ ├── .gitignore │ │ ├── Readme.md │ │ ├── evaluate.sh │ │ ├── model.jsonnet │ │ ├── predict.sh │ │ ├── preprocess.sh │ │ └── train.sh │ │ ├── lead │ │ ├── .gitignore │ │ ├── Readme.md │ │ ├── preprocess.sh │ │ └── run.sh │ │ ├── oracle │ │ ├── .gitignore │ │ ├── Readme.md │ │ ├── preprocess.sh │ │ └── run.sh │ │ └── sumfocus │ │ ├── .gitignore │ │ ├── Readme.md │ │ ├── analyze_results.py │ │ ├── run-max-sents.sh │ │ ├── run-max-words.sh │ │ └── run-parameter-sweep.sh ├── kedzie2018 │ ├── Readme.md │ └── cnn-dailymail │ │ ├── extractive-model │ │ ├── .gitignore │ │ ├── evaluate.sh │ │ ├── model.jsonnet │ │ ├── predict.sh │ │ └── train.sh │ │ ├── lead │ │ ├── .gitignore │ │ └── run.sh │ │ └── oracle │ │ ├── .gitignore │ │ └── run.sh ├── onmt │ ├── Readme.md │ ├── convert_to_jsonl.py │ ├── demo.ipynb │ ├── pointer-generator │ │ ├── .gitignore │ │ ├── Readme.md │ │ ├── evaluate.sh │ │ ├── model.jsonnet │ │ ├── predict.sh │ │ ├── replace-config.sh │ │ ├── run.sh │ │ └── train.sh │ └── seq2seq │ │ ├── .gitignore │ │ ├── Readme.md │ │ ├── evaluate.sh │ │ ├── model.jsonnet │ │ ├── predict.sh │ │ ├── replace-config.sh │ │ ├── run.sh │ │ └── train.sh └── wikicite │ └── analysis │ ├── document-distribution │ ├── Readme.md │ └── run.py │ └── topic-distribution │ ├── .gitignore │ ├── Readme.md │ └── run.py ├── external ├── ROUGE-1.5.5 │ ├── .gitignore │ └── Readme.md └── meteor │ ├── .gitignore │ ├── Readme.md │ └── setup.sh ├── requirements.txt ├── runtime.txt └── summarize ├── __init__.py ├── common ├── __init__.py ├── tempdir.py ├── testing.py └── util.py ├── data ├── __init__.py ├── dataset_readers │ ├── __init__.py │ ├── cloze │ │ ├── __init__.py │ │ ├── abstractive.py │ │ ├── extractive.py │ │ └── pointer_generator.py │ ├── sds │ │ ├── __init__.py │ │ ├── abstractive.py │ │ ├── extractive.py │ │ └── pointer_generator.py │ └── util.py ├── dataset_setup │ ├── __init__.py │ ├── cnn_dailymail.py │ ├── deutsch2019.py │ ├── gigaword.py │ ├── kedzie2018.py │ ├── tokenize.py │ ├── util.py │ └── wikicite.py ├── dataset_stats │ ├── __init__.py │ └── sds.py ├── io │ ├── __init__.py │ ├── jsonl_reader.py │ ├── jsonl_writer.py │ └── util.py └── paragraph_tokenizers │ ├── __init__.py │ ├── paragraph_tokenizer.py │ └── paragraph_word_tokenizer.py ├── metrics ├── __init__.py ├── meteor.py ├── python_rouge.py └── rouge.py ├── models ├── __init__.py ├── cloze │ ├── __init__.py │ ├── bm25 │ │ ├── __init__.py │ │ ├── bm25.py │ │ └── calculate_df.py │ ├── extractive_baseline.py │ ├── lead.py │ ├── open_ai_language_model.py │ ├── oracle.py │ ├── pointer_generator.py │ ├── seq2seq.py │ └── sumfocus.py └── sds │ ├── __init__.py │ ├── extractive_baseline.py │ ├── lead.py │ ├── oracle.py │ ├── pointer_generator.py │ └── seq2seq.py ├── modules ├── __init__.py ├── bridge.py ├── coverage_matrix_attention │ ├── __init__.py │ ├── coverage_matrix_attention.py │ ├── matrix_attention_wrapper.py │ └── mlp.py ├── generate_probability_functions │ ├── __init__.py │ ├── generate_probability_function.py │ ├── onmt.py │ └── see2017.py ├── matrix_attention │ ├── __init__.py │ └── mlp.py ├── rnns │ ├── __init__.py │ ├── gru.py │ ├── lstm.py │ ├── rnn.py │ └── util.py └── sentence_extractors │ ├── __init__.py │ ├── rnn.py │ └── sentence_extractor.py ├── nn ├── __init__.py ├── beam_search │ ├── __init__.py │ ├── beam_search.py │ ├── coverage_penalizers │ │ ├── __init__.py │ │ ├── coverage_penalizer.py │ │ └── onmt.py │ ├── length_penalizers │ │ ├── __init__.py │ │ ├── average.py │ │ ├── length_penalizer.py │ │ └── wu.py │ └── relaxed.py └── util.py ├── predictors ├── __init__.py ├── cloze │ ├── __init__.py │ ├── abstractive.py │ └── extractive.py └── sds │ ├── __init__.py │ ├── abstractive.py │ └── extractive.py ├── tests ├── __init__.py ├── common │ ├── __init__.py │ └── tempdir_test.py ├── data │ ├── __init__.py │ ├── dataset_readers │ │ ├── __init__.py │ │ ├── cloze │ │ │ ├── __init__.py │ │ │ ├── abstractive_test.py │ │ │ ├── extractive_test.py │ │ │ └── pointer_generator_test.py │ │ └── sds │ │ │ ├── __init__.py │ │ │ ├── abstractive_test.py │ │ │ ├── extractive_test.py │ │ │ └── pointer_generator_test.py │ ├── dataset_setup │ │ ├── __init__.py │ │ └── tokenize_test.py │ ├── io │ │ ├── __init__.py │ │ ├── jsonl_reader_test.py │ │ ├── jsonl_writer_test.py │ │ └── util_test.py │ └── paragraph_tokenizers │ │ ├── __init__.py │ │ └── paragraph_word_tokenizer_test.py ├── fixtures │ ├── configs │ │ ├── cloze │ │ │ ├── extractive-baseline.jsonnet │ │ │ ├── pointer-generator.jsonnet │ │ │ └── seq2seq.jsonnet │ │ └── sds │ │ │ ├── extractive-baseline.jsonnet │ │ │ ├── pointer-generator.jsonnet │ │ │ └── seq2seq.jsonnet │ └── data │ │ ├── chen2018 │ │ ├── Readme.md │ │ ├── gold.jsonl │ │ └── model.jsonl │ │ ├── cloze.jsonl │ │ ├── hong2014 │ │ ├── centroid.jsonl │ │ ├── classy04.jsonl │ │ ├── classy11.jsonl │ │ ├── dpp.jsonl │ │ ├── freq-sum.jsonl │ │ ├── greedy-kl.jsonl │ │ ├── icsi-summ.jsonl │ │ ├── lexrank.jsonl │ │ ├── occams-v.jsonl │ │ ├── reg-sum.jsonl │ │ ├── setup.py │ │ ├── submodular.jsonl │ │ └── ts-sum.jsonl │ │ └── sds.jsonl ├── metrics │ ├── __init__.py │ ├── meteor_test.py │ ├── python_rouge_test.py │ └── rouge_test.py ├── models │ ├── __init__.py │ ├── cloze │ │ ├── __init__.py │ │ ├── bm25 │ │ │ ├── __init__.py │ │ │ ├── bm25_test.py │ │ │ └── calculate_df_test.py │ │ ├── extractive_baseline_test.py │ │ ├── lead_test.py │ │ ├── open_ai_language_model_test.py │ │ ├── pointer_generator_test.py │ │ ├── seq2seq_test.py │ │ └── sumfocus_test.py │ └── sds │ │ ├── __init__.py │ │ ├── extractive_baseline_test.py │ │ ├── lead_test.py │ │ ├── pointer_generator_test.py │ │ └── seq2seq_test.py ├── modules │ ├── __init__.py │ ├── bridge_test.py │ ├── coverage_matrix_attention │ │ ├── __init__.py │ │ └── mlp_test.py │ ├── rnns │ │ ├── __init__.py │ │ ├── gru_test.py │ │ ├── lstm_test.py │ │ ├── rnn_test.py │ │ └── util.py │ └── sentence_extractors │ │ ├── __init__.py │ │ └── rnn_test.py ├── nn │ ├── __init__.py │ ├── beam_search │ │ ├── __init__.py │ │ ├── beam_search_test.py │ │ ├── coverage_penalizers │ │ │ ├── __init__.py │ │ │ └── onmt_test.py │ │ ├── length_penalizers │ │ │ ├── __init__.py │ │ │ ├── average_test.py │ │ │ └── wu_test.py │ │ └── relaxed_test.py │ └── util_test.py └── training │ ├── __init__.py │ └── metrics │ ├── __init__.py │ ├── binary_f1_measure_test.py │ └── python_rouge_metric_test.py ├── training ├── __init__.py └── metrics │ ├── __init__.py │ ├── binary_f1_measure.py │ ├── cross_entropy_metric.py │ └── python_rouge_metric.py └── utils ├── __init__.py ├── copy_jsonl_fields.py ├── extract_cloze_from_labels.py ├── extract_summary_from_labels.py └── replace_config.py /.gitignore: -------------------------------------------------------------------------------- 1 | .pytest_cache 2 | __pycache__ 3 | .DS_Store 4 | /data 5 | .ipynb_checkpoints 6 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "external/gpt-2"] 2 | path = external/gpt-2 3 | url = https://github.com/openai/gpt-2 4 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # Summarize 2 | Summarize is a PyTorch-based package for automatic summarization built on AllenNLP. 3 | It contains implementations of end-to-end extractive models and abstractive models, including the standard Seq2Seq and Pointer-Generator models. 4 | 5 | This repository also contains the code for the "Summary Cloze: A New Task for Content Selection in Topic-Focused Summarization" paper. 6 | Please see `experiments/deutsch2019` for more details about how to run the models. 7 | -------------------------------------------------------------------------------- /data/cnn-dailymail/setup.sh: -------------------------------------------------------------------------------- 1 | python -m summarize.data.dataset_setup.cnn_dailymail data/cnn-dailymail 2 | for split in train valid test; do 3 | for dataset in cnn dailymail cnn-dailymail; do 4 | python -m summarize.data.dataset_setup.tokenize \ 5 | data/cnn-dailymail/${dataset}/${split}.jsonl.gz \ 6 | data/cnn-dailymail/${dataset}/${split}.tokenized.jsonl.gz \ 7 | document summary \ 8 | --backend nltk 9 | done 10 | done 11 | -------------------------------------------------------------------------------- /data/deutsch2019/Readme.md: -------------------------------------------------------------------------------- 1 | This directory contains the script to preprocess the WikiCite dataset for "Summary Cloze: A New Task for Content Selection in Topic-Focused Summarization." 2 | 3 | First, run the setup script under the `data/wikicite` directory. 4 | Then, run the `setup.sh` script to compute the ROUGE-based heuristic extractive labels for the dataset. 5 | The processing speed is somewhat slow, so it may take several hours to process the data. 6 | Alternatively, the preprocessed data can be downloaded here: 7 | train, 8 | valid, 9 | test. 10 | -------------------------------------------------------------------------------- /data/deutsch2019/setup.sh: -------------------------------------------------------------------------------- 1 | for split in train valid test; do 2 | python -m summarize.data.dataset_setup.deutsch2019 \ 3 | data/wikicite/${split}.tokenized.v1.1.jsonl.gz \ 4 | data/deutsch2019/${split}.v1.1.jsonl.gz \ 5 | --num-cores 8 6 | done 7 | -------------------------------------------------------------------------------- /data/gigaword/Readme.md: -------------------------------------------------------------------------------- 1 | # Gigaword 2 | ## Setup 3 | To setup the Gigaword corpus, run the following command: 4 | ``` 5 | python -m summarize.data.dataset_setup.gigaword \ 6 | data/gigaword 7 | ``` 8 | The script downloads the data from https://github.com/harvardnlp/sent-summary, replaces the `UNK` token with the AllenNLP special token for out-of-vocabulary words, and saves the data in the jsonl format. 9 | 10 | There are 3,803,957 training, 189,651, and 1951 testing examples. 11 | 12 | This is the dataset which is used to train the [OpenNMT-py Gigaword summarization models](http://opennmt.net/Models-py/#summarization). 13 | I assume it is also the data used by [Rush et al. (2015)](https://www.aclweb.org/anthology/D15-1044), but the paper does not link to any dataset, code, or specify the size of the datasets splits. 14 | The follow up work, [Ranzato et al. (2016)](https://arxiv.org/pdf/1511.06732.pdf), also uses Gigaword, but the dataset split sizes are very different (179,414 training, 22,568 validation, and 22,259 testing examples). 15 | The [corresponding repository](https://github.com/facebookarchive/MIXER) only has instructions and code for the machine translation experiments. 16 | -------------------------------------------------------------------------------- /data/kedzie2018/cnn-dailymail/setup.sh: -------------------------------------------------------------------------------- 1 | for split in train valid test; do 2 | python -m summarize.data.dataset_setup.kedzie2018 \ 3 | https://s3.amazonaws.com/danieldeutsch/summarize/data/cnn-dailymail/cnn-dailymail/${split}.tokenized.v1.0.jsonl.gz \ 4 | data/kedzie2018/cnn-dailymail/${split}.jsonl.gz \ 5 | 100 \ 6 | --num-cores 16 7 | done 8 | -------------------------------------------------------------------------------- /data/onmt/Readme.md: -------------------------------------------------------------------------------- 1 | # OpenNMT CNN/DailyMail 2 | This dataset is the CNN/DailyMail dataset as prepared by the OpenNMT library. 3 | The preprocessed data can be downloaded here: 4 | 5 | - https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/train.v1.0.jsonl.gz 6 | - https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/valid.v1.0.jsonl.gz 7 | - https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/test.v1.0.jsonl.gz 8 | -------------------------------------------------------------------------------- /data/onmt/convert_to_jsonl.py: -------------------------------------------------------------------------------- 1 | # Edit the system path so the summarize library can be imported 2 | import sys 3 | sys.path.append('.') 4 | 5 | import argparse 6 | import json 7 | import re 8 | 9 | from summarize.data.io import JsonlWriter 10 | 11 | 12 | def main(args): 13 | with JsonlWriter(args.output_jsonl) as out: 14 | with open(args.src_tsv, 'r') as f_src: 15 | with open(args.tgt_tsv, 'r') as f_tgt: 16 | for src, tgt in zip(f_src, f_tgt): 17 | if len(src.strip()) == 0: 18 | continue 19 | 20 | document = [src.strip()] 21 | summary = [] 22 | for match in re.findall(r' (.+?) ', tgt): 23 | summary.append(match) 24 | out.write({'document': document, 'summary': summary}) 25 | 26 | 27 | if __name__ == '__main__': 28 | argp = argparse.ArgumentParser() 29 | argp.add_argument('src_tsv') 30 | argp.add_argument('tgt_tsv') 31 | argp.add_argument('output_jsonl') 32 | args = argp.parse_args() 33 | main(args) 34 | -------------------------------------------------------------------------------- /data/onmt/setup.sh: -------------------------------------------------------------------------------- 1 | wget https://s3.amazonaws.com/opennmt-models/Summary/cnndm.tar.gz -O data/onmt/cnndm.tar.gz 2 | mkdir data/onmt/onmt 3 | tar -xzvf data/onmt/cnndm.tar.gz -C data/onmt/onmt 4 | 5 | python data/onmt/convert_to_jsonl.py \ 6 | data/onmt/onmt/train.txt.src \ 7 | data/onmt/onmt/train.txt.tgt.tagged \ 8 | data/onmt/train.jsonl.gz 9 | 10 | python data/onmt/convert_to_jsonl.py \ 11 | data/onmt/onmt/val.txt.src \ 12 | data/onmt/onmt/val.txt.tgt.tagged \ 13 | data/onmt/valid.jsonl.gz 14 | 15 | python data/onmt/convert_to_jsonl.py \ 16 | data/onmt/onmt/test.txt.src \ 17 | data/onmt/onmt/test.txt.tgt.tagged \ 18 | data/onmt/test.jsonl.gz 19 | -------------------------------------------------------------------------------- /data/wikicite/Readme.md: -------------------------------------------------------------------------------- 1 | # WikiCite 2 | The WikiCite dataset is a collection of summary cloze instances collected from Wikipedia. 3 | For more details, please see https://github.com/danieldeutsch/wikicite. 4 | 5 | ## Setup 6 | The `setup.sh` script downloads the original dataset and tokenizes the text fields. 7 | The original dataset and tokenized versions can be downloaded here: 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 |
CorpusTrainValidTest
Originalhttps://danieldeutsch.s3.amazonaws.com/summarize/data/wikicite/train.v1.1.jsonl.gzhttps://danieldeutsch.s3.amazonaws.com/summarize/data/wikicite/valid.v1.1.jsonl.gzhttps://danieldeutsch.s3.amazonaws.com/summarize/data/wikicite/test.v1.1.jsonl.gz
Tokenizedhttps://danieldeutsch.s3.amazonaws.com/summarize/data/wikicite/train.tokenized.v1.1.jsonl.gzhttps://danieldeutsch.s3.amazonaws.com/summarize/data/wikicite/valid.tokenized.v1.1.jsonl.gzhttps://danieldeutsch.s3.amazonaws.com/summarize/data/wikicite/test.tokenized.v1.1.jsonl.gz
33 | 34 | 35 | ## Citation 36 | If you use this dataset, please cite the following paper: 37 | ``` 38 | @inproceedings{DeutschRo19, 39 | author = {Daniel Deutsch and Dan Roth}, 40 | title = {{Summary Cloze: A New Task for Content Selection in Topic-Focused Summarization}}, 41 | booktitle = {Proc. of the Conference on Empirical Methods in Natural Language Processing (EMNLP)}, 42 | year = {2019}, 43 | url = "https://cogcomp.seas.upenn.edu/papers/DeutschRo19.pdf", 44 | funding = {ARL}, 45 | } 46 | ``` 47 | -------------------------------------------------------------------------------- /data/wikicite/setup.sh: -------------------------------------------------------------------------------- 1 | for split in train valid test; do 2 | python -m summarize.data.dataset_setup.wikicite \ 3 | https://danieldeutsch.s3.amazonaws.com/summarize/data/wikicite/${split}.v1.1.jsonl.gz \ 4 | data/wikicite/${split}.tokenized.jsonl.gz 5 | done 6 | -------------------------------------------------------------------------------- /experiments/deutsch2019/Readme.md: -------------------------------------------------------------------------------- 1 | # Deutsch 2019 2 | This directory contains the experiments related to "Summary Cloze: A New Task for Content Selection in Topic-Focused Summarization" by Deutsch and Roth (2019). 3 | 4 | ## Demo 5 | A demo of the final models (with the topics and context) can be viewed by clicking this badge: 6 | 7 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/danieldeutsch/summarize/21054f43de1b363aba1e1283d62736e5117877bf?filepath=experiments%2Fdeutsch2019%2Fdemo.ipynb) 8 | 9 | If you run the Jupyter Notebook on the MyBinder servers, the abstractive model takes around 30 to 60 seconds to produce the output. 10 | 11 | ## Instructions 12 | First, it may be necessary to checkout [this commit](https://github.com/danieldeutsch/summarize/releases/tag/emnlp2019) since there could have been breaking changes to the code since the original models were trained. 13 | 14 | Then, setup the WikiCite dataset by running the setup script in `data/deutsch2019`. 15 | 16 | Each of the directories contains the scripts to run the different models from the paper. 17 | The `baselines` directory contains code for some baseline models, such as the lead, oracle, and language model baselines. 18 | The `extractive-step` directory contains the code for the extractive models and extractive preprocessing steps. 19 | The `abstractive-step` directory contains the code for training the abstractive models, both the base Pointer-Generator model and the fine-tuned model with the coverage loss. 20 | The directories contain documentation with extra information, results, and saved models. 21 | 22 | If you use any of the code or data from this experiment, please cite the following paper: 23 | ``` 24 | @inproceedings{DeutschRo19, 25 | author = {Daniel Deutsch and Dan Roth}, 26 | title = {{Summary Cloze: A New Task for Content Selection in Topic-Focused Summarization}}, 27 | booktitle = {Proc. of the Conference on Empirical Methods in Natural Language Processing (EMNLP)}, 28 | year = {2019}, 29 | url = "https://cogcomp.seas.upenn.edu/papers/DeutschRo19.pdf", 30 | funding = {ARL}, 31 | } 32 | ``` 33 | -------------------------------------------------------------------------------- /experiments/deutsch2019/abstractive-step/coverage/.gitignore: -------------------------------------------------------------------------------- 1 | model 2 | output 3 | results 4 | -------------------------------------------------------------------------------- /experiments/deutsch2019/abstractive-step/coverage/evaluate.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | 3 | if [ "$#" -ne 2 ]; then 4 | echo "Usage: sh evaluate.sh " 5 | exit 6 | fi 7 | 8 | preprocessing_dataset=$1 9 | use_context=$2 10 | if [ "${preprocessing_dataset}" == "lead" ]; then 11 | preprocess_dir="${expt_dir}/../../extractive-step/lead/preprocessed" 12 | elif [ "${preprocessing_dataset}" == "oracle" ]; then 13 | preprocess_dir="${expt_dir}/../../extractive-step/oracle/preprocessed" 14 | elif [ "${preprocessing_dataset}" == "extractive-model" ]; then 15 | preprocess_dir="${expt_dir}/../../extractive-step/extractive-model/preprocessed/topics/context" 16 | else 17 | echo "Invalid preprocessing dataset: ${preprocessing_dataset}" 18 | exit 19 | fi 20 | 21 | if [ "${use_context}" == "true" ]; then 22 | context_dir="context" 23 | else 24 | context_dir="no-context" 25 | fi 26 | 27 | output_dir=${expt_dir}/output/${preprocessing_dataset}/${context_dir} 28 | results_dir=${expt_dir}/results/${preprocessing_dataset}/${context_dir} 29 | mkdir -p ${results_dir} 30 | 31 | for split in valid test; do 32 | python -m summarize.metrics.rouge \ 33 | ${preprocess_dir}/${split}.jsonl.gz \ 34 | ${output_dir}/${split}.jsonl \ 35 | --gold-summary-field-name cloze \ 36 | --model-summary-field-name cloze \ 37 | --add-gold-wrapping-list \ 38 | --add-model-wrapping-list \ 39 | --compute-rouge-l \ 40 | --silent \ 41 | --output-file ${results_dir}/${split}.metrics.json 42 | done 43 | -------------------------------------------------------------------------------- /experiments/deutsch2019/abstractive-step/coverage/predict.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | 3 | if [ "$#" -ne 2 ]; then 4 | echo "Usage: sh predict.sh " 5 | exit 6 | fi 7 | 8 | preprocessing_dataset=$1 9 | use_context=$2 10 | if [ "${preprocessing_dataset}" == "lead" ]; then 11 | preprocess_dir="${expt_dir}/../../extractive-step/lead/preprocessed" 12 | elif [ "${preprocessing_dataset}" == "oracle" ]; then 13 | preprocess_dir="${expt_dir}/../../extractive-step/oracle/preprocessed" 14 | elif [ "${preprocessing_dataset}" == "extractive-model" ]; then 15 | preprocess_dir="${expt_dir}/../../extractive-step/extractive-model/preprocessed/topics/context" 16 | else 17 | echo "Invalid preprocessing dataset: ${preprocessing_dataset}" 18 | exit 19 | fi 20 | 21 | if [ "${use_context}" == "true" ]; then 22 | context_dir="context" 23 | else 24 | context_dir="no-context" 25 | fi 26 | 27 | model_dir=${expt_dir}/model/${preprocessing_dataset}/${context_dir} 28 | model_file=${model_dir}/model.tar.gz 29 | output_dir=${expt_dir}/output/${preprocessing_dataset}/${context_dir} 30 | results_dir=${expt_dir}/results/${preprocessing_dataset}/${context_dir} 31 | mkdir -p ${output_dir} ${results_dir} 32 | 33 | for split in valid test; do 34 | allennlp predict \ 35 | --include-package summarize \ 36 | --output-file ${output_dir}/${split}.jsonl \ 37 | --predictor cloze-abstractive-predictor \ 38 | --silent \ 39 | --use-dataset-reader \ 40 | --cuda-device 0 \ 41 | --batch-size 16 \ 42 | ${model_file} \ 43 | ${preprocess_dir}/${split}.jsonl.gz 44 | done 45 | 46 | allennlp evaluate \ 47 | --include-package summarize \ 48 | --output-file ${results_dir}/test.evaluate.metrics.json \ 49 | --cuda-device 0 \ 50 | --overrides '{"validation_iterator.instances_per_epoch": null, "model.beam_search.beam_size": 1}' \ 51 | ${model_file} \ 52 | ${preprocess_dir}/test.jsonl.gz 53 | -------------------------------------------------------------------------------- /experiments/deutsch2019/abstractive-step/coverage/run.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | 3 | if [ "$#" -ne 2 ]; then 4 | echo "Usage: sh run.sh " 5 | exit 6 | fi 7 | 8 | sh ${expt_dir}/train.sh $1 $2 9 | sh ${expt_dir}/predict.sh $1 $2 10 | sh ${expt_dir}/evaluate.sh $1 $2 11 | -------------------------------------------------------------------------------- /experiments/deutsch2019/abstractive-step/coverage/train.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | 3 | if [ "$#" -ne 2 ]; then 4 | echo "Usage: sh train.sh " 5 | exit 6 | fi 7 | 8 | preprocessing_dataset=$1 9 | use_context=$2 10 | if [ "${preprocessing_dataset}" == "lead" ]; then 11 | preprocess_dir="${expt_dir}/../../extractive-step/lead/preprocessed" 12 | elif [ "${preprocessing_dataset}" == "oracle" ]; then 13 | preprocess_dir="${expt_dir}/../../extractive-step/oracle/preprocessed" 14 | elif [ "${preprocessing_dataset}" == "extractive-model" ]; then 15 | preprocess_dir="${expt_dir}/../../extractive-step/extractive-model/preprocessed/topics/context" 16 | else 17 | echo "Invalid preprocessing dataset: ${preprocessing_dataset}" 18 | exit 19 | fi 20 | 21 | if [ "${use_context}" == "true" ]; then 22 | context_dir="context" 23 | else 24 | context_dir="no-context" 25 | fi 26 | 27 | model_dir=${expt_dir}/model/${preprocessing_dataset}/${context_dir} 28 | pretrained_dir=${expt_dir}/../pointer-generator/model/${preprocessing_dataset}/${context_dir} 29 | model_config=${expt_dir}/model.jsonnet 30 | 31 | if [ -d ${model_dir} ]; then 32 | read -p "remove directory ${model_dir}? [y/n] " yn 33 | case $yn in 34 | [Yy]* ) rm -rf ${model_dir};; 35 | [Nn]* ) ;; 36 | * ) echo "Please answer yes or no.";; 37 | esac 38 | fi 39 | 40 | export DATA_DIR=${preprocess_dir} 41 | export USE_CONTEXT=${use_context} 42 | export PRETRAINED_DIR=${pretrained_dir} 43 | allennlp train \ 44 | --include-package summarize \ 45 | --serialization-dir ${model_dir} \ 46 | ${model_config} 47 | -------------------------------------------------------------------------------- /experiments/deutsch2019/abstractive-step/pointer-generator/.gitignore: -------------------------------------------------------------------------------- 1 | model 2 | output 3 | results 4 | -------------------------------------------------------------------------------- /experiments/deutsch2019/abstractive-step/pointer-generator/evaluate.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | 3 | if [ "$#" -ne 2 ]; then 4 | echo "Usage: sh evaluate.sh " 5 | exit 6 | fi 7 | 8 | preprocessing_dataset=$1 9 | use_context=$2 10 | if [ "${preprocessing_dataset}" == "lead" ]; then 11 | preprocess_dir="${expt_dir}/../../extractive-step/lead/preprocessed" 12 | elif [ "${preprocessing_dataset}" == "oracle" ]; then 13 | preprocess_dir="${expt_dir}/../../extractive-step/oracle/preprocessed" 14 | elif [ "${preprocessing_dataset}" == "extractive-model" ]; then 15 | preprocess_dir="${expt_dir}/../../extractive-step/extractive-model/preprocessed/topics/context" 16 | else 17 | echo "Invalid preprocessing dataset: ${preprocessing_dataset}" 18 | exit 19 | fi 20 | 21 | if [ "${use_context}" == "true" ]; then 22 | context_dir="context" 23 | else 24 | context_dir="no-context" 25 | fi 26 | 27 | output_dir=${expt_dir}/output/${preprocessing_dataset}/${context_dir} 28 | results_dir=${expt_dir}/results/${preprocessing_dataset}/${context_dir} 29 | mkdir -p ${results_dir} 30 | 31 | for split in valid test; do 32 | python -m summarize.metrics.rouge \ 33 | ${preprocess_dir}/${split}.jsonl.gz \ 34 | ${output_dir}/${split}.jsonl \ 35 | --gold-summary-field-name cloze \ 36 | --model-summary-field-name cloze \ 37 | --add-gold-wrapping-list \ 38 | --add-model-wrapping-list \ 39 | --compute-rouge-l \ 40 | --silent \ 41 | --output-file ${results_dir}/${split}.metrics.json 42 | done 43 | -------------------------------------------------------------------------------- /experiments/deutsch2019/abstractive-step/pointer-generator/predict.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | 3 | if [ "$#" -ne 2 ]; then 4 | echo "Usage: sh predict.sh " 5 | exit 6 | fi 7 | 8 | preprocessing_dataset=$1 9 | use_context=$2 10 | if [ "${preprocessing_dataset}" == "lead" ]; then 11 | preprocess_dir="${expt_dir}/../../extractive-step/lead/preprocessed" 12 | elif [ "${preprocessing_dataset}" == "oracle" ]; then 13 | preprocess_dir="${expt_dir}/../../extractive-step/oracle/preprocessed" 14 | elif [ "${preprocessing_dataset}" == "extractive-model" ]; then 15 | preprocess_dir="${expt_dir}/../../extractive-step/extractive-model/preprocessed/topics/context" 16 | else 17 | echo "Invalid preprocessing dataset: ${preprocessing_dataset}" 18 | exit 19 | fi 20 | 21 | if [ "${use_context}" == "true" ]; then 22 | context_dir="context" 23 | else 24 | context_dir="no-context" 25 | fi 26 | 27 | model_dir=${expt_dir}/model/${preprocessing_dataset}/${context_dir} 28 | model_file=${model_dir}/model.tar.gz 29 | output_dir=${expt_dir}/output/${preprocessing_dataset}/${context_dir} 30 | mkdir -p ${output_dir} 31 | 32 | for split in valid test; do 33 | allennlp predict \ 34 | --include-package summarize \ 35 | --output-file ${output_dir}/${split}.jsonl \ 36 | --predictor cloze-abstractive-predictor \ 37 | --silent \ 38 | --use-dataset-reader \ 39 | --cuda-device 0 \ 40 | --batch-size 16 \ 41 | ${model_file} \ 42 | ${preprocess_dir}/${split}.jsonl.gz 43 | done 44 | -------------------------------------------------------------------------------- /experiments/deutsch2019/abstractive-step/pointer-generator/run.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | 3 | if [ "$#" -ne 2 ]; then 4 | echo "Usage: sh run.sh " 5 | exit 6 | fi 7 | 8 | sh ${expt_dir}/train.sh $1 $2 9 | sh ${expt_dir}/predict.sh $1 $2 10 | sh ${expt_dir}/evaluate.sh $1 $2 11 | -------------------------------------------------------------------------------- /experiments/deutsch2019/abstractive-step/pointer-generator/train.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | 3 | if [ "$#" -ne 2 ]; then 4 | echo "Usage: sh train.sh " 5 | exit 6 | fi 7 | 8 | preprocessing_dataset=$1 9 | use_context=$2 10 | if [ "${preprocessing_dataset}" == "lead" ]; then 11 | preprocess_dir="${expt_dir}/../../extractive-step/lead/preprocessed" 12 | elif [ "${preprocessing_dataset}" == "oracle" ]; then 13 | preprocess_dir="${expt_dir}/../../extractive-step/oracle/preprocessed" 14 | elif [ "${preprocessing_dataset}" == "extractive-model" ]; then 15 | preprocess_dir="${expt_dir}/../../extractive-step/extractive-model/preprocessed/topics/context" 16 | else 17 | echo "Invalid preprocessing dataset: ${preprocessing_dataset}" 18 | exit 19 | fi 20 | 21 | if [ "${use_context}" == "true" ]; then 22 | context_dir="context" 23 | else 24 | context_dir="no-context" 25 | fi 26 | 27 | model_dir=${expt_dir}/model/${preprocessing_dataset}/${context_dir} 28 | model_config=${expt_dir}/model.jsonnet 29 | 30 | if [ -d ${model_dir} ]; then 31 | read -p "remove directory ${model_dir}? [y/n] " yn 32 | case $yn in 33 | [Yy]* ) rm -rf ${model_dir};; 34 | [Nn]* ) ;; 35 | * ) echo "Please answer yes or no.";; 36 | esac 37 | fi 38 | 39 | export DATA_DIR=${preprocess_dir} 40 | export USE_CONTEXT=${use_context} 41 | allennlp train \ 42 | --include-package summarize \ 43 | --serialization-dir ${model_dir} \ 44 | ${model_config} 45 | -------------------------------------------------------------------------------- /experiments/deutsch2019/baselines/lead/.gitignore: -------------------------------------------------------------------------------- 1 | output 2 | results 3 | -------------------------------------------------------------------------------- /experiments/deutsch2019/baselines/lead/run.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | output_dir="${expt_dir}/output" 3 | results_dir="${expt_dir}/results" 4 | mkdir -p ${output_dir} ${results_dir} 5 | 6 | for split in valid test; do 7 | python -m summarize.models.cloze.lead \ 8 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \ 9 | ${output_dir}/${split}.jsonl \ 10 | --max-sentences 1 11 | 12 | python -m summarize.metrics.rouge \ 13 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \ 14 | ${output_dir}/${split}.jsonl \ 15 | --gold-summary-field-name cloze \ 16 | --model-summary-field-name cloze \ 17 | --add-gold-wrapping-list \ 18 | --add-model-wrapping-list \ 19 | --compute-rouge-l \ 20 | --silent \ 21 | --output-file ${results_dir}/${split}.metrics.json 22 | done 23 | -------------------------------------------------------------------------------- /experiments/deutsch2019/baselines/open-ai/.gitignore: -------------------------------------------------------------------------------- 1 | models 2 | output 3 | results 4 | -------------------------------------------------------------------------------- /experiments/deutsch2019/baselines/open-ai/Readme.md: -------------------------------------------------------------------------------- 1 | # OpenAI Language Model 2 | The OpenAI Language Model ([Radford et al., 2019](https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf)) serves as a baseline for the summary cloze task. 3 | The language model conditions on the context of the summary and generates the next sentence. 4 | The cited document is not used at all. 5 | The purpose of the experiment is to measure how well a system can do without access to the reference text. 6 | 7 | ## Setup 8 | Before using the OpenAI language model, you first need to download the model 9 | ``` 10 | sh experiments/deutsch2019/baselines/open-ai/setup.sh 11 | ``` 12 | For more documentation on the model and its parameters, see the official [Github repository](https://github.com/openai/gpt-2). 13 | -------------------------------------------------------------------------------- /experiments/deutsch2019/baselines/open-ai/run.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | output_dir="${expt_dir}/output" 3 | results_dir="${expt_dir}/results" 4 | mkdir -p ${output_dir} ${results_dir} 5 | 6 | model="345M" 7 | 8 | for split in valid test; do 9 | python -m summarize.models.cloze.open_ai_language_model \ 10 | ${expt_dir}/models/${model} \ 11 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \ 12 | ${output_dir}/${split}.jsonl \ 13 | 1 \ 14 | 40 15 | 16 | python -m summarize.metrics.rouge \ 17 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \ 18 | ${output_dir}/${split}.jsonl \ 19 | --gold-summary-field-name cloze \ 20 | --model-summary-field-name cloze \ 21 | --add-gold-wrapping-list \ 22 | --add-model-wrapping-list \ 23 | --silent \ 24 | --output-file ${results_dir}/${split}.metrics.json 25 | done 26 | -------------------------------------------------------------------------------- /experiments/deutsch2019/baselines/open-ai/setup.sh: -------------------------------------------------------------------------------- 1 | cwd=$(pwd) 2 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 3 | 4 | pushd ${expt_dir} 5 | python ${cwd}/external/gpt-2/download_model.py 345M 6 | -------------------------------------------------------------------------------- /experiments/deutsch2019/baselines/oracle/.gitignore: -------------------------------------------------------------------------------- 1 | output 2 | results 3 | -------------------------------------------------------------------------------- /experiments/deutsch2019/baselines/oracle/run.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | output_dir="${expt_dir}/output" 3 | results_dir="${expt_dir}/results" 4 | mkdir -p ${output_dir} ${results_dir} 5 | 6 | for split in valid test; do 7 | for metric in "R1-F1" "R2-F1" "RL-F1"; do 8 | python -m summarize.models.cloze.oracle \ 9 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \ 10 | ${output_dir}/${split}.${metric}.jsonl \ 11 | ${metric} \ 12 | --max-sentences 1 \ 13 | --cloze-only 14 | 15 | python -m summarize.metrics.rouge \ 16 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \ 17 | ${output_dir}/${split}.${metric}.jsonl \ 18 | --gold-summary-field-name cloze \ 19 | --model-summary-field-name cloze \ 20 | --add-gold-wrapping-list \ 21 | --add-model-wrapping-list \ 22 | --compute-rouge-l \ 23 | --silent \ 24 | --output-file ${results_dir}/${split}.${metric}.metrics.json 25 | done 26 | done 27 | -------------------------------------------------------------------------------- /experiments/deutsch2019/extractive-step/Readme.md: -------------------------------------------------------------------------------- 1 | # Extractive Step 2 | This directory contains the scripts to train the extractive models. 3 | -------------------------------------------------------------------------------- /experiments/deutsch2019/extractive-step/bm25/.gitignore: -------------------------------------------------------------------------------- 1 | output 2 | results 3 | -------------------------------------------------------------------------------- /experiments/deutsch2019/extractive-step/bm25/calculate-df.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | output_dir="${expt_dir}/output" 3 | mkdir -p ${output_dir} 4 | 5 | python -m summarize.models.cloze.bm25.calculate_df \ 6 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/train.v1.1.jsonl.gz \ 7 | ${output_dir}/df.jsonl.gz 8 | -------------------------------------------------------------------------------- /experiments/deutsch2019/extractive-step/bm25/evaluate.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | output_dir="${expt_dir}/output" 3 | results_dir="${expt_dir}/results" 4 | mkdir -p ${results_dir} 5 | 6 | for split in valid test; do 7 | python -m summarize.metrics.rouge \ 8 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \ 9 | ${output_dir}/${split}.max-words.jsonl \ 10 | --gold-summary-field-name cloze \ 11 | --model-summary-field-name cloze \ 12 | --add-gold-wrapping-list \ 13 | --compute-rouge-l \ 14 | --silent \ 15 | --max-words 200 \ 16 | --output-file ${results_dir}/${split}.max-words.metrics.json 17 | 18 | python -m summarize.metrics.rouge \ 19 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \ 20 | ${output_dir}/${split}.max-sents.jsonl \ 21 | --gold-summary-field-name cloze \ 22 | --model-summary-field-name cloze \ 23 | --add-gold-wrapping-list \ 24 | --add-model-wrapping-list \ 25 | --compute-rouge-l \ 26 | --silent \ 27 | --output-file ${results_dir}/${split}.max-sents.metrics.json 28 | done 29 | -------------------------------------------------------------------------------- /experiments/deutsch2019/extractive-step/bm25/predict.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | output_dir="${expt_dir}/output" 3 | mkdir -p ${output_dir} 4 | 5 | max_words=200 6 | max_sents=1 7 | 8 | for split in valid test; do 9 | python -m summarize.models.cloze.bm25.bm25 \ 10 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \ 11 | ${output_dir}/df.jsonl.gz \ 12 | ${output_dir}/${split}.max-words.jsonl \ 13 | --max-words ${max_words} 14 | 15 | python -m summarize.models.cloze.bm25.bm25 \ 16 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \ 17 | ${output_dir}/df.jsonl.gz \ 18 | ${output_dir}/${split}.max-sents.jsonl \ 19 | --max-sentences ${max_sents} \ 20 | --flatten 21 | done 22 | -------------------------------------------------------------------------------- /experiments/deutsch2019/extractive-step/extractive-model/.gitignore: -------------------------------------------------------------------------------- 1 | model 2 | output 3 | results 4 | preprocessed 5 | -------------------------------------------------------------------------------- /experiments/deutsch2019/extractive-step/extractive-model/evaluate.sh: -------------------------------------------------------------------------------- 1 | if [ "$#" -ne 2 ]; then 2 | echo "Usage: sh evaluate.sh " 3 | exit 4 | fi 5 | 6 | use_topics=$1 7 | use_context=$2 8 | if [ "${use_topics}" == "true" ]; then 9 | topics_dir="topics" 10 | else 11 | topics_dir="no-topics" 12 | fi 13 | if [ "${use_context}" == "true" ]; then 14 | context_dir="context" 15 | else 16 | context_dir="no-context" 17 | fi 18 | 19 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 20 | output_dir=${expt_dir}/output/${topics_dir}/${context_dir} 21 | results_dir=${expt_dir}/results/${topics_dir}/${context_dir} 22 | mkdir -p ${results_dir} 23 | 24 | for split in valid test; do 25 | python -m summarize.metrics.rouge \ 26 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \ 27 | ${output_dir}/${split}.max-tokens.jsonl \ 28 | --gold-summary-field-name cloze \ 29 | --model-summary-field-name cloze \ 30 | --add-gold-wrapping-list \ 31 | --compute-rouge-l \ 32 | --silent \ 33 | --max-words 200 \ 34 | --output-file ${results_dir}/${split}.max-tokens.metrics.json 35 | 36 | python -m summarize.metrics.rouge \ 37 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \ 38 | ${output_dir}/${split}.max-sents.jsonl \ 39 | --gold-summary-field-name cloze \ 40 | --model-summary-field-name cloze \ 41 | --add-gold-wrapping-list \ 42 | --compute-rouge-l \ 43 | --silent \ 44 | --output-file ${results_dir}/${split}.max-sents.metrics.json 45 | done 46 | -------------------------------------------------------------------------------- /experiments/deutsch2019/extractive-step/extractive-model/predict.sh: -------------------------------------------------------------------------------- 1 | if [ "$#" -ne 2 ]; then 2 | echo "Usage: sh predict.sh " 3 | exit 4 | fi 5 | 6 | use_topics=$1 7 | use_context=$2 8 | if [ "${use_topics}" == "true" ]; then 9 | topics_dir="topics" 10 | else 11 | topics_dir="no-topics" 12 | fi 13 | if [ "${use_context}" == "true" ]; then 14 | context_dir="context" 15 | else 16 | context_dir="no-context" 17 | fi 18 | 19 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 20 | model_file=${expt_dir}/model/${topics_dir}/${context_dir}/model.tar.gz 21 | output_dir=${expt_dir}/output/${topics_dir}/${context_dir} 22 | mkdir -p ${output_dir} 23 | 24 | for split in valid test; do 25 | allennlp predict \ 26 | --include-package summarize \ 27 | --predictor cloze-extractive-predictor \ 28 | --output-file ${output_dir}/${split}.max-tokens.jsonl \ 29 | --cuda-device 0 \ 30 | --batch-size 1 \ 31 | --silent \ 32 | --use-dataset-reader \ 33 | --overrides '{"dataset_reader.max_num_sentences": null}' \ 34 | ${model_file} \ 35 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz 36 | 37 | allennlp predict \ 38 | --include-package summarize \ 39 | --predictor cloze-extractive-predictor \ 40 | --output-file ${output_dir}/${split}.max-sents.jsonl \ 41 | --cuda-device 0 \ 42 | --batch-size 1 \ 43 | --silent \ 44 | --use-dataset-reader \ 45 | --overrides '{"dataset_reader.max_num_sentences": null, "model.max_words": null, "model.max_sents": 1}' \ 46 | ${model_file} \ 47 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz 48 | done 49 | -------------------------------------------------------------------------------- /experiments/deutsch2019/extractive-step/extractive-model/preprocess.sh: -------------------------------------------------------------------------------- 1 | if [ "$#" -ne 2 ]; then 2 | echo "Usage: sh preprocess.sh " 3 | exit 4 | fi 5 | 6 | use_topics=$1 7 | use_context=$2 8 | if [ "${use_topics}" == "true" ]; then 9 | topics_dir="topics" 10 | else 11 | topics_dir="no-topics" 12 | fi 13 | if [ "${use_context}" == "true" ]; then 14 | context_dir="context" 15 | else 16 | context_dir="no-context" 17 | fi 18 | 19 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 20 | model_file=${expt_dir}/model/${topics_dir}/${context_dir}/model.tar.gz 21 | preprocess_dir=${expt_dir}/preprocessed/${topics_dir}/${context_dir} 22 | mkdir -p ${preprocess_dir} 23 | 24 | for split in train valid test; do 25 | temp_file=$(mktemp) 26 | allennlp predict \ 27 | --include-package summarize \ 28 | --predictor cloze-extractive-predictor \ 29 | --output-file ${temp_file} \ 30 | --cuda-device 0 \ 31 | --batch-size 1 \ 32 | --silent \ 33 | --use-dataset-reader \ 34 | --overrides '{"dataset_reader.max_num_sentences": null}' \ 35 | ${model_file} \ 36 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz 37 | 38 | python -m summarize.utils.copy_jsonl_fields \ 39 | ${temp_file} \ 40 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \ 41 | ${preprocess_dir}/${split}.jsonl.gz \ 42 | --field-names cloze document 43 | 44 | rm ${temp_file} 45 | done 46 | -------------------------------------------------------------------------------- /experiments/deutsch2019/extractive-step/extractive-model/train.sh: -------------------------------------------------------------------------------- 1 | if [ "$#" -ne 2 ]; then 2 | echo "Usage: sh train.sh " 3 | exit 4 | fi 5 | 6 | use_topics=$1 7 | use_context=$2 8 | if [ "${use_topics}" == "true" ]; then 9 | topics_dir="topics" 10 | else 11 | topics_dir="no-topics" 12 | fi 13 | if [ "${use_context}" == "true" ]; then 14 | context_dir="context" 15 | else 16 | context_dir="no-context" 17 | fi 18 | 19 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 20 | model_dir=${expt_dir}/model/${topics_dir}/${context_dir} 21 | model_config=${expt_dir}/model.jsonnet 22 | 23 | if [ -d ${model_dir} ]; then 24 | read -p "remove directory ${model_dir}? [y/n] " yn 25 | case $yn in 26 | [Yy]* ) rm -rf ${model_dir};; 27 | [Nn]* ) ;; 28 | * ) echo "Please answer yes or no.";; 29 | esac 30 | fi 31 | 32 | export USE_TOPICS=${use_topics} 33 | export USE_CONTEXT=${use_context} 34 | allennlp train \ 35 | --include-package summarize \ 36 | --serialization-dir ${model_dir} \ 37 | ${model_config} 38 | -------------------------------------------------------------------------------- /experiments/deutsch2019/extractive-step/lead/.gitignore: -------------------------------------------------------------------------------- 1 | output 2 | preprocessed 3 | results 4 | -------------------------------------------------------------------------------- /experiments/deutsch2019/extractive-step/lead/Readme.md: -------------------------------------------------------------------------------- 1 | # Lead 2 | This directory contains the scripts to run the lead model preprocessing. 3 | The preprocessed data can be found here: [train](https://danieldeutsch.s3.amazonaws.com/summarize/experiments/deutsch2019/v1.1/extractive-step/lead/preprocessed/train.jsonl.gz), [valid](https://danieldeutsch.s3.amazonaws.com/summarize/experiments/deutsch2019/v1.1/extractive-step/lead/preprocessed/valid.jsonl.gz), [test](https://danieldeutsch.s3.amazonaws.com/summarize/experiments/deutsch2019/v1.1/extractive-step/lead/preprocessed/test.jsonl.gz). 4 | -------------------------------------------------------------------------------- /experiments/deutsch2019/extractive-step/lead/preprocess.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | preprocess_dir="${expt_dir}/preprocessed" 3 | mkdir -p ${preprocess_dir} 4 | 5 | for split in train valid test; do 6 | temp_file=$(mktemp) 7 | python -m summarize.models.cloze.lead \ 8 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \ 9 | ${temp_file} \ 10 | --max-tokens 200 \ 11 | --field-name document \ 12 | --keep-sentences 13 | 14 | python -m summarize.utils.copy_jsonl_fields \ 15 | ${temp_file} \ 16 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \ 17 | ${preprocess_dir}/${split}.jsonl.gz \ 18 | --field-names document document 19 | 20 | rm ${temp_file} 21 | done 22 | -------------------------------------------------------------------------------- /experiments/deutsch2019/extractive-step/lead/run.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | output_dir="${expt_dir}/output" 3 | results_dir="${expt_dir}/results" 4 | mkdir -p ${output_dir} ${results_dir} 5 | 6 | for split in valid test; do 7 | python -m summarize.models.cloze.lead \ 8 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \ 9 | ${output_dir}/${split}.jsonl \ 10 | --max-tokens 200 11 | 12 | python -m summarize.metrics.rouge \ 13 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \ 14 | ${output_dir}/${split}.jsonl \ 15 | --gold-summary-field-name cloze \ 16 | --model-summary-field-name cloze \ 17 | --add-gold-wrapping-list \ 18 | --add-model-wrapping-list \ 19 | --compute-rouge-l \ 20 | --silent \ 21 | --max-words 200 \ 22 | --output-file ${results_dir}/${split}.metrics.json 23 | done 24 | -------------------------------------------------------------------------------- /experiments/deutsch2019/extractive-step/oracle/.gitignore: -------------------------------------------------------------------------------- 1 | output 2 | preprocessed 3 | results 4 | -------------------------------------------------------------------------------- /experiments/deutsch2019/extractive-step/oracle/Readme.md: -------------------------------------------------------------------------------- 1 | # Oracle 2 | This directory contains the scripts to run the oracle model preprocessing. 3 | The preprocessed data can be found here: [train](https://danieldeutsch.s3.amazonaws.com/summarize/experiments/deutsch2019/v1.1/extractive-step/oracle/preprocessed/train.jsonl.gz), [valid](https://danieldeutsch.s3.amazonaws.com/summarize/experiments/deutsch2019/v1.1/extractive-step/oracle/preprocessed/valid.jsonl.gz), [test](https://danieldeutsch.s3.amazonaws.com/summarize/experiments/deutsch2019/v1.1/extractive-step/oracle/preprocessed/test.jsonl.gz). 4 | -------------------------------------------------------------------------------- /experiments/deutsch2019/extractive-step/oracle/preprocess.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | preprocess_dir="${expt_dir}/preprocessed" 3 | mkdir -p ${preprocess_dir} 4 | 5 | for split in train valid test; do 6 | temp_file=$(mktemp) 7 | python -m summarize.utils.extract_cloze_from_labels \ 8 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \ 9 | ${temp_file} \ 10 | --field-name document \ 11 | --keep-sentences 12 | 13 | python -m summarize.utils.copy_jsonl_fields \ 14 | ${temp_file} \ 15 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \ 16 | ${preprocess_dir}/${split}.jsonl.gz \ 17 | --field-names document document 18 | 19 | rm ${temp_file} 20 | done 21 | -------------------------------------------------------------------------------- /experiments/deutsch2019/extractive-step/oracle/run.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | output_dir="${expt_dir}/output" 3 | results_dir="${expt_dir}/results" 4 | mkdir -p ${output_dir} ${results_dir} 5 | 6 | for split in valid test; do 7 | python -m summarize.utils.extract_cloze_from_labels \ 8 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \ 9 | ${output_dir}/${split}.jsonl 10 | 11 | python -m summarize.metrics.rouge \ 12 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \ 13 | ${output_dir}/${split}.jsonl \ 14 | --gold-summary-field-name cloze \ 15 | --model-summary-field-name cloze \ 16 | --add-gold-wrapping-list \ 17 | --add-model-wrapping-list \ 18 | --compute-rouge-l \ 19 | --silent \ 20 | --max-words 200 \ 21 | --output-file ${results_dir}/${split}.metrics.json 22 | done 23 | -------------------------------------------------------------------------------- /experiments/deutsch2019/extractive-step/sumfocus/.gitignore: -------------------------------------------------------------------------------- 1 | output 2 | sweep 3 | logs 4 | -------------------------------------------------------------------------------- /experiments/deutsch2019/extractive-step/sumfocus/Readme.md: -------------------------------------------------------------------------------- 1 | # SumFocus 2 | An implementation of SumFocus from [Vanderwende et al. (2007)](https://www.cis.upenn.edu/~nenkova/papers/ipm.pdf). 3 | `run-parameter-sweep.sh` will run a parameter sweep to find the best settings of the unigram probability distribution smoothing parameter (`beta` in the code) and the interpolation parameters between the document, topic, and context (`topic_lambda` and `context_lambda` in the code) using the NLP Grid for parallelization. 4 | To analyze the results, run the python script `analyze_results.py` which will output what the best hyperparameter settings were for all variations of using and not using the topic and context. 5 | 6 | After the best hyperparameter settings are found, you can run the model on the test data to compute Rouge by running 7 | ``` 8 | sh experiments/deutsch2019/extractive-step/sumfocus/run-max-words.sh \ 9 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/test.v1.1.jsonl.gz \ 10 | experiments/deutsch2019/extractive-step/sumfocus/output/test.max-words.jsonl \ 11 | experiments/deutsch2019/extractive-step/sumfocus/output/test.max-words.metrics.jsonl \ 12 | \ 13 | \ 14 | \ 15 | 200 16 | 17 | sh experiments/deutsch2019/extractive-step/sumfocus/run-max-sents.sh \ 18 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/test.v1.1.jsonl.gz \ 19 | experiments/deutsch2019/extractive-step/sumfocus/output/test.max-sents.jsonl \ 20 | experiments/deutsch2019/extractive-step/sumfocus/output/test.max-sents.metrics.jsonl \ 21 | \ 22 | \ 23 | \ 24 | 1 25 | ``` 26 | -------------------------------------------------------------------------------- /experiments/deutsch2019/extractive-step/sumfocus/run-max-sents.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #$ -cwd 3 | if [ "$#" -ne 7 ]; then 4 | echo "Usage: sh run-max-sents.sh" 5 | echo " " 6 | exit 7 | fi 8 | 9 | input_file=$1 10 | output_file=$2 11 | metrics_file=$3 12 | beta=$4 13 | topic_lambda=$5 14 | context_lambda=$6 15 | max_sents=$7 16 | 17 | mkdir -p $(dirname ${output_file}) 18 | python -m summarize.models.cloze.sumfocus \ 19 | ${input_file} \ 20 | ${output_file} \ 21 | ${beta} \ 22 | ${topic_lambda} \ 23 | ${context_lambda} \ 24 | --max-sentences ${max_sents} 25 | 26 | mkdir -p $(dirname ${metrics_file}) 27 | python -m summarize.metrics.rouge \ 28 | ${input_file} \ 29 | ${output_file} \ 30 | --gold-summary-field-name cloze \ 31 | --model-summary-field-name cloze \ 32 | --add-gold-wrapping-list \ 33 | --add-model-wrapping-list \ 34 | --compute-rouge-l \ 35 | --silent \ 36 | --output-file ${metrics_file} 37 | -------------------------------------------------------------------------------- /experiments/deutsch2019/extractive-step/sumfocus/run-max-words.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #$ -cwd 3 | if [ "$#" -ne 7 ]; then 4 | echo "Usage: sh run-max-words.sh" 5 | echo " " 6 | exit 7 | fi 8 | 9 | input_file=$1 10 | output_file=$2 11 | metrics_file=$3 12 | beta=$4 13 | topic_lambda=$5 14 | context_lambda=$6 15 | max_words=$7 16 | 17 | mkdir -p $(dirname ${output_file}) 18 | python -m summarize.models.cloze.sumfocus \ 19 | ${input_file} \ 20 | ${output_file} \ 21 | ${beta} \ 22 | ${topic_lambda} \ 23 | ${context_lambda} \ 24 | --max-words ${max_words} 25 | 26 | mkdir -p $(dirname ${metrics_file}) 27 | python -m summarize.metrics.rouge \ 28 | ${input_file} \ 29 | ${output_file} \ 30 | --gold-summary-field-name cloze \ 31 | --model-summary-field-name cloze \ 32 | --add-gold-wrapping-list \ 33 | --add-model-wrapping-list \ 34 | --compute-rouge-l \ 35 | --silent \ 36 | --max-words 200 \ 37 | --output-file ${metrics_file} 38 | -------------------------------------------------------------------------------- /experiments/deutsch2019/extractive-step/sumfocus/run-parameter-sweep.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | max_words_output_dir="${expt_dir}/sweep/max-words" 3 | max_sents_output_dir="${expt_dir}/sweep/max-sents" 4 | log_dir="${expt_dir}/logs" 5 | mkdir -p ${max_words_output_dir} ${max_sents_output_dir} ${log_dir} 6 | 7 | max_words=200 8 | max_num_sents=1 9 | 10 | for beta in 0.1 0.5 1.0 2.0; do 11 | for topic_lambda in 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0; do 12 | for context_lambda in 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0; do 13 | for split in valid; do 14 | name="beta_${beta}.topic-lambda-${topic_lambda}.context-lambda-${context_lambda}" 15 | gold_file="https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz" 16 | model_file="${max_words_output_dir}/${split}.${name}.jsonl" 17 | metrics_file="${max_words_output_dir}/${split}.${name}.metrics.json" 18 | 19 | stdout=${log_dir}/${name}-words.stdout 20 | stderr=${log_dir}/${name}-words.stderr 21 | qsub -N ${name} -o ${stdout} -e ${stderr} \ 22 | ${expt_dir}/run-max-words.sh ${gold_file} ${model_file} ${metrics_file} ${beta} ${topic_lambda} ${context_lambda} ${max_words} 23 | 24 | model_file="${max_sents_output_dir}/${split}.${name}.jsonl" 25 | metrics_file="${max_sents_output_dir}/${split}.${name}.metrics.json" 26 | 27 | stdout=${log_dir}/${name}-sents.stdout 28 | stderr=${log_dir}/${name}-sents.stderr 29 | qsub -N ${name} -o ${stdout} -e ${stderr} \ 30 | ${expt_dir}/run-max-sents.sh ${gold_file} ${model_file} ${metrics_file} ${beta} ${topic_lambda} ${context_lambda} ${max_num_sents} 31 | done 32 | done 33 | done 34 | done 35 | -------------------------------------------------------------------------------- /experiments/kedzie2018/Readme.md: -------------------------------------------------------------------------------- 1 | # Kedzie 2018 2 | This is a partial reimplementation of [Content Selection in Deep Learning Models of Summarization](https://arxiv.org/abs/1810.12343) by Kedzie et al. (2018). 3 | 4 | ## Instructions 5 | First, prepare the necessary data under `data/kedzie2018`. 6 | Then, each directory of the experiment corresponds to a different dataset and model with its own script to train, predict, and evaluate. 7 | 8 | ## Results 9 | Below are the reproduction results for the CNN/DailyMail dataset. 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 |
ExtractorEncoderR2-RecallSaved Model
ReportedReproduced
Lead-24.424.4-
RNNAvg25.425.5Link
RNN25.425.4Link
CNN25.1--
Oracle-36.237.3-
59 | -------------------------------------------------------------------------------- /experiments/kedzie2018/cnn-dailymail/extractive-model/.gitignore: -------------------------------------------------------------------------------- 1 | model 2 | output 3 | results 4 | -------------------------------------------------------------------------------- /experiments/kedzie2018/cnn-dailymail/extractive-model/evaluate.sh: -------------------------------------------------------------------------------- 1 | if [ "$#" -ne 2 ]; then 2 | echo "Usage: sh evaluate.sh " 3 | exit 4 | fi 5 | 6 | encoder=$1 7 | extractor=$2 8 | 9 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 10 | output_dir=${expt_dir}/output/${encoder}/${extractor} 11 | results_dir=${expt_dir}/results/${encoder}/${extractor} 12 | 13 | mkdir -p ${results_dir} 14 | 15 | for split in valid test; do 16 | python -m summarize.metrics.rouge \ 17 | https://s3.amazonaws.com/danieldeutsch/summarize/data/kedzie2018/cnn-dailymail/${split}.v1.0.jsonl.gz \ 18 | ${output_dir}/${split}.jsonl \ 19 | --silent \ 20 | --max-ngram 2 \ 21 | --max-words 100 \ 22 | --remove-stopwords \ 23 | --output-file ${results_dir}/${split}.metrics.json 24 | done 25 | -------------------------------------------------------------------------------- /experiments/kedzie2018/cnn-dailymail/extractive-model/model.jsonnet: -------------------------------------------------------------------------------- 1 | local encoder = std.extVar("ENCODER"); 2 | 3 | // The size of the decoder's input changes based on the encoder choice 4 | local decoder_input_size = 5 | if encoder == "avg" then 200 6 | else if encoder == "rnn" then 400; 7 | 8 | { 9 | "dataset_reader": { 10 | "type": "sds-extractive", 11 | "tokenizer": { 12 | "type": "word", 13 | "word_splitter": { 14 | "type": "just_spaces" 15 | } 16 | }, 17 | "token_indexers": { 18 | "tokens": { 19 | "type": "single_id", 20 | "lowercase_tokens": true 21 | } 22 | }, 23 | "max_num_sentences": 50 24 | }, 25 | "vocabulary": { 26 | "pretrained_files": { 27 | "tokens": "(http://nlp.stanford.edu/data/glove.6B.zip)#glove.6B.200d.txt" 28 | }, 29 | "only_include_pretrained_words": true 30 | }, 31 | "train_data_path": "https://s3.amazonaws.com/danieldeutsch/summarize/data/kedzie2018/cnn-dailymail/train.v1.0.jsonl.gz", 32 | "validation_data_path": "https://s3.amazonaws.com/danieldeutsch/summarize/data/kedzie2018/cnn-dailymail/valid.v1.0.jsonl.gz", 33 | "model": { 34 | "type": "sds-extractive-baseline", 35 | "token_embedder": { 36 | "token_embedders": { 37 | "tokens": { 38 | "type": "embedding", 39 | "embedding_dim": 200, 40 | "trainable": false, 41 | "pretrained_file": "(http://nlp.stanford.edu/data/glove.6B.zip)#glove.6B.200d.txt", 42 | } 43 | } 44 | }, 45 | "sentence_encoder": 46 | if encoder == "avg" then { 47 | "type": "boe", 48 | "embedding_dim": 200, 49 | "averaged": true 50 | } 51 | else if encoder == "rnn" then { 52 | "type": "gru", 53 | "input_size": 200, 54 | "hidden_size": 200, 55 | "bidirectional": true 56 | } 57 | , 58 | "sentence_extractor": { 59 | "type": "rnn", 60 | "rnn": { 61 | "type": "gru", 62 | "input_size": decoder_input_size, 63 | "hidden_size": 300, 64 | "bidirectional": true, 65 | }, 66 | "feed_forward": { 67 | "input_dim": 600, 68 | "num_layers": 2, 69 | "hidden_dims": [100, 1], 70 | "activations": ["relu", "linear"], 71 | "dropout": [0.25, 0.0] 72 | }, 73 | "dropout": 0.25 74 | }, 75 | "max_words": 100, 76 | "dropout": 0.25, 77 | "metrics": [ 78 | { 79 | "type": "python-rouge", 80 | "ngram_orders": [2], 81 | "max_words": 100, 82 | "remove_stopwords": true 83 | } 84 | ] 85 | }, 86 | "iterator": { 87 | "type": "bucket", 88 | "batch_size": 32, 89 | "sorting_keys": [["document", "num_fields"]] 90 | }, 91 | "validation_iterator": { 92 | "type": "bucket", 93 | "batch_size": 32, 94 | "sorting_keys": [["document", "num_fields"]] 95 | }, 96 | "trainer": { 97 | "optimizer": { 98 | "type": "adam", 99 | "lr": 0.0001 100 | }, 101 | "grad_norm": 5, 102 | "num_epochs": 20, 103 | "validation_metric": "+R2-R", 104 | "cuda_device": 0 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /experiments/kedzie2018/cnn-dailymail/extractive-model/predict.sh: -------------------------------------------------------------------------------- 1 | if [ "$#" -ne 2 ]; then 2 | echo "Usage: sh predict.sh " 3 | exit 4 | fi 5 | 6 | encoder=$1 7 | extractor=$2 8 | 9 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 10 | model_file=${expt_dir}/model/${encoder}/${extractor}/model.tar.gz 11 | output_dir=${expt_dir}/output/${encoder}/${extractor} 12 | 13 | mkdir -p ${output_dir} 14 | 15 | for split in valid test; do 16 | allennlp predict \ 17 | --include-package summarize \ 18 | --predictor sds-extractive-predictor \ 19 | --output-file ${output_dir}/${split}.jsonl \ 20 | --cuda-device 0 \ 21 | --batch-size 16 \ 22 | --silent \ 23 | --use-dataset-reader \ 24 | ${model_file} \ 25 | https://s3.amazonaws.com/danieldeutsch/summarize/data/kedzie2018/cnn-dailymail/${split}.v1.0.jsonl.gz 26 | done 27 | -------------------------------------------------------------------------------- /experiments/kedzie2018/cnn-dailymail/extractive-model/train.sh: -------------------------------------------------------------------------------- 1 | if [ "$#" -ne 2 ]; then 2 | echo "Usage: sh train.sh " 3 | exit 4 | fi 5 | 6 | encoder=$1 7 | extractor=$2 8 | 9 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 10 | model_dir=${expt_dir}/model/${encoder}/${extractor} 11 | model_config=${expt_dir}/model.jsonnet 12 | 13 | if [ -d ${model_dir} ]; then 14 | read -p "remove directory ${model_dir}? [y/n] " yn 15 | case $yn in 16 | [Yy]* ) rm -rf ${model_dir};; 17 | [Nn]* ) ;; 18 | * ) echo "Please answer yes or no.";; 19 | esac 20 | fi 21 | 22 | export ENCODER=${encoder} 23 | export EXTRACTOR=${extractor} 24 | allennlp train \ 25 | --include-package summarize \ 26 | --serialization-dir ${model_dir} \ 27 | ${model_config} 28 | -------------------------------------------------------------------------------- /experiments/kedzie2018/cnn-dailymail/lead/.gitignore: -------------------------------------------------------------------------------- 1 | output 2 | results 3 | -------------------------------------------------------------------------------- /experiments/kedzie2018/cnn-dailymail/lead/run.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | output_dir="${expt_dir}/output" 3 | results_dir="${expt_dir}/results" 4 | mkdir -p ${output_dir} 5 | mkdir -p ${results_dir} 6 | 7 | for split in valid test; do 8 | python -m summarize.models.sds.lead \ 9 | https://s3.amazonaws.com/danieldeutsch/summarize/data/kedzie2018/cnn-dailymail/${split}.v1.0.jsonl.gz \ 10 | ${output_dir}/${split}.jsonl \ 11 | --max-tokens 100 12 | 13 | python -m summarize.metrics.rouge \ 14 | https://s3.amazonaws.com/danieldeutsch/summarize/data/kedzie2018/cnn-dailymail/${split}.v1.0.jsonl.gz \ 15 | ${output_dir}/${split}.jsonl \ 16 | --silent \ 17 | --max-ngram 2 \ 18 | --remove-stopwords \ 19 | --max-words 100 \ 20 | --output-file ${results_dir}/${split}.metrics.json 21 | done 22 | -------------------------------------------------------------------------------- /experiments/kedzie2018/cnn-dailymail/oracle/.gitignore: -------------------------------------------------------------------------------- 1 | output 2 | results 3 | -------------------------------------------------------------------------------- /experiments/kedzie2018/cnn-dailymail/oracle/run.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | output_dir="${expt_dir}/output" 3 | results_dir="${expt_dir}/results" 4 | mkdir -p ${output_dir} 5 | mkdir -p ${results_dir} 6 | 7 | for split in valid test; do 8 | python -m summarize.utils.extract_summary_from_labels \ 9 | https://s3.amazonaws.com/danieldeutsch/summarize/data/kedzie2018/cnn-dailymail/${split}.v1.0.jsonl.gz \ 10 | ${output_dir}/${split}.jsonl 11 | 12 | python -m summarize.metrics.rouge \ 13 | https://s3.amazonaws.com/danieldeutsch/summarize/data/kedzie2018/cnn-dailymail/${split}.v1.0.jsonl.gz \ 14 | ${output_dir}/${split}.jsonl \ 15 | --silent \ 16 | --max-ngram 2 \ 17 | --remove-stopwords \ 18 | --max-words 100 \ 19 | --output-file ${results_dir}/${split}.metrics.json 20 | done 21 | -------------------------------------------------------------------------------- /experiments/onmt/Readme.md: -------------------------------------------------------------------------------- 1 | # OpenNMT Parity Experiment 2 | This experiment aims to compare the performance of the Summarize and OpenNMT models with as close to identical setups as possible to ensure parity between libraries. 3 | The tests train and evaluate the sequence-to-sequence and pointer-generator models which are based on RNNs. 4 | There is a directory for each model that includes more details and the specific commands to reproduce the results. 5 | The OpenNMT commands come from the [summarization example](http://opennmt.net/OpenNMT-py/Summarization.html). 6 | 7 | ## Demo 8 | The final trained model can be demoed using the MyBinder Jupyter Notebook by clicking on this badge: 9 | 10 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/danieldeutsch/summarize/574d0027262573291724c72641a3e4967e018030?filepath=experiments%2Fonmt%2Fdemo.ipynb) 11 | 12 | Please note that generating the summary can take about 1 minute on the MyBinder servers. 13 | 14 | 15 | ## OpenNMT Data Setup 16 | The preprocessing of the CNN/DailyMail dataset is common between both OpenNMT models. 17 | ``` 18 | git clone https://github.com/OpenNMT/OpenNMT-py 19 | cd OpenNMT-py 20 | wget https://s3.amazonaws.com/opennmt-models/Summary/cnndm.tar.gz 21 | mkdir data/cnndm 22 | tar -xzvf cnndm.tar.gz -C data/cnndm 23 | 24 | python preprocess.py \ 25 | -train_src data/cnndm/train.txt.src \ 26 | -train_tgt data/cnndm/train.txt.tgt.tagged \ 27 | -valid_src data/cnndm/val.txt.src \ 28 | -valid_tgt data/cnndm/val.txt.tgt.tagged \ 29 | -save_data data/cnndm/CNNDM \ 30 | -src_seq_length 10000 \ 31 | -tgt_seq_length 10000 \ 32 | -src_seq_length_trunc 400 \ 33 | -tgt_seq_length_trunc 100 \ 34 | -dynamic_dict \ 35 | -share_vocab \ 36 | -shard_size 100000 37 | ``` 38 | -------------------------------------------------------------------------------- /experiments/onmt/convert_to_jsonl.py: -------------------------------------------------------------------------------- 1 | """ 2 | Converts the output of the OpenNMT models to the jsonl format that 3 | is necessary for evaluation. Additionally, the script will remove the 4 | sentence delimiters from the output. 5 | """ 6 | # Edit the system path so the summarize library can be imported 7 | import sys 8 | sys.path.append('.') 9 | 10 | import argparse 11 | import json 12 | 13 | from summarize.data.io import JsonlWriter 14 | 15 | 16 | def main(args): 17 | with JsonlWriter(args.output_jsonl) as out: 18 | with open(args.input_tsv, 'r') as f: 19 | for line in f: 20 | line = line.strip() 21 | line = line.replace('', '').replace('', '') 22 | line = ' '.join(line.split()) 23 | summary = [line] 24 | out.write({'summary': summary}) 25 | 26 | 27 | if __name__ == '__main__': 28 | argp = argparse.ArgumentParser() 29 | argp.add_argument('input_tsv', help='The output from the OpenNMT model') 30 | argp.add_argument('output_jsonl', help='The converted jsonl file') 31 | args = argp.parse_args() 32 | main(args) 33 | -------------------------------------------------------------------------------- /experiments/onmt/pointer-generator/.gitignore: -------------------------------------------------------------------------------- 1 | model 2 | output 3 | results 4 | -------------------------------------------------------------------------------- /experiments/onmt/pointer-generator/evaluate.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | output_dir=${expt_dir}/output 3 | results_dir=${expt_dir}/results 4 | 5 | mkdir -p ${results_dir} 6 | 7 | for split in valid test; do 8 | for constraints in min-length repeated-trigrams length coverage; do 9 | python -m summarize.metrics.rouge \ 10 | https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/${split}.v1.0.jsonl.gz \ 11 | ${output_dir}/${split}.${constraints}.jsonl \ 12 | --silent \ 13 | --compute-rouge-l \ 14 | --output-file ${results_dir}/${split}.${constraints}.metrics.json 15 | done 16 | done 17 | -------------------------------------------------------------------------------- /experiments/onmt/pointer-generator/predict.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | model_file=${expt_dir}/model/model.tar.gz 3 | output_dir=${expt_dir}/output 4 | 5 | mkdir -p ${output_dir} 6 | 7 | for split in valid test; do 8 | # add minimum length 9 | allennlp predict \ 10 | --include-package summarize \ 11 | --predictor sds-abstractive-predictor \ 12 | --output-file ${output_dir}/${split}.min-length.jsonl \ 13 | --cuda-device 0 \ 14 | --batch-size 16 \ 15 | --silent \ 16 | --use-dataset-reader \ 17 | --overrides '{"model.beam_search.disallow_repeated_ngrams": null, "model.beam_search.repeated_ngrams_exceptions": null, "model.beam_search.length_penalizer": null, "model.beam_search.coverage_penalizer": null}' \ 18 | ${model_file} \ 19 | https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/${split}.v1.0.jsonl.gz 20 | 21 | # add disallow repeated trigrams 22 | allennlp predict \ 23 | --include-package summarize \ 24 | --predictor sds-abstractive-predictor \ 25 | --output-file ${output_dir}/${split}.repeated-trigrams.jsonl \ 26 | --cuda-device 0 \ 27 | --batch-size 16 \ 28 | --silent \ 29 | --use-dataset-reader \ 30 | --overrides '{"model.beam_search.length_penalizer": null, "model.beam_search.coverage_penalizer": null}' \ 31 | ${model_file} \ 32 | https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/${split}.v1.0.jsonl.gz 33 | 34 | # add length penalizer 35 | allennlp predict \ 36 | --include-package summarize \ 37 | --predictor sds-abstractive-predictor \ 38 | --output-file ${output_dir}/${split}.length.jsonl \ 39 | --cuda-device 0 \ 40 | --batch-size 16 \ 41 | --silent \ 42 | --use-dataset-reader \ 43 | --overrides '{"model.beam_search.coverage_penalizer": null}' \ 44 | ${model_file} \ 45 | https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/${split}.v1.0.jsonl.gz 46 | 47 | # add coverage penalizer 48 | allennlp predict \ 49 | --include-package summarize \ 50 | --predictor sds-abstractive-predictor \ 51 | --output-file ${output_dir}/${split}.coverage.jsonl \ 52 | --cuda-device 0 \ 53 | --batch-size 16 \ 54 | --silent \ 55 | --use-dataset-reader \ 56 | ${model_file} \ 57 | https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/${split}.v1.0.jsonl.gz 58 | done 59 | -------------------------------------------------------------------------------- /experiments/onmt/pointer-generator/replace-config.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | model_config=${expt_dir}/model.jsonnet 3 | model_tar=${expt_dir}/model/model.tar.gz 4 | 5 | python -m summarize.utils.replace_config \ 6 | ${model_tar} \ 7 | ${model_tar} \ 8 | ${model_config} 9 | -------------------------------------------------------------------------------- /experiments/onmt/pointer-generator/run.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | 3 | sh ${expt_dir}/train.sh 4 | sh ${expt_dir}/replace-config.sh 5 | sh ${expt_dir}/predict.sh 6 | sh ${expt_dir}/evaluate.sh 7 | -------------------------------------------------------------------------------- /experiments/onmt/pointer-generator/train.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | model_dir=${expt_dir}/model 3 | model_config=${expt_dir}/model.jsonnet 4 | 5 | if [ -d ${model_dir} ]; then 6 | read -p "remove directory ${model_dir}? [y/n] " yn 7 | case $yn in 8 | [Yy]* ) rm -rf ${model_dir};; 9 | [Nn]* ) ;; 10 | * ) echo "Please answer yes or no.";; 11 | esac 12 | fi 13 | 14 | allennlp train \ 15 | --include-package summarize \ 16 | --serialization-dir ${model_dir} \ 17 | --overrides '{"model.run_beam_search": false}' \ 18 | --file-friendly-logging \ 19 | ${model_config} 20 | -------------------------------------------------------------------------------- /experiments/onmt/seq2seq/.gitignore: -------------------------------------------------------------------------------- 1 | model 2 | output 3 | results 4 | -------------------------------------------------------------------------------- /experiments/onmt/seq2seq/evaluate.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | output_dir=${expt_dir}/output 3 | results_dir=${expt_dir}/results 4 | 5 | mkdir -p ${results_dir} 6 | 7 | for split in valid test; do 8 | for constraints in min-length repeated-trigrams length coverage; do 9 | python -m summarize.metrics.rouge \ 10 | https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/${split}.v1.0.jsonl.gz \ 11 | ${output_dir}/${split}.${constraints}.jsonl \ 12 | --silent \ 13 | --compute-rouge-l \ 14 | --output-file ${results_dir}/${split}.${constraints}.metrics.json 15 | done 16 | done 17 | -------------------------------------------------------------------------------- /experiments/onmt/seq2seq/predict.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | model_file=${expt_dir}/model/model.tar.gz 3 | output_dir=${expt_dir}/output 4 | 5 | mkdir -p ${output_dir} 6 | 7 | for split in valid test; do 8 | # add minimum length 9 | allennlp predict \ 10 | --include-package summarize \ 11 | --predictor sds-abstractive-predictor \ 12 | --output-file ${output_dir}/${split}.min-length.jsonl \ 13 | --cuda-device 0 \ 14 | --batch-size 16 \ 15 | --silent \ 16 | --use-dataset-reader \ 17 | --overrides '{"model.beam_search.disallow_repeated_ngrams": null, "model.beam_search.repeated_ngrams_exceptions": null, "model.beam_search.length_penalizer": null, "model.beam_search.coverage_penalizer": null}' \ 18 | ${model_file} \ 19 | https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/${split}.v1.0.jsonl.gz 20 | 21 | # add disallow repeated trigrams 22 | allennlp predict \ 23 | --include-package summarize \ 24 | --predictor sds-abstractive-predictor \ 25 | --output-file ${output_dir}/${split}.repeated-trigrams.jsonl \ 26 | --cuda-device 0 \ 27 | --batch-size 16 \ 28 | --silent \ 29 | --use-dataset-reader \ 30 | --overrides '{"model.beam_search.length_penalizer": null, "model.beam_search.coverage_penalizer": null}' \ 31 | ${model_file} \ 32 | https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/${split}.v1.0.jsonl.gz 33 | 34 | # add length penalizer 35 | allennlp predict \ 36 | --include-package summarize \ 37 | --predictor sds-abstractive-predictor \ 38 | --output-file ${output_dir}/${split}.length.jsonl \ 39 | --cuda-device 0 \ 40 | --batch-size 16 \ 41 | --silent \ 42 | --use-dataset-reader \ 43 | --overrides '{"model.beam_search.coverage_penalizer": null}' \ 44 | ${model_file} \ 45 | https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/${split}.v1.0.jsonl.gz 46 | 47 | # add coverage penalizer 48 | allennlp predict \ 49 | --include-package summarize \ 50 | --predictor sds-abstractive-predictor \ 51 | --output-file ${output_dir}/${split}.coverage.jsonl \ 52 | --cuda-device 0 \ 53 | --batch-size 16 \ 54 | --silent \ 55 | --use-dataset-reader \ 56 | ${model_file} \ 57 | https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/${split}.v1.0.jsonl.gz 58 | done 59 | -------------------------------------------------------------------------------- /experiments/onmt/seq2seq/replace-config.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | model_config=${expt_dir}/model.jsonnet 3 | model_tar=${expt_dir}/model/model.tar.gz 4 | 5 | python -m summarize.utils.replace_config \ 6 | ${model_tar} \ 7 | ${model_tar} \ 8 | ${model_config} 9 | -------------------------------------------------------------------------------- /experiments/onmt/seq2seq/run.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | 3 | sh ${expt_dir}/train.sh 4 | sh ${expt_dir}/replace-config.sh 5 | sh ${expt_dir}/predict.sh 6 | sh ${expt_dir}/evaluate.sh 7 | -------------------------------------------------------------------------------- /experiments/onmt/seq2seq/train.sh: -------------------------------------------------------------------------------- 1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 2 | model_dir=${expt_dir}/model 3 | model_config=${expt_dir}/model.jsonnet 4 | 5 | if [ -d ${model_dir} ]; then 6 | read -p "remove directory ${model_dir}? [y/n] " yn 7 | case $yn in 8 | [Yy]* ) rm -rf ${model_dir};; 9 | [Nn]* ) ;; 10 | * ) echo "Please answer yes or no.";; 11 | esac 12 | fi 13 | 14 | allennlp train \ 15 | --include-package summarize \ 16 | --serialization-dir ${model_dir} \ 17 | --overrides '{"model.run_beam_search": false}' \ 18 | ${model_config} 19 | -------------------------------------------------------------------------------- /experiments/wikicite/analysis/document-distribution/Readme.md: -------------------------------------------------------------------------------- 1 | This experiment calculates statistics about the reference documents. 2 | It can be run like the following 3 | ``` 4 | python experiments/wikicite/analysis/document-distribution/run.py \ 5 | https://danieldeutsch.s3.amazonaws.com/summarize/data/wikicite/train.tokenized.v1.1.jsonl.gz 6 | ``` 7 | -------------------------------------------------------------------------------- /experiments/wikicite/analysis/document-distribution/run.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | import argparse 3 | import sys 4 | from collections import defaultdict, Counter 5 | from tqdm import tqdm 6 | 7 | sys.path.append('../summarize') 8 | 9 | from summarize.data.io import JsonlReader 10 | 11 | 12 | def main(args): 13 | # The number of times each document appears 14 | document_to_num_occurrences = Counter() 15 | # The histogram of the document set sizes 16 | document_set_sizes = Counter() 17 | # The mapping from the document to the page ids 18 | document_to_page_ids = defaultdict(set) 19 | 20 | with JsonlReader(args.input_jsonl) as f: 21 | for instance in tqdm(f): 22 | page_id = instance['page_id'] 23 | documents = instance['documents'] 24 | document_set_sizes[len(documents)] += 1 25 | 26 | for document in documents: 27 | url = document['canonical_url'] 28 | document_to_num_occurrences[url] += 1 29 | document_to_page_ids[url].add(page_id) 30 | 31 | # The histogram for the number of times a document appears 32 | num_occurrences_to_num_documents = Counter() 33 | for count in document_to_num_occurrences.values(): 34 | num_occurrences_to_num_documents[count] += 1 35 | 36 | # The histogram for the number of pages a document appears 37 | num_pages_to_num_documents = Counter() 38 | for page_ids in document_to_page_ids.values(): 39 | num_pages_to_num_documents[len(page_ids)] += 1 40 | 41 | num_instances = sum(document_set_sizes.values()) 42 | num_multidoc = num_instances - document_set_sizes[1] 43 | 44 | num_unique_documents = len(document_to_num_occurrences) 45 | num_documents_multiple_times = num_unique_documents - num_occurrences_to_num_documents[1] 46 | 47 | num_documents_multiple_pages = num_unique_documents - num_pages_to_num_documents[1] 48 | 49 | print(f'Total unique documents: {num_unique_documents}') 50 | print(f'Total multi-document: {num_multidoc} ({num_multidoc / num_instances * 100:.2f}%)') 51 | print(f'Total documents appear more than once: {num_documents_multiple_times} ({num_documents_multiple_times / num_unique_documents * 100:.2f}%)') 52 | print(f'Total documents that appear in more than one page: {num_documents_multiple_pages} ({num_documents_multiple_pages / num_unique_documents * 100:.2f}%)') 53 | 54 | 55 | if __name__ == '__main__': 56 | argp = argparse.ArgumentParser() 57 | argp.add_argument('input_jsonl', help='The WikiCite dataset to analyze') 58 | args = argp.parse_args() 59 | main(args) 60 | -------------------------------------------------------------------------------- /experiments/wikicite/analysis/topic-distribution/.gitignore: -------------------------------------------------------------------------------- 1 | plots 2 | -------------------------------------------------------------------------------- /experiments/wikicite/analysis/topic-distribution/Readme.md: -------------------------------------------------------------------------------- 1 | This experiment calculates statistics about the topic frequencies in the WikiCite dataset. 2 | It can be run like the following 3 | ``` 4 | python experiments/wikicite/analysis/topic-distribution/run.py \ 5 | https://danieldeutsch.s3.amazonaws.com/summarize/data/wikicite/train.tokenized.v1.1.jsonl.gz \ 6 | experiments/wikicite/analysis/topic-distribution/plots 7 | ``` 8 | -------------------------------------------------------------------------------- /experiments/wikicite/analysis/topic-distribution/run.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | import argparse 3 | import matplotlib 4 | matplotlib.use('Agg') 5 | import matplotlib.pyplot as plt 6 | import os 7 | import sys 8 | from collections import Counter 9 | from tqdm import tqdm 10 | 11 | sys.path.append('../summarize') 12 | 13 | from summarize.data.io import JsonlReader 14 | 15 | 16 | def main(args): 17 | os.makedirs(args.output_dir, exist_ok=True) 18 | 19 | topic_counts = Counter() 20 | with JsonlReader(args.input_jsonl) as f: 21 | for instance in tqdm(f): 22 | headings = instance['headings'] 23 | for topic in headings: 24 | topic_counts[topic.lower()] += 1 25 | 26 | total_topk = 0 27 | topk_topics = [] 28 | topk_counts = [] 29 | for topic, count in topic_counts.most_common(15): 30 | topk_topics.append(topic) 31 | topk_counts.append(count) 32 | total_topk += count 33 | 34 | other_count = sum(topic_counts.values()) - total_topk 35 | topk_topics.append('other') 36 | topk_counts.append(other_count) 37 | 38 | for i in range(len(topk_counts)): 39 | topk_counts[i] /= 1000 40 | 41 | plt.figure() 42 | fig, ax = plt.subplots() 43 | x = list(reversed(range(len(topk_counts)))) 44 | ax.barh(x, topk_counts) 45 | ax.set_yticks(x) 46 | ax.set_yticklabels(topk_topics) 47 | ax.set_xlabel('Thousands of Occurrences') 48 | ax.set_title('Topic Frequencies') 49 | plt.tight_layout() 50 | plt.savefig(f'{args.output_dir}/topic-distribution.png', dpi=1000) 51 | 52 | count_histogram = [0] * 10 53 | for count in topic_counts.values(): 54 | if count >= 10: 55 | count_histogram[-1] += 1 56 | else: 57 | count_histogram[count - 1] += 1 58 | 59 | plt.figure() 60 | fig, ax = plt.subplots() 61 | x = list(range(len(count_histogram))) 62 | labels = list(range(1, len(count_histogram))) + ['10+'] 63 | ax.bar(x, count_histogram) 64 | ax.set_xticks(x) 65 | ax.set_xticklabels(labels) 66 | ax.set_xlabel('Number of Occurrences') 67 | ax.set_ylabel('Number of Topics') 68 | ax.set_title('Topic Frequency Histogram') 69 | plt.tight_layout() 70 | plt.savefig(f'{args.output_dir}/frequency-histogram.png', dpi=1000) 71 | 72 | print('Total unique topics: ', len(topic_counts)) 73 | 74 | print('Sample unique topics') 75 | print('--------------------') 76 | for topic, _ in topic_counts.most_common()[-50:]: 77 | print(topic) 78 | 79 | 80 | if __name__ == '__main__': 81 | argp = argparse.ArgumentParser() 82 | argp.add_argument('input_jsonl', help='The WikiCite dataset to analyze') 83 | argp.add_argument('output_dir', help='The directory where the plot should be written') 84 | args = argp.parse_args() 85 | main(args) 86 | -------------------------------------------------------------------------------- /external/ROUGE-1.5.5/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | -------------------------------------------------------------------------------- /external/ROUGE-1.5.5/Readme.md: -------------------------------------------------------------------------------- 1 | # ROUGE-1.5.5 2 | Unfortunately due to licensing issues, we cannot release the original source code to compute ROUGE. 3 | If you have a copy, place the contents of the ROUGE-1.5.5 directory here. 4 | -------------------------------------------------------------------------------- /external/meteor/.gitignore: -------------------------------------------------------------------------------- 1 | meteor-1.5 2 | -------------------------------------------------------------------------------- /external/meteor/Readme.md: -------------------------------------------------------------------------------- 1 | # Meteor 2 | [Meteor](https://www.cs.cmu.edu/~alavie/METEOR/) is an evaluation metric for machine translation that is commonly used in summarization. 3 | To setup Meteor, run the `setup.sh` script from the root of the repository. 4 | -------------------------------------------------------------------------------- /external/meteor/setup.sh: -------------------------------------------------------------------------------- 1 | wget https://www.cs.cmu.edu/~alavie/METEOR/download/meteor-1.5.tar.gz -O external/meteor/meteor-1.5.tar.gz 2 | tar xzvf external/meteor/meteor-1.5.tar.gz -C external/meteor 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | allennlp==0.9.0 2 | enforce 3 | -------------------------------------------------------------------------------- /runtime.txt: -------------------------------------------------------------------------------- 1 | python-3.6 2 | -------------------------------------------------------------------------------- /summarize/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/__init__.py -------------------------------------------------------------------------------- /summarize/common/__init__.py: -------------------------------------------------------------------------------- 1 | from summarize.common.tempdir import TemporaryDirectory 2 | -------------------------------------------------------------------------------- /summarize/common/tempdir.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import tempfile 3 | from typing import Optional 4 | 5 | 6 | class TemporaryDirectory(object): 7 | """ 8 | Creates a temporary directory that works with a context manager (the python 9 | ``with`` statement). This class was created because the user is responsible for 10 | deleting the directory created by ``tempfile.mkdtemp``. Instead, the context 11 | manager ensures the directory is deleted at the end. 12 | 13 | Example usage:: 14 | 15 | with TemporaryDirectory() as temp_dir: 16 | with open(temp_dir + '/file.txt') as out: 17 | ... 18 | 19 | Parameters 20 | ---------- 21 | root: ``str``, optional (default = ``None``) 22 | The root directory where the temporary directory should be created. If ``None``, 23 | the ``tempfile.mkdtemp`` default location is used. 24 | persist: ``bool``, optional (default = False) 25 | Indicates whether or not the directory should be persist on disk after the 26 | context closes. 27 | """ 28 | def __init__(self, 29 | root: Optional[str] = None, 30 | persist: bool = False) -> None: 31 | self.root = root 32 | self.persist = persist 33 | 34 | def __enter__(self): 35 | self.path = tempfile.mkdtemp(dir=self.root) 36 | return self.path 37 | 38 | def __exit__(self, *args): 39 | if not self.persist: 40 | shutil.rmtree(self.path) 41 | -------------------------------------------------------------------------------- /summarize/common/testing.py: -------------------------------------------------------------------------------- 1 | FIXTURES_ROOT = 'summarize/tests/fixtures' 2 | -------------------------------------------------------------------------------- /summarize/common/util.py: -------------------------------------------------------------------------------- 1 | # These symbols are used when beginning- and end-of-sentence tags are required 2 | # in addition to START_SYMBOL and END_SYMBOL, which mark the starting and 3 | # ending of full sequences. 4 | SENT_START_SYMBOL = '@sent_start@' 5 | SENT_END_SYMBOL = '@sent_end@' 6 | 7 | # This symbol represents the copy token in the Pointer-Generator model 8 | COPY_SYMBOL = '@copy@' 9 | -------------------------------------------------------------------------------- /summarize/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/data/__init__.py -------------------------------------------------------------------------------- /summarize/data/dataset_readers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/data/dataset_readers/__init__.py -------------------------------------------------------------------------------- /summarize/data/dataset_readers/cloze/__init__.py: -------------------------------------------------------------------------------- 1 | from summarize.data.dataset_readers.cloze.abstractive import AbstractiveClozeDatasetReader 2 | from summarize.data.dataset_readers.cloze.extractive import ExtractiveClozeDatasetReader 3 | from summarize.data.dataset_readers.cloze.pointer_generator import PointerGeneratorClozeDatasetReader 4 | -------------------------------------------------------------------------------- /summarize/data/dataset_readers/sds/__init__.py: -------------------------------------------------------------------------------- 1 | from summarize.data.dataset_readers.sds.abstractive import AbstractiveDatasetReader 2 | from summarize.data.dataset_readers.sds.extractive import ExtractiveDatasetReader 3 | from summarize.data.dataset_readers.sds.pointer_generator import PointerGeneratorDatasetReader 4 | -------------------------------------------------------------------------------- /summarize/data/dataset_setup/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/data/dataset_setup/__init__.py -------------------------------------------------------------------------------- /summarize/data/dataset_setup/deutsch2019.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from joblib import Parallel, delayed 3 | from tqdm import tqdm 4 | from typing import Dict, List 5 | 6 | from summarize.data.io import JsonlReader, JsonlWriter 7 | from summarize.metrics.python_rouge import PythonRouge 8 | from summarize.metrics.rouge import R1_RECALL 9 | from summarize.models.sds.oracle import get_greedy_oracle_summary 10 | 11 | _BATCH_SIZE = 100 12 | 13 | 14 | def _process_batch(parallel: Parallel, 15 | batch: List[Dict[str, List[str]]], 16 | python_rouge: PythonRouge, 17 | out: JsonlWriter) -> None: 18 | jobs = [] 19 | documents = [] 20 | for instance in batch: 21 | document = [sentence for document in instance['documents'] 22 | for paragraph in document['paragraphs'] 23 | for sentence in paragraph] 24 | cloze = instance['cloze'] 25 | job = delayed(get_greedy_oracle_summary)(document, [cloze], R1_RECALL, 26 | use_porter_stemmer=True, 27 | remove_stopwords=False, 28 | python_rouge=python_rouge) 29 | jobs.append(job) 30 | documents.append(document) 31 | 32 | results = parallel(jobs) 33 | for instance, document, (_, labels) in zip(batch, documents, results): 34 | id_ = instance['id'] 35 | page_title = instance['page_title'] 36 | headings = instance['headings'] 37 | topics = [page_title] + headings 38 | context = instance['left_context'] 39 | cloze = instance['cloze'] 40 | output_data = { 41 | 'id': id_, 42 | 'topics': topics, 43 | 'document': document, 44 | 'context': context, 45 | 'cloze': cloze, 46 | 'labels': labels 47 | } 48 | out.write(output_data) 49 | 50 | 51 | def main(args): 52 | python_rouge = PythonRouge() 53 | with JsonlWriter(args.output_jsonl) as out: 54 | with JsonlReader(args.input_jsonl) as f: 55 | with Parallel(n_jobs=args.num_cores) as parallel: 56 | batch = [] 57 | for instance in tqdm(f): 58 | batch.append(instance) 59 | if len(batch) == _BATCH_SIZE: 60 | _process_batch(parallel, batch, python_rouge, out) 61 | batch.clear() 62 | 63 | if batch: 64 | _process_batch(parallel, batch, python_rouge, out) 65 | 66 | 67 | if __name__ == '__main__': 68 | argp = argparse.ArgumentParser() 69 | argp.add_argument('input_jsonl', help='The input file to preprocess') 70 | argp.add_argument('output_jsonl', help='The output file') 71 | argp.add_argument('--num-cores', type=int, default=1, help='The number of cores to use') 72 | args = argp.parse_args() 73 | main(args) 74 | -------------------------------------------------------------------------------- /summarize/data/dataset_setup/kedzie2018.py: -------------------------------------------------------------------------------- 1 | """ 2 | Prepares the datasets to reproduce Kedzie 2018 by computing greedy oracle 3 | summaries by optimizing ROUGE-1 recall. 4 | """ 5 | import argparse 6 | from joblib import Parallel, delayed 7 | from tqdm import tqdm 8 | from typing import Dict, List 9 | 10 | from summarize.data.io import JsonlReader, JsonlWriter 11 | from summarize.metrics.python_rouge import PythonRouge 12 | from summarize.metrics.rouge import R1_RECALL 13 | from summarize.models.sds.oracle import get_greedy_oracle_summary 14 | 15 | _BATCH_SIZE = 100 16 | 17 | 18 | def _process_batch(parallel: Parallel, 19 | batch: List[Dict[str, List[str]]], 20 | max_tokens: int, 21 | python_rouge: PythonRouge, 22 | out: JsonlWriter) -> None: 23 | jobs = [] 24 | for instance in batch: 25 | document = instance['document'] 26 | summary = instance['summary'] 27 | job = delayed(get_greedy_oracle_summary)(document, summary, 28 | R1_RECALL, 29 | max_tokens=max_tokens, 30 | use_porter_stemmer=True, 31 | remove_stopwords=True, 32 | python_rouge=python_rouge) 33 | jobs.append(job) 34 | 35 | results = parallel(jobs) 36 | for instance, (_, labels) in zip(batch, results): 37 | instance['labels'] = labels 38 | out.write(instance) 39 | 40 | 41 | def main(args): 42 | python_rouge = PythonRouge() 43 | with JsonlWriter(args.output_jsonl) as out: 44 | with JsonlReader(args.input_jsonl) as f: 45 | with Parallel(n_jobs=args.num_cores) as parallel: 46 | batch = [] 47 | for instance in tqdm(f): 48 | batch.append(instance) 49 | if len(batch) == _BATCH_SIZE: 50 | _process_batch(parallel, batch, args.max_tokens, python_rouge, out) 51 | batch.clear() 52 | 53 | if batch: 54 | _process_batch(parallel, batch, args.max_tokens, python_rouge, out) 55 | 56 | 57 | if __name__ == '__main__': 58 | argp = argparse.ArgumentParser() 59 | argp.add_argument('input_jsonl', help='The dataset to setup') 60 | argp.add_argument('output_jsonl', help='The output file') 61 | argp.add_argument('max_tokens', type=int, help='The maximum number of tokens to take in the greedy summary') 62 | argp.add_argument('--num-cores', type=int, default=1, help='The number of cores to use') 63 | args = argp.parse_args() 64 | main(args) 65 | -------------------------------------------------------------------------------- /summarize/data/dataset_setup/tokenize.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tokenizes fields in a jsonl dataset file with the English spacy tokenizer. 3 | """ 4 | import argparse 5 | import nltk 6 | import spacy 7 | from tqdm import tqdm 8 | from typing import Callable, Iterable, T 9 | 10 | from summarize.data.io import JsonlReader, JsonlWriter 11 | 12 | 13 | def tokenize(tokenize_func: Callable[[str], Iterable[T]], field): 14 | """ 15 | Tokenizes text using the a tokenizer function. The ``field`` argument can be 16 | a string or a nested list of strings. The method will return the same level of nesting 17 | with the tokens whitespace separated in a string. 18 | 19 | The ``tokenize_func`` should be some function which returns iterable of tokens 20 | which can be cast to strings. For example, the ``nlp`` object from spacy or 21 | the ``word_tokenize`` function from nltk both work. 22 | 23 | For example:: 24 | 25 | nlp = spacy.load('en') 26 | tokenize(nlp, "Hi, I'm Dan.") 27 | >>> "Hi , I 'm Dan ." 28 | tokenize(nlp, [['The first.', 'The second.'], 'The third.']) 29 | >>> [['The first .', 'The second .'], 'The third .'] 30 | 31 | from nltk import word_tokenize 32 | tokenize(word_tokenize, 'This is the NLTK version.') 33 | >>> 'This is the NLTK version .' 34 | 35 | Parameters 36 | ---------- 37 | tokenize_func: ``Callable[[str], Iterable[T]]``, required. 38 | The tokenization function. See above for a more detailed explanation. 39 | field: required. 40 | The text to tokenize. See above for the type explanation. 41 | 42 | Returns 43 | ------- 44 | The tokenized text. 45 | """ 46 | if isinstance(field, str): 47 | return ' '.join([str(token) for token in tokenize_func(field)]) 48 | elif isinstance(field, list): 49 | return [tokenize(tokenize_func, item) for item in field] 50 | else: 51 | raise TypeError(f'Unknown ``field`` type {type(field)}') 52 | 53 | 54 | def main(args): 55 | if args.backend == 'spacy': 56 | nlp = spacy.load('en', disable=['tagger', 'parser', 'ner']) 57 | elif args.backend == 'nltk': 58 | nlp = nltk.word_tokenize 59 | 60 | with JsonlWriter(args.output_file) as out: 61 | with JsonlReader(args.input_file) as f: 62 | for instance in tqdm(f, desc=f'Tokenizing {args.input_file}'): 63 | for field in args.fields: 64 | instance[field] = tokenize(nlp, instance[field]) 65 | out.write(instance) 66 | 67 | 68 | if __name__ == '__main__': 69 | argp = argparse.ArgumentParser() 70 | argp.add_argument('input_file', help='The jsonl file with fields to tokenize') 71 | argp.add_argument('output_file', help='The output jsonl file with the tokenized data') 72 | argp.add_argument('fields', nargs='+') 73 | argp.add_argument('--backend', default='spacy', choices=['spacy', 'nltk'], 74 | help='Indicates which library should be used for tokenization') 75 | args = argp.parse_args() 76 | main(args) 77 | -------------------------------------------------------------------------------- /summarize/data/dataset_setup/wikicite.py: -------------------------------------------------------------------------------- 1 | """ 2 | Preprocesses the original WikiCite dataset by tokenizing all of the text fields. 3 | """ 4 | import argparse 5 | import spacy 6 | from tqdm import tqdm 7 | 8 | from summarize.data.dataset_setup.tokenize import tokenize 9 | from summarize.data.io import JsonlReader, JsonlWriter 10 | 11 | 12 | def main(args): 13 | nlp = spacy.load('en', disable=['tagger', 'parser', 'ner']) 14 | 15 | with JsonlWriter(args.output_jsonl) as out: 16 | with JsonlReader(args.input_jsonl) as f: 17 | for instance in tqdm(f): 18 | instance['headings'] = [tokenize(nlp, heading) for heading in instance['headings']] 19 | for document in instance['documents']: 20 | if document['title']: 21 | document['title'] = tokenize(nlp, document['title']) 22 | document['paragraphs'] = tokenize(nlp, document['paragraphs']) 23 | 24 | instance['left_context'] = tokenize(nlp, instance['left_context']) 25 | instance['cloze'] = tokenize(nlp, instance['cloze']) 26 | instance['right_context'] = tokenize(nlp, instance['right_context']) 27 | out.write(instance) 28 | 29 | 30 | if __name__ == '__main__': 31 | argp = argparse.ArgumentParser() 32 | argp.add_argument('input_jsonl', help='The input file to setup') 33 | argp.add_argument('output_jsonl', help='The output file') 34 | args = argp.parse_args() 35 | main(args) 36 | -------------------------------------------------------------------------------- /summarize/data/dataset_stats/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/data/dataset_stats/__init__.py -------------------------------------------------------------------------------- /summarize/data/io/__init__.py: -------------------------------------------------------------------------------- 1 | from summarize.data.io.jsonl_reader import JsonlReader 2 | from summarize.data.io.jsonl_writer import JsonlWriter 3 | -------------------------------------------------------------------------------- /summarize/data/io/jsonl_reader.py: -------------------------------------------------------------------------------- 1 | import bz2 2 | import gzip 3 | import json 4 | from allennlp.common.file_utils import cached_path 5 | from typing import Any, List 6 | 7 | from summarize.data.io.util import is_gz_file 8 | 9 | 10 | class JsonlReader(object): 11 | """ 12 | The `JsonlReader` is a layer of abstraction around reading serialized 13 | objects from a jsonl file. The reader will automatically deserialize and return 14 | one object from each line in the file. The data in the file will be decoded 15 | from a binary file depending on the extension of the file name. Current 16 | supported binary formats are gzip (``.gz``) and bz2 (``.bz2``). For gzip only, 17 | this will also inspect the file to see if it's gzipped in addition to checking 18 | the extension. 19 | 20 | The class should be used the same way that a built-in file handler works:: 21 | 22 | with JsonlReader('/path/to/file.jsonl.gz') as f: 23 | for data in f: 24 | ... 25 | 26 | The class uses the cached path functionality from AllenNLP, so it is also 27 | possible to pass a url to the constructor. 28 | 29 | Parameters 30 | ---------- 31 | file_path: ``str`` 32 | The path to the file where the data should be read. 33 | """ 34 | def __init__(self, file_path: str) -> None: 35 | self.file_path = cached_path(file_path) 36 | 37 | def __enter__(self): 38 | self.binary = False 39 | if self.file_path.endswith('.gz') or is_gz_file(self.file_path): 40 | self.file_handler = gzip.open(self.file_path, 'rb') 41 | self.binary = True 42 | elif self.file_path.endswith('.bz2'): 43 | self.file_handler = bz2.open(self.file_path, 'rb') 44 | self.binary = True 45 | else: 46 | self.file_handler = open(self.file_path, 'r') 47 | self.binary = False 48 | return self 49 | 50 | def __iter__(self): 51 | return self 52 | 53 | def __next__(self) -> Any: 54 | for line in self.file_handler: 55 | if self.binary: 56 | line = line.decode() 57 | return json.loads(line) 58 | raise StopIteration 59 | 60 | def __exit__(self, *args): 61 | self.file_handler.close() 62 | 63 | def read(self) -> List[Any]: 64 | """Reads all of the instances into a list.""" 65 | with self: 66 | return [instance for instance in self] 67 | -------------------------------------------------------------------------------- /summarize/data/io/jsonl_writer.py: -------------------------------------------------------------------------------- 1 | import bz2 2 | import gzip 3 | import json 4 | import os 5 | from typing import Any 6 | 7 | 8 | class JsonlWriter(object): 9 | """ 10 | The ``JsonlWriter`` is a layer of abstraction around writing data to jsonl 11 | files. The writer will automatically serialize the input objects into json 12 | strings, then write them to an output file, one object per line. The data 13 | will be written as plain text or as bytes, depending on the extension of 14 | the output file. Current supported binary formats are gzip (``.gz``) and 15 | bz2 (``.bz2``). All other extensions will use plain text. 16 | 17 | The class should be used the same way that a built-in file handler works:: 18 | 19 | with JsonlWriter('/path/to/file.jsonl.gz') as out: 20 | data = ... # some data to serialize 21 | out.write(data) 22 | 23 | Parameters 24 | ---------- 25 | file_path: ``str`` 26 | The path to the file where the data should be written. 27 | """ 28 | def __init__(self, file_path: str) -> None: 29 | self.file_path = file_path 30 | 31 | def __enter__(self): 32 | dirname = os.path.dirname(self.file_path) 33 | if dirname: 34 | os.makedirs(dirname, exist_ok=True) 35 | if self.file_path.endswith('.gz'): 36 | self.file_handler = gzip.open(self.file_path, 'wb') 37 | self.binary = True 38 | elif self.file_path.endswith('.bz2'): 39 | self.file_handler = bz2.open(self.file_path, 'wb') 40 | self.binary = True 41 | else: 42 | self.file_handler = open(self.file_path, 'w') 43 | self.binary = False 44 | return self 45 | 46 | def write(self, object: Any) -> None: 47 | """ 48 | Serializes the input object to a json string and writes it to the file. 49 | 50 | Parameters 51 | ---------- 52 | object: ``Any`` 53 | The object to write to the file. 54 | """ 55 | string = json.dumps(object) 56 | if self.binary: 57 | self.file_handler.write(string.encode() + b'\n') 58 | else: 59 | self.file_handler.write(string + '\n') 60 | 61 | def __exit__(self, *args): 62 | self.file_handler.close() 63 | -------------------------------------------------------------------------------- /summarize/data/io/util.py: -------------------------------------------------------------------------------- 1 | import binascii 2 | 3 | 4 | def is_gz_file(file_path: str): 5 | """ 6 | Tests to see if a file is gzipped or not. This was taken from 7 | https://stackoverflow.com/questions/3703276/how-to-tell-if-a-file-is-gzip-compressed 8 | 9 | Returns 10 | ------- 11 | True if it is gzipped, False otherwise. 12 | """ 13 | with open(file_path, 'rb') as f: 14 | return binascii.hexlify(f.read(2)) == b'1f8b' 15 | -------------------------------------------------------------------------------- /summarize/data/paragraph_tokenizers/__init__.py: -------------------------------------------------------------------------------- 1 | from summarize.data.paragraph_tokenizers.paragraph_tokenizer import ParagraphTokenizer 2 | from summarize.data.paragraph_tokenizers.paragraph_word_tokenizer import ParagraphWordTokenizer 3 | -------------------------------------------------------------------------------- /summarize/data/paragraph_tokenizers/paragraph_tokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from allennlp.common import Registrable 4 | from allennlp.data.tokenizers.token import Token 5 | 6 | 7 | class ParagraphTokenizer(Registrable): 8 | """ 9 | A ``ParagraphTokenizer`` is a wrapper around an AllenNLP ``Tokenizer`` for tokenizing 10 | a list of strings into tokens. The primary use is for tokenizing a pre-sentence-split 11 | paragraph into a single list of tokens. Having this abstraction at the paragraph-level 12 | allows for additional functionality, like adding tokens in between the sentences. 13 | """ 14 | def tokenize(self, texts: List[str]) -> List[Token]: 15 | """ 16 | Actually implements splitting sentences into tokens. 17 | 18 | Returns 19 | ------- 20 | tokens : ``List[Token]`` 21 | """ 22 | raise NotImplementedError 23 | -------------------------------------------------------------------------------- /summarize/data/paragraph_tokenizers/paragraph_word_tokenizer.py: -------------------------------------------------------------------------------- 1 | from allennlp.data.tokenizers import Token, WordTokenizer 2 | from allennlp.data.tokenizers.word_filter import WordFilter, PassThroughWordFilter 3 | from allennlp.data.tokenizers.word_splitter import WordSplitter 4 | from allennlp.data.tokenizers.word_stemmer import WordStemmer, PassThroughWordStemmer 5 | from overrides import overrides 6 | from typing import List 7 | 8 | from summarize.data.paragraph_tokenizers import ParagraphTokenizer 9 | 10 | 11 | @ParagraphTokenizer.register('word') 12 | class ParagraphWordTokenizer(ParagraphTokenizer): 13 | """ 14 | A ``ParagraphWordTokenizer`` is a wrapper around the ``WordTokenizer`` at the 15 | paragraph-level. It includes the ability to insert tokens in between the 16 | sentence tokens. 17 | 18 | Parameters 19 | ---------- 20 | word_splitter: ``WordSplitter``, optional (default = ``None``) 21 | See ``WordTokenizer`` 22 | word_filter: ``WordFilter``, optional (default = ``PassThroughWordFilter()``) 23 | See ``WordTokenizer`` 24 | word_stemmer: ``WordStemmer``, optional (default = ``PassThroughWordStemmer()``) 25 | See ``WordTokenizer`` 26 | start_tokens: ``List[str]``, optional (default = ``[]``) 27 | See ``WordTokenizer`` 28 | end_tokens: ``List[str]``, optional (default = ``[]``) 29 | See ``WordTokenizer`` 30 | in_between_tokens: ``List[str]``, optional (default = ``[]``) 31 | The tokens to insert in between sentences. 32 | """ 33 | def __init__(self, 34 | word_splitter: WordSplitter = None, 35 | word_filter: WordFilter = PassThroughWordFilter(), 36 | word_stemmer: WordStemmer = PassThroughWordStemmer(), 37 | start_tokens: List[str] = None, 38 | end_tokens: List[str] = None, 39 | in_between_tokens: List[str] = None): 40 | self.tokenizer = WordTokenizer(word_splitter=word_splitter, 41 | word_filter=word_filter, 42 | word_stemmer=word_stemmer) 43 | self.start_tokens = start_tokens or [] 44 | self.start_tokens = [Token(token) for token in self.start_tokens] 45 | self.end_tokens = end_tokens or [] 46 | self.end_tokens = [Token(token) for token in self.end_tokens] 47 | self.in_between_tokens = in_between_tokens or [] 48 | self.in_between_tokens = [Token(token) for token in self.in_between_tokens] 49 | 50 | @overrides 51 | def tokenize(self, texts: List[str]) -> List[Token]: 52 | tokenized_texts = [self.tokenizer.tokenize(text) for text in texts] 53 | tokens = [] 54 | if self.start_tokens: 55 | tokens.extend(self.start_tokens) 56 | for i, tokenized_text in enumerate(tokenized_texts): 57 | tokens.extend(tokenized_text) 58 | 59 | # Add the in-between tokens if this is not the last sentence 60 | if i != len(tokenized_texts) - 1: 61 | tokens.extend(self.in_between_tokens) 62 | if self.end_tokens: 63 | tokens.extend(self.end_tokens) 64 | return tokens 65 | -------------------------------------------------------------------------------- /summarize/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/metrics/__init__.py -------------------------------------------------------------------------------- /summarize/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/models/__init__.py -------------------------------------------------------------------------------- /summarize/models/cloze/__init__.py: -------------------------------------------------------------------------------- 1 | from summarize.models.cloze.extractive_baseline import ClozeExtractiveBaselineModel 2 | from summarize.models.cloze.pointer_generator import ClozePointerGeneratorModel 3 | from summarize.models.cloze.seq2seq import ClozeSeq2SeqModel 4 | from summarize.models.cloze.open_ai_language_model import OpenAILanguageModel 5 | -------------------------------------------------------------------------------- /summarize/models/cloze/bm25/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/models/cloze/bm25/__init__.py -------------------------------------------------------------------------------- /summarize/models/cloze/bm25/calculate_df.py: -------------------------------------------------------------------------------- 1 | """ 2 | Computes the document-frequency term for calculating BM25. The model will consider 3 | the cloze context as the query and the reference document sentences as the 4 | documents that need to be ranked. Therefore, one sentence is a "document" in the 5 | BM25 equation, and thus the document frequencies should be based on the document 6 | sentences. 7 | """ 8 | import argparse 9 | from collections import Counter 10 | from tqdm import tqdm 11 | 12 | from summarize.data.io import JsonlReader, JsonlWriter 13 | 14 | 15 | def main(args): 16 | dfs = Counter() 17 | total_document_length = 0 18 | num_documents = 0 19 | 20 | with JsonlReader(args.input_jsonl) as f: 21 | for instance in tqdm(f, desc='Calculating document frequencies'): 22 | document = instance['document'] 23 | for sentence in document: 24 | tokens = sentence.lower().split() 25 | total_document_length += len(tokens) 26 | num_documents += 1 27 | for token in set(tokens): 28 | dfs[token] += 1 29 | 30 | average_document_length = total_document_length / num_documents 31 | with JsonlWriter(args.output_jsonl) as out: 32 | out.write({'num_documents': num_documents, 'average_document_length': average_document_length}) 33 | for token, df in dfs.items(): 34 | out.write({'token': token, 'df': df}) 35 | 36 | 37 | if __name__ == '__main__': 38 | argp = argparse.ArgumentParser() 39 | argp.add_argument('input_jsonl') 40 | argp.add_argument('output_jsonl') 41 | args = argp.parse_args() 42 | main(args) 43 | -------------------------------------------------------------------------------- /summarize/models/cloze/lead.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from summarize.data.io import JsonlReader, JsonlWriter 4 | from summarize.models.sds.lead import get_lead_summary 5 | 6 | 7 | def main(args): 8 | with JsonlWriter(args.output_jsonl) as out: 9 | with JsonlReader(args.input_jsonl) as f: 10 | for instance in f: 11 | document = instance['document'] 12 | cloze = get_lead_summary(document, 13 | max_sentences=args.max_sentences, 14 | max_tokens=args.max_tokens, 15 | max_bytes=args.max_bytes) 16 | if not args.keep_sentences: 17 | cloze = ' '.join(cloze) 18 | out.write({args.field_name: cloze}) 19 | 20 | 21 | if __name__ == '__main__': 22 | argp = argparse.ArgumentParser() 23 | argp.add_argument('input_jsonl', help='The input documents') 24 | argp.add_argument('output_jsonl', help='The output file') 25 | argp.add_argument('--max-sentences', type=int, help='The number of sentences to take') 26 | argp.add_argument('--max-tokens', type=int, help='The number of tokens to take') 27 | argp.add_argument('--max-bytes', type=int, help='The number of bytes to take') 28 | argp.add_argument('--field-name', default='cloze', help='The name of the output field') 29 | argp.add_argument('--keep-sentences', action='store_true', help='Indicates if the output field should be left as sentences or flattened') 30 | args = argp.parse_args() 31 | main(args) 32 | -------------------------------------------------------------------------------- /summarize/models/sds/__init__.py: -------------------------------------------------------------------------------- 1 | from summarize.models.sds.extractive_baseline import ExtractiveBaselineModel 2 | from summarize.models.sds.pointer_generator import PointerGeneratorModel 3 | from summarize.models.sds.seq2seq import Seq2SeqModel 4 | -------------------------------------------------------------------------------- /summarize/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/modules/__init__.py -------------------------------------------------------------------------------- /summarize/modules/coverage_matrix_attention/__init__.py: -------------------------------------------------------------------------------- 1 | from summarize.modules.coverage_matrix_attention.coverage_matrix_attention import CoverageMatrixAttention 2 | from summarize.modules.coverage_matrix_attention.matrix_attention_wrapper import MatrixAttentionWrapper 3 | from summarize.modules.coverage_matrix_attention.mlp import MLPCoverageAttention 4 | -------------------------------------------------------------------------------- /summarize/modules/coverage_matrix_attention/coverage_matrix_attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from allennlp.common.registrable import Registrable 3 | from typing import Tuple 4 | 5 | 6 | class CoverageMatrixAttention(torch.nn.Module, Registrable): 7 | """ 8 | The ``CoverageMatrixAttention`` computes a matrix of attention probabilities 9 | between the encoder and decoder outputs. The attention function has access 10 | to the cumulative probabilities that the attention has assigned to each 11 | input token previously. In addition to the attention probabilities, the function 12 | should return the coverage vectors which were used to compute the distribution 13 | at each time step as well as the new coverage vector which takes into account 14 | the function's computation. 15 | 16 | The module must compute the probabilities instead of the raw scores (like 17 | the ``MatrixAttention`` module does) because the coverage vector contains 18 | the accumulated probabilities. 19 | """ 20 | def forward(self, 21 | decoder_outputs: torch.Tensor, 22 | encoder_outputs: torch.Tensor, 23 | encoder_mask: torch.Tensor, 24 | coverage_vector: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: 25 | """ 26 | Computes a matrix of attention scores and updates the coverage vector. 27 | 28 | Parameters 29 | ---------- 30 | decoder_outputs: (batch_size, num_decoder_tokens, hidden_dim) 31 | The decoder's outputs. 32 | encoder_outputs: (batch_size, num_encoder_tokens, hidden_dim) 33 | The encoder's outputs. 34 | encoder_mask: (batch_size, num_encoder_tokens) 35 | The encoder token mask. 36 | coverage_vector: (batch_size, num_encoder_tokens) 37 | The cumulative attention probability assigned to each input token 38 | thus far. 39 | 40 | Returns 41 | ------- 42 | torch.Tensor: (batch_size, num_decoder_tokens, num_encoder_tokens) 43 | The attention probabilities between each decoder and encoder hidden representations. 44 | torch.Tensor: (batch_size, num_decoder_tokens, num_encoder_tokens) 45 | The coverage vectors used to compute the corresponding attention probabilities. 46 | torch.Tensor: (batch_size, num_encoder_tokens) 47 | The latest coverage vector after computing 48 | """ 49 | raise NotImplementedError 50 | -------------------------------------------------------------------------------- /summarize/modules/coverage_matrix_attention/matrix_attention_wrapper.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from allennlp.modules.matrix_attention import MatrixAttention 3 | from allennlp.nn.util import masked_softmax 4 | from overrides import overrides 5 | from typing import Tuple 6 | 7 | from summarize.modules.coverage_matrix_attention import CoverageMatrixAttention 8 | 9 | 10 | @CoverageMatrixAttention.register('matrix-attention') 11 | class MatrixAttentionWrapper(CoverageMatrixAttention): 12 | """ 13 | Wraps the ``MatrixAttention`` module from AllenNLP so the attention functions 14 | which do not use coverage can implement the ``CoverageMatrixAttention`` module 15 | interface. 16 | 17 | Parameters 18 | ---------- 19 | matrix_attention: ``MatrixAttention`` 20 | The underlying ``MatrixAttention`` to use. 21 | """ 22 | def __init__(self, matrix_attention: MatrixAttention) -> None: 23 | super().__init__() 24 | self.matrix_attention = matrix_attention 25 | 26 | @overrides 27 | def forward(self, 28 | decoder_outputs: torch.Tensor, 29 | encoder_outputs: torch.Tensor, 30 | encoder_mask: torch.Tensor, 31 | coverage_vector: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: 32 | # shape: (batch_size, num_summary_tokens, num_document_tokens) 33 | affinities = self.matrix_attention(decoder_outputs, encoder_outputs) 34 | # shape: (batch_size, num_summary_tokens, num_document_tokens) 35 | probabilities = masked_softmax(affinities, encoder_mask) 36 | 37 | # Create dummy coverage vectors to return 38 | batch_size, num_summary_tokens, num_document_tokens = affinities.size() 39 | # shape: (batch_size, num_summary_tokens, num_document_tokens) 40 | coverage_vectors = coverage_vector.new_zeros(batch_size, num_summary_tokens, num_document_tokens) 41 | # shape: (batch_size, num_document_tokens) 42 | coverage_vector = coverage_vector.new_zeros(batch_size, num_document_tokens) 43 | 44 | return probabilities, coverage_vectors, coverage_vector 45 | -------------------------------------------------------------------------------- /summarize/modules/generate_probability_functions/__init__.py: -------------------------------------------------------------------------------- 1 | from summarize.modules.generate_probability_functions.generate_probability_function import GenerateProbabilityFunction 2 | from summarize.modules.generate_probability_functions.onmt import ONMTGenerateProbabilityFunction 3 | from summarize.modules.generate_probability_functions.see2017 import See2017GenerateProbabilityFunction 4 | -------------------------------------------------------------------------------- /summarize/modules/generate_probability_functions/generate_probability_function.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from allennlp.common.registrable import Registrable 3 | 4 | 5 | class GenerateProbabilityFunction(torch.nn.Module, Registrable): 6 | def forward(self, 7 | input_embeddings: torch.Tensor, 8 | pre_attention_decoder_outputs: torch.Tensor, 9 | post_attention_decoder_outputs: torch.Tensor, 10 | attention_context: torch.Tensor) -> torch.Tensor: 11 | """ 12 | Computes the probability of generating a token, the soft switch from 13 | See et al. (2017). 14 | 15 | Parameters 16 | ---------- 17 | input_embeddings: (batch_size, num_summary_tokens, embedding_dim) 18 | The embeddings which are passed as input to the decoder. 19 | pre_attention_decoder_outputs: (batch_size, num_summary_tokens, hidden_dim) 20 | The direct output from the decoder, which does not include any attention. 21 | post_attention_decoder_outputs: (batch_size, num_summary_tokens, hidden_dim) 22 | The output of the decoder after attention has been included. 23 | attention_context: (batch_size, num_summary_tokens, encoder_hidden_dim) 24 | The attention context (the weighted average of the encoder hidden states 25 | based on the attention distribution) 26 | 27 | Returns 28 | ------- 29 | (batch_size, num_summary_tokens): 30 | The generation probability. 31 | """ 32 | raise NotImplementedError 33 | -------------------------------------------------------------------------------- /summarize/modules/generate_probability_functions/onmt.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from overrides import overrides 3 | 4 | from summarize.modules.generate_probability_functions import GenerateProbabilityFunction 5 | 6 | 7 | @GenerateProbabilityFunction.register('onmt') 8 | class ONMTGenerateProbabilityFunction(GenerateProbabilityFunction): 9 | """ 10 | Computes the generation probability according the function used by the 11 | OpenNMT framework. The probability is a function of only the final decoder 12 | hidden states (with attention). 13 | 14 | Parameters 15 | ---------- 16 | decoder_dim: ``int`` 17 | The size of the decoder's hidden state. 18 | """ 19 | def __init__(self, decoder_dim: int) -> None: 20 | super().__init__() 21 | self.hidden_layer = torch.nn.Linear(decoder_dim, 1) 22 | 23 | @overrides 24 | def forward(self, 25 | input_embeddings: torch.Tensor, 26 | pre_attention_decoder_outputs: torch.Tensor, 27 | post_attention_decoder_outputs: torch.Tensor, 28 | attention_context: torch.Tensor) -> torch.Tensor: 29 | # shape: (batch_size, num_summary_tokens) 30 | return torch.sigmoid(self.hidden_layer(post_attention_decoder_outputs).squeeze(2)) 31 | -------------------------------------------------------------------------------- /summarize/modules/generate_probability_functions/see2017.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from overrides import overrides 3 | 4 | from summarize.modules.generate_probability_functions import GenerateProbabilityFunction 5 | 6 | 7 | @GenerateProbabilityFunction.register('see2017') 8 | class See2017GenerateProbabilityFunction(GenerateProbabilityFunction): 9 | """ 10 | Computes the generation probability according to See et al. (2017). The probability 11 | is a linear function of the input embedding, output from the decoder (without attention), 12 | and the attention context vector. 13 | 14 | Parameters 15 | ---------- 16 | embedding_dim: ``int`` 17 | The size of the input embeddings to the decoder 18 | encoder_dim: ``int`` 19 | The size of the encoder's hidden state. 20 | decoder_dim: ``int`` 21 | The size of the decoder's hidden state. 22 | """ 23 | def __init__(self, embedding_dim: int, encoder_dim: int, decoder_dim: int) -> None: 24 | super().__init__() 25 | self.input_layer = torch.nn.Linear(embedding_dim, 1) 26 | self.hidden_layer = torch.nn.Linear(decoder_dim, 1) 27 | self.context_layer = torch.nn.Linear(encoder_dim, 1) 28 | 29 | @overrides 30 | def forward(self, 31 | input_embeddings: torch.Tensor, 32 | pre_attention_decoder_outputs: torch.Tensor, 33 | post_attention_decoder_outputs: torch.Tensor, 34 | attention_context: torch.Tensor) -> torch.Tensor: 35 | # shape: (batch_size, num_summary_tokens) 36 | input_score = self.input_layer(input_embeddings).squeeze(2) 37 | # shape: (batch_size, num_summary_tokens) 38 | hidden_score = self.hidden_layer(pre_attention_decoder_outputs).squeeze(2) 39 | # shape: (batch_size, num_summary_tokens) 40 | context_score = self.context_layer(attention_context).squeeze(2) 41 | # shape: (batch_size, num_summary_tokens) 42 | probability = torch.sigmoid(context_score + hidden_score + input_score) 43 | 44 | # In my experience, the generation probability can sometimes be equal 45 | # to 1.0 or 0.0 (with really large/small scores) even with reasonably sized 46 | # parameter values. This causes problems with the log which is called 47 | # later on. Therefore, we move the probability closer to 0.5 by a small 48 | # number for stability. 49 | # shape: (batch_size, num_summary_tokens) 50 | geq_one_half_mask = (probability >= 0.5).float() 51 | # shape: (batch_size, num_summary_tokens) 52 | probability = (probability - 1e-3) * (geq_one_half_mask) + (probability + 1e-3) * (1 - geq_one_half_mask) 53 | return probability 54 | -------------------------------------------------------------------------------- /summarize/modules/matrix_attention/__init__.py: -------------------------------------------------------------------------------- 1 | from summarize.modules.matrix_attention.mlp import MLPAttention 2 | -------------------------------------------------------------------------------- /summarize/modules/matrix_attention/mlp.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from allennlp.modules.matrix_attention import MatrixAttention 3 | from overrides import overrides 4 | 5 | 6 | @MatrixAttention.register('mlp') 7 | class MLPAttention(MatrixAttention): 8 | """ 9 | An implementation of the "concat" attention from the arvix version of 10 | Luong et al. (2015) (https://arxiv.org/pdf/1508.04025.pdf). For some reason, 11 | the "concat" attention is different in the version in the ACL Anthology. 12 | 13 | Parameters 14 | ---------- 15 | encoder_size: ``int`` 16 | The size of the encoder hidden states. 17 | decoder_size: ``int`` 18 | The size of the decoder hidden states. 19 | attention_size: ``int`` 20 | The size of the intermediate attention hidden size. 21 | """ 22 | def __init__(self, 23 | encoder_size: int, 24 | decoder_size: int, 25 | attention_size: int) -> None: 26 | super().__init__() 27 | self.linear_context = torch.nn.Linear(encoder_size, attention_size, bias=False) 28 | self.linear_query = torch.nn.Linear(decoder_size, attention_size, bias=True) 29 | self.v = torch.nn.Linear(attention_size, 1, bias=False) 30 | 31 | @overrides 32 | def forward(self, 33 | decoder_outputs: torch.Tensor, 34 | encoder_outputs: torch.Tensor) -> torch.Tensor: 35 | """ 36 | Parameters 37 | ---------- 38 | decoder_outputs: ``torch.Tensor``, ``(batch_size, num_summary_tokens, decoder_size)`` 39 | The decoder outputs 40 | encoder_outputs: ``torch.Tensor``, ``(batch_size, num_document_tokens, encoder_size)`` 41 | 42 | Returns 43 | ------- 44 | A ``(batch_size, num_summary_tokens, num_document_tokens)``-sized tensor with the 45 | unnormalized attention scores. 46 | """ 47 | num_decoder_tokens = decoder_outputs.size(1) 48 | num_encoder_tokens = encoder_outputs.size(1) 49 | 50 | # shape: (batch_size, num_summary_tokens, 1, decoder_size) 51 | decoder_outputs = decoder_outputs.unsqueeze(2) 52 | # shape: (batch_size, 1, num_document_tokens, encoder_size) 53 | encoder_outputs = encoder_outputs.unsqueeze(1) 54 | 55 | # shape: (batch_size, num_summary_tokens, 1, attention_size) 56 | decoder_projection = self.linear_query(decoder_outputs) 57 | # shape: (batch_size, 1, num_document_tokens, attention_size) 58 | encoder_projection = self.linear_context(encoder_outputs) 59 | 60 | # shape: (batch_size, num_summary_tokens, num_document_tokens, attention_size) 61 | decoder_projection = decoder_projection.expand(-1, -1, num_encoder_tokens, -1) 62 | # shape: (batch_size, num_summary_tokens, num_document_tokens, attention_size) 63 | encoder_projection = encoder_projection.expand(-1, num_decoder_tokens, -1, -1) 64 | 65 | # shape: (batch_size, num_summary_tokens, num_document_tokens) 66 | affinities = self.v(torch.tanh(decoder_projection + encoder_projection)).squeeze(-1) 67 | return affinities 68 | -------------------------------------------------------------------------------- /summarize/modules/rnns/__init__.py: -------------------------------------------------------------------------------- 1 | from summarize.modules.rnns.rnn import RNN 2 | from summarize.modules.rnns.lstm import LSTM 3 | from summarize.modules.rnns.gru import GRU 4 | -------------------------------------------------------------------------------- /summarize/modules/rnns/gru.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from overrides import overrides 3 | 4 | from summarize.modules.rnns import RNN 5 | 6 | 7 | @RNN.register('gru') 8 | class GRU(RNN): 9 | """ 10 | A wrapper around the ``torch.nn.GRU`` module. 11 | 12 | Parameters 13 | ---------- 14 | input_size: ``int``, required 15 | The size of the input dimension. 16 | hidden_size: ``hidden_size``, required 17 | The size of the hidden dimension. If bidirectional, each direction will 18 | be this hidden size. 19 | num_layers: ``num_layers``, required 20 | The number of layers. 21 | bidirectional: ``bool``, required 22 | Indicates if the RNN is bidirectional or not. 23 | dropout: ``float``, optional (default = ``0.0``) 24 | The dropout parameter in between RNN layers. 25 | """ 26 | def __init__(self, 27 | input_size: int, 28 | hidden_size: int, 29 | num_layers: int = 1, 30 | bidirectional: bool = False, 31 | dropout: float = 0.0) -> None: 32 | rnn = torch.nn.GRU(input_size, hidden_size, 33 | bidirectional=bidirectional, 34 | batch_first=True, 35 | num_layers=num_layers, 36 | dropout=dropout) 37 | super().__init__(input_size, hidden_size, num_layers, bidirectional, rnn) 38 | 39 | @overrides 40 | def has_memory(self) -> bool: 41 | return False 42 | 43 | @overrides 44 | def reshape_hidden_for_decoder(self, hidden: torch.Tensor) -> torch.Tensor: 45 | if self.num_layers != 1: 46 | # Not entirely sure what to do here. AllenNLP just returns the last 47 | # layer, but I don't know if that's correct. 48 | raise NotImplementedError 49 | 50 | num_directions = 2 if self.bidirectional else 1 51 | batch_size = hidden.size(1) 52 | 53 | # Separate the layers from the number of directions 54 | # shape: (num_layers, num_directions, batch_size, hidden_size) 55 | hidden = hidden.view(self.num_layers, num_directions, batch_size, self.hidden_size) 56 | 57 | # If this is uni-directional, then we can remove the directions 58 | # dimension and return 59 | if num_directions == 1: 60 | # shape: (1, batch_size, hidden_size) 61 | hidden = hidden.squeeze(0) 62 | return hidden 63 | else: 64 | # Otherwise, we have to concatenate the two directions into one vector 65 | # shape: (num_layers, batch_size, hidden_size * 2) 66 | hidden = torch.cat([hidden[:, 0, :, :], hidden[:, 1, :, :]], dim=2) 67 | return hidden 68 | -------------------------------------------------------------------------------- /summarize/modules/sentence_extractors/__init__.py: -------------------------------------------------------------------------------- 1 | from summarize.modules.sentence_extractors.sentence_extractor import SentenceExtractor 2 | from summarize.modules.sentence_extractors.rnn import RNNSentenceExtractor 3 | -------------------------------------------------------------------------------- /summarize/modules/sentence_extractors/rnn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from allennlp.common.checks import ConfigurationError 3 | from allennlp.modules import FeedForward, Seq2SeqEncoder 4 | from overrides import overrides 5 | 6 | from summarize.modules.sentence_extractors import SentenceExtractor 7 | 8 | 9 | @SentenceExtractor.register('rnn') 10 | class RNNSentenceExtractor(SentenceExtractor): 11 | """ 12 | The RNNSentenceExtractor calculates extraction scores by running an RNN 13 | over the sentence representations followed by a feed-forward layer 14 | on the new hidden states. 15 | 16 | Parameters 17 | ---------- 18 | rnn: 19 | The RNN to use (or any Seq2SeqEncoder) 20 | feed_forward: 21 | The feed-forward layer, which must have output dimension 1. 22 | dropout: 23 | The dropout to apply on the RNN hidden states. 24 | """ 25 | def __init__(self, 26 | rnn: Seq2SeqEncoder, 27 | feed_forward: FeedForward, 28 | dropout: float = 0.0) -> None: 29 | super().__init__() 30 | self.rnn = rnn 31 | self.feed_forward = feed_forward 32 | self.dropout = torch.nn.Dropout(dropout) 33 | 34 | if rnn.get_output_dim() != feed_forward.get_input_dim(): 35 | raise ConfigurationError('The RNN and feed-forward layers have incompatible dimensions') 36 | if feed_forward.get_output_dim() != 1: 37 | raise ConfigurationError('The feed-foward network must have output size 1') 38 | 39 | @overrides 40 | def forward(self, 41 | sentence_encodings: torch.Tensor, 42 | mask: torch.Tensor) -> torch.Tensor: 43 | # shape: (batch_size, num_sents, hidden_size) 44 | hidden_encodings = self.rnn(sentence_encodings, mask) 45 | hidden_encodings = self.dropout(hidden_encodings) 46 | # shape: (batch_size, num_sents) 47 | extraction_scores = self.feed_forward(hidden_encodings).squeeze(-1) 48 | return extraction_scores 49 | -------------------------------------------------------------------------------- /summarize/modules/sentence_extractors/sentence_extractor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from allennlp.common import Registrable 3 | 4 | 5 | class SentenceExtractor(torch.nn.Module, Registrable): 6 | def forward(self, 7 | sentence_encodings: torch.Tensor, 8 | mask: torch.Tensor) -> torch.Tensor: 9 | """ 10 | Calculates the probability of each sentence being extracted from the 11 | sentence encodings. 12 | 13 | Parameters 14 | ---------- 15 | sentence_encodings: (batch_size, num_sents, hidden_dim) 16 | The encoding of each sentence 17 | mask: (batch_size, num_sents) 18 | The sentence mask 19 | 20 | Returns 21 | ------- 22 | A (batch_size, num_sents) tensor with the raw extraction scores for each 23 | input sentence. 24 | """ 25 | raise NotImplementedError 26 | -------------------------------------------------------------------------------- /summarize/nn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/nn/__init__.py -------------------------------------------------------------------------------- /summarize/nn/beam_search/__init__.py: -------------------------------------------------------------------------------- 1 | from summarize.nn.beam_search.beam_search import BeamSearch 2 | from summarize.nn.beam_search.relaxed import RelaxedBeamSearch 3 | -------------------------------------------------------------------------------- /summarize/nn/beam_search/coverage_penalizers/__init__.py: -------------------------------------------------------------------------------- 1 | from summarize.nn.beam_search.coverage_penalizers.coverage_penalizer import CoveragePenalizer 2 | from summarize.nn.beam_search.coverage_penalizers.onmt import ONMTCoveragePenalizer 3 | -------------------------------------------------------------------------------- /summarize/nn/beam_search/coverage_penalizers/coverage_penalizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | ``CoveragePenalizer``s are used to rerank the output of beam search by adding 3 | a penalty to the score of each prediction at each step of decoding. 4 | """ 5 | import torch 6 | 7 | from allennlp.common import Registrable 8 | 9 | 10 | class CoveragePenalizer(Registrable): 11 | def __call__(self, coverage: torch.Tensor) -> torch.Tensor: 12 | """ 13 | Computes the factor that should be added to the log-probability of 14 | each output step. 15 | 16 | Parameters 17 | ---------- 18 | coverage: ``torch.Tensor``, (..., num_document_tokens) 19 | A tensor that represents the accumulated attention probabilities 20 | assigned to each document token thus far in decoding. The tensor 21 | may have any number of leading dimensions. 22 | 23 | Returns 24 | ------- 25 | ``torch.Tensor``: 26 | A tensor with the coverage penalties, the same size as the leading 27 | dimensions as the coverage tensor. 28 | """ 29 | raise NotImplementedError 30 | -------------------------------------------------------------------------------- /summarize/nn/beam_search/coverage_penalizers/onmt.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from overrides import overrides 3 | 4 | from summarize.nn.beam_search.coverage_penalizers import CoveragePenalizer 5 | 6 | 7 | @CoveragePenalizer.register('onmt') 8 | class ONMTCoveragePenalizer(CoveragePenalizer): 9 | """ 10 | An implementation of the "summary" coverage penalty in the OpenNMT machine 11 | translation library (https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/translate/penalties.py). 12 | Because we add the coverage penalty to the log-probabilies (as in Wu et al.), 13 | instead of subtracting (as in ONMT), the sign of this penalty is the opposite 14 | as the ONMT implementation. 15 | 16 | The penalty discourages the coverage from attending to any one token too often. 17 | 18 | Parameters 19 | ---------- 20 | beta: ``float`` 21 | The scaling factor. 22 | """ 23 | def __init__(self, beta: float) -> None: 24 | self.beta = beta 25 | 26 | @overrides 27 | def __call__(self, coverage: torch.Tensor) -> torch.Tensor: 28 | num_document_tokens = coverage.size(-1) 29 | penalty = num_document_tokens - torch.clamp(coverage, 1.0).sum(dim=-1) 30 | return self.beta * penalty 31 | -------------------------------------------------------------------------------- /summarize/nn/beam_search/length_penalizers/__init__.py: -------------------------------------------------------------------------------- 1 | from summarize.nn.beam_search.length_penalizers.length_penalizer import LengthPenalizer 2 | from summarize.nn.beam_search.length_penalizers.average import AverageLengthPenalizer 3 | from summarize.nn.beam_search.length_penalizers.wu import WuLengthPenalizer 4 | -------------------------------------------------------------------------------- /summarize/nn/beam_search/length_penalizers/average.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from overrides import overrides 3 | 4 | from summarize.nn.beam_search.length_penalizers import LengthPenalizer 5 | 6 | 7 | @LengthPenalizer.register('average') 8 | class AverageLengthPenalizer(LengthPenalizer): 9 | """ 10 | Penalizes by predictions length of the sequence, thus causing the score 11 | to be the average log-probability per token. 12 | """ 13 | @overrides 14 | def __call__(self, length: torch.Tensor) -> torch.Tensor: 15 | return length.float() 16 | -------------------------------------------------------------------------------- /summarize/nn/beam_search/length_penalizers/length_penalizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | ``LengthPenalizer``s are used to rerank the output of beam search. After all 3 | the top-k hypotheses have been found, their log-probability scores are divided 4 | by a length penalty to adjust for different lengths. 5 | """ 6 | import torch 7 | 8 | from allennlp.common import Registrable 9 | 10 | 11 | class LengthPenalizer(Registrable): 12 | def __call__(self, lengths: torch.Tensor) -> torch.Tensor: 13 | """ 14 | Computes the factor that the log-probability of the output sequence 15 | should be divded by based on its length. 16 | 17 | Parameters 18 | ---------- 19 | lengths: ``torch.Tensor`` 20 | A tensor of the lengths, which can be any size. 21 | 22 | Returns 23 | ------- 24 | ``torch.Tensor``: 25 | A tensor with the length penalties, the same size as the input tensor. 26 | """ 27 | raise NotImplementedError 28 | -------------------------------------------------------------------------------- /summarize/nn/beam_search/length_penalizers/wu.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from overrides import overrides 4 | 5 | from summarize.nn.beam_search.length_penalizers import LengthPenalizer 6 | 7 | 8 | @LengthPenalizer.register('wu') 9 | class WuLengthPenalizer(LengthPenalizer): 10 | """ 11 | Implements the length penalty in Wu et al. (2016) (https://arxiv.org/pdf/1609.08144.pdf), 12 | section 7. 13 | 14 | Parameters 15 | ---------- 16 | alpha: ``float`` 17 | The value of alpha in the length penalty. 18 | """ 19 | def __init__(self, alpha: float) -> None: 20 | self.alpha = alpha 21 | 22 | @overrides 23 | def __call__(self, length: torch.Tensor) -> torch.Tensor: 24 | return torch.pow(5.0 + length.float(), self.alpha) / np.power(6.0, self.alpha) 25 | -------------------------------------------------------------------------------- /summarize/nn/util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def normalize_losses(losses: torch.Tensor, 5 | losses_mask: torch.Tensor, 6 | instance_normalization: str, 7 | batch_normalization: str) -> torch.Tensor: 8 | """ 9 | Normalizes the input losses based on the type of normalization specified 10 | by `instance_normalization` and `batch_normalization`. 11 | 12 | Parameters 13 | ---------- 14 | losses: (batch_size, num_tokens) 15 | The loss per summary token. 16 | losses_mask: (batch_size, num_tokens) 17 | The mask which indicates which losses are valid. 18 | instance_normalization: 19 | The method of normalizing each item in the batch, either "sum" or "average", 20 | which will sum or average the losses per summary. 21 | batch_normalization: 22 | The method of normalizing the losses per summary, either "sum" or "average". 23 | After the loss for each instance is compuated via the method specified 24 | by `instance_normalization`, the subsequent losses are either summed 25 | or averaged. 26 | 27 | Returns 28 | ------- 29 | The normalized loss. 30 | """ 31 | # First, apply the loss mask to 0-out any invalid losses 32 | losses = losses * losses_mask.float() 33 | 34 | if instance_normalization == 'sum': 35 | # shape: (batch_size,) 36 | loss_per_summary = losses.sum(dim=1) 37 | elif instance_normalization == 'average': 38 | # shape: (batch_size,) 39 | lengths = losses_mask.float().sum(dim=1) 40 | # shape: (batch_size,) 41 | loss_per_summary = losses.sum(dim=1) / lengths 42 | else: 43 | raise Exception(f'Unknown type of instance normalization: {instance_normalization}') 44 | 45 | if batch_normalization == 'sum': 46 | loss = loss_per_summary.sum() 47 | elif batch_normalization == 'average': 48 | loss = loss_per_summary.mean() 49 | else: 50 | raise Exception(f'Unknown type of batch normalization: {batch_normalization}') 51 | 52 | return loss 53 | -------------------------------------------------------------------------------- /summarize/predictors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/predictors/__init__.py -------------------------------------------------------------------------------- /summarize/predictors/cloze/__init__.py: -------------------------------------------------------------------------------- 1 | from summarize.predictors.cloze.abstractive import ClozeAbstractivePredictor 2 | from summarize.predictors.cloze.extractive import ClozeExtractivePredictor 3 | -------------------------------------------------------------------------------- /summarize/predictors/cloze/abstractive.py: -------------------------------------------------------------------------------- 1 | import json 2 | from allennlp.common.util import JsonDict 3 | from allennlp.data import Instance 4 | from allennlp.service.predictors.predictor import Predictor 5 | from overrides import overrides 6 | 7 | 8 | @Predictor.register('cloze-abstractive-predictor') 9 | class ClozeAbstractivePredictor(Predictor): 10 | @overrides 11 | def _json_to_instance(self, json_dict: JsonDict) -> Instance: 12 | document = json_dict['document'] 13 | topics = json_dict['topics'] 14 | context = json_dict['context'] 15 | return self._dataset_reader.text_to_instance(document=document, 16 | topics=topics, 17 | context=context) 18 | 19 | @overrides 20 | def dump_line(self, outputs: JsonDict) -> str: 21 | cloze = outputs['cloze'] 22 | output_data = {'cloze': cloze} 23 | return json.dumps(output_data) + '\n' 24 | -------------------------------------------------------------------------------- /summarize/predictors/cloze/extractive.py: -------------------------------------------------------------------------------- 1 | import json 2 | from allennlp.common.util import JsonDict 3 | from allennlp.data import Instance 4 | from allennlp.service.predictors.predictor import Predictor 5 | from overrides import overrides 6 | 7 | 8 | @Predictor.register('cloze-extractive-predictor') 9 | class ClozeExtractivePredictor(Predictor): 10 | @overrides 11 | def _json_to_instance(self, json_dict: JsonDict) -> Instance: 12 | document = json_dict['document'] 13 | topics = json_dict['topics'] 14 | context = json_dict['context'] 15 | return self._dataset_reader.text_to_instance(document=document, 16 | topics=topics, 17 | context=context) 18 | 19 | @overrides 20 | def dump_line(self, outputs: JsonDict) -> str: 21 | indices = outputs['predicted_indices'] 22 | document = outputs['metadata']['document'] 23 | cloze = [document[index] for index in indices] 24 | output_data = {'cloze': cloze} 25 | return json.dumps(output_data) + '\n' 26 | -------------------------------------------------------------------------------- /summarize/predictors/sds/__init__.py: -------------------------------------------------------------------------------- 1 | from summarize.predictors.sds.abstractive import AbstractivePredictor 2 | from summarize.predictors.sds.extractive import ExtractivePredictor 3 | -------------------------------------------------------------------------------- /summarize/predictors/sds/abstractive.py: -------------------------------------------------------------------------------- 1 | import json 2 | from allennlp.common.util import JsonDict 3 | from allennlp.data import Instance 4 | from allennlp.service.predictors.predictor import Predictor 5 | from overrides import overrides 6 | 7 | 8 | @Predictor.register('sds-abstractive-predictor') 9 | class AbstractivePredictor(Predictor): 10 | @overrides 11 | def _json_to_instance(self, json_dict: JsonDict) -> Instance: 12 | document = json_dict['document'] 13 | return self._dataset_reader.text_to_instance(document=document) 14 | 15 | @overrides 16 | def dump_line(self, outputs: JsonDict) -> str: 17 | summary = outputs['summary'] 18 | output_data = {'summary': [summary]} 19 | return json.dumps(output_data) + '\n' 20 | -------------------------------------------------------------------------------- /summarize/predictors/sds/extractive.py: -------------------------------------------------------------------------------- 1 | import json 2 | from allennlp.common.util import JsonDict 3 | from allennlp.service.predictors.predictor import Predictor 4 | from overrides import overrides 5 | 6 | 7 | @Predictor.register('sds-extractive-predictor') 8 | class ExtractivePredictor(Predictor): 9 | @overrides 10 | def dump_line(self, outputs: JsonDict) -> str: 11 | indices = outputs['predicted_indices'] 12 | document = outputs['metadata']['document'] 13 | summary = [document[index] for index in indices] 14 | output_data = {'summary': summary} 15 | return json.dumps(output_data) + '\n' 16 | -------------------------------------------------------------------------------- /summarize/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/__init__.py -------------------------------------------------------------------------------- /summarize/tests/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/common/__init__.py -------------------------------------------------------------------------------- /summarize/tests/common/tempdir_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import unittest 4 | 5 | from summarize.common import TemporaryDirectory 6 | 7 | 8 | class TestTemporaryDirectory(unittest.TestCase): 9 | def test_temporary_directory(self): 10 | with TemporaryDirectory() as temp_dir: 11 | assert os.path.exists(temp_dir) 12 | assert os.path.isdir(temp_dir) 13 | assert not os.path.exists(temp_dir) 14 | 15 | def test_temporary_directory_root(self): 16 | # Create two temporary directories with one inside the other 17 | # to make sure it was created in the correct location 18 | with TemporaryDirectory() as root_temp_dir: 19 | with TemporaryDirectory(root=root_temp_dir) as temp_dir: 20 | assert os.path.exists(temp_dir) 21 | assert os.path.isdir(temp_dir) 22 | assert temp_dir.startswith(root_temp_dir) 23 | 24 | def test_temporary_directory_persist(self): 25 | with TemporaryDirectory(persist=True) as temp_dir: 26 | assert os.path.exists(temp_dir) 27 | assert os.path.isdir(temp_dir) 28 | assert os.path.exists(temp_dir) 29 | shutil.rmtree(temp_dir) 30 | assert not os.path.exists(temp_dir) 31 | -------------------------------------------------------------------------------- /summarize/tests/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/data/__init__.py -------------------------------------------------------------------------------- /summarize/tests/data/dataset_readers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/data/dataset_readers/__init__.py -------------------------------------------------------------------------------- /summarize/tests/data/dataset_readers/cloze/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/data/dataset_readers/cloze/__init__.py -------------------------------------------------------------------------------- /summarize/tests/data/dataset_readers/cloze/abstractive_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from allennlp.data.tokenizers import WordTokenizer 3 | from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter 4 | 5 | from summarize.common.testing import FIXTURES_ROOT 6 | from summarize.data.dataset_readers.cloze import AbstractiveClozeDatasetReader 7 | from summarize.data.paragraph_tokenizers import ParagraphWordTokenizer 8 | 9 | 10 | class TestAbstractiveClozeDatasetReader(unittest.TestCase): 11 | def test_read_from_file(self): 12 | word_tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter()) 13 | paragraph_tokenizer = ParagraphWordTokenizer(word_splitter=JustSpacesWordSplitter()) 14 | reader = AbstractiveClozeDatasetReader(document_tokenizer=paragraph_tokenizer, 15 | topic_tokenizer=word_tokenizer, 16 | max_document_length=10, 17 | max_context_length=7, 18 | max_cloze_length=5) 19 | instances = list(reader.read(f'{FIXTURES_ROOT}/data/cloze.jsonl')) 20 | 21 | instance0 = { 22 | 'document': ['NEW', 'YORK', ',', 'Jan.', '8', ',', '2016', '/PRNewswire/', '--', 'Businessman'], 23 | 'topics': [['Ken', 'Fields'], ['Politics']], 24 | 'context': ['%', 'Renewable', 'Energy', 'in', '20', 'Years', '.'], 25 | 'cloze': ['Picking', 'as', 'his', 'campaign', 'slogan'] 26 | } 27 | 28 | assert len(instances) == 25 29 | fields = instances[0].fields 30 | assert [t.text for t in fields['document'].tokens] == instance0['document'] 31 | assert len(fields['topics'].field_list) == len(instance0['topics']) 32 | for topic_field, topic in zip(fields['topics'].field_list, instance0['topics']): 33 | assert [t.text for t in topic_field.tokens] == topic 34 | assert [t.text for t in fields['context'].tokens] == instance0['context'] 35 | assert [t.text for t in fields['cloze'].tokens] == instance0['cloze'] 36 | metadata = fields['metadata'] 37 | assert 'document' in metadata 38 | assert 'topics' in metadata 39 | assert 'context' in metadata 40 | assert 'cloze' in metadata 41 | -------------------------------------------------------------------------------- /summarize/tests/data/dataset_readers/cloze/extractive_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import unittest 3 | from allennlp.data.tokenizers import WordTokenizer 4 | from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter 5 | 6 | from summarize.common.testing import FIXTURES_ROOT 7 | from summarize.data.dataset_readers.cloze import ExtractiveClozeDatasetReader 8 | 9 | 10 | class TestExtractiveClozeDatasetReader(unittest.TestCase): 11 | def test_read_from_file(self): 12 | tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter()) 13 | reader = ExtractiveClozeDatasetReader(tokenizer=tokenizer, max_num_sentences=5, 14 | max_sentence_length=6, max_context_length=4) 15 | instances = list(reader.read(f'{FIXTURES_ROOT}/data/cloze.jsonl')) 16 | 17 | instance1 = { 18 | 'document': [ 19 | ['Drew', 'Sheneman', 'has', 'been', 'the', 'editorial'], 20 | ['J.', ')'], 21 | ['since', '1998', '.'], 22 | ['With', 'exceptional', 'artistry', ',', 'his', 'cartoons'], 23 | ['Sheneman', 'began', 'cartooning', 'in', 'college', 'and'] 24 | ], 25 | 'topics': [['Drew', 'Sheneman']], 26 | 'context': ['American', 'editorial', 'cartoonist', '.'], 27 | 'labels': [1, 0, 1, 0, 1] 28 | } 29 | 30 | assert len(instances) == 25 31 | fields = instances[1].fields 32 | assert len(fields['document'].field_list) == 5 33 | for sentence, sentence_field in zip(instance1['document'], fields['document'].field_list): 34 | assert [t.text for t in sentence_field.tokens] == sentence 35 | assert len(fields['topics'].field_list) == 1 36 | for topic, topic_field in zip(instance1['topics'], fields['topics'].field_list): 37 | assert [t.text for t in topic_field.tokens] == topic 38 | assert [t.text for t in fields['context']] == instance1['context'] 39 | assert np.array_equal(fields['labels'].array, instance1['labels']) 40 | metadata = fields['metadata'] 41 | assert 'document' in metadata 42 | assert 'topics' in metadata 43 | assert 'context' in metadata 44 | assert 'cloze' in metadata 45 | -------------------------------------------------------------------------------- /summarize/tests/data/dataset_readers/sds/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/data/dataset_readers/sds/__init__.py -------------------------------------------------------------------------------- /summarize/tests/data/dataset_readers/sds/abstractive_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter 3 | 4 | from summarize.common.testing import FIXTURES_ROOT 5 | from summarize.data.dataset_readers.sds import AbstractiveDatasetReader 6 | from summarize.data.paragraph_tokenizers import ParagraphWordTokenizer 7 | 8 | 9 | class TestAbstractiveDatasetReader(unittest.TestCase): 10 | def test_read_from_file(self): 11 | tokenizer = ParagraphWordTokenizer(word_splitter=JustSpacesWordSplitter()) 12 | reader = AbstractiveDatasetReader(document_tokenizer=tokenizer, max_document_length=10, max_summary_length=5) 13 | instances = list(reader.read(f'{FIXTURES_ROOT}/data/sds.jsonl')) 14 | 15 | instance0 = { 16 | 'document': ['Editor', '\'s', 'note', ':', 'In', 'our', 'Behind', 'the', 'Scenes', 'series'], 17 | 'summary': ['Mentally', 'ill', 'inmates', 'in', 'Miami'] 18 | } 19 | 20 | assert len(instances) == 25 21 | fields = instances[0].fields 22 | assert [t.text for t in fields['document'].tokens] == instance0['document'] 23 | assert [t.text for t in fields['summary'].tokens] == instance0['summary'] 24 | metadata = fields['metadata'] 25 | assert 'document' in metadata 26 | assert len(metadata['document']) == 20 27 | assert 'summary' in metadata 28 | assert len(metadata['summary']) == 4 29 | -------------------------------------------------------------------------------- /summarize/tests/data/dataset_readers/sds/extractive_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import unittest 3 | from allennlp.data.tokenizers import WordTokenizer 4 | from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter 5 | 6 | from summarize.common.testing import FIXTURES_ROOT 7 | from summarize.data.dataset_readers.sds import ExtractiveDatasetReader 8 | 9 | 10 | class TestExtractiveDatasetReader(unittest.TestCase): 11 | def test_read_from_file(self): 12 | tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter()) 13 | reader = ExtractiveDatasetReader(tokenizer=tokenizer, max_num_sentences=5, max_sentence_length=6) 14 | instances = list(reader.read(f'{FIXTURES_ROOT}/data/sds.jsonl')) 15 | 16 | instance0 = { 17 | 'document': [ 18 | ['Editor', '\'s', 'note', ':', 'In', 'our'], 19 | ['An', 'inmate', 'housed', 'on', 'the', '``'], 20 | ['MIAMI', ',', 'Florida', '(', 'CNN', ')'], 21 | ['Most', 'often', ',', 'they', 'face', 'drug'], 22 | ['So', ',', 'they', 'end', 'up', 'on'] 23 | ] 24 | } 25 | 26 | assert len(instances) == 25 27 | fields = instances[0].fields 28 | assert len(fields['document'].field_list) == 5 29 | for sentence, sentence_field in zip(instance0['document'], fields['document'].field_list): 30 | assert [t.text for t in sentence_field.tokens] == sentence 31 | assert np.array_equal(fields['labels'].array, [0, 0, 1, 1, 0]) 32 | metadata = fields['metadata'] 33 | assert 'document' in metadata 34 | assert len(metadata['document']) == 5 35 | assert 'summary' in metadata 36 | assert len(metadata['summary']) == 4 37 | -------------------------------------------------------------------------------- /summarize/tests/data/dataset_setup/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/data/dataset_setup/__init__.py -------------------------------------------------------------------------------- /summarize/tests/data/dataset_setup/tokenize_test.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | import unittest 3 | from nltk import word_tokenize 4 | 5 | from summarize.data.dataset_setup.tokenize import tokenize 6 | 7 | 8 | class TestTokenize(unittest.TestCase): 9 | def test_spacy_tokenize(self): 10 | nlp = spacy.load('en', disable=['tagger', 'parser', 'ner']) 11 | field = "Hi, I'm Dan." 12 | expected = "Hi , I 'm Dan ." 13 | actual = tokenize(nlp, field) 14 | assert expected == actual 15 | 16 | field = [['The first.', 'The second.'], 'The third.'] 17 | expected = [['The first .', 'The second .'], 'The third .'] 18 | actual = tokenize(nlp, field) 19 | assert expected == actual 20 | 21 | def test_nltk_tokenize(self): 22 | field = "Hi, I'm Dan." 23 | expected = "Hi , I 'm Dan ." 24 | actual = tokenize(word_tokenize, field) 25 | assert expected == actual 26 | -------------------------------------------------------------------------------- /summarize/tests/data/io/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/data/io/__init__.py -------------------------------------------------------------------------------- /summarize/tests/data/io/jsonl_writer_test.py: -------------------------------------------------------------------------------- 1 | import bz2 2 | import gzip 3 | import json 4 | import tempfile 5 | import unittest 6 | 7 | from summarize.data.io import JsonlWriter 8 | 9 | 10 | class TestJsonlWriter(unittest.TestCase): 11 | def setUp(self): 12 | self.data = [ 13 | {'a': 4, 'b': 'testing'}, 14 | {'c': [1, 2, 3]} 15 | ] 16 | 17 | def test_plain_file(self): 18 | # Write the data to a file 19 | temp_file = tempfile.NamedTemporaryFile(suffix='.jsonl') 20 | with JsonlWriter(temp_file.name) as out: 21 | for item in self.data: 22 | out.write(item) 23 | 24 | # Load from file, ensure it is correct 25 | actual_data = [] 26 | with open(temp_file.name, 'r') as f: 27 | for line in f: 28 | actual_data.append(json.loads(line)) 29 | self.assertEqual(self.data, actual_data) 30 | 31 | def test_gzip_file(self): 32 | # Write the data to a file 33 | temp_file = tempfile.NamedTemporaryFile(suffix='.jsonl.gz') 34 | with JsonlWriter(temp_file.name) as out: 35 | for item in self.data: 36 | out.write(item) 37 | 38 | # Load from file, ensure it is correct 39 | actual_data = [] 40 | with gzip.open(temp_file.name, 'rb') as f: 41 | for line in f: 42 | actual_data.append(json.loads(line.decode())) 43 | self.assertEqual(self.data, actual_data) 44 | 45 | def test_bz2_file(self): 46 | # Write the data to a file 47 | temp_file = tempfile.NamedTemporaryFile(suffix='.jsonl.bz2') 48 | with JsonlWriter(temp_file.name) as out: 49 | for item in self.data: 50 | out.write(item) 51 | 52 | # Load from file, ensure it is correct 53 | actual_data = [] 54 | with bz2.open(temp_file.name, 'rb') as f: 55 | for line in f: 56 | actual_data.append(json.loads(line.decode())) 57 | self.assertEqual(self.data, actual_data) 58 | -------------------------------------------------------------------------------- /summarize/tests/data/io/util_test.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import tempfile 3 | import unittest 4 | 5 | from summarize.data.io.util import is_gz_file 6 | 7 | 8 | class TestUtil(unittest.TestCase): 9 | def test_is_gz_file(self): 10 | with tempfile.NamedTemporaryFile() as temp: 11 | # Write a plain text file 12 | with open(temp.name, 'w') as out: 13 | out.write('plain text') 14 | assert is_gz_file(temp.name) is False 15 | 16 | # Write a gzipped file 17 | with gzip.open(temp.name, 'wb') as out: 18 | out.write(b'gzipped') 19 | assert is_gz_file(temp.name) is True 20 | -------------------------------------------------------------------------------- /summarize/tests/data/paragraph_tokenizers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/data/paragraph_tokenizers/__init__.py -------------------------------------------------------------------------------- /summarize/tests/data/paragraph_tokenizers/paragraph_word_tokenizer_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from summarize.data.paragraph_tokenizers import ParagraphWordTokenizer 4 | 5 | 6 | class TestParagraphWordTokenizer(unittest.TestCase): 7 | def test_in_between_tokens(self): 8 | texts = [ 9 | 'This is the first sentence.', 10 | 'Followed by the second.', 11 | 'And the third!' 12 | ] 13 | 14 | tokenizer = ParagraphWordTokenizer() 15 | expected = [ 16 | 'This', 'is', 'the', 'first', 'sentence', '.', 17 | 'Followed', 'by', 'the', 'second', '.', 18 | 'And', 'the', 'third', '!' 19 | ] 20 | tokens = tokenizer.tokenize(texts) 21 | actual = list(map(str, tokens)) 22 | assert expected == actual 23 | 24 | tokenizer = ParagraphWordTokenizer(start_tokens=['@start@'], 25 | end_tokens=['@end@'], 26 | in_between_tokens=['', '']) 27 | expected = [ 28 | '@start@', 'This', 'is', 'the', 'first', 'sentence', '.', '', '', 29 | 'Followed', 'by', 'the', 'second', '.', '', '', 30 | 'And', 'the', 'third', '!', '@end@' 31 | ] 32 | tokens = tokenizer.tokenize(texts) 33 | actual = list(map(str, tokens)) 34 | assert expected == actual 35 | -------------------------------------------------------------------------------- /summarize/tests/fixtures/configs/cloze/extractive-baseline.jsonnet: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_reader": { 3 | "type": "cloze-extractive", 4 | "max_num_sentences": 50, 5 | "max_sentence_length": 15, 6 | "max_context_length": 20, 7 | "tokenizer": { 8 | "type": "word", 9 | "word_splitter": { 10 | "type": "just_spaces" 11 | } 12 | }, 13 | "token_indexers": { 14 | "tokens": { 15 | "type": "single_id", 16 | "lowercase_tokens": true 17 | } 18 | } 19 | }, 20 | "train_data_path": "summarize/tests/fixtures/data/cloze.jsonl", 21 | "validation_data_path": "summarize/tests/fixtures/data/cloze.jsonl", 22 | "model": { 23 | "type": "cloze-extractive-baseline", 24 | "token_embedder": { 25 | "tokens": { 26 | "type": "embedding", 27 | "embedding_dim": 20 28 | } 29 | }, 30 | "sentence_encoder": { 31 | "type": "lstm", 32 | "input_size": 20, 33 | "hidden_size": 20, 34 | "bidirectional": true 35 | }, 36 | "sentence_extractor": { 37 | "type": "rnn", 38 | "rnn": { 39 | "type": "lstm", 40 | "input_size": 40, 41 | "hidden_size": 20, 42 | "bidirectional": true 43 | }, 44 | "feed_forward": { 45 | "input_dim": 40, 46 | "hidden_dims": 1, 47 | "num_layers": 1, 48 | "activations": "linear" 49 | } 50 | }, 51 | "topic_encoder": { 52 | "type": "lstm", 53 | "input_size": 20, 54 | "hidden_size": 20, 55 | "bidirectional": true 56 | }, 57 | "topic_layer": { 58 | "input_dim": 40, 59 | "hidden_dims": 40, 60 | "num_layers": 1, 61 | "activations": "linear" 62 | }, 63 | "context_encoder": { 64 | "type": "lstm", 65 | "input_size": 20, 66 | "hidden_size": 20, 67 | "bidirectional": true 68 | }, 69 | "attention": { 70 | "type": "mlp", 71 | "encoder_size": 40, 72 | "decoder_size": 40, 73 | "attention_size": 40 74 | }, 75 | "attention_layer": { 76 | "input_dim": 40 + 40, 77 | "hidden_dims": 40, 78 | "num_layers": 1, 79 | "activations": "linear" 80 | }, 81 | "use_topics": true, 82 | "use_context": true, 83 | "max_words": 20, 84 | "metrics": [ 85 | { 86 | "type": "python-rouge", 87 | "ngram_orders": [2] 88 | } 89 | ] 90 | }, 91 | "iterator": { 92 | "type": "basic", 93 | "batch_size": 4, 94 | "instances_per_epoch": 2 95 | }, 96 | "trainer": { 97 | "optimizer": "adam", 98 | "num_epochs": 5, 99 | "cuda_device": -1 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /summarize/tests/fixtures/configs/sds/extractive-baseline.jsonnet: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_reader": { 3 | "type": "sds-extractive", 4 | "max_num_sentences": 50, 5 | "max_sentence_length": 15, 6 | "tokenizer": { 7 | "type": "word", 8 | "word_splitter": { 9 | "type": "just_spaces" 10 | } 11 | }, 12 | "token_indexers": { 13 | "tokens": { 14 | "type": "single_id", 15 | "lowercase_tokens": true 16 | } 17 | } 18 | }, 19 | "train_data_path": "summarize/tests/fixtures/data/sds.jsonl", 20 | "validation_data_path": "summarize/tests/fixtures/data/sds.jsonl", 21 | "model": { 22 | "type": "sds-extractive-baseline", 23 | "token_embedder": { 24 | "tokens": { 25 | "type": "embedding", 26 | "embedding_dim": 20 27 | } 28 | }, 29 | "sentence_encoder": { 30 | "type": "lstm", 31 | "input_size": 20, 32 | "hidden_size": 20, 33 | "bidirectional": true 34 | }, 35 | "sentence_extractor": { 36 | "type": "rnn", 37 | "rnn": { 38 | "type": "lstm", 39 | "input_size": 40, 40 | "hidden_size": 20, 41 | "bidirectional": true 42 | }, 43 | "feed_forward": { 44 | "input_dim": 40, 45 | "hidden_dims": 1, 46 | "num_layers": 1, 47 | "activations": "linear" 48 | } 49 | }, 50 | "max_words": 20, 51 | "metrics": [ 52 | { 53 | "type": "python-rouge", 54 | "ngram_orders": [2] 55 | } 56 | ] 57 | }, 58 | "iterator": { 59 | "type": "basic", 60 | "batch_size": 4, 61 | "instances_per_epoch": 2 62 | }, 63 | "trainer": { 64 | "optimizer": "adam", 65 | "num_epochs": 5, 66 | "cuda_device": -1 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /summarize/tests/fixtures/data/chen2018/Readme.md: -------------------------------------------------------------------------------- 1 | The `gold.jsonl` and `model.jsonl` are 10 reference and system summaries from "Fast Abstractive Summarization with Reinforce-Selected Sentence Rewriting" by Chen and Bansal (2018). 2 | -------------------------------------------------------------------------------- /summarize/tests/fixtures/data/chen2018/gold.jsonl: -------------------------------------------------------------------------------- 1 | {"summary": ["marseille prosecutor says `` so far no videos were used in the crash investigation '' despite media reports .", "journalists at bild and paris match are `` very confident '' the video clip is real , an editor says .", "andreas lubitz had informed his lufthansa training school of an episode of severe depression , airline says ."]} 2 | {"summary": ["membership gives the icc jurisdiction over alleged crimes committed in palestinian territories since last june .", "israel and the united states opposed the move , which could open the door to war crimes investigations against israelis ."]} 3 | {"summary": ["college-bound basketball star asks girl with down syndrome to high school prom .", "pictures of the two during the `` prom-posal '' have gone viral ."]} 4 | {"summary": ["don mclean 's `` american pie '' lyrics auctioned for $ 1.2 million .", "the song is dense with symbolism ; mclean says lyrics , notes will reveal meaning .", "`` pie '' is mclean 's biggest hit , was no. 1 in 1972 ."]} 5 | {"summary": ["gov. mike pence is making the right call to fix indiana 's religious freedom law , which can be used for discrimination .", "mark goldfeder : indiana should aim to be a shining beacon of cooperation : the real `` crossroads of america ''"]} 6 | {"summary": ["cameron hooker had kidnapped young hitchhiker colleen stan in 1977 .", "over the next seven years victim was tortured and raped as his captive .", "hooker , now 61 , was sentenced to a 104-year prison term jail in 1985 .", "he applied for early parole but was told he 'd spent at least 15 years in jail ."]} 7 | {"summary": ["figures show that while millions still tune in they listen for shorter bursts .", "average listener spent ten hours a week tuning in last three months of 2014 .", "this was 14 % down on decade earlier , when people tuned in for 11.6 hours .", "the bbc trust has cleared the way for firms to buy their way into lifestyle programmes on the world news channel in a product placement experiment . for example , publishers could pay to have their books reviewed on talking books . the bbc trust will review the scheme in a year ."]} 8 | {"summary": ["s300 barely takes off before plunging back to the ground .", "minute-long clip shows people dashing for cover as rocket hits ground .", "mishap comes shortly after footage of crash killing missile engineers ."]} 9 | {"summary": ["david letterman made the joke while warming up his late show audience .", "college staffer asked what advice the ` scandal-scarred ' comic could give .", "the host told them ` treat a lady like a wh -- e , and a wh -- e like a lady '", "joke was met with stunned silence with some branding it ` disrespectful '"]} 10 | {"summary": ["winds swept the ocean foam off lashing waves before mixing it with sand from the shore line .", "the result was a bizarre and grotesque yellow , thick , jelly-like foam substance which coated the entire beach .", "it stretched more than 15 metres up avoca beach in the central coast and onto the pathways and shrubbery .", "sylvia freedman , who was holidaying there when the storm hit , captured the strange phenomenon on her camera ."]} 11 | -------------------------------------------------------------------------------- /summarize/tests/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/metrics/__init__.py -------------------------------------------------------------------------------- /summarize/tests/metrics/meteor_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import unittest 4 | 5 | from summarize.common.testing import FIXTURES_ROOT 6 | from summarize.data.io import JsonlReader 7 | from summarize.metrics.meteor import DEFAULT_METEOR_JAR_PATH, run_meteor 8 | 9 | 10 | @pytest.mark.skipif(not os.path.exists(DEFAULT_METEOR_JAR_PATH), reason='Meteor jar does not exist') 11 | class TestMeteor(unittest.TestCase): 12 | def test_meteor_runs(self): 13 | gold_summaries = [ 14 | 'This is the gold summary for the first instance.', 15 | 'And this is for the second one.' 16 | ] 17 | model_summaries = [ 18 | 'This is the model output.', 19 | 'And this is the one for the second document.' 20 | ] 21 | assert run_meteor(gold_summaries, model_summaries) > 0.0 22 | 23 | def test_chen2018(self): 24 | """ 25 | Tests to ensure that Meteor returns the expected score on the 26 | Chen 2018 data subset. I ran Meteor on the full data (~11k examples) 27 | which takes too long to run for a unit test. After confirming the numbers 28 | are the same as what is reported in the paper, I ran the code on just 29 | the subset, and this test ensures those numbers are returned. 30 | """ 31 | gold_file_path = f'{FIXTURES_ROOT}/data/chen2018/gold.jsonl' 32 | model_file_path = f'{FIXTURES_ROOT}/data/chen2018/model.jsonl' 33 | 34 | gold = JsonlReader(gold_file_path).read() 35 | model = JsonlReader(model_file_path).read() 36 | 37 | gold = [' '.join(summary['summary']) for summary in gold] 38 | model = [' '.join(summary['summary']) for summary in model] 39 | 40 | score = run_meteor(gold, model) 41 | assert abs(score - 18.28372) < 1e-5 42 | -------------------------------------------------------------------------------- /summarize/tests/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/models/__init__.py -------------------------------------------------------------------------------- /summarize/tests/models/cloze/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/models/cloze/__init__.py -------------------------------------------------------------------------------- /summarize/tests/models/cloze/bm25/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/models/cloze/bm25/__init__.py -------------------------------------------------------------------------------- /summarize/tests/models/cloze/bm25/bm25_test.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | import unittest 3 | from collections import namedtuple 4 | 5 | from summarize.data.io import JsonlReader 6 | from summarize.common.testing import FIXTURES_ROOT 7 | from summarize.models.cloze.bm25 import calculate_df, bm25 8 | 9 | 10 | class TestBM25(unittest.TestCase): 11 | def test_bm25_runs(self): 12 | with tempfile.NamedTemporaryFile(suffix='.jsonl') as df_file: 13 | with tempfile.NamedTemporaryFile(suffix='.jsonl') as bm25_file: 14 | Args = namedtuple('Args', ['input_jsonl', 'output_jsonl']) 15 | args = Args(f'{FIXTURES_ROOT}/data/cloze.jsonl', df_file.name) 16 | calculate_df.main(args) 17 | 18 | Args = namedtuple('Args', ['input_jsonl', 'df_jsonl', 'output_jsonl', 19 | 'k', 'b', 'max_words', 'max_sentences', 'flatten']) 20 | args = Args(f'{FIXTURES_ROOT}/data/cloze.jsonl', df_file.name, bm25_file.name, 21 | 1.2, 0.75, None, 1, True) 22 | bm25.main(args) 23 | 24 | instances = JsonlReader(bm25_file.name).read() 25 | assert len(instances) == 25 26 | for instance in instances: 27 | assert 'cloze' in instance 28 | assert isinstance(instance['cloze'], str) 29 | -------------------------------------------------------------------------------- /summarize/tests/models/cloze/bm25/calculate_df_test.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | import unittest 3 | from collections import namedtuple 4 | 5 | from summarize.data.io import JsonlReader 6 | from summarize.common.testing import FIXTURES_ROOT 7 | from summarize.models.cloze.bm25 import calculate_df 8 | 9 | 10 | class TestCalculateDF(unittest.TestCase): 11 | def test_calculate_df_runs(self): 12 | with tempfile.NamedTemporaryFile(suffix='.jsonl') as df_file: 13 | Args = namedtuple('Args', ['input_jsonl', 'output_jsonl']) 14 | args = Args(f'{FIXTURES_ROOT}/data/cloze.jsonl', df_file.name) 15 | calculate_df.main(args) 16 | 17 | lines = JsonlReader(df_file.name).read() 18 | assert len(lines) > 0 19 | metadata = lines[0] 20 | assert 'num_documents' in metadata 21 | assert 'average_document_length' in metadata 22 | for count in lines[1:]: 23 | assert 'token' in count 24 | assert 'df' in count 25 | -------------------------------------------------------------------------------- /summarize/tests/models/cloze/extractive_baseline_test.py: -------------------------------------------------------------------------------- 1 | from allennlp.common.testing import ModelTestCase 2 | 3 | # Some imports necessary in order to register the dataset reader, model, and modules 4 | import summarize.data.dataset_readers.cloze 5 | import summarize.models.cloze 6 | import summarize.modules.matrix_attention 7 | import summarize.training.metrics 8 | from summarize.common.testing import FIXTURES_ROOT 9 | 10 | 11 | class ExtractiveBaselineModelModelTest(ModelTestCase): 12 | def setUp(self): 13 | super().setUp() 14 | self.set_up_model(f'{FIXTURES_ROOT}/configs/cloze/extractive-baseline.jsonnet', 15 | f'{FIXTURES_ROOT}/data/cloze.jsonl') 16 | 17 | def test_cloze_extractive_baseline_can_train_save_and_load(self): 18 | self.ensure_model_can_train_save_and_load(self.param_file) 19 | 20 | def test_batch_predictions_are_consistent(self): 21 | self.ensure_batch_predictions_are_consistent() 22 | -------------------------------------------------------------------------------- /summarize/tests/models/cloze/lead_test.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | import unittest 3 | from collections import namedtuple 4 | 5 | from summarize.data.io import JsonlReader 6 | from summarize.models.cloze import lead 7 | from summarize.common.testing import FIXTURES_ROOT 8 | 9 | 10 | class TestClozeLead(unittest.TestCase): 11 | def test_cloze_lead(self): 12 | with tempfile.NamedTemporaryFile(suffix='.jsonl') as output_file: 13 | Args = namedtuple('Args', ['input_jsonl', 'output_jsonl', 'max_sentences', 14 | 'max_tokens', 'max_bytes', 'field_name', 'keep_sentences']) 15 | args = Args(f'{FIXTURES_ROOT}/data/cloze.jsonl', output_file.name, 16 | 1, None, None, 'cloze', True) 17 | lead.main(args) 18 | 19 | instances = JsonlReader(output_file.name).read() 20 | assert len(instances) == 25 21 | assert all('cloze' in instance for instance in instances) 22 | assert all(isinstance(instance['cloze'], list) for instance in instances) 23 | 24 | args = Args(f'{FIXTURES_ROOT}/data/cloze.jsonl', output_file.name, 25 | 1, None, None, 'cloze', False) 26 | lead.main(args) 27 | 28 | instances = JsonlReader(output_file.name).read() 29 | assert len(instances) == 25 30 | assert all('cloze' in instance for instance in instances) 31 | assert all(isinstance(instance['cloze'], str) for instance in instances) 32 | -------------------------------------------------------------------------------- /summarize/tests/models/cloze/open_ai_language_model_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import unittest 4 | 5 | from summarize.common.testing import FIXTURES_ROOT 6 | from summarize.data.io import JsonlReader 7 | from summarize.models.cloze import OpenAILanguageModel 8 | 9 | _MODEL_DIR = 'experiments/deutsch2019/baselines/open-ai/models/345M' 10 | 11 | 12 | class TestOpenAILanguageModel(unittest.TestCase): 13 | @pytest.mark.skip(reason='Too slow') 14 | @pytest.mark.skipif(not os.path.exists(_MODEL_DIR), reason='OpenAI Language Model does not exist') 15 | def test_open_ai_language_model(self): 16 | """ 17 | Tests to make sure the OpenAI language model successfully loads and 18 | can process data. 19 | """ 20 | length = 100 21 | temperature = 1.0 22 | top_k = 20 23 | lm = OpenAILanguageModel(_MODEL_DIR, length, temperature, top_k) 24 | 25 | # This can be quite slow, so we only do it for 1 instance 26 | with JsonlReader(f'{FIXTURES_ROOT}/data/cloze.jsonl') as f: 27 | for instance in f: 28 | context = instance['context'] 29 | input_text = ' '.join(context) 30 | sentence = lm.sample_next_sentence(input_text) 31 | assert sentence is not None 32 | break 33 | -------------------------------------------------------------------------------- /summarize/tests/models/cloze/pointer_generator_test.py: -------------------------------------------------------------------------------- 1 | from allennlp.common.testing import ModelTestCase 2 | 3 | # Some imports necessary in order to register the dataset reader, model, and modules 4 | import summarize.data.dataset_readers.cloze 5 | import summarize.models.cloze 6 | import summarize.modules.matrix_attention 7 | import summarize.training.metrics 8 | from summarize.common.testing import FIXTURES_ROOT 9 | 10 | 11 | class TestClozePointerGeneratorModel(ModelTestCase): 12 | def setUp(self): 13 | super().setUp() 14 | self.set_up_model(f'{FIXTURES_ROOT}/configs/cloze/pointer-generator.jsonnet', 15 | f'{FIXTURES_ROOT}/data/cloze.jsonl') 16 | 17 | def test_cloze_pointer_generator_can_train_save_and_load(self): 18 | self.ensure_model_can_train_save_and_load(self.param_file) 19 | 20 | def test_batch_predictions_are_consistent(self): 21 | # The log-probabilities are often unstable 22 | self.ensure_batch_predictions_are_consistent(keys_to_ignore='log_probabilities') 23 | -------------------------------------------------------------------------------- /summarize/tests/models/cloze/seq2seq_test.py: -------------------------------------------------------------------------------- 1 | from allennlp.common.testing import ModelTestCase 2 | 3 | # Some imports necessary in order to register the dataset reader, model, and modules 4 | import summarize.data.dataset_readers.cloze 5 | import summarize.models.cloze 6 | import summarize.modules.matrix_attention 7 | import summarize.training.metrics 8 | from summarize.common.testing import FIXTURES_ROOT 9 | 10 | 11 | class TestClozeSeq2SeqModel(ModelTestCase): 12 | def setUp(self): 13 | super().setUp() 14 | self.set_up_model(f'{FIXTURES_ROOT}/configs/cloze/seq2seq.jsonnet', 15 | f'{FIXTURES_ROOT}/data/cloze.jsonl') 16 | 17 | def test_cloze_seq2seq_can_train_save_and_load(self): 18 | self.ensure_model_can_train_save_and_load(self.param_file) 19 | 20 | def test_batch_predictions_are_consistent(self): 21 | # The log-probabilities are often unstable 22 | self.ensure_batch_predictions_are_consistent(keys_to_ignore='log_probabilities') 23 | -------------------------------------------------------------------------------- /summarize/tests/models/cloze/sumfocus_test.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | import unittest 3 | from collections import namedtuple 4 | 5 | from summarize.data.io import JsonlReader 6 | from summarize.common.testing import FIXTURES_ROOT 7 | from summarize.models.cloze import sumfocus 8 | 9 | 10 | class TestSumFocus(unittest.TestCase): 11 | def test_sumfocus_runs(self): 12 | with tempfile.NamedTemporaryFile(suffix='.jsonl') as output_file: 13 | Args = namedtuple('Args', ['input_jsonl', 'output_jsonl', 'beta', 14 | 'topic_lambda', 'context_lambda', 15 | 'max_words', 'max_sentences']) 16 | args = Args(f'{FIXTURES_ROOT}/data/cloze.jsonl', output_file.name, 17 | 0.5, 0.2, 0.3, 200, None) 18 | sumfocus.main(args) 19 | 20 | instances = JsonlReader(output_file.name).read() 21 | assert len(instances) == 25 22 | for instance in instances: 23 | assert 'cloze' in instance 24 | assert isinstance(instance['cloze'], str) 25 | -------------------------------------------------------------------------------- /summarize/tests/models/sds/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/models/sds/__init__.py -------------------------------------------------------------------------------- /summarize/tests/models/sds/extractive_baseline_test.py: -------------------------------------------------------------------------------- 1 | from allennlp.common.testing import ModelTestCase 2 | 3 | # Some imports necessary in order to register the dataset reader, model, and modules 4 | import summarize.data.dataset_readers.sds 5 | import summarize.models.sds 6 | import summarize.modules.matrix_attention 7 | import summarize.training.metrics 8 | from summarize.common.testing import FIXTURES_ROOT 9 | 10 | 11 | class ExtractiveBaselineModelModelTest(ModelTestCase): 12 | def setUp(self): 13 | super().setUp() 14 | self.set_up_model(f'{FIXTURES_ROOT}/configs/sds/extractive-baseline.jsonnet', 15 | f'{FIXTURES_ROOT}/data/sds.jsonl') 16 | 17 | def test_sds_extractive_baseline_can_train_save_and_load(self): 18 | self.ensure_model_can_train_save_and_load(self.param_file) 19 | 20 | def test_batch_predictions_are_consistent(self): 21 | # The log-probabilities are often unstable 22 | self.ensure_batch_predictions_are_consistent(keys_to_ignore='log_probabilities') 23 | -------------------------------------------------------------------------------- /summarize/tests/models/sds/lead_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from summarize.models.sds.lead import get_lead_summary 4 | 5 | 6 | class TestLeadSummary(unittest.TestCase): 7 | def setUp(self): 8 | self.document = [ 9 | 'The first sentence .', 10 | 'Followed by the second .', 11 | 'Finally the third .' 12 | ] 13 | 14 | def test_max_sentences(self): 15 | assert self.document[:1] == get_lead_summary(self.document, max_sentences=1) 16 | assert self.document[:2] == get_lead_summary(self.document, max_sentences=2) 17 | assert self.document == get_lead_summary(self.document, max_sentences=3) 18 | assert self.document == get_lead_summary(self.document, max_sentences=4) 19 | 20 | def test_max_token(self): 21 | assert ['The'] == get_lead_summary(self.document, max_tokens=1) 22 | assert ['The first sentence .'] == get_lead_summary(self.document, max_tokens=4) 23 | assert ['The first sentence .', 'Followed'] == get_lead_summary(self.document, max_tokens=5) 24 | assert ['The first sentence .', 'Followed by the second .', 'Finally the third'] == get_lead_summary(self.document, max_tokens=12) 25 | assert ['The first sentence .', 'Followed by the second .', 'Finally the third .'] == get_lead_summary(self.document, max_tokens=13) 26 | assert ['The first sentence .', 'Followed by the second .', 'Finally the third .'] == get_lead_summary(self.document, max_tokens=14) 27 | 28 | def test_max_bytes(self): 29 | assert ['T'] == get_lead_summary(self.document, max_bytes=1) 30 | assert ['The first sentence'] == get_lead_summary(self.document, max_bytes=19) 31 | assert ['The first sentence .'] == get_lead_summary(self.document, max_bytes=20) 32 | assert ['The first sentence .'] == get_lead_summary(self.document, max_bytes=21) 33 | assert ['The first sentence .', 'F'] == get_lead_summary(self.document, max_bytes=22) 34 | assert ['The first sentence .', 'Followed by the second .', 'Finally the third'] == get_lead_summary(self.document, max_bytes=64) 35 | assert ['The first sentence .', 'Followed by the second .', 'Finally the third .'] == get_lead_summary(self.document, max_bytes=65) 36 | assert ['The first sentence .', 'Followed by the second .', 'Finally the third .'] == get_lead_summary(self.document, max_bytes=66) 37 | 38 | def test_invalid_arguments(self): 39 | with self.assertRaises(Exception): 40 | get_lead_summary(self.document) 41 | with self.assertRaises(Exception): 42 | get_lead_summary(self.document, max_sentences=1, max_tokens=1) 43 | with self.assertRaises(Exception): 44 | get_lead_summary(self.document, max_sentences=1, max_bytes=1) 45 | with self.assertRaises(Exception): 46 | get_lead_summary(self.document, max_tokens=1, max_bytes=1) 47 | with self.assertRaises(Exception): 48 | get_lead_summary(self.document, max_sentences=1, max_tokens=1, max_bytes=1) 49 | -------------------------------------------------------------------------------- /summarize/tests/models/sds/pointer_generator_test.py: -------------------------------------------------------------------------------- 1 | from allennlp.common.testing import ModelTestCase 2 | 3 | # Some imports necessary in order to register the dataset reader, model, and modules 4 | import summarize.data.dataset_readers.sds 5 | import summarize.models.sds 6 | import summarize.modules.matrix_attention 7 | import summarize.training.metrics 8 | from summarize.common.testing import FIXTURES_ROOT 9 | 10 | 11 | class PointerGeneratorModelTest(ModelTestCase): 12 | def setUp(self): 13 | super().setUp() 14 | self.set_up_model(f'{FIXTURES_ROOT}/configs/sds/pointer-generator.jsonnet', 15 | f'{FIXTURES_ROOT}/data/sds.jsonl') 16 | 17 | def test_sds_pointer_generator_can_train_save_and_load(self): 18 | self.ensure_model_can_train_save_and_load(self.param_file) 19 | 20 | def test_batch_predictions_are_consistent(self): 21 | self.ensure_batch_predictions_are_consistent(keys_to_ignore='log_probabilities') 22 | -------------------------------------------------------------------------------- /summarize/tests/models/sds/seq2seq_test.py: -------------------------------------------------------------------------------- 1 | from allennlp.common.testing import ModelTestCase 2 | 3 | # Some imports necessary in order to register the dataset reader, model, and modules 4 | import summarize.data.dataset_readers.sds 5 | import summarize.models.sds 6 | import summarize.modules.matrix_attention 7 | import summarize.training.metrics 8 | from summarize.common.testing import FIXTURES_ROOT 9 | 10 | 11 | class Seq2SeqModelTest(ModelTestCase): 12 | def setUp(self): 13 | super().setUp() 14 | self.set_up_model(f'{FIXTURES_ROOT}/configs/sds/seq2seq.jsonnet', 15 | f'{FIXTURES_ROOT}/data/sds.jsonl') 16 | 17 | def test_sds_seq2seq_can_train_save_and_load(self): 18 | self.ensure_model_can_train_save_and_load(self.param_file) 19 | 20 | def test_batch_predictions_are_consistent(self): 21 | # The log-probabilities are often unstable 22 | self.ensure_batch_predictions_are_consistent(keys_to_ignore='log_probabilities') 23 | -------------------------------------------------------------------------------- /summarize/tests/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/modules/__init__.py -------------------------------------------------------------------------------- /summarize/tests/modules/coverage_matrix_attention/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/modules/coverage_matrix_attention/__init__.py -------------------------------------------------------------------------------- /summarize/tests/modules/coverage_matrix_attention/mlp_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import unittest 3 | 4 | from summarize.modules.coverage_matrix_attention import MLPCoverageAttention 5 | 6 | 7 | class TestMLPCoverageAttention(unittest.TestCase): 8 | def test_mlp_coverage_attention(self): 9 | batch_size = 3 10 | num_encoder_tokens = 7 11 | num_decoder_tokens = 9 12 | encoder_dim = 11 13 | decoder_dim = 13 14 | attention_dim = 17 15 | 16 | encoder_outputs = torch.rand(batch_size, num_encoder_tokens, encoder_dim) 17 | encoder_mask = torch.LongTensor([ 18 | [1, 1, 1, 1, 1, 1, 1], 19 | [1, 1, 1, 1, 1, 0, 0], 20 | [1, 1, 1, 0, 0, 0, 0] 21 | ]) 22 | decoder_outputs = torch.rand(batch_size, num_decoder_tokens, decoder_dim) 23 | initial_coverage_vector = torch.zeros(batch_size, num_encoder_tokens) 24 | 25 | attention = MLPCoverageAttention(encoder_dim, decoder_dim, attention_dim) 26 | probabilities, coverage_vectors, coverage_vector = \ 27 | attention(decoder_outputs, encoder_outputs, encoder_mask, initial_coverage_vector) 28 | 29 | # It's too hard to test specific values, so we run several sanity checks 30 | assert probabilities.size() == (batch_size, num_decoder_tokens, num_encoder_tokens) 31 | assert coverage_vectors.size() == (batch_size, num_decoder_tokens, num_encoder_tokens) 32 | assert coverage_vector.size() == (batch_size, num_encoder_tokens) 33 | 34 | # Make sure the first coverage vector is the initial argument 35 | assert torch.equal(initial_coverage_vector, coverage_vectors[:, 0]) 36 | 37 | # Make sure the last coverage vector is the expected cumulative sum 38 | cumsum = torch.cumsum(probabilities, dim=1) 39 | assert torch.isclose(cumsum[:, -1], coverage_vector).all() 40 | 41 | # Make sure the probabilities obey the mask 42 | assert torch.equal((probabilities > 0).long(), encoder_mask.unsqueeze(1).expand_as(probabilities)) 43 | -------------------------------------------------------------------------------- /summarize/tests/modules/rnns/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/modules/rnns/__init__.py -------------------------------------------------------------------------------- /summarize/tests/modules/rnns/gru_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import unittest 3 | from allennlp.nn.util import get_final_encoder_states 4 | 5 | from summarize.tests.modules.rnns import util 6 | 7 | 8 | class TestGRU(unittest.TestCase): 9 | def test_gru_remap_hidden(self): 10 | batch_size = 3 11 | sequence_length = 11 12 | input_size = 5 13 | hidden_size = 7 14 | num_layers = 1 15 | bidirectional = True 16 | 17 | input_data, mask = util.get_random_inputs(batch_size, sequence_length, input_size) 18 | seq2seq_encoder, rnn = util.get_rnns('gru', input_size, hidden_size, num_layers, bidirectional) 19 | 20 | # Ensure the final encoder states are the same, with and without masking 21 | ones_mask = torch.ones(mask.size()) 22 | encoder_outputs = seq2seq_encoder(input_data, None) 23 | expected_hidden = get_final_encoder_states(encoder_outputs, ones_mask, bidirectional) 24 | _, hidden = rnn(input_data, None) 25 | actual_hidden = rnn.reshape_hidden_for_decoder(hidden) 26 | assert (torch.abs(expected_hidden - actual_hidden) < 1e-5).all() 27 | 28 | encoder_outputs = seq2seq_encoder(input_data, mask) 29 | expected_hidden = get_final_encoder_states(encoder_outputs, mask, bidirectional) 30 | _, hidden = rnn(input_data, mask) 31 | actual_hidden = rnn.reshape_hidden_for_decoder(hidden) 32 | assert (torch.abs(expected_hidden - actual_hidden) < 1e-5).all() 33 | -------------------------------------------------------------------------------- /summarize/tests/modules/rnns/lstm_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import unittest 3 | from allennlp.nn.util import get_final_encoder_states 4 | 5 | from summarize.tests.modules.rnns import util 6 | 7 | 8 | class TestLSTM(unittest.TestCase): 9 | def test_lstm_remap_hidden(self): 10 | batch_size = 3 11 | sequence_length = 11 12 | input_size = 5 13 | hidden_size = 7 14 | num_layers = 1 15 | bidirectional = True 16 | 17 | input_data, mask = util.get_random_inputs(batch_size, sequence_length, input_size) 18 | seq2seq_encoder, rnn = util.get_rnns('lstm', input_size, hidden_size, num_layers, bidirectional) 19 | 20 | # Ensure the final encoder states are the same, with and without masking 21 | ones_mask = torch.ones(mask.size()) 22 | encoder_outputs = seq2seq_encoder(input_data, None) 23 | expected_hidden = get_final_encoder_states(encoder_outputs, ones_mask, bidirectional) 24 | _, hidden = rnn(input_data, None) 25 | actual_hidden = rnn.reshape_hidden_for_decoder(hidden) 26 | actual_hidden, _ = actual_hidden 27 | assert (torch.abs(expected_hidden - actual_hidden) < 1e-5).all() 28 | 29 | encoder_outputs = seq2seq_encoder(input_data, mask) 30 | expected_hidden = get_final_encoder_states(encoder_outputs, mask, bidirectional) 31 | _, hidden = rnn(input_data, mask) 32 | actual_hidden = rnn.reshape_hidden_for_decoder(hidden) 33 | actual_hidden, _ = actual_hidden 34 | assert (torch.abs(expected_hidden - actual_hidden) < 1e-5).all() 35 | -------------------------------------------------------------------------------- /summarize/tests/modules/rnns/rnn_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import unittest 3 | 4 | from summarize.tests.modules.rnns import util 5 | 6 | 7 | class TestRNN(unittest.TestCase): 8 | def test_rnn_seq2seq_encoder_are_identical(self): 9 | batch_size = 3 10 | sequence_length = 11 11 | input_size = 5 12 | hidden_size = 7 13 | num_layers = 2 14 | bidirectional = True 15 | 16 | input_data, mask = util.get_random_inputs(batch_size, sequence_length, input_size) 17 | seq2seq_encoder, rnn = util.get_rnns('gru', input_size, hidden_size, num_layers, bidirectional) 18 | 19 | # First, compare without any masking 20 | expected_outputs = seq2seq_encoder(input_data, None) 21 | actual_outputs, _ = rnn(input_data, None) 22 | assert torch.equal(expected_outputs, actual_outputs) 23 | 24 | # Now with the masking 25 | expected_outputs = seq2seq_encoder(input_data, mask) 26 | actual_outputs, _ = rnn(input_data, mask) 27 | assert torch.equal(expected_outputs, actual_outputs) 28 | 29 | def test_rnn_seq2seq_encoder_are_identical_for_loop(self): 30 | # Tests the Seq2SeqEncoder versus the RNN to make sure that when the 31 | # RNN is applied with a for loop that the final outputs are the same 32 | batch_size = 3 33 | sequence_length = 11 34 | input_size = 5 35 | hidden_size = 7 36 | num_layers = 2 37 | bidirectional = False 38 | 39 | input_data, mask = util.get_random_inputs(batch_size, sequence_length, input_size) 40 | seq2seq_encoder, rnn = util.get_rnns('gru', input_size, hidden_size, num_layers, bidirectional) 41 | 42 | expected_outputs = seq2seq_encoder(input_data, None) 43 | actual_outputs = [] 44 | hidden = None 45 | for i in range(sequence_length): 46 | input_step = input_data[:, i, :].unsqueeze(1) 47 | actual_output, hidden = rnn(input_step, None, hidden) 48 | actual_outputs.append(actual_output) 49 | actual_outputs = torch.cat(actual_outputs, dim=1) 50 | assert torch.equal(expected_outputs, actual_outputs) 51 | 52 | def test_no_mask_and_ones_mask_are_identical(self): 53 | # Tests to make sure the outputs are identical when using no mask (None) 54 | # versus a mask of just ones. 55 | batch_size = 30 56 | sequence_length = 20 57 | input_size = 5 58 | hidden_size = 7 59 | num_layers = 2 60 | bidirectional = False 61 | 62 | input_data, _ = util.get_random_inputs(batch_size, sequence_length, input_size) 63 | _, rnn = util.get_rnns('gru', input_size, hidden_size, num_layers, bidirectional) 64 | mask = torch.ones(input_data.size()[:-1]) 65 | hidden = torch.rand(num_layers, batch_size, hidden_size) 66 | 67 | no_mask_outputs, no_mask_hidden = rnn(input_data, None, hidden) 68 | masked_outputs, masked_hidden = rnn(input_data, mask, hidden) 69 | assert torch.equal(no_mask_outputs, masked_outputs) 70 | assert torch.equal(no_mask_hidden, masked_hidden) 71 | -------------------------------------------------------------------------------- /summarize/tests/modules/rnns/util.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | from allennlp.modules import Seq2SeqEncoder 4 | 5 | from summarize.modules.rnns import GRU, LSTM 6 | 7 | 8 | def get_random_inputs(batch_size: int, sequence_length: int, input_size: int): 9 | """ 10 | Creates and returns random masked input data for an RNN. 11 | """ 12 | input_data = torch.randn(batch_size, sequence_length, input_size) 13 | mask = torch.ones(batch_size, sequence_length, dtype=torch.uint8) 14 | # Start with 1 to make sure one of the inputs is not masked at all 15 | for i in range(1, batch_size): 16 | index = random.randint(1, sequence_length) 17 | mask[i, index:] = 0 18 | return input_data, mask 19 | 20 | 21 | def get_rnns(rnn_type: str, input_size: int, hidden_size: int, num_layers: int, bidirectional: bool): 22 | """ 23 | Creates and returns an equivalent AllenNLP ``Seq2SeqEncoder`` and ``RNN`` RNNs. 24 | """ 25 | assert num_layers in [1, 2] 26 | assert rnn_type in ['gru', 'lstm'] 27 | seq2seq_encoder = Seq2SeqEncoder.by_name(rnn_type)(input_size=input_size, hidden_size=hidden_size, 28 | num_layers=num_layers, bidirectional=bidirectional) 29 | if rnn_type == 'gru': 30 | rnn = GRU(input_size, hidden_size, num_layers, bidirectional) 31 | else: 32 | rnn = LSTM(input_size, hidden_size, num_layers, bidirectional) 33 | 34 | rnn.rnn.weight_ih_l0[:] = seq2seq_encoder._module.weight_ih_l0[:] 35 | rnn.rnn.weight_hh_l0[:] = seq2seq_encoder._module.weight_hh_l0[:] 36 | rnn.rnn.bias_ih_l0[:] = seq2seq_encoder._module.bias_ih_l0[:] 37 | rnn.rnn.bias_hh_l0[:] = seq2seq_encoder._module.bias_hh_l0[:] 38 | if bidirectional: 39 | rnn.rnn.weight_ih_l0_reverse[:] = seq2seq_encoder._module.weight_ih_l0_reverse[:] 40 | rnn.rnn.weight_hh_l0_reverse[:] = seq2seq_encoder._module.weight_hh_l0_reverse[:] 41 | rnn.rnn.bias_ih_l0_reverse[:] = seq2seq_encoder._module.bias_ih_l0_reverse[:] 42 | rnn.rnn.bias_hh_l0_reverse[:] = seq2seq_encoder._module.bias_hh_l0_reverse[:] 43 | 44 | if num_layers == 2: 45 | rnn.rnn.weight_ih_l1[:] = seq2seq_encoder._module.weight_ih_l1[:] 46 | rnn.rnn.weight_hh_l1[:] = seq2seq_encoder._module.weight_hh_l1[:] 47 | rnn.rnn.bias_ih_l1[:] = seq2seq_encoder._module.bias_ih_l1[:] 48 | rnn.rnn.bias_hh_l1[:] = seq2seq_encoder._module.bias_hh_l1[:] 49 | if bidirectional: 50 | rnn.rnn.weight_ih_l1_reverse[:] = seq2seq_encoder._module.weight_ih_l1_reverse[:] 51 | rnn.rnn.weight_hh_l1_reverse[:] = seq2seq_encoder._module.weight_hh_l1_reverse[:] 52 | rnn.rnn.bias_ih_l1_reverse[:] = seq2seq_encoder._module.bias_ih_l1_reverse[:] 53 | rnn.rnn.bias_hh_l1_reverse[:] = seq2seq_encoder._module.bias_hh_l1_reverse[:] 54 | 55 | return seq2seq_encoder, rnn 56 | -------------------------------------------------------------------------------- /summarize/tests/modules/sentence_extractors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/modules/sentence_extractors/__init__.py -------------------------------------------------------------------------------- /summarize/tests/modules/sentence_extractors/rnn_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import unittest 3 | from allennlp.modules import FeedForward 4 | from allennlp.modules.seq2seq_encoders import PytorchSeq2SeqWrapper 5 | from allennlp.nn import Activation 6 | from torch.nn import GRU 7 | 8 | from summarize.modules.sentence_extractors import RNNSentenceExtractor 9 | 10 | 11 | class RNNSentenceExtractorTest(unittest.TestCase): 12 | def test_rnn_sentence_extractor(self): 13 | # Hyperparameters 14 | batch_size = 3 15 | num_sents = 5 16 | input_hidden_size = 7 17 | hidden_size = 11 18 | 19 | # Setup a model 20 | gru = GRU(input_size=input_hidden_size, 21 | hidden_size=hidden_size, 22 | bidirectional=True, 23 | batch_first=True) 24 | rnn = PytorchSeq2SeqWrapper(gru) 25 | feed_forward = FeedForward(input_dim=hidden_size * 2, 26 | num_layers=2, 27 | hidden_dims=[10, 1], 28 | activations=[Activation.by_name('tanh')(), Activation.by_name('linear')()]) 29 | extractor = RNNSentenceExtractor(rnn, feed_forward) 30 | 31 | # Setup some dummy data 32 | sentence_encodings = torch.randn(batch_size, num_sents, input_hidden_size) 33 | mask = torch.ones(batch_size, num_sents) 34 | 35 | # Pass the data through and verify the size of the output 36 | extraction_scores = extractor(sentence_encodings, mask) 37 | assert extraction_scores.size() == (batch_size, num_sents) 38 | -------------------------------------------------------------------------------- /summarize/tests/nn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/nn/__init__.py -------------------------------------------------------------------------------- /summarize/tests/nn/beam_search/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/nn/beam_search/__init__.py -------------------------------------------------------------------------------- /summarize/tests/nn/beam_search/coverage_penalizers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/nn/beam_search/coverage_penalizers/__init__.py -------------------------------------------------------------------------------- /summarize/tests/nn/beam_search/coverage_penalizers/onmt_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import unittest 3 | 4 | from summarize.nn.beam_search.coverage_penalizers import ONMTCoveragePenalizer 5 | 6 | 7 | class TestAverageLengthPenalizer(unittest.TestCase): 8 | def test_onmt_coverage_penalizer(self): 9 | coverage = torch.FloatTensor([[0.4, 1.2, 0.8], [1.5, 0.7, 0.0]]) 10 | 11 | penalizer = ONMTCoveragePenalizer(0.0) 12 | penalties = penalizer(coverage) 13 | expected_penalties = torch.FloatTensor([0.0, 0.0]) 14 | assert torch.allclose(expected_penalties, penalties) 15 | 16 | penalizer = ONMTCoveragePenalizer(0.5) 17 | penalties = penalizer(coverage) 18 | expected_penalties = torch.FloatTensor([-0.2 * 0.5, -0.5 * 0.5]) 19 | assert torch.allclose(expected_penalties, penalties, atol=1e-3) 20 | -------------------------------------------------------------------------------- /summarize/tests/nn/beam_search/length_penalizers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/nn/beam_search/length_penalizers/__init__.py -------------------------------------------------------------------------------- /summarize/tests/nn/beam_search/length_penalizers/average_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import unittest 3 | 4 | from summarize.nn.beam_search.length_penalizers import AverageLengthPenalizer 5 | 6 | 7 | class TestAverageLengthPenalizer(unittest.TestCase): 8 | def test_average_length_penalizer(self): 9 | lengths = torch.LongTensor([[1, 2], [3, 4]]) 10 | 11 | penalizer = AverageLengthPenalizer() 12 | penalties = penalizer(lengths) 13 | assert torch.equal(lengths.float(), penalties) 14 | -------------------------------------------------------------------------------- /summarize/tests/nn/beam_search/length_penalizers/wu_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import unittest 3 | 4 | from summarize.nn.beam_search.length_penalizers import WuLengthPenalizer 5 | 6 | 7 | class TestWuLengthPenalizer(unittest.TestCase): 8 | def test_wu_length_penalizer(self): 9 | lengths = torch.LongTensor([[1, 2], [3, 4]]) 10 | 11 | penalizer = WuLengthPenalizer(0.0) 12 | penalties = penalizer(lengths) 13 | expected_penalties = torch.FloatTensor([[1.0, 1.0], [1.0, 1.0]]) 14 | assert torch.allclose(expected_penalties, penalties) 15 | 16 | penalizer = WuLengthPenalizer(0.5) 17 | penalties = penalizer(lengths) 18 | expected_penalties = torch.FloatTensor([[1.0, 1.0801], [1.1547, 1.2247]]) 19 | assert torch.allclose(expected_penalties, penalties, atol=1e-3) 20 | -------------------------------------------------------------------------------- /summarize/tests/nn/util_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import unittest 3 | 4 | from summarize.nn.util import normalize_losses 5 | 6 | 7 | class TestUtil(unittest.TestCase): 8 | def test_normalize_losses(self): 9 | losses = torch.FloatTensor([ 10 | [1.0, 2.0, 3.0], 11 | [4.0, 5.0, 6.0] 12 | ]) 13 | mask = torch.FloatTensor([ 14 | [1.0, 1.0, 1.0], 15 | [1.0, 1.0, 0.0] 16 | ]) 17 | 18 | actual_loss = normalize_losses(losses, mask, 'sum', 'sum') 19 | expected_loss = 15.0 20 | assert expected_loss == actual_loss.item() 21 | 22 | actual_loss = normalize_losses(losses, mask, 'sum', 'average') 23 | expected_loss = 7.5 24 | assert expected_loss == actual_loss.item() 25 | 26 | actual_loss = normalize_losses(losses, mask, 'average', 'sum') 27 | expected_loss = 6.5 28 | assert expected_loss == actual_loss.item() 29 | 30 | actual_loss = normalize_losses(losses, mask, 'average', 'average') 31 | expected_loss = 3.25 32 | assert expected_loss == actual_loss.item() 33 | 34 | with self.assertRaises(Exception): 35 | normalize_losses(losses, mask, 'unknown', 'sum') 36 | with self.assertRaises(Exception): 37 | normalize_losses(losses, mask, 'sum', 'unknown') 38 | -------------------------------------------------------------------------------- /summarize/tests/training/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/training/__init__.py -------------------------------------------------------------------------------- /summarize/tests/training/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/training/metrics/__init__.py -------------------------------------------------------------------------------- /summarize/tests/training/metrics/binary_f1_measure_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import unittest 3 | 4 | from summarize.training.metrics import BinaryF1Measure 5 | 6 | 7 | class BinaryF1MeasureTest(unittest.TestCase): 8 | def test_binary_f1_measure(self): 9 | gold_labels = torch.LongTensor([ 10 | [1, 1, 0], 11 | [0, 1, 1], 12 | [0, 0, 0] 13 | ]) 14 | model_labels = torch.LongTensor([ 15 | [1, 0, 1], 16 | [1, 1, 1], 17 | [0, 1, 1] 18 | ]) 19 | mask = torch.LongTensor([ 20 | [1, 1, 1], 21 | [1, 1, 1], 22 | [0, 0, 0] 23 | ]) 24 | 25 | metric = BinaryF1Measure() 26 | expected_precision = 3 / 5 27 | expected_recall = 3 / 4 28 | expected_f1 = 2 * (expected_precision * expected_recall) / (expected_precision + expected_recall) 29 | 30 | metric(gold_labels, model_labels, mask) 31 | actual_metrics = metric.get_metric() 32 | self.assertAlmostEqual(actual_metrics['precision'], expected_precision, delta=1e-5) 33 | self.assertAlmostEqual(actual_metrics['recall'], expected_recall, delta=1e-5) 34 | self.assertAlmostEqual(actual_metrics['f1'], expected_f1, delta=1e-5) 35 | -------------------------------------------------------------------------------- /summarize/training/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/training/__init__.py -------------------------------------------------------------------------------- /summarize/training/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | from summarize.training.metrics.binary_f1_measure import BinaryF1Measure 2 | from summarize.training.metrics.python_rouge_metric import PythonRougeMetric 3 | from summarize.training.metrics.cross_entropy_metric import CrossEntropyMetric 4 | -------------------------------------------------------------------------------- /summarize/training/metrics/binary_f1_measure.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from allennlp.training.metrics import F1Measure, Metric 3 | from overrides import overrides 4 | from typing import Dict, Optional 5 | 6 | 7 | @Metric.register('binary-f1') 8 | class BinaryF1Measure(F1Measure): 9 | """ 10 | The BinaryF1Measure allows for computing the standard F1 metric using 11 | two binary vectors, the ground-truth labels and the predictions from the 12 | model. The original F1Measure computation would require the ground-truth 13 | predictions to be a (batch_size, ..., 2) binary tensor that marks the 14 | ground-truth class. 15 | """ 16 | def __init__(self) -> None: 17 | super().__init__(1) 18 | 19 | @overrides 20 | def __call__(self, 21 | gold_labels: torch.Tensor, 22 | model_labels: torch.Tensor, 23 | mask: Optional[torch.Tensor] = None, 24 | **kwargs): 25 | """ 26 | Parameters 27 | ---------- 28 | gold_labels: (batch_size, ...) 29 | The ground-truth binary labels 30 | model_labels: (batch_size, ...) 31 | The binary model predictions 32 | mask: (batch_size, ...) 33 | The mask 34 | """ 35 | categorical_model_labels = model_labels.new_zeros(*model_labels.size(), 2) 36 | model_labels = model_labels.unsqueeze(-1) 37 | categorical_model_labels.scatter_(-1, model_labels, 1) 38 | super().__call__(categorical_model_labels, gold_labels, mask) 39 | 40 | @overrides 41 | def get_metric(self, reset: bool = False) -> Dict[str, float]: 42 | precision, recall, f1_measure = super().get_metric(reset) 43 | return { 44 | 'precision': precision, 45 | 'recall': recall, 46 | 'f1': f1_measure 47 | } 48 | -------------------------------------------------------------------------------- /summarize/training/metrics/cross_entropy_metric.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from allennlp.training.metrics import Metric 3 | from overrides import overrides 4 | from typing import Dict 5 | 6 | 7 | @Metric.register('cross-entropy') 8 | class CrossEntropyMetric(Metric): 9 | def __init__(self) -> None: 10 | self.total_loss = 0 11 | self.total_num_tokens = 0 12 | 13 | @overrides 14 | def __call__(self, loss: float, num_tokens: int) -> None: 15 | self.total_loss += loss 16 | self.total_num_tokens += num_tokens 17 | 18 | @overrides 19 | def get_metric(self, reset: bool = False) -> Dict[str, float]: 20 | cross_entropy = self.total_loss / self.total_num_tokens 21 | perplexity = np.exp(cross_entropy) 22 | if reset: 23 | self.total_loss = 0 24 | self.total_num_tokens = 0 25 | return { 26 | 'cross-entropy': cross_entropy, 27 | 'perplexity': perplexity 28 | } 29 | -------------------------------------------------------------------------------- /summarize/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/utils/__init__.py -------------------------------------------------------------------------------- /summarize/utils/copy_jsonl_fields.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from summarize.data.io import JsonlReader, JsonlWriter 4 | 5 | 6 | def main(args): 7 | with JsonlWriter(args.output_jsonl) as out: 8 | with JsonlReader(args.source_jsonl) as source: 9 | with JsonlReader(args.target_jsonl) as target: 10 | for source_instance, target_instance in zip(source, target): 11 | for source_field, target_field in args.field_names: 12 | target_instance[target_field] = source_instance[source_field] 13 | out.write(target_instance) 14 | 15 | 16 | if __name__ == '__main__': 17 | argp = argparse.ArgumentParser() 18 | argp.add_argument('source_jsonl', help='The file with the desired field') 19 | argp.add_argument('target_jsonl', help='The destination file') 20 | argp.add_argument('output_jsonl', help='The file with the target data and copied source field') 21 | argp.add_argument('--field-names', nargs=2, action='append', 22 | help='The names of the source and target fields') 23 | args = argp.parse_args() 24 | main(args) 25 | -------------------------------------------------------------------------------- /summarize/utils/extract_cloze_from_labels.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from summarize.data.io import JsonlReader, JsonlWriter 4 | 5 | 6 | def main(args): 7 | with JsonlWriter(args.output_jsonl) as out: 8 | with JsonlReader(args.input_jsonl) as f: 9 | for instance in f: 10 | document = instance['document'] 11 | labels = instance['labels'] 12 | cloze = [document[index] for index in labels] 13 | if not args.keep_sentences: 14 | cloze = ' '.join(cloze) 15 | out.write({args.field_name: cloze}) 16 | 17 | 18 | if __name__ == '__main__': 19 | argp = argparse.ArgumentParser() 20 | argp.add_argument('input_jsonl', help='The input file with the labeled summaries.') 21 | argp.add_argument('output_jsonl', help='The output file') 22 | argp.add_argument('--field-name', default='cloze', help='The name of the output field') 23 | argp.add_argument('--keep-sentences', action='store_true', help='Indicates if the output field should be left as sentences or flattened') 24 | args = argp.parse_args() 25 | main(args) 26 | -------------------------------------------------------------------------------- /summarize/utils/extract_summary_from_labels.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from summarize.data.io import JsonlReader, JsonlWriter 4 | 5 | 6 | def main(args): 7 | with JsonlWriter(args.output_jsonl) as out: 8 | with JsonlReader(args.input_jsonl) as f: 9 | for instance in f: 10 | document = instance['document'] 11 | labels = instance['labels'] 12 | summary = [document[index] for index in labels] 13 | out.write({'summary': summary}) 14 | 15 | 16 | if __name__ == '__main__': 17 | argp = argparse.ArgumentParser() 18 | argp.add_argument('input_jsonl', help='The input file with the labeled summaries.') 19 | argp.add_argument('output_jsonl', help='The output file') 20 | args = argp.parse_args() 21 | main(args) 22 | -------------------------------------------------------------------------------- /summarize/utils/replace_config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Replaces the model configuration in a model.tar.gz file with a new one. The 3 | new configuration can be a jsonnet file that will be evaluated into a json. 4 | """ 5 | import argparse 6 | import tarfile 7 | import json 8 | from io import BytesIO 9 | 10 | from allennlp.common.params import Params 11 | 12 | 13 | def main(args): 14 | tar_bytes = open(args.model_file_path, 'rb').read() 15 | with tarfile.open(fileobj=BytesIO(tar_bytes), mode='r:gz') as tar: 16 | with tarfile.open(args.output_file_path, 'w:gz') as out: 17 | for member in tar.getmembers(): 18 | if member.name != 'config.json': 19 | out.addfile(member, tar.extractfile(member.name)) 20 | else: 21 | new_params = Params.from_file(args.config_file_path) 22 | serialized_params = json.dumps(new_params.as_ordered_dict(), indent=4).encode() 23 | bytes_io = BytesIO(serialized_params) 24 | member.size = len(serialized_params) 25 | out.addfile(tarinfo=member, fileobj=bytes_io) 26 | 27 | 28 | if __name__ == '__main__': 29 | argp = argparse.ArgumentParser() 30 | argp.add_argument('model_file_path', help='The path to the model.tar.gz with the config to replace') 31 | argp.add_argument('output_file_path', help='The path to the new model.tar.gz') 32 | argp.add_argument('config_file_path', help='The path to the new config file') 33 | args = argp.parse_args() 34 | main(args) 35 | --------------------------------------------------------------------------------