├── .gitignore
├── .gitmodules
├── LICENSE
├── Readme.md
├── data
├── cnn-dailymail
│ ├── Readme.md
│ └── setup.sh
├── deutsch2019
│ ├── Readme.md
│ └── setup.sh
├── gigaword
│ └── Readme.md
├── kedzie2018
│ └── cnn-dailymail
│ │ └── setup.sh
├── onmt
│ ├── Readme.md
│ ├── convert_to_jsonl.py
│ └── setup.sh
└── wikicite
│ ├── Readme.md
│ └── setup.sh
├── experiments
├── deutsch2019
│ ├── Readme.md
│ ├── abstractive-step
│ │ ├── Readme.md
│ │ ├── coverage
│ │ │ ├── .gitignore
│ │ │ ├── evaluate.sh
│ │ │ ├── model.jsonnet
│ │ │ ├── predict.sh
│ │ │ ├── run.sh
│ │ │ └── train.sh
│ │ └── pointer-generator
│ │ │ ├── .gitignore
│ │ │ ├── evaluate.sh
│ │ │ ├── model.jsonnet
│ │ │ ├── predict.sh
│ │ │ ├── run.sh
│ │ │ └── train.sh
│ ├── baselines
│ │ ├── lead
│ │ │ ├── .gitignore
│ │ │ └── run.sh
│ │ ├── open-ai
│ │ │ ├── .gitignore
│ │ │ ├── Readme.md
│ │ │ ├── run.sh
│ │ │ └── setup.sh
│ │ └── oracle
│ │ │ ├── .gitignore
│ │ │ └── run.sh
│ ├── demo.ipynb
│ └── extractive-step
│ │ ├── Readme.md
│ │ ├── bm25
│ │ ├── .gitignore
│ │ ├── calculate-df.sh
│ │ ├── evaluate.sh
│ │ └── predict.sh
│ │ ├── extractive-model
│ │ ├── .gitignore
│ │ ├── Readme.md
│ │ ├── evaluate.sh
│ │ ├── model.jsonnet
│ │ ├── predict.sh
│ │ ├── preprocess.sh
│ │ └── train.sh
│ │ ├── lead
│ │ ├── .gitignore
│ │ ├── Readme.md
│ │ ├── preprocess.sh
│ │ └── run.sh
│ │ ├── oracle
│ │ ├── .gitignore
│ │ ├── Readme.md
│ │ ├── preprocess.sh
│ │ └── run.sh
│ │ └── sumfocus
│ │ ├── .gitignore
│ │ ├── Readme.md
│ │ ├── analyze_results.py
│ │ ├── run-max-sents.sh
│ │ ├── run-max-words.sh
│ │ └── run-parameter-sweep.sh
├── kedzie2018
│ ├── Readme.md
│ └── cnn-dailymail
│ │ ├── extractive-model
│ │ ├── .gitignore
│ │ ├── evaluate.sh
│ │ ├── model.jsonnet
│ │ ├── predict.sh
│ │ └── train.sh
│ │ ├── lead
│ │ ├── .gitignore
│ │ └── run.sh
│ │ └── oracle
│ │ ├── .gitignore
│ │ └── run.sh
├── onmt
│ ├── Readme.md
│ ├── convert_to_jsonl.py
│ ├── demo.ipynb
│ ├── pointer-generator
│ │ ├── .gitignore
│ │ ├── Readme.md
│ │ ├── evaluate.sh
│ │ ├── model.jsonnet
│ │ ├── predict.sh
│ │ ├── replace-config.sh
│ │ ├── run.sh
│ │ └── train.sh
│ └── seq2seq
│ │ ├── .gitignore
│ │ ├── Readme.md
│ │ ├── evaluate.sh
│ │ ├── model.jsonnet
│ │ ├── predict.sh
│ │ ├── replace-config.sh
│ │ ├── run.sh
│ │ └── train.sh
└── wikicite
│ └── analysis
│ ├── document-distribution
│ ├── Readme.md
│ └── run.py
│ └── topic-distribution
│ ├── .gitignore
│ ├── Readme.md
│ └── run.py
├── external
├── ROUGE-1.5.5
│ ├── .gitignore
│ └── Readme.md
└── meteor
│ ├── .gitignore
│ ├── Readme.md
│ └── setup.sh
├── requirements.txt
├── runtime.txt
└── summarize
├── __init__.py
├── common
├── __init__.py
├── tempdir.py
├── testing.py
└── util.py
├── data
├── __init__.py
├── dataset_readers
│ ├── __init__.py
│ ├── cloze
│ │ ├── __init__.py
│ │ ├── abstractive.py
│ │ ├── extractive.py
│ │ └── pointer_generator.py
│ ├── sds
│ │ ├── __init__.py
│ │ ├── abstractive.py
│ │ ├── extractive.py
│ │ └── pointer_generator.py
│ └── util.py
├── dataset_setup
│ ├── __init__.py
│ ├── cnn_dailymail.py
│ ├── deutsch2019.py
│ ├── gigaword.py
│ ├── kedzie2018.py
│ ├── tokenize.py
│ ├── util.py
│ └── wikicite.py
├── dataset_stats
│ ├── __init__.py
│ └── sds.py
├── io
│ ├── __init__.py
│ ├── jsonl_reader.py
│ ├── jsonl_writer.py
│ └── util.py
└── paragraph_tokenizers
│ ├── __init__.py
│ ├── paragraph_tokenizer.py
│ └── paragraph_word_tokenizer.py
├── metrics
├── __init__.py
├── meteor.py
├── python_rouge.py
└── rouge.py
├── models
├── __init__.py
├── cloze
│ ├── __init__.py
│ ├── bm25
│ │ ├── __init__.py
│ │ ├── bm25.py
│ │ └── calculate_df.py
│ ├── extractive_baseline.py
│ ├── lead.py
│ ├── open_ai_language_model.py
│ ├── oracle.py
│ ├── pointer_generator.py
│ ├── seq2seq.py
│ └── sumfocus.py
└── sds
│ ├── __init__.py
│ ├── extractive_baseline.py
│ ├── lead.py
│ ├── oracle.py
│ ├── pointer_generator.py
│ └── seq2seq.py
├── modules
├── __init__.py
├── bridge.py
├── coverage_matrix_attention
│ ├── __init__.py
│ ├── coverage_matrix_attention.py
│ ├── matrix_attention_wrapper.py
│ └── mlp.py
├── generate_probability_functions
│ ├── __init__.py
│ ├── generate_probability_function.py
│ ├── onmt.py
│ └── see2017.py
├── matrix_attention
│ ├── __init__.py
│ └── mlp.py
├── rnns
│ ├── __init__.py
│ ├── gru.py
│ ├── lstm.py
│ ├── rnn.py
│ └── util.py
└── sentence_extractors
│ ├── __init__.py
│ ├── rnn.py
│ └── sentence_extractor.py
├── nn
├── __init__.py
├── beam_search
│ ├── __init__.py
│ ├── beam_search.py
│ ├── coverage_penalizers
│ │ ├── __init__.py
│ │ ├── coverage_penalizer.py
│ │ └── onmt.py
│ ├── length_penalizers
│ │ ├── __init__.py
│ │ ├── average.py
│ │ ├── length_penalizer.py
│ │ └── wu.py
│ └── relaxed.py
└── util.py
├── predictors
├── __init__.py
├── cloze
│ ├── __init__.py
│ ├── abstractive.py
│ └── extractive.py
└── sds
│ ├── __init__.py
│ ├── abstractive.py
│ └── extractive.py
├── tests
├── __init__.py
├── common
│ ├── __init__.py
│ └── tempdir_test.py
├── data
│ ├── __init__.py
│ ├── dataset_readers
│ │ ├── __init__.py
│ │ ├── cloze
│ │ │ ├── __init__.py
│ │ │ ├── abstractive_test.py
│ │ │ ├── extractive_test.py
│ │ │ └── pointer_generator_test.py
│ │ └── sds
│ │ │ ├── __init__.py
│ │ │ ├── abstractive_test.py
│ │ │ ├── extractive_test.py
│ │ │ └── pointer_generator_test.py
│ ├── dataset_setup
│ │ ├── __init__.py
│ │ └── tokenize_test.py
│ ├── io
│ │ ├── __init__.py
│ │ ├── jsonl_reader_test.py
│ │ ├── jsonl_writer_test.py
│ │ └── util_test.py
│ └── paragraph_tokenizers
│ │ ├── __init__.py
│ │ └── paragraph_word_tokenizer_test.py
├── fixtures
│ ├── configs
│ │ ├── cloze
│ │ │ ├── extractive-baseline.jsonnet
│ │ │ ├── pointer-generator.jsonnet
│ │ │ └── seq2seq.jsonnet
│ │ └── sds
│ │ │ ├── extractive-baseline.jsonnet
│ │ │ ├── pointer-generator.jsonnet
│ │ │ └── seq2seq.jsonnet
│ └── data
│ │ ├── chen2018
│ │ ├── Readme.md
│ │ ├── gold.jsonl
│ │ └── model.jsonl
│ │ ├── cloze.jsonl
│ │ ├── hong2014
│ │ ├── centroid.jsonl
│ │ ├── classy04.jsonl
│ │ ├── classy11.jsonl
│ │ ├── dpp.jsonl
│ │ ├── freq-sum.jsonl
│ │ ├── greedy-kl.jsonl
│ │ ├── icsi-summ.jsonl
│ │ ├── lexrank.jsonl
│ │ ├── occams-v.jsonl
│ │ ├── reg-sum.jsonl
│ │ ├── setup.py
│ │ ├── submodular.jsonl
│ │ └── ts-sum.jsonl
│ │ └── sds.jsonl
├── metrics
│ ├── __init__.py
│ ├── meteor_test.py
│ ├── python_rouge_test.py
│ └── rouge_test.py
├── models
│ ├── __init__.py
│ ├── cloze
│ │ ├── __init__.py
│ │ ├── bm25
│ │ │ ├── __init__.py
│ │ │ ├── bm25_test.py
│ │ │ └── calculate_df_test.py
│ │ ├── extractive_baseline_test.py
│ │ ├── lead_test.py
│ │ ├── open_ai_language_model_test.py
│ │ ├── pointer_generator_test.py
│ │ ├── seq2seq_test.py
│ │ └── sumfocus_test.py
│ └── sds
│ │ ├── __init__.py
│ │ ├── extractive_baseline_test.py
│ │ ├── lead_test.py
│ │ ├── pointer_generator_test.py
│ │ └── seq2seq_test.py
├── modules
│ ├── __init__.py
│ ├── bridge_test.py
│ ├── coverage_matrix_attention
│ │ ├── __init__.py
│ │ └── mlp_test.py
│ ├── rnns
│ │ ├── __init__.py
│ │ ├── gru_test.py
│ │ ├── lstm_test.py
│ │ ├── rnn_test.py
│ │ └── util.py
│ └── sentence_extractors
│ │ ├── __init__.py
│ │ └── rnn_test.py
├── nn
│ ├── __init__.py
│ ├── beam_search
│ │ ├── __init__.py
│ │ ├── beam_search_test.py
│ │ ├── coverage_penalizers
│ │ │ ├── __init__.py
│ │ │ └── onmt_test.py
│ │ ├── length_penalizers
│ │ │ ├── __init__.py
│ │ │ ├── average_test.py
│ │ │ └── wu_test.py
│ │ └── relaxed_test.py
│ └── util_test.py
└── training
│ ├── __init__.py
│ └── metrics
│ ├── __init__.py
│ ├── binary_f1_measure_test.py
│ └── python_rouge_metric_test.py
├── training
├── __init__.py
└── metrics
│ ├── __init__.py
│ ├── binary_f1_measure.py
│ ├── cross_entropy_metric.py
│ └── python_rouge_metric.py
└── utils
├── __init__.py
├── copy_jsonl_fields.py
├── extract_cloze_from_labels.py
├── extract_summary_from_labels.py
└── replace_config.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .pytest_cache
2 | __pycache__
3 | .DS_Store
4 | /data
5 | .ipynb_checkpoints
6 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "external/gpt-2"]
2 | path = external/gpt-2
3 | url = https://github.com/openai/gpt-2
4 |
--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
1 | # Summarize
2 | Summarize is a PyTorch-based package for automatic summarization built on AllenNLP.
3 | It contains implementations of end-to-end extractive models and abstractive models, including the standard Seq2Seq and Pointer-Generator models.
4 |
5 | This repository also contains the code for the "Summary Cloze: A New Task for Content Selection in Topic-Focused Summarization" paper.
6 | Please see `experiments/deutsch2019` for more details about how to run the models.
7 |
--------------------------------------------------------------------------------
/data/cnn-dailymail/setup.sh:
--------------------------------------------------------------------------------
1 | python -m summarize.data.dataset_setup.cnn_dailymail data/cnn-dailymail
2 | for split in train valid test; do
3 | for dataset in cnn dailymail cnn-dailymail; do
4 | python -m summarize.data.dataset_setup.tokenize \
5 | data/cnn-dailymail/${dataset}/${split}.jsonl.gz \
6 | data/cnn-dailymail/${dataset}/${split}.tokenized.jsonl.gz \
7 | document summary \
8 | --backend nltk
9 | done
10 | done
11 |
--------------------------------------------------------------------------------
/data/deutsch2019/Readme.md:
--------------------------------------------------------------------------------
1 | This directory contains the script to preprocess the WikiCite dataset for "Summary Cloze: A New Task for Content Selection in Topic-Focused Summarization."
2 |
3 | First, run the setup script under the `data/wikicite` directory.
4 | Then, run the `setup.sh` script to compute the ROUGE-based heuristic extractive labels for the dataset.
5 | The processing speed is somewhat slow, so it may take several hours to process the data.
6 | Alternatively, the preprocessed data can be downloaded here:
7 | train,
8 | valid,
9 | test.
10 |
--------------------------------------------------------------------------------
/data/deutsch2019/setup.sh:
--------------------------------------------------------------------------------
1 | for split in train valid test; do
2 | python -m summarize.data.dataset_setup.deutsch2019 \
3 | data/wikicite/${split}.tokenized.v1.1.jsonl.gz \
4 | data/deutsch2019/${split}.v1.1.jsonl.gz \
5 | --num-cores 8
6 | done
7 |
--------------------------------------------------------------------------------
/data/gigaword/Readme.md:
--------------------------------------------------------------------------------
1 | # Gigaword
2 | ## Setup
3 | To setup the Gigaword corpus, run the following command:
4 | ```
5 | python -m summarize.data.dataset_setup.gigaword \
6 | data/gigaword
7 | ```
8 | The script downloads the data from https://github.com/harvardnlp/sent-summary, replaces the `UNK` token with the AllenNLP special token for out-of-vocabulary words, and saves the data in the jsonl format.
9 |
10 | There are 3,803,957 training, 189,651, and 1951 testing examples.
11 |
12 | This is the dataset which is used to train the [OpenNMT-py Gigaword summarization models](http://opennmt.net/Models-py/#summarization).
13 | I assume it is also the data used by [Rush et al. (2015)](https://www.aclweb.org/anthology/D15-1044), but the paper does not link to any dataset, code, or specify the size of the datasets splits.
14 | The follow up work, [Ranzato et al. (2016)](https://arxiv.org/pdf/1511.06732.pdf), also uses Gigaword, but the dataset split sizes are very different (179,414 training, 22,568 validation, and 22,259 testing examples).
15 | The [corresponding repository](https://github.com/facebookarchive/MIXER) only has instructions and code for the machine translation experiments.
16 |
--------------------------------------------------------------------------------
/data/kedzie2018/cnn-dailymail/setup.sh:
--------------------------------------------------------------------------------
1 | for split in train valid test; do
2 | python -m summarize.data.dataset_setup.kedzie2018 \
3 | https://s3.amazonaws.com/danieldeutsch/summarize/data/cnn-dailymail/cnn-dailymail/${split}.tokenized.v1.0.jsonl.gz \
4 | data/kedzie2018/cnn-dailymail/${split}.jsonl.gz \
5 | 100 \
6 | --num-cores 16
7 | done
8 |
--------------------------------------------------------------------------------
/data/onmt/Readme.md:
--------------------------------------------------------------------------------
1 | # OpenNMT CNN/DailyMail
2 | This dataset is the CNN/DailyMail dataset as prepared by the OpenNMT library.
3 | The preprocessed data can be downloaded here:
4 |
5 | - https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/train.v1.0.jsonl.gz
6 | - https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/valid.v1.0.jsonl.gz
7 | - https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/test.v1.0.jsonl.gz
8 |
--------------------------------------------------------------------------------
/data/onmt/convert_to_jsonl.py:
--------------------------------------------------------------------------------
1 | # Edit the system path so the summarize library can be imported
2 | import sys
3 | sys.path.append('.')
4 |
5 | import argparse
6 | import json
7 | import re
8 |
9 | from summarize.data.io import JsonlWriter
10 |
11 |
12 | def main(args):
13 | with JsonlWriter(args.output_jsonl) as out:
14 | with open(args.src_tsv, 'r') as f_src:
15 | with open(args.tgt_tsv, 'r') as f_tgt:
16 | for src, tgt in zip(f_src, f_tgt):
17 | if len(src.strip()) == 0:
18 | continue
19 |
20 | document = [src.strip()]
21 | summary = []
22 | for match in re.findall(r' (.+?) ', tgt):
23 | summary.append(match)
24 | out.write({'document': document, 'summary': summary})
25 |
26 |
27 | if __name__ == '__main__':
28 | argp = argparse.ArgumentParser()
29 | argp.add_argument('src_tsv')
30 | argp.add_argument('tgt_tsv')
31 | argp.add_argument('output_jsonl')
32 | args = argp.parse_args()
33 | main(args)
34 |
--------------------------------------------------------------------------------
/data/onmt/setup.sh:
--------------------------------------------------------------------------------
1 | wget https://s3.amazonaws.com/opennmt-models/Summary/cnndm.tar.gz -O data/onmt/cnndm.tar.gz
2 | mkdir data/onmt/onmt
3 | tar -xzvf data/onmt/cnndm.tar.gz -C data/onmt/onmt
4 |
5 | python data/onmt/convert_to_jsonl.py \
6 | data/onmt/onmt/train.txt.src \
7 | data/onmt/onmt/train.txt.tgt.tagged \
8 | data/onmt/train.jsonl.gz
9 |
10 | python data/onmt/convert_to_jsonl.py \
11 | data/onmt/onmt/val.txt.src \
12 | data/onmt/onmt/val.txt.tgt.tagged \
13 | data/onmt/valid.jsonl.gz
14 |
15 | python data/onmt/convert_to_jsonl.py \
16 | data/onmt/onmt/test.txt.src \
17 | data/onmt/onmt/test.txt.tgt.tagged \
18 | data/onmt/test.jsonl.gz
19 |
--------------------------------------------------------------------------------
/data/wikicite/Readme.md:
--------------------------------------------------------------------------------
1 | # WikiCite
2 | The WikiCite dataset is a collection of summary cloze instances collected from Wikipedia.
3 | For more details, please see https://github.com/danieldeutsch/wikicite.
4 |
5 | ## Setup
6 | The `setup.sh` script downloads the original dataset and tokenizes the text fields.
7 | The original dataset and tokenized versions can be downloaded here:
8 |
9 |
10 |
11 |
12 | Corpus |
13 | Train |
14 | Valid |
15 | Test |
16 |
17 |
18 |
19 |
20 | Original |
21 | https://danieldeutsch.s3.amazonaws.com/summarize/data/wikicite/train.v1.1.jsonl.gz |
22 | https://danieldeutsch.s3.amazonaws.com/summarize/data/wikicite/valid.v1.1.jsonl.gz |
23 | https://danieldeutsch.s3.amazonaws.com/summarize/data/wikicite/test.v1.1.jsonl.gz |
24 |
25 |
26 | Tokenized |
27 | https://danieldeutsch.s3.amazonaws.com/summarize/data/wikicite/train.tokenized.v1.1.jsonl.gz |
28 | https://danieldeutsch.s3.amazonaws.com/summarize/data/wikicite/valid.tokenized.v1.1.jsonl.gz |
29 | https://danieldeutsch.s3.amazonaws.com/summarize/data/wikicite/test.tokenized.v1.1.jsonl.gz |
30 |
31 |
32 |
33 |
34 |
35 | ## Citation
36 | If you use this dataset, please cite the following paper:
37 | ```
38 | @inproceedings{DeutschRo19,
39 | author = {Daniel Deutsch and Dan Roth},
40 | title = {{Summary Cloze: A New Task for Content Selection in Topic-Focused Summarization}},
41 | booktitle = {Proc. of the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
42 | year = {2019},
43 | url = "https://cogcomp.seas.upenn.edu/papers/DeutschRo19.pdf",
44 | funding = {ARL},
45 | }
46 | ```
47 |
--------------------------------------------------------------------------------
/data/wikicite/setup.sh:
--------------------------------------------------------------------------------
1 | for split in train valid test; do
2 | python -m summarize.data.dataset_setup.wikicite \
3 | https://danieldeutsch.s3.amazonaws.com/summarize/data/wikicite/${split}.v1.1.jsonl.gz \
4 | data/wikicite/${split}.tokenized.jsonl.gz
5 | done
6 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/Readme.md:
--------------------------------------------------------------------------------
1 | # Deutsch 2019
2 | This directory contains the experiments related to "Summary Cloze: A New Task for Content Selection in Topic-Focused Summarization" by Deutsch and Roth (2019).
3 |
4 | ## Demo
5 | A demo of the final models (with the topics and context) can be viewed by clicking this badge:
6 |
7 | [](https://mybinder.org/v2/gh/danieldeutsch/summarize/21054f43de1b363aba1e1283d62736e5117877bf?filepath=experiments%2Fdeutsch2019%2Fdemo.ipynb)
8 |
9 | If you run the Jupyter Notebook on the MyBinder servers, the abstractive model takes around 30 to 60 seconds to produce the output.
10 |
11 | ## Instructions
12 | First, it may be necessary to checkout [this commit](https://github.com/danieldeutsch/summarize/releases/tag/emnlp2019) since there could have been breaking changes to the code since the original models were trained.
13 |
14 | Then, setup the WikiCite dataset by running the setup script in `data/deutsch2019`.
15 |
16 | Each of the directories contains the scripts to run the different models from the paper.
17 | The `baselines` directory contains code for some baseline models, such as the lead, oracle, and language model baselines.
18 | The `extractive-step` directory contains the code for the extractive models and extractive preprocessing steps.
19 | The `abstractive-step` directory contains the code for training the abstractive models, both the base Pointer-Generator model and the fine-tuned model with the coverage loss.
20 | The directories contain documentation with extra information, results, and saved models.
21 |
22 | If you use any of the code or data from this experiment, please cite the following paper:
23 | ```
24 | @inproceedings{DeutschRo19,
25 | author = {Daniel Deutsch and Dan Roth},
26 | title = {{Summary Cloze: A New Task for Content Selection in Topic-Focused Summarization}},
27 | booktitle = {Proc. of the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
28 | year = {2019},
29 | url = "https://cogcomp.seas.upenn.edu/papers/DeutschRo19.pdf",
30 | funding = {ARL},
31 | }
32 | ```
33 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/abstractive-step/coverage/.gitignore:
--------------------------------------------------------------------------------
1 | model
2 | output
3 | results
4 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/abstractive-step/coverage/evaluate.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 |
3 | if [ "$#" -ne 2 ]; then
4 | echo "Usage: sh evaluate.sh "
5 | exit
6 | fi
7 |
8 | preprocessing_dataset=$1
9 | use_context=$2
10 | if [ "${preprocessing_dataset}" == "lead" ]; then
11 | preprocess_dir="${expt_dir}/../../extractive-step/lead/preprocessed"
12 | elif [ "${preprocessing_dataset}" == "oracle" ]; then
13 | preprocess_dir="${expt_dir}/../../extractive-step/oracle/preprocessed"
14 | elif [ "${preprocessing_dataset}" == "extractive-model" ]; then
15 | preprocess_dir="${expt_dir}/../../extractive-step/extractive-model/preprocessed/topics/context"
16 | else
17 | echo "Invalid preprocessing dataset: ${preprocessing_dataset}"
18 | exit
19 | fi
20 |
21 | if [ "${use_context}" == "true" ]; then
22 | context_dir="context"
23 | else
24 | context_dir="no-context"
25 | fi
26 |
27 | output_dir=${expt_dir}/output/${preprocessing_dataset}/${context_dir}
28 | results_dir=${expt_dir}/results/${preprocessing_dataset}/${context_dir}
29 | mkdir -p ${results_dir}
30 |
31 | for split in valid test; do
32 | python -m summarize.metrics.rouge \
33 | ${preprocess_dir}/${split}.jsonl.gz \
34 | ${output_dir}/${split}.jsonl \
35 | --gold-summary-field-name cloze \
36 | --model-summary-field-name cloze \
37 | --add-gold-wrapping-list \
38 | --add-model-wrapping-list \
39 | --compute-rouge-l \
40 | --silent \
41 | --output-file ${results_dir}/${split}.metrics.json
42 | done
43 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/abstractive-step/coverage/predict.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 |
3 | if [ "$#" -ne 2 ]; then
4 | echo "Usage: sh predict.sh "
5 | exit
6 | fi
7 |
8 | preprocessing_dataset=$1
9 | use_context=$2
10 | if [ "${preprocessing_dataset}" == "lead" ]; then
11 | preprocess_dir="${expt_dir}/../../extractive-step/lead/preprocessed"
12 | elif [ "${preprocessing_dataset}" == "oracle" ]; then
13 | preprocess_dir="${expt_dir}/../../extractive-step/oracle/preprocessed"
14 | elif [ "${preprocessing_dataset}" == "extractive-model" ]; then
15 | preprocess_dir="${expt_dir}/../../extractive-step/extractive-model/preprocessed/topics/context"
16 | else
17 | echo "Invalid preprocessing dataset: ${preprocessing_dataset}"
18 | exit
19 | fi
20 |
21 | if [ "${use_context}" == "true" ]; then
22 | context_dir="context"
23 | else
24 | context_dir="no-context"
25 | fi
26 |
27 | model_dir=${expt_dir}/model/${preprocessing_dataset}/${context_dir}
28 | model_file=${model_dir}/model.tar.gz
29 | output_dir=${expt_dir}/output/${preprocessing_dataset}/${context_dir}
30 | results_dir=${expt_dir}/results/${preprocessing_dataset}/${context_dir}
31 | mkdir -p ${output_dir} ${results_dir}
32 |
33 | for split in valid test; do
34 | allennlp predict \
35 | --include-package summarize \
36 | --output-file ${output_dir}/${split}.jsonl \
37 | --predictor cloze-abstractive-predictor \
38 | --silent \
39 | --use-dataset-reader \
40 | --cuda-device 0 \
41 | --batch-size 16 \
42 | ${model_file} \
43 | ${preprocess_dir}/${split}.jsonl.gz
44 | done
45 |
46 | allennlp evaluate \
47 | --include-package summarize \
48 | --output-file ${results_dir}/test.evaluate.metrics.json \
49 | --cuda-device 0 \
50 | --overrides '{"validation_iterator.instances_per_epoch": null, "model.beam_search.beam_size": 1}' \
51 | ${model_file} \
52 | ${preprocess_dir}/test.jsonl.gz
53 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/abstractive-step/coverage/run.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 |
3 | if [ "$#" -ne 2 ]; then
4 | echo "Usage: sh run.sh "
5 | exit
6 | fi
7 |
8 | sh ${expt_dir}/train.sh $1 $2
9 | sh ${expt_dir}/predict.sh $1 $2
10 | sh ${expt_dir}/evaluate.sh $1 $2
11 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/abstractive-step/coverage/train.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 |
3 | if [ "$#" -ne 2 ]; then
4 | echo "Usage: sh train.sh "
5 | exit
6 | fi
7 |
8 | preprocessing_dataset=$1
9 | use_context=$2
10 | if [ "${preprocessing_dataset}" == "lead" ]; then
11 | preprocess_dir="${expt_dir}/../../extractive-step/lead/preprocessed"
12 | elif [ "${preprocessing_dataset}" == "oracle" ]; then
13 | preprocess_dir="${expt_dir}/../../extractive-step/oracle/preprocessed"
14 | elif [ "${preprocessing_dataset}" == "extractive-model" ]; then
15 | preprocess_dir="${expt_dir}/../../extractive-step/extractive-model/preprocessed/topics/context"
16 | else
17 | echo "Invalid preprocessing dataset: ${preprocessing_dataset}"
18 | exit
19 | fi
20 |
21 | if [ "${use_context}" == "true" ]; then
22 | context_dir="context"
23 | else
24 | context_dir="no-context"
25 | fi
26 |
27 | model_dir=${expt_dir}/model/${preprocessing_dataset}/${context_dir}
28 | pretrained_dir=${expt_dir}/../pointer-generator/model/${preprocessing_dataset}/${context_dir}
29 | model_config=${expt_dir}/model.jsonnet
30 |
31 | if [ -d ${model_dir} ]; then
32 | read -p "remove directory ${model_dir}? [y/n] " yn
33 | case $yn in
34 | [Yy]* ) rm -rf ${model_dir};;
35 | [Nn]* ) ;;
36 | * ) echo "Please answer yes or no.";;
37 | esac
38 | fi
39 |
40 | export DATA_DIR=${preprocess_dir}
41 | export USE_CONTEXT=${use_context}
42 | export PRETRAINED_DIR=${pretrained_dir}
43 | allennlp train \
44 | --include-package summarize \
45 | --serialization-dir ${model_dir} \
46 | ${model_config}
47 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/abstractive-step/pointer-generator/.gitignore:
--------------------------------------------------------------------------------
1 | model
2 | output
3 | results
4 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/abstractive-step/pointer-generator/evaluate.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 |
3 | if [ "$#" -ne 2 ]; then
4 | echo "Usage: sh evaluate.sh "
5 | exit
6 | fi
7 |
8 | preprocessing_dataset=$1
9 | use_context=$2
10 | if [ "${preprocessing_dataset}" == "lead" ]; then
11 | preprocess_dir="${expt_dir}/../../extractive-step/lead/preprocessed"
12 | elif [ "${preprocessing_dataset}" == "oracle" ]; then
13 | preprocess_dir="${expt_dir}/../../extractive-step/oracle/preprocessed"
14 | elif [ "${preprocessing_dataset}" == "extractive-model" ]; then
15 | preprocess_dir="${expt_dir}/../../extractive-step/extractive-model/preprocessed/topics/context"
16 | else
17 | echo "Invalid preprocessing dataset: ${preprocessing_dataset}"
18 | exit
19 | fi
20 |
21 | if [ "${use_context}" == "true" ]; then
22 | context_dir="context"
23 | else
24 | context_dir="no-context"
25 | fi
26 |
27 | output_dir=${expt_dir}/output/${preprocessing_dataset}/${context_dir}
28 | results_dir=${expt_dir}/results/${preprocessing_dataset}/${context_dir}
29 | mkdir -p ${results_dir}
30 |
31 | for split in valid test; do
32 | python -m summarize.metrics.rouge \
33 | ${preprocess_dir}/${split}.jsonl.gz \
34 | ${output_dir}/${split}.jsonl \
35 | --gold-summary-field-name cloze \
36 | --model-summary-field-name cloze \
37 | --add-gold-wrapping-list \
38 | --add-model-wrapping-list \
39 | --compute-rouge-l \
40 | --silent \
41 | --output-file ${results_dir}/${split}.metrics.json
42 | done
43 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/abstractive-step/pointer-generator/predict.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 |
3 | if [ "$#" -ne 2 ]; then
4 | echo "Usage: sh predict.sh "
5 | exit
6 | fi
7 |
8 | preprocessing_dataset=$1
9 | use_context=$2
10 | if [ "${preprocessing_dataset}" == "lead" ]; then
11 | preprocess_dir="${expt_dir}/../../extractive-step/lead/preprocessed"
12 | elif [ "${preprocessing_dataset}" == "oracle" ]; then
13 | preprocess_dir="${expt_dir}/../../extractive-step/oracle/preprocessed"
14 | elif [ "${preprocessing_dataset}" == "extractive-model" ]; then
15 | preprocess_dir="${expt_dir}/../../extractive-step/extractive-model/preprocessed/topics/context"
16 | else
17 | echo "Invalid preprocessing dataset: ${preprocessing_dataset}"
18 | exit
19 | fi
20 |
21 | if [ "${use_context}" == "true" ]; then
22 | context_dir="context"
23 | else
24 | context_dir="no-context"
25 | fi
26 |
27 | model_dir=${expt_dir}/model/${preprocessing_dataset}/${context_dir}
28 | model_file=${model_dir}/model.tar.gz
29 | output_dir=${expt_dir}/output/${preprocessing_dataset}/${context_dir}
30 | mkdir -p ${output_dir}
31 |
32 | for split in valid test; do
33 | allennlp predict \
34 | --include-package summarize \
35 | --output-file ${output_dir}/${split}.jsonl \
36 | --predictor cloze-abstractive-predictor \
37 | --silent \
38 | --use-dataset-reader \
39 | --cuda-device 0 \
40 | --batch-size 16 \
41 | ${model_file} \
42 | ${preprocess_dir}/${split}.jsonl.gz
43 | done
44 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/abstractive-step/pointer-generator/run.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 |
3 | if [ "$#" -ne 2 ]; then
4 | echo "Usage: sh run.sh "
5 | exit
6 | fi
7 |
8 | sh ${expt_dir}/train.sh $1 $2
9 | sh ${expt_dir}/predict.sh $1 $2
10 | sh ${expt_dir}/evaluate.sh $1 $2
11 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/abstractive-step/pointer-generator/train.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 |
3 | if [ "$#" -ne 2 ]; then
4 | echo "Usage: sh train.sh "
5 | exit
6 | fi
7 |
8 | preprocessing_dataset=$1
9 | use_context=$2
10 | if [ "${preprocessing_dataset}" == "lead" ]; then
11 | preprocess_dir="${expt_dir}/../../extractive-step/lead/preprocessed"
12 | elif [ "${preprocessing_dataset}" == "oracle" ]; then
13 | preprocess_dir="${expt_dir}/../../extractive-step/oracle/preprocessed"
14 | elif [ "${preprocessing_dataset}" == "extractive-model" ]; then
15 | preprocess_dir="${expt_dir}/../../extractive-step/extractive-model/preprocessed/topics/context"
16 | else
17 | echo "Invalid preprocessing dataset: ${preprocessing_dataset}"
18 | exit
19 | fi
20 |
21 | if [ "${use_context}" == "true" ]; then
22 | context_dir="context"
23 | else
24 | context_dir="no-context"
25 | fi
26 |
27 | model_dir=${expt_dir}/model/${preprocessing_dataset}/${context_dir}
28 | model_config=${expt_dir}/model.jsonnet
29 |
30 | if [ -d ${model_dir} ]; then
31 | read -p "remove directory ${model_dir}? [y/n] " yn
32 | case $yn in
33 | [Yy]* ) rm -rf ${model_dir};;
34 | [Nn]* ) ;;
35 | * ) echo "Please answer yes or no.";;
36 | esac
37 | fi
38 |
39 | export DATA_DIR=${preprocess_dir}
40 | export USE_CONTEXT=${use_context}
41 | allennlp train \
42 | --include-package summarize \
43 | --serialization-dir ${model_dir} \
44 | ${model_config}
45 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/baselines/lead/.gitignore:
--------------------------------------------------------------------------------
1 | output
2 | results
3 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/baselines/lead/run.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 | output_dir="${expt_dir}/output"
3 | results_dir="${expt_dir}/results"
4 | mkdir -p ${output_dir} ${results_dir}
5 |
6 | for split in valid test; do
7 | python -m summarize.models.cloze.lead \
8 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \
9 | ${output_dir}/${split}.jsonl \
10 | --max-sentences 1
11 |
12 | python -m summarize.metrics.rouge \
13 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \
14 | ${output_dir}/${split}.jsonl \
15 | --gold-summary-field-name cloze \
16 | --model-summary-field-name cloze \
17 | --add-gold-wrapping-list \
18 | --add-model-wrapping-list \
19 | --compute-rouge-l \
20 | --silent \
21 | --output-file ${results_dir}/${split}.metrics.json
22 | done
23 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/baselines/open-ai/.gitignore:
--------------------------------------------------------------------------------
1 | models
2 | output
3 | results
4 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/baselines/open-ai/Readme.md:
--------------------------------------------------------------------------------
1 | # OpenAI Language Model
2 | The OpenAI Language Model ([Radford et al., 2019](https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf)) serves as a baseline for the summary cloze task.
3 | The language model conditions on the context of the summary and generates the next sentence.
4 | The cited document is not used at all.
5 | The purpose of the experiment is to measure how well a system can do without access to the reference text.
6 |
7 | ## Setup
8 | Before using the OpenAI language model, you first need to download the model
9 | ```
10 | sh experiments/deutsch2019/baselines/open-ai/setup.sh
11 | ```
12 | For more documentation on the model and its parameters, see the official [Github repository](https://github.com/openai/gpt-2).
13 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/baselines/open-ai/run.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 | output_dir="${expt_dir}/output"
3 | results_dir="${expt_dir}/results"
4 | mkdir -p ${output_dir} ${results_dir}
5 |
6 | model="345M"
7 |
8 | for split in valid test; do
9 | python -m summarize.models.cloze.open_ai_language_model \
10 | ${expt_dir}/models/${model} \
11 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \
12 | ${output_dir}/${split}.jsonl \
13 | 1 \
14 | 40
15 |
16 | python -m summarize.metrics.rouge \
17 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \
18 | ${output_dir}/${split}.jsonl \
19 | --gold-summary-field-name cloze \
20 | --model-summary-field-name cloze \
21 | --add-gold-wrapping-list \
22 | --add-model-wrapping-list \
23 | --silent \
24 | --output-file ${results_dir}/${split}.metrics.json
25 | done
26 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/baselines/open-ai/setup.sh:
--------------------------------------------------------------------------------
1 | cwd=$(pwd)
2 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
3 |
4 | pushd ${expt_dir}
5 | python ${cwd}/external/gpt-2/download_model.py 345M
6 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/baselines/oracle/.gitignore:
--------------------------------------------------------------------------------
1 | output
2 | results
3 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/baselines/oracle/run.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 | output_dir="${expt_dir}/output"
3 | results_dir="${expt_dir}/results"
4 | mkdir -p ${output_dir} ${results_dir}
5 |
6 | for split in valid test; do
7 | for metric in "R1-F1" "R2-F1" "RL-F1"; do
8 | python -m summarize.models.cloze.oracle \
9 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \
10 | ${output_dir}/${split}.${metric}.jsonl \
11 | ${metric} \
12 | --max-sentences 1 \
13 | --cloze-only
14 |
15 | python -m summarize.metrics.rouge \
16 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \
17 | ${output_dir}/${split}.${metric}.jsonl \
18 | --gold-summary-field-name cloze \
19 | --model-summary-field-name cloze \
20 | --add-gold-wrapping-list \
21 | --add-model-wrapping-list \
22 | --compute-rouge-l \
23 | --silent \
24 | --output-file ${results_dir}/${split}.${metric}.metrics.json
25 | done
26 | done
27 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/extractive-step/Readme.md:
--------------------------------------------------------------------------------
1 | # Extractive Step
2 | This directory contains the scripts to train the extractive models.
3 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/extractive-step/bm25/.gitignore:
--------------------------------------------------------------------------------
1 | output
2 | results
3 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/extractive-step/bm25/calculate-df.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 | output_dir="${expt_dir}/output"
3 | mkdir -p ${output_dir}
4 |
5 | python -m summarize.models.cloze.bm25.calculate_df \
6 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/train.v1.1.jsonl.gz \
7 | ${output_dir}/df.jsonl.gz
8 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/extractive-step/bm25/evaluate.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 | output_dir="${expt_dir}/output"
3 | results_dir="${expt_dir}/results"
4 | mkdir -p ${results_dir}
5 |
6 | for split in valid test; do
7 | python -m summarize.metrics.rouge \
8 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \
9 | ${output_dir}/${split}.max-words.jsonl \
10 | --gold-summary-field-name cloze \
11 | --model-summary-field-name cloze \
12 | --add-gold-wrapping-list \
13 | --compute-rouge-l \
14 | --silent \
15 | --max-words 200 \
16 | --output-file ${results_dir}/${split}.max-words.metrics.json
17 |
18 | python -m summarize.metrics.rouge \
19 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \
20 | ${output_dir}/${split}.max-sents.jsonl \
21 | --gold-summary-field-name cloze \
22 | --model-summary-field-name cloze \
23 | --add-gold-wrapping-list \
24 | --add-model-wrapping-list \
25 | --compute-rouge-l \
26 | --silent \
27 | --output-file ${results_dir}/${split}.max-sents.metrics.json
28 | done
29 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/extractive-step/bm25/predict.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 | output_dir="${expt_dir}/output"
3 | mkdir -p ${output_dir}
4 |
5 | max_words=200
6 | max_sents=1
7 |
8 | for split in valid test; do
9 | python -m summarize.models.cloze.bm25.bm25 \
10 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \
11 | ${output_dir}/df.jsonl.gz \
12 | ${output_dir}/${split}.max-words.jsonl \
13 | --max-words ${max_words}
14 |
15 | python -m summarize.models.cloze.bm25.bm25 \
16 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \
17 | ${output_dir}/df.jsonl.gz \
18 | ${output_dir}/${split}.max-sents.jsonl \
19 | --max-sentences ${max_sents} \
20 | --flatten
21 | done
22 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/extractive-step/extractive-model/.gitignore:
--------------------------------------------------------------------------------
1 | model
2 | output
3 | results
4 | preprocessed
5 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/extractive-step/extractive-model/evaluate.sh:
--------------------------------------------------------------------------------
1 | if [ "$#" -ne 2 ]; then
2 | echo "Usage: sh evaluate.sh "
3 | exit
4 | fi
5 |
6 | use_topics=$1
7 | use_context=$2
8 | if [ "${use_topics}" == "true" ]; then
9 | topics_dir="topics"
10 | else
11 | topics_dir="no-topics"
12 | fi
13 | if [ "${use_context}" == "true" ]; then
14 | context_dir="context"
15 | else
16 | context_dir="no-context"
17 | fi
18 |
19 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
20 | output_dir=${expt_dir}/output/${topics_dir}/${context_dir}
21 | results_dir=${expt_dir}/results/${topics_dir}/${context_dir}
22 | mkdir -p ${results_dir}
23 |
24 | for split in valid test; do
25 | python -m summarize.metrics.rouge \
26 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \
27 | ${output_dir}/${split}.max-tokens.jsonl \
28 | --gold-summary-field-name cloze \
29 | --model-summary-field-name cloze \
30 | --add-gold-wrapping-list \
31 | --compute-rouge-l \
32 | --silent \
33 | --max-words 200 \
34 | --output-file ${results_dir}/${split}.max-tokens.metrics.json
35 |
36 | python -m summarize.metrics.rouge \
37 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \
38 | ${output_dir}/${split}.max-sents.jsonl \
39 | --gold-summary-field-name cloze \
40 | --model-summary-field-name cloze \
41 | --add-gold-wrapping-list \
42 | --compute-rouge-l \
43 | --silent \
44 | --output-file ${results_dir}/${split}.max-sents.metrics.json
45 | done
46 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/extractive-step/extractive-model/predict.sh:
--------------------------------------------------------------------------------
1 | if [ "$#" -ne 2 ]; then
2 | echo "Usage: sh predict.sh "
3 | exit
4 | fi
5 |
6 | use_topics=$1
7 | use_context=$2
8 | if [ "${use_topics}" == "true" ]; then
9 | topics_dir="topics"
10 | else
11 | topics_dir="no-topics"
12 | fi
13 | if [ "${use_context}" == "true" ]; then
14 | context_dir="context"
15 | else
16 | context_dir="no-context"
17 | fi
18 |
19 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
20 | model_file=${expt_dir}/model/${topics_dir}/${context_dir}/model.tar.gz
21 | output_dir=${expt_dir}/output/${topics_dir}/${context_dir}
22 | mkdir -p ${output_dir}
23 |
24 | for split in valid test; do
25 | allennlp predict \
26 | --include-package summarize \
27 | --predictor cloze-extractive-predictor \
28 | --output-file ${output_dir}/${split}.max-tokens.jsonl \
29 | --cuda-device 0 \
30 | --batch-size 1 \
31 | --silent \
32 | --use-dataset-reader \
33 | --overrides '{"dataset_reader.max_num_sentences": null}' \
34 | ${model_file} \
35 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz
36 |
37 | allennlp predict \
38 | --include-package summarize \
39 | --predictor cloze-extractive-predictor \
40 | --output-file ${output_dir}/${split}.max-sents.jsonl \
41 | --cuda-device 0 \
42 | --batch-size 1 \
43 | --silent \
44 | --use-dataset-reader \
45 | --overrides '{"dataset_reader.max_num_sentences": null, "model.max_words": null, "model.max_sents": 1}' \
46 | ${model_file} \
47 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz
48 | done
49 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/extractive-step/extractive-model/preprocess.sh:
--------------------------------------------------------------------------------
1 | if [ "$#" -ne 2 ]; then
2 | echo "Usage: sh preprocess.sh "
3 | exit
4 | fi
5 |
6 | use_topics=$1
7 | use_context=$2
8 | if [ "${use_topics}" == "true" ]; then
9 | topics_dir="topics"
10 | else
11 | topics_dir="no-topics"
12 | fi
13 | if [ "${use_context}" == "true" ]; then
14 | context_dir="context"
15 | else
16 | context_dir="no-context"
17 | fi
18 |
19 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
20 | model_file=${expt_dir}/model/${topics_dir}/${context_dir}/model.tar.gz
21 | preprocess_dir=${expt_dir}/preprocessed/${topics_dir}/${context_dir}
22 | mkdir -p ${preprocess_dir}
23 |
24 | for split in train valid test; do
25 | temp_file=$(mktemp)
26 | allennlp predict \
27 | --include-package summarize \
28 | --predictor cloze-extractive-predictor \
29 | --output-file ${temp_file} \
30 | --cuda-device 0 \
31 | --batch-size 1 \
32 | --silent \
33 | --use-dataset-reader \
34 | --overrides '{"dataset_reader.max_num_sentences": null}' \
35 | ${model_file} \
36 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz
37 |
38 | python -m summarize.utils.copy_jsonl_fields \
39 | ${temp_file} \
40 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \
41 | ${preprocess_dir}/${split}.jsonl.gz \
42 | --field-names cloze document
43 |
44 | rm ${temp_file}
45 | done
46 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/extractive-step/extractive-model/train.sh:
--------------------------------------------------------------------------------
1 | if [ "$#" -ne 2 ]; then
2 | echo "Usage: sh train.sh "
3 | exit
4 | fi
5 |
6 | use_topics=$1
7 | use_context=$2
8 | if [ "${use_topics}" == "true" ]; then
9 | topics_dir="topics"
10 | else
11 | topics_dir="no-topics"
12 | fi
13 | if [ "${use_context}" == "true" ]; then
14 | context_dir="context"
15 | else
16 | context_dir="no-context"
17 | fi
18 |
19 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
20 | model_dir=${expt_dir}/model/${topics_dir}/${context_dir}
21 | model_config=${expt_dir}/model.jsonnet
22 |
23 | if [ -d ${model_dir} ]; then
24 | read -p "remove directory ${model_dir}? [y/n] " yn
25 | case $yn in
26 | [Yy]* ) rm -rf ${model_dir};;
27 | [Nn]* ) ;;
28 | * ) echo "Please answer yes or no.";;
29 | esac
30 | fi
31 |
32 | export USE_TOPICS=${use_topics}
33 | export USE_CONTEXT=${use_context}
34 | allennlp train \
35 | --include-package summarize \
36 | --serialization-dir ${model_dir} \
37 | ${model_config}
38 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/extractive-step/lead/.gitignore:
--------------------------------------------------------------------------------
1 | output
2 | preprocessed
3 | results
4 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/extractive-step/lead/Readme.md:
--------------------------------------------------------------------------------
1 | # Lead
2 | This directory contains the scripts to run the lead model preprocessing.
3 | The preprocessed data can be found here: [train](https://danieldeutsch.s3.amazonaws.com/summarize/experiments/deutsch2019/v1.1/extractive-step/lead/preprocessed/train.jsonl.gz), [valid](https://danieldeutsch.s3.amazonaws.com/summarize/experiments/deutsch2019/v1.1/extractive-step/lead/preprocessed/valid.jsonl.gz), [test](https://danieldeutsch.s3.amazonaws.com/summarize/experiments/deutsch2019/v1.1/extractive-step/lead/preprocessed/test.jsonl.gz).
4 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/extractive-step/lead/preprocess.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 | preprocess_dir="${expt_dir}/preprocessed"
3 | mkdir -p ${preprocess_dir}
4 |
5 | for split in train valid test; do
6 | temp_file=$(mktemp)
7 | python -m summarize.models.cloze.lead \
8 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \
9 | ${temp_file} \
10 | --max-tokens 200 \
11 | --field-name document \
12 | --keep-sentences
13 |
14 | python -m summarize.utils.copy_jsonl_fields \
15 | ${temp_file} \
16 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \
17 | ${preprocess_dir}/${split}.jsonl.gz \
18 | --field-names document document
19 |
20 | rm ${temp_file}
21 | done
22 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/extractive-step/lead/run.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 | output_dir="${expt_dir}/output"
3 | results_dir="${expt_dir}/results"
4 | mkdir -p ${output_dir} ${results_dir}
5 |
6 | for split in valid test; do
7 | python -m summarize.models.cloze.lead \
8 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \
9 | ${output_dir}/${split}.jsonl \
10 | --max-tokens 200
11 |
12 | python -m summarize.metrics.rouge \
13 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \
14 | ${output_dir}/${split}.jsonl \
15 | --gold-summary-field-name cloze \
16 | --model-summary-field-name cloze \
17 | --add-gold-wrapping-list \
18 | --add-model-wrapping-list \
19 | --compute-rouge-l \
20 | --silent \
21 | --max-words 200 \
22 | --output-file ${results_dir}/${split}.metrics.json
23 | done
24 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/extractive-step/oracle/.gitignore:
--------------------------------------------------------------------------------
1 | output
2 | preprocessed
3 | results
4 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/extractive-step/oracle/Readme.md:
--------------------------------------------------------------------------------
1 | # Oracle
2 | This directory contains the scripts to run the oracle model preprocessing.
3 | The preprocessed data can be found here: [train](https://danieldeutsch.s3.amazonaws.com/summarize/experiments/deutsch2019/v1.1/extractive-step/oracle/preprocessed/train.jsonl.gz), [valid](https://danieldeutsch.s3.amazonaws.com/summarize/experiments/deutsch2019/v1.1/extractive-step/oracle/preprocessed/valid.jsonl.gz), [test](https://danieldeutsch.s3.amazonaws.com/summarize/experiments/deutsch2019/v1.1/extractive-step/oracle/preprocessed/test.jsonl.gz).
4 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/extractive-step/oracle/preprocess.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 | preprocess_dir="${expt_dir}/preprocessed"
3 | mkdir -p ${preprocess_dir}
4 |
5 | for split in train valid test; do
6 | temp_file=$(mktemp)
7 | python -m summarize.utils.extract_cloze_from_labels \
8 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \
9 | ${temp_file} \
10 | --field-name document \
11 | --keep-sentences
12 |
13 | python -m summarize.utils.copy_jsonl_fields \
14 | ${temp_file} \
15 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \
16 | ${preprocess_dir}/${split}.jsonl.gz \
17 | --field-names document document
18 |
19 | rm ${temp_file}
20 | done
21 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/extractive-step/oracle/run.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 | output_dir="${expt_dir}/output"
3 | results_dir="${expt_dir}/results"
4 | mkdir -p ${output_dir} ${results_dir}
5 |
6 | for split in valid test; do
7 | python -m summarize.utils.extract_cloze_from_labels \
8 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \
9 | ${output_dir}/${split}.jsonl
10 |
11 | python -m summarize.metrics.rouge \
12 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz \
13 | ${output_dir}/${split}.jsonl \
14 | --gold-summary-field-name cloze \
15 | --model-summary-field-name cloze \
16 | --add-gold-wrapping-list \
17 | --add-model-wrapping-list \
18 | --compute-rouge-l \
19 | --silent \
20 | --max-words 200 \
21 | --output-file ${results_dir}/${split}.metrics.json
22 | done
23 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/extractive-step/sumfocus/.gitignore:
--------------------------------------------------------------------------------
1 | output
2 | sweep
3 | logs
4 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/extractive-step/sumfocus/Readme.md:
--------------------------------------------------------------------------------
1 | # SumFocus
2 | An implementation of SumFocus from [Vanderwende et al. (2007)](https://www.cis.upenn.edu/~nenkova/papers/ipm.pdf).
3 | `run-parameter-sweep.sh` will run a parameter sweep to find the best settings of the unigram probability distribution smoothing parameter (`beta` in the code) and the interpolation parameters between the document, topic, and context (`topic_lambda` and `context_lambda` in the code) using the NLP Grid for parallelization.
4 | To analyze the results, run the python script `analyze_results.py` which will output what the best hyperparameter settings were for all variations of using and not using the topic and context.
5 |
6 | After the best hyperparameter settings are found, you can run the model on the test data to compute Rouge by running
7 | ```
8 | sh experiments/deutsch2019/extractive-step/sumfocus/run-max-words.sh \
9 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/test.v1.1.jsonl.gz \
10 | experiments/deutsch2019/extractive-step/sumfocus/output/test.max-words.jsonl \
11 | experiments/deutsch2019/extractive-step/sumfocus/output/test.max-words.metrics.jsonl \
12 | \
13 | \
14 | \
15 | 200
16 |
17 | sh experiments/deutsch2019/extractive-step/sumfocus/run-max-sents.sh \
18 | https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/test.v1.1.jsonl.gz \
19 | experiments/deutsch2019/extractive-step/sumfocus/output/test.max-sents.jsonl \
20 | experiments/deutsch2019/extractive-step/sumfocus/output/test.max-sents.metrics.jsonl \
21 | \
22 | \
23 | \
24 | 1
25 | ```
26 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/extractive-step/sumfocus/run-max-sents.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | #$ -cwd
3 | if [ "$#" -ne 7 ]; then
4 | echo "Usage: sh run-max-sents.sh"
5 | echo " "
6 | exit
7 | fi
8 |
9 | input_file=$1
10 | output_file=$2
11 | metrics_file=$3
12 | beta=$4
13 | topic_lambda=$5
14 | context_lambda=$6
15 | max_sents=$7
16 |
17 | mkdir -p $(dirname ${output_file})
18 | python -m summarize.models.cloze.sumfocus \
19 | ${input_file} \
20 | ${output_file} \
21 | ${beta} \
22 | ${topic_lambda} \
23 | ${context_lambda} \
24 | --max-sentences ${max_sents}
25 |
26 | mkdir -p $(dirname ${metrics_file})
27 | python -m summarize.metrics.rouge \
28 | ${input_file} \
29 | ${output_file} \
30 | --gold-summary-field-name cloze \
31 | --model-summary-field-name cloze \
32 | --add-gold-wrapping-list \
33 | --add-model-wrapping-list \
34 | --compute-rouge-l \
35 | --silent \
36 | --output-file ${metrics_file}
37 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/extractive-step/sumfocus/run-max-words.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | #$ -cwd
3 | if [ "$#" -ne 7 ]; then
4 | echo "Usage: sh run-max-words.sh"
5 | echo " "
6 | exit
7 | fi
8 |
9 | input_file=$1
10 | output_file=$2
11 | metrics_file=$3
12 | beta=$4
13 | topic_lambda=$5
14 | context_lambda=$6
15 | max_words=$7
16 |
17 | mkdir -p $(dirname ${output_file})
18 | python -m summarize.models.cloze.sumfocus \
19 | ${input_file} \
20 | ${output_file} \
21 | ${beta} \
22 | ${topic_lambda} \
23 | ${context_lambda} \
24 | --max-words ${max_words}
25 |
26 | mkdir -p $(dirname ${metrics_file})
27 | python -m summarize.metrics.rouge \
28 | ${input_file} \
29 | ${output_file} \
30 | --gold-summary-field-name cloze \
31 | --model-summary-field-name cloze \
32 | --add-gold-wrapping-list \
33 | --add-model-wrapping-list \
34 | --compute-rouge-l \
35 | --silent \
36 | --max-words 200 \
37 | --output-file ${metrics_file}
38 |
--------------------------------------------------------------------------------
/experiments/deutsch2019/extractive-step/sumfocus/run-parameter-sweep.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 | max_words_output_dir="${expt_dir}/sweep/max-words"
3 | max_sents_output_dir="${expt_dir}/sweep/max-sents"
4 | log_dir="${expt_dir}/logs"
5 | mkdir -p ${max_words_output_dir} ${max_sents_output_dir} ${log_dir}
6 |
7 | max_words=200
8 | max_num_sents=1
9 |
10 | for beta in 0.1 0.5 1.0 2.0; do
11 | for topic_lambda in 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0; do
12 | for context_lambda in 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0; do
13 | for split in valid; do
14 | name="beta_${beta}.topic-lambda-${topic_lambda}.context-lambda-${context_lambda}"
15 | gold_file="https://danieldeutsch.s3.amazonaws.com/summarize/data/deutsch2019/${split}.v1.1.jsonl.gz"
16 | model_file="${max_words_output_dir}/${split}.${name}.jsonl"
17 | metrics_file="${max_words_output_dir}/${split}.${name}.metrics.json"
18 |
19 | stdout=${log_dir}/${name}-words.stdout
20 | stderr=${log_dir}/${name}-words.stderr
21 | qsub -N ${name} -o ${stdout} -e ${stderr} \
22 | ${expt_dir}/run-max-words.sh ${gold_file} ${model_file} ${metrics_file} ${beta} ${topic_lambda} ${context_lambda} ${max_words}
23 |
24 | model_file="${max_sents_output_dir}/${split}.${name}.jsonl"
25 | metrics_file="${max_sents_output_dir}/${split}.${name}.metrics.json"
26 |
27 | stdout=${log_dir}/${name}-sents.stdout
28 | stderr=${log_dir}/${name}-sents.stderr
29 | qsub -N ${name} -o ${stdout} -e ${stderr} \
30 | ${expt_dir}/run-max-sents.sh ${gold_file} ${model_file} ${metrics_file} ${beta} ${topic_lambda} ${context_lambda} ${max_num_sents}
31 | done
32 | done
33 | done
34 | done
35 |
--------------------------------------------------------------------------------
/experiments/kedzie2018/Readme.md:
--------------------------------------------------------------------------------
1 | # Kedzie 2018
2 | This is a partial reimplementation of [Content Selection in Deep Learning Models of Summarization](https://arxiv.org/abs/1810.12343) by Kedzie et al. (2018).
3 |
4 | ## Instructions
5 | First, prepare the necessary data under `data/kedzie2018`.
6 | Then, each directory of the experiment corresponds to a different dataset and model with its own script to train, predict, and evaluate.
7 |
8 | ## Results
9 | Below are the reproduction results for the CNN/DailyMail dataset.
10 |
11 |
12 |
13 | Extractor |
14 | Encoder |
15 | R2-Recall |
16 | Saved Model |
17 |
18 |
19 | Reported |
20 | Reproduced |
21 |
22 |
23 |
24 |
25 | Lead |
26 | - |
27 | 24.4 |
28 | 24.4 |
29 | - |
30 |
31 |
32 | RNN |
33 | Avg |
34 | 25.4 |
35 | 25.5 |
36 | Link |
37 |
38 |
39 | RNN |
40 | 25.4 |
41 | 25.4 |
42 | Link |
43 |
44 |
45 | CNN |
46 | 25.1 |
47 | - |
48 | - |
49 |
50 |
51 | Oracle |
52 | - |
53 | 36.2 |
54 | 37.3 |
55 | - |
56 |
57 |
58 |
59 |
--------------------------------------------------------------------------------
/experiments/kedzie2018/cnn-dailymail/extractive-model/.gitignore:
--------------------------------------------------------------------------------
1 | model
2 | output
3 | results
4 |
--------------------------------------------------------------------------------
/experiments/kedzie2018/cnn-dailymail/extractive-model/evaluate.sh:
--------------------------------------------------------------------------------
1 | if [ "$#" -ne 2 ]; then
2 | echo "Usage: sh evaluate.sh "
3 | exit
4 | fi
5 |
6 | encoder=$1
7 | extractor=$2
8 |
9 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
10 | output_dir=${expt_dir}/output/${encoder}/${extractor}
11 | results_dir=${expt_dir}/results/${encoder}/${extractor}
12 |
13 | mkdir -p ${results_dir}
14 |
15 | for split in valid test; do
16 | python -m summarize.metrics.rouge \
17 | https://s3.amazonaws.com/danieldeutsch/summarize/data/kedzie2018/cnn-dailymail/${split}.v1.0.jsonl.gz \
18 | ${output_dir}/${split}.jsonl \
19 | --silent \
20 | --max-ngram 2 \
21 | --max-words 100 \
22 | --remove-stopwords \
23 | --output-file ${results_dir}/${split}.metrics.json
24 | done
25 |
--------------------------------------------------------------------------------
/experiments/kedzie2018/cnn-dailymail/extractive-model/model.jsonnet:
--------------------------------------------------------------------------------
1 | local encoder = std.extVar("ENCODER");
2 |
3 | // The size of the decoder's input changes based on the encoder choice
4 | local decoder_input_size =
5 | if encoder == "avg" then 200
6 | else if encoder == "rnn" then 400;
7 |
8 | {
9 | "dataset_reader": {
10 | "type": "sds-extractive",
11 | "tokenizer": {
12 | "type": "word",
13 | "word_splitter": {
14 | "type": "just_spaces"
15 | }
16 | },
17 | "token_indexers": {
18 | "tokens": {
19 | "type": "single_id",
20 | "lowercase_tokens": true
21 | }
22 | },
23 | "max_num_sentences": 50
24 | },
25 | "vocabulary": {
26 | "pretrained_files": {
27 | "tokens": "(http://nlp.stanford.edu/data/glove.6B.zip)#glove.6B.200d.txt"
28 | },
29 | "only_include_pretrained_words": true
30 | },
31 | "train_data_path": "https://s3.amazonaws.com/danieldeutsch/summarize/data/kedzie2018/cnn-dailymail/train.v1.0.jsonl.gz",
32 | "validation_data_path": "https://s3.amazonaws.com/danieldeutsch/summarize/data/kedzie2018/cnn-dailymail/valid.v1.0.jsonl.gz",
33 | "model": {
34 | "type": "sds-extractive-baseline",
35 | "token_embedder": {
36 | "token_embedders": {
37 | "tokens": {
38 | "type": "embedding",
39 | "embedding_dim": 200,
40 | "trainable": false,
41 | "pretrained_file": "(http://nlp.stanford.edu/data/glove.6B.zip)#glove.6B.200d.txt",
42 | }
43 | }
44 | },
45 | "sentence_encoder":
46 | if encoder == "avg" then {
47 | "type": "boe",
48 | "embedding_dim": 200,
49 | "averaged": true
50 | }
51 | else if encoder == "rnn" then {
52 | "type": "gru",
53 | "input_size": 200,
54 | "hidden_size": 200,
55 | "bidirectional": true
56 | }
57 | ,
58 | "sentence_extractor": {
59 | "type": "rnn",
60 | "rnn": {
61 | "type": "gru",
62 | "input_size": decoder_input_size,
63 | "hidden_size": 300,
64 | "bidirectional": true,
65 | },
66 | "feed_forward": {
67 | "input_dim": 600,
68 | "num_layers": 2,
69 | "hidden_dims": [100, 1],
70 | "activations": ["relu", "linear"],
71 | "dropout": [0.25, 0.0]
72 | },
73 | "dropout": 0.25
74 | },
75 | "max_words": 100,
76 | "dropout": 0.25,
77 | "metrics": [
78 | {
79 | "type": "python-rouge",
80 | "ngram_orders": [2],
81 | "max_words": 100,
82 | "remove_stopwords": true
83 | }
84 | ]
85 | },
86 | "iterator": {
87 | "type": "bucket",
88 | "batch_size": 32,
89 | "sorting_keys": [["document", "num_fields"]]
90 | },
91 | "validation_iterator": {
92 | "type": "bucket",
93 | "batch_size": 32,
94 | "sorting_keys": [["document", "num_fields"]]
95 | },
96 | "trainer": {
97 | "optimizer": {
98 | "type": "adam",
99 | "lr": 0.0001
100 | },
101 | "grad_norm": 5,
102 | "num_epochs": 20,
103 | "validation_metric": "+R2-R",
104 | "cuda_device": 0
105 | }
106 | }
107 |
--------------------------------------------------------------------------------
/experiments/kedzie2018/cnn-dailymail/extractive-model/predict.sh:
--------------------------------------------------------------------------------
1 | if [ "$#" -ne 2 ]; then
2 | echo "Usage: sh predict.sh "
3 | exit
4 | fi
5 |
6 | encoder=$1
7 | extractor=$2
8 |
9 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
10 | model_file=${expt_dir}/model/${encoder}/${extractor}/model.tar.gz
11 | output_dir=${expt_dir}/output/${encoder}/${extractor}
12 |
13 | mkdir -p ${output_dir}
14 |
15 | for split in valid test; do
16 | allennlp predict \
17 | --include-package summarize \
18 | --predictor sds-extractive-predictor \
19 | --output-file ${output_dir}/${split}.jsonl \
20 | --cuda-device 0 \
21 | --batch-size 16 \
22 | --silent \
23 | --use-dataset-reader \
24 | ${model_file} \
25 | https://s3.amazonaws.com/danieldeutsch/summarize/data/kedzie2018/cnn-dailymail/${split}.v1.0.jsonl.gz
26 | done
27 |
--------------------------------------------------------------------------------
/experiments/kedzie2018/cnn-dailymail/extractive-model/train.sh:
--------------------------------------------------------------------------------
1 | if [ "$#" -ne 2 ]; then
2 | echo "Usage: sh train.sh "
3 | exit
4 | fi
5 |
6 | encoder=$1
7 | extractor=$2
8 |
9 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
10 | model_dir=${expt_dir}/model/${encoder}/${extractor}
11 | model_config=${expt_dir}/model.jsonnet
12 |
13 | if [ -d ${model_dir} ]; then
14 | read -p "remove directory ${model_dir}? [y/n] " yn
15 | case $yn in
16 | [Yy]* ) rm -rf ${model_dir};;
17 | [Nn]* ) ;;
18 | * ) echo "Please answer yes or no.";;
19 | esac
20 | fi
21 |
22 | export ENCODER=${encoder}
23 | export EXTRACTOR=${extractor}
24 | allennlp train \
25 | --include-package summarize \
26 | --serialization-dir ${model_dir} \
27 | ${model_config}
28 |
--------------------------------------------------------------------------------
/experiments/kedzie2018/cnn-dailymail/lead/.gitignore:
--------------------------------------------------------------------------------
1 | output
2 | results
3 |
--------------------------------------------------------------------------------
/experiments/kedzie2018/cnn-dailymail/lead/run.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 | output_dir="${expt_dir}/output"
3 | results_dir="${expt_dir}/results"
4 | mkdir -p ${output_dir}
5 | mkdir -p ${results_dir}
6 |
7 | for split in valid test; do
8 | python -m summarize.models.sds.lead \
9 | https://s3.amazonaws.com/danieldeutsch/summarize/data/kedzie2018/cnn-dailymail/${split}.v1.0.jsonl.gz \
10 | ${output_dir}/${split}.jsonl \
11 | --max-tokens 100
12 |
13 | python -m summarize.metrics.rouge \
14 | https://s3.amazonaws.com/danieldeutsch/summarize/data/kedzie2018/cnn-dailymail/${split}.v1.0.jsonl.gz \
15 | ${output_dir}/${split}.jsonl \
16 | --silent \
17 | --max-ngram 2 \
18 | --remove-stopwords \
19 | --max-words 100 \
20 | --output-file ${results_dir}/${split}.metrics.json
21 | done
22 |
--------------------------------------------------------------------------------
/experiments/kedzie2018/cnn-dailymail/oracle/.gitignore:
--------------------------------------------------------------------------------
1 | output
2 | results
3 |
--------------------------------------------------------------------------------
/experiments/kedzie2018/cnn-dailymail/oracle/run.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 | output_dir="${expt_dir}/output"
3 | results_dir="${expt_dir}/results"
4 | mkdir -p ${output_dir}
5 | mkdir -p ${results_dir}
6 |
7 | for split in valid test; do
8 | python -m summarize.utils.extract_summary_from_labels \
9 | https://s3.amazonaws.com/danieldeutsch/summarize/data/kedzie2018/cnn-dailymail/${split}.v1.0.jsonl.gz \
10 | ${output_dir}/${split}.jsonl
11 |
12 | python -m summarize.metrics.rouge \
13 | https://s3.amazonaws.com/danieldeutsch/summarize/data/kedzie2018/cnn-dailymail/${split}.v1.0.jsonl.gz \
14 | ${output_dir}/${split}.jsonl \
15 | --silent \
16 | --max-ngram 2 \
17 | --remove-stopwords \
18 | --max-words 100 \
19 | --output-file ${results_dir}/${split}.metrics.json
20 | done
21 |
--------------------------------------------------------------------------------
/experiments/onmt/Readme.md:
--------------------------------------------------------------------------------
1 | # OpenNMT Parity Experiment
2 | This experiment aims to compare the performance of the Summarize and OpenNMT models with as close to identical setups as possible to ensure parity between libraries.
3 | The tests train and evaluate the sequence-to-sequence and pointer-generator models which are based on RNNs.
4 | There is a directory for each model that includes more details and the specific commands to reproduce the results.
5 | The OpenNMT commands come from the [summarization example](http://opennmt.net/OpenNMT-py/Summarization.html).
6 |
7 | ## Demo
8 | The final trained model can be demoed using the MyBinder Jupyter Notebook by clicking on this badge:
9 |
10 | [](https://mybinder.org/v2/gh/danieldeutsch/summarize/574d0027262573291724c72641a3e4967e018030?filepath=experiments%2Fonmt%2Fdemo.ipynb)
11 |
12 | Please note that generating the summary can take about 1 minute on the MyBinder servers.
13 |
14 |
15 | ## OpenNMT Data Setup
16 | The preprocessing of the CNN/DailyMail dataset is common between both OpenNMT models.
17 | ```
18 | git clone https://github.com/OpenNMT/OpenNMT-py
19 | cd OpenNMT-py
20 | wget https://s3.amazonaws.com/opennmt-models/Summary/cnndm.tar.gz
21 | mkdir data/cnndm
22 | tar -xzvf cnndm.tar.gz -C data/cnndm
23 |
24 | python preprocess.py \
25 | -train_src data/cnndm/train.txt.src \
26 | -train_tgt data/cnndm/train.txt.tgt.tagged \
27 | -valid_src data/cnndm/val.txt.src \
28 | -valid_tgt data/cnndm/val.txt.tgt.tagged \
29 | -save_data data/cnndm/CNNDM \
30 | -src_seq_length 10000 \
31 | -tgt_seq_length 10000 \
32 | -src_seq_length_trunc 400 \
33 | -tgt_seq_length_trunc 100 \
34 | -dynamic_dict \
35 | -share_vocab \
36 | -shard_size 100000
37 | ```
38 |
--------------------------------------------------------------------------------
/experiments/onmt/convert_to_jsonl.py:
--------------------------------------------------------------------------------
1 | """
2 | Converts the output of the OpenNMT models to the jsonl format that
3 | is necessary for evaluation. Additionally, the script will remove the
4 | sentence delimiters from the output.
5 | """
6 | # Edit the system path so the summarize library can be imported
7 | import sys
8 | sys.path.append('.')
9 |
10 | import argparse
11 | import json
12 |
13 | from summarize.data.io import JsonlWriter
14 |
15 |
16 | def main(args):
17 | with JsonlWriter(args.output_jsonl) as out:
18 | with open(args.input_tsv, 'r') as f:
19 | for line in f:
20 | line = line.strip()
21 | line = line.replace('', '').replace('', '')
22 | line = ' '.join(line.split())
23 | summary = [line]
24 | out.write({'summary': summary})
25 |
26 |
27 | if __name__ == '__main__':
28 | argp = argparse.ArgumentParser()
29 | argp.add_argument('input_tsv', help='The output from the OpenNMT model')
30 | argp.add_argument('output_jsonl', help='The converted jsonl file')
31 | args = argp.parse_args()
32 | main(args)
33 |
--------------------------------------------------------------------------------
/experiments/onmt/pointer-generator/.gitignore:
--------------------------------------------------------------------------------
1 | model
2 | output
3 | results
4 |
--------------------------------------------------------------------------------
/experiments/onmt/pointer-generator/evaluate.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 | output_dir=${expt_dir}/output
3 | results_dir=${expt_dir}/results
4 |
5 | mkdir -p ${results_dir}
6 |
7 | for split in valid test; do
8 | for constraints in min-length repeated-trigrams length coverage; do
9 | python -m summarize.metrics.rouge \
10 | https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/${split}.v1.0.jsonl.gz \
11 | ${output_dir}/${split}.${constraints}.jsonl \
12 | --silent \
13 | --compute-rouge-l \
14 | --output-file ${results_dir}/${split}.${constraints}.metrics.json
15 | done
16 | done
17 |
--------------------------------------------------------------------------------
/experiments/onmt/pointer-generator/predict.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 | model_file=${expt_dir}/model/model.tar.gz
3 | output_dir=${expt_dir}/output
4 |
5 | mkdir -p ${output_dir}
6 |
7 | for split in valid test; do
8 | # add minimum length
9 | allennlp predict \
10 | --include-package summarize \
11 | --predictor sds-abstractive-predictor \
12 | --output-file ${output_dir}/${split}.min-length.jsonl \
13 | --cuda-device 0 \
14 | --batch-size 16 \
15 | --silent \
16 | --use-dataset-reader \
17 | --overrides '{"model.beam_search.disallow_repeated_ngrams": null, "model.beam_search.repeated_ngrams_exceptions": null, "model.beam_search.length_penalizer": null, "model.beam_search.coverage_penalizer": null}' \
18 | ${model_file} \
19 | https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/${split}.v1.0.jsonl.gz
20 |
21 | # add disallow repeated trigrams
22 | allennlp predict \
23 | --include-package summarize \
24 | --predictor sds-abstractive-predictor \
25 | --output-file ${output_dir}/${split}.repeated-trigrams.jsonl \
26 | --cuda-device 0 \
27 | --batch-size 16 \
28 | --silent \
29 | --use-dataset-reader \
30 | --overrides '{"model.beam_search.length_penalizer": null, "model.beam_search.coverage_penalizer": null}' \
31 | ${model_file} \
32 | https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/${split}.v1.0.jsonl.gz
33 |
34 | # add length penalizer
35 | allennlp predict \
36 | --include-package summarize \
37 | --predictor sds-abstractive-predictor \
38 | --output-file ${output_dir}/${split}.length.jsonl \
39 | --cuda-device 0 \
40 | --batch-size 16 \
41 | --silent \
42 | --use-dataset-reader \
43 | --overrides '{"model.beam_search.coverage_penalizer": null}' \
44 | ${model_file} \
45 | https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/${split}.v1.0.jsonl.gz
46 |
47 | # add coverage penalizer
48 | allennlp predict \
49 | --include-package summarize \
50 | --predictor sds-abstractive-predictor \
51 | --output-file ${output_dir}/${split}.coverage.jsonl \
52 | --cuda-device 0 \
53 | --batch-size 16 \
54 | --silent \
55 | --use-dataset-reader \
56 | ${model_file} \
57 | https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/${split}.v1.0.jsonl.gz
58 | done
59 |
--------------------------------------------------------------------------------
/experiments/onmt/pointer-generator/replace-config.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 | model_config=${expt_dir}/model.jsonnet
3 | model_tar=${expt_dir}/model/model.tar.gz
4 |
5 | python -m summarize.utils.replace_config \
6 | ${model_tar} \
7 | ${model_tar} \
8 | ${model_config}
9 |
--------------------------------------------------------------------------------
/experiments/onmt/pointer-generator/run.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 |
3 | sh ${expt_dir}/train.sh
4 | sh ${expt_dir}/replace-config.sh
5 | sh ${expt_dir}/predict.sh
6 | sh ${expt_dir}/evaluate.sh
7 |
--------------------------------------------------------------------------------
/experiments/onmt/pointer-generator/train.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 | model_dir=${expt_dir}/model
3 | model_config=${expt_dir}/model.jsonnet
4 |
5 | if [ -d ${model_dir} ]; then
6 | read -p "remove directory ${model_dir}? [y/n] " yn
7 | case $yn in
8 | [Yy]* ) rm -rf ${model_dir};;
9 | [Nn]* ) ;;
10 | * ) echo "Please answer yes or no.";;
11 | esac
12 | fi
13 |
14 | allennlp train \
15 | --include-package summarize \
16 | --serialization-dir ${model_dir} \
17 | --overrides '{"model.run_beam_search": false}' \
18 | --file-friendly-logging \
19 | ${model_config}
20 |
--------------------------------------------------------------------------------
/experiments/onmt/seq2seq/.gitignore:
--------------------------------------------------------------------------------
1 | model
2 | output
3 | results
4 |
--------------------------------------------------------------------------------
/experiments/onmt/seq2seq/evaluate.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 | output_dir=${expt_dir}/output
3 | results_dir=${expt_dir}/results
4 |
5 | mkdir -p ${results_dir}
6 |
7 | for split in valid test; do
8 | for constraints in min-length repeated-trigrams length coverage; do
9 | python -m summarize.metrics.rouge \
10 | https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/${split}.v1.0.jsonl.gz \
11 | ${output_dir}/${split}.${constraints}.jsonl \
12 | --silent \
13 | --compute-rouge-l \
14 | --output-file ${results_dir}/${split}.${constraints}.metrics.json
15 | done
16 | done
17 |
--------------------------------------------------------------------------------
/experiments/onmt/seq2seq/predict.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 | model_file=${expt_dir}/model/model.tar.gz
3 | output_dir=${expt_dir}/output
4 |
5 | mkdir -p ${output_dir}
6 |
7 | for split in valid test; do
8 | # add minimum length
9 | allennlp predict \
10 | --include-package summarize \
11 | --predictor sds-abstractive-predictor \
12 | --output-file ${output_dir}/${split}.min-length.jsonl \
13 | --cuda-device 0 \
14 | --batch-size 16 \
15 | --silent \
16 | --use-dataset-reader \
17 | --overrides '{"model.beam_search.disallow_repeated_ngrams": null, "model.beam_search.repeated_ngrams_exceptions": null, "model.beam_search.length_penalizer": null, "model.beam_search.coverage_penalizer": null}' \
18 | ${model_file} \
19 | https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/${split}.v1.0.jsonl.gz
20 |
21 | # add disallow repeated trigrams
22 | allennlp predict \
23 | --include-package summarize \
24 | --predictor sds-abstractive-predictor \
25 | --output-file ${output_dir}/${split}.repeated-trigrams.jsonl \
26 | --cuda-device 0 \
27 | --batch-size 16 \
28 | --silent \
29 | --use-dataset-reader \
30 | --overrides '{"model.beam_search.length_penalizer": null, "model.beam_search.coverage_penalizer": null}' \
31 | ${model_file} \
32 | https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/${split}.v1.0.jsonl.gz
33 |
34 | # add length penalizer
35 | allennlp predict \
36 | --include-package summarize \
37 | --predictor sds-abstractive-predictor \
38 | --output-file ${output_dir}/${split}.length.jsonl \
39 | --cuda-device 0 \
40 | --batch-size 16 \
41 | --silent \
42 | --use-dataset-reader \
43 | --overrides '{"model.beam_search.coverage_penalizer": null}' \
44 | ${model_file} \
45 | https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/${split}.v1.0.jsonl.gz
46 |
47 | # add coverage penalizer
48 | allennlp predict \
49 | --include-package summarize \
50 | --predictor sds-abstractive-predictor \
51 | --output-file ${output_dir}/${split}.coverage.jsonl \
52 | --cuda-device 0 \
53 | --batch-size 16 \
54 | --silent \
55 | --use-dataset-reader \
56 | ${model_file} \
57 | https://danieldeutsch.s3.amazonaws.com/summarize/data/onmt/${split}.v1.0.jsonl.gz
58 | done
59 |
--------------------------------------------------------------------------------
/experiments/onmt/seq2seq/replace-config.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 | model_config=${expt_dir}/model.jsonnet
3 | model_tar=${expt_dir}/model/model.tar.gz
4 |
5 | python -m summarize.utils.replace_config \
6 | ${model_tar} \
7 | ${model_tar} \
8 | ${model_config}
9 |
--------------------------------------------------------------------------------
/experiments/onmt/seq2seq/run.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 |
3 | sh ${expt_dir}/train.sh
4 | sh ${expt_dir}/replace-config.sh
5 | sh ${expt_dir}/predict.sh
6 | sh ${expt_dir}/evaluate.sh
7 |
--------------------------------------------------------------------------------
/experiments/onmt/seq2seq/train.sh:
--------------------------------------------------------------------------------
1 | expt_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
2 | model_dir=${expt_dir}/model
3 | model_config=${expt_dir}/model.jsonnet
4 |
5 | if [ -d ${model_dir} ]; then
6 | read -p "remove directory ${model_dir}? [y/n] " yn
7 | case $yn in
8 | [Yy]* ) rm -rf ${model_dir};;
9 | [Nn]* ) ;;
10 | * ) echo "Please answer yes or no.";;
11 | esac
12 | fi
13 |
14 | allennlp train \
15 | --include-package summarize \
16 | --serialization-dir ${model_dir} \
17 | --overrides '{"model.run_beam_search": false}' \
18 | ${model_config}
19 |
--------------------------------------------------------------------------------
/experiments/wikicite/analysis/document-distribution/Readme.md:
--------------------------------------------------------------------------------
1 | This experiment calculates statistics about the reference documents.
2 | It can be run like the following
3 | ```
4 | python experiments/wikicite/analysis/document-distribution/run.py \
5 | https://danieldeutsch.s3.amazonaws.com/summarize/data/wikicite/train.tokenized.v1.1.jsonl.gz
6 | ```
7 |
--------------------------------------------------------------------------------
/experiments/wikicite/analysis/document-distribution/run.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | import argparse
3 | import sys
4 | from collections import defaultdict, Counter
5 | from tqdm import tqdm
6 |
7 | sys.path.append('../summarize')
8 |
9 | from summarize.data.io import JsonlReader
10 |
11 |
12 | def main(args):
13 | # The number of times each document appears
14 | document_to_num_occurrences = Counter()
15 | # The histogram of the document set sizes
16 | document_set_sizes = Counter()
17 | # The mapping from the document to the page ids
18 | document_to_page_ids = defaultdict(set)
19 |
20 | with JsonlReader(args.input_jsonl) as f:
21 | for instance in tqdm(f):
22 | page_id = instance['page_id']
23 | documents = instance['documents']
24 | document_set_sizes[len(documents)] += 1
25 |
26 | for document in documents:
27 | url = document['canonical_url']
28 | document_to_num_occurrences[url] += 1
29 | document_to_page_ids[url].add(page_id)
30 |
31 | # The histogram for the number of times a document appears
32 | num_occurrences_to_num_documents = Counter()
33 | for count in document_to_num_occurrences.values():
34 | num_occurrences_to_num_documents[count] += 1
35 |
36 | # The histogram for the number of pages a document appears
37 | num_pages_to_num_documents = Counter()
38 | for page_ids in document_to_page_ids.values():
39 | num_pages_to_num_documents[len(page_ids)] += 1
40 |
41 | num_instances = sum(document_set_sizes.values())
42 | num_multidoc = num_instances - document_set_sizes[1]
43 |
44 | num_unique_documents = len(document_to_num_occurrences)
45 | num_documents_multiple_times = num_unique_documents - num_occurrences_to_num_documents[1]
46 |
47 | num_documents_multiple_pages = num_unique_documents - num_pages_to_num_documents[1]
48 |
49 | print(f'Total unique documents: {num_unique_documents}')
50 | print(f'Total multi-document: {num_multidoc} ({num_multidoc / num_instances * 100:.2f}%)')
51 | print(f'Total documents appear more than once: {num_documents_multiple_times} ({num_documents_multiple_times / num_unique_documents * 100:.2f}%)')
52 | print(f'Total documents that appear in more than one page: {num_documents_multiple_pages} ({num_documents_multiple_pages / num_unique_documents * 100:.2f}%)')
53 |
54 |
55 | if __name__ == '__main__':
56 | argp = argparse.ArgumentParser()
57 | argp.add_argument('input_jsonl', help='The WikiCite dataset to analyze')
58 | args = argp.parse_args()
59 | main(args)
60 |
--------------------------------------------------------------------------------
/experiments/wikicite/analysis/topic-distribution/.gitignore:
--------------------------------------------------------------------------------
1 | plots
2 |
--------------------------------------------------------------------------------
/experiments/wikicite/analysis/topic-distribution/Readme.md:
--------------------------------------------------------------------------------
1 | This experiment calculates statistics about the topic frequencies in the WikiCite dataset.
2 | It can be run like the following
3 | ```
4 | python experiments/wikicite/analysis/topic-distribution/run.py \
5 | https://danieldeutsch.s3.amazonaws.com/summarize/data/wikicite/train.tokenized.v1.1.jsonl.gz \
6 | experiments/wikicite/analysis/topic-distribution/plots
7 | ```
8 |
--------------------------------------------------------------------------------
/experiments/wikicite/analysis/topic-distribution/run.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | import argparse
3 | import matplotlib
4 | matplotlib.use('Agg')
5 | import matplotlib.pyplot as plt
6 | import os
7 | import sys
8 | from collections import Counter
9 | from tqdm import tqdm
10 |
11 | sys.path.append('../summarize')
12 |
13 | from summarize.data.io import JsonlReader
14 |
15 |
16 | def main(args):
17 | os.makedirs(args.output_dir, exist_ok=True)
18 |
19 | topic_counts = Counter()
20 | with JsonlReader(args.input_jsonl) as f:
21 | for instance in tqdm(f):
22 | headings = instance['headings']
23 | for topic in headings:
24 | topic_counts[topic.lower()] += 1
25 |
26 | total_topk = 0
27 | topk_topics = []
28 | topk_counts = []
29 | for topic, count in topic_counts.most_common(15):
30 | topk_topics.append(topic)
31 | topk_counts.append(count)
32 | total_topk += count
33 |
34 | other_count = sum(topic_counts.values()) - total_topk
35 | topk_topics.append('other')
36 | topk_counts.append(other_count)
37 |
38 | for i in range(len(topk_counts)):
39 | topk_counts[i] /= 1000
40 |
41 | plt.figure()
42 | fig, ax = plt.subplots()
43 | x = list(reversed(range(len(topk_counts))))
44 | ax.barh(x, topk_counts)
45 | ax.set_yticks(x)
46 | ax.set_yticklabels(topk_topics)
47 | ax.set_xlabel('Thousands of Occurrences')
48 | ax.set_title('Topic Frequencies')
49 | plt.tight_layout()
50 | plt.savefig(f'{args.output_dir}/topic-distribution.png', dpi=1000)
51 |
52 | count_histogram = [0] * 10
53 | for count in topic_counts.values():
54 | if count >= 10:
55 | count_histogram[-1] += 1
56 | else:
57 | count_histogram[count - 1] += 1
58 |
59 | plt.figure()
60 | fig, ax = plt.subplots()
61 | x = list(range(len(count_histogram)))
62 | labels = list(range(1, len(count_histogram))) + ['10+']
63 | ax.bar(x, count_histogram)
64 | ax.set_xticks(x)
65 | ax.set_xticklabels(labels)
66 | ax.set_xlabel('Number of Occurrences')
67 | ax.set_ylabel('Number of Topics')
68 | ax.set_title('Topic Frequency Histogram')
69 | plt.tight_layout()
70 | plt.savefig(f'{args.output_dir}/frequency-histogram.png', dpi=1000)
71 |
72 | print('Total unique topics: ', len(topic_counts))
73 |
74 | print('Sample unique topics')
75 | print('--------------------')
76 | for topic, _ in topic_counts.most_common()[-50:]:
77 | print(topic)
78 |
79 |
80 | if __name__ == '__main__':
81 | argp = argparse.ArgumentParser()
82 | argp.add_argument('input_jsonl', help='The WikiCite dataset to analyze')
83 | argp.add_argument('output_dir', help='The directory where the plot should be written')
84 | args = argp.parse_args()
85 | main(args)
86 |
--------------------------------------------------------------------------------
/external/ROUGE-1.5.5/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 |
--------------------------------------------------------------------------------
/external/ROUGE-1.5.5/Readme.md:
--------------------------------------------------------------------------------
1 | # ROUGE-1.5.5
2 | Unfortunately due to licensing issues, we cannot release the original source code to compute ROUGE.
3 | If you have a copy, place the contents of the ROUGE-1.5.5 directory here.
4 |
--------------------------------------------------------------------------------
/external/meteor/.gitignore:
--------------------------------------------------------------------------------
1 | meteor-1.5
2 |
--------------------------------------------------------------------------------
/external/meteor/Readme.md:
--------------------------------------------------------------------------------
1 | # Meteor
2 | [Meteor](https://www.cs.cmu.edu/~alavie/METEOR/) is an evaluation metric for machine translation that is commonly used in summarization.
3 | To setup Meteor, run the `setup.sh` script from the root of the repository.
4 |
--------------------------------------------------------------------------------
/external/meteor/setup.sh:
--------------------------------------------------------------------------------
1 | wget https://www.cs.cmu.edu/~alavie/METEOR/download/meteor-1.5.tar.gz -O external/meteor/meteor-1.5.tar.gz
2 | tar xzvf external/meteor/meteor-1.5.tar.gz -C external/meteor
3 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | allennlp==0.9.0
2 | enforce
3 |
--------------------------------------------------------------------------------
/runtime.txt:
--------------------------------------------------------------------------------
1 | python-3.6
2 |
--------------------------------------------------------------------------------
/summarize/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/__init__.py
--------------------------------------------------------------------------------
/summarize/common/__init__.py:
--------------------------------------------------------------------------------
1 | from summarize.common.tempdir import TemporaryDirectory
2 |
--------------------------------------------------------------------------------
/summarize/common/tempdir.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | import tempfile
3 | from typing import Optional
4 |
5 |
6 | class TemporaryDirectory(object):
7 | """
8 | Creates a temporary directory that works with a context manager (the python
9 | ``with`` statement). This class was created because the user is responsible for
10 | deleting the directory created by ``tempfile.mkdtemp``. Instead, the context
11 | manager ensures the directory is deleted at the end.
12 |
13 | Example usage::
14 |
15 | with TemporaryDirectory() as temp_dir:
16 | with open(temp_dir + '/file.txt') as out:
17 | ...
18 |
19 | Parameters
20 | ----------
21 | root: ``str``, optional (default = ``None``)
22 | The root directory where the temporary directory should be created. If ``None``,
23 | the ``tempfile.mkdtemp`` default location is used.
24 | persist: ``bool``, optional (default = False)
25 | Indicates whether or not the directory should be persist on disk after the
26 | context closes.
27 | """
28 | def __init__(self,
29 | root: Optional[str] = None,
30 | persist: bool = False) -> None:
31 | self.root = root
32 | self.persist = persist
33 |
34 | def __enter__(self):
35 | self.path = tempfile.mkdtemp(dir=self.root)
36 | return self.path
37 |
38 | def __exit__(self, *args):
39 | if not self.persist:
40 | shutil.rmtree(self.path)
41 |
--------------------------------------------------------------------------------
/summarize/common/testing.py:
--------------------------------------------------------------------------------
1 | FIXTURES_ROOT = 'summarize/tests/fixtures'
2 |
--------------------------------------------------------------------------------
/summarize/common/util.py:
--------------------------------------------------------------------------------
1 | # These symbols are used when beginning- and end-of-sentence tags are required
2 | # in addition to START_SYMBOL and END_SYMBOL, which mark the starting and
3 | # ending of full sequences.
4 | SENT_START_SYMBOL = '@sent_start@'
5 | SENT_END_SYMBOL = '@sent_end@'
6 |
7 | # This symbol represents the copy token in the Pointer-Generator model
8 | COPY_SYMBOL = '@copy@'
9 |
--------------------------------------------------------------------------------
/summarize/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/data/__init__.py
--------------------------------------------------------------------------------
/summarize/data/dataset_readers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/data/dataset_readers/__init__.py
--------------------------------------------------------------------------------
/summarize/data/dataset_readers/cloze/__init__.py:
--------------------------------------------------------------------------------
1 | from summarize.data.dataset_readers.cloze.abstractive import AbstractiveClozeDatasetReader
2 | from summarize.data.dataset_readers.cloze.extractive import ExtractiveClozeDatasetReader
3 | from summarize.data.dataset_readers.cloze.pointer_generator import PointerGeneratorClozeDatasetReader
4 |
--------------------------------------------------------------------------------
/summarize/data/dataset_readers/sds/__init__.py:
--------------------------------------------------------------------------------
1 | from summarize.data.dataset_readers.sds.abstractive import AbstractiveDatasetReader
2 | from summarize.data.dataset_readers.sds.extractive import ExtractiveDatasetReader
3 | from summarize.data.dataset_readers.sds.pointer_generator import PointerGeneratorDatasetReader
4 |
--------------------------------------------------------------------------------
/summarize/data/dataset_setup/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/data/dataset_setup/__init__.py
--------------------------------------------------------------------------------
/summarize/data/dataset_setup/deutsch2019.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from joblib import Parallel, delayed
3 | from tqdm import tqdm
4 | from typing import Dict, List
5 |
6 | from summarize.data.io import JsonlReader, JsonlWriter
7 | from summarize.metrics.python_rouge import PythonRouge
8 | from summarize.metrics.rouge import R1_RECALL
9 | from summarize.models.sds.oracle import get_greedy_oracle_summary
10 |
11 | _BATCH_SIZE = 100
12 |
13 |
14 | def _process_batch(parallel: Parallel,
15 | batch: List[Dict[str, List[str]]],
16 | python_rouge: PythonRouge,
17 | out: JsonlWriter) -> None:
18 | jobs = []
19 | documents = []
20 | for instance in batch:
21 | document = [sentence for document in instance['documents']
22 | for paragraph in document['paragraphs']
23 | for sentence in paragraph]
24 | cloze = instance['cloze']
25 | job = delayed(get_greedy_oracle_summary)(document, [cloze], R1_RECALL,
26 | use_porter_stemmer=True,
27 | remove_stopwords=False,
28 | python_rouge=python_rouge)
29 | jobs.append(job)
30 | documents.append(document)
31 |
32 | results = parallel(jobs)
33 | for instance, document, (_, labels) in zip(batch, documents, results):
34 | id_ = instance['id']
35 | page_title = instance['page_title']
36 | headings = instance['headings']
37 | topics = [page_title] + headings
38 | context = instance['left_context']
39 | cloze = instance['cloze']
40 | output_data = {
41 | 'id': id_,
42 | 'topics': topics,
43 | 'document': document,
44 | 'context': context,
45 | 'cloze': cloze,
46 | 'labels': labels
47 | }
48 | out.write(output_data)
49 |
50 |
51 | def main(args):
52 | python_rouge = PythonRouge()
53 | with JsonlWriter(args.output_jsonl) as out:
54 | with JsonlReader(args.input_jsonl) as f:
55 | with Parallel(n_jobs=args.num_cores) as parallel:
56 | batch = []
57 | for instance in tqdm(f):
58 | batch.append(instance)
59 | if len(batch) == _BATCH_SIZE:
60 | _process_batch(parallel, batch, python_rouge, out)
61 | batch.clear()
62 |
63 | if batch:
64 | _process_batch(parallel, batch, python_rouge, out)
65 |
66 |
67 | if __name__ == '__main__':
68 | argp = argparse.ArgumentParser()
69 | argp.add_argument('input_jsonl', help='The input file to preprocess')
70 | argp.add_argument('output_jsonl', help='The output file')
71 | argp.add_argument('--num-cores', type=int, default=1, help='The number of cores to use')
72 | args = argp.parse_args()
73 | main(args)
74 |
--------------------------------------------------------------------------------
/summarize/data/dataset_setup/kedzie2018.py:
--------------------------------------------------------------------------------
1 | """
2 | Prepares the datasets to reproduce Kedzie 2018 by computing greedy oracle
3 | summaries by optimizing ROUGE-1 recall.
4 | """
5 | import argparse
6 | from joblib import Parallel, delayed
7 | from tqdm import tqdm
8 | from typing import Dict, List
9 |
10 | from summarize.data.io import JsonlReader, JsonlWriter
11 | from summarize.metrics.python_rouge import PythonRouge
12 | from summarize.metrics.rouge import R1_RECALL
13 | from summarize.models.sds.oracle import get_greedy_oracle_summary
14 |
15 | _BATCH_SIZE = 100
16 |
17 |
18 | def _process_batch(parallel: Parallel,
19 | batch: List[Dict[str, List[str]]],
20 | max_tokens: int,
21 | python_rouge: PythonRouge,
22 | out: JsonlWriter) -> None:
23 | jobs = []
24 | for instance in batch:
25 | document = instance['document']
26 | summary = instance['summary']
27 | job = delayed(get_greedy_oracle_summary)(document, summary,
28 | R1_RECALL,
29 | max_tokens=max_tokens,
30 | use_porter_stemmer=True,
31 | remove_stopwords=True,
32 | python_rouge=python_rouge)
33 | jobs.append(job)
34 |
35 | results = parallel(jobs)
36 | for instance, (_, labels) in zip(batch, results):
37 | instance['labels'] = labels
38 | out.write(instance)
39 |
40 |
41 | def main(args):
42 | python_rouge = PythonRouge()
43 | with JsonlWriter(args.output_jsonl) as out:
44 | with JsonlReader(args.input_jsonl) as f:
45 | with Parallel(n_jobs=args.num_cores) as parallel:
46 | batch = []
47 | for instance in tqdm(f):
48 | batch.append(instance)
49 | if len(batch) == _BATCH_SIZE:
50 | _process_batch(parallel, batch, args.max_tokens, python_rouge, out)
51 | batch.clear()
52 |
53 | if batch:
54 | _process_batch(parallel, batch, args.max_tokens, python_rouge, out)
55 |
56 |
57 | if __name__ == '__main__':
58 | argp = argparse.ArgumentParser()
59 | argp.add_argument('input_jsonl', help='The dataset to setup')
60 | argp.add_argument('output_jsonl', help='The output file')
61 | argp.add_argument('max_tokens', type=int, help='The maximum number of tokens to take in the greedy summary')
62 | argp.add_argument('--num-cores', type=int, default=1, help='The number of cores to use')
63 | args = argp.parse_args()
64 | main(args)
65 |
--------------------------------------------------------------------------------
/summarize/data/dataset_setup/tokenize.py:
--------------------------------------------------------------------------------
1 | """
2 | Tokenizes fields in a jsonl dataset file with the English spacy tokenizer.
3 | """
4 | import argparse
5 | import nltk
6 | import spacy
7 | from tqdm import tqdm
8 | from typing import Callable, Iterable, T
9 |
10 | from summarize.data.io import JsonlReader, JsonlWriter
11 |
12 |
13 | def tokenize(tokenize_func: Callable[[str], Iterable[T]], field):
14 | """
15 | Tokenizes text using the a tokenizer function. The ``field`` argument can be
16 | a string or a nested list of strings. The method will return the same level of nesting
17 | with the tokens whitespace separated in a string.
18 |
19 | The ``tokenize_func`` should be some function which returns iterable of tokens
20 | which can be cast to strings. For example, the ``nlp`` object from spacy or
21 | the ``word_tokenize`` function from nltk both work.
22 |
23 | For example::
24 |
25 | nlp = spacy.load('en')
26 | tokenize(nlp, "Hi, I'm Dan.")
27 | >>> "Hi , I 'm Dan ."
28 | tokenize(nlp, [['The first.', 'The second.'], 'The third.'])
29 | >>> [['The first .', 'The second .'], 'The third .']
30 |
31 | from nltk import word_tokenize
32 | tokenize(word_tokenize, 'This is the NLTK version.')
33 | >>> 'This is the NLTK version .'
34 |
35 | Parameters
36 | ----------
37 | tokenize_func: ``Callable[[str], Iterable[T]]``, required.
38 | The tokenization function. See above for a more detailed explanation.
39 | field: required.
40 | The text to tokenize. See above for the type explanation.
41 |
42 | Returns
43 | -------
44 | The tokenized text.
45 | """
46 | if isinstance(field, str):
47 | return ' '.join([str(token) for token in tokenize_func(field)])
48 | elif isinstance(field, list):
49 | return [tokenize(tokenize_func, item) for item in field]
50 | else:
51 | raise TypeError(f'Unknown ``field`` type {type(field)}')
52 |
53 |
54 | def main(args):
55 | if args.backend == 'spacy':
56 | nlp = spacy.load('en', disable=['tagger', 'parser', 'ner'])
57 | elif args.backend == 'nltk':
58 | nlp = nltk.word_tokenize
59 |
60 | with JsonlWriter(args.output_file) as out:
61 | with JsonlReader(args.input_file) as f:
62 | for instance in tqdm(f, desc=f'Tokenizing {args.input_file}'):
63 | for field in args.fields:
64 | instance[field] = tokenize(nlp, instance[field])
65 | out.write(instance)
66 |
67 |
68 | if __name__ == '__main__':
69 | argp = argparse.ArgumentParser()
70 | argp.add_argument('input_file', help='The jsonl file with fields to tokenize')
71 | argp.add_argument('output_file', help='The output jsonl file with the tokenized data')
72 | argp.add_argument('fields', nargs='+')
73 | argp.add_argument('--backend', default='spacy', choices=['spacy', 'nltk'],
74 | help='Indicates which library should be used for tokenization')
75 | args = argp.parse_args()
76 | main(args)
77 |
--------------------------------------------------------------------------------
/summarize/data/dataset_setup/wikicite.py:
--------------------------------------------------------------------------------
1 | """
2 | Preprocesses the original WikiCite dataset by tokenizing all of the text fields.
3 | """
4 | import argparse
5 | import spacy
6 | from tqdm import tqdm
7 |
8 | from summarize.data.dataset_setup.tokenize import tokenize
9 | from summarize.data.io import JsonlReader, JsonlWriter
10 |
11 |
12 | def main(args):
13 | nlp = spacy.load('en', disable=['tagger', 'parser', 'ner'])
14 |
15 | with JsonlWriter(args.output_jsonl) as out:
16 | with JsonlReader(args.input_jsonl) as f:
17 | for instance in tqdm(f):
18 | instance['headings'] = [tokenize(nlp, heading) for heading in instance['headings']]
19 | for document in instance['documents']:
20 | if document['title']:
21 | document['title'] = tokenize(nlp, document['title'])
22 | document['paragraphs'] = tokenize(nlp, document['paragraphs'])
23 |
24 | instance['left_context'] = tokenize(nlp, instance['left_context'])
25 | instance['cloze'] = tokenize(nlp, instance['cloze'])
26 | instance['right_context'] = tokenize(nlp, instance['right_context'])
27 | out.write(instance)
28 |
29 |
30 | if __name__ == '__main__':
31 | argp = argparse.ArgumentParser()
32 | argp.add_argument('input_jsonl', help='The input file to setup')
33 | argp.add_argument('output_jsonl', help='The output file')
34 | args = argp.parse_args()
35 | main(args)
36 |
--------------------------------------------------------------------------------
/summarize/data/dataset_stats/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/data/dataset_stats/__init__.py
--------------------------------------------------------------------------------
/summarize/data/io/__init__.py:
--------------------------------------------------------------------------------
1 | from summarize.data.io.jsonl_reader import JsonlReader
2 | from summarize.data.io.jsonl_writer import JsonlWriter
3 |
--------------------------------------------------------------------------------
/summarize/data/io/jsonl_reader.py:
--------------------------------------------------------------------------------
1 | import bz2
2 | import gzip
3 | import json
4 | from allennlp.common.file_utils import cached_path
5 | from typing import Any, List
6 |
7 | from summarize.data.io.util import is_gz_file
8 |
9 |
10 | class JsonlReader(object):
11 | """
12 | The `JsonlReader` is a layer of abstraction around reading serialized
13 | objects from a jsonl file. The reader will automatically deserialize and return
14 | one object from each line in the file. The data in the file will be decoded
15 | from a binary file depending on the extension of the file name. Current
16 | supported binary formats are gzip (``.gz``) and bz2 (``.bz2``). For gzip only,
17 | this will also inspect the file to see if it's gzipped in addition to checking
18 | the extension.
19 |
20 | The class should be used the same way that a built-in file handler works::
21 |
22 | with JsonlReader('/path/to/file.jsonl.gz') as f:
23 | for data in f:
24 | ...
25 |
26 | The class uses the cached path functionality from AllenNLP, so it is also
27 | possible to pass a url to the constructor.
28 |
29 | Parameters
30 | ----------
31 | file_path: ``str``
32 | The path to the file where the data should be read.
33 | """
34 | def __init__(self, file_path: str) -> None:
35 | self.file_path = cached_path(file_path)
36 |
37 | def __enter__(self):
38 | self.binary = False
39 | if self.file_path.endswith('.gz') or is_gz_file(self.file_path):
40 | self.file_handler = gzip.open(self.file_path, 'rb')
41 | self.binary = True
42 | elif self.file_path.endswith('.bz2'):
43 | self.file_handler = bz2.open(self.file_path, 'rb')
44 | self.binary = True
45 | else:
46 | self.file_handler = open(self.file_path, 'r')
47 | self.binary = False
48 | return self
49 |
50 | def __iter__(self):
51 | return self
52 |
53 | def __next__(self) -> Any:
54 | for line in self.file_handler:
55 | if self.binary:
56 | line = line.decode()
57 | return json.loads(line)
58 | raise StopIteration
59 |
60 | def __exit__(self, *args):
61 | self.file_handler.close()
62 |
63 | def read(self) -> List[Any]:
64 | """Reads all of the instances into a list."""
65 | with self:
66 | return [instance for instance in self]
67 |
--------------------------------------------------------------------------------
/summarize/data/io/jsonl_writer.py:
--------------------------------------------------------------------------------
1 | import bz2
2 | import gzip
3 | import json
4 | import os
5 | from typing import Any
6 |
7 |
8 | class JsonlWriter(object):
9 | """
10 | The ``JsonlWriter`` is a layer of abstraction around writing data to jsonl
11 | files. The writer will automatically serialize the input objects into json
12 | strings, then write them to an output file, one object per line. The data
13 | will be written as plain text or as bytes, depending on the extension of
14 | the output file. Current supported binary formats are gzip (``.gz``) and
15 | bz2 (``.bz2``). All other extensions will use plain text.
16 |
17 | The class should be used the same way that a built-in file handler works::
18 |
19 | with JsonlWriter('/path/to/file.jsonl.gz') as out:
20 | data = ... # some data to serialize
21 | out.write(data)
22 |
23 | Parameters
24 | ----------
25 | file_path: ``str``
26 | The path to the file where the data should be written.
27 | """
28 | def __init__(self, file_path: str) -> None:
29 | self.file_path = file_path
30 |
31 | def __enter__(self):
32 | dirname = os.path.dirname(self.file_path)
33 | if dirname:
34 | os.makedirs(dirname, exist_ok=True)
35 | if self.file_path.endswith('.gz'):
36 | self.file_handler = gzip.open(self.file_path, 'wb')
37 | self.binary = True
38 | elif self.file_path.endswith('.bz2'):
39 | self.file_handler = bz2.open(self.file_path, 'wb')
40 | self.binary = True
41 | else:
42 | self.file_handler = open(self.file_path, 'w')
43 | self.binary = False
44 | return self
45 |
46 | def write(self, object: Any) -> None:
47 | """
48 | Serializes the input object to a json string and writes it to the file.
49 |
50 | Parameters
51 | ----------
52 | object: ``Any``
53 | The object to write to the file.
54 | """
55 | string = json.dumps(object)
56 | if self.binary:
57 | self.file_handler.write(string.encode() + b'\n')
58 | else:
59 | self.file_handler.write(string + '\n')
60 |
61 | def __exit__(self, *args):
62 | self.file_handler.close()
63 |
--------------------------------------------------------------------------------
/summarize/data/io/util.py:
--------------------------------------------------------------------------------
1 | import binascii
2 |
3 |
4 | def is_gz_file(file_path: str):
5 | """
6 | Tests to see if a file is gzipped or not. This was taken from
7 | https://stackoverflow.com/questions/3703276/how-to-tell-if-a-file-is-gzip-compressed
8 |
9 | Returns
10 | -------
11 | True if it is gzipped, False otherwise.
12 | """
13 | with open(file_path, 'rb') as f:
14 | return binascii.hexlify(f.read(2)) == b'1f8b'
15 |
--------------------------------------------------------------------------------
/summarize/data/paragraph_tokenizers/__init__.py:
--------------------------------------------------------------------------------
1 | from summarize.data.paragraph_tokenizers.paragraph_tokenizer import ParagraphTokenizer
2 | from summarize.data.paragraph_tokenizers.paragraph_word_tokenizer import ParagraphWordTokenizer
3 |
--------------------------------------------------------------------------------
/summarize/data/paragraph_tokenizers/paragraph_tokenizer.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | from allennlp.common import Registrable
4 | from allennlp.data.tokenizers.token import Token
5 |
6 |
7 | class ParagraphTokenizer(Registrable):
8 | """
9 | A ``ParagraphTokenizer`` is a wrapper around an AllenNLP ``Tokenizer`` for tokenizing
10 | a list of strings into tokens. The primary use is for tokenizing a pre-sentence-split
11 | paragraph into a single list of tokens. Having this abstraction at the paragraph-level
12 | allows for additional functionality, like adding tokens in between the sentences.
13 | """
14 | def tokenize(self, texts: List[str]) -> List[Token]:
15 | """
16 | Actually implements splitting sentences into tokens.
17 |
18 | Returns
19 | -------
20 | tokens : ``List[Token]``
21 | """
22 | raise NotImplementedError
23 |
--------------------------------------------------------------------------------
/summarize/data/paragraph_tokenizers/paragraph_word_tokenizer.py:
--------------------------------------------------------------------------------
1 | from allennlp.data.tokenizers import Token, WordTokenizer
2 | from allennlp.data.tokenizers.word_filter import WordFilter, PassThroughWordFilter
3 | from allennlp.data.tokenizers.word_splitter import WordSplitter
4 | from allennlp.data.tokenizers.word_stemmer import WordStemmer, PassThroughWordStemmer
5 | from overrides import overrides
6 | from typing import List
7 |
8 | from summarize.data.paragraph_tokenizers import ParagraphTokenizer
9 |
10 |
11 | @ParagraphTokenizer.register('word')
12 | class ParagraphWordTokenizer(ParagraphTokenizer):
13 | """
14 | A ``ParagraphWordTokenizer`` is a wrapper around the ``WordTokenizer`` at the
15 | paragraph-level. It includes the ability to insert tokens in between the
16 | sentence tokens.
17 |
18 | Parameters
19 | ----------
20 | word_splitter: ``WordSplitter``, optional (default = ``None``)
21 | See ``WordTokenizer``
22 | word_filter: ``WordFilter``, optional (default = ``PassThroughWordFilter()``)
23 | See ``WordTokenizer``
24 | word_stemmer: ``WordStemmer``, optional (default = ``PassThroughWordStemmer()``)
25 | See ``WordTokenizer``
26 | start_tokens: ``List[str]``, optional (default = ``[]``)
27 | See ``WordTokenizer``
28 | end_tokens: ``List[str]``, optional (default = ``[]``)
29 | See ``WordTokenizer``
30 | in_between_tokens: ``List[str]``, optional (default = ``[]``)
31 | The tokens to insert in between sentences.
32 | """
33 | def __init__(self,
34 | word_splitter: WordSplitter = None,
35 | word_filter: WordFilter = PassThroughWordFilter(),
36 | word_stemmer: WordStemmer = PassThroughWordStemmer(),
37 | start_tokens: List[str] = None,
38 | end_tokens: List[str] = None,
39 | in_between_tokens: List[str] = None):
40 | self.tokenizer = WordTokenizer(word_splitter=word_splitter,
41 | word_filter=word_filter,
42 | word_stemmer=word_stemmer)
43 | self.start_tokens = start_tokens or []
44 | self.start_tokens = [Token(token) for token in self.start_tokens]
45 | self.end_tokens = end_tokens or []
46 | self.end_tokens = [Token(token) for token in self.end_tokens]
47 | self.in_between_tokens = in_between_tokens or []
48 | self.in_between_tokens = [Token(token) for token in self.in_between_tokens]
49 |
50 | @overrides
51 | def tokenize(self, texts: List[str]) -> List[Token]:
52 | tokenized_texts = [self.tokenizer.tokenize(text) for text in texts]
53 | tokens = []
54 | if self.start_tokens:
55 | tokens.extend(self.start_tokens)
56 | for i, tokenized_text in enumerate(tokenized_texts):
57 | tokens.extend(tokenized_text)
58 |
59 | # Add the in-between tokens if this is not the last sentence
60 | if i != len(tokenized_texts) - 1:
61 | tokens.extend(self.in_between_tokens)
62 | if self.end_tokens:
63 | tokens.extend(self.end_tokens)
64 | return tokens
65 |
--------------------------------------------------------------------------------
/summarize/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/metrics/__init__.py
--------------------------------------------------------------------------------
/summarize/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/models/__init__.py
--------------------------------------------------------------------------------
/summarize/models/cloze/__init__.py:
--------------------------------------------------------------------------------
1 | from summarize.models.cloze.extractive_baseline import ClozeExtractiveBaselineModel
2 | from summarize.models.cloze.pointer_generator import ClozePointerGeneratorModel
3 | from summarize.models.cloze.seq2seq import ClozeSeq2SeqModel
4 | from summarize.models.cloze.open_ai_language_model import OpenAILanguageModel
5 |
--------------------------------------------------------------------------------
/summarize/models/cloze/bm25/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/models/cloze/bm25/__init__.py
--------------------------------------------------------------------------------
/summarize/models/cloze/bm25/calculate_df.py:
--------------------------------------------------------------------------------
1 | """
2 | Computes the document-frequency term for calculating BM25. The model will consider
3 | the cloze context as the query and the reference document sentences as the
4 | documents that need to be ranked. Therefore, one sentence is a "document" in the
5 | BM25 equation, and thus the document frequencies should be based on the document
6 | sentences.
7 | """
8 | import argparse
9 | from collections import Counter
10 | from tqdm import tqdm
11 |
12 | from summarize.data.io import JsonlReader, JsonlWriter
13 |
14 |
15 | def main(args):
16 | dfs = Counter()
17 | total_document_length = 0
18 | num_documents = 0
19 |
20 | with JsonlReader(args.input_jsonl) as f:
21 | for instance in tqdm(f, desc='Calculating document frequencies'):
22 | document = instance['document']
23 | for sentence in document:
24 | tokens = sentence.lower().split()
25 | total_document_length += len(tokens)
26 | num_documents += 1
27 | for token in set(tokens):
28 | dfs[token] += 1
29 |
30 | average_document_length = total_document_length / num_documents
31 | with JsonlWriter(args.output_jsonl) as out:
32 | out.write({'num_documents': num_documents, 'average_document_length': average_document_length})
33 | for token, df in dfs.items():
34 | out.write({'token': token, 'df': df})
35 |
36 |
37 | if __name__ == '__main__':
38 | argp = argparse.ArgumentParser()
39 | argp.add_argument('input_jsonl')
40 | argp.add_argument('output_jsonl')
41 | args = argp.parse_args()
42 | main(args)
43 |
--------------------------------------------------------------------------------
/summarize/models/cloze/lead.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from summarize.data.io import JsonlReader, JsonlWriter
4 | from summarize.models.sds.lead import get_lead_summary
5 |
6 |
7 | def main(args):
8 | with JsonlWriter(args.output_jsonl) as out:
9 | with JsonlReader(args.input_jsonl) as f:
10 | for instance in f:
11 | document = instance['document']
12 | cloze = get_lead_summary(document,
13 | max_sentences=args.max_sentences,
14 | max_tokens=args.max_tokens,
15 | max_bytes=args.max_bytes)
16 | if not args.keep_sentences:
17 | cloze = ' '.join(cloze)
18 | out.write({args.field_name: cloze})
19 |
20 |
21 | if __name__ == '__main__':
22 | argp = argparse.ArgumentParser()
23 | argp.add_argument('input_jsonl', help='The input documents')
24 | argp.add_argument('output_jsonl', help='The output file')
25 | argp.add_argument('--max-sentences', type=int, help='The number of sentences to take')
26 | argp.add_argument('--max-tokens', type=int, help='The number of tokens to take')
27 | argp.add_argument('--max-bytes', type=int, help='The number of bytes to take')
28 | argp.add_argument('--field-name', default='cloze', help='The name of the output field')
29 | argp.add_argument('--keep-sentences', action='store_true', help='Indicates if the output field should be left as sentences or flattened')
30 | args = argp.parse_args()
31 | main(args)
32 |
--------------------------------------------------------------------------------
/summarize/models/sds/__init__.py:
--------------------------------------------------------------------------------
1 | from summarize.models.sds.extractive_baseline import ExtractiveBaselineModel
2 | from summarize.models.sds.pointer_generator import PointerGeneratorModel
3 | from summarize.models.sds.seq2seq import Seq2SeqModel
4 |
--------------------------------------------------------------------------------
/summarize/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/modules/__init__.py
--------------------------------------------------------------------------------
/summarize/modules/coverage_matrix_attention/__init__.py:
--------------------------------------------------------------------------------
1 | from summarize.modules.coverage_matrix_attention.coverage_matrix_attention import CoverageMatrixAttention
2 | from summarize.modules.coverage_matrix_attention.matrix_attention_wrapper import MatrixAttentionWrapper
3 | from summarize.modules.coverage_matrix_attention.mlp import MLPCoverageAttention
4 |
--------------------------------------------------------------------------------
/summarize/modules/coverage_matrix_attention/coverage_matrix_attention.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from allennlp.common.registrable import Registrable
3 | from typing import Tuple
4 |
5 |
6 | class CoverageMatrixAttention(torch.nn.Module, Registrable):
7 | """
8 | The ``CoverageMatrixAttention`` computes a matrix of attention probabilities
9 | between the encoder and decoder outputs. The attention function has access
10 | to the cumulative probabilities that the attention has assigned to each
11 | input token previously. In addition to the attention probabilities, the function
12 | should return the coverage vectors which were used to compute the distribution
13 | at each time step as well as the new coverage vector which takes into account
14 | the function's computation.
15 |
16 | The module must compute the probabilities instead of the raw scores (like
17 | the ``MatrixAttention`` module does) because the coverage vector contains
18 | the accumulated probabilities.
19 | """
20 | def forward(self,
21 | decoder_outputs: torch.Tensor,
22 | encoder_outputs: torch.Tensor,
23 | encoder_mask: torch.Tensor,
24 | coverage_vector: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
25 | """
26 | Computes a matrix of attention scores and updates the coverage vector.
27 |
28 | Parameters
29 | ----------
30 | decoder_outputs: (batch_size, num_decoder_tokens, hidden_dim)
31 | The decoder's outputs.
32 | encoder_outputs: (batch_size, num_encoder_tokens, hidden_dim)
33 | The encoder's outputs.
34 | encoder_mask: (batch_size, num_encoder_tokens)
35 | The encoder token mask.
36 | coverage_vector: (batch_size, num_encoder_tokens)
37 | The cumulative attention probability assigned to each input token
38 | thus far.
39 |
40 | Returns
41 | -------
42 | torch.Tensor: (batch_size, num_decoder_tokens, num_encoder_tokens)
43 | The attention probabilities between each decoder and encoder hidden representations.
44 | torch.Tensor: (batch_size, num_decoder_tokens, num_encoder_tokens)
45 | The coverage vectors used to compute the corresponding attention probabilities.
46 | torch.Tensor: (batch_size, num_encoder_tokens)
47 | The latest coverage vector after computing
48 | """
49 | raise NotImplementedError
50 |
--------------------------------------------------------------------------------
/summarize/modules/coverage_matrix_attention/matrix_attention_wrapper.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from allennlp.modules.matrix_attention import MatrixAttention
3 | from allennlp.nn.util import masked_softmax
4 | from overrides import overrides
5 | from typing import Tuple
6 |
7 | from summarize.modules.coverage_matrix_attention import CoverageMatrixAttention
8 |
9 |
10 | @CoverageMatrixAttention.register('matrix-attention')
11 | class MatrixAttentionWrapper(CoverageMatrixAttention):
12 | """
13 | Wraps the ``MatrixAttention`` module from AllenNLP so the attention functions
14 | which do not use coverage can implement the ``CoverageMatrixAttention`` module
15 | interface.
16 |
17 | Parameters
18 | ----------
19 | matrix_attention: ``MatrixAttention``
20 | The underlying ``MatrixAttention`` to use.
21 | """
22 | def __init__(self, matrix_attention: MatrixAttention) -> None:
23 | super().__init__()
24 | self.matrix_attention = matrix_attention
25 |
26 | @overrides
27 | def forward(self,
28 | decoder_outputs: torch.Tensor,
29 | encoder_outputs: torch.Tensor,
30 | encoder_mask: torch.Tensor,
31 | coverage_vector: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
32 | # shape: (batch_size, num_summary_tokens, num_document_tokens)
33 | affinities = self.matrix_attention(decoder_outputs, encoder_outputs)
34 | # shape: (batch_size, num_summary_tokens, num_document_tokens)
35 | probabilities = masked_softmax(affinities, encoder_mask)
36 |
37 | # Create dummy coverage vectors to return
38 | batch_size, num_summary_tokens, num_document_tokens = affinities.size()
39 | # shape: (batch_size, num_summary_tokens, num_document_tokens)
40 | coverage_vectors = coverage_vector.new_zeros(batch_size, num_summary_tokens, num_document_tokens)
41 | # shape: (batch_size, num_document_tokens)
42 | coverage_vector = coverage_vector.new_zeros(batch_size, num_document_tokens)
43 |
44 | return probabilities, coverage_vectors, coverage_vector
45 |
--------------------------------------------------------------------------------
/summarize/modules/generate_probability_functions/__init__.py:
--------------------------------------------------------------------------------
1 | from summarize.modules.generate_probability_functions.generate_probability_function import GenerateProbabilityFunction
2 | from summarize.modules.generate_probability_functions.onmt import ONMTGenerateProbabilityFunction
3 | from summarize.modules.generate_probability_functions.see2017 import See2017GenerateProbabilityFunction
4 |
--------------------------------------------------------------------------------
/summarize/modules/generate_probability_functions/generate_probability_function.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from allennlp.common.registrable import Registrable
3 |
4 |
5 | class GenerateProbabilityFunction(torch.nn.Module, Registrable):
6 | def forward(self,
7 | input_embeddings: torch.Tensor,
8 | pre_attention_decoder_outputs: torch.Tensor,
9 | post_attention_decoder_outputs: torch.Tensor,
10 | attention_context: torch.Tensor) -> torch.Tensor:
11 | """
12 | Computes the probability of generating a token, the soft switch from
13 | See et al. (2017).
14 |
15 | Parameters
16 | ----------
17 | input_embeddings: (batch_size, num_summary_tokens, embedding_dim)
18 | The embeddings which are passed as input to the decoder.
19 | pre_attention_decoder_outputs: (batch_size, num_summary_tokens, hidden_dim)
20 | The direct output from the decoder, which does not include any attention.
21 | post_attention_decoder_outputs: (batch_size, num_summary_tokens, hidden_dim)
22 | The output of the decoder after attention has been included.
23 | attention_context: (batch_size, num_summary_tokens, encoder_hidden_dim)
24 | The attention context (the weighted average of the encoder hidden states
25 | based on the attention distribution)
26 |
27 | Returns
28 | -------
29 | (batch_size, num_summary_tokens):
30 | The generation probability.
31 | """
32 | raise NotImplementedError
33 |
--------------------------------------------------------------------------------
/summarize/modules/generate_probability_functions/onmt.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from overrides import overrides
3 |
4 | from summarize.modules.generate_probability_functions import GenerateProbabilityFunction
5 |
6 |
7 | @GenerateProbabilityFunction.register('onmt')
8 | class ONMTGenerateProbabilityFunction(GenerateProbabilityFunction):
9 | """
10 | Computes the generation probability according the function used by the
11 | OpenNMT framework. The probability is a function of only the final decoder
12 | hidden states (with attention).
13 |
14 | Parameters
15 | ----------
16 | decoder_dim: ``int``
17 | The size of the decoder's hidden state.
18 | """
19 | def __init__(self, decoder_dim: int) -> None:
20 | super().__init__()
21 | self.hidden_layer = torch.nn.Linear(decoder_dim, 1)
22 |
23 | @overrides
24 | def forward(self,
25 | input_embeddings: torch.Tensor,
26 | pre_attention_decoder_outputs: torch.Tensor,
27 | post_attention_decoder_outputs: torch.Tensor,
28 | attention_context: torch.Tensor) -> torch.Tensor:
29 | # shape: (batch_size, num_summary_tokens)
30 | return torch.sigmoid(self.hidden_layer(post_attention_decoder_outputs).squeeze(2))
31 |
--------------------------------------------------------------------------------
/summarize/modules/generate_probability_functions/see2017.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from overrides import overrides
3 |
4 | from summarize.modules.generate_probability_functions import GenerateProbabilityFunction
5 |
6 |
7 | @GenerateProbabilityFunction.register('see2017')
8 | class See2017GenerateProbabilityFunction(GenerateProbabilityFunction):
9 | """
10 | Computes the generation probability according to See et al. (2017). The probability
11 | is a linear function of the input embedding, output from the decoder (without attention),
12 | and the attention context vector.
13 |
14 | Parameters
15 | ----------
16 | embedding_dim: ``int``
17 | The size of the input embeddings to the decoder
18 | encoder_dim: ``int``
19 | The size of the encoder's hidden state.
20 | decoder_dim: ``int``
21 | The size of the decoder's hidden state.
22 | """
23 | def __init__(self, embedding_dim: int, encoder_dim: int, decoder_dim: int) -> None:
24 | super().__init__()
25 | self.input_layer = torch.nn.Linear(embedding_dim, 1)
26 | self.hidden_layer = torch.nn.Linear(decoder_dim, 1)
27 | self.context_layer = torch.nn.Linear(encoder_dim, 1)
28 |
29 | @overrides
30 | def forward(self,
31 | input_embeddings: torch.Tensor,
32 | pre_attention_decoder_outputs: torch.Tensor,
33 | post_attention_decoder_outputs: torch.Tensor,
34 | attention_context: torch.Tensor) -> torch.Tensor:
35 | # shape: (batch_size, num_summary_tokens)
36 | input_score = self.input_layer(input_embeddings).squeeze(2)
37 | # shape: (batch_size, num_summary_tokens)
38 | hidden_score = self.hidden_layer(pre_attention_decoder_outputs).squeeze(2)
39 | # shape: (batch_size, num_summary_tokens)
40 | context_score = self.context_layer(attention_context).squeeze(2)
41 | # shape: (batch_size, num_summary_tokens)
42 | probability = torch.sigmoid(context_score + hidden_score + input_score)
43 |
44 | # In my experience, the generation probability can sometimes be equal
45 | # to 1.0 or 0.0 (with really large/small scores) even with reasonably sized
46 | # parameter values. This causes problems with the log which is called
47 | # later on. Therefore, we move the probability closer to 0.5 by a small
48 | # number for stability.
49 | # shape: (batch_size, num_summary_tokens)
50 | geq_one_half_mask = (probability >= 0.5).float()
51 | # shape: (batch_size, num_summary_tokens)
52 | probability = (probability - 1e-3) * (geq_one_half_mask) + (probability + 1e-3) * (1 - geq_one_half_mask)
53 | return probability
54 |
--------------------------------------------------------------------------------
/summarize/modules/matrix_attention/__init__.py:
--------------------------------------------------------------------------------
1 | from summarize.modules.matrix_attention.mlp import MLPAttention
2 |
--------------------------------------------------------------------------------
/summarize/modules/matrix_attention/mlp.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from allennlp.modules.matrix_attention import MatrixAttention
3 | from overrides import overrides
4 |
5 |
6 | @MatrixAttention.register('mlp')
7 | class MLPAttention(MatrixAttention):
8 | """
9 | An implementation of the "concat" attention from the arvix version of
10 | Luong et al. (2015) (https://arxiv.org/pdf/1508.04025.pdf). For some reason,
11 | the "concat" attention is different in the version in the ACL Anthology.
12 |
13 | Parameters
14 | ----------
15 | encoder_size: ``int``
16 | The size of the encoder hidden states.
17 | decoder_size: ``int``
18 | The size of the decoder hidden states.
19 | attention_size: ``int``
20 | The size of the intermediate attention hidden size.
21 | """
22 | def __init__(self,
23 | encoder_size: int,
24 | decoder_size: int,
25 | attention_size: int) -> None:
26 | super().__init__()
27 | self.linear_context = torch.nn.Linear(encoder_size, attention_size, bias=False)
28 | self.linear_query = torch.nn.Linear(decoder_size, attention_size, bias=True)
29 | self.v = torch.nn.Linear(attention_size, 1, bias=False)
30 |
31 | @overrides
32 | def forward(self,
33 | decoder_outputs: torch.Tensor,
34 | encoder_outputs: torch.Tensor) -> torch.Tensor:
35 | """
36 | Parameters
37 | ----------
38 | decoder_outputs: ``torch.Tensor``, ``(batch_size, num_summary_tokens, decoder_size)``
39 | The decoder outputs
40 | encoder_outputs: ``torch.Tensor``, ``(batch_size, num_document_tokens, encoder_size)``
41 |
42 | Returns
43 | -------
44 | A ``(batch_size, num_summary_tokens, num_document_tokens)``-sized tensor with the
45 | unnormalized attention scores.
46 | """
47 | num_decoder_tokens = decoder_outputs.size(1)
48 | num_encoder_tokens = encoder_outputs.size(1)
49 |
50 | # shape: (batch_size, num_summary_tokens, 1, decoder_size)
51 | decoder_outputs = decoder_outputs.unsqueeze(2)
52 | # shape: (batch_size, 1, num_document_tokens, encoder_size)
53 | encoder_outputs = encoder_outputs.unsqueeze(1)
54 |
55 | # shape: (batch_size, num_summary_tokens, 1, attention_size)
56 | decoder_projection = self.linear_query(decoder_outputs)
57 | # shape: (batch_size, 1, num_document_tokens, attention_size)
58 | encoder_projection = self.linear_context(encoder_outputs)
59 |
60 | # shape: (batch_size, num_summary_tokens, num_document_tokens, attention_size)
61 | decoder_projection = decoder_projection.expand(-1, -1, num_encoder_tokens, -1)
62 | # shape: (batch_size, num_summary_tokens, num_document_tokens, attention_size)
63 | encoder_projection = encoder_projection.expand(-1, num_decoder_tokens, -1, -1)
64 |
65 | # shape: (batch_size, num_summary_tokens, num_document_tokens)
66 | affinities = self.v(torch.tanh(decoder_projection + encoder_projection)).squeeze(-1)
67 | return affinities
68 |
--------------------------------------------------------------------------------
/summarize/modules/rnns/__init__.py:
--------------------------------------------------------------------------------
1 | from summarize.modules.rnns.rnn import RNN
2 | from summarize.modules.rnns.lstm import LSTM
3 | from summarize.modules.rnns.gru import GRU
4 |
--------------------------------------------------------------------------------
/summarize/modules/rnns/gru.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from overrides import overrides
3 |
4 | from summarize.modules.rnns import RNN
5 |
6 |
7 | @RNN.register('gru')
8 | class GRU(RNN):
9 | """
10 | A wrapper around the ``torch.nn.GRU`` module.
11 |
12 | Parameters
13 | ----------
14 | input_size: ``int``, required
15 | The size of the input dimension.
16 | hidden_size: ``hidden_size``, required
17 | The size of the hidden dimension. If bidirectional, each direction will
18 | be this hidden size.
19 | num_layers: ``num_layers``, required
20 | The number of layers.
21 | bidirectional: ``bool``, required
22 | Indicates if the RNN is bidirectional or not.
23 | dropout: ``float``, optional (default = ``0.0``)
24 | The dropout parameter in between RNN layers.
25 | """
26 | def __init__(self,
27 | input_size: int,
28 | hidden_size: int,
29 | num_layers: int = 1,
30 | bidirectional: bool = False,
31 | dropout: float = 0.0) -> None:
32 | rnn = torch.nn.GRU(input_size, hidden_size,
33 | bidirectional=bidirectional,
34 | batch_first=True,
35 | num_layers=num_layers,
36 | dropout=dropout)
37 | super().__init__(input_size, hidden_size, num_layers, bidirectional, rnn)
38 |
39 | @overrides
40 | def has_memory(self) -> bool:
41 | return False
42 |
43 | @overrides
44 | def reshape_hidden_for_decoder(self, hidden: torch.Tensor) -> torch.Tensor:
45 | if self.num_layers != 1:
46 | # Not entirely sure what to do here. AllenNLP just returns the last
47 | # layer, but I don't know if that's correct.
48 | raise NotImplementedError
49 |
50 | num_directions = 2 if self.bidirectional else 1
51 | batch_size = hidden.size(1)
52 |
53 | # Separate the layers from the number of directions
54 | # shape: (num_layers, num_directions, batch_size, hidden_size)
55 | hidden = hidden.view(self.num_layers, num_directions, batch_size, self.hidden_size)
56 |
57 | # If this is uni-directional, then we can remove the directions
58 | # dimension and return
59 | if num_directions == 1:
60 | # shape: (1, batch_size, hidden_size)
61 | hidden = hidden.squeeze(0)
62 | return hidden
63 | else:
64 | # Otherwise, we have to concatenate the two directions into one vector
65 | # shape: (num_layers, batch_size, hidden_size * 2)
66 | hidden = torch.cat([hidden[:, 0, :, :], hidden[:, 1, :, :]], dim=2)
67 | return hidden
68 |
--------------------------------------------------------------------------------
/summarize/modules/sentence_extractors/__init__.py:
--------------------------------------------------------------------------------
1 | from summarize.modules.sentence_extractors.sentence_extractor import SentenceExtractor
2 | from summarize.modules.sentence_extractors.rnn import RNNSentenceExtractor
3 |
--------------------------------------------------------------------------------
/summarize/modules/sentence_extractors/rnn.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from allennlp.common.checks import ConfigurationError
3 | from allennlp.modules import FeedForward, Seq2SeqEncoder
4 | from overrides import overrides
5 |
6 | from summarize.modules.sentence_extractors import SentenceExtractor
7 |
8 |
9 | @SentenceExtractor.register('rnn')
10 | class RNNSentenceExtractor(SentenceExtractor):
11 | """
12 | The RNNSentenceExtractor calculates extraction scores by running an RNN
13 | over the sentence representations followed by a feed-forward layer
14 | on the new hidden states.
15 |
16 | Parameters
17 | ----------
18 | rnn:
19 | The RNN to use (or any Seq2SeqEncoder)
20 | feed_forward:
21 | The feed-forward layer, which must have output dimension 1.
22 | dropout:
23 | The dropout to apply on the RNN hidden states.
24 | """
25 | def __init__(self,
26 | rnn: Seq2SeqEncoder,
27 | feed_forward: FeedForward,
28 | dropout: float = 0.0) -> None:
29 | super().__init__()
30 | self.rnn = rnn
31 | self.feed_forward = feed_forward
32 | self.dropout = torch.nn.Dropout(dropout)
33 |
34 | if rnn.get_output_dim() != feed_forward.get_input_dim():
35 | raise ConfigurationError('The RNN and feed-forward layers have incompatible dimensions')
36 | if feed_forward.get_output_dim() != 1:
37 | raise ConfigurationError('The feed-foward network must have output size 1')
38 |
39 | @overrides
40 | def forward(self,
41 | sentence_encodings: torch.Tensor,
42 | mask: torch.Tensor) -> torch.Tensor:
43 | # shape: (batch_size, num_sents, hidden_size)
44 | hidden_encodings = self.rnn(sentence_encodings, mask)
45 | hidden_encodings = self.dropout(hidden_encodings)
46 | # shape: (batch_size, num_sents)
47 | extraction_scores = self.feed_forward(hidden_encodings).squeeze(-1)
48 | return extraction_scores
49 |
--------------------------------------------------------------------------------
/summarize/modules/sentence_extractors/sentence_extractor.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from allennlp.common import Registrable
3 |
4 |
5 | class SentenceExtractor(torch.nn.Module, Registrable):
6 | def forward(self,
7 | sentence_encodings: torch.Tensor,
8 | mask: torch.Tensor) -> torch.Tensor:
9 | """
10 | Calculates the probability of each sentence being extracted from the
11 | sentence encodings.
12 |
13 | Parameters
14 | ----------
15 | sentence_encodings: (batch_size, num_sents, hidden_dim)
16 | The encoding of each sentence
17 | mask: (batch_size, num_sents)
18 | The sentence mask
19 |
20 | Returns
21 | -------
22 | A (batch_size, num_sents) tensor with the raw extraction scores for each
23 | input sentence.
24 | """
25 | raise NotImplementedError
26 |
--------------------------------------------------------------------------------
/summarize/nn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/nn/__init__.py
--------------------------------------------------------------------------------
/summarize/nn/beam_search/__init__.py:
--------------------------------------------------------------------------------
1 | from summarize.nn.beam_search.beam_search import BeamSearch
2 | from summarize.nn.beam_search.relaxed import RelaxedBeamSearch
3 |
--------------------------------------------------------------------------------
/summarize/nn/beam_search/coverage_penalizers/__init__.py:
--------------------------------------------------------------------------------
1 | from summarize.nn.beam_search.coverage_penalizers.coverage_penalizer import CoveragePenalizer
2 | from summarize.nn.beam_search.coverage_penalizers.onmt import ONMTCoveragePenalizer
3 |
--------------------------------------------------------------------------------
/summarize/nn/beam_search/coverage_penalizers/coverage_penalizer.py:
--------------------------------------------------------------------------------
1 | """
2 | ``CoveragePenalizer``s are used to rerank the output of beam search by adding
3 | a penalty to the score of each prediction at each step of decoding.
4 | """
5 | import torch
6 |
7 | from allennlp.common import Registrable
8 |
9 |
10 | class CoveragePenalizer(Registrable):
11 | def __call__(self, coverage: torch.Tensor) -> torch.Tensor:
12 | """
13 | Computes the factor that should be added to the log-probability of
14 | each output step.
15 |
16 | Parameters
17 | ----------
18 | coverage: ``torch.Tensor``, (..., num_document_tokens)
19 | A tensor that represents the accumulated attention probabilities
20 | assigned to each document token thus far in decoding. The tensor
21 | may have any number of leading dimensions.
22 |
23 | Returns
24 | -------
25 | ``torch.Tensor``:
26 | A tensor with the coverage penalties, the same size as the leading
27 | dimensions as the coverage tensor.
28 | """
29 | raise NotImplementedError
30 |
--------------------------------------------------------------------------------
/summarize/nn/beam_search/coverage_penalizers/onmt.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from overrides import overrides
3 |
4 | from summarize.nn.beam_search.coverage_penalizers import CoveragePenalizer
5 |
6 |
7 | @CoveragePenalizer.register('onmt')
8 | class ONMTCoveragePenalizer(CoveragePenalizer):
9 | """
10 | An implementation of the "summary" coverage penalty in the OpenNMT machine
11 | translation library (https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/translate/penalties.py).
12 | Because we add the coverage penalty to the log-probabilies (as in Wu et al.),
13 | instead of subtracting (as in ONMT), the sign of this penalty is the opposite
14 | as the ONMT implementation.
15 |
16 | The penalty discourages the coverage from attending to any one token too often.
17 |
18 | Parameters
19 | ----------
20 | beta: ``float``
21 | The scaling factor.
22 | """
23 | def __init__(self, beta: float) -> None:
24 | self.beta = beta
25 |
26 | @overrides
27 | def __call__(self, coverage: torch.Tensor) -> torch.Tensor:
28 | num_document_tokens = coverage.size(-1)
29 | penalty = num_document_tokens - torch.clamp(coverage, 1.0).sum(dim=-1)
30 | return self.beta * penalty
31 |
--------------------------------------------------------------------------------
/summarize/nn/beam_search/length_penalizers/__init__.py:
--------------------------------------------------------------------------------
1 | from summarize.nn.beam_search.length_penalizers.length_penalizer import LengthPenalizer
2 | from summarize.nn.beam_search.length_penalizers.average import AverageLengthPenalizer
3 | from summarize.nn.beam_search.length_penalizers.wu import WuLengthPenalizer
4 |
--------------------------------------------------------------------------------
/summarize/nn/beam_search/length_penalizers/average.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from overrides import overrides
3 |
4 | from summarize.nn.beam_search.length_penalizers import LengthPenalizer
5 |
6 |
7 | @LengthPenalizer.register('average')
8 | class AverageLengthPenalizer(LengthPenalizer):
9 | """
10 | Penalizes by predictions length of the sequence, thus causing the score
11 | to be the average log-probability per token.
12 | """
13 | @overrides
14 | def __call__(self, length: torch.Tensor) -> torch.Tensor:
15 | return length.float()
16 |
--------------------------------------------------------------------------------
/summarize/nn/beam_search/length_penalizers/length_penalizer.py:
--------------------------------------------------------------------------------
1 | """
2 | ``LengthPenalizer``s are used to rerank the output of beam search. After all
3 | the top-k hypotheses have been found, their log-probability scores are divided
4 | by a length penalty to adjust for different lengths.
5 | """
6 | import torch
7 |
8 | from allennlp.common import Registrable
9 |
10 |
11 | class LengthPenalizer(Registrable):
12 | def __call__(self, lengths: torch.Tensor) -> torch.Tensor:
13 | """
14 | Computes the factor that the log-probability of the output sequence
15 | should be divded by based on its length.
16 |
17 | Parameters
18 | ----------
19 | lengths: ``torch.Tensor``
20 | A tensor of the lengths, which can be any size.
21 |
22 | Returns
23 | -------
24 | ``torch.Tensor``:
25 | A tensor with the length penalties, the same size as the input tensor.
26 | """
27 | raise NotImplementedError
28 |
--------------------------------------------------------------------------------
/summarize/nn/beam_search/length_penalizers/wu.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from overrides import overrides
4 |
5 | from summarize.nn.beam_search.length_penalizers import LengthPenalizer
6 |
7 |
8 | @LengthPenalizer.register('wu')
9 | class WuLengthPenalizer(LengthPenalizer):
10 | """
11 | Implements the length penalty in Wu et al. (2016) (https://arxiv.org/pdf/1609.08144.pdf),
12 | section 7.
13 |
14 | Parameters
15 | ----------
16 | alpha: ``float``
17 | The value of alpha in the length penalty.
18 | """
19 | def __init__(self, alpha: float) -> None:
20 | self.alpha = alpha
21 |
22 | @overrides
23 | def __call__(self, length: torch.Tensor) -> torch.Tensor:
24 | return torch.pow(5.0 + length.float(), self.alpha) / np.power(6.0, self.alpha)
25 |
--------------------------------------------------------------------------------
/summarize/nn/util.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def normalize_losses(losses: torch.Tensor,
5 | losses_mask: torch.Tensor,
6 | instance_normalization: str,
7 | batch_normalization: str) -> torch.Tensor:
8 | """
9 | Normalizes the input losses based on the type of normalization specified
10 | by `instance_normalization` and `batch_normalization`.
11 |
12 | Parameters
13 | ----------
14 | losses: (batch_size, num_tokens)
15 | The loss per summary token.
16 | losses_mask: (batch_size, num_tokens)
17 | The mask which indicates which losses are valid.
18 | instance_normalization:
19 | The method of normalizing each item in the batch, either "sum" or "average",
20 | which will sum or average the losses per summary.
21 | batch_normalization:
22 | The method of normalizing the losses per summary, either "sum" or "average".
23 | After the loss for each instance is compuated via the method specified
24 | by `instance_normalization`, the subsequent losses are either summed
25 | or averaged.
26 |
27 | Returns
28 | -------
29 | The normalized loss.
30 | """
31 | # First, apply the loss mask to 0-out any invalid losses
32 | losses = losses * losses_mask.float()
33 |
34 | if instance_normalization == 'sum':
35 | # shape: (batch_size,)
36 | loss_per_summary = losses.sum(dim=1)
37 | elif instance_normalization == 'average':
38 | # shape: (batch_size,)
39 | lengths = losses_mask.float().sum(dim=1)
40 | # shape: (batch_size,)
41 | loss_per_summary = losses.sum(dim=1) / lengths
42 | else:
43 | raise Exception(f'Unknown type of instance normalization: {instance_normalization}')
44 |
45 | if batch_normalization == 'sum':
46 | loss = loss_per_summary.sum()
47 | elif batch_normalization == 'average':
48 | loss = loss_per_summary.mean()
49 | else:
50 | raise Exception(f'Unknown type of batch normalization: {batch_normalization}')
51 |
52 | return loss
53 |
--------------------------------------------------------------------------------
/summarize/predictors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/predictors/__init__.py
--------------------------------------------------------------------------------
/summarize/predictors/cloze/__init__.py:
--------------------------------------------------------------------------------
1 | from summarize.predictors.cloze.abstractive import ClozeAbstractivePredictor
2 | from summarize.predictors.cloze.extractive import ClozeExtractivePredictor
3 |
--------------------------------------------------------------------------------
/summarize/predictors/cloze/abstractive.py:
--------------------------------------------------------------------------------
1 | import json
2 | from allennlp.common.util import JsonDict
3 | from allennlp.data import Instance
4 | from allennlp.service.predictors.predictor import Predictor
5 | from overrides import overrides
6 |
7 |
8 | @Predictor.register('cloze-abstractive-predictor')
9 | class ClozeAbstractivePredictor(Predictor):
10 | @overrides
11 | def _json_to_instance(self, json_dict: JsonDict) -> Instance:
12 | document = json_dict['document']
13 | topics = json_dict['topics']
14 | context = json_dict['context']
15 | return self._dataset_reader.text_to_instance(document=document,
16 | topics=topics,
17 | context=context)
18 |
19 | @overrides
20 | def dump_line(self, outputs: JsonDict) -> str:
21 | cloze = outputs['cloze']
22 | output_data = {'cloze': cloze}
23 | return json.dumps(output_data) + '\n'
24 |
--------------------------------------------------------------------------------
/summarize/predictors/cloze/extractive.py:
--------------------------------------------------------------------------------
1 | import json
2 | from allennlp.common.util import JsonDict
3 | from allennlp.data import Instance
4 | from allennlp.service.predictors.predictor import Predictor
5 | from overrides import overrides
6 |
7 |
8 | @Predictor.register('cloze-extractive-predictor')
9 | class ClozeExtractivePredictor(Predictor):
10 | @overrides
11 | def _json_to_instance(self, json_dict: JsonDict) -> Instance:
12 | document = json_dict['document']
13 | topics = json_dict['topics']
14 | context = json_dict['context']
15 | return self._dataset_reader.text_to_instance(document=document,
16 | topics=topics,
17 | context=context)
18 |
19 | @overrides
20 | def dump_line(self, outputs: JsonDict) -> str:
21 | indices = outputs['predicted_indices']
22 | document = outputs['metadata']['document']
23 | cloze = [document[index] for index in indices]
24 | output_data = {'cloze': cloze}
25 | return json.dumps(output_data) + '\n'
26 |
--------------------------------------------------------------------------------
/summarize/predictors/sds/__init__.py:
--------------------------------------------------------------------------------
1 | from summarize.predictors.sds.abstractive import AbstractivePredictor
2 | from summarize.predictors.sds.extractive import ExtractivePredictor
3 |
--------------------------------------------------------------------------------
/summarize/predictors/sds/abstractive.py:
--------------------------------------------------------------------------------
1 | import json
2 | from allennlp.common.util import JsonDict
3 | from allennlp.data import Instance
4 | from allennlp.service.predictors.predictor import Predictor
5 | from overrides import overrides
6 |
7 |
8 | @Predictor.register('sds-abstractive-predictor')
9 | class AbstractivePredictor(Predictor):
10 | @overrides
11 | def _json_to_instance(self, json_dict: JsonDict) -> Instance:
12 | document = json_dict['document']
13 | return self._dataset_reader.text_to_instance(document=document)
14 |
15 | @overrides
16 | def dump_line(self, outputs: JsonDict) -> str:
17 | summary = outputs['summary']
18 | output_data = {'summary': [summary]}
19 | return json.dumps(output_data) + '\n'
20 |
--------------------------------------------------------------------------------
/summarize/predictors/sds/extractive.py:
--------------------------------------------------------------------------------
1 | import json
2 | from allennlp.common.util import JsonDict
3 | from allennlp.service.predictors.predictor import Predictor
4 | from overrides import overrides
5 |
6 |
7 | @Predictor.register('sds-extractive-predictor')
8 | class ExtractivePredictor(Predictor):
9 | @overrides
10 | def dump_line(self, outputs: JsonDict) -> str:
11 | indices = outputs['predicted_indices']
12 | document = outputs['metadata']['document']
13 | summary = [document[index] for index in indices]
14 | output_data = {'summary': summary}
15 | return json.dumps(output_data) + '\n'
16 |
--------------------------------------------------------------------------------
/summarize/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/common/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/common/tempdir_test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import unittest
4 |
5 | from summarize.common import TemporaryDirectory
6 |
7 |
8 | class TestTemporaryDirectory(unittest.TestCase):
9 | def test_temporary_directory(self):
10 | with TemporaryDirectory() as temp_dir:
11 | assert os.path.exists(temp_dir)
12 | assert os.path.isdir(temp_dir)
13 | assert not os.path.exists(temp_dir)
14 |
15 | def test_temporary_directory_root(self):
16 | # Create two temporary directories with one inside the other
17 | # to make sure it was created in the correct location
18 | with TemporaryDirectory() as root_temp_dir:
19 | with TemporaryDirectory(root=root_temp_dir) as temp_dir:
20 | assert os.path.exists(temp_dir)
21 | assert os.path.isdir(temp_dir)
22 | assert temp_dir.startswith(root_temp_dir)
23 |
24 | def test_temporary_directory_persist(self):
25 | with TemporaryDirectory(persist=True) as temp_dir:
26 | assert os.path.exists(temp_dir)
27 | assert os.path.isdir(temp_dir)
28 | assert os.path.exists(temp_dir)
29 | shutil.rmtree(temp_dir)
30 | assert not os.path.exists(temp_dir)
31 |
--------------------------------------------------------------------------------
/summarize/tests/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/data/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/data/dataset_readers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/data/dataset_readers/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/data/dataset_readers/cloze/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/data/dataset_readers/cloze/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/data/dataset_readers/cloze/abstractive_test.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from allennlp.data.tokenizers import WordTokenizer
3 | from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter
4 |
5 | from summarize.common.testing import FIXTURES_ROOT
6 | from summarize.data.dataset_readers.cloze import AbstractiveClozeDatasetReader
7 | from summarize.data.paragraph_tokenizers import ParagraphWordTokenizer
8 |
9 |
10 | class TestAbstractiveClozeDatasetReader(unittest.TestCase):
11 | def test_read_from_file(self):
12 | word_tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())
13 | paragraph_tokenizer = ParagraphWordTokenizer(word_splitter=JustSpacesWordSplitter())
14 | reader = AbstractiveClozeDatasetReader(document_tokenizer=paragraph_tokenizer,
15 | topic_tokenizer=word_tokenizer,
16 | max_document_length=10,
17 | max_context_length=7,
18 | max_cloze_length=5)
19 | instances = list(reader.read(f'{FIXTURES_ROOT}/data/cloze.jsonl'))
20 |
21 | instance0 = {
22 | 'document': ['NEW', 'YORK', ',', 'Jan.', '8', ',', '2016', '/PRNewswire/', '--', 'Businessman'],
23 | 'topics': [['Ken', 'Fields'], ['Politics']],
24 | 'context': ['%', 'Renewable', 'Energy', 'in', '20', 'Years', '.'],
25 | 'cloze': ['Picking', 'as', 'his', 'campaign', 'slogan']
26 | }
27 |
28 | assert len(instances) == 25
29 | fields = instances[0].fields
30 | assert [t.text for t in fields['document'].tokens] == instance0['document']
31 | assert len(fields['topics'].field_list) == len(instance0['topics'])
32 | for topic_field, topic in zip(fields['topics'].field_list, instance0['topics']):
33 | assert [t.text for t in topic_field.tokens] == topic
34 | assert [t.text for t in fields['context'].tokens] == instance0['context']
35 | assert [t.text for t in fields['cloze'].tokens] == instance0['cloze']
36 | metadata = fields['metadata']
37 | assert 'document' in metadata
38 | assert 'topics' in metadata
39 | assert 'context' in metadata
40 | assert 'cloze' in metadata
41 |
--------------------------------------------------------------------------------
/summarize/tests/data/dataset_readers/cloze/extractive_test.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import unittest
3 | from allennlp.data.tokenizers import WordTokenizer
4 | from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter
5 |
6 | from summarize.common.testing import FIXTURES_ROOT
7 | from summarize.data.dataset_readers.cloze import ExtractiveClozeDatasetReader
8 |
9 |
10 | class TestExtractiveClozeDatasetReader(unittest.TestCase):
11 | def test_read_from_file(self):
12 | tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())
13 | reader = ExtractiveClozeDatasetReader(tokenizer=tokenizer, max_num_sentences=5,
14 | max_sentence_length=6, max_context_length=4)
15 | instances = list(reader.read(f'{FIXTURES_ROOT}/data/cloze.jsonl'))
16 |
17 | instance1 = {
18 | 'document': [
19 | ['Drew', 'Sheneman', 'has', 'been', 'the', 'editorial'],
20 | ['J.', ')'],
21 | ['since', '1998', '.'],
22 | ['With', 'exceptional', 'artistry', ',', 'his', 'cartoons'],
23 | ['Sheneman', 'began', 'cartooning', 'in', 'college', 'and']
24 | ],
25 | 'topics': [['Drew', 'Sheneman']],
26 | 'context': ['American', 'editorial', 'cartoonist', '.'],
27 | 'labels': [1, 0, 1, 0, 1]
28 | }
29 |
30 | assert len(instances) == 25
31 | fields = instances[1].fields
32 | assert len(fields['document'].field_list) == 5
33 | for sentence, sentence_field in zip(instance1['document'], fields['document'].field_list):
34 | assert [t.text for t in sentence_field.tokens] == sentence
35 | assert len(fields['topics'].field_list) == 1
36 | for topic, topic_field in zip(instance1['topics'], fields['topics'].field_list):
37 | assert [t.text for t in topic_field.tokens] == topic
38 | assert [t.text for t in fields['context']] == instance1['context']
39 | assert np.array_equal(fields['labels'].array, instance1['labels'])
40 | metadata = fields['metadata']
41 | assert 'document' in metadata
42 | assert 'topics' in metadata
43 | assert 'context' in metadata
44 | assert 'cloze' in metadata
45 |
--------------------------------------------------------------------------------
/summarize/tests/data/dataset_readers/sds/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/data/dataset_readers/sds/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/data/dataset_readers/sds/abstractive_test.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter
3 |
4 | from summarize.common.testing import FIXTURES_ROOT
5 | from summarize.data.dataset_readers.sds import AbstractiveDatasetReader
6 | from summarize.data.paragraph_tokenizers import ParagraphWordTokenizer
7 |
8 |
9 | class TestAbstractiveDatasetReader(unittest.TestCase):
10 | def test_read_from_file(self):
11 | tokenizer = ParagraphWordTokenizer(word_splitter=JustSpacesWordSplitter())
12 | reader = AbstractiveDatasetReader(document_tokenizer=tokenizer, max_document_length=10, max_summary_length=5)
13 | instances = list(reader.read(f'{FIXTURES_ROOT}/data/sds.jsonl'))
14 |
15 | instance0 = {
16 | 'document': ['Editor', '\'s', 'note', ':', 'In', 'our', 'Behind', 'the', 'Scenes', 'series'],
17 | 'summary': ['Mentally', 'ill', 'inmates', 'in', 'Miami']
18 | }
19 |
20 | assert len(instances) == 25
21 | fields = instances[0].fields
22 | assert [t.text for t in fields['document'].tokens] == instance0['document']
23 | assert [t.text for t in fields['summary'].tokens] == instance0['summary']
24 | metadata = fields['metadata']
25 | assert 'document' in metadata
26 | assert len(metadata['document']) == 20
27 | assert 'summary' in metadata
28 | assert len(metadata['summary']) == 4
29 |
--------------------------------------------------------------------------------
/summarize/tests/data/dataset_readers/sds/extractive_test.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import unittest
3 | from allennlp.data.tokenizers import WordTokenizer
4 | from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter
5 |
6 | from summarize.common.testing import FIXTURES_ROOT
7 | from summarize.data.dataset_readers.sds import ExtractiveDatasetReader
8 |
9 |
10 | class TestExtractiveDatasetReader(unittest.TestCase):
11 | def test_read_from_file(self):
12 | tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())
13 | reader = ExtractiveDatasetReader(tokenizer=tokenizer, max_num_sentences=5, max_sentence_length=6)
14 | instances = list(reader.read(f'{FIXTURES_ROOT}/data/sds.jsonl'))
15 |
16 | instance0 = {
17 | 'document': [
18 | ['Editor', '\'s', 'note', ':', 'In', 'our'],
19 | ['An', 'inmate', 'housed', 'on', 'the', '``'],
20 | ['MIAMI', ',', 'Florida', '(', 'CNN', ')'],
21 | ['Most', 'often', ',', 'they', 'face', 'drug'],
22 | ['So', ',', 'they', 'end', 'up', 'on']
23 | ]
24 | }
25 |
26 | assert len(instances) == 25
27 | fields = instances[0].fields
28 | assert len(fields['document'].field_list) == 5
29 | for sentence, sentence_field in zip(instance0['document'], fields['document'].field_list):
30 | assert [t.text for t in sentence_field.tokens] == sentence
31 | assert np.array_equal(fields['labels'].array, [0, 0, 1, 1, 0])
32 | metadata = fields['metadata']
33 | assert 'document' in metadata
34 | assert len(metadata['document']) == 5
35 | assert 'summary' in metadata
36 | assert len(metadata['summary']) == 4
37 |
--------------------------------------------------------------------------------
/summarize/tests/data/dataset_setup/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/data/dataset_setup/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/data/dataset_setup/tokenize_test.py:
--------------------------------------------------------------------------------
1 | import spacy
2 | import unittest
3 | from nltk import word_tokenize
4 |
5 | from summarize.data.dataset_setup.tokenize import tokenize
6 |
7 |
8 | class TestTokenize(unittest.TestCase):
9 | def test_spacy_tokenize(self):
10 | nlp = spacy.load('en', disable=['tagger', 'parser', 'ner'])
11 | field = "Hi, I'm Dan."
12 | expected = "Hi , I 'm Dan ."
13 | actual = tokenize(nlp, field)
14 | assert expected == actual
15 |
16 | field = [['The first.', 'The second.'], 'The third.']
17 | expected = [['The first .', 'The second .'], 'The third .']
18 | actual = tokenize(nlp, field)
19 | assert expected == actual
20 |
21 | def test_nltk_tokenize(self):
22 | field = "Hi, I'm Dan."
23 | expected = "Hi , I 'm Dan ."
24 | actual = tokenize(word_tokenize, field)
25 | assert expected == actual
26 |
--------------------------------------------------------------------------------
/summarize/tests/data/io/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/data/io/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/data/io/jsonl_writer_test.py:
--------------------------------------------------------------------------------
1 | import bz2
2 | import gzip
3 | import json
4 | import tempfile
5 | import unittest
6 |
7 | from summarize.data.io import JsonlWriter
8 |
9 |
10 | class TestJsonlWriter(unittest.TestCase):
11 | def setUp(self):
12 | self.data = [
13 | {'a': 4, 'b': 'testing'},
14 | {'c': [1, 2, 3]}
15 | ]
16 |
17 | def test_plain_file(self):
18 | # Write the data to a file
19 | temp_file = tempfile.NamedTemporaryFile(suffix='.jsonl')
20 | with JsonlWriter(temp_file.name) as out:
21 | for item in self.data:
22 | out.write(item)
23 |
24 | # Load from file, ensure it is correct
25 | actual_data = []
26 | with open(temp_file.name, 'r') as f:
27 | for line in f:
28 | actual_data.append(json.loads(line))
29 | self.assertEqual(self.data, actual_data)
30 |
31 | def test_gzip_file(self):
32 | # Write the data to a file
33 | temp_file = tempfile.NamedTemporaryFile(suffix='.jsonl.gz')
34 | with JsonlWriter(temp_file.name) as out:
35 | for item in self.data:
36 | out.write(item)
37 |
38 | # Load from file, ensure it is correct
39 | actual_data = []
40 | with gzip.open(temp_file.name, 'rb') as f:
41 | for line in f:
42 | actual_data.append(json.loads(line.decode()))
43 | self.assertEqual(self.data, actual_data)
44 |
45 | def test_bz2_file(self):
46 | # Write the data to a file
47 | temp_file = tempfile.NamedTemporaryFile(suffix='.jsonl.bz2')
48 | with JsonlWriter(temp_file.name) as out:
49 | for item in self.data:
50 | out.write(item)
51 |
52 | # Load from file, ensure it is correct
53 | actual_data = []
54 | with bz2.open(temp_file.name, 'rb') as f:
55 | for line in f:
56 | actual_data.append(json.loads(line.decode()))
57 | self.assertEqual(self.data, actual_data)
58 |
--------------------------------------------------------------------------------
/summarize/tests/data/io/util_test.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import tempfile
3 | import unittest
4 |
5 | from summarize.data.io.util import is_gz_file
6 |
7 |
8 | class TestUtil(unittest.TestCase):
9 | def test_is_gz_file(self):
10 | with tempfile.NamedTemporaryFile() as temp:
11 | # Write a plain text file
12 | with open(temp.name, 'w') as out:
13 | out.write('plain text')
14 | assert is_gz_file(temp.name) is False
15 |
16 | # Write a gzipped file
17 | with gzip.open(temp.name, 'wb') as out:
18 | out.write(b'gzipped')
19 | assert is_gz_file(temp.name) is True
20 |
--------------------------------------------------------------------------------
/summarize/tests/data/paragraph_tokenizers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/data/paragraph_tokenizers/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/data/paragraph_tokenizers/paragraph_word_tokenizer_test.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from summarize.data.paragraph_tokenizers import ParagraphWordTokenizer
4 |
5 |
6 | class TestParagraphWordTokenizer(unittest.TestCase):
7 | def test_in_between_tokens(self):
8 | texts = [
9 | 'This is the first sentence.',
10 | 'Followed by the second.',
11 | 'And the third!'
12 | ]
13 |
14 | tokenizer = ParagraphWordTokenizer()
15 | expected = [
16 | 'This', 'is', 'the', 'first', 'sentence', '.',
17 | 'Followed', 'by', 'the', 'second', '.',
18 | 'And', 'the', 'third', '!'
19 | ]
20 | tokens = tokenizer.tokenize(texts)
21 | actual = list(map(str, tokens))
22 | assert expected == actual
23 |
24 | tokenizer = ParagraphWordTokenizer(start_tokens=['@start@'],
25 | end_tokens=['@end@'],
26 | in_between_tokens=['', ''])
27 | expected = [
28 | '@start@', 'This', 'is', 'the', 'first', 'sentence', '.', '', '',
29 | 'Followed', 'by', 'the', 'second', '.', '', '',
30 | 'And', 'the', 'third', '!', '@end@'
31 | ]
32 | tokens = tokenizer.tokenize(texts)
33 | actual = list(map(str, tokens))
34 | assert expected == actual
35 |
--------------------------------------------------------------------------------
/summarize/tests/fixtures/configs/cloze/extractive-baseline.jsonnet:
--------------------------------------------------------------------------------
1 | {
2 | "dataset_reader": {
3 | "type": "cloze-extractive",
4 | "max_num_sentences": 50,
5 | "max_sentence_length": 15,
6 | "max_context_length": 20,
7 | "tokenizer": {
8 | "type": "word",
9 | "word_splitter": {
10 | "type": "just_spaces"
11 | }
12 | },
13 | "token_indexers": {
14 | "tokens": {
15 | "type": "single_id",
16 | "lowercase_tokens": true
17 | }
18 | }
19 | },
20 | "train_data_path": "summarize/tests/fixtures/data/cloze.jsonl",
21 | "validation_data_path": "summarize/tests/fixtures/data/cloze.jsonl",
22 | "model": {
23 | "type": "cloze-extractive-baseline",
24 | "token_embedder": {
25 | "tokens": {
26 | "type": "embedding",
27 | "embedding_dim": 20
28 | }
29 | },
30 | "sentence_encoder": {
31 | "type": "lstm",
32 | "input_size": 20,
33 | "hidden_size": 20,
34 | "bidirectional": true
35 | },
36 | "sentence_extractor": {
37 | "type": "rnn",
38 | "rnn": {
39 | "type": "lstm",
40 | "input_size": 40,
41 | "hidden_size": 20,
42 | "bidirectional": true
43 | },
44 | "feed_forward": {
45 | "input_dim": 40,
46 | "hidden_dims": 1,
47 | "num_layers": 1,
48 | "activations": "linear"
49 | }
50 | },
51 | "topic_encoder": {
52 | "type": "lstm",
53 | "input_size": 20,
54 | "hidden_size": 20,
55 | "bidirectional": true
56 | },
57 | "topic_layer": {
58 | "input_dim": 40,
59 | "hidden_dims": 40,
60 | "num_layers": 1,
61 | "activations": "linear"
62 | },
63 | "context_encoder": {
64 | "type": "lstm",
65 | "input_size": 20,
66 | "hidden_size": 20,
67 | "bidirectional": true
68 | },
69 | "attention": {
70 | "type": "mlp",
71 | "encoder_size": 40,
72 | "decoder_size": 40,
73 | "attention_size": 40
74 | },
75 | "attention_layer": {
76 | "input_dim": 40 + 40,
77 | "hidden_dims": 40,
78 | "num_layers": 1,
79 | "activations": "linear"
80 | },
81 | "use_topics": true,
82 | "use_context": true,
83 | "max_words": 20,
84 | "metrics": [
85 | {
86 | "type": "python-rouge",
87 | "ngram_orders": [2]
88 | }
89 | ]
90 | },
91 | "iterator": {
92 | "type": "basic",
93 | "batch_size": 4,
94 | "instances_per_epoch": 2
95 | },
96 | "trainer": {
97 | "optimizer": "adam",
98 | "num_epochs": 5,
99 | "cuda_device": -1
100 | }
101 | }
102 |
--------------------------------------------------------------------------------
/summarize/tests/fixtures/configs/sds/extractive-baseline.jsonnet:
--------------------------------------------------------------------------------
1 | {
2 | "dataset_reader": {
3 | "type": "sds-extractive",
4 | "max_num_sentences": 50,
5 | "max_sentence_length": 15,
6 | "tokenizer": {
7 | "type": "word",
8 | "word_splitter": {
9 | "type": "just_spaces"
10 | }
11 | },
12 | "token_indexers": {
13 | "tokens": {
14 | "type": "single_id",
15 | "lowercase_tokens": true
16 | }
17 | }
18 | },
19 | "train_data_path": "summarize/tests/fixtures/data/sds.jsonl",
20 | "validation_data_path": "summarize/tests/fixtures/data/sds.jsonl",
21 | "model": {
22 | "type": "sds-extractive-baseline",
23 | "token_embedder": {
24 | "tokens": {
25 | "type": "embedding",
26 | "embedding_dim": 20
27 | }
28 | },
29 | "sentence_encoder": {
30 | "type": "lstm",
31 | "input_size": 20,
32 | "hidden_size": 20,
33 | "bidirectional": true
34 | },
35 | "sentence_extractor": {
36 | "type": "rnn",
37 | "rnn": {
38 | "type": "lstm",
39 | "input_size": 40,
40 | "hidden_size": 20,
41 | "bidirectional": true
42 | },
43 | "feed_forward": {
44 | "input_dim": 40,
45 | "hidden_dims": 1,
46 | "num_layers": 1,
47 | "activations": "linear"
48 | }
49 | },
50 | "max_words": 20,
51 | "metrics": [
52 | {
53 | "type": "python-rouge",
54 | "ngram_orders": [2]
55 | }
56 | ]
57 | },
58 | "iterator": {
59 | "type": "basic",
60 | "batch_size": 4,
61 | "instances_per_epoch": 2
62 | },
63 | "trainer": {
64 | "optimizer": "adam",
65 | "num_epochs": 5,
66 | "cuda_device": -1
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/summarize/tests/fixtures/data/chen2018/Readme.md:
--------------------------------------------------------------------------------
1 | The `gold.jsonl` and `model.jsonl` are 10 reference and system summaries from "Fast Abstractive Summarization with Reinforce-Selected Sentence Rewriting" by Chen and Bansal (2018).
2 |
--------------------------------------------------------------------------------
/summarize/tests/fixtures/data/chen2018/gold.jsonl:
--------------------------------------------------------------------------------
1 | {"summary": ["marseille prosecutor says `` so far no videos were used in the crash investigation '' despite media reports .", "journalists at bild and paris match are `` very confident '' the video clip is real , an editor says .", "andreas lubitz had informed his lufthansa training school of an episode of severe depression , airline says ."]}
2 | {"summary": ["membership gives the icc jurisdiction over alleged crimes committed in palestinian territories since last june .", "israel and the united states opposed the move , which could open the door to war crimes investigations against israelis ."]}
3 | {"summary": ["college-bound basketball star asks girl with down syndrome to high school prom .", "pictures of the two during the `` prom-posal '' have gone viral ."]}
4 | {"summary": ["don mclean 's `` american pie '' lyrics auctioned for $ 1.2 million .", "the song is dense with symbolism ; mclean says lyrics , notes will reveal meaning .", "`` pie '' is mclean 's biggest hit , was no. 1 in 1972 ."]}
5 | {"summary": ["gov. mike pence is making the right call to fix indiana 's religious freedom law , which can be used for discrimination .", "mark goldfeder : indiana should aim to be a shining beacon of cooperation : the real `` crossroads of america ''"]}
6 | {"summary": ["cameron hooker had kidnapped young hitchhiker colleen stan in 1977 .", "over the next seven years victim was tortured and raped as his captive .", "hooker , now 61 , was sentenced to a 104-year prison term jail in 1985 .", "he applied for early parole but was told he 'd spent at least 15 years in jail ."]}
7 | {"summary": ["figures show that while millions still tune in they listen for shorter bursts .", "average listener spent ten hours a week tuning in last three months of 2014 .", "this was 14 % down on decade earlier , when people tuned in for 11.6 hours .", "the bbc trust has cleared the way for firms to buy their way into lifestyle programmes on the world news channel in a product placement experiment . for example , publishers could pay to have their books reviewed on talking books . the bbc trust will review the scheme in a year ."]}
8 | {"summary": ["s300 barely takes off before plunging back to the ground .", "minute-long clip shows people dashing for cover as rocket hits ground .", "mishap comes shortly after footage of crash killing missile engineers ."]}
9 | {"summary": ["david letterman made the joke while warming up his late show audience .", "college staffer asked what advice the ` scandal-scarred ' comic could give .", "the host told them ` treat a lady like a wh -- e , and a wh -- e like a lady '", "joke was met with stunned silence with some branding it ` disrespectful '"]}
10 | {"summary": ["winds swept the ocean foam off lashing waves before mixing it with sand from the shore line .", "the result was a bizarre and grotesque yellow , thick , jelly-like foam substance which coated the entire beach .", "it stretched more than 15 metres up avoca beach in the central coast and onto the pathways and shrubbery .", "sylvia freedman , who was holidaying there when the storm hit , captured the strange phenomenon on her camera ."]}
11 |
--------------------------------------------------------------------------------
/summarize/tests/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/metrics/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/metrics/meteor_test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pytest
3 | import unittest
4 |
5 | from summarize.common.testing import FIXTURES_ROOT
6 | from summarize.data.io import JsonlReader
7 | from summarize.metrics.meteor import DEFAULT_METEOR_JAR_PATH, run_meteor
8 |
9 |
10 | @pytest.mark.skipif(not os.path.exists(DEFAULT_METEOR_JAR_PATH), reason='Meteor jar does not exist')
11 | class TestMeteor(unittest.TestCase):
12 | def test_meteor_runs(self):
13 | gold_summaries = [
14 | 'This is the gold summary for the first instance.',
15 | 'And this is for the second one.'
16 | ]
17 | model_summaries = [
18 | 'This is the model output.',
19 | 'And this is the one for the second document.'
20 | ]
21 | assert run_meteor(gold_summaries, model_summaries) > 0.0
22 |
23 | def test_chen2018(self):
24 | """
25 | Tests to ensure that Meteor returns the expected score on the
26 | Chen 2018 data subset. I ran Meteor on the full data (~11k examples)
27 | which takes too long to run for a unit test. After confirming the numbers
28 | are the same as what is reported in the paper, I ran the code on just
29 | the subset, and this test ensures those numbers are returned.
30 | """
31 | gold_file_path = f'{FIXTURES_ROOT}/data/chen2018/gold.jsonl'
32 | model_file_path = f'{FIXTURES_ROOT}/data/chen2018/model.jsonl'
33 |
34 | gold = JsonlReader(gold_file_path).read()
35 | model = JsonlReader(model_file_path).read()
36 |
37 | gold = [' '.join(summary['summary']) for summary in gold]
38 | model = [' '.join(summary['summary']) for summary in model]
39 |
40 | score = run_meteor(gold, model)
41 | assert abs(score - 18.28372) < 1e-5
42 |
--------------------------------------------------------------------------------
/summarize/tests/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/models/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/models/cloze/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/models/cloze/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/models/cloze/bm25/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/models/cloze/bm25/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/models/cloze/bm25/bm25_test.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 | import unittest
3 | from collections import namedtuple
4 |
5 | from summarize.data.io import JsonlReader
6 | from summarize.common.testing import FIXTURES_ROOT
7 | from summarize.models.cloze.bm25 import calculate_df, bm25
8 |
9 |
10 | class TestBM25(unittest.TestCase):
11 | def test_bm25_runs(self):
12 | with tempfile.NamedTemporaryFile(suffix='.jsonl') as df_file:
13 | with tempfile.NamedTemporaryFile(suffix='.jsonl') as bm25_file:
14 | Args = namedtuple('Args', ['input_jsonl', 'output_jsonl'])
15 | args = Args(f'{FIXTURES_ROOT}/data/cloze.jsonl', df_file.name)
16 | calculate_df.main(args)
17 |
18 | Args = namedtuple('Args', ['input_jsonl', 'df_jsonl', 'output_jsonl',
19 | 'k', 'b', 'max_words', 'max_sentences', 'flatten'])
20 | args = Args(f'{FIXTURES_ROOT}/data/cloze.jsonl', df_file.name, bm25_file.name,
21 | 1.2, 0.75, None, 1, True)
22 | bm25.main(args)
23 |
24 | instances = JsonlReader(bm25_file.name).read()
25 | assert len(instances) == 25
26 | for instance in instances:
27 | assert 'cloze' in instance
28 | assert isinstance(instance['cloze'], str)
29 |
--------------------------------------------------------------------------------
/summarize/tests/models/cloze/bm25/calculate_df_test.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 | import unittest
3 | from collections import namedtuple
4 |
5 | from summarize.data.io import JsonlReader
6 | from summarize.common.testing import FIXTURES_ROOT
7 | from summarize.models.cloze.bm25 import calculate_df
8 |
9 |
10 | class TestCalculateDF(unittest.TestCase):
11 | def test_calculate_df_runs(self):
12 | with tempfile.NamedTemporaryFile(suffix='.jsonl') as df_file:
13 | Args = namedtuple('Args', ['input_jsonl', 'output_jsonl'])
14 | args = Args(f'{FIXTURES_ROOT}/data/cloze.jsonl', df_file.name)
15 | calculate_df.main(args)
16 |
17 | lines = JsonlReader(df_file.name).read()
18 | assert len(lines) > 0
19 | metadata = lines[0]
20 | assert 'num_documents' in metadata
21 | assert 'average_document_length' in metadata
22 | for count in lines[1:]:
23 | assert 'token' in count
24 | assert 'df' in count
25 |
--------------------------------------------------------------------------------
/summarize/tests/models/cloze/extractive_baseline_test.py:
--------------------------------------------------------------------------------
1 | from allennlp.common.testing import ModelTestCase
2 |
3 | # Some imports necessary in order to register the dataset reader, model, and modules
4 | import summarize.data.dataset_readers.cloze
5 | import summarize.models.cloze
6 | import summarize.modules.matrix_attention
7 | import summarize.training.metrics
8 | from summarize.common.testing import FIXTURES_ROOT
9 |
10 |
11 | class ExtractiveBaselineModelModelTest(ModelTestCase):
12 | def setUp(self):
13 | super().setUp()
14 | self.set_up_model(f'{FIXTURES_ROOT}/configs/cloze/extractive-baseline.jsonnet',
15 | f'{FIXTURES_ROOT}/data/cloze.jsonl')
16 |
17 | def test_cloze_extractive_baseline_can_train_save_and_load(self):
18 | self.ensure_model_can_train_save_and_load(self.param_file)
19 |
20 | def test_batch_predictions_are_consistent(self):
21 | self.ensure_batch_predictions_are_consistent()
22 |
--------------------------------------------------------------------------------
/summarize/tests/models/cloze/lead_test.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 | import unittest
3 | from collections import namedtuple
4 |
5 | from summarize.data.io import JsonlReader
6 | from summarize.models.cloze import lead
7 | from summarize.common.testing import FIXTURES_ROOT
8 |
9 |
10 | class TestClozeLead(unittest.TestCase):
11 | def test_cloze_lead(self):
12 | with tempfile.NamedTemporaryFile(suffix='.jsonl') as output_file:
13 | Args = namedtuple('Args', ['input_jsonl', 'output_jsonl', 'max_sentences',
14 | 'max_tokens', 'max_bytes', 'field_name', 'keep_sentences'])
15 | args = Args(f'{FIXTURES_ROOT}/data/cloze.jsonl', output_file.name,
16 | 1, None, None, 'cloze', True)
17 | lead.main(args)
18 |
19 | instances = JsonlReader(output_file.name).read()
20 | assert len(instances) == 25
21 | assert all('cloze' in instance for instance in instances)
22 | assert all(isinstance(instance['cloze'], list) for instance in instances)
23 |
24 | args = Args(f'{FIXTURES_ROOT}/data/cloze.jsonl', output_file.name,
25 | 1, None, None, 'cloze', False)
26 | lead.main(args)
27 |
28 | instances = JsonlReader(output_file.name).read()
29 | assert len(instances) == 25
30 | assert all('cloze' in instance for instance in instances)
31 | assert all(isinstance(instance['cloze'], str) for instance in instances)
32 |
--------------------------------------------------------------------------------
/summarize/tests/models/cloze/open_ai_language_model_test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pytest
3 | import unittest
4 |
5 | from summarize.common.testing import FIXTURES_ROOT
6 | from summarize.data.io import JsonlReader
7 | from summarize.models.cloze import OpenAILanguageModel
8 |
9 | _MODEL_DIR = 'experiments/deutsch2019/baselines/open-ai/models/345M'
10 |
11 |
12 | class TestOpenAILanguageModel(unittest.TestCase):
13 | @pytest.mark.skip(reason='Too slow')
14 | @pytest.mark.skipif(not os.path.exists(_MODEL_DIR), reason='OpenAI Language Model does not exist')
15 | def test_open_ai_language_model(self):
16 | """
17 | Tests to make sure the OpenAI language model successfully loads and
18 | can process data.
19 | """
20 | length = 100
21 | temperature = 1.0
22 | top_k = 20
23 | lm = OpenAILanguageModel(_MODEL_DIR, length, temperature, top_k)
24 |
25 | # This can be quite slow, so we only do it for 1 instance
26 | with JsonlReader(f'{FIXTURES_ROOT}/data/cloze.jsonl') as f:
27 | for instance in f:
28 | context = instance['context']
29 | input_text = ' '.join(context)
30 | sentence = lm.sample_next_sentence(input_text)
31 | assert sentence is not None
32 | break
33 |
--------------------------------------------------------------------------------
/summarize/tests/models/cloze/pointer_generator_test.py:
--------------------------------------------------------------------------------
1 | from allennlp.common.testing import ModelTestCase
2 |
3 | # Some imports necessary in order to register the dataset reader, model, and modules
4 | import summarize.data.dataset_readers.cloze
5 | import summarize.models.cloze
6 | import summarize.modules.matrix_attention
7 | import summarize.training.metrics
8 | from summarize.common.testing import FIXTURES_ROOT
9 |
10 |
11 | class TestClozePointerGeneratorModel(ModelTestCase):
12 | def setUp(self):
13 | super().setUp()
14 | self.set_up_model(f'{FIXTURES_ROOT}/configs/cloze/pointer-generator.jsonnet',
15 | f'{FIXTURES_ROOT}/data/cloze.jsonl')
16 |
17 | def test_cloze_pointer_generator_can_train_save_and_load(self):
18 | self.ensure_model_can_train_save_and_load(self.param_file)
19 |
20 | def test_batch_predictions_are_consistent(self):
21 | # The log-probabilities are often unstable
22 | self.ensure_batch_predictions_are_consistent(keys_to_ignore='log_probabilities')
23 |
--------------------------------------------------------------------------------
/summarize/tests/models/cloze/seq2seq_test.py:
--------------------------------------------------------------------------------
1 | from allennlp.common.testing import ModelTestCase
2 |
3 | # Some imports necessary in order to register the dataset reader, model, and modules
4 | import summarize.data.dataset_readers.cloze
5 | import summarize.models.cloze
6 | import summarize.modules.matrix_attention
7 | import summarize.training.metrics
8 | from summarize.common.testing import FIXTURES_ROOT
9 |
10 |
11 | class TestClozeSeq2SeqModel(ModelTestCase):
12 | def setUp(self):
13 | super().setUp()
14 | self.set_up_model(f'{FIXTURES_ROOT}/configs/cloze/seq2seq.jsonnet',
15 | f'{FIXTURES_ROOT}/data/cloze.jsonl')
16 |
17 | def test_cloze_seq2seq_can_train_save_and_load(self):
18 | self.ensure_model_can_train_save_and_load(self.param_file)
19 |
20 | def test_batch_predictions_are_consistent(self):
21 | # The log-probabilities are often unstable
22 | self.ensure_batch_predictions_are_consistent(keys_to_ignore='log_probabilities')
23 |
--------------------------------------------------------------------------------
/summarize/tests/models/cloze/sumfocus_test.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 | import unittest
3 | from collections import namedtuple
4 |
5 | from summarize.data.io import JsonlReader
6 | from summarize.common.testing import FIXTURES_ROOT
7 | from summarize.models.cloze import sumfocus
8 |
9 |
10 | class TestSumFocus(unittest.TestCase):
11 | def test_sumfocus_runs(self):
12 | with tempfile.NamedTemporaryFile(suffix='.jsonl') as output_file:
13 | Args = namedtuple('Args', ['input_jsonl', 'output_jsonl', 'beta',
14 | 'topic_lambda', 'context_lambda',
15 | 'max_words', 'max_sentences'])
16 | args = Args(f'{FIXTURES_ROOT}/data/cloze.jsonl', output_file.name,
17 | 0.5, 0.2, 0.3, 200, None)
18 | sumfocus.main(args)
19 |
20 | instances = JsonlReader(output_file.name).read()
21 | assert len(instances) == 25
22 | for instance in instances:
23 | assert 'cloze' in instance
24 | assert isinstance(instance['cloze'], str)
25 |
--------------------------------------------------------------------------------
/summarize/tests/models/sds/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/models/sds/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/models/sds/extractive_baseline_test.py:
--------------------------------------------------------------------------------
1 | from allennlp.common.testing import ModelTestCase
2 |
3 | # Some imports necessary in order to register the dataset reader, model, and modules
4 | import summarize.data.dataset_readers.sds
5 | import summarize.models.sds
6 | import summarize.modules.matrix_attention
7 | import summarize.training.metrics
8 | from summarize.common.testing import FIXTURES_ROOT
9 |
10 |
11 | class ExtractiveBaselineModelModelTest(ModelTestCase):
12 | def setUp(self):
13 | super().setUp()
14 | self.set_up_model(f'{FIXTURES_ROOT}/configs/sds/extractive-baseline.jsonnet',
15 | f'{FIXTURES_ROOT}/data/sds.jsonl')
16 |
17 | def test_sds_extractive_baseline_can_train_save_and_load(self):
18 | self.ensure_model_can_train_save_and_load(self.param_file)
19 |
20 | def test_batch_predictions_are_consistent(self):
21 | # The log-probabilities are often unstable
22 | self.ensure_batch_predictions_are_consistent(keys_to_ignore='log_probabilities')
23 |
--------------------------------------------------------------------------------
/summarize/tests/models/sds/lead_test.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from summarize.models.sds.lead import get_lead_summary
4 |
5 |
6 | class TestLeadSummary(unittest.TestCase):
7 | def setUp(self):
8 | self.document = [
9 | 'The first sentence .',
10 | 'Followed by the second .',
11 | 'Finally the third .'
12 | ]
13 |
14 | def test_max_sentences(self):
15 | assert self.document[:1] == get_lead_summary(self.document, max_sentences=1)
16 | assert self.document[:2] == get_lead_summary(self.document, max_sentences=2)
17 | assert self.document == get_lead_summary(self.document, max_sentences=3)
18 | assert self.document == get_lead_summary(self.document, max_sentences=4)
19 |
20 | def test_max_token(self):
21 | assert ['The'] == get_lead_summary(self.document, max_tokens=1)
22 | assert ['The first sentence .'] == get_lead_summary(self.document, max_tokens=4)
23 | assert ['The first sentence .', 'Followed'] == get_lead_summary(self.document, max_tokens=5)
24 | assert ['The first sentence .', 'Followed by the second .', 'Finally the third'] == get_lead_summary(self.document, max_tokens=12)
25 | assert ['The first sentence .', 'Followed by the second .', 'Finally the third .'] == get_lead_summary(self.document, max_tokens=13)
26 | assert ['The first sentence .', 'Followed by the second .', 'Finally the third .'] == get_lead_summary(self.document, max_tokens=14)
27 |
28 | def test_max_bytes(self):
29 | assert ['T'] == get_lead_summary(self.document, max_bytes=1)
30 | assert ['The first sentence'] == get_lead_summary(self.document, max_bytes=19)
31 | assert ['The first sentence .'] == get_lead_summary(self.document, max_bytes=20)
32 | assert ['The first sentence .'] == get_lead_summary(self.document, max_bytes=21)
33 | assert ['The first sentence .', 'F'] == get_lead_summary(self.document, max_bytes=22)
34 | assert ['The first sentence .', 'Followed by the second .', 'Finally the third'] == get_lead_summary(self.document, max_bytes=64)
35 | assert ['The first sentence .', 'Followed by the second .', 'Finally the third .'] == get_lead_summary(self.document, max_bytes=65)
36 | assert ['The first sentence .', 'Followed by the second .', 'Finally the third .'] == get_lead_summary(self.document, max_bytes=66)
37 |
38 | def test_invalid_arguments(self):
39 | with self.assertRaises(Exception):
40 | get_lead_summary(self.document)
41 | with self.assertRaises(Exception):
42 | get_lead_summary(self.document, max_sentences=1, max_tokens=1)
43 | with self.assertRaises(Exception):
44 | get_lead_summary(self.document, max_sentences=1, max_bytes=1)
45 | with self.assertRaises(Exception):
46 | get_lead_summary(self.document, max_tokens=1, max_bytes=1)
47 | with self.assertRaises(Exception):
48 | get_lead_summary(self.document, max_sentences=1, max_tokens=1, max_bytes=1)
49 |
--------------------------------------------------------------------------------
/summarize/tests/models/sds/pointer_generator_test.py:
--------------------------------------------------------------------------------
1 | from allennlp.common.testing import ModelTestCase
2 |
3 | # Some imports necessary in order to register the dataset reader, model, and modules
4 | import summarize.data.dataset_readers.sds
5 | import summarize.models.sds
6 | import summarize.modules.matrix_attention
7 | import summarize.training.metrics
8 | from summarize.common.testing import FIXTURES_ROOT
9 |
10 |
11 | class PointerGeneratorModelTest(ModelTestCase):
12 | def setUp(self):
13 | super().setUp()
14 | self.set_up_model(f'{FIXTURES_ROOT}/configs/sds/pointer-generator.jsonnet',
15 | f'{FIXTURES_ROOT}/data/sds.jsonl')
16 |
17 | def test_sds_pointer_generator_can_train_save_and_load(self):
18 | self.ensure_model_can_train_save_and_load(self.param_file)
19 |
20 | def test_batch_predictions_are_consistent(self):
21 | self.ensure_batch_predictions_are_consistent(keys_to_ignore='log_probabilities')
22 |
--------------------------------------------------------------------------------
/summarize/tests/models/sds/seq2seq_test.py:
--------------------------------------------------------------------------------
1 | from allennlp.common.testing import ModelTestCase
2 |
3 | # Some imports necessary in order to register the dataset reader, model, and modules
4 | import summarize.data.dataset_readers.sds
5 | import summarize.models.sds
6 | import summarize.modules.matrix_attention
7 | import summarize.training.metrics
8 | from summarize.common.testing import FIXTURES_ROOT
9 |
10 |
11 | class Seq2SeqModelTest(ModelTestCase):
12 | def setUp(self):
13 | super().setUp()
14 | self.set_up_model(f'{FIXTURES_ROOT}/configs/sds/seq2seq.jsonnet',
15 | f'{FIXTURES_ROOT}/data/sds.jsonl')
16 |
17 | def test_sds_seq2seq_can_train_save_and_load(self):
18 | self.ensure_model_can_train_save_and_load(self.param_file)
19 |
20 | def test_batch_predictions_are_consistent(self):
21 | # The log-probabilities are often unstable
22 | self.ensure_batch_predictions_are_consistent(keys_to_ignore='log_probabilities')
23 |
--------------------------------------------------------------------------------
/summarize/tests/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/modules/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/modules/coverage_matrix_attention/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/modules/coverage_matrix_attention/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/modules/coverage_matrix_attention/mlp_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import unittest
3 |
4 | from summarize.modules.coverage_matrix_attention import MLPCoverageAttention
5 |
6 |
7 | class TestMLPCoverageAttention(unittest.TestCase):
8 | def test_mlp_coverage_attention(self):
9 | batch_size = 3
10 | num_encoder_tokens = 7
11 | num_decoder_tokens = 9
12 | encoder_dim = 11
13 | decoder_dim = 13
14 | attention_dim = 17
15 |
16 | encoder_outputs = torch.rand(batch_size, num_encoder_tokens, encoder_dim)
17 | encoder_mask = torch.LongTensor([
18 | [1, 1, 1, 1, 1, 1, 1],
19 | [1, 1, 1, 1, 1, 0, 0],
20 | [1, 1, 1, 0, 0, 0, 0]
21 | ])
22 | decoder_outputs = torch.rand(batch_size, num_decoder_tokens, decoder_dim)
23 | initial_coverage_vector = torch.zeros(batch_size, num_encoder_tokens)
24 |
25 | attention = MLPCoverageAttention(encoder_dim, decoder_dim, attention_dim)
26 | probabilities, coverage_vectors, coverage_vector = \
27 | attention(decoder_outputs, encoder_outputs, encoder_mask, initial_coverage_vector)
28 |
29 | # It's too hard to test specific values, so we run several sanity checks
30 | assert probabilities.size() == (batch_size, num_decoder_tokens, num_encoder_tokens)
31 | assert coverage_vectors.size() == (batch_size, num_decoder_tokens, num_encoder_tokens)
32 | assert coverage_vector.size() == (batch_size, num_encoder_tokens)
33 |
34 | # Make sure the first coverage vector is the initial argument
35 | assert torch.equal(initial_coverage_vector, coverage_vectors[:, 0])
36 |
37 | # Make sure the last coverage vector is the expected cumulative sum
38 | cumsum = torch.cumsum(probabilities, dim=1)
39 | assert torch.isclose(cumsum[:, -1], coverage_vector).all()
40 |
41 | # Make sure the probabilities obey the mask
42 | assert torch.equal((probabilities > 0).long(), encoder_mask.unsqueeze(1).expand_as(probabilities))
43 |
--------------------------------------------------------------------------------
/summarize/tests/modules/rnns/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/modules/rnns/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/modules/rnns/gru_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import unittest
3 | from allennlp.nn.util import get_final_encoder_states
4 |
5 | from summarize.tests.modules.rnns import util
6 |
7 |
8 | class TestGRU(unittest.TestCase):
9 | def test_gru_remap_hidden(self):
10 | batch_size = 3
11 | sequence_length = 11
12 | input_size = 5
13 | hidden_size = 7
14 | num_layers = 1
15 | bidirectional = True
16 |
17 | input_data, mask = util.get_random_inputs(batch_size, sequence_length, input_size)
18 | seq2seq_encoder, rnn = util.get_rnns('gru', input_size, hidden_size, num_layers, bidirectional)
19 |
20 | # Ensure the final encoder states are the same, with and without masking
21 | ones_mask = torch.ones(mask.size())
22 | encoder_outputs = seq2seq_encoder(input_data, None)
23 | expected_hidden = get_final_encoder_states(encoder_outputs, ones_mask, bidirectional)
24 | _, hidden = rnn(input_data, None)
25 | actual_hidden = rnn.reshape_hidden_for_decoder(hidden)
26 | assert (torch.abs(expected_hidden - actual_hidden) < 1e-5).all()
27 |
28 | encoder_outputs = seq2seq_encoder(input_data, mask)
29 | expected_hidden = get_final_encoder_states(encoder_outputs, mask, bidirectional)
30 | _, hidden = rnn(input_data, mask)
31 | actual_hidden = rnn.reshape_hidden_for_decoder(hidden)
32 | assert (torch.abs(expected_hidden - actual_hidden) < 1e-5).all()
33 |
--------------------------------------------------------------------------------
/summarize/tests/modules/rnns/lstm_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import unittest
3 | from allennlp.nn.util import get_final_encoder_states
4 |
5 | from summarize.tests.modules.rnns import util
6 |
7 |
8 | class TestLSTM(unittest.TestCase):
9 | def test_lstm_remap_hidden(self):
10 | batch_size = 3
11 | sequence_length = 11
12 | input_size = 5
13 | hidden_size = 7
14 | num_layers = 1
15 | bidirectional = True
16 |
17 | input_data, mask = util.get_random_inputs(batch_size, sequence_length, input_size)
18 | seq2seq_encoder, rnn = util.get_rnns('lstm', input_size, hidden_size, num_layers, bidirectional)
19 |
20 | # Ensure the final encoder states are the same, with and without masking
21 | ones_mask = torch.ones(mask.size())
22 | encoder_outputs = seq2seq_encoder(input_data, None)
23 | expected_hidden = get_final_encoder_states(encoder_outputs, ones_mask, bidirectional)
24 | _, hidden = rnn(input_data, None)
25 | actual_hidden = rnn.reshape_hidden_for_decoder(hidden)
26 | actual_hidden, _ = actual_hidden
27 | assert (torch.abs(expected_hidden - actual_hidden) < 1e-5).all()
28 |
29 | encoder_outputs = seq2seq_encoder(input_data, mask)
30 | expected_hidden = get_final_encoder_states(encoder_outputs, mask, bidirectional)
31 | _, hidden = rnn(input_data, mask)
32 | actual_hidden = rnn.reshape_hidden_for_decoder(hidden)
33 | actual_hidden, _ = actual_hidden
34 | assert (torch.abs(expected_hidden - actual_hidden) < 1e-5).all()
35 |
--------------------------------------------------------------------------------
/summarize/tests/modules/rnns/rnn_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import unittest
3 |
4 | from summarize.tests.modules.rnns import util
5 |
6 |
7 | class TestRNN(unittest.TestCase):
8 | def test_rnn_seq2seq_encoder_are_identical(self):
9 | batch_size = 3
10 | sequence_length = 11
11 | input_size = 5
12 | hidden_size = 7
13 | num_layers = 2
14 | bidirectional = True
15 |
16 | input_data, mask = util.get_random_inputs(batch_size, sequence_length, input_size)
17 | seq2seq_encoder, rnn = util.get_rnns('gru', input_size, hidden_size, num_layers, bidirectional)
18 |
19 | # First, compare without any masking
20 | expected_outputs = seq2seq_encoder(input_data, None)
21 | actual_outputs, _ = rnn(input_data, None)
22 | assert torch.equal(expected_outputs, actual_outputs)
23 |
24 | # Now with the masking
25 | expected_outputs = seq2seq_encoder(input_data, mask)
26 | actual_outputs, _ = rnn(input_data, mask)
27 | assert torch.equal(expected_outputs, actual_outputs)
28 |
29 | def test_rnn_seq2seq_encoder_are_identical_for_loop(self):
30 | # Tests the Seq2SeqEncoder versus the RNN to make sure that when the
31 | # RNN is applied with a for loop that the final outputs are the same
32 | batch_size = 3
33 | sequence_length = 11
34 | input_size = 5
35 | hidden_size = 7
36 | num_layers = 2
37 | bidirectional = False
38 |
39 | input_data, mask = util.get_random_inputs(batch_size, sequence_length, input_size)
40 | seq2seq_encoder, rnn = util.get_rnns('gru', input_size, hidden_size, num_layers, bidirectional)
41 |
42 | expected_outputs = seq2seq_encoder(input_data, None)
43 | actual_outputs = []
44 | hidden = None
45 | for i in range(sequence_length):
46 | input_step = input_data[:, i, :].unsqueeze(1)
47 | actual_output, hidden = rnn(input_step, None, hidden)
48 | actual_outputs.append(actual_output)
49 | actual_outputs = torch.cat(actual_outputs, dim=1)
50 | assert torch.equal(expected_outputs, actual_outputs)
51 |
52 | def test_no_mask_and_ones_mask_are_identical(self):
53 | # Tests to make sure the outputs are identical when using no mask (None)
54 | # versus a mask of just ones.
55 | batch_size = 30
56 | sequence_length = 20
57 | input_size = 5
58 | hidden_size = 7
59 | num_layers = 2
60 | bidirectional = False
61 |
62 | input_data, _ = util.get_random_inputs(batch_size, sequence_length, input_size)
63 | _, rnn = util.get_rnns('gru', input_size, hidden_size, num_layers, bidirectional)
64 | mask = torch.ones(input_data.size()[:-1])
65 | hidden = torch.rand(num_layers, batch_size, hidden_size)
66 |
67 | no_mask_outputs, no_mask_hidden = rnn(input_data, None, hidden)
68 | masked_outputs, masked_hidden = rnn(input_data, mask, hidden)
69 | assert torch.equal(no_mask_outputs, masked_outputs)
70 | assert torch.equal(no_mask_hidden, masked_hidden)
71 |
--------------------------------------------------------------------------------
/summarize/tests/modules/rnns/util.py:
--------------------------------------------------------------------------------
1 | import random
2 | import torch
3 | from allennlp.modules import Seq2SeqEncoder
4 |
5 | from summarize.modules.rnns import GRU, LSTM
6 |
7 |
8 | def get_random_inputs(batch_size: int, sequence_length: int, input_size: int):
9 | """
10 | Creates and returns random masked input data for an RNN.
11 | """
12 | input_data = torch.randn(batch_size, sequence_length, input_size)
13 | mask = torch.ones(batch_size, sequence_length, dtype=torch.uint8)
14 | # Start with 1 to make sure one of the inputs is not masked at all
15 | for i in range(1, batch_size):
16 | index = random.randint(1, sequence_length)
17 | mask[i, index:] = 0
18 | return input_data, mask
19 |
20 |
21 | def get_rnns(rnn_type: str, input_size: int, hidden_size: int, num_layers: int, bidirectional: bool):
22 | """
23 | Creates and returns an equivalent AllenNLP ``Seq2SeqEncoder`` and ``RNN`` RNNs.
24 | """
25 | assert num_layers in [1, 2]
26 | assert rnn_type in ['gru', 'lstm']
27 | seq2seq_encoder = Seq2SeqEncoder.by_name(rnn_type)(input_size=input_size, hidden_size=hidden_size,
28 | num_layers=num_layers, bidirectional=bidirectional)
29 | if rnn_type == 'gru':
30 | rnn = GRU(input_size, hidden_size, num_layers, bidirectional)
31 | else:
32 | rnn = LSTM(input_size, hidden_size, num_layers, bidirectional)
33 |
34 | rnn.rnn.weight_ih_l0[:] = seq2seq_encoder._module.weight_ih_l0[:]
35 | rnn.rnn.weight_hh_l0[:] = seq2seq_encoder._module.weight_hh_l0[:]
36 | rnn.rnn.bias_ih_l0[:] = seq2seq_encoder._module.bias_ih_l0[:]
37 | rnn.rnn.bias_hh_l0[:] = seq2seq_encoder._module.bias_hh_l0[:]
38 | if bidirectional:
39 | rnn.rnn.weight_ih_l0_reverse[:] = seq2seq_encoder._module.weight_ih_l0_reverse[:]
40 | rnn.rnn.weight_hh_l0_reverse[:] = seq2seq_encoder._module.weight_hh_l0_reverse[:]
41 | rnn.rnn.bias_ih_l0_reverse[:] = seq2seq_encoder._module.bias_ih_l0_reverse[:]
42 | rnn.rnn.bias_hh_l0_reverse[:] = seq2seq_encoder._module.bias_hh_l0_reverse[:]
43 |
44 | if num_layers == 2:
45 | rnn.rnn.weight_ih_l1[:] = seq2seq_encoder._module.weight_ih_l1[:]
46 | rnn.rnn.weight_hh_l1[:] = seq2seq_encoder._module.weight_hh_l1[:]
47 | rnn.rnn.bias_ih_l1[:] = seq2seq_encoder._module.bias_ih_l1[:]
48 | rnn.rnn.bias_hh_l1[:] = seq2seq_encoder._module.bias_hh_l1[:]
49 | if bidirectional:
50 | rnn.rnn.weight_ih_l1_reverse[:] = seq2seq_encoder._module.weight_ih_l1_reverse[:]
51 | rnn.rnn.weight_hh_l1_reverse[:] = seq2seq_encoder._module.weight_hh_l1_reverse[:]
52 | rnn.rnn.bias_ih_l1_reverse[:] = seq2seq_encoder._module.bias_ih_l1_reverse[:]
53 | rnn.rnn.bias_hh_l1_reverse[:] = seq2seq_encoder._module.bias_hh_l1_reverse[:]
54 |
55 | return seq2seq_encoder, rnn
56 |
--------------------------------------------------------------------------------
/summarize/tests/modules/sentence_extractors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/modules/sentence_extractors/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/modules/sentence_extractors/rnn_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import unittest
3 | from allennlp.modules import FeedForward
4 | from allennlp.modules.seq2seq_encoders import PytorchSeq2SeqWrapper
5 | from allennlp.nn import Activation
6 | from torch.nn import GRU
7 |
8 | from summarize.modules.sentence_extractors import RNNSentenceExtractor
9 |
10 |
11 | class RNNSentenceExtractorTest(unittest.TestCase):
12 | def test_rnn_sentence_extractor(self):
13 | # Hyperparameters
14 | batch_size = 3
15 | num_sents = 5
16 | input_hidden_size = 7
17 | hidden_size = 11
18 |
19 | # Setup a model
20 | gru = GRU(input_size=input_hidden_size,
21 | hidden_size=hidden_size,
22 | bidirectional=True,
23 | batch_first=True)
24 | rnn = PytorchSeq2SeqWrapper(gru)
25 | feed_forward = FeedForward(input_dim=hidden_size * 2,
26 | num_layers=2,
27 | hidden_dims=[10, 1],
28 | activations=[Activation.by_name('tanh')(), Activation.by_name('linear')()])
29 | extractor = RNNSentenceExtractor(rnn, feed_forward)
30 |
31 | # Setup some dummy data
32 | sentence_encodings = torch.randn(batch_size, num_sents, input_hidden_size)
33 | mask = torch.ones(batch_size, num_sents)
34 |
35 | # Pass the data through and verify the size of the output
36 | extraction_scores = extractor(sentence_encodings, mask)
37 | assert extraction_scores.size() == (batch_size, num_sents)
38 |
--------------------------------------------------------------------------------
/summarize/tests/nn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/nn/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/nn/beam_search/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/nn/beam_search/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/nn/beam_search/coverage_penalizers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/nn/beam_search/coverage_penalizers/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/nn/beam_search/coverage_penalizers/onmt_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import unittest
3 |
4 | from summarize.nn.beam_search.coverage_penalizers import ONMTCoveragePenalizer
5 |
6 |
7 | class TestAverageLengthPenalizer(unittest.TestCase):
8 | def test_onmt_coverage_penalizer(self):
9 | coverage = torch.FloatTensor([[0.4, 1.2, 0.8], [1.5, 0.7, 0.0]])
10 |
11 | penalizer = ONMTCoveragePenalizer(0.0)
12 | penalties = penalizer(coverage)
13 | expected_penalties = torch.FloatTensor([0.0, 0.0])
14 | assert torch.allclose(expected_penalties, penalties)
15 |
16 | penalizer = ONMTCoveragePenalizer(0.5)
17 | penalties = penalizer(coverage)
18 | expected_penalties = torch.FloatTensor([-0.2 * 0.5, -0.5 * 0.5])
19 | assert torch.allclose(expected_penalties, penalties, atol=1e-3)
20 |
--------------------------------------------------------------------------------
/summarize/tests/nn/beam_search/length_penalizers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/nn/beam_search/length_penalizers/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/nn/beam_search/length_penalizers/average_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import unittest
3 |
4 | from summarize.nn.beam_search.length_penalizers import AverageLengthPenalizer
5 |
6 |
7 | class TestAverageLengthPenalizer(unittest.TestCase):
8 | def test_average_length_penalizer(self):
9 | lengths = torch.LongTensor([[1, 2], [3, 4]])
10 |
11 | penalizer = AverageLengthPenalizer()
12 | penalties = penalizer(lengths)
13 | assert torch.equal(lengths.float(), penalties)
14 |
--------------------------------------------------------------------------------
/summarize/tests/nn/beam_search/length_penalizers/wu_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import unittest
3 |
4 | from summarize.nn.beam_search.length_penalizers import WuLengthPenalizer
5 |
6 |
7 | class TestWuLengthPenalizer(unittest.TestCase):
8 | def test_wu_length_penalizer(self):
9 | lengths = torch.LongTensor([[1, 2], [3, 4]])
10 |
11 | penalizer = WuLengthPenalizer(0.0)
12 | penalties = penalizer(lengths)
13 | expected_penalties = torch.FloatTensor([[1.0, 1.0], [1.0, 1.0]])
14 | assert torch.allclose(expected_penalties, penalties)
15 |
16 | penalizer = WuLengthPenalizer(0.5)
17 | penalties = penalizer(lengths)
18 | expected_penalties = torch.FloatTensor([[1.0, 1.0801], [1.1547, 1.2247]])
19 | assert torch.allclose(expected_penalties, penalties, atol=1e-3)
20 |
--------------------------------------------------------------------------------
/summarize/tests/nn/util_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import unittest
3 |
4 | from summarize.nn.util import normalize_losses
5 |
6 |
7 | class TestUtil(unittest.TestCase):
8 | def test_normalize_losses(self):
9 | losses = torch.FloatTensor([
10 | [1.0, 2.0, 3.0],
11 | [4.0, 5.0, 6.0]
12 | ])
13 | mask = torch.FloatTensor([
14 | [1.0, 1.0, 1.0],
15 | [1.0, 1.0, 0.0]
16 | ])
17 |
18 | actual_loss = normalize_losses(losses, mask, 'sum', 'sum')
19 | expected_loss = 15.0
20 | assert expected_loss == actual_loss.item()
21 |
22 | actual_loss = normalize_losses(losses, mask, 'sum', 'average')
23 | expected_loss = 7.5
24 | assert expected_loss == actual_loss.item()
25 |
26 | actual_loss = normalize_losses(losses, mask, 'average', 'sum')
27 | expected_loss = 6.5
28 | assert expected_loss == actual_loss.item()
29 |
30 | actual_loss = normalize_losses(losses, mask, 'average', 'average')
31 | expected_loss = 3.25
32 | assert expected_loss == actual_loss.item()
33 |
34 | with self.assertRaises(Exception):
35 | normalize_losses(losses, mask, 'unknown', 'sum')
36 | with self.assertRaises(Exception):
37 | normalize_losses(losses, mask, 'sum', 'unknown')
38 |
--------------------------------------------------------------------------------
/summarize/tests/training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/training/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/training/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/tests/training/metrics/__init__.py
--------------------------------------------------------------------------------
/summarize/tests/training/metrics/binary_f1_measure_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import unittest
3 |
4 | from summarize.training.metrics import BinaryF1Measure
5 |
6 |
7 | class BinaryF1MeasureTest(unittest.TestCase):
8 | def test_binary_f1_measure(self):
9 | gold_labels = torch.LongTensor([
10 | [1, 1, 0],
11 | [0, 1, 1],
12 | [0, 0, 0]
13 | ])
14 | model_labels = torch.LongTensor([
15 | [1, 0, 1],
16 | [1, 1, 1],
17 | [0, 1, 1]
18 | ])
19 | mask = torch.LongTensor([
20 | [1, 1, 1],
21 | [1, 1, 1],
22 | [0, 0, 0]
23 | ])
24 |
25 | metric = BinaryF1Measure()
26 | expected_precision = 3 / 5
27 | expected_recall = 3 / 4
28 | expected_f1 = 2 * (expected_precision * expected_recall) / (expected_precision + expected_recall)
29 |
30 | metric(gold_labels, model_labels, mask)
31 | actual_metrics = metric.get_metric()
32 | self.assertAlmostEqual(actual_metrics['precision'], expected_precision, delta=1e-5)
33 | self.assertAlmostEqual(actual_metrics['recall'], expected_recall, delta=1e-5)
34 | self.assertAlmostEqual(actual_metrics['f1'], expected_f1, delta=1e-5)
35 |
--------------------------------------------------------------------------------
/summarize/training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/training/__init__.py
--------------------------------------------------------------------------------
/summarize/training/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | from summarize.training.metrics.binary_f1_measure import BinaryF1Measure
2 | from summarize.training.metrics.python_rouge_metric import PythonRougeMetric
3 | from summarize.training.metrics.cross_entropy_metric import CrossEntropyMetric
4 |
--------------------------------------------------------------------------------
/summarize/training/metrics/binary_f1_measure.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from allennlp.training.metrics import F1Measure, Metric
3 | from overrides import overrides
4 | from typing import Dict, Optional
5 |
6 |
7 | @Metric.register('binary-f1')
8 | class BinaryF1Measure(F1Measure):
9 | """
10 | The BinaryF1Measure allows for computing the standard F1 metric using
11 | two binary vectors, the ground-truth labels and the predictions from the
12 | model. The original F1Measure computation would require the ground-truth
13 | predictions to be a (batch_size, ..., 2) binary tensor that marks the
14 | ground-truth class.
15 | """
16 | def __init__(self) -> None:
17 | super().__init__(1)
18 |
19 | @overrides
20 | def __call__(self,
21 | gold_labels: torch.Tensor,
22 | model_labels: torch.Tensor,
23 | mask: Optional[torch.Tensor] = None,
24 | **kwargs):
25 | """
26 | Parameters
27 | ----------
28 | gold_labels: (batch_size, ...)
29 | The ground-truth binary labels
30 | model_labels: (batch_size, ...)
31 | The binary model predictions
32 | mask: (batch_size, ...)
33 | The mask
34 | """
35 | categorical_model_labels = model_labels.new_zeros(*model_labels.size(), 2)
36 | model_labels = model_labels.unsqueeze(-1)
37 | categorical_model_labels.scatter_(-1, model_labels, 1)
38 | super().__call__(categorical_model_labels, gold_labels, mask)
39 |
40 | @overrides
41 | def get_metric(self, reset: bool = False) -> Dict[str, float]:
42 | precision, recall, f1_measure = super().get_metric(reset)
43 | return {
44 | 'precision': precision,
45 | 'recall': recall,
46 | 'f1': f1_measure
47 | }
48 |
--------------------------------------------------------------------------------
/summarize/training/metrics/cross_entropy_metric.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from allennlp.training.metrics import Metric
3 | from overrides import overrides
4 | from typing import Dict
5 |
6 |
7 | @Metric.register('cross-entropy')
8 | class CrossEntropyMetric(Metric):
9 | def __init__(self) -> None:
10 | self.total_loss = 0
11 | self.total_num_tokens = 0
12 |
13 | @overrides
14 | def __call__(self, loss: float, num_tokens: int) -> None:
15 | self.total_loss += loss
16 | self.total_num_tokens += num_tokens
17 |
18 | @overrides
19 | def get_metric(self, reset: bool = False) -> Dict[str, float]:
20 | cross_entropy = self.total_loss / self.total_num_tokens
21 | perplexity = np.exp(cross_entropy)
22 | if reset:
23 | self.total_loss = 0
24 | self.total_num_tokens = 0
25 | return {
26 | 'cross-entropy': cross_entropy,
27 | 'perplexity': perplexity
28 | }
29 |
--------------------------------------------------------------------------------
/summarize/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/summarize/f36a86d58f381ff1f607f356dad3d6ef7b0e0224/summarize/utils/__init__.py
--------------------------------------------------------------------------------
/summarize/utils/copy_jsonl_fields.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from summarize.data.io import JsonlReader, JsonlWriter
4 |
5 |
6 | def main(args):
7 | with JsonlWriter(args.output_jsonl) as out:
8 | with JsonlReader(args.source_jsonl) as source:
9 | with JsonlReader(args.target_jsonl) as target:
10 | for source_instance, target_instance in zip(source, target):
11 | for source_field, target_field in args.field_names:
12 | target_instance[target_field] = source_instance[source_field]
13 | out.write(target_instance)
14 |
15 |
16 | if __name__ == '__main__':
17 | argp = argparse.ArgumentParser()
18 | argp.add_argument('source_jsonl', help='The file with the desired field')
19 | argp.add_argument('target_jsonl', help='The destination file')
20 | argp.add_argument('output_jsonl', help='The file with the target data and copied source field')
21 | argp.add_argument('--field-names', nargs=2, action='append',
22 | help='The names of the source and target fields')
23 | args = argp.parse_args()
24 | main(args)
25 |
--------------------------------------------------------------------------------
/summarize/utils/extract_cloze_from_labels.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from summarize.data.io import JsonlReader, JsonlWriter
4 |
5 |
6 | def main(args):
7 | with JsonlWriter(args.output_jsonl) as out:
8 | with JsonlReader(args.input_jsonl) as f:
9 | for instance in f:
10 | document = instance['document']
11 | labels = instance['labels']
12 | cloze = [document[index] for index in labels]
13 | if not args.keep_sentences:
14 | cloze = ' '.join(cloze)
15 | out.write({args.field_name: cloze})
16 |
17 |
18 | if __name__ == '__main__':
19 | argp = argparse.ArgumentParser()
20 | argp.add_argument('input_jsonl', help='The input file with the labeled summaries.')
21 | argp.add_argument('output_jsonl', help='The output file')
22 | argp.add_argument('--field-name', default='cloze', help='The name of the output field')
23 | argp.add_argument('--keep-sentences', action='store_true', help='Indicates if the output field should be left as sentences or flattened')
24 | args = argp.parse_args()
25 | main(args)
26 |
--------------------------------------------------------------------------------
/summarize/utils/extract_summary_from_labels.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from summarize.data.io import JsonlReader, JsonlWriter
4 |
5 |
6 | def main(args):
7 | with JsonlWriter(args.output_jsonl) as out:
8 | with JsonlReader(args.input_jsonl) as f:
9 | for instance in f:
10 | document = instance['document']
11 | labels = instance['labels']
12 | summary = [document[index] for index in labels]
13 | out.write({'summary': summary})
14 |
15 |
16 | if __name__ == '__main__':
17 | argp = argparse.ArgumentParser()
18 | argp.add_argument('input_jsonl', help='The input file with the labeled summaries.')
19 | argp.add_argument('output_jsonl', help='The output file')
20 | args = argp.parse_args()
21 | main(args)
22 |
--------------------------------------------------------------------------------
/summarize/utils/replace_config.py:
--------------------------------------------------------------------------------
1 | """
2 | Replaces the model configuration in a model.tar.gz file with a new one. The
3 | new configuration can be a jsonnet file that will be evaluated into a json.
4 | """
5 | import argparse
6 | import tarfile
7 | import json
8 | from io import BytesIO
9 |
10 | from allennlp.common.params import Params
11 |
12 |
13 | def main(args):
14 | tar_bytes = open(args.model_file_path, 'rb').read()
15 | with tarfile.open(fileobj=BytesIO(tar_bytes), mode='r:gz') as tar:
16 | with tarfile.open(args.output_file_path, 'w:gz') as out:
17 | for member in tar.getmembers():
18 | if member.name != 'config.json':
19 | out.addfile(member, tar.extractfile(member.name))
20 | else:
21 | new_params = Params.from_file(args.config_file_path)
22 | serialized_params = json.dumps(new_params.as_ordered_dict(), indent=4).encode()
23 | bytes_io = BytesIO(serialized_params)
24 | member.size = len(serialized_params)
25 | out.addfile(tarinfo=member, fileobj=bytes_io)
26 |
27 |
28 | if __name__ == '__main__':
29 | argp = argparse.ArgumentParser()
30 | argp.add_argument('model_file_path', help='The path to the model.tar.gz with the config to replace')
31 | argp.add_argument('output_file_path', help='The path to the new model.tar.gz')
32 | argp.add_argument('config_file_path', help='The path to the new config file')
33 | args = argp.parse_args()
34 | main(args)
35 |
--------------------------------------------------------------------------------