├── .github └── workflows │ └── publish.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── Readme.md ├── experiments └── generation │ ├── Readme.md │ ├── model.jsonnet │ ├── preprocess.py │ ├── setup.sh │ └── train.sh ├── qaeval ├── __init__.py ├── answer_selection.py ├── answering │ ├── __init__.py │ ├── model.py │ └── utils.py ├── generation │ ├── __init__.py │ ├── dataset_reader.py │ ├── model.py │ ├── predictor.py │ └── util.py ├── metric.py ├── scoring │ ├── __init__.py │ ├── lerc │ │ ├── __init__.py │ │ ├── lerc_dataset_reader.py │ │ ├── lerc_model.py │ │ ├── lerc_predictor.py │ │ └── pretrain_model.py │ └── scorers │ │ ├── __init__.py │ │ ├── em.py │ │ ├── f1.py │ │ ├── is_answered.py │ │ ├── lerc.py │ │ ├── meta.py │ │ └── scorer.py ├── tests │ ├── __init__.py │ ├── answer_selection_test.py │ ├── answering │ │ ├── __init__.py │ │ ├── model_test.py │ │ └── utils_test.py │ ├── fixtures │ │ └── multiling2011.jsonl │ ├── generation │ │ ├── __init__.py │ │ └── model_test.py │ ├── metric_test.py │ └── scoring │ │ ├── __init__.py │ │ └── scorers │ │ ├── __init__.py │ │ ├── em_test.py │ │ ├── f1_test.py │ │ ├── is_answered_test.py │ │ ├── lerc_test.py │ │ ├── meta_test.py │ │ └── scorer_test.py └── version.py ├── requirements.txt └── setup.py /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Publish 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python setup.py sdist bdist_wheel 31 | twine upload dist/* 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .idea -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 6 | 7 | ## Unreleased 8 | 9 | ## [v0.1.0](https://github.com/danieldeutsch/qaeval/releases/tag/0.0.9) - 2021-08-02 10 | ### Added 11 | - Added an end-to-end implementation of the metric 12 | 13 | ## [v0.0.9](https://github.com/danieldeutsch/qaeval/releases/tag/0.0.9) - 2021-06-23 14 | ### Fixed 15 | - Added `edlib` to `setup.py` and `requirements.txt`. 16 | 17 | ## [v0.0.8](https://github.com/danieldeutsch/qaeval/releases/tag/0.0.8) - 2021-06-15 18 | ### Added 19 | - Added trying to fix the predicted character offsets using an alignment algorithm 20 | - Added an option to return the QA result as a dict 21 | 22 | ### Changed 23 | - Refactored the `Scorer` code to be cleaner 24 | 25 | ### Fixed 26 | - Specifying the spacy version to 2.2.4 in `setup.py` 27 | 28 | ## [v0.0.7](https://github.com/danieldeutsch/qaeval/releases/tag/0.0.7) - 2021-05-07 29 | ### Added 30 | - Added returning approximate character offsets in the context for the QA model's prediction 31 | 32 | ## [v0.0.6](https://github.com/danieldeutsch/qaeval/releases/tag/0.0.6) - 2021-05-06 33 | ### Added 34 | - Added "silent" options for the question generation and answering models 35 | 36 | ## [v0.0.5](https://github.com/danieldeutsch/qaeval/releases/tag/0.0.5) - 2021-01-02 37 | ### Added 38 | - Added scoring predictions with [LERC](https://arxiv.org/abs/2010.03636) 39 | 40 | ### Changed 41 | - Changed the scoring interface with a breaking change 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # QAEval 2 | This repository contains the code for the QAEval metric from [Towards Question-Answering as an Automatic Metric for Evaluating the Content Quality of a Summary](http://arxiv.org/abs/2010.00490). 3 | We have included here only the minimal amount of code to run the metric and does not include the code to run the experiments in the paper. 4 | 5 | The easiest way to run the metric end-to-end is to use the wrapper implementation included in [SacreROUGE](https://github.com/danieldeutsch/sacrerouge/blob/master/doc/metrics/qaeval.md). 6 | 7 | The pretrained question generation model can be downloaded [here](https://drive.google.com/file/d/1vVhRgLtsQDAOmxYhY5PMPnxxHUyCOdQU/view?usp=sharing) and the pretrained question answering model can be downloaded [here](https://drive.google.com/file/d/1q2Z3FPP9AYNz0RJKHMlaweNhmLQoyPA8/view?usp=sharing). 8 | 9 | ## Known Differences from Paper 10 | There are several known differences between the implementation here and the one we used for the experiments in the paper. 11 | 12 | - For the paper, we used a string equals and ROUGE-1 F1 with stemming to calculate the EM and F1 scores between the QA model's predicted answer and the ground-truth answer. 13 | This implementation uses the SQuAD EM/F1 implementations from the Transformers library. 14 | We made this decision to not create a dependency on ROUGE. 15 | 16 | - The AllenNLP version used here is 1.1.0. 17 | For the paper it was 1.0.0rc3. 18 | The 1.0.0rc3 version requires Transformers 2.8.0. 19 | After upgrading the AllenNLP version, we can now use Transformers 3.0.2, but this made the question-generation model used for the paper incompatible, so it had to be retrained. 20 | The retraining scripts are [here](experiments/generation/Readme.md). 21 | The required changes to the code for this were to pass `use_cache=False` to the BART call. 22 | 23 | ## Citation 24 | ``` 25 | @misc{deutsch2020questionanswering, 26 | title={Towards Question-Answering as an Automatic Metric for Evaluating the Content Quality of a Summary}, 27 | author={Daniel Deutsch and Tania Bedrax-Weiss and Dan Roth}, 28 | year={2020}, 29 | eprint={2010.00490}, 30 | archivePrefix={arXiv}, 31 | primaryClass={cs.CL} 32 | } 33 | ``` 34 | -------------------------------------------------------------------------------- /experiments/generation/Readme.md: -------------------------------------------------------------------------------- 1 | This directory contains the code to train the question-generation model. 2 | The original model used for the paper experiments could not be directly loaded by the code after updating the AllenNLP and Transformers packages, so we have retrained the model here. 3 | 4 | To retrain the model, run: 5 | ``` 6 | sh experiments/generation/setup.sh 7 | sh experiments/generation/train.sh 8 | ``` -------------------------------------------------------------------------------- /experiments/generation/model.jsonnet: -------------------------------------------------------------------------------- 1 | local bert_model = "facebook/bart-large"; 2 | 3 | { 4 | "dataset_reader": { 5 | "type": "question_generation", 6 | "model_name": bert_model, 7 | "lazy": false 8 | }, 9 | "train_data_path": "experiments/generation/data/train.jsonl", 10 | "validation_data_path": "experiments/generation/data/valid.jsonl", 11 | "model": { 12 | "type": "question_generation", 13 | "model_name": bert_model, 14 | }, 15 | "data_loader": { 16 | "batch_sampler": { 17 | "type": "bucket", 18 | "batch_size": 16 19 | }, 20 | }, 21 | "trainer": { 22 | "checkpointer": { 23 | "num_serialized_models_to_keep": 1 24 | }, 25 | "num_epochs": 1, 26 | "cuda_device": 0, 27 | "grad_norm": 1, 28 | "optimizer": { 29 | "type": "huggingface_adamw", 30 | "lr": 3e-5, 31 | "betas": [0.9, 0.999], 32 | "eps": 1e-8, 33 | "correct_bias": true 34 | } 35 | }, 36 | "random_seed": 4, 37 | "numpy_seed": 5, 38 | "pytorch_seed": 6 39 | } -------------------------------------------------------------------------------- /experiments/generation/preprocess.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | import sys 4 | 5 | 6 | def main(): 7 | with open(sys.argv[2], 'w') as out: 8 | with open(sys.argv[1], 'r') as f: 9 | reader = csv.reader(f, delimiter='\t') 10 | for i, row in enumerate(reader): 11 | if i == 0: 12 | continue 13 | 14 | question = row[2] 15 | answer = row[3] 16 | context = row[4] 17 | 18 | try: 19 | answer_start = context.lower().index(answer.lower()) 20 | answer_end = answer_start + len(answer) 21 | 22 | out.write(json.dumps({ 23 | 'context': context, 24 | 'answer': answer, 25 | 'answer_start': answer_start, 26 | 'answer_end': answer_end, 27 | 'question': question 28 | }) + '\n') 29 | 30 | except ValueError: 31 | pass 32 | 33 | 34 | if __name__ == '__main__': 35 | main() -------------------------------------------------------------------------------- /experiments/generation/setup.sh: -------------------------------------------------------------------------------- 1 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 2 | 3 | # Download the data from the CodaLab worksheet 4 | mkdir -p ${DIR}/data 5 | wget https://worksheets.codalab.org/rest/bundles/0x36403eba6daf46acbc5729cb1680a001/contents/blob/ -O ${DIR}/data/train.tsv 6 | wget https://worksheets.codalab.org/rest/bundles/0x303c441ba0d04062a293a4b83c86af77/contents/blob/ -O ${DIR}/data/dev.tsv 7 | wget https://worksheets.codalab.org/rest/bundles/0x2a42519198824d9bbb60bbba0fe629b6/contents/blob/ -O ${DIR}/data/combined_neg_pos.tsv 8 | 9 | # Reformat 10 | python ${DIR}/preprocess.py ${DIR}/data/train.tsv ${DIR}/data/train.jsonl 11 | python ${DIR}/preprocess.py ${DIR}/data/dev.tsv ${DIR}/data/valid.jsonl 12 | python ${DIR}/preprocess.py ${DIR}/data/combined_neg_pos.tsv ${DIR}/data/combined_neg_pos.jsonl 13 | -------------------------------------------------------------------------------- /experiments/generation/train.sh: -------------------------------------------------------------------------------- 1 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 2 | 3 | rm -r ${DIR}/model 4 | 5 | allennlp train \ 6 | --include-package qaeval \ 7 | -s ${DIR}/model \ 8 | ${DIR}/model.jsonnet -------------------------------------------------------------------------------- /qaeval/__init__.py: -------------------------------------------------------------------------------- 1 | from qaeval.answer_selection import AnswerSelector 2 | from qaeval.answering.model import QuestionAnsweringModel 3 | from qaeval.generation.model import QuestionGenerationModel 4 | from qaeval.metric import QAEval 5 | from qaeval.version import VERSION as __version__ 6 | 7 | FIXTURES_ROOT = 'qaeval/tests/fixtures' 8 | -------------------------------------------------------------------------------- /qaeval/answer_selection.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from collections import namedtuple 3 | from spacy.tokens import Span 4 | from typing import List 5 | 6 | NP_CHUNKS_STRATEGY = 'np-chunks' 7 | MAX_NP_STRATEGY = 'max-np' 8 | NER_STRATEGY = 'ner' 9 | ALL_STRATEGY = 'all' 10 | STRATEGIES = [NP_CHUNKS_STRATEGY, MAX_NP_STRATEGY, NER_STRATEGY, ALL_STRATEGY] 11 | 12 | AnswerOffsets = namedtuple('Answer', ['start', 'end', 'sent_start', 'sent_end', 'text']) 13 | 14 | 15 | class AnswerSelector(object): 16 | def __init__(self, strategy: str): 17 | if strategy not in STRATEGIES: 18 | raise Exception(f'Unknown strategy: {strategy}') 19 | self.strategy = strategy 20 | self.nlp = spacy.load('en_core_web_sm') 21 | 22 | def _get_np_chunks_answers(self, sentence: Span) -> List[AnswerOffsets]: 23 | chunks = [] 24 | for chunk in sentence.noun_chunks: 25 | chunks.append(AnswerOffsets(chunk.start_char, chunk.end_char, sentence.start_char, sentence.end_char, str(chunk))) 26 | return chunks 27 | 28 | def _get_max_np_answers(self, sentence: Span) -> List[AnswerOffsets]: 29 | root = sentence.root 30 | nodes = [root] 31 | nps = [] 32 | 33 | while len(nodes) > 0: 34 | node = nodes.pop() 35 | 36 | # If the node is a noun, collect all of the tokens 37 | # which are descendants of this node 38 | recurse = True 39 | if node.pos_ in ['NOUN', 'PROPN']: 40 | min_index = node.i 41 | max_index = node.i 42 | stack = [node] 43 | while len(stack) > 0: 44 | current = stack.pop() 45 | min_index = min(min_index, current.i) 46 | max_index = max(max_index, current.i) 47 | for child in current.children: 48 | stack.append(child) 49 | 50 | sent_start_index = sentence[0].i 51 | 52 | # Because of parsing issues, we only take NPs if they are shorter than a given length 53 | num_tokens = max_index - min_index + 1 54 | if num_tokens <= 7: 55 | recurse = False 56 | span = sentence[min_index - sent_start_index:max_index + 1 - sent_start_index] 57 | nps.append(AnswerOffsets(span.start_char, span.end_char, sentence.start_char, sentence.end_char, str(span))) 58 | 59 | if recurse: 60 | # Otherwise, process all of this node's children 61 | for child in node.children: 62 | nodes.append(child) 63 | 64 | # Sort in order of appearance 65 | nps.sort(key=lambda offsets: offsets.start) 66 | return nps 67 | 68 | def _get_ner_answers(self, sentence: Span) -> List[AnswerOffsets]: 69 | ners = [] 70 | for entity in sentence.ents: 71 | if entity.label_ in ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'EVENT', 'WORK_OF_ART']: 72 | ners.append(AnswerOffsets(entity.start_char, entity.end_char, sentence.start_char, sentence.end_char, str(entity))) 73 | return ners 74 | 75 | def _get_all_answers(self, sentence: Span) -> List[AnswerOffsets]: 76 | answers = set() 77 | answers |= set(self._get_np_chunks_answers(sentence)) 78 | answers |= set(self._get_max_np_answers(sentence)) 79 | answers |= set(self._get_ner_answers(sentence)) 80 | 81 | # Sort in order of appearance 82 | answers = sorted(answers, key=lambda answer: (answer.start, answer.end)) 83 | return answers 84 | 85 | def select(self, text: str) -> List[AnswerOffsets]: 86 | """ 87 | Selects a list of noun phrases from the input `text. Each returned `AnswerOffsets` has: 88 | - `start`: the character index in `text` where the noun phrase starts 89 | - `end`: the *exclusive* character index in `text` where the noun phrase ends 90 | - `sent_start`: the character index in `text` where the sentence that this noun phrase 91 | is in starts 92 | - `sent_end`: the *exclusive* character index in `text` where the sentence that this 93 | noun phrase is in ends 94 | - `text`: the noun phrase as a string 95 | """ 96 | doc = self.nlp(text) 97 | answers = [] 98 | for sent in doc.sents: 99 | if self.strategy == NP_CHUNKS_STRATEGY: 100 | answers.extend(self._get_np_chunks_answers(sent)) 101 | elif self.strategy == MAX_NP_STRATEGY: 102 | answers.extend(self._get_max_np_answers(sent)) 103 | elif self.strategy == NER_STRATEGY: 104 | answers.extend(self._get_ner_answers(sent)) 105 | elif self.strategy == ALL_STRATEGY: 106 | answers.extend(self._get_all_answers(sent)) 107 | else: 108 | raise Exception(f'Unknown strategy: {self.strategy}') 109 | return answers 110 | 111 | def select_all(self, text_list: List[str]) -> List[List[AnswerOffsets]]: 112 | return [self.select(text) for text in text_list] -------------------------------------------------------------------------------- /qaeval/answering/__init__.py: -------------------------------------------------------------------------------- 1 | from qaeval.answering.model import QuestionAnsweringModel -------------------------------------------------------------------------------- /qaeval/answering/model.py: -------------------------------------------------------------------------------- 1 | # This file was edited from the run_squad.py file in the experiment repository 2 | import torch 3 | from torch.utils.data import DataLoader, SequentialSampler 4 | from tqdm import tqdm 5 | 6 | from transformers import ( 7 | AutoConfig, 8 | AutoModelForQuestionAnswering, 9 | AutoTokenizer, 10 | squad_convert_examples_to_features, 11 | ) 12 | from transformers.data.processors.squad import SquadResult, SquadExample 13 | 14 | from typing import Dict, List, Tuple, Union 15 | 16 | from qaeval.answering.utils import compute_predictions_logits_with_null, fix_answer_span, SpanFixError 17 | 18 | Prediction = Union[ 19 | Tuple[str, float, float], 20 | Tuple[str, float, float, Tuple[int, int]], 21 | Dict[str, Union[str, float, Tuple[int, int]]], 22 | ] 23 | 24 | 25 | class QuestionAnsweringModel(object): 26 | def __init__(self, 27 | model_dir: str, 28 | cuda_device: int = 0, 29 | batch_size: int = 8, 30 | silent: bool = True) -> None: 31 | self.config = AutoConfig.from_pretrained(model_dir) 32 | self.tokenizer = AutoTokenizer.from_pretrained(model_dir, do_lower_case=True) 33 | self.model = AutoModelForQuestionAnswering.from_pretrained(model_dir, config=self.config) 34 | if cuda_device >= 0: 35 | self.model.to(cuda_device) 36 | 37 | self.model_type = 'electra' 38 | self.cuda_device = cuda_device 39 | self.batch_size = batch_size 40 | self.max_seq_length = 384 41 | self.doc_stride = 128 42 | self.silent = silent 43 | 44 | def _to_list(self, tensor): 45 | return tensor.detach().cpu().tolist() 46 | 47 | def _try_fixing_offsets( 48 | self, 49 | contexts: List[str], 50 | predictions: Dict[str, str], 51 | offsets_dict: Dict[str, Tuple[int, int]], 52 | ) -> Dict[str, Tuple[int, int]]: 53 | """ 54 | Tries to fix the potentially noisy character offsets of the predictions in the `contexts`. 55 | The input and output end indices are exclusive. 56 | """ 57 | new_offsets = {} 58 | 59 | for i, context in enumerate(contexts): 60 | index = str(i) 61 | 62 | prediction = predictions[index] 63 | pred_start, pred_end = offsets_dict[index] 64 | if context is None or prediction is None or pred_start is None or pred_end is None: 65 | new_offsets[index] = (pred_start, pred_end) 66 | else: 67 | span = context[pred_start:pred_end] 68 | if span != prediction: 69 | try: 70 | pred_start, pred_end = fix_answer_span(prediction, span, pred_start, pred_end) 71 | except SpanFixError: 72 | pass 73 | new_offsets[index] = (pred_start, pred_end) 74 | return new_offsets 75 | 76 | def answer( 77 | self, 78 | question: str, 79 | context: str, 80 | return_offsets: bool = False, 81 | try_fixing_offsets: bool = True, 82 | return_dict: bool = False, 83 | ) -> Prediction: 84 | """ 85 | Returns a tuple of (prediction, probability, null_probability). If `return_offsets = True`, the tuple 86 | will include rough character offsets of where the prediction is in the context. Because the tokenizer that 87 | the QA model uses does not support returning the character offsets from the BERT tokenization, we cannot 88 | directly provide exactly where the answer came from. However, the offsets should be pretty close to the 89 | prediction, and the prediction should be a substring of the offsets (modulo whitespace). If 90 | `return_offsets` and `try_fixing_offsets` are `True`, we will try to fix the character offsets via 91 | an alignment. See below. 92 | 93 | The `SquadExample` class maintains a list of whitespace separated tokens `doc_tokens` and a mapping 94 | from the context string characters to the token indices `char_to_word_offset`. Whitespace 95 | is included in the previous token. The `squad_convert_example_to_features` method takes each of these 96 | tokens and breaks it into the subtokens with the transformers tokenizer, which are passed into the model. 97 | It also keeps a mapping from the subtokens to the `doc_tokens` called `tok_to_orig_index`. The QA model 98 | predicts a span in the subtokens. In the `_get_char_offsets` method, we use these data structures to map 99 | from the subtoken span to character offsets. However, we cannot separate subtokens, so they are merged together. 100 | See the below example 101 | 102 | context: " My name is Dan!" 103 | doc_tokens: [My, name, is, Dan!] 104 | char_to_word_offset: [-1, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3] 105 | subtokens: [My, name, is, Dan, ##!] 106 | tok_to_orig_index: [0, 1, 2, 3, 3] 107 | 108 | prediction: "name is Dan" 109 | prediction subtokens: [name, is, Dan] 110 | prediction in doc_tokens: [name, is, Dan!] 111 | prediction in context: "name is Dan!" 112 | 113 | The prediction includes the extra whitespace between "is" and "Dan" as well as the "!" 114 | 115 | If `try_fixing_offsets=True`, we will try to fix the character offsets to be correct based on an alignment 116 | algorithm. We use the `edlib` python package to create a character alignment between the actual prediction 117 | string and the span given by the original offsets. We then update the offsets based on the alignment. If 118 | this procedure fails, the original offsets will be returned. 119 | """ 120 | return self.answer_all( 121 | [(question, context)], return_offsets=return_offsets, 122 | try_fixing_offsets=try_fixing_offsets, return_dicts=return_dict 123 | )[0] 124 | 125 | def answer_all( 126 | self, 127 | input_data: List[Tuple[str, str]], 128 | return_offsets: bool = False, 129 | try_fixing_offsets: bool = True, 130 | return_dicts: bool = False, 131 | ) -> List[Prediction]: 132 | # Convert all of the instances to squad examples 133 | examples = [] 134 | for i, (question, context) in enumerate(input_data): 135 | examples.append(SquadExample( 136 | qas_id=str(i), 137 | question_text=question, 138 | context_text=context, 139 | answer_text=None, 140 | start_position_character=None, 141 | title=None, 142 | is_impossible=True, 143 | answers=[] 144 | )) 145 | 146 | features, dataset = squad_convert_examples_to_features( 147 | examples=examples, 148 | tokenizer=self.tokenizer, 149 | max_seq_length=self.max_seq_length, 150 | doc_stride=self.doc_stride, 151 | max_query_length=64, 152 | is_training=False, 153 | return_dataset="pt", 154 | threads=1, 155 | tqdm_enabled=not self.silent 156 | ) 157 | 158 | eval_sampler = SequentialSampler(dataset) 159 | eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.batch_size) 160 | 161 | self.model.eval() 162 | all_results = [] 163 | generator = eval_dataloader 164 | if not self.silent: 165 | generator = tqdm(generator, desc='Evaluating') 166 | 167 | for batch in generator: 168 | if self.cuda_device >= 0: 169 | batch = tuple(t.to(self.cuda_device) for t in batch) 170 | 171 | with torch.no_grad(): 172 | inputs = { 173 | "input_ids": batch[0], 174 | "attention_mask": batch[1], 175 | "token_type_ids": batch[2], 176 | } 177 | 178 | feature_indices = batch[3] 179 | outputs = self.model(**inputs) 180 | 181 | for i, feature_index in enumerate(feature_indices): 182 | eval_feature = features[feature_index.item()] 183 | unique_id = int(eval_feature.unique_id) 184 | output = [self._to_list(output[i]) for output in outputs] 185 | start_logits, end_logits = output 186 | result = SquadResult(unique_id, start_logits, end_logits) 187 | 188 | all_results.append(result) 189 | 190 | model_predictions = compute_predictions_logits_with_null( 191 | self.tokenizer, 192 | examples, 193 | features, 194 | all_results, 195 | 20, 196 | 30, 197 | True, 198 | False, 199 | True, 200 | return_offsets=return_offsets 201 | ) 202 | 203 | if return_offsets: 204 | predictions, prediction_probs, no_answer_probs, offsets = model_predictions 205 | if try_fixing_offsets: 206 | contexts = [context for _, context in input_data] 207 | offsets = self._try_fixing_offsets(contexts, predictions, offsets) 208 | else: 209 | predictions, prediction_probs, no_answer_probs = model_predictions 210 | 211 | results = [] 212 | for i in range(len(input_data)): 213 | i = str(i) 214 | r = (predictions[i], prediction_probs[i], no_answer_probs[i]) 215 | if return_dicts: 216 | r = { 217 | 'prediction': r[0], 218 | 'probability': r[1], 219 | 'null_probability': r[2], 220 | } 221 | 222 | if return_offsets: 223 | if return_dicts: 224 | r['start'] = offsets[i][0] 225 | r['end'] = offsets[i][1] 226 | else: 227 | r = r + (offsets[i],) 228 | results.append(r) 229 | return results 230 | 231 | 232 | -------------------------------------------------------------------------------- /qaeval/answering/utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import edlib 3 | from typing import Tuple 4 | 5 | from transformers.data.metrics.squad_metrics import ( 6 | get_final_text, 7 | _get_best_indexes, 8 | _compute_softmax, 9 | ) 10 | 11 | 12 | def _is_whitespace(c): 13 | if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: 14 | return True 15 | return False 16 | 17 | 18 | def _get_char_offsets(example, pred_start, pred_end): 19 | # The returned end index will be exclusive 20 | if pred_start is None or pred_end is None: 21 | # This could happen if there's an edge case with no valid predictions. See where the prediction is "empty" 22 | return None, None 23 | 24 | token_to_char_start = {} 25 | token_to_char_end = {} 26 | for char_index, token_index in enumerate(example.char_to_word_offset): 27 | if token_index not in token_to_char_start: 28 | token_to_char_start[token_index] = char_index 29 | token_to_char_end[token_index] = char_index 30 | 31 | # Any whitespace after the token is included in that token. Find the last non-whitespace character 32 | for token_index, end in token_to_char_end.items(): 33 | if token_index == -1: 34 | # Whitespace at the beginning is mapped to token -1. We don't care about it 35 | continue 36 | while _is_whitespace(example.context_text[end]): 37 | end -= 1 38 | if end < 0: 39 | break 40 | if end < 0: 41 | raise Exception(f'Token end is less than 0.') 42 | token_to_char_end[token_index] = end + 1 # exclusive 43 | return token_to_char_start[pred_start], token_to_char_end[pred_end] 44 | 45 | 46 | def compute_predictions_logits_with_null( 47 | tokenizer, 48 | all_examples, 49 | all_features, 50 | all_results, 51 | n_best_size, 52 | max_answer_length, 53 | do_lower_case, 54 | verbose_logging, 55 | version_2_with_negative, 56 | return_offsets = False 57 | ): 58 | example_index_to_features = collections.defaultdict(list) 59 | for feature in all_features: 60 | example_index_to_features[feature.example_index].append(feature) 61 | 62 | unique_id_to_result = {} 63 | for result in all_results: 64 | unique_id_to_result[result.unique_id] = result 65 | 66 | _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name 67 | "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"] 68 | ) 69 | 70 | all_predictions = collections.OrderedDict() 71 | all_nbest_json = collections.OrderedDict() 72 | scores_diff_json = collections.OrderedDict() 73 | all_probs = collections.OrderedDict() 74 | null_scores = collections.OrderedDict() 75 | offsets = collections.OrderedDict() 76 | 77 | for (example_index, example) in enumerate(all_examples): 78 | features = example_index_to_features[example_index] 79 | 80 | prelim_predictions = [] 81 | # keep track of the minimum score of null start+end of position 0 82 | score_null = 1000000 # large and positive 83 | min_null_feature_index = 0 # the paragraph slice with min null score 84 | null_start_logit = 0 # the start logit at the slice with min null score 85 | null_end_logit = 0 # the end logit at the slice with min null score 86 | for (feature_index, feature) in enumerate(features): 87 | result = unique_id_to_result[feature.unique_id] 88 | start_indexes = _get_best_indexes(result.start_logits, n_best_size) 89 | end_indexes = _get_best_indexes(result.end_logits, n_best_size) 90 | # if we could have irrelevant answers, get the min score of irrelevant 91 | if version_2_with_negative: 92 | feature_null_score = result.start_logits[0] + result.end_logits[0] 93 | if feature_null_score < score_null: 94 | score_null = feature_null_score 95 | min_null_feature_index = feature_index 96 | null_start_logit = result.start_logits[0] 97 | null_end_logit = result.end_logits[0] 98 | for start_index in start_indexes: 99 | for end_index in end_indexes: 100 | # We could hypothetically create invalid predictions, e.g., predict 101 | # that the start of the span is in the question. We throw out all 102 | # invalid predictions. 103 | if start_index >= len(feature.tokens): 104 | continue 105 | if end_index >= len(feature.tokens): 106 | continue 107 | if start_index not in feature.token_to_orig_map: 108 | continue 109 | if end_index not in feature.token_to_orig_map: 110 | continue 111 | if not feature.token_is_max_context.get(start_index, False): 112 | continue 113 | if end_index < start_index: 114 | continue 115 | length = end_index - start_index + 1 116 | if length > max_answer_length: 117 | continue 118 | prelim_predictions.append( 119 | _PrelimPrediction( 120 | feature_index=feature_index, 121 | start_index=start_index, 122 | end_index=end_index, 123 | start_logit=result.start_logits[start_index], 124 | end_logit=result.end_logits[end_index], 125 | ) 126 | ) 127 | if version_2_with_negative: 128 | prelim_predictions.append( 129 | _PrelimPrediction( 130 | feature_index=min_null_feature_index, 131 | start_index=0, 132 | end_index=0, 133 | start_logit=null_start_logit, 134 | end_logit=null_end_logit, 135 | ) 136 | ) 137 | prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True) 138 | 139 | _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name 140 | "NbestPrediction", ["text", "start_logit", "end_logit", "doc_start", "doc_end"] 141 | ) 142 | 143 | seen_predictions = {} 144 | nbest = [] 145 | for pred in prelim_predictions: 146 | if len(nbest) >= n_best_size: 147 | break 148 | feature = features[pred.feature_index] 149 | if pred.start_index > 0: # this is a non-null prediction 150 | tok_tokens = feature.tokens[pred.start_index: (pred.end_index + 1)] 151 | orig_doc_start = feature.token_to_orig_map[pred.start_index] 152 | orig_doc_end = feature.token_to_orig_map[pred.end_index] 153 | orig_tokens = example.doc_tokens[orig_doc_start: (orig_doc_end + 1)] 154 | 155 | tok_text = tokenizer.convert_tokens_to_string(tok_tokens) 156 | 157 | # tok_text = " ".join(tok_tokens) 158 | # 159 | # # De-tokenize WordPieces that have been split off. 160 | # tok_text = tok_text.replace(" ##", "") 161 | # tok_text = tok_text.replace("##", "") 162 | 163 | # Clean whitespace 164 | tok_text = tok_text.strip() 165 | tok_text = " ".join(tok_text.split()) 166 | orig_text = " ".join(orig_tokens) 167 | 168 | final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) 169 | if final_text in seen_predictions: 170 | continue 171 | 172 | seen_predictions[final_text] = True 173 | else: 174 | final_text = "" 175 | orig_doc_start = None 176 | orig_doc_end = None 177 | seen_predictions[final_text] = True 178 | 179 | nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit, 180 | doc_start=orig_doc_start, doc_end=orig_doc_end)) 181 | # if we didn't include the empty option in the n-best, include it 182 | if version_2_with_negative: 183 | if "" not in seen_predictions: 184 | nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit, 185 | doc_start=None, doc_end=None)) 186 | 187 | # In very rare edge cases we could only have single null prediction. 188 | # So we just create a nonce prediction in this case to avoid failure. 189 | if len(nbest) == 1: 190 | nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0, doc_start=None, 191 | doc_end=None)) 192 | 193 | # In very rare edge cases we could have no valid predictions. So we 194 | # just create a nonce prediction in this case to avoid failure. 195 | if not nbest: 196 | nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0, doc_start=None, 197 | doc_end=None)) 198 | 199 | assert len(nbest) >= 1 200 | 201 | total_scores = [] 202 | best_non_null_entry = None 203 | best_non_null_entry_index = None 204 | for i, entry in enumerate(nbest): 205 | total_scores.append(entry.start_logit + entry.end_logit) 206 | if not best_non_null_entry: 207 | if entry.text: 208 | best_non_null_entry = entry 209 | best_non_null_entry_index = i 210 | 211 | probs = _compute_softmax(total_scores) 212 | 213 | nbest_json = [] 214 | null_prob = None 215 | best_prob = None 216 | for (i, entry) in enumerate(nbest): 217 | output = collections.OrderedDict() 218 | output["text"] = entry.text 219 | output["probability"] = probs[i] 220 | output["start_logit"] = entry.start_logit 221 | output["end_logit"] = entry.end_logit 222 | if entry.text == '': 223 | null_prob = probs[i] 224 | if i == best_non_null_entry_index: 225 | best_prob = probs[i] 226 | nbest_json.append(output) 227 | 228 | assert len(nbest_json) >= 1 229 | 230 | if not version_2_with_negative: 231 | all_predictions[example.qas_id] = nbest_json[0]["text"] 232 | else: 233 | # Always predict the best non-null text 234 | all_predictions[example.qas_id] = best_non_null_entry.text 235 | all_probs[example.qas_id] = best_prob 236 | null_scores[example.qas_id] = null_prob 237 | offsets[example.qas_id] = _get_char_offsets(example, best_non_null_entry.doc_start, 238 | best_non_null_entry.doc_end) 239 | 240 | # # predict "" iff the null score - the score of best non-null > threshold 241 | # score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit) 242 | # scores_diff_json[example.qas_id] = score_diff 243 | # if score_diff > null_score_diff_threshold: 244 | # all_predictions[example.qas_id] = "" 245 | # else: 246 | # all_predictions[example.qas_id] = best_non_null_entry.text 247 | all_nbest_json[example.qas_id] = nbest_json 248 | 249 | output = (all_predictions, all_probs, null_scores) 250 | if return_offsets: 251 | output = output + (offsets,) 252 | return output 253 | 254 | 255 | class SpanFixError(Exception): 256 | pass 257 | 258 | 259 | def fix_answer_span(prediction: str, document_span: str, start: int, end: int) -> Tuple[int, int]: 260 | """ 261 | Tries to fix the answer span of the prediction, which may include some extra whitespace or special characters. 262 | 263 | # Parameters 264 | - `prediction`: the string output by the QA model 265 | - `document_span`: the span in the text given by the maybe noisy offsets. See `QuestionAnsweringModel.answer()` 266 | documentation for more information 267 | - `start`: the start character offset of `document_span` in the original text 268 | - `end`: the *exclusive* end character offset of the `document_span` in the original text 269 | 270 | # Returns 271 | The `start` and *exclusive* `end` character offsets of fixed character offsets of `prediction` in the 272 | original text. 273 | """ 274 | 275 | if len(prediction) > len(document_span): 276 | raise SpanFixError(f'Unexpected lengths: "{prediction}", "{document_span}"') 277 | 278 | alignment = edlib.align(prediction, document_span, mode='HW', task='path') 279 | locations = alignment['locations'] 280 | if len(locations) != 1: 281 | raise SpanFixError(f'Unable to compute span: "{prediction}", "{document_span}"') 282 | align_start, align_end = locations[0] 283 | 284 | start += align_start 285 | end -= len(document_span) - align_end 286 | end += 1 287 | return start, end -------------------------------------------------------------------------------- /qaeval/generation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/qaeval/dd7273183dd1b2c9995115310ef041daa953ca81/qaeval/generation/__init__.py -------------------------------------------------------------------------------- /qaeval/generation/dataset_reader.py: -------------------------------------------------------------------------------- 1 | import json 2 | from allennlp.data import DatasetReader, Instance 3 | from allennlp.data.fields import MetadataField, TextField 4 | from allennlp.data.token_indexers import PretrainedTransformerIndexer 5 | from allennlp.data.tokenizers import PretrainedTransformerTokenizer 6 | from overrides import overrides 7 | from typing import Any, Dict, Iterable, Optional 8 | 9 | from qaeval.generation.util import SPAN_START_TOKEN, SPAN_END_TOKEN 10 | 11 | 12 | @DatasetReader.register('question_generation') 13 | class QuestionGenerationDatasetReader(DatasetReader): 14 | def __init__(self, 15 | model_name: str, 16 | lazy: bool = False): 17 | super().__init__(lazy=lazy) 18 | self.tokenizer = PretrainedTransformerTokenizer(model_name) 19 | self.token_indexers = {'tokens': PretrainedTransformerIndexer(model_name, namespace='tokens')} 20 | 21 | # Add the tokens which will mark the answer span 22 | self.tokenizer.tokenizer.add_tokens([SPAN_START_TOKEN, SPAN_END_TOKEN]) 23 | 24 | @overrides 25 | def _read(self, file_path: str) -> Iterable[Instance]: 26 | with open(file_path, 'r') as f: 27 | for line in f: 28 | data = json.loads(line) 29 | context = data['context'] 30 | start = data['answer_start'] 31 | end = data['answer_end'] 32 | question = data.pop('question', None) 33 | metadata = data.pop('metadata', {}) 34 | yield self.text_to_instance(context, start, end, question, metadata) 35 | 36 | def _insert_span_symbols(self, context: str, start: int, end: int) -> str: 37 | return f'{context[:start]}{SPAN_START_TOKEN} {context[start:end]} {SPAN_END_TOKEN}{context[end:]}' 38 | 39 | @overrides 40 | def text_to_instance(self, 41 | context: str, 42 | start: int, 43 | end: int, 44 | question: Optional[str] = None, 45 | metadata: Dict[str, Any] = None) -> Instance: 46 | """ 47 | Creates an Instance. `start` and `end` should be the character offsets in `context` of the answer. 48 | `end` should be exclusive. 49 | """ 50 | fields = {} 51 | metadata = metadata or {} 52 | 53 | answer = context[start:end] 54 | marked_context = self._insert_span_symbols(context, start, end) 55 | source_tokens = self.tokenizer.tokenize(marked_context) 56 | fields['source_tokens'] = TextField(source_tokens, self.token_indexers) 57 | metadata['answer'] = answer 58 | metadata['answer_start'] = start 59 | metadata['answer_end'] = end 60 | metadata['context'] = context 61 | metadata['marked_context'] = marked_context 62 | metadata['source_tokens'] = source_tokens 63 | 64 | if question is not None: 65 | target_tokens = self.tokenizer.tokenize(question) 66 | fields['target_tokens'] = TextField(target_tokens, self.token_indexers) 67 | metadata['question'] = question 68 | metadata['target_tokens'] = target_tokens 69 | 70 | fields['metadata'] = MetadataField(metadata) 71 | return Instance(fields) -------------------------------------------------------------------------------- /qaeval/generation/model.py: -------------------------------------------------------------------------------- 1 | # Implementation largely based on https://github.com/allenai/allennlp-models/pull/35/ 2 | 3 | import math 4 | import torch 5 | import torch.nn.functional as F 6 | from allennlp.data import Vocabulary 7 | from allennlp.data.fields.text_field import TextFieldTensors 8 | from allennlp.data.tokenizers import PretrainedTransformerTokenizer 9 | from allennlp.models import Model 10 | from allennlp.nn.beam_search import BeamSearch 11 | from allennlp.nn.util import sequence_cross_entropy_with_logits 12 | from allennlp.predictors import Predictor 13 | from overrides import overrides 14 | from transformers import BartForConditionalGeneration 15 | from tqdm import tqdm 16 | from typing import Any, Dict, List, Tuple 17 | 18 | # Dataset reader and predictor imports are necessary to find the classes when the 19 | # predictor is loaded from the archive 20 | from qaeval.generation.dataset_reader import QuestionGenerationDatasetReader 21 | from qaeval.generation.predictor import QuestionGenerationPredictor 22 | from qaeval.generation.util import ALL_SPECIAL_TOKENS 23 | 24 | 25 | class QuestionGenerationModel(object): 26 | def __init__(self, 27 | model_path: str, 28 | cuda_device: int = 0, 29 | batch_size: int = 8, 30 | silent: bool = True): 31 | self.predictor = Predictor.from_path(model_path, predictor_name='question_generation', cuda_device=cuda_device) 32 | self.batch_size = batch_size 33 | self.silent = silent 34 | 35 | def generate(self, text: str, start: int, end: int) -> str: 36 | return self.generate_all([(text, start, end)])[0] 37 | 38 | def generate_all(self, inputs: List[Tuple[str, int, int]]) -> List[str]: 39 | """ 40 | Generates a question for each input. The input is a list of tuples of the text and start and ending character 41 | offsets of the answer. The ending character offset should be exclusive. 42 | """ 43 | input_jsons = [] 44 | for text, start, end in inputs: 45 | input_jsons.append({ 46 | 'text': text, 47 | 'start': start, 48 | 'end': end 49 | }) 50 | outputs = [] 51 | num_batches = int(math.ceil(len(input_jsons) / self.batch_size)) 52 | 53 | generator = range(0, len(input_jsons), self.batch_size) 54 | if not self.silent: 55 | generator = tqdm(generator, total=num_batches, desc='Generating questions') 56 | 57 | for i in generator: 58 | batch = input_jsons[i:i + self.batch_size] 59 | outputs.extend(self.predictor.predict_batch_json(batch)) 60 | assert len(input_jsons) == len(outputs) 61 | return [output['predicted_question'] for output in outputs] 62 | 63 | 64 | @Model.register('question_generation') 65 | class _QuestionGenerationModel(Model): 66 | def __init__(self, 67 | vocab: Vocabulary, 68 | model_name: str, 69 | max_decoding_steps: int = 100, 70 | beam_size: int = 4) -> None: 71 | super().__init__(vocab) 72 | self.bart = BartForConditionalGeneration.from_pretrained(model_name) 73 | self.tokenizer = PretrainedTransformerTokenizer(model_name) 74 | 75 | # Increase the size of Bart's vocabulary to account for the new special 76 | # tokens that were added. Method found from https://github.com/huggingface/transformers/issues/3446 77 | # comment on June 12. 78 | vocab_size = self.bart.config.vocab_size 79 | self.bart.resize_token_embeddings(vocab_size + len(ALL_SPECIAL_TOKENS)) 80 | 81 | self._start_id = self.bart.config.bos_token_id 82 | self._end_id = self.bart.config.eos_token_id 83 | self._pad_id = self.bart.config.pad_token_id 84 | 85 | self._max_decoding_steps = max_decoding_steps 86 | self._beam_search = BeamSearch( 87 | self._end_id, max_steps=max_decoding_steps, beam_size=beam_size or 1 88 | ) 89 | 90 | @overrides 91 | def forward(self, 92 | source_tokens: TextFieldTensors, 93 | metadata: List[Dict[str, Any]], 94 | target_tokens: TextFieldTensors = None) -> Dict[str, Any]: 95 | source_ids = source_tokens['tokens']['token_ids'] 96 | source_mask = source_tokens['tokens']['mask'] 97 | 98 | output_dict = {'metadata': metadata} 99 | if target_tokens is not None: 100 | # Calculate loss 101 | target_ids = target_tokens['tokens']['token_ids'] 102 | target_mask = target_tokens['tokens']['mask'] 103 | 104 | logits = self.bart( 105 | input_ids=source_ids, 106 | attention_mask=source_mask, 107 | decoder_input_ids=target_ids[:, :-1].contiguous(), 108 | decoder_attention_mask=target_mask[:, :-1].contiguous(), 109 | use_cache=False 110 | )[0] 111 | 112 | # The BART paper mentions label smoothing of 0.1 for sequence generation tasks 113 | loss = sequence_cross_entropy_with_logits( 114 | logits, 115 | target_ids[:, 1:].contiguous(), 116 | target_mask[:, 1:].contiguous(), 117 | label_smoothing=0.1, 118 | average='token' 119 | ) 120 | output_dict['loss'] = loss 121 | 122 | if not self.training: 123 | # Run inference: This differs from the original code which 124 | # includes the decoder_start_id 125 | initial_decoder_ids = torch.tensor( 126 | [[self._start_id]], 127 | dtype=source_ids.dtype, 128 | device=source_ids.device, 129 | ).repeat(source_ids.shape[0], 1) 130 | 131 | inital_state = { 132 | "input_ids": source_ids, 133 | "input_mask": source_mask, 134 | "encoder_states": None, 135 | } 136 | beam_result = self._beam_search.search( 137 | initial_decoder_ids, inital_state, self.take_step 138 | ) 139 | 140 | predictions = beam_result[0] 141 | max_pred_indices = ( 142 | beam_result[1].argmax(dim=-1).view(-1, 1, 1).expand(-1, -1, predictions.shape[-1]) 143 | ) 144 | predictions = predictions.gather(dim=1, index=max_pred_indices).squeeze(dim=1) 145 | 146 | output_dict["predicted_ids"] = predictions 147 | output_dict["log_probabilities"] = ( 148 | beam_result[1].gather(dim=-1, index=max_pred_indices[..., 0]).squeeze(dim=-1) 149 | ) 150 | 151 | self.make_output_human_readable(output_dict) 152 | 153 | return output_dict 154 | 155 | @staticmethod 156 | def _decoder_cache_to_dict(decoder_cache): 157 | cache_dict = {} 158 | for layer_index, layer_cache in enumerate(decoder_cache): 159 | for attention_name, attention_cache in layer_cache.items(): 160 | for tensor_name, cache_value in attention_cache.items(): 161 | key = (layer_index, attention_name, tensor_name) 162 | cache_dict[key] = cache_value 163 | return cache_dict 164 | 165 | @staticmethod 166 | def _dict_to_decoder_cache(cache_dict): 167 | decoder_cache = [] 168 | for key, cache_value in cache_dict.items(): 169 | # Split key and extract index and dict keys 170 | layer_idx, attention_name, tensor_name = key 171 | # Extend decoder_cache to fit layer_idx + 1 layers 172 | decoder_cache = decoder_cache + [{} for _ in range(layer_idx + 1 - len(decoder_cache))] 173 | cache = decoder_cache[layer_idx] 174 | if attention_name not in cache: 175 | cache[attention_name] = {} 176 | assert tensor_name not in cache[attention_name] 177 | cache[attention_name][tensor_name] = cache_value 178 | return decoder_cache 179 | 180 | def take_step( 181 | self, last_predictions: torch.Tensor, state: Dict[str, torch.Tensor], step: int 182 | ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: 183 | if len(last_predictions.shape) == 1: 184 | last_predictions = last_predictions.unsqueeze(-1) 185 | 186 | # Only the last predictions are needed for the decoder, but we need to pad the decoder ids 187 | # to not mess up the positional embeddings in the decoder. 188 | padding_size = 0 189 | if step > 0: 190 | padding_size = step + 1 191 | padding = torch.full( 192 | (last_predictions.shape[0], padding_size), 193 | self._pad_id, 194 | dtype=last_predictions.dtype, 195 | device=last_predictions.device, 196 | ) 197 | last_predictions = torch.cat([padding, last_predictions], dim=-1) 198 | 199 | decoder_cache = None 200 | decoder_cache_dict = { 201 | k: (state[k].contiguous() if state[k] is not None else None) 202 | for k in state 203 | if k not in {"input_ids", "input_mask", "encoder_states"} 204 | } 205 | if len(decoder_cache_dict) != 0: 206 | decoder_cache = self._dict_to_decoder_cache(decoder_cache_dict) 207 | 208 | log_probabilities = None 209 | for i in range(padding_size, last_predictions.shape[1]): 210 | encoder_outputs = ( 211 | (state["encoder_states"],) if state["encoder_states"] is not None else None 212 | ) 213 | outputs = self.bart( 214 | input_ids=state["input_ids"], 215 | attention_mask=state["input_mask"], 216 | encoder_outputs=encoder_outputs, 217 | decoder_input_ids=last_predictions[:, : i + 1], 218 | decoder_cached_states=decoder_cache, 219 | generation_mode=True, 220 | use_cache=True, 221 | ) 222 | 223 | decoder_log_probabilities = F.log_softmax(outputs[0][:, 0], dim=-1) 224 | 225 | if log_probabilities is None: 226 | log_probabilities = decoder_log_probabilities 227 | else: 228 | idx = last_predictions[:, i].view(-1, 1) 229 | log_probabilities = decoder_log_probabilities + log_probabilities.gather( 230 | dim=-1, index=idx 231 | ) 232 | 233 | decoder_cache = outputs[1][1] 234 | 235 | state["encoder_states"] = outputs[2] 236 | 237 | if decoder_cache is not None: 238 | decoder_cache_dict = self._decoder_cache_to_dict(decoder_cache) 239 | state.update(decoder_cache_dict) 240 | 241 | return log_probabilities, state 242 | 243 | @overrides 244 | def make_output_human_readable(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, Any]: 245 | """ 246 | # Parameters 247 | output_dict : `Dict[str, torch.Tensor]` 248 | A dictionary containing a batch of predictions with key `predictions`. The tensor should have 249 | shape `(batch_size, max_sequence_length)` 250 | # Returns 251 | Dict[str, Any] 252 | Original `output_dict` with an additional `predicted_tokens` key that maps to a list of lists of 253 | tokens. 254 | """ 255 | predicted_ids = output_dict["predicted_ids"] 256 | predictions = [] 257 | for i in range(predicted_ids.shape[0]): 258 | token_ids = predicted_ids[i].tolist() 259 | while len(token_ids) > 0 and token_ids[-1] == self._end_id: 260 | token_ids.pop() 261 | predictions.append(self.tokenizer.tokenizer.decode(token_ids).strip()) 262 | output_dict["predicted_question"] = predictions 263 | 264 | return output_dict -------------------------------------------------------------------------------- /qaeval/generation/predictor.py: -------------------------------------------------------------------------------- 1 | import json 2 | from allennlp.common.util import JsonDict 3 | from allennlp.data import Instance 4 | from allennlp.predictors import Predictor 5 | from overrides import overrides 6 | 7 | 8 | @Predictor.register('question_generation') 9 | class QuestionGenerationPredictor(Predictor): 10 | @overrides 11 | def _json_to_instance(self, json_dict: JsonDict) -> Instance: 12 | return self._dataset_reader.text_to_instance(context=json_dict['text'], 13 | start=json_dict['start'], 14 | end=json_dict['end']) 15 | 16 | @overrides 17 | def dump_line(self, outputs: JsonDict) -> str: 18 | input_dict = outputs['metadata']['input_dict'] 19 | input_dict['question'] = outputs['predicted_question'] 20 | return json.dumps(input_dict) + '\n' -------------------------------------------------------------------------------- /qaeval/generation/util.py: -------------------------------------------------------------------------------- 1 | SPAN_START_TOKEN = '' 2 | SPAN_END_TOKEN = '' 3 | 4 | ALL_SPECIAL_TOKENS = [SPAN_START_TOKEN, SPAN_END_TOKEN] -------------------------------------------------------------------------------- /qaeval/metric.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import logging 3 | from tqdm import tqdm 4 | from typing import Any, Dict, List, Tuple, Union 5 | 6 | from qaeval import AnswerSelector, QuestionAnsweringModel, QuestionGenerationModel 7 | from qaeval.answer_selection import NP_CHUNKS_STRATEGY 8 | from qaeval.scoring.scorers import ( 9 | ExactMatchScorer, 10 | F1Scorer, 11 | IsAnsweredScorer, 12 | LERCScorer, 13 | MetaScorer, 14 | ) 15 | 16 | MetricsDict = Dict[str, float] 17 | SummaryType = Union[str, List[str]] 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | class QAEval(object): 23 | def __init__( 24 | self, 25 | generation_model_path: str, 26 | answering_model_dir: str, 27 | answer_selection_strategy: str = NP_CHUNKS_STRATEGY, 28 | cuda_device: int = 0, 29 | generation_batch_size: int = 8, 30 | answering_batch_size: int = 8, 31 | use_lerc: bool = False, 32 | lerc_model_path: str = None, 33 | lerc_pretrained_model_path: str = None, 34 | lerc_batch_size: int = 8, 35 | verbose: bool = False, 36 | ) -> None: 37 | self.answer_selector = AnswerSelector(answer_selection_strategy) 38 | self.question_generator = QuestionGenerationModel( 39 | generation_model_path, 40 | cuda_device=cuda_device, 41 | batch_size=generation_batch_size, 42 | silent=not verbose, 43 | ) 44 | self.question_answerer = QuestionAnsweringModel( 45 | answering_model_dir, 46 | cuda_device=cuda_device, 47 | batch_size=answering_batch_size, 48 | silent=not verbose, 49 | ) 50 | self.verbose = verbose 51 | 52 | scorers = [IsAnsweredScorer(), ExactMatchScorer(), F1Scorer()] 53 | if use_lerc: 54 | if lerc_model_path is None or lerc_pretrained_model_path is None: 55 | raise Exception( 56 | f"If `use_lerc` is `True`, `lerc_model_path` and `lerc_pretrained_model_path` must not be `None`" 57 | ) 58 | scorers.append( 59 | LERCScorer( 60 | lerc_model_path, 61 | lerc_pretrained_model_path, 62 | cuda_device, 63 | lerc_batch_size, 64 | ) 65 | ) 66 | self.scorer = MetaScorer(scorers) 67 | 68 | def _flatten_summaries(self, summaries: List[SummaryType]) -> List[str]: 69 | flat_summaries = [] 70 | for summary in summaries: 71 | if isinstance(summary, list): 72 | summary = " ".join(summary) 73 | flat_summaries.append(summary) 74 | return flat_summaries 75 | 76 | def _flatten_references_list( 77 | self, references_list: List[List[SummaryType]] 78 | ) -> List[List[str]]: 79 | # Flattens all of the summaries so they are `str` instead of `List[str]` 80 | flat_references_list = [] 81 | for references in references_list: 82 | flat_references_list.append([]) 83 | for reference in references: 84 | if isinstance(reference, list): 85 | reference = " ".join(reference) 86 | flat_references_list[-1].append(reference) 87 | return flat_references_list 88 | 89 | def _get_empty_summary_mask( 90 | self, summaries: List[str], references_list: List[List[str]] 91 | ) -> Tuple[List[str], List[List[str]], List[bool]]: 92 | # This will identify any summaries that have empty text. The output will be the list of non-empty summaries 93 | # with their corresponding references plus a list of booleans that is parallel will the input `summaries` 94 | # which mark whether or not they are empty 95 | is_empty_list = [] 96 | non_empty_summaries = [] 97 | non_empty_references_list = [] 98 | 99 | for summary, references in zip(summaries, references_list): 100 | if len(summary.strip()) > 0: 101 | is_empty_list.append(False) 102 | non_empty_summaries.append(summary) 103 | non_empty_references_list.append(references) 104 | else: 105 | is_empty_list.append(True) 106 | return non_empty_summaries, non_empty_references_list, is_empty_list 107 | 108 | def _get_question_id( 109 | self, instance_index: int, reference_index: int, start: int, end: int 110 | ) -> str: 111 | m = hashlib.md5() 112 | m.update(str(instance_index).encode()) 113 | m.update(str(reference_index).encode()) 114 | m.update(str(start).encode()) 115 | m.update(str(end).encode()) 116 | return m.hexdigest() 117 | 118 | def _generate_qa_pairs( 119 | self, references_list: List[List[str]] 120 | ) -> List[List[List[Dict[str, Any]]]]: 121 | # This will generate the question-answer pairs for each reference. Since references may be repeated, 122 | # we first deduplicate the references to minimize the expensive work. 123 | # 124 | # `reference_to_index` keeps track of where each of the unique references are in `distinct_references_list` 125 | reference_to_index = {} 126 | distinct_references_list = [] 127 | 128 | # Maps from (i, j) to the index in the `distinct_references_list` 129 | mapping = {} 130 | for i, references in enumerate(references_list): 131 | for j, reference in enumerate(references): 132 | if reference not in reference_to_index: 133 | reference_to_index[reference] = len(distinct_references_list) 134 | distinct_references_list.append(reference) 135 | mapping[(i, j)] = reference_to_index[reference] 136 | 137 | # Select the answers 138 | logger.info( 139 | f"Selecting answers from {len(distinct_references_list)} distinct summaries" 140 | ) 141 | answers_list = self.answer_selector.select_all(distinct_references_list) 142 | num_answers = sum(len(answers) for answers in answers_list) 143 | logger.info(f"Selected {num_answers} answers in total") 144 | 145 | # Generate the questions 146 | generation_inputs = [] 147 | for reference, answers in zip(distinct_references_list, answers_list): 148 | for answer in answers: 149 | sentence = reference[answer.sent_start : answer.sent_end] 150 | start = answer.start - answer.sent_start 151 | end = answer.end - answer.sent_start 152 | generation_inputs.append((sentence, start, end)) 153 | 154 | logger.info(f"Generating questions for {len(generation_inputs)} answers") 155 | question_list = self.question_generator.generate_all(generation_inputs) 156 | logger.info("Finished generating questions") 157 | 158 | # Remap the questions to align with the answers 159 | index = 0 160 | remapped_questions = [] 161 | for i, answers in enumerate(answers_list): 162 | remapped_questions.append([]) 163 | for _ in answers: 164 | remapped_questions[-1].append(question_list[index]) 165 | index += 1 166 | assert len(remapped_questions[i]) == len(answers_list[i]) 167 | assert len(remapped_questions) == len(answers_list) 168 | 169 | # Remap output to align with the inputs 170 | # qa_pairs_lists[summary_index][reference_index] = [(q, a)] 171 | qa_pairs_lists = [] 172 | for i, references in enumerate(references_list): 173 | qa_pairs_lists.append([]) 174 | for j, reference in enumerate(references): 175 | index = mapping[(i, j)] 176 | qa_pairs_lists[-1].append([]) 177 | for question, answer in zip( 178 | remapped_questions[index], answers_list[index] 179 | ): 180 | question_id = self._get_question_id(i, j, answer.start, answer.end) 181 | qa_pairs_lists[-1][-1].append( 182 | { 183 | "question_id": question_id, 184 | "question": question, 185 | "answer": answer.text, 186 | "sent_start": answer.sent_start, 187 | "sent_end": answer.sent_end, 188 | "answer_start": answer.start, 189 | "answer_end": answer.end, 190 | } 191 | ) 192 | return qa_pairs_lists 193 | 194 | def _get_prediction_id(self, prediction_index: int): 195 | m = hashlib.md5() 196 | m.update(str(prediction_index).encode()) 197 | return m.hexdigest() 198 | 199 | def _answer_questions( 200 | self, summaries: List[str], qa_pairs_lists: List[List[List[Dict[str, Any]]]] 201 | ) -> List[List[List[Dict[str, Any]]]]: 202 | # Answers all of the questions. Some of the (question, context) pairs may be duplicates, for instance because 203 | # of jackknifing. It will be a lot faster to deduplicate them first. 204 | # 205 | # `qa_inputs` will contain the unique inputs, `context_to_input_index` maps from the (question, context) pair 206 | # to its index in `qa_inputs`, and `mapping` will map from the i-th summary, j-th reference, and k-th question 207 | # to the index of the corresponding data in `qa_inputs` 208 | qa_inputs = [] 209 | context_to_input_index = {} 210 | mapping = {} 211 | 212 | for i, (summary, qa_pairs_list) in enumerate(zip(summaries, qa_pairs_lists)): 213 | for j, qa_pairs in enumerate(qa_pairs_list): 214 | for k, qa in enumerate(qa_pairs): 215 | question = qa["question"] 216 | key = (question, summary) 217 | if key not in context_to_input_index: 218 | context_to_input_index[key] = len(qa_inputs) 219 | qa_inputs.append(key) 220 | mapping[(i, j, k)] = context_to_input_index[key] 221 | 222 | logger.info(f"Answering {len(qa_inputs)} distinct (question, context) pairs") 223 | predictions = self.question_answerer.answer_all(qa_inputs, return_offsets=True) 224 | logger.info("Finished answering questions") 225 | 226 | # Remap from the distinct answers back to the original QA lists 227 | predictions_lists = [] 228 | for i, (summary, qa_pairs_list) in enumerate(zip(summaries, qa_pairs_lists)): 229 | predictions_lists.append([]) 230 | for j, qa_pairs in enumerate(qa_pairs_list): 231 | predictions_lists[-1].append([]) 232 | for k, qa in enumerate(qa_pairs): 233 | index = mapping[(i, j, k)] 234 | prediction, probability, null_probability, offsets = predictions[ 235 | index 236 | ] 237 | predictions_lists[-1][-1].append( 238 | { 239 | "prediction_id": self._get_prediction_id(index), 240 | "prediction": prediction, 241 | "probability": probability, 242 | "null_probability": null_probability, 243 | "start": offsets[0], 244 | "end": offsets[1], 245 | } 246 | ) 247 | return predictions_lists 248 | 249 | def _score_predictions( 250 | self, 251 | summaries: List[str], 252 | qa_pairs_lists: List[List[List[Dict[str, Any]]]], 253 | predictions_lists: List[List[List[Dict[str, Any]]]], 254 | ) -> Tuple[List[MetricsDict], List[List[List[Dict[str, float]]]]]: 255 | logger.info("Scoring predictions") 256 | metrics_list = [] 257 | scores_list = [] 258 | 259 | generator = tqdm( 260 | zip(summaries, qa_pairs_lists, predictions_lists), 261 | total=len(summaries), 262 | disable=not self.verbose, 263 | ) 264 | for summary, qa_pairs_list, predictions_list in generator: 265 | # This is for 1 (summary, references) pair 266 | input_questions_list = [] 267 | input_answers_list = [] 268 | input_predictions_list = [] 269 | input_probabilities_list = [] 270 | input_null_probabilities_list = [] 271 | for qa_pairs, predictions in zip(qa_pairs_list, predictions_list): 272 | # This is the set of QA pairs for 1 reference 273 | input_questions_list.append([]) 274 | input_answers_list.append([]) 275 | input_predictions_list.append([]) 276 | input_probabilities_list.append([]) 277 | input_null_probabilities_list.append([]) 278 | for qa, prediction in zip(qa_pairs, predictions): 279 | input_questions_list[-1].append(qa["question"]) 280 | input_answers_list[-1].append(qa["answer"]) 281 | input_predictions_list[-1].append(prediction["prediction"]) 282 | input_probabilities_list[-1].append(prediction["probability"]) 283 | input_null_probabilities_list[-1].append( 284 | prediction["null_probability"] 285 | ) 286 | 287 | metrics, scores = self.scorer.score_multi_ref( 288 | summary, 289 | input_questions_list, 290 | input_answers_list, 291 | input_predictions_list, 292 | input_probabilities_list, 293 | input_null_probabilities_list, 294 | ) 295 | metrics = {"qa-eval": metrics} 296 | metrics_list.append(metrics) 297 | scores_list.append(scores) 298 | 299 | logger.info("Finished scoring predictions") 300 | return metrics_list, scores_list 301 | 302 | def _combine_outputs( 303 | self, 304 | metrics_list: List[MetricsDict], 305 | qa_pairs_lists: List[List[List[Dict[str, Any]]]], 306 | predictions_lists: List[List[List[Dict[str, Any]]]], 307 | scores_lists: List[List[List[Dict[str, float]]]], 308 | ) -> List[List[List[Dict[str, Any]]]]: 309 | # This method will combine the metrics and QA pair metadata together into a tuple so they can 310 | # both be returned together 311 | combined = [] 312 | for metrics, qa_pairs_list, predictions_list, scores_list in zip( 313 | metrics_list, qa_pairs_lists, predictions_lists, scores_lists 314 | ): 315 | # This is for 1 (summary, reference) pair 316 | combined.append((metrics, [])) 317 | for qa_pairs, predictions, scores in zip( 318 | qa_pairs_list, predictions_list, scores_list 319 | ): 320 | # This is for 1 reference 321 | combined[-1][1].append([]) 322 | for qa, prediction, score in zip(qa_pairs, predictions, scores): 323 | prediction = dict(**prediction) 324 | for key in self.scorer.keys(): 325 | prediction[key] = score[key] 326 | combined[-1][1][-1].append( 327 | {"question": qa, "prediction": prediction} 328 | ) 329 | return combined 330 | 331 | def _insert_empty_outputs( 332 | self, 333 | metrics_list: List[MetricsDict], 334 | is_empty_list: List[bool], 335 | include_qa_list: bool, 336 | ) -> List[Any]: 337 | full_metrics_list = [] 338 | index = 0 339 | for is_empty in is_empty_list: 340 | if is_empty: 341 | empty_metrics = {"qa-eval": self.scorer.default_scores()} 342 | if include_qa_list: 343 | full_metrics_list.append((empty_metrics, [])) 344 | else: 345 | full_metrics_list.append(empty_metrics) 346 | else: 347 | full_metrics_list.append(metrics_list[index]) 348 | index += 1 349 | return full_metrics_list 350 | 351 | def score_batch( 352 | self, 353 | summaries: List[SummaryType], 354 | references_list: List[List[SummaryType]], 355 | return_qa_pairs: bool = False, 356 | ) -> List[List[MetricsDict]]: 357 | summaries = self._flatten_summaries(summaries) 358 | references_list = self._flatten_references_list(references_list) 359 | 360 | # Remove any input summaries that are empty. They mess up the processing otherwise 361 | ( 362 | summaries, 363 | references_list, 364 | is_empty_list, 365 | ) = self._get_empty_summary_mask(summaries, references_list) 366 | 367 | qa_pairs_lists = self._generate_qa_pairs(references_list) 368 | predictions_lists = self._answer_questions(summaries, qa_pairs_lists) 369 | metrics_list, scores_lists = self._score_predictions( 370 | summaries, qa_pairs_lists, predictions_lists 371 | ) 372 | 373 | if return_qa_pairs: 374 | output = self._combine_outputs( 375 | metrics_list, qa_pairs_lists, predictions_lists, scores_lists 376 | ) 377 | else: 378 | output = metrics_list 379 | output = self._insert_empty_outputs(output, is_empty_list, return_qa_pairs) 380 | return output 381 | -------------------------------------------------------------------------------- /qaeval/scoring/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/qaeval/dd7273183dd1b2c9995115310ef041daa953ca81/qaeval/scoring/__init__.py -------------------------------------------------------------------------------- /qaeval/scoring/lerc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/qaeval/dd7273183dd1b2c9995115310ef041daa953ca81/qaeval/scoring/lerc/__init__.py -------------------------------------------------------------------------------- /qaeval/scoring/lerc/lerc_dataset_reader.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import json 3 | import numpy as np 4 | from overrides import overrides 5 | from transformers import BertTokenizer 6 | 7 | from allennlp.data.dataset_readers.dataset_reader import DatasetReader 8 | from allennlp.data.fields import ArrayField 9 | from allennlp.data.fields.metadata_field import MetadataField 10 | from allennlp.data.instance import Instance 11 | 12 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 13 | 14 | 15 | @DatasetReader.register("lerc") 16 | class LERCDatasetReader(DatasetReader): 17 | def __init__( 18 | self, 19 | bert_model: str = 'bert-base-uncased', 20 | max_length: int = 512, 21 | holdout_sets: list = [], 22 | augment: bool = True, 23 | lazy: bool = False 24 | ) -> None: 25 | super().__init__(lazy) 26 | self.max_length = max_length 27 | self.holdout_sets = holdout_sets if type(holdout_sets) == list else [holdout_sets] 28 | self.augment = augment 29 | self.tokenizer = BertTokenizer.from_pretrained(bert_model) 30 | 31 | @overrides 32 | def _read(self, file_path: str): 33 | lines = [] 34 | mocha_dataset = json.load(open(file_path)) 35 | 36 | # Check that if we specified datasets to hold out, that they are 37 | # indeed in the MOCHA dataset. 38 | for constituent_dataset in self.holdout_sets: 39 | assert constituent_dataset in mocha_dataset.keys() 40 | 41 | # Iterate through the constituent datasets, loading the MOCHA instances 42 | for constituent_dataset in mocha_dataset: 43 | seen_questions = set() 44 | if constituent_dataset in self.holdout_sets: 45 | continue 46 | 47 | for line in mocha_dataset[constituent_dataset].values(): 48 | # Append the current instance 49 | lines.append({ 50 | 'context': line['context'], 51 | 'question': line['question'], 52 | 'reference': line['reference'], 53 | 'candidate': line['candidate'], 54 | 'score': line['score'], 55 | }) 56 | 57 | # Do a little data augmentation if the flag is set. 58 | if self.augment: 59 | # Identity augmentation with the reference 60 | # If this is the first time we have seen the question, 61 | # create an identity instance. 62 | if line['question'] not in seen_questions: 63 | lines.append({ 64 | 'context': line['context'], 65 | 'question': line['question'], 66 | 'reference': line['reference'], 67 | 'candidate': line['reference'], 68 | 'score': 5, 69 | }) 70 | seen_questions.add(line['question']) 71 | 72 | # Augmentations via flipping reference and candidate 73 | # If the current line has a perfect score, flip the 74 | # reference and candidate 75 | if self.augment and line['score'] == 5: 76 | lines.append({ 77 | 'context': line['context'], 78 | 'question': line['question'], 79 | 'reference': line['candidate'], 80 | 'candidate': line['reference'], 81 | 'score': 5, 82 | }) 83 | 84 | # Create instances 85 | for line in lines: 86 | yield self.text_to_instance(**line) 87 | 88 | @overrides 89 | def text_to_instance( 90 | self, context, question, reference, candidate, score=None 91 | ) -> Instance: 92 | context_tokens = self.tokenizer.tokenize(context) 93 | question_tokens = self.tokenizer.tokenize(question) 94 | reference_tokens = self.tokenizer.tokenize(reference) 95 | candidate_tokens = self.tokenizer.tokenize(candidate) 96 | 97 | # Truncates the context if the BERT input would be too long 98 | context_tokens = self.truncate_context( 99 | context_tokens, question_tokens, reference_tokens, candidate_tokens 100 | ) 101 | 102 | # Creates the BERT input (input IDs, segment IDs, and attention mask) 103 | input_ids, token_type_ids, attention_mask = self.create_input( 104 | context_tokens, question_tokens, reference_tokens, candidate_tokens 105 | ) 106 | 107 | fields = { 108 | 'input_ids': ArrayField(np.array(input_ids), dtype=np.int64, 109 | padding_value=self.tokenizer.pad_token_id), 110 | 'token_type_ids': ArrayField(np.array(token_type_ids), 111 | dtype=np.int64), 112 | 'attention_mask': ArrayField(np.array(attention_mask), 113 | dtype=np.int64), 114 | 'metadata': MetadataField({ 115 | 'context': context, 116 | 'context_tokens': context_tokens, 117 | 'question': question, 118 | 'question_tokens': question_tokens, 119 | 'reference': reference, 120 | 'reference_tokens': reference_tokens, 121 | 'candidate': candidate, 122 | 'candidate_tokens': candidate_tokens, 123 | }) 124 | } 125 | if score: 126 | fields['score'] = ArrayField(np.array(score)) 127 | 128 | return Instance(fields) 129 | 130 | def truncate_context(self, context, question, reference, candidate): 131 | """ Calculates if the current input would be over `self.max_length` 132 | and if so, truncates the context so that the input would be at 133 | `self.max_length`. 134 | """ 135 | 136 | num_added_tokens = self.tokenizer.num_special_tokens_to_add(pair=True) + 2 137 | current_length = len(context) + len(question) + len(reference) + \ 138 | len(candidate) + num_added_tokens 139 | 140 | if current_length > self.max_length: 141 | difference = self.max_length - current_length 142 | context = context[:difference] 143 | 144 | return context 145 | 146 | def create_input(self, context, question, reference, candidate): 147 | # `input_tokens`: `[CLS] cont [SEP] ques [SEP] ref [SEP] cand [SEP]` 148 | cls = [self.tokenizer.cls_token] 149 | sep = [self.tokenizer.sep_token] 150 | input_tokens = cls + context + sep + question + sep + reference + sep + candidate + sep 151 | input_ids = self.tokenizer.convert_tokens_to_ids(input_tokens) 152 | 153 | # `token_type_ids`: is 0 for `[CLS] cont [SEP] ques [SEP]` and 154 | # 1 for `ref [SEP] cand [SEP]` 155 | token_type_ids = [0] * (len(context) + len(question) + 3) + \ 156 | [1] * (len(reference) + len(candidate) + 2) 157 | 158 | # `attention_mask` is 1's for all positions which aren't padding 159 | attention_mask = [1] * len(input_ids) 160 | 161 | assert len(input_ids) == len(token_type_ids) == len(attention_mask) 162 | 163 | return input_ids, token_type_ids, attention_mask -------------------------------------------------------------------------------- /qaeval/scoring/lerc/lerc_model.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from overrides import overrides 3 | from transformers import BertModel 4 | import torch 5 | from typing import Dict 6 | 7 | from allennlp.data.vocabulary import Vocabulary 8 | from allennlp.models.archival import load_archive 9 | from allennlp.models.model import Model 10 | from allennlp.nn import InitializerApplicator 11 | from allennlp.training.metrics.pearson_correlation import PearsonCorrelation 12 | 13 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 14 | 15 | 16 | @Model.register("lerc") 17 | class LERC(Model): 18 | @property 19 | def embedding_dim(self): 20 | return self.bert.embeddings.word_embeddings.embedding_dim 21 | 22 | @overrides 23 | def get_metrics(self, reset: bool = False) -> Dict[str, float]: 24 | return {metric_name: metric.get_metric(reset) 25 | for metric_name, metric in self.metrics.items()} 26 | 27 | def __init__( 28 | self, 29 | bert_model: str = 'bert-base-uncased', 30 | pretrained_archive_path: str = None, 31 | vocab=Vocabulary(), 32 | initializer=InitializerApplicator() 33 | ) -> None: 34 | super(LERC, self).__init__(vocab) 35 | if pretrained_archive_path: 36 | logger.info('Loading pretrained: %s', pretrained_archive_path) 37 | archive = load_archive(pretrained_archive_path) 38 | self.bert = archive.model.bert 39 | else: 40 | self.bert = BertModel.from_pretrained(bert_model) 41 | 42 | self.score_layer = torch.nn.Linear(self.embedding_dim, 1) 43 | self.metrics = {'pearson': PearsonCorrelation()} 44 | self.loss = torch.nn.MSELoss() 45 | initializer(self) 46 | 47 | @overrides 48 | def forward( 49 | self, 50 | input_ids: torch.Tensor, 51 | token_type_ids: torch.Tensor, 52 | attention_mask: torch.Tensor = None, 53 | score: torch.Tensor = None, 54 | metadata: Dict = None 55 | ) -> Dict: 56 | output, _ = self.bert( 57 | input_ids=input_ids, 58 | token_type_ids=token_type_ids, 59 | attention_mask=attention_mask 60 | ) 61 | cls_output = output[:, 0, :] 62 | pred_score = self.score_layer(cls_output).squeeze(-1) 63 | 64 | output_dict = {'pred_score': pred_score, 'metadata': metadata} 65 | 66 | if score is not None: 67 | score = score.float() 68 | self.metrics['pearson'](pred_score, score) 69 | output_dict['loss'] = self.loss(pred_score, score) 70 | output_dict['score'] = score 71 | 72 | return output_dict -------------------------------------------------------------------------------- /qaeval/scoring/lerc/lerc_predictor.py: -------------------------------------------------------------------------------- 1 | from overrides import overrides 2 | 3 | from allennlp.data import DatasetReader, Instance 4 | from allennlp.models import Model 5 | from allennlp.predictors.predictor import Predictor 6 | 7 | from qaeval.scoring.lerc.lerc_model import LERC 8 | from qaeval.scoring.lerc.lerc_dataset_reader import LERCDatasetReader 9 | from qaeval.scoring.lerc.pretrain_model import PretrainLERC 10 | 11 | 12 | @Predictor.register("lerc") 13 | class LERCPredictor(Predictor): 14 | def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: 15 | super().__init__(model, dataset_reader) 16 | 17 | @overrides 18 | def _json_to_instance(self, inputs) -> Instance: 19 | inputs = { 20 | 'context': inputs['context'], 21 | 'question': inputs['question'], 22 | 'reference': inputs['reference'], 23 | 'candidate': inputs['candidate'] 24 | } 25 | return self._dataset_reader.text_to_instance(**inputs) -------------------------------------------------------------------------------- /qaeval/scoring/lerc/pretrain_model.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from overrides import overrides 3 | from typing import Dict 4 | 5 | import torch 6 | from allennlp.data.vocabulary import Vocabulary 7 | from allennlp.models.model import Model 8 | from allennlp.nn import InitializerApplicator 9 | from allennlp.training.metrics.categorical_accuracy import CategoricalAccuracy 10 | from transformers import BertModel 11 | 12 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 13 | 14 | 15 | @Model.register("pretrain-lerc") 16 | class PretrainLERC(Model): 17 | @property 18 | def embedding_dim(self): 19 | return self.bert.embeddings.word_embeddings.embedding_dim 20 | 21 | @overrides 22 | def get_metrics(self, reset: bool = False) -> Dict[str, float]: 23 | return {metric_name: metric.get_metric(reset) 24 | for metric_name, metric in self.metrics.items()} 25 | 26 | def __init__( 27 | self, 28 | bert_model: str = 'bert-base-uncased', 29 | vocab=Vocabulary(), 30 | initializer=InitializerApplicator() 31 | ) -> None: 32 | super(PretrainLERC, self).__init__(vocab) 33 | self.bert = BertModel.from_pretrained(bert_model) 34 | self.label_layer = torch.nn.Linear(self.embedding_dim, 3) 35 | self.metrics = {'accuracy': CategoricalAccuracy()} 36 | self.loss = torch.nn.CrossEntropyLoss() 37 | initializer(self) 38 | 39 | @overrides 40 | def forward( 41 | self, 42 | input_ids: torch.Tensor, 43 | token_type_ids: torch.Tensor, 44 | attention_mask: torch.Tensor = None, 45 | label: torch.Tensor = None, 46 | metadata: Dict = None 47 | ) -> Dict: 48 | # output.size() = [batch_size, seq_len, embedding_dim] 49 | output, _ = self.bert(input_ids=input_ids, 50 | token_type_ids=token_type_ids, 51 | attention_mask=attention_mask) 52 | 53 | # cls_output.size() = [batch_size, embedding_dim] 54 | cls_output = output[:, 0, :] 55 | 56 | # logits.size() = [batch_size, 3] 57 | logits = self.label_layer(cls_output) 58 | 59 | output_dict = { 60 | 'logits': logits, 61 | 'class_probabilties': torch.nn.functional.softmax(logits, dim=-1), 62 | 'pred_label': torch.max(logits, dim=-1)[1], 63 | 'metadata': metadata 64 | } 65 | 66 | if label is not None: 67 | label = label.long() 68 | self.metrics['accuracy'](logits, label) 69 | output_dict['loss'] = self.loss(logits, label) 70 | output_dict['label'] = label 71 | 72 | return output_dict -------------------------------------------------------------------------------- /qaeval/scoring/scorers/__init__.py: -------------------------------------------------------------------------------- 1 | from qaeval.scoring.scorers.scorer import Scorer 2 | from qaeval.scoring.scorers.is_answered import IsAnsweredScorer 3 | from qaeval.scoring.scorers.em import ExactMatchScorer 4 | from qaeval.scoring.scorers.f1 import F1Scorer 5 | from qaeval.scoring.scorers.lerc import LERCScorer 6 | from qaeval.scoring.scorers.meta import MetaScorer -------------------------------------------------------------------------------- /qaeval/scoring/scorers/em.py: -------------------------------------------------------------------------------- 1 | from transformers.data.metrics.squad_metrics import compute_exact 2 | from typing import Dict, List, Set 3 | 4 | from qaeval.scoring.scorers import Scorer 5 | 6 | 7 | class ExactMatchScorer(Scorer): 8 | def keys(self) -> Set[str]: 9 | return {'em'} 10 | 11 | def _score_single_ref( 12 | self, 13 | context: str, 14 | questions: List[str], 15 | answers: List[str], 16 | predictions: List[str], 17 | probabilities: List[float], 18 | null_probabilities: List[float] 19 | ) -> List[Dict[str, float]]: 20 | scores = [] 21 | for prediction, answer, prob, null_prob in zip(predictions, answers, probabilities, null_probabilities): 22 | if prediction is None or null_prob >= prob: 23 | scores.append({'em': 0.0}) 24 | else: 25 | scores.append({'em': compute_exact(answer, prediction)}) 26 | return scores 27 | -------------------------------------------------------------------------------- /qaeval/scoring/scorers/f1.py: -------------------------------------------------------------------------------- 1 | from transformers.data.metrics.squad_metrics import compute_f1 2 | from typing import Dict, List, Set 3 | 4 | from qaeval.scoring.scorers import Scorer 5 | 6 | 7 | class F1Scorer(Scorer): 8 | def keys(self) -> Set[str]: 9 | return {'f1'} 10 | 11 | def _score_single_ref( 12 | self, 13 | context: str, 14 | questions: List[str], 15 | answers: List[str], 16 | predictions: List[str], 17 | probabilities: List[float], 18 | null_probabilities: List[float] 19 | ) -> List[Dict[str, float]]: 20 | scores = [] 21 | for prediction, answer, prob, null_prob in zip(predictions, answers, probabilities, null_probabilities): 22 | if prediction is None or null_prob >= prob: 23 | scores.append({'f1': 0.0}) 24 | else: 25 | scores.append({'f1': compute_f1(answer, prediction)}) 26 | return scores 27 | -------------------------------------------------------------------------------- /qaeval/scoring/scorers/is_answered.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Set 2 | 3 | from qaeval.scoring.scorers import Scorer 4 | 5 | 6 | class IsAnsweredScorer(Scorer): 7 | def keys(self) -> Set[str]: 8 | return {'is_answered'} 9 | 10 | def _score_single_ref( 11 | self, 12 | context: str, 13 | questions: List[str], 14 | answers: List[str], 15 | predictions: List[str], 16 | probabilities: List[float], 17 | null_probabilities: List[float] 18 | ) -> List[Dict[str, float]]: 19 | scores = [] 20 | for prob, null_prob in zip(probabilities, null_probabilities): 21 | if prob > null_prob: 22 | scores.append({'is_answered': 1.0}) 23 | else: 24 | scores.append({'is_answered': 0.0}) 25 | return scores 26 | -------------------------------------------------------------------------------- /qaeval/scoring/scorers/lerc.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Set 2 | 3 | from allennlp.models import load_archive 4 | 5 | from qaeval.scoring.lerc.lerc_predictor import LERCPredictor 6 | from qaeval.scoring.scorers import Scorer 7 | 8 | 9 | class LERCScorer(Scorer): 10 | def __init__(self, model_path: str, pretrained_path: str, cuda_device: int, batch_size: int = 8) -> None: 11 | archive = load_archive(model_path, cuda_device=cuda_device, overrides='{"model.pretrained_archive_path": "' + pretrained_path + '"}') 12 | self.predictor = LERCPredictor.from_archive(archive, predictor_name='lerc') 13 | self.batch_size = batch_size 14 | 15 | def keys(self) -> Set[str]: 16 | return {'lerc'} 17 | 18 | def _score_single_ref( 19 | self, 20 | context: str, 21 | questions: List[str], 22 | answers: List[str], 23 | predictions: List[str], 24 | probabilities: List[float], 25 | null_probabilities: List[float] 26 | ) -> List[Dict[str, float]]: 27 | input_dicts = [] 28 | indices = [] 29 | for i, (answer, question, prediction, probability, null_probability) in enumerate(zip(answers, questions, predictions, 30 | probabilities, null_probabilities)): 31 | if probability > null_probability: 32 | input_dicts.append({ 33 | 'context': context, 34 | 'question': question, 35 | 'reference': answer, 36 | 'candidate': prediction 37 | }) 38 | indices.append(i) 39 | 40 | output_dicts = [] 41 | for i in range(0, len(input_dicts), self.batch_size): 42 | batch = input_dicts[i:i + self.batch_size] 43 | output_dicts.extend(self.predictor.predict_batch_json(batch)) 44 | assert len(output_dicts) == len(input_dicts) 45 | 46 | scores = [0.0] * len(questions) 47 | for i, output_dict in zip(indices, output_dicts): 48 | scores[i] = output_dict['pred_score'] 49 | scores = [{'lerc': s} for s in scores] 50 | return scores -------------------------------------------------------------------------------- /qaeval/scoring/scorers/meta.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Set 2 | 3 | from qaeval.scoring.scorers import Scorer 4 | 5 | 6 | class MetaScorer(Scorer): 7 | def __init__(self, scorers: List['Scorer']) -> None: 8 | self.scorers = scorers 9 | 10 | def _merge_dicts(self, dicts: List[Dict[str, float]]) -> Dict[str, float]: 11 | merged = {} 12 | for other in dicts: 13 | merged.update(other) 14 | return merged 15 | 16 | def keys(self) -> Set[str]: 17 | keys = set() 18 | for scorer in self.scorers: 19 | keys |= scorer.keys() 20 | return keys 21 | 22 | def _score_single_ref( 23 | self, 24 | context: str, 25 | questions: List[str], 26 | answers: List[str], 27 | predictions: List[str], 28 | probabilities: List[float], 29 | null_probabilities: List[float] 30 | ) -> List[Dict[str, float]]: 31 | scores_list = [] 32 | for scorer in self.scorers: 33 | _, scores = scorer.score_single_ref( 34 | context, 35 | questions, 36 | answers, 37 | predictions, 38 | probabilities, 39 | null_probabilities 40 | ) 41 | scores_list.append(scores) 42 | 43 | combined_scores = [] 44 | for i in range(len(questions)): 45 | combined_scores.append(self._merge_dicts([scores[i] for scores in scores_list])) 46 | return combined_scores 47 | -------------------------------------------------------------------------------- /qaeval/scoring/scorers/scorer.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Set, Tuple 2 | 3 | 4 | class Scorer(object): 5 | def keys(self) -> Set[str]: 6 | raise NotImplementedError 7 | 8 | def default_scores(self) -> Dict[str, float]: 9 | return {key: 0.0 for key in self.keys()} 10 | 11 | def score_single_ref( 12 | self, 13 | context: str, 14 | questions: List[str], 15 | answers: List[str], 16 | predictions: List[str], 17 | probabilities: List[float], 18 | null_probabilities: List[float] 19 | ) -> Tuple[Dict[str, float], List[Dict[str, float]]]: 20 | scores_dicts = self._score_single_ref( 21 | context, 22 | questions, 23 | answers, 24 | predictions, 25 | probabilities, 26 | null_probabilities 27 | ) 28 | aggregated_scores = self.aggregate_scores(scores_dicts) 29 | return aggregated_scores, scores_dicts 30 | 31 | def _score_single_ref( 32 | self, 33 | context: str, 34 | questions: List[str], 35 | answers: List[str], 36 | predictions: List[str], 37 | probabilities: List[float], 38 | null_probabilities: List[float] 39 | ) -> List[Dict[str, float]]: 40 | raise NotImplementedError 41 | 42 | def score_multi_ref( 43 | self, 44 | context: str, 45 | questions_list: List[List[str]], 46 | answers_list: List[List[str]], 47 | predictions_list: List[List[str]], 48 | probabilities_list: List[List[float]], 49 | null_probabilities_list: List[List[float]] 50 | ) -> Tuple[Dict[str, float], List[List[Dict[str, float]]]]: 51 | # The aggregated per-reference scores 52 | reference_scores_list = [] 53 | # The scores for each individual question. [i][j] will be the scores from 54 | # reference i and question j 55 | question_scores_list = [] 56 | 57 | for i in range(len(questions_list)): 58 | reference_scores, question_scores = self.score_single_ref( 59 | context, 60 | questions_list[i], 61 | answers_list[i], 62 | predictions_list[i], 63 | probabilities_list[i], 64 | null_probabilities_list[i] 65 | ) 66 | reference_scores_list.append(reference_scores) 67 | question_scores_list.append(question_scores) 68 | 69 | instance_scores = self.aggregate_scores(reference_scores_list) 70 | return instance_scores, question_scores_list 71 | 72 | def _ensure_expected_keys(self, expected_keys: Set[str], scores_dicts: List[Dict[str, float]]) -> None: 73 | for scores in scores_dicts: 74 | if expected_keys != scores.keys(): 75 | raise Exception(f'Unequal keys: {expected_keys}; {scores.keys()}') 76 | 77 | def aggregate_scores(self, scores_dicts: List[Dict[str, float]]) -> Dict[str, float]: 78 | if len(scores_dicts) == 0: 79 | return self.default_scores() 80 | 81 | expected_keys = self.keys() 82 | self._ensure_expected_keys(expected_keys, scores_dicts) 83 | sums = {key: 0.0 for key in expected_keys} 84 | for scores in scores_dicts: 85 | for key in expected_keys: 86 | sums[key] += scores[key] 87 | 88 | averages = {key: sums[key] / len(scores_dicts) for key in expected_keys} 89 | return averages -------------------------------------------------------------------------------- /qaeval/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/qaeval/dd7273183dd1b2c9995115310ef041daa953ca81/qaeval/tests/__init__.py -------------------------------------------------------------------------------- /qaeval/tests/answer_selection_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from qaeval.answer_selection import AnswerSelector, NP_CHUNKS_STRATEGY, \ 4 | MAX_NP_STRATEGY, NER_STRATEGY, ALL_STRATEGY, STRATEGIES, AnswerOffsets 5 | 6 | 7 | class TestAnswerSelector(unittest.TestCase): 8 | def test_constructor(self): 9 | for strategy in STRATEGIES: 10 | AnswerSelector(strategy) 11 | with self.assertRaises(Exception): 12 | AnswerSelector('missing') 13 | 14 | def test_np_chunks(self): 15 | selector = AnswerSelector(NP_CHUNKS_STRATEGY) 16 | answers = selector.select('Several churches in Baghdad have been attacked. More attacks have been in Mosul.') 17 | assert len(answers) == 4 18 | assert answers[0] == AnswerOffsets(0, 16, 0, 47, 'Several churches') 19 | assert answers[1] == AnswerOffsets(20, 27, 0, 47, 'Baghdad') 20 | assert answers[2] == AnswerOffsets(48, 60, 48, 80, 'More attacks') 21 | assert answers[3] == AnswerOffsets(74, 79, 48, 80, 'Mosul') 22 | 23 | def test_max_np(self): 24 | selector = AnswerSelector(MAX_NP_STRATEGY) 25 | answers = selector.select('Several churches in Baghdad have been attacked. More attacks have been in Mosul.') 26 | assert len(answers) == 3 27 | assert answers[0] == AnswerOffsets(0, 27, 0, 47, 'Several churches in Baghdad') # Several churches in Baghdad 28 | assert answers[1] == AnswerOffsets(48, 60, 48, 80, 'More attacks') 29 | assert answers[2] == AnswerOffsets(74, 79, 48, 80, 'Mosul') 30 | 31 | def test_ner(self): 32 | selector = AnswerSelector(NER_STRATEGY) 33 | answers = selector.select('Several churches in Baghdad have been attacked. More attacks have been in Mosul.') 34 | assert len(answers) == 2 35 | assert answers[0] == AnswerOffsets(20, 27, 0, 47, 'Baghdad') 36 | assert answers[1] == AnswerOffsets(74, 79, 48, 80, 'Mosul') 37 | 38 | def test_all(self): 39 | selector = AnswerSelector(ALL_STRATEGY) 40 | answers = selector.select('Several churches in Baghdad have been attacked. More attacks have been in Mosul.') 41 | assert len(answers) == 5 42 | assert answers[0] == AnswerOffsets(0, 16, 0, 47, 'Several churches') 43 | assert answers[1] == AnswerOffsets(0, 27, 0, 47, 'Several churches in Baghdad') 44 | assert answers[2] == AnswerOffsets(20, 27, 0, 47, 'Baghdad') 45 | assert answers[3] == AnswerOffsets(48, 60, 48, 80, 'More attacks') 46 | assert answers[4] == AnswerOffsets(74, 79, 48, 80, 'Mosul') 47 | -------------------------------------------------------------------------------- /qaeval/tests/answering/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/qaeval/dd7273183dd1b2c9995115310ef041daa953ca81/qaeval/tests/answering/__init__.py -------------------------------------------------------------------------------- /qaeval/tests/answering/model_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import unittest 4 | 5 | from qaeval.answering import QuestionAnsweringModel 6 | 7 | 8 | @pytest.mark.skipif('ANSWERING_MODEL_DIR' not in os.environ, reason='Answering model environment variable not set') 9 | class TestQuestionAnsweringModel(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls) -> None: 12 | cls.model = QuestionAnsweringModel(os.environ['ANSWERING_MODEL_DIR'], cuda_device=0) 13 | 14 | def test_answering(self): 15 | question = 'Who does the A380 super - jumbo passenger jet surpass and break their monopoly?' 16 | context = "The superjumbo Airbus A380 , the world 's largest commercial airliner , took off Wednesday into cloudy skies over southwestern France for its second test flight . The European aircraft maker , based in the French city of Toulouse , said the second flight -- which came exactly a week after the A380 's highly anticipated maiden voyage -- would last about four hours . As opposed to the international media hype that surrounded last week 's flight , with hundreds of journalists on site to capture the historic moment , Airbus chose to conduct Wednesday 's test more discreetly ." 17 | answer, probability, null_probability = self.model.answer(question, context) 18 | 19 | assert answer == 'the world \'s largest' 20 | assert probability == pytest.approx(0.00428164186632745, abs=1e-5) 21 | assert null_probability == pytest.approx(0.9895479613676263, abs=1e-5) 22 | 23 | def test_answering_with_offsets(self): 24 | question = 'Who does the A380 super - jumbo passenger jet surpass and break their monopoly?' 25 | context = "The superjumbo Airbus A380 , the world 's largest commercial airliner , took off Wednesday into cloudy skies over southwestern France for its second test flight . The European aircraft maker , based in the French city of Toulouse , said the second flight -- which came exactly a week after the A380 's highly anticipated maiden voyage -- would last about four hours . As opposed to the international media hype that surrounded last week 's flight , with hundreds of journalists on site to capture the historic moment , Airbus chose to conduct Wednesday 's test more discreetly ." 26 | answer, probability, null_probability, offsets = self.model.answer(question, context, return_offsets=True) 27 | 28 | assert answer == 'the world \'s largest' 29 | assert probability == pytest.approx(0.00428164186632745, abs=1e-5) 30 | assert null_probability == pytest.approx(0.9895479613676263, abs=1e-5) 31 | assert offsets == (29, 49) 32 | 33 | def test_answering_with_fixing_offsets(self): 34 | question = 'What is my name?' 35 | context = 'My name is Dan!' 36 | 37 | # Verify the original, unfixed offsets are not correct 38 | answer, probability, null_probability, offsets = self.model.answer( 39 | question, context, return_offsets=True, try_fixing_offsets=False 40 | ) 41 | start, end = offsets 42 | assert answer == 'Dan' 43 | assert context[start:end] == 'Dan!' 44 | 45 | # `try_fixing_offsets=True` by default 46 | answer, probability, null_probability, offsets = self.model.answer( 47 | question, context, return_offsets=True 48 | ) 49 | start, end = offsets 50 | assert answer == 'Dan' 51 | assert context[start:end] == 'Dan' 52 | 53 | def test_return_dict(self): 54 | question = 'Who does the A380 super - jumbo passenger jet surpass and break their monopoly?' 55 | context = "The superjumbo Airbus A380 , the world 's largest commercial airliner , took off Wednesday into cloudy skies over southwestern France for its second test flight . The European aircraft maker , based in the French city of Toulouse , said the second flight -- which came exactly a week after the A380 's highly anticipated maiden voyage -- would last about four hours . As opposed to the international media hype that surrounded last week 's flight , with hundreds of journalists on site to capture the historic moment , Airbus chose to conduct Wednesday 's test more discreetly ." 56 | result = self.model.answer( 57 | question, context, return_offsets=True, return_dict=True 58 | ) 59 | 60 | assert result['prediction'] == 'the world \'s largest' 61 | assert result['probability'] == pytest.approx(0.00428164186632745, abs=1e-5) 62 | assert result['null_probability'] == pytest.approx(0.9895479613676263, abs=1e-5) 63 | assert result['start'] == 29 64 | assert result['end'] == 49 65 | -------------------------------------------------------------------------------- /qaeval/tests/answering/utils_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from qaeval.answering.utils import fix_answer_span, SpanFixError 4 | 5 | 6 | class TestUtils(unittest.TestCase): 7 | def test_fix_answer_span(self): 8 | assert fix_answer_span('Dan', 'Dan!', 0, 4) == (0, 3) 9 | assert fix_answer_span('Dan', 'Dan!', 10, 14) == (10, 13) 10 | assert fix_answer_span('Dan', ' Dan!', 0, 5) == (1, 4) 11 | assert fix_answer_span('Dan', ' Dan! ', 0, 6) == (1, 4) 12 | assert fix_answer_span('Dan', ' Dan! ', 0, 8) == (2, 5) 13 | 14 | assert fix_answer_span('is Dan', 'is Dan!', 0, 7) == (0, 6) 15 | assert fix_answer_span('is Dan', ' is Dan!', 0, 8) == (1, 7) 16 | assert fix_answer_span('is Dan', ' is Dan! ', 0, 9) == (1, 7) 17 | assert fix_answer_span('is Dan', 'is Dan!', 0, 8) == (0, 7) 18 | assert fix_answer_span('is Dan', 'is Dan!', 0, 9) == (0, 8) 19 | assert fix_answer_span('is Dan', ' is Dan! ', 0, 11) == (1, 9) 20 | 21 | # Length is too long 22 | with self.assertRaises(SpanFixError): 23 | fix_answer_span('Dan!', 'Dan', 0, 3) 24 | with self.assertRaises(SpanFixError): 25 | fix_answer_span('is Dan', 'is Dan', 0, 6) 26 | 27 | # Not a substring 28 | with self.assertRaises(SpanFixError): 29 | fix_answer_span('Dan', 'Not a substring', 0, 15) 30 | 31 | def test_fix_answer_span_unicode(self): 32 | prediction = 'track and field, swimming, diving' 33 | document_span = 'that…track and field, swimming, diving,' 34 | assert fix_answer_span(prediction, document_span, 0, 40) == (5, 39) 35 | -------------------------------------------------------------------------------- /qaeval/tests/fixtures/multiling2011.jsonl: -------------------------------------------------------------------------------- 1 | {"instance_id": "M000", "summarizer_id": "1", "summarizer_type": "peer", "summary": {"summarizer_id": "1", "summarizer_type": "peer", "text": ["* AP - \"confirmed death toll passed 121,000, and 5 million people", "The Bush administration has pledged $350 million in aid for the relief effort. Critics have been quick to compare this to the $177 million spent every day in Iraq to conduct war in that country. In comparison, there was a $500 million pledge made recently by the government of Japan.", "An AP/ISOS poll has found three in ten U.S. citizens have donated to Tsunami Aid organizations.", "Several prominent Romanian artists and celebrities will gather at the Radio Hall in Bucharest on Sunday, January 23, to raise money for the victims of the catastrophic Indian Ocean tsunami of December 26, 2004. In an event titled Romanian Artists in Support of Asia, organised by The Reporter Foundation of Romania, artists will auction off their works, as well as personal objects, with all proceeds being donated to the relief efforts for the tsunami victims.", "Saturday, March 26, 2005 Up to four times as many women as men died in the December 26 Indian Ocean Tsunami, figures published by Oxfam International today reveal.", "Without the new law, contributors would have waited until 2006 and their 2005 tax returns to be able to write off their charitable donations. The law is intended to promote donating towards the tsunami relief effort.", "While complaints about the 'miserly' generosity of the Bush Administration have surfaced in recent days, donations and actions at the grassroots level have quietly illustrated the concern and sympathy felt by ordinary Americans."]}, "references": [{"summarizer_id": "A", "summarizer_type": "reference", "text": ["Dec. 2004 Sumatra earthquake was the longest-lasting ever recorded and large enough to vibrate the whole planet!", "A week after tsunami waves scoured the coasts of multiple countries in southern Asia, the UN says the final death toll could climb beyond 150,000 and may never be known, while it is estimated that 5 million people lack food, water or basic sanitation necessary for survival.", "It should also be noted that women killed outnumber men.", "The world's wealthiest nations have begun pouring funding into the Earthquake/Tsunami damaged region.", "Japanese Prime Minister announced a half-billion dollar donation, China has promised $60.5 million, while Norway increased its funding donation to $180 million.", "The Bush administration has pledged $350 million.", "Moreover, Indiana University's Center on Philanthropy is estimating approximately $322 million in goods and cash have been donated by private U.S. citizens and corporations.", "The Romanian Government pledged 150,000 euro, while the Romanian public raised 395,000 euro in a telethon for the tsunami victims.", "In addition, Romanian artists will auction off their works, as well as personal objects, with all proceeds being donated to the relief efforts.", "Despite the encouraging promises, the UN warns that logistics of securing the funds, purchasing supplies and shipping them to stricken regions will take time.", "Meanwhile, deaths due to dehydration, disease and starvation will continue to climb.", "In this entire situation, the municipal council of Saskatchewan town in Canada says it \"accidentally\" donated $10,000 to the Red Cross for tsunami relief and now asks for its money back!"]}, {"summarizer_id": "B", "summarizer_type": "reference", "text": ["A week after the earthquake in Southern Asia, the confirmed death count is over 120.000.", "Indonesia's Aceh province alone counts 80.000 deaths.", "Sri Lanka announced approximately 28.500, India more than 7.700 and as many as 10.500 foreign citizens are reported missing.", "As the count continues, a gender imbalance was noticed in deaths by the tsunami, where four times more women than men were killed.", "This phenomenon is due to the greater physical strength of men and different occupations at the time of the disaster.", "Because of this phenomenon, many incidents of violence against women have been reported, and it is estimated that they will continue for the time being, therefore the people in charge seek measures to address the problem.", "The earthquake in Southern Asia was the longest in duration earthquake ever recorded.", "It lasted between 500 and 600 seconds and released an amount of energy that equals a 100 gigaton bomb, while the gash that was created in the bottom of the sea was measured nearly 800 miles deep.", "The U.S.A. has committed to offer 350 million dollars to the affected areas, while the U.S. government signed into law on direct tax relief to individuals who have contributed with donations to disaster relief.", "The amount of donations comes up to 322 million dollars.", "Japan also offers 500 million dollars.", "The Romanians raised 395,000 euros in a telethon, while on the other hand a town in Canada demands refund for 10.000 dollars by the Red Cross due to mistake."]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["U.S. citizens donating in 2005 to help tsunami victims wrote off their donations on their 2004 tax returns.", "In order to help the earthquake's and tsunami's victims, the world's wealthiest nations poured funding into the damaged region.", "However, the UN Office for the Coordination of Humanitarian Affairs in Indonesia chief, Michael Elmquist, warned that logistics of securing the funds, purchasing supplies and shipping them to stricken regions is a long lasting procedure.", "Meanwhile, the death toll is increased due to dehydration, disease, and starvation.", "The offered assistance by the U.S. government was considered limited and prompted complaints.", "At the same time, the ordinary Americans' donations were touching.", "The scientists determined that the Sumatra's earthquake was the largest ever recorded.", "People from around the world raised money for the victims as the Romanians who organised telemarathon and other charitable events.", "On the other hand, a small Canadian village calls back from the Red Cross the $ 10,000 which had donated as the election day for approving the donation some council members were absent.", "The Red Cross has already deposited the cheque, but has indicated it will return the money.", "Only in Indonesia, the death toll reaches 80000.", "In southern Asia about 1 million people are homeless and humanitarian agencies estimate that 5 million people need relief.", "The World Health Organization makes hard efforts to improve the conditions.", "The vast majority of the victims were women.", "This fact is justified to some extent but requires protection for the female sex."]}]} 2 | {"instance_id": "M000", "summarizer_id": "2", "summarizer_type": "peer", "summary": {"summarizer_id": "2", "summarizer_type": "peer", "text": ["\"Confirmed death toll passed 121,000, and 5 million people were homeless.\"", "United Nations--\"UN Emergency Relief Coordinator and Under-Secretary-General for Humanitarian Affairs Jan Egeland said the final death toll could climb beyond 150,000.\"", "January 16, the Romanian public raised the equivalent of 395,000 euro in a telethon, while the Romanian Government pledged 150,000 euro for the relief effort.", "In an event titled Romanian Artists in Support of Asia, organised by The Reporter Foundation of Romania, artists will auction off their works, as well as personal objects, with all proceeds being donated to the relief efforts for the tsunami victims.", "Several prominent Romanian artists and celebrities will gather at the Radio Hall in Bucharest on Sunday, January 23, to raise money for the victims of the catastrophic Indian Ocean tsunami of December 26, 2004.", "Near the epicenter of the earthquakes and tsunami, Indonesia's Aceh province alone may have as many as 80,000 death.", "In Sri Lanka, more than 1,600 kilometres from the epicenter, the ground moved nearly 10 centimetres.", "Meadow Lake, Saskatchewan--The municipal council of a small town in Canada's Prairies has said it \"accidentally\" donated $10,000 to the Red Cross for tsunami relief.", "Two Romanian tourists have been declared missing in the tsunami.", "After the U.S. increased it's funding donation to 350 million USD, Japanese Prime Minister Junichiro Koizumi announced a half-billion dollar donation on Saturday, Jan. 1.", "An AP/ISOS poll has found three in ten U.S. citizens have donated to Tsunami Aid organizations."]}, "references": [{"summarizer_id": "A", "summarizer_type": "reference", "text": ["Dec. 2004 Sumatra earthquake was the longest-lasting ever recorded and large enough to vibrate the whole planet!", "A week after tsunami waves scoured the coasts of multiple countries in southern Asia, the UN says the final death toll could climb beyond 150,000 and may never be known, while it is estimated that 5 million people lack food, water or basic sanitation necessary for survival.", "It should also be noted that women killed outnumber men.", "The world's wealthiest nations have begun pouring funding into the Earthquake/Tsunami damaged region.", "Japanese Prime Minister announced a half-billion dollar donation, China has promised $60.5 million, while Norway increased its funding donation to $180 million.", "The Bush administration has pledged $350 million.", "Moreover, Indiana University's Center on Philanthropy is estimating approximately $322 million in goods and cash have been donated by private U.S. citizens and corporations.", "The Romanian Government pledged 150,000 euro, while the Romanian public raised 395,000 euro in a telethon for the tsunami victims.", "In addition, Romanian artists will auction off their works, as well as personal objects, with all proceeds being donated to the relief efforts.", "Despite the encouraging promises, the UN warns that logistics of securing the funds, purchasing supplies and shipping them to stricken regions will take time.", "Meanwhile, deaths due to dehydration, disease and starvation will continue to climb.", "In this entire situation, the municipal council of Saskatchewan town in Canada says it \"accidentally\" donated $10,000 to the Red Cross for tsunami relief and now asks for its money back!"]}, {"summarizer_id": "B", "summarizer_type": "reference", "text": ["A week after the earthquake in Southern Asia, the confirmed death count is over 120.000.", "Indonesia's Aceh province alone counts 80.000 deaths.", "Sri Lanka announced approximately 28.500, India more than 7.700 and as many as 10.500 foreign citizens are reported missing.", "As the count continues, a gender imbalance was noticed in deaths by the tsunami, where four times more women than men were killed.", "This phenomenon is due to the greater physical strength of men and different occupations at the time of the disaster.", "Because of this phenomenon, many incidents of violence against women have been reported, and it is estimated that they will continue for the time being, therefore the people in charge seek measures to address the problem.", "The earthquake in Southern Asia was the longest in duration earthquake ever recorded.", "It lasted between 500 and 600 seconds and released an amount of energy that equals a 100 gigaton bomb, while the gash that was created in the bottom of the sea was measured nearly 800 miles deep.", "The U.S.A. has committed to offer 350 million dollars to the affected areas, while the U.S. government signed into law on direct tax relief to individuals who have contributed with donations to disaster relief.", "The amount of donations comes up to 322 million dollars.", "Japan also offers 500 million dollars.", "The Romanians raised 395,000 euros in a telethon, while on the other hand a town in Canada demands refund for 10.000 dollars by the Red Cross due to mistake."]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["U.S. citizens donating in 2005 to help tsunami victims wrote off their donations on their 2004 tax returns.", "In order to help the earthquake's and tsunami's victims, the world's wealthiest nations poured funding into the damaged region.", "However, the UN Office for the Coordination of Humanitarian Affairs in Indonesia chief, Michael Elmquist, warned that logistics of securing the funds, purchasing supplies and shipping them to stricken regions is a long lasting procedure.", "Meanwhile, the death toll is increased due to dehydration, disease, and starvation.", "The offered assistance by the U.S. government was considered limited and prompted complaints.", "At the same time, the ordinary Americans' donations were touching.", "The scientists determined that the Sumatra's earthquake was the largest ever recorded.", "People from around the world raised money for the victims as the Romanians who organised telemarathon and other charitable events.", "On the other hand, a small Canadian village calls back from the Red Cross the $ 10,000 which had donated as the election day for approving the donation some council members were absent.", "The Red Cross has already deposited the cheque, but has indicated it will return the money.", "Only in Indonesia, the death toll reaches 80000.", "In southern Asia about 1 million people are homeless and humanitarian agencies estimate that 5 million people need relief.", "The World Health Organization makes hard efforts to improve the conditions.", "The vast majority of the victims were women.", "This fact is justified to some extent but requires protection for the female sex."]}]} 3 | {"instance_id": "M000", "summarizer_id": "A", "summarizer_type": "reference", "summary": {"summarizer_id": "A", "summarizer_type": "reference", "text": ["Dec. 2004 Sumatra earthquake was the longest-lasting ever recorded and large enough to vibrate the whole planet!", "A week after tsunami waves scoured the coasts of multiple countries in southern Asia, the UN says the final death toll could climb beyond 150,000 and may never be known, while it is estimated that 5 million people lack food, water or basic sanitation necessary for survival.", "It should also be noted that women killed outnumber men.", "The world's wealthiest nations have begun pouring funding into the Earthquake/Tsunami damaged region.", "Japanese Prime Minister announced a half-billion dollar donation, China has promised $60.5 million, while Norway increased its funding donation to $180 million.", "The Bush administration has pledged $350 million.", "Moreover, Indiana University's Center on Philanthropy is estimating approximately $322 million in goods and cash have been donated by private U.S. citizens and corporations.", "The Romanian Government pledged 150,000 euro, while the Romanian public raised 395,000 euro in a telethon for the tsunami victims.", "In addition, Romanian artists will auction off their works, as well as personal objects, with all proceeds being donated to the relief efforts.", "Despite the encouraging promises, the UN warns that logistics of securing the funds, purchasing supplies and shipping them to stricken regions will take time.", "Meanwhile, deaths due to dehydration, disease and starvation will continue to climb.", "In this entire situation, the municipal council of Saskatchewan town in Canada says it \"accidentally\" donated $10,000 to the Red Cross for tsunami relief and now asks for its money back!"]}, "references": [{"summarizer_id": "B", "summarizer_type": "reference", "text": ["A week after the earthquake in Southern Asia, the confirmed death count is over 120.000.", "Indonesia's Aceh province alone counts 80.000 deaths.", "Sri Lanka announced approximately 28.500, India more than 7.700 and as many as 10.500 foreign citizens are reported missing.", "As the count continues, a gender imbalance was noticed in deaths by the tsunami, where four times more women than men were killed.", "This phenomenon is due to the greater physical strength of men and different occupations at the time of the disaster.", "Because of this phenomenon, many incidents of violence against women have been reported, and it is estimated that they will continue for the time being, therefore the people in charge seek measures to address the problem.", "The earthquake in Southern Asia was the longest in duration earthquake ever recorded.", "It lasted between 500 and 600 seconds and released an amount of energy that equals a 100 gigaton bomb, while the gash that was created in the bottom of the sea was measured nearly 800 miles deep.", "The U.S.A. has committed to offer 350 million dollars to the affected areas, while the U.S. government signed into law on direct tax relief to individuals who have contributed with donations to disaster relief.", "The amount of donations comes up to 322 million dollars.", "Japan also offers 500 million dollars.", "The Romanians raised 395,000 euros in a telethon, while on the other hand a town in Canada demands refund for 10.000 dollars by the Red Cross due to mistake."]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["U.S. citizens donating in 2005 to help tsunami victims wrote off their donations on their 2004 tax returns.", "In order to help the earthquake's and tsunami's victims, the world's wealthiest nations poured funding into the damaged region.", "However, the UN Office for the Coordination of Humanitarian Affairs in Indonesia chief, Michael Elmquist, warned that logistics of securing the funds, purchasing supplies and shipping them to stricken regions is a long lasting procedure.", "Meanwhile, the death toll is increased due to dehydration, disease, and starvation.", "The offered assistance by the U.S. government was considered limited and prompted complaints.", "At the same time, the ordinary Americans' donations were touching.", "The scientists determined that the Sumatra's earthquake was the largest ever recorded.", "People from around the world raised money for the victims as the Romanians who organised telemarathon and other charitable events.", "On the other hand, a small Canadian village calls back from the Red Cross the $ 10,000 which had donated as the election day for approving the donation some council members were absent.", "The Red Cross has already deposited the cheque, but has indicated it will return the money.", "Only in Indonesia, the death toll reaches 80000.", "In southern Asia about 1 million people are homeless and humanitarian agencies estimate that 5 million people need relief.", "The World Health Organization makes hard efforts to improve the conditions.", "The vast majority of the victims were women.", "This fact is justified to some extent but requires protection for the female sex."]}]} 4 | {"instance_id": "M000", "summarizer_id": "B", "summarizer_type": "reference", "summary": {"summarizer_id": "B", "summarizer_type": "reference", "text": ["A week after the earthquake in Southern Asia, the confirmed death count is over 120.000.", "Indonesia's Aceh province alone counts 80.000 deaths.", "Sri Lanka announced approximately 28.500, India more than 7.700 and as many as 10.500 foreign citizens are reported missing.", "As the count continues, a gender imbalance was noticed in deaths by the tsunami, where four times more women than men were killed.", "This phenomenon is due to the greater physical strength of men and different occupations at the time of the disaster.", "Because of this phenomenon, many incidents of violence against women have been reported, and it is estimated that they will continue for the time being, therefore the people in charge seek measures to address the problem.", "The earthquake in Southern Asia was the longest in duration earthquake ever recorded.", "It lasted between 500 and 600 seconds and released an amount of energy that equals a 100 gigaton bomb, while the gash that was created in the bottom of the sea was measured nearly 800 miles deep.", "The U.S.A. has committed to offer 350 million dollars to the affected areas, while the U.S. government signed into law on direct tax relief to individuals who have contributed with donations to disaster relief.", "The amount of donations comes up to 322 million dollars.", "Japan also offers 500 million dollars.", "The Romanians raised 395,000 euros in a telethon, while on the other hand a town in Canada demands refund for 10.000 dollars by the Red Cross due to mistake."]}, "references": [{"summarizer_id": "A", "summarizer_type": "reference", "text": ["Dec. 2004 Sumatra earthquake was the longest-lasting ever recorded and large enough to vibrate the whole planet!", "A week after tsunami waves scoured the coasts of multiple countries in southern Asia, the UN says the final death toll could climb beyond 150,000 and may never be known, while it is estimated that 5 million people lack food, water or basic sanitation necessary for survival.", "It should also be noted that women killed outnumber men.", "The world's wealthiest nations have begun pouring funding into the Earthquake/Tsunami damaged region.", "Japanese Prime Minister announced a half-billion dollar donation, China has promised $60.5 million, while Norway increased its funding donation to $180 million.", "The Bush administration has pledged $350 million.", "Moreover, Indiana University's Center on Philanthropy is estimating approximately $322 million in goods and cash have been donated by private U.S. citizens and corporations.", "The Romanian Government pledged 150,000 euro, while the Romanian public raised 395,000 euro in a telethon for the tsunami victims.", "In addition, Romanian artists will auction off their works, as well as personal objects, with all proceeds being donated to the relief efforts.", "Despite the encouraging promises, the UN warns that logistics of securing the funds, purchasing supplies and shipping them to stricken regions will take time.", "Meanwhile, deaths due to dehydration, disease and starvation will continue to climb.", "In this entire situation, the municipal council of Saskatchewan town in Canada says it \"accidentally\" donated $10,000 to the Red Cross for tsunami relief and now asks for its money back!"]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["U.S. citizens donating in 2005 to help tsunami victims wrote off their donations on their 2004 tax returns.", "In order to help the earthquake's and tsunami's victims, the world's wealthiest nations poured funding into the damaged region.", "However, the UN Office for the Coordination of Humanitarian Affairs in Indonesia chief, Michael Elmquist, warned that logistics of securing the funds, purchasing supplies and shipping them to stricken regions is a long lasting procedure.", "Meanwhile, the death toll is increased due to dehydration, disease, and starvation.", "The offered assistance by the U.S. government was considered limited and prompted complaints.", "At the same time, the ordinary Americans' donations were touching.", "The scientists determined that the Sumatra's earthquake was the largest ever recorded.", "People from around the world raised money for the victims as the Romanians who organised telemarathon and other charitable events.", "On the other hand, a small Canadian village calls back from the Red Cross the $ 10,000 which had donated as the election day for approving the donation some council members were absent.", "The Red Cross has already deposited the cheque, but has indicated it will return the money.", "Only in Indonesia, the death toll reaches 80000.", "In southern Asia about 1 million people are homeless and humanitarian agencies estimate that 5 million people need relief.", "The World Health Organization makes hard efforts to improve the conditions.", "The vast majority of the victims were women.", "This fact is justified to some extent but requires protection for the female sex."]}]} 5 | {"instance_id": "M001", "summarizer_id": "1", "summarizer_type": "peer", "summary": {"summarizer_id": "1", "summarizer_type": "peer", "text": ["Survivors of the London Bombings have urged the British public to write to their MPs, and set up an online petition calling", "As part of the formal investigation into the attacks, detectives studied thousands of hours of CCTV footage. The images show three of the bombers entering Luton station, before travelling to King's Cross station where they are also pictured.", "The July 7 bombings were a series of coordinated bombings which struck London on the morning of July 7, 2005. 52 people died and approximately 700 were injured as a result of the bombings.", "West Yorkshire Police searched six houses in Leeds today in connection with the London bombings. Houses were searched in the Burley, Beeston and Holbeck areas. Further properties were searched in Dewsbury, about eight miles from the city centre. The raids began at 6:30am BST this morning after warrants were issued under the Terrorism Act 2000. Police are still in attendance at one property, and are describing the searches as significant, and \"intelligence led\".", "A Canadian man, Momin Khawaja, was arrested in 2004 and has been held in a Canadian jail ever since. He is accused of being a co-conspirator with five British plotters for allegedly developing bomb detonators. Khawaja is considered an un-indicted co-conspirator with relation to the British case. Khawaja is the first person to be held under the 2001 Anti-Terrorism Act under Canada's Criminal Code, put in place by Canada's then Liberal government. Khawaja has been charged with seven offences under the new laws."]}, "references": [{"summarizer_id": "A", "summarizer_type": "reference", "text": ["Tributes paid to the victims of the July 7 2005 London bombings Four years after the 7/7 London bombings, when 52 people were killed and about 700 were injured by a series of suicide bombings on transport networks, a 1.4 tonne stainless steel plaque with the names of the victims has been unveiled in Hyde Park.", "52 stainless steel columns standing 3.5m tall were also inaugurated.", "All four suicide bombers were UK residents.", "Al Qaeda has claimed responsibility for the attacks, but police are unsure of its exact role in the attack.", "Survivors of the bombings launched a campaign for public inquiry, which was rejected by the British government as too expensive and time-consuming.", "The government's attitude infuriated survivors and relatives of the dead.", "According to Scotland Yard, the suicide bombers, whose practice shows 'terrorist methodology', had rehearsed their plan nine days earlier.", "Police were able to trace their movements after recovering tickets and receipts from houses connected to the bombers which indicated the route of their trip.", "Three years after the attacks, footage of the bombers, taken by a CCTV camera nine days before the bombings, has been shown in court as part of the trials of three suspects alleged to be involved in the preparation of the bombings.", "Meanwhile, five men, all British nationals, were found guilty for their part in an unsuccessful plot to carry out fertiliser bombings in the UK.", "As it was revealed, some of them had met with two of the London subway bombers."]}, {"summarizer_id": "B", "summarizer_type": "reference", "text": ["After the London bombings in July 2005, the British Police searched six houses in Leeds.", "It was confirmed that the explosives used, came from the building region.", "At the same time, they announced that all bombers were British citizens and that the surveillance cameras taped them on the day of the bombings near the attacked areas.", "In order to trace the suspects, two pictures of Hassib Hussain were published and a line was established, so that eye witnesses could testify information related to him.", "During September 2005, Scotland Yard revealed that the bombers had rehearsed the attacks nine days earlier, according to statements and CCTV tapes.", "Also, during the same period, the suspect for the attacks Hussain Osman was extradited to the UK by the Italian authorities.", "On December 2005, the survivors of the attacks participated in the promotion campaign for public inquiry into the terrorist's attacks, trying to push the government to give up the solution of \u0093narration of events by eye witnesses\u0094.", "A year after the attacks, Londoners honored the victims.", "Officials laid flowers at the location of the explosions and the prime minister made statements.", "On May 2007 five suspects were found guilty for their part in an unsuccessful plot to carry out bombings in the UK.", "While a year after, in the trial of the suspects for the attacks of 2005, the jury was shown footage of their moves.", "Finally, two years after the attacks, 52 stainless steel columns were placed in memory of the victims."]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["London Metropolitan Police searched six houses in Leeds in order to find elements about the terrorist attacks on july 7.", "The raids were dangerous and long lasting.", "The suspects of the bombings in London are all British nationals.", "Four of them alleged suicide bombers.", "The Metropolitan Police released two photographs of one of the London bombers and made a request for any additional information about his whearabouts in order to succeed in arresting him.", "Another bomb suspect, Hussain Osman, who was accused of planting the failed bomb at the Underground station, arrived in London and had been arrested following his extradition from Italy.", "Survivors of the London Bombings set up an online petition calling for an independent Public Inquiry into the attacks.", "The British government rejected calls for a Public Inquiry.", "But survivors argue that a comprehensive investigation could teach valuable lessons which may help reduce the likelihood of future attacks, and improve the response capabilities of the emergency services.", "One year after the terrorist attacks, the Londoners honor the memory of victims.", "Simultaneously, a video links the attacks with Al-Qaeda.", "In 2007, five men were found guilty of plotting to cause an explosion and have been jailed with life sentences.", "Some of the fertiliser bomb conspirators had met with two of the London subway bombers.", "A court in the United Kingdom has been shown footage of the bombers that attacked London.", "Four years later, London remembers the vistims of terrorism and the police goes on investigating the case, thoroughly."]}]} 6 | {"instance_id": "M001", "summarizer_id": "2", "summarizer_type": "peer", "summary": {"summarizer_id": "2", "summarizer_type": "peer", "text": ["Profiles of the suspects in the July 7, 2005 bombings in London have been released.", "The July 7 bombings were a series of coordinated bombings which struck London on the morning of July 7, 2005. 52 people died and approximately 700 were injured as a result of the bombings.", "The Duchess of Cornwall left a floral tribute for the families of the victims.", "\"The families will be campaigning for there to be a full public inquiry.\"", "Houses were searched in the Burley, Beeston and Holbeck areas.", "Last month a video showing Mohammad Siddique Khan saying goodbye to his child was shown in the court.", "Khawaja is considered an un-indicted co-conspirator with relation to the British case.", "West Yorkshire Police searched six houses in Leeds today in connection with the London bombings.", "The men arrive at King's Cross at 8:55 a.m. and are seen at Baker Steet at midday.", "The images show three of the bombers entering Luton station, before travelling to King's Cross station where they are pictured.", "The first photograph is a passport photo of Hasib Hussain and the second shows is a crop of a CCTV image from Luton station at 7:20 on the day of the attack.", "Initially thought to have been the suicide bomber on the Piccadilly Line train.", "52 people were killed and hundreds more injured on July 7th 2005 when four suicide bombers blew themselves up on three separate London Underground trains and a public bus.", "\"A narrative of events will not satisfy anybody.\""]}, "references": [{"summarizer_id": "A", "summarizer_type": "reference", "text": ["Tributes paid to the victims of the July 7 2005 London bombings Four years after the 7/7 London bombings, when 52 people were killed and about 700 were injured by a series of suicide bombings on transport networks, a 1.4 tonne stainless steel plaque with the names of the victims has been unveiled in Hyde Park.", "52 stainless steel columns standing 3.5m tall were also inaugurated.", "All four suicide bombers were UK residents.", "Al Qaeda has claimed responsibility for the attacks, but police are unsure of its exact role in the attack.", "Survivors of the bombings launched a campaign for public inquiry, which was rejected by the British government as too expensive and time-consuming.", "The government's attitude infuriated survivors and relatives of the dead.", "According to Scotland Yard, the suicide bombers, whose practice shows 'terrorist methodology', had rehearsed their plan nine days earlier.", "Police were able to trace their movements after recovering tickets and receipts from houses connected to the bombers which indicated the route of their trip.", "Three years after the attacks, footage of the bombers, taken by a CCTV camera nine days before the bombings, has been shown in court as part of the trials of three suspects alleged to be involved in the preparation of the bombings.", "Meanwhile, five men, all British nationals, were found guilty for their part in an unsuccessful plot to carry out fertiliser bombings in the UK.", "As it was revealed, some of them had met with two of the London subway bombers."]}, {"summarizer_id": "B", "summarizer_type": "reference", "text": ["After the London bombings in July 2005, the British Police searched six houses in Leeds.", "It was confirmed that the explosives used, came from the building region.", "At the same time, they announced that all bombers were British citizens and that the surveillance cameras taped them on the day of the bombings near the attacked areas.", "In order to trace the suspects, two pictures of Hassib Hussain were published and a line was established, so that eye witnesses could testify information related to him.", "During September 2005, Scotland Yard revealed that the bombers had rehearsed the attacks nine days earlier, according to statements and CCTV tapes.", "Also, during the same period, the suspect for the attacks Hussain Osman was extradited to the UK by the Italian authorities.", "On December 2005, the survivors of the attacks participated in the promotion campaign for public inquiry into the terrorist's attacks, trying to push the government to give up the solution of \u0093narration of events by eye witnesses\u0094.", "A year after the attacks, Londoners honored the victims.", "Officials laid flowers at the location of the explosions and the prime minister made statements.", "On May 2007 five suspects were found guilty for their part in an unsuccessful plot to carry out bombings in the UK.", "While a year after, in the trial of the suspects for the attacks of 2005, the jury was shown footage of their moves.", "Finally, two years after the attacks, 52 stainless steel columns were placed in memory of the victims."]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["London Metropolitan Police searched six houses in Leeds in order to find elements about the terrorist attacks on july 7.", "The raids were dangerous and long lasting.", "The suspects of the bombings in London are all British nationals.", "Four of them alleged suicide bombers.", "The Metropolitan Police released two photographs of one of the London bombers and made a request for any additional information about his whearabouts in order to succeed in arresting him.", "Another bomb suspect, Hussain Osman, who was accused of planting the failed bomb at the Underground station, arrived in London and had been arrested following his extradition from Italy.", "Survivors of the London Bombings set up an online petition calling for an independent Public Inquiry into the attacks.", "The British government rejected calls for a Public Inquiry.", "But survivors argue that a comprehensive investigation could teach valuable lessons which may help reduce the likelihood of future attacks, and improve the response capabilities of the emergency services.", "One year after the terrorist attacks, the Londoners honor the memory of victims.", "Simultaneously, a video links the attacks with Al-Qaeda.", "In 2007, five men were found guilty of plotting to cause an explosion and have been jailed with life sentences.", "Some of the fertiliser bomb conspirators had met with two of the London subway bombers.", "A court in the United Kingdom has been shown footage of the bombers that attacked London.", "Four years later, London remembers the vistims of terrorism and the police goes on investigating the case, thoroughly."]}]} 7 | {"instance_id": "M001", "summarizer_id": "A", "summarizer_type": "reference", "summary": {"summarizer_id": "A", "summarizer_type": "reference", "text": ["Tributes paid to the victims of the July 7 2005 London bombings Four years after the 7/7 London bombings, when 52 people were killed and about 700 were injured by a series of suicide bombings on transport networks, a 1.4 tonne stainless steel plaque with the names of the victims has been unveiled in Hyde Park.", "52 stainless steel columns standing 3.5m tall were also inaugurated.", "All four suicide bombers were UK residents.", "Al Qaeda has claimed responsibility for the attacks, but police are unsure of its exact role in the attack.", "Survivors of the bombings launched a campaign for public inquiry, which was rejected by the British government as too expensive and time-consuming.", "The government's attitude infuriated survivors and relatives of the dead.", "According to Scotland Yard, the suicide bombers, whose practice shows 'terrorist methodology', had rehearsed their plan nine days earlier.", "Police were able to trace their movements after recovering tickets and receipts from houses connected to the bombers which indicated the route of their trip.", "Three years after the attacks, footage of the bombers, taken by a CCTV camera nine days before the bombings, has been shown in court as part of the trials of three suspects alleged to be involved in the preparation of the bombings.", "Meanwhile, five men, all British nationals, were found guilty for their part in an unsuccessful plot to carry out fertiliser bombings in the UK.", "As it was revealed, some of them had met with two of the London subway bombers."]}, "references": [{"summarizer_id": "B", "summarizer_type": "reference", "text": ["After the London bombings in July 2005, the British Police searched six houses in Leeds.", "It was confirmed that the explosives used, came from the building region.", "At the same time, they announced that all bombers were British citizens and that the surveillance cameras taped them on the day of the bombings near the attacked areas.", "In order to trace the suspects, two pictures of Hassib Hussain were published and a line was established, so that eye witnesses could testify information related to him.", "During September 2005, Scotland Yard revealed that the bombers had rehearsed the attacks nine days earlier, according to statements and CCTV tapes.", "Also, during the same period, the suspect for the attacks Hussain Osman was extradited to the UK by the Italian authorities.", "On December 2005, the survivors of the attacks participated in the promotion campaign for public inquiry into the terrorist's attacks, trying to push the government to give up the solution of \u0093narration of events by eye witnesses\u0094.", "A year after the attacks, Londoners honored the victims.", "Officials laid flowers at the location of the explosions and the prime minister made statements.", "On May 2007 five suspects were found guilty for their part in an unsuccessful plot to carry out bombings in the UK.", "While a year after, in the trial of the suspects for the attacks of 2005, the jury was shown footage of their moves.", "Finally, two years after the attacks, 52 stainless steel columns were placed in memory of the victims."]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["London Metropolitan Police searched six houses in Leeds in order to find elements about the terrorist attacks on july 7.", "The raids were dangerous and long lasting.", "The suspects of the bombings in London are all British nationals.", "Four of them alleged suicide bombers.", "The Metropolitan Police released two photographs of one of the London bombers and made a request for any additional information about his whearabouts in order to succeed in arresting him.", "Another bomb suspect, Hussain Osman, who was accused of planting the failed bomb at the Underground station, arrived in London and had been arrested following his extradition from Italy.", "Survivors of the London Bombings set up an online petition calling for an independent Public Inquiry into the attacks.", "The British government rejected calls for a Public Inquiry.", "But survivors argue that a comprehensive investigation could teach valuable lessons which may help reduce the likelihood of future attacks, and improve the response capabilities of the emergency services.", "One year after the terrorist attacks, the Londoners honor the memory of victims.", "Simultaneously, a video links the attacks with Al-Qaeda.", "In 2007, five men were found guilty of plotting to cause an explosion and have been jailed with life sentences.", "Some of the fertiliser bomb conspirators had met with two of the London subway bombers.", "A court in the United Kingdom has been shown footage of the bombers that attacked London.", "Four years later, London remembers the vistims of terrorism and the police goes on investigating the case, thoroughly."]}]} 8 | {"instance_id": "M001", "summarizer_id": "B", "summarizer_type": "reference", "summary": {"summarizer_id": "B", "summarizer_type": "reference", "text": ["After the London bombings in July 2005, the British Police searched six houses in Leeds.", "It was confirmed that the explosives used, came from the building region.", "At the same time, they announced that all bombers were British citizens and that the surveillance cameras taped them on the day of the bombings near the attacked areas.", "In order to trace the suspects, two pictures of Hassib Hussain were published and a line was established, so that eye witnesses could testify information related to him.", "During September 2005, Scotland Yard revealed that the bombers had rehearsed the attacks nine days earlier, according to statements and CCTV tapes.", "Also, during the same period, the suspect for the attacks Hussain Osman was extradited to the UK by the Italian authorities.", "On December 2005, the survivors of the attacks participated in the promotion campaign for public inquiry into the terrorist's attacks, trying to push the government to give up the solution of \u0093narration of events by eye witnesses\u0094.", "A year after the attacks, Londoners honored the victims.", "Officials laid flowers at the location of the explosions and the prime minister made statements.", "On May 2007 five suspects were found guilty for their part in an unsuccessful plot to carry out bombings in the UK.", "While a year after, in the trial of the suspects for the attacks of 2005, the jury was shown footage of their moves.", "Finally, two years after the attacks, 52 stainless steel columns were placed in memory of the victims."]}, "references": [{"summarizer_id": "A", "summarizer_type": "reference", "text": ["Tributes paid to the victims of the July 7 2005 London bombings Four years after the 7/7 London bombings, when 52 people were killed and about 700 were injured by a series of suicide bombings on transport networks, a 1.4 tonne stainless steel plaque with the names of the victims has been unveiled in Hyde Park.", "52 stainless steel columns standing 3.5m tall were also inaugurated.", "All four suicide bombers were UK residents.", "Al Qaeda has claimed responsibility for the attacks, but police are unsure of its exact role in the attack.", "Survivors of the bombings launched a campaign for public inquiry, which was rejected by the British government as too expensive and time-consuming.", "The government's attitude infuriated survivors and relatives of the dead.", "According to Scotland Yard, the suicide bombers, whose practice shows 'terrorist methodology', had rehearsed their plan nine days earlier.", "Police were able to trace their movements after recovering tickets and receipts from houses connected to the bombers which indicated the route of their trip.", "Three years after the attacks, footage of the bombers, taken by a CCTV camera nine days before the bombings, has been shown in court as part of the trials of three suspects alleged to be involved in the preparation of the bombings.", "Meanwhile, five men, all British nationals, were found guilty for their part in an unsuccessful plot to carry out fertiliser bombings in the UK.", "As it was revealed, some of them had met with two of the London subway bombers."]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["London Metropolitan Police searched six houses in Leeds in order to find elements about the terrorist attacks on july 7.", "The raids were dangerous and long lasting.", "The suspects of the bombings in London are all British nationals.", "Four of them alleged suicide bombers.", "The Metropolitan Police released two photographs of one of the London bombers and made a request for any additional information about his whearabouts in order to succeed in arresting him.", "Another bomb suspect, Hussain Osman, who was accused of planting the failed bomb at the Underground station, arrived in London and had been arrested following his extradition from Italy.", "Survivors of the London Bombings set up an online petition calling for an independent Public Inquiry into the attacks.", "The British government rejected calls for a Public Inquiry.", "But survivors argue that a comprehensive investigation could teach valuable lessons which may help reduce the likelihood of future attacks, and improve the response capabilities of the emergency services.", "One year after the terrorist attacks, the Londoners honor the memory of victims.", "Simultaneously, a video links the attacks with Al-Qaeda.", "In 2007, five men were found guilty of plotting to cause an explosion and have been jailed with life sentences.", "Some of the fertiliser bomb conspirators had met with two of the London subway bombers.", "A court in the United Kingdom has been shown footage of the bombers that attacked London.", "Four years later, London remembers the vistims of terrorism and the police goes on investigating the case, thoroughly."]}]} 9 | {"instance_id": "M002", "summarizer_id": "1", "summarizer_type": "peer", "summary": {"summarizer_id": "1", "summarizer_type": "peer", "text": ["The United Kingdom has frozen all bilateral business deals with Iran until all 15 British sailors and marines, who were detained by Iranian forces on March 23 are released.", "Iranian President Mahmoud Ahmadinejad on Wednesday announced that he would free the fifteen British captured navy personnel as a \"gift to the British people.\"", "Iran stated Tuesday that the sailors and marines are being treated \"humanely\" and are in \"good health.\"", "On March 23, the fifteen sailors and marines from the frigate HMS Cornwall had been inspecting a ship, in what the UK identified as Iraqi waters, when they were surrounded by Iranian gunboats and taken into custody. Iran claims the UK forces were in Iranian waters, and are still detaining the fifteen.", "The European Union has released a statement calling for the release of all 15 British sailors and marines being detained in Iran and that \"appropriate measures\" will be taken if Iran refuses to release them.", "Iran's National Security Council has announced that it will \"suspend\" the releasing of 15 British sailors and marines detained by Iranian forces on March 23."]}, "references": [{"summarizer_id": "A", "summarizer_type": "reference", "text": ["Two years after the seizure of Royal Navy personnel by Iran, two inquiries, that examined the British Ministry's of Defence\u0092 handling, identified \u0093weaknesses in training, communications and the handling of intelligence\u0094 as well as \"collective failure of judgement\".", "The fifteen sailors and marines, from the frigate HMS Cornwall, were captured by Iranian border guards on March 23 in the Persian Gulf, while they were inspecting, in accordance with UN Security Council Resolution 1723, a ship believed to be smuggling cars into Iraq.", "The UK insisted they were operating in Iraqi waters, while Iran claimed they entered illegally into Iran's territorial waters and that they could face charges of espionage.", "If those charges were brought against them, the result would be heavy punishment by current Iranian law.", "On 28 March, British Prime Minister froze all bilateral business deals with Iran.", "The next day, Iran announced that it will \"suspend\" the releasing of 15 British personnel, due to the political ballyhoo by London.", "The EU called the Iranian seizure a \"clear breach\" of international law.", "Meanwhile, footage of all 15 British personnel had been broadcast on Iranian TV, with one of the sailors saying that the soldiers were in Iranian waters at the time of their detainment.", "The British government claimed that the confessions were extracted under duress.", "Few days later, Iranian President announced that he would free them as a \"gift to the British people\".", "The fifteen British navy personnel landed at Heathrow on 5 April, after thirteen days of captivity."]}, {"summarizer_id": "B", "summarizer_type": "reference", "text": ["In March 2007 a British frigate with 15 Navy personnel, including a woman, have been captured by Iranian authorities, while they were investigating a ship suspected of smuggling cars in Iraq , on charges that they entered illegally into Iran's territorial waters.", "The detainees were taken in Tehran and unofficial information indicated that if they charge them with espionage, the result would be heavy punishment.", "Britain reacted immediately.", "\"COBRA emergency committee\" was activated.", "Blair expressed his disappointment and he demanded their immediate release.", "He also stated that he wishes a peaceful solution for the issue.", "U.S.A and E.U expressed their support and their claim for immediate release.", "The coordinates of Iranians about the location of the ship wasn\u0092t true according to Britain and threatened to cease all business deals with Iran.", "The next days and although originally Iranians stated that they would release the female prisoner, they changed terms due to the hot negotiations.", "The Iranians, however, stated that the prisoners were in good health and their detention was decent.", "After 13 days of captivity Ahmadinejad announced that they would be released as a \"gift to the British people.\"", "After a ceremony they returned to London.", "One detainee said that during their captivity they suffered mental stress and that they have admitted the category in order to deter further tensions.", "Finally, two surveys conducted after a month concluded that the rapture was the result of unfortunate accumulation of factors rather than human mistake."]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["Fifteen British Royal Navy personnel was been captured by Iranian authorities at gunpoint in the Persian Gulf off the Iraqi coast.", "According to Britain, a ship that was believed to be smuggling cars into Iraq, was been checked.", "In accordance with Iran, a British ship was approaching an Iranian site, which formerly belonged to Iraq.", "The sailors had been arrested for further investigation.", "The staff admitted the violation.", "The sailors would be held until five Iranian guards' had been released, who had been arrested in Iraq.", "Britain, supported by the USA and the Europe, required the crew's return and triggered the COBRA.", "The Iranians had threatened of reprisals if they kidnapped members of the Iranian Revolutionary Guard.", "The British crew may be accused of being spies and punished exemplarily.", "Britain had frozen every business arrangement with Iran and had presented evidence that the sailors were in Iraq.", "Footage of the British crew had been broadcasted on Iranian TV, there a sailor, with a black \"head scarf\", admitted that the soldiers were in Iranian waters at the time of their detainment.", "According to the Iranian Foreign Minister, the female sailor would be released immediately.", "Britain denied the possibility of release.", "Iran suspended the release of the British, due to the non-negotiable British stance.", "On April 4, Iranian President freed the crew.", "According to Britain the British confessions were extracted under pressure.", "The crew returned after 13 days of captivity.", "The crew described their capture and detention by Iran."]}]} 10 | {"instance_id": "M002", "summarizer_id": "2", "summarizer_type": "peer", "summary": {"summarizer_id": "2", "summarizer_type": "peer", "text": ["The sailors and marines, from the frigate HMS Cornwall, had been inspecting, in accordance with UN Security Council Resolution 1723, a ship that was believed to be smuggling cars into Iraq, though it was subsequently cleared after inspection when Iranian gunboats surrounded the sailors and arrested them at gunpoint.", "The sailors and marines were captured by Iranian border guards on March 23 in the Persian Gulf near the Shatt al Arab waterway.", "The United Kingdom has frozen all bilateral business deals with Iran until all 15 British sailors and marines.", "The United States called for the immediate release of the sailors.", "After the UK queried the statement by General Alireza Afshar, the Iranian government gave a revised position for the incident, now placing it inside Iranian territorial waters.", "Iran claims the UK forces were in Iranian waters, and are still detaining the fifteen.", "The fifteen British captured navy personnel detained by Iran would be released, saying they have been pardoned as a gift to the British people.", "Iranian media said that the British sailors 'shouted for joy' at the news.", "The Australian reported that an internet website \"run by associates of Mahmoud Ahmadinejad\" states that the 15 British sailors who were arrested by Iranian Revolutionary Guards could face charges of espionage.", "Ahmadinejad met with the detainees shortly after a press conference where he announced that the release will be immediate, and that they will be taken to the airport.", "The Royal Navy insists that they were operating in Iraqi waters."]}, "references": [{"summarizer_id": "A", "summarizer_type": "reference", "text": ["Two years after the seizure of Royal Navy personnel by Iran, two inquiries, that examined the British Ministry's of Defence\u0092 handling, identified \u0093weaknesses in training, communications and the handling of intelligence\u0094 as well as \"collective failure of judgement\".", "The fifteen sailors and marines, from the frigate HMS Cornwall, were captured by Iranian border guards on March 23 in the Persian Gulf, while they were inspecting, in accordance with UN Security Council Resolution 1723, a ship believed to be smuggling cars into Iraq.", "The UK insisted they were operating in Iraqi waters, while Iran claimed they entered illegally into Iran's territorial waters and that they could face charges of espionage.", "If those charges were brought against them, the result would be heavy punishment by current Iranian law.", "On 28 March, British Prime Minister froze all bilateral business deals with Iran.", "The next day, Iran announced that it will \"suspend\" the releasing of 15 British personnel, due to the political ballyhoo by London.", "The EU called the Iranian seizure a \"clear breach\" of international law.", "Meanwhile, footage of all 15 British personnel had been broadcast on Iranian TV, with one of the sailors saying that the soldiers were in Iranian waters at the time of their detainment.", "The British government claimed that the confessions were extracted under duress.", "Few days later, Iranian President announced that he would free them as a \"gift to the British people\".", "The fifteen British navy personnel landed at Heathrow on 5 April, after thirteen days of captivity."]}, {"summarizer_id": "B", "summarizer_type": "reference", "text": ["In March 2007 a British frigate with 15 Navy personnel, including a woman, have been captured by Iranian authorities, while they were investigating a ship suspected of smuggling cars in Iraq , on charges that they entered illegally into Iran's territorial waters.", "The detainees were taken in Tehran and unofficial information indicated that if they charge them with espionage, the result would be heavy punishment.", "Britain reacted immediately.", "\"COBRA emergency committee\" was activated.", "Blair expressed his disappointment and he demanded their immediate release.", "He also stated that he wishes a peaceful solution for the issue.", "U.S.A and E.U expressed their support and their claim for immediate release.", "The coordinates of Iranians about the location of the ship wasn\u0092t true according to Britain and threatened to cease all business deals with Iran.", "The next days and although originally Iranians stated that they would release the female prisoner, they changed terms due to the hot negotiations.", "The Iranians, however, stated that the prisoners were in good health and their detention was decent.", "After 13 days of captivity Ahmadinejad announced that they would be released as a \"gift to the British people.\"", "After a ceremony they returned to London.", "One detainee said that during their captivity they suffered mental stress and that they have admitted the category in order to deter further tensions.", "Finally, two surveys conducted after a month concluded that the rapture was the result of unfortunate accumulation of factors rather than human mistake."]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["Fifteen British Royal Navy personnel was been captured by Iranian authorities at gunpoint in the Persian Gulf off the Iraqi coast.", "According to Britain, a ship that was believed to be smuggling cars into Iraq, was been checked.", "In accordance with Iran, a British ship was approaching an Iranian site, which formerly belonged to Iraq.", "The sailors had been arrested for further investigation.", "The staff admitted the violation.", "The sailors would be held until five Iranian guards' had been released, who had been arrested in Iraq.", "Britain, supported by the USA and the Europe, required the crew's return and triggered the COBRA.", "The Iranians had threatened of reprisals if they kidnapped members of the Iranian Revolutionary Guard.", "The British crew may be accused of being spies and punished exemplarily.", "Britain had frozen every business arrangement with Iran and had presented evidence that the sailors were in Iraq.", "Footage of the British crew had been broadcasted on Iranian TV, there a sailor, with a black \"head scarf\", admitted that the soldiers were in Iranian waters at the time of their detainment.", "According to the Iranian Foreign Minister, the female sailor would be released immediately.", "Britain denied the possibility of release.", "Iran suspended the release of the British, due to the non-negotiable British stance.", "On April 4, Iranian President freed the crew.", "According to Britain the British confessions were extracted under pressure.", "The crew returned after 13 days of captivity.", "The crew described their capture and detention by Iran."]}]} 11 | {"instance_id": "M002", "summarizer_id": "A", "summarizer_type": "reference", "summary": {"summarizer_id": "A", "summarizer_type": "reference", "text": ["Two years after the seizure of Royal Navy personnel by Iran, two inquiries, that examined the British Ministry's of Defence\u0092 handling, identified \u0093weaknesses in training, communications and the handling of intelligence\u0094 as well as \"collective failure of judgement\".", "The fifteen sailors and marines, from the frigate HMS Cornwall, were captured by Iranian border guards on March 23 in the Persian Gulf, while they were inspecting, in accordance with UN Security Council Resolution 1723, a ship believed to be smuggling cars into Iraq.", "The UK insisted they were operating in Iraqi waters, while Iran claimed they entered illegally into Iran's territorial waters and that they could face charges of espionage.", "If those charges were brought against them, the result would be heavy punishment by current Iranian law.", "On 28 March, British Prime Minister froze all bilateral business deals with Iran.", "The next day, Iran announced that it will \"suspend\" the releasing of 15 British personnel, due to the political ballyhoo by London.", "The EU called the Iranian seizure a \"clear breach\" of international law.", "Meanwhile, footage of all 15 British personnel had been broadcast on Iranian TV, with one of the sailors saying that the soldiers were in Iranian waters at the time of their detainment.", "The British government claimed that the confessions were extracted under duress.", "Few days later, Iranian President announced that he would free them as a \"gift to the British people\".", "The fifteen British navy personnel landed at Heathrow on 5 April, after thirteen days of captivity."]}, "references": [{"summarizer_id": "B", "summarizer_type": "reference", "text": ["In March 2007 a British frigate with 15 Navy personnel, including a woman, have been captured by Iranian authorities, while they were investigating a ship suspected of smuggling cars in Iraq , on charges that they entered illegally into Iran's territorial waters.", "The detainees were taken in Tehran and unofficial information indicated that if they charge them with espionage, the result would be heavy punishment.", "Britain reacted immediately.", "\"COBRA emergency committee\" was activated.", "Blair expressed his disappointment and he demanded their immediate release.", "He also stated that he wishes a peaceful solution for the issue.", "U.S.A and E.U expressed their support and their claim for immediate release.", "The coordinates of Iranians about the location of the ship wasn\u0092t true according to Britain and threatened to cease all business deals with Iran.", "The next days and although originally Iranians stated that they would release the female prisoner, they changed terms due to the hot negotiations.", "The Iranians, however, stated that the prisoners were in good health and their detention was decent.", "After 13 days of captivity Ahmadinejad announced that they would be released as a \"gift to the British people.\"", "After a ceremony they returned to London.", "One detainee said that during their captivity they suffered mental stress and that they have admitted the category in order to deter further tensions.", "Finally, two surveys conducted after a month concluded that the rapture was the result of unfortunate accumulation of factors rather than human mistake."]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["Fifteen British Royal Navy personnel was been captured by Iranian authorities at gunpoint in the Persian Gulf off the Iraqi coast.", "According to Britain, a ship that was believed to be smuggling cars into Iraq, was been checked.", "In accordance with Iran, a British ship was approaching an Iranian site, which formerly belonged to Iraq.", "The sailors had been arrested for further investigation.", "The staff admitted the violation.", "The sailors would be held until five Iranian guards' had been released, who had been arrested in Iraq.", "Britain, supported by the USA and the Europe, required the crew's return and triggered the COBRA.", "The Iranians had threatened of reprisals if they kidnapped members of the Iranian Revolutionary Guard.", "The British crew may be accused of being spies and punished exemplarily.", "Britain had frozen every business arrangement with Iran and had presented evidence that the sailors were in Iraq.", "Footage of the British crew had been broadcasted on Iranian TV, there a sailor, with a black \"head scarf\", admitted that the soldiers were in Iranian waters at the time of their detainment.", "According to the Iranian Foreign Minister, the female sailor would be released immediately.", "Britain denied the possibility of release.", "Iran suspended the release of the British, due to the non-negotiable British stance.", "On April 4, Iranian President freed the crew.", "According to Britain the British confessions were extracted under pressure.", "The crew returned after 13 days of captivity.", "The crew described their capture and detention by Iran."]}]} 12 | {"instance_id": "M002", "summarizer_id": "B", "summarizer_type": "reference", "summary": {"summarizer_id": "B", "summarizer_type": "reference", "text": ["In March 2007 a British frigate with 15 Navy personnel, including a woman, have been captured by Iranian authorities, while they were investigating a ship suspected of smuggling cars in Iraq , on charges that they entered illegally into Iran's territorial waters.", "The detainees were taken in Tehran and unofficial information indicated that if they charge them with espionage, the result would be heavy punishment.", "Britain reacted immediately.", "\"COBRA emergency committee\" was activated.", "Blair expressed his disappointment and he demanded their immediate release.", "He also stated that he wishes a peaceful solution for the issue.", "U.S.A and E.U expressed their support and their claim for immediate release.", "The coordinates of Iranians about the location of the ship wasn\u0092t true according to Britain and threatened to cease all business deals with Iran.", "The next days and although originally Iranians stated that they would release the female prisoner, they changed terms due to the hot negotiations.", "The Iranians, however, stated that the prisoners were in good health and their detention was decent.", "After 13 days of captivity Ahmadinejad announced that they would be released as a \"gift to the British people.\"", "After a ceremony they returned to London.", "One detainee said that during their captivity they suffered mental stress and that they have admitted the category in order to deter further tensions.", "Finally, two surveys conducted after a month concluded that the rapture was the result of unfortunate accumulation of factors rather than human mistake."]}, "references": [{"summarizer_id": "A", "summarizer_type": "reference", "text": ["Two years after the seizure of Royal Navy personnel by Iran, two inquiries, that examined the British Ministry's of Defence\u0092 handling, identified \u0093weaknesses in training, communications and the handling of intelligence\u0094 as well as \"collective failure of judgement\".", "The fifteen sailors and marines, from the frigate HMS Cornwall, were captured by Iranian border guards on March 23 in the Persian Gulf, while they were inspecting, in accordance with UN Security Council Resolution 1723, a ship believed to be smuggling cars into Iraq.", "The UK insisted they were operating in Iraqi waters, while Iran claimed they entered illegally into Iran's territorial waters and that they could face charges of espionage.", "If those charges were brought against them, the result would be heavy punishment by current Iranian law.", "On 28 March, British Prime Minister froze all bilateral business deals with Iran.", "The next day, Iran announced that it will \"suspend\" the releasing of 15 British personnel, due to the political ballyhoo by London.", "The EU called the Iranian seizure a \"clear breach\" of international law.", "Meanwhile, footage of all 15 British personnel had been broadcast on Iranian TV, with one of the sailors saying that the soldiers were in Iranian waters at the time of their detainment.", "The British government claimed that the confessions were extracted under duress.", "Few days later, Iranian President announced that he would free them as a \"gift to the British people\".", "The fifteen British navy personnel landed at Heathrow on 5 April, after thirteen days of captivity."]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["Fifteen British Royal Navy personnel was been captured by Iranian authorities at gunpoint in the Persian Gulf off the Iraqi coast.", "According to Britain, a ship that was believed to be smuggling cars into Iraq, was been checked.", "In accordance with Iran, a British ship was approaching an Iranian site, which formerly belonged to Iraq.", "The sailors had been arrested for further investigation.", "The staff admitted the violation.", "The sailors would be held until five Iranian guards' had been released, who had been arrested in Iraq.", "Britain, supported by the USA and the Europe, required the crew's return and triggered the COBRA.", "The Iranians had threatened of reprisals if they kidnapped members of the Iranian Revolutionary Guard.", "The British crew may be accused of being spies and punished exemplarily.", "Britain had frozen every business arrangement with Iran and had presented evidence that the sailors were in Iraq.", "Footage of the British crew had been broadcasted on Iranian TV, there a sailor, with a black \"head scarf\", admitted that the soldiers were in Iranian waters at the time of their detainment.", "According to the Iranian Foreign Minister, the female sailor would be released immediately.", "Britain denied the possibility of release.", "Iran suspended the release of the British, due to the non-negotiable British stance.", "On April 4, Iranian President freed the crew.", "According to Britain the British confessions were extracted under pressure.", "The crew returned after 13 days of captivity.", "The crew described their capture and detention by Iran."]}]} -------------------------------------------------------------------------------- /qaeval/tests/generation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/qaeval/dd7273183dd1b2c9995115310ef041daa953ca81/qaeval/tests/generation/__init__.py -------------------------------------------------------------------------------- /qaeval/tests/generation/model_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import unittest 4 | 5 | from qaeval.generation.model import QuestionGenerationModel 6 | 7 | 8 | @pytest.mark.skipif('GENERATION_MODEL' not in os.environ, reason='Generation model environment variable not set') 9 | class TestGenerationModel(unittest.TestCase): 10 | def test_generation(self): 11 | model = QuestionGenerationModel(os.environ['GENERATION_MODEL']) 12 | 13 | # "The superjumbo Airbus A380" 14 | question = model.generate('The superjumbo Airbus A380 , the world \'s largest commercial airliner , took off Wednesday into cloudy skies over southwestern France for its second test flight .', 15 | 0, 26) 16 | assert question == 'What world\'s largest commercial airliner took off Wednesday into cloudy skies over southwestern France for its second test flight?' 17 | 18 | # "the world 's largest commercial airliner" 19 | question = model.generate( 20 | 'The superjumbo Airbus A380 , the world \'s largest commercial airliner , took off Wednesday into cloudy skies over southwestern France for its second test flight .', 21 | 29, 69) 22 | assert question == 'What superjumbo Airbus A380 took off Wednesday into cloudy skies over southwestern France for its second test flight?' 23 | -------------------------------------------------------------------------------- /qaeval/tests/metric_test.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import pytest 4 | import unittest 5 | from typing import List 6 | 7 | from qaeval import QAEval, FIXTURES_ROOT 8 | 9 | 10 | @pytest.mark.skipif( 11 | "GENERATION_MODEL" not in os.environ, 12 | reason="`GENERATION_MODEL` environment variable not set", 13 | ) 14 | @pytest.mark.skipif( 15 | "ANSWERING_MODEL" not in os.environ, 16 | reason="`ANSWERING_MODEL` environment variable not set", 17 | ) 18 | class TestQAEval(unittest.TestCase): 19 | def setUp(self) -> None: 20 | self.summaries = [] 21 | self.references_list = [] 22 | with open(f"{FIXTURES_ROOT}/multiling2011.jsonl", "r") as f: 23 | for line in f: 24 | data = json.loads(line) 25 | summary = data["summary"]["text"] 26 | references = [reference["text"] for reference in data["references"]] 27 | self.summaries.append(summary) 28 | self.references_list.append(references) 29 | 30 | def _check_output(self, metric: QAEval, expected_output: List) -> None: 31 | actual_output = metric.score_batch(self.summaries, self.references_list) 32 | assert len(expected_output) == len(actual_output) 33 | for expected, actual in zip(expected_output, actual_output): 34 | assert len(expected) == len(actual) == 1 35 | expected = expected["qa-eval"] 36 | actual = actual["qa-eval"] 37 | assert len(expected) == len(actual) 38 | for metric in expected.keys(): 39 | assert expected[metric] == pytest.approx(actual[metric], abs=1e-5) 40 | 41 | def test_qaeval(self): 42 | # This is a regression test, not necessarily a test for correctness 43 | metric = QAEval( 44 | generation_model_path=os.environ["GENERATION_MODEL"], 45 | answering_model_dir=os.environ["ANSWERING_MODEL"], 46 | ) 47 | expected_output = [ 48 | { 49 | "qa-eval": { 50 | "is_answered": 0.2171952736318408, 51 | "em": 0.03078358208955224, 52 | "f1": 0.05688114487088367, 53 | } 54 | }, 55 | { 56 | "qa-eval": { 57 | "is_answered": 0.2706778606965174, 58 | "em": 0.08286691542288557, 59 | "f1": 0.11367400349443259, 60 | } 61 | }, 62 | { 63 | "qa-eval": { 64 | "is_answered": 0.4552238805970149, 65 | "em": 0.05223880597014925, 66 | "f1": 0.10360696517412935, 67 | } 68 | }, 69 | { 70 | "qa-eval": { 71 | "is_answered": 0.2671408582089552, 72 | "em": 0.04582555970149253, 73 | "f1": 0.05402803689883914, 74 | } 75 | }, 76 | { 77 | "qa-eval": { 78 | "is_answered": 0.17126063232225966, 79 | "em": 0.025276841598459315, 80 | "f1": 0.04173576561636263, 81 | } 82 | }, 83 | { 84 | "qa-eval": { 85 | "is_answered": 0.3291829383548209, 86 | "em": 0.029159756771697066, 87 | "f1": 0.0543755246092705, 88 | } 89 | }, 90 | { 91 | "qa-eval": { 92 | "is_answered": 0.34836235489220563, 93 | "em": 0.05223880597014925, 94 | "f1": 0.09381412591922542, 95 | } 96 | }, 97 | { 98 | "qa-eval": { 99 | "is_answered": 0.4337987481945113, 100 | "em": 0.04537794896485315, 101 | "f1": 0.12145356515842792, 102 | } 103 | }, 104 | { 105 | "qa-eval": { 106 | "is_answered": 0.44427039821776665, 107 | "em": 0.06434837092731831, 108 | "f1": 0.10272833079850623, 109 | } 110 | }, 111 | { 112 | "qa-eval": { 113 | "is_answered": 0.40391255917571706, 114 | "em": 0.09642160957950431, 115 | "f1": 0.13482779720666102, 116 | } 117 | }, 118 | { 119 | "qa-eval": { 120 | "is_answered": 0.5345864661654135, 121 | "em": 0.12349624060150374, 122 | "f1": 0.16393273976257167, 123 | } 124 | }, 125 | { 126 | "qa-eval": { 127 | "is_answered": 0.5204365079365079, 128 | "em": 0.12678571428571428, 129 | "f1": 0.16151234567901235, 130 | } 131 | }, 132 | ] 133 | self._check_output(metric, expected_output) 134 | 135 | @pytest.mark.skipif( 136 | "LERC_MODEL" not in os.environ, 137 | reason="`LERC_MODEL` environment variable not set", 138 | ) 139 | @pytest.mark.skipif( 140 | "LERC_PRETRAINED_MODEL" not in os.environ, 141 | reason="`LERC_PRETRAINED_MODEL` environment variable not set", 142 | ) 143 | def test_qaeval_with_lerc(self): 144 | # This is a regression test, not necessarily a test for correctness 145 | metric = QAEval( 146 | generation_model_path=os.environ["GENERATION_MODEL"], 147 | answering_model_dir=os.environ["ANSWERING_MODEL"], 148 | use_lerc=True, 149 | lerc_model_path=os.environ["LERC_MODEL"], 150 | lerc_pretrained_model_path=os.environ["LERC_PRETRAINED_MODEL"], 151 | ) 152 | expected_output = [ 153 | { 154 | "qa-eval": { 155 | "is_answered": 0.2171952736318408, 156 | "em": 0.03078358208955224, 157 | "f1": 0.05688114487088367, 158 | "lerc": 0.5280342313984585, 159 | } 160 | }, 161 | { 162 | "qa-eval": { 163 | "is_answered": 0.2706778606965174, 164 | "em": 0.08286691542288557, 165 | "f1": 0.11367400349443259, 166 | "lerc": 0.8588525844061404, 167 | } 168 | }, 169 | { 170 | "qa-eval": { 171 | "is_answered": 0.4552238805970149, 172 | "em": 0.05223880597014925, 173 | "f1": 0.10360696517412935, 174 | "lerc": 1.2307390170310861, 175 | } 176 | }, 177 | { 178 | "qa-eval": { 179 | "is_answered": 0.2671408582089552, 180 | "em": 0.04582555970149253, 181 | "f1": 0.05402803689883914, 182 | "lerc": 0.6782244059549116, 183 | } 184 | }, 185 | { 186 | "qa-eval": { 187 | "is_answered": 0.17126063232225966, 188 | "em": 0.025276841598459315, 189 | "f1": 0.04173576561636263, 190 | "lerc": 0.40871678001285994, 191 | } 192 | }, 193 | { 194 | "qa-eval": { 195 | "is_answered": 0.3291829383548209, 196 | "em": 0.029159756771697066, 197 | "f1": 0.0543755246092705, 198 | "lerc": 0.6477515654560587, 199 | } 200 | }, 201 | { 202 | "qa-eval": { 203 | "is_answered": 0.34836235489220563, 204 | "em": 0.05223880597014925, 205 | "f1": 0.09381412591922542, 206 | "lerc": 0.947292007320556, 207 | } 208 | }, 209 | { 210 | "qa-eval": { 211 | "is_answered": 0.4337987481945113, 212 | "em": 0.04537794896485315, 213 | "f1": 0.12145356515842792, 214 | "lerc": 1.2629075305115793, 215 | } 216 | }, 217 | { 218 | "qa-eval": { 219 | "is_answered": 0.44427039821776665, 220 | "em": 0.06434837092731831, 221 | "f1": 0.10272833079850623, 222 | "lerc": 1.1977039740821571, 223 | } 224 | }, 225 | { 226 | "qa-eval": { 227 | "is_answered": 0.40391255917571706, 228 | "em": 0.09642160957950431, 229 | "f1": 0.13482779720666102, 230 | "lerc": 1.2360802221434326, 231 | } 232 | }, 233 | { 234 | "qa-eval": { 235 | "is_answered": 0.5345864661654135, 236 | "em": 0.12349624060150374, 237 | "f1": 0.16393273976257167, 238 | "lerc": 1.5575424717221045, 239 | } 240 | }, 241 | { 242 | "qa-eval": { 243 | "is_answered": 0.5204365079365079, 244 | "em": 0.12678571428571428, 245 | "f1": 0.16151234567901235, 246 | "lerc": 1.4713040575976408, 247 | } 248 | }, 249 | ] 250 | self._check_output(metric, expected_output) 251 | 252 | @pytest.mark.skipif( 253 | "LERC_MODEL" not in os.environ, 254 | reason="`LERC_MODEL` environment variable not set", 255 | ) 256 | @pytest.mark.skipif( 257 | "LERC_PRETRAINED_MODEL" not in os.environ, 258 | reason="`LERC_PRETRAINED_MODEL` environment variable not set", 259 | ) 260 | def test_return_qa_pairs(self): 261 | metric = QAEval( 262 | generation_model_path=os.environ["GENERATION_MODEL"], 263 | answering_model_dir=os.environ["ANSWERING_MODEL"], 264 | use_lerc=True, 265 | lerc_model_path=os.environ["LERC_MODEL"], 266 | lerc_pretrained_model_path=os.environ["LERC_PRETRAINED_MODEL"], 267 | ) 268 | 269 | summaries = [ 270 | "Dan walked to the bakery this morning.", 271 | "He bought some scones today", 272 | ] 273 | references_list = [ 274 | ["Dan went to buy scones earlier this morning."], 275 | ["Dan went to buy scones earlier this morning."], 276 | ] 277 | 278 | results_list = metric.score_batch(summaries, references_list, return_qa_pairs=True) 279 | assert len(results_list) == 2 280 | metrics, qa_pairs_list = results_list[0] 281 | assert metrics["qa-eval"]["is_answered"] == 1.0 282 | assert metrics["qa-eval"]["em"] == 0.5 283 | assert metrics["qa-eval"]["f1"] == 0.5 284 | self.assertAlmostEqual(metrics["qa-eval"]["lerc"], 3.171376943588257, places=4) 285 | assert len(qa_pairs_list) == 1 286 | qa_pairs = qa_pairs_list[0] 287 | assert len(qa_pairs) == 2 288 | assert ( 289 | qa_pairs[0]["question"]["question"] 290 | == "Who went to buy scones earlier this morning?" 291 | ) 292 | assert qa_pairs[0]["prediction"]["prediction"] == "Dan" 293 | assert qa_pairs[0]["prediction"]["start"] == 0 294 | assert qa_pairs[0]["prediction"]["end"] == 3 295 | assert qa_pairs[0]["prediction"]["is_answered"] == 1.0 296 | assert qa_pairs[0]["prediction"]["em"] == 1.0 297 | assert qa_pairs[0]["prediction"]["f1"] == 1.0 298 | self.assertAlmostEqual( 299 | qa_pairs[0]["prediction"]["lerc"], 5.035197734832764, places=4 300 | ) 301 | assert ( 302 | qa_pairs[1]["question"]["question"] 303 | == "What did Dan go to buy earlier this morning?" 304 | ) 305 | assert qa_pairs[1]["prediction"]["prediction"] == "bakery" 306 | assert qa_pairs[1]["prediction"]["start"] == 18 307 | assert qa_pairs[1]["prediction"]["end"] == 24 308 | assert qa_pairs[1]["prediction"]["is_answered"] == 1.0 309 | assert qa_pairs[1]["prediction"]["em"] == 0.0 310 | assert qa_pairs[1]["prediction"]["f1"] == 0.0 311 | self.assertAlmostEqual( 312 | qa_pairs[1]["prediction"]["lerc"], 1.30755615234375, places=4 313 | ) 314 | 315 | metrics, qa_pairs_list = results_list[1] 316 | assert metrics["qa-eval"]["is_answered"] == 0.5 317 | assert metrics["qa-eval"]["em"] == 0.5 318 | assert metrics["qa-eval"]["f1"] == 0.5 319 | self.assertAlmostEqual(metrics["qa-eval"]["lerc"], 2.492440700531006, places=4) 320 | assert len(qa_pairs_list) == 1 321 | qa_pairs = qa_pairs_list[0] 322 | assert len(qa_pairs) == 2 323 | assert ( 324 | qa_pairs[0]["question"]["question"] 325 | == "Who went to buy scones earlier this morning?" 326 | ) 327 | assert qa_pairs[0]["prediction"]["prediction"] == "He" 328 | assert qa_pairs[0]["prediction"]["start"] == 0 329 | assert qa_pairs[0]["prediction"]["end"] == 2 330 | assert qa_pairs[0]["prediction"]["is_answered"] == 0.0 331 | assert qa_pairs[0]["prediction"]["em"] == 0.0 332 | assert qa_pairs[0]["prediction"]["f1"] == 0.0 333 | assert qa_pairs[0]["prediction"]["lerc"] == 0.0 334 | assert ( 335 | qa_pairs[1]["question"]["question"] 336 | == "What did Dan go to buy earlier this morning?" 337 | ) 338 | assert qa_pairs[1]["prediction"]["prediction"] == "scones" 339 | assert qa_pairs[1]["prediction"]["start"] == 15 340 | assert qa_pairs[1]["prediction"]["end"] == 21 341 | assert qa_pairs[1]["prediction"]["is_answered"] == 1.0 342 | assert qa_pairs[1]["prediction"]["em"] == 1.0 343 | assert qa_pairs[1]["prediction"]["f1"] == 1.0 344 | self.assertAlmostEqual( 345 | qa_pairs[1]["prediction"]["lerc"], 4.984881401062012, places=4 346 | ) 347 | -------------------------------------------------------------------------------- /qaeval/tests/scoring/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/qaeval/dd7273183dd1b2c9995115310ef041daa953ca81/qaeval/tests/scoring/__init__.py -------------------------------------------------------------------------------- /qaeval/tests/scoring/scorers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danieldeutsch/qaeval/dd7273183dd1b2c9995115310ef041daa953ca81/qaeval/tests/scoring/scorers/__init__.py -------------------------------------------------------------------------------- /qaeval/tests/scoring/scorers/em_test.py: -------------------------------------------------------------------------------- 1 | from qaeval.scoring.scorers import ExactMatchScorer 2 | from qaeval.tests.scoring.scorers.scorer_test import TestScorer 3 | 4 | 5 | class TestExactMatchScorer(TestScorer): 6 | @classmethod 7 | def setUpClass(cls) -> None: 8 | cls.scorer = ExactMatchScorer() 9 | 10 | def test_keys(self): 11 | assert self.scorer.keys() == {'em'} 12 | 13 | def test_default_scores(self): 14 | assert self.scorer.default_scores() == {'em': 0.0} 15 | 16 | def test_is_answered(self): 17 | # the transformer library accepts "a jogger" and "the jogger" for exact match 18 | self.assert_expected_output( 19 | self.scorer, 20 | {'em': (1 / 3 + 1 / 1) / 2}, 21 | [{'em': 1 / 3}, {'em': 1 / 1}], 22 | [[{'em': 0.0}, {'em': 1.0}, {'em': 0.0}], [{'em': 1.0}]] 23 | ) 24 | -------------------------------------------------------------------------------- /qaeval/tests/scoring/scorers/f1_test.py: -------------------------------------------------------------------------------- 1 | from qaeval.scoring.scorers import F1Scorer 2 | from qaeval.tests.scoring.scorers.scorer_test import TestScorer 3 | 4 | 5 | class TestF1Scorer(TestScorer): 6 | @classmethod 7 | def setUpClass(cls) -> None: 8 | cls.scorer = F1Scorer() 9 | 10 | def test_keys(self): 11 | assert self.scorer.keys() == {'f1'} 12 | 13 | def test_default_scores(self): 14 | assert self.scorer.default_scores() == {'f1': 0.0} 15 | 16 | def test_is_answered(self): 17 | self.assert_expected_output( 18 | self.scorer, 19 | {'f1': (1 / 3 + 1 / 1) / 2}, 20 | [{'f1': 1 / 3}, {'f1': 1 / 1}], 21 | [[{'f1': 0.0}, {'f1': 1.0}, {'f1': 0.0}], [{'f1': 1.0}]] 22 | ) 23 | -------------------------------------------------------------------------------- /qaeval/tests/scoring/scorers/is_answered_test.py: -------------------------------------------------------------------------------- 1 | from qaeval.scoring.scorers.is_answered import IsAnsweredScorer 2 | from qaeval.tests.scoring.scorers.scorer_test import TestScorer 3 | 4 | 5 | class TestIsAnsweredScorer(TestScorer): 6 | @classmethod 7 | def setUpClass(cls) -> None: 8 | cls.scorer = IsAnsweredScorer() 9 | 10 | def test_keys(self): 11 | assert self.scorer.keys() == {'is_answered'} 12 | 13 | def test_default_scores(self): 14 | assert self.scorer.default_scores() == {'is_answered': 0.0} 15 | 16 | def test_is_answered(self): 17 | self.assert_expected_output( 18 | self.scorer, 19 | {'is_answered': (2 / 3 + 1 / 1) / 2}, 20 | [{'is_answered': 2 / 3}, {'is_answered': 1 / 1}], 21 | [[{'is_answered': 1.0}, {'is_answered': 1.0}, {'is_answered': 0.0}], [{'is_answered': 1.0}]] 22 | ) 23 | -------------------------------------------------------------------------------- /qaeval/tests/scoring/scorers/lerc_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from qaeval.scoring.scorers import LERCScorer 5 | from qaeval.tests.scoring.scorers.scorer_test import TestScorer 6 | 7 | 8 | @pytest.mark.skipif('LERC_MODEL' not in os.environ or 'LERC_PRETRAINED' not in os.environ, reason='LERC environment variables not set') 9 | class TestLERCScorer(TestScorer): 10 | @classmethod 11 | def setUpClass(cls) -> None: 12 | cls.scorer = LERCScorer( 13 | model_path=os.environ['LERC_MODEL'], 14 | pretrained_path=os.environ['LERC_PRETRAINED'], 15 | cuda_device=0 16 | ) 17 | 18 | def test_keys(self): 19 | assert self.scorer.keys() == {'lerc'} 20 | 21 | def test_default_scores(self): 22 | assert self.scorer.default_scores() == {'lerc': 0.0} 23 | 24 | def test_is_answered(self): 25 | self.assert_expected_output( 26 | # This is a regression test. It does not ensure these numbers are correct 27 | self.scorer, 28 | {'lerc': (2.5152266025543213 + 4.940724849700928) / 2}, 29 | [{'lerc': 2.5152266025543213}, {'lerc': 4.940724849700928}], 30 | [[{'lerc': 2.5210483074188232}, {'lerc': 5.024631500244141}, {'lerc': 0.0}], [{'lerc': 4.940724849700928}]] 31 | ) 32 | -------------------------------------------------------------------------------- /qaeval/tests/scoring/scorers/meta_test.py: -------------------------------------------------------------------------------- 1 | from qaeval.scoring.scorers import IsAnsweredScorer, F1Scorer, MetaScorer 2 | from qaeval.tests.scoring.scorers.scorer_test import TestScorer 3 | 4 | 5 | class TestMetaScorer(TestScorer): 6 | @classmethod 7 | def setUpClass(cls) -> None: 8 | cls.scorer = MetaScorer([ 9 | IsAnsweredScorer(), F1Scorer(), 10 | ]) 11 | 12 | def test_keys(self): 13 | assert self.scorer.keys() == {'is_answered', 'f1'} 14 | 15 | def test_default_scores(self): 16 | assert self.scorer.default_scores() == {'is_answered': 0.0, 'f1': 0.0} 17 | 18 | def test_is_answered(self): 19 | self.assert_expected_output( 20 | self.scorer, 21 | {'is_answered': (2 / 3 + 1 / 1) / 2, 'f1': (1 / 3 + 1 / 1) / 2}, 22 | [{'is_answered': 2 / 3, 'f1': 1 / 3}, {'is_answered': 1 / 1, 'f1': 1 / 1}], 23 | [ 24 | [{'is_answered': 1.0, 'f1': 0.0}, {'is_answered': 1.0, 'f1': 1.0}, {'is_answered': 0.0, 'f1': 0.0}], 25 | [{'is_answered': 1.0, 'f1': 1.0}] 26 | ] 27 | ) 28 | -------------------------------------------------------------------------------- /qaeval/tests/scoring/scorers/scorer_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from typing import Dict, List 3 | 4 | from qaeval.scoring.scorers import Scorer 5 | 6 | SUMMARY = '(CNN)Singer-songwriter David Crosby hit a jogger with his car Sunday evening, a spokesman said. The accident happened in Santa Ynez, California, near where Crosby lives. Crosby was driving at approximately 50 mph when he struck the jogger, according to California Highway Patrol Spokesman Don Clotworthy. The posted speed limit was 55. The jogger suffered multiple fractures, and was airlifted to a hospital in Santa Barbara, Clotworthy said.' 7 | REFERENCE = 'Accident happens in Santa Ynez, California, near where Crosby lives . The jogger suffered multiple fractures; his injuries are not believed to be life-threatening .' 8 | 9 | QUESTIONS = [ 10 | [ 11 | 'What happens in Santa Ynez, California, near where Crosby lives?', 12 | 'Where in California does accident happen near where Crosby lives?', 13 | 'What did the jogger suffer multiple fractures for that are not believed to be life-threatening?', 14 | ], 15 | [ 16 | 'Who suffered multiple fractures?', 17 | ] 18 | ] 19 | ANSWERS = [ 20 | ['Accident', 'Santa Ynez', 'injuries'], 21 | ['The jogger'], 22 | ] 23 | ANSWER_OFFSETS = [ 24 | [(0, 8), (20, 30), (115, 122)], 25 | [(70, 80)], 26 | ] 27 | PREDICTIONS = [ 28 | ['hit a jogger', 'Santa Ynez', 'hit'], 29 | ['a jogger'], 30 | ] 31 | PREDICTION_OFFSETS = [ 32 | [(36, 48), (121, 131), (36, 39)], 33 | [(40, 48)], 34 | ] 35 | PROBABILITIES = [ 36 | [0.8, 0.6, 0.3], 37 | [0.5] 38 | ] 39 | NULL_PROBABILITIES = [ 40 | [0.3, 0.2, 0.6], 41 | [0.1] 42 | ] 43 | 44 | 45 | class TestScorer(unittest.TestCase): 46 | def assert_expected_output( 47 | self, 48 | scorer: Scorer, 49 | instance_scores: Dict[str, float], 50 | reference_scores_list: List[Dict[str, float]], 51 | question_scores_lists: List[List[Dict[str, float]]], 52 | ) -> None: 53 | for i in range(len(QUESTIONS)): 54 | actual_reference_scores, actual_question_scores_list = scorer.score_single_ref( 55 | SUMMARY, 56 | QUESTIONS[i], 57 | ANSWERS[i], 58 | PREDICTIONS[i], 59 | PROBABILITIES[i], 60 | NULL_PROBABILITIES[i] 61 | ) 62 | for key in scorer.keys(): 63 | self.assertAlmostEqual(reference_scores_list[i][key], actual_reference_scores[key], places=4) 64 | for expected, actual in zip(question_scores_lists[i], actual_question_scores_list): 65 | self.assertAlmostEqual(expected[key], actual[key], places=4) 66 | 67 | actual_instance_scores, actual_question_scores_lists = scorer.score_multi_ref( 68 | SUMMARY, 69 | QUESTIONS, 70 | ANSWERS, 71 | PREDICTIONS, 72 | PROBABILITIES, 73 | NULL_PROBABILITIES 74 | ) 75 | 76 | for key in scorer.keys(): 77 | self.assertAlmostEqual(instance_scores[key], actual_instance_scores[key], places=4) 78 | for expected_list, actual_list in zip(question_scores_lists, actual_question_scores_lists): 79 | for expected, actual in zip(expected_list, actual_list): 80 | self.assertAlmostEqual(expected[key], actual[key], places=4) 81 | -------------------------------------------------------------------------------- /qaeval/version.py: -------------------------------------------------------------------------------- 1 | _MAJOR = '0' 2 | _MINOR = '1' 3 | _PATCH = '0' 4 | 5 | VERSION = f'{_MAJOR}.{_MINOR}.{_PATCH}' 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | allennlp==1.1.0 2 | click==7.1.2 3 | edlib 4 | spacy==2.2.4 5 | torch==1.6.0 6 | transformers==3.0.2 7 | urllib3>=1.25.10 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | # version.py defines the VERSION variable. 4 | # We use exec here so we don't import qaeval whilst setting up. 5 | VERSION = {} 6 | with open('qaeval/version.py', 'r') as version_file: 7 | exec(version_file.read(), VERSION) 8 | 9 | setuptools.setup( 10 | name='qaeval', 11 | version=VERSION['VERSION'], 12 | author='Daniel Deutsch', 13 | description='A package for evaluating the content of summaries through question-answering', 14 | url='https://github.com/danieldeutsch/qaeval', 15 | packages=setuptools.find_packages(), 16 | python_requires='>=3.6', 17 | install_requires=[ 18 | 'allennlp==1.1.0', 19 | 'click==7.1.2', 20 | 'edlib', 21 | 'spacy==2.2.4', 22 | 'torch==1.6.0', 23 | 'transformers==3.0.2', 24 | 'urllib3>=1.25.10' 25 | ] 26 | ) --------------------------------------------------------------------------------