├── .github
    └── workflows
    │   └── publish.yml
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── Readme.md
├── experiments
    └── generation
    │   ├── Readme.md
    │   ├── model.jsonnet
    │   ├── preprocess.py
    │   ├── setup.sh
    │   └── train.sh
├── qaeval
    ├── __init__.py
    ├── answer_selection.py
    ├── answering
    │   ├── __init__.py
    │   ├── model.py
    │   └── utils.py
    ├── generation
    │   ├── __init__.py
    │   ├── dataset_reader.py
    │   ├── model.py
    │   ├── predictor.py
    │   └── util.py
    ├── metric.py
    ├── scoring
    │   ├── __init__.py
    │   ├── lerc
    │   │   ├── __init__.py
    │   │   ├── lerc_dataset_reader.py
    │   │   ├── lerc_model.py
    │   │   ├── lerc_predictor.py
    │   │   └── pretrain_model.py
    │   └── scorers
    │   │   ├── __init__.py
    │   │   ├── em.py
    │   │   ├── f1.py
    │   │   ├── is_answered.py
    │   │   ├── lerc.py
    │   │   ├── meta.py
    │   │   └── scorer.py
    ├── tests
    │   ├── __init__.py
    │   ├── answer_selection_test.py
    │   ├── answering
    │   │   ├── __init__.py
    │   │   ├── model_test.py
    │   │   └── utils_test.py
    │   ├── fixtures
    │   │   └── multiling2011.jsonl
    │   ├── generation
    │   │   ├── __init__.py
    │   │   └── model_test.py
    │   ├── metric_test.py
    │   └── scoring
    │   │   ├── __init__.py
    │   │   └── scorers
    │   │       ├── __init__.py
    │   │       ├── em_test.py
    │   │       ├── f1_test.py
    │   │       ├── is_answered_test.py
    │   │       ├── lerc_test.py
    │   │       ├── meta_test.py
    │   │       └── scorer_test.py
    └── version.py
├── requirements.txt
└── setup.py


/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Publish
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: '3.x'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install setuptools wheel twine
25 |     - name: Build and publish
26 |       env:
27 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 |       run: |
30 |         python setup.py sdist bdist_wheel
31 |         twine upload dist/*
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .idea


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | All notable changes to this project will be documented in this file.
 3 | 
 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 6 | 
 7 | ## Unreleased
 8 | 
 9 | ## [v0.1.0](https://github.com/danieldeutsch/qaeval/releases/tag/0.0.9) - 2021-08-02
10 | ### Added
11 | - Added an end-to-end implementation of the metric
12 | 
13 | ## [v0.0.9](https://github.com/danieldeutsch/qaeval/releases/tag/0.0.9) - 2021-06-23
14 | ### Fixed
15 | - Added `edlib` to `setup.py` and `requirements.txt`.
16 | 
17 | ## [v0.0.8](https://github.com/danieldeutsch/qaeval/releases/tag/0.0.8) - 2021-06-15
18 | ### Added
19 | - Added trying to fix the predicted character offsets using an alignment algorithm
20 | - Added an option to return the QA result as a dict
21 | 
22 | ### Changed
23 | - Refactored the `Scorer` code to be cleaner
24 | 
25 | ### Fixed
26 | - Specifying the spacy version to 2.2.4 in `setup.py`
27 | 
28 | ## [v0.0.7](https://github.com/danieldeutsch/qaeval/releases/tag/0.0.7) - 2021-05-07
29 | ### Added
30 | - Added returning approximate character offsets in the context for the QA model's prediction
31 | 
32 | ## [v0.0.6](https://github.com/danieldeutsch/qaeval/releases/tag/0.0.6) - 2021-05-06
33 | ### Added
34 | - Added "silent" options for the question generation and answering models
35 | 
36 | ## [v0.0.5](https://github.com/danieldeutsch/qaeval/releases/tag/0.0.5) - 2021-01-02
37 | ### Added
38 | - Added scoring predictions with [LERC](https://arxiv.org/abs/2010.03636)
39 | 
40 | ### Changed
41 | - Changed the scoring interface with a breaking change 
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
 1 | # QAEval
 2 | This repository contains the code for the QAEval metric from [Towards Question-Answering as an Automatic Metric for Evaluating the Content Quality of a Summary](http://arxiv.org/abs/2010.00490).
 3 | We have included here only the minimal amount of code to run the metric and does not include the code to run the experiments in the paper.
 4 | 
 5 | The easiest way to run the metric end-to-end is to use the wrapper implementation included in [SacreROUGE](https://github.com/danieldeutsch/sacrerouge/blob/master/doc/metrics/qaeval.md).
 6 | 
 7 | The pretrained question generation model can be downloaded [here](https://drive.google.com/file/d/1vVhRgLtsQDAOmxYhY5PMPnxxHUyCOdQU/view?usp=sharing) and the pretrained question answering model can be downloaded [here](https://drive.google.com/file/d/1q2Z3FPP9AYNz0RJKHMlaweNhmLQoyPA8/view?usp=sharing).
 8 | 
 9 | ## Known Differences from Paper
10 | There are several known differences between the implementation here and the one we used for the experiments in the paper.
11 | 
12 | - For the paper, we used a string equals and ROUGE-1 F1 with stemming to calculate the EM and F1 scores between the QA model's predicted answer and the ground-truth answer.
13 | This implementation uses the SQuAD EM/F1 implementations from the Transformers library.
14 | We made this decision to not create a dependency on ROUGE.
15 | 
16 | - The AllenNLP version used here is 1.1.0.
17 | For the paper it was 1.0.0rc3.
18 | The 1.0.0rc3 version requires Transformers 2.8.0.
19 | After upgrading the AllenNLP version, we can now use Transformers 3.0.2, but this made the question-generation model used for the paper incompatible, so it had to be retrained.
20 | The retraining scripts are [here](experiments/generation/Readme.md).
21 | The required changes to the code for this were to pass `use_cache=False` to the BART call.
22 | 
23 | ## Citation
24 | ```
25 | @misc{deutsch2020questionanswering,
26 |       title={Towards Question-Answering as an Automatic Metric for Evaluating the Content Quality of a Summary}, 
27 |       author={Daniel Deutsch and Tania Bedrax-Weiss and Dan Roth},
28 |       year={2020},
29 |       eprint={2010.00490},
30 |       archivePrefix={arXiv},
31 |       primaryClass={cs.CL}
32 | }
33 | ```
34 | 


--------------------------------------------------------------------------------
/experiments/generation/Readme.md:
--------------------------------------------------------------------------------
1 | This directory contains the code to train the question-generation model.
2 | The original model used for the paper experiments could not be directly loaded by the code after updating the AllenNLP and Transformers packages, so we have retrained the model here.
3 | 
4 | To retrain the model, run:
5 | ```
6 | sh experiments/generation/setup.sh
7 | sh experiments/generation/train.sh
8 | ```


--------------------------------------------------------------------------------
/experiments/generation/model.jsonnet:
--------------------------------------------------------------------------------
 1 | local bert_model = "facebook/bart-large";
 2 | 
 3 | {
 4 |   "dataset_reader": {
 5 |     "type": "question_generation",
 6 |     "model_name": bert_model,
 7 |     "lazy": false
 8 |   },
 9 |   "train_data_path": "experiments/generation/data/train.jsonl",
10 |   "validation_data_path": "experiments/generation/data/valid.jsonl",
11 |   "model": {
12 |     "type": "question_generation",
13 |     "model_name": bert_model,
14 |   },
15 |   "data_loader": {
16 |     "batch_sampler": {
17 |       "type": "bucket",
18 |       "batch_size": 16
19 |     },
20 |   },
21 |   "trainer": {
22 |     "checkpointer": {
23 |       "num_serialized_models_to_keep": 1
24 |     },
25 |     "num_epochs": 1,
26 |     "cuda_device": 0,
27 |     "grad_norm": 1,
28 |     "optimizer": {
29 |       "type": "huggingface_adamw",
30 |       "lr": 3e-5,
31 |       "betas": [0.9, 0.999],
32 |       "eps": 1e-8,
33 |       "correct_bias": true
34 |     }
35 |   },
36 |   "random_seed": 4,
37 |   "numpy_seed": 5,
38 |   "pytorch_seed": 6
39 | }


--------------------------------------------------------------------------------
/experiments/generation/preprocess.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import json
 3 | import sys
 4 | 
 5 | 
 6 | def main():
 7 |     with open(sys.argv[2], 'w') as out:
 8 |         with open(sys.argv[1], 'r') as f:
 9 |             reader = csv.reader(f, delimiter='\t')
10 |             for i, row in enumerate(reader):
11 |                 if i == 0:
12 |                     continue
13 | 
14 |                 question = row[2]
15 |                 answer = row[3]
16 |                 context = row[4]
17 | 
18 |                 try:
19 |                     answer_start = context.lower().index(answer.lower())
20 |                     answer_end = answer_start + len(answer)
21 | 
22 |                     out.write(json.dumps({
23 |                         'context': context,
24 |                         'answer': answer,
25 |                         'answer_start': answer_start,
26 |                         'answer_end': answer_end,
27 |                         'question': question
28 |                     }) + '\n')
29 | 
30 |                 except ValueError:
31 |                     pass
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     main()


--------------------------------------------------------------------------------
/experiments/generation/setup.sh:
--------------------------------------------------------------------------------
 1 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 2 | 
 3 | # Download the data from the CodaLab worksheet
 4 | mkdir -p ${DIR}/data
 5 | wget https://worksheets.codalab.org/rest/bundles/0x36403eba6daf46acbc5729cb1680a001/contents/blob/ -O ${DIR}/data/train.tsv
 6 | wget https://worksheets.codalab.org/rest/bundles/0x303c441ba0d04062a293a4b83c86af77/contents/blob/ -O ${DIR}/data/dev.tsv
 7 | wget https://worksheets.codalab.org/rest/bundles/0x2a42519198824d9bbb60bbba0fe629b6/contents/blob/ -O ${DIR}/data/combined_neg_pos.tsv
 8 | 
 9 | # Reformat
10 | python ${DIR}/preprocess.py ${DIR}/data/train.tsv ${DIR}/data/train.jsonl
11 | python ${DIR}/preprocess.py ${DIR}/data/dev.tsv ${DIR}/data/valid.jsonl
12 | python ${DIR}/preprocess.py ${DIR}/data/combined_neg_pos.tsv ${DIR}/data/combined_neg_pos.jsonl
13 | 


--------------------------------------------------------------------------------
/experiments/generation/train.sh:
--------------------------------------------------------------------------------
1 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
2 | 
3 | rm -r ${DIR}/model
4 | 
5 | allennlp train \
6 |   --include-package qaeval \
7 |   -s ${DIR}/model \
8 |   ${DIR}/model.jsonnet


--------------------------------------------------------------------------------
/qaeval/__init__.py:
--------------------------------------------------------------------------------
1 | from qaeval.answer_selection import AnswerSelector
2 | from qaeval.answering.model import QuestionAnsweringModel
3 | from qaeval.generation.model import QuestionGenerationModel
4 | from qaeval.metric import QAEval
5 | from qaeval.version import VERSION as __version__
6 | 
7 | FIXTURES_ROOT = 'qaeval/tests/fixtures'
8 | 


--------------------------------------------------------------------------------
/qaeval/answer_selection.py:
--------------------------------------------------------------------------------
  1 | import spacy
  2 | from collections import namedtuple
  3 | from spacy.tokens import Span
  4 | from typing import List
  5 | 
  6 | NP_CHUNKS_STRATEGY = 'np-chunks'
  7 | MAX_NP_STRATEGY = 'max-np'
  8 | NER_STRATEGY = 'ner'
  9 | ALL_STRATEGY = 'all'
 10 | STRATEGIES = [NP_CHUNKS_STRATEGY, MAX_NP_STRATEGY, NER_STRATEGY, ALL_STRATEGY]
 11 | 
 12 | AnswerOffsets = namedtuple('Answer', ['start', 'end', 'sent_start', 'sent_end', 'text'])
 13 | 
 14 | 
 15 | class AnswerSelector(object):
 16 |     def __init__(self, strategy: str):
 17 |         if strategy not in STRATEGIES:
 18 |             raise Exception(f'Unknown strategy: {strategy}')
 19 |         self.strategy = strategy
 20 |         self.nlp = spacy.load('en_core_web_sm')
 21 | 
 22 |     def _get_np_chunks_answers(self, sentence: Span) -> List[AnswerOffsets]:
 23 |         chunks = []
 24 |         for chunk in sentence.noun_chunks:
 25 |             chunks.append(AnswerOffsets(chunk.start_char, chunk.end_char, sentence.start_char, sentence.end_char, str(chunk)))
 26 |         return chunks
 27 | 
 28 |     def _get_max_np_answers(self, sentence: Span) -> List[AnswerOffsets]:
 29 |         root = sentence.root
 30 |         nodes = [root]
 31 |         nps = []
 32 | 
 33 |         while len(nodes) > 0:
 34 |             node = nodes.pop()
 35 | 
 36 |             # If the node is a noun, collect all of the tokens
 37 |             # which are descendants of this node
 38 |             recurse = True
 39 |             if node.pos_ in ['NOUN', 'PROPN']:
 40 |                 min_index = node.i
 41 |                 max_index = node.i
 42 |                 stack = [node]
 43 |                 while len(stack) > 0:
 44 |                     current = stack.pop()
 45 |                     min_index = min(min_index, current.i)
 46 |                     max_index = max(max_index, current.i)
 47 |                     for child in current.children:
 48 |                         stack.append(child)
 49 | 
 50 |                 sent_start_index = sentence[0].i
 51 | 
 52 |                 # Because of parsing issues, we only take NPs if they are shorter than a given length
 53 |                 num_tokens = max_index - min_index + 1
 54 |                 if num_tokens <= 7:
 55 |                     recurse = False
 56 |                     span = sentence[min_index - sent_start_index:max_index + 1 - sent_start_index]
 57 |                     nps.append(AnswerOffsets(span.start_char, span.end_char, sentence.start_char, sentence.end_char, str(span)))
 58 | 
 59 |             if recurse:
 60 |                 # Otherwise, process all of this node's children
 61 |                 for child in node.children:
 62 |                     nodes.append(child)
 63 | 
 64 |         # Sort in order of appearance
 65 |         nps.sort(key=lambda offsets: offsets.start)
 66 |         return nps
 67 | 
 68 |     def _get_ner_answers(self, sentence: Span) -> List[AnswerOffsets]:
 69 |         ners = []
 70 |         for entity in sentence.ents:
 71 |             if entity.label_ in ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'EVENT', 'WORK_OF_ART']:
 72 |                 ners.append(AnswerOffsets(entity.start_char, entity.end_char, sentence.start_char, sentence.end_char, str(entity)))
 73 |         return ners
 74 | 
 75 |     def _get_all_answers(self, sentence: Span) -> List[AnswerOffsets]:
 76 |         answers = set()
 77 |         answers |= set(self._get_np_chunks_answers(sentence))
 78 |         answers |= set(self._get_max_np_answers(sentence))
 79 |         answers |= set(self._get_ner_answers(sentence))
 80 | 
 81 |         # Sort in order of appearance
 82 |         answers = sorted(answers, key=lambda answer: (answer.start, answer.end))
 83 |         return answers
 84 | 
 85 |     def select(self, text: str) -> List[AnswerOffsets]:
 86 |         """
 87 |         Selects a list of noun phrases from the input `text. Each returned `AnswerOffsets` has:
 88 |             - `start`: the character index in `text` where the noun phrase starts
 89 |             - `end`: the *exclusive* character index in `text` where the noun phrase ends
 90 |             - `sent_start`: the character index in `text` where the sentence that this noun phrase
 91 |                 is in starts
 92 |             - `sent_end`: the *exclusive* character index in `text` where the sentence that this
 93 |                 noun phrase is in ends
 94 |             - `text`: the noun phrase as a string
 95 |         """
 96 |         doc = self.nlp(text)
 97 |         answers = []
 98 |         for sent in doc.sents:
 99 |             if self.strategy == NP_CHUNKS_STRATEGY:
100 |                 answers.extend(self._get_np_chunks_answers(sent))
101 |             elif self.strategy == MAX_NP_STRATEGY:
102 |                 answers.extend(self._get_max_np_answers(sent))
103 |             elif self.strategy == NER_STRATEGY:
104 |                 answers.extend(self._get_ner_answers(sent))
105 |             elif self.strategy == ALL_STRATEGY:
106 |                 answers.extend(self._get_all_answers(sent))
107 |             else:
108 |                 raise Exception(f'Unknown strategy: {self.strategy}')
109 |         return answers
110 | 
111 |     def select_all(self, text_list: List[str]) -> List[List[AnswerOffsets]]:
112 |         return [self.select(text) for text in text_list]


--------------------------------------------------------------------------------
/qaeval/answering/__init__.py:
--------------------------------------------------------------------------------
1 | from qaeval.answering.model import QuestionAnsweringModel


--------------------------------------------------------------------------------
/qaeval/answering/model.py:
--------------------------------------------------------------------------------
  1 | # This file was edited from the run_squad.py file in the experiment repository
  2 | import torch
  3 | from torch.utils.data import DataLoader, SequentialSampler
  4 | from tqdm import tqdm
  5 | 
  6 | from transformers import (
  7 |     AutoConfig,
  8 |     AutoModelForQuestionAnswering,
  9 |     AutoTokenizer,
 10 |     squad_convert_examples_to_features,
 11 | )
 12 | from transformers.data.processors.squad import SquadResult, SquadExample
 13 | 
 14 | from typing import Dict, List, Tuple, Union
 15 | 
 16 | from qaeval.answering.utils import compute_predictions_logits_with_null, fix_answer_span, SpanFixError
 17 | 
 18 | Prediction = Union[
 19 |     Tuple[str, float, float],
 20 |     Tuple[str, float, float, Tuple[int, int]],
 21 |     Dict[str, Union[str, float, Tuple[int, int]]],
 22 | ]
 23 | 
 24 | 
 25 | class QuestionAnsweringModel(object):
 26 |     def __init__(self,
 27 |                  model_dir: str,
 28 |                  cuda_device: int = 0,
 29 |                  batch_size: int = 8,
 30 |                  silent: bool = True) -> None:
 31 |         self.config = AutoConfig.from_pretrained(model_dir)
 32 |         self.tokenizer = AutoTokenizer.from_pretrained(model_dir, do_lower_case=True)
 33 |         self.model = AutoModelForQuestionAnswering.from_pretrained(model_dir, config=self.config)
 34 |         if cuda_device >= 0:
 35 |             self.model.to(cuda_device)
 36 | 
 37 |         self.model_type = 'electra'
 38 |         self.cuda_device = cuda_device
 39 |         self.batch_size = batch_size
 40 |         self.max_seq_length = 384
 41 |         self.doc_stride = 128
 42 |         self.silent = silent
 43 | 
 44 |     def _to_list(self, tensor):
 45 |         return tensor.detach().cpu().tolist()
 46 | 
 47 |     def _try_fixing_offsets(
 48 |         self,
 49 |         contexts: List[str],
 50 |         predictions: Dict[str, str],
 51 |         offsets_dict: Dict[str, Tuple[int, int]],
 52 |     ) -> Dict[str, Tuple[int, int]]:
 53 |         """
 54 |         Tries to fix the potentially noisy character offsets of the predictions in the `contexts`.
 55 |         The input and output end indices are exclusive.
 56 |         """
 57 |         new_offsets = {}
 58 | 
 59 |         for i, context in enumerate(contexts):
 60 |             index = str(i)
 61 | 
 62 |             prediction = predictions[index]
 63 |             pred_start, pred_end = offsets_dict[index]
 64 |             if context is None or prediction is None or pred_start is None or pred_end is None:
 65 |                 new_offsets[index] = (pred_start, pred_end)
 66 |             else:
 67 |                 span = context[pred_start:pred_end]
 68 |                 if span != prediction:
 69 |                     try:
 70 |                         pred_start, pred_end = fix_answer_span(prediction, span, pred_start, pred_end)
 71 |                     except SpanFixError:
 72 |                         pass
 73 |                 new_offsets[index] = (pred_start, pred_end)
 74 |         return new_offsets
 75 | 
 76 |     def answer(
 77 |         self,
 78 |         question: str,
 79 |         context: str,
 80 |         return_offsets: bool = False,
 81 |         try_fixing_offsets: bool = True,
 82 |         return_dict: bool = False,
 83 |     ) -> Prediction:
 84 |         """
 85 |         Returns a tuple of (prediction, probability, null_probability). If `return_offsets = True`, the tuple
 86 |         will include rough character offsets of where the prediction is in the context. Because the tokenizer that
 87 |         the QA model uses does not support returning the character offsets from the BERT tokenization, we cannot
 88 |         directly provide exactly where the answer came from. However, the offsets should be pretty close to the
 89 |         prediction, and the prediction should be a substring of the offsets (modulo whitespace). If
 90 |         `return_offsets` and `try_fixing_offsets` are `True`, we will try to fix the character offsets via
 91 |         an alignment. See below.
 92 | 
 93 |         The `SquadExample` class maintains a list of whitespace separated tokens `doc_tokens` and a mapping
 94 |         from the context string characters to the token indices `char_to_word_offset`. Whitespace
 95 |         is included in the previous token. The `squad_convert_example_to_features` method takes each of these
 96 |         tokens and breaks it into the subtokens with the transformers tokenizer, which are passed into the model.
 97 |         It also keeps a mapping from the subtokens to the `doc_tokens` called `tok_to_orig_index`. The QA model
 98 |         predicts a span in the subtokens. In the `_get_char_offsets` method, we use these data structures to map
 99 |         from the subtoken span to character offsets. However, we cannot separate subtokens, so they are merged together.
100 |         See the below example
101 | 
102 |             context: " My name is  Dan!"
103 |             doc_tokens: [My, name, is, Dan!]
104 |             char_to_word_offset: [-1, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3]
105 |             subtokens: [My, name, is, Dan, ##!]
106 |             tok_to_orig_index: [0, 1, 2, 3, 3]
107 | 
108 |             prediction: "name is Dan"
109 |             prediction subtokens: [name, is, Dan]
110 |             prediction in doc_tokens: [name, is, Dan!]
111 |             prediction in context: "name is  Dan!"
112 | 
113 |         The prediction includes the extra whitespace between "is" and "Dan" as well as the "!"
114 | 
115 |         If `try_fixing_offsets=True`, we will try to fix the character offsets to be correct based on an alignment
116 |         algorithm. We use the `edlib` python package to create a character alignment between the actual prediction
117 |         string and the span given by the original offsets. We then update the offsets based on the alignment. If
118 |         this procedure fails, the original offsets will be returned.
119 |         """
120 |         return self.answer_all(
121 |             [(question, context)], return_offsets=return_offsets,
122 |             try_fixing_offsets=try_fixing_offsets, return_dicts=return_dict
123 |         )[0]
124 | 
125 |     def answer_all(
126 |         self,
127 |         input_data: List[Tuple[str, str]],
128 |         return_offsets: bool = False,
129 |         try_fixing_offsets: bool = True,
130 |         return_dicts: bool = False,
131 |     ) -> List[Prediction]:
132 |         # Convert all of the instances to squad examples
133 |         examples = []
134 |         for i, (question, context) in enumerate(input_data):
135 |             examples.append(SquadExample(
136 |                 qas_id=str(i),
137 |                 question_text=question,
138 |                 context_text=context,
139 |                 answer_text=None,
140 |                 start_position_character=None,
141 |                 title=None,
142 |                 is_impossible=True,
143 |                 answers=[]
144 |             ))
145 | 
146 |         features, dataset = squad_convert_examples_to_features(
147 |             examples=examples,
148 |             tokenizer=self.tokenizer,
149 |             max_seq_length=self.max_seq_length,
150 |             doc_stride=self.doc_stride,
151 |             max_query_length=64,
152 |             is_training=False,
153 |             return_dataset="pt",
154 |             threads=1,
155 |             tqdm_enabled=not self.silent
156 |         )
157 | 
158 |         eval_sampler = SequentialSampler(dataset)
159 |         eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.batch_size)
160 | 
161 |         self.model.eval()
162 |         all_results = []
163 |         generator = eval_dataloader
164 |         if not self.silent:
165 |             generator = tqdm(generator, desc='Evaluating')
166 | 
167 |         for batch in generator:
168 |             if self.cuda_device >= 0:
169 |                 batch = tuple(t.to(self.cuda_device) for t in batch)
170 | 
171 |             with torch.no_grad():
172 |                 inputs = {
173 |                     "input_ids": batch[0],
174 |                     "attention_mask": batch[1],
175 |                     "token_type_ids": batch[2],
176 |                 }
177 | 
178 |                 feature_indices = batch[3]
179 |                 outputs = self.model(**inputs)
180 | 
181 |             for i, feature_index in enumerate(feature_indices):
182 |                 eval_feature = features[feature_index.item()]
183 |                 unique_id = int(eval_feature.unique_id)
184 |                 output = [self._to_list(output[i]) for output in outputs]
185 |                 start_logits, end_logits = output
186 |                 result = SquadResult(unique_id, start_logits, end_logits)
187 | 
188 |                 all_results.append(result)
189 | 
190 |         model_predictions = compute_predictions_logits_with_null(
191 |             self.tokenizer,
192 |             examples,
193 |             features,
194 |             all_results,
195 |             20,
196 |             30,
197 |             True,
198 |             False,
199 |             True,
200 |             return_offsets=return_offsets
201 |         )
202 | 
203 |         if return_offsets:
204 |             predictions, prediction_probs, no_answer_probs, offsets = model_predictions
205 |             if try_fixing_offsets:
206 |                 contexts = [context for _, context in input_data]
207 |                 offsets = self._try_fixing_offsets(contexts, predictions, offsets)
208 |         else:
209 |             predictions, prediction_probs, no_answer_probs = model_predictions
210 | 
211 |         results = []
212 |         for i in range(len(input_data)):
213 |             i = str(i)
214 |             r = (predictions[i], prediction_probs[i], no_answer_probs[i])
215 |             if return_dicts:
216 |                 r = {
217 |                     'prediction': r[0],
218 |                     'probability': r[1],
219 |                     'null_probability': r[2],
220 |                 }
221 | 
222 |             if return_offsets:
223 |                 if return_dicts:
224 |                     r['start'] = offsets[i][0]
225 |                     r['end'] = offsets[i][1]
226 |                 else:
227 |                     r = r + (offsets[i],)
228 |             results.append(r)
229 |         return results
230 | 
231 | 
232 | 


--------------------------------------------------------------------------------
/qaeval/answering/utils.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import edlib
  3 | from typing import Tuple
  4 | 
  5 | from transformers.data.metrics.squad_metrics import (
  6 |     get_final_text,
  7 |     _get_best_indexes,
  8 |     _compute_softmax,
  9 | )
 10 | 
 11 | 
 12 | def _is_whitespace(c):
 13 |     if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
 14 |         return True
 15 |     return False
 16 | 
 17 | 
 18 | def _get_char_offsets(example, pred_start, pred_end):
 19 |     # The returned end index will be exclusive
 20 |     if pred_start is None or pred_end is None:
 21 |         # This could happen if there's an edge case with no valid predictions. See where the prediction is "empty"
 22 |         return None, None
 23 | 
 24 |     token_to_char_start = {}
 25 |     token_to_char_end = {}
 26 |     for char_index, token_index in enumerate(example.char_to_word_offset):
 27 |         if token_index not in token_to_char_start:
 28 |             token_to_char_start[token_index] = char_index
 29 |         token_to_char_end[token_index] = char_index
 30 | 
 31 |     # Any whitespace after the token is included in that token. Find the last non-whitespace character
 32 |     for token_index, end in token_to_char_end.items():
 33 |         if token_index == -1:
 34 |             # Whitespace at the beginning is mapped to token -1. We don't care about it
 35 |             continue
 36 |         while _is_whitespace(example.context_text[end]):
 37 |             end -= 1
 38 |             if end < 0:
 39 |                 break
 40 |         if end < 0:
 41 |             raise Exception(f'Token end is less than 0.')
 42 |         token_to_char_end[token_index] = end + 1  # exclusive
 43 |     return token_to_char_start[pred_start], token_to_char_end[pred_end]
 44 | 
 45 | 
 46 | def compute_predictions_logits_with_null(
 47 |         tokenizer,
 48 |         all_examples,
 49 |         all_features,
 50 |         all_results,
 51 |         n_best_size,
 52 |         max_answer_length,
 53 |         do_lower_case,
 54 |         verbose_logging,
 55 |         version_2_with_negative,
 56 |         return_offsets = False
 57 | ):
 58 |     example_index_to_features = collections.defaultdict(list)
 59 |     for feature in all_features:
 60 |         example_index_to_features[feature.example_index].append(feature)
 61 | 
 62 |     unique_id_to_result = {}
 63 |     for result in all_results:
 64 |         unique_id_to_result[result.unique_id] = result
 65 | 
 66 |     _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
 67 |         "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
 68 |     )
 69 | 
 70 |     all_predictions = collections.OrderedDict()
 71 |     all_nbest_json = collections.OrderedDict()
 72 |     scores_diff_json = collections.OrderedDict()
 73 |     all_probs = collections.OrderedDict()
 74 |     null_scores = collections.OrderedDict()
 75 |     offsets = collections.OrderedDict()
 76 | 
 77 |     for (example_index, example) in enumerate(all_examples):
 78 |         features = example_index_to_features[example_index]
 79 | 
 80 |         prelim_predictions = []
 81 |         # keep track of the minimum score of null start+end of position 0
 82 |         score_null = 1000000  # large and positive
 83 |         min_null_feature_index = 0  # the paragraph slice with min null score
 84 |         null_start_logit = 0  # the start logit at the slice with min null score
 85 |         null_end_logit = 0  # the end logit at the slice with min null score
 86 |         for (feature_index, feature) in enumerate(features):
 87 |             result = unique_id_to_result[feature.unique_id]
 88 |             start_indexes = _get_best_indexes(result.start_logits, n_best_size)
 89 |             end_indexes = _get_best_indexes(result.end_logits, n_best_size)
 90 |             # if we could have irrelevant answers, get the min score of irrelevant
 91 |             if version_2_with_negative:
 92 |                 feature_null_score = result.start_logits[0] + result.end_logits[0]
 93 |                 if feature_null_score < score_null:
 94 |                     score_null = feature_null_score
 95 |                     min_null_feature_index = feature_index
 96 |                     null_start_logit = result.start_logits[0]
 97 |                     null_end_logit = result.end_logits[0]
 98 |             for start_index in start_indexes:
 99 |                 for end_index in end_indexes:
100 |                     # We could hypothetically create invalid predictions, e.g., predict
101 |                     # that the start of the span is in the question. We throw out all
102 |                     # invalid predictions.
103 |                     if start_index >= len(feature.tokens):
104 |                         continue
105 |                     if end_index >= len(feature.tokens):
106 |                         continue
107 |                     if start_index not in feature.token_to_orig_map:
108 |                         continue
109 |                     if end_index not in feature.token_to_orig_map:
110 |                         continue
111 |                     if not feature.token_is_max_context.get(start_index, False):
112 |                         continue
113 |                     if end_index < start_index:
114 |                         continue
115 |                     length = end_index - start_index + 1
116 |                     if length > max_answer_length:
117 |                         continue
118 |                     prelim_predictions.append(
119 |                         _PrelimPrediction(
120 |                             feature_index=feature_index,
121 |                             start_index=start_index,
122 |                             end_index=end_index,
123 |                             start_logit=result.start_logits[start_index],
124 |                             end_logit=result.end_logits[end_index],
125 |                         )
126 |                     )
127 |         if version_2_with_negative:
128 |             prelim_predictions.append(
129 |                 _PrelimPrediction(
130 |                     feature_index=min_null_feature_index,
131 |                     start_index=0,
132 |                     end_index=0,
133 |                     start_logit=null_start_logit,
134 |                     end_logit=null_end_logit,
135 |                 )
136 |             )
137 |         prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
138 | 
139 |         _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
140 |             "NbestPrediction", ["text", "start_logit", "end_logit", "doc_start", "doc_end"]
141 |         )
142 | 
143 |         seen_predictions = {}
144 |         nbest = []
145 |         for pred in prelim_predictions:
146 |             if len(nbest) >= n_best_size:
147 |                 break
148 |             feature = features[pred.feature_index]
149 |             if pred.start_index > 0:  # this is a non-null prediction
150 |                 tok_tokens = feature.tokens[pred.start_index: (pred.end_index + 1)]
151 |                 orig_doc_start = feature.token_to_orig_map[pred.start_index]
152 |                 orig_doc_end = feature.token_to_orig_map[pred.end_index]
153 |                 orig_tokens = example.doc_tokens[orig_doc_start: (orig_doc_end + 1)]
154 | 
155 |                 tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
156 | 
157 |                 # tok_text = " ".join(tok_tokens)
158 |                 #
159 |                 # # De-tokenize WordPieces that have been split off.
160 |                 # tok_text = tok_text.replace(" ##", "")
161 |                 # tok_text = tok_text.replace("##", "")
162 | 
163 |                 # Clean whitespace
164 |                 tok_text = tok_text.strip()
165 |                 tok_text = " ".join(tok_text.split())
166 |                 orig_text = " ".join(orig_tokens)
167 | 
168 |                 final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
169 |                 if final_text in seen_predictions:
170 |                     continue
171 | 
172 |                 seen_predictions[final_text] = True
173 |             else:
174 |                 final_text = ""
175 |                 orig_doc_start = None
176 |                 orig_doc_end = None
177 |                 seen_predictions[final_text] = True
178 | 
179 |             nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit,
180 |                                           doc_start=orig_doc_start, doc_end=orig_doc_end))
181 |         # if we didn't include the empty option in the n-best, include it
182 |         if version_2_with_negative:
183 |             if "" not in seen_predictions:
184 |                 nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit,
185 |                                               doc_start=None, doc_end=None))
186 | 
187 |             # In very rare edge cases we could only have single null prediction.
188 |             # So we just create a nonce prediction in this case to avoid failure.
189 |             if len(nbest) == 1:
190 |                 nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0, doc_start=None,
191 |                                                  doc_end=None))
192 | 
193 |         # In very rare edge cases we could have no valid predictions. So we
194 |         # just create a nonce prediction in this case to avoid failure.
195 |         if not nbest:
196 |             nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0, doc_start=None,
197 |                                           doc_end=None))
198 | 
199 |         assert len(nbest) >= 1
200 | 
201 |         total_scores = []
202 |         best_non_null_entry = None
203 |         best_non_null_entry_index = None
204 |         for i, entry in enumerate(nbest):
205 |             total_scores.append(entry.start_logit + entry.end_logit)
206 |             if not best_non_null_entry:
207 |                 if entry.text:
208 |                     best_non_null_entry = entry
209 |                     best_non_null_entry_index = i
210 | 
211 |         probs = _compute_softmax(total_scores)
212 | 
213 |         nbest_json = []
214 |         null_prob = None
215 |         best_prob = None
216 |         for (i, entry) in enumerate(nbest):
217 |             output = collections.OrderedDict()
218 |             output["text"] = entry.text
219 |             output["probability"] = probs[i]
220 |             output["start_logit"] = entry.start_logit
221 |             output["end_logit"] = entry.end_logit
222 |             if entry.text == '':
223 |                 null_prob = probs[i]
224 |             if i == best_non_null_entry_index:
225 |                 best_prob = probs[i]
226 |             nbest_json.append(output)
227 | 
228 |         assert len(nbest_json) >= 1
229 | 
230 |         if not version_2_with_negative:
231 |             all_predictions[example.qas_id] = nbest_json[0]["text"]
232 |         else:
233 |             # Always predict the best non-null text
234 |             all_predictions[example.qas_id] = best_non_null_entry.text
235 |             all_probs[example.qas_id] = best_prob
236 |             null_scores[example.qas_id] = null_prob
237 |             offsets[example.qas_id] = _get_char_offsets(example, best_non_null_entry.doc_start,
238 |                                                         best_non_null_entry.doc_end)
239 | 
240 |             # # predict "" iff the null score - the score of best non-null > threshold
241 |             # score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
242 |             # scores_diff_json[example.qas_id] = score_diff
243 |             # if score_diff > null_score_diff_threshold:
244 |             #     all_predictions[example.qas_id] = ""
245 |             # else:
246 |             #     all_predictions[example.qas_id] = best_non_null_entry.text
247 |         all_nbest_json[example.qas_id] = nbest_json
248 | 
249 |     output = (all_predictions, all_probs, null_scores)
250 |     if return_offsets:
251 |         output = output + (offsets,)
252 |     return output
253 | 
254 | 
255 | class SpanFixError(Exception):
256 |     pass
257 | 
258 | 
259 | def fix_answer_span(prediction: str, document_span: str, start: int, end: int) -> Tuple[int, int]:
260 |     """
261 |     Tries to fix the answer span of the prediction, which may include some extra whitespace or special characters.
262 | 
263 |     # Parameters
264 |     - `prediction`: the string output by the QA model
265 |     - `document_span`: the span in the text given by the maybe noisy offsets. See `QuestionAnsweringModel.answer()`
266 |     documentation for more information
267 |     - `start`: the start character offset of `document_span` in the original text
268 |     - `end`: the *exclusive* end character offset of the `document_span` in the original text
269 | 
270 |     # Returns
271 |     The `start` and *exclusive* `end` character offsets of fixed character offsets of `prediction` in the
272 |     original text.
273 |     """
274 | 
275 |     if len(prediction) > len(document_span):
276 |         raise SpanFixError(f'Unexpected lengths: "{prediction}", "{document_span}"')
277 | 
278 |     alignment = edlib.align(prediction, document_span, mode='HW', task='path')
279 |     locations = alignment['locations']
280 |     if len(locations) != 1:
281 |         raise SpanFixError(f'Unable to compute span: "{prediction}", "{document_span}"')
282 |     align_start, align_end = locations[0]
283 | 
284 |     start += align_start
285 |     end -= len(document_span) - align_end
286 |     end += 1
287 |     return start, end


--------------------------------------------------------------------------------
/qaeval/generation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/qaeval/dd7273183dd1b2c9995115310ef041daa953ca81/qaeval/generation/__init__.py


--------------------------------------------------------------------------------
/qaeval/generation/dataset_reader.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from allennlp.data import DatasetReader, Instance
 3 | from allennlp.data.fields import MetadataField, TextField
 4 | from allennlp.data.token_indexers import PretrainedTransformerIndexer
 5 | from allennlp.data.tokenizers import PretrainedTransformerTokenizer
 6 | from overrides import overrides
 7 | from typing import Any, Dict, Iterable, Optional
 8 | 
 9 | from qaeval.generation.util import SPAN_START_TOKEN, SPAN_END_TOKEN
10 | 
11 | 
12 | @DatasetReader.register('question_generation')
13 | class QuestionGenerationDatasetReader(DatasetReader):
14 |     def __init__(self,
15 |                  model_name: str,
16 |                  lazy: bool = False):
17 |         super().__init__(lazy=lazy)
18 |         self.tokenizer = PretrainedTransformerTokenizer(model_name)
19 |         self.token_indexers = {'tokens': PretrainedTransformerIndexer(model_name, namespace='tokens')}
20 | 
21 |         # Add the tokens which will mark the answer span
22 |         self.tokenizer.tokenizer.add_tokens([SPAN_START_TOKEN, SPAN_END_TOKEN])
23 | 
24 |     @overrides
25 |     def _read(self, file_path: str) -> Iterable[Instance]:
26 |         with open(file_path, 'r') as f:
27 |             for line in f:
28 |                 data = json.loads(line)
29 |                 context = data['context']
30 |                 start = data['answer_start']
31 |                 end = data['answer_end']
32 |                 question = data.pop('question', None)
33 |                 metadata = data.pop('metadata', {})
34 |                 yield self.text_to_instance(context, start, end, question, metadata)
35 | 
36 |     def _insert_span_symbols(self, context: str, start: int, end: int) -> str:
37 |         return f'{context[:start]}{SPAN_START_TOKEN} {context[start:end]} {SPAN_END_TOKEN}{context[end:]}'
38 | 
39 |     @overrides
40 |     def text_to_instance(self,
41 |                          context: str,
42 |                          start: int,
43 |                          end: int,
44 |                          question: Optional[str] = None,
45 |                          metadata: Dict[str, Any] = None) -> Instance:
46 |         """
47 |         Creates an Instance. `start` and `end` should be the character offsets in `context` of the answer.
48 |         `end` should be exclusive.
49 |         """
50 |         fields = {}
51 |         metadata = metadata or {}
52 | 
53 |         answer = context[start:end]
54 |         marked_context = self._insert_span_symbols(context, start, end)
55 |         source_tokens = self.tokenizer.tokenize(marked_context)
56 |         fields['source_tokens'] = TextField(source_tokens, self.token_indexers)
57 |         metadata['answer'] = answer
58 |         metadata['answer_start'] = start
59 |         metadata['answer_end'] = end
60 |         metadata['context'] = context
61 |         metadata['marked_context'] = marked_context
62 |         metadata['source_tokens'] = source_tokens
63 | 
64 |         if question is not None:
65 |             target_tokens = self.tokenizer.tokenize(question)
66 |             fields['target_tokens'] = TextField(target_tokens, self.token_indexers)
67 |             metadata['question'] = question
68 |             metadata['target_tokens'] = target_tokens
69 | 
70 |         fields['metadata'] = MetadataField(metadata)
71 |         return Instance(fields)


--------------------------------------------------------------------------------
/qaeval/generation/model.py:
--------------------------------------------------------------------------------
  1 | # Implementation largely based on https://github.com/allenai/allennlp-models/pull/35/
  2 | 
  3 | import math
  4 | import torch
  5 | import torch.nn.functional as F
  6 | from allennlp.data import Vocabulary
  7 | from allennlp.data.fields.text_field import TextFieldTensors
  8 | from allennlp.data.tokenizers import PretrainedTransformerTokenizer
  9 | from allennlp.models import Model
 10 | from allennlp.nn.beam_search import BeamSearch
 11 | from allennlp.nn.util import sequence_cross_entropy_with_logits
 12 | from allennlp.predictors import Predictor
 13 | from overrides import overrides
 14 | from transformers import BartForConditionalGeneration
 15 | from tqdm import tqdm
 16 | from typing import Any, Dict, List, Tuple
 17 | 
 18 | # Dataset reader and predictor imports are necessary to find the classes when the
 19 | # predictor is loaded from the archive
 20 | from qaeval.generation.dataset_reader import QuestionGenerationDatasetReader
 21 | from qaeval.generation.predictor import QuestionGenerationPredictor
 22 | from qaeval.generation.util import ALL_SPECIAL_TOKENS
 23 | 
 24 | 
 25 | class QuestionGenerationModel(object):
 26 |     def __init__(self,
 27 |                  model_path: str,
 28 |                  cuda_device: int = 0,
 29 |                  batch_size: int = 8,
 30 |                  silent: bool = True):
 31 |         self.predictor = Predictor.from_path(model_path, predictor_name='question_generation', cuda_device=cuda_device)
 32 |         self.batch_size = batch_size
 33 |         self.silent = silent
 34 | 
 35 |     def generate(self, text: str, start: int, end: int) -> str:
 36 |         return self.generate_all([(text, start, end)])[0]
 37 | 
 38 |     def generate_all(self, inputs: List[Tuple[str, int, int]]) -> List[str]:
 39 |         """
 40 |         Generates a question for each input. The input is a list of tuples of the text and start and ending character
 41 |         offsets of the answer. The ending character offset should be exclusive.
 42 |         """
 43 |         input_jsons = []
 44 |         for text, start, end in inputs:
 45 |             input_jsons.append({
 46 |             'text': text,
 47 |             'start': start,
 48 |             'end': end
 49 |         })
 50 |         outputs = []
 51 |         num_batches = int(math.ceil(len(input_jsons) / self.batch_size))
 52 | 
 53 |         generator = range(0, len(input_jsons), self.batch_size)
 54 |         if not self.silent:
 55 |             generator = tqdm(generator, total=num_batches, desc='Generating questions')
 56 | 
 57 |         for i in generator:
 58 |             batch = input_jsons[i:i + self.batch_size]
 59 |             outputs.extend(self.predictor.predict_batch_json(batch))
 60 |         assert len(input_jsons) == len(outputs)
 61 |         return [output['predicted_question'] for output in outputs]
 62 | 
 63 | 
 64 | @Model.register('question_generation')
 65 | class _QuestionGenerationModel(Model):
 66 |     def __init__(self,
 67 |                  vocab: Vocabulary,
 68 |                  model_name: str,
 69 |                  max_decoding_steps: int = 100,
 70 |                  beam_size: int = 4) -> None:
 71 |         super().__init__(vocab)
 72 |         self.bart = BartForConditionalGeneration.from_pretrained(model_name)
 73 |         self.tokenizer = PretrainedTransformerTokenizer(model_name)
 74 | 
 75 |         # Increase the size of Bart's vocabulary to account for the new special
 76 |         # tokens that were added. Method found from https://github.com/huggingface/transformers/issues/3446
 77 |         # comment on June 12.
 78 |         vocab_size = self.bart.config.vocab_size
 79 |         self.bart.resize_token_embeddings(vocab_size + len(ALL_SPECIAL_TOKENS))
 80 | 
 81 |         self._start_id = self.bart.config.bos_token_id
 82 |         self._end_id = self.bart.config.eos_token_id
 83 |         self._pad_id = self.bart.config.pad_token_id
 84 | 
 85 |         self._max_decoding_steps = max_decoding_steps
 86 |         self._beam_search = BeamSearch(
 87 |             self._end_id, max_steps=max_decoding_steps, beam_size=beam_size or 1
 88 |         )
 89 | 
 90 |     @overrides
 91 |     def forward(self,
 92 |                 source_tokens: TextFieldTensors,
 93 |                 metadata: List[Dict[str, Any]],
 94 |                 target_tokens: TextFieldTensors = None) -> Dict[str, Any]:
 95 |         source_ids = source_tokens['tokens']['token_ids']
 96 |         source_mask = source_tokens['tokens']['mask']
 97 | 
 98 |         output_dict = {'metadata': metadata}
 99 |         if target_tokens is not None:
100 |             # Calculate loss
101 |             target_ids = target_tokens['tokens']['token_ids']
102 |             target_mask = target_tokens['tokens']['mask']
103 | 
104 |             logits = self.bart(
105 |                 input_ids=source_ids,
106 |                 attention_mask=source_mask,
107 |                 decoder_input_ids=target_ids[:, :-1].contiguous(),
108 |                 decoder_attention_mask=target_mask[:, :-1].contiguous(),
109 |                 use_cache=False
110 |             )[0]
111 | 
112 |             # The BART paper mentions label smoothing of 0.1 for sequence generation tasks
113 |             loss = sequence_cross_entropy_with_logits(
114 |                 logits,
115 |                 target_ids[:, 1:].contiguous(),
116 |                 target_mask[:, 1:].contiguous(),
117 |                 label_smoothing=0.1,
118 |                 average='token'
119 |             )
120 |             output_dict['loss'] = loss
121 | 
122 |         if not self.training:
123 |             # Run inference: This differs from the original code which
124 |             # includes the decoder_start_id
125 |             initial_decoder_ids = torch.tensor(
126 |                 [[self._start_id]],
127 |                 dtype=source_ids.dtype,
128 |                 device=source_ids.device,
129 |             ).repeat(source_ids.shape[0], 1)
130 | 
131 |             inital_state = {
132 |                 "input_ids": source_ids,
133 |                 "input_mask": source_mask,
134 |                 "encoder_states": None,
135 |             }
136 |             beam_result = self._beam_search.search(
137 |                 initial_decoder_ids, inital_state, self.take_step
138 |             )
139 | 
140 |             predictions = beam_result[0]
141 |             max_pred_indices = (
142 |                 beam_result[1].argmax(dim=-1).view(-1, 1, 1).expand(-1, -1, predictions.shape[-1])
143 |             )
144 |             predictions = predictions.gather(dim=1, index=max_pred_indices).squeeze(dim=1)
145 | 
146 |             output_dict["predicted_ids"] = predictions
147 |             output_dict["log_probabilities"] = (
148 |                 beam_result[1].gather(dim=-1, index=max_pred_indices[..., 0]).squeeze(dim=-1)
149 |             )
150 | 
151 |             self.make_output_human_readable(output_dict)
152 | 
153 |         return output_dict
154 | 
155 |     @staticmethod
156 |     def _decoder_cache_to_dict(decoder_cache):
157 |         cache_dict = {}
158 |         for layer_index, layer_cache in enumerate(decoder_cache):
159 |             for attention_name, attention_cache in layer_cache.items():
160 |                 for tensor_name, cache_value in attention_cache.items():
161 |                     key = (layer_index, attention_name, tensor_name)
162 |                     cache_dict[key] = cache_value
163 |         return cache_dict
164 | 
165 |     @staticmethod
166 |     def _dict_to_decoder_cache(cache_dict):
167 |         decoder_cache = []
168 |         for key, cache_value in cache_dict.items():
169 |             # Split key and extract index and dict keys
170 |             layer_idx, attention_name, tensor_name = key
171 |             # Extend decoder_cache to fit layer_idx + 1 layers
172 |             decoder_cache = decoder_cache + [{} for _ in range(layer_idx + 1 - len(decoder_cache))]
173 |             cache = decoder_cache[layer_idx]
174 |             if attention_name not in cache:
175 |                 cache[attention_name] = {}
176 |             assert tensor_name not in cache[attention_name]
177 |             cache[attention_name][tensor_name] = cache_value
178 |         return decoder_cache
179 | 
180 |     def take_step(
181 |             self, last_predictions: torch.Tensor, state: Dict[str, torch.Tensor], step: int
182 |     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
183 |         if len(last_predictions.shape) == 1:
184 |             last_predictions = last_predictions.unsqueeze(-1)
185 | 
186 |         # Only the last predictions are needed for the decoder, but we need to pad the decoder ids
187 |         # to not mess up the positional embeddings in the decoder.
188 |         padding_size = 0
189 |         if step > 0:
190 |             padding_size = step + 1
191 |             padding = torch.full(
192 |                 (last_predictions.shape[0], padding_size),
193 |                 self._pad_id,
194 |                 dtype=last_predictions.dtype,
195 |                 device=last_predictions.device,
196 |             )
197 |             last_predictions = torch.cat([padding, last_predictions], dim=-1)
198 | 
199 |         decoder_cache = None
200 |         decoder_cache_dict = {
201 |             k: (state[k].contiguous() if state[k] is not None else None)
202 |             for k in state
203 |             if k not in {"input_ids", "input_mask", "encoder_states"}
204 |         }
205 |         if len(decoder_cache_dict) != 0:
206 |             decoder_cache = self._dict_to_decoder_cache(decoder_cache_dict)
207 | 
208 |         log_probabilities = None
209 |         for i in range(padding_size, last_predictions.shape[1]):
210 |             encoder_outputs = (
211 |                 (state["encoder_states"],) if state["encoder_states"] is not None else None
212 |             )
213 |             outputs = self.bart(
214 |                 input_ids=state["input_ids"],
215 |                 attention_mask=state["input_mask"],
216 |                 encoder_outputs=encoder_outputs,
217 |                 decoder_input_ids=last_predictions[:, : i + 1],
218 |                 decoder_cached_states=decoder_cache,
219 |                 generation_mode=True,
220 |                 use_cache=True,
221 |             )
222 | 
223 |             decoder_log_probabilities = F.log_softmax(outputs[0][:, 0], dim=-1)
224 | 
225 |             if log_probabilities is None:
226 |                 log_probabilities = decoder_log_probabilities
227 |             else:
228 |                 idx = last_predictions[:, i].view(-1, 1)
229 |                 log_probabilities = decoder_log_probabilities + log_probabilities.gather(
230 |                     dim=-1, index=idx
231 |                 )
232 | 
233 |             decoder_cache = outputs[1][1]
234 | 
235 |             state["encoder_states"] = outputs[2]
236 | 
237 |         if decoder_cache is not None:
238 |             decoder_cache_dict = self._decoder_cache_to_dict(decoder_cache)
239 |             state.update(decoder_cache_dict)
240 | 
241 |         return log_probabilities, state
242 | 
243 |     @overrides
244 |     def make_output_human_readable(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, Any]:
245 |         """
246 |         # Parameters
247 |         output_dict : `Dict[str, torch.Tensor]`
248 |             A dictionary containing a batch of predictions with key `predictions`. The tensor should have
249 |             shape `(batch_size, max_sequence_length)`
250 |         # Returns
251 |         Dict[str, Any]
252 |             Original `output_dict` with an additional `predicted_tokens` key that maps to a list of lists of
253 |             tokens.
254 |         """
255 |         predicted_ids = output_dict["predicted_ids"]
256 |         predictions = []
257 |         for i in range(predicted_ids.shape[0]):
258 |             token_ids = predicted_ids[i].tolist()
259 |             while len(token_ids) > 0 and token_ids[-1] == self._end_id:
260 |                 token_ids.pop()
261 |             predictions.append(self.tokenizer.tokenizer.decode(token_ids).strip())
262 |         output_dict["predicted_question"] = predictions
263 | 
264 |         return output_dict


--------------------------------------------------------------------------------
/qaeval/generation/predictor.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from allennlp.common.util import JsonDict
 3 | from allennlp.data import Instance
 4 | from allennlp.predictors import Predictor
 5 | from overrides import overrides
 6 | 
 7 | 
 8 | @Predictor.register('question_generation')
 9 | class QuestionGenerationPredictor(Predictor):
10 |     @overrides
11 |     def _json_to_instance(self, json_dict: JsonDict) -> Instance:
12 |         return self._dataset_reader.text_to_instance(context=json_dict['text'],
13 |                                                      start=json_dict['start'],
14 |                                                      end=json_dict['end'])
15 | 
16 |     @overrides
17 |     def dump_line(self, outputs: JsonDict) -> str:
18 |         input_dict = outputs['metadata']['input_dict']
19 |         input_dict['question'] = outputs['predicted_question']
20 |         return json.dumps(input_dict) + '\n'


--------------------------------------------------------------------------------
/qaeval/generation/util.py:
--------------------------------------------------------------------------------
1 | SPAN_START_TOKEN = '<m>'
2 | SPAN_END_TOKEN = '</m>'
3 | 
4 | ALL_SPECIAL_TOKENS = [SPAN_START_TOKEN, SPAN_END_TOKEN]


--------------------------------------------------------------------------------
/qaeval/metric.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import logging
  3 | from tqdm import tqdm
  4 | from typing import Any, Dict, List, Tuple, Union
  5 | 
  6 | from qaeval import AnswerSelector, QuestionAnsweringModel, QuestionGenerationModel
  7 | from qaeval.answer_selection import NP_CHUNKS_STRATEGY
  8 | from qaeval.scoring.scorers import (
  9 |     ExactMatchScorer,
 10 |     F1Scorer,
 11 |     IsAnsweredScorer,
 12 |     LERCScorer,
 13 |     MetaScorer,
 14 | )
 15 | 
 16 | MetricsDict = Dict[str, float]
 17 | SummaryType = Union[str, List[str]]
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | 
 22 | class QAEval(object):
 23 |     def __init__(
 24 |         self,
 25 |         generation_model_path: str,
 26 |         answering_model_dir: str,
 27 |         answer_selection_strategy: str = NP_CHUNKS_STRATEGY,
 28 |         cuda_device: int = 0,
 29 |         generation_batch_size: int = 8,
 30 |         answering_batch_size: int = 8,
 31 |         use_lerc: bool = False,
 32 |         lerc_model_path: str = None,
 33 |         lerc_pretrained_model_path: str = None,
 34 |         lerc_batch_size: int = 8,
 35 |         verbose: bool = False,
 36 |     ) -> None:
 37 |         self.answer_selector = AnswerSelector(answer_selection_strategy)
 38 |         self.question_generator = QuestionGenerationModel(
 39 |             generation_model_path,
 40 |             cuda_device=cuda_device,
 41 |             batch_size=generation_batch_size,
 42 |             silent=not verbose,
 43 |         )
 44 |         self.question_answerer = QuestionAnsweringModel(
 45 |             answering_model_dir,
 46 |             cuda_device=cuda_device,
 47 |             batch_size=answering_batch_size,
 48 |             silent=not verbose,
 49 |         )
 50 |         self.verbose = verbose
 51 | 
 52 |         scorers = [IsAnsweredScorer(), ExactMatchScorer(), F1Scorer()]
 53 |         if use_lerc:
 54 |             if lerc_model_path is None or lerc_pretrained_model_path is None:
 55 |                 raise Exception(
 56 |                     f"If `use_lerc` is `True`, `lerc_model_path` and `lerc_pretrained_model_path` must not be `None`"
 57 |                 )
 58 |             scorers.append(
 59 |                 LERCScorer(
 60 |                     lerc_model_path,
 61 |                     lerc_pretrained_model_path,
 62 |                     cuda_device,
 63 |                     lerc_batch_size,
 64 |                 )
 65 |             )
 66 |         self.scorer = MetaScorer(scorers)
 67 | 
 68 |     def _flatten_summaries(self, summaries: List[SummaryType]) -> List[str]:
 69 |         flat_summaries = []
 70 |         for summary in summaries:
 71 |             if isinstance(summary, list):
 72 |                 summary = " ".join(summary)
 73 |             flat_summaries.append(summary)
 74 |         return flat_summaries
 75 | 
 76 |     def _flatten_references_list(
 77 |         self, references_list: List[List[SummaryType]]
 78 |     ) -> List[List[str]]:
 79 |         # Flattens all of the summaries so they are `str` instead of `List[str]`
 80 |         flat_references_list = []
 81 |         for references in references_list:
 82 |             flat_references_list.append([])
 83 |             for reference in references:
 84 |                 if isinstance(reference, list):
 85 |                     reference = " ".join(reference)
 86 |                 flat_references_list[-1].append(reference)
 87 |         return flat_references_list
 88 | 
 89 |     def _get_empty_summary_mask(
 90 |         self, summaries: List[str], references_list: List[List[str]]
 91 |     ) -> Tuple[List[str], List[List[str]], List[bool]]:
 92 |         # This will identify any summaries that have empty text. The output will be the list of non-empty summaries
 93 |         # with their corresponding references plus a list of booleans that is parallel will the input `summaries`
 94 |         # which mark whether or not they are empty
 95 |         is_empty_list = []
 96 |         non_empty_summaries = []
 97 |         non_empty_references_list = []
 98 | 
 99 |         for summary, references in zip(summaries, references_list):
100 |             if len(summary.strip()) > 0:
101 |                 is_empty_list.append(False)
102 |                 non_empty_summaries.append(summary)
103 |                 non_empty_references_list.append(references)
104 |             else:
105 |                 is_empty_list.append(True)
106 |         return non_empty_summaries, non_empty_references_list, is_empty_list
107 | 
108 |     def _get_question_id(
109 |         self, instance_index: int, reference_index: int, start: int, end: int
110 |     ) -> str:
111 |         m = hashlib.md5()
112 |         m.update(str(instance_index).encode())
113 |         m.update(str(reference_index).encode())
114 |         m.update(str(start).encode())
115 |         m.update(str(end).encode())
116 |         return m.hexdigest()
117 | 
118 |     def _generate_qa_pairs(
119 |         self, references_list: List[List[str]]
120 |     ) -> List[List[List[Dict[str, Any]]]]:
121 |         # This will generate the question-answer pairs for each reference. Since references may be repeated,
122 |         # we first deduplicate the references to minimize the expensive work.
123 |         #
124 |         # `reference_to_index` keeps track of where each of the unique references are in `distinct_references_list`
125 |         reference_to_index = {}
126 |         distinct_references_list = []
127 | 
128 |         # Maps from (i, j) to the index in the `distinct_references_list`
129 |         mapping = {}
130 |         for i, references in enumerate(references_list):
131 |             for j, reference in enumerate(references):
132 |                 if reference not in reference_to_index:
133 |                     reference_to_index[reference] = len(distinct_references_list)
134 |                     distinct_references_list.append(reference)
135 |                 mapping[(i, j)] = reference_to_index[reference]
136 | 
137 |         # Select the answers
138 |         logger.info(
139 |             f"Selecting answers from {len(distinct_references_list)} distinct summaries"
140 |         )
141 |         answers_list = self.answer_selector.select_all(distinct_references_list)
142 |         num_answers = sum(len(answers) for answers in answers_list)
143 |         logger.info(f"Selected {num_answers} answers in total")
144 | 
145 |         # Generate the questions
146 |         generation_inputs = []
147 |         for reference, answers in zip(distinct_references_list, answers_list):
148 |             for answer in answers:
149 |                 sentence = reference[answer.sent_start : answer.sent_end]
150 |                 start = answer.start - answer.sent_start
151 |                 end = answer.end - answer.sent_start
152 |                 generation_inputs.append((sentence, start, end))
153 | 
154 |         logger.info(f"Generating questions for {len(generation_inputs)} answers")
155 |         question_list = self.question_generator.generate_all(generation_inputs)
156 |         logger.info("Finished generating questions")
157 | 
158 |         # Remap the questions to align with the answers
159 |         index = 0
160 |         remapped_questions = []
161 |         for i, answers in enumerate(answers_list):
162 |             remapped_questions.append([])
163 |             for _ in answers:
164 |                 remapped_questions[-1].append(question_list[index])
165 |                 index += 1
166 |             assert len(remapped_questions[i]) == len(answers_list[i])
167 |         assert len(remapped_questions) == len(answers_list)
168 | 
169 |         # Remap output to align with the inputs
170 |         # qa_pairs_lists[summary_index][reference_index] = [(q, a)]
171 |         qa_pairs_lists = []
172 |         for i, references in enumerate(references_list):
173 |             qa_pairs_lists.append([])
174 |             for j, reference in enumerate(references):
175 |                 index = mapping[(i, j)]
176 |                 qa_pairs_lists[-1].append([])
177 |                 for question, answer in zip(
178 |                     remapped_questions[index], answers_list[index]
179 |                 ):
180 |                     question_id = self._get_question_id(i, j, answer.start, answer.end)
181 |                     qa_pairs_lists[-1][-1].append(
182 |                         {
183 |                             "question_id": question_id,
184 |                             "question": question,
185 |                             "answer": answer.text,
186 |                             "sent_start": answer.sent_start,
187 |                             "sent_end": answer.sent_end,
188 |                             "answer_start": answer.start,
189 |                             "answer_end": answer.end,
190 |                         }
191 |                     )
192 |         return qa_pairs_lists
193 | 
194 |     def _get_prediction_id(self, prediction_index: int):
195 |         m = hashlib.md5()
196 |         m.update(str(prediction_index).encode())
197 |         return m.hexdigest()
198 | 
199 |     def _answer_questions(
200 |         self, summaries: List[str], qa_pairs_lists: List[List[List[Dict[str, Any]]]]
201 |     ) -> List[List[List[Dict[str, Any]]]]:
202 |         # Answers all of the questions. Some of the (question, context) pairs may be duplicates, for instance because
203 |         # of jackknifing. It will be a lot faster to deduplicate them first.
204 |         #
205 |         # `qa_inputs` will contain the unique inputs, `context_to_input_index` maps from the (question, context) pair
206 |         # to its index in `qa_inputs`, and `mapping` will map from the i-th summary, j-th reference, and k-th question
207 |         # to the index of the corresponding data in `qa_inputs`
208 |         qa_inputs = []
209 |         context_to_input_index = {}
210 |         mapping = {}
211 | 
212 |         for i, (summary, qa_pairs_list) in enumerate(zip(summaries, qa_pairs_lists)):
213 |             for j, qa_pairs in enumerate(qa_pairs_list):
214 |                 for k, qa in enumerate(qa_pairs):
215 |                     question = qa["question"]
216 |                     key = (question, summary)
217 |                     if key not in context_to_input_index:
218 |                         context_to_input_index[key] = len(qa_inputs)
219 |                         qa_inputs.append(key)
220 |                     mapping[(i, j, k)] = context_to_input_index[key]
221 | 
222 |         logger.info(f"Answering {len(qa_inputs)} distinct (question, context) pairs")
223 |         predictions = self.question_answerer.answer_all(qa_inputs, return_offsets=True)
224 |         logger.info("Finished answering questions")
225 | 
226 |         # Remap from the distinct answers back to the original QA lists
227 |         predictions_lists = []
228 |         for i, (summary, qa_pairs_list) in enumerate(zip(summaries, qa_pairs_lists)):
229 |             predictions_lists.append([])
230 |             for j, qa_pairs in enumerate(qa_pairs_list):
231 |                 predictions_lists[-1].append([])
232 |                 for k, qa in enumerate(qa_pairs):
233 |                     index = mapping[(i, j, k)]
234 |                     prediction, probability, null_probability, offsets = predictions[
235 |                         index
236 |                     ]
237 |                     predictions_lists[-1][-1].append(
238 |                         {
239 |                             "prediction_id": self._get_prediction_id(index),
240 |                             "prediction": prediction,
241 |                             "probability": probability,
242 |                             "null_probability": null_probability,
243 |                             "start": offsets[0],
244 |                             "end": offsets[1],
245 |                         }
246 |                     )
247 |         return predictions_lists
248 | 
249 |     def _score_predictions(
250 |         self,
251 |         summaries: List[str],
252 |         qa_pairs_lists: List[List[List[Dict[str, Any]]]],
253 |         predictions_lists: List[List[List[Dict[str, Any]]]],
254 |     ) -> Tuple[List[MetricsDict], List[List[List[Dict[str, float]]]]]:
255 |         logger.info("Scoring predictions")
256 |         metrics_list = []
257 |         scores_list = []
258 | 
259 |         generator = tqdm(
260 |             zip(summaries, qa_pairs_lists, predictions_lists),
261 |             total=len(summaries),
262 |             disable=not self.verbose,
263 |         )
264 |         for summary, qa_pairs_list, predictions_list in generator:
265 |             # This is for 1 (summary, references) pair
266 |             input_questions_list = []
267 |             input_answers_list = []
268 |             input_predictions_list = []
269 |             input_probabilities_list = []
270 |             input_null_probabilities_list = []
271 |             for qa_pairs, predictions in zip(qa_pairs_list, predictions_list):
272 |                 # This is the set of QA pairs for 1 reference
273 |                 input_questions_list.append([])
274 |                 input_answers_list.append([])
275 |                 input_predictions_list.append([])
276 |                 input_probabilities_list.append([])
277 |                 input_null_probabilities_list.append([])
278 |                 for qa, prediction in zip(qa_pairs, predictions):
279 |                     input_questions_list[-1].append(qa["question"])
280 |                     input_answers_list[-1].append(qa["answer"])
281 |                     input_predictions_list[-1].append(prediction["prediction"])
282 |                     input_probabilities_list[-1].append(prediction["probability"])
283 |                     input_null_probabilities_list[-1].append(
284 |                         prediction["null_probability"]
285 |                     )
286 | 
287 |             metrics, scores = self.scorer.score_multi_ref(
288 |                 summary,
289 |                 input_questions_list,
290 |                 input_answers_list,
291 |                 input_predictions_list,
292 |                 input_probabilities_list,
293 |                 input_null_probabilities_list,
294 |             )
295 |             metrics = {"qa-eval": metrics}
296 |             metrics_list.append(metrics)
297 |             scores_list.append(scores)
298 | 
299 |         logger.info("Finished scoring predictions")
300 |         return metrics_list, scores_list
301 | 
302 |     def _combine_outputs(
303 |         self,
304 |         metrics_list: List[MetricsDict],
305 |         qa_pairs_lists: List[List[List[Dict[str, Any]]]],
306 |         predictions_lists: List[List[List[Dict[str, Any]]]],
307 |         scores_lists: List[List[List[Dict[str, float]]]],
308 |     ) -> List[List[List[Dict[str, Any]]]]:
309 |         # This method will combine the metrics and QA pair metadata together into a tuple so they can
310 |         # both be returned together
311 |         combined = []
312 |         for metrics, qa_pairs_list, predictions_list, scores_list in zip(
313 |             metrics_list, qa_pairs_lists, predictions_lists, scores_lists
314 |         ):
315 |             # This is for 1 (summary, reference) pair
316 |             combined.append((metrics, []))
317 |             for qa_pairs, predictions, scores in zip(
318 |                 qa_pairs_list, predictions_list, scores_list
319 |             ):
320 |                 # This is for 1 reference
321 |                 combined[-1][1].append([])
322 |                 for qa, prediction, score in zip(qa_pairs, predictions, scores):
323 |                     prediction = dict(**prediction)
324 |                     for key in self.scorer.keys():
325 |                         prediction[key] = score[key]
326 |                     combined[-1][1][-1].append(
327 |                         {"question": qa, "prediction": prediction}
328 |                     )
329 |         return combined
330 | 
331 |     def _insert_empty_outputs(
332 |         self,
333 |         metrics_list: List[MetricsDict],
334 |         is_empty_list: List[bool],
335 |         include_qa_list: bool,
336 |     ) -> List[Any]:
337 |         full_metrics_list = []
338 |         index = 0
339 |         for is_empty in is_empty_list:
340 |             if is_empty:
341 |                 empty_metrics = {"qa-eval": self.scorer.default_scores()}
342 |                 if include_qa_list:
343 |                     full_metrics_list.append((empty_metrics, []))
344 |                 else:
345 |                     full_metrics_list.append(empty_metrics)
346 |             else:
347 |                 full_metrics_list.append(metrics_list[index])
348 |                 index += 1
349 |         return full_metrics_list
350 | 
351 |     def score_batch(
352 |         self,
353 |         summaries: List[SummaryType],
354 |         references_list: List[List[SummaryType]],
355 |         return_qa_pairs: bool = False,
356 |     ) -> List[List[MetricsDict]]:
357 |         summaries = self._flatten_summaries(summaries)
358 |         references_list = self._flatten_references_list(references_list)
359 | 
360 |         # Remove any input summaries that are empty. They mess up the processing otherwise
361 |         (
362 |             summaries,
363 |             references_list,
364 |             is_empty_list,
365 |         ) = self._get_empty_summary_mask(summaries, references_list)
366 | 
367 |         qa_pairs_lists = self._generate_qa_pairs(references_list)
368 |         predictions_lists = self._answer_questions(summaries, qa_pairs_lists)
369 |         metrics_list, scores_lists = self._score_predictions(
370 |             summaries, qa_pairs_lists, predictions_lists
371 |         )
372 | 
373 |         if return_qa_pairs:
374 |             output = self._combine_outputs(
375 |                 metrics_list, qa_pairs_lists, predictions_lists, scores_lists
376 |             )
377 |         else:
378 |             output = metrics_list
379 |         output = self._insert_empty_outputs(output, is_empty_list, return_qa_pairs)
380 |         return output
381 | 


--------------------------------------------------------------------------------
/qaeval/scoring/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/qaeval/dd7273183dd1b2c9995115310ef041daa953ca81/qaeval/scoring/__init__.py


--------------------------------------------------------------------------------
/qaeval/scoring/lerc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/qaeval/dd7273183dd1b2c9995115310ef041daa953ca81/qaeval/scoring/lerc/__init__.py


--------------------------------------------------------------------------------
/qaeval/scoring/lerc/lerc_dataset_reader.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import json
  3 | import numpy as np
  4 | from overrides import overrides
  5 | from transformers import BertTokenizer
  6 | 
  7 | from allennlp.data.dataset_readers.dataset_reader import DatasetReader
  8 | from allennlp.data.fields import ArrayField
  9 | from allennlp.data.fields.metadata_field import MetadataField
 10 | from allennlp.data.instance import Instance
 11 | 
 12 | logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 13 | 
 14 | 
 15 | @DatasetReader.register("lerc")
 16 | class LERCDatasetReader(DatasetReader):
 17 |     def __init__(
 18 |         self,
 19 |         bert_model: str = 'bert-base-uncased',
 20 |         max_length: int = 512,
 21 |         holdout_sets: list = [],
 22 |         augment: bool = True,
 23 |         lazy: bool = False
 24 |     ) -> None:
 25 |         super().__init__(lazy)
 26 |         self.max_length = max_length
 27 |         self.holdout_sets = holdout_sets if type(holdout_sets) == list else [holdout_sets]
 28 |         self.augment = augment
 29 |         self.tokenizer = BertTokenizer.from_pretrained(bert_model)
 30 | 
 31 |     @overrides
 32 |     def _read(self, file_path: str):
 33 |         lines = []
 34 |         mocha_dataset = json.load(open(file_path))
 35 | 
 36 |         # Check that if we specified datasets to hold out, that they are
 37 |         # indeed in the MOCHA dataset.
 38 |         for constituent_dataset in self.holdout_sets:
 39 |             assert constituent_dataset in mocha_dataset.keys()
 40 | 
 41 |         # Iterate through the constituent datasets, loading the MOCHA instances
 42 |         for constituent_dataset in mocha_dataset:
 43 |             seen_questions = set()
 44 |             if constituent_dataset in self.holdout_sets:
 45 |                 continue
 46 | 
 47 |             for line in mocha_dataset[constituent_dataset].values():
 48 |                 # Append the current instance
 49 |                 lines.append({
 50 |                     'context': line['context'],
 51 |                     'question': line['question'],
 52 |                     'reference': line['reference'],
 53 |                     'candidate': line['candidate'],
 54 |                     'score': line['score'],
 55 |                 })
 56 | 
 57 |                 # Do a little data augmentation if the flag is set.
 58 |                 if self.augment:
 59 |                     # Identity augmentation with the reference
 60 |                     # If this is the first time we have seen the question,
 61 |                     # create an identity instance.
 62 |                     if line['question'] not in seen_questions:
 63 |                         lines.append({
 64 |                             'context': line['context'],
 65 |                             'question': line['question'],
 66 |                             'reference': line['reference'],
 67 |                             'candidate': line['reference'],
 68 |                             'score': 5,
 69 |                         })
 70 |                         seen_questions.add(line['question'])
 71 | 
 72 |                     # Augmentations via flipping reference and candidate
 73 |                     # If the current line has a perfect score, flip the
 74 |                     # reference and candidate
 75 |                     if self.augment and line['score'] == 5:
 76 |                         lines.append({
 77 |                             'context': line['context'],
 78 |                             'question': line['question'],
 79 |                             'reference': line['candidate'],
 80 |                             'candidate': line['reference'],
 81 |                             'score': 5,
 82 |                         })
 83 | 
 84 |         # Create instances
 85 |         for line in lines:
 86 |             yield self.text_to_instance(**line)
 87 | 
 88 |     @overrides
 89 |     def text_to_instance(
 90 |         self, context, question, reference, candidate, score=None
 91 |     ) -> Instance:
 92 |         context_tokens = self.tokenizer.tokenize(context)
 93 |         question_tokens = self.tokenizer.tokenize(question)
 94 |         reference_tokens = self.tokenizer.tokenize(reference)
 95 |         candidate_tokens = self.tokenizer.tokenize(candidate)
 96 | 
 97 |         # Truncates the context if the BERT input would be too long
 98 |         context_tokens = self.truncate_context(
 99 |             context_tokens, question_tokens, reference_tokens, candidate_tokens
100 |         )
101 | 
102 |         # Creates the BERT input (input IDs, segment IDs, and attention mask)
103 |         input_ids, token_type_ids, attention_mask = self.create_input(
104 |             context_tokens, question_tokens, reference_tokens, candidate_tokens
105 |         )
106 | 
107 |         fields = {
108 |             'input_ids': ArrayField(np.array(input_ids), dtype=np.int64,
109 |                                     padding_value=self.tokenizer.pad_token_id),
110 |             'token_type_ids': ArrayField(np.array(token_type_ids),
111 |                                          dtype=np.int64),
112 |             'attention_mask': ArrayField(np.array(attention_mask),
113 |                                          dtype=np.int64),
114 |             'metadata': MetadataField({
115 |                 'context': context,
116 |                 'context_tokens': context_tokens,
117 |                 'question': question,
118 |                 'question_tokens': question_tokens,
119 |                 'reference': reference,
120 |                 'reference_tokens': reference_tokens,
121 |                 'candidate': candidate,
122 |                 'candidate_tokens': candidate_tokens,
123 |             })
124 |         }
125 |         if score:
126 |             fields['score'] = ArrayField(np.array(score))
127 | 
128 |         return Instance(fields)
129 | 
130 |     def truncate_context(self, context, question, reference, candidate):
131 |         """ Calculates if the current input would be over `self.max_length`
132 |             and if so, truncates the context so that the input would be at
133 |             `self.max_length`.
134 |         """
135 | 
136 |         num_added_tokens = self.tokenizer.num_special_tokens_to_add(pair=True) + 2
137 |         current_length = len(context) + len(question) + len(reference) + \
138 |                          len(candidate) + num_added_tokens
139 | 
140 |         if current_length > self.max_length:
141 |             difference = self.max_length - current_length
142 |             context = context[:difference]
143 | 
144 |         return context
145 | 
146 |     def create_input(self, context, question, reference, candidate):
147 |         # `input_tokens`: `[CLS] cont [SEP] ques [SEP]  ref [SEP] cand [SEP]`
148 |         cls = [self.tokenizer.cls_token]
149 |         sep = [self.tokenizer.sep_token]
150 |         input_tokens = cls + context + sep + question + sep + reference + sep + candidate + sep
151 |         input_ids = self.tokenizer.convert_tokens_to_ids(input_tokens)
152 | 
153 |         # `token_type_ids`: is 0 for `[CLS] cont [SEP] ques [SEP]` and
154 |         #                   1 for `ref [SEP] cand [SEP]`
155 |         token_type_ids = [0] * (len(context) + len(question) + 3) +  \
156 |                          [1] * (len(reference) + len(candidate) + 2)
157 | 
158 |         # `attention_mask` is 1's for all positions which aren't padding
159 |         attention_mask = [1] * len(input_ids)
160 | 
161 |         assert len(input_ids) == len(token_type_ids) == len(attention_mask)
162 | 
163 |         return input_ids, token_type_ids, attention_mask


--------------------------------------------------------------------------------
/qaeval/scoring/lerc/lerc_model.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from overrides import overrides
 3 | from transformers import BertModel
 4 | import torch
 5 | from typing import Dict
 6 | 
 7 | from allennlp.data.vocabulary import Vocabulary
 8 | from allennlp.models.archival import load_archive
 9 | from allennlp.models.model import Model
10 | from allennlp.nn import InitializerApplicator
11 | from allennlp.training.metrics.pearson_correlation import PearsonCorrelation
12 | 
13 | logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
14 | 
15 | 
16 | @Model.register("lerc")
17 | class LERC(Model):
18 |     @property
19 |     def embedding_dim(self):
20 |         return self.bert.embeddings.word_embeddings.embedding_dim
21 | 
22 |     @overrides
23 |     def get_metrics(self, reset: bool = False) -> Dict[str, float]:
24 |         return {metric_name: metric.get_metric(reset)
25 |                 for metric_name, metric in self.metrics.items()}
26 | 
27 |     def __init__(
28 |         self,
29 |         bert_model: str = 'bert-base-uncased',
30 |         pretrained_archive_path: str = None,
31 |         vocab=Vocabulary(),
32 |         initializer=InitializerApplicator()
33 |     ) -> None:
34 |         super(LERC, self).__init__(vocab)
35 |         if pretrained_archive_path:
36 |             logger.info('Loading pretrained: %s', pretrained_archive_path)
37 |             archive = load_archive(pretrained_archive_path)
38 |             self.bert = archive.model.bert
39 |         else:
40 |             self.bert = BertModel.from_pretrained(bert_model)
41 | 
42 |         self.score_layer = torch.nn.Linear(self.embedding_dim, 1)
43 |         self.metrics = {'pearson': PearsonCorrelation()}
44 |         self.loss = torch.nn.MSELoss()
45 |         initializer(self)
46 | 
47 |     @overrides
48 |     def forward(
49 |         self,
50 |         input_ids: torch.Tensor,
51 |         token_type_ids: torch.Tensor,
52 |         attention_mask: torch.Tensor = None,
53 |         score: torch.Tensor = None,
54 |         metadata: Dict = None
55 |     ) -> Dict:
56 |         output, _ = self.bert(
57 |             input_ids=input_ids,
58 |             token_type_ids=token_type_ids,
59 |             attention_mask=attention_mask
60 |         )
61 |         cls_output = output[:, 0, :]
62 |         pred_score = self.score_layer(cls_output).squeeze(-1)
63 | 
64 |         output_dict = {'pred_score': pred_score, 'metadata': metadata}
65 | 
66 |         if score is not None:
67 |             score = score.float()
68 |             self.metrics['pearson'](pred_score, score)
69 |             output_dict['loss'] = self.loss(pred_score, score)
70 |             output_dict['score'] = score
71 | 
72 |         return output_dict


--------------------------------------------------------------------------------
/qaeval/scoring/lerc/lerc_predictor.py:
--------------------------------------------------------------------------------
 1 | from overrides import overrides
 2 | 
 3 | from allennlp.data import DatasetReader, Instance
 4 | from allennlp.models import Model
 5 | from allennlp.predictors.predictor import Predictor
 6 | 
 7 | from qaeval.scoring.lerc.lerc_model import LERC
 8 | from qaeval.scoring.lerc.lerc_dataset_reader import LERCDatasetReader
 9 | from qaeval.scoring.lerc.pretrain_model import PretrainLERC
10 | 
11 | 
12 | @Predictor.register("lerc")
13 | class LERCPredictor(Predictor):
14 |     def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
15 |         super().__init__(model, dataset_reader)
16 | 
17 |     @overrides
18 |     def _json_to_instance(self, inputs) -> Instance:
19 |         inputs = {
20 |             'context': inputs['context'],
21 |             'question': inputs['question'],
22 |             'reference': inputs['reference'],
23 |             'candidate': inputs['candidate']
24 |         }
25 |         return self._dataset_reader.text_to_instance(**inputs)


--------------------------------------------------------------------------------
/qaeval/scoring/lerc/pretrain_model.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from overrides import overrides
 3 | from typing import Dict
 4 | 
 5 | import torch
 6 | from allennlp.data.vocabulary import Vocabulary
 7 | from allennlp.models.model import Model
 8 | from allennlp.nn import InitializerApplicator
 9 | from allennlp.training.metrics.categorical_accuracy import CategoricalAccuracy
10 | from transformers import BertModel
11 | 
12 | logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
13 | 
14 | 
15 | @Model.register("pretrain-lerc")
16 | class PretrainLERC(Model):
17 |     @property
18 |     def embedding_dim(self):
19 |         return self.bert.embeddings.word_embeddings.embedding_dim
20 | 
21 |     @overrides
22 |     def get_metrics(self, reset: bool = False) -> Dict[str, float]:
23 |         return {metric_name: metric.get_metric(reset)
24 |                 for metric_name, metric in self.metrics.items()}
25 | 
26 |     def __init__(
27 |         self,
28 |         bert_model: str = 'bert-base-uncased',
29 |         vocab=Vocabulary(),
30 |         initializer=InitializerApplicator()
31 |     ) -> None:
32 |         super(PretrainLERC, self).__init__(vocab)
33 |         self.bert = BertModel.from_pretrained(bert_model)
34 |         self.label_layer = torch.nn.Linear(self.embedding_dim, 3)
35 |         self.metrics = {'accuracy': CategoricalAccuracy()}
36 |         self.loss = torch.nn.CrossEntropyLoss()
37 |         initializer(self)
38 | 
39 |     @overrides
40 |     def forward(
41 |         self,
42 |         input_ids: torch.Tensor,
43 |         token_type_ids: torch.Tensor,
44 |         attention_mask: torch.Tensor = None,
45 |         label: torch.Tensor = None,
46 |         metadata: Dict = None
47 |     ) -> Dict:
48 |         # output.size() = [batch_size, seq_len, embedding_dim]
49 |         output, _ = self.bert(input_ids=input_ids,
50 |                               token_type_ids=token_type_ids,
51 |                               attention_mask=attention_mask)
52 | 
53 |         # cls_output.size() = [batch_size, embedding_dim]
54 |         cls_output = output[:, 0, :]
55 | 
56 |         # logits.size() = [batch_size, 3]
57 |         logits = self.label_layer(cls_output)
58 | 
59 |         output_dict = {
60 |             'logits': logits,
61 |             'class_probabilties': torch.nn.functional.softmax(logits, dim=-1),
62 |             'pred_label': torch.max(logits, dim=-1)[1],
63 |             'metadata': metadata
64 |         }
65 | 
66 |         if label is not None:
67 |             label = label.long()
68 |             self.metrics['accuracy'](logits, label)
69 |             output_dict['loss'] = self.loss(logits, label)
70 |             output_dict['label'] = label
71 | 
72 |         return output_dict


--------------------------------------------------------------------------------
/qaeval/scoring/scorers/__init__.py:
--------------------------------------------------------------------------------
1 | from qaeval.scoring.scorers.scorer import Scorer
2 | from qaeval.scoring.scorers.is_answered import IsAnsweredScorer
3 | from qaeval.scoring.scorers.em import ExactMatchScorer
4 | from qaeval.scoring.scorers.f1 import F1Scorer
5 | from qaeval.scoring.scorers.lerc import LERCScorer
6 | from qaeval.scoring.scorers.meta import MetaScorer


--------------------------------------------------------------------------------
/qaeval/scoring/scorers/em.py:
--------------------------------------------------------------------------------
 1 | from transformers.data.metrics.squad_metrics import compute_exact
 2 | from typing import Dict, List, Set
 3 | 
 4 | from qaeval.scoring.scorers import Scorer
 5 | 
 6 | 
 7 | class ExactMatchScorer(Scorer):
 8 |     def keys(self) -> Set[str]:
 9 |         return {'em'}
10 | 
11 |     def _score_single_ref(
12 |         self,
13 |         context: str,
14 |         questions: List[str],
15 |         answers: List[str],
16 |         predictions: List[str],
17 |         probabilities: List[float],
18 |         null_probabilities: List[float]
19 |     ) -> List[Dict[str, float]]:
20 |         scores = []
21 |         for prediction, answer, prob, null_prob in zip(predictions, answers, probabilities, null_probabilities):
22 |             if prediction is None or null_prob >= prob:
23 |                 scores.append({'em': 0.0})
24 |             else:
25 |                 scores.append({'em': compute_exact(answer, prediction)})
26 |         return scores
27 | 


--------------------------------------------------------------------------------
/qaeval/scoring/scorers/f1.py:
--------------------------------------------------------------------------------
 1 | from transformers.data.metrics.squad_metrics import compute_f1
 2 | from typing import Dict, List, Set
 3 | 
 4 | from qaeval.scoring.scorers import Scorer
 5 | 
 6 | 
 7 | class F1Scorer(Scorer):
 8 |     def keys(self) -> Set[str]:
 9 |         return {'f1'}
10 | 
11 |     def _score_single_ref(
12 |         self,
13 |         context: str,
14 |         questions: List[str],
15 |         answers: List[str],
16 |         predictions: List[str],
17 |         probabilities: List[float],
18 |         null_probabilities: List[float]
19 |     ) -> List[Dict[str, float]]:
20 |         scores = []
21 |         for prediction, answer, prob, null_prob in zip(predictions, answers, probabilities, null_probabilities):
22 |             if prediction is None or null_prob >= prob:
23 |                 scores.append({'f1': 0.0})
24 |             else:
25 |                 scores.append({'f1': compute_f1(answer, prediction)})
26 |         return scores
27 | 


--------------------------------------------------------------------------------
/qaeval/scoring/scorers/is_answered.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List, Set
 2 | 
 3 | from qaeval.scoring.scorers import Scorer
 4 | 
 5 | 
 6 | class IsAnsweredScorer(Scorer):
 7 |     def keys(self) -> Set[str]:
 8 |         return {'is_answered'}
 9 | 
10 |     def _score_single_ref(
11 |         self,
12 |         context: str,
13 |         questions: List[str],
14 |         answers: List[str],
15 |         predictions: List[str],
16 |         probabilities: List[float],
17 |         null_probabilities: List[float]
18 |     ) -> List[Dict[str, float]]:
19 |         scores = []
20 |         for prob, null_prob in zip(probabilities, null_probabilities):
21 |             if prob > null_prob:
22 |                 scores.append({'is_answered': 1.0})
23 |             else:
24 |                 scores.append({'is_answered': 0.0})
25 |         return scores
26 | 


--------------------------------------------------------------------------------
/qaeval/scoring/scorers/lerc.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List, Set
 2 | 
 3 | from allennlp.models import load_archive
 4 | 
 5 | from qaeval.scoring.lerc.lerc_predictor import LERCPredictor
 6 | from qaeval.scoring.scorers import Scorer
 7 | 
 8 | 
 9 | class LERCScorer(Scorer):
10 |     def __init__(self, model_path: str, pretrained_path: str, cuda_device: int, batch_size: int = 8) -> None:
11 |         archive = load_archive(model_path, cuda_device=cuda_device, overrides='{"model.pretrained_archive_path": "' + pretrained_path + '"}')
12 |         self.predictor = LERCPredictor.from_archive(archive, predictor_name='lerc')
13 |         self.batch_size = batch_size
14 | 
15 |     def keys(self) -> Set[str]:
16 |         return {'lerc'}
17 | 
18 |     def _score_single_ref(
19 |         self,
20 |         context: str,
21 |         questions: List[str],
22 |         answers: List[str],
23 |         predictions: List[str],
24 |         probabilities: List[float],
25 |         null_probabilities: List[float]
26 |     ) -> List[Dict[str, float]]:
27 |         input_dicts = []
28 |         indices = []
29 |         for i, (answer, question, prediction, probability, null_probability) in enumerate(zip(answers, questions, predictions,
30 |                                                                                               probabilities, null_probabilities)):
31 |             if probability > null_probability:
32 |                 input_dicts.append({
33 |                     'context': context,
34 |                     'question': question,
35 |                     'reference': answer,
36 |                     'candidate': prediction
37 |                 })
38 |                 indices.append(i)
39 | 
40 |         output_dicts = []
41 |         for i in range(0, len(input_dicts), self.batch_size):
42 |             batch = input_dicts[i:i + self.batch_size]
43 |             output_dicts.extend(self.predictor.predict_batch_json(batch))
44 |         assert len(output_dicts) == len(input_dicts)
45 | 
46 |         scores = [0.0] * len(questions)
47 |         for i, output_dict in zip(indices, output_dicts):
48 |             scores[i] = output_dict['pred_score']
49 |         scores = [{'lerc': s} for s in scores]
50 |         return scores


--------------------------------------------------------------------------------
/qaeval/scoring/scorers/meta.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List, Set
 2 | 
 3 | from qaeval.scoring.scorers import Scorer
 4 | 
 5 | 
 6 | class MetaScorer(Scorer):
 7 |     def __init__(self, scorers: List['Scorer']) -> None:
 8 |         self.scorers = scorers
 9 | 
10 |     def _merge_dicts(self, dicts: List[Dict[str, float]]) -> Dict[str, float]:
11 |         merged = {}
12 |         for other in dicts:
13 |             merged.update(other)
14 |         return merged
15 | 
16 |     def keys(self) -> Set[str]:
17 |         keys = set()
18 |         for scorer in self.scorers:
19 |             keys |= scorer.keys()
20 |         return keys
21 | 
22 |     def _score_single_ref(
23 |         self,
24 |         context: str,
25 |         questions: List[str],
26 |         answers: List[str],
27 |         predictions: List[str],
28 |         probabilities: List[float],
29 |         null_probabilities: List[float]
30 |     ) -> List[Dict[str, float]]:
31 |         scores_list = []
32 |         for scorer in self.scorers:
33 |             _, scores = scorer.score_single_ref(
34 |                 context,
35 |                 questions,
36 |                 answers,
37 |                 predictions,
38 |                 probabilities,
39 |                 null_probabilities
40 |             )
41 |             scores_list.append(scores)
42 | 
43 |         combined_scores = []
44 |         for i in range(len(questions)):
45 |             combined_scores.append(self._merge_dicts([scores[i] for scores in scores_list]))
46 |         return combined_scores
47 | 


--------------------------------------------------------------------------------
/qaeval/scoring/scorers/scorer.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Dict, Set, Tuple
 2 | 
 3 | 
 4 | class Scorer(object):
 5 |     def keys(self) -> Set[str]:
 6 |         raise NotImplementedError
 7 | 
 8 |     def default_scores(self) -> Dict[str, float]:
 9 |         return {key: 0.0 for key in self.keys()}
10 | 
11 |     def score_single_ref(
12 |         self,
13 |         context: str,
14 |         questions: List[str],
15 |         answers: List[str],
16 |         predictions: List[str],
17 |         probabilities: List[float],
18 |         null_probabilities: List[float]
19 |     ) -> Tuple[Dict[str, float], List[Dict[str, float]]]:
20 |         scores_dicts = self._score_single_ref(
21 |             context,
22 |             questions,
23 |             answers,
24 |             predictions,
25 |             probabilities,
26 |             null_probabilities
27 |         )
28 |         aggregated_scores = self.aggregate_scores(scores_dicts)
29 |         return aggregated_scores, scores_dicts
30 | 
31 |     def _score_single_ref(
32 |         self,
33 |         context: str,
34 |         questions: List[str],
35 |         answers: List[str],
36 |         predictions: List[str],
37 |         probabilities: List[float],
38 |         null_probabilities: List[float]
39 |     ) -> List[Dict[str, float]]:
40 |         raise NotImplementedError
41 | 
42 |     def score_multi_ref(
43 |         self,
44 |         context: str,
45 |         questions_list: List[List[str]],
46 |         answers_list: List[List[str]],
47 |         predictions_list: List[List[str]],
48 |         probabilities_list: List[List[float]],
49 |         null_probabilities_list: List[List[float]]
50 |     ) -> Tuple[Dict[str, float], List[List[Dict[str, float]]]]:
51 |         # The aggregated per-reference scores
52 |         reference_scores_list = []
53 |         # The scores for each individual question. [i][j] will be the scores from
54 |         # reference i and question j
55 |         question_scores_list = []
56 | 
57 |         for i in range(len(questions_list)):
58 |             reference_scores, question_scores = self.score_single_ref(
59 |                 context,
60 |                 questions_list[i],
61 |                 answers_list[i],
62 |                 predictions_list[i],
63 |                 probabilities_list[i],
64 |                 null_probabilities_list[i]
65 |             )
66 |             reference_scores_list.append(reference_scores)
67 |             question_scores_list.append(question_scores)
68 | 
69 |         instance_scores = self.aggregate_scores(reference_scores_list)
70 |         return instance_scores, question_scores_list
71 | 
72 |     def _ensure_expected_keys(self, expected_keys: Set[str], scores_dicts: List[Dict[str, float]]) -> None:
73 |         for scores in scores_dicts:
74 |             if expected_keys != scores.keys():
75 |                 raise Exception(f'Unequal keys: {expected_keys}; {scores.keys()}')
76 | 
77 |     def aggregate_scores(self, scores_dicts: List[Dict[str, float]]) -> Dict[str, float]:
78 |         if len(scores_dicts) == 0:
79 |             return self.default_scores()
80 | 
81 |         expected_keys = self.keys()
82 |         self._ensure_expected_keys(expected_keys, scores_dicts)
83 |         sums = {key: 0.0 for key in expected_keys}
84 |         for scores in scores_dicts:
85 |             for key in expected_keys:
86 |                 sums[key] += scores[key]
87 | 
88 |         averages = {key: sums[key] / len(scores_dicts) for key in expected_keys}
89 |         return averages


--------------------------------------------------------------------------------
/qaeval/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/qaeval/dd7273183dd1b2c9995115310ef041daa953ca81/qaeval/tests/__init__.py


--------------------------------------------------------------------------------
/qaeval/tests/answer_selection_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from qaeval.answer_selection import AnswerSelector, NP_CHUNKS_STRATEGY, \
 4 |     MAX_NP_STRATEGY, NER_STRATEGY, ALL_STRATEGY, STRATEGIES, AnswerOffsets
 5 | 
 6 | 
 7 | class TestAnswerSelector(unittest.TestCase):
 8 |     def test_constructor(self):
 9 |         for strategy in STRATEGIES:
10 |             AnswerSelector(strategy)
11 |         with self.assertRaises(Exception):
12 |             AnswerSelector('missing')
13 | 
14 |     def test_np_chunks(self):
15 |         selector = AnswerSelector(NP_CHUNKS_STRATEGY)
16 |         answers = selector.select('Several churches in Baghdad have been attacked. More attacks have been in Mosul.')
17 |         assert len(answers) == 4
18 |         assert answers[0] == AnswerOffsets(0, 16, 0, 47, 'Several churches')
19 |         assert answers[1] == AnswerOffsets(20, 27, 0, 47, 'Baghdad')
20 |         assert answers[2] == AnswerOffsets(48, 60, 48, 80, 'More attacks')
21 |         assert answers[3] == AnswerOffsets(74, 79, 48, 80, 'Mosul')
22 | 
23 |     def test_max_np(self):
24 |         selector = AnswerSelector(MAX_NP_STRATEGY)
25 |         answers = selector.select('Several churches in Baghdad have been attacked. More attacks have been in Mosul.')
26 |         assert len(answers) == 3
27 |         assert answers[0] == AnswerOffsets(0, 27, 0, 47, 'Several churches in Baghdad')  # Several churches in Baghdad
28 |         assert answers[1] == AnswerOffsets(48, 60, 48, 80, 'More attacks')
29 |         assert answers[2] == AnswerOffsets(74, 79, 48, 80, 'Mosul')
30 | 
31 |     def test_ner(self):
32 |         selector = AnswerSelector(NER_STRATEGY)
33 |         answers = selector.select('Several churches in Baghdad have been attacked. More attacks have been in Mosul.')
34 |         assert len(answers) == 2
35 |         assert answers[0] == AnswerOffsets(20, 27, 0, 47, 'Baghdad')
36 |         assert answers[1] == AnswerOffsets(74, 79, 48, 80, 'Mosul')
37 | 
38 |     def test_all(self):
39 |         selector = AnswerSelector(ALL_STRATEGY)
40 |         answers = selector.select('Several churches in Baghdad have been attacked. More attacks have been in Mosul.')
41 |         assert len(answers) == 5
42 |         assert answers[0] == AnswerOffsets(0, 16, 0, 47, 'Several churches')
43 |         assert answers[1] == AnswerOffsets(0, 27, 0, 47, 'Several churches in Baghdad')
44 |         assert answers[2] == AnswerOffsets(20, 27, 0, 47, 'Baghdad')
45 |         assert answers[3] == AnswerOffsets(48, 60, 48, 80, 'More attacks')
46 |         assert answers[4] == AnswerOffsets(74, 79, 48, 80, 'Mosul')
47 | 


--------------------------------------------------------------------------------
/qaeval/tests/answering/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/qaeval/dd7273183dd1b2c9995115310ef041daa953ca81/qaeval/tests/answering/__init__.py


--------------------------------------------------------------------------------
/qaeval/tests/answering/model_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | import unittest
 4 | 
 5 | from qaeval.answering import QuestionAnsweringModel
 6 | 
 7 | 
 8 | @pytest.mark.skipif('ANSWERING_MODEL_DIR' not in os.environ, reason='Answering model environment variable not set')
 9 | class TestQuestionAnsweringModel(unittest.TestCase):
10 |     @classmethod
11 |     def setUpClass(cls) -> None:
12 |         cls.model = QuestionAnsweringModel(os.environ['ANSWERING_MODEL_DIR'], cuda_device=0)
13 | 
14 |     def test_answering(self):
15 |         question = 'Who does the A380 super - jumbo passenger jet surpass and break their monopoly?'
16 |         context = "The superjumbo Airbus A380 , the world 's largest commercial airliner , took off Wednesday into cloudy skies over southwestern France for its second test flight . The European aircraft maker , based in the French city of Toulouse , said the second flight -- which came exactly a week after the A380 's highly anticipated maiden voyage -- would last about four hours . As opposed to the international media hype that surrounded last week 's flight , with hundreds of journalists on site to capture the historic moment , Airbus chose to conduct Wednesday 's test more discreetly ."
17 |         answer, probability, null_probability = self.model.answer(question, context)
18 | 
19 |         assert answer == 'the world \'s largest'
20 |         assert probability == pytest.approx(0.00428164186632745, abs=1e-5)
21 |         assert null_probability == pytest.approx(0.9895479613676263, abs=1e-5)
22 | 
23 |     def test_answering_with_offsets(self):
24 |         question = 'Who does the A380 super - jumbo passenger jet surpass and break their monopoly?'
25 |         context = "The superjumbo Airbus A380 , the world 's largest commercial airliner , took off Wednesday into cloudy skies over southwestern France for its second test flight . The European aircraft maker , based in the French city of Toulouse , said the second flight -- which came exactly a week after the A380 's highly anticipated maiden voyage -- would last about four hours . As opposed to the international media hype that surrounded last week 's flight , with hundreds of journalists on site to capture the historic moment , Airbus chose to conduct Wednesday 's test more discreetly ."
26 |         answer, probability, null_probability, offsets = self.model.answer(question, context, return_offsets=True)
27 | 
28 |         assert answer == 'the world \'s largest'
29 |         assert probability == pytest.approx(0.00428164186632745, abs=1e-5)
30 |         assert null_probability == pytest.approx(0.9895479613676263, abs=1e-5)
31 |         assert offsets == (29, 49)
32 | 
33 |     def test_answering_with_fixing_offsets(self):
34 |         question = 'What is my name?'
35 |         context = 'My name is Dan!'
36 | 
37 |         # Verify the original, unfixed offsets are not correct
38 |         answer, probability, null_probability, offsets = self.model.answer(
39 |             question, context, return_offsets=True, try_fixing_offsets=False
40 |         )
41 |         start, end = offsets
42 |         assert answer == 'Dan'
43 |         assert context[start:end] == 'Dan!'
44 | 
45 |         # `try_fixing_offsets=True` by default
46 |         answer, probability, null_probability, offsets = self.model.answer(
47 |             question, context, return_offsets=True
48 |         )
49 |         start, end = offsets
50 |         assert answer == 'Dan'
51 |         assert context[start:end] == 'Dan'
52 | 
53 |     def test_return_dict(self):
54 |         question = 'Who does the A380 super - jumbo passenger jet surpass and break their monopoly?'
55 |         context = "The superjumbo Airbus A380 , the world 's largest commercial airliner , took off Wednesday into cloudy skies over southwestern France for its second test flight . The European aircraft maker , based in the French city of Toulouse , said the second flight -- which came exactly a week after the A380 's highly anticipated maiden voyage -- would last about four hours . As opposed to the international media hype that surrounded last week 's flight , with hundreds of journalists on site to capture the historic moment , Airbus chose to conduct Wednesday 's test more discreetly ."
56 |         result = self.model.answer(
57 |             question, context, return_offsets=True, return_dict=True
58 |         )
59 | 
60 |         assert result['prediction'] == 'the world \'s largest'
61 |         assert result['probability'] == pytest.approx(0.00428164186632745, abs=1e-5)
62 |         assert result['null_probability'] == pytest.approx(0.9895479613676263, abs=1e-5)
63 |         assert result['start'] == 29
64 |         assert result['end'] == 49
65 | 


--------------------------------------------------------------------------------
/qaeval/tests/answering/utils_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from qaeval.answering.utils import fix_answer_span, SpanFixError
 4 | 
 5 | 
 6 | class TestUtils(unittest.TestCase):
 7 |     def test_fix_answer_span(self):
 8 |         assert fix_answer_span('Dan', 'Dan!', 0, 4) == (0, 3)
 9 |         assert fix_answer_span('Dan', 'Dan!', 10, 14) == (10, 13)
10 |         assert fix_answer_span('Dan', ' Dan!', 0, 5) == (1, 4)
11 |         assert fix_answer_span('Dan', ' Dan! ', 0, 6) == (1, 4)
12 |         assert fix_answer_span('Dan', '  Dan!  ', 0, 8) == (2, 5)
13 | 
14 |         assert fix_answer_span('is Dan', 'is Dan!', 0, 7) == (0, 6)
15 |         assert fix_answer_span('is Dan', ' is Dan!', 0, 8) == (1, 7)
16 |         assert fix_answer_span('is Dan', ' is Dan! ', 0, 9) == (1, 7)
17 |         assert fix_answer_span('is Dan', 'is  Dan!', 0, 8) == (0, 7)
18 |         assert fix_answer_span('is Dan', 'is   Dan!', 0, 9) == (0, 8)
19 |         assert fix_answer_span('is Dan', ' is   Dan! ', 0, 11) == (1, 9)
20 | 
21 |         # Length is too long
22 |         with self.assertRaises(SpanFixError):
23 |             fix_answer_span('Dan!', 'Dan', 0, 3)
24 |         with self.assertRaises(SpanFixError):
25 |             fix_answer_span('is  Dan', 'is Dan', 0, 6)
26 | 
27 |         # Not a substring
28 |         with self.assertRaises(SpanFixError):
29 |             fix_answer_span('Dan', 'Not a substring', 0, 15)
30 | 
31 |     def test_fix_answer_span_unicode(self):
32 |         prediction = 'track and   field, swimming, diving'
33 |         document_span = 'that…track and field, swimming,  diving,'
34 |         assert fix_answer_span(prediction, document_span, 0, 40) == (5, 39)
35 | 


--------------------------------------------------------------------------------
/qaeval/tests/fixtures/multiling2011.jsonl:
--------------------------------------------------------------------------------
 1 | {"instance_id": "M000", "summarizer_id": "1", "summarizer_type": "peer", "summary": {"summarizer_id": "1", "summarizer_type": "peer", "text": ["* AP - \"confirmed death toll passed 121,000, and 5 million people", "The Bush administration has pledged $350 million in aid for the relief effort. Critics have been quick to compare this to the $177 million spent every day in Iraq to conduct war in that country. In comparison, there was a $500 million pledge made recently by the government of Japan.", "An AP/ISOS poll has found three in ten U.S. citizens have donated to Tsunami Aid organizations.", "Several prominent Romanian artists and celebrities will gather at the Radio Hall in Bucharest on Sunday, January 23, to raise money for the victims of the catastrophic Indian Ocean tsunami of December 26, 2004. In an event titled Romanian Artists in Support of Asia, organised by The Reporter Foundation of Romania, artists will auction off their works, as well as personal objects, with all proceeds being donated to the relief efforts for the tsunami victims.", "Saturday, March 26, 2005 Up to four times as many women as men died in the December 26 Indian Ocean Tsunami, figures published by Oxfam International today reveal.", "Without the new law, contributors would have waited until 2006 and their 2005 tax returns to be able to write off their charitable donations. The law is intended to promote donating towards the tsunami relief effort.", "While complaints about the 'miserly' generosity of the Bush Administration have surfaced in recent days, donations and actions at the grassroots level have quietly illustrated the concern and sympathy felt by ordinary Americans."]}, "references": [{"summarizer_id": "A", "summarizer_type": "reference", "text": ["Dec. 2004 Sumatra earthquake was the longest-lasting ever recorded and large enough to vibrate the whole planet!", "A week after tsunami waves scoured the coasts of multiple countries in southern Asia, the UN says the final death toll could climb beyond 150,000 and may never be known, while it is estimated that 5 million people lack food, water or basic sanitation necessary for survival.", "It should also be noted that women killed outnumber men.", "The world's wealthiest nations have begun pouring funding into the Earthquake/Tsunami damaged region.", "Japanese Prime Minister announced a half-billion dollar donation, China has promised $60.5 million, while Norway increased its funding donation to $180 million.", "The Bush administration has pledged $350 million.", "Moreover, Indiana University's Center on Philanthropy is estimating approximately $322 million in goods and cash have been donated by private U.S. citizens and corporations.", "The Romanian Government pledged 150,000 euro, while the Romanian public raised 395,000 euro in a telethon for the tsunami victims.", "In addition, Romanian artists will auction off their works, as well as personal objects, with all proceeds being donated to the relief efforts.", "Despite the encouraging promises, the UN warns that logistics of securing the funds, purchasing supplies and shipping them to stricken regions will take time.", "Meanwhile, deaths due to dehydration, disease and starvation will continue to climb.", "In this entire situation, the municipal council of Saskatchewan town in Canada says it \"accidentally\" donated $10,000 to the Red Cross for tsunami relief and now asks for its money back!"]}, {"summarizer_id": "B", "summarizer_type": "reference", "text": ["A week after the earthquake in Southern Asia, the confirmed death count is over 120.000.", "Indonesia's Aceh province alone counts 80.000 deaths.", "Sri Lanka announced approximately 28.500, India more than 7.700 and as many as 10.500 foreign citizens are reported missing.", "As the count continues, a gender imbalance was noticed in deaths by the tsunami, where four times more women than men were killed.", "This phenomenon is due to the greater physical strength of men and different occupations at the time of the disaster.", "Because of this phenomenon, many incidents of violence against women have been reported, and it is estimated that they will continue for the time being, therefore the people in charge seek measures to address the problem.", "The earthquake in Southern Asia was the longest in duration earthquake ever recorded.", "It lasted between 500 and 600 seconds and released an amount of energy that equals a 100 gigaton bomb, while the gash that was created in the bottom of the sea was measured nearly 800 miles deep.", "The U.S.A. has committed to offer 350 million dollars to the affected areas, while the U.S. government signed into law on direct tax relief to individuals who have contributed with donations to disaster relief.", "The amount of donations comes up to 322 million dollars.", "Japan also offers 500 million dollars.", "The Romanians raised 395,000 euros in a telethon, while on the other hand a town in Canada demands refund for 10.000 dollars by the Red Cross due to mistake."]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["U.S. citizens donating in 2005 to help tsunami victims wrote off their donations on their 2004 tax returns.", "In order to help the earthquake's and tsunami's victims, the world's wealthiest nations poured funding into the damaged region.", "However, the UN Office for the Coordination of Humanitarian Affairs in Indonesia chief, Michael Elmquist, warned that logistics of securing the funds, purchasing supplies and shipping them to stricken regions is a long lasting procedure.", "Meanwhile, the  death toll is increased due to dehydration, disease, and starvation.", "The offered assistance by the U.S. government was considered limited and prompted complaints.", "At the same time, the ordinary Americans' donations were touching.", "The scientists determined that the Sumatra's earthquake was the largest ever recorded.", "People from around the world raised money for the victims as the Romanians who organised telemarathon and other charitable events.", "On the other hand, a small Canadian village calls back from the Red Cross  the $ 10,000 which had donated as the election day for approving the donation some council members were absent.", "The Red Cross has already deposited the cheque, but has indicated it will return the money.", "Only in Indonesia, the death toll reaches 80000.", "In southern Asia about 1 million people are homeless and humanitarian agencies estimate that 5 million people need relief.", "The World Health Organization makes hard efforts to improve the conditions.", "The vast majority of the victims were women.", "This fact is justified to some extent but requires protection for the female sex."]}]}
 2 | {"instance_id": "M000", "summarizer_id": "2", "summarizer_type": "peer", "summary": {"summarizer_id": "2", "summarizer_type": "peer", "text": ["\"Confirmed death toll passed 121,000, and 5 million people were homeless.\"", "United Nations--\"UN Emergency Relief Coordinator and Under-Secretary-General for Humanitarian Affairs Jan Egeland said the final death toll could climb beyond 150,000.\"", "January 16, the Romanian public raised the equivalent of 395,000 euro in a telethon, while the Romanian Government pledged 150,000 euro for the relief effort.", "In an event titled Romanian Artists in Support of Asia, organised by The Reporter Foundation of Romania, artists will auction off their works, as well as personal objects, with all proceeds being donated to the relief efforts for the tsunami victims.", "Several prominent Romanian artists and celebrities will gather at the Radio Hall in Bucharest on Sunday, January 23, to raise money for the victims of the catastrophic Indian Ocean tsunami of December 26, 2004.", "Near the epicenter of the earthquakes and tsunami, Indonesia's Aceh province alone may have as many as 80,000 death.", "In Sri Lanka, more than 1,600 kilometres from the epicenter, the ground moved nearly 10 centimetres.", "Meadow Lake, Saskatchewan--The municipal council of a small town in Canada's Prairies has said it \"accidentally\" donated $10,000 to the Red Cross for tsunami relief.", "Two Romanian tourists have been declared missing in the tsunami.", "After the U.S. increased it's funding donation to 350 million USD, Japanese Prime Minister Junichiro Koizumi announced a half-billion dollar donation on Saturday, Jan. 1.", "An AP/ISOS poll has found three in ten U.S. citizens have donated to Tsunami Aid organizations."]}, "references": [{"summarizer_id": "A", "summarizer_type": "reference", "text": ["Dec. 2004 Sumatra earthquake was the longest-lasting ever recorded and large enough to vibrate the whole planet!", "A week after tsunami waves scoured the coasts of multiple countries in southern Asia, the UN says the final death toll could climb beyond 150,000 and may never be known, while it is estimated that 5 million people lack food, water or basic sanitation necessary for survival.", "It should also be noted that women killed outnumber men.", "The world's wealthiest nations have begun pouring funding into the Earthquake/Tsunami damaged region.", "Japanese Prime Minister announced a half-billion dollar donation, China has promised $60.5 million, while Norway increased its funding donation to $180 million.", "The Bush administration has pledged $350 million.", "Moreover, Indiana University's Center on Philanthropy is estimating approximately $322 million in goods and cash have been donated by private U.S. citizens and corporations.", "The Romanian Government pledged 150,000 euro, while the Romanian public raised 395,000 euro in a telethon for the tsunami victims.", "In addition, Romanian artists will auction off their works, as well as personal objects, with all proceeds being donated to the relief efforts.", "Despite the encouraging promises, the UN warns that logistics of securing the funds, purchasing supplies and shipping them to stricken regions will take time.", "Meanwhile, deaths due to dehydration, disease and starvation will continue to climb.", "In this entire situation, the municipal council of Saskatchewan town in Canada says it \"accidentally\" donated $10,000 to the Red Cross for tsunami relief and now asks for its money back!"]}, {"summarizer_id": "B", "summarizer_type": "reference", "text": ["A week after the earthquake in Southern Asia, the confirmed death count is over 120.000.", "Indonesia's Aceh province alone counts 80.000 deaths.", "Sri Lanka announced approximately 28.500, India more than 7.700 and as many as 10.500 foreign citizens are reported missing.", "As the count continues, a gender imbalance was noticed in deaths by the tsunami, where four times more women than men were killed.", "This phenomenon is due to the greater physical strength of men and different occupations at the time of the disaster.", "Because of this phenomenon, many incidents of violence against women have been reported, and it is estimated that they will continue for the time being, therefore the people in charge seek measures to address the problem.", "The earthquake in Southern Asia was the longest in duration earthquake ever recorded.", "It lasted between 500 and 600 seconds and released an amount of energy that equals a 100 gigaton bomb, while the gash that was created in the bottom of the sea was measured nearly 800 miles deep.", "The U.S.A. has committed to offer 350 million dollars to the affected areas, while the U.S. government signed into law on direct tax relief to individuals who have contributed with donations to disaster relief.", "The amount of donations comes up to 322 million dollars.", "Japan also offers 500 million dollars.", "The Romanians raised 395,000 euros in a telethon, while on the other hand a town in Canada demands refund for 10.000 dollars by the Red Cross due to mistake."]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["U.S. citizens donating in 2005 to help tsunami victims wrote off their donations on their 2004 tax returns.", "In order to help the earthquake's and tsunami's victims, the world's wealthiest nations poured funding into the damaged region.", "However, the UN Office for the Coordination of Humanitarian Affairs in Indonesia chief, Michael Elmquist, warned that logistics of securing the funds, purchasing supplies and shipping them to stricken regions is a long lasting procedure.", "Meanwhile, the  death toll is increased due to dehydration, disease, and starvation.", "The offered assistance by the U.S. government was considered limited and prompted complaints.", "At the same time, the ordinary Americans' donations were touching.", "The scientists determined that the Sumatra's earthquake was the largest ever recorded.", "People from around the world raised money for the victims as the Romanians who organised telemarathon and other charitable events.", "On the other hand, a small Canadian village calls back from the Red Cross  the $ 10,000 which had donated as the election day for approving the donation some council members were absent.", "The Red Cross has already deposited the cheque, but has indicated it will return the money.", "Only in Indonesia, the death toll reaches 80000.", "In southern Asia about 1 million people are homeless and humanitarian agencies estimate that 5 million people need relief.", "The World Health Organization makes hard efforts to improve the conditions.", "The vast majority of the victims were women.", "This fact is justified to some extent but requires protection for the female sex."]}]}
 3 | {"instance_id": "M000", "summarizer_id": "A", "summarizer_type": "reference", "summary": {"summarizer_id": "A", "summarizer_type": "reference", "text": ["Dec. 2004 Sumatra earthquake was the longest-lasting ever recorded and large enough to vibrate the whole planet!", "A week after tsunami waves scoured the coasts of multiple countries in southern Asia, the UN says the final death toll could climb beyond 150,000 and may never be known, while it is estimated that 5 million people lack food, water or basic sanitation necessary for survival.", "It should also be noted that women killed outnumber men.", "The world's wealthiest nations have begun pouring funding into the Earthquake/Tsunami damaged region.", "Japanese Prime Minister announced a half-billion dollar donation, China has promised $60.5 million, while Norway increased its funding donation to $180 million.", "The Bush administration has pledged $350 million.", "Moreover, Indiana University's Center on Philanthropy is estimating approximately $322 million in goods and cash have been donated by private U.S. citizens and corporations.", "The Romanian Government pledged 150,000 euro, while the Romanian public raised 395,000 euro in a telethon for the tsunami victims.", "In addition, Romanian artists will auction off their works, as well as personal objects, with all proceeds being donated to the relief efforts.", "Despite the encouraging promises, the UN warns that logistics of securing the funds, purchasing supplies and shipping them to stricken regions will take time.", "Meanwhile, deaths due to dehydration, disease and starvation will continue to climb.", "In this entire situation, the municipal council of Saskatchewan town in Canada says it \"accidentally\" donated $10,000 to the Red Cross for tsunami relief and now asks for its money back!"]}, "references": [{"summarizer_id": "B", "summarizer_type": "reference", "text": ["A week after the earthquake in Southern Asia, the confirmed death count is over 120.000.", "Indonesia's Aceh province alone counts 80.000 deaths.", "Sri Lanka announced approximately 28.500, India more than 7.700 and as many as 10.500 foreign citizens are reported missing.", "As the count continues, a gender imbalance was noticed in deaths by the tsunami, where four times more women than men were killed.", "This phenomenon is due to the greater physical strength of men and different occupations at the time of the disaster.", "Because of this phenomenon, many incidents of violence against women have been reported, and it is estimated that they will continue for the time being, therefore the people in charge seek measures to address the problem.", "The earthquake in Southern Asia was the longest in duration earthquake ever recorded.", "It lasted between 500 and 600 seconds and released an amount of energy that equals a 100 gigaton bomb, while the gash that was created in the bottom of the sea was measured nearly 800 miles deep.", "The U.S.A. has committed to offer 350 million dollars to the affected areas, while the U.S. government signed into law on direct tax relief to individuals who have contributed with donations to disaster relief.", "The amount of donations comes up to 322 million dollars.", "Japan also offers 500 million dollars.", "The Romanians raised 395,000 euros in a telethon, while on the other hand a town in Canada demands refund for 10.000 dollars by the Red Cross due to mistake."]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["U.S. citizens donating in 2005 to help tsunami victims wrote off their donations on their 2004 tax returns.", "In order to help the earthquake's and tsunami's victims, the world's wealthiest nations poured funding into the damaged region.", "However, the UN Office for the Coordination of Humanitarian Affairs in Indonesia chief, Michael Elmquist, warned that logistics of securing the funds, purchasing supplies and shipping them to stricken regions is a long lasting procedure.", "Meanwhile, the  death toll is increased due to dehydration, disease, and starvation.", "The offered assistance by the U.S. government was considered limited and prompted complaints.", "At the same time, the ordinary Americans' donations were touching.", "The scientists determined that the Sumatra's earthquake was the largest ever recorded.", "People from around the world raised money for the victims as the Romanians who organised telemarathon and other charitable events.", "On the other hand, a small Canadian village calls back from the Red Cross  the $ 10,000 which had donated as the election day for approving the donation some council members were absent.", "The Red Cross has already deposited the cheque, but has indicated it will return the money.", "Only in Indonesia, the death toll reaches 80000.", "In southern Asia about 1 million people are homeless and humanitarian agencies estimate that 5 million people need relief.", "The World Health Organization makes hard efforts to improve the conditions.", "The vast majority of the victims were women.", "This fact is justified to some extent but requires protection for the female sex."]}]}
 4 | {"instance_id": "M000", "summarizer_id": "B", "summarizer_type": "reference", "summary": {"summarizer_id": "B", "summarizer_type": "reference", "text": ["A week after the earthquake in Southern Asia, the confirmed death count is over 120.000.", "Indonesia's Aceh province alone counts 80.000 deaths.", "Sri Lanka announced approximately 28.500, India more than 7.700 and as many as 10.500 foreign citizens are reported missing.", "As the count continues, a gender imbalance was noticed in deaths by the tsunami, where four times more women than men were killed.", "This phenomenon is due to the greater physical strength of men and different occupations at the time of the disaster.", "Because of this phenomenon, many incidents of violence against women have been reported, and it is estimated that they will continue for the time being, therefore the people in charge seek measures to address the problem.", "The earthquake in Southern Asia was the longest in duration earthquake ever recorded.", "It lasted between 500 and 600 seconds and released an amount of energy that equals a 100 gigaton bomb, while the gash that was created in the bottom of the sea was measured nearly 800 miles deep.", "The U.S.A. has committed to offer 350 million dollars to the affected areas, while the U.S. government signed into law on direct tax relief to individuals who have contributed with donations to disaster relief.", "The amount of donations comes up to 322 million dollars.", "Japan also offers 500 million dollars.", "The Romanians raised 395,000 euros in a telethon, while on the other hand a town in Canada demands refund for 10.000 dollars by the Red Cross due to mistake."]}, "references": [{"summarizer_id": "A", "summarizer_type": "reference", "text": ["Dec. 2004 Sumatra earthquake was the longest-lasting ever recorded and large enough to vibrate the whole planet!", "A week after tsunami waves scoured the coasts of multiple countries in southern Asia, the UN says the final death toll could climb beyond 150,000 and may never be known, while it is estimated that 5 million people lack food, water or basic sanitation necessary for survival.", "It should also be noted that women killed outnumber men.", "The world's wealthiest nations have begun pouring funding into the Earthquake/Tsunami damaged region.", "Japanese Prime Minister announced a half-billion dollar donation, China has promised $60.5 million, while Norway increased its funding donation to $180 million.", "The Bush administration has pledged $350 million.", "Moreover, Indiana University's Center on Philanthropy is estimating approximately $322 million in goods and cash have been donated by private U.S. citizens and corporations.", "The Romanian Government pledged 150,000 euro, while the Romanian public raised 395,000 euro in a telethon for the tsunami victims.", "In addition, Romanian artists will auction off their works, as well as personal objects, with all proceeds being donated to the relief efforts.", "Despite the encouraging promises, the UN warns that logistics of securing the funds, purchasing supplies and shipping them to stricken regions will take time.", "Meanwhile, deaths due to dehydration, disease and starvation will continue to climb.", "In this entire situation, the municipal council of Saskatchewan town in Canada says it \"accidentally\" donated $10,000 to the Red Cross for tsunami relief and now asks for its money back!"]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["U.S. citizens donating in 2005 to help tsunami victims wrote off their donations on their 2004 tax returns.", "In order to help the earthquake's and tsunami's victims, the world's wealthiest nations poured funding into the damaged region.", "However, the UN Office for the Coordination of Humanitarian Affairs in Indonesia chief, Michael Elmquist, warned that logistics of securing the funds, purchasing supplies and shipping them to stricken regions is a long lasting procedure.", "Meanwhile, the  death toll is increased due to dehydration, disease, and starvation.", "The offered assistance by the U.S. government was considered limited and prompted complaints.", "At the same time, the ordinary Americans' donations were touching.", "The scientists determined that the Sumatra's earthquake was the largest ever recorded.", "People from around the world raised money for the victims as the Romanians who organised telemarathon and other charitable events.", "On the other hand, a small Canadian village calls back from the Red Cross  the $ 10,000 which had donated as the election day for approving the donation some council members were absent.", "The Red Cross has already deposited the cheque, but has indicated it will return the money.", "Only in Indonesia, the death toll reaches 80000.", "In southern Asia about 1 million people are homeless and humanitarian agencies estimate that 5 million people need relief.", "The World Health Organization makes hard efforts to improve the conditions.", "The vast majority of the victims were women.", "This fact is justified to some extent but requires protection for the female sex."]}]}
 5 | {"instance_id": "M001", "summarizer_id": "1", "summarizer_type": "peer", "summary": {"summarizer_id": "1", "summarizer_type": "peer", "text": ["Survivors of the London Bombings have urged the British public to write to their MPs, and set up an online petition calling", "As part of the formal investigation into the attacks, detectives studied thousands of hours of CCTV footage. The images show three of the bombers entering Luton station, before travelling to King's Cross station where they are also pictured.", "The July 7 bombings were a series of coordinated bombings which struck London on the morning of July 7, 2005. 52 people died and approximately 700 were injured as a result of the bombings.", "West Yorkshire Police searched six houses in Leeds today in connection with the London bombings. Houses were searched in the Burley, Beeston and Holbeck areas. Further properties were searched in Dewsbury, about eight miles from the city centre. The raids began at 6:30am BST this morning after warrants were issued under the Terrorism Act 2000. Police are still in attendance at one property, and are describing the searches as significant, and \"intelligence led\".", "A Canadian man, Momin Khawaja, was arrested in 2004 and has been held in a Canadian jail ever since. He is accused of being a co-conspirator with five British plotters for allegedly developing bomb detonators. Khawaja is considered an un-indicted co-conspirator with relation to the British case. Khawaja is the first person to be held under the 2001 Anti-Terrorism Act under Canada's Criminal Code, put in place by Canada's then Liberal government. Khawaja has been charged with seven offences under the new laws."]}, "references": [{"summarizer_id": "A", "summarizer_type": "reference", "text": ["Tributes paid to the victims of the July 7 2005 London bombings Four years after the 7/7 London bombings, when 52 people were killed and about 700 were injured by a series of suicide bombings on transport networks, a 1.4 tonne stainless steel plaque with the names of the victims has been unveiled in Hyde Park.", "52 stainless steel columns standing 3.5m tall were also inaugurated.", "All four suicide bombers were UK residents.", "Al Qaeda has claimed responsibility for the attacks, but police are unsure of its exact role in the attack.", "Survivors of the bombings launched a campaign for public inquiry, which was rejected by the British government as too expensive and time-consuming.", "The government's attitude infuriated survivors and relatives of the dead.", "According to Scotland Yard, the suicide bombers, whose practice shows 'terrorist methodology', had rehearsed their plan nine days earlier.", "Police were able to trace their movements after recovering tickets and receipts from houses connected to the bombers which indicated the route of their trip.", "Three years after the attacks, footage of the bombers, taken by a CCTV camera nine days before the bombings, has been shown in court as part of the trials of three suspects alleged to be involved in the preparation of the bombings.", "Meanwhile, five men, all British nationals, were found guilty for their part in an unsuccessful plot to carry out fertiliser bombings in the UK.", "As it was revealed, some of them had met with two of the London subway bombers."]}, {"summarizer_id": "B", "summarizer_type": "reference", "text": ["After the London bombings in July 2005, the British Police searched six houses in Leeds.", "It was confirmed that the explosives used, came from the building region.", "At the same time, they announced that all bombers were British citizens and that the surveillance cameras taped them on the day of the bombings near the attacked areas.", "In order to trace the suspects, two pictures of Hassib Hussain were published and a line was established, so that eye witnesses could testify information related to him.", "During September 2005, Scotland Yard revealed that the bombers had rehearsed the attacks nine days earlier, according to statements and CCTV tapes.", "Also, during the same period, the suspect for the attacks Hussain Osman was extradited to the UK by the Italian authorities.", "On December 2005, the survivors of the attacks participated in the promotion campaign for public inquiry into the terrorist's attacks, trying to push the government to give up the solution of \u0093narration of events by eye witnesses\u0094.", "A year after the attacks, Londoners honored the victims.", "Officials laid flowers at the location of the explosions and the prime minister made statements.", "On May 2007 five suspects were found guilty for their part in an unsuccessful plot to carry out bombings in the UK.", "While a year after, in the trial of the suspects for the attacks of 2005, the jury was shown footage of their moves.", "Finally, two years after the attacks, 52 stainless steel columns were placed in memory of the victims."]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["London Metropolitan Police searched six houses in Leeds in order to find elements about the terrorist attacks on july 7.", "The raids were dangerous and long lasting.", "The suspects of the bombings in London are all British nationals.", "Four of them alleged suicide bombers.", "The Metropolitan Police released two photographs of one of the London bombers and made a request for any additional information about his whearabouts in order to succeed in arresting him.", "Another bomb suspect, Hussain Osman, who was accused of planting the failed bomb at the Underground station,  arrived in London and had been arrested following his extradition from Italy.", "Survivors of the London Bombings set up an online petition calling for an independent Public Inquiry into the attacks.", "The British government rejected calls for a Public Inquiry.", "But survivors argue that a comprehensive investigation could teach valuable lessons which may help reduce the likelihood of future attacks, and improve the response capabilities of the emergency services.", "One year after the terrorist attacks, the Londoners honor the memory of victims.", "Simultaneously, a video links the attacks with Al-Qaeda.", "In 2007, five men were found guilty of plotting to cause an explosion and have been jailed with life sentences.", "Some of the fertiliser bomb conspirators had met with two of the London subway bombers.", "A court in the United Kingdom has been shown footage of the bombers that attacked London.", "Four years later, London remembers the vistims of terrorism and the police goes on investigating the case, thoroughly."]}]}
 6 | {"instance_id": "M001", "summarizer_id": "2", "summarizer_type": "peer", "summary": {"summarizer_id": "2", "summarizer_type": "peer", "text": ["Profiles of the suspects in the July 7, 2005 bombings in London have been released.", "The July 7 bombings were a series of coordinated bombings which struck London on the morning of July 7, 2005. 52 people died and approximately 700 were injured as a result of the bombings.", "The Duchess of Cornwall left a floral tribute for the families of the victims.", "\"The families will be campaigning for there to be a full public inquiry.\"", "Houses were searched in the Burley, Beeston and Holbeck areas.", "Last month a video showing Mohammad Siddique Khan saying goodbye to his child was shown in the court.", "Khawaja is considered an un-indicted co-conspirator with relation to the British case.", "West Yorkshire Police searched six houses in Leeds today in connection with the London bombings.", "The men arrive at King's Cross at 8:55 a.m. and are seen at Baker Steet at midday.", "The images show three of the bombers entering Luton station, before travelling to King's Cross station where they are pictured.", "The first photograph is a passport photo of Hasib Hussain and the second shows is a crop of a CCTV image from Luton station at 7:20 on the day of the attack.", "Initially thought to have been the suicide bomber on the Piccadilly Line train.", "52 people were killed and hundreds more injured on July 7th 2005 when four suicide bombers blew themselves up on three separate London Underground trains and a public bus.", "\"A narrative of events will not satisfy anybody.\""]}, "references": [{"summarizer_id": "A", "summarizer_type": "reference", "text": ["Tributes paid to the victims of the July 7 2005 London bombings Four years after the 7/7 London bombings, when 52 people were killed and about 700 were injured by a series of suicide bombings on transport networks, a 1.4 tonne stainless steel plaque with the names of the victims has been unveiled in Hyde Park.", "52 stainless steel columns standing 3.5m tall were also inaugurated.", "All four suicide bombers were UK residents.", "Al Qaeda has claimed responsibility for the attacks, but police are unsure of its exact role in the attack.", "Survivors of the bombings launched a campaign for public inquiry, which was rejected by the British government as too expensive and time-consuming.", "The government's attitude infuriated survivors and relatives of the dead.", "According to Scotland Yard, the suicide bombers, whose practice shows 'terrorist methodology', had rehearsed their plan nine days earlier.", "Police were able to trace their movements after recovering tickets and receipts from houses connected to the bombers which indicated the route of their trip.", "Three years after the attacks, footage of the bombers, taken by a CCTV camera nine days before the bombings, has been shown in court as part of the trials of three suspects alleged to be involved in the preparation of the bombings.", "Meanwhile, five men, all British nationals, were found guilty for their part in an unsuccessful plot to carry out fertiliser bombings in the UK.", "As it was revealed, some of them had met with two of the London subway bombers."]}, {"summarizer_id": "B", "summarizer_type": "reference", "text": ["After the London bombings in July 2005, the British Police searched six houses in Leeds.", "It was confirmed that the explosives used, came from the building region.", "At the same time, they announced that all bombers were British citizens and that the surveillance cameras taped them on the day of the bombings near the attacked areas.", "In order to trace the suspects, two pictures of Hassib Hussain were published and a line was established, so that eye witnesses could testify information related to him.", "During September 2005, Scotland Yard revealed that the bombers had rehearsed the attacks nine days earlier, according to statements and CCTV tapes.", "Also, during the same period, the suspect for the attacks Hussain Osman was extradited to the UK by the Italian authorities.", "On December 2005, the survivors of the attacks participated in the promotion campaign for public inquiry into the terrorist's attacks, trying to push the government to give up the solution of \u0093narration of events by eye witnesses\u0094.", "A year after the attacks, Londoners honored the victims.", "Officials laid flowers at the location of the explosions and the prime minister made statements.", "On May 2007 five suspects were found guilty for their part in an unsuccessful plot to carry out bombings in the UK.", "While a year after, in the trial of the suspects for the attacks of 2005, the jury was shown footage of their moves.", "Finally, two years after the attacks, 52 stainless steel columns were placed in memory of the victims."]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["London Metropolitan Police searched six houses in Leeds in order to find elements about the terrorist attacks on july 7.", "The raids were dangerous and long lasting.", "The suspects of the bombings in London are all British nationals.", "Four of them alleged suicide bombers.", "The Metropolitan Police released two photographs of one of the London bombers and made a request for any additional information about his whearabouts in order to succeed in arresting him.", "Another bomb suspect, Hussain Osman, who was accused of planting the failed bomb at the Underground station,  arrived in London and had been arrested following his extradition from Italy.", "Survivors of the London Bombings set up an online petition calling for an independent Public Inquiry into the attacks.", "The British government rejected calls for a Public Inquiry.", "But survivors argue that a comprehensive investigation could teach valuable lessons which may help reduce the likelihood of future attacks, and improve the response capabilities of the emergency services.", "One year after the terrorist attacks, the Londoners honor the memory of victims.", "Simultaneously, a video links the attacks with Al-Qaeda.", "In 2007, five men were found guilty of plotting to cause an explosion and have been jailed with life sentences.", "Some of the fertiliser bomb conspirators had met with two of the London subway bombers.", "A court in the United Kingdom has been shown footage of the bombers that attacked London.", "Four years later, London remembers the vistims of terrorism and the police goes on investigating the case, thoroughly."]}]}
 7 | {"instance_id": "M001", "summarizer_id": "A", "summarizer_type": "reference", "summary": {"summarizer_id": "A", "summarizer_type": "reference", "text": ["Tributes paid to the victims of the July 7 2005 London bombings Four years after the 7/7 London bombings, when 52 people were killed and about 700 were injured by a series of suicide bombings on transport networks, a 1.4 tonne stainless steel plaque with the names of the victims has been unveiled in Hyde Park.", "52 stainless steel columns standing 3.5m tall were also inaugurated.", "All four suicide bombers were UK residents.", "Al Qaeda has claimed responsibility for the attacks, but police are unsure of its exact role in the attack.", "Survivors of the bombings launched a campaign for public inquiry, which was rejected by the British government as too expensive and time-consuming.", "The government's attitude infuriated survivors and relatives of the dead.", "According to Scotland Yard, the suicide bombers, whose practice shows 'terrorist methodology', had rehearsed their plan nine days earlier.", "Police were able to trace their movements after recovering tickets and receipts from houses connected to the bombers which indicated the route of their trip.", "Three years after the attacks, footage of the bombers, taken by a CCTV camera nine days before the bombings, has been shown in court as part of the trials of three suspects alleged to be involved in the preparation of the bombings.", "Meanwhile, five men, all British nationals, were found guilty for their part in an unsuccessful plot to carry out fertiliser bombings in the UK.", "As it was revealed, some of them had met with two of the London subway bombers."]}, "references": [{"summarizer_id": "B", "summarizer_type": "reference", "text": ["After the London bombings in July 2005, the British Police searched six houses in Leeds.", "It was confirmed that the explosives used, came from the building region.", "At the same time, they announced that all bombers were British citizens and that the surveillance cameras taped them on the day of the bombings near the attacked areas.", "In order to trace the suspects, two pictures of Hassib Hussain were published and a line was established, so that eye witnesses could testify information related to him.", "During September 2005, Scotland Yard revealed that the bombers had rehearsed the attacks nine days earlier, according to statements and CCTV tapes.", "Also, during the same period, the suspect for the attacks Hussain Osman was extradited to the UK by the Italian authorities.", "On December 2005, the survivors of the attacks participated in the promotion campaign for public inquiry into the terrorist's attacks, trying to push the government to give up the solution of \u0093narration of events by eye witnesses\u0094.", "A year after the attacks, Londoners honored the victims.", "Officials laid flowers at the location of the explosions and the prime minister made statements.", "On May 2007 five suspects were found guilty for their part in an unsuccessful plot to carry out bombings in the UK.", "While a year after, in the trial of the suspects for the attacks of 2005, the jury was shown footage of their moves.", "Finally, two years after the attacks, 52 stainless steel columns were placed in memory of the victims."]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["London Metropolitan Police searched six houses in Leeds in order to find elements about the terrorist attacks on july 7.", "The raids were dangerous and long lasting.", "The suspects of the bombings in London are all British nationals.", "Four of them alleged suicide bombers.", "The Metropolitan Police released two photographs of one of the London bombers and made a request for any additional information about his whearabouts in order to succeed in arresting him.", "Another bomb suspect, Hussain Osman, who was accused of planting the failed bomb at the Underground station,  arrived in London and had been arrested following his extradition from Italy.", "Survivors of the London Bombings set up an online petition calling for an independent Public Inquiry into the attacks.", "The British government rejected calls for a Public Inquiry.", "But survivors argue that a comprehensive investigation could teach valuable lessons which may help reduce the likelihood of future attacks, and improve the response capabilities of the emergency services.", "One year after the terrorist attacks, the Londoners honor the memory of victims.", "Simultaneously, a video links the attacks with Al-Qaeda.", "In 2007, five men were found guilty of plotting to cause an explosion and have been jailed with life sentences.", "Some of the fertiliser bomb conspirators had met with two of the London subway bombers.", "A court in the United Kingdom has been shown footage of the bombers that attacked London.", "Four years later, London remembers the vistims of terrorism and the police goes on investigating the case, thoroughly."]}]}
 8 | {"instance_id": "M001", "summarizer_id": "B", "summarizer_type": "reference", "summary": {"summarizer_id": "B", "summarizer_type": "reference", "text": ["After the London bombings in July 2005, the British Police searched six houses in Leeds.", "It was confirmed that the explosives used, came from the building region.", "At the same time, they announced that all bombers were British citizens and that the surveillance cameras taped them on the day of the bombings near the attacked areas.", "In order to trace the suspects, two pictures of Hassib Hussain were published and a line was established, so that eye witnesses could testify information related to him.", "During September 2005, Scotland Yard revealed that the bombers had rehearsed the attacks nine days earlier, according to statements and CCTV tapes.", "Also, during the same period, the suspect for the attacks Hussain Osman was extradited to the UK by the Italian authorities.", "On December 2005, the survivors of the attacks participated in the promotion campaign for public inquiry into the terrorist's attacks, trying to push the government to give up the solution of \u0093narration of events by eye witnesses\u0094.", "A year after the attacks, Londoners honored the victims.", "Officials laid flowers at the location of the explosions and the prime minister made statements.", "On May 2007 five suspects were found guilty for their part in an unsuccessful plot to carry out bombings in the UK.", "While a year after, in the trial of the suspects for the attacks of 2005, the jury was shown footage of their moves.", "Finally, two years after the attacks, 52 stainless steel columns were placed in memory of the victims."]}, "references": [{"summarizer_id": "A", "summarizer_type": "reference", "text": ["Tributes paid to the victims of the July 7 2005 London bombings Four years after the 7/7 London bombings, when 52 people were killed and about 700 were injured by a series of suicide bombings on transport networks, a 1.4 tonne stainless steel plaque with the names of the victims has been unveiled in Hyde Park.", "52 stainless steel columns standing 3.5m tall were also inaugurated.", "All four suicide bombers were UK residents.", "Al Qaeda has claimed responsibility for the attacks, but police are unsure of its exact role in the attack.", "Survivors of the bombings launched a campaign for public inquiry, which was rejected by the British government as too expensive and time-consuming.", "The government's attitude infuriated survivors and relatives of the dead.", "According to Scotland Yard, the suicide bombers, whose practice shows 'terrorist methodology', had rehearsed their plan nine days earlier.", "Police were able to trace their movements after recovering tickets and receipts from houses connected to the bombers which indicated the route of their trip.", "Three years after the attacks, footage of the bombers, taken by a CCTV camera nine days before the bombings, has been shown in court as part of the trials of three suspects alleged to be involved in the preparation of the bombings.", "Meanwhile, five men, all British nationals, were found guilty for their part in an unsuccessful plot to carry out fertiliser bombings in the UK.", "As it was revealed, some of them had met with two of the London subway bombers."]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["London Metropolitan Police searched six houses in Leeds in order to find elements about the terrorist attacks on july 7.", "The raids were dangerous and long lasting.", "The suspects of the bombings in London are all British nationals.", "Four of them alleged suicide bombers.", "The Metropolitan Police released two photographs of one of the London bombers and made a request for any additional information about his whearabouts in order to succeed in arresting him.", "Another bomb suspect, Hussain Osman, who was accused of planting the failed bomb at the Underground station,  arrived in London and had been arrested following his extradition from Italy.", "Survivors of the London Bombings set up an online petition calling for an independent Public Inquiry into the attacks.", "The British government rejected calls for a Public Inquiry.", "But survivors argue that a comprehensive investigation could teach valuable lessons which may help reduce the likelihood of future attacks, and improve the response capabilities of the emergency services.", "One year after the terrorist attacks, the Londoners honor the memory of victims.", "Simultaneously, a video links the attacks with Al-Qaeda.", "In 2007, five men were found guilty of plotting to cause an explosion and have been jailed with life sentences.", "Some of the fertiliser bomb conspirators had met with two of the London subway bombers.", "A court in the United Kingdom has been shown footage of the bombers that attacked London.", "Four years later, London remembers the vistims of terrorism and the police goes on investigating the case, thoroughly."]}]}
 9 | {"instance_id": "M002", "summarizer_id": "1", "summarizer_type": "peer", "summary": {"summarizer_id": "1", "summarizer_type": "peer", "text": ["The United Kingdom has frozen all bilateral business deals with Iran until all 15 British sailors and marines, who were detained by Iranian forces on March 23 are released.", "Iranian President Mahmoud Ahmadinejad on Wednesday announced that he would free the fifteen British captured navy personnel as a \"gift to the British people.\"", "Iran stated Tuesday that the sailors and marines are being treated \"humanely\" and are in \"good health.\"", "On March 23, the fifteen sailors and marines from the frigate HMS Cornwall had been inspecting a ship, in what the UK identified as Iraqi waters, when they were surrounded by Iranian gunboats and taken into custody. Iran claims the UK forces were in Iranian waters, and are still detaining the fifteen.", "The European Union has released a statement calling for the release of all 15 British sailors and marines being detained in Iran and that \"appropriate measures\" will be taken if Iran refuses to release them.", "Iran's National Security Council has announced that it will \"suspend\" the releasing of 15 British sailors and marines detained by Iranian forces on March 23."]}, "references": [{"summarizer_id": "A", "summarizer_type": "reference", "text": ["Two years after the seizure of Royal Navy personnel by Iran, two inquiries, that examined the British Ministry's of Defence\u0092 handling, identified \u0093weaknesses in training, communications and the handling of intelligence\u0094 as well as \"collective failure of judgement\".", "The fifteen sailors and marines, from the frigate HMS Cornwall, were captured by Iranian border guards on March 23 in the Persian Gulf, while they were inspecting, in accordance with UN Security Council Resolution 1723, a ship believed to be smuggling cars into Iraq.", "The UK insisted they were operating in Iraqi waters, while Iran claimed they entered illegally into Iran's territorial waters and that they could face charges of espionage.", "If those charges were brought against them, the result would be heavy punishment by current Iranian law.", "On 28 March, British Prime Minister froze all bilateral business deals with Iran.", "The next day, Iran announced that it will \"suspend\" the releasing of 15 British personnel, due to the political ballyhoo by London.", "The EU called the Iranian seizure a \"clear breach\" of international law.", "Meanwhile, footage of all 15 British personnel had been broadcast on Iranian TV, with one of the sailors saying that the soldiers were in Iranian waters at the time of their detainment.", "The British government claimed that the confessions were extracted under duress.", "Few days later, Iranian President announced that he would free them as a \"gift to the British people\".", "The fifteen British navy personnel landed at Heathrow on 5 April, after thirteen days of captivity."]}, {"summarizer_id": "B", "summarizer_type": "reference", "text": ["In March 2007 a British frigate with 15 Navy personnel, including a woman, have been captured by Iranian authorities, while they were investigating a ship suspected of smuggling cars in Iraq , on charges that they entered illegally into Iran's territorial waters.", "The detainees were taken in Tehran and unofficial information indicated that if they charge them with espionage, the result would be heavy punishment.", "Britain reacted immediately.", "\"COBRA emergency committee\" was activated.", "Blair expressed his disappointment and he demanded their immediate release.", "He also stated that he wishes a peaceful solution for the issue.", "U.S.A and E.U expressed their support and their claim for immediate release.", "The coordinates of Iranians about the location of the ship wasn\u0092t true according to Britain and threatened to cease all business deals with Iran.", "The next days and although originally Iranians stated that they would release the female prisoner, they changed terms due to the hot negotiations.", "The Iranians, however, stated that the prisoners were in good health and their detention was decent.", "After 13 days of captivity Ahmadinejad announced that they would be released as a \"gift to the British people.\"", "After a ceremony they returned to London.", "One detainee said that during their captivity they suffered mental stress and that they have admitted the category in order to deter further tensions.", "Finally, two surveys conducted after a month concluded that the rapture was the result of unfortunate accumulation of factors rather than human mistake."]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["Fifteen British Royal Navy personnel was been captured by Iranian authorities at gunpoint in the Persian Gulf off the Iraqi coast.", "According to Britain, a ship that was believed to be smuggling cars into Iraq, was been checked.", "In accordance with Iran, a British ship was approaching  an Iranian site, which formerly belonged to Iraq.", "The sailors had been arrested for further investigation.", "The staff admitted the violation.", "The sailors would be held until five Iranian guards' had been released, who had been arrested in Iraq.", "Britain, supported by the USA and the Europe, required the crew's return and triggered the COBRA.", "The Iranians had threatened of reprisals if they kidnapped members of the Iranian Revolutionary Guard.", "The British crew may be accused of being spies and punished exemplarily.", "Britain had frozen every business arrangement with Iran and had presented evidence that the sailors were in Iraq.", "Footage of the British crew had been broadcasted on Iranian TV, there a sailor, with a black \"head scarf\",  admitted that the soldiers were in Iranian waters at the time of their detainment.", "According to the Iranian Foreign Minister, the female sailor would be released immediately.", "Britain denied the possibility of release.", "Iran suspended the release of the British, due to the non-negotiable British stance.", "On April 4, Iranian President freed the crew.", "According to Britain the British confessions were extracted under pressure.", "The crew returned after 13 days of captivity.", "The crew described their capture and detention by Iran."]}]}
10 | {"instance_id": "M002", "summarizer_id": "2", "summarizer_type": "peer", "summary": {"summarizer_id": "2", "summarizer_type": "peer", "text": ["The sailors and marines, from the frigate HMS Cornwall, had been inspecting, in accordance with UN Security Council Resolution 1723, a ship that was believed to be smuggling cars into Iraq, though it was subsequently cleared after inspection when Iranian gunboats surrounded the sailors and arrested them at gunpoint.", "The sailors and marines were captured by Iranian border guards on March 23 in the Persian Gulf near the Shatt al Arab waterway.", "The United Kingdom has frozen all bilateral business deals with Iran until all 15 British sailors and marines.", "The United States called for the immediate release of the sailors.", "After the UK queried the statement by General Alireza Afshar, the Iranian government gave a revised position for the incident, now placing it inside Iranian territorial waters.", "Iran claims the UK forces were in Iranian waters, and are still detaining the fifteen.", "The fifteen British captured navy personnel detained by Iran would be released, saying they have been pardoned as a gift to the British people.", "Iranian media said that the British sailors 'shouted for joy' at the news.", "The Australian reported that an internet website \"run by associates of Mahmoud Ahmadinejad\" states that the 15 British sailors who were arrested by Iranian Revolutionary Guards could face charges of espionage.", "Ahmadinejad met with the detainees shortly after a press conference where he announced that the release will be immediate, and that they will be taken to the airport.", "The Royal Navy insists that they were operating in Iraqi waters."]}, "references": [{"summarizer_id": "A", "summarizer_type": "reference", "text": ["Two years after the seizure of Royal Navy personnel by Iran, two inquiries, that examined the British Ministry's of Defence\u0092 handling, identified \u0093weaknesses in training, communications and the handling of intelligence\u0094 as well as \"collective failure of judgement\".", "The fifteen sailors and marines, from the frigate HMS Cornwall, were captured by Iranian border guards on March 23 in the Persian Gulf, while they were inspecting, in accordance with UN Security Council Resolution 1723, a ship believed to be smuggling cars into Iraq.", "The UK insisted they were operating in Iraqi waters, while Iran claimed they entered illegally into Iran's territorial waters and that they could face charges of espionage.", "If those charges were brought against them, the result would be heavy punishment by current Iranian law.", "On 28 March, British Prime Minister froze all bilateral business deals with Iran.", "The next day, Iran announced that it will \"suspend\" the releasing of 15 British personnel, due to the political ballyhoo by London.", "The EU called the Iranian seizure a \"clear breach\" of international law.", "Meanwhile, footage of all 15 British personnel had been broadcast on Iranian TV, with one of the sailors saying that the soldiers were in Iranian waters at the time of their detainment.", "The British government claimed that the confessions were extracted under duress.", "Few days later, Iranian President announced that he would free them as a \"gift to the British people\".", "The fifteen British navy personnel landed at Heathrow on 5 April, after thirteen days of captivity."]}, {"summarizer_id": "B", "summarizer_type": "reference", "text": ["In March 2007 a British frigate with 15 Navy personnel, including a woman, have been captured by Iranian authorities, while they were investigating a ship suspected of smuggling cars in Iraq , on charges that they entered illegally into Iran's territorial waters.", "The detainees were taken in Tehran and unofficial information indicated that if they charge them with espionage, the result would be heavy punishment.", "Britain reacted immediately.", "\"COBRA emergency committee\" was activated.", "Blair expressed his disappointment and he demanded their immediate release.", "He also stated that he wishes a peaceful solution for the issue.", "U.S.A and E.U expressed their support and their claim for immediate release.", "The coordinates of Iranians about the location of the ship wasn\u0092t true according to Britain and threatened to cease all business deals with Iran.", "The next days and although originally Iranians stated that they would release the female prisoner, they changed terms due to the hot negotiations.", "The Iranians, however, stated that the prisoners were in good health and their detention was decent.", "After 13 days of captivity Ahmadinejad announced that they would be released as a \"gift to the British people.\"", "After a ceremony they returned to London.", "One detainee said that during their captivity they suffered mental stress and that they have admitted the category in order to deter further tensions.", "Finally, two surveys conducted after a month concluded that the rapture was the result of unfortunate accumulation of factors rather than human mistake."]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["Fifteen British Royal Navy personnel was been captured by Iranian authorities at gunpoint in the Persian Gulf off the Iraqi coast.", "According to Britain, a ship that was believed to be smuggling cars into Iraq, was been checked.", "In accordance with Iran, a British ship was approaching  an Iranian site, which formerly belonged to Iraq.", "The sailors had been arrested for further investigation.", "The staff admitted the violation.", "The sailors would be held until five Iranian guards' had been released, who had been arrested in Iraq.", "Britain, supported by the USA and the Europe, required the crew's return and triggered the COBRA.", "The Iranians had threatened of reprisals if they kidnapped members of the Iranian Revolutionary Guard.", "The British crew may be accused of being spies and punished exemplarily.", "Britain had frozen every business arrangement with Iran and had presented evidence that the sailors were in Iraq.", "Footage of the British crew had been broadcasted on Iranian TV, there a sailor, with a black \"head scarf\",  admitted that the soldiers were in Iranian waters at the time of their detainment.", "According to the Iranian Foreign Minister, the female sailor would be released immediately.", "Britain denied the possibility of release.", "Iran suspended the release of the British, due to the non-negotiable British stance.", "On April 4, Iranian President freed the crew.", "According to Britain the British confessions were extracted under pressure.", "The crew returned after 13 days of captivity.", "The crew described their capture and detention by Iran."]}]}
11 | {"instance_id": "M002", "summarizer_id": "A", "summarizer_type": "reference", "summary": {"summarizer_id": "A", "summarizer_type": "reference", "text": ["Two years after the seizure of Royal Navy personnel by Iran, two inquiries, that examined the British Ministry's of Defence\u0092 handling, identified \u0093weaknesses in training, communications and the handling of intelligence\u0094 as well as \"collective failure of judgement\".", "The fifteen sailors and marines, from the frigate HMS Cornwall, were captured by Iranian border guards on March 23 in the Persian Gulf, while they were inspecting, in accordance with UN Security Council Resolution 1723, a ship believed to be smuggling cars into Iraq.", "The UK insisted they were operating in Iraqi waters, while Iran claimed they entered illegally into Iran's territorial waters and that they could face charges of espionage.", "If those charges were brought against them, the result would be heavy punishment by current Iranian law.", "On 28 March, British Prime Minister froze all bilateral business deals with Iran.", "The next day, Iran announced that it will \"suspend\" the releasing of 15 British personnel, due to the political ballyhoo by London.", "The EU called the Iranian seizure a \"clear breach\" of international law.", "Meanwhile, footage of all 15 British personnel had been broadcast on Iranian TV, with one of the sailors saying that the soldiers were in Iranian waters at the time of their detainment.", "The British government claimed that the confessions were extracted under duress.", "Few days later, Iranian President announced that he would free them as a \"gift to the British people\".", "The fifteen British navy personnel landed at Heathrow on 5 April, after thirteen days of captivity."]}, "references": [{"summarizer_id": "B", "summarizer_type": "reference", "text": ["In March 2007 a British frigate with 15 Navy personnel, including a woman, have been captured by Iranian authorities, while they were investigating a ship suspected of smuggling cars in Iraq , on charges that they entered illegally into Iran's territorial waters.", "The detainees were taken in Tehran and unofficial information indicated that if they charge them with espionage, the result would be heavy punishment.", "Britain reacted immediately.", "\"COBRA emergency committee\" was activated.", "Blair expressed his disappointment and he demanded their immediate release.", "He also stated that he wishes a peaceful solution for the issue.", "U.S.A and E.U expressed their support and their claim for immediate release.", "The coordinates of Iranians about the location of the ship wasn\u0092t true according to Britain and threatened to cease all business deals with Iran.", "The next days and although originally Iranians stated that they would release the female prisoner, they changed terms due to the hot negotiations.", "The Iranians, however, stated that the prisoners were in good health and their detention was decent.", "After 13 days of captivity Ahmadinejad announced that they would be released as a \"gift to the British people.\"", "After a ceremony they returned to London.", "One detainee said that during their captivity they suffered mental stress and that they have admitted the category in order to deter further tensions.", "Finally, two surveys conducted after a month concluded that the rapture was the result of unfortunate accumulation of factors rather than human mistake."]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["Fifteen British Royal Navy personnel was been captured by Iranian authorities at gunpoint in the Persian Gulf off the Iraqi coast.", "According to Britain, a ship that was believed to be smuggling cars into Iraq, was been checked.", "In accordance with Iran, a British ship was approaching  an Iranian site, which formerly belonged to Iraq.", "The sailors had been arrested for further investigation.", "The staff admitted the violation.", "The sailors would be held until five Iranian guards' had been released, who had been arrested in Iraq.", "Britain, supported by the USA and the Europe, required the crew's return and triggered the COBRA.", "The Iranians had threatened of reprisals if they kidnapped members of the Iranian Revolutionary Guard.", "The British crew may be accused of being spies and punished exemplarily.", "Britain had frozen every business arrangement with Iran and had presented evidence that the sailors were in Iraq.", "Footage of the British crew had been broadcasted on Iranian TV, there a sailor, with a black \"head scarf\",  admitted that the soldiers were in Iranian waters at the time of their detainment.", "According to the Iranian Foreign Minister, the female sailor would be released immediately.", "Britain denied the possibility of release.", "Iran suspended the release of the British, due to the non-negotiable British stance.", "On April 4, Iranian President freed the crew.", "According to Britain the British confessions were extracted under pressure.", "The crew returned after 13 days of captivity.", "The crew described their capture and detention by Iran."]}]}
12 | {"instance_id": "M002", "summarizer_id": "B", "summarizer_type": "reference", "summary": {"summarizer_id": "B", "summarizer_type": "reference", "text": ["In March 2007 a British frigate with 15 Navy personnel, including a woman, have been captured by Iranian authorities, while they were investigating a ship suspected of smuggling cars in Iraq , on charges that they entered illegally into Iran's territorial waters.", "The detainees were taken in Tehran and unofficial information indicated that if they charge them with espionage, the result would be heavy punishment.", "Britain reacted immediately.", "\"COBRA emergency committee\" was activated.", "Blair expressed his disappointment and he demanded their immediate release.", "He also stated that he wishes a peaceful solution for the issue.", "U.S.A and E.U expressed their support and their claim for immediate release.", "The coordinates of Iranians about the location of the ship wasn\u0092t true according to Britain and threatened to cease all business deals with Iran.", "The next days and although originally Iranians stated that they would release the female prisoner, they changed terms due to the hot negotiations.", "The Iranians, however, stated that the prisoners were in good health and their detention was decent.", "After 13 days of captivity Ahmadinejad announced that they would be released as a \"gift to the British people.\"", "After a ceremony they returned to London.", "One detainee said that during their captivity they suffered mental stress and that they have admitted the category in order to deter further tensions.", "Finally, two surveys conducted after a month concluded that the rapture was the result of unfortunate accumulation of factors rather than human mistake."]}, "references": [{"summarizer_id": "A", "summarizer_type": "reference", "text": ["Two years after the seizure of Royal Navy personnel by Iran, two inquiries, that examined the British Ministry's of Defence\u0092 handling, identified \u0093weaknesses in training, communications and the handling of intelligence\u0094 as well as \"collective failure of judgement\".", "The fifteen sailors and marines, from the frigate HMS Cornwall, were captured by Iranian border guards on March 23 in the Persian Gulf, while they were inspecting, in accordance with UN Security Council Resolution 1723, a ship believed to be smuggling cars into Iraq.", "The UK insisted they were operating in Iraqi waters, while Iran claimed they entered illegally into Iran's territorial waters and that they could face charges of espionage.", "If those charges were brought against them, the result would be heavy punishment by current Iranian law.", "On 28 March, British Prime Minister froze all bilateral business deals with Iran.", "The next day, Iran announced that it will \"suspend\" the releasing of 15 British personnel, due to the political ballyhoo by London.", "The EU called the Iranian seizure a \"clear breach\" of international law.", "Meanwhile, footage of all 15 British personnel had been broadcast on Iranian TV, with one of the sailors saying that the soldiers were in Iranian waters at the time of their detainment.", "The British government claimed that the confessions were extracted under duress.", "Few days later, Iranian President announced that he would free them as a \"gift to the British people\".", "The fifteen British navy personnel landed at Heathrow on 5 April, after thirteen days of captivity."]}, {"summarizer_id": "C", "summarizer_type": "reference", "text": ["Fifteen British Royal Navy personnel was been captured by Iranian authorities at gunpoint in the Persian Gulf off the Iraqi coast.", "According to Britain, a ship that was believed to be smuggling cars into Iraq, was been checked.", "In accordance with Iran, a British ship was approaching  an Iranian site, which formerly belonged to Iraq.", "The sailors had been arrested for further investigation.", "The staff admitted the violation.", "The sailors would be held until five Iranian guards' had been released, who had been arrested in Iraq.", "Britain, supported by the USA and the Europe, required the crew's return and triggered the COBRA.", "The Iranians had threatened of reprisals if they kidnapped members of the Iranian Revolutionary Guard.", "The British crew may be accused of being spies and punished exemplarily.", "Britain had frozen every business arrangement with Iran and had presented evidence that the sailors were in Iraq.", "Footage of the British crew had been broadcasted on Iranian TV, there a sailor, with a black \"head scarf\",  admitted that the soldiers were in Iranian waters at the time of their detainment.", "According to the Iranian Foreign Minister, the female sailor would be released immediately.", "Britain denied the possibility of release.", "Iran suspended the release of the British, due to the non-negotiable British stance.", "On April 4, Iranian President freed the crew.", "According to Britain the British confessions were extracted under pressure.", "The crew returned after 13 days of captivity.", "The crew described their capture and detention by Iran."]}]}


--------------------------------------------------------------------------------
/qaeval/tests/generation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/qaeval/dd7273183dd1b2c9995115310ef041daa953ca81/qaeval/tests/generation/__init__.py


--------------------------------------------------------------------------------
/qaeval/tests/generation/model_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | import unittest
 4 | 
 5 | from qaeval.generation.model import QuestionGenerationModel
 6 | 
 7 | 
 8 | @pytest.mark.skipif('GENERATION_MODEL' not in os.environ, reason='Generation model environment variable not set')
 9 | class TestGenerationModel(unittest.TestCase):
10 |     def test_generation(self):
11 |         model = QuestionGenerationModel(os.environ['GENERATION_MODEL'])
12 | 
13 |         # "The superjumbo Airbus A380"
14 |         question = model.generate('The superjumbo Airbus A380 , the world \'s largest commercial airliner , took off Wednesday into cloudy skies over southwestern France for its second test flight .',
15 |                                   0, 26)
16 |         assert question == 'What world\'s largest commercial airliner took off Wednesday into cloudy skies over southwestern France for its second test flight?'
17 | 
18 |         # "the world 's largest commercial airliner"
19 |         question = model.generate(
20 |             'The superjumbo Airbus A380 , the world \'s largest commercial airliner , took off Wednesday into cloudy skies over southwestern France for its second test flight .',
21 |             29, 69)
22 |         assert question == 'What superjumbo Airbus A380 took off Wednesday into cloudy skies over southwestern France for its second test flight?'
23 | 


--------------------------------------------------------------------------------
/qaeval/tests/metric_test.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import pytest
  4 | import unittest
  5 | from typing import List
  6 | 
  7 | from qaeval import QAEval, FIXTURES_ROOT
  8 | 
  9 | 
 10 | @pytest.mark.skipif(
 11 |     "GENERATION_MODEL" not in os.environ,
 12 |     reason="`GENERATION_MODEL` environment variable not set",
 13 | )
 14 | @pytest.mark.skipif(
 15 |     "ANSWERING_MODEL" not in os.environ,
 16 |     reason="`ANSWERING_MODEL` environment variable not set",
 17 | )
 18 | class TestQAEval(unittest.TestCase):
 19 |     def setUp(self) -> None:
 20 |         self.summaries = []
 21 |         self.references_list = []
 22 |         with open(f"{FIXTURES_ROOT}/multiling2011.jsonl", "r") as f:
 23 |             for line in f:
 24 |                 data = json.loads(line)
 25 |                 summary = data["summary"]["text"]
 26 |                 references = [reference["text"] for reference in data["references"]]
 27 |                 self.summaries.append(summary)
 28 |                 self.references_list.append(references)
 29 | 
 30 |     def _check_output(self, metric: QAEval, expected_output: List) -> None:
 31 |         actual_output = metric.score_batch(self.summaries, self.references_list)
 32 |         assert len(expected_output) == len(actual_output)
 33 |         for expected, actual in zip(expected_output, actual_output):
 34 |             assert len(expected) == len(actual) == 1
 35 |             expected = expected["qa-eval"]
 36 |             actual = actual["qa-eval"]
 37 |             assert len(expected) == len(actual)
 38 |             for metric in expected.keys():
 39 |                 assert expected[metric] == pytest.approx(actual[metric], abs=1e-5)
 40 | 
 41 |     def test_qaeval(self):
 42 |         # This is a regression test, not necessarily a test for correctness
 43 |         metric = QAEval(
 44 |             generation_model_path=os.environ["GENERATION_MODEL"],
 45 |             answering_model_dir=os.environ["ANSWERING_MODEL"],
 46 |         )
 47 |         expected_output = [
 48 |             {
 49 |                 "qa-eval": {
 50 |                     "is_answered": 0.2171952736318408,
 51 |                     "em": 0.03078358208955224,
 52 |                     "f1": 0.05688114487088367,
 53 |                 }
 54 |             },
 55 |             {
 56 |                 "qa-eval": {
 57 |                     "is_answered": 0.2706778606965174,
 58 |                     "em": 0.08286691542288557,
 59 |                     "f1": 0.11367400349443259,
 60 |                 }
 61 |             },
 62 |             {
 63 |                 "qa-eval": {
 64 |                     "is_answered": 0.4552238805970149,
 65 |                     "em": 0.05223880597014925,
 66 |                     "f1": 0.10360696517412935,
 67 |                 }
 68 |             },
 69 |             {
 70 |                 "qa-eval": {
 71 |                     "is_answered": 0.2671408582089552,
 72 |                     "em": 0.04582555970149253,
 73 |                     "f1": 0.05402803689883914,
 74 |                 }
 75 |             },
 76 |             {
 77 |                 "qa-eval": {
 78 |                     "is_answered": 0.17126063232225966,
 79 |                     "em": 0.025276841598459315,
 80 |                     "f1": 0.04173576561636263,
 81 |                 }
 82 |             },
 83 |             {
 84 |                 "qa-eval": {
 85 |                     "is_answered": 0.3291829383548209,
 86 |                     "em": 0.029159756771697066,
 87 |                     "f1": 0.0543755246092705,
 88 |                 }
 89 |             },
 90 |             {
 91 |                 "qa-eval": {
 92 |                     "is_answered": 0.34836235489220563,
 93 |                     "em": 0.05223880597014925,
 94 |                     "f1": 0.09381412591922542,
 95 |                 }
 96 |             },
 97 |             {
 98 |                 "qa-eval": {
 99 |                     "is_answered": 0.4337987481945113,
100 |                     "em": 0.04537794896485315,
101 |                     "f1": 0.12145356515842792,
102 |                 }
103 |             },
104 |             {
105 |                 "qa-eval": {
106 |                     "is_answered": 0.44427039821776665,
107 |                     "em": 0.06434837092731831,
108 |                     "f1": 0.10272833079850623,
109 |                 }
110 |             },
111 |             {
112 |                 "qa-eval": {
113 |                     "is_answered": 0.40391255917571706,
114 |                     "em": 0.09642160957950431,
115 |                     "f1": 0.13482779720666102,
116 |                 }
117 |             },
118 |             {
119 |                 "qa-eval": {
120 |                     "is_answered": 0.5345864661654135,
121 |                     "em": 0.12349624060150374,
122 |                     "f1": 0.16393273976257167,
123 |                 }
124 |             },
125 |             {
126 |                 "qa-eval": {
127 |                     "is_answered": 0.5204365079365079,
128 |                     "em": 0.12678571428571428,
129 |                     "f1": 0.16151234567901235,
130 |                 }
131 |             },
132 |         ]
133 |         self._check_output(metric, expected_output)
134 | 
135 |     @pytest.mark.skipif(
136 |         "LERC_MODEL" not in os.environ,
137 |         reason="`LERC_MODEL` environment variable not set",
138 |     )
139 |     @pytest.mark.skipif(
140 |         "LERC_PRETRAINED_MODEL" not in os.environ,
141 |         reason="`LERC_PRETRAINED_MODEL` environment variable not set",
142 |     )
143 |     def test_qaeval_with_lerc(self):
144 |         # This is a regression test, not necessarily a test for correctness
145 |         metric = QAEval(
146 |             generation_model_path=os.environ["GENERATION_MODEL"],
147 |             answering_model_dir=os.environ["ANSWERING_MODEL"],
148 |             use_lerc=True,
149 |             lerc_model_path=os.environ["LERC_MODEL"],
150 |             lerc_pretrained_model_path=os.environ["LERC_PRETRAINED_MODEL"],
151 |         )
152 |         expected_output = [
153 |             {
154 |                 "qa-eval": {
155 |                     "is_answered": 0.2171952736318408,
156 |                     "em": 0.03078358208955224,
157 |                     "f1": 0.05688114487088367,
158 |                     "lerc": 0.5280342313984585,
159 |                 }
160 |             },
161 |             {
162 |                 "qa-eval": {
163 |                     "is_answered": 0.2706778606965174,
164 |                     "em": 0.08286691542288557,
165 |                     "f1": 0.11367400349443259,
166 |                     "lerc": 0.8588525844061404,
167 |                 }
168 |             },
169 |             {
170 |                 "qa-eval": {
171 |                     "is_answered": 0.4552238805970149,
172 |                     "em": 0.05223880597014925,
173 |                     "f1": 0.10360696517412935,
174 |                     "lerc": 1.2307390170310861,
175 |                 }
176 |             },
177 |             {
178 |                 "qa-eval": {
179 |                     "is_answered": 0.2671408582089552,
180 |                     "em": 0.04582555970149253,
181 |                     "f1": 0.05402803689883914,
182 |                     "lerc": 0.6782244059549116,
183 |                 }
184 |             },
185 |             {
186 |                 "qa-eval": {
187 |                     "is_answered": 0.17126063232225966,
188 |                     "em": 0.025276841598459315,
189 |                     "f1": 0.04173576561636263,
190 |                     "lerc": 0.40871678001285994,
191 |                 }
192 |             },
193 |             {
194 |                 "qa-eval": {
195 |                     "is_answered": 0.3291829383548209,
196 |                     "em": 0.029159756771697066,
197 |                     "f1": 0.0543755246092705,
198 |                     "lerc": 0.6477515654560587,
199 |                 }
200 |             },
201 |             {
202 |                 "qa-eval": {
203 |                     "is_answered": 0.34836235489220563,
204 |                     "em": 0.05223880597014925,
205 |                     "f1": 0.09381412591922542,
206 |                     "lerc": 0.947292007320556,
207 |                 }
208 |             },
209 |             {
210 |                 "qa-eval": {
211 |                     "is_answered": 0.4337987481945113,
212 |                     "em": 0.04537794896485315,
213 |                     "f1": 0.12145356515842792,
214 |                     "lerc": 1.2629075305115793,
215 |                 }
216 |             },
217 |             {
218 |                 "qa-eval": {
219 |                     "is_answered": 0.44427039821776665,
220 |                     "em": 0.06434837092731831,
221 |                     "f1": 0.10272833079850623,
222 |                     "lerc": 1.1977039740821571,
223 |                 }
224 |             },
225 |             {
226 |                 "qa-eval": {
227 |                     "is_answered": 0.40391255917571706,
228 |                     "em": 0.09642160957950431,
229 |                     "f1": 0.13482779720666102,
230 |                     "lerc": 1.2360802221434326,
231 |                 }
232 |             },
233 |             {
234 |                 "qa-eval": {
235 |                     "is_answered": 0.5345864661654135,
236 |                     "em": 0.12349624060150374,
237 |                     "f1": 0.16393273976257167,
238 |                     "lerc": 1.5575424717221045,
239 |                 }
240 |             },
241 |             {
242 |                 "qa-eval": {
243 |                     "is_answered": 0.5204365079365079,
244 |                     "em": 0.12678571428571428,
245 |                     "f1": 0.16151234567901235,
246 |                     "lerc": 1.4713040575976408,
247 |                 }
248 |             },
249 |         ]
250 |         self._check_output(metric, expected_output)
251 | 
252 |     @pytest.mark.skipif(
253 |         "LERC_MODEL" not in os.environ,
254 |         reason="`LERC_MODEL` environment variable not set",
255 |     )
256 |     @pytest.mark.skipif(
257 |         "LERC_PRETRAINED_MODEL" not in os.environ,
258 |         reason="`LERC_PRETRAINED_MODEL` environment variable not set",
259 |     )
260 |     def test_return_qa_pairs(self):
261 |         metric = QAEval(
262 |             generation_model_path=os.environ["GENERATION_MODEL"],
263 |             answering_model_dir=os.environ["ANSWERING_MODEL"],
264 |             use_lerc=True,
265 |             lerc_model_path=os.environ["LERC_MODEL"],
266 |             lerc_pretrained_model_path=os.environ["LERC_PRETRAINED_MODEL"],
267 |         )
268 | 
269 |         summaries = [
270 |             "Dan walked to the bakery this morning.",
271 |             "He bought some scones today",
272 |         ]
273 |         references_list = [
274 |             ["Dan went to buy scones earlier this morning."],
275 |             ["Dan went to buy scones earlier this morning."],
276 |         ]
277 | 
278 |         results_list = metric.score_batch(summaries, references_list, return_qa_pairs=True)
279 |         assert len(results_list) == 2
280 |         metrics, qa_pairs_list = results_list[0]
281 |         assert metrics["qa-eval"]["is_answered"] == 1.0
282 |         assert metrics["qa-eval"]["em"] == 0.5
283 |         assert metrics["qa-eval"]["f1"] == 0.5
284 |         self.assertAlmostEqual(metrics["qa-eval"]["lerc"], 3.171376943588257, places=4)
285 |         assert len(qa_pairs_list) == 1
286 |         qa_pairs = qa_pairs_list[0]
287 |         assert len(qa_pairs) == 2
288 |         assert (
289 |             qa_pairs[0]["question"]["question"]
290 |             == "Who went to buy scones earlier this morning?"
291 |         )
292 |         assert qa_pairs[0]["prediction"]["prediction"] == "Dan"
293 |         assert qa_pairs[0]["prediction"]["start"] == 0
294 |         assert qa_pairs[0]["prediction"]["end"] == 3
295 |         assert qa_pairs[0]["prediction"]["is_answered"] == 1.0
296 |         assert qa_pairs[0]["prediction"]["em"] == 1.0
297 |         assert qa_pairs[0]["prediction"]["f1"] == 1.0
298 |         self.assertAlmostEqual(
299 |             qa_pairs[0]["prediction"]["lerc"], 5.035197734832764, places=4
300 |         )
301 |         assert (
302 |             qa_pairs[1]["question"]["question"]
303 |             == "What did Dan go to buy earlier this morning?"
304 |         )
305 |         assert qa_pairs[1]["prediction"]["prediction"] == "bakery"
306 |         assert qa_pairs[1]["prediction"]["start"] == 18
307 |         assert qa_pairs[1]["prediction"]["end"] == 24
308 |         assert qa_pairs[1]["prediction"]["is_answered"] == 1.0
309 |         assert qa_pairs[1]["prediction"]["em"] == 0.0
310 |         assert qa_pairs[1]["prediction"]["f1"] == 0.0
311 |         self.assertAlmostEqual(
312 |             qa_pairs[1]["prediction"]["lerc"], 1.30755615234375, places=4
313 |         )
314 | 
315 |         metrics, qa_pairs_list = results_list[1]
316 |         assert metrics["qa-eval"]["is_answered"] == 0.5
317 |         assert metrics["qa-eval"]["em"] == 0.5
318 |         assert metrics["qa-eval"]["f1"] == 0.5
319 |         self.assertAlmostEqual(metrics["qa-eval"]["lerc"], 2.492440700531006, places=4)
320 |         assert len(qa_pairs_list) == 1
321 |         qa_pairs = qa_pairs_list[0]
322 |         assert len(qa_pairs) == 2
323 |         assert (
324 |             qa_pairs[0]["question"]["question"]
325 |             == "Who went to buy scones earlier this morning?"
326 |         )
327 |         assert qa_pairs[0]["prediction"]["prediction"] == "He"
328 |         assert qa_pairs[0]["prediction"]["start"] == 0
329 |         assert qa_pairs[0]["prediction"]["end"] == 2
330 |         assert qa_pairs[0]["prediction"]["is_answered"] == 0.0
331 |         assert qa_pairs[0]["prediction"]["em"] == 0.0
332 |         assert qa_pairs[0]["prediction"]["f1"] == 0.0
333 |         assert qa_pairs[0]["prediction"]["lerc"] == 0.0
334 |         assert (
335 |             qa_pairs[1]["question"]["question"]
336 |             == "What did Dan go to buy earlier this morning?"
337 |         )
338 |         assert qa_pairs[1]["prediction"]["prediction"] == "scones"
339 |         assert qa_pairs[1]["prediction"]["start"] == 15
340 |         assert qa_pairs[1]["prediction"]["end"] == 21
341 |         assert qa_pairs[1]["prediction"]["is_answered"] == 1.0
342 |         assert qa_pairs[1]["prediction"]["em"] == 1.0
343 |         assert qa_pairs[1]["prediction"]["f1"] == 1.0
344 |         self.assertAlmostEqual(
345 |             qa_pairs[1]["prediction"]["lerc"], 4.984881401062012, places=4
346 |         )
347 | 


--------------------------------------------------------------------------------
/qaeval/tests/scoring/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/qaeval/dd7273183dd1b2c9995115310ef041daa953ca81/qaeval/tests/scoring/__init__.py


--------------------------------------------------------------------------------
/qaeval/tests/scoring/scorers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danieldeutsch/qaeval/dd7273183dd1b2c9995115310ef041daa953ca81/qaeval/tests/scoring/scorers/__init__.py


--------------------------------------------------------------------------------
/qaeval/tests/scoring/scorers/em_test.py:
--------------------------------------------------------------------------------
 1 | from qaeval.scoring.scorers import ExactMatchScorer
 2 | from qaeval.tests.scoring.scorers.scorer_test import TestScorer
 3 | 
 4 | 
 5 | class TestExactMatchScorer(TestScorer):
 6 |     @classmethod
 7 |     def setUpClass(cls) -> None:
 8 |         cls.scorer = ExactMatchScorer()
 9 | 
10 |     def test_keys(self):
11 |         assert self.scorer.keys() == {'em'}
12 | 
13 |     def test_default_scores(self):
14 |         assert self.scorer.default_scores() == {'em': 0.0}
15 | 
16 |     def test_is_answered(self):
17 |         # the transformer library accepts "a jogger" and "the jogger" for exact match
18 |         self.assert_expected_output(
19 |             self.scorer,
20 |             {'em': (1 / 3 + 1 / 1) / 2},
21 |             [{'em': 1 / 3}, {'em': 1 / 1}],
22 |             [[{'em': 0.0}, {'em': 1.0}, {'em': 0.0}], [{'em': 1.0}]]
23 |         )
24 | 


--------------------------------------------------------------------------------
/qaeval/tests/scoring/scorers/f1_test.py:
--------------------------------------------------------------------------------
 1 | from qaeval.scoring.scorers import F1Scorer
 2 | from qaeval.tests.scoring.scorers.scorer_test import TestScorer
 3 | 
 4 | 
 5 | class TestF1Scorer(TestScorer):
 6 |     @classmethod
 7 |     def setUpClass(cls) -> None:
 8 |         cls.scorer = F1Scorer()
 9 | 
10 |     def test_keys(self):
11 |         assert self.scorer.keys() == {'f1'}
12 | 
13 |     def test_default_scores(self):
14 |         assert self.scorer.default_scores() == {'f1': 0.0}
15 | 
16 |     def test_is_answered(self):
17 |         self.assert_expected_output(
18 |             self.scorer,
19 |             {'f1': (1 / 3 + 1 / 1) / 2},
20 |             [{'f1': 1 / 3}, {'f1': 1 / 1}],
21 |             [[{'f1': 0.0}, {'f1': 1.0}, {'f1': 0.0}], [{'f1': 1.0}]]
22 |         )
23 | 


--------------------------------------------------------------------------------
/qaeval/tests/scoring/scorers/is_answered_test.py:
--------------------------------------------------------------------------------
 1 | from qaeval.scoring.scorers.is_answered import IsAnsweredScorer
 2 | from qaeval.tests.scoring.scorers.scorer_test import TestScorer
 3 | 
 4 | 
 5 | class TestIsAnsweredScorer(TestScorer):
 6 |     @classmethod
 7 |     def setUpClass(cls) -> None:
 8 |         cls.scorer = IsAnsweredScorer()
 9 | 
10 |     def test_keys(self):
11 |         assert self.scorer.keys() == {'is_answered'}
12 | 
13 |     def test_default_scores(self):
14 |         assert self.scorer.default_scores() == {'is_answered': 0.0}
15 | 
16 |     def test_is_answered(self):
17 |         self.assert_expected_output(
18 |             self.scorer,
19 |             {'is_answered': (2 / 3 + 1 / 1) / 2},
20 |             [{'is_answered': 2 / 3}, {'is_answered': 1 / 1}],
21 |             [[{'is_answered': 1.0}, {'is_answered': 1.0}, {'is_answered': 0.0}], [{'is_answered': 1.0}]]
22 |         )
23 | 


--------------------------------------------------------------------------------
/qaeval/tests/scoring/scorers/lerc_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from qaeval.scoring.scorers import LERCScorer
 5 | from qaeval.tests.scoring.scorers.scorer_test import TestScorer
 6 | 
 7 | 
 8 | @pytest.mark.skipif('LERC_MODEL' not in os.environ or 'LERC_PRETRAINED' not in os.environ, reason='LERC environment variables not set')
 9 | class TestLERCScorer(TestScorer):
10 |     @classmethod
11 |     def setUpClass(cls) -> None:
12 |         cls.scorer = LERCScorer(
13 |             model_path=os.environ['LERC_MODEL'],
14 |             pretrained_path=os.environ['LERC_PRETRAINED'],
15 |             cuda_device=0
16 |         )
17 | 
18 |     def test_keys(self):
19 |         assert self.scorer.keys() == {'lerc'}
20 | 
21 |     def test_default_scores(self):
22 |         assert self.scorer.default_scores() == {'lerc': 0.0}
23 | 
24 |     def test_is_answered(self):
25 |         self.assert_expected_output(
26 |             # This is a regression test. It does not ensure these numbers are correct
27 |             self.scorer,
28 |             {'lerc': (2.5152266025543213 + 4.940724849700928) / 2},
29 |             [{'lerc': 2.5152266025543213}, {'lerc': 4.940724849700928}],
30 |             [[{'lerc': 2.5210483074188232}, {'lerc': 5.024631500244141}, {'lerc': 0.0}], [{'lerc': 4.940724849700928}]]
31 |         )
32 | 


--------------------------------------------------------------------------------
/qaeval/tests/scoring/scorers/meta_test.py:
--------------------------------------------------------------------------------
 1 | from qaeval.scoring.scorers import IsAnsweredScorer, F1Scorer, MetaScorer
 2 | from qaeval.tests.scoring.scorers.scorer_test import TestScorer
 3 | 
 4 | 
 5 | class TestMetaScorer(TestScorer):
 6 |     @classmethod
 7 |     def setUpClass(cls) -> None:
 8 |         cls.scorer = MetaScorer([
 9 |             IsAnsweredScorer(), F1Scorer(),
10 |         ])
11 | 
12 |     def test_keys(self):
13 |         assert self.scorer.keys() == {'is_answered', 'f1'}
14 | 
15 |     def test_default_scores(self):
16 |         assert self.scorer.default_scores() == {'is_answered': 0.0, 'f1': 0.0}
17 | 
18 |     def test_is_answered(self):
19 |         self.assert_expected_output(
20 |             self.scorer,
21 |             {'is_answered': (2 / 3 + 1 / 1) / 2, 'f1': (1 / 3 + 1 / 1) / 2},
22 |             [{'is_answered': 2 / 3, 'f1': 1 / 3}, {'is_answered': 1 / 1, 'f1': 1 / 1}],
23 |             [
24 |                 [{'is_answered': 1.0, 'f1': 0.0}, {'is_answered': 1.0, 'f1': 1.0}, {'is_answered': 0.0, 'f1': 0.0}],
25 |                 [{'is_answered': 1.0, 'f1': 1.0}]
26 |             ]
27 |         )
28 | 


--------------------------------------------------------------------------------
/qaeval/tests/scoring/scorers/scorer_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from typing import Dict, List
 3 | 
 4 | from qaeval.scoring.scorers import Scorer
 5 | 
 6 | SUMMARY = '(CNN)Singer-songwriter David Crosby hit a jogger with his car Sunday evening, a spokesman said. The accident happened in Santa Ynez, California, near where Crosby lives. Crosby was driving at approximately 50 mph when he struck the jogger, according to California Highway Patrol Spokesman Don Clotworthy. The posted speed limit was 55. The jogger suffered multiple fractures, and was airlifted to a hospital in Santa Barbara, Clotworthy said.'
 7 | REFERENCE = 'Accident happens in Santa Ynez, California, near where Crosby lives . The jogger suffered multiple fractures; his injuries are not believed to be life-threatening .'
 8 | 
 9 | QUESTIONS = [
10 |     [
11 |         'What happens in Santa Ynez, California, near where Crosby lives?',
12 |         'Where in California does accident happen near where Crosby lives?',
13 |         'What did the jogger suffer multiple fractures for that are not believed to be life-threatening?',
14 |     ],
15 |     [
16 |         'Who suffered multiple fractures?',
17 |     ]
18 | ]
19 | ANSWERS = [
20 |     ['Accident', 'Santa Ynez', 'injuries'],
21 |     ['The jogger'],
22 | ]
23 | ANSWER_OFFSETS = [
24 |     [(0, 8), (20, 30), (115, 122)],
25 |     [(70, 80)],
26 | ]
27 | PREDICTIONS = [
28 |     ['hit a jogger', 'Santa Ynez', 'hit'],
29 |     ['a jogger'],
30 | ]
31 | PREDICTION_OFFSETS = [
32 |     [(36, 48), (121, 131), (36, 39)],
33 |     [(40, 48)],
34 | ]
35 | PROBABILITIES = [
36 |     [0.8, 0.6, 0.3],
37 |     [0.5]
38 | ]
39 | NULL_PROBABILITIES = [
40 |     [0.3, 0.2, 0.6],
41 |     [0.1]
42 | ]
43 | 
44 | 
45 | class TestScorer(unittest.TestCase):
46 |     def assert_expected_output(
47 |         self,
48 |         scorer: Scorer,
49 |         instance_scores: Dict[str, float],
50 |         reference_scores_list: List[Dict[str, float]],
51 |         question_scores_lists: List[List[Dict[str, float]]],
52 |     ) -> None:
53 |         for i in range(len(QUESTIONS)):
54 |             actual_reference_scores, actual_question_scores_list = scorer.score_single_ref(
55 |                 SUMMARY,
56 |                 QUESTIONS[i],
57 |                 ANSWERS[i],
58 |                 PREDICTIONS[i],
59 |                 PROBABILITIES[i],
60 |                 NULL_PROBABILITIES[i]
61 |             )
62 |             for key in scorer.keys():
63 |                 self.assertAlmostEqual(reference_scores_list[i][key], actual_reference_scores[key], places=4)
64 |                 for expected, actual in zip(question_scores_lists[i], actual_question_scores_list):
65 |                     self.assertAlmostEqual(expected[key], actual[key], places=4)
66 | 
67 |         actual_instance_scores, actual_question_scores_lists = scorer.score_multi_ref(
68 |             SUMMARY,
69 |             QUESTIONS,
70 |             ANSWERS,
71 |             PREDICTIONS,
72 |             PROBABILITIES,
73 |             NULL_PROBABILITIES
74 |         )
75 | 
76 |         for key in scorer.keys():
77 |             self.assertAlmostEqual(instance_scores[key], actual_instance_scores[key], places=4)
78 |             for expected_list, actual_list in zip(question_scores_lists, actual_question_scores_lists):
79 |                 for expected, actual in zip(expected_list, actual_list):
80 |                     self.assertAlmostEqual(expected[key], actual[key], places=4)
81 | 


--------------------------------------------------------------------------------
/qaeval/version.py:
--------------------------------------------------------------------------------
1 | _MAJOR = '0'
2 | _MINOR = '1'
3 | _PATCH = '0'
4 | 
5 | VERSION = f'{_MAJOR}.{_MINOR}.{_PATCH}'
6 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | allennlp==1.1.0
2 | click==7.1.2
3 | edlib
4 | spacy==2.2.4
5 | torch==1.6.0
6 | transformers==3.0.2
7 | urllib3>=1.25.10


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | # version.py defines the VERSION variable.
 4 | # We use exec here so we don't import qaeval whilst setting up.
 5 | VERSION = {}
 6 | with open('qaeval/version.py', 'r') as version_file:
 7 |     exec(version_file.read(), VERSION)
 8 | 
 9 | setuptools.setup(
10 |     name='qaeval',
11 |     version=VERSION['VERSION'],
12 |     author='Daniel Deutsch',
13 |     description='A package for evaluating the content of summaries through question-answering',
14 |     url='https://github.com/danieldeutsch/qaeval',
15 |     packages=setuptools.find_packages(),
16 |     python_requires='>=3.6',
17 |     install_requires=[
18 |         'allennlp==1.1.0',
19 |         'click==7.1.2',
20 |         'edlib',
21 |         'spacy==2.2.4',
22 |         'torch==1.6.0',
23 |         'transformers==3.0.2',
24 |         'urllib3>=1.25.10'
25 |     ]
26 | )


--------------------------------------------------------------------------------