├── .gitignore
├── ACKNOWLEDGEMENTS
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── collection
    ├── README.md
    ├── download_commoncrawl_passages.py
    ├── download_wayback_passages.py
    └── paragraph_chunker.py
├── dataset
    └── qrecc_data.zip
├── requirements.txt
└── utils
    ├── evaluate_qa.py
    ├── evaluate_retrieval.py
    └── span_heuristic.py


/.gitignore:
--------------------------------------------------------------------------------
1 | dataset/__MACOSX/
2 | dataset/qrecc_test.json
3 | dataset/qrecc_train.json
4 | .DS_Store


--------------------------------------------------------------------------------
/ACKNOWLEDGEMENTS:
--------------------------------------------------------------------------------
 1 | Acknowledgements
 2 | 
 3 | Portions of ml-qrecc may utilize the following copyrighted
 4 | material, the use of which is hereby acknowledged.
 5 | 
 6 | _____________________
 7 | 
 8 | Leonard Richardson and contributors (Beautiful Soup)
 9 |     The MIT License
10 | 
11 |     Copyright (c) 2004-2020 Leonard Richardson and contributors
12 | 
13 |     Permission is hereby granted, free of charge, to any person obtaining a copy
14 |     of this software and associated documentation files (the "Software"), to deal
15 |     in the Software without restriction, including without limitation the rights
16 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17 |     copies of the Software, and to permit persons to whom the Software is
18 |     furnished to do so, subject to the following conditions:
19 | 
20 |     The above copyright notice and this permission notice shall be included in
21 |     all copies or substantial portions of the Software.
22 | 
23 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29 |     THE SOFTWARE.
30 | 
31 | NLTK Authors (NLTK)
32 |    Copyright (C) 2001-2020 NLTK Project
33 | 
34 |    Licensed under the Apache License, Version 2.0 (the "License");
35 |    you may not use this file except in compliance with the License.
36 |    You may obtain a copy of the License at
37 | 
38 |        http://www.apache.org/licenses/LICENSE-2.0
39 | 
40 |    Unless required by applicable law or agreed to in writing, software
41 |    distributed under the License is distributed on an "AS IS" BASIS,
42 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
43 |    See the License for the specific language governing permissions and
44 |    limitations under the License.
45 | 
46 | Pyserini Authors (Pyserini)
47 |    Copyright 2019-2020, The Pyserini Authors.
48 | 
49 |    Licensed under the Apache License, Version 2.0 (the "License");
50 |    you may not use this file except in compliance with the License.
51 |    You may obtain a copy of the License at
52 | 
53 |        http://www.apache.org/licenses/LICENSE-2.0
54 | 
55 |    Unless required by applicable law or agreed to in writing, software
56 |    distributed under the License is distributed on an "AS IS" BASIS,
57 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
58 |    See the License for the specific language governing permissions and
59 |    limitations under the License.
60 | 
61 | warcio Authors (warcio)
62 |    Copyright 2017-2020, The warcio Authors.
63 | 
64 |    Licensed under the Apache License, Version 2.0 (the "License");
65 |    you may not use this file except in compliance with the License.
66 |    You may obtain a copy of the License at
67 | 
68 |        http://www.apache.org/licenses/LICENSE-2.0
69 | 
70 |    Unless required by applicable law or agreed to in writing, software
71 |    distributed under the License is distributed on an "AS IS" BASIS,
72 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
73 |    See the License for the specific language governing permissions and
74 |    limitations under the License.
75 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the open source team at [opensource-conduct@group.apple.com](mailto:opensource-conduct@group.apple.com). All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.4,
71 | available at [https://www.contributor-covenant.org/version/1/4/code-of-conduct.html](https://www.contributor-covenant.org/version/1/4/code-of-conduct.html)


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contribution Guide
 2 | 
 3 | Thanks for your interest in contributing. This project was released to accompany a research paper for purposes of reproducability, and beyond its publication there are limited plans for future development of the repository.
 4 | 
 5 | While we welcome new pull requests and issues please note that our response may be limited. Forks and out-of-tree improvements are strongly encouraged.
 6 | 
 7 | ## Before you get started
 8 | 
 9 | By submitting a pull request, you represent that you have the right to license your contribution to Apple and the community, and agree by submitting the patch that your contributions are licensed under the [LICENSE](LICENSE).
10 | 
11 | We ask that all community members read and observe our [Code of Conduct](CODE_OF_CONDUCT.md).


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 |                                  Apache License
  4 |                            Version 2.0, January 2004
  5 |                         http://www.apache.org/licenses/
  6 | 
  7 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  8 | 
  9 |    1. Definitions.
 10 | 
 11 |       "License" shall mean the terms and conditions for use, reproduction,
 12 |       and distribution as defined by Sections 1 through 9 of this document.
 13 | 
 14 |       "Licensor" shall mean the copyright owner or entity authorized by
 15 |       the copyright owner that is granting the License.
 16 | 
 17 |       "Legal Entity" shall mean the union of the acting entity and all
 18 |       other entities that control, are controlled by, or are under common
 19 |       control with that entity. For the purposes of this definition,
 20 |       "control" means (i) the power, direct or indirect, to cause the
 21 |       direction or management of such entity, whether by contract or
 22 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 23 |       outstanding shares, or (iii) beneficial ownership of such entity.
 24 | 
 25 |       "You" (or "Your") shall mean an individual or Legal Entity
 26 |       exercising permissions granted by this License.
 27 | 
 28 |       "Source" form shall mean the preferred form for making modifications,
 29 |       including but not limited to software source code, documentation
 30 |       source, and configuration files.
 31 | 
 32 |       "Object" form shall mean any form resulting from mechanical
 33 |       transformation or translation of a Source form, including but
 34 |       not limited to compiled object code, generated documentation,
 35 |       and conversions to other media types.
 36 | 
 37 |       "Work" shall mean the work of authorship, whether in Source or
 38 |       Object form, made available under the License, as indicated by a
 39 |       copyright notice that is included in or attached to the work
 40 |       (an example is provided in the Appendix below).
 41 | 
 42 |       "Derivative Works" shall mean any work, whether in Source or Object
 43 |       form, that is based on (or derived from) the Work and for which the
 44 |       editorial revisions, annotations, elaborations, or other modifications
 45 |       represent, as a whole, an original work of authorship. For the purposes
 46 |       of this License, Derivative Works shall not include works that remain
 47 |       separable from, or merely link (or bind by name) to the interfaces of,
 48 |       the Work and Derivative Works thereof.
 49 | 
 50 |       "Contribution" shall mean any work of authorship, including
 51 |       the original version of the Work and any modifications or additions
 52 |       to that Work or Derivative Works thereof, that is intentionally
 53 |       submitted to Licensor for inclusion in the Work by the copyright owner
 54 |       or by an individual or Legal Entity authorized to submit on behalf of
 55 |       the copyright owner. For the purposes of this definition, "submitted"
 56 |       means any form of electronic, verbal, or written communication sent
 57 |       to the Licensor or its representatives, including but not limited to
 58 |       communication on electronic mailing lists, source code control systems,
 59 |       and issue tracking systems that are managed by, or on behalf of, the
 60 |       Licensor for the purpose of discussing and improving the Work, but
 61 |       excluding communication that is conspicuously marked or otherwise
 62 |       designated in writing by the copyright owner as "Not a Contribution."
 63 | 
 64 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 65 |       on behalf of whom a Contribution has been received by Licensor and
 66 |       subsequently incorporated within the Work.
 67 | 
 68 |    2. Grant of Copyright License. Subject to the terms and conditions of
 69 |       this License, each Contributor hereby grants to You a perpetual,
 70 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 71 |       copyright license to reproduce, prepare Derivative Works of,
 72 |       publicly display, publicly perform, sublicense, and distribute the
 73 |       Work and such Derivative Works in Source or Object form.
 74 | 
 75 |    3. Grant of Patent License. Subject to the terms and conditions of
 76 |       this License, each Contributor hereby grants to You a perpetual,
 77 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 78 |       (except as stated in this section) patent license to make, have made,
 79 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 80 |       where such license applies only to those patent claims licensable
 81 |       by such Contributor that are necessarily infringed by their
 82 |       Contribution(s) alone or by combination of their Contribution(s)
 83 |       with the Work to which such Contribution(s) was submitted. If You
 84 |       institute patent litigation against any entity (including a
 85 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 86 |       or a Contribution incorporated within the Work constitutes direct
 87 |       or contributory patent infringement, then any patent licenses
 88 |       granted to You under this License for that Work shall terminate
 89 |       as of the date such litigation is filed.
 90 | 
 91 |    4. Redistribution. You may reproduce and distribute copies of the
 92 |       Work or Derivative Works thereof in any medium, with or without
 93 |       modifications, and in Source or Object form, provided that You
 94 |       meet the following conditions:
 95 | 
 96 |       (a) You must give any other recipients of the Work or
 97 |           Derivative Works a copy of this License; and
 98 | 
 99 |       (b) You must cause any modified files to carry prominent notices
100 |           stating that You changed the files; and
101 | 
102 |       (c) You must retain, in the Source form of any Derivative Works
103 |           that You distribute, all copyright, patent, trademark, and
104 |           attribution notices from the Source form of the Work,
105 |           excluding those notices that do not pertain to any part of
106 |           the Derivative Works; and
107 | 
108 |       (d) If the Work includes a "NOTICE" text file as part of its
109 |           distribution, then any Derivative Works that You distribute must
110 |           include a readable copy of the attribution notices contained
111 |           within such NOTICE file, excluding those notices that do not
112 |           pertain to any part of the Derivative Works, in at least one
113 |           of the following places: within a NOTICE text file distributed
114 |           as part of the Derivative Works; within the Source form or
115 |           documentation, if provided along with the Derivative Works; or,
116 |           within a display generated by the Derivative Works, if and
117 |           wherever such third-party notices normally appear. The contents
118 |           of the NOTICE file are for informational purposes only and
119 |           do not modify the License. You may add Your own attribution
120 |           notices within Derivative Works that You distribute, alongside
121 |           or as an addendum to the NOTICE text from the Work, provided
122 |           that such additional attribution notices cannot be construed
123 |           as modifying the License.
124 | 
125 |       You may add Your own copyright statement to Your modifications and
126 |       may provide additional or different license terms and conditions
127 |       for use, reproduction, or distribution of Your modifications, or
128 |       for any such Derivative Works as a whole, provided Your use,
129 |       reproduction, and distribution of the Work otherwise complies with
130 |       the conditions stated in this License.
131 | 
132 |    5. Submission of Contributions. Unless You explicitly state otherwise,
133 |       any Contribution intentionally submitted for inclusion in the Work
134 |       by You to the Licensor shall be under the terms and conditions of
135 |       this License, without any additional terms or conditions.
136 |       Notwithstanding the above, nothing herein shall supersede or modify
137 |       the terms of any separate license agreement you may have executed
138 |       with Licensor regarding such Contributions.
139 | 
140 |    6. Trademarks. This License does not grant permission to use the trade
141 |       names, trademarks, service marks, or product names of the Licensor,
142 |       except as required for reasonable and customary use in describing the
143 |       origin of the Work and reproducing the content of the NOTICE file.
144 | 
145 |    7. Disclaimer of Warranty. Unless required by applicable law or
146 |       agreed to in writing, Licensor provides the Work (and each
147 |       Contributor provides its Contributions) on an "AS IS" BASIS,
148 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
149 |       implied, including, without limitation, any warranties or conditions
150 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151 |       PARTICULAR PURPOSE. You are solely responsible for determining the
152 |       appropriateness of using or redistributing the Work and assume any
153 |       risks associated with Your exercise of permissions under this License.
154 | 
155 |    8. Limitation of Liability. In no event and under no legal theory,
156 |       whether in tort (including negligence), contract, or otherwise,
157 |       unless required by applicable law (such as deliberate and grossly
158 |       negligent acts) or agreed to in writing, shall any Contributor be
159 |       liable to You for damages, including any direct, indirect, special,
160 |       incidental, or consequential damages of any character arising as a
161 |       result of this License or out of the use or inability to use the
162 |       Work (including but not limited to damages for loss of goodwill,
163 |       work stoppage, computer failure or malfunction, or any and all
164 |       other commercial damages or losses), even if such Contributor
165 |       has been advised of the possibility of such damages.
166 | 
167 |    9. Accepting Warranty or Additional Liability. While redistributing
168 |       the Work or Derivative Works thereof, You may choose to offer,
169 |       and charge a fee for, acceptance of support, warranty, indemnity,
170 |       or other liability obligations and/or rights consistent with this
171 |       License. However, in accepting such obligations, You may act only
172 |       on Your own behalf and on Your sole responsibility, not on behalf
173 |       of any other Contributor, and only if You agree to indemnify,
174 |       defend, and hold each Contributor harmless for any liability
175 |       incurred by, or claims asserted against, such Contributor by reason
176 |       of your accepting any such warranty or additional liability.
177 | 
178 |    END OF TERMS AND CONDITIONS
179 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Open-Domain Question Answering Goes Conversational via Question Rewriting
 2 | 
 3 | [**Tasks**](#task-description) | [**Dataset**](#dataset) | [**Evaluation**](#evaluation) |
 4 | [**Paper**](https://arxiv.org/abs/2010.04898) |
 5 | [**Citation**](#citation) | [**License**](#license)
 6 | 
 7 | We introduce QReCC (**Q**uestion **Re**writing in **C**onversational **C**ontext), an end-to-end open-domain question answering dataset comprising of 14K conversations with 81K question-answer pairs.
 8 | The goal of this dataset is to provide a challenging benchmark for end-to-end conversational question answering that includes the individual subtasks of question rewriting, passage retrieval and reading comprehension.
 9 | Please refer to our paper [Open-Domain Question Answering Goes Conversational via Question Rewriting](https://arxiv.org/abs/2010.04898) for details.
10 | 
11 | ## Task Description
12 | 
13 | The task in QReCC is to find answers to conversational questions within a collection of 10M web pages split into 54M passages.
14 | Answers to questions in the same conversation may be distributed across several web pages.
15 | 
16 | ## Dataset
17 | 
18 | QReCC contains 14K conversations with 81K question-answer pairs.
19 | We build QReCC on questions from [TREC CAsT](https://github.com/daltonj/treccastweb/tree/master/2019/data), [QuAC](https://quac.ai) and [Google Natural Questions](https://github.com/google-research-datasets/natural-questions).
20 | While TREC CAsT and QuAC datasets contain multi-turn conversations, Natural Questions is not a conversational dataset.
21 | We used questions in NQ dataset as prompts to create conversations explicitly balancing types of context-dependent questions, such as anaphora (co-references) and ellipsis.
22 | 
23 | For each query we collect query rewrites by resolving references, the resulting query rewrite is a context-independent version of the original (context-dependent) question.
24 | The rewritten query is then used to with a search engine to answer the question. Each query is also annotated with answer, link to the web page that used to produce the answer.
25 | 
26 | Each conversation in the dataset contains a unique `Conversation_no`, `Turn_no` unique within a conversation, the original `Question`, `Context`, `Rewrite`, `Answer` with `Answer_URL` and the `Conversation_source`.
27 | 
28 | ```json
29 | {
30 |   "Context": [
31 |     "What are the pros and cons of electric cars?",
32 |     "Some pros are: They're easier on the environment. Electricity is cheaper than gasoline. Maintenance is less frequent and less expensive. They're very quiet. You'll get tax credits. They can shorten your commute time. Some cons are: Most EVs have pretty short ranges. Recharging can take a while."
33 |   ],
34 |   "Question": "Tell me more about Tesla",
35 |   "Rewrite": "Tell me more about Tesla the car company.",
36 |   "Answer": "Tesla Inc. is an American automotive and energy company based in Palo Alto, California. The company specializes in electric car manufacturing and, through its SolarCity subsidiary, solar panel manufacturing.",
37 |   "Answer_URL": "https://en.wikipedia.org/wiki/Tesla,_Inc.",
38 |   "Conversation_no": 74,
39 |   "Turn_no": 2,
40 |   "Conversation_source": "trec"
41 | }
42 | ```
43 | 
44 | ## Evaluation
45 | 
46 | ### Evaluate performance on Retrieval Question Answering task
47 | 
48 | To evaluate retrieval QA, use [evaluate_retrieval.py](https://github.com/apple/ml-qrecc/blob/main/utils/evaluate_retrieval.py)
49 | 
50 | ### Evaluate performance on Extractive Question Answering task
51 | 
52 | To evaluate extractive QA, use [evaluate_qa.py](https://github.com/apple/ml-qrecc/blob/main/utils/evaluate_qa.py)
53 | 
54 | ## Citation
55 | 
56 | Please cite the following if you found QReCC dataset, our [paper](https://arxiv.org/abs/2010.04898), or these resources useful.
57 | 
58 | ```bibtex
59 | @article{qrecc,
60 |   title={Open-Domain Question Answering Goes Conversational via Question Rewriting},
61 |   author={Anantha, Raviteja and Vakulenko, Svitlana and Tu, Zhucheng and Longpre, Shayne and Pulman, Stephen and Chappidi, Srinivas},
62 |   journal={Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
63 |   year={2021}
64 | }
65 | ```
66 | 
67 | ## License
68 | 
69 | The code in this repository is licensed according to the [LICENSE](LICENSE) file.
70 | 
71 | The QReCC dataset is licensed under the Creative Commons Attribution-ShareAlike 3.0 Unported License.
72 | To view a copy of this license, visit http://creativecommons.org/licenses/by-sa/3.0/.
73 | 
74 | ## Contact Us
75 | 
76 | To contact us feel free to email the authors in the paper or create an issue in this repository.
77 | 


--------------------------------------------------------------------------------
/collection/README.md:
--------------------------------------------------------------------------------
 1 | # Building the Collection
 2 | 
 3 | This directory contains the scripts and instructions for downloading, processing, and building the collection for our baseline.
 4 | 
 5 | The collection consists of webpages from the Common Crawl and the Wayback Machine.
 6 | **Please note that the Common Crawl collection is [quite large](https://commoncrawl.org/2019/11/november-2019-crawl-archive-now-available/) (order of tens of TBs), so please check the data cap of your internet plan to make sure you stay within the limit.**
 7 | 
 8 | To download pages from the Common Crawl, run the following command.
 9 | This saves webpage documents in [JSON lines](https://jsonlines.org) (.jsonl) format into the `collection/commoncrawl` subdirectory.
10 | For us this took slightly over a day to run.
11 | 
12 | ```bash
13 | time python download_commoncrawl_passages.py --output-directory collection/commoncrawl --workers 8
14 | ```
15 | 
16 | To download pages from the Wayback Machine, run the following command after you've extracted the dataset.
17 | This saves webpage documents in .jsonl format into the `collection/wayback` subdirectory.
18 | For us this took 9 hours to run.
19 | 
20 | ```bash
21 | time python download_wayback_passages.py --inputs '../dataset/*.json' --output-directory collection/wayback --workers 4
22 | ```
23 | 
24 | Next we segmented the webpage documents into smaller passages.
25 | This is quite quick and took several minutes.
26 | 
27 | ```bash
28 | time python paragraph_chunker.py --input-directory collection --output-directory collection-paragraph --workers 8
29 | ```
30 | 
31 | Finally we indexed the passages using [Pyserini](https://github.com/castorini/pyserini/), a Python wrapper around [Anserini](http://anserini.io/), an information retrieval toolkit built on Lucene.
32 | Java (JDK) is needed as a pre-requisite. 
33 | After installing Pyserini we used the following command to build the index.
34 | For us this took less than 2 hours.
35 | 
36 | ```bash
37 | time python -m pyserini.index -collection JsonCollection -generator DefaultLuceneDocumentGenerator \
38 |  -threads 76 -input collection-paragraph \
39 |  -index index-paragraph -storePositions -storeDocvectors -storeRaw
40 | ```
41 | 


--------------------------------------------------------------------------------
/collection/download_commoncrawl_passages.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # For licensing see accompanying LICENSE file.
  3 | # Copyright (C) 2020 Apple Inc. All Rights Reserved.
  4 | #
  5 | 
  6 | """
  7 | This script creates a corpus of documents from the November 2019 Common Crawl archive.
  8 | """
  9 | 
 10 | from argparse import ArgumentParser
 11 | from collections import defaultdict
 12 | import gzip
 13 | import json
 14 | import logging
 15 | from multiprocessing import Pool
 16 | import os
 17 | from pathlib import Path
 18 | import re
 19 | import shutil
 20 | import tempfile
 21 | from typing import Dict, List, Tuple
 22 | import urllib.request
 23 | 
 24 | from warcio.archiveiterator import ArchiveIterator
 25 | 
 26 | index_files_root = Path('index-files')
 27 | filter_lists_root = Path('filter-lists')
 28 | sampled_filter_lists_root = Path('filter-lists-sampled')
 29 | wet_files_cache = Path('wet-files')
 30 | 
 31 | 
 32 | def get_cc_index_paths() -> List[str]:
 33 |     """Get a list of paths for Common Crawl URL index files."""
 34 |     index_paths = []
 35 |     with tempfile.NamedTemporaryFile() as temp_f:
 36 |         urllib.request.urlretrieve(
 37 |             'https://data.commoncrawl.org/crawl-data/CC-MAIN-2019-47/cc-index.paths.gz',
 38 |             temp_f.name,
 39 |         )
 40 |         with gzip.open(temp_f.name, 'rb') as f:
 41 |             for line in f:
 42 |                 line = line.decode('utf-8').rstrip()
 43 |                 if line.endswith('.gz'):
 44 |                     index_paths.append(f'https://data.commoncrawl.org/{line}')
 45 | 
 46 |     return index_paths
 47 | 
 48 | 
 49 | def get_cc_wet_paths() -> Dict[str, str]:
 50 |     """Get a dict of WET file name to WET URL."""
 51 |     wet_urls = {}
 52 |     with tempfile.NamedTemporaryFile() as temp_f:
 53 |         urllib.request.urlretrieve(
 54 |             'https://data.commoncrawl.org/crawl-data/CC-MAIN-2019-47/wet.paths.gz',
 55 |             temp_f.name,
 56 |         )
 57 |         with gzip.open(temp_f.name, 'rb') as f:
 58 |             for line in f:
 59 |                 line = line.decode('utf-8').rstrip()
 60 |                 filename = line.split('/')[-1]
 61 |                 wet_urls[filename] = f'https://data.commoncrawl.org/{line}'
 62 | 
 63 |     return wet_urls
 64 | 
 65 | 
 66 | def process_cc_index(index_url: str) -> Dict[str, List[str]]:
 67 |     """Return a map of WET file to list of URLs it contains."""
 68 |     # Download index file
 69 |     filename = index_url.split('/')[-1]
 70 |     index_files_root.mkdir(exist_ok=True)
 71 |     if not (index_files_root / filename).exists():
 72 |         urllib.request.urlretrieve(index_url, index_files_root / filename)
 73 | 
 74 |     # Parse index file
 75 |     wet_to_urls = defaultdict(list)
 76 |     cc_index_line_pattern = re.compile(r'^[\S]+ \d+ (.*)$')
 77 |     with gzip.open(index_files_root / filename, 'rb') as f:
 78 |         for line in f:
 79 |             line = line.decode('utf-8').rstrip()
 80 |             match = cc_index_line_pattern.match(line)
 81 |             if match:
 82 |                 url_metadata = json.loads(match.group(1))
 83 |                 if (
 84 |                     url_metadata['status'] == '200'
 85 |                     and url_metadata.get('languages') == 'eng'
 86 |                     and url_metadata['mime'] == 'text/html'
 87 |                 ):
 88 |                     wet_filename = url_metadata['filename'].split('/')[-1]
 89 |                     wet_to_urls[wet_filename].append(url_metadata['url'])
 90 |             else:
 91 |                 logging.error(f'Line in index file cannot be matched by regex: {line}')
 92 | 
 93 |     return wet_to_urls
 94 | 
 95 | 
 96 | def sort_and_sample_filter_list(filter_list_path: Path) -> None:
 97 |     """Sort and sample URLs in a filter list."""
 98 |     urls = []
 99 |     with open(filter_list_path) as f:
100 |         for line in f:
101 |             urls.append(line.rstrip())
102 | 
103 |     urls.sort()
104 | 
105 |     with open(sampled_filter_lists_root / filter_list_path.name, 'w') as f:
106 |         for i, url in enumerate(urls):
107 |             if i % 100 == 0:
108 |                 f.write(url + '\n')
109 | 
110 | 
111 | def sample_filter_lists() -> None:
112 |     """Sample filter lists."""
113 |     filter_lists = list(filter_lists_root.iterdir())
114 |     sampled_filter_lists_root.mkdir(exist_ok=True)
115 | 
116 |     with Pool() as p:
117 |         p.map(sort_and_sample_filter_list, filter_lists)
118 | 
119 | 
120 | def process_wet_file(tup: Tuple[Path, str, str, Path],) -> None:
121 |     """Download WET file and extract webpages from WARC whose URL is in the filter list."""
122 |     filter_list, wet_name, wet_url, commoncrawl_docs_root = tup
123 |     accepted_urls = set()
124 |     with open(filter_list) as f:
125 |         for line in f:
126 |             accepted_urls.add(line.rstrip())
127 | 
128 |     attempt = 0
129 |     while attempt < 3:
130 |         try:
131 |             urllib.request.urlretrieve(wet_url, wet_files_cache / wet_name)
132 |             break
133 |         except Exception:
134 |             logging.exception(f'Error while downloading {wet_url}')
135 |             attempt += 1
136 | 
137 |     if not (wet_files_cache / wet_name).exists():
138 |         logging.error(
139 |             f'Failed to download {wet_url} after 3 attempts. Ignoring file...'
140 |         )
141 |         return
142 | 
143 |     with gzip.open(wet_files_cache / wet_name, 'rb') as stream, open(
144 |         commoncrawl_docs_root / f'{wet_name}.jsonl', 'w'
145 |     ) as f:
146 |         for record in ArchiveIterator(stream):
147 |             if record.rec_type == 'conversion':
148 |                 url = record.rec_headers.get_header('WARC-Target-URI')
149 |                 if url not in accepted_urls:
150 |                     continue
151 | 
152 |                 contents = record.content_stream().read().decode('utf-8')
153 |                 if contents.startswith('404 Not Found'):
154 |                     continue
155 | 
156 |                 output_dict = {'id': url, 'contents': contents}
157 | 
158 |                 f.write(json.dumps(output_dict) + '\n')
159 | 
160 |     os.remove(wet_files_cache / wet_name)
161 | 
162 | 
163 | def get_docs_from_wet_files(parallelism, commoncrawl_docs_root: Path) -> None:
164 |     """Download WET files and extract webpages whose URLs is in the filter list."""
165 |     wet_files_cache.mkdir(exist_ok=True)
166 |     commoncrawl_docs_root.mkdir(exist_ok=True, parents=True)
167 | 
168 |     filter_lists = list(sampled_filter_lists_root.iterdir())
169 | 
170 |     # Download WET file paths
171 |     wet_paths = get_cc_wet_paths()
172 |     wet_names = []
173 |     resolved_wet_paths = []
174 |     for filter_list in filter_lists:
175 |         wet_filename = str(filter_list.name).replace('.warc.gz.txt', '.warc.wet.gz')
176 |         wet_names.append(wet_filename)
177 |         resolved_wet_paths.append(wet_paths[wet_filename])
178 | 
179 |     with Pool(parallelism) as p:
180 |         for i, _ in enumerate(
181 |             p.imap_unordered(
182 |                 process_wet_file,
183 |                 zip(
184 |                     filter_lists,
185 |                     wet_names,
186 |                     resolved_wet_paths,
187 |                     [commoncrawl_docs_root for _ in range(len(filter_lists))],
188 |                 ),
189 |             )
190 |         ):
191 |             if (i + 1) % 50 == 0:
192 |                 logging.info(f'Processed {i + 1} / {len(filter_lists)} WET files...')
193 | 
194 | 
195 | def main(parallelism: int, commoncrawl_docs_root: Path):
196 |     cc_index_paths = get_cc_index_paths()
197 | 
198 |     # Construct filter lists
199 |     if filter_lists_root.exists():
200 |         shutil.rmtree(filter_lists_root)
201 |     filter_lists_root.mkdir(exist_ok=True)
202 | 
203 |     for i in range(0, len(cc_index_paths), parallelism):
204 |         with Pool(parallelism) as p:
205 |             logging.info(
206 |                 f'Processing Common Crawl index {i+1}-{min(i + parallelism, len(cc_index_paths))} / {len(cc_index_paths)}...'
207 |             )
208 |             partial_filter_lists = p.map(
209 |                 process_cc_index, cc_index_paths[i : i + parallelism]
210 |             )
211 |             for partial_filter_list in partial_filter_lists:
212 |                 for wet_filename, urls in partial_filter_list.items():
213 |                     with open(filter_lists_root / f'{wet_filename}.txt', 'a') as f:
214 |                         for url in urls:
215 |                             f.writelines(url + '\n')
216 | 
217 |     # Create sampled filter lists
218 |     logging.info('Sorting and sampling filter lists...')
219 |     sample_filter_lists()
220 | 
221 |     # Download WET files and filter records
222 |     logging.info('Processing WET files...')
223 |     get_docs_from_wet_files(parallelism, commoncrawl_docs_root)
224 | 
225 |     # Remove temporary files
226 |     logging.info('Done processing WET files, removing temporary directories...')
227 |     shutil.rmtree(index_files_root)
228 |     shutil.rmtree(filter_lists_root)
229 |     shutil.rmtree(sampled_filter_lists_root)
230 |     shutil.rmtree(wet_files_cache)
231 | 
232 | 
233 | if __name__ == '__main__':
234 |     parser = ArgumentParser(
235 |         description='Creates a corpus of documents from the November 2019 Common Crawl archive'
236 |     )
237 |     parser.add_argument(
238 |         '--output-directory',
239 |         default='docs/common-crawl',
240 |         help='Path to directory containing document output, defaults to docs/common-crawl',
241 |     )
242 |     parser.add_argument(
243 |         '--workers',
244 |         default=8,
245 |         type=int,
246 |         help='Number of workers for downloading in parallel',
247 |     )
248 |     args = parser.parse_args()
249 | 
250 |     logging.basicConfig(level=logging.INFO)
251 | 
252 |     commoncrawl_docs_root = Path(args.output_directory)
253 | 
254 |     main(args.workers, commoncrawl_docs_root)
255 | 


--------------------------------------------------------------------------------
/collection/download_wayback_passages.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # For licensing see accompanying LICENSE file.
  3 | # Copyright (C) 2020 Apple Inc. All Rights Reserved.
  4 | #
  5 | 
  6 | """
  7 | This script downloads webpages in the conversation contexts from the Wayback Machine.
  8 | """
  9 | 
 10 | from argparse import ArgumentParser
 11 | import glob
 12 | import json
 13 | import logging
 14 | import multiprocessing
 15 | from pathlib import Path
 16 | import random
 17 | import re
 18 | import requests
 19 | from requests.exceptions import HTTPError
 20 | import shutil
 21 | import time
 22 | import urllib.parse
 23 | import uuid
 24 | 
 25 | from bs4 import BeautifulSoup
 26 | import pandas as pd
 27 | 
 28 | 
 29 | wayback_prefix = re.compile(r'^https:\/\/web\.archive\.org\/web')
 30 | replace_pattern = re.compile(r'(web\.archive\.org\/web\/\d+)')
 31 | blacklist = [
 32 |     '[document]',
 33 |     'noscript',
 34 |     'header',
 35 |     'html',
 36 |     'meta',
 37 |     'head',
 38 |     'input',
 39 |     'script',
 40 |     'style',
 41 |     # there may be more elements we don't want
 42 | ]
 43 | 
 44 | 
 45 | def download_with_retry(url: str, max_retries: int = 10) -> requests.Response:
 46 |     """Download a URL with exponential backoff, until max_retries is reached."""
 47 |     retry_num = 0
 48 |     while True:
 49 |         try:
 50 |             response = requests.get(url)
 51 |             response.raise_for_status()
 52 |             return response
 53 |         except HTTPError as e:
 54 |             status_code = e.response.status_code
 55 |             if not (status_code == 429 or status_code >= 500):
 56 |                 # This is not an error we should retry on
 57 |                 raise e
 58 | 
 59 |             if retry_num > max_retries:
 60 |                 logging.error(
 61 |                     f'Failed to perform GET request on {url} after {max_retries} retries.'
 62 |                 )
 63 |                 raise e
 64 | 
 65 |             if status_code == 429:
 66 |                 time.sleep(5 + 2 ** retry_num + random.randint(0, 1000) / 1000)
 67 |             else:
 68 |                 time.sleep(2 ** retry_num + random.randint(0, 1000) / 1000)
 69 |             retry_num += 1
 70 | 
 71 | 
 72 | def extract_text(html_text: str) -> str:
 73 |     """Extracts text from an HTML document."""
 74 |     soup = BeautifulSoup(html_text, 'html.parser')
 75 |     text = soup.find_all(text=True)
 76 |     output = ''
 77 |     for t in text:
 78 |         if t.parent.name not in blacklist:
 79 |             output += f'{t} '
 80 | 
 81 |     return output
 82 | 
 83 | 
 84 | def download_link(tup):
 85 |     link = tup[0]
 86 |     output_path = tup[1]
 87 |     num_workers = tup[2]
 88 |     page_id = str(uuid.uuid4())
 89 |     url_no_header = None
 90 | 
 91 |     try:
 92 |         # Find the Wayback Machine link
 93 |         if not wayback_prefix.match(link):
 94 |             link_encoded = urllib.parse.quote(link)
 95 | 
 96 |             available, availability_attempt = False, 0
 97 |             # Sometimes the API returns HTTP success code 200, but archived snapshots shows page is unavailable
 98 |             # when it actually is. Give it a total of three tries.
 99 |             while not available and availability_attempt < 3:
100 |                 response = download_with_retry(
101 |                     f'http://archive.org/wayback/available?url={link_encoded}&timestamp=20191127'
102 |                 )
103 |                 json_response = response.json()
104 |                 available = 'closest' in json_response['archived_snapshots']
105 |                 availability_attempt += 1
106 | 
107 |             if not available:
108 |                 logging.warning(
109 |                     f'Not available on Wayback Machine: {link}, HTTP code {response.status_code}, {json_response}'
110 |                 )
111 |                 return {'link': link, 'page_id': page_id, 'available': False}
112 | 
113 |             url = json_response['archived_snapshots']['closest']['url']
114 |         else:
115 |             url = link
116 | 
117 |         match = replace_pattern.search(url)
118 |         assert match
119 |         url_no_header = replace_pattern.sub(f'{match.group(1)}id_', url)
120 | 
121 |         response = download_with_retry(url_no_header)
122 |         html_page = response.text
123 |         parsed_text = extract_text(html_page)
124 | 
125 |         proc = multiprocessing.current_process()
126 |         pid_mod = str(proc.pid % num_workers)
127 | 
128 |         (output_path / pid_mod).mkdir(parents=True, exist_ok=True)
129 | 
130 |         with open(output_path / pid_mod / page_id, 'w') as f:
131 |             doc = {
132 |                 'id': url_no_header,
133 |                 'contents': parsed_text,
134 |             }
135 |             f.write(json.dumps(doc) + '\n')
136 | 
137 |         return {
138 |             'link': link,
139 |             'page_id': page_id,
140 |             'available': True,
141 |             'status_code': response.status_code,
142 |             'wayback_url': url_no_header,
143 |         }
144 |     except HTTPError as http_err:
145 |         logging.warning(f'HTTP error occurred: {http_err} for {link}')
146 |         return {
147 |             'link': link,
148 |             'page_id': page_id,
149 |             'available': False,
150 |             'status_code': http_err.response.status_code if http_err.response else None,
151 |             'wayback_url': url_no_header,
152 |         }
153 |     except UnicodeDecodeError as e:
154 |         logging.warning(f'Unicode decode error occurred: {e} for {link}')
155 |         return {
156 |             'link': link,
157 |             'page_id': page_id,
158 |             'available': False,
159 |             'status_code': response.status_code,
160 |             'wayback_url': url_no_header,
161 |         }
162 |     except Exception as e:
163 |         logging.warning(f'Exception occurred: {e} for {link}')
164 |         return {
165 |             'link': link,
166 |             'page_id': page_id,
167 |             'available': False,
168 |             'status_code': None,
169 |             'wayback_url': url_no_header,
170 |         }
171 | 
172 | 
173 | def crawl_wayback_machine(
174 |     inputs_globbing_pattern: str, output_dir: str, num_workers: int
175 | ) -> None:
176 |     links = set()
177 |     for dataset in glob.glob(inputs_globbing_pattern):
178 |         with open(dataset) as f:
179 |             data = json.load(f)
180 |             for conversation_turn in data:
181 |                 if conversation_turn['Answer_URL'] == '':
182 |                     continue
183 | 
184 |                 for url in conversation_turn['Answer_URL'].split(' '):
185 |                     if url.endswith('.pdf'):
186 |                         continue
187 | 
188 |                     anchor_sign_pos = url.find('#')
189 |                     if anchor_sign_pos != -1:
190 |                         url = url.split('#')[0]
191 | 
192 |                     links.add(url)
193 |     links = list(links)
194 | 
195 |     output_path = Path(output_dir)
196 |     output_path.mkdir(parents=True, exist_ok=True)
197 | 
198 |     records = []
199 |     with multiprocessing.Pool(num_workers) as p:
200 |         for i, result in enumerate(
201 |             p.imap_unordered(
202 |                 download_link,
203 |                 [(l, output_path, num_workers) for l in links],
204 |                 chunksize=16,
205 |             )
206 |         ):
207 |             records.append(result)
208 |             if (i + 1) % 10 == 0:
209 |                 logging.info(f'Processed {i + 1} / {len(links)} links...')
210 | 
211 |     # Combine small files together into larger files
212 |     for worker_output_dir in output_path.iterdir():
213 |         if worker_output_dir.is_dir():
214 |             with open(output_path / f'{worker_output_dir.name}.jsonl', 'w') as outfile:
215 |                 for single_doc_file in worker_output_dir.iterdir():
216 |                     with open(single_doc_file) as infile:
217 |                         outfile.write(infile.read())
218 | 
219 |             shutil.rmtree(worker_output_dir)
220 | 
221 |     df = pd.DataFrame.from_records(records)
222 |     df.to_csv(output_path / 'summary.tsv', index=False, sep='\t')
223 | 
224 | 
225 | if __name__ == '__main__':
226 |     parser = ArgumentParser(description='Crawl pages from Wayback Machine')
227 |     parser.add_argument(
228 |         '--inputs', required=True, help='Globbing pattern for train and test JSON files'
229 |     )
230 |     parser.add_argument(
231 |         '--output-directory',
232 |         required=True,
233 |         help='Path to directory containing crawled output',
234 |     )
235 |     parser.add_argument(
236 |         '--workers',
237 |         default=4,
238 |         type=int,
239 |         help='Number of workers for downloading in parallel',
240 |     )
241 |     args = parser.parse_args()
242 | 
243 |     logging.basicConfig(level=logging.INFO)
244 |     crawl_wayback_machine(args.inputs, args.output_directory, args.workers)
245 | 


--------------------------------------------------------------------------------
/collection/paragraph_chunker.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # For licensing see accompanying LICENSE file.
  3 | # Copyright (C) 2020 Apple Inc. All Rights Reserved.
  4 | #
  5 | 
  6 | """For a directory of nested JSON lines files, where each line is a document, chunk each document into many passages."""
  7 | 
  8 | from argparse import ArgumentParser
  9 | import json
 10 | import logging
 11 | import multiprocessing
 12 | from pathlib import Path
 13 | from typing import List, Tuple
 14 | 
 15 | MIN_PASSAGE_TOKENS = 220
 16 | 
 17 | 
 18 | def chunk_doc(content: str) -> List[str]:
 19 |     """Given a document, return a list of passages of no fewer than MIN_PASSAGE_TOKENS tokens / passage until EOF."""
 20 |     passages = []
 21 |     passage_tokens = []
 22 |     lines = content.split('\n')
 23 |     for line in lines:
 24 |         line = line.rstrip()
 25 | 
 26 |         if '===' in line:
 27 |             continue
 28 |         if len(line) == 0:
 29 |             continue
 30 | 
 31 |         tokens = line.split()
 32 |         passage_tokens.extend(tokens)
 33 | 
 34 |         if len(passage_tokens) > MIN_PASSAGE_TOKENS:
 35 |             passages.append(' '.join(passage_tokens))
 36 |             passage_tokens = []
 37 | 
 38 |     passages.append(' '.join(passage_tokens))
 39 |     return passages
 40 | 
 41 | 
 42 | def process_file(tup: Tuple[str, str, Path]) -> None:
 43 |     """Chunk all documents in a single file."""
 44 |     input_directory, output_directory, input_file = tup
 45 |     output_file = str(input_file).replace(input_directory, output_directory)
 46 |     output_path = Path(output_file)
 47 |     output_path.parent.mkdir(parents=True, exist_ok=True)
 48 | 
 49 |     with open(input_file) as f1, open(output_path, 'w') as f2:
 50 |         for jsonl in f1:
 51 |             doc = json.loads(jsonl)
 52 |             passages = chunk_doc(doc['contents'])
 53 | 
 54 |             for i, passage in enumerate(passages):
 55 |                 paragraph = {'id': f"{doc['id']}_p{i}", 'contents': passage}
 56 | 
 57 |                 f2.write(json.dumps(paragraph) + '\n')
 58 | 
 59 | 
 60 | def chunk_documents(input_directory: str, output_directory: str, workers: int) -> None:
 61 |     """Iterate .jsonl files in input_directory and output .jsonl files in output_directory where each doc is chunked."""
 62 |     input_directory_path = Path(input_directory)
 63 | 
 64 |     jsonl_files = list(input_directory_path.glob('**/*.jsonl'))
 65 | 
 66 |     with multiprocessing.Pool(workers) as p:
 67 |         for i, _ in enumerate(
 68 |             p.imap_unordered(
 69 |                 process_file,
 70 |                 [(input_directory, output_directory, f) for f in jsonl_files],
 71 |                 chunksize=16,
 72 |             )
 73 |         ):
 74 |             if (i + 1) % 100 == 0:
 75 |                 logging.info(f'Processed {i + 1} / {len(jsonl_files)} files...')
 76 | 
 77 | 
 78 | if __name__ == '__main__':
 79 |     parser = ArgumentParser(
 80 |         description='Chunk documents in .jsonl files into many passages.'
 81 |     )
 82 |     parser.add_argument(
 83 |         '--input-directory',
 84 |         required=True,
 85 |         help='Directory containing .jsonl files to chunk',
 86 |     )
 87 |     parser.add_argument(
 88 |         '--output-directory',
 89 |         required=True,
 90 |         help='Directory to store .jsonl files containing document passages',
 91 |     )
 92 |     parser.add_argument(
 93 |         '--workers',
 94 |         default=8,
 95 |         type=int,
 96 |         help='Number of workers for downloading in parallel',
 97 |     )
 98 |     args = parser.parse_args()
 99 | 
100 |     logging.basicConfig(level=logging.INFO)
101 | 
102 |     chunk_documents(args.input_directory, args.output_directory, args.workers)
103 | 


--------------------------------------------------------------------------------
/dataset/qrecc_data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-qrecc/cf44d03cb6676f7414471cec509d4a6c6858b0d3/dataset/qrecc_data.zip


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.9.3
2 | nltk==3.5
3 | pyserini==0.10.0.1
4 | warcio==1.7.4
5 | 


--------------------------------------------------------------------------------
/utils/evaluate_qa.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # For licensing see accompanying LICENSE file.
 3 | # Copyright (C) 2020 Apple Inc. All Rights Reserved.
 4 | #
 5 | 
 6 | """
 7 | Functions for computing QA evaluation metrics.
 8 | 
 9 | We adapt the functions from the official SQuAD (Rajpurkar et al. '18) evaluation script:
10 | https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/
11 | """
12 | 
13 | import collections
14 | import re
15 | import string
16 | from typing import List
17 | 
18 | 
19 | def normalize_answer(s: str) -> str:
20 |     """Lower text and remove punctuation, articles and extra whitespace."""
21 | 
22 |     def remove_articles(text):
23 |         regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
24 |         return re.sub(regex, ' ', text)
25 | 
26 |     def white_space_fix(text):
27 |         return ' '.join(text.split())
28 | 
29 |     def remove_punc(text):
30 |         exclude = set(string.punctuation)
31 |         return ''.join(ch for ch in text if ch not in exclude)
32 | 
33 |     def lower(text):
34 |         return text.lower()
35 | 
36 |     return white_space_fix(remove_articles(remove_punc(lower(s))))
37 | 
38 | 
39 | def get_tokens(s: str) -> List[str]:
40 |     """Normalize string and split string into tokens."""
41 |     if not s:
42 |         return []
43 |     return normalize_answer(s).split()
44 | 
45 | 
46 | def compute_exact(a_gold: str, a_pred: str) -> int:
47 |     """Compute the Exact Match score."""
48 |     return int(normalize_answer(a_gold) == normalize_answer(a_pred))
49 | 
50 | 
51 | def compute_f1_from_tokens(gold_toks: List[str], pred_toks: List[str]) -> float:
52 |     """Compute the F1 score from tokenized gold answer and prediction."""
53 |     common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
54 |     num_same = sum(common.values())
55 | 
56 |     if len(gold_toks) == 0 or len(pred_toks) == 0:
57 |         # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
58 |         return int(gold_toks == pred_toks)
59 | 
60 |     if num_same == 0:
61 |         return 0
62 | 
63 |     precision = 1.0 * num_same / len(pred_toks)
64 |     recall = 1.0 * num_same / len(gold_toks)
65 |     f1 = (2 * precision * recall) / (precision + recall)
66 |     return f1
67 | 
68 | 
69 | def compute_f1(a_gold: str, a_pred: str) -> float:
70 |     """Compute the F1 score."""
71 |     gold_toks = get_tokens(a_gold)
72 |     pred_toks = get_tokens(a_pred)
73 |     return compute_f1_from_tokens(gold_toks, pred_toks)
74 | 


--------------------------------------------------------------------------------
/utils/evaluate_retrieval.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # For licensing see accompanying LICENSE file.
  3 | # Copyright (C) 2021 Apple Inc. All Rights Reserved.
  4 | #
  5 | 
  6 | from argparse import ArgumentParser
  7 | import json
  8 | import logging
  9 | from multiprocessing import Pool
 10 | from pathlib import Path
 11 | from typing import Dict, List, Tuple
 12 | 
 13 | from evaluate_qa import compute_exact, compute_f1
 14 | from span_heuristic import find_closest_span_match
 15 | 
 16 | """
 17 | Functions for evaluating passage retrieval.
 18 | 
 19 | This is used to compute MRR (mean reciprocal rank), Recall@10, and Recall@100 in Table 5 of the paper.
 20 | """
 21 | 
 22 | 
 23 | RELEVANCE_THRESHOLD = 0.8
 24 | 
 25 | 
 26 | def compute_f1_for_retrieved_passage(line: str) -> dict:
 27 |     """
 28 |     Given a serialized JSON line, with fields 'content' and 'answer', find the closest span matching answer,
 29 |     update the deserialized dict with the span and F1 score, and return the dict.
 30 |     """
 31 |     data = json.loads(line)
 32 |     content, answer = data['content'], data['answer']
 33 | 
 34 |     # If there is no answer, although the closest extractive answer is '', in the MRR and recall@k functions below
 35 |     # we do not count any passage for these questions as relevant.
 36 |     if len(answer) < 1:
 37 |         data['heuristic_answer'] = ''
 38 |         data['f1'] = compute_f1(answer, '')
 39 |         return data
 40 | 
 41 |     best_span, best_f1 = find_closest_span_match(content, answer)
 42 | 
 43 |     data['heuristic_answer'] = best_span
 44 |     data['f1'] = best_f1
 45 | 
 46 |     return data
 47 | 
 48 | 
 49 | def compute_mean_reciprocal_rank(
 50 |     question_id_to_docs: Dict[str, List[dict]], relevance_threshold: float
 51 | ) -> float:
 52 |     """Given a dictionary mapping a question id to a list of docs, find the mean reciprocal rank."""
 53 |     recip_rank_sum = 0
 54 |     for qid, docs in question_id_to_docs.items():
 55 |         top_rank = float('inf')
 56 |         for doc in docs:
 57 |             if len(doc['answer']) > 0 and doc['f1'] >= relevance_threshold:
 58 |                 top_rank = min(top_rank, doc['rank'])
 59 | 
 60 |         recip_rank = 1 / top_rank if top_rank != float('inf') else 0
 61 |         recip_rank_sum += recip_rank
 62 | 
 63 |     return recip_rank_sum / len(question_id_to_docs)
 64 | 
 65 | 
 66 | def compute_recall_at_k(
 67 |     question_id_to_docs: Dict[str, List[dict]], k: int, relevance_threshold: float
 68 | ) -> float:
 69 |     """
 70 |     Given a dictionary mapping a question id to a list of docs, find the recall@k.
 71 | 
 72 |     We define recall@k = 1.0 if any document in the top-k is relevant, and 0 otherwise.
 73 |     """
 74 |     relevant_doc_found_total = 0
 75 |     for qid, docs in question_id_to_docs.items():
 76 |         relevant_doc_found = 0
 77 |         for doc in docs:
 78 |             if len(doc['answer']) > 0 and doc['f1'] >= relevance_threshold and doc['rank'] <= k:
 79 |                 relevant_doc_found = 1
 80 |                 break
 81 | 
 82 |         relevant_doc_found_total += relevant_doc_found
 83 | 
 84 |     return relevant_doc_found_total / len(question_id_to_docs)
 85 | 
 86 | 
 87 | def compute_extractive_upper_bounds(
 88 |     question_id_to_docs: Dict[str, List[dict]], temp_files_directory: Path
 89 | ) -> Tuple[float, float]:
 90 |     """Given a dictionary mapping a question id to a list of docs, find the extractive upper bounds of (EM, F1)."""
 91 |     total_em, total_f1 = 0, 0.0
 92 |     with open(temp_files_directory / 'retrieved-passages-relevant-f1.jsonl', 'w') as outfile:
 93 |         for qid, docs in question_id_to_docs.items():
 94 |             best_em, best_f1 = 0, 0.0
 95 |             best_doc = docs[0]
 96 |             for doc in docs:
 97 |                 em = compute_exact(doc['answer'], doc['heuristic_answer'])
 98 |                 f1 = compute_f1(doc['answer'], doc['heuristic_answer'])
 99 |                 if f1 > best_f1:
100 |                     best_doc = doc
101 |                 best_em = max(best_em, em)
102 |                 best_f1 = max(best_f1, f1)
103 |                 if best_em == 1 and best_f1 == 1.0:
104 |                     break
105 | 
106 |             total_em += best_em
107 |             total_f1 += best_f1
108 | 
109 |             outfile.write(json.dumps(best_doc) + '\n')
110 | 
111 |     return (
112 |         total_em / len(question_id_to_docs),
113 |         total_f1 / len(question_id_to_docs),
114 |     )
115 | 
116 | 
117 | def get_unique_relevant_docs_count(
118 |     question_id_to_docs: Dict[str, List[dict]], relevance_threshold: float
119 | ) -> float:
120 |     """Given a dictionary mapping a question id to a list of docs, find the number of unique relevant docs."""
121 |     unique_relevant_docs = set()
122 |     for qid, docs in question_id_to_docs.items():
123 |         for doc in docs:
124 |             if len(doc['answer']) > 0 and doc['f1'] >= relevance_threshold:
125 |                 unique_relevant_docs.add(doc['docid'])
126 | 
127 |     return len(unique_relevant_docs)
128 | 
129 | 
130 | def get_average_relevant_docs_per_question(
131 |     question_id_to_docs: Dict[str, List[dict]], relevance_threshold: float
132 | ) -> float:
133 |     """Given a dictionary mapping a question id to a list of docs, find the average number of relevant docs per question."""
134 |     relevant_docs = 0
135 |     for qid, docs in question_id_to_docs.items():
136 |         for doc in docs:
137 |             if len(doc['answer']) > 0 and doc['f1'] >= relevance_threshold:
138 |                 relevant_docs += 1
139 | 
140 |     return relevant_docs / len(question_id_to_docs)
141 | 
142 | 
143 | def main(retrieved_passages_pattern: str, temp_files_directory: str, workers: int):
144 |     retrieved_passages_files = Path().glob(retrieved_passages_pattern)
145 |     temp_files_directory = Path(temp_files_directory)
146 |     temp_files_directory.mkdir(exist_ok=True, parents=True)
147 | 
148 |     question_id_to_docs = {}
149 | 
150 |     for retrieved_passages_file in retrieved_passages_files:
151 |         with open(retrieved_passages_file) as infile:
152 |             with Pool(workers) as p:
153 |                 for i, passage_results in enumerate(
154 |                     p.imap(compute_f1_for_retrieved_passage, infile)
155 |                 ):
156 |                     if (i + 1) % 5000 == 0:
157 |                         logging.info(
158 |                             f'Processing {retrieved_passages_file.name}, {i + 1} lines done...'
159 |                         )
160 | 
161 |                     qid = f"{passage_results['Conversation-ID']}_{passage_results['Turn-ID']}"
162 |                     if qid not in question_id_to_docs:
163 |                         question_id_to_docs[qid] = []
164 | 
165 |                     question_id_to_docs[qid].append(
166 |                         {
167 |                             'Conversation-ID': passage_results['Conversation-ID'],
168 |                             'Turn-ID': passage_results['Turn-ID'],
169 |                             'docid': passage_results['docid'],
170 |                             'content': passage_results['content'],
171 |                             'rank': passage_results['rank'],
172 |                             'answer': passage_results['answer'],
173 |                             'heuristic_answer': passage_results['heuristic_answer'],
174 |                             'f1': passage_results['f1'],
175 |                         }
176 |                     )
177 | 
178 |     print('Final metrics:')
179 |     unique_relevant_docs = get_unique_relevant_docs_count(question_id_to_docs, RELEVANCE_THRESHOLD)
180 |     unique_docs_perfect_f1 = get_unique_relevant_docs_count(question_id_to_docs, 1.0)
181 |     avg_relevant_docs_per_question = get_average_relevant_docs_per_question(
182 |         question_id_to_docs, 1.0
183 |     )
184 | 
185 |     print(f'Total number of unique queries: {len(question_id_to_docs)}')
186 |     print(f'Total number of unique relevant docs: {unique_relevant_docs}')
187 |     print(f'Total number of unique docs with F1=1.0: {unique_docs_perfect_f1}')
188 |     print(f'Average number of relevant docs per query: {avg_relevant_docs_per_question}')
189 | 
190 |     mrr = compute_mean_reciprocal_rank(question_id_to_docs, RELEVANCE_THRESHOLD)
191 |     recall_at_10 = compute_recall_at_k(question_id_to_docs, 10, RELEVANCE_THRESHOLD)
192 |     recall_at_100 = compute_recall_at_k(question_id_to_docs, 100, RELEVANCE_THRESHOLD)
193 |     print(f'Mean Reciprocal Rank (MRR): {mrr:.4f}')
194 |     print(f'Recall@10: {recall_at_10 * 100:.2f}%')
195 |     print(f'Recall@100: {recall_at_100 * 100:.2f}%')
196 | 
197 |     em_upper_bound, f1_upper_bound = compute_extractive_upper_bounds(
198 |         question_id_to_docs, temp_files_directory
199 |     )
200 |     print(f'Extractive Upper Bound for EM (100 point scale): {em_upper_bound * 100:.2f}')
201 |     print(f'Extractive Upper Bound for F1 (100 point scale): {f1_upper_bound * 100:.2f}')
202 | 
203 | 
204 | if __name__ == '__main__':
205 |     parser = ArgumentParser(description='Passage retrieval evaluation')
206 |     parser.add_argument(
207 |         '--retrieved-passages-pattern',
208 |         required=True,
209 |         help="""A globbing pattern to select .jsonl files containing retrieved passages.
210 |         Each json line should contain the fields 'Conversation-ID', 'Turn-ID', 'docid', 'content', 'answer', 'rank'.
211 |         'answer' is the gold answer given in the QReCC dataset and rank is the rank of the document starting from 1.""",
212 |     )
213 |     parser.add_argument(
214 |         '--temp-files-directory',
215 |         default='/tmp/qrecc-retrieval-eval',
216 |         help='Directory to store temporary files containing F1 scores, which can be used for debugging and analysis',
217 |     )
218 |     parser.add_argument(
219 |         '--workers', default=8, type=int, help='Number of workers for parallel processing',
220 |     )
221 |     args = parser.parse_args()
222 | 
223 |     logging.basicConfig(level=logging.INFO)
224 | 
225 |     main(args.retrieved_passages_pattern, args.temp_files_directory, args.workers)
226 | 


--------------------------------------------------------------------------------
/utils/span_heuristic.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # For licensing see accompanying LICENSE file.
 3 | # Copyright (C) 2020 Apple Inc. All Rights Reserved.
 4 | #
 5 | 
 6 | """
 7 | Heuristic for finding a span in some passage that's close to the golden span.
 8 | """
 9 | 
10 | from difflib import SequenceMatcher as SM
11 | import re
12 | import string
13 | from typing import List, Tuple
14 | 
15 | from nltk.util import ngrams
16 | 
17 | from evaluate_qa import compute_f1, compute_f1_from_tokens, get_tokens, normalize_answer
18 | 
19 | 
20 | ARTICLES_RE = re.compile(r'\b(a|an|the)\b', re.UNICODE)
21 | EXCLUDED_PUNCTS = set(string.punctuation)
22 | 
23 | 
24 | def _find_approximate_matching_sequence(context: str, target: str) -> Tuple[str, float]:
25 |     """Find some substring in the context which closely matches the target, returning this substring with a score."""
26 |     if target in context:
27 |         return target, 1.0
28 | 
29 |     target_length = len(target.split())
30 |     max_sim_val = 0
31 |     max_sim_string = ''
32 |     seq_matcher = SM()
33 |     seq_matcher.set_seq2(target)
34 |     for ngram in ngrams(context.split(), target_length + int(0.05 * target_length)):
35 |         candidate_ngram = ' '.join(ngram)
36 |         seq_matcher.set_seq1(candidate_ngram)
37 |         similarity = seq_matcher.quick_ratio()
38 |         if similarity > max_sim_val:
39 |             max_sim_val = similarity
40 |             max_sim_string = candidate_ngram
41 |         if similarity == 1.0:
42 |             # early exiting
43 |             break
44 | 
45 |     return max_sim_string, max_sim_val
46 | 
47 | 
48 | def _normalize_tokens(tokens: List[str], keep_empty_str=True) -> List[str]:
49 |     """
50 |     Normalize individual tokens.
51 | 
52 |     If keep_empty_str is True, this keeps the overall number of tokens the same.
53 |     A particular token could be normalized to an empty string.
54 |     """
55 |     normalized_tokens = []
56 |     for token in tokens:
57 |         token = token.lower()
58 |         token = ''.join(ch for ch in token if ch not in EXCLUDED_PUNCTS)
59 |         token = re.sub(ARTICLES_RE, '', token)
60 |         if keep_empty_str or len(token):
61 |             normalized_tokens.append(token)
62 | 
63 |     return normalized_tokens
64 | 
65 | 
66 | def find_closest_span_match(passage: str, gold_answer: str) -> Tuple[str, float]:
67 |     """Heuristic for finding the closest span in a passage relative to some golden answer based on F1 score."""
68 |     closest_encompassing_span, closest_encompassing_span_score = _find_approximate_matching_sequence(passage, gold_answer)
69 |     closest_encompassing_span_tok = closest_encompassing_span.split()
70 |     gold_answer_tok = gold_answer.split()
71 |     closest_encompassing_span_tok_normalized = _normalize_tokens(closest_encompassing_span_tok)
72 |     gold_answer_tok_normalized = _normalize_tokens(gold_answer_tok, keep_empty_str=False)
73 | 
74 |     best_span, best_score, best_i, best_j = '', 0, None, None
75 |     for i in range(0, len(closest_encompassing_span_tok_normalized)):
76 |         for j in range(i + 1, len(closest_encompassing_span_tok_normalized) + 1):
77 |             score = compute_f1_from_tokens(
78 |                 gold_answer_tok_normalized,
79 |                 [t for t in closest_encompassing_span_tok_normalized[i:j] if len(t)],
80 |             )
81 |             if score > best_score:
82 |                 best_score = score
83 |                 best_i, best_j = i, j
84 | 
85 |     best_span = ' '.join(closest_encompassing_span_tok[best_i:best_j])
86 |     best_f1 = compute_f1(gold_answer, best_span)
87 |     return best_span, best_f1
88 | 


--------------------------------------------------------------------------------