├── .gitignore ├── ACKNOWLEDGEMENTS ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── collection ├── README.md ├── download_commoncrawl_passages.py ├── download_wayback_passages.py └── paragraph_chunker.py ├── dataset └── qrecc_data.zip ├── requirements.txt └── utils ├── evaluate_qa.py ├── evaluate_retrieval.py └── span_heuristic.py /.gitignore: -------------------------------------------------------------------------------- 1 | dataset/__MACOSX/ 2 | dataset/qrecc_test.json 3 | dataset/qrecc_train.json 4 | .DS_Store -------------------------------------------------------------------------------- /ACKNOWLEDGEMENTS: -------------------------------------------------------------------------------- 1 | Acknowledgements 2 | 3 | Portions of ml-qrecc may utilize the following copyrighted 4 | material, the use of which is hereby acknowledged. 5 | 6 | _____________________ 7 | 8 | Leonard Richardson and contributors (Beautiful Soup) 9 | The MIT License 10 | 11 | Copyright (c) 2004-2020 Leonard Richardson and contributors 12 | 13 | Permission is hereby granted, free of charge, to any person obtaining a copy 14 | of this software and associated documentation files (the "Software"), to deal 15 | in the Software without restriction, including without limitation the rights 16 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 17 | copies of the Software, and to permit persons to whom the Software is 18 | furnished to do so, subject to the following conditions: 19 | 20 | The above copyright notice and this permission notice shall be included in 21 | all copies or substantial portions of the Software. 22 | 23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 24 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 25 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 26 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 27 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 28 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 29 | THE SOFTWARE. 30 | 31 | NLTK Authors (NLTK) 32 | Copyright (C) 2001-2020 NLTK Project 33 | 34 | Licensed under the Apache License, Version 2.0 (the "License"); 35 | you may not use this file except in compliance with the License. 36 | You may obtain a copy of the License at 37 | 38 | http://www.apache.org/licenses/LICENSE-2.0 39 | 40 | Unless required by applicable law or agreed to in writing, software 41 | distributed under the License is distributed on an "AS IS" BASIS, 42 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 43 | See the License for the specific language governing permissions and 44 | limitations under the License. 45 | 46 | Pyserini Authors (Pyserini) 47 | Copyright 2019-2020, The Pyserini Authors. 48 | 49 | Licensed under the Apache License, Version 2.0 (the "License"); 50 | you may not use this file except in compliance with the License. 51 | You may obtain a copy of the License at 52 | 53 | http://www.apache.org/licenses/LICENSE-2.0 54 | 55 | Unless required by applicable law or agreed to in writing, software 56 | distributed under the License is distributed on an "AS IS" BASIS, 57 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 58 | See the License for the specific language governing permissions and 59 | limitations under the License. 60 | 61 | warcio Authors (warcio) 62 | Copyright 2017-2020, The warcio Authors. 63 | 64 | Licensed under the Apache License, Version 2.0 (the "License"); 65 | you may not use this file except in compliance with the License. 66 | You may obtain a copy of the License at 67 | 68 | http://www.apache.org/licenses/LICENSE-2.0 69 | 70 | Unless required by applicable law or agreed to in writing, software 71 | distributed under the License is distributed on an "AS IS" BASIS, 72 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 73 | See the License for the specific language governing permissions and 74 | limitations under the License. 75 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the open source team at [opensource-conduct@group.apple.com](mailto:opensource-conduct@group.apple.com). All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.4, 71 | available at [https://www.contributor-covenant.org/version/1/4/code-of-conduct.html](https://www.contributor-covenant.org/version/1/4/code-of-conduct.html) -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribution Guide 2 | 3 | Thanks for your interest in contributing. This project was released to accompany a research paper for purposes of reproducability, and beyond its publication there are limited plans for future development of the repository. 4 | 5 | While we welcome new pull requests and issues please note that our response may be limited. Forks and out-of-tree improvements are strongly encouraged. 6 | 7 | ## Before you get started 8 | 9 | By submitting a pull request, you represent that you have the right to license your contribution to Apple and the community, and agree by submitting the patch that your contributions are licensed under the [LICENSE](LICENSE). 10 | 11 | We ask that all community members read and observe our [Code of Conduct](CODE_OF_CONDUCT.md). -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | 3 | Apache License 4 | Version 2.0, January 2004 5 | http://www.apache.org/licenses/ 6 | 7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 8 | 9 | 1. Definitions. 10 | 11 | "License" shall mean the terms and conditions for use, reproduction, 12 | and distribution as defined by Sections 1 through 9 of this document. 13 | 14 | "Licensor" shall mean the copyright owner or entity authorized by 15 | the copyright owner that is granting the License. 16 | 17 | "Legal Entity" shall mean the union of the acting entity and all 18 | other entities that control, are controlled by, or are under common 19 | control with that entity. For the purposes of this definition, 20 | "control" means (i) the power, direct or indirect, to cause the 21 | direction or management of such entity, whether by contract or 22 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 23 | outstanding shares, or (iii) beneficial ownership of such entity. 24 | 25 | "You" (or "Your") shall mean an individual or Legal Entity 26 | exercising permissions granted by this License. 27 | 28 | "Source" form shall mean the preferred form for making modifications, 29 | including but not limited to software source code, documentation 30 | source, and configuration files. 31 | 32 | "Object" form shall mean any form resulting from mechanical 33 | transformation or translation of a Source form, including but 34 | not limited to compiled object code, generated documentation, 35 | and conversions to other media types. 36 | 37 | "Work" shall mean the work of authorship, whether in Source or 38 | Object form, made available under the License, as indicated by a 39 | copyright notice that is included in or attached to the work 40 | (an example is provided in the Appendix below). 41 | 42 | "Derivative Works" shall mean any work, whether in Source or Object 43 | form, that is based on (or derived from) the Work and for which the 44 | editorial revisions, annotations, elaborations, or other modifications 45 | represent, as a whole, an original work of authorship. For the purposes 46 | of this License, Derivative Works shall not include works that remain 47 | separable from, or merely link (or bind by name) to the interfaces of, 48 | the Work and Derivative Works thereof. 49 | 50 | "Contribution" shall mean any work of authorship, including 51 | the original version of the Work and any modifications or additions 52 | to that Work or Derivative Works thereof, that is intentionally 53 | submitted to Licensor for inclusion in the Work by the copyright owner 54 | or by an individual or Legal Entity authorized to submit on behalf of 55 | the copyright owner. For the purposes of this definition, "submitted" 56 | means any form of electronic, verbal, or written communication sent 57 | to the Licensor or its representatives, including but not limited to 58 | communication on electronic mailing lists, source code control systems, 59 | and issue tracking systems that are managed by, or on behalf of, the 60 | Licensor for the purpose of discussing and improving the Work, but 61 | excluding communication that is conspicuously marked or otherwise 62 | designated in writing by the copyright owner as "Not a Contribution." 63 | 64 | "Contributor" shall mean Licensor and any individual or Legal Entity 65 | on behalf of whom a Contribution has been received by Licensor and 66 | subsequently incorporated within the Work. 67 | 68 | 2. Grant of Copyright License. Subject to the terms and conditions of 69 | this License, each Contributor hereby grants to You a perpetual, 70 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 71 | copyright license to reproduce, prepare Derivative Works of, 72 | publicly display, publicly perform, sublicense, and distribute the 73 | Work and such Derivative Works in Source or Object form. 74 | 75 | 3. Grant of Patent License. Subject to the terms and conditions of 76 | this License, each Contributor hereby grants to You a perpetual, 77 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 78 | (except as stated in this section) patent license to make, have made, 79 | use, offer to sell, sell, import, and otherwise transfer the Work, 80 | where such license applies only to those patent claims licensable 81 | by such Contributor that are necessarily infringed by their 82 | Contribution(s) alone or by combination of their Contribution(s) 83 | with the Work to which such Contribution(s) was submitted. If You 84 | institute patent litigation against any entity (including a 85 | cross-claim or counterclaim in a lawsuit) alleging that the Work 86 | or a Contribution incorporated within the Work constitutes direct 87 | or contributory patent infringement, then any patent licenses 88 | granted to You under this License for that Work shall terminate 89 | as of the date such litigation is filed. 90 | 91 | 4. Redistribution. You may reproduce and distribute copies of the 92 | Work or Derivative Works thereof in any medium, with or without 93 | modifications, and in Source or Object form, provided that You 94 | meet the following conditions: 95 | 96 | (a) You must give any other recipients of the Work or 97 | Derivative Works a copy of this License; and 98 | 99 | (b) You must cause any modified files to carry prominent notices 100 | stating that You changed the files; and 101 | 102 | (c) You must retain, in the Source form of any Derivative Works 103 | that You distribute, all copyright, patent, trademark, and 104 | attribution notices from the Source form of the Work, 105 | excluding those notices that do not pertain to any part of 106 | the Derivative Works; and 107 | 108 | (d) If the Work includes a "NOTICE" text file as part of its 109 | distribution, then any Derivative Works that You distribute must 110 | include a readable copy of the attribution notices contained 111 | within such NOTICE file, excluding those notices that do not 112 | pertain to any part of the Derivative Works, in at least one 113 | of the following places: within a NOTICE text file distributed 114 | as part of the Derivative Works; within the Source form or 115 | documentation, if provided along with the Derivative Works; or, 116 | within a display generated by the Derivative Works, if and 117 | wherever such third-party notices normally appear. The contents 118 | of the NOTICE file are for informational purposes only and 119 | do not modify the License. You may add Your own attribution 120 | notices within Derivative Works that You distribute, alongside 121 | or as an addendum to the NOTICE text from the Work, provided 122 | that such additional attribution notices cannot be construed 123 | as modifying the License. 124 | 125 | You may add Your own copyright statement to Your modifications and 126 | may provide additional or different license terms and conditions 127 | for use, reproduction, or distribution of Your modifications, or 128 | for any such Derivative Works as a whole, provided Your use, 129 | reproduction, and distribution of the Work otherwise complies with 130 | the conditions stated in this License. 131 | 132 | 5. Submission of Contributions. Unless You explicitly state otherwise, 133 | any Contribution intentionally submitted for inclusion in the Work 134 | by You to the Licensor shall be under the terms and conditions of 135 | this License, without any additional terms or conditions. 136 | Notwithstanding the above, nothing herein shall supersede or modify 137 | the terms of any separate license agreement you may have executed 138 | with Licensor regarding such Contributions. 139 | 140 | 6. Trademarks. This License does not grant permission to use the trade 141 | names, trademarks, service marks, or product names of the Licensor, 142 | except as required for reasonable and customary use in describing the 143 | origin of the Work and reproducing the content of the NOTICE file. 144 | 145 | 7. Disclaimer of Warranty. Unless required by applicable law or 146 | agreed to in writing, Licensor provides the Work (and each 147 | Contributor provides its Contributions) on an "AS IS" BASIS, 148 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 149 | implied, including, without limitation, any warranties or conditions 150 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 151 | PARTICULAR PURPOSE. You are solely responsible for determining the 152 | appropriateness of using or redistributing the Work and assume any 153 | risks associated with Your exercise of permissions under this License. 154 | 155 | 8. Limitation of Liability. In no event and under no legal theory, 156 | whether in tort (including negligence), contract, or otherwise, 157 | unless required by applicable law (such as deliberate and grossly 158 | negligent acts) or agreed to in writing, shall any Contributor be 159 | liable to You for damages, including any direct, indirect, special, 160 | incidental, or consequential damages of any character arising as a 161 | result of this License or out of the use or inability to use the 162 | Work (including but not limited to damages for loss of goodwill, 163 | work stoppage, computer failure or malfunction, or any and all 164 | other commercial damages or losses), even if such Contributor 165 | has been advised of the possibility of such damages. 166 | 167 | 9. Accepting Warranty or Additional Liability. While redistributing 168 | the Work or Derivative Works thereof, You may choose to offer, 169 | and charge a fee for, acceptance of support, warranty, indemnity, 170 | or other liability obligations and/or rights consistent with this 171 | License. However, in accepting such obligations, You may act only 172 | on Your own behalf and on Your sole responsibility, not on behalf 173 | of any other Contributor, and only if You agree to indemnify, 174 | defend, and hold each Contributor harmless for any liability 175 | incurred by, or claims asserted against, such Contributor by reason 176 | of your accepting any such warranty or additional liability. 177 | 178 | END OF TERMS AND CONDITIONS 179 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Open-Domain Question Answering Goes Conversational via Question Rewriting 2 | 3 | [**Tasks**](#task-description) | [**Dataset**](#dataset) | [**Evaluation**](#evaluation) | 4 | [**Paper**](https://arxiv.org/abs/2010.04898) | 5 | [**Citation**](#citation) | [**License**](#license) 6 | 7 | We introduce QReCC (**Q**uestion **Re**writing in **C**onversational **C**ontext), an end-to-end open-domain question answering dataset comprising of 14K conversations with 81K question-answer pairs. 8 | The goal of this dataset is to provide a challenging benchmark for end-to-end conversational question answering that includes the individual subtasks of question rewriting, passage retrieval and reading comprehension. 9 | Please refer to our paper [Open-Domain Question Answering Goes Conversational via Question Rewriting](https://arxiv.org/abs/2010.04898) for details. 10 | 11 | ## Task Description 12 | 13 | The task in QReCC is to find answers to conversational questions within a collection of 10M web pages split into 54M passages. 14 | Answers to questions in the same conversation may be distributed across several web pages. 15 | 16 | ## Dataset 17 | 18 | QReCC contains 14K conversations with 81K question-answer pairs. 19 | We build QReCC on questions from [TREC CAsT](https://github.com/daltonj/treccastweb/tree/master/2019/data), [QuAC](https://quac.ai) and [Google Natural Questions](https://github.com/google-research-datasets/natural-questions). 20 | While TREC CAsT and QuAC datasets contain multi-turn conversations, Natural Questions is not a conversational dataset. 21 | We used questions in NQ dataset as prompts to create conversations explicitly balancing types of context-dependent questions, such as anaphora (co-references) and ellipsis. 22 | 23 | For each query we collect query rewrites by resolving references, the resulting query rewrite is a context-independent version of the original (context-dependent) question. 24 | The rewritten query is then used to with a search engine to answer the question. Each query is also annotated with answer, link to the web page that used to produce the answer. 25 | 26 | Each conversation in the dataset contains a unique `Conversation_no`, `Turn_no` unique within a conversation, the original `Question`, `Context`, `Rewrite`, `Answer` with `Answer_URL` and the `Conversation_source`. 27 | 28 | ```json 29 | { 30 | "Context": [ 31 | "What are the pros and cons of electric cars?", 32 | "Some pros are: They're easier on the environment. Electricity is cheaper than gasoline. Maintenance is less frequent and less expensive. They're very quiet. You'll get tax credits. They can shorten your commute time. Some cons are: Most EVs have pretty short ranges. Recharging can take a while." 33 | ], 34 | "Question": "Tell me more about Tesla", 35 | "Rewrite": "Tell me more about Tesla the car company.", 36 | "Answer": "Tesla Inc. is an American automotive and energy company based in Palo Alto, California. The company specializes in electric car manufacturing and, through its SolarCity subsidiary, solar panel manufacturing.", 37 | "Answer_URL": "https://en.wikipedia.org/wiki/Tesla,_Inc.", 38 | "Conversation_no": 74, 39 | "Turn_no": 2, 40 | "Conversation_source": "trec" 41 | } 42 | ``` 43 | 44 | ## Evaluation 45 | 46 | ### Evaluate performance on Retrieval Question Answering task 47 | 48 | To evaluate retrieval QA, use [evaluate_retrieval.py](https://github.com/apple/ml-qrecc/blob/main/utils/evaluate_retrieval.py) 49 | 50 | ### Evaluate performance on Extractive Question Answering task 51 | 52 | To evaluate extractive QA, use [evaluate_qa.py](https://github.com/apple/ml-qrecc/blob/main/utils/evaluate_qa.py) 53 | 54 | ## Citation 55 | 56 | Please cite the following if you found QReCC dataset, our [paper](https://arxiv.org/abs/2010.04898), or these resources useful. 57 | 58 | ```bibtex 59 | @article{qrecc, 60 | title={Open-Domain Question Answering Goes Conversational via Question Rewriting}, 61 | author={Anantha, Raviteja and Vakulenko, Svitlana and Tu, Zhucheng and Longpre, Shayne and Pulman, Stephen and Chappidi, Srinivas}, 62 | journal={Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, 63 | year={2021} 64 | } 65 | ``` 66 | 67 | ## License 68 | 69 | The code in this repository is licensed according to the [LICENSE](LICENSE) file. 70 | 71 | The QReCC dataset is licensed under the Creative Commons Attribution-ShareAlike 3.0 Unported License. 72 | To view a copy of this license, visit http://creativecommons.org/licenses/by-sa/3.0/. 73 | 74 | ## Contact Us 75 | 76 | To contact us feel free to email the authors in the paper or create an issue in this repository. 77 | -------------------------------------------------------------------------------- /collection/README.md: -------------------------------------------------------------------------------- 1 | # Building the Collection 2 | 3 | This directory contains the scripts and instructions for downloading, processing, and building the collection for our baseline. 4 | 5 | The collection consists of webpages from the Common Crawl and the Wayback Machine. 6 | **Please note that the Common Crawl collection is [quite large](https://commoncrawl.org/2019/11/november-2019-crawl-archive-now-available/) (order of tens of TBs), so please check the data cap of your internet plan to make sure you stay within the limit.** 7 | 8 | To download pages from the Common Crawl, run the following command. 9 | This saves webpage documents in [JSON lines](https://jsonlines.org) (.jsonl) format into the `collection/commoncrawl` subdirectory. 10 | For us this took slightly over a day to run. 11 | 12 | ```bash 13 | time python download_commoncrawl_passages.py --output-directory collection/commoncrawl --workers 8 14 | ``` 15 | 16 | To download pages from the Wayback Machine, run the following command after you've extracted the dataset. 17 | This saves webpage documents in .jsonl format into the `collection/wayback` subdirectory. 18 | For us this took 9 hours to run. 19 | 20 | ```bash 21 | time python download_wayback_passages.py --inputs '../dataset/*.json' --output-directory collection/wayback --workers 4 22 | ``` 23 | 24 | Next we segmented the webpage documents into smaller passages. 25 | This is quite quick and took several minutes. 26 | 27 | ```bash 28 | time python paragraph_chunker.py --input-directory collection --output-directory collection-paragraph --workers 8 29 | ``` 30 | 31 | Finally we indexed the passages using [Pyserini](https://github.com/castorini/pyserini/), a Python wrapper around [Anserini](http://anserini.io/), an information retrieval toolkit built on Lucene. 32 | Java (JDK) is needed as a pre-requisite. 33 | After installing Pyserini we used the following command to build the index. 34 | For us this took less than 2 hours. 35 | 36 | ```bash 37 | time python -m pyserini.index -collection JsonCollection -generator DefaultLuceneDocumentGenerator \ 38 | -threads 76 -input collection-paragraph \ 39 | -index index-paragraph -storePositions -storeDocvectors -storeRaw 40 | ``` 41 | -------------------------------------------------------------------------------- /collection/download_commoncrawl_passages.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2020 Apple Inc. All Rights Reserved. 4 | # 5 | 6 | """ 7 | This script creates a corpus of documents from the November 2019 Common Crawl archive. 8 | """ 9 | 10 | from argparse import ArgumentParser 11 | from collections import defaultdict 12 | import gzip 13 | import json 14 | import logging 15 | from multiprocessing import Pool 16 | import os 17 | from pathlib import Path 18 | import re 19 | import shutil 20 | import tempfile 21 | from typing import Dict, List, Tuple 22 | import urllib.request 23 | 24 | from warcio.archiveiterator import ArchiveIterator 25 | 26 | index_files_root = Path('index-files') 27 | filter_lists_root = Path('filter-lists') 28 | sampled_filter_lists_root = Path('filter-lists-sampled') 29 | wet_files_cache = Path('wet-files') 30 | 31 | 32 | def get_cc_index_paths() -> List[str]: 33 | """Get a list of paths for Common Crawl URL index files.""" 34 | index_paths = [] 35 | with tempfile.NamedTemporaryFile() as temp_f: 36 | urllib.request.urlretrieve( 37 | 'https://data.commoncrawl.org/crawl-data/CC-MAIN-2019-47/cc-index.paths.gz', 38 | temp_f.name, 39 | ) 40 | with gzip.open(temp_f.name, 'rb') as f: 41 | for line in f: 42 | line = line.decode('utf-8').rstrip() 43 | if line.endswith('.gz'): 44 | index_paths.append(f'https://data.commoncrawl.org/{line}') 45 | 46 | return index_paths 47 | 48 | 49 | def get_cc_wet_paths() -> Dict[str, str]: 50 | """Get a dict of WET file name to WET URL.""" 51 | wet_urls = {} 52 | with tempfile.NamedTemporaryFile() as temp_f: 53 | urllib.request.urlretrieve( 54 | 'https://data.commoncrawl.org/crawl-data/CC-MAIN-2019-47/wet.paths.gz', 55 | temp_f.name, 56 | ) 57 | with gzip.open(temp_f.name, 'rb') as f: 58 | for line in f: 59 | line = line.decode('utf-8').rstrip() 60 | filename = line.split('/')[-1] 61 | wet_urls[filename] = f'https://data.commoncrawl.org/{line}' 62 | 63 | return wet_urls 64 | 65 | 66 | def process_cc_index(index_url: str) -> Dict[str, List[str]]: 67 | """Return a map of WET file to list of URLs it contains.""" 68 | # Download index file 69 | filename = index_url.split('/')[-1] 70 | index_files_root.mkdir(exist_ok=True) 71 | if not (index_files_root / filename).exists(): 72 | urllib.request.urlretrieve(index_url, index_files_root / filename) 73 | 74 | # Parse index file 75 | wet_to_urls = defaultdict(list) 76 | cc_index_line_pattern = re.compile(r'^[\S]+ \d+ (.*)$') 77 | with gzip.open(index_files_root / filename, 'rb') as f: 78 | for line in f: 79 | line = line.decode('utf-8').rstrip() 80 | match = cc_index_line_pattern.match(line) 81 | if match: 82 | url_metadata = json.loads(match.group(1)) 83 | if ( 84 | url_metadata['status'] == '200' 85 | and url_metadata.get('languages') == 'eng' 86 | and url_metadata['mime'] == 'text/html' 87 | ): 88 | wet_filename = url_metadata['filename'].split('/')[-1] 89 | wet_to_urls[wet_filename].append(url_metadata['url']) 90 | else: 91 | logging.error(f'Line in index file cannot be matched by regex: {line}') 92 | 93 | return wet_to_urls 94 | 95 | 96 | def sort_and_sample_filter_list(filter_list_path: Path) -> None: 97 | """Sort and sample URLs in a filter list.""" 98 | urls = [] 99 | with open(filter_list_path) as f: 100 | for line in f: 101 | urls.append(line.rstrip()) 102 | 103 | urls.sort() 104 | 105 | with open(sampled_filter_lists_root / filter_list_path.name, 'w') as f: 106 | for i, url in enumerate(urls): 107 | if i % 100 == 0: 108 | f.write(url + '\n') 109 | 110 | 111 | def sample_filter_lists() -> None: 112 | """Sample filter lists.""" 113 | filter_lists = list(filter_lists_root.iterdir()) 114 | sampled_filter_lists_root.mkdir(exist_ok=True) 115 | 116 | with Pool() as p: 117 | p.map(sort_and_sample_filter_list, filter_lists) 118 | 119 | 120 | def process_wet_file(tup: Tuple[Path, str, str, Path],) -> None: 121 | """Download WET file and extract webpages from WARC whose URL is in the filter list.""" 122 | filter_list, wet_name, wet_url, commoncrawl_docs_root = tup 123 | accepted_urls = set() 124 | with open(filter_list) as f: 125 | for line in f: 126 | accepted_urls.add(line.rstrip()) 127 | 128 | attempt = 0 129 | while attempt < 3: 130 | try: 131 | urllib.request.urlretrieve(wet_url, wet_files_cache / wet_name) 132 | break 133 | except Exception: 134 | logging.exception(f'Error while downloading {wet_url}') 135 | attempt += 1 136 | 137 | if not (wet_files_cache / wet_name).exists(): 138 | logging.error( 139 | f'Failed to download {wet_url} after 3 attempts. Ignoring file...' 140 | ) 141 | return 142 | 143 | with gzip.open(wet_files_cache / wet_name, 'rb') as stream, open( 144 | commoncrawl_docs_root / f'{wet_name}.jsonl', 'w' 145 | ) as f: 146 | for record in ArchiveIterator(stream): 147 | if record.rec_type == 'conversion': 148 | url = record.rec_headers.get_header('WARC-Target-URI') 149 | if url not in accepted_urls: 150 | continue 151 | 152 | contents = record.content_stream().read().decode('utf-8') 153 | if contents.startswith('404 Not Found'): 154 | continue 155 | 156 | output_dict = {'id': url, 'contents': contents} 157 | 158 | f.write(json.dumps(output_dict) + '\n') 159 | 160 | os.remove(wet_files_cache / wet_name) 161 | 162 | 163 | def get_docs_from_wet_files(parallelism, commoncrawl_docs_root: Path) -> None: 164 | """Download WET files and extract webpages whose URLs is in the filter list.""" 165 | wet_files_cache.mkdir(exist_ok=True) 166 | commoncrawl_docs_root.mkdir(exist_ok=True, parents=True) 167 | 168 | filter_lists = list(sampled_filter_lists_root.iterdir()) 169 | 170 | # Download WET file paths 171 | wet_paths = get_cc_wet_paths() 172 | wet_names = [] 173 | resolved_wet_paths = [] 174 | for filter_list in filter_lists: 175 | wet_filename = str(filter_list.name).replace('.warc.gz.txt', '.warc.wet.gz') 176 | wet_names.append(wet_filename) 177 | resolved_wet_paths.append(wet_paths[wet_filename]) 178 | 179 | with Pool(parallelism) as p: 180 | for i, _ in enumerate( 181 | p.imap_unordered( 182 | process_wet_file, 183 | zip( 184 | filter_lists, 185 | wet_names, 186 | resolved_wet_paths, 187 | [commoncrawl_docs_root for _ in range(len(filter_lists))], 188 | ), 189 | ) 190 | ): 191 | if (i + 1) % 50 == 0: 192 | logging.info(f'Processed {i + 1} / {len(filter_lists)} WET files...') 193 | 194 | 195 | def main(parallelism: int, commoncrawl_docs_root: Path): 196 | cc_index_paths = get_cc_index_paths() 197 | 198 | # Construct filter lists 199 | if filter_lists_root.exists(): 200 | shutil.rmtree(filter_lists_root) 201 | filter_lists_root.mkdir(exist_ok=True) 202 | 203 | for i in range(0, len(cc_index_paths), parallelism): 204 | with Pool(parallelism) as p: 205 | logging.info( 206 | f'Processing Common Crawl index {i+1}-{min(i + parallelism, len(cc_index_paths))} / {len(cc_index_paths)}...' 207 | ) 208 | partial_filter_lists = p.map( 209 | process_cc_index, cc_index_paths[i : i + parallelism] 210 | ) 211 | for partial_filter_list in partial_filter_lists: 212 | for wet_filename, urls in partial_filter_list.items(): 213 | with open(filter_lists_root / f'{wet_filename}.txt', 'a') as f: 214 | for url in urls: 215 | f.writelines(url + '\n') 216 | 217 | # Create sampled filter lists 218 | logging.info('Sorting and sampling filter lists...') 219 | sample_filter_lists() 220 | 221 | # Download WET files and filter records 222 | logging.info('Processing WET files...') 223 | get_docs_from_wet_files(parallelism, commoncrawl_docs_root) 224 | 225 | # Remove temporary files 226 | logging.info('Done processing WET files, removing temporary directories...') 227 | shutil.rmtree(index_files_root) 228 | shutil.rmtree(filter_lists_root) 229 | shutil.rmtree(sampled_filter_lists_root) 230 | shutil.rmtree(wet_files_cache) 231 | 232 | 233 | if __name__ == '__main__': 234 | parser = ArgumentParser( 235 | description='Creates a corpus of documents from the November 2019 Common Crawl archive' 236 | ) 237 | parser.add_argument( 238 | '--output-directory', 239 | default='docs/common-crawl', 240 | help='Path to directory containing document output, defaults to docs/common-crawl', 241 | ) 242 | parser.add_argument( 243 | '--workers', 244 | default=8, 245 | type=int, 246 | help='Number of workers for downloading in parallel', 247 | ) 248 | args = parser.parse_args() 249 | 250 | logging.basicConfig(level=logging.INFO) 251 | 252 | commoncrawl_docs_root = Path(args.output_directory) 253 | 254 | main(args.workers, commoncrawl_docs_root) 255 | -------------------------------------------------------------------------------- /collection/download_wayback_passages.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2020 Apple Inc. All Rights Reserved. 4 | # 5 | 6 | """ 7 | This script downloads webpages in the conversation contexts from the Wayback Machine. 8 | """ 9 | 10 | from argparse import ArgumentParser 11 | import glob 12 | import json 13 | import logging 14 | import multiprocessing 15 | from pathlib import Path 16 | import random 17 | import re 18 | import requests 19 | from requests.exceptions import HTTPError 20 | import shutil 21 | import time 22 | import urllib.parse 23 | import uuid 24 | 25 | from bs4 import BeautifulSoup 26 | import pandas as pd 27 | 28 | 29 | wayback_prefix = re.compile(r'^https:\/\/web\.archive\.org\/web') 30 | replace_pattern = re.compile(r'(web\.archive\.org\/web\/\d+)') 31 | blacklist = [ 32 | '[document]', 33 | 'noscript', 34 | 'header', 35 | 'html', 36 | 'meta', 37 | 'head', 38 | 'input', 39 | 'script', 40 | 'style', 41 | # there may be more elements we don't want 42 | ] 43 | 44 | 45 | def download_with_retry(url: str, max_retries: int = 10) -> requests.Response: 46 | """Download a URL with exponential backoff, until max_retries is reached.""" 47 | retry_num = 0 48 | while True: 49 | try: 50 | response = requests.get(url) 51 | response.raise_for_status() 52 | return response 53 | except HTTPError as e: 54 | status_code = e.response.status_code 55 | if not (status_code == 429 or status_code >= 500): 56 | # This is not an error we should retry on 57 | raise e 58 | 59 | if retry_num > max_retries: 60 | logging.error( 61 | f'Failed to perform GET request on {url} after {max_retries} retries.' 62 | ) 63 | raise e 64 | 65 | if status_code == 429: 66 | time.sleep(5 + 2 ** retry_num + random.randint(0, 1000) / 1000) 67 | else: 68 | time.sleep(2 ** retry_num + random.randint(0, 1000) / 1000) 69 | retry_num += 1 70 | 71 | 72 | def extract_text(html_text: str) -> str: 73 | """Extracts text from an HTML document.""" 74 | soup = BeautifulSoup(html_text, 'html.parser') 75 | text = soup.find_all(text=True) 76 | output = '' 77 | for t in text: 78 | if t.parent.name not in blacklist: 79 | output += f'{t} ' 80 | 81 | return output 82 | 83 | 84 | def download_link(tup): 85 | link = tup[0] 86 | output_path = tup[1] 87 | num_workers = tup[2] 88 | page_id = str(uuid.uuid4()) 89 | url_no_header = None 90 | 91 | try: 92 | # Find the Wayback Machine link 93 | if not wayback_prefix.match(link): 94 | link_encoded = urllib.parse.quote(link) 95 | 96 | available, availability_attempt = False, 0 97 | # Sometimes the API returns HTTP success code 200, but archived snapshots shows page is unavailable 98 | # when it actually is. Give it a total of three tries. 99 | while not available and availability_attempt < 3: 100 | response = download_with_retry( 101 | f'http://archive.org/wayback/available?url={link_encoded}×tamp=20191127' 102 | ) 103 | json_response = response.json() 104 | available = 'closest' in json_response['archived_snapshots'] 105 | availability_attempt += 1 106 | 107 | if not available: 108 | logging.warning( 109 | f'Not available on Wayback Machine: {link}, HTTP code {response.status_code}, {json_response}' 110 | ) 111 | return {'link': link, 'page_id': page_id, 'available': False} 112 | 113 | url = json_response['archived_snapshots']['closest']['url'] 114 | else: 115 | url = link 116 | 117 | match = replace_pattern.search(url) 118 | assert match 119 | url_no_header = replace_pattern.sub(f'{match.group(1)}id_', url) 120 | 121 | response = download_with_retry(url_no_header) 122 | html_page = response.text 123 | parsed_text = extract_text(html_page) 124 | 125 | proc = multiprocessing.current_process() 126 | pid_mod = str(proc.pid % num_workers) 127 | 128 | (output_path / pid_mod).mkdir(parents=True, exist_ok=True) 129 | 130 | with open(output_path / pid_mod / page_id, 'w') as f: 131 | doc = { 132 | 'id': url_no_header, 133 | 'contents': parsed_text, 134 | } 135 | f.write(json.dumps(doc) + '\n') 136 | 137 | return { 138 | 'link': link, 139 | 'page_id': page_id, 140 | 'available': True, 141 | 'status_code': response.status_code, 142 | 'wayback_url': url_no_header, 143 | } 144 | except HTTPError as http_err: 145 | logging.warning(f'HTTP error occurred: {http_err} for {link}') 146 | return { 147 | 'link': link, 148 | 'page_id': page_id, 149 | 'available': False, 150 | 'status_code': http_err.response.status_code if http_err.response else None, 151 | 'wayback_url': url_no_header, 152 | } 153 | except UnicodeDecodeError as e: 154 | logging.warning(f'Unicode decode error occurred: {e} for {link}') 155 | return { 156 | 'link': link, 157 | 'page_id': page_id, 158 | 'available': False, 159 | 'status_code': response.status_code, 160 | 'wayback_url': url_no_header, 161 | } 162 | except Exception as e: 163 | logging.warning(f'Exception occurred: {e} for {link}') 164 | return { 165 | 'link': link, 166 | 'page_id': page_id, 167 | 'available': False, 168 | 'status_code': None, 169 | 'wayback_url': url_no_header, 170 | } 171 | 172 | 173 | def crawl_wayback_machine( 174 | inputs_globbing_pattern: str, output_dir: str, num_workers: int 175 | ) -> None: 176 | links = set() 177 | for dataset in glob.glob(inputs_globbing_pattern): 178 | with open(dataset) as f: 179 | data = json.load(f) 180 | for conversation_turn in data: 181 | if conversation_turn['Answer_URL'] == '': 182 | continue 183 | 184 | for url in conversation_turn['Answer_URL'].split(' '): 185 | if url.endswith('.pdf'): 186 | continue 187 | 188 | anchor_sign_pos = url.find('#') 189 | if anchor_sign_pos != -1: 190 | url = url.split('#')[0] 191 | 192 | links.add(url) 193 | links = list(links) 194 | 195 | output_path = Path(output_dir) 196 | output_path.mkdir(parents=True, exist_ok=True) 197 | 198 | records = [] 199 | with multiprocessing.Pool(num_workers) as p: 200 | for i, result in enumerate( 201 | p.imap_unordered( 202 | download_link, 203 | [(l, output_path, num_workers) for l in links], 204 | chunksize=16, 205 | ) 206 | ): 207 | records.append(result) 208 | if (i + 1) % 10 == 0: 209 | logging.info(f'Processed {i + 1} / {len(links)} links...') 210 | 211 | # Combine small files together into larger files 212 | for worker_output_dir in output_path.iterdir(): 213 | if worker_output_dir.is_dir(): 214 | with open(output_path / f'{worker_output_dir.name}.jsonl', 'w') as outfile: 215 | for single_doc_file in worker_output_dir.iterdir(): 216 | with open(single_doc_file) as infile: 217 | outfile.write(infile.read()) 218 | 219 | shutil.rmtree(worker_output_dir) 220 | 221 | df = pd.DataFrame.from_records(records) 222 | df.to_csv(output_path / 'summary.tsv', index=False, sep='\t') 223 | 224 | 225 | if __name__ == '__main__': 226 | parser = ArgumentParser(description='Crawl pages from Wayback Machine') 227 | parser.add_argument( 228 | '--inputs', required=True, help='Globbing pattern for train and test JSON files' 229 | ) 230 | parser.add_argument( 231 | '--output-directory', 232 | required=True, 233 | help='Path to directory containing crawled output', 234 | ) 235 | parser.add_argument( 236 | '--workers', 237 | default=4, 238 | type=int, 239 | help='Number of workers for downloading in parallel', 240 | ) 241 | args = parser.parse_args() 242 | 243 | logging.basicConfig(level=logging.INFO) 244 | crawl_wayback_machine(args.inputs, args.output_directory, args.workers) 245 | -------------------------------------------------------------------------------- /collection/paragraph_chunker.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2020 Apple Inc. All Rights Reserved. 4 | # 5 | 6 | """For a directory of nested JSON lines files, where each line is a document, chunk each document into many passages.""" 7 | 8 | from argparse import ArgumentParser 9 | import json 10 | import logging 11 | import multiprocessing 12 | from pathlib import Path 13 | from typing import List, Tuple 14 | 15 | MIN_PASSAGE_TOKENS = 220 16 | 17 | 18 | def chunk_doc(content: str) -> List[str]: 19 | """Given a document, return a list of passages of no fewer than MIN_PASSAGE_TOKENS tokens / passage until EOF.""" 20 | passages = [] 21 | passage_tokens = [] 22 | lines = content.split('\n') 23 | for line in lines: 24 | line = line.rstrip() 25 | 26 | if '===' in line: 27 | continue 28 | if len(line) == 0: 29 | continue 30 | 31 | tokens = line.split() 32 | passage_tokens.extend(tokens) 33 | 34 | if len(passage_tokens) > MIN_PASSAGE_TOKENS: 35 | passages.append(' '.join(passage_tokens)) 36 | passage_tokens = [] 37 | 38 | passages.append(' '.join(passage_tokens)) 39 | return passages 40 | 41 | 42 | def process_file(tup: Tuple[str, str, Path]) -> None: 43 | """Chunk all documents in a single file.""" 44 | input_directory, output_directory, input_file = tup 45 | output_file = str(input_file).replace(input_directory, output_directory) 46 | output_path = Path(output_file) 47 | output_path.parent.mkdir(parents=True, exist_ok=True) 48 | 49 | with open(input_file) as f1, open(output_path, 'w') as f2: 50 | for jsonl in f1: 51 | doc = json.loads(jsonl) 52 | passages = chunk_doc(doc['contents']) 53 | 54 | for i, passage in enumerate(passages): 55 | paragraph = {'id': f"{doc['id']}_p{i}", 'contents': passage} 56 | 57 | f2.write(json.dumps(paragraph) + '\n') 58 | 59 | 60 | def chunk_documents(input_directory: str, output_directory: str, workers: int) -> None: 61 | """Iterate .jsonl files in input_directory and output .jsonl files in output_directory where each doc is chunked.""" 62 | input_directory_path = Path(input_directory) 63 | 64 | jsonl_files = list(input_directory_path.glob('**/*.jsonl')) 65 | 66 | with multiprocessing.Pool(workers) as p: 67 | for i, _ in enumerate( 68 | p.imap_unordered( 69 | process_file, 70 | [(input_directory, output_directory, f) for f in jsonl_files], 71 | chunksize=16, 72 | ) 73 | ): 74 | if (i + 1) % 100 == 0: 75 | logging.info(f'Processed {i + 1} / {len(jsonl_files)} files...') 76 | 77 | 78 | if __name__ == '__main__': 79 | parser = ArgumentParser( 80 | description='Chunk documents in .jsonl files into many passages.' 81 | ) 82 | parser.add_argument( 83 | '--input-directory', 84 | required=True, 85 | help='Directory containing .jsonl files to chunk', 86 | ) 87 | parser.add_argument( 88 | '--output-directory', 89 | required=True, 90 | help='Directory to store .jsonl files containing document passages', 91 | ) 92 | parser.add_argument( 93 | '--workers', 94 | default=8, 95 | type=int, 96 | help='Number of workers for downloading in parallel', 97 | ) 98 | args = parser.parse_args() 99 | 100 | logging.basicConfig(level=logging.INFO) 101 | 102 | chunk_documents(args.input_directory, args.output_directory, args.workers) 103 | -------------------------------------------------------------------------------- /dataset/qrecc_data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-qrecc/cf44d03cb6676f7414471cec509d4a6c6858b0d3/dataset/qrecc_data.zip -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.9.3 2 | nltk==3.5 3 | pyserini==0.10.0.1 4 | warcio==1.7.4 5 | -------------------------------------------------------------------------------- /utils/evaluate_qa.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2020 Apple Inc. All Rights Reserved. 4 | # 5 | 6 | """ 7 | Functions for computing QA evaluation metrics. 8 | 9 | We adapt the functions from the official SQuAD (Rajpurkar et al. '18) evaluation script: 10 | https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/ 11 | """ 12 | 13 | import collections 14 | import re 15 | import string 16 | from typing import List 17 | 18 | 19 | def normalize_answer(s: str) -> str: 20 | """Lower text and remove punctuation, articles and extra whitespace.""" 21 | 22 | def remove_articles(text): 23 | regex = re.compile(r'\b(a|an|the)\b', re.UNICODE) 24 | return re.sub(regex, ' ', text) 25 | 26 | def white_space_fix(text): 27 | return ' '.join(text.split()) 28 | 29 | def remove_punc(text): 30 | exclude = set(string.punctuation) 31 | return ''.join(ch for ch in text if ch not in exclude) 32 | 33 | def lower(text): 34 | return text.lower() 35 | 36 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 37 | 38 | 39 | def get_tokens(s: str) -> List[str]: 40 | """Normalize string and split string into tokens.""" 41 | if not s: 42 | return [] 43 | return normalize_answer(s).split() 44 | 45 | 46 | def compute_exact(a_gold: str, a_pred: str) -> int: 47 | """Compute the Exact Match score.""" 48 | return int(normalize_answer(a_gold) == normalize_answer(a_pred)) 49 | 50 | 51 | def compute_f1_from_tokens(gold_toks: List[str], pred_toks: List[str]) -> float: 52 | """Compute the F1 score from tokenized gold answer and prediction.""" 53 | common = collections.Counter(gold_toks) & collections.Counter(pred_toks) 54 | num_same = sum(common.values()) 55 | 56 | if len(gold_toks) == 0 or len(pred_toks) == 0: 57 | # If either is no-answer, then F1 is 1 if they agree, 0 otherwise 58 | return int(gold_toks == pred_toks) 59 | 60 | if num_same == 0: 61 | return 0 62 | 63 | precision = 1.0 * num_same / len(pred_toks) 64 | recall = 1.0 * num_same / len(gold_toks) 65 | f1 = (2 * precision * recall) / (precision + recall) 66 | return f1 67 | 68 | 69 | def compute_f1(a_gold: str, a_pred: str) -> float: 70 | """Compute the F1 score.""" 71 | gold_toks = get_tokens(a_gold) 72 | pred_toks = get_tokens(a_pred) 73 | return compute_f1_from_tokens(gold_toks, pred_toks) 74 | -------------------------------------------------------------------------------- /utils/evaluate_retrieval.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2021 Apple Inc. All Rights Reserved. 4 | # 5 | 6 | from argparse import ArgumentParser 7 | import json 8 | import logging 9 | from multiprocessing import Pool 10 | from pathlib import Path 11 | from typing import Dict, List, Tuple 12 | 13 | from evaluate_qa import compute_exact, compute_f1 14 | from span_heuristic import find_closest_span_match 15 | 16 | """ 17 | Functions for evaluating passage retrieval. 18 | 19 | This is used to compute MRR (mean reciprocal rank), Recall@10, and Recall@100 in Table 5 of the paper. 20 | """ 21 | 22 | 23 | RELEVANCE_THRESHOLD = 0.8 24 | 25 | 26 | def compute_f1_for_retrieved_passage(line: str) -> dict: 27 | """ 28 | Given a serialized JSON line, with fields 'content' and 'answer', find the closest span matching answer, 29 | update the deserialized dict with the span and F1 score, and return the dict. 30 | """ 31 | data = json.loads(line) 32 | content, answer = data['content'], data['answer'] 33 | 34 | # If there is no answer, although the closest extractive answer is '', in the MRR and recall@k functions below 35 | # we do not count any passage for these questions as relevant. 36 | if len(answer) < 1: 37 | data['heuristic_answer'] = '' 38 | data['f1'] = compute_f1(answer, '') 39 | return data 40 | 41 | best_span, best_f1 = find_closest_span_match(content, answer) 42 | 43 | data['heuristic_answer'] = best_span 44 | data['f1'] = best_f1 45 | 46 | return data 47 | 48 | 49 | def compute_mean_reciprocal_rank( 50 | question_id_to_docs: Dict[str, List[dict]], relevance_threshold: float 51 | ) -> float: 52 | """Given a dictionary mapping a question id to a list of docs, find the mean reciprocal rank.""" 53 | recip_rank_sum = 0 54 | for qid, docs in question_id_to_docs.items(): 55 | top_rank = float('inf') 56 | for doc in docs: 57 | if len(doc['answer']) > 0 and doc['f1'] >= relevance_threshold: 58 | top_rank = min(top_rank, doc['rank']) 59 | 60 | recip_rank = 1 / top_rank if top_rank != float('inf') else 0 61 | recip_rank_sum += recip_rank 62 | 63 | return recip_rank_sum / len(question_id_to_docs) 64 | 65 | 66 | def compute_recall_at_k( 67 | question_id_to_docs: Dict[str, List[dict]], k: int, relevance_threshold: float 68 | ) -> float: 69 | """ 70 | Given a dictionary mapping a question id to a list of docs, find the recall@k. 71 | 72 | We define recall@k = 1.0 if any document in the top-k is relevant, and 0 otherwise. 73 | """ 74 | relevant_doc_found_total = 0 75 | for qid, docs in question_id_to_docs.items(): 76 | relevant_doc_found = 0 77 | for doc in docs: 78 | if len(doc['answer']) > 0 and doc['f1'] >= relevance_threshold and doc['rank'] <= k: 79 | relevant_doc_found = 1 80 | break 81 | 82 | relevant_doc_found_total += relevant_doc_found 83 | 84 | return relevant_doc_found_total / len(question_id_to_docs) 85 | 86 | 87 | def compute_extractive_upper_bounds( 88 | question_id_to_docs: Dict[str, List[dict]], temp_files_directory: Path 89 | ) -> Tuple[float, float]: 90 | """Given a dictionary mapping a question id to a list of docs, find the extractive upper bounds of (EM, F1).""" 91 | total_em, total_f1 = 0, 0.0 92 | with open(temp_files_directory / 'retrieved-passages-relevant-f1.jsonl', 'w') as outfile: 93 | for qid, docs in question_id_to_docs.items(): 94 | best_em, best_f1 = 0, 0.0 95 | best_doc = docs[0] 96 | for doc in docs: 97 | em = compute_exact(doc['answer'], doc['heuristic_answer']) 98 | f1 = compute_f1(doc['answer'], doc['heuristic_answer']) 99 | if f1 > best_f1: 100 | best_doc = doc 101 | best_em = max(best_em, em) 102 | best_f1 = max(best_f1, f1) 103 | if best_em == 1 and best_f1 == 1.0: 104 | break 105 | 106 | total_em += best_em 107 | total_f1 += best_f1 108 | 109 | outfile.write(json.dumps(best_doc) + '\n') 110 | 111 | return ( 112 | total_em / len(question_id_to_docs), 113 | total_f1 / len(question_id_to_docs), 114 | ) 115 | 116 | 117 | def get_unique_relevant_docs_count( 118 | question_id_to_docs: Dict[str, List[dict]], relevance_threshold: float 119 | ) -> float: 120 | """Given a dictionary mapping a question id to a list of docs, find the number of unique relevant docs.""" 121 | unique_relevant_docs = set() 122 | for qid, docs in question_id_to_docs.items(): 123 | for doc in docs: 124 | if len(doc['answer']) > 0 and doc['f1'] >= relevance_threshold: 125 | unique_relevant_docs.add(doc['docid']) 126 | 127 | return len(unique_relevant_docs) 128 | 129 | 130 | def get_average_relevant_docs_per_question( 131 | question_id_to_docs: Dict[str, List[dict]], relevance_threshold: float 132 | ) -> float: 133 | """Given a dictionary mapping a question id to a list of docs, find the average number of relevant docs per question.""" 134 | relevant_docs = 0 135 | for qid, docs in question_id_to_docs.items(): 136 | for doc in docs: 137 | if len(doc['answer']) > 0 and doc['f1'] >= relevance_threshold: 138 | relevant_docs += 1 139 | 140 | return relevant_docs / len(question_id_to_docs) 141 | 142 | 143 | def main(retrieved_passages_pattern: str, temp_files_directory: str, workers: int): 144 | retrieved_passages_files = Path().glob(retrieved_passages_pattern) 145 | temp_files_directory = Path(temp_files_directory) 146 | temp_files_directory.mkdir(exist_ok=True, parents=True) 147 | 148 | question_id_to_docs = {} 149 | 150 | for retrieved_passages_file in retrieved_passages_files: 151 | with open(retrieved_passages_file) as infile: 152 | with Pool(workers) as p: 153 | for i, passage_results in enumerate( 154 | p.imap(compute_f1_for_retrieved_passage, infile) 155 | ): 156 | if (i + 1) % 5000 == 0: 157 | logging.info( 158 | f'Processing {retrieved_passages_file.name}, {i + 1} lines done...' 159 | ) 160 | 161 | qid = f"{passage_results['Conversation-ID']}_{passage_results['Turn-ID']}" 162 | if qid not in question_id_to_docs: 163 | question_id_to_docs[qid] = [] 164 | 165 | question_id_to_docs[qid].append( 166 | { 167 | 'Conversation-ID': passage_results['Conversation-ID'], 168 | 'Turn-ID': passage_results['Turn-ID'], 169 | 'docid': passage_results['docid'], 170 | 'content': passage_results['content'], 171 | 'rank': passage_results['rank'], 172 | 'answer': passage_results['answer'], 173 | 'heuristic_answer': passage_results['heuristic_answer'], 174 | 'f1': passage_results['f1'], 175 | } 176 | ) 177 | 178 | print('Final metrics:') 179 | unique_relevant_docs = get_unique_relevant_docs_count(question_id_to_docs, RELEVANCE_THRESHOLD) 180 | unique_docs_perfect_f1 = get_unique_relevant_docs_count(question_id_to_docs, 1.0) 181 | avg_relevant_docs_per_question = get_average_relevant_docs_per_question( 182 | question_id_to_docs, 1.0 183 | ) 184 | 185 | print(f'Total number of unique queries: {len(question_id_to_docs)}') 186 | print(f'Total number of unique relevant docs: {unique_relevant_docs}') 187 | print(f'Total number of unique docs with F1=1.0: {unique_docs_perfect_f1}') 188 | print(f'Average number of relevant docs per query: {avg_relevant_docs_per_question}') 189 | 190 | mrr = compute_mean_reciprocal_rank(question_id_to_docs, RELEVANCE_THRESHOLD) 191 | recall_at_10 = compute_recall_at_k(question_id_to_docs, 10, RELEVANCE_THRESHOLD) 192 | recall_at_100 = compute_recall_at_k(question_id_to_docs, 100, RELEVANCE_THRESHOLD) 193 | print(f'Mean Reciprocal Rank (MRR): {mrr:.4f}') 194 | print(f'Recall@10: {recall_at_10 * 100:.2f}%') 195 | print(f'Recall@100: {recall_at_100 * 100:.2f}%') 196 | 197 | em_upper_bound, f1_upper_bound = compute_extractive_upper_bounds( 198 | question_id_to_docs, temp_files_directory 199 | ) 200 | print(f'Extractive Upper Bound for EM (100 point scale): {em_upper_bound * 100:.2f}') 201 | print(f'Extractive Upper Bound for F1 (100 point scale): {f1_upper_bound * 100:.2f}') 202 | 203 | 204 | if __name__ == '__main__': 205 | parser = ArgumentParser(description='Passage retrieval evaluation') 206 | parser.add_argument( 207 | '--retrieved-passages-pattern', 208 | required=True, 209 | help="""A globbing pattern to select .jsonl files containing retrieved passages. 210 | Each json line should contain the fields 'Conversation-ID', 'Turn-ID', 'docid', 'content', 'answer', 'rank'. 211 | 'answer' is the gold answer given in the QReCC dataset and rank is the rank of the document starting from 1.""", 212 | ) 213 | parser.add_argument( 214 | '--temp-files-directory', 215 | default='/tmp/qrecc-retrieval-eval', 216 | help='Directory to store temporary files containing F1 scores, which can be used for debugging and analysis', 217 | ) 218 | parser.add_argument( 219 | '--workers', default=8, type=int, help='Number of workers for parallel processing', 220 | ) 221 | args = parser.parse_args() 222 | 223 | logging.basicConfig(level=logging.INFO) 224 | 225 | main(args.retrieved_passages_pattern, args.temp_files_directory, args.workers) 226 | -------------------------------------------------------------------------------- /utils/span_heuristic.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2020 Apple Inc. All Rights Reserved. 4 | # 5 | 6 | """ 7 | Heuristic for finding a span in some passage that's close to the golden span. 8 | """ 9 | 10 | from difflib import SequenceMatcher as SM 11 | import re 12 | import string 13 | from typing import List, Tuple 14 | 15 | from nltk.util import ngrams 16 | 17 | from evaluate_qa import compute_f1, compute_f1_from_tokens, get_tokens, normalize_answer 18 | 19 | 20 | ARTICLES_RE = re.compile(r'\b(a|an|the)\b', re.UNICODE) 21 | EXCLUDED_PUNCTS = set(string.punctuation) 22 | 23 | 24 | def _find_approximate_matching_sequence(context: str, target: str) -> Tuple[str, float]: 25 | """Find some substring in the context which closely matches the target, returning this substring with a score.""" 26 | if target in context: 27 | return target, 1.0 28 | 29 | target_length = len(target.split()) 30 | max_sim_val = 0 31 | max_sim_string = '' 32 | seq_matcher = SM() 33 | seq_matcher.set_seq2(target) 34 | for ngram in ngrams(context.split(), target_length + int(0.05 * target_length)): 35 | candidate_ngram = ' '.join(ngram) 36 | seq_matcher.set_seq1(candidate_ngram) 37 | similarity = seq_matcher.quick_ratio() 38 | if similarity > max_sim_val: 39 | max_sim_val = similarity 40 | max_sim_string = candidate_ngram 41 | if similarity == 1.0: 42 | # early exiting 43 | break 44 | 45 | return max_sim_string, max_sim_val 46 | 47 | 48 | def _normalize_tokens(tokens: List[str], keep_empty_str=True) -> List[str]: 49 | """ 50 | Normalize individual tokens. 51 | 52 | If keep_empty_str is True, this keeps the overall number of tokens the same. 53 | A particular token could be normalized to an empty string. 54 | """ 55 | normalized_tokens = [] 56 | for token in tokens: 57 | token = token.lower() 58 | token = ''.join(ch for ch in token if ch not in EXCLUDED_PUNCTS) 59 | token = re.sub(ARTICLES_RE, '', token) 60 | if keep_empty_str or len(token): 61 | normalized_tokens.append(token) 62 | 63 | return normalized_tokens 64 | 65 | 66 | def find_closest_span_match(passage: str, gold_answer: str) -> Tuple[str, float]: 67 | """Heuristic for finding the closest span in a passage relative to some golden answer based on F1 score.""" 68 | closest_encompassing_span, closest_encompassing_span_score = _find_approximate_matching_sequence(passage, gold_answer) 69 | closest_encompassing_span_tok = closest_encompassing_span.split() 70 | gold_answer_tok = gold_answer.split() 71 | closest_encompassing_span_tok_normalized = _normalize_tokens(closest_encompassing_span_tok) 72 | gold_answer_tok_normalized = _normalize_tokens(gold_answer_tok, keep_empty_str=False) 73 | 74 | best_span, best_score, best_i, best_j = '', 0, None, None 75 | for i in range(0, len(closest_encompassing_span_tok_normalized)): 76 | for j in range(i + 1, len(closest_encompassing_span_tok_normalized) + 1): 77 | score = compute_f1_from_tokens( 78 | gold_answer_tok_normalized, 79 | [t for t in closest_encompassing_span_tok_normalized[i:j] if len(t)], 80 | ) 81 | if score > best_score: 82 | best_score = score 83 | best_i, best_j = i, j 84 | 85 | best_span = ' '.join(closest_encompassing_span_tok[best_i:best_j]) 86 | best_f1 = compute_f1(gold_answer, best_span) 87 | return best_span, best_f1 88 | --------------------------------------------------------------------------------