├── .gitignore ├── LICENSE ├── MODEL_LICENSE ├── README.md ├── README_zh.md ├── arguments.py ├── assets ├── cases │ ├── 0.png │ ├── 1.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ ├── 6.png │ ├── 7.png │ ├── 8.png │ └── 9.png └── main_process.png ├── cli_demo.py ├── data ├── nq_open.jsonl ├── trivia_qa.jsonl └── web_questions.jsonl ├── download.py ├── evaluate.py ├── evaluate ├── __init__.py ├── eval.py └── triviaqa.py ├── model ├── __init__.py ├── modeling_webglm.py ├── retriever │ ├── __init__.py │ ├── extracting │ │ ├── __init__.py │ │ ├── extracting_by_bs4.py │ │ └── html2text.py │ ├── fetching │ │ ├── __init__.py │ │ └── playwright_based_crawl_new.py │ ├── filtering │ │ ├── __init__.py │ │ └── contriver.py │ └── searching │ │ ├── __init__.py │ │ ├── bing_search.py │ │ ├── searcher.py │ │ └── serpapi.py ├── stopwords │ ├── english │ └── explaination └── utils.py ├── requirements.txt ├── scripts ├── nq_open.sh ├── triviaqa.sh └── web_questions.sh ├── train_retriever.py └── web_demo.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | webglm_data 3 | error.html 4 | TODO.md 5 | *.ipynb 6 | download/ 7 | %* 8 | retriever_runs -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright Hanyu Lai, Hao Yu, Xiao Liu 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /MODEL_LICENSE: -------------------------------------------------------------------------------- 1 | 1. Definitions 2 | 3 | “Licensor” means the WebGLM Team that distributes its Software. 4 | 5 | “Software” means the WebGLM model parameters and data made available under this license. 6 | 7 | 2. License Grant 8 | 9 | Subject to the terms and conditions of this License, the Licensor hereby grants to you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license to use the Software solely for your non-commercial research purposes. 10 | 11 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 12 | 13 | 3. Restriction 14 | 15 | You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any commercial, military, or illegal purposes. 16 | 17 | You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings. 18 | 19 | 4. Disclaimer 20 | 21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | 23 | 5. Limitation of Liability 24 | 25 | EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 26 | 27 | 6. Dispute Resolution 28 | 29 | This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing. 30 | 31 | Note that the license is subject to update to a more comprehensive version. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
📃 Paper (KDD'23) • 🌐 中文 README • 🤗 HF Repo [WebGLM-10B] [WebGLM-2B] • 📚 Dataset [WebGLM-QA]
4 | 5 | This is the official implementation of WebGLM. If you find our open-sourced efforts useful, please 🌟 the repo to encourage our following developement! 6 | 7 | 8 | 9 | **[Please click to watch the demo!]** 10 | 11 | [](https://www.youtube.com/watch?v=ohjrlYCLLEU) 12 | 13 | _Read this in [中文](README_zh.md)._ 14 | 15 | ## Update 16 | **[2023/06/25]** Release [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B), an updated version of [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B) which introduces several new features: 17 | 18 | 1. **Stronger Performance**: we have fully upgraded the ChatGLM2-6B. It uses the hybrid objective function of [GLM](https://github.com/THUDM/GLM), and has undergone pre-training with 1.4T bilingual tokens and human preference alignment training. The [evaluation results](README.md#evaluation-results) show that, compared to the first-generation model, ChatGLM2-6B has achieved substantial improvements in performance on datasets like MMLU (+23%), CEval (+33%), GSM8K (+571%), BBH (+60%), showing strong competitiveness among models of the same size. 19 | 2. **Longer Context**: Based on [FlashAttention](https://github.com/HazyResearch/flash-attention) technique, we have extended the context length of the base model from 2K in ChatGLM-6B to 32K, and trained with a context length of 8K during the dialogue alignment, allowing for more rounds of dialogue. However, the current version of ChatGLM2-6B has limited understanding of single-round ultra-long documents, which we will focus on optimizing in future iterations. 20 | 3. **More Efficient Inference**: Based on [Multi-Query Attention](http://arxiv.org/abs/1911.02150) technique, ChatGLM2-6B has more efficient inference speed and lower GPU memory usage: under the official implementation, the inference speed has increased by 42% compared to the first generation; under INT4 quantization, the dialogue length supported by 6G GPU memory has increased from 1K to 8K. 21 | 22 | More details please refer to [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B)。 23 | 24 | 25 | 26 | - [Overview](#overview) 27 | - [Features](#features) 28 | - [News](#news) 29 | - [Preparation](#preparation) 30 | - [Prepare Code and Environments](#prepare-code-and-environments) 31 | - [Prepare SerpAPI Key](#prepare-serpapi-key) 32 | - [Prepare Retriever Checkpoint](#prepare-retriever-checkpoint) 33 | - [Try WebGLM](#try-webglm) 34 | - [Export Environment Variables](#export-environment-variables) 35 | - [Run as Command Line Interface](#run-as-command-line-interface) 36 | - [Run as Web Service](#run-as-web-service) 37 | - [Train WebGLM](#train-webglm) 38 | - [Train Generator](#train-generator) 39 | - [Prepare Data](#prepare-data) 40 | - [Training](#training) 41 | - [Train Retriever](#train-retriever) 42 | - [Prepare Data](#prepare-data-1) 43 | - [Training](#training-1) 44 | - [Evaluation](#evaluation) 45 | - [Real Application Cases](#real-application-cases) 46 | - [Citation](#citation) 47 | 48 | # Overview 49 | 50 |  51 | 52 | WebGLM aspires to provide an efficient and cost-effective web-enhanced question-answering system using the 10-billion-parameter General Language Model (GLM). It aims to improve real-world application deployment by integrating web search and retrieval capabilities into the pre-trained language model. 53 | 54 | ## Features 55 | 56 | - **LLM-augmented Retriever**: Enhances the retrieval of relevant web content to better aid in answering questions accurately. 57 | - **Bootstrapped Generator**: Generates human-like responses to questions, leveraging the power of the GLM to provide refined answers. 58 | - **Human Preference-aware Scorer**: Estimates the quality of generated responses by prioritizing human preferences, ensuring the system produces useful and engaging content. 59 | 60 | # News 61 | 62 | - **[2023-06-24]** We support searching via [Bing](https://www.bing.com/) now! 63 | - **[2023-06-14]** We release our code and the [paper](https://arxiv.org/pdf/2306.07906.pdf) of WebGLM! 64 | 65 | # Preparation 66 | 67 | ## Prepare Code and Environments 68 | 69 | Clone this repo, and install python requirements. 70 | 71 | ```bash 72 | pip install -r requirements.txt 73 | ``` 74 | 75 | Install Nodejs. 76 | 77 | ```bash 78 | apt install nodejs # If you use Ubuntu 79 | ``` 80 | 81 | Install playwright dependencies. 82 | 83 | ```bash 84 | playwright install 85 | ``` 86 | 87 | If browsing environments are not installed in your host, you need to install them. Do not worry, playwright will give you instructions when you first execute it if so. 88 | 89 | ## Prepare SerpAPI Key 90 | 91 | In search process, we use SerpAPI to get search results. You need to get a SerpAPI key from [here](https://serpapi.com/). 92 | 93 | Then, set the environment variable `SERPAPI_KEY` to your key. 94 | 95 | ```bash 96 | export SERPAPI_KEY="YOUR KEY" 97 | ``` 98 | 99 | Alternatively, you can use Bing search with local browser environment (playwright). You can add `--searcher bing` to start command lines to use Bing search. (See [Run as Command Line Interface](#run-as-command-line-interface) and [Run as Web Service](#run-as-web-service)) 100 | 101 | ## Prepare Retriever Checkpoint 102 | 103 | Download the checkpoint on [ModelScope](https://www.modelscope.cn/models/shawliu9/webglm-contriever) by running the command line below. 104 | 105 | You can manually specify the path to save the checkpoint by `--save SAVE_PATH`. 106 | 107 | ```bash 108 | python download.py retriever-pretrained-checkpoint 109 | ``` 110 | 111 | # Try WebGLM 112 | 113 | Before you run the code, make sure that the space of your device is enough. 114 | 115 | ## Export Environment Variables 116 | 117 | Export the environment variable `WEBGLM_RETRIEVER_CKPT` to the path of the retriever checkpoint. If you have downloaded the retriever checkpoint in the default path, you can simply run the command line below. 118 | 119 | ```bash 120 | export WEBGLM_RETRIEVER_CKPT=./download/retriever-pretrained-checkpoint 121 | ``` 122 | 123 | ## Run as Command Line Interface 124 | 125 | You can try WebGLM-2B model by: 126 | 127 | ```bash 128 | python cli_demo.py -w THUDM/WebGLM-2B 129 | ``` 130 | 131 | Or directly for WebGLM-10B model: 132 | 133 | ```bash 134 | python cli_demo.py 135 | ``` 136 | 137 | If you want to use Bing search instead of SerpAPI, you can add `--searcher bing` to the command line, for example: 138 | 139 | ```bash 140 | python cli_demo.py -w THUDM/WebGLM-2B --searcher bing 141 | ``` 142 | 143 | ## Run as Web Service 144 | 145 | Run `web_demo.py` with the same arguments as `cli_demo.py` to start a web service. 146 | For example, you can try WebGLM-2B model with Bing search by: 147 | 148 | ```bash 149 | python web_demo.py -w THUDM/WebGLM-2B --searcher bing 150 | ``` 151 | 152 | # Train WebGLM 153 | 154 | ## Train Generator 155 | 156 | ### Prepare Data (WebGLM-QA) 157 | 158 | Download the training data (WebGLM-QA) on [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/d/d290dcfc92e342f9a017/) by running the command line below. 159 | 160 | ```bash 161 | python download.py generator-training-data 162 | ``` 163 | 164 | It will automatically download all the data and preprocess them into the seq2seq form that can be used immediately in `./download`. 165 | 166 | ### Training 167 | 168 | Please refer to [GLM repo](https://github.com/THUDM/GLM#train-with-your-own-data) for seq2seq training. 169 | 170 | ## Train Retriever 171 | 172 | ### Prepare Data 173 | 174 | Download the training data on [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/d/3927b67a834c475288e2/) by running the command line below. 175 | 176 | ```bash 177 | python download.py retriever-training-data 178 | ``` 179 | 180 | ### Training 181 | 182 | Run the following command line to train the retriever. If you have downloaded the retriever training data in the default path, you can simply run the command line below. 183 | 184 | ```bash 185 | python train_retriever.py --train_data_dir ./download/retriever-training-data 186 | ``` 187 | 188 | # Evaluation 189 | 190 | You can reproduce our results on TriviaQA, WebQuestions and NQ Open. Take TriviaQA for example, you can simply run the command line below: 191 | 192 | ```bash 193 | bash scripts/triviaqa.sh 194 | ``` 195 | 196 | and start running the experiment. 197 | 198 | # Real Application Cases 199 | 200 | [Here](assets/cases) you can see some examples of WebGLM real application scenarios. 201 | 202 |📃 论文 (KDD 2023) 4 | 5 | 本项目为 WebGLM 的官方实现。 6 | 7 | https://github.com/THUDM/WebGLM/assets/129033897/d2e1dd35-6340-4175-ac2d-fd585daa17cf 8 | 9 | _Read this in [English](README.md)._ 10 | 11 | 12 | 13 | - [概述](#概述) 14 | - [特点](#特点) 15 | - [开发准备](#开发准备) 16 | - [准备代码和环境](#准备代码和环境) 17 | - [准备 SerpAPI 密钥](#准备serpapi密钥) 18 | - [下载检索器权重](#下载检索器权重) 19 | - [尝试 WebGLM](#尝试webglm) 20 | - [导出环境变量](#导出环境变量) 21 | - [以命令行界面运行](#以命令行界面运行) 22 | - [以 Web 服务形式运行](#以Web服务形式运行) 23 | - [训练 WebGLM](#训练webglm) 24 | - [训练生成器](#训练生成器) 25 | - [准备数据](#准备数据) 26 | - [训练](#训练) 27 | - [训练检索器](#训练检索器) 28 | - [准备数据](#准备数据-1) 29 | - [训练](#训练-1) 30 | - [评测](#评测) 31 | - [实际应用案例](#实际应用案例) 32 | - [引用](#引用) 33 | 34 | # 概述 35 | 36 |  37 | 38 | WebGLM 旨在使用 10 亿参数的通用语言模型(GLM)提供一种高效且低成本的网络增强问答系统。它旨在通过将网络搜索和召回功能集成到预训练的语言模型中以进行实际应用的部署。 39 | 40 | ## 特点 41 | 42 | - **大模型增强检索器**:增强了相关网络内容的检索能力,以更好地准确回答问题。 43 | - **自举生成器**:利用 GLM 的能力为问题生成回复,提供详细的答案。 44 | - **基于人类偏好的打分器**:通过优先考虑人类偏好来评估生成回复的质量,确保系统能够产生有用和吸引人的内容。 45 | 46 | # 开发准备 47 | 48 | ## 准备代码和环境 49 | 50 | 克隆此仓库,并安装所需第三方库 51 | 52 | ```bash 53 | pip install -r requirements.txt 54 | ``` 55 | 56 | 安装 Nodejs。 57 | 58 | ```bash 59 | apt install nodejs # 如果你使用Ubuntu 60 | ``` 61 | 62 | 安装 playwright 依赖项。 63 | 64 | ```bash 65 | playwright install 66 | ``` 67 | 68 | 如果你的主机中没有安装浏览器环境,则需要安装。不用担心,如果是这种情况,playwright 会在首次执行时出现说明。 69 | 70 | ## 准备 SerpAPI 密钥 71 | 72 | 在搜索过程中,我们使用 SerpAPI 获取搜索结果。你需要从[这里](https://serpapi.com/)获取 SerpAPI 密钥。 73 | 74 | 然后将环境变量`SERPAPI_KEY`设置为你的密钥。 75 | 76 | 或者,你可以通过 playwright 使用 Bing。你可以在 WebGLM 的启动命令行中添加 `--searcher bing` 以使用 Bing 搜索。 77 | 78 | ```bash 79 | export SERPAPI_KEY="YOUR KEY" 80 | ``` 81 | 82 | ## 下载检索器权重 83 | 84 | 通过运行以下命令从[清华云](https://cloud.tsinghua.edu.cn/d/54056861b2f34bbfb3f9/)下载检索器的权重。 85 | 86 | 你可以通过 `--save SAVE_PATH` 手动指定检索器权重的保存路径。 87 | 88 | ```bash 89 | python download.py retriever-pretrained-checkpoint 90 | ``` 91 | 92 | # 尝试 WebGLM 93 | 94 | 在运行代码之前,请确保你的设备空间足够。 95 | 96 | ## 导出环境变量 97 | 98 | 将环境变量`WEBGLM_RETRIEVER_CKPT`设定为检索器权重的路径。如果你已将检索器权重下载到默认路径,可以直接运行以下命令行。 99 | 100 | ```bash 101 | export WEBGLM_RETRIEVER_CKPT=./download/retriever-pretrained-checkpoint 102 | ``` 103 | 104 | ## 以命令行界面运行 105 | 106 | 你可以尝试 WebGLM-2B 模型: 107 | 108 | ```bash 109 | python cli_demo.py -w THUDM/WebGLM-2B 110 | ``` 111 | 112 | 或直接尝试 WebGLM-10B 模型: 113 | 114 | ```bash 115 | python cli_demo.py 116 | ``` 117 | 118 | 如果你想使用 Bing 搜索而不是 SerpAPI,可以在命令行中添加 `--searcher bing`,例如: 119 | 120 | ```bash 121 | python cli_demo.py -w THUDM/WebGLM-2B --searcher bing 122 | ``` 123 | 124 | ## 以 Web 服务形式运行 125 | 126 | 使用与 `cli_demo.py` 相同的参数运行 `web_demo.py`。例如,你可以通过 Bing 搜索使用 WebGLM-2B 模型: 127 | 128 | ```bash 129 | python web_demo.py -w THUDM/WebGLM-2B --searcher bing 130 | ``` 131 | 132 | # 训练 WebGLM 133 | 134 | ## 训练生成器 135 | 136 | ### 准备数据 137 | 138 | 运行下面的命令行从[清华云](https://cloud.tsinghua.edu.cn/d/ae204894f2e842f19a3f/)下载训练数据。 139 | 140 | ```bash 141 | python download.py generator-training-data 142 | ``` 143 | 144 | 它将自动下载所有数据,并将它们预处理成可以立即在`./download`中使用的 seq2seq 格式。 145 | 146 | ### 训练 147 | 148 | 请参考[GLM 仓库](https://github.com/THUDM/GLM#train-with-your-own-data)进行 seq2seq 训练。 149 | 150 | ## 训练检索器 151 | 152 | ### 准备数据 153 | 154 | 通过运行以下命令行,从[清华云](https://cloud.tsinghua.edu.cn/d/fa5e6eb1afac4f08a4c6/)下载训练数据。 155 | 156 | ```bash 157 | python download.py retriever-training-data 158 | ``` 159 | 160 | ### 训练 161 | 162 | 运行以下命令行来训练检索器。如果你已经在默认路径下载了检索器训练数据,可以直接运行以下命令行。 163 | 164 | ```bash 165 | python train_retriever.py --train_data_dir ./download/retriever-training-data 166 | ``` 167 | 168 | # 评测 169 | 170 | 你可以在 TriviaQA、WebQuestions 和 NQ Open 上重现我们的结果。以 TriviaQA 为例,可以运行以下命令行: 171 | 172 | ```bash 173 | bash scripts/triviaqa.sh 174 | ``` 175 | 176 | 并开始进行评测。 177 | 178 | # 真实应用案例 179 | 180 | 您可以在[这里](assets/cases)查看一些 WebGLM 实际应用场景的示例。 181 | 182 | # 引用 183 | 184 | 如果您针对您的研究使用了这个代码,请引用我们的论文。 185 | 186 | ``` 187 | @misc{liu2023webglm, 188 | title={WebGLM: Towards An Efficient Web-Enhanced Question Answering System with Human Preferences}, 189 | author={Xiao Liu and Hanyu Lai and Hao Yu and Yifan Xu and Aohan Zeng and Zhengxiao Du and Peng Zhang and Yuxiao Dong and Jie Tang}, 190 | year={2023}, 191 | eprint={2306.07906}, 192 | archivePrefix={arXiv}, 193 | primaryClass={cs.CL} 194 | } 195 | ``` 196 | 197 | > 该仓库已进行简化以便于部署。 198 | -------------------------------------------------------------------------------- /arguments.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def add_model_config_args(parser): 4 | """Model arguments""" 5 | parser.add_argument("-w", "--webglm_ckpt_path", type=str, default=None, help="path to the webglm checkpoint, default to $WEBGLM_CKPT or THUDM/WebGLM") 6 | 7 | parser.add_argument("-r", "--retriever_ckpt_path", type=str, default=None, help="path to the retriever checkpoint, default to $WEBGLM_RETRIEVER_CKPT") 8 | 9 | parser.add_argument("-d", "--device", type=str, default="cuda", help="device to run the model, default to cuda") 10 | 11 | parser.add_argument("-b", "--filter_max_batch_size", type=int, default=50, help="max batch size for the retriever, default to 50") 12 | 13 | parser.add_argument("-s", "--serpapi_key", type=str, default=None, help="serpapi key for the searcher, default to $SERPAPI_KEY") 14 | parser.add_argument("--searcher", type=str, default="serpapi", help="searcher to use (serpapi or bing), default to serpapi") 15 | 16 | return parser 17 | 18 | def add_evaluation_args(parser): 19 | """Evaluation arguments""" 20 | parser.add_argument("-t", "--task", type=str, default=None, help="evaluate task, choose from nq_open, web_questions, triviaqa") 21 | 22 | parser.add_argument("-p", "--evaluate_task_data_path", type=str, default=None, help="data path of the evaluate task") 23 | 24 | return parser 25 | 26 | def get_args(args_list=None, parser=None): 27 | """Parse all the args.""" 28 | if parser is None: 29 | parser = argparse.ArgumentParser(description='webglm') 30 | else: 31 | assert isinstance(parser, argparse.ArgumentParser) 32 | 33 | parser = add_model_config_args(parser) 34 | parser = add_evaluation_args(parser) 35 | 36 | return parser.parse_args() -------------------------------------------------------------------------------- /assets/cases/0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/assets/cases/0.png -------------------------------------------------------------------------------- /assets/cases/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/assets/cases/1.png -------------------------------------------------------------------------------- /assets/cases/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/assets/cases/2.png -------------------------------------------------------------------------------- /assets/cases/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/assets/cases/3.png -------------------------------------------------------------------------------- /assets/cases/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/assets/cases/4.png -------------------------------------------------------------------------------- /assets/cases/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/assets/cases/5.png -------------------------------------------------------------------------------- /assets/cases/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/assets/cases/6.png -------------------------------------------------------------------------------- /assets/cases/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/assets/cases/7.png -------------------------------------------------------------------------------- /assets/cases/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/assets/cases/8.png -------------------------------------------------------------------------------- /assets/cases/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/assets/cases/9.png -------------------------------------------------------------------------------- /assets/main_process.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/assets/main_process.png -------------------------------------------------------------------------------- /cli_demo.py: -------------------------------------------------------------------------------- 1 | from model import load_model, citation_correction 2 | import argparse 3 | from arguments import add_model_config_args 4 | 5 | if __name__ == '__main__': 6 | 7 | arg = argparse.ArgumentParser() 8 | add_model_config_args(arg) 9 | args = arg.parse_args() 10 | 11 | webglm = load_model(args) 12 | 13 | while True: 14 | question = input("[Enter to Exit] >>> ") 15 | question = question.strip() 16 | if not question: 17 | break 18 | if question == "quit": 19 | break 20 | final_results = {} 21 | for results in webglm.stream_query(question): 22 | final_results.update(results) 23 | if "references" in results: 24 | for ix, ref in enumerate(results["references"]): 25 | print("Reference [%d](%s): %s"%(ix + 1, ref['url'], ref['text'])) 26 | if "answer" in results: 27 | print("\n%s\n"%citation_correction(results["answer"], [ref['text'] for ref in final_results["references"]])) -------------------------------------------------------------------------------- /data/nq_open.jsonl: -------------------------------------------------------------------------------- 1 | {"question": "the era of the great mughals began with the accession of", "answer": ["Akbar the Great", "Babur"]} 2 | {"question": "ethiopia flight 961 crashes in to the sea", "answer": ["23 November 1996"]} 3 | {"question": "what position did doug peterson play in the nfl", "answer": ["holder on placekicks", "quarterback"]} 4 | {"question": "when did houston go to the american league", "answer": ["2013"]} 5 | {"question": "who has the power to approve or veto legislation constitution", "answer": ["the President"]} 6 | {"question": "how many seasons are there for lost girl", "answer": ["5", "five"]} 7 | {"question": "a single period of precession of earth's axis is completed in about", "answer": ["approximately 26,000 years", "26,000 years"]} 8 | {"question": "when did frank sinatra first sing new york new york", "answer": ["1980", "1979"]} 9 | {"question": "who made the poppies at tower of london", "answer": ["Paul Cummins", "Tom Piper", "stage designer Tom Piper"]} 10 | {"question": "what album is help by the beatles on", "answer": ["Help!"]} 11 | {"question": "when did taylor swift's first album release", "answer": ["October 24, 2006", "2005"]} 12 | {"question": "architectural elements forming rib vaults eg wells cathedral", "answer": ["an armature of piped masonry", "barrel vaults", "two to three barrel vaults"]} 13 | {"question": "when did the rational dress society begin to work", "answer": ["1881"]} 14 | {"question": "when did the study of media effects begin", "answer": ["1919", "1975"]} 15 | {"question": "what is the order of the netflix marvel shows", "answer": ["Marvel's Iron Fist", "Marvel's Daredevil", "Marvel's The Punisher", "Marvel's Jessica Jones", "Marvel's The Defenders", "Marvel's Luke Cage"]} 16 | {"question": "when was the south asian association for regional co-operation (saarc) formed", "answer": ["December 1985", "8 December 1985"]} 17 | {"question": "who sang i ran all the way home", "answer": ["The Impalas"]} 18 | {"question": "what are the active materials of a lead acid battery", "answer": ["Lead", "sulfuric acid", "Lead and lead dioxide", "lead dioxide"]} 19 | {"question": "last episode of what happens to my family", "answer": ["53"]} 20 | {"question": "who is the first indian woman to be canonized as a saint", "answer": ["Saint Alphonsa"]} 21 | {"question": "who was the french chef given credit for developing the classic kitchen brigade", "answer": ["Georges Auguste Escoffier"]} 22 | {"question": "who wins the next iron chef super chefs", "answer": ["Zakarian", "Geoffrey Zakarian"]} 23 | {"question": "what is the population of st petersburg fl", "answer": ["260,999", "257,083"]} 24 | {"question": "yeh hai mohabbatein serial star cast real name", "answer": ["Divyanka Tripathi and Karan Patel"]} 25 | {"question": "what episode does goku give up against cell", "answer": ["165", "180"]} 26 | {"question": "where did the battle of corinth take place", "answer": ["in Corinth, Mississippi", "Corinth, Mississippi"]} 27 | {"question": "what is the angle of the tower of pisa", "answer": ["about 3.99 degrees", "3.99 degrees", "at about 3.99 degrees"]} 28 | {"question": "who plays the dad in pretty in pink", "answer": ["Harry Dean Stanton"]} 29 | {"question": "who turns into a bear in the hobbit", "answer": ["Beorn"]} 30 | {"question": "who won last year's ncaa women's basketball", "answer": ["South Carolina"]} 31 | {"question": "who has the most followers in the world on instagram", "answer": ["Instagram's own account", "Instagram"]} 32 | {"question": "who wrote knock knock knocking on heavens door", "answer": ["Bob Dylan"]} 33 | {"question": "who did bette midler portray in the rose", "answer": ["Mary Rose Foster"]} 34 | {"question": "when is the last time the vikings were in the nfc championship", "answer": ["1976", "2017/18"]} 35 | {"question": "what is the name of the hyena in lion king", "answer": ["Banzai", "Shenzi", "Ed"]} 36 | {"question": "who sang the song good morning good morning", "answer": ["Gene Kelly", "Donald O'Connor", "Judy Garland", "Debbie Reynolds", "Mickey Rooney"]} 37 | {"question": "who is the lead singer of depeche mode", "answer": ["David Gahan"]} 38 | {"question": "where does the path train stop in newark", "answer": ["Newark Penn Station"]} 39 | {"question": "when is the met office leaving the bbc", "answer": ["31 March 2018"]} 40 | {"question": "when does the miz and maryse show start", "answer": ["2018"]} 41 | {"question": "how many seasons of vampire diaries r there", "answer": ["eight", "8"]} 42 | {"question": "who played the twins in darling buds of may", "answer": ["Christina Giles", "Katherine Giles"]} 43 | {"question": "coldplay song i will try to fix you", "answer": ["\"Fix You\""]} 44 | {"question": "what is upstream project in oil and gas", "answer": ["drilling exploratory wells"]} 45 | {"question": "where does no game no life anime end", "answer": ["the Elkia Federation", "the sixth volume"]} 46 | {"question": "when did ole miss beat alabama in football", "answer": ["October 3, 1970", "September 11, 1976", "October 13, 2001", "October 27, 1894", "October 18, 2003", "November 5, 1910", "October 8, 1988", "October 4, 2014", "September 19, 2015", "October 5, 1968"]} 47 | {"question": "how many games in a row have the uconn women's basketball team won", "answer": ["111 straight wins", "111", "90"]} 48 | {"question": "what's the population of prince edward island", "answer": ["142,907 residents", "142,907"]} 49 | {"question": "where do you get a cashiers check from", "answer": ["a bank", "bank"]} 50 | {"question": "rizal finished all the chapters of the novel noli me tangere in", "answer": ["December 1886", "Spanish"]} 51 | {"question": "how much money did it cost to make gta v", "answer": ["137"]} 52 | {"question": "who does stefan marry in the vampire diaries", "answer": ["Caroline Forbes"]} 53 | {"question": "who won the award for best goalkeeper in football world cup 2006", "answer": ["Gianluigi Buffon"]} 54 | {"question": "who has been appointed as the election commissioner of india", "answer": ["Om Prakash Rawat"]} 55 | {"question": "when is season 3 of grace and frankie being released", "answer": ["March 24, 2017"]} 56 | {"question": "who plays the robot on the orville show", "answer": ["Mark Jackson"]} 57 | {"question": "when was the latest version of chrome released", "answer": ["2018-01-22"]} 58 | {"question": "who plays timon in lion king on broadway", "answer": ["Max Casella"]} 59 | {"question": "where do the sharks play in san jose", "answer": ["the SAP Center", "SAP Center", "SAP Center at San Jose"]} 60 | {"question": "who was the famous scientist that ran the research lab moseley went to in manchester", "answer": ["Sir Ernest Rutherford"]} 61 | {"question": "what grade was arnold from hey arnold in", "answer": ["fourth"]} 62 | {"question": "who sings every light in the house is on", "answer": ["Trace Adkins"]} 63 | {"question": "what are the ranks in the us navy", "answer": ["E-8s senior chief petty officer", "E-9s master chief petty officer"]} 64 | {"question": "who controlled the house and the senate in 2012", "answer": ["Republican", "Democratic"]} 65 | {"question": "who plays auggie in the movie the wonder", "answer": ["Jacob Tremblay"]} 66 | {"question": "who is the king and queen of the netherlands", "answer": ["Queen Máxima of the Netherlands", "King Willem-Alexander"]} 67 | {"question": "how many breeds of pigs are there in the uk", "answer": ["---"]} 68 | {"question": "who does demetrius love in a midsummer night dream", "answer": ["Helena", "Hermia"]} 69 | {"question": "where is arachidonic acid found in the body", "answer": ["brain", "muscles", "liver"]} 70 | {"question": "who wrote he ain't heavy he's my brother lyrics", "answer": ["Bobby Scott", "Bob Russell"]} 71 | {"question": "when was the last time the jets won a playoff game", "answer": ["2010"]} 72 | {"question": "who was kat slater's sisters in eastenders", "answer": ["Zoe", "Little Mo", "Lynne"]} 73 | {"question": "what was tom hanks character name in castaway", "answer": ["Chuck Noland"]} 74 | {"question": "how many seasons of rules of engagement is there", "answer": ["7", "seven"]} 75 | {"question": "the outer layer of the skin that contains no blood or nerve supply is the", "answer": ["epidermis"]} 76 | {"question": "points on a sphere or angles in a circle are measured in units called", "answer": ["radians"]} 77 | {"question": "when did they stop cigarette advertising on television", "answer": ["January 2, 1971", "1970"]} 78 | {"question": "when is the publishers clearing house sweepstakes drawing", "answer": ["just after the Super Bowl"]} 79 | {"question": "where is the capital city of alabama located", "answer": ["Montgomery"]} 80 | {"question": "epidemiologists attempt to explain the link between health and variables such as", "answer": ["biological agents", "disease conditions in defined populations", "smoking", "stress", "chemicals", "alcohol"]} 81 | {"question": "what year is it for the jewish calendar", "answer": ["AM 5778", "5778"]} 82 | {"question": "batman the enemy with episode 5 release date", "answer": ["March 27, 2018"]} 83 | {"question": "who plays gram on the young and the restless", "answer": ["Max Shippee"]} 84 | {"question": "when was the term social justice first used", "answer": ["the 1840s", "1840s"]} 85 | {"question": "how much for a passport in the philippines", "answer": ["$60 abroad", "₱950"]} 86 | {"question": "who plays joker in batman the dark knight", "answer": ["Ledger"]} 87 | {"question": "when was the minimum wage established in the united states", "answer": ["1938", "1933", "1912"]} 88 | {"question": "where was the first session of the assam association held in1905", "answer": ["Guwahati"]} 89 | {"question": "who plays the saint of killers on preacher", "answer": ["Graham McTavish"]} 90 | {"question": "when did skiing halfpipe become an olympic event", "answer": ["2014"]} 91 | {"question": "derek and meredith get back together season 3", "answer": ["Staring at the Sun"]} 92 | {"question": "who played the mom on what's eating gilbert grape", "answer": ["Darlene Cates"]} 93 | {"question": "name two fibres which are made of proteins", "answer": ["feathers", "hair", "wool", "fur", "silk"]} 94 | {"question": "what year does the quiet man take place", "answer": ["the 1920s", "In the 1920s"]} 95 | {"question": "when did mcdonald's sell 1 million burgers", "answer": ["By 1965"]} 96 | {"question": "who won battle of the sexes tennis game", "answer": ["Billie Jean King"]} 97 | {"question": "who won the battle of the first battle of bull run", "answer": ["Confederate victory", "Confederate forces", "Confederate"]} 98 | {"question": "swan lake the sleeping beauty and the nutcracker are three famous ballets by", "answer": ["Pyotr Ilyich Tchaikovsky"]} 99 | {"question": "when does the new gotham season come out", "answer": ["September 21, 2017", "September 21, 2017"]} 100 | {"question": "where do they put the tomb vampires in order to burn them during founders day", "answer": ["the Gilbert building"]} 101 | {"question": "what is the meaning of the dragon boat festival", "answer": ["commemorating fealty and filial piety"]} 102 | {"question": "who played ice queen in chronicles of narnia", "answer": ["Tilda Swinton", "Laura Brent"]} 103 | {"question": "who made the song falling in love with you", "answer": ["Hugo Peretti", "George David Weiss", "Elvis Presley", "Luigi Creatore"]} 104 | {"question": "who plays artemisia in 300 rise of an empire", "answer": ["Caitlin Carmichael", "Eva Green", "Jade Chynoweth"]} 105 | {"question": "which is produced in plants of narora kakrapar tarapur", "answer": ["Atomic Power"]} 106 | {"question": "who is the team that beat the eagles this season", "answer": ["Dallas Cowboys", "Seattle Seahawks", "Kansas City Chiefs"]} 107 | {"question": "who played amy grant i i can only imagine", "answer": ["Nicole DuPort"]} 108 | {"question": "when was the first australian prime minister elected", "answer": ["Sir Edmund Barton", "1901"]} 109 | {"question": "what type of political system does el salvador have", "answer": ["a presidential representative democratic republic", "\"flawed democracy\"", "presidential representative democratic republic"]} 110 | {"question": "what is the current population of bora bora", "answer": ["10,605"]} 111 | {"question": "what year did the us hockey team won the olympics", "answer": ["1960", "1960 and 1980", "1980"]} 112 | {"question": "what is the oath that new citizens take", "answer": ["United States Oath of Allegiance"]} 113 | {"question": "who plays the dragon queen from game of thrones", "answer": ["Emilia Clarke"]} 114 | {"question": "element named after fictional planet from which superman came", "answer": ["Kryptonite"]} 115 | {"question": "who is playing the halftime show at super bowl 2016", "answer": ["Beyoncé", "Coldplay", "Bruno Mars"]} 116 | {"question": "who had a baby at 100 in the bible", "answer": ["Sarah", "Abraham"]} 117 | {"question": "who plays matthew on anne with an e", "answer": ["R. H. Thomson"]} 118 | {"question": "when did the united states host the world cup", "answer": ["1994"]} 119 | {"question": "when did mcgee became a regular on ncis", "answer": ["in season two", "season two"]} 120 | {"question": "when do primary ossification centers appear in an embryo", "answer": ["prenatal development"]} 121 | {"question": "what was the real name of saudi arabia", "answer": ["the Saudi Arab kingdom"]} 122 | {"question": "who won college basketball player of the year", "answer": ["A'ja Wilson", "Jalen Brunson"]} 123 | {"question": "when did lionel messi play his first game for barcelona", "answer": ["2002", "2001", "October 2004"]} 124 | {"question": "the group that officially elects the president of the united states is called", "answer": ["the U.S. Electoral College", "U.S. Electoral College"]} 125 | {"question": "where are alkali metals located on the periodic table", "answer": ["in the s-block", "group 1"]} 126 | {"question": "who did the broncos beat in the super bowl", "answer": ["Carolina Panthers", "Atlanta Falcons", "Green Bay Packers"]} 127 | {"question": "what type of government did the ming dynasty have", "answer": ["imperial rule"]} 128 | {"question": "what was the final episode of quantum leap", "answer": ["\"Mirror Image\""]} 129 | {"question": "who won the super heavyweight gold medal at the 2000 olympics", "answer": ["Audley Harrison"]} 130 | {"question": "where is lord's prayer found in bible", "answer": ["in the Gospel of Luke"]} 131 | {"question": "how many languages in harry potter translated into", "answer": ["over 74", "over 74 languages"]} 132 | {"question": "who owns the crown plaza hotel in chicago illinois", "answer": ["InterContinental Hotels Group"]} 133 | {"question": "who became the king of ayodhya after ram", "answer": ["Kusha"]} 134 | {"question": "who is under the mask of darth vader", "answer": ["Anakin Skywalker"]} 135 | {"question": "other than water what else has hydrogen bonds", "answer": ["inorganic molecules such as water"]} 136 | {"question": "who plays dusty in the movie pure country", "answer": ["George Strait"]} 137 | {"question": "what is the pirates of the caribbean in order", "answer": ["On Stranger Tides", "At World's End", "Dead Men Tell No Tales", "Dead Man's Chest"]} 138 | {"question": "who plays the dad in nanny mcphee and the big bang", "answer": ["Ewan McGregor"]} 139 | {"question": "when did the golden state warriors win the finals", "answer": ["1947", "1975", "1956", "2015", "2017"]} 140 | {"question": "what engine is in a holden v8 supercar", "answer": ["V8-engine"]} 141 | {"question": "what's the dog's name on tom and jerry", "answer": ["Spike"]} 142 | {"question": "who is edmund on days of our lives", "answer": ["Adam Caine"]} 143 | {"question": "who is opening for little mix glory days tour", "answer": ["Sheppard", "Zoe Badwi", "Louisa Johnson", "Bronnie", "The Vamps", "Ella Eyre", "Conor Maynard"]} 144 | {"question": "who starred in an officer and a gentleman", "answer": ["Richard Gere", "David Keith", "Louis Gossett Jr.", "Debra Winger", "Phillip J. Salmon"]} 145 | {"question": "who is the actor that plays dr. sean murphy", "answer": ["Freddie Highmore"]} 146 | {"question": "what is the ethnic background of the shib sibs", "answer": ["Japanese"]} 147 | {"question": "what is the name of the first earthquake early warning system", "answer": ["1991", "The Mexican Seismic Alert System"]} 148 | {"question": "most passing yards in nfl history in a game", "answer": ["Norm Van Brocklin", "554"]} 149 | {"question": "who has won the eurovision song contest the most times", "answer": ["Ireland's Johnny Logan", "Ireland"]} 150 | {"question": "when was united nations convention on the rights of the child created", "answer": ["20 November 1989"]} 151 | {"question": "who sings the song it ain't me", "answer": ["Selena Gomez", "American singer Selena Gomez"]} 152 | {"question": "when was the biltmore house opened to the public", "answer": ["March 1930"]} 153 | {"question": "why was hong kong important to the british empire", "answer": ["a centre for international trade"]} 154 | {"question": "what is katie running from in safe haven", "answer": ["her abusive husband"]} 155 | {"question": "how tall is the actor who plays hagrid in harry potter", "answer": ["6ft 1in"]} 156 | {"question": "who proved that cells come from other cells", "answer": ["Matthias Schleiden", "Robert Hooke", "Robert Remak", "Theodor Schwann", "Rudolf Virchow"]} 157 | {"question": "where did the butchers in the slaughterhouse cases live", "answer": ["New Orleans"]} 158 | {"question": "who plays captain phasma in star wars the force awakens", "answer": ["Gwendoline Christie"]} 159 | {"question": "who sang picking up pebbles and throwing them into the sea", "answer": ["Matt Flinders"]} 160 | {"question": "how many episodes of corrie has there been", "answer": ["9,436"]} 161 | {"question": "who is the first president to be impeached", "answer": ["Andrew Johnson", "Johnson"]} 162 | {"question": "what is billy last name in where the red fern grows", "answer": ["Colman", "Billy Colman"]} 163 | {"question": "nuclear power plant that blew up in russia", "answer": ["Chernobyl Nuclear Power Plant", "Chernobyl", "the Chernobyl Nuclear Power Plant"]} 164 | {"question": "where does a roadrunner live in the desert", "answer": ["mountainous shrubland", "arid lowland"]} 165 | {"question": "how many pieces in a terry's chocolate orange", "answer": ["six", "20"]} 166 | {"question": "who did puerto rico belong to before the u.s", "answer": ["Taíno", "indigenous Taíno people", "Spain"]} 167 | {"question": "who made the most free throws in nba history", "answer": ["Karl Malone"]} 168 | {"question": "criminal minds what episode does jj find out she pregnant", "answer": ["in April 2011", "The Crossing"]} 169 | {"question": "who made delhi as capital for the first time", "answer": ["the Pandavas"]} 170 | {"question": "when does the champions league quarter finals start", "answer": ["16 March 2018", "3–4 April 2018"]} 171 | {"question": "who developed the concept of total quality management", "answer": ["W. Edwards Deming"]} 172 | {"question": "who plays at the prudential center in newark", "answer": ["Seton Hall Pirates", "New Jersey Devils"]} 173 | {"question": "when do new episodes of riverdale season 2 come out", "answer": ["February 7, 2018", "April 18, 2018", "October 11, 2017", "TBA", "January 31, 2018"]} 174 | {"question": "who sings the song i'll never forget you", "answer": ["Mariah Carey", "Zara Larsson and MNEK", "Noisettes"]} 175 | {"question": "who did america declare war on in ww1", "answer": ["Austria-Hungary"]} 176 | {"question": "where does hydrogen peroxide come from in the body", "answer": ["nearly all living cells"]} 177 | {"question": "what is the name for the ch3coo- ion", "answer": ["polyatomic anion"]} 178 | {"question": "who wrote old flames cant hold a candle to you", "answer": ["Pebe Sebert", "Pebe Sebert and Hugh Moffatt", "Patricia Rose Sebert", "Hugh Moffatt"]} 179 | {"question": "how many wars held between india and pakistan", "answer": ["four"]} 180 | {"question": "where is gall bladder situated in human body", "answer": ["beneath the liver"]} 181 | {"question": "nba record for most double doubles in a season", "answer": ["Tim Duncan"]} 182 | {"question": "where is simple squamous epithelium found in the body", "answer": ["alveoli", "outer layer of skin", "capillaries", "glomeruli"]} 183 | {"question": "who hit the first home run in the houston astrodome", "answer": ["Mickey Mantle"]} 184 | {"question": "what season does bart bass die in gossip girl", "answer": [")"]} 185 | {"question": "where does tropic of cancer pass in india", "answer": ["Chhattisgarh", "West Bengal", "Gujarat", "State of Tripura", "Jharkhand", "State of Mizoram", "Madhya Pradesh", "Rajasthan"]} 186 | {"question": "who played bat masterson in the tv series", "answer": ["Gene Barry"]} 187 | {"question": "who was originally cast to play indiana jones", "answer": ["Tom Selleck"]} 188 | {"question": "what song is played while raising the american flag", "answer": ["Reveille", "\"Reveille\""]} 189 | {"question": "when did the the regulatory reform (fire safety) order 2005 first come into effect", "answer": ["1 October 2006"]} 190 | {"question": "how many paintings of sunflowers did van gogh paint", "answer": ["two"]} 191 | {"question": "what category was hurricane charley when it hit florida", "answer": ["4", "Category 4", "Category 4"]} 192 | {"question": "who is president of india in present time", "answer": ["Ram Nath Kovind"]} 193 | {"question": "when were the winnie the pooh books written", "answer": ["1924", "1926", "1927", "1928"]} 194 | {"question": "when was the debating club established in almora", "answer": ["1871 A.D.", "1871"]} 195 | {"question": "number 4 in roman numerals on clock faces", "answer": ["IV"]} 196 | {"question": "which country has won maximum number of gold medal in asian game 2014", "answer": ["China"]} 197 | {"question": "when does the last episode of adventure time air", "answer": ["TBA"]} 198 | {"question": "where did the dewey decimal system come from", "answer": ["Melvil Dewey"]} 199 | {"question": "where does the formation of atp take place", "answer": ["plasma membrane in bacteria", "inner mitochondrial membrane", "thylakoid membrane", "mitochondrial membrane in eukaryotes"]} 200 | {"question": "who has won the most college football national champions", "answer": ["Princeton"]} 201 | {"question": "when did amnesia the dark descent come out", "answer": ["8 September 2010"]} 202 | {"question": "where was the first colony in north america located", "answer": ["Virginia"]} 203 | {"question": "who did the minnesota vikings lose to in the super bowl", "answer": ["Pittsburgh Steelers", "Oakland Raiders", "Miami Dolphins", "Kansas City Chiefs"]} 204 | {"question": "when did the movie napoleon dynamite come out", "answer": ["June 11, 2004", "2004"]} 205 | {"question": "what is the hot coffee mod in san andreas", "answer": ["a normally inaccessible mini-game"]} 206 | {"question": "who wrote cant get you out of my head lyrics", "answer": ["Cathy Dennis and Rob Davis", "Rob Davis", "Cathy Dennis"]} 207 | {"question": "where does a brisket come from on a cow", "answer": ["the breast or lower chest", "breast or lower chest"]} 208 | {"question": "where did the last name wallace come from", "answer": ["a Scottish surname"]} 209 | {"question": "who are the cast members of ncis new orleans", "answer": ["Zoe McLellan", "Lucas Black", "Daryl \"Chill\" Mitchell", "Shalita Grant", "Rob Kerkovich", "Vanessa Ferlito", "Daryl Mitchell", "Scott Bakula", "CCH Pounder"]} 210 | {"question": "what type of database is library literature and information science", "answer": ["bibliographic database", "bibliographic"]} 211 | {"question": "who holds the world record for the most world records", "answer": ["Ashrita Furman"]} 212 | {"question": "who played solomon in little house on the prairie", "answer": ["Todd Bridges"]} 213 | {"question": "who did america declare war on in ww1", "answer": ["Austria-Hungary"]} 214 | {"question": "jonny cash one piece at a time car", "answer": ["Cadillac"]} 215 | {"question": "who developed the central processing unit (cpu)", "answer": ["John von Neumann"]} 216 | {"question": "when did marathon change its name to snickers", "answer": ["19 July 1990"]} 217 | {"question": "who sang let me tell you about the birds and the bees", "answer": ["Jewel Akens"]} 218 | {"question": "how old is the actress who plays phyllis on y&r", "answer": ["Gina Tognoni", "age 44"]} 219 | {"question": "when does the miz and maryse show start", "answer": ["2018"]} 220 | {"question": "who is the owner of reading football club", "answer": ["Xiu Li Dai", "Dai Xiuli", "Dai Yongge", "Yongge Dai"]} 221 | {"question": "when was the last episode of vampire diaries aired", "answer": ["March 10, 2017", "March 10, 2017"]} 222 | {"question": "when did the eagles play in the superbowl", "answer": ["February 6, 2005"]} 223 | {"question": "who does eric end up with in that 70s show", "answer": ["Donna"]} 224 | {"question": "who ran the fastest 40 yard dash in the nfl", "answer": ["Jakeem Grant", "John Ross"]} 225 | {"question": "who opens the church of the holy sepulchre", "answer": ["the Sunni Muslim family", "the Nusaybah family"]} 226 | {"question": "when does the miz and maryse show start", "answer": ["2018"]} 227 | {"question": "where does the last name galvez come from", "answer": ["Spanish surname", "Spanish"]} 228 | {"question": "who sang rip it up and start again", "answer": ["Scottish post-punk band Orange Juice", "Orange Juice"]} 229 | {"question": "when was the young and the restless first aired", "answer": ["March 26, 1973"]} 230 | {"question": "who is given credit for the gnu initiative", "answer": ["Richard Stallman"]} 231 | {"question": "what was the initial effect of the transition from command to market economies in eastern europe", "answer": ["Inequality of opportunity"]} 232 | {"question": "which state is located in the centre of india", "answer": ["Chhattisgarh", "Madhya Pradesh"]} 233 | {"question": "when did they replace lead with graphite in pencils", "answer": ["never contained the element lead"]} 234 | {"question": "when was the taming ofthe shrew first performed", "answer": ["prior to June 1592"]} 235 | {"question": "who wrote lyrics for phantom of the opera", "answer": ["Charles Hart", "Charles Hart and Richard Stilgoe", "Richard Stilgoe"]} 236 | {"question": "where is the tibia and fibula bone located", "answer": ["leg"]} 237 | {"question": "how many gold medals did australia win in the 2000 olympics", "answer": ["16"]} 238 | {"question": "who played gino in a place to call home", "answer": ["Aldo Mignone"]} 239 | {"question": "who is the minister of local government in zimbabwe", "answer": ["Hon July Moyo"]} 240 | {"question": "when was harry potter and the philosophers stone published", "answer": ["in 1997", "1997"]} 241 | {"question": "who are the co hosts on the real", "answer": ["Tamar Braxton", "Loni Love", "Adrienne Houghton", "Tamera Mowry-Housley", "Jeannie Mai"]} 242 | {"question": "how many episodes are there in dragon ball z", "answer": ["291 episodes", "291"]} 243 | {"question": "who has the power (judicial) to make decisions in courts of law", "answer": ["judges"]} 244 | {"question": "where was the killing of a sacred deer filmed", "answer": ["Cincinnati"]} 245 | {"question": "when do the new episodes of supernatural start", "answer": ["May 10, 2018"]} 246 | {"question": "what type of economic system was utilized in the soviet union", "answer": ["communism", "state ownership"]} 247 | {"question": "when did the east india company take control of india", "answer": ["in 1757", "1757", "1799", "1612"]} 248 | {"question": "when was the $1 000 bill discontinued", "answer": ["1969", "December 27, 1945", "July 14, 1969"]} 249 | {"question": "what are the three fifty shades of grey books", "answer": ["Fifty Shades of Grey", "Fifty Shades Darker", "Fifty Shades Freed"]} 250 | {"question": "who plays alec ramsay in the black stallion", "answer": ["Kelly Reno"]} 251 | {"question": "when was the last time the ducks won the stanley cup", "answer": ["(2006–07)", "2006–07"]} 252 | {"question": "product-market fit means being in a good market with a product that can satisfy that market", "answer": ["Mark Andreessen"]} 253 | {"question": "who lasted the longest in the royal rumble", "answer": ["Rey Mysterio"]} 254 | {"question": "who won the mens single ice skating 2018", "answer": ["Yuzuru Hanyu", "Javier Fernández", "Shoma Uno"]} 255 | {"question": "who played alex cross in along came a spider", "answer": ["Morgan Freeman"]} 256 | {"question": "the cast of don't tell mom the babysitter's dead", "answer": ["Kimmy Robertson", "Jeff Bollow", "John Getz", "Keith Coogan", "Joanna Cassidy", "Concetta Tomei", "Robert Hy Gorman", "David Duchovny", "Jayne Brook", "Eda Reiss Merin", "Christopher Pettiet", "Dan Castellaneta (voice)", "Josh Charles", "Danielle Harris", "Michael Kopelow", "Christina Applegate"]} 257 | {"question": "i was a great islamic scholar and mathematician who died in 1131 ce", "answer": ["Omar Khayyam"]} 258 | {"question": "how many seasons of the bastard executioner are there", "answer": ["one", "one season"]} 259 | {"question": "where did the butchers in the slaughterhouse cases live", "answer": ["New Orleans"]} 260 | {"question": "is parallax more pronounced with nearby stars or with distant stars", "answer": ["nearby objects", "nearby"]} 261 | {"question": "is a network connection device that can build tables that identify addresses on each network", "answer": ["routing table", "a router"]} 262 | {"question": "who sings don't take your guns to town", "answer": ["U2", "Johnny Cash"]} 263 | {"question": "who was the great wall of china built to defend against", "answer": ["nomads from Inner Asia", "nomads from Inner Asia."]} 264 | {"question": "when did canada get rid of the death penalty", "answer": ["July 14, 1976", "1976", "1998"]} 265 | {"question": "types of skiing in the winter olympics 2018", "answer": ["Slalom", "Downhill", "Super-G", "Giant slalom", "Slalom – (SC)"]} 266 | {"question": "what is final season of game of thrones", "answer": ["The eighth", "eighth", "the eighth season"]} 267 | {"question": "name the four major layers of the earth in order", "answer": ["a liquid outer core", "an outer silicate solid crust", "a highly viscous mantle", "a solid inner core"]} 268 | {"question": "who wrote it's a long long way to pasadena", "answer": ["John Young", "Harry Vanda", "David Hemmings", "George Young"]} 269 | {"question": "what is the maximum data rate for the 802.11a standard select one", "answer": ["54 Mbit/s"]} 270 | {"question": "how many gold medals did australia win in the 2000 olympics", "answer": ["16"]} 271 | {"question": "where does dividends go on cash flow statement", "answer": ["the financing activities section"]} 272 | {"question": "when did the first ice age come out", "answer": ["2002", "March 15, 2002"]} 273 | {"question": "who is the designer in devil wears prada", "answer": ["Valentino Garavani"]} 274 | {"question": "who captained the first european ship to sail around the tip of africa", "answer": ["Bartolomeu Dias"]} 275 | {"question": "what is the baby elephants name in jungle book", "answer": ["Hathi Jr."]} 276 | {"question": "where is the left anterior descending artery located", "answer": ["the left coronary artery"]} 277 | {"question": "when was corporal punishment banned in south africa", "answer": ["1997"]} 278 | {"question": "under article 1 what is the minimum age required to serve in the house of representatives", "answer": ["25", "25 years old"]} 279 | {"question": "when was the last year the eagles went to the superbowl", "answer": ["following the 2017 season", "2017"]} 280 | {"question": "who has won 2017 women's singles korean open series badminton championship", "answer": ["P. V. Sindhu"]} 281 | {"question": "who does bryce dallas howard play in the grinch", "answer": ["Surprised Who"]} 282 | {"question": "which country is the last member of saarc", "answer": ["Afghanistan"]} 283 | {"question": "who played the colorado kid in rio bravo", "answer": ["Ricky Nelson"]} 284 | {"question": "who played tom in four weddings and a funeral", "answer": ["James Fleet"]} 285 | {"question": "where is the citrus bowl held this year", "answer": ["Camping World Stadium"]} 286 | {"question": "what age do you need to be to buy a bb gun", "answer": ["18"]} 287 | {"question": "who played the mad hatter in the batman tv show", "answer": ["Roddy McDowall", "David Wayne", "Benedict Samuel"]} 288 | {"question": "is it marley and me or marley and i", "answer": ["Marley & Me"]} 289 | {"question": "where does route 66 start on the west coast", "answer": ["in Santa Monica"]} 290 | {"question": "the oligodynamic effect is a phenomenon that describes", "answer": ["a biocidal effect of metals"]} 291 | {"question": "who plays heather in beauty and the beast", "answer": ["Nicole Gale Anderson"]} 292 | {"question": "what age do you need to be to buy a bb gun", "answer": ["18"]} 293 | {"question": "who is the girl in green day 21 guns", "answer": ["Lisa Stelly"]} 294 | {"question": "what is the meaning of the harp in ireland", "answer": ["the arms of Ireland"]} 295 | {"question": "who does the head of the fbi report to", "answer": ["the Director of National Intelligence", "the Attorney General"]} 296 | {"question": "what is the first book of percy jackson", "answer": ["The Lightning Thief"]} 297 | {"question": "if a piece of music is perceived to have changed key then we say the piece has", "answer": ["transposed", "transposition"]} 298 | {"question": "when do you celebrate birthday if born on feb 29", "answer": ["February 29", "February 28", "March 1"]} 299 | {"question": "when did the eagles win last super bowl", "answer": ["2017"]} 300 | {"question": "who is tinker air force base named after", "answer": ["Major General Clarence L. Tinker"]} 301 | {"question": "who had created the second bank of the united states", "answer": ["President James Madison", "James Madison"]} 302 | {"question": "when did gaurdians of the galaxy 2 come out", "answer": ["2017", "May 5, 2017"]} 303 | {"question": "what size engine does a 2005 honda civic have", "answer": ["1169 cc"]} 304 | {"question": "kings and queens of england in the 1900s", "answer": ["George V", "George VI", "Edward VIII", "Elizabeth II", "Edward VII"]} 305 | {"question": "when was rosencrantz and guildenstern are dead written", "answer": ["1966"]} 306 | {"question": "which country has the most coastline in the world", "answer": ["Canada"]} 307 | {"question": "who was the king of england in 1756", "answer": ["George II"]} 308 | {"question": "what are the colors of the netherlands flag", "answer": ["blue", "white", "red"]} 309 | {"question": "when was the first nuclear power plant opened", "answer": ["December 20, 1951", "June 27, 1954", "On June 27, 1954", "the USSR", "December 2, 1942"]} 310 | {"question": "what is the function of a political action committee (pac)", "answer": ["pools campaign contributions from members"]} 311 | {"question": "who sang the song i wanna be sedated", "answer": ["the Ramones"]} 312 | {"question": "what proposition made the insurance commissioner an elected position", "answer": ["Proposition 103", "Proposition 103 in 1988"]} 313 | {"question": "who are nominated for president of india 2017", "answer": ["Meira Kumar", "Ram Nath Kovind"]} 314 | {"question": "who wrote and performed i can only imagine", "answer": ["Bart Millard", "Christian rock band MercyMe", "MercyMe"]} 315 | {"question": "location of the ten commandments in the bible", "answer": ["Exodus", "Deuteronomy"]} 316 | {"question": "who sings she's like the wind lyrics", "answer": ["Wendy Fraser", "Patrick Swayze"]} 317 | {"question": "when did the us not go to the olympics", "answer": ["The 1980 Summer Olympics", "1980"]} 318 | {"question": "when does body temperature tend to be lowest", "answer": ["11 p.m. to 3 a.m."]} 319 | {"question": "who will win 2018 election in sri lanka", "answer": ["Sri Lanka Podujana Peramuna"]} 320 | {"question": "where does the white witch live in narnia", "answer": ["her castle"]} 321 | {"question": "what is the oldest street in the philippines", "answer": ["Cebu City", "Colon Street"]} 322 | {"question": "how many countries does cadbury sell its products", "answer": ["more than 50 countries worldwide", "more than 50"]} 323 | {"question": "who was the viceroy when the simon commission visited india", "answer": ["Lord Irwin"]} 324 | {"question": "what languages are spoken in india the most", "answer": ["Bengali", "Telugu", "Marathi", "Hindi", "English"]} 325 | {"question": "who plays hannibal in silence of the lambs", "answer": ["Anthony Hopkins"]} 326 | {"question": "when did the bill of rights come out", "answer": ["1689", "16 December 1689"]} 327 | {"question": "when does season 5 of the blacklist resume", "answer": ["January 31, 2018"]} 328 | {"question": "where did the allies go after north africa", "answer": ["Italy", "the Italian Campaign"]} 329 | {"question": "when did toyota start making cars in the us", "answer": ["by the early 1980s", "1984"]} 330 | {"question": "when did the royal proclamation of 1763 end", "answer": ["the American Revolutionary War", "with the American Revolutionary War", "1783"]} 331 | {"question": "when did the movie varsity blues come out", "answer": ["1999", "January 15, 1999"]} 332 | {"question": "who played shmuel in the boy in the striped pyjamas", "answer": ["Jack Scanlon"]} 333 | {"question": "what is an example of a tricyclic antidepressant", "answer": ["Amineptine"]} 334 | {"question": "where is creatine phosphate found in the body", "answer": ["brain", "pancreas", "skeletal muscle and the brain", "muscle cells", "heart"]} 335 | {"question": "who appoints the chair of the federal reserve system", "answer": ["President of the United States"]} 336 | {"question": "locations for the film an englishman who went up a hill", "answer": ["Llanrhaeadr-ym-Mochnant", "Llansilin in Powys"]} 337 | {"question": "who sings sugar sugar you are my candy girl", "answer": ["the Archies"]} 338 | {"question": "how many seasons of the rugrats are there", "answer": ["9 seasons", "9"]} 339 | {"question": "who played zoe hart on hart of dixie", "answer": ["Rachel Sarah Bilson"]} 340 | {"question": "where is the highest level of fluoride stored in the teeth", "answer": ["surface of the enamel"]} 341 | {"question": "who plays noah newman on the young and the restless", "answer": ["Robert Gillespie Adamson IV"]} 342 | {"question": "when was the canadian pacific railway started and finished", "answer": ["between 1881 and 1885"]} 343 | {"question": "how many super bowl games has the patriots played in", "answer": ["10", "ten", "ten times"]} 344 | {"question": "who has the most catches in nfl history", "answer": ["Jerry Rice"]} 345 | {"question": "when did the sims 4 toddlers come out", "answer": ["January 2017", "January 12, 2017", "the January 2017 patch"]} 346 | {"question": "when was the last time the military drafted", "answer": ["1973", "1972", "December 1972"]} 347 | {"question": "where does new york drinking water come from", "answer": ["the eastern Catskill Mountains"]} 348 | {"question": "when does the day of the dead end", "answer": ["November 2"]} 349 | {"question": "what is the name of the main artery which takes blood from the heart to the body", "answer": ["The aorta", "aorta"]} 350 | {"question": "who sings the theme song for the tv show cops", "answer": ["Inner Circle", "Jamaican reggae band Inner Circle"]} 351 | {"question": "what is the minimum wage in france per hour", "answer": ["11.16", "€9.88 per hour."]} 352 | {"question": "when is the fourth movie of the divergent series coming out", "answer": ["never made"]} 353 | {"question": "when did the word of wisdom become mandatory", "answer": ["February 1834"]} 354 | {"question": "the chinese dragons are protectors of how many seas diggy", "answer": ["Four Seas", "Four"]} 355 | {"question": "hazels boyfriend in the fault in our stars", "answer": ["Augustus Waters"]} 356 | {"question": "who was the ruler of england in 1616", "answer": ["James I"]} 357 | {"question": "when does sam realize he is jim in ghost whisperer", "answer": ["Leap of Faith"]} 358 | {"question": "what are the parts of a domain name called", "answer": ["subdomain", "top-level domain", "hostname"]} 359 | {"question": "where was percy jackson and the olympians filmed", "answer": ["Vancouver", "Mission, British Columbia"]} 360 | {"question": "who plays general hux in the last jedi", "answer": ["Domhnall Gleeson"]} 361 | {"question": "who started ww2 and how did it start", "answer": ["Nazi Germany"]} 362 | {"question": "when did the royal mint move to wales", "answer": ["1968", "the 1960s.", "17 December 1968", "the 1960s"]} 363 | {"question": "what does istj mean in a personality test", "answer": ["Extroverted Thinking (Te)", "Extroverted Intuition (Ne)", "Introverted Sensing (Si)", "Introverted Feeling (Fi)"]} 364 | {"question": "who won the champions league final in 2016", "answer": ["Real Madrid"]} 365 | {"question": "when was the last time new zealand had an earthquake", "answer": ["11 Jul 2017"]} 366 | {"question": "who performed the first c section in 1794", "answer": ["Dr. Jesse Bennett"]} 367 | {"question": "who has won the most games in nfl 2017", "answer": ["Dallas Cowboys"]} 368 | {"question": "who has the most gold medals in the winter olympics of all time", "answer": ["Norway"]} 369 | {"question": "what caused the breakup of the democratic republican party", "answer": ["the disputed 1824 presidential election"]} 370 | {"question": "who voices randy in f is for family", "answer": ["T.J. Miller"]} 371 | {"question": "when does the dlc for rainbow six siege come out", "answer": ["January 2018"]} 372 | {"question": "who are the australia's got talent judges", "answer": ["Kelly Osbourne", "Ian Dickson", "Ian \"Dicko\" Dickson", "Eddie Perfect", "Sophie Monk"]} 373 | {"question": "what does hp mean in war and order", "answer": ["hit points or health points"]} 374 | {"question": "who is the biggest selling female group of all time", "answer": ["Spice Girls"]} 375 | {"question": "an object that moves around an external axis is said to be", "answer": ["revolution or orbital revolution", "orbit"]} 376 | {"question": "when did seattle slew win the triple crown", "answer": ["1977", "in 1977"]} 377 | {"question": "bible verse taking the lord's name in vain", "answer": ["Exodus 20:7"]} 378 | {"question": "who wrote papa got a brand new bag", "answer": ["James Brown"]} 379 | {"question": "when did the united states host the world cup", "answer": ["1994"]} 380 | {"question": "abbreviated name of the highest peak in tasmania", "answer": ["Mount Ossa"]} 381 | {"question": "where was the salvation army's christmas collection kettle first introduced", "answer": ["San Francisco", "in San Francisco"]} 382 | {"question": "when was the first election held in india", "answer": ["1951–52"]} 383 | {"question": "when did the sat become out of 1600", "answer": ["March 2016", "2014", "2016"]} 384 | {"question": "where is fe best absorbed in the body", "answer": ["in the duodenum", "the duodenum"]} 385 | {"question": "who sings too much time on my hands lyrics", "answer": ["Tommy Shaw"]} 386 | {"question": "who does the voice of nala in the lion king", "answer": ["Niketa Calame", "Sally Dworsky", "Moira Kelly", "Laura Williams"]} 387 | {"question": "when did seat belts become law in ontario", "answer": ["January 1, 1976"]} 388 | {"question": "all the motor neurons that control the skeletal muscles are", "answer": ["efferent nerves", "Somatic motor neurons", "Somatic"]} 389 | {"question": "who is the first wife on sister wives", "answer": ["Meri"]} 390 | {"question": "who played the elephant man in the film", "answer": ["John Hurt"]} 391 | {"question": "when was coffee first made into a drink", "answer": ["15th century", "the 15th century"]} 392 | {"question": "when was the last time oklahoma won a national championship in football", "answer": ["2003", "2000"]} 393 | {"question": "who is jared on the bold and the beautiful", "answer": ["Andrew Collins"]} 394 | {"question": "who plays chummy's mother in call the midwife", "answer": ["Cheryl Campbell"]} 395 | {"question": "when's the last time army won the army navy game", "answer": ["2017", "Army"]} 396 | {"question": "when did the nba create the 3 point line", "answer": ["the 1979–80 season", "1979–80 season"]} 397 | {"question": "what is the share of agriculture in indian economy", "answer": ["17.32%", "23%"]} 398 | {"question": "chief ministers of tamil nadu mentioned on wikipedia", "answer": ["Ramakrishna Ranga Rao", "Tanguturi Prakasam", "P. Subbarayan", "M. G. Ramachandran", "Janaki Ramachandran", "P. T. Rajan", "J. Jayalalithaa", "M. Karunanidhi", "A. Subbarayalu Reddiar", "C. N. Annadurai", "P. S. Kumaraswamy Raja", "K. Palaniswami", "O. Panneerselvam", "V.R. Nedunchezhiyan", "Raja of Panagal", "O. P. Ramaswamy Reddiyar", "Kurma Venkata Reddy Naidu", "B. Munuswamy Naidu", "C. Rajagopalachari"]} 399 | {"question": "who plays unis in she's the man", "answer": ["Emily Perkins"]} 400 | {"question": "when did the united states start using the death penalty", "answer": ["the beginning", "1608"]} 401 | -------------------------------------------------------------------------------- /data/web_questions.jsonl: -------------------------------------------------------------------------------- 1 | {"question": "who did benjamin franklin get married to?", "answer": ["Deborah Read"]} 2 | {"question": "what is the currency in egypt 2012?", "answer": ["Egyptian pound"]} 3 | {"question": "what language turkey people speak?", "answer": ["Turkish Language"]} 4 | {"question": "what does jamaican people speak?", "answer": ["Jamaican Creole English Language", "Jamaican English"]} 5 | {"question": "what language do people from thailand speak?", "answer": ["Mon Language", "Lao Language", "Khmer language", "Hmong language", "Thai Language", "Cham language", "Mlabri language", "Malay, Pattani Language", "Nyaw Language", "Saek language"]} 6 | {"question": "what else did ben franklin invent?", "answer": ["Lightning rod", "Franklin stove", "Bifocals", "Glass harmonica"]} 7 | {"question": "which country does greenland belong to?", "answer": ["Denmark"]} 8 | {"question": "who did michael j fox marry?", "answer": ["Tracy Pollan"]} 9 | {"question": "where did eleanor roosevelt die?", "answer": ["New York City"]} 10 | {"question": "what airport is near arlington tx?", "answer": ["Arlington Municipal Airport"]} 11 | {"question": "which country was justin bieber born in?", "answer": ["Canada"]} 12 | {"question": "who is the minority leader of the house of representatives now?", "answer": ["Nancy Pelosi"]} 13 | {"question": "where did clay matthews go to school?", "answer": ["Agoura High School", "University of Southern California"]} 14 | {"question": "what to do today in atlanta with kids?", "answer": ["Atlanta History Center", "Atlanta Cyclorama & Civil War Museum", "Atlanta Ballet", "Fernbank Museum of Natural History", "Woodruff Arts Center", "Zoo Atlanta", "Atlanta Symphony Orchestra", "Centennial Olympic Park", "Martin Luther King, Jr., National Historic Site", "Fernbank Science Center"]} 15 | {"question": "what team did ronaldo play for in 2003?", "answer": ["Real Madrid C.F."]} 16 | {"question": "what is the currency of puerto rico called?", "answer": ["United States dollar"]} 17 | {"question": "what tv shows did shawnee smith play in?", "answer": ["Anger Management", "The Tom Show", "Scream Queens", "Brand New Life", "30 Days of Night: Dust to Dust", "Arsenio", "Becker", "The Stand", "All is Forgiven"]} 18 | {"question": "what language does cuba speak?", "answer": ["Spanish Language"]} 19 | {"question": "where is mount st helens volcano?", "answer": ["Skamania County"]} 20 | {"question": "what school did karl benz go to?", "answer": ["Karlsruhe Institute of Technology", "University of Karlsruhe"]} 21 | {"question": "where are samsung based?", "answer": ["Seoul"]} 22 | {"question": "where did joe flacco attend college?", "answer": ["University of Delaware"]} 23 | {"question": "who did jackie robinson first play for?", "answer": ["Montreal Royals", "Kansas City Monarchs", "Los Angeles Bulldogs", "Brooklyn Dodgers", "UCLA Bruins football"]} 24 | {"question": "what are the religions practiced in indonesia?", "answer": ["Protestantism", "Hinduism", "Catholicism", "Islam"]} 25 | {"question": "what country did buddha come from?", "answer": ["India"]} 26 | {"question": "where george lopez was born?", "answer": ["Mission Hills"]} 27 | {"question": "where are yamaha outboard motors manufactured?", "answer": ["Shizuoka Prefecture"]} 28 | {"question": "where is the carpathian mountain range located?", "answer": ["Ukraine", "Europe", "Romania", "Czech Republic", "Poland", "Serbia", "Slovakia", "Hungary"]} 29 | {"question": "who is emma stone father?", "answer": ["Jeff Stone"]} 30 | {"question": "what is the oregon ducks 2012 football schedule?", "answer": ["University of Oregon"]} 31 | {"question": "which airport to fly into rome?", "answer": ["Ciampino – G.B. Pastine International Airport", "Roma Termini railway station", "Civitavecchia Ferry Terminal", "Leonardo da Vinci–Fiumicino Airport"]} 32 | {"question": "who plays juni cortez?", "answer": ["Daryl Sabara"]} 33 | {"question": "what school did ben roethlisberger go to?", "answer": ["Miami University"]} 34 | {"question": "what type of government does germany have now?", "answer": ["Constitutional republic", "Multi-party system", "Federal republic", "Democracy", "Parliamentary republic"]} 35 | {"question": "what language do british speak?", "answer": ["Scottish Gaelic language", "Scots Language", "Cornish Language", "Irish", "English Language", "Welsh Language", "Guernésiais", "Ulster Scots dialects", "Jèrriais", "Manx Language"]} 36 | {"question": "what super bowl did peyton manning win?", "answer": ["2006 NFL season"]} 37 | {"question": "who did tim tebow play college football for?", "answer": ["University of Florida"]} 38 | {"question": "where does the zambezi river originate?", "answer": ["Tanzania"]} 39 | {"question": "when did annie open?", "answer": ["Annie (1977 original Broadway cast)"]} 40 | {"question": "who did tim tebow play college football for?", "answer": ["University of Florida"]} 41 | {"question": "who inspired obama?", "answer": ["Saul Alinsky", "Nipsey Russell"]} 42 | {"question": "who is gimli's father in the hobbit?", "answer": ["Gloin"]} 43 | {"question": "who was the leader of the us during wwii?", "answer": ["Gerald Ford"]} 44 | {"question": "what shows are shot in new york?", "answer": ["Flight of the Conchords", "The Stand"]} 45 | {"question": "who plays kenneth?", "answer": ["Jack McBrayer"]} 46 | {"question": "where did dolly parton grow up?", "answer": ["Tennessee"]} 47 | {"question": "what type of government does iraq have now?", "answer": ["Parliamentary system", "Federation", "Republic", "Federal republic", "Parliamentary republic"]} 48 | {"question": "who has played lex luthor?", "answer": ["Anthony LaPaglia", "Kevin Spacey", "Clancy Brown", "James Marsters", "Gene Hackman"]} 49 | {"question": "who did scarlett johansson date?", "answer": ["Justin Timberlake", "Josh Hartnett", "Benicio del Toro", "Jared Leto", "Derek Jeter"]} 50 | {"question": "who did cam newton sign with?", "answer": ["Carolina Panthers"]} 51 | {"question": "who did mozart write his four horn concertos for?", "answer": ["wolfgang amadeus mozart used story by pierre beaumarchais"]} 52 | {"question": "where did mitt romney's parents come from?", "answer": ["Bloomfield Hills"]} 53 | {"question": "what county is brentwood tennessee in?", "answer": ["Williamson County"]} 54 | {"question": "what timezone is utah in?", "answer": ["Mountain Time Zone"]} 55 | {"question": "who developed the tcp ip reference model?", "answer": ["Robert E. Kahn", "Vint Cerf"]} 56 | {"question": "where did jovan belcher kill himself?", "answer": ["Kansas City"]} 57 | {"question": "what is there to do for fun in kansas city?", "answer": ["Kemper Arena", "Starlight Theatre", "Kauffman Stadium", "Municipal Stadium", "Ward Parkway Center", "Arrowhead Stadium", "Blue Ridge Mall", "Blue Ridge Crossing", "Crown Center", "TWA Corporate Headquarters' Building"]} 58 | {"question": "what team does jordan own?", "answer": ["Jordan national football team"]} 59 | {"question": "what the zip code for seattle washington?", "answer": ["98109", "98108", "98105", "98104", "98107", "98106", "98101", "98103", "98102", "98117"]} 60 | {"question": "what time in hilo hawaii?", "answer": ["Hawaii–Aleutian Time Zone"]} 61 | {"question": "where is perpignan located?", "answer": ["France"]} 62 | {"question": "who is the state governor of tennessee?", "answer": ["Bill Haslam"]} 63 | {"question": "what things did martin luther king do?", "answer": ["Civil rights movement", "Civil disobedience", "Nonviolence"]} 64 | {"question": "what is the australian dollar called?", "answer": ["Australian dollar"]} 65 | {"question": "what movies did ron howard director?", "answer": ["How the Grinch Stole Christmas!"]} 66 | {"question": "what are the three official languages of belgium?", "answer": ["French Language", "German Language", "Dutch Language"]} 67 | {"question": "what are the major languages spoken in greece?", "answer": ["Albanian language", "Greek Language"]} 68 | {"question": "what type of cancer did eva peron have?", "answer": ["Cervical cancer"]} 69 | {"question": "what currency does russia use 2012?", "answer": ["Russian ruble"]} 70 | {"question": "what did the scientist chadwick discovered?", "answer": ["Neutron"]} 71 | {"question": "who plays london tipton in suite life on deck?", "answer": ["Brenda Song"]} 72 | {"question": "what are the school colors for harvard university?", "answer": ["Crimson"]} 73 | {"question": "who does lee clark manager?", "answer": ["Birmingham City F.C."]} 74 | {"question": "where did george w bush live as a child?", "answer": ["New Haven"]} 75 | {"question": "where was rihanna born and raised?", "answer": ["Saint Michael Parish", "Barbados"]} 76 | {"question": "who was vp for lincoln?", "answer": ["Andrew Johnson", "Hannibal Hamlin"]} 77 | {"question": "who was the italian leader in ww1?", "answer": ["Benito Mussolini"]} 78 | {"question": "what year was george w bush elected?", "answer": ["George W. Bush presidential campaign, 2000"]} 79 | {"question": "what are abraham sons names?", "answer": ["Zimran", "Ishbak", "Midian", "Shuah", "Ishmael", "Jokshan", "Isaac", "Medan"]} 80 | {"question": "what did queen victoria say about the suffragettes?", "answer": ["I am every day more convinced that we women, if we are to be good women, feminine and amiable and domestic, are not fitted to reign; at least it is they that drive themselves to the work which it entails."]} 81 | {"question": "what airport do you fly into to get to destin fl?", "answer": ["Northwest Florida Regional Airport", "Destin–Fort Walton Beach Airport"]} 82 | {"question": "what is the currency used in italy?", "answer": ["Euro"]} 83 | {"question": "where is the ufc headquarters?", "answer": ["Las Vegas"]} 84 | {"question": "what is the song anna kendrick sings in pitch perfect?", "answer": ["Cups"]} 85 | {"question": "what places in japan were bombed?", "answer": ["Hiroshima Prefecture"]} 86 | {"question": "what language do chinese people write in?", "answer": ["Traditional Chinese characters", "Chinese", "Simplified Chinese character", "'Phags-pa script", "Nüshu script", "Chinese characters"]} 87 | {"question": "what state is washington d.c. located?", "answer": ["Washington", "Washington, D.C."]} 88 | {"question": "what artistic movement did henri matisse belong to?", "answer": ["Fauvism", "Impressionism", "Neo-impressionism", "Modernism"]} 89 | {"question": "who does peyton manning play football for?", "answer": ["Denver Broncos"]} 90 | {"question": "what region of the world is egypt associated with?", "answer": ["Middle East"]} 91 | {"question": "where was the city of david?", "answer": ["Bethlehem"]} 92 | {"question": "what are the sights to see in madrid?", "answer": ["Paseo del Prado", "Thyssen-Bornemisza Museum", "Almudena Cathedral", "Plaza de Cibeles", "Puerta del Sol", "Royal Palace of Madrid", "Museo de Lazaro Galdiano", "Gran Vía", "Museo Nacional Centro de Arte Reina Sofía", "Plaza Mayor, Madrid"]} 93 | {"question": "who is khloe kardashian's husband?", "answer": ["Lamar Odom"]} 94 | {"question": "who played on the jeffersons?", "answer": ["Isabel Sanford", "Marla Gibbs", "Sherman Hemsley"]} 95 | {"question": "what kind of money should i take to costa rica?", "answer": ["Costa Rican colón"]} 96 | {"question": "what state does romney live in?", "answer": ["Massachusetts"]} 97 | {"question": "what time zone am i in california?", "answer": ["Pacific Time Zone", "UTC-8"]} 98 | {"question": "where does archbishop desmond tutu live?", "answer": ["South Africa"]} 99 | {"question": "what is new york city airport?", "answer": ["Flushing Airport", "Mitchel Air Force Base", "Downtown Manhattan Heliport", "LaGuardia Airport", "John F. Kennedy International Airport", "New York Skyports Inc. Seaplane Base", "East 34th Street Heliport"]} 100 | {"question": "what did anton van leeuwenhoek contribute to our knowledge of cells?", "answer": ["microscope first used by anton van leeuwenhoek"]} 101 | {"question": "what kind of monarchy does japan have?", "answer": ["Constitutional monarchy"]} 102 | {"question": "who rules denmark right now?", "answer": ["Helle Thorning-Schmidt"]} 103 | {"question": "who plays bilbo baggins in the hobbit?", "answer": ["Norman Bird", "Martin Freeman", "Ian Holm"]} 104 | {"question": "what did fred durst do?", "answer": ["Musician"]} 105 | {"question": "who will play mr gray in the film?", "answer": ["Karen Mulder"]} 106 | {"question": "where to exchange euros in new york city?", "answer": ["John F. Kennedy International Airport"]} 107 | {"question": "what kind of cancer did carl wilson have?", "answer": ["Lung cancer"]} 108 | {"question": "what to see near grand canyon?", "answer": ["Grand Canyon National Park Superintendent's Residence", "Grand Canyon South Rim Ranger's Dormitory", "Grand Canyon Village Historic District", "Grand Canyon North Rim Headquarters", "Grandview Mine", "Grand Canyon Water Reclamation Plant", "Buckey O'Neill Cabin", "El Tovar Hotel", "Grand Canyon Depot"]} 109 | {"question": "what type of government does usa follow?", "answer": ["Federal republic"]} 110 | {"question": "where is the chernobyl nuclear power plant?", "answer": ["Prypiat", "Ukrainian SSR", "Chernobyl Nuclear Power Plant", "Chernobyl"]} 111 | {"question": "what did the ancient romans speak?", "answer": ["Latin Language"]} 112 | {"question": "where was the temple of karnak built?", "answer": ["Egypt", "Luxor Governorate"]} 113 | {"question": "who is sir francis bacon?", "answer": ["Philosopher"]} 114 | {"question": "what to do in richardson dallas?", "answer": ["Wizard's Sports Cafe"]} 115 | {"question": "where did bristol palin go to school?", "answer": ["Wasilla High School", "West Anchorage High School", "Juneau-Douglas High School"]} 116 | {"question": "what was the title of the book charles darwin wrote?", "answer": ["The Structure and Distribution of Coral Reefs", "On evolution", "A student's introduction to Charles Darwin", "Climbing Plants", "The Expression of the Emotions in Man and Animals", "The origin of species : complete and fully illustrated", "The Origin of Species", "The Life of Erasmus Darwin", "The Autobiography of Charles Darwin", "The Descent of Man, and Selection in Relation to Sex"]} 117 | {"question": "where did francisco coronado come from?", "answer": ["Salamanca"]} 118 | {"question": "who is eli whitney and what did he invent?", "answer": ["Cotton gin"]} 119 | {"question": "what type of books did agatha christie wrote?", "answer": ["Crime writer"]} 120 | {"question": "what type of government does the us follow?", "answer": ["Presidential system", "Federal republic", "Representative democracy", "Two-party system", "Constitutional republic", "Republic"]} 121 | {"question": "what is my timezone in louisiana?", "answer": ["Central Time Zone", "UTC−06:00"]} 122 | {"question": "what type of government does australia have?", "answer": ["Parliamentary system", "Federation", "Constitutional monarchy"]} 123 | {"question": "where was country singer george jones born?", "answer": ["Saratoga"]} 124 | {"question": "who is the next governor of indiana?", "answer": ["Mitch Daniels"]} 125 | {"question": "who is willow smith mom name?", "answer": ["Jada Pinkett Smith"]} 126 | {"question": "what part did winona ryder play in star trek?", "answer": ["Amanda Grayson"]} 127 | {"question": "where is tyrese gibson from?", "answer": ["Watts"]} 128 | {"question": "what did stephen hawking study?", "answer": ["Physics"]} 129 | {"question": "where does bradley walsh live?", "answer": ["England"]} 130 | {"question": "what county is frederick md in?", "answer": ["Frederick County"]} 131 | {"question": "where did drew brees go to college wikianswers?", "answer": ["Purdue University"]} 132 | {"question": "where was benjamin franklin educated?", "answer": ["Boston Latin School"]} 133 | {"question": "where english is spoken?", "answer": ["Canada", "Australia", "South Africa", "Zambia", "United Kingdom", "Zimbabwe", "Uganda", "New Zealand", "Turks and Caicos Islands", "Tanzania"]} 134 | {"question": "who did carlos boozer play for?", "answer": ["Utah Jazz", "Cleveland Cavaliers"]} 135 | {"question": "what did president carter do in office?", "answer": ["Social development", "Human rights", "Economic development"]} 136 | {"question": "who founded the pittsburgh steelers in 1933?", "answer": ["Rooney family"]} 137 | {"question": "what did randy savage died of?", "answer": ["Myocardial infarction", "Traffic collision"]} 138 | {"question": "what type of government system does italy have?", "answer": ["Constitutional republic", "Parliamentary republic", "Unitary state"]} 139 | {"question": "what time zone is anaheim california?", "answer": ["Pacific Time Zone"]} 140 | {"question": "what two continents is turkey on?", "answer": ["Europe", "Eurasia", "Asia"]} 141 | {"question": "what year did the orioles go to the world series?", "answer": ["1983 World Series", "1966 World Series", "1970 World Series"]} 142 | {"question": "what is the nigeria time?", "answer": ["West Africa Time", "UTC+01:00"]} 143 | {"question": "what type of currency do they use in england?", "answer": ["UK £"]} 144 | {"question": "what type of government does france use?", "answer": ["Semi-presidential system", "Constitutional republic", "Unitary state"]} 145 | {"question": "what is serbian language called?", "answer": ["Serbian language"]} 146 | {"question": "who is the senior senator of louisiana?", "answer": ["Mary Landrieu"]} 147 | {"question": "who plays donna noble?", "answer": ["Catherine Tate"]} 148 | {"question": "who did vasco de gama explore for?", "answer": ["Portugal"]} 149 | {"question": "who wrote the jana gana mana?", "answer": ["Ram Singh Thakur", "Rabindranath Tagore"]} 150 | {"question": "what county is kansas city kansas?", "answer": ["Wyandotte County"]} 151 | {"question": "who was richard nixon married to?", "answer": ["Pat Nixon"]} 152 | {"question": "what countries are part of the uk?", "answer": ["Scotland", "England", "Wales", "Northern Ireland"]} 153 | {"question": "what killed john bonham?", "answer": ["Inhalation of vomit"]} 154 | {"question": "what instruments did louis armstrong play?", "answer": ["trumpet", "Cornet"]} 155 | {"question": "where are the gobi desert located on a map?", "answer": ["Mongolia"]} 156 | {"question": "what country did buddha come from?", "answer": ["India"]} 157 | {"question": "what are the names of the city states in ancient greece?", "answer": ["Athens"]} 158 | {"question": "what type of cancer did gilda radner die of?", "answer": ["Ovarian cancer"]} 159 | {"question": "what do people in australia speak?", "answer": ["Lojban", "Esperanto Language", "English Language"]} 160 | {"question": "what were amelia earhart's achievements?", "answer": ["Writer", "Pilot"]} 161 | {"question": "what kind of guitar did george harrison use?", "answer": ["Fender Stratocaster", "Rickenbacker 360/12"]} 162 | {"question": "what position did vince lombardi play in college?", "answer": ["Right Guard"]} 163 | {"question": "where was martin luther king jr raised?", "answer": ["Atlanta"]} 164 | {"question": "what countries share a land border with indonesia?", "answer": ["Australia", "East Malaysia"]} 165 | {"question": "who is hammurabi and what did he do?", "answer": ["Monarch"]} 166 | {"question": "what movies has john williams score?", "answer": ["A.I. Artificial Intelligence", "Always", "Catch Me If You Can", "1941", "Daddy-O", "Amistad", "Close Encounters of the Third Kind", "E.T. the Extra-Terrestrial", "Empire of the Sun", "Earthquake"]} 167 | {"question": "what was robert burns famous for?", "answer": ["Poet"]} 168 | {"question": "what national team does cristiano ronaldo play for?", "answer": ["Portugal national football team"]} 169 | {"question": "who did armie hammer play in the social network?", "answer": ["Jesse Eisenberg"]} 170 | {"question": "what state does romney live in?", "answer": ["Massachusetts"]} 171 | {"question": "what countries have spanish as the national language?", "answer": ["Spain"]} 172 | {"question": "who did gerald ford select as his vice president when he became president?", "answer": ["Nelson Rockefeller"]} 173 | {"question": "who fought in the gulf war 1991?", "answer": ["Saudi Arabia", "Australia", "United States of America", "France", "United Kingdom", "Argentina", "Iraq"]} 174 | {"question": "who plays captain kirk in star trek?", "answer": ["William Shatner"]} 175 | {"question": "what type of artist is henri matisse?", "answer": ["Sculpture", "Printmaking", "Collage", "Painting", "Drawing"]} 176 | {"question": "what are republicans views on health care?", "answer": ["20003"]} 177 | {"question": "what was the first book charles dickens wrote?", "answer": ["Oliver Twist"]} 178 | {"question": "when did the wright brothers created their first plane?", "answer": ["1900 Wright Glider"]} 179 | {"question": "when was the last time the toronto maple leafs were in the stanley cup finals?", "answer": ["1967 Stanley Cup Finals"]} 180 | {"question": "where was elvis costello born?", "answer": ["Paddington"]} 181 | {"question": "what was thomas jefferson role in the declaration of independence?", "answer": ["Writer"]} 182 | {"question": "when did conflict start in ireland?", "answer": ["Viking invasion of Ireland"]} 183 | {"question": "what do they call money in japan?", "answer": ["Japanese yen"]} 184 | {"question": "where is jamarcus russell from?", "answer": ["Mobile"]} 185 | {"question": "where is the seychelles on world map?", "answer": ["Africa"]} 186 | {"question": "what makes elvis presley famous?", "answer": ["Singer"]} 187 | {"question": "what language does cuba speak?", "answer": ["Spanish Language"]} 188 | {"question": "what type of economy exists in china?", "answer": ["Socialist state"]} 189 | {"question": "what town was martin luther king assassinated in?", "answer": ["Memphis"]} 190 | {"question": "what books did agatha christie wrote?", "answer": ["And Then There Were None", "Le Grand alibi", "Ten Little Indians", "Appointment with Death", "Desyat Negrityat", "The Man in the Brown Suit", "Witness for the Prosecution"]} 191 | {"question": "what time does american horror story air?", "answer": ["Tom Selleck"]} 192 | {"question": "what team is hank baskett on 2010?", "answer": ["Philadelphia Eagles"]} 193 | {"question": "where was george washington carver from?", "answer": ["Diamond"]} 194 | {"question": "what county is st paul va in?", "answer": ["United States of America", "Wise County", "Russell County", "Virginia"]} 195 | {"question": "where did rudolf virchow conduct his research?", "answer": ["Humboldt University of Berlin", "University of Würzburg"]} 196 | {"question": "when does jewish new year start?", "answer": ["Yiddish Language"]} 197 | {"question": "what disease does robin roberts have?", "answer": ["Breast cancer"]} 198 | {"question": "who played obi wan in episode 2?", "answer": ["Ewan McGregor"]} 199 | {"question": "what type of government does the us follow?", "answer": ["Presidential system", "Federal republic", "Representative democracy", "Two-party system", "Constitutional republic", "Republic"]} 200 | {"question": "what does joey jordison play in slipknot?", "answer": ["Drums"]} 201 | {"question": "what is the capital of modern egypt?", "answer": ["Cairo"]} 202 | {"question": "who fought the battle of gettysburg?", "answer": ["Confederate States of America", "United States of America"]} 203 | {"question": "what art movement did leonardo da vinci belong to?", "answer": ["High Renaissance"]} 204 | {"question": "where obama went to school?", "answer": ["Occidental College", "Harvard Law School", "Noelani Elementary School", "Punahou School", "State Elementary School Menteng 01", "St. Francis of Assisi Catholic School", "Columbia University"]} 205 | {"question": "where did the iroquois indians come from?", "answer": ["Québec"]} 206 | {"question": "where is mission san buenaventura located?", "answer": ["Ventura County"]} 207 | {"question": "who played jacob black in twilight?", "answer": ["Taylor Lautner"]} 208 | {"question": "what kind of government is sweden?", "answer": ["Representative democracy", "Unitary state", "Parliamentary system", "Constitutional monarchy", "Hereditary monarchy", "Multi-party system"]} 209 | {"question": "who owns the portland press herald?", "answer": ["Blethen Maine Newspapers, Inc."]} 210 | {"question": "who plays ken barlow in coronation street?", "answer": ["Tony Warren"]} 211 | {"question": "who did the voice of darth vader in episode 3?", "answer": ["Hayden Christensen"]} 212 | {"question": "who plays the voice of brian on family guy?", "answer": ["Seth MacFarlane"]} 213 | {"question": "who was the first president of the afl?", "answer": ["Bud Adams", "Lamar Hunt"]} 214 | {"question": "where to get a marriage license in long island?", "answer": ["United States District Court for the Eastern District of New York"]} 215 | {"question": "what is the currency used in italy?", "answer": ["Euro"]} 216 | {"question": "which dawkins book to read first?", "answer": ["The Selfish Gene"]} 217 | {"question": "who is the coach of the sf giants?", "answer": ["Tim Flannery"]} 218 | {"question": "what produce does florida export?", "answer": ["Orange juice"]} 219 | {"question": "what state is the steelers from?", "answer": ["Pittsburgh"]} 220 | {"question": "where does delaware river start?", "answer": ["West Branch Delaware River", "Mount Jefferson"]} 221 | {"question": "when do world war ii end?", "answer": ["1942"]} 222 | {"question": "who is jimmy savile?", "answer": ["Presenter"]} 223 | {"question": "what is the national flower of hawaii?", "answer": ["Hawaiian hibiscus"]} 224 | {"question": "what countries does the panama canal go through?", "answer": ["Panama Canal Zone"]} 225 | {"question": "who was esther's husband?", "answer": ["Susa"]} 226 | {"question": "what did john irving wrote?", "answer": ["Trying to Save Piggy Sneed", "The Fourth Hand", "The Cider House Rules", "The 158-Pound Marriage", "The World According to Garp", "The Hotel New Hampshire", "A Widow for One Year", "A Prayer for Owen Meany", "The Water-Method Man", "Until I Find You"]} 227 | {"question": "what type of government was formed when italy unified?", "answer": ["Parliamentary republic"]} 228 | {"question": "what is the name of the san francisco newspaper?", "answer": ["The San Francisco Examiner", "California Star", "San Francisco Bay Guardian", "San Francisco Business Times", "San Francisco Bay Times", "San Francisco Chronicle", "Bay Area Reporter", "Sing Tao Daily", "AsianWeek", "San Francisco Call"]} 229 | {"question": "who did kim richards marry?", "answer": ["Greg Davis", "John Jackson", "G. Monty Brinson"]} 230 | {"question": "who plays blaine in batman?", "answer": ["Him/Herself"]} 231 | {"question": "what language does australians speak?", "answer": ["Greek Language", "English Language", "Italian Language", "Chinese language"]} 232 | {"question": "who did annie oakley married?", "answer": ["Frank E. Butler"]} 233 | {"question": "what university did romney graduated from?", "answer": ["Stanford University", "Cranbrook Schools", "Harvard Law School", "Harvard Business School", "Harvard University", "Brigham Young University"]} 234 | {"question": "what are the landlocked countries in latin america?", "answer": ["Bolivia", "Honduras", "Cuba", "El Salvador", "Guatemala", "Costa Rica", "Cuauhtémoc, D.F.", "Great Pyramid of Tenochtitlán", "Paraguay", "Belize"]} 235 | {"question": "what college did magic johnson play for?", "answer": ["Michigan State University"]} 236 | {"question": "where does the zambezi river start?", "answer": ["Mwinilunga"]} 237 | {"question": "who is shakira married to?", "answer": ["Gerard Piqué"]} 238 | {"question": "what has ian somerhalder acted in?", "answer": ["Smallville", "The Vampire Diaries", "Lost", "Tell Me You Love Me", "Fearless", "Young Americans"]} 239 | {"question": "where is the time zone line in south dakota?", "answer": ["Mountain Time Zone", "Central Time Zone", "UTC−07:00", "UTC−06:00"]} 240 | {"question": "who plays bilbo baggins in the hobbit?", "answer": ["Norman Bird", "Martin Freeman", "Ian Holm"]} 241 | {"question": "what time is it in texas houston right now?", "answer": ["Central Time Zone"]} 242 | {"question": "who is mary mcleod bethune for kids?", "answer": ["Educator"]} 243 | {"question": "what year does hitler die?", "answer": ["Hitler and His Generals: Military Conferences 1942-1945"]} 244 | {"question": "what countries do the united nations help?", "answer": ["Afghanistan", "Albania", "Angola", "Algeria", "Andorra", "Austria", "Australia", "Antigua and Barbuda", "Armenia", "Argentina"]} 245 | {"question": "when did charles goodyear invented rubber?", "answer": ["During the early 1830's he began inventing, filing six patents between 1830 and 1834, and during this period became interested in rubber, which he tried - unsuccessfully - to use in some of his mechanical inventions."]} 246 | {"question": "what places make up new england?", "answer": ["Maine"]} 247 | {"question": "what team does colin kaepernick play for?", "answer": ["San Francisco 49ers"]} 248 | {"question": "what did the islamic people believe in?", "answer": ["Zakāt", "Salah", "Hajj", "Islamic dietary laws", "Jihad", "Sawm", "Halal food", "Shahada", "Adab"]} 249 | {"question": "where to go fishing in roanoke va?", "answer": ["Rainbow Bluff Expedition"]} 250 | {"question": "who does peyton manning play football for?", "answer": ["Denver Broncos"]} 251 | {"question": "what is the state flower of arizona?", "answer": ["Saguaro"]} 252 | {"question": "where the missouri river ends?", "answer": ["Mississippi River"]} 253 | {"question": "what did shakespeare become famous for?", "answer": ["Poet", "Playwright", "Dramatist", "Lyricist", "Author"]} 254 | {"question": "where was st. lucy born?", "answer": ["Syracuse"]} 255 | {"question": "who is jamie little engaged to?", "answer": ["Cody Selman"]} 256 | {"question": "what super bowl did peyton manning win?", "answer": ["2006 NFL season"]} 257 | {"question": "which continental congress approve the declaration of independence?", "answer": ["Second Continental Congress"]} 258 | {"question": "what do christians believe about heaven hell and purgatory?", "answer": ["Greek Evangelical Church", "The Church of Nails"]} 259 | {"question": "which airport to fly into in buenos aires?", "answer": ["Aeroparque Jorge Newbery", "Ministro Pistarini International Airport", "Don Torcuato Airport"]} 260 | {"question": "what language does australia use?", "answer": ["English Language"]} 261 | {"question": "what high school did lil wayne graduate from?", "answer": ["Mcmain Magnet Secondary School"]} 262 | {"question": "what do you call the chinese writing system?", "answer": ["Standard Mandarin"]} 263 | {"question": "where does robin williams live 2011?", "answer": ["San Francisco"]} 264 | {"question": "what country did germany invade first in ww1?", "answer": ["Belgium"]} 265 | {"question": "who was the leader of soviet union during wwii?", "answer": ["Joseph Stalin"]} 266 | {"question": "what flower is on the oklahoma quarter?", "answer": ["Scissor-tailed Flycatcher"]} 267 | {"question": "who plays stephanie plum in one for the money?", "answer": ["Katherine Heigl"]} 268 | {"question": "what did albert speer design?", "answer": ["Deutsches Stadion", "Volkshalle", "Reich Chancellery", "Olympic Stadium"]} 269 | {"question": "where was theodore roosevelt buried?", "answer": ["Youngs Memorial Cemetery"]} 270 | {"question": "what made the soviet union fall?", "answer": ["Cold War"]} 271 | {"question": "what is there to do in peoria illinois?", "answer": ["Peoria Zoo", "Judge Flanagan Residence", "George L. Luthy Memorial Botanical Garden", "Lakeview Museum of Arts and Sciences", "WeaverRidge Golf Club", "Heart of Illinois Fair", "Wildlife Prairie State Park", "Par-A-Dice Hotel and Casino", "Peoria Civic Center", "O'Brien Field"]} 272 | {"question": "what school did michael jordan attend?", "answer": ["University of North Carolina at Chapel Hill", "Emsley A. Laney High School"]} 273 | {"question": "what countries have english as their official language?", "answer": ["Canada", "Australia", "Kingdom of Great Britain", "United States of America", "United Kingdom", "Ireland", "New Zealand"]} 274 | {"question": "what did st augustine do?", "answer": ["Physician", "Writer", "Philosopher"]} 275 | {"question": "what was the ancient egyptians spoken language?", "answer": ["Egyptian Arabic"]} 276 | {"question": "where herman cain stance on the issues?", "answer": ["Évocateur: The Morton Downey Jr. Movie"]} 277 | {"question": "who was vice president after kennedy died?", "answer": ["Lyndon B. Johnson"]} 278 | {"question": "what was nikola tesla inventions?", "answer": ["Tesla coil"]} 279 | {"question": "where did hank marvin come from?", "answer": ["Newcastle upon Tyne"]} 280 | {"question": "what did baron de montesquie influence?", "answer": ["charles-louis de secondat montesquieu influenced edward gibbon"]} 281 | {"question": "what songs does smokey robinson sing?", "answer": ["Being With You", "Cruisin'", "Crusin'", "And I Love Her", "The Tracks of My Tears", "Quiet Storm", "Tracks of my Tears", "Santa Claus is Coming to Town"]} 282 | {"question": "what is the australian dollar called?", "answer": ["Australian dollar"]} 283 | {"question": "what kind government does egypt have?", "answer": ["Semi-presidential system", "Constitutional republic", "Republic", "Unitary state"]} 284 | {"question": "where is the galapagos islands located on a world map?", "answer": ["Pacific Ocean", "Galápagos Province"]} 285 | {"question": "what all does google now do?", "answer": ["Google Maps", "Nexus 7", "Google Buzz", "Nexus 10", "Nexus One", "Nexus S", "Google Chrome", "Google Earth", "Google Wave"]} 286 | {"question": "who did sir francis drake marry?", "answer": ["Mary Newman", "Elizabeth Sydenham"]} 287 | {"question": "what county is west st paul in?", "answer": ["Dakota County"]} 288 | {"question": "what was the capital city of the east roman empire?", "answer": ["Constantinople"]} 289 | {"question": "when did chipper jones get drafted?", "answer": ["1990 Major League Baseball Draft"]} 290 | {"question": "what instruments does justin bieber use?", "answer": ["guitar", "Piano", "trumpet", "Drums"]} 291 | {"question": "what electorate does anna bligh represent?", "answer": ["Electoral district of South Brisbane"]} 292 | {"question": "what role did alexander hamilton play in the constitution?", "answer": ["Financier"]} 293 | {"question": "where did margaret hoover go to college?", "answer": ["Davidson College"]} 294 | {"question": "in which continent is germany?", "answer": ["Europe"]} 295 | {"question": "who did france surrender to in ww2?", "answer": ["Germany"]} 296 | {"question": "who did queen elizabeth 1 executed?", "answer": ["queen elizabeth i of england she executed mary queen of scots"]} 297 | {"question": "where can i go running in sacramento?", "answer": ["Boulevard Park"]} 298 | {"question": "what county is texarkana arkansas in?", "answer": ["Miller County"]} 299 | {"question": "what are the two official languages of paraguay?", "answer": ["Paraguayan Guaraní", "Spanish Language"]} 300 | {"question": "what language is spoken in haiti today?", "answer": ["French Language", "Haitian Creole French Language"]} 301 | {"question": "where was the vietnam war location?", "answer": ["South Vietnam", "North Vietnam", "Southeast Asia", "Cambodia", "Vietnam", "Laos"]} 302 | {"question": "where does name pennsylvania come from?", "answer": ["William Penn"]} 303 | {"question": "what is the money of switzerland called?", "answer": ["Swiss franc"]} 304 | {"question": "what countries does greece share borders with?", "answer": ["Turkey", "Republic of Macedonia", "Albania", "Lake Prespa", "Bulgaria"]} 305 | {"question": "what year was lebron james rookie season?", "answer": ["2003–04 NBA season"]} 306 | {"question": "what is cindy sherman known for?", "answer": ["Photographer"]} 307 | {"question": "what movies has carmen electra been in?", "answer": ["The Mating Habits of the Earthbound Human", "Scary Movie", "Getting Played", "Cheaper by the Dozen 2", "Meet the Spartans", "I Want Candy", "Full of It", "The Chosen One: Legend of the Raven", "Scary Movie 4", "Dirty Love"]} 308 | {"question": "what is the political structure of china?", "answer": ["Single-party state", "Communist state", "Socialist state"]} 309 | {"question": "which states does the colorado river run through?", "answer": ["Utah", "Arizona", "Nevada", "California", "Colorado"]} 310 | {"question": "who is moira en x men?", "answer": ["Mutant"]} 311 | {"question": "what are the major cities in ukraine?", "answer": ["Kiev"]} 312 | {"question": "what countries are part of the uk?", "answer": ["Scotland", "England", "Wales", "Northern Ireland"]} 313 | {"question": "who was the soviet leader during world war ii?", "answer": ["Alexei Negmatov"]} 314 | {"question": "what kind of government does libya have today?", "answer": ["Provisional government", "Parliamentary republic"]} 315 | {"question": "where did aaron rodgers go to high school?", "answer": ["Pleasant Valley High School"]} 316 | {"question": "what year was the first miss america pageant held?", "answer": ["1930 Miss America"]} 317 | {"question": "what did george clemenceau do?", "answer": ["Newspaper", "Physician", "Statesman", "Publisher"]} 318 | {"question": "what state is mount st. helens in?", "answer": ["Washington"]} 319 | {"question": "what is st anthony patron saint of?", "answer": ["Padua"]} 320 | {"question": "who did france surrender to in ww2?", "answer": ["Germany"]} 321 | {"question": "who plays edward scissorhands?", "answer": ["Johnny Depp"]} 322 | {"question": "where is the university of maryland medical school?", "answer": ["Maryland", "United States of America", "Baltimore"]} 323 | {"question": "what language do people speak in the netherlands?", "answer": ["Frisian languages", "West Flemish", "Dutch Language"]} 324 | {"question": "what countries are part of the baltic?", "answer": ["Latvia", "Lithuania", "Estonia"]} 325 | {"question": "when did florida marlins join mlb?", "answer": ["1993 Major League Baseball Season"]} 326 | {"question": "who plays the voice of kitt in knight rider?", "answer": ["William Daniels"]} 327 | {"question": "what did the scientist chadwick discovered?", "answer": ["Neutron"]} 328 | {"question": "where did queensland get its name from?", "answer": ["Queen Victoria"]} 329 | {"question": "what music period did beethoven live in?", "answer": ["Opera", "Classical music"]} 330 | {"question": "which countries speak german officially?", "answer": ["Canada", "German Democratic Republic", "Luxembourg", "Switzerland", "Liechtenstein", "Germany", "West Germany", "Belgium", "Vatican City", "Second Polish Republic"]} 331 | {"question": "what year did president william henry harrison take office?", "answer": ["3/4/1841"]} 332 | {"question": "what are the songs that justin bieber wrote?", "answer": ["Catching Feelings", "Down to Earth", "Beauty and a Beat", "All Around The World (featuring Ludacris)", "Die in Your Arms", "As Long As You Love Me (featuring Big Sean)", "Baby", "Believe", "Be Alright", "Boyfriend"]} 333 | {"question": "what language brazil speak?", "answer": ["Brazilian Portuguese", "Portuguese Language", "Italian Language"]} 334 | {"question": "which countries share a border with russia?", "answer": ["Ukraine", "Belarus", "Kazakhstan", "Poland", "Lithuania", "Azerbaijan", "Mongolia", "North Korea", "Georgia", "Norway"]} 335 | {"question": "who is princess leia in star wars?", "answer": ["Carrie Fisher"]} 336 | {"question": "what are some of the traditions of islam?", "answer": ["Zakāt", "Salah", "Hajj", "Islamic dietary laws", "Mosque Carpet", "Jihad", "Sawm", "Halal food", "Shahada", "Adab"]} 337 | {"question": "what team was chris paul on?", "answer": ["Los Angeles Clippers"]} 338 | {"question": "what capital of austria?", "answer": ["Vienna"]} 339 | {"question": "what county is greeley colorado in?", "answer": ["Weld County"]} 340 | {"question": "what was lebron james first team?", "answer": ["Cleveland Cavaliers"]} 341 | {"question": "what industry does walmart operate in?", "answer": ["Department Stores", "Retail", "Variety Stores"]} 342 | {"question": "what was the title of the book charles darwin wrote?", "answer": ["The Structure and Distribution of Coral Reefs", "On evolution", "A student's introduction to Charles Darwin", "Climbing Plants", "The Expression of the Emotions in Man and Animals", "The origin of species : complete and fully illustrated", "The Origin of Species", "The Life of Erasmus Darwin", "The Autobiography of Charles Darwin", "The Descent of Man, and Selection in Relation to Sex"]} 343 | {"question": "what was robert burns?", "answer": ["Poet", "Writer", "Bard", "Author"]} 344 | {"question": "what battles did stonewall jackson fight in?", "answer": ["Battle of Manassas Station Ops.", "Battle of McDowell", "Battle of Port Republic", "Battle of Rappahannock Station I", "Battle of Chancellorsville", "Battle of Front Royal", "Battle of Hoke's Run", "Battle of Cedar Mountain", "First Battle of Winchester", "Battle of Hancock"]} 345 | {"question": "who is governor of ohio 2011?", "answer": ["John Kasich"]} 346 | {"question": "who does donnie wahlberg play in the sixth sense?", "answer": ["Vincent Grey"]} 347 | {"question": "where is laos in world map?", "answer": ["Cambodia"]} 348 | {"question": "what do you call the chinese writing system?", "answer": ["Standard Mandarin"]} 349 | {"question": "what was jesse james killed with?", "answer": ["Assassination", "Firearm"]} 350 | {"question": "when does the mayan calendar end exactly?", "answer": ["2012"]} 351 | {"question": "who does jordan palmer play for?", "answer": ["Jacksonville Jaguars"]} 352 | {"question": "who was vice president after kennedy died?", "answer": ["Lyndon B. Johnson"]} 353 | {"question": "where did salvador dali study art?", "answer": ["Real Academia de Bellas Artes de San Fernando"]} 354 | {"question": "what was john quincy adams famous for?", "answer": ["Secretary of State", "President", "Ambassador", "Member of Congress", "Senator"]} 355 | {"question": "where did giuliana rancic grow up?", "answer": ["Naples"]} 356 | {"question": "where is the university of the rockies located?", "answer": ["United States of America", "Colorado", "Colorado Springs"]} 357 | {"question": "what did george orwell died of?", "answer": ["Tuberculosis"]} 358 | {"question": "what type of sports do japanese play?", "answer": ["Japan national football team", "Japan women's national handball team", "Japan men's national volleyball team", "Japan national handball team", "Japan women's national volleyball team", "Japan national baseball team"]} 359 | {"question": "who is michael buble?", "answer": ["Singer", "Actor", "Singer-songwriter"]} 360 | {"question": "what countries in the world speak chinese?", "answer": ["Canada", "Brunei", "Singapore", "Malaysia", "Shěn", "Vietnam", "China"]} 361 | {"question": "who is the head coach of inter milan?", "answer": ["Andrea Stramaccioni"]} 362 | {"question": "what are the important holidays of islam?", "answer": ["Eid al-Fitr", "Ramadan", "Eid al-Adha"]} 363 | {"question": "what language do the maasai tribe speak?", "answer": ["Maasai Language"]} 364 | {"question": "where did william morris go to college?", "answer": ["Marlborough College", "Exeter College, Oxford", "University of Oxford"]} 365 | {"question": "what type of music did john lennon sing?", "answer": ["Experimental rock", "Pop rock", "Pop music", "Blues-rock", "Art rock", "Soft rock", "Psychedelic rock", "Rock music", "Experimental music"]} 366 | {"question": "what form of currency does china have?", "answer": ["Renminbi"]} 367 | {"question": "where did dutch language come from?", "answer": ["Europeans"]} 368 | {"question": "who did paul revere marry?", "answer": ["Sarah Revere"]} 369 | {"question": "what country did germany invade first in ww1?", "answer": ["Belgium"]} 370 | {"question": "what did stephen hawking become famous for?", "answer": ["Author", "Mathematician", "Professor", "Writer", "Cosmologist", "Physicist", "Actor", "Astronomer", "Scientist", "Science writer"]} 371 | {"question": "what countries share borders with spain?", "answer": ["Perejil Island", "Portugal", "France", "Andorra", "Morocco", "Gibraltar"]} 372 | {"question": "who was the father of king george vi?", "answer": ["George V"]} 373 | {"question": "who was the leader of soviet union during wwii?", "answer": ["Joseph Stalin"]} 374 | {"question": "what did drita find out?", "answer": ["Football Superleague of Kosovo"]} 375 | {"question": "what did james k polk do before he was president?", "answer": ["Lawyer"]} 376 | {"question": "what make of bike did steve mcqueen ride in the great escape?", "answer": ["Yucatan"]} 377 | {"question": "what type of music did claude debussy play?", "answer": ["Ballet", "French opera", "Art song", "Incidental music", "Classical music", "20th-century classical music"]} 378 | {"question": "what language does egyptian people speak?", "answer": ["Modern Standard Arabic"]} 379 | {"question": "who won the battle of gettysburg union or confederate?", "answer": ["Union"]} 380 | {"question": "what works of art did leonardo da vinci produce?", "answer": ["Ginevra de' Benci", "The Last Supper", "The Virgin and Child with St Anne and St John the Baptist", "Mona Lisa", "Benois Madonna", "Madonna Litta", "Lady with an Ermine", "St. John the Baptist", "The Virgin and Child with St. Anne", "Annunciation"]} 381 | {"question": "when was president john adams elected?", "answer": ["John Adams Presidential Campaign, 1796", "John Adams Presidential Campaign, 1800"]} 382 | {"question": "what are the four main languages spoken in spain?", "answer": ["Basque Language", "Galician Language", "Catalan language", "Occitan language"]} 383 | {"question": "who did john kennedy have affairs with?", "answer": ["william averell harriman appointed by john fitzgerald kennedy"]} 384 | {"question": "who are the colorado representatives?", "answer": ["Wayne Allard", "Ken Salazar", "Gordon L. Allott", "Hank Brown", "Edwin C. Johnson", "William L. Armstrong", "Floyd K. Haskell", "Mark Udall", "Michael Bennet"]} 385 | {"question": "where did andy murray started playing tennis?", "answer": ["United Kingdom"]} 386 | {"question": "what does ringo sing?", "answer": ["Yellow Submarine", "\"Pinocchio Medley (\"Do You See the Noses Growing?\"", "Good Night", "I Shall Be Released", "Roll Over Beethoven", "Tommy's Holiday Camp", "Sweet Little Sixteen", "California Calling", "Honey Don't", "Don’t Pass Me By"]} 387 | {"question": "what team did david beckham play for in 2011?", "answer": ["LA Galaxy"]} 388 | {"question": "who is washington redskins backup qb?", "answer": ["Rex Grossman"]} 389 | {"question": "what was lucille ball?", "answer": ["Singer", "Model", "Comedian", "Television Producer", "Actor"]} 390 | {"question": "where is the nra headquarters located?", "answer": ["Fairfax"]} 391 | {"question": "what type of government does the us follow?", "answer": ["Presidential system", "Federal republic", "Representative democracy", "Two-party system", "Constitutional republic", "Republic"]} 392 | {"question": "who did carrie ann inaba get engaged to?", "answer": ["Jesse Sloan"]} 393 | {"question": "where do american bulldogs originate from?", "answer": ["United States of America"]} 394 | {"question": "what is william taft known for?", "answer": ["President of the United States"]} 395 | {"question": "who led the campaign in the shenandoah valley?", "answer": ["Jackson's Valley Campaign"]} 396 | {"question": "what did peter tchaikovsky do?", "answer": ["Composer"]} 397 | {"question": "what university did gordon brown attend?", "answer": ["University of Edinburgh"]} 398 | {"question": "when did michael jordan return to the nba?", "answer": ["2001–02 NBA season"]} 399 | {"question": "what is the zip code for midland tx?", "answer": ["79702", "79710", "79708", "79706", "79707", "79704", "79705", "79711", "79703", "79701"]} 400 | {"question": "what document did james madison write?", "answer": ["The Federalist Papers", "The Papers of James Madison: Presidential Series", "The Papers of James Madison: Congressional Series", "The Papers of James Madison: Secretary of State Series", "The Papers of James Madison", "The Papers of James Madison: Retirement Series"]} 401 | -------------------------------------------------------------------------------- /download.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import re 4 | from tqdm import tqdm 5 | import requests 6 | import json, argparse 7 | 8 | sess = requests.Session() 9 | 10 | def parse_args(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--link', '-l', type=str, required=True, help='Share link of Tsinghua Cloud') 13 | parser.add_argument('--password', '-p', type=str, default='', help='Password of the share link') 14 | parser.add_argument('--save', '-s', type=str, default='./', help='Save directory') 15 | parser.add_argument('--file', '-f', type=str, default=None, help='File name, support regex, if not set, download all files') 16 | return parser.parse_args() 17 | 18 | def get_share_key(url): 19 | prefix = 'https://cloud.tsinghua.edu.cn/d/' 20 | if not url.startswith(prefix): 21 | raise ValueError('Share link of Tsinghua Cloud should start with {}'.format(prefix)) 22 | share_key = url[len(prefix):].replace('/', '') 23 | print('Share key: {}'.format(share_key)) 24 | 25 | return share_key 26 | 27 | 28 | def dfs_search_files(share_key: str, path="/"): 29 | global sess 30 | filelist = [] 31 | print('https://cloud.tsinghua.edu.cn/api/v2.1/share-links/{}/dirents/?path={}'.format(share_key, path)) 32 | r = sess.get('https://cloud.tsinghua.edu.cn/api/v2.1/share-links/{}/dirents/?path={}'.format(share_key, path)) 33 | objects = r.json()['dirent_list'] 34 | for obj in objects: 35 | if obj["is_dir"]: 36 | filelist += dfs_search_files(share_key, obj['folder_path']) 37 | else: 38 | filelist.append(obj) 39 | 40 | return filelist 41 | 42 | def download_single_file(url: str, fname: str): 43 | global sess 44 | resp = sess.get(url, stream=True) 45 | total = int(resp.headers.get('content-length', 0)) 46 | dir_name = os.path.dirname(fname) 47 | if not os.path.exists(dir_name): 48 | os.makedirs(dir_name) 49 | with open(fname, 'wb') as file, tqdm( 50 | total=total, 51 | ncols=120, 52 | unit='iB', 53 | unit_scale=True, 54 | unit_divisor=1024, 55 | ) as bar: 56 | for data in resp.iter_content(chunk_size=1024): 57 | size = file.write(data) 58 | bar.update(size) 59 | 60 | def download(url, save_dir): 61 | share_key = get_share_key(url) 62 | 63 | print("Searching for files to be downloaded...") 64 | search_files = dfs_search_files(share_key) 65 | # for file in search_files: 66 | # print(file['is_dir'], file.keys()) 67 | filelist = sorted(search_files, key=lambda x: x['file_path']) 68 | print("Found {} files in the share link.".format(len(filelist))) 69 | print("Last Modified Time".ljust(25), " ", "File Size".rjust(10), " ", "File Path") 70 | print("-" * 100) 71 | for file in filelist: 72 | print(file["last_modified"], " ", str(file["size"]).rjust(10), " ", file["file_path"]) 73 | print("-" * 100) 74 | 75 | if not args.yes: 76 | while True: 77 | key = input("Start downloading? [y/n]") 78 | if key == 'y': 79 | break 80 | elif key == 'n': 81 | return 82 | 83 | flag = True 84 | for i, file in enumerate(filelist): 85 | file_url = 'https://cloud.tsinghua.edu.cn/d/{}/files/?p={}&dl=1'.format(share_key, file["file_path"]) 86 | save_path = os.path.join(save_dir, file["file_path"][1:]) 87 | if not os.path.exists(save_dir): 88 | os.makedirs(save_dir) 89 | print("[{}/{}] Downloading File: {}".format(i + 1, len(filelist), save_path)) 90 | try: 91 | download_single_file(file_url, save_path) 92 | except Exception as e: 93 | print("Error happened when downloading file: {}".format(save_path)) 94 | print(e) 95 | flag = False 96 | if flag: 97 | print("Download finished.") 98 | else: 99 | print("Download finished with error.") 100 | 101 | return flag 102 | 103 | def make_data(sample): 104 | src = "" 105 | for ix, ref in enumerate(sample['references']): 106 | src += "Reference [%d]: %s\\" % (ix+1, ref) 107 | src += "Question: %s\\Answer:" % (sample['question']) 108 | source = src.replace("\n", " ").replace("\r", " ") 109 | target = sample['answer'].replace("\n"," ").replace("\r", " ") 110 | 111 | return source, target 112 | 113 | if __name__ == "__main__": 114 | 115 | arg = argparse.ArgumentParser() 116 | arg.add_argument('target', type=str, choices=["generator-training-data", "retriever-training-data", "retriever-pretrained-checkpoint", "all"], help='Target to download') 117 | arg.add_argument('--save', '-s', type=str, default='./download', help='Save directory') 118 | arg.add_argument("-y", "--yes", action="store_true", help="Download without confirmation") 119 | args = arg.parse_args() 120 | 121 | if args.target in ["all", "generator-training-data"]: 122 | 123 | save_dir = os.path.join(args.save, 'generator-training-data', 'raw') 124 | if download('https://cloud.tsinghua.edu.cn/d/d290dcfc92e342f9a017/', save_dir): 125 | 126 | for split in ['train', 'val', 'test']: 127 | ds = [json.loads(data) for data in open(f'{save_dir}/{split}.jsonl').readlines()] 128 | processed_dir = os.path.join(args.save, 'generator-training-data', 'processed') 129 | if not os.path.exists(processed_dir): 130 | os.makedirs(processed_dir) 131 | source_out = open(os.path.join(processed_dir, f'{split}.source'), 'w') 132 | target_out = open(os.path.join(processed_dir, f'{split}.target'), 'w') 133 | for sample in tqdm(ds): 134 | source, target = make_data(sample) 135 | source_out.write(source + '\n') 136 | target_out.write(target + '\n') 137 | 138 | source_out.close() 139 | target_out.close() 140 | 141 | if args.target in ["all", "retriever-training-data"]: 142 | download("https://cloud.tsinghua.edu.cn/d/3927b67a834c475288e2/", os.path.join(args.save, 'retriever-training-data')) 143 | 144 | if args.target in ["all", "retriever-pretrained-checkpoint"]: 145 | download("https://cloud.tsinghua.edu.cn/d/bc96946dd9a14c84b8d4/", os.path.join(args.save, 'retriever-pretrained-checkpoint')) 146 | -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | from arguments import get_args 2 | from model import load_model 3 | 4 | def main(): 5 | args = get_args() 6 | 7 | webglm = load_model(args) 8 | 9 | task = args.task 10 | if task == 'triviaqa': 11 | from evaluate.triviaqa import eval 12 | elif task == 'nq_open': 13 | from evaluate.eval import eval 14 | elif task == 'web_questions': 15 | from evaluate.eval import eval 16 | else: 17 | raise "Task Name Error!" 18 | 19 | print('WebGLM Initialize Done. Start Evaluating...') 20 | result = eval(webglm, args) 21 | print(f'Result: {result}') 22 | print('Evaluate Done') 23 | 24 | if __name__ == "__main__": 25 | main() -------------------------------------------------------------------------------- /evaluate/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/evaluate/__init__.py -------------------------------------------------------------------------------- /evaluate/eval.py: -------------------------------------------------------------------------------- 1 | import json 2 | from tqdm import tqdm 3 | 4 | def eval(model, args): 5 | ds = [json.loads(data_str) for data_str in open(args.evaluate_task_data_path).readlines()] 6 | 7 | correct, total = 0, 0 8 | 9 | for ix, sample in enumerate(tqdm(ds)): 10 | predict = model.query(sample['question'])['answer'] 11 | for label in sample['answer']: 12 | if label in predict: 13 | correct += 1 14 | break 15 | total += 1 16 | 17 | return correct / total -------------------------------------------------------------------------------- /evaluate/triviaqa.py: -------------------------------------------------------------------------------- 1 | from transformers import BigBirdTokenizer, BigBirdForQuestionAnswering 2 | import torch 3 | import json 4 | from tqdm import tqdm 5 | 6 | PUNCTUATION_SET_TO_EXCLUDE = set(''.join(['‘', '’', '´', '`', '.', ',', '-', '"', '\'', '[', ']', '{', '}', '(', ')', '!', '?'])) 7 | 8 | def get_sub_answers(answers, begin=0, end=None): 9 | return [" ".join(x.split(" ")[begin:end]) for x in answers if len(x.split(" ")) > 1] 10 | 11 | def expand_to_aliases(given_answers, ignore_prefix=False, ignore_suffix=False): 12 | if ignore_prefix: 13 | given_answers = given_answers + get_sub_answers(given_answers, begin=1) 14 | if ignore_suffix: 15 | given_answers = given_answers + get_sub_answers(given_answers, end=-1) 16 | answers = [] 17 | for answer in given_answers: 18 | alias = answer.replace('_', ' ').lower() 19 | alias = ''.join(c if c not in PUNCTUATION_SET_TO_EXCLUDE else ' ' for c in alias) 20 | answers.append(' '.join(alias.split()).strip()) 21 | return set(answers) 22 | 23 | 24 | def get_best_valid_start_end_idx(start_scores, end_scores, top_k=1, max_size=100): 25 | best_start_scores, best_start_idx = torch.topk(start_scores, top_k) 26 | best_end_scores, best_end_idx = torch.topk(end_scores, top_k) 27 | 28 | widths = best_end_idx[:, None] - best_start_idx[None, :] 29 | mask = torch.logical_or(widths < 0, widths > max_size) 30 | scores = (best_end_scores[:, None] + best_start_scores[None, :]) - (1e8 * mask) 31 | best_score = torch.argmax(scores).item() 32 | 33 | return best_start_idx[best_score % top_k], best_end_idx[best_score // top_k] 34 | 35 | def extract(extractor, tokenizer, example): 36 | encoding = tokenizer(example["question"], example["predict"], return_tensors="pt", max_length=512, padding="max_length", truncation=True) 37 | input_ids = encoding['input_ids'].to("cuda") 38 | 39 | with torch.no_grad(): 40 | start_scores, end_scores = extractor(input_ids=input_ids).to_tuple() 41 | 42 | start_score, end_score = get_best_valid_start_end_idx(start_scores[0], end_scores[0], top_k=8, max_size=16) 43 | 44 | all_tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"][0].tolist()) 45 | answer_tokens = all_tokens[start_score: end_score + 1] 46 | 47 | example["output"] = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) 48 | 49 | answers = expand_to_aliases(example["answer"], ignore_prefix=True, ignore_suffix=True) 50 | predictions = expand_to_aliases([example["output"]], ignore_prefix=True) 51 | 52 | example["match"] = len(list(answers & predictions)) > 0 53 | 54 | return example 55 | 56 | 57 | def eval(model, args): 58 | ds = [json.loads(data_str) for data_str in open(args.evaluate_task_data_path).readlines()] 59 | 60 | for ix, sample in enumerate(tqdm(ds)): 61 | output = model.query(sample['question']) 62 | ds[ix]['predict'] = output['answer'] 63 | 64 | print('Start Extracting Answer...') 65 | 66 | tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-base-trivia-itc") 67 | extractor = BigBirdForQuestionAnswering.from_pretrained("google/bigbird-base-trivia-itc").to("cuda") 68 | 69 | scores = {} 70 | acc = {} 71 | 72 | for sample in tqdm(ds): 73 | example = {} 74 | match = extract(extractor, tokenizer, sample)['match'] 75 | labels = sample['labels'] 76 | for label in labels: 77 | if label not in scores: 78 | scores[label] = [0, 0] 79 | scores[label][1] += 1 80 | if match: 81 | scores[label][0] += 1 82 | 83 | for split, data in scores.items(): 84 | acc[split] = data[0] / data[1] 85 | 86 | return acc -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | from .modeling_webglm import WebGLM, load_model 2 | from .utils import citation_correction -------------------------------------------------------------------------------- /model/modeling_webglm.py: -------------------------------------------------------------------------------- 1 | from .retriever import ReferenceRetiever 2 | from transformers import AutoModelForSeq2SeqLM, AutoTokenizer 3 | import re, os 4 | 5 | class WebGLM: 6 | def __init__(self, webglm_ckpt_path, retriever_ckpt_path, device=None, filter_max_batch_size=400, searcher_name="serpapi") -> None: 7 | self.device = device 8 | self.ref_retriever = ReferenceRetiever(retriever_ckpt_path, device, filter_max_batch_size, searcher_name) 9 | self.tokenizer = AutoTokenizer.from_pretrained(webglm_ckpt_path, trust_remote_code=True) 10 | self.model = AutoModelForSeq2SeqLM.from_pretrained(webglm_ckpt_path, trust_remote_code=True) 11 | self.model = self.model.half() 12 | if device: 13 | self.model.to(device) 14 | self.model.eval() 15 | 16 | def query(self, question): 17 | refs = self.ref_retriever.query(question) 18 | if not refs: 19 | return { "references": [], "answer": "" } 20 | prompt = '' 21 | for ix, ref in enumerate(refs): 22 | txt = ref["text"] 23 | prompt += f'Reference [{ix+1}]: {txt}' '\\' 24 | prompt += f'Question: {question}\\Answer: [gMASK]' 25 | inputs = self.tokenizer(prompt, return_tensors="pt") 26 | inputs = self.tokenizer.build_inputs_for_generation(inputs, max_gen_length=1024) 27 | if self.device: 28 | inputs = inputs.to(self.device) 29 | outputs = self.model.generate(**inputs, max_length=1024, eos_token_id = self.tokenizer.eop_token_id, pad_token_id=self.tokenizer.eop_token_id) 30 | f = re.findall(r"<\|startofpiece\|>(.+)<\|endofpiece\|>", self.tokenizer.decode(outputs[0].tolist())) 31 | assert len(f) > 0 32 | return { "answer": f[0].strip(), "references": refs} 33 | 34 | def stream_query(self, question): 35 | refs = self.ref_retriever.query(question) 36 | if not refs: 37 | yield { "references": [], "answer": "" } 38 | return 39 | yield { "references": refs } 40 | prompt = '' 41 | for ix, ref in enumerate(refs): 42 | txt = ref["text"] 43 | prompt += f'Reference [{ix+1}]: {txt}' '\\' 44 | prompt += f'Question: {question}\\Answer: [gMASK]' 45 | inputs = self.tokenizer(prompt, return_tensors="pt") 46 | inputs = self.tokenizer.build_inputs_for_generation(inputs, max_gen_length=1024) 47 | if self.device: 48 | inputs = inputs.to(self.device) 49 | outputs = self.model.generate(**inputs, max_length=1024, eos_token_id = self.tokenizer.eop_token_id, pad_token_id=self.tokenizer.eop_token_id) 50 | f = re.findall(r"<\|startofpiece\|>(.+)<\|endofpiece\|>", self.tokenizer.decode(outputs[0].tolist())) 51 | assert len(f) > 0 52 | yield { "answer": f[0].strip() } 53 | 54 | 55 | def load_model(args): 56 | webglm_ckpt_path = args.webglm_ckpt_path or os.getenv("WEBGLM_CKPT") or 'THUDM/WebGLM' 57 | retiever_ckpt_path = args.retriever_ckpt_path or os.getenv("WEBGLM_RETRIEVER_CKPT") 58 | if not retiever_ckpt_path: 59 | print('Retriever checkpoint not specified, please specify it with --retriever_ckpt_path or $WEBGLM_RETRIEVER_CKPT') 60 | exit(1) 61 | if args.serpapi_key: 62 | os.environ["SERPAPI_KEY"] = args.serpapi_key 63 | 64 | print('WebGLM Initializing...') 65 | 66 | webglm = WebGLM(webglm_ckpt_path, retiever_ckpt_path, args.device, args.filter_max_batch_size, args.searcher) 67 | 68 | print('WebGLM Loaded') 69 | 70 | return webglm -------------------------------------------------------------------------------- /model/retriever/__init__.py: -------------------------------------------------------------------------------- 1 | import json 2 | from .searching import create_searcher 3 | from .fetching import Fetcher 4 | from .extracting import Extractor 5 | from .filtering import ReferenceFilter 6 | 7 | from typing import Optional, Union, List, Dict, Tuple, Iterable, Callable, Any 8 | 9 | class ReferenceRetiever(): 10 | def __init__(self, retriever_ckpt_path, device=None, filter_max_batch_size=400, searcher="serpapi") -> None: 11 | self.searcher = create_searcher(searcher) 12 | self.fetcher = Fetcher() 13 | self.extractor = Extractor() 14 | self.filter = ReferenceFilter(retriever_ckpt_path, device, filter_max_batch_size) 15 | 16 | def query(self, question) -> List[Dict[str, str]]: 17 | print("[System] Searching ...") 18 | search_results = self.searcher.search(question) 19 | urls = [result.url for result in search_results] 20 | titles = {result.url: result.title for result in search_results} 21 | print("[System] Count of available urls: ", len(urls)) 22 | if len(urls) == 0: 23 | print("[System] No available urls. Please check your network connection.") 24 | return None 25 | 26 | print("[System] Fetching ...") 27 | fetch_results = self.fetcher.fetch(urls) 28 | cnt = sum([len(fetch_results[key]) for key in fetch_results]) 29 | print("[System] Count of available fetch results: ", cnt) 30 | if cnt == 0: 31 | print("[System] No available fetch results. Please check playwright or your network.") 32 | return None 33 | 34 | print("[System] Extracting ...") 35 | data_list = [] 36 | for url in fetch_results: 37 | extract_results = self.extractor.extract_by_html2text(fetch_results[url]) 38 | for value in extract_results: 39 | data_list.append({ 40 | "url": url, 41 | "title": titles[url], 42 | "text": value 43 | }) 44 | print("[System] Count of paragraphs: ", len(data_list)) 45 | if len(data_list) == 0: 46 | print("[System] No available paragraphs. The references provide no useful information.") 47 | return None 48 | 49 | print("[System] Filtering ...") 50 | return self.filter.produce_references(question, data_list, 5) -------------------------------------------------------------------------------- /model/retriever/extracting/__init__.py: -------------------------------------------------------------------------------- 1 | from .extracting_by_bs4 import extracting as bs4 2 | from .html2text import html2text 3 | 4 | from typing import List, Dict 5 | import re 6 | 7 | class Extractor: 8 | def __init__(self) -> None: 9 | pass 10 | 11 | def _pre_filter(self, paragraphs): 12 | # sorted_paragraphs = sorted(paragraphs, key=lambda x: len(x)) 13 | # if len(sorted_paragraphs[-1]) < 10: 14 | # return [] 15 | ret = [] 16 | for item in paragraphs: 17 | item = item.strip() 18 | item = re.sub(r"\[\d+\]", "", item) 19 | if len(item) < 50: 20 | continue 21 | if len(item) > 1200: 22 | item = item[:1200] + "..." 23 | ret.append(item) 24 | return ret 25 | 26 | def extract_by_bs4(self, html) -> List[str]: 27 | return self._pre_filter(bs4(html)) 28 | 29 | def extract_by_html2text(self, html) -> List[str]: 30 | return self._pre_filter(html2text(html).split("\n")) -------------------------------------------------------------------------------- /model/retriever/extracting/extracting_by_bs4.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import asyncio 3 | import multiprocessing 4 | import json 5 | import os 6 | import sys 7 | from typing import List, Dict 8 | 9 | def extracting(html: str) -> List[str]: 10 | html = html.replace("\n", " ") 11 | soup = BeautifulSoup(html, 'html.parser') 12 | raw = soup.find('body') 13 | if raw: 14 | raw = raw.get_text("\n") 15 | else: 16 | raw = soup.get_text("\n") 17 | paragraphs = [] 18 | for item in raw.split("\n"): 19 | item = item.strip() 20 | if not item: 21 | continue 22 | paragraphs.append(item) 23 | return paragraphs -------------------------------------------------------------------------------- /model/retriever/extracting/html2text.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """html2text: Turn HTML into equivalent text (Markdown or plain text).""" 3 | __version__ = "3.200.3" 4 | __author__ = "Aaron Swartz (me@aaronsw.com)" 5 | __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." 6 | __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] 7 | 8 | # TODO: 9 | # Support decoded entities with unifiable. 10 | 11 | try: 12 | True 13 | except NameError: 14 | setattr(__builtins__, 'True', 1) 15 | setattr(__builtins__, 'False', 0) 16 | 17 | def has_key(x, y): 18 | if hasattr(x, 'has_key'): return x.has_key(y) 19 | else: return y in x 20 | 21 | try: 22 | import htmlentitydefs 23 | import urlparse 24 | import HTMLParser 25 | except ImportError: #Python3 26 | import html.entities as htmlentitydefs 27 | import urllib.parse as urlparse 28 | import html.parser as HTMLParser 29 | try: #Python3 30 | import urllib.request as urllib 31 | except: 32 | import urllib 33 | import optparse, re, sys, codecs, types 34 | 35 | try: from textwrap import wrap 36 | except: pass 37 | 38 | # Use Unicode characters instead of their ascii psuedo-replacements 39 | UNICODE_SNOB = 0 40 | 41 | # Escape all special characters. Output is less readable, but avoids corner case formatting issues. 42 | ESCAPE_SNOB = 0 43 | 44 | # Put the links after each paragraph instead of at the end. 45 | LINKS_EACH_PARAGRAPH = 0 46 | 47 | # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) 48 | BODY_WIDTH = 78 49 | 50 | # Don't show internal links (href="#local-anchor") -- corresponding link targets 51 | # won't be visible in the plain text file anyway. 52 | SKIP_INTERNAL_LINKS = True 53 | 54 | # Use inline, rather than reference, formatting for images and links 55 | INLINE_LINKS = True 56 | 57 | # Number of pixels Google indents nested lists 58 | GOOGLE_LIST_INDENT = 36 59 | 60 | # Don't add markdown elements and output nicely for plain reading 61 | NO_MARKDOWN = True 62 | 63 | IGNORE_ANCHORS = True 64 | IGNORE_IMAGES = True 65 | IGNORE_EMPHASIS = True 66 | 67 | ### Entity Nonsense ### 68 | 69 | def name2cp(k): 70 | if k == 'apos': return ord("'") 71 | if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 72 | return htmlentitydefs.name2codepoint[k] 73 | else: 74 | k = htmlentitydefs.entitydefs[k] 75 | if k.startswith("") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 76 | return ord(codecs.latin_1_decode(k)[0]) 77 | 78 | unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', 79 | 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', 80 | 'ndash':'-', 'oelig':'oe', 'aelig':'ae', 81 | 'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', 82 | 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', 83 | 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', 84 | 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', 85 | 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u', 86 | 'lrm':'', 'rlm':''} 87 | 88 | # All types of possible quotation marks - this is used to strip any blockquotes 89 | # before we add our own quotes in, for plain text formatting 90 | all_quotes = u'\u0022\u0027\u00AB\u00BB\u2018\u2019\u201A\u201B\u201C\u201D\u201E\u201F\u2039\u203A' 91 | 92 | unifiable_n = {} 93 | 94 | for k in unifiable.keys(): 95 | unifiable_n[name2cp(k)] = unifiable[k] 96 | 97 | ### End Entity Nonsense ### 98 | 99 | def onlywhite(line): 100 | """Return true if the line does only consist of whitespace characters.""" 101 | for c in line: 102 | if c != ' ' and c != ' ': 103 | return c == ' ' 104 | return line 105 | 106 | def hn(tag): 107 | if tag[0] == 'h' and len(tag) == 2: 108 | try: 109 | n = int(tag[1]) 110 | if n in range(1, 10): return n 111 | except ValueError: return 0 112 | 113 | def dumb_property_dict(style): 114 | """returns a hash of css attributes""" 115 | return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]); 116 | 117 | def dumb_css_parser(data): 118 | """returns a hash of css selectors, each of which contains a hash of css attributes""" 119 | # remove @import sentences 120 | data += ';' 121 | importIndex = data.find('@import') 122 | while importIndex != -1: 123 | data = data[0:importIndex] + data[data.find(';', importIndex) + 1:] 124 | importIndex = data.find('@import') 125 | 126 | # parse the css. reverted from dictionary compehension in order to support older pythons 127 | elements = [x.split('{') for x in data.split('}') if '{' in x.strip()] 128 | try: 129 | elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements]) 130 | except ValueError: 131 | elements = {} # not that important 132 | 133 | return elements 134 | 135 | def element_style(attrs, style_def, parent_style): 136 | """returns a hash of the 'final' style attributes of the element""" 137 | style = parent_style.copy() 138 | if 'class' in attrs: 139 | for css_class in attrs['class'].split(): 140 | css_style = style_def['.' + css_class] 141 | style.update(css_style) 142 | if 'style' in attrs: 143 | immediate_style = dumb_property_dict(attrs['style']) 144 | style.update(immediate_style) 145 | return style 146 | 147 | def google_list_style(style): 148 | """finds out whether this is an ordered or unordered list""" 149 | if 'list-style-type' in style: 150 | list_style = style['list-style-type'] 151 | if list_style in ['disc', 'circle', 'square', 'none']: 152 | return 'ul' 153 | return 'ol' 154 | 155 | def google_has_height(style): 156 | """check if the style of the element has the 'height' attribute explicitly defined""" 157 | if 'height' in style: 158 | return True 159 | return False 160 | 161 | def google_text_emphasis(style): 162 | """return a list of all emphasis modifiers of the element""" 163 | emphasis = [] 164 | if 'text-decoration' in style: 165 | emphasis.append(style['text-decoration']) 166 | if 'font-style' in style: 167 | emphasis.append(style['font-style']) 168 | if 'font-weight' in style: 169 | emphasis.append(style['font-weight']) 170 | return emphasis 171 | 172 | def google_fixed_width_font(style): 173 | """check if the css of the current element defines a fixed width font""" 174 | font_family = '' 175 | if 'font-family' in style: 176 | font_family = style['font-family'] 177 | if 'Courier New' == font_family or 'Consolas' == font_family: 178 | return True 179 | return False 180 | 181 | def list_numbering_start(attrs): 182 | """extract numbering from list element attributes""" 183 | if 'start' in attrs: 184 | return int(attrs['start']) - 1 185 | else: 186 | return 0 187 | 188 | class HTML2Text(HTMLParser.HTMLParser): 189 | def __init__(self, out=None, baseurl=''): 190 | HTMLParser.HTMLParser.__init__(self) 191 | 192 | # Config options 193 | self.unicode_snob = UNICODE_SNOB 194 | self.escape_snob = ESCAPE_SNOB 195 | self.links_each_paragraph = LINKS_EACH_PARAGRAPH 196 | self.body_width = BODY_WIDTH 197 | self.skip_internal_links = SKIP_INTERNAL_LINKS 198 | self.inline_links = INLINE_LINKS 199 | self.google_list_indent = GOOGLE_LIST_INDENT 200 | self.no_markdown = NO_MARKDOWN 201 | self.ignore_links = IGNORE_ANCHORS 202 | self.ignore_images = IGNORE_IMAGES 203 | self.ignore_emphasis = IGNORE_EMPHASIS 204 | self.google_doc = False 205 | self.ul_item_mark = '*' 206 | self.emphasis_mark = '_' 207 | self.strong_mark = '**' 208 | self.hr_mark = '* * *' 209 | self.blockquote_marks = ('> ', '') 210 | 211 | if out is None: 212 | self.out = self.outtextf 213 | else: 214 | self.out = out 215 | 216 | self.outtextlist = [] # empty list to store output characters before they are "joined" 217 | 218 | try: 219 | self.outtext = unicode() 220 | except NameError: # Python3 221 | self.outtext = str() 222 | 223 | self.quiet = 0 224 | self.p_p = 0 # number of newline character to print before next output 225 | self.outcount = 0 226 | self.start = 1 227 | self.space = 0 228 | self.a = [] 229 | self.astack = [] 230 | self.maybe_automatic_link = None 231 | self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://') 232 | self.acount = 0 233 | self.list = [] 234 | self.blockquote = 0 235 | self.pre = 0 236 | self.startpre = 0 237 | self.code = False 238 | self.br_toggle = '' 239 | self.lastWasNL = 0 240 | self.lastWasList = False 241 | self.style = 0 242 | self.style_def = {} 243 | self.tag_stack = [] 244 | self.emphasis = 0 245 | self.drop_white_space = 0 246 | self.inheader = False 247 | self.abbr_title = None # current abbreviation definition 248 | self.abbr_data = None # last inner HTML (for abbr being defined) 249 | self.abbr_list = {} # stack of abbreviations to write later 250 | self.baseurl = baseurl 251 | self.last_tag_started = None # holds the most recent tag we entered 252 | 253 | try: del unifiable_n[name2cp('nbsp')] 254 | except KeyError: pass 255 | unifiable['nbsp'] = ' _place_holder;' 256 | 257 | def normalise_options(self): 258 | """ Configure options just before handle """ 259 | if self.no_markdown: 260 | # Configure for plain text output 261 | self.body_width = 0 262 | self.escape_snob = False 263 | self.ignore_links = True 264 | self.ignore_images = True 265 | self.ignore_emphasis = True 266 | if self.unicode_snob: 267 | self.ul_item_mark = u'\u2013' 268 | self.blockquote_marks = (u'\u201C', u'\u201D') 269 | self.hr_mark = u'\u2014\u2014\u2014' 270 | else: 271 | self.ul_item_mark = '-' 272 | self.blockquote_marks = ('"', '"') 273 | self.hr_mark = '---' 274 | 275 | def feed(self, data): 276 | data = data.replace("' + 'script>", "") 277 | HTMLParser.HTMLParser.feed(self, data) 278 | 279 | def handle(self, data): 280 | self.normalise_options() 281 | self.feed(data) 282 | self.feed(" ") 283 | return self.post_process(self.close()) 284 | 285 | def outtextf(self, s): 286 | self.outtextlist.append(s) 287 | if s: self.lastWasNL = s[-1] == '\n' 288 | 289 | def close(self): 290 | HTMLParser.HTMLParser.close(self) 291 | 292 | self.pbr() 293 | self.o('', 0, 'end') 294 | 295 | self.outtext = self.outtext.join(self.outtextlist) 296 | if self.unicode_snob: 297 | nbsp = unichr(name2cp('nbsp')) 298 | else: 299 | nbsp = u' ' 300 | self.outtext = self.outtext.replace(u' _place_holder;', nbsp) 301 | 302 | return self.outtext 303 | 304 | def handle_charref(self, c): 305 | self.o(self.charref(c), 1) 306 | 307 | def handle_entityref(self, c): 308 | self.o(self.entityref(c), 1) 309 | 310 | def handle_starttag(self, tag, attrs): 311 | self.handle_tag(tag, attrs, 1) 312 | 313 | def handle_endtag(self, tag): 314 | self.handle_tag(tag, None, 0) 315 | 316 | def previousIndex(self, attrs): 317 | """ returns the index of certain set of attributes (of a link) in the 318 | self.a list 319 | 320 | If the set of attributes is not found, returns None 321 | """ 322 | if not has_key(attrs, 'href'): return None 323 | 324 | i = -1 325 | for a in self.a: 326 | i += 1 327 | match = 0 328 | 329 | if has_key(a, 'href') and a['href'] == attrs['href']: 330 | if has_key(a, 'title') or has_key(attrs, 'title'): 331 | if (has_key(a, 'title') and has_key(attrs, 'title') and 332 | a['title'] == attrs['title']): 333 | match = True 334 | else: 335 | match = True 336 | 337 | if match: return i 338 | 339 | def drop_last(self, nLetters): 340 | if not self.quiet: 341 | self.outtext = self.outtext[:-nLetters] 342 | 343 | def handle_emphasis(self, start, tag_style, parent_style): 344 | """handles various text emphases""" 345 | tag_emphasis = google_text_emphasis(tag_style) 346 | parent_emphasis = google_text_emphasis(parent_style) 347 | 348 | # handle Google's text emphasis 349 | strikethrough = 'line-through' in tag_emphasis and self.hide_strikethrough 350 | bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis 351 | italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis 352 | fixed = google_fixed_width_font(tag_style) and not \ 353 | google_fixed_width_font(parent_style) and not self.pre 354 | 355 | if start: 356 | # crossed-out text must be handled before other attributes 357 | # in order not to output qualifiers unnecessarily 358 | if bold or italic or fixed: 359 | self.emphasis += 1 360 | if strikethrough: 361 | self.quiet += 1 362 | if italic: 363 | self.o(self.emphasis_mark) 364 | self.drop_white_space += 1 365 | if bold: 366 | self.o(self.strong_mark) 367 | self.drop_white_space += 1 368 | if fixed: 369 | self.o('`') 370 | self.drop_white_space += 1 371 | self.code = True 372 | else: 373 | if bold or italic or fixed: 374 | # there must not be whitespace before closing emphasis mark 375 | self.emphasis -= 1 376 | self.space = 0 377 | self.outtext = self.outtext.rstrip() 378 | if fixed: 379 | if self.drop_white_space: 380 | # empty emphasis, drop it 381 | self.drop_last(1) 382 | self.drop_white_space -= 1 383 | else: 384 | self.o('`') 385 | self.code = False 386 | if bold: 387 | if self.drop_white_space: 388 | # empty emphasis, drop it 389 | self.drop_last(2) 390 | self.drop_white_space -= 1 391 | else: 392 | self.o(self.strong_mark) 393 | if italic: 394 | if self.drop_white_space: 395 | # empty emphasis, drop it 396 | self.drop_last(1) 397 | self.drop_white_space -= 1 398 | else: 399 | self.o(self.emphasis_mark) 400 | # space is only allowed after *all* emphasis marks 401 | if (bold or italic) and not self.emphasis: 402 | self.o(" ") 403 | if strikethrough: 404 | self.quiet -= 1 405 | 406 | def handle_tag(self, tag, attrs, start): 407 | #attrs = fixattrs(attrs) 408 | if attrs is None: 409 | attrs = {} 410 | else: 411 | attrs = dict(attrs) 412 | if start: 413 | self.last_tag_started = tag 414 | 415 | if self.google_doc: 416 | # the attrs parameter is empty for a closing tag. in addition, we 417 | # need the attributes of the parent nodes in order to get a 418 | # complete style description for the current element. we assume 419 | # that google docs export well formed html. 420 | parent_style = {} 421 | if start: 422 | if self.tag_stack: 423 | parent_style = self.tag_stack[-1][2] 424 | tag_style = element_style(attrs, self.style_def, parent_style) 425 | self.tag_stack.append((tag, attrs, tag_style)) 426 | else: 427 | dummy, attrs, tag_style = self.tag_stack.pop() 428 | if self.tag_stack: 429 | parent_style = self.tag_stack[-1][2] 430 | 431 | if hn(tag): 432 | self.p() 433 | if not self.no_markdown: 434 | if start: 435 | self.inheader = True 436 | # self.o(hn(tag)*"#" + ' ') 437 | else: 438 | self.inheader = False 439 | return # prevent redundant emphasis marks on headers 440 | 441 | if tag in ['p', 'div']: 442 | if self.google_doc: 443 | if start and google_has_height(tag_style): 444 | self.p() 445 | else: 446 | self.soft_br() 447 | else: 448 | self.p() 449 | 450 | if tag == "br" and start: 451 | self.o(" \n") 452 | 453 | if tag == "hr" and start: 454 | self.p() 455 | self.o(self.hr_mark) 456 | self.p() 457 | 458 | if tag in ["head", "style", 'script']: 459 | if start: self.quiet += 1 460 | else: self.quiet -= 1 461 | 462 | if tag == "style": 463 | if start: self.style += 1 464 | else: self.style -= 1 465 | 466 | if tag == "body": 467 | self.quiet = 0 # sites like 9rules.com never close
468 | 469 | if tag == "blockquote": 470 | if start: 471 | self.p(); 472 | self.o(self.blockquote_marks[0], 0, 1) 473 | self.start = 1 474 | self.blockquote += 1 475 | else: 476 | if self.no_markdown: 477 | # remove whitespace and extra quotes before adding our own quotes 478 | self.rstrip_outtext(all_quotes) 479 | self.o(self.blockquote_marks[1], 0, 1) 480 | self.blockquote -= 1 481 | self.p() 482 | 483 | if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: 484 | self.o(self.emphasis_mark) 485 | if tag in ['strong', 'b'] and not self.ignore_emphasis: 486 | self.o(self.strong_mark) 487 | if tag in ['del', 'strike', 's'] and not self.no_markdown: 488 | if start: 489 | self.o("<"+tag+">") 490 | else: 491 | self.o(""+tag+">") 492 | 493 | if self.google_doc: 494 | if not self.inheader and not self.no_markdown: 495 | # handle some font attributes, but leave headers clean 496 | self.handle_emphasis(start, tag_style, parent_style) 497 | 498 | if tag in ["code", "tt"] and not self.pre: 499 | self.o('`') #TODO: `` `this` `` 500 | if tag == "abbr": 501 | if start: 502 | self.abbr_title = None 503 | self.abbr_data = '' 504 | if has_key(attrs, 'title'): 505 | self.abbr_title = attrs['title'] 506 | else: 507 | if self.abbr_title != None: 508 | self.abbr_list[self.abbr_data] = self.abbr_title 509 | self.abbr_title = None 510 | self.abbr_data = '' 511 | 512 | if tag == "a" and not self.ignore_links: 513 | if start: 514 | if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')): 515 | self.astack.append(attrs) 516 | self.maybe_automatic_link = attrs['href'] 517 | else: 518 | self.astack.append(None) 519 | else: 520 | if self.astack: 521 | a = self.astack.pop() 522 | if self.maybe_automatic_link: 523 | self.maybe_automatic_link = None 524 | elif a: 525 | if self.inline_links: 526 | self.o("](" + escape_md(a['href']) + ")") 527 | else: 528 | i = self.previousIndex(a) 529 | if i is not None: 530 | a = self.a[i] 531 | else: 532 | self.acount += 1 533 | a['count'] = self.acount 534 | a['outcount'] = self.outcount 535 | self.a.append(a) 536 | self.o("][" + str(a['count']) + "]") 537 | 538 | if tag == "img" and start and not self.ignore_images: 539 | if has_key(attrs, 'src'): 540 | attrs['href'] = attrs['src'] 541 | alt = attrs.get('alt', '') 542 | self.o("![" + escape_md(alt) + "]") 543 | 544 | if self.inline_links: 545 | self.o("(" + escape_md(attrs['href']) + ")") 546 | else: 547 | i = self.previousIndex(attrs) 548 | if i is not None: 549 | attrs = self.a[i] 550 | else: 551 | self.acount += 1 552 | attrs['count'] = self.acount 553 | attrs['outcount'] = self.outcount 554 | self.a.append(attrs) 555 | self.o("[" + str(attrs['count']) + "]") 556 | 557 | if tag == 'dl' and start: self.p() 558 | if tag == 'dt' and not start: self.pbr() 559 | if tag == 'dd' and start: self.o(' ') 560 | if tag == 'dd' and not start: self.pbr() 561 | 562 | if tag in ["ol", "ul"]: 563 | # Google Docs create sub lists as top level lists 564 | if (not self.list) and (not self.lastWasList): 565 | self.p() 566 | if start: 567 | if self.google_doc: 568 | list_style = google_list_style(tag_style) 569 | else: 570 | list_style = tag 571 | numbering_start = list_numbering_start(attrs) 572 | self.list.append({'name':list_style, 'num':numbering_start}) 573 | else: 574 | if self.list: self.list.pop() 575 | self.lastWasList = True 576 | else: 577 | self.lastWasList = False 578 | 579 | if tag == 'li': 580 | self.pbr() 581 | if start: 582 | if self.list: li = self.list[-1] 583 | else: li = {'name':'ul', 'num':0} 584 | if self.google_doc: 585 | nest_count = self.google_nest_count(tag_style) 586 | else: 587 | nest_count = len(self.list) 588 | self.o(" " * nest_count) #TODO: line upstuff... 640 | data = "\n" + data 641 | 642 | if puredata and self.last_tag_started == 'blockquote' and self.no_markdown: 643 | data = data.lstrip(' \t\n\r'+all_quotes) 644 | 645 | bq = '' 646 | if not self.no_markdown: 647 | bq = (">" * self.blockquote) 648 | if not (force and data and data[0] == ">") and self.blockquote: 649 | bq += " " 650 | 651 | if self.pre: 652 | if not self.list: 653 | bq += " " 654 | #else: list content is already partially indented 655 | for i in range(len(self.list)): 656 | bq += " " 657 | data = data.replace("\n", "\n"+bq) 658 | 659 | if self.startpre: 660 | self.startpre = 0 661 | if self.list: 662 | data = data.lstrip("\n") # use existing initial indentation 663 | 664 | if self.start: 665 | self.space = 0 666 | self.p_p = 0 667 | self.start = 0 668 | 669 | if force == 'end': 670 | # It's the end. 671 | self.p_p = 0 672 | self.out("\n") 673 | self.space = 0 674 | 675 | if self.p_p: 676 | self.out((self.br_toggle+'\n'+bq)*self.p_p) 677 | self.space = 0 678 | self.br_toggle = '' 679 | 680 | if self.space: 681 | if not self.lastWasNL: self.out(' ') 682 | self.space = 0 683 | 684 | if self.a and ((self.p_p == 2 and self.links_each_paragraph) or force == "end"): 685 | if force == "end": self.out("\n") 686 | 687 | newa = [] 688 | for link in self.a: 689 | if self.outcount > link['outcount']: 690 | self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href'])) 691 | if has_key(link, 'title'): self.out(" ("+link['title']+")") 692 | self.out("\n") 693 | else: 694 | newa.append(link) 695 | 696 | if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done. 697 | 698 | self.a = newa 699 | 700 | if self.abbr_list and force == "end": 701 | for abbr, definition in self.abbr_list.items(): 702 | self.out(" *[" + abbr + "]: " + definition + "\n") 703 | 704 | self.p_p = 0 705 | self.out(data) 706 | self.outcount += 1 707 | 708 | def handle_data(self, data): 709 | if r'\/script>' in data: self.quiet -= 1 710 | 711 | if self.style: 712 | self.style_def.update(dumb_css_parser(data)) 713 | 714 | if not self.maybe_automatic_link is None: 715 | href = self.maybe_automatic_link 716 | if href == data and self.absolute_url_matcher.match(href): 717 | self.o("<" + data + ">") 718 | return 719 | else: 720 | self.o("[") 721 | self.maybe_automatic_link = None 722 | 723 | if not self.code and not self.pre and not self.no_markdown: 724 | data = escape_md_section(data, snob=self.escape_snob) 725 | self.o(data, 1) 726 | 727 | def unknown_decl(self, data): pass 728 | 729 | def charref(self, name): 730 | if name[0] in ['x','X']: 731 | c = int(name[1:], 16) 732 | else: 733 | c = int(name) 734 | 735 | if not self.unicode_snob and c in unifiable_n.keys(): 736 | return unifiable_n[c] 737 | else: 738 | try: 739 | return unichr(c) 740 | except NameError: #Python3 741 | return chr(c) 742 | 743 | def entityref(self, c): 744 | if not self.unicode_snob and c in unifiable.keys(): 745 | return unifiable[c] 746 | else: 747 | try: 748 | name2cp(c) 749 | except KeyError: 750 | if self.no_markdown: 751 | # let original ampersand and character through 752 | return "&" + c 753 | else: 754 | return "&" + c + ';' 755 | else: 756 | try: 757 | return unichr(name2cp(c)) 758 | except NameError: #Python3 759 | return chr(name2cp(c)) 760 | 761 | def replaceEntities(self, s): 762 | s = s.group(1) 763 | if s[0] == "#": 764 | return self.charref(s[1:]) 765 | else: return self.entityref(s) 766 | 767 | r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") 768 | def unescape(self, s): 769 | return self.r_unescape.sub(self.replaceEntities, s) 770 | 771 | def google_nest_count(self, style): 772 | """calculate the nesting count of google doc lists""" 773 | nest_count = 0 774 | if 'margin-left' in style: 775 | nest_count = int(style['margin-left'][:-2]) / self.google_list_indent 776 | return nest_count 777 | 778 | def post_process(self, text): 779 | if self.no_markdown: 780 | # Tidy up for plain text response 781 | text = remove_multi_blank_lines(text) 782 | else: 783 | # Wrapping does not work with plain text yet, as the criteria in skipwrap 784 | # depends on markdown formatting and syntax 785 | text = self.optwrap(text) 786 | return text 787 | 788 | def optwrap(self, text): 789 | """Wrap all paragraphs in the provided text.""" 790 | if not self.body_width: 791 | return text 792 | 793 | assert wrap, "Requires Python 2.3." 794 | result = '' 795 | newlines = 0 796 | for para in text.split("\n"): 797 | if len(para) > 0: 798 | if not skipwrap(para): 799 | result += "\n".join(wrap(para, self.body_width)) 800 | if para.endswith(' '): 801 | result += " \n" 802 | newlines = 1 803 | else: 804 | result += "\n\n" 805 | newlines = 2 806 | else: 807 | if not onlywhite(para): 808 | result += para + "\n" 809 | newlines = 1 810 | else: 811 | if newlines < 2: 812 | result += "\n" 813 | newlines += 1 814 | return result 815 | 816 | def rstrip_outtext(self, additional_chars): 817 | """ Remove whitespace at the end of the outtext """ 818 | if self.outtextlist: 819 | self.outtextlist[-1] = self.outtextlist[-1].rstrip(' \r\t\n'+additional_chars) 820 | 821 | multi_blank_line_matcher = re.compile(r'([ \t]*\n){3,}') 822 | ordered_list_matcher = re.compile(r'\d+\.\s') 823 | unordered_list_matcher = re.compile(r'[-\*\+]\s') 824 | md_chars_matcher = re.compile(r"([\\\[\]\(\)])") 825 | md_chars_matcher_all = re.compile(r"([`\*_{}\[\]\(\)#!])") 826 | md_dot_matcher = re.compile(r""" 827 | ^ # start of line 828 | (\s*\d+) # optional whitespace and a number 829 | (\.) # dot 830 | (?=\s) # lookahead assert whitespace 831 | """, re.MULTILINE | re.VERBOSE) 832 | md_plus_matcher = re.compile(r""" 833 | ^ 834 | (\s*) 835 | (\+) 836 | (?=\s) 837 | """, flags=re.MULTILINE | re.VERBOSE) 838 | md_dash_matcher = re.compile(r""" 839 | ^ 840 | (\s*) 841 | (-) 842 | (?=\s|\-) # followed by whitespace (bullet list, or spaced out hr) 843 | # or another dash (header or hr) 844 | """, flags=re.MULTILINE | re.VERBOSE) 845 | slash_chars = r'\`*_{}[]()#+-.!' 846 | md_backslash_matcher = re.compile(r''' 847 | (\\) # match one slash 848 | (?=[%s]) # followed by a char that requires escaping 849 | ''' % re.escape(slash_chars), 850 | flags=re.VERBOSE) 851 | 852 | def skipwrap(para): 853 | # If the text begins with four spaces or one tab, it's a code block; don't wrap 854 | if para[0:4] == ' ' or para[0] == '\t': 855 | return True 856 | # If the text begins with only two "--", possibly preceded by whitespace, that's 857 | # an emdash; so wrap. 858 | stripped = para.lstrip() 859 | if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-": 860 | return False 861 | # I'm not sure what this is for; I thought it was to detect lists, but there's 862 | # a
-inside- case in one of the tests that also depends upon it. 863 | if stripped[0:1] == '-' or stripped[0:1] == '*': 864 | return True 865 | # If the text begins with a single -, *, or +, followed by a space, or an integer, 866 | # followed by a ., followed by a space (in either case optionally preceeded by 867 | # whitespace), it's a list; don't wrap. 868 | if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped): 869 | return True 870 | return False 871 | 872 | def wrapwrite(text): 873 | text = text.encode('utf-8') 874 | try: #Python3 875 | sys.stdout.buffer.write(text) 876 | except AttributeError: 877 | sys.stdout.write(text) 878 | 879 | def html2text(html, baseurl=''): 880 | h = HTML2Text(baseurl=baseurl) 881 | return h.handle(html) 882 | 883 | def unescape(s, unicode_snob=False): 884 | h = HTML2Text() 885 | h.unicode_snob = unicode_snob 886 | return h.unescape(s) 887 | 888 | def escape_md(text): 889 | """Escapes markdown-sensitive characters within other markdown constructs.""" 890 | return md_chars_matcher.sub(r"\\\1", text) 891 | 892 | def escape_md_section(text, snob=False): 893 | """Escapes markdown-sensitive characters across whole document sections.""" 894 | text = md_backslash_matcher.sub(r"\\\1", text) 895 | if snob: 896 | text = md_chars_matcher_all.sub(r"\\\1", text) 897 | text = md_dot_matcher.sub(r"\1\\\2", text) 898 | text = md_plus_matcher.sub(r"\1\\\2", text) 899 | text = md_dash_matcher.sub(r"\1\\\2", text) 900 | return text 901 | 902 | def remove_multi_blank_lines(text): 903 | """ Ensure there can only be one blank line between text """ 904 | return multi_blank_line_matcher.sub('\n\n', text) 905 | 906 | def main(): 907 | baseurl = '' 908 | 909 | p = optparse.OptionParser('%prog [(filename|url) [encoding]]', 910 | version='%prog ' + __version__) 911 | p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true", 912 | default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis") 913 | p.add_option("--ignore-links", dest="ignore_links", action="store_true", 914 | default=IGNORE_ANCHORS, help="don't include any formatting for links") 915 | p.add_option("--ignore-images", dest="ignore_images", action="store_true", 916 | default=IGNORE_IMAGES, help="don't include any formatting for images") 917 | p.add_option("--no-markdown", dest="no_markdown", action="store_true", 918 | default=NO_MARKDOWN, help="don't use markdown syntax and display nicely as plain text") 919 | p.add_option("-g", "--google-doc", action="store_true", dest="google_doc", 920 | default=False, help="convert an html-exported Google Document") 921 | p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", 922 | default=False, help="use a dash rather than a star for unordered list items") 923 | p.add_option("-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk", 924 | default=False, help="use an asterisk rather than an underscore for emphasized text") 925 | p.add_option("-b", "--body-width", dest="body_width", action="store", type="int", 926 | default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap") 927 | p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int", 928 | default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists") 929 | p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", 930 | default=False, help="hide strike-through text. only relevant when -g is specified as well") 931 | p.add_option("--escape-all", action="store_true", dest="escape_snob", 932 | default=False, help="Escape all special characters. Output is less readable, but avoids corner case formatting issues.") 933 | (options, args) = p.parse_args() 934 | 935 | # process input 936 | encoding = "utf-8" 937 | if len(args) > 0: 938 | file_ = args[0] 939 | if len(args) == 2: 940 | encoding = args[1] 941 | if len(args) > 2: 942 | p.error('Too many arguments') 943 | 944 | if file_.startswith('http://') or file_.startswith('https://'): 945 | baseurl = file_ 946 | j = urllib.urlopen(baseurl) 947 | data = j.read() 948 | if encoding is None: 949 | try: 950 | from feedparser import _getCharacterEncoding as enc 951 | except ImportError: 952 | enc = lambda x, y: ('utf-8', 1) 953 | encoding = enc(j.headers, data)[0] 954 | if encoding == 'us-ascii': 955 | encoding = 'utf-8' 956 | else: 957 | data = open(file_, 'rb').read() 958 | if encoding is None: 959 | try: 960 | from chardet import detect 961 | except ImportError: 962 | detect = lambda x: {'encoding': 'utf-8'} 963 | encoding = detect(data)['encoding'] 964 | else: 965 | data = sys.stdin.read() 966 | 967 | data = data.decode(encoding) 968 | h = HTML2Text(baseurl=baseurl) 969 | # handle options 970 | if options.ul_style_dash: h.ul_item_mark = '-' 971 | if options.em_style_asterisk: 972 | h.emphasis_mark = '*' 973 | h.strong_mark = '__' 974 | 975 | h.body_width = options.body_width 976 | h.list_indent = options.list_indent 977 | h.ignore_emphasis = options.ignore_emphasis 978 | h.ignore_links = options.ignore_links 979 | h.ignore_images = options.ignore_images 980 | h.no_markdown = options.no_markdown 981 | h.google_doc = options.google_doc 982 | h.hide_strikethrough = options.hide_strikethrough 983 | h.escape_snob = options.escape_snob 984 | 985 | wrapwrite(h.handle(data)) 986 | 987 | 988 | if __name__ == "__main__": 989 | main() 990 | -------------------------------------------------------------------------------- /model/retriever/fetching/__init__.py: -------------------------------------------------------------------------------- 1 | from .playwright_based_crawl_new import get_raw_pages 2 | from .import playwright_based_crawl_new 3 | 4 | import asyncio 5 | 6 | from typing import List, Dict 7 | 8 | class Fetcher: 9 | def __init__(self) -> None: 10 | self.loop = asyncio.get_event_loop() 11 | # TODO delete loop -> loop.close() 12 | 13 | 14 | def _pre_handle_urls(self, urls: List[str]) -> List[str]: 15 | urls_new = [] 16 | for url in urls: 17 | if url in urls_new or "http://%s"%url in urls_new or "https://%s"%url in urls_new: 18 | continue 19 | if not url.startswith("http"): 20 | url = "http://%s" % url 21 | urls_new.append(url) 22 | return urls_new 23 | 24 | def fetch(self, urls: List[str]) -> Dict[str, List[str]]: 25 | 26 | urls = self._pre_handle_urls(urls) 27 | 28 | self.loop.run_until_complete(get_raw_pages(urls, close_browser=True)) 29 | responses = [playwright_based_crawl_new.results[url] for url in urls] 30 | 31 | ret = dict() 32 | for url, resp in zip(urls, responses): 33 | if not resp[1]: 34 | pass 35 | else: 36 | ret[url] = resp[1] 37 | 38 | return ret 39 | -------------------------------------------------------------------------------- /model/retriever/fetching/playwright_based_crawl_new.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from playwright.async_api import async_playwright, Page 3 | 4 | 5 | results ={} 6 | status ={} 7 | context = None 8 | 9 | 10 | async def one_page_handle(context, url): 11 | # 开启事件监听 12 | # page.on('response',printdata) 13 | # 进入子页面 14 | try: 15 | global results 16 | results[url] = [None,None] 17 | response = await context.request.get(url, timeout=5000) 18 | # 等待子页面加载完毕 19 | results[url] = (response.status, await response.text()) 20 | except Exception as e: 21 | pass 22 | 23 | async def get_conetent(): 24 | global context 25 | if not context: 26 | # print("加载驱动") 27 | playwright = await async_playwright().start() 28 | browser = await playwright.firefox.launch() 29 | # 新建上下文 30 | context = await browser.new_context() 31 | return context 32 | 33 | 34 | async def close_browser(browser): 35 | # 关闭浏览器驱动 36 | await browser.close() 37 | 38 | async def get_raw_pages_(context, urls): 39 | # 封装异步任务 40 | tasks = [] 41 | global results 42 | results = {} 43 | for url in urls: 44 | tasks.append(asyncio.create_task(one_page_handle(context, url))) 45 | 46 | await asyncio.wait(tasks, timeout=10) 47 | 48 | 49 | async def get_raw_pages(urls, close_browser=False): 50 | context = await get_conetent() 51 | await get_raw_pages_(context,urls) 52 | 53 | -------------------------------------------------------------------------------- /model/retriever/filtering/__init__.py: -------------------------------------------------------------------------------- 1 | from .contriver import ReferenceFilter -------------------------------------------------------------------------------- /model/retriever/filtering/contriver.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer, AutoModel 3 | import os 4 | 5 | from typing import Optional, Union, List, Dict, Tuple, Iterable, Callable, Any 6 | 7 | class ContrieverScorer: 8 | def __init__(self, retriever_ckpt_path, device=None, max_batch_size=400) -> None: 9 | query_encoder_path = os.path.join(retriever_ckpt_path, 'query_encoder') 10 | reference_encoder_path = os.path.join(retriever_ckpt_path, 'reference_encoder') 11 | 12 | self.tokenizer = AutoTokenizer.from_pretrained("facebook/contriever-msmarco") 13 | self.query_encoder = AutoModel.from_pretrained(query_encoder_path) 14 | self.reference_encoder = AutoModel.from_pretrained(reference_encoder_path) 15 | self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if not device else device 16 | self.query_encoder = self.query_encoder.to(self.device).eval() 17 | self.reference_encoder = self.reference_encoder.to(self.device).eval() 18 | assert max_batch_size > 0 19 | self.max_batch_size = max_batch_size 20 | 21 | def get_query_embeddings(self, sentences: List[str]) -> torch.Tensor: 22 | # Tokenization and Inference 23 | torch.cuda.empty_cache() 24 | with torch.no_grad(): 25 | inputs = self.tokenizer(sentences, padding=True, 26 | truncation=True, return_tensors='pt') 27 | for key in inputs: 28 | inputs[key] = inputs[key].to(self.device) 29 | outputs = self.query_encoder(**inputs) 30 | # Mean Pool 31 | token_embeddings = outputs[0] 32 | mask = inputs["attention_mask"] 33 | token_embeddings = token_embeddings.masked_fill( 34 | ~mask[..., None].bool(), 0.) 35 | sentence_embeddings = token_embeddings.sum( 36 | dim=1) / mask.sum(dim=1)[..., None] 37 | return sentence_embeddings 38 | 39 | def get_embeddings(self, sentences: List[str]) -> torch.Tensor: 40 | # Tokenization and Inference 41 | torch.cuda.empty_cache() 42 | with torch.no_grad(): 43 | inputs = self.tokenizer(sentences, padding=True, 44 | truncation=True, return_tensors='pt') 45 | for key in inputs: 46 | inputs[key] = inputs[key].to(self.device) 47 | outputs = self.reference_encoder(**inputs) 48 | # Mean Pool 49 | token_embeddings = outputs[0] 50 | mask = inputs["attention_mask"] 51 | token_embeddings = token_embeddings.masked_fill( 52 | ~mask[..., None].bool(), 0.) 53 | sentence_embeddings = token_embeddings.sum( 54 | dim=1) / mask.sum(dim=1)[..., None] 55 | return sentence_embeddings 56 | 57 | def score_documents_on_query(self, query: str, documents: List[str]) -> torch.Tensor: 58 | query_embedding = self.get_query_embeddings([query])[0] 59 | document_embeddings = self.get_embeddings(documents) 60 | return query_embedding@document_embeddings.t() 61 | 62 | def select_topk(self, query: str, documents: List[str], k=1): 63 | """ 64 | Returns: 65 | `ret`: `torch.return_types.topk`, use `ret.values` or `ret.indices` to get value or index tensor 66 | """ 67 | scores = [] 68 | for i in range((len(documents) + self.max_batch_size - 1) // self.max_batch_size): 69 | scores.append(self.score_documents_on_query(query, documents[self.max_batch_size*i:self.max_batch_size*(i+1)]).to('cpu')) 70 | scores = torch.concat(scores) 71 | return scores.topk(min(k, len(scores))) 72 | 73 | 74 | class ReferenceFilter: 75 | def __init__(self, retriever_ckpt_path, device=None, max_batch_size=400) -> None: 76 | self.scorer = ContrieverScorer(retriever_ckpt_path, device, max_batch_size) 77 | 78 | def produce_references(self, query, paragraphs: List[Dict[str, str]], topk=5) -> List[Dict[str, str]]: 79 | """Individually calculate scores of each sentence, and return `topk`. paragraphs should be like a list of {title, url, text}.""" 80 | # paragraphs = self._pre_filter(paragraphs) 81 | texts = [item['text'] for item in paragraphs] 82 | topk = self.scorer.select_topk(query, texts, topk) 83 | indices = list(topk.indices.detach().cpu().numpy()) 84 | return [paragraphs[idx] for idx in indices] 85 | 86 | 87 | -------------------------------------------------------------------------------- /model/retriever/searching/__init__.py: -------------------------------------------------------------------------------- 1 | from .serpapi import Searcher as SerpAPISearcher 2 | from .bing_search import Searcher as BingSearcher 3 | from .searcher import SearchResult, SearcherInterface 4 | 5 | def create_searcher(name: str) -> SearcherInterface: 6 | if name == "serpapi": 7 | return SerpAPISearcher() 8 | elif name == "bing": 9 | return BingSearcher() 10 | else: 11 | raise NotImplementedError() -------------------------------------------------------------------------------- /model/retriever/searching/bing_search.py: -------------------------------------------------------------------------------- 1 | from playwright.sync_api import sync_playwright 2 | from .searcher import * 3 | from typing import List, Dict, Tuple, Optional 4 | 5 | import json 6 | 7 | def get_bing_search_raw_page(question: str): 8 | results = [] 9 | with sync_playwright() as p: 10 | browser = p.chromium.launch() 11 | context = browser.new_context() 12 | page = context.new_page() 13 | try: 14 | page.goto(f"https://www.bing.com/search?q={question}") 15 | except: 16 | page.goto(f"https://www.bing.com") 17 | page.fill('input[name="q"]', question) 18 | page.press('input[name="q"]', 'Enter') 19 | try: 20 | page.wait_for_load_state('networkidle', timeout=3000) 21 | except: 22 | pass 23 | # page.wait_for_load_state('networkidle') 24 | search_results = page.query_selector_all('.b_algo h2') 25 | for result in search_results: 26 | title = result.inner_text() 27 | a_tag = result.query_selector('a') 28 | if not a_tag: continue 29 | url = a_tag.get_attribute('href') 30 | if not url: continue 31 | # print(title, url) 32 | results.append({ 33 | 'title': title, 34 | 'url': url 35 | }) 36 | browser.close() 37 | return results 38 | 39 | def query_bing(question, max_tries=3): 40 | cnt = 0 41 | while cnt < max_tries: 42 | cnt += 1 43 | results = get_bing_search_raw_page(question) 44 | if results: 45 | return results 46 | print('No Bing Result') 47 | return None 48 | 49 | 50 | if __name__ == '__main__': 51 | 52 | with open('crawl.json', 'w', encoding='utf-8') as f: 53 | json.dump(query_bing('how to cook a steak'), f, ensure_ascii=False, indent=4) 54 | 55 | exit(0) 56 | 57 | 58 | class Searcher(SearcherInterface): 59 | def __init__(self) -> None: 60 | pass 61 | 62 | def _parse(self, result) -> List[SearchResult]: 63 | if not result: 64 | return None 65 | ret = [] 66 | for item in result: 67 | ret.append(SearchResult(item['title'], item['url'], None)) 68 | return ret 69 | 70 | def search(self, query) -> List[SearchResult]: 71 | return self._parse(query_bing(query)) 72 | 73 | 74 | 75 | if __name__ == '__main__': 76 | 77 | print(json.dumps(query_bing('how to cook a cake?'), ensure_ascii=False, indent=4)) -------------------------------------------------------------------------------- /model/retriever/searching/searcher.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import List, Dict 3 | 4 | class SearchResult: 5 | def __init__(self, title, url, snip) -> None: 6 | self.title = title 7 | self.url = url 8 | self.snip = snip 9 | 10 | def dump(self): 11 | return { 12 | "title": self.title, 13 | "url": self.url, 14 | "snip": self.snip 15 | } 16 | 17 | def __str__(self) -> str: 18 | return json.dumps(self.dump()) 19 | 20 | class SearcherInterface: 21 | def search(self, query) -> List[SearchResult]: 22 | raise NotImplementedError() -------------------------------------------------------------------------------- /model/retriever/searching/serpapi.py: -------------------------------------------------------------------------------- 1 | import json, os 2 | import requests 3 | from .searcher import * 4 | from typing import List, Dict 5 | 6 | 7 | 8 | def serp_api(query: str, api_key: str): 9 | params = { 10 | "engine": "google", 11 | "q": query, 12 | "api_key": api_key 13 | } 14 | resp = requests.get("https://serpapi.com/search", params=params) 15 | if resp.status_code != 200: 16 | raise Exception("Serpapi returned %d\n%s"%(resp.status_code, resp.text)) 17 | result = resp.json() 18 | ret = [] 19 | for item in result['organic_results']: 20 | if "title" not in item or "link" not in item or "snippet" not in item: 21 | continue 22 | ret.append(SearchResult(item['title'], item['link'], item['snippet'])) 23 | return ret 24 | 25 | 26 | 27 | def dump_results(results: List[SearchResult]): 28 | return json.dumps([result.dump() for result in results]) 29 | 30 | 31 | class Searcher(SearcherInterface): 32 | def __init__(self) -> None: 33 | self.SERPAPI_KEY = os.getenv("SERPAPI_KEY") 34 | if not self.SERPAPI_KEY: 35 | print("[Error] SERPAPI_KEY is not set, please set it to use serpapi") 36 | exit(1) 37 | 38 | def _parse(self, result) -> List[SearchResult]: 39 | if not result: 40 | return None 41 | ret = [] 42 | for item in result: 43 | ret.append(SearchResult(item['ref'], item['url'], item['snip'])) 44 | return ret 45 | 46 | def search(self, query) -> List[SearchResult]: 47 | return serp_api(query, self.SERPAPI_KEY) 48 | -------------------------------------------------------------------------------- /model/stopwords/english: -------------------------------------------------------------------------------- 1 | 'd 2 | 'll 3 | 'm 4 | 're 5 | 's 6 | 't 7 | 've 8 | ZT 9 | ZZ 10 | a 11 | a's 12 | able 13 | about 14 | above 15 | abst 16 | accordance 17 | according 18 | accordingly 19 | across 20 | act 21 | actually 22 | added 23 | adj 24 | adopted 25 | affected 26 | affecting 27 | affects 28 | after 29 | afterwards 30 | again 31 | against 32 | ah 33 | ain't 34 | all 35 | allow 36 | allows 37 | almost 38 | alone 39 | along 40 | already 41 | also 42 | although 43 | always 44 | am 45 | among 46 | amongst 47 | an 48 | and 49 | announce 50 | another 51 | any 52 | anybody 53 | anyhow 54 | anymore 55 | anyone 56 | anything 57 | anyway 58 | anyways 59 | anywhere 60 | apart 61 | apparently 62 | appear 63 | appreciate 64 | appropriate 65 | approximately 66 | are 67 | area 68 | areas 69 | aren 70 | aren't 71 | arent 72 | arise 73 | around 74 | as 75 | aside 76 | ask 77 | asked 78 | asking 79 | asks 80 | associated 81 | at 82 | auth 83 | available 84 | away 85 | awfully 86 | b 87 | back 88 | backed 89 | backing 90 | backs 91 | be 92 | became 93 | because 94 | become 95 | becomes 96 | becoming 97 | been 98 | before 99 | beforehand 100 | began 101 | begin 102 | beginning 103 | beginnings 104 | begins 105 | behind 106 | being 107 | beings 108 | believe 109 | below 110 | beside 111 | besides 112 | best 113 | better 114 | between 115 | beyond 116 | big 117 | biol 118 | both 119 | brief 120 | briefly 121 | but 122 | by 123 | c 124 | c'mon 125 | c's 126 | ca 127 | came 128 | can 129 | can't 130 | cannot 131 | cant 132 | case 133 | cases 134 | cause 135 | causes 136 | certain 137 | certainly 138 | changes 139 | clear 140 | clearly 141 | co 142 | com 143 | come 144 | comes 145 | concerning 146 | consequently 147 | consider 148 | considering 149 | contain 150 | containing 151 | contains 152 | corresponding 153 | could 154 | couldn't 155 | couldnt 156 | course 157 | currently 158 | d 159 | date 160 | definitely 161 | describe 162 | described 163 | despite 164 | did 165 | didn't 166 | differ 167 | different 168 | differently 169 | discuss 170 | do 171 | does 172 | doesn't 173 | doing 174 | don't 175 | done 176 | down 177 | downed 178 | downing 179 | downs 180 | downwards 181 | due 182 | during 183 | e 184 | each 185 | early 186 | ed 187 | edu 188 | effect 189 | eg 190 | eight 191 | eighty 192 | either 193 | else 194 | elsewhere 195 | end 196 | ended 197 | ending 198 | ends 199 | enough 200 | entirely 201 | especially 202 | et 203 | et-al 204 | etc 205 | even 206 | evenly 207 | ever 208 | every 209 | everybody 210 | everyone 211 | everything 212 | everywhere 213 | ex 214 | exactly 215 | example 216 | except 217 | f 218 | face 219 | faces 220 | fact 221 | facts 222 | far 223 | felt 224 | few 225 | ff 226 | fifth 227 | find 228 | finds 229 | first 230 | five 231 | fix 232 | followed 233 | following 234 | follows 235 | for 236 | former 237 | formerly 238 | forth 239 | found 240 | four 241 | from 242 | full 243 | fully 244 | further 245 | furthered 246 | furthering 247 | furthermore 248 | furthers 249 | g 250 | gave 251 | general 252 | generally 253 | get 254 | gets 255 | getting 256 | give 257 | given 258 | gives 259 | giving 260 | go 261 | goes 262 | going 263 | gone 264 | good 265 | goods 266 | got 267 | gotten 268 | great 269 | greater 270 | greatest 271 | greetings 272 | group 273 | grouped 274 | grouping 275 | groups 276 | h 277 | had 278 | hadn't 279 | happens 280 | hardly 281 | has 282 | hasn't 283 | have 284 | haven't 285 | having 286 | he 287 | he's 288 | hed 289 | hello 290 | help 291 | hence 292 | her 293 | here 294 | here's 295 | hereafter 296 | hereby 297 | herein 298 | heres 299 | hereupon 300 | hers 301 | herself 302 | hes 303 | hi 304 | hid 305 | high 306 | higher 307 | highest 308 | him 309 | himself 310 | his 311 | hither 312 | home 313 | hopefully 314 | how 315 | howbeit 316 | however 317 | hundred 318 | i 319 | i'd 320 | i'll 321 | i'm 322 | i've 323 | id 324 | ie 325 | if 326 | ignored 327 | im 328 | immediate 329 | immediately 330 | importance 331 | important 332 | in 333 | inasmuch 334 | inc 335 | include 336 | indeed 337 | index 338 | indicate 339 | indicated 340 | indicates 341 | information 342 | inner 343 | insofar 344 | instead 345 | interest 346 | interested 347 | interesting 348 | interests 349 | into 350 | invention 351 | inward 352 | is 353 | isn't 354 | it 355 | it'd 356 | it'll 357 | it's 358 | itd 359 | its 360 | itself 361 | j 362 | just 363 | k 364 | keep 365 | keeps 366 | kept 367 | keys 368 | kg 369 | kind 370 | km 371 | knew 372 | know 373 | known 374 | knows 375 | l 376 | large 377 | largely 378 | last 379 | lately 380 | later 381 | latest 382 | latter 383 | latterly 384 | least 385 | less 386 | lest 387 | let 388 | let's 389 | lets 390 | like 391 | liked 392 | likely 393 | line 394 | little 395 | long 396 | longer 397 | longest 398 | look 399 | looking 400 | looks 401 | ltd 402 | m 403 | made 404 | mainly 405 | make 406 | makes 407 | making 408 | man 409 | many 410 | may 411 | maybe 412 | me 413 | mean 414 | means 415 | meantime 416 | meanwhile 417 | member 418 | members 419 | men 420 | merely 421 | mg 422 | might 423 | million 424 | miss 425 | ml 426 | more 427 | moreover 428 | most 429 | mostly 430 | mr 431 | mrs 432 | much 433 | mug 434 | must 435 | my 436 | myself 437 | n 438 | n't 439 | na 440 | name 441 | namely 442 | nay 443 | nd 444 | near 445 | nearly 446 | necessarily 447 | necessary 448 | need 449 | needed 450 | needing 451 | needs 452 | neither 453 | never 454 | nevertheless 455 | new 456 | newer 457 | newest 458 | next 459 | nine 460 | ninety 461 | no 462 | nobody 463 | non 464 | none 465 | nonetheless 466 | noone 467 | nor 468 | normally 469 | nos 470 | not 471 | noted 472 | nothing 473 | novel 474 | now 475 | nowhere 476 | number 477 | numbers 478 | o 479 | obtain 480 | obtained 481 | obviously 482 | of 483 | off 484 | often 485 | oh 486 | ok 487 | okay 488 | old 489 | older 490 | oldest 491 | omitted 492 | on 493 | once 494 | one 495 | ones 496 | only 497 | onto 498 | open 499 | opened 500 | opening 501 | opens 502 | or 503 | ord 504 | order 505 | ordered 506 | ordering 507 | orders 508 | other 509 | others 510 | otherwise 511 | ought 512 | our 513 | ours 514 | ourselves 515 | out 516 | outside 517 | over 518 | overall 519 | owing 520 | own 521 | p 522 | page 523 | pages 524 | part 525 | parted 526 | particular 527 | particularly 528 | parting 529 | parts 530 | past 531 | per 532 | perhaps 533 | place 534 | placed 535 | places 536 | please 537 | plus 538 | point 539 | pointed 540 | pointing 541 | points 542 | poorly 543 | possible 544 | possibly 545 | potentially 546 | pp 547 | predominantly 548 | present 549 | presented 550 | presenting 551 | presents 552 | presumably 553 | previously 554 | primarily 555 | probably 556 | problem 557 | problems 558 | promptly 559 | proud 560 | provides 561 | put 562 | puts 563 | q 564 | que 565 | quickly 566 | quite 567 | qv 568 | r 569 | ran 570 | rather 571 | rd 572 | re 573 | readily 574 | really 575 | reasonably 576 | recent 577 | recently 578 | ref 579 | refs 580 | regarding 581 | regardless 582 | regards 583 | related 584 | relatively 585 | research 586 | respectively 587 | resulted 588 | resulting 589 | results 590 | right 591 | room 592 | rooms 593 | run 594 | s 595 | said 596 | same 597 | saw 598 | say 599 | saying 600 | says 601 | sec 602 | second 603 | secondly 604 | seconds 605 | section 606 | see 607 | seeing 608 | seem 609 | seemed 610 | seeming 611 | seems 612 | seen 613 | sees 614 | self 615 | selves 616 | sensible 617 | sent 618 | serious 619 | seriously 620 | seven 621 | several 622 | shall 623 | she 624 | she'll 625 | shed 626 | shes 627 | should 628 | shouldn't 629 | show 630 | showed 631 | showing 632 | shown 633 | showns 634 | shows 635 | side 636 | sides 637 | significant 638 | significantly 639 | similar 640 | similarly 641 | since 642 | six 643 | slightly 644 | small 645 | smaller 646 | smallest 647 | so 648 | some 649 | somebody 650 | somehow 651 | someone 652 | somethan 653 | something 654 | sometime 655 | sometimes 656 | somewhat 657 | somewhere 658 | soon 659 | sorry 660 | specifically 661 | specified 662 | specify 663 | specifying 664 | state 665 | states 666 | still 667 | stop 668 | strongly 669 | sub 670 | substantially 671 | successfully 672 | such 673 | sufficiently 674 | suggest 675 | sup 676 | sure 677 | t 678 | t's 679 | take 680 | taken 681 | taking 682 | tell 683 | tends 684 | th 685 | than 686 | thank 687 | thanks 688 | thanx 689 | that 690 | that'll 691 | that's 692 | that've 693 | thats 694 | the 695 | their 696 | theirs 697 | them 698 | themselves 699 | then 700 | thence 701 | there 702 | there'll 703 | there's 704 | there've 705 | thereafter 706 | thereby 707 | thered 708 | therefore 709 | therein 710 | thereof 711 | therere 712 | theres 713 | thereto 714 | thereupon 715 | these 716 | they 717 | they'd 718 | they'll 719 | they're 720 | they've 721 | theyd 722 | theyre 723 | thing 724 | things 725 | think 726 | thinks 727 | third 728 | this 729 | thorough 730 | thoroughly 731 | those 732 | thou 733 | though 734 | thoughh 735 | thought 736 | thoughts 737 | thousand 738 | three 739 | throug 740 | through 741 | throughout 742 | thru 743 | thus 744 | til 745 | tip 746 | to 747 | today 748 | together 749 | too 750 | took 751 | toward 752 | towards 753 | tried 754 | tries 755 | truly 756 | try 757 | trying 758 | ts 759 | turn 760 | turned 761 | turning 762 | turns 763 | twice 764 | two 765 | u 766 | un 767 | under 768 | unfortunately 769 | unless 770 | unlike 771 | unlikely 772 | until 773 | unto 774 | up 775 | upon 776 | ups 777 | us 778 | use 779 | used 780 | useful 781 | usefully 782 | usefulness 783 | uses 784 | using 785 | usually 786 | uucp 787 | v 788 | value 789 | various 790 | very 791 | via 792 | viz 793 | vol 794 | vols 795 | vs 796 | w 797 | want 798 | wanted 799 | wanting 800 | wants 801 | was 802 | wasn't 803 | way 804 | ways 805 | we 806 | we'd 807 | we'll 808 | we're 809 | we've 810 | wed 811 | welcome 812 | well 813 | wells 814 | went 815 | were 816 | weren't 817 | what 818 | what'll 819 | what's 820 | whatever 821 | whats 822 | when 823 | whence 824 | whenever 825 | where 826 | where's 827 | whereafter 828 | whereas 829 | whereby 830 | wherein 831 | wheres 832 | whereupon 833 | wherever 834 | whether 835 | which 836 | while 837 | whim 838 | whither 839 | who 840 | who'll 841 | who's 842 | whod 843 | whoever 844 | whole 845 | whom 846 | whomever 847 | whos 848 | whose 849 | why 850 | widely 851 | will 852 | willing 853 | wish 854 | with 855 | within 856 | without 857 | won't 858 | wonder 859 | words 860 | work 861 | worked 862 | working 863 | works 864 | world 865 | would 866 | wouldn't 867 | www 868 | x 869 | y 870 | year 871 | years 872 | yes 873 | yet 874 | you 875 | you'd 876 | you'll 877 | you're 878 | you've 879 | youd 880 | young 881 | younger 882 | youngest 883 | your 884 | youre 885 | yours 886 | yourself 887 | yourselves 888 | z 889 | zero 890 | zt 891 | zz 892 | ! 893 | """" 894 | # 895 | $ 896 | % 897 | & 898 | ' 899 | ( 900 | ) 901 | * 902 | + 903 | "," 904 | - 905 | . 906 | / 907 | : 908 | ; 909 | < 910 | = 911 | > 912 | ? 913 | @ 914 | [ 915 | \ 916 | ] 917 | ^ 918 | _ 919 | ` 920 | { 921 | | 922 | } 923 | ~ -------------------------------------------------------------------------------- /model/stopwords/explaination: -------------------------------------------------------------------------------- 1 | reason 2 | reasons -------------------------------------------------------------------------------- /model/utils.py: -------------------------------------------------------------------------------- 1 | import re, os 2 | from rouge_score import rouge_scorer, tokenize 3 | 4 | class DataUtils: 5 | @staticmethod 6 | def split_segments(statement: str): 7 | all_statements = [] 8 | statement = re.sub(' +', ' ', statement.replace('\n', ' ')) 9 | split_pattern = r'(? 20: 28 | all_statements.append([prefix, []]) 29 | prefix = "" 30 | prefix += seg 31 | if prefix and prefix[-1] in ['.!?:']: 32 | prefix += " " 33 | if prefix: 34 | if all_statements and len(prefix) < 20: 35 | all_statements[-1][0] += prefix 36 | else: 37 | all_statements.append([prefix, []]) 38 | if all_statements: 39 | all_statements[-1][1] += cite 40 | 41 | return [seg[0] for seg in all_statements], [seg[1] for seg in all_statements] 42 | 43 | @staticmethod 44 | def matching_score(all_statements, references): 45 | def remove_stopwords(stmt: str): 46 | stmt = tokenize.tokenize(stmt, None) 47 | ret = [] 48 | for item in stmt: 49 | if item in stopwords: 50 | continue 51 | ret.append(item) 52 | return " ".join(ret) 53 | 54 | all_statements = [remove_stopwords(item) for item in all_statements] 55 | references = [remove_stopwords(item) for item in references] 56 | 57 | # return None 58 | scorer = rouge_scorer.RougeScorer(['rouge1']) 59 | all_scores = [] 60 | for statement in all_statements: 61 | if len(tokenize.tokenize(statement, None)) < 5: 62 | all_scores.append([0] * len(references)) 63 | continue 64 | ref_score = [] 65 | for idx, ref in enumerate(references): 66 | rouge = scorer.score(ref, statement)['rouge1'].precision 67 | # print(rouge) 68 | ref_score.append(rouge) 69 | all_scores.append(ref_score) 70 | return all_scores 71 | 72 | @staticmethod 73 | def get_ideal_citations(all_scores, raw_citations, citation_threshold, extra_bonus=0.3): 74 | 75 | assert len(all_scores) == len(raw_citations) 76 | 77 | ideal_citations = [] 78 | for seg_idx, scores in enumerate(all_scores): 79 | idc = [] 80 | best_idx = 0 81 | best_scr = 0 82 | for idx, score in enumerate(scores): 83 | if idx in raw_citations[seg_idx]: 84 | score += extra_bonus / len(raw_citations[seg_idx]) 85 | if score >= citation_threshold: 86 | idc.append(idx) 87 | if score > best_scr: 88 | best_idx = idx 89 | best_scr = score 90 | if len(idc) == 0 and len(raw_citations[seg_idx]) > 0: 91 | idc.append(best_idx) 92 | ideal_citations.append(idc) 93 | return ideal_citations 94 | 95 | @staticmethod 96 | def recompose(all_statements, raw_citations, references, sep=" ", citation_threshold=0.75) -> str: 97 | scores = DataUtils.matching_score(all_statements, references) 98 | ret = "" 99 | ideal_citations = DataUtils.get_ideal_citations(scores, raw_citations, citation_threshold) 100 | for seg, cit in zip(all_statements, ideal_citations): 101 | # judge if seg[0] is alphanumeric 102 | if ret and ret[-1] == "]" and seg and seg[0].isalnum(): 103 | ret += sep 104 | ret += seg 105 | for c in cit: 106 | ret += "[%d]"%(c+1) 107 | if ret and ret[-1] in ".!?:": 108 | ret += sep 109 | return ret.strip() 110 | 111 | class Stopwords: 112 | @staticmethod 113 | def load(): 114 | src = [ 115 | "./model/stopwords/english", 116 | "./model/stopwords/explaination", 117 | ] 118 | ret = [] 119 | for item in src: 120 | with open(item, "r") as f: 121 | ret += [word.strip() for word in f.readlines()] 122 | return ret 123 | 124 | 125 | stopwords = set(Stopwords.load()) 126 | 127 | def citation_correction(original_answer, references): 128 | segments, raw_cite = DataUtils.split_segments(original_answer) 129 | 130 | return DataUtils.recompose(segments, raw_cite, references) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.11.2 2 | chardet==5.1.0 3 | datasets==2.12.0 4 | feedparser==6.0.10 5 | gradio==3.33.1 6 | HTMLParser==0.0.2 7 | playwright==1.26.0 8 | Requests==2.31.0 9 | rouge_score==0.1.2 10 | torch==1.12.1 11 | tqdm==4.65.0 12 | transformers==4.29.1 13 | -------------------------------------------------------------------------------- /scripts/nq_open.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | script_path=$(realpath $0) 3 | script_dir=$(dirname $script_path) 4 | main_dir=$(dirname $script_dir) 5 | 6 | source "${main_dir}/configs/model_webglm.sh" 7 | 8 | DATA_PATH="data/nq_open.jsonl" 9 | 10 | run_cmd="python ${main_dir}/evaluate.py \ 11 | --webglm_ckpt_path $GENERATOR_CHECKPOINT_PATH \ 12 | --task nq_open \ 13 | --evaluate_task_data_path $DATA_PATH" 14 | 15 | eval ${run_cmd} -------------------------------------------------------------------------------- /scripts/triviaqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | script_path=$(realpath $0) 3 | script_dir=$(dirname $script_path) 4 | main_dir=$(dirname $script_dir) 5 | 6 | source "${main_dir}/configs/model_webglm.sh" 7 | 8 | DATA_PATH="data/triviaqa.jsonl" 9 | 10 | run_cmd="python ${main_dir}/evaluate.py \ 11 | --webglm_ckpt_path $GENERATOR_CHECKPOINT_PATH \ 12 | --task triviaqa \ 13 | --evaluate_task_data_path $DATA_PATH" 14 | 15 | eval ${run_cmd} -------------------------------------------------------------------------------- /scripts/web_questions.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | script_path=$(realpath $0) 3 | script_dir=$(dirname $script_path) 4 | main_dir=$(dirname $script_dir) 5 | 6 | source "${main_dir}/configs/model_webglm.sh" 7 | 8 | DATA_PATH="data/web_questions.jsonl" 9 | 10 | run_cmd="python ${main_dir}/evaluate.py \ 11 | --webglm_ckpt_path $GENERATOR_CHECKPOINT_PATH \ 12 | --task web_questions \ 13 | --evaluate_task_data_path $DATA_PATH" 14 | 15 | eval ${run_cmd} -------------------------------------------------------------------------------- /train_retriever.py: -------------------------------------------------------------------------------- 1 | from transformers import RobertaTokenizer, RobertaModel, AutoModelWithLMHead, AutoTokenizer, Trainer, AutoModel, BertLMHeadModel 2 | from datasets.load import load_dataset, load_from_disk 3 | import torch, os, sys, time, random, json, argparse 4 | from rouge_score.rouge_scorer import RougeScorer 5 | 6 | from torch.utils.data import Dataset, DataLoader 7 | from torch.optim import AdamW 8 | from torch.utils.data.distributed import DistributedSampler 9 | 10 | class QuestionReferenceDensity(torch.nn.Module): 11 | def __init__(self) -> None: 12 | super().__init__() 13 | self.question_encoder = AutoModel.from_pretrained("facebook/contriever-msmarco") 14 | self.reference_encoder = AutoModel.from_pretrained("facebook/contriever-msmarco") 15 | 16 | total = sum([param.nelement() for param in self.parameters()]) 17 | print("Number of parameter: %.2fM" % (total / 1e6)) 18 | 19 | def mean_pooling(self, token_embeddings, mask): 20 | token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.) 21 | sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None] 22 | return sentence_embeddings 23 | 24 | 25 | def forward(self, question, pos, neg): 26 | global args 27 | 28 | q = self.question_encoder(**question) 29 | r_pos = self.reference_encoder(**pos) 30 | r_neg = self.reference_encoder(**neg) 31 | cls_q = self.mean_pooling(q[0], question["attention_mask"]) 32 | cls_q /= args.temp 33 | cls_r_pos = self.mean_pooling(r_pos[0], pos["attention_mask"]) 34 | cls_r_neg = self.mean_pooling(r_neg[0], neg["attention_mask"]) 35 | 36 | l_pos = torch.matmul(cls_q, torch.transpose(cls_r_pos, 0, 1)) 37 | 38 | l_neg = torch.matmul(cls_q, torch.transpose(cls_r_neg, 0, 1)) 39 | 40 | return l_pos, l_neg 41 | 42 | @staticmethod 43 | def loss(l_pos, l_neg): 44 | return torch.nn.functional.cross_entropy(torch.cat([l_pos, l_neg], dim=1), torch.arange(0, len(l_pos), dtype=torch.long, device=args.device)) 45 | 46 | @staticmethod 47 | def num_correct(l_pos, l_neg): 48 | return ((torch.diag(l_pos) > torch.diag(l_neg))==True).sum() 49 | 50 | @staticmethod 51 | def acc(l_pos, l_neg): 52 | return ((torch.diag(l_pos) > torch.diag(l_neg))==True).sum() / len(l_pos) 53 | 54 | 55 | class WarmupLinearScheduler(torch.optim.lr_scheduler.LambdaLR): 56 | def __init__(self, optimizer, warmup, total, ratio, last_epoch=-1): 57 | self.warmup = warmup 58 | self.total = total 59 | self.ratio = ratio 60 | super(WarmupLinearScheduler, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) 61 | 62 | def lr_lambda(self, step): 63 | if step < self.warmup: 64 | return (1 - self.ratio) * step / float(max(1, self.warmup)) 65 | 66 | return max( 67 | 0.0, 68 | 1.0 + (self.ratio - 1) * (step - self.warmup) / float(max(1.0, self.total - self.warmup)), 69 | ) 70 | 71 | 72 | def move_dict_to_device(obj, device): 73 | for key in obj: 74 | obj[key] = obj[key].to(device) 75 | 76 | def collate(data): 77 | question = tokenizer([item["question"] for item in data], return_tensors="pt", padding=True, truncation=True) 78 | positive_reference = tokenizer([item["positive_reference"] for item in data], return_tensors="pt", padding=True, truncation=True) 79 | negative_reference = tokenizer([item["negative_reference"] for item in data], return_tensors="pt", padding=True, truncation=True) 80 | 81 | for key in question: question[key] = question[key].to(args.device) 82 | for key in positive_reference: positive_reference[key] = positive_reference[key].to(args.device) 83 | for key in negative_reference: negative_reference[key] = negative_reference[key].to(args.device) 84 | 85 | return question, positive_reference, negative_reference 86 | 87 | def eval(): 88 | # print("EVAL ...") 89 | model.eval() 90 | with torch.no_grad(): 91 | total_acc = 0 92 | for q, pos, neg in eval_loader: 93 | results = model(q, pos, neg) 94 | # print(results) 95 | # exit() 96 | tot_cr = model.num_correct(*results) 97 | total_acc += tot_cr 98 | 99 | print("EVALUATION, Acc: %10.6f"%(total_acc / len(eval_set))) 100 | 101 | def save(name): 102 | os.makedirs(log_dir, exist_ok=True) 103 | model.question_encoder.save_pretrained(os.path.join(log_dir, name, "query_encoder")) 104 | model.reference_encoder.save_pretrained(os.path.join(log_dir, name, "reference_encoder")) 105 | 106 | def train(max_epoch = 10, eval_step = 200, save_step = 400, print_step = 50): 107 | step = 0 108 | for epoch in range(0, max_epoch): 109 | print("EPOCH %d"%epoch) 110 | for q, pos, neg in train_loader: 111 | model.train() 112 | step += 1 113 | opt.zero_grad() 114 | results = model(q, pos, neg) 115 | loss = model.loss(*results) 116 | 117 | if step % print_step == 0: 118 | print("Step %4d, Loss, Acc: %10.6f, %10.6f"%(step, loss, model.acc(*results))) 119 | 120 | loss.backward() 121 | opt.step() 122 | 123 | scheduler.step() 124 | model.zero_grad() 125 | if step % eval_step == 0: 126 | eval() 127 | pass 128 | if step % save_step == 0: 129 | save("step-%d"%(step)) 130 | 131 | 132 | save("step-%d-epoch-%d"%(step, epoch)) 133 | # eval() 134 | 135 | if __name__ == "__main__": 136 | args = argparse.ArgumentParser() 137 | args.add_argument("--max_epoch", type=int, default=3) 138 | args.add_argument("--eval_step", type=int, default=40) 139 | args.add_argument("--save_step", type=int, default=40) 140 | args.add_argument("--print_step", type=int, default=40) 141 | args.add_argument("--device", type=str, default="cuda") 142 | args.add_argument("--temp", type=float, default=0.05) 143 | args.add_argument("--train_batch_size", type=int, default=64) 144 | args.add_argument("--eval_batch_size", type=int, default=32) 145 | args.add_argument("--lr", type=float, default=1e-6) 146 | args.add_argument("--warmup", type=int, default=100) 147 | args.add_argument("--total", type=int, default=1000) 148 | args.add_argument("--ratio", type=float, default=0.0) 149 | args.add_argument("--save_dir", type=str, default="./retriever_runs") 150 | args.add_argument("--train_data_dir", type=str, required=True) 151 | 152 | args = args.parse_args() 153 | 154 | log_dir = os.path.join(args.save_dir, time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time()))) 155 | 156 | train_set = load_from_disk(os.path.join(args.train_data_dir, "train")) 157 | eval_set = load_from_disk(os.path.join(args.train_data_dir, "eval")) 158 | 159 | tokenizer = AutoTokenizer.from_pretrained("facebook/contriever-msmarco") 160 | train_loader = DataLoader(train_set, batch_size=args.train_batch_size, collate_fn=collate) 161 | eval_loader = DataLoader(eval_set, batch_size=args.eval_batch_size, collate_fn=collate) 162 | 163 | model = QuestionReferenceDensity() 164 | model = model.to(args.device) 165 | opt = AdamW(model.parameters(), lr=args.lr, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.01) 166 | scheduler_args = { 167 | "warmup": args.warmup, 168 | "total": args.total, 169 | "ratio": args.ratio, 170 | } 171 | scheduler = WarmupLinearScheduler(opt, **scheduler_args) 172 | temp = args.temp 173 | 174 | train(max_epoch=args.max_epoch, eval_step=args.eval_step, save_step=args.save_step, print_step=args.print_step) 175 | 176 | -------------------------------------------------------------------------------- /web_demo.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | from model import citation_correction, load_model 3 | import argparse 4 | 5 | from arguments import add_model_config_args 6 | 7 | TOTAL_NUM = 10 8 | CSS = """ 9 | #col { 10 | width: min(100%, 800px); 11 | top: 0; 12 | right: 0; 13 | bottom: 0; 14 | left: 0; 15 | margin: auto; 16 | } 17 | 18 | footer{display:none !important} 19 | """ 20 | 21 | 22 | 23 | # a summary structure ( usetag in html ) 24 | # title is in summary, click to expand 25 | # in the container, there is an icon that can be clicked to jump to url. 26 | # the other part is the text. 27 | ref_html = """ 28 | 29 | 30 |39 | 40 | """ 41 | 42 | def query(query: str): 43 | 44 | refs = [] 45 | answer = "Loading ..." 46 | 47 | yield answer, "" 48 | 49 | for resp in webglm.stream_query(query): 50 | if "references" in resp: 51 | refs = resp["references"] 52 | if "answer" in resp: 53 | answer = resp["answer"] 54 | answer = citation_correction(answer, [ref['text'] for ref in refs]) 55 | yield answer, "31 | [{index}] {title} 32 | 33 | 34 | 35 | 36 |
37 |{text}
38 |References (Click to Expand)
" + "\n".join([ref_html.format(**item, index = idx + 1) for idx, item in enumerate(refs)]) 56 | 57 | if __name__ == '__main__': 58 | 59 | arg = argparse.ArgumentParser() 60 | add_model_config_args(arg) 61 | args = arg.parse_args() 62 | 63 | webglm = load_model(args) 64 | 65 | with gr.Blocks(theme=gr.themes.Base(), css=CSS) as demo: 66 | 67 | with gr.Column(elem_id='col'): 68 | gr.Markdown( 69 | """ 70 | # WebGLM Demo 71 | """) 72 | with gr.Row(): 73 | # with gr.Column(scale=8): 74 | query_box = gr.Textbox(show_label=False, placeholder="Enter question and press ENTER").style(container=False) 75 | # with gr.Column(scale=1, min_width=60): 76 | # query_button = gr.Button('Query') 77 | 78 | answer_box = gr.Textbox(show_label=False, value='', lines=5) 79 | 80 | # with gr.Box(): 81 | ref_boxes = gr.HTML(label="References") 82 | 83 | # with gr.Column() as refs_col: 84 | # ref_boxes = [] 85 | # for i in range(TOTAL_NUM): 86 | # ref_boxes.append(gr.Textbox(f"Textbox {i}", visible=False)) 87 | 88 | query_box.submit(query, query_box, [answer_box, ref_boxes]) 89 | # query_button.click(query, query_box, [answer_box, ref_boxes]) 90 | 91 | demo.queue() 92 | demo.launch() --------------------------------------------------------------------------------