├── .gitignore
├── LICENSE
├── MODEL_LICENSE
├── README.md
├── README_zh.md
├── arguments.py
├── assets
    ├── cases
    │   ├── 0.png
    │   ├── 1.png
    │   ├── 2.png
    │   ├── 3.png
    │   ├── 4.png
    │   ├── 5.png
    │   ├── 6.png
    │   ├── 7.png
    │   ├── 8.png
    │   └── 9.png
    └── main_process.png
├── cli_demo.py
├── data
    ├── nq_open.jsonl
    ├── trivia_qa.jsonl
    └── web_questions.jsonl
├── download.py
├── evaluate.py
├── evaluate
    ├── __init__.py
    ├── eval.py
    └── triviaqa.py
├── model
    ├── __init__.py
    ├── modeling_webglm.py
    ├── retriever
    │   ├── __init__.py
    │   ├── extracting
    │   │   ├── __init__.py
    │   │   ├── extracting_by_bs4.py
    │   │   └── html2text.py
    │   ├── fetching
    │   │   ├── __init__.py
    │   │   └── playwright_based_crawl_new.py
    │   ├── filtering
    │   │   ├── __init__.py
    │   │   └── contriver.py
    │   └── searching
    │   │   ├── __init__.py
    │   │   ├── bing_search.py
    │   │   ├── searcher.py
    │   │   └── serpapi.py
    ├── stopwords
    │   ├── english
    │   └── explaination
    └── utils.py
├── requirements.txt
├── scripts
    ├── nq_open.sh
    ├── triviaqa.sh
    └── web_questions.sh
├── train_retriever.py
└── web_demo.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | webglm_data
3 | error.html
4 | TODO.md
5 | *.ipynb
6 | download/
7 | %*
8 | retriever_runs


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright Hanyu Lai, Hao Yu, Xiao Liu
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/MODEL_LICENSE:
--------------------------------------------------------------------------------
 1 | 1. Definitions
 2 | 
 3 | “Licensor” means the WebGLM Team that distributes its Software.
 4 | 
 5 | “Software” means the WebGLM model parameters and data made available under this license.
 6 | 
 7 | 2. License Grant
 8 | 
 9 | Subject to the terms and conditions of this License, the Licensor hereby grants to you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license to use the Software solely for your non-commercial research purposes.
10 | 
11 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
12 | 
13 | 3. Restriction
14 | 
15 | You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any commercial, military, or illegal purposes.
16 | 
17 | You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings.
18 | 
19 | 4. Disclaimer
20 | 
21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 | 
23 | 5. Limitation of Liability
24 | 
25 | EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
26 | 
27 | 6. Dispute Resolution
28 | 
29 | This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing.
30 | 
31 | Note that the license is subject to update to a more comprehensive version.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <h1>WebGLM: Towards An Efficient Web-enhanced Question Answering System with Human Preferences</h1>
  2 | 
  3 | <p align="center">📃 <a href="https://arxiv.org/pdf/2306.07906.pdf" target="_blank">Paper (KDD'23)</a> • 🌐 <a href="https://github.com/THUDM/WebGLM/blob/main/README_zh.md" target="_blank">中文 README</a> • 🤗 HF Repo <a href="https://huggingface.co/THUDM/WebGLM" target="_blank">[WebGLM-10B]</a> <a href="https://huggingface.co/THUDM/WebGLM-2B" target="_blank">[WebGLM-2B]</a> • 📚 Dataset <a href="https://huggingface.co/datasets/THUDM/webglm-qa" target="_blank">[WebGLM-QA]</a></p>
  4 | 
  5 | This is the official implementation of WebGLM. If you find our open-sourced efforts useful, please 🌟 the repo to encourage our following developement!
  6 | 
  7 | <!--https://github.com/THUDM/WebGLM/assets/129033897/d2e1dd35-6340-4175-ac2d-fd585daa17cf-->
  8 | 
  9 | **[Please click to watch the demo!]**
 10 | 
 11 | [![Click to Watch Demo!](https://img.youtube.com/vi/ohjrlYCLLEU/0.jpg)](https://www.youtube.com/watch?v=ohjrlYCLLEU)
 12 | 
 13 | _Read this in [中文](README_zh.md)._
 14 | 
 15 | ## Update
 16 | **[2023/06/25]** Release [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B), an updated version of [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B) which introduces several new features:
 17 | 
 18 | 1. **Stronger Performance**: we have fully upgraded the ChatGLM2-6B. It uses the hybrid objective function of [GLM](https://github.com/THUDM/GLM), and has undergone pre-training with 1.4T bilingual tokens and human preference alignment training. The [evaluation results](README.md#evaluation-results) show that, compared to the first-generation model, ChatGLM2-6B has achieved substantial improvements in performance on datasets like MMLU (+23%), CEval (+33%), GSM8K (+571%), BBH (+60%), showing strong competitiveness among models of the same size.
 19 | 2. **Longer Context**: Based on [FlashAttention](https://github.com/HazyResearch/flash-attention) technique, we have extended the context length of the base model from 2K in ChatGLM-6B to 32K, and trained with a context length of 8K during the dialogue alignment, allowing for more rounds of dialogue. However, the current version of ChatGLM2-6B has limited understanding of single-round ultra-long documents, which we will focus on optimizing in future iterations.
 20 | 3. **More Efficient Inference**: Based on [Multi-Query Attention](http://arxiv.org/abs/1911.02150) technique, ChatGLM2-6B has more efficient inference speed and lower GPU memory usage: under the official  implementation, the inference speed has increased by 42% compared to the first generation; under INT4 quantization, the dialogue length supported by 6G GPU memory has increased from 1K to 8K.
 21 | 
 22 | More details please refer to [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B)。
 23 | 
 24 | <!-- TOC -->
 25 | 
 26 | -   [Overview](#overview)
 27 |     -   [Features](#features)
 28 | -   [News](#news)
 29 | -   [Preparation](#preparation)
 30 |     -   [Prepare Code and Environments](#prepare-code-and-environments)
 31 |     -   [Prepare SerpAPI Key](#prepare-serpapi-key)
 32 |     -   [Prepare Retriever Checkpoint](#prepare-retriever-checkpoint)
 33 | -   [Try WebGLM](#try-webglm)
 34 |     -   [Export Environment Variables](#export-environment-variables)
 35 |     -   [Run as Command Line Interface](#run-as-command-line-interface)
 36 |     -   [Run as Web Service](#run-as-web-service)
 37 | -   [Train WebGLM](#train-webglm)
 38 |     -   [Train Generator](#train-generator)
 39 |         -   [Prepare Data](#prepare-data)
 40 |         -   [Training](#training)
 41 |     -   [Train Retriever](#train-retriever)
 42 |         -   [Prepare Data](#prepare-data-1)
 43 |         -   [Training](#training-1)
 44 | -   [Evaluation](#evaluation)
 45 | -   [Real Application Cases](#real-application-cases)
 46 | -   [Citation](#citation)
 47 | 
 48 | # Overview
 49 | 
 50 | ![paper](./assets/main_process.png)
 51 | 
 52 | WebGLM aspires to provide an efficient and cost-effective web-enhanced question-answering system using the 10-billion-parameter General Language Model (GLM). It aims to improve real-world application deployment by integrating web search and retrieval capabilities into the pre-trained language model.
 53 | 
 54 | ## Features
 55 | 
 56 | -   **LLM-augmented Retriever**: Enhances the retrieval of relevant web content to better aid in answering questions accurately.
 57 | -   **Bootstrapped Generator**: Generates human-like responses to questions, leveraging the power of the GLM to provide refined answers.
 58 | -   **Human Preference-aware Scorer**: Estimates the quality of generated responses by prioritizing human preferences, ensuring the system produces useful and engaging content.
 59 | 
 60 | # News
 61 | 
 62 | -   **[2023-06-24]** We support searching via [Bing](https://www.bing.com/) now!
 63 | -   **[2023-06-14]** We release our code and the [paper](https://arxiv.org/pdf/2306.07906.pdf) of WebGLM!
 64 | 
 65 | # Preparation
 66 | 
 67 | ## Prepare Code and Environments
 68 | 
 69 | Clone this repo, and install python requirements.
 70 | 
 71 | ```bash
 72 | pip install -r requirements.txt
 73 | ```
 74 | 
 75 | Install Nodejs.
 76 | 
 77 | ```bash
 78 | apt install nodejs # If you use Ubuntu
 79 | ```
 80 | 
 81 | Install playwright dependencies.
 82 | 
 83 | ```bash
 84 | playwright install
 85 | ```
 86 | 
 87 | If browsing environments are not installed in your host, you need to install them. Do not worry, playwright will give you instructions when you first execute it if so.
 88 | 
 89 | ## Prepare SerpAPI Key
 90 | 
 91 | In search process, we use SerpAPI to get search results. You need to get a SerpAPI key from [here](https://serpapi.com/).
 92 | 
 93 | Then, set the environment variable `SERPAPI_KEY` to your key.
 94 | 
 95 | ```bash
 96 | export SERPAPI_KEY="YOUR KEY"
 97 | ```
 98 | 
 99 | Alternatively, you can use Bing search with local browser environment (playwright). You can add `--searcher bing` to start command lines to use Bing search. (See [Run as Command Line Interface](#run-as-command-line-interface) and [Run as Web Service](#run-as-web-service))
100 | 
101 | ## Prepare Retriever Checkpoint
102 | 
103 | Download the checkpoint on [ModelScope](https://www.modelscope.cn/models/shawliu9/webglm-contriever) by running the command line below.
104 | 
105 | You can manually specify the path to save the checkpoint by `--save SAVE_PATH`.
106 | 
107 | ```bash
108 | python download.py retriever-pretrained-checkpoint
109 | ```
110 | 
111 | # Try WebGLM
112 | 
113 | Before you run the code, make sure that the space of your device is enough.
114 | 
115 | ## Export Environment Variables
116 | 
117 | Export the environment variable `WEBGLM_RETRIEVER_CKPT` to the path of the retriever checkpoint. If you have downloaded the retriever checkpoint in the default path, you can simply run the command line below.
118 | 
119 | ```bash
120 | export WEBGLM_RETRIEVER_CKPT=./download/retriever-pretrained-checkpoint
121 | ```
122 | 
123 | ## Run as Command Line Interface
124 | 
125 | You can try WebGLM-2B model by:
126 | 
127 | ```bash
128 | python cli_demo.py -w THUDM/WebGLM-2B
129 | ```
130 | 
131 | Or directly for WebGLM-10B model:
132 | 
133 | ```bash
134 | python cli_demo.py
135 | ```
136 | 
137 | If you want to use Bing search instead of SerpAPI, you can add `--searcher bing` to the command line, for example:
138 | 
139 | ```bash
140 | python cli_demo.py -w THUDM/WebGLM-2B --searcher bing
141 | ```
142 | 
143 | ## Run as Web Service
144 | 
145 | Run `web_demo.py` with the same arguments as `cli_demo.py` to start a web service.
146 | For example, you can try WebGLM-2B model with Bing search by:
147 | 
148 | ```bash
149 | python web_demo.py -w THUDM/WebGLM-2B --searcher bing
150 | ```
151 | 
152 | # Train WebGLM
153 | 
154 | ## Train Generator
155 | 
156 | ### Prepare Data (WebGLM-QA)
157 | 
158 | Download the training data (WebGLM-QA) on [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/d/d290dcfc92e342f9a017/) by running the command line below.
159 | 
160 | ```bash
161 | python download.py generator-training-data
162 | ```
163 | 
164 | It will automatically download all the data and preprocess them into the seq2seq form that can be used immediately in `./download`.
165 | 
166 | ### Training
167 | 
168 | Please refer to [GLM repo](https://github.com/THUDM/GLM#train-with-your-own-data) for seq2seq training.
169 | 
170 | ## Train Retriever
171 | 
172 | ### Prepare Data
173 | 
174 | Download the training data on [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/d/3927b67a834c475288e2/) by running the command line below.
175 | 
176 | ```bash
177 | python download.py retriever-training-data
178 | ```
179 | 
180 | ### Training
181 | 
182 | Run the following command line to train the retriever. If you have downloaded the retriever training data in the default path, you can simply run the command line below.
183 | 
184 | ```bash
185 | python train_retriever.py --train_data_dir ./download/retriever-training-data
186 | ```
187 | 
188 | # Evaluation
189 | 
190 | You can reproduce our results on TriviaQA, WebQuestions and NQ Open. Take TriviaQA for example, you can simply run the command line below:
191 | 
192 | ```bash
193 | bash scripts/triviaqa.sh
194 | ```
195 | 
196 | and start running the experiment.
197 | 
198 | # Real Application Cases
199 | 
200 | [Here](assets/cases) you can see some examples of WebGLM real application scenarios.
201 | 
202 | <details><summary><b>When will the COVID-19 disappear?</b></summary>
203 | 
204 | ![](assets/cases/0.png)
205 | 
206 | </details>
207 | 
208 | <details><summary><b>How to balance career and hobbies?</b></summary>
209 | 
210 | ![](assets/cases/1.png)
211 | 
212 | </details>
213 | 
214 | <details><summary><b>FL Studio and Cubase, which is better?</b></summary>
215 | 
216 | ![](assets/cases/2.png)
217 | 
218 | </details>
219 | 
220 | <details><summary><b>Is attention better than CNN?</b></summary>
221 | 
222 | ![](assets/cases/3.png)
223 | 
224 | </details>
225 | 
226 | <details><summary><b>How to survive in the first-tier cities without a high-salary work?</b></summary>
227 | 
228 | ![](assets/cases/4.png)
229 | 
230 | </details>
231 | 
232 | <details><summary><b>What do you think of version 3.5 of Genshin Impact?</b></summary>
233 | 
234 | ![](assets/cases/5.png)
235 | 
236 | </details>
237 | 
238 | <details><summary><b>transformers are originated in NLP, but why they can be applied in CV?</b></summary>
239 | 
240 | ![](assets/cases/6.png)
241 | 
242 | </details>
243 | 
244 | <details><summary><b>Who proposed Music Transformer? How does it work?</b></summary>
245 | 
246 | ![](assets/cases/7.png)
247 | 
248 | </details>
249 | 
250 | <details><summary><b>What is the backbone of Toolformer?</b></summary>
251 | 
252 | ![](assets/cases/8.png)
253 | 
254 | </details>
255 | 
256 | # License
257 | 
258 | This repository is licensed under the [Apache-2.0 License](LICENSE). The use of model weights is subject to the [Model_License](MODEL_LICENSE). All open-sourced data is for resarch purpose only.
259 | 
260 | # Citation
261 | 
262 | If you use this code for your research, please cite our paper.
263 | 
264 | ```
265 | @misc{liu2023webglm,
266 |       title={WebGLM: Towards An Efficient Web-Enhanced Question Answering System with Human Preferences},
267 |       author={Xiao Liu and Hanyu Lai and Hao Yu and Yifan Xu and Aohan Zeng and Zhengxiao Du and Peng Zhang and Yuxiao Dong and Jie Tang},
268 |       year={2023},
269 |       eprint={2306.07906},
270 |       archivePrefix={arXiv},
271 |       primaryClass={cs.CL}
272 | }
273 | ```
274 | 
275 | > This repo is simplified for easier deployment.
276 | 


--------------------------------------------------------------------------------
/README_zh.md:
--------------------------------------------------------------------------------
  1 | <h1>WebGLM: 基于人类偏好的高效网络增强问答系统</h1>
  2 | 
  3 | <p align="center">📃 <a href="https://arxiv.org/pdf/2306.07906.pdf" target="_blank">论文 (KDD 2023)</a>
  4 | 
  5 | 本项目为 WebGLM 的官方实现。
  6 | 
  7 | https://github.com/THUDM/WebGLM/assets/129033897/d2e1dd35-6340-4175-ac2d-fd585daa17cf
  8 | 
  9 | _Read this in [English](README.md)._
 10 | 
 11 | <!-- TOC -->
 12 | 
 13 | -   [概述](#概述)
 14 |     -   [特点](#特点)
 15 | -   [开发准备](#开发准备)
 16 |     -   [准备代码和环境](#准备代码和环境)
 17 |     -   [准备 SerpAPI 密钥](#准备serpapi密钥)
 18 |     -   [下载检索器权重](#下载检索器权重)
 19 | -   [尝试 WebGLM](#尝试webglm)
 20 |     -   [导出环境变量](#导出环境变量)
 21 |     -   [以命令行界面运行](#以命令行界面运行)
 22 |     -   [以 Web 服务形式运行](#以Web服务形式运行)
 23 | -   [训练 WebGLM](#训练webglm)
 24 |     -   [训练生成器](#训练生成器)
 25 |         -   [准备数据](#准备数据)
 26 |         -   [训练](#训练)
 27 |     -   [训练检索器](#训练检索器)
 28 |         -   [准备数据](#准备数据-1)
 29 |         -   [训练](#训练-1)
 30 | -   [评测](#评测)
 31 | -   [实际应用案例](#实际应用案例)
 32 | -   [引用](#引用)
 33 | 
 34 | # 概述
 35 | 
 36 | ![paper](./assets/main_process.png)
 37 | 
 38 | WebGLM 旨在使用 10 亿参数的通用语言模型（GLM）提供一种高效且低成本的网络增强问答系统。它旨在通过将网络搜索和召回功能集成到预训练的语言模型中以进行实际应用的部署。
 39 | 
 40 | ## 特点
 41 | 
 42 | -   **大模型增强检索器**：增强了相关网络内容的检索能力，以更好地准确回答问题。
 43 | -   **自举生成器**：利用 GLM 的能力为问题生成回复，提供详细的答案。
 44 | -   **基于人类偏好的打分器**：通过优先考虑人类偏好来评估生成回复的质量，确保系统能够产生有用和吸引人的内容。
 45 | 
 46 | # 开发准备
 47 | 
 48 | ## 准备代码和环境
 49 | 
 50 | 克隆此仓库，并安装所需第三方库
 51 | 
 52 | ```bash
 53 | pip install -r requirements.txt
 54 | ```
 55 | 
 56 | 安装 Nodejs。
 57 | 
 58 | ```bash
 59 | apt install nodejs # 如果你使用Ubuntu
 60 | ```
 61 | 
 62 | 安装 playwright 依赖项。
 63 | 
 64 | ```bash
 65 | playwright install
 66 | ```
 67 | 
 68 | 如果你的主机中没有安装浏览器环境，则需要安装。不用担心，如果是这种情况，playwright 会在首次执行时出现说明。
 69 | 
 70 | ## 准备 SerpAPI 密钥
 71 | 
 72 | 在搜索过程中，我们使用 SerpAPI 获取搜索结果。你需要从[这里](https://serpapi.com/)获取 SerpAPI 密钥。
 73 | 
 74 | 然后将环境变量`SERPAPI_KEY`设置为你的密钥。
 75 | 
 76 | 或者，你可以通过 playwright 使用 Bing。你可以在 WebGLM 的启动命令行中添加 `--searcher bing` 以使用 Bing 搜索。
 77 | 
 78 | ```bash
 79 | export SERPAPI_KEY="YOUR KEY"
 80 | ```
 81 | 
 82 | ## 下载检索器权重
 83 | 
 84 | 通过运行以下命令从[清华云](https://cloud.tsinghua.edu.cn/d/54056861b2f34bbfb3f9/)下载检索器的权重。
 85 | 
 86 | 你可以通过 `--save SAVE_PATH` 手动指定检索器权重的保存路径。
 87 | 
 88 | ```bash
 89 | python download.py retriever-pretrained-checkpoint
 90 | ```
 91 | 
 92 | # 尝试 WebGLM
 93 | 
 94 | 在运行代码之前，请确保你的设备空间足够。
 95 | 
 96 | ## 导出环境变量
 97 | 
 98 | 将环境变量`WEBGLM_RETRIEVER_CKPT`设定为检索器权重的路径。如果你已将检索器权重下载到默认路径，可以直接运行以下命令行。
 99 | 
100 | ```bash
101 | export WEBGLM_RETRIEVER_CKPT=./download/retriever-pretrained-checkpoint
102 | ```
103 | 
104 | ## 以命令行界面运行
105 | 
106 | 你可以尝试 WebGLM-2B 模型：
107 | 
108 | ```bash
109 | python cli_demo.py -w THUDM/WebGLM-2B
110 | ```
111 | 
112 | 或直接尝试 WebGLM-10B 模型：
113 | 
114 | ```bash
115 | python cli_demo.py
116 | ```
117 | 
118 | 如果你想使用 Bing 搜索而不是 SerpAPI，可以在命令行中添加 `--searcher bing`，例如：
119 | 
120 | ```bash
121 | python cli_demo.py -w THUDM/WebGLM-2B --searcher bing
122 | ```
123 | 
124 | ## 以 Web 服务形式运行
125 | 
126 | 使用与 `cli_demo.py` 相同的参数运行 `web_demo.py`。例如，你可以通过 Bing 搜索使用 WebGLM-2B 模型：
127 | 
128 | ```bash
129 | python web_demo.py -w THUDM/WebGLM-2B --searcher bing
130 | ```
131 | 
132 | # 训练 WebGLM
133 | 
134 | ## 训练生成器
135 | 
136 | ### 准备数据
137 | 
138 | 运行下面的命令行从[清华云](https://cloud.tsinghua.edu.cn/d/ae204894f2e842f19a3f/)下载训练数据。
139 | 
140 | ```bash
141 | python download.py generator-training-data
142 | ```
143 | 
144 | 它将自动下载所有数据，并将它们预处理成可以立即在`./download`中使用的 seq2seq 格式。
145 | 
146 | ### 训练
147 | 
148 | 请参考[GLM 仓库](https://github.com/THUDM/GLM#train-with-your-own-data)进行 seq2seq 训练。
149 | 
150 | ## 训练检索器
151 | 
152 | ### 准备数据
153 | 
154 | 通过运行以下命令行，从[清华云](https://cloud.tsinghua.edu.cn/d/fa5e6eb1afac4f08a4c6/)下载训练数据。
155 | 
156 | ```bash
157 | python download.py retriever-training-data
158 | ```
159 | 
160 | ### 训练
161 | 
162 | 运行以下命令行来训练检索器。如果你已经在默认路径下载了检索器训练数据，可以直接运行以下命令行。
163 | 
164 | ```bash
165 | python train_retriever.py --train_data_dir ./download/retriever-training-data
166 | ```
167 | 
168 | # 评测
169 | 
170 | 你可以在 TriviaQA、WebQuestions 和 NQ Open 上重现我们的结果。以 TriviaQA 为例，可以运行以下命令行：
171 | 
172 | ```bash
173 | bash scripts/triviaqa.sh
174 | ```
175 | 
176 | 并开始进行评测。
177 | 
178 | # 真实应用案例
179 | 
180 | 您可以在[这里](assets/cases)查看一些 WebGLM 实际应用场景的示例。
181 | 
182 | # 引用
183 | 
184 | 如果您针对您的研究使用了这个代码，请引用我们的论文。
185 | 
186 | ```
187 | @misc{liu2023webglm,
188 |     title={WebGLM: Towards An Efficient Web-Enhanced Question Answering System with Human Preferences},
189 |     author={Xiao Liu and Hanyu Lai and Hao Yu and Yifan Xu and Aohan Zeng and Zhengxiao Du and Peng Zhang and Yuxiao Dong and Jie Tang},
190 |     year={2023},
191 |     eprint={2306.07906},
192 |     archivePrefix={arXiv},
193 |     primaryClass={cs.CL}
194 | }
195 | ```
196 | 
197 | > 该仓库已进行简化以便于部署。
198 | 


--------------------------------------------------------------------------------
/arguments.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | def add_model_config_args(parser):
 4 |     """Model arguments"""
 5 |     parser.add_argument("-w", "--webglm_ckpt_path", type=str, default=None, help="path to the webglm checkpoint, default to $WEBGLM_CKPT or THUDM/WebGLM")
 6 |     
 7 |     parser.add_argument("-r", "--retriever_ckpt_path", type=str, default=None, help="path to the retriever checkpoint, default to $WEBGLM_RETRIEVER_CKPT")
 8 |     
 9 |     parser.add_argument("-d", "--device", type=str, default="cuda", help="device to run the model, default to cuda")
10 |     
11 |     parser.add_argument("-b", "--filter_max_batch_size", type=int, default=50, help="max batch size for the retriever, default to 50")
12 |     
13 |     parser.add_argument("-s", "--serpapi_key", type=str, default=None, help="serpapi key for the searcher, default to $SERPAPI_KEY")
14 |     parser.add_argument("--searcher", type=str, default="serpapi", help="searcher to use (serpapi or bing), default to serpapi")
15 |     
16 |     return parser
17 | 
18 | def add_evaluation_args(parser):
19 |     """Evaluation arguments"""
20 |     parser.add_argument("-t", "--task", type=str, default=None, help="evaluate task, choose from nq_open, web_questions, triviaqa")
21 |     
22 |     parser.add_argument("-p", "--evaluate_task_data_path", type=str, default=None, help="data path of the evaluate task")
23 |     
24 |     return parser
25 | 
26 | def get_args(args_list=None, parser=None):
27 |     """Parse all the args."""
28 |     if parser is None:
29 |         parser = argparse.ArgumentParser(description='webglm')
30 |     else:
31 |         assert isinstance(parser, argparse.ArgumentParser)
32 |     
33 |     parser = add_model_config_args(parser)
34 |     parser = add_evaluation_args(parser)
35 |     
36 |     return parser.parse_args()


--------------------------------------------------------------------------------
/assets/cases/0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/assets/cases/0.png


--------------------------------------------------------------------------------
/assets/cases/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/assets/cases/1.png


--------------------------------------------------------------------------------
/assets/cases/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/assets/cases/2.png


--------------------------------------------------------------------------------
/assets/cases/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/assets/cases/3.png


--------------------------------------------------------------------------------
/assets/cases/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/assets/cases/4.png


--------------------------------------------------------------------------------
/assets/cases/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/assets/cases/5.png


--------------------------------------------------------------------------------
/assets/cases/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/assets/cases/6.png


--------------------------------------------------------------------------------
/assets/cases/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/assets/cases/7.png


--------------------------------------------------------------------------------
/assets/cases/8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/assets/cases/8.png


--------------------------------------------------------------------------------
/assets/cases/9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/assets/cases/9.png


--------------------------------------------------------------------------------
/assets/main_process.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/assets/main_process.png


--------------------------------------------------------------------------------
/cli_demo.py:
--------------------------------------------------------------------------------
 1 | from model import load_model, citation_correction
 2 | import argparse
 3 | from arguments import add_model_config_args
 4 | 
 5 | if __name__ == '__main__':
 6 |     
 7 |     arg = argparse.ArgumentParser()
 8 |     add_model_config_args(arg)
 9 |     args = arg.parse_args()
10 |     
11 |     webglm = load_model(args)
12 |     
13 |     while True:
14 |         question = input("[Enter to Exit] >>> ")
15 |         question = question.strip()
16 |         if not question:
17 |             break
18 |         if question == "quit":
19 |             break
20 |         final_results = {}
21 |         for results in webglm.stream_query(question):
22 |             final_results.update(results)
23 |             if "references" in results:
24 |                 for ix, ref in enumerate(results["references"]):
25 |                     print("Reference [%d](%s): %s"%(ix + 1, ref['url'], ref['text']))
26 |             if "answer" in results:
27 |                 print("\n%s\n"%citation_correction(results["answer"], [ref['text'] for ref in final_results["references"]]))


--------------------------------------------------------------------------------
/data/nq_open.jsonl:
--------------------------------------------------------------------------------
  1 | {"question": "the era of the great mughals began with the accession of", "answer": ["Akbar the Great", "Babur"]}
  2 | {"question": "ethiopia flight 961 crashes in to the sea", "answer": ["23 November 1996"]}
  3 | {"question": "what position did doug peterson play in the nfl", "answer": ["holder on placekicks", "quarterback"]}
  4 | {"question": "when did houston go to the american league", "answer": ["2013"]}
  5 | {"question": "who has the power to approve or veto legislation constitution", "answer": ["the President"]}
  6 | {"question": "how many seasons are there for lost girl", "answer": ["5", "five"]}
  7 | {"question": "a single period of precession of earth's axis is completed in about", "answer": ["approximately 26,000 years", "26,000 years"]}
  8 | {"question": "when did frank sinatra first sing new york new york", "answer": ["1980", "1979"]}
  9 | {"question": "who made the poppies at tower of london", "answer": ["Paul Cummins", "Tom Piper", "stage designer Tom Piper"]}
 10 | {"question": "what album is help by the beatles on", "answer": ["Help!"]}
 11 | {"question": "when did taylor swift's first album release", "answer": ["October 24, 2006", "2005"]}
 12 | {"question": "architectural elements forming rib vaults eg wells cathedral", "answer": ["an armature of piped masonry", "barrel vaults", "two to three barrel vaults"]}
 13 | {"question": "when did the rational dress society begin to work", "answer": ["1881"]}
 14 | {"question": "when did the study of media effects begin", "answer": ["1919", "1975"]}
 15 | {"question": "what is the order of the netflix marvel shows", "answer": ["Marvel's Iron Fist", "Marvel's Daredevil", "Marvel's The Punisher", "Marvel's Jessica Jones", "Marvel's The Defenders", "Marvel's Luke Cage"]}
 16 | {"question": "when was the south asian association for regional co-operation (saarc) formed", "answer": ["December 1985", "8 December 1985"]}
 17 | {"question": "who sang i ran all the way home", "answer": ["The Impalas"]}
 18 | {"question": "what are the active materials of a lead acid battery", "answer": ["Lead", "sulfuric acid", "Lead and lead dioxide", "lead dioxide"]}
 19 | {"question": "last episode of what happens to my family", "answer": ["53"]}
 20 | {"question": "who is the first indian woman to be canonized as a saint", "answer": ["Saint Alphonsa"]}
 21 | {"question": "who was the french chef given credit for developing the classic kitchen​ brigade", "answer": ["Georges Auguste Escoffier"]}
 22 | {"question": "who wins the next iron chef super chefs", "answer": ["Zakarian", "Geoffrey Zakarian"]}
 23 | {"question": "what is the population of st petersburg fl", "answer": ["260,999", "257,083"]}
 24 | {"question": "yeh hai mohabbatein serial star cast real name", "answer": ["Divyanka Tripathi and Karan Patel"]}
 25 | {"question": "what episode does goku give up against cell", "answer": ["165", "180"]}
 26 | {"question": "where did the battle of corinth take place", "answer": ["in Corinth, Mississippi", "Corinth, Mississippi"]}
 27 | {"question": "what is the angle of the tower of pisa", "answer": ["about 3.99 degrees", "3.99 degrees", "at about 3.99 degrees"]}
 28 | {"question": "who plays the dad in pretty in pink", "answer": ["Harry Dean Stanton"]}
 29 | {"question": "who turns into a bear in the hobbit", "answer": ["Beorn"]}
 30 | {"question": "who won last year's ncaa women's basketball", "answer": ["South Carolina"]}
 31 | {"question": "who has the most followers in the world on instagram", "answer": ["Instagram's own account", "Instagram"]}
 32 | {"question": "who wrote knock knock knocking on heavens door", "answer": ["Bob Dylan"]}
 33 | {"question": "who did bette midler portray in the rose", "answer": ["Mary Rose Foster"]}
 34 | {"question": "when is the last time the vikings were in the nfc championship", "answer": ["1976", "2017/18"]}
 35 | {"question": "what is the name of the hyena in lion king", "answer": ["Banzai", "Shenzi", "Ed"]}
 36 | {"question": "who sang the song good morning good morning", "answer": ["Gene Kelly", "Donald O'Connor", "Judy Garland", "Debbie Reynolds", "Mickey Rooney"]}
 37 | {"question": "who is the lead singer of depeche mode", "answer": ["David Gahan"]}
 38 | {"question": "where does the path train stop in newark", "answer": ["Newark Penn Station"]}
 39 | {"question": "when is the met office leaving the bbc", "answer": ["31 March 2018"]}
 40 | {"question": "when does the miz and maryse show start", "answer": ["2018"]}
 41 | {"question": "how many seasons of vampire diaries r there", "answer": ["eight", "8"]}
 42 | {"question": "who played the twins in darling buds of may", "answer": ["Christina Giles", "Katherine Giles"]}
 43 | {"question": "coldplay song i will try to fix you", "answer": ["\"Fix You\""]}
 44 | {"question": "what is upstream project in oil and gas", "answer": ["drilling exploratory wells"]}
 45 | {"question": "where does no game no life anime end", "answer": ["the Elkia Federation", "the sixth volume"]}
 46 | {"question": "when did ole miss beat alabama in football", "answer": ["October 3, 1970", "September 11, 1976", "October 13, 2001", "October 27, 1894", "October 18, 2003", "November 5, 1910", "October 8, 1988", "October 4, 2014", "September 19, 2015", "October 5, 1968"]}
 47 | {"question": "how many games in a row have the uconn women's basketball team won", "answer": ["111 straight wins", "111", "90"]}
 48 | {"question": "what's the population of prince edward island", "answer": ["142,907 residents", "142,907"]}
 49 | {"question": "where do you get a cashiers check from", "answer": ["a bank", "bank"]}
 50 | {"question": "rizal finished all the chapters of the novel noli me tangere in", "answer": ["December 1886", "Spanish"]}
 51 | {"question": "how much money did it cost to make gta v", "answer": ["137"]}
 52 | {"question": "who does stefan marry in the vampire diaries", "answer": ["Caroline Forbes"]}
 53 | {"question": "who won the award for best goalkeeper in football world cup 2006", "answer": ["Gianluigi Buffon"]}
 54 | {"question": "who has been appointed as the election commissioner of india", "answer": ["Om Prakash Rawat"]}
 55 | {"question": "when is season 3 of grace and frankie being released", "answer": ["March 24, 2017"]}
 56 | {"question": "who plays the robot on the orville show", "answer": ["Mark Jackson"]}
 57 | {"question": "when was the latest version of chrome released", "answer": ["2018-01-22"]}
 58 | {"question": "who plays timon in lion king on broadway", "answer": ["Max Casella"]}
 59 | {"question": "where do the sharks play in san jose", "answer": ["the SAP Center", "SAP Center", "SAP Center at San Jose"]}
 60 | {"question": "who was the famous scientist that ran the research lab moseley went to in manchester", "answer": ["Sir Ernest Rutherford"]}
 61 | {"question": "what grade was arnold from hey arnold in", "answer": ["fourth"]}
 62 | {"question": "who sings every light in the house is on", "answer": ["Trace Adkins"]}
 63 | {"question": "what are the ranks in the us navy", "answer": ["E-8s senior chief petty officer", "E-9s master chief petty officer"]}
 64 | {"question": "who controlled the house and the senate in 2012", "answer": ["Republican", "Democratic"]}
 65 | {"question": "who plays auggie in the movie the wonder", "answer": ["Jacob Tremblay"]}
 66 | {"question": "who is the king and queen of the netherlands", "answer": ["Queen Máxima of the Netherlands", "King Willem-Alexander"]}
 67 | {"question": "how many breeds of pigs are there in the uk", "answer": ["---"]}
 68 | {"question": "who does demetrius love in a midsummer night dream", "answer": ["Helena", "Hermia"]}
 69 | {"question": "where is arachidonic acid found in the body", "answer": ["brain", "muscles", "liver"]}
 70 | {"question": "who wrote he ain't heavy he's my brother lyrics", "answer": ["Bobby Scott", "Bob Russell"]}
 71 | {"question": "when was the last time the jets won a playoff game", "answer": ["2010"]}
 72 | {"question": "who was kat slater's sisters in eastenders", "answer": ["Zoe", "Little Mo", "Lynne"]}
 73 | {"question": "what was tom hanks character name in castaway", "answer": ["Chuck Noland"]}
 74 | {"question": "how many seasons of rules of engagement is there", "answer": ["7", "seven"]}
 75 | {"question": "the outer layer of the skin that contains no blood or nerve supply is the", "answer": ["epidermis"]}
 76 | {"question": "points on a sphere or angles in a circle are measured in units called", "answer": ["radians"]}
 77 | {"question": "when did they stop cigarette advertising on television", "answer": ["January 2, 1971", "1970"]}
 78 | {"question": "when is the publishers clearing house sweepstakes drawing", "answer": ["just after the Super Bowl"]}
 79 | {"question": "where is the capital city of alabama located", "answer": ["Montgomery"]}
 80 | {"question": "epidemiologists attempt to explain the link between health and variables such as", "answer": ["biological agents", "disease conditions in defined populations", "smoking", "stress", "chemicals", "alcohol"]}
 81 | {"question": "what year is it for the jewish calendar", "answer": ["AM 5778", "5778"]}
 82 | {"question": "batman the enemy with episode 5 release date", "answer": ["March 27, 2018"]}
 83 | {"question": "who plays gram on the young and the restless", "answer": ["Max Shippee"]}
 84 | {"question": "when was the term social justice first used", "answer": ["the 1840s", "1840s"]}
 85 | {"question": "how much for a passport in the philippines", "answer": ["$60 abroad", "₱950"]}
 86 | {"question": "who plays joker in batman the dark knight", "answer": ["Ledger"]}
 87 | {"question": "when was the minimum wage established in the united states", "answer": ["1938", "1933", "1912"]}
 88 | {"question": "where was the first session of the assam association held in1905", "answer": ["Guwahati"]}
 89 | {"question": "who plays the saint of killers on preacher", "answer": ["Graham McTavish"]}
 90 | {"question": "when did skiing halfpipe become an olympic event", "answer": ["2014"]}
 91 | {"question": "derek and meredith get back together season 3", "answer": ["Staring at the Sun"]}
 92 | {"question": "who played the mom on what's eating gilbert grape", "answer": ["Darlene Cates"]}
 93 | {"question": "name two fibres which are made of proteins", "answer": ["feathers", "hair", "wool", "fur", "silk"]}
 94 | {"question": "what year does the quiet man take place", "answer": ["the 1920s", "In the 1920s"]}
 95 | {"question": "when did mcdonald's sell 1 million burgers", "answer": ["By 1965"]}
 96 | {"question": "who won battle of the sexes tennis game", "answer": ["Billie Jean King"]}
 97 | {"question": "who won the battle of the first battle of bull run", "answer": ["Confederate victory", "Confederate forces", "Confederate"]}
 98 | {"question": "swan lake the sleeping beauty and the nutcracker are three famous ballets by", "answer": ["Pyotr Ilyich Tchaikovsky"]}
 99 | {"question": "when does the new gotham season come out", "answer": ["September 21, 2017", "September 21, 2017"]}
100 | {"question": "where do they put the tomb vampires in order to burn them during founders day", "answer": ["the Gilbert building"]}
101 | {"question": "what is the meaning of the dragon boat festival", "answer": ["commemorating fealty and filial piety"]}
102 | {"question": "who played ice queen in chronicles of narnia", "answer": ["Tilda Swinton", "Laura Brent"]}
103 | {"question": "who made the song falling in love with you", "answer": ["Hugo Peretti", "George David Weiss", "Elvis Presley", "Luigi Creatore"]}
104 | {"question": "who plays artemisia in 300 rise of an empire", "answer": ["Caitlin Carmichael", "Eva Green", "Jade Chynoweth"]}
105 | {"question": "which is produced in plants of narora kakrapar tarapur", "answer": ["Atomic Power"]}
106 | {"question": "who is the team that beat the eagles this season", "answer": ["Dallas Cowboys", "Seattle Seahawks", "Kansas City Chiefs"]}
107 | {"question": "who played amy grant i i can only imagine", "answer": ["Nicole DuPort"]}
108 | {"question": "when was the first australian prime minister elected", "answer": ["Sir Edmund Barton", "1901"]}
109 | {"question": "what type of political system does el salvador have", "answer": ["a presidential representative democratic republic", "\"flawed democracy\"", "presidential representative democratic republic"]}
110 | {"question": "what is the current population of bora bora", "answer": ["10,605"]}
111 | {"question": "what year did the us hockey team won the olympics", "answer": ["1960", "1960 and 1980", "1980"]}
112 | {"question": "what is the oath that new citizens take", "answer": ["United States Oath of Allegiance"]}
113 | {"question": "who plays the dragon queen from game of thrones", "answer": ["Emilia Clarke"]}
114 | {"question": "element named after fictional planet from which superman came", "answer": ["Kryptonite"]}
115 | {"question": "who is playing the halftime show at super bowl 2016", "answer": ["Beyoncé", "Coldplay", "Bruno Mars"]}
116 | {"question": "who had a baby at 100 in the bible", "answer": ["Sarah", "Abraham"]}
117 | {"question": "who plays matthew on anne with an e", "answer": ["R. H. Thomson"]}
118 | {"question": "when did the united states host the world cup", "answer": ["1994"]}
119 | {"question": "when did mcgee became a regular on ncis", "answer": ["in season two", "season two"]}
120 | {"question": "when do primary ossification centers appear in an embryo", "answer": ["prenatal development"]}
121 | {"question": "what was the real name of saudi arabia", "answer": ["the Saudi Arab kingdom"]}
122 | {"question": "who won college basketball player of the year", "answer": ["A'ja Wilson", "Jalen Brunson"]}
123 | {"question": "when did lionel messi play his first game for barcelona", "answer": ["2002", "2001", "October 2004"]}
124 | {"question": "the group that officially elects the president of the united states is called", "answer": ["the U.S. Electoral College", "U.S. Electoral College"]}
125 | {"question": "where are alkali metals located on the periodic table", "answer": ["in the s-block", "group 1"]}
126 | {"question": "who did the broncos beat in the super bowl", "answer": ["Carolina Panthers", "Atlanta Falcons", "Green Bay Packers"]}
127 | {"question": "what type of government did the ming dynasty have", "answer": ["imperial rule"]}
128 | {"question": "what was the final episode of quantum leap", "answer": ["\"Mirror Image\""]}
129 | {"question": "who won the super heavyweight gold medal at the 2000 olympics", "answer": ["Audley Harrison"]}
130 | {"question": "where is lord's prayer found in bible", "answer": ["in the Gospel of Luke"]}
131 | {"question": "how many languages in harry potter translated into", "answer": ["over 74", "over 74 languages"]}
132 | {"question": "who owns the crown plaza hotel in chicago illinois", "answer": ["InterContinental Hotels Group"]}
133 | {"question": "who became the king of ayodhya after ram", "answer": ["Kusha"]}
134 | {"question": "who is under the mask of darth vader", "answer": ["Anakin Skywalker"]}
135 | {"question": "other than water what else has hydrogen bonds", "answer": ["inorganic molecules such as water"]}
136 | {"question": "who plays dusty in the movie pure country", "answer": ["George Strait"]}
137 | {"question": "what is the pirates of the caribbean in order", "answer": ["On Stranger Tides", "At World's End", "Dead Men Tell No Tales", "Dead Man's Chest"]}
138 | {"question": "who plays the dad in nanny mcphee and the big bang", "answer": ["Ewan McGregor"]}
139 | {"question": "when did the golden state warriors win the finals", "answer": ["1947", "1975", "1956", "2015", "2017"]}
140 | {"question": "what engine is in a holden v8 supercar", "answer": ["V8-engine"]}
141 | {"question": "what's the dog's name on tom and jerry", "answer": ["Spike"]}
142 | {"question": "who is edmund on days of our lives", "answer": ["Adam Caine"]}
143 | {"question": "who is opening for little mix glory days tour", "answer": ["Sheppard", "Zoe Badwi", "Louisa Johnson", "Bronnie", "The Vamps", "Ella Eyre", "Conor Maynard"]}
144 | {"question": "who starred in an officer and a gentleman", "answer": ["Richard Gere", "David Keith", "Louis Gossett Jr.", "Debra Winger", "Phillip J. Salmon"]}
145 | {"question": "who is the actor that plays dr. sean murphy", "answer": ["Freddie Highmore"]}
146 | {"question": "what is the ethnic background of the shib sibs", "answer": ["Japanese"]}
147 | {"question": "what is the name of the first earthquake early warning system", "answer": ["1991", "The Mexican Seismic Alert System"]}
148 | {"question": "most passing yards in nfl history in a game", "answer": ["Norm Van Brocklin", "554"]}
149 | {"question": "who has won the eurovision song contest the most times", "answer": ["Ireland's Johnny Logan", "Ireland"]}
150 | {"question": "when was united nations convention on the rights of the child created", "answer": ["20 November 1989"]}
151 | {"question": "who sings the song it ain't me", "answer": ["Selena Gomez", "American singer Selena Gomez"]}
152 | {"question": "when was the biltmore house opened to the public", "answer": ["March 1930"]}
153 | {"question": "why was hong kong important to the british empire", "answer": ["a centre for international trade"]}
154 | {"question": "what is katie running from in safe haven", "answer": ["her abusive husband"]}
155 | {"question": "how tall is the actor who plays hagrid in harry potter", "answer": ["6ft 1in"]}
156 | {"question": "who proved that cells come from other cells", "answer": ["Matthias Schleiden", "Robert Hooke", "Robert Remak", "Theodor Schwann", "Rudolf Virchow"]}
157 | {"question": "where did the butchers in the slaughterhouse cases live", "answer": ["New Orleans"]}
158 | {"question": "who plays captain phasma in star wars the force awakens", "answer": ["Gwendoline Christie"]}
159 | {"question": "who sang picking up pebbles and throwing them into the sea", "answer": ["Matt Flinders"]}
160 | {"question": "how many episodes of corrie has there been", "answer": ["9,436"]}
161 | {"question": "who is the first president to be impeached", "answer": ["Andrew Johnson", "Johnson"]}
162 | {"question": "what is billy last name in where the red fern grows", "answer": ["Colman", "Billy Colman"]}
163 | {"question": "nuclear power plant that blew up in russia", "answer": ["Chernobyl Nuclear Power Plant", "Chernobyl", "the Chernobyl Nuclear Power Plant"]}
164 | {"question": "where does a roadrunner live in the desert", "answer": ["mountainous shrubland", "arid lowland"]}
165 | {"question": "how many pieces in a terry's chocolate orange", "answer": ["six", "20"]}
166 | {"question": "who did puerto rico belong to before the u.s", "answer": ["Taíno", "indigenous Taíno people", "Spain"]}
167 | {"question": "who made the most free throws in nba history", "answer": ["Karl Malone"]}
168 | {"question": "criminal minds what episode does jj find out she pregnant", "answer": ["in April 2011", "The Crossing"]}
169 | {"question": "who made delhi as capital for the first time", "answer": ["the Pandavas"]}
170 | {"question": "when does the champions league quarter finals start", "answer": ["16 March 2018", "3–4 April 2018"]}
171 | {"question": "who developed the concept of total quality management", "answer": ["W. Edwards Deming"]}
172 | {"question": "who plays at the prudential center in newark", "answer": ["Seton Hall Pirates", "New Jersey Devils"]}
173 | {"question": "when do new episodes of riverdale season 2 come out", "answer": ["February 7, 2018", "April 18, 2018", "October 11, 2017", "TBA", "January 31, 2018"]}
174 | {"question": "who sings the song i'll never forget you", "answer": ["Mariah Carey", "Zara Larsson and MNEK", "Noisettes"]}
175 | {"question": "who did america declare war on in ww1", "answer": ["Austria-Hungary"]}
176 | {"question": "where does hydrogen peroxide come from in the body", "answer": ["nearly all living cells"]}
177 | {"question": "what is the name for the ch3coo- ion", "answer": ["polyatomic anion"]}
178 | {"question": "who wrote old flames cant hold a candle to you", "answer": ["Pebe Sebert", "Pebe Sebert and Hugh Moffatt", "Patricia Rose Sebert", "Hugh Moffatt"]}
179 | {"question": "how many wars held between india and pakistan", "answer": ["four"]}
180 | {"question": "where is gall bladder situated in human body", "answer": ["beneath the liver"]}
181 | {"question": "nba record for most double doubles in a season", "answer": ["Tim Duncan"]}
182 | {"question": "where is simple squamous epithelium found in the body", "answer": ["alveoli", "outer layer of skin", "capillaries", "glomeruli"]}
183 | {"question": "who hit the first home run in the houston astrodome", "answer": ["Mickey Mantle"]}
184 | {"question": "what season does bart bass die in gossip girl", "answer": [")"]}
185 | {"question": "where does tropic of cancer pass in india", "answer": ["Chhattisgarh", "West Bengal", "Gujarat", "State of Tripura", "Jharkhand", "State of Mizoram", "Madhya Pradesh", "Rajasthan"]}
186 | {"question": "who played bat masterson in the tv series", "answer": ["Gene Barry"]}
187 | {"question": "who was originally cast to play indiana jones", "answer": ["Tom Selleck"]}
188 | {"question": "what song is played while raising the american flag", "answer": ["Reveille", "\"Reveille\""]}
189 | {"question": "when did the the regulatory reform (fire safety) order 2005 first come into effect", "answer": ["1 October 2006"]}
190 | {"question": "how many paintings of sunflowers did van gogh paint", "answer": ["two"]}
191 | {"question": "what category was hurricane charley when it hit florida", "answer": ["4", "Category 4", "Category 4"]}
192 | {"question": "who is president of india in present time", "answer": ["Ram Nath Kovind"]}
193 | {"question": "when were the winnie the pooh books written", "answer": ["1924", "1926", "1927", "1928"]}
194 | {"question": "when was the debating club established in almora", "answer": ["1871 A.D.", "1871"]}
195 | {"question": "number 4 in roman numerals on clock faces", "answer": ["IV"]}
196 | {"question": "which country has won maximum number of gold medal in asian game 2014", "answer": ["China"]}
197 | {"question": "when does the last episode of adventure time air", "answer": ["TBA"]}
198 | {"question": "where did the dewey decimal system come from", "answer": ["Melvil Dewey"]}
199 | {"question": "where does the formation of atp take place", "answer": ["plasma membrane in bacteria", "inner mitochondrial membrane", "thylakoid membrane", "mitochondrial membrane in eukaryotes"]}
200 | {"question": "who has won the most college football national champions", "answer": ["Princeton"]}
201 | {"question": "when did amnesia the dark descent come out", "answer": ["8 September 2010"]}
202 | {"question": "where was the first colony in north america located", "answer": ["Virginia"]}
203 | {"question": "who did the minnesota vikings lose to in the super bowl", "answer": ["Pittsburgh Steelers", "Oakland Raiders", "Miami Dolphins", "Kansas City Chiefs"]}
204 | {"question": "when did the movie napoleon dynamite come out", "answer": ["June 11, 2004", "2004"]}
205 | {"question": "what is the hot coffee mod in san andreas", "answer": ["a normally inaccessible mini-game"]}
206 | {"question": "who wrote cant get you out of my head lyrics", "answer": ["Cathy Dennis and Rob Davis", "Rob Davis", "Cathy Dennis"]}
207 | {"question": "where does a brisket come from on a cow", "answer": ["the breast or lower chest", "breast or lower chest"]}
208 | {"question": "where did the last name wallace come from", "answer": ["a Scottish surname"]}
209 | {"question": "who are the cast members of ncis new orleans", "answer": ["Zoe McLellan", "Lucas Black", "Daryl \"Chill\" Mitchell", "Shalita Grant", "Rob Kerkovich", "Vanessa Ferlito", "Daryl Mitchell", "Scott Bakula", "CCH Pounder"]}
210 | {"question": "what type of database is library literature and information science", "answer": ["bibliographic database", "bibliographic"]}
211 | {"question": "who holds the world record for the most world records", "answer": ["Ashrita Furman"]}
212 | {"question": "who played solomon in little house on the prairie", "answer": ["Todd Bridges"]}
213 | {"question": "who did america declare war on in ww1", "answer": ["Austria-Hungary"]}
214 | {"question": "jonny cash one piece at a time car", "answer": ["Cadillac"]}
215 | {"question": "who developed the central processing unit (cpu)", "answer": ["John von Neumann"]}
216 | {"question": "when did marathon change its name to snickers", "answer": ["19 July 1990"]}
217 | {"question": "who sang let me tell you about the birds and the bees", "answer": ["Jewel Akens"]}
218 | {"question": "how old is the actress who plays phyllis on y&r", "answer": ["Gina Tognoni", "age 44"]}
219 | {"question": "when does the miz and maryse show start", "answer": ["2018"]}
220 | {"question": "who is the owner of reading football club", "answer": ["Xiu Li Dai", "Dai Xiuli", "Dai Yongge", "Yongge Dai"]}
221 | {"question": "when was the last episode of vampire diaries aired", "answer": ["March 10, 2017", "March 10, 2017"]}
222 | {"question": "when did the eagles play in the superbowl", "answer": ["February 6, 2005"]}
223 | {"question": "who does eric end up with in that 70s show", "answer": ["Donna"]}
224 | {"question": "who ran the fastest 40 yard dash in the nfl", "answer": ["Jakeem Grant", "John Ross"]}
225 | {"question": "who opens the church of the holy sepulchre", "answer": ["the Sunni Muslim family", "the Nusaybah family"]}
226 | {"question": "when does the miz and maryse show start", "answer": ["2018"]}
227 | {"question": "where does the last name galvez come from", "answer": ["Spanish surname", "Spanish"]}
228 | {"question": "who sang rip it up and start again", "answer": ["Scottish post-punk band Orange Juice", "Orange Juice"]}
229 | {"question": "when was the young and the restless first aired", "answer": ["March 26, 1973"]}
230 | {"question": "who is given credit for the gnu initiative", "answer": ["Richard Stallman"]}
231 | {"question": "what was the initial effect of the transition from command to market economies in eastern europe", "answer": ["Inequality of opportunity"]}
232 | {"question": "which state is located in the centre of india", "answer": ["Chhattisgarh", "Madhya Pradesh"]}
233 | {"question": "when did they replace lead with graphite in pencils", "answer": ["never contained the element lead"]}
234 | {"question": "when was the taming ofthe shrew first performed", "answer": ["prior to June 1592"]}
235 | {"question": "who wrote lyrics for phantom of the opera", "answer": ["Charles Hart", "Charles Hart and Richard Stilgoe", "Richard Stilgoe"]}
236 | {"question": "where is the tibia and fibula bone located", "answer": ["leg"]}
237 | {"question": "how many gold medals did australia win in the 2000 olympics", "answer": ["16"]}
238 | {"question": "who played gino in a place to call home", "answer": ["Aldo Mignone"]}
239 | {"question": "who is the minister of local government in zimbabwe", "answer": ["Hon July Moyo"]}
240 | {"question": "when was harry potter and the philosophers stone published", "answer": ["in 1997", "1997"]}
241 | {"question": "who are the co hosts on the real", "answer": ["Tamar Braxton", "Loni Love", "Adrienne Houghton", "Tamera Mowry-Housley", "Jeannie Mai"]}
242 | {"question": "how many episodes are there in dragon ball z", "answer": ["291 episodes", "291"]}
243 | {"question": "who has the power (judicial) to make decisions in courts of law", "answer": ["judges"]}
244 | {"question": "where was the killing of a sacred deer filmed", "answer": ["Cincinnati"]}
245 | {"question": "when do the new episodes of supernatural start", "answer": ["May 10, 2018"]}
246 | {"question": "what type of economic system was utilized in the soviet union", "answer": ["communism", "state ownership"]}
247 | {"question": "when did the east india company take control of india", "answer": ["in 1757", "1757", "1799", "1612"]}
248 | {"question": "when was the $1 000 bill discontinued", "answer": ["1969", "December 27, 1945", "July 14, 1969"]}
249 | {"question": "what are the three fifty shades of grey books", "answer": ["Fifty Shades of Grey", "Fifty Shades Darker", "Fifty Shades Freed"]}
250 | {"question": "who plays alec ramsay in the black stallion", "answer": ["Kelly Reno"]}
251 | {"question": "when was the last time the ducks won the stanley cup", "answer": ["(2006–07)", "2006–07"]}
252 | {"question": "product-market fit means being in a good market with a product that can satisfy that market", "answer": ["Mark Andreessen"]}
253 | {"question": "who lasted the longest in the royal rumble", "answer": ["Rey Mysterio"]}
254 | {"question": "who won the mens single ice skating 2018", "answer": ["Yuzuru Hanyu", "Javier Fernández", "Shoma Uno"]}
255 | {"question": "who played alex cross in along came a spider", "answer": ["Morgan Freeman"]}
256 | {"question": "the cast of don't tell mom the babysitter's dead", "answer": ["Kimmy Robertson", "Jeff Bollow", "John Getz", "Keith Coogan", "Joanna Cassidy", "Concetta Tomei", "Robert Hy Gorman", "David Duchovny", "Jayne Brook", "Eda Reiss Merin", "Christopher Pettiet", "Dan Castellaneta (voice)", "Josh Charles", "Danielle Harris", "Michael Kopelow", "Christina Applegate"]}
257 | {"question": "i was a great islamic scholar and mathematician who died in 1131 ce", "answer": ["Omar Khayyam"]}
258 | {"question": "how many seasons of the bastard executioner are there", "answer": ["one", "one season"]}
259 | {"question": "where did the butchers in the slaughterhouse cases live", "answer": ["New Orleans"]}
260 | {"question": "is parallax more pronounced with nearby stars or with distant stars", "answer": ["nearby objects", "nearby"]}
261 | {"question": "is a network connection device that can build tables that identify addresses on each network", "answer": ["routing table", "a router"]}
262 | {"question": "who sings don't take your guns to town", "answer": ["U2", "Johnny Cash"]}
263 | {"question": "who was the great wall of china built to defend against", "answer": ["nomads from Inner Asia", "nomads from Inner Asia."]}
264 | {"question": "when did canada get rid of the death penalty", "answer": ["July 14, 1976", "1976", "1998"]}
265 | {"question": "types of skiing in the winter olympics 2018", "answer": ["Slalom", "Downhill", "Super-G", "Giant slalom", "Slalom – (SC)"]}
266 | {"question": "what is final season of game of thrones", "answer": ["The eighth", "eighth", "the eighth season"]}
267 | {"question": "name the four major layers of the earth in order", "answer": ["a liquid outer core", "an outer silicate solid crust", "a highly viscous mantle", "a solid inner core"]}
268 | {"question": "who wrote it's a long long way to pasadena", "answer": ["John Young", "Harry Vanda", "David Hemmings", "George Young"]}
269 | {"question": "what is the maximum data rate for the 802.11a standard select one", "answer": ["54 Mbit/s"]}
270 | {"question": "how many gold medals did australia win in the 2000 olympics", "answer": ["16"]}
271 | {"question": "where does dividends go on cash flow statement", "answer": ["the financing activities section"]}
272 | {"question": "when did the first ice age come out", "answer": ["2002", "March 15, 2002"]}
273 | {"question": "who is the designer in devil wears prada", "answer": ["Valentino Garavani"]}
274 | {"question": "who captained the first european ship to sail around the tip of africa", "answer": ["Bartolomeu Dias"]}
275 | {"question": "what is the baby elephants name in jungle book", "answer": ["Hathi Jr."]}
276 | {"question": "where is the left anterior descending artery located", "answer": ["the left coronary artery"]}
277 | {"question": "when was corporal punishment banned in south africa", "answer": ["1997"]}
278 | {"question": "under article 1 what is the minimum age required to serve in the house of representatives", "answer": ["25", "25 years old"]}
279 | {"question": "when was the last year the eagles went to the superbowl", "answer": ["following the 2017 season", "2017"]}
280 | {"question": "who has won 2017 women's singles korean open series badminton championship", "answer": ["P. V. Sindhu"]}
281 | {"question": "who does bryce dallas howard play in the grinch", "answer": ["Surprised Who"]}
282 | {"question": "which country is the last member of saarc", "answer": ["Afghanistan"]}
283 | {"question": "who played the colorado kid in rio bravo", "answer": ["Ricky Nelson"]}
284 | {"question": "who played tom in four weddings and a funeral", "answer": ["James Fleet"]}
285 | {"question": "where is the citrus bowl held this year", "answer": ["Camping World Stadium"]}
286 | {"question": "what age do you need to be to buy a bb gun", "answer": ["18"]}
287 | {"question": "who played the mad hatter in the batman tv show", "answer": ["Roddy McDowall", "David Wayne", "Benedict Samuel"]}
288 | {"question": "is it marley and me or marley and i", "answer": ["Marley & Me"]}
289 | {"question": "where does route 66 start on the west coast", "answer": ["in Santa Monica"]}
290 | {"question": "the oligodynamic effect is a phenomenon that describes", "answer": ["a biocidal effect of metals"]}
291 | {"question": "who plays heather in beauty and the beast", "answer": ["Nicole Gale Anderson"]}
292 | {"question": "what age do you need to be to buy a bb gun", "answer": ["18"]}
293 | {"question": "who is the girl in green day 21 guns", "answer": ["Lisa Stelly"]}
294 | {"question": "what is the meaning of the harp in ireland", "answer": ["the arms of Ireland"]}
295 | {"question": "who does the head of the fbi report to", "answer": ["the Director of National Intelligence", "the Attorney General"]}
296 | {"question": "what is the first book of percy jackson", "answer": ["The Lightning Thief"]}
297 | {"question": "if a piece of music is perceived to have changed key then we say the piece has", "answer": ["transposed", "transposition"]}
298 | {"question": "when do you celebrate birthday if born on feb 29", "answer": ["February 29", "February 28", "March 1"]}
299 | {"question": "when did the eagles win last super bowl", "answer": ["2017"]}
300 | {"question": "who is tinker air force base named after", "answer": ["Major General Clarence L. Tinker"]}
301 | {"question": "who had created the second bank of the united states", "answer": ["President James Madison", "James Madison"]}
302 | {"question": "when did gaurdians of the galaxy 2 come out", "answer": ["2017", "May 5, 2017"]}
303 | {"question": "what size engine does a 2005 honda civic have", "answer": ["1169 cc"]}
304 | {"question": "kings and queens of england in the 1900s", "answer": ["George V", "George VI", "Edward VIII", "Elizabeth II", "Edward VII"]}
305 | {"question": "when was rosencrantz and guildenstern are dead written", "answer": ["1966"]}
306 | {"question": "which country has the most coastline in the world", "answer": ["Canada"]}
307 | {"question": "who was the king of england in 1756", "answer": ["George II"]}
308 | {"question": "what are the colors of the netherlands flag", "answer": ["blue", "white", "red"]}
309 | {"question": "when was the first nuclear power plant opened", "answer": ["December 20, 1951", "June 27, 1954", "On June 27, 1954", "the USSR", "December 2, 1942"]}
310 | {"question": "what is the function of a political action committee (pac)", "answer": ["pools campaign contributions from members"]}
311 | {"question": "who sang the song i wanna be sedated", "answer": ["the Ramones"]}
312 | {"question": "what proposition made the insurance commissioner an elected position", "answer": ["Proposition 103", "Proposition 103 in 1988"]}
313 | {"question": "who are nominated for president of india 2017", "answer": ["Meira Kumar", "Ram Nath Kovind"]}
314 | {"question": "who wrote and performed i can only imagine", "answer": ["Bart Millard", "Christian rock band MercyMe", "MercyMe"]}
315 | {"question": "location of the ten commandments in the bible", "answer": ["Exodus", "Deuteronomy"]}
316 | {"question": "who sings she's like the wind lyrics", "answer": ["Wendy Fraser", "Patrick Swayze"]}
317 | {"question": "when did the us not go to the olympics", "answer": ["The 1980 Summer Olympics", "1980"]}
318 | {"question": "when does body temperature tend to be lowest", "answer": ["11 p.m. to 3 a.m."]}
319 | {"question": "who will win 2018 election in sri lanka", "answer": ["Sri Lanka Podujana Peramuna"]}
320 | {"question": "where does the white witch live in narnia", "answer": ["her castle"]}
321 | {"question": "what is the oldest street in the philippines", "answer": ["Cebu City", "Colon Street"]}
322 | {"question": "how many countries does cadbury sell its products", "answer": ["more than 50 countries worldwide", "more than 50"]}
323 | {"question": "who was the viceroy when the simon commission visited india", "answer": ["Lord Irwin"]}
324 | {"question": "what languages are spoken in india the most", "answer": ["Bengali", "Telugu", "Marathi", "Hindi", "English"]}
325 | {"question": "who plays hannibal in silence of the lambs", "answer": ["Anthony Hopkins"]}
326 | {"question": "when did the bill of rights come out", "answer": ["1689", "16 December 1689"]}
327 | {"question": "when does season 5 of the blacklist resume", "answer": ["January 31, 2018"]}
328 | {"question": "where did the allies go after north africa", "answer": ["Italy", "the Italian Campaign"]}
329 | {"question": "when did toyota start making cars in the us", "answer": ["by the early 1980s", "1984"]}
330 | {"question": "when did the royal proclamation of 1763 end", "answer": ["the American Revolutionary War", "with the American Revolutionary War", "1783"]}
331 | {"question": "when did the movie varsity blues come out", "answer": ["1999", "January 15, 1999"]}
332 | {"question": "who played shmuel in the boy in the striped pyjamas", "answer": ["Jack Scanlon"]}
333 | {"question": "what is an example of a tricyclic antidepressant", "answer": ["Amineptine"]}
334 | {"question": "where is creatine phosphate found in the body", "answer": ["brain", "pancreas", "skeletal muscle and the brain", "muscle cells", "heart"]}
335 | {"question": "who appoints the chair of the federal reserve system", "answer": ["President of the United States"]}
336 | {"question": "locations for the film an englishman who went up a hill", "answer": ["Llanrhaeadr-ym-Mochnant", "Llansilin in Powys"]}
337 | {"question": "who sings sugar sugar you are my candy girl", "answer": ["the Archies"]}
338 | {"question": "how many seasons of the rugrats are there", "answer": ["9 seasons", "9"]}
339 | {"question": "who played zoe hart on hart of dixie", "answer": ["Rachel Sarah Bilson"]}
340 | {"question": "where is the highest level of fluoride stored in the teeth", "answer": ["surface of the enamel"]}
341 | {"question": "who plays noah newman on the young and the restless", "answer": ["Robert Gillespie Adamson IV"]}
342 | {"question": "when was the canadian pacific railway started and finished", "answer": ["between 1881 and 1885"]}
343 | {"question": "how many super bowl games has the patriots played in", "answer": ["10", "ten", "ten times"]}
344 | {"question": "who has the most catches in nfl history", "answer": ["Jerry Rice"]}
345 | {"question": "when did the sims 4 toddlers come out", "answer": ["January 2017", "January 12, 2017", "the January 2017 patch"]}
346 | {"question": "when was the last time the military drafted", "answer": ["1973", "1972", "December 1972"]}
347 | {"question": "where does new york drinking water come from", "answer": ["the eastern Catskill Mountains"]}
348 | {"question": "when does the day of the dead end", "answer": ["November 2"]}
349 | {"question": "what is the name of the main artery which takes blood from the heart to the body", "answer": ["The aorta", "aorta"]}
350 | {"question": "who sings the theme song for the tv show cops", "answer": ["Inner Circle", "Jamaican reggae band Inner Circle"]}
351 | {"question": "what is the minimum wage in france per hour", "answer": ["11.16", "€9.88 per hour."]}
352 | {"question": "when is the fourth movie of the divergent series coming out", "answer": ["never made"]}
353 | {"question": "when did the word of wisdom become mandatory", "answer": ["February 1834"]}
354 | {"question": "the chinese dragons are protectors of how many seas diggy", "answer": ["Four Seas", "Four"]}
355 | {"question": "hazels boyfriend in the fault in our stars", "answer": ["Augustus Waters"]}
356 | {"question": "who was the ruler of england in 1616", "answer": ["James I"]}
357 | {"question": "when does sam realize he is jim in ghost whisperer", "answer": ["Leap of Faith"]}
358 | {"question": "what are the parts of a domain name called", "answer": ["subdomain", "top-level domain", "hostname"]}
359 | {"question": "where was percy jackson and the olympians filmed", "answer": ["Vancouver", "Mission, British Columbia"]}
360 | {"question": "who plays general hux in the last jedi", "answer": ["Domhnall Gleeson"]}
361 | {"question": "who started ww2 and how did it start", "answer": ["Nazi Germany"]}
362 | {"question": "when did the royal mint move to wales", "answer": ["1968", "the 1960s.", "17 December 1968", "the 1960s"]}
363 | {"question": "what does istj mean in a personality test", "answer": ["Extroverted Thinking (Te)", "Extroverted Intuition (Ne)", "Introverted Sensing (Si)", "Introverted Feeling (Fi)"]}
364 | {"question": "who won the champions league final in 2016", "answer": ["Real Madrid"]}
365 | {"question": "when was the last time new zealand had an earthquake", "answer": ["11 Jul 2017"]}
366 | {"question": "who performed the first c section in 1794", "answer": ["Dr. Jesse Bennett"]}
367 | {"question": "who has won the most games in nfl 2017", "answer": ["Dallas Cowboys"]}
368 | {"question": "who has the most gold medals in the winter olympics of all time", "answer": ["Norway"]}
369 | {"question": "what caused the breakup of the democratic republican party", "answer": ["the disputed 1824 presidential election"]}
370 | {"question": "who voices randy in f is for family", "answer": ["T.J. Miller"]}
371 | {"question": "when does the dlc for rainbow six siege come out", "answer": ["January 2018"]}
372 | {"question": "who are the australia's got talent judges", "answer": ["Kelly Osbourne", "Ian Dickson", "Ian \"Dicko\" Dickson", "Eddie Perfect", "Sophie Monk"]}
373 | {"question": "what does hp mean in war and order", "answer": ["hit points or health points"]}
374 | {"question": "who is the biggest selling female group of all time", "answer": ["Spice Girls"]}
375 | {"question": "an object that moves around an external axis is said to be", "answer": ["revolution or orbital revolution", "orbit"]}
376 | {"question": "when did seattle slew win the triple crown", "answer": ["1977", "in 1977"]}
377 | {"question": "bible verse taking the lord's name in vain", "answer": ["Exodus 20:7"]}
378 | {"question": "who wrote papa got a brand new bag", "answer": ["James Brown"]}
379 | {"question": "when did the united states host the world cup", "answer": ["1994"]}
380 | {"question": "abbreviated name of the highest peak in tasmania", "answer": ["Mount Ossa"]}
381 | {"question": "where was the salvation army's christmas collection kettle first introduced", "answer": ["San Francisco", "in San Francisco"]}
382 | {"question": "when was the first election held in india", "answer": ["1951–52"]}
383 | {"question": "when did the sat become out of 1600", "answer": ["March 2016", "2014", "2016"]}
384 | {"question": "where is fe best absorbed in the body", "answer": ["in the duodenum", "the duodenum"]}
385 | {"question": "who sings too much time on my hands lyrics", "answer": ["Tommy Shaw"]}
386 | {"question": "who does the voice of nala in the lion king", "answer": ["Niketa Calame", "Sally Dworsky", "Moira Kelly", "Laura Williams"]}
387 | {"question": "when did seat belts become law in ontario", "answer": ["January 1, 1976"]}
388 | {"question": "all the motor neurons that control the skeletal muscles are", "answer": ["efferent nerves", "Somatic motor neurons", "Somatic"]}
389 | {"question": "who is the first wife on sister wives", "answer": ["Meri"]}
390 | {"question": "who played the elephant man in the film", "answer": ["John Hurt"]}
391 | {"question": "when was coffee first made into a drink", "answer": ["15th century", "the 15th century"]}
392 | {"question": "when was the last time oklahoma won a national championship in football", "answer": ["2003", "2000"]}
393 | {"question": "who is jared on the bold and the beautiful", "answer": ["Andrew Collins"]}
394 | {"question": "who plays chummy's mother in call the midwife", "answer": ["Cheryl Campbell"]}
395 | {"question": "when's the last time army won the army navy game", "answer": ["2017", "Army"]}
396 | {"question": "when did the nba create the 3 point line", "answer": ["the 1979–80 season", "1979–80 season"]}
397 | {"question": "what is the share of agriculture in indian economy", "answer": ["17.32%", "23%"]}
398 | {"question": "chief ministers of tamil nadu mentioned on wikipedia", "answer": ["Ramakrishna Ranga Rao", "Tanguturi Prakasam", "P. Subbarayan", "M. G. Ramachandran", "Janaki Ramachandran", "P. T. Rajan", "J. Jayalalithaa", "M. Karunanidhi", "A. Subbarayalu Reddiar", "C. N. Annadurai", "P. S. Kumaraswamy Raja", "K. Palaniswami", "O. Panneerselvam", "V.R. Nedunchezhiyan", "Raja of Panagal", "O. P. Ramaswamy Reddiyar", "Kurma Venkata Reddy Naidu", "B. Munuswamy Naidu", "C. Rajagopalachari"]}
399 | {"question": "who plays unis in she's the man", "answer": ["Emily Perkins"]}
400 | {"question": "when did the united states start using the death penalty", "answer": ["the beginning", "1608"]}
401 | 


--------------------------------------------------------------------------------
/data/web_questions.jsonl:
--------------------------------------------------------------------------------
  1 | {"question": "who did benjamin franklin get married to?", "answer": ["Deborah Read"]}
  2 | {"question": "what is the currency in egypt 2012?", "answer": ["Egyptian pound"]}
  3 | {"question": "what language turkey people speak?", "answer": ["Turkish Language"]}
  4 | {"question": "what does jamaican people speak?", "answer": ["Jamaican Creole English Language", "Jamaican English"]}
  5 | {"question": "what language do people from thailand speak?", "answer": ["Mon Language", "Lao Language", "Khmer language", "Hmong language", "Thai Language", "Cham language", "Mlabri language", "Malay, Pattani Language", "Nyaw Language", "Saek language"]}
  6 | {"question": "what else did ben franklin invent?", "answer": ["Lightning rod", "Franklin stove", "Bifocals", "Glass harmonica"]}
  7 | {"question": "which country does greenland belong to?", "answer": ["Denmark"]}
  8 | {"question": "who did michael j fox marry?", "answer": ["Tracy Pollan"]}
  9 | {"question": "where did eleanor roosevelt die?", "answer": ["New York City"]}
 10 | {"question": "what airport is near arlington tx?", "answer": ["Arlington Municipal Airport"]}
 11 | {"question": "which country was justin bieber born in?", "answer": ["Canada"]}
 12 | {"question": "who is the minority leader of the house of representatives now?", "answer": ["Nancy Pelosi"]}
 13 | {"question": "where did clay matthews go to school?", "answer": ["Agoura High School", "University of Southern California"]}
 14 | {"question": "what to do today in atlanta with kids?", "answer": ["Atlanta History Center", "Atlanta Cyclorama & Civil War Museum", "Atlanta Ballet", "Fernbank Museum of Natural History", "Woodruff Arts Center", "Zoo Atlanta", "Atlanta Symphony Orchestra", "Centennial Olympic Park", "Martin Luther King, Jr., National Historic Site", "Fernbank Science Center"]}
 15 | {"question": "what team did ronaldo play for in 2003?", "answer": ["Real Madrid C.F."]}
 16 | {"question": "what is the currency of puerto rico called?", "answer": ["United States dollar"]}
 17 | {"question": "what tv shows did shawnee smith play in?", "answer": ["Anger Management", "The Tom Show", "Scream Queens", "Brand New Life", "30 Days of Night: Dust to Dust", "Arsenio", "Becker", "The Stand", "All is Forgiven"]}
 18 | {"question": "what language does cuba speak?", "answer": ["Spanish Language"]}
 19 | {"question": "where is mount st helens volcano?", "answer": ["Skamania County"]}
 20 | {"question": "what school did karl benz go to?", "answer": ["Karlsruhe Institute of Technology", "University of Karlsruhe"]}
 21 | {"question": "where are samsung based?", "answer": ["Seoul"]}
 22 | {"question": "where did joe flacco attend college?", "answer": ["University of Delaware"]}
 23 | {"question": "who did jackie robinson first play for?", "answer": ["Montreal Royals", "Kansas City Monarchs", "Los Angeles Bulldogs", "Brooklyn Dodgers", "UCLA Bruins football"]}
 24 | {"question": "what are the religions practiced in indonesia?", "answer": ["Protestantism", "Hinduism", "Catholicism", "Islam"]}
 25 | {"question": "what country did buddha come from?", "answer": ["India"]}
 26 | {"question": "where george lopez was born?", "answer": ["Mission Hills"]}
 27 | {"question": "where are yamaha outboard motors manufactured?", "answer": ["Shizuoka Prefecture"]}
 28 | {"question": "where is the carpathian mountain range located?", "answer": ["Ukraine", "Europe", "Romania", "Czech Republic", "Poland", "Serbia", "Slovakia", "Hungary"]}
 29 | {"question": "who is emma stone father?", "answer": ["Jeff Stone"]}
 30 | {"question": "what is the oregon ducks 2012 football schedule?", "answer": ["University of Oregon"]}
 31 | {"question": "which airport to fly into rome?", "answer": ["Ciampino – G.B. Pastine International Airport", "Roma Termini railway station", "Civitavecchia Ferry Terminal", "Leonardo da Vinci–Fiumicino Airport"]}
 32 | {"question": "who plays juni cortez?", "answer": ["Daryl Sabara"]}
 33 | {"question": "what school did ben roethlisberger go to?", "answer": ["Miami University"]}
 34 | {"question": "what type of government does germany have now?", "answer": ["Constitutional republic", "Multi-party system", "Federal republic", "Democracy", "Parliamentary republic"]}
 35 | {"question": "what language do british speak?", "answer": ["Scottish Gaelic language", "Scots Language", "Cornish Language", "Irish", "English Language", "Welsh Language", "Guernésiais", "Ulster Scots dialects", "Jèrriais", "Manx Language"]}
 36 | {"question": "what super bowl did peyton manning win?", "answer": ["2006 NFL season"]}
 37 | {"question": "who did tim tebow play college football for?", "answer": ["University of Florida"]}
 38 | {"question": "where does the zambezi river originate?", "answer": ["Tanzania"]}
 39 | {"question": "when did annie open?", "answer": ["Annie (1977 original Broadway cast)"]}
 40 | {"question": "who did tim tebow play college football for?", "answer": ["University of Florida"]}
 41 | {"question": "who inspired obama?", "answer": ["Saul Alinsky", "Nipsey Russell"]}
 42 | {"question": "who is gimli's father in the hobbit?", "answer": ["Gloin"]}
 43 | {"question": "who was the leader of the us during wwii?", "answer": ["Gerald Ford"]}
 44 | {"question": "what shows are shot in new york?", "answer": ["Flight of the Conchords", "The Stand"]}
 45 | {"question": "who plays kenneth?", "answer": ["Jack McBrayer"]}
 46 | {"question": "where did dolly parton grow up?", "answer": ["Tennessee"]}
 47 | {"question": "what type of government does iraq have now?", "answer": ["Parliamentary system", "Federation", "Republic", "Federal republic", "Parliamentary republic"]}
 48 | {"question": "who has played lex luthor?", "answer": ["Anthony LaPaglia", "Kevin Spacey", "Clancy Brown", "James Marsters", "Gene Hackman"]}
 49 | {"question": "who did scarlett johansson date?", "answer": ["Justin Timberlake", "Josh Hartnett", "Benicio del Toro", "Jared Leto", "Derek Jeter"]}
 50 | {"question": "who did cam newton sign with?", "answer": ["Carolina Panthers"]}
 51 | {"question": "who did mozart write his four horn concertos for?", "answer": ["wolfgang amadeus mozart used story by pierre beaumarchais"]}
 52 | {"question": "where did mitt romney's parents come from?", "answer": ["Bloomfield Hills"]}
 53 | {"question": "what county is brentwood tennessee in?", "answer": ["Williamson County"]}
 54 | {"question": "what timezone is utah in?", "answer": ["Mountain Time Zone"]}
 55 | {"question": "who developed the tcp ip reference model?", "answer": ["Robert E. Kahn", "Vint Cerf"]}
 56 | {"question": "where did jovan belcher kill himself?", "answer": ["Kansas City"]}
 57 | {"question": "what is there to do for fun in kansas city?", "answer": ["Kemper Arena", "Starlight Theatre", "Kauffman Stadium", "Municipal Stadium", "Ward Parkway Center", "Arrowhead Stadium", "Blue Ridge Mall", "Blue Ridge Crossing", "Crown Center", "TWA Corporate Headquarters' Building"]}
 58 | {"question": "what team does jordan own?", "answer": ["Jordan national football team"]}
 59 | {"question": "what the zip code for seattle washington?", "answer": ["98109", "98108", "98105", "98104", "98107", "98106", "98101", "98103", "98102", "98117"]}
 60 | {"question": "what time in hilo hawaii?", "answer": ["Hawaii–Aleutian Time Zone"]}
 61 | {"question": "where is perpignan located?", "answer": ["France"]}
 62 | {"question": "who is the state governor of tennessee?", "answer": ["Bill Haslam"]}
 63 | {"question": "what things did martin luther king do?", "answer": ["Civil rights movement", "Civil disobedience", "Nonviolence"]}
 64 | {"question": "what is the australian dollar called?", "answer": ["Australian dollar"]}
 65 | {"question": "what movies did ron howard director?", "answer": ["How the Grinch Stole Christmas!"]}
 66 | {"question": "what are the three official languages of belgium?", "answer": ["French Language", "German Language", "Dutch Language"]}
 67 | {"question": "what are the major languages spoken in greece?", "answer": ["Albanian language", "Greek Language"]}
 68 | {"question": "what type of cancer did eva peron have?", "answer": ["Cervical cancer"]}
 69 | {"question": "what currency does russia use 2012?", "answer": ["Russian ruble"]}
 70 | {"question": "what did the scientist chadwick discovered?", "answer": ["Neutron"]}
 71 | {"question": "who plays london tipton in suite life on deck?", "answer": ["Brenda Song"]}
 72 | {"question": "what are the school colors for harvard university?", "answer": ["Crimson"]}
 73 | {"question": "who does lee clark manager?", "answer": ["Birmingham City F.C."]}
 74 | {"question": "where did george w bush live as a child?", "answer": ["New Haven"]}
 75 | {"question": "where was rihanna born and raised?", "answer": ["Saint Michael Parish", "Barbados"]}
 76 | {"question": "who was vp for lincoln?", "answer": ["Andrew Johnson", "Hannibal Hamlin"]}
 77 | {"question": "who was the italian leader in ww1?", "answer": ["Benito Mussolini"]}
 78 | {"question": "what year was george w bush elected?", "answer": ["George W. Bush presidential campaign, 2000"]}
 79 | {"question": "what are abraham sons names?", "answer": ["Zimran", "Ishbak", "Midian", "Shuah", "Ishmael", "Jokshan", "Isaac", "Medan"]}
 80 | {"question": "what did queen victoria say about the suffragettes?", "answer": ["I am every day more convinced that we women, if we are to be good women, feminine and amiable and domestic, are not fitted to reign; at least it is they that drive themselves to the work which it entails."]}
 81 | {"question": "what airport do you fly into to get to destin fl?", "answer": ["Northwest Florida Regional Airport", "Destin–Fort Walton Beach Airport"]}
 82 | {"question": "what is the currency used in italy?", "answer": ["Euro"]}
 83 | {"question": "where is the ufc headquarters?", "answer": ["Las Vegas"]}
 84 | {"question": "what is the song anna kendrick sings in pitch perfect?", "answer": ["Cups"]}
 85 | {"question": "what places in japan were bombed?", "answer": ["Hiroshima Prefecture"]}
 86 | {"question": "what language do chinese people write in?", "answer": ["Traditional Chinese characters", "Chinese", "Simplified Chinese character", "'Phags-pa script", "Nüshu script", "Chinese characters"]}
 87 | {"question": "what state is washington d.c. located?", "answer": ["Washington", "Washington, D.C."]}
 88 | {"question": "what artistic movement did henri matisse belong to?", "answer": ["Fauvism", "Impressionism", "Neo-impressionism", "Modernism"]}
 89 | {"question": "who does peyton manning play football for?", "answer": ["Denver Broncos"]}
 90 | {"question": "what region of the world is egypt associated with?", "answer": ["Middle East"]}
 91 | {"question": "where was the city of david?", "answer": ["Bethlehem"]}
 92 | {"question": "what are the sights to see in madrid?", "answer": ["Paseo del Prado", "Thyssen-Bornemisza Museum", "Almudena Cathedral", "Plaza de Cibeles", "Puerta del Sol", "Royal Palace of Madrid", "Museo de Lazaro Galdiano", "Gran Vía", "Museo Nacional Centro de Arte Reina Sofía", "Plaza Mayor, Madrid"]}
 93 | {"question": "who is khloe kardashian's husband?", "answer": ["Lamar Odom"]}
 94 | {"question": "who played on the jeffersons?", "answer": ["Isabel Sanford", "Marla Gibbs", "Sherman Hemsley"]}
 95 | {"question": "what kind of money should i take to costa rica?", "answer": ["Costa Rican colón"]}
 96 | {"question": "what state does romney live in?", "answer": ["Massachusetts"]}
 97 | {"question": "what time zone am i in california?", "answer": ["Pacific Time Zone", "UTC-8"]}
 98 | {"question": "where does archbishop desmond tutu live?", "answer": ["South Africa"]}
 99 | {"question": "what is new york city airport?", "answer": ["Flushing Airport", "Mitchel Air Force Base", "Downtown Manhattan Heliport", "LaGuardia Airport", "John F. Kennedy International Airport", "New York Skyports Inc. Seaplane Base", "East 34th Street Heliport"]}
100 | {"question": "what did anton van leeuwenhoek contribute to our knowledge of cells?", "answer": ["microscope first used by anton van leeuwenhoek"]}
101 | {"question": "what kind of monarchy does japan have?", "answer": ["Constitutional monarchy"]}
102 | {"question": "who rules denmark right now?", "answer": ["Helle Thorning-Schmidt"]}
103 | {"question": "who plays bilbo baggins in the hobbit?", "answer": ["Norman Bird", "Martin Freeman", "Ian Holm"]}
104 | {"question": "what did fred durst do?", "answer": ["Musician"]}
105 | {"question": "who will play mr gray in the film?", "answer": ["Karen Mulder"]}
106 | {"question": "where to exchange euros in new york city?", "answer": ["John F. Kennedy International Airport"]}
107 | {"question": "what kind of cancer did carl wilson have?", "answer": ["Lung cancer"]}
108 | {"question": "what to see near grand canyon?", "answer": ["Grand Canyon National Park Superintendent's Residence", "Grand Canyon South Rim Ranger's Dormitory", "Grand Canyon Village Historic District", "Grand Canyon North Rim Headquarters", "Grandview Mine", "Grand Canyon Water Reclamation Plant", "Buckey O'Neill Cabin", "El Tovar Hotel", "Grand Canyon Depot"]}
109 | {"question": "what type of government does usa follow?", "answer": ["Federal republic"]}
110 | {"question": "where is the chernobyl nuclear power plant?", "answer": ["Prypiat", "Ukrainian SSR", "Chernobyl Nuclear Power Plant", "Chernobyl"]}
111 | {"question": "what did the ancient romans speak?", "answer": ["Latin Language"]}
112 | {"question": "where was the temple of karnak built?", "answer": ["Egypt", "Luxor Governorate"]}
113 | {"question": "who is sir francis bacon?", "answer": ["Philosopher"]}
114 | {"question": "what to do in richardson dallas?", "answer": ["Wizard's Sports Cafe"]}
115 | {"question": "where did bristol palin go to school?", "answer": ["Wasilla High School", "West Anchorage High School", "Juneau-Douglas High School"]}
116 | {"question": "what was the title of the book charles darwin wrote?", "answer": ["The Structure and Distribution of Coral Reefs", "On evolution", "A student's introduction to Charles Darwin", "Climbing Plants", "The Expression of the Emotions in Man and Animals", "The origin of species : complete and fully illustrated", "The Origin of Species", "The Life of Erasmus Darwin", "The Autobiography of Charles Darwin", "The Descent of Man, and Selection in Relation to Sex"]}
117 | {"question": "where did francisco coronado come from?", "answer": ["Salamanca"]}
118 | {"question": "who is eli whitney and what did he invent?", "answer": ["Cotton gin"]}
119 | {"question": "what type of books did agatha christie wrote?", "answer": ["Crime writer"]}
120 | {"question": "what type of government does the us follow?", "answer": ["Presidential system", "Federal republic", "Representative democracy", "Two-party system", "Constitutional republic", "Republic"]}
121 | {"question": "what is my timezone in louisiana?", "answer": ["Central Time Zone", "UTC−06:00"]}
122 | {"question": "what type of government does australia have?", "answer": ["Parliamentary system", "Federation", "Constitutional monarchy"]}
123 | {"question": "where was country singer george jones born?", "answer": ["Saratoga"]}
124 | {"question": "who is the next governor of indiana?", "answer": ["Mitch Daniels"]}
125 | {"question": "who is willow smith mom name?", "answer": ["Jada Pinkett Smith"]}
126 | {"question": "what part did winona ryder play in star trek?", "answer": ["Amanda Grayson"]}
127 | {"question": "where is tyrese gibson from?", "answer": ["Watts"]}
128 | {"question": "what did stephen hawking study?", "answer": ["Physics"]}
129 | {"question": "where does bradley walsh live?", "answer": ["England"]}
130 | {"question": "what county is frederick md in?", "answer": ["Frederick County"]}
131 | {"question": "where did drew brees go to college wikianswers?", "answer": ["Purdue University"]}
132 | {"question": "where was benjamin franklin educated?", "answer": ["Boston Latin School"]}
133 | {"question": "where english is spoken?", "answer": ["Canada", "Australia", "South Africa", "Zambia", "United Kingdom", "Zimbabwe", "Uganda", "New Zealand", "Turks and Caicos Islands", "Tanzania"]}
134 | {"question": "who did carlos boozer play for?", "answer": ["Utah Jazz", "Cleveland Cavaliers"]}
135 | {"question": "what did president carter do in office?", "answer": ["Social development", "Human rights", "Economic development"]}
136 | {"question": "who founded the pittsburgh steelers in 1933?", "answer": ["Rooney family"]}
137 | {"question": "what did randy savage died of?", "answer": ["Myocardial infarction", "Traffic collision"]}
138 | {"question": "what type of government system does italy have?", "answer": ["Constitutional republic", "Parliamentary republic", "Unitary state"]}
139 | {"question": "what time zone is anaheim california?", "answer": ["Pacific Time Zone"]}
140 | {"question": "what two continents is turkey on?", "answer": ["Europe", "Eurasia", "Asia"]}
141 | {"question": "what year did the orioles go to the world series?", "answer": ["1983 World Series", "1966 World Series", "1970 World Series"]}
142 | {"question": "what is the nigeria time?", "answer": ["West Africa Time", "UTC+01:00"]}
143 | {"question": "what type of currency do they use in england?", "answer": ["UK £"]}
144 | {"question": "what type of government does france use?", "answer": ["Semi-presidential system", "Constitutional republic", "Unitary state"]}
145 | {"question": "what is serbian language called?", "answer": ["Serbian language"]}
146 | {"question": "who is the senior senator of louisiana?", "answer": ["Mary Landrieu"]}
147 | {"question": "who plays donna noble?", "answer": ["Catherine Tate"]}
148 | {"question": "who did vasco de gama explore for?", "answer": ["Portugal"]}
149 | {"question": "who wrote the jana gana mana?", "answer": ["Ram Singh Thakur", "Rabindranath Tagore"]}
150 | {"question": "what county is kansas city kansas?", "answer": ["Wyandotte County"]}
151 | {"question": "who was richard nixon married to?", "answer": ["Pat Nixon"]}
152 | {"question": "what countries are part of the uk?", "answer": ["Scotland", "England", "Wales", "Northern Ireland"]}
153 | {"question": "what killed john bonham?", "answer": ["Inhalation of vomit"]}
154 | {"question": "what instruments did louis armstrong play?", "answer": ["trumpet", "Cornet"]}
155 | {"question": "where are the gobi desert located on a map?", "answer": ["Mongolia"]}
156 | {"question": "what country did buddha come from?", "answer": ["India"]}
157 | {"question": "what are the names of the city states in ancient greece?", "answer": ["Athens"]}
158 | {"question": "what type of cancer did gilda radner die of?", "answer": ["Ovarian cancer"]}
159 | {"question": "what do people in australia speak?", "answer": ["Lojban", "Esperanto Language", "English Language"]}
160 | {"question": "what were amelia earhart's achievements?", "answer": ["Writer", "Pilot"]}
161 | {"question": "what kind of guitar did george harrison use?", "answer": ["Fender Stratocaster", "Rickenbacker 360/12"]}
162 | {"question": "what position did vince lombardi play in college?", "answer": ["Right Guard"]}
163 | {"question": "where was martin luther king jr raised?", "answer": ["Atlanta"]}
164 | {"question": "what countries share a land border with indonesia?", "answer": ["Australia", "East Malaysia"]}
165 | {"question": "who is hammurabi and what did he do?", "answer": ["Monarch"]}
166 | {"question": "what movies has john williams score?", "answer": ["A.I. Artificial Intelligence", "Always", "Catch Me If You Can", "1941", "Daddy-O", "Amistad", "Close Encounters of the Third Kind", "E.T. the Extra-Terrestrial", "Empire of the Sun", "Earthquake"]}
167 | {"question": "what was robert burns famous for?", "answer": ["Poet"]}
168 | {"question": "what national team does cristiano ronaldo play for?", "answer": ["Portugal national football team"]}
169 | {"question": "who did armie hammer play in the social network?", "answer": ["Jesse Eisenberg"]}
170 | {"question": "what state does romney live in?", "answer": ["Massachusetts"]}
171 | {"question": "what countries have spanish as the national language?", "answer": ["Spain"]}
172 | {"question": "who did gerald ford select as his vice president when he became president?", "answer": ["Nelson Rockefeller"]}
173 | {"question": "who fought in the gulf war 1991?", "answer": ["Saudi Arabia", "Australia", "United States of America", "France", "United Kingdom", "Argentina", "Iraq"]}
174 | {"question": "who plays captain kirk in star trek?", "answer": ["William Shatner"]}
175 | {"question": "what type of artist is henri matisse?", "answer": ["Sculpture", "Printmaking", "Collage", "Painting", "Drawing"]}
176 | {"question": "what are republicans views on health care?", "answer": ["20003"]}
177 | {"question": "what was the first book charles dickens wrote?", "answer": ["Oliver Twist"]}
178 | {"question": "when did the wright brothers created their first plane?", "answer": ["1900 Wright Glider"]}
179 | {"question": "when was the last time the toronto maple leafs were in the stanley cup finals?", "answer": ["1967 Stanley Cup Finals"]}
180 | {"question": "where was elvis costello born?", "answer": ["Paddington"]}
181 | {"question": "what was thomas jefferson role in the declaration of independence?", "answer": ["Writer"]}
182 | {"question": "when did conflict start in ireland?", "answer": ["Viking invasion of Ireland"]}
183 | {"question": "what do they call money in japan?", "answer": ["Japanese yen"]}
184 | {"question": "where is jamarcus russell from?", "answer": ["Mobile"]}
185 | {"question": "where is the seychelles on world map?", "answer": ["Africa"]}
186 | {"question": "what makes elvis presley famous?", "answer": ["Singer"]}
187 | {"question": "what language does cuba speak?", "answer": ["Spanish Language"]}
188 | {"question": "what type of economy exists in china?", "answer": ["Socialist state"]}
189 | {"question": "what town was martin luther king assassinated in?", "answer": ["Memphis"]}
190 | {"question": "what books did agatha christie wrote?", "answer": ["And Then There Were None", "Le Grand alibi", "Ten Little Indians", "Appointment with Death", "Desyat Negrityat", "The Man in the Brown Suit", "Witness for the Prosecution"]}
191 | {"question": "what time does american horror story air?", "answer": ["Tom Selleck"]}
192 | {"question": "what team is hank baskett on 2010?", "answer": ["Philadelphia Eagles"]}
193 | {"question": "where was george washington carver from?", "answer": ["Diamond"]}
194 | {"question": "what county is st paul va in?", "answer": ["United States of America", "Wise County", "Russell County", "Virginia"]}
195 | {"question": "where did rudolf virchow conduct his research?", "answer": ["Humboldt University of Berlin", "University of Würzburg"]}
196 | {"question": "when does jewish new year start?", "answer": ["Yiddish Language"]}
197 | {"question": "what disease does robin roberts have?", "answer": ["Breast cancer"]}
198 | {"question": "who played obi wan in episode 2?", "answer": ["Ewan McGregor"]}
199 | {"question": "what type of government does the us follow?", "answer": ["Presidential system", "Federal republic", "Representative democracy", "Two-party system", "Constitutional republic", "Republic"]}
200 | {"question": "what does joey jordison play in slipknot?", "answer": ["Drums"]}
201 | {"question": "what is the capital of modern egypt?", "answer": ["Cairo"]}
202 | {"question": "who fought the battle of gettysburg?", "answer": ["Confederate States of America", "United States of America"]}
203 | {"question": "what art movement did leonardo da vinci belong to?", "answer": ["High Renaissance"]}
204 | {"question": "where obama went to school?", "answer": ["Occidental College", "Harvard Law School", "Noelani Elementary School", "Punahou School", "State Elementary School Menteng 01", "St. Francis of Assisi Catholic School", "Columbia University"]}
205 | {"question": "where did the iroquois indians come from?", "answer": ["Québec"]}
206 | {"question": "where is mission san buenaventura located?", "answer": ["Ventura County"]}
207 | {"question": "who played jacob black in twilight?", "answer": ["Taylor Lautner"]}
208 | {"question": "what kind of government is sweden?", "answer": ["Representative democracy", "Unitary state", "Parliamentary system", "Constitutional monarchy", "Hereditary monarchy", "Multi-party system"]}
209 | {"question": "who owns the portland press herald?", "answer": ["Blethen Maine Newspapers, Inc."]}
210 | {"question": "who plays ken barlow in coronation street?", "answer": ["Tony Warren"]}
211 | {"question": "who did the voice of darth vader in episode 3?", "answer": ["Hayden Christensen"]}
212 | {"question": "who plays the voice of brian on family guy?", "answer": ["Seth MacFarlane"]}
213 | {"question": "who was the first president of the afl?", "answer": ["Bud Adams", "Lamar Hunt"]}
214 | {"question": "where to get a marriage license in long island?", "answer": ["United States District Court for the Eastern District of New York"]}
215 | {"question": "what is the currency used in italy?", "answer": ["Euro"]}
216 | {"question": "which dawkins book to read first?", "answer": ["The Selfish Gene"]}
217 | {"question": "who is the coach of the sf giants?", "answer": ["Tim Flannery"]}
218 | {"question": "what produce does florida export?", "answer": ["Orange juice"]}
219 | {"question": "what state is the steelers from?", "answer": ["Pittsburgh"]}
220 | {"question": "where does delaware river start?", "answer": ["West Branch Delaware River", "Mount Jefferson"]}
221 | {"question": "when do world war ii end?", "answer": ["1942"]}
222 | {"question": "who is jimmy savile?", "answer": ["Presenter"]}
223 | {"question": "what is the national flower of hawaii?", "answer": ["Hawaiian hibiscus"]}
224 | {"question": "what countries does the panama canal go through?", "answer": ["Panama Canal Zone"]}
225 | {"question": "who was esther's husband?", "answer": ["Susa"]}
226 | {"question": "what did john irving wrote?", "answer": ["Trying to Save Piggy Sneed", "The Fourth Hand", "The Cider House Rules", "The 158-Pound Marriage", "The World According to Garp", "The Hotel New Hampshire", "A Widow for One Year", "A Prayer for Owen Meany", "The Water-Method Man", "Until I Find You"]}
227 | {"question": "what type of government was formed when italy unified?", "answer": ["Parliamentary republic"]}
228 | {"question": "what is the name of the san francisco newspaper?", "answer": ["The San Francisco Examiner", "California Star", "San Francisco Bay Guardian", "San Francisco Business Times", "San Francisco Bay Times", "San Francisco Chronicle", "Bay Area Reporter", "Sing Tao Daily", "AsianWeek", "San Francisco Call"]}
229 | {"question": "who did kim richards marry?", "answer": ["Greg Davis", "John Jackson", "G. Monty Brinson"]}
230 | {"question": "who plays blaine in batman?", "answer": ["Him/Herself"]}
231 | {"question": "what language does australians speak?", "answer": ["Greek Language", "English Language", "Italian Language", "Chinese language"]}
232 | {"question": "who did annie oakley married?", "answer": ["Frank E. Butler"]}
233 | {"question": "what university did romney graduated from?", "answer": ["Stanford University", "Cranbrook Schools", "Harvard Law School", "Harvard Business School", "Harvard University", "Brigham Young University"]}
234 | {"question": "what are the landlocked countries in latin america?", "answer": ["Bolivia", "Honduras", "Cuba", "El Salvador", "Guatemala", "Costa Rica", "Cuauhtémoc, D.F.", "Great Pyramid of Tenochtitlán", "Paraguay", "Belize"]}
235 | {"question": "what college did magic johnson play for?", "answer": ["Michigan State University"]}
236 | {"question": "where does the zambezi river start?", "answer": ["Mwinilunga"]}
237 | {"question": "who is shakira married to?", "answer": ["Gerard Piqué"]}
238 | {"question": "what has ian somerhalder acted in?", "answer": ["Smallville", "The Vampire Diaries", "Lost", "Tell Me You Love Me", "Fearless", "Young Americans"]}
239 | {"question": "where is the time zone line in south dakota?", "answer": ["Mountain Time Zone", "Central Time Zone", "UTC−07:00", "UTC−06:00"]}
240 | {"question": "who plays bilbo baggins in the hobbit?", "answer": ["Norman Bird", "Martin Freeman", "Ian Holm"]}
241 | {"question": "what time is it in texas houston right now?", "answer": ["Central Time Zone"]}
242 | {"question": "who is mary mcleod bethune for kids?", "answer": ["Educator"]}
243 | {"question": "what year does hitler die?", "answer": ["Hitler and His Generals: Military Conferences 1942-1945"]}
244 | {"question": "what countries do the united nations help?", "answer": ["Afghanistan", "Albania", "Angola", "Algeria", "Andorra", "Austria", "Australia", "Antigua and Barbuda", "Armenia", "Argentina"]}
245 | {"question": "when did charles goodyear invented rubber?", "answer": ["During the early 1830's he began inventing, filing six patents between 1830 and 1834, and during this period became interested in rubber, which he tried - unsuccessfully - to use in some of his mechanical inventions."]}
246 | {"question": "what places make up new england?", "answer": ["Maine"]}
247 | {"question": "what team does colin kaepernick play for?", "answer": ["San Francisco 49ers"]}
248 | {"question": "what did the islamic people believe in?", "answer": ["Zakāt", "Salah", "Hajj", "Islamic dietary laws", "Jihad", "Sawm", "Halal food", "Shahada", "Adab"]}
249 | {"question": "where to go fishing in roanoke va?", "answer": ["Rainbow Bluff Expedition"]}
250 | {"question": "who does peyton manning play football for?", "answer": ["Denver Broncos"]}
251 | {"question": "what is the state flower of arizona?", "answer": ["Saguaro"]}
252 | {"question": "where the missouri river ends?", "answer": ["Mississippi River"]}
253 | {"question": "what did shakespeare become famous for?", "answer": ["Poet", "Playwright", "Dramatist", "Lyricist", "Author"]}
254 | {"question": "where was st. lucy born?", "answer": ["Syracuse"]}
255 | {"question": "who is jamie little engaged to?", "answer": ["Cody Selman"]}
256 | {"question": "what super bowl did peyton manning win?", "answer": ["2006 NFL season"]}
257 | {"question": "which continental congress approve the declaration of independence?", "answer": ["Second Continental Congress"]}
258 | {"question": "what do christians believe about heaven hell and purgatory?", "answer": ["Greek Evangelical Church", "The Church of Nails"]}
259 | {"question": "which airport to fly into in buenos aires?", "answer": ["Aeroparque Jorge Newbery", "Ministro Pistarini International Airport", "Don Torcuato Airport"]}
260 | {"question": "what language does australia use?", "answer": ["English Language"]}
261 | {"question": "what high school did lil wayne graduate from?", "answer": ["Mcmain Magnet Secondary School"]}
262 | {"question": "what do you call the chinese writing system?", "answer": ["Standard Mandarin"]}
263 | {"question": "where does robin williams live 2011?", "answer": ["San Francisco"]}
264 | {"question": "what country did germany invade first in ww1?", "answer": ["Belgium"]}
265 | {"question": "who was the leader of soviet union during wwii?", "answer": ["Joseph Stalin"]}
266 | {"question": "what flower is on the oklahoma quarter?", "answer": ["Scissor-tailed Flycatcher"]}
267 | {"question": "who plays stephanie plum in one for the money?", "answer": ["Katherine Heigl"]}
268 | {"question": "what did albert speer design?", "answer": ["Deutsches Stadion", "Volkshalle", "Reich Chancellery", "Olympic Stadium"]}
269 | {"question": "where was theodore roosevelt buried?", "answer": ["Youngs Memorial Cemetery"]}
270 | {"question": "what made the soviet union fall?", "answer": ["Cold War"]}
271 | {"question": "what is there to do in peoria illinois?", "answer": ["Peoria Zoo", "Judge Flanagan Residence", "George L. Luthy Memorial Botanical Garden", "Lakeview Museum of Arts and Sciences", "WeaverRidge Golf Club", "Heart of Illinois Fair", "Wildlife Prairie State Park", "Par-A-Dice Hotel and Casino", "Peoria Civic Center", "O'Brien Field"]}
272 | {"question": "what school did michael jordan attend?", "answer": ["University of North Carolina at Chapel Hill", "Emsley A. Laney High School"]}
273 | {"question": "what countries have english as their official language?", "answer": ["Canada", "Australia", "Kingdom of Great Britain", "United States of America", "United Kingdom", "Ireland", "New Zealand"]}
274 | {"question": "what did st augustine do?", "answer": ["Physician", "Writer", "Philosopher"]}
275 | {"question": "what was the ancient egyptians spoken language?", "answer": ["Egyptian Arabic"]}
276 | {"question": "where herman cain stance on the issues?", "answer": ["Évocateur: The Morton Downey Jr. Movie"]}
277 | {"question": "who was vice president after kennedy died?", "answer": ["Lyndon B. Johnson"]}
278 | {"question": "what was nikola tesla inventions?", "answer": ["Tesla coil"]}
279 | {"question": "where did hank marvin come from?", "answer": ["Newcastle upon Tyne"]}
280 | {"question": "what did baron de montesquie influence?", "answer": ["charles-louis de secondat montesquieu influenced edward gibbon"]}
281 | {"question": "what songs does smokey robinson sing?", "answer": ["Being With You", "Cruisin'", "Crusin'", "And I Love Her", "The Tracks of My Tears", "Quiet Storm", "Tracks of my Tears", "Santa Claus is Coming to Town"]}
282 | {"question": "what is the australian dollar called?", "answer": ["Australian dollar"]}
283 | {"question": "what kind government does egypt have?", "answer": ["Semi-presidential system", "Constitutional republic", "Republic", "Unitary state"]}
284 | {"question": "where is the galapagos islands located on a world map?", "answer": ["Pacific Ocean", "Galápagos Province"]}
285 | {"question": "what all does google now do?", "answer": ["Google Maps", "Nexus 7", "Google Buzz", "Nexus 10", "Nexus One", "Nexus S", "Google Chrome", "Google Earth", "Google Wave"]}
286 | {"question": "who did sir francis drake marry?", "answer": ["Mary Newman", "Elizabeth Sydenham"]}
287 | {"question": "what county is west st paul in?", "answer": ["Dakota County"]}
288 | {"question": "what was the capital city of the east roman empire?", "answer": ["Constantinople"]}
289 | {"question": "when did chipper jones get drafted?", "answer": ["1990 Major League Baseball Draft"]}
290 | {"question": "what instruments does justin bieber use?", "answer": ["guitar", "Piano", "trumpet", "Drums"]}
291 | {"question": "what electorate does anna bligh represent?", "answer": ["Electoral district of South Brisbane"]}
292 | {"question": "what role did alexander hamilton play in the constitution?", "answer": ["Financier"]}
293 | {"question": "where did margaret hoover go to college?", "answer": ["Davidson College"]}
294 | {"question": "in which continent is germany?", "answer": ["Europe"]}
295 | {"question": "who did france surrender to in ww2?", "answer": ["Germany"]}
296 | {"question": "who did queen elizabeth 1 executed?", "answer": ["queen elizabeth i of england she executed mary queen of scots"]}
297 | {"question": "where can i go running in sacramento?", "answer": ["Boulevard Park"]}
298 | {"question": "what county is texarkana arkansas in?", "answer": ["Miller County"]}
299 | {"question": "what are the two official languages of paraguay?", "answer": ["Paraguayan Guaraní", "Spanish Language"]}
300 | {"question": "what language is spoken in haiti today?", "answer": ["French Language", "Haitian Creole French Language"]}
301 | {"question": "where was the vietnam war location?", "answer": ["South Vietnam", "North Vietnam", "Southeast Asia", "Cambodia", "Vietnam", "Laos"]}
302 | {"question": "where does name pennsylvania come from?", "answer": ["William Penn"]}
303 | {"question": "what is the money of switzerland called?", "answer": ["Swiss franc"]}
304 | {"question": "what countries does greece share borders with?", "answer": ["Turkey", "Republic of Macedonia", "Albania", "Lake Prespa", "Bulgaria"]}
305 | {"question": "what year was lebron james rookie season?", "answer": ["2003–04 NBA season"]}
306 | {"question": "what is cindy sherman known for?", "answer": ["Photographer"]}
307 | {"question": "what movies has carmen electra been in?", "answer": ["The Mating Habits of the Earthbound Human", "Scary Movie", "Getting Played", "Cheaper by the Dozen 2", "Meet the Spartans", "I Want Candy", "Full of It", "The Chosen One: Legend of the Raven", "Scary Movie 4", "Dirty Love"]}
308 | {"question": "what is the political structure of china?", "answer": ["Single-party state", "Communist state", "Socialist state"]}
309 | {"question": "which states does the colorado river run through?", "answer": ["Utah", "Arizona", "Nevada", "California", "Colorado"]}
310 | {"question": "who is moira en x men?", "answer": ["Mutant"]}
311 | {"question": "what are the major cities in ukraine?", "answer": ["Kiev"]}
312 | {"question": "what countries are part of the uk?", "answer": ["Scotland", "England", "Wales", "Northern Ireland"]}
313 | {"question": "who was the soviet leader during world war ii?", "answer": ["Alexei Negmatov"]}
314 | {"question": "what kind of government does libya have today?", "answer": ["Provisional government", "Parliamentary republic"]}
315 | {"question": "where did aaron rodgers go to high school?", "answer": ["Pleasant Valley High School"]}
316 | {"question": "what year was the first miss america pageant held?", "answer": ["1930 Miss America"]}
317 | {"question": "what did george clemenceau do?", "answer": ["Newspaper", "Physician", "Statesman", "Publisher"]}
318 | {"question": "what state is mount st. helens in?", "answer": ["Washington"]}
319 | {"question": "what is st anthony patron saint of?", "answer": ["Padua"]}
320 | {"question": "who did france surrender to in ww2?", "answer": ["Germany"]}
321 | {"question": "who plays edward scissorhands?", "answer": ["Johnny Depp"]}
322 | {"question": "where is the university of maryland medical school?", "answer": ["Maryland", "United States of America", "Baltimore"]}
323 | {"question": "what language do people speak in the netherlands?", "answer": ["Frisian languages", "West Flemish", "Dutch Language"]}
324 | {"question": "what countries are part of the baltic?", "answer": ["Latvia", "Lithuania", "Estonia"]}
325 | {"question": "when did florida marlins join mlb?", "answer": ["1993 Major League Baseball Season"]}
326 | {"question": "who plays the voice of kitt in knight rider?", "answer": ["William Daniels"]}
327 | {"question": "what did the scientist chadwick discovered?", "answer": ["Neutron"]}
328 | {"question": "where did queensland get its name from?", "answer": ["Queen Victoria"]}
329 | {"question": "what music period did beethoven live in?", "answer": ["Opera", "Classical music"]}
330 | {"question": "which countries speak german officially?", "answer": ["Canada", "German Democratic Republic", "Luxembourg", "Switzerland", "Liechtenstein", "Germany", "West Germany", "Belgium", "Vatican City", "Second Polish Republic"]}
331 | {"question": "what year did president william henry harrison take office?", "answer": ["3/4/1841"]}
332 | {"question": "what are the songs that justin bieber wrote?", "answer": ["Catching Feelings", "Down to Earth", "Beauty and a Beat", "All Around The World (featuring Ludacris)", "Die in Your Arms", "As Long As You Love Me (featuring Big Sean)", "Baby", "Believe", "Be Alright", "Boyfriend"]}
333 | {"question": "what language brazil speak?", "answer": ["Brazilian Portuguese", "Portuguese Language", "Italian Language"]}
334 | {"question": "which countries share a border with russia?", "answer": ["Ukraine", "Belarus", "Kazakhstan", "Poland", "Lithuania", "Azerbaijan", "Mongolia", "North Korea", "Georgia", "Norway"]}
335 | {"question": "who is princess leia in star wars?", "answer": ["Carrie Fisher"]}
336 | {"question": "what are some of the traditions of islam?", "answer": ["Zakāt", "Salah", "Hajj", "Islamic dietary laws", "Mosque Carpet", "Jihad", "Sawm", "Halal food", "Shahada", "Adab"]}
337 | {"question": "what team was chris paul on?", "answer": ["Los Angeles Clippers"]}
338 | {"question": "what capital of austria?", "answer": ["Vienna"]}
339 | {"question": "what county is greeley colorado in?", "answer": ["Weld County"]}
340 | {"question": "what was lebron james first team?", "answer": ["Cleveland Cavaliers"]}
341 | {"question": "what industry does walmart operate in?", "answer": ["Department Stores", "Retail", "Variety Stores"]}
342 | {"question": "what was the title of the book charles darwin wrote?", "answer": ["The Structure and Distribution of Coral Reefs", "On evolution", "A student's introduction to Charles Darwin", "Climbing Plants", "The Expression of the Emotions in Man and Animals", "The origin of species : complete and fully illustrated", "The Origin of Species", "The Life of Erasmus Darwin", "The Autobiography of Charles Darwin", "The Descent of Man, and Selection in Relation to Sex"]}
343 | {"question": "what was robert burns?", "answer": ["Poet", "Writer", "Bard", "Author"]}
344 | {"question": "what battles did stonewall jackson fight in?", "answer": ["Battle of Manassas Station Ops.", "Battle of McDowell", "Battle of Port Republic", "Battle of Rappahannock Station I", "Battle of Chancellorsville", "Battle of Front Royal", "Battle of Hoke's Run", "Battle of Cedar Mountain", "First Battle of Winchester", "Battle of Hancock"]}
345 | {"question": "who is governor of ohio 2011?", "answer": ["John Kasich"]}
346 | {"question": "who does donnie wahlberg play in the sixth sense?", "answer": ["Vincent Grey"]}
347 | {"question": "where is laos in world map?", "answer": ["Cambodia"]}
348 | {"question": "what do you call the chinese writing system?", "answer": ["Standard Mandarin"]}
349 | {"question": "what was jesse james killed with?", "answer": ["Assassination", "Firearm"]}
350 | {"question": "when does the mayan calendar end exactly?", "answer": ["2012"]}
351 | {"question": "who does jordan palmer play for?", "answer": ["Jacksonville Jaguars"]}
352 | {"question": "who was vice president after kennedy died?", "answer": ["Lyndon B. Johnson"]}
353 | {"question": "where did salvador dali study art?", "answer": ["Real Academia de Bellas Artes de San Fernando"]}
354 | {"question": "what was john quincy adams famous for?", "answer": ["Secretary of State", "President", "Ambassador", "Member of Congress", "Senator"]}
355 | {"question": "where did giuliana rancic grow up?", "answer": ["Naples"]}
356 | {"question": "where is the university of the rockies located?", "answer": ["United States of America", "Colorado", "Colorado Springs"]}
357 | {"question": "what did george orwell died of?", "answer": ["Tuberculosis"]}
358 | {"question": "what type of sports do japanese play?", "answer": ["Japan national football team", "Japan women's national handball team", "Japan men's national volleyball team", "Japan national handball team", "Japan women's national volleyball team", "Japan national baseball team"]}
359 | {"question": "who is michael buble?", "answer": ["Singer", "Actor", "Singer-songwriter"]}
360 | {"question": "what countries in the world speak chinese?", "answer": ["Canada", "Brunei", "Singapore", "Malaysia", "Shěn", "Vietnam", "China"]}
361 | {"question": "who is the head coach of inter milan?", "answer": ["Andrea Stramaccioni"]}
362 | {"question": "what are the important holidays of islam?", "answer": ["Eid al-Fitr", "Ramadan", "Eid al-Adha"]}
363 | {"question": "what language do the maasai tribe speak?", "answer": ["Maasai Language"]}
364 | {"question": "where did william morris go to college?", "answer": ["Marlborough College", "Exeter College, Oxford", "University of Oxford"]}
365 | {"question": "what type of music did john lennon sing?", "answer": ["Experimental rock", "Pop rock", "Pop music", "Blues-rock", "Art rock", "Soft rock", "Psychedelic rock", "Rock music", "Experimental music"]}
366 | {"question": "what form of currency does china have?", "answer": ["Renminbi"]}
367 | {"question": "where did dutch language come from?", "answer": ["Europeans"]}
368 | {"question": "who did paul revere marry?", "answer": ["Sarah Revere"]}
369 | {"question": "what country did germany invade first in ww1?", "answer": ["Belgium"]}
370 | {"question": "what did stephen hawking become famous for?", "answer": ["Author", "Mathematician", "Professor", "Writer", "Cosmologist", "Physicist", "Actor", "Astronomer", "Scientist", "Science writer"]}
371 | {"question": "what countries share borders with spain?", "answer": ["Perejil Island", "Portugal", "France", "Andorra", "Morocco", "Gibraltar"]}
372 | {"question": "who was the father of king george vi?", "answer": ["George V"]}
373 | {"question": "who was the leader of soviet union during wwii?", "answer": ["Joseph Stalin"]}
374 | {"question": "what did drita find out?", "answer": ["Football Superleague of Kosovo"]}
375 | {"question": "what did james k polk do before he was president?", "answer": ["Lawyer"]}
376 | {"question": "what make of bike did steve mcqueen ride in the great escape?", "answer": ["Yucatan"]}
377 | {"question": "what type of music did claude debussy play?", "answer": ["Ballet", "French opera", "Art song", "Incidental music", "Classical music", "20th-century classical music"]}
378 | {"question": "what language does egyptian people speak?", "answer": ["Modern Standard Arabic"]}
379 | {"question": "who won the battle of gettysburg union or confederate?", "answer": ["Union"]}
380 | {"question": "what works of art did leonardo da vinci produce?", "answer": ["Ginevra de' Benci", "The Last Supper", "The Virgin and Child with St Anne and St John the Baptist", "Mona Lisa", "Benois Madonna", "Madonna Litta", "Lady with an Ermine", "St. John the Baptist", "The Virgin and Child with St. Anne", "Annunciation"]}
381 | {"question": "when was president john adams elected?", "answer": ["John Adams Presidential Campaign, 1796", "John Adams Presidential Campaign, 1800"]}
382 | {"question": "what are the four main languages spoken in spain?", "answer": ["Basque Language", "Galician Language", "Catalan language", "Occitan language"]}
383 | {"question": "who did john kennedy have affairs with?", "answer": ["william averell harriman appointed by john fitzgerald kennedy"]}
384 | {"question": "who are the colorado representatives?", "answer": ["Wayne Allard", "Ken Salazar", "Gordon L. Allott", "Hank Brown", "Edwin C. Johnson", "William L. Armstrong", "Floyd K. Haskell", "Mark Udall", "Michael Bennet"]}
385 | {"question": "where did andy murray started playing tennis?", "answer": ["United Kingdom"]}
386 | {"question": "what does ringo sing?", "answer": ["Yellow Submarine", "\"Pinocchio Medley (\"Do You See the Noses Growing?\"", "Good Night", "I Shall Be Released", "Roll Over Beethoven", "Tommy's Holiday Camp", "Sweet Little Sixteen", "California Calling", "Honey Don't", "Don’t Pass Me By"]}
387 | {"question": "what team did david beckham play for in 2011?", "answer": ["LA Galaxy"]}
388 | {"question": "who is washington redskins backup qb?", "answer": ["Rex Grossman"]}
389 | {"question": "what was lucille ball?", "answer": ["Singer", "Model", "Comedian", "Television Producer", "Actor"]}
390 | {"question": "where is the nra headquarters located?", "answer": ["Fairfax"]}
391 | {"question": "what type of government does the us follow?", "answer": ["Presidential system", "Federal republic", "Representative democracy", "Two-party system", "Constitutional republic", "Republic"]}
392 | {"question": "who did carrie ann inaba get engaged to?", "answer": ["Jesse Sloan"]}
393 | {"question": "where do american bulldogs originate from?", "answer": ["United States of America"]}
394 | {"question": "what is william taft known for?", "answer": ["President of the United States"]}
395 | {"question": "who led the campaign in the shenandoah valley?", "answer": ["Jackson's Valley Campaign"]}
396 | {"question": "what did peter tchaikovsky do?", "answer": ["Composer"]}
397 | {"question": "what university did gordon brown attend?", "answer": ["University of Edinburgh"]}
398 | {"question": "when did michael jordan return to the nba?", "answer": ["2001–02 NBA season"]}
399 | {"question": "what is the zip code for midland tx?", "answer": ["79702", "79710", "79708", "79706", "79707", "79704", "79705", "79711", "79703", "79701"]}
400 | {"question": "what document did james madison write?", "answer": ["The Federalist Papers", "The Papers of James Madison: Presidential Series", "The Papers of James Madison: Congressional Series", "The Papers of James Madison: Secretary of State Series", "The Papers of James Madison", "The Papers of James Madison: Retirement Series"]}
401 | 


--------------------------------------------------------------------------------
/download.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import re
  4 | from tqdm import tqdm
  5 | import requests
  6 | import json, argparse
  7 | 
  8 | sess = requests.Session()
  9 | 
 10 | def parse_args():
 11 |     parser = argparse.ArgumentParser()
 12 |     parser.add_argument('--link', '-l', type=str, required=True, help='Share link of Tsinghua Cloud')
 13 |     parser.add_argument('--password', '-p', type=str, default='', help='Password of the share link')
 14 |     parser.add_argument('--save', '-s', type=str, default='./', help='Save directory')
 15 |     parser.add_argument('--file', '-f', type=str, default=None, help='File name, support regex, if not set, download all files')
 16 |     return parser.parse_args()
 17 | 
 18 | def get_share_key(url):
 19 |     prefix = 'https://cloud.tsinghua.edu.cn/d/'
 20 |     if not url.startswith(prefix):
 21 |         raise ValueError('Share link of Tsinghua Cloud should start with {}'.format(prefix))
 22 |     share_key = url[len(prefix):].replace('/', '')     
 23 |     print('Share key: {}'.format(share_key))
 24 |     
 25 |     return share_key
 26 |         
 27 |     
 28 | def dfs_search_files(share_key: str, path="/"):
 29 |     global sess
 30 |     filelist = []
 31 |     print('https://cloud.tsinghua.edu.cn/api/v2.1/share-links/{}/dirents/?path={}'.format(share_key, path))
 32 |     r = sess.get('https://cloud.tsinghua.edu.cn/api/v2.1/share-links/{}/dirents/?path={}'.format(share_key, path))
 33 |     objects = r.json()['dirent_list']
 34 |     for obj in objects:
 35 |         if obj["is_dir"]:
 36 |             filelist += dfs_search_files(share_key, obj['folder_path'])
 37 |         else:
 38 |             filelist.append(obj)
 39 | 
 40 |     return filelist
 41 |     
 42 | def download_single_file(url: str, fname: str):
 43 |     global sess
 44 |     resp = sess.get(url, stream=True)
 45 |     total = int(resp.headers.get('content-length', 0))
 46 |     dir_name = os.path.dirname(fname)
 47 |     if not os.path.exists(dir_name):
 48 |         os.makedirs(dir_name)
 49 |     with open(fname, 'wb') as file, tqdm(
 50 |         total=total,
 51 |         ncols=120,
 52 |         unit='iB',
 53 |         unit_scale=True,
 54 |         unit_divisor=1024,
 55 |     ) as bar:
 56 |         for data in resp.iter_content(chunk_size=1024):
 57 |             size = file.write(data)
 58 |             bar.update(size)
 59 | 
 60 | def download(url, save_dir):
 61 |     share_key = get_share_key(url)
 62 |     
 63 |     print("Searching for files to be downloaded...")
 64 |     search_files = dfs_search_files(share_key)
 65 |     # for file in search_files:
 66 |     #     print(file['is_dir'], file.keys())
 67 |     filelist = sorted(search_files, key=lambda x: x['file_path'])
 68 |     print("Found {} files in the share link.".format(len(filelist)))
 69 |     print("Last Modified Time".ljust(25), " ", "File Size".rjust(10), " ", "File Path")
 70 |     print("-" * 100)
 71 |     for file in filelist:
 72 |         print(file["last_modified"], " ", str(file["size"]).rjust(10), " ", file["file_path"])
 73 |     print("-" * 100)
 74 |     
 75 |     if not args.yes:
 76 |         while True:
 77 |             key = input("Start downloading? [y/n]")
 78 |             if key == 'y':
 79 |                 break
 80 |             elif key == 'n':
 81 |                 return
 82 |     
 83 |     flag = True
 84 |     for i, file in enumerate(filelist):
 85 |         file_url = 'https://cloud.tsinghua.edu.cn/d/{}/files/?p={}&dl=1'.format(share_key, file["file_path"])
 86 |         save_path = os.path.join(save_dir, file["file_path"][1:])
 87 |         if not os.path.exists(save_dir):
 88 |             os.makedirs(save_dir)
 89 |         print("[{}/{}] Downloading File: {}".format(i + 1, len(filelist), save_path))
 90 |         try:
 91 |             download_single_file(file_url, save_path)
 92 |         except Exception as e:
 93 |             print("Error happened when downloading file: {}".format(save_path))
 94 |             print(e)
 95 |             flag = False
 96 |     if flag:
 97 |         print("Download finished.")
 98 |     else:
 99 |         print("Download finished with error.")
100 |     
101 |     return flag
102 | 
103 | def make_data(sample):
104 |     src = ""
105 |     for ix, ref in enumerate(sample['references']):
106 |         src += "Reference [%d]: %s\\" % (ix+1, ref)
107 |     src += "Question: %s\\Answer:" % (sample['question'])
108 |     source = src.replace("\n", " ").replace("\r", " ")
109 |     target = sample['answer'].replace("\n"," ").replace("\r", " ")
110 |     
111 |     return source, target
112 |     
113 | if __name__ == "__main__":
114 |     
115 |     arg = argparse.ArgumentParser()
116 |     arg.add_argument('target', type=str, choices=["generator-training-data", "retriever-training-data", "retriever-pretrained-checkpoint", "all"], help='Target to download')
117 |     arg.add_argument('--save', '-s', type=str, default='./download', help='Save directory')
118 |     arg.add_argument("-y", "--yes", action="store_true", help="Download without confirmation")
119 |     args = arg.parse_args()
120 |     
121 |     if args.target in ["all", "generator-training-data"]:
122 |         
123 |         save_dir = os.path.join(args.save, 'generator-training-data', 'raw')
124 |         if download('https://cloud.tsinghua.edu.cn/d/d290dcfc92e342f9a017/', save_dir):
125 |             
126 |             for split in ['train', 'val', 'test']:
127 |                 ds = [json.loads(data) for data in open(f'{save_dir}/{split}.jsonl').readlines()]
128 |                 processed_dir = os.path.join(args.save, 'generator-training-data', 'processed')
129 |                 if not os.path.exists(processed_dir):
130 |                     os.makedirs(processed_dir)
131 |                 source_out = open(os.path.join(processed_dir, f'{split}.source'), 'w')
132 |                 target_out = open(os.path.join(processed_dir, f'{split}.target'), 'w')
133 |                 for sample in tqdm(ds):
134 |                     source, target = make_data(sample)
135 |                     source_out.write(source + '\n')
136 |                     target_out.write(target + '\n')
137 |             
138 |                 source_out.close()
139 |                 target_out.close()
140 |             
141 |     if args.target in ["all", "retriever-training-data"]:
142 |         download("https://cloud.tsinghua.edu.cn/d/3927b67a834c475288e2/", os.path.join(args.save, 'retriever-training-data'))
143 |         
144 |     if args.target in ["all", "retriever-pretrained-checkpoint"]:
145 |         download("https://cloud.tsinghua.edu.cn/d/bc96946dd9a14c84b8d4/", os.path.join(args.save, 'retriever-pretrained-checkpoint'))  
146 | 


--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
 1 | from arguments import get_args
 2 | from model import load_model
 3 | 
 4 | def main():
 5 |     args = get_args()
 6 |     
 7 |     webglm = load_model(args)
 8 |     
 9 |     task = args.task
10 |     if task == 'triviaqa':
11 |         from evaluate.triviaqa import eval
12 |     elif task == 'nq_open':
13 |         from evaluate.eval import eval
14 |     elif task == 'web_questions':
15 |         from evaluate.eval import eval
16 |     else:
17 |         raise "Task Name Error!"
18 |     
19 |     print('WebGLM Initialize Done. Start Evaluating...')
20 |     result = eval(webglm, args)
21 |     print(f'Result: {result}')
22 |     print('Evaluate Done')
23 | 
24 | if __name__ == "__main__":
25 |     main()


--------------------------------------------------------------------------------
/evaluate/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/WebGLM/dd03d8fe05b504dc734f52e8689818deff643912/evaluate/__init__.py


--------------------------------------------------------------------------------
/evaluate/eval.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from tqdm import tqdm
 3 | 
 4 | def eval(model, args):
 5 |     ds = [json.loads(data_str) for data_str in open(args.evaluate_task_data_path).readlines()]
 6 |     
 7 |     correct, total = 0, 0
 8 |     
 9 |     for ix, sample in enumerate(tqdm(ds)):
10 |         predict = model.query(sample['question'])['answer']
11 |         for label in sample['answer']:
12 |             if label in predict:
13 |                 correct += 1
14 |                 break
15 |         total += 1
16 |     
17 |     return correct / total


--------------------------------------------------------------------------------
/evaluate/triviaqa.py:
--------------------------------------------------------------------------------
 1 | from transformers import BigBirdTokenizer, BigBirdForQuestionAnswering
 2 | import torch
 3 | import json
 4 | from tqdm import tqdm
 5 | 
 6 | PUNCTUATION_SET_TO_EXCLUDE = set(''.join(['‘', '’', '´', '`', '.', ',', '-', '"', '\'', '[', ']', '{', '}', '(', ')', '!', '?']))
 7 | 
 8 | def get_sub_answers(answers, begin=0, end=None):
 9 |     return [" ".join(x.split(" ")[begin:end]) for x in answers if len(x.split(" ")) > 1]
10 | 
11 | def expand_to_aliases(given_answers, ignore_prefix=False, ignore_suffix=False):
12 |     if ignore_prefix:
13 |         given_answers = given_answers + get_sub_answers(given_answers, begin=1)
14 |     if ignore_suffix:
15 |         given_answers = given_answers + get_sub_answers(given_answers, end=-1)
16 |     answers = []
17 |     for answer in given_answers:
18 |         alias = answer.replace('_', ' ').lower()
19 |         alias = ''.join(c if c not in PUNCTUATION_SET_TO_EXCLUDE else ' ' for c in alias)
20 |         answers.append(' '.join(alias.split()).strip())
21 |     return set(answers)
22 | 
23 | 
24 | def get_best_valid_start_end_idx(start_scores, end_scores, top_k=1, max_size=100):
25 |     best_start_scores, best_start_idx = torch.topk(start_scores, top_k)
26 |     best_end_scores, best_end_idx = torch.topk(end_scores, top_k)
27 | 
28 |     widths = best_end_idx[:, None] - best_start_idx[None, :]
29 |     mask = torch.logical_or(widths < 0, widths > max_size)
30 |     scores = (best_end_scores[:, None] + best_start_scores[None, :]) - (1e8 * mask)
31 |     best_score = torch.argmax(scores).item()
32 | 
33 |     return best_start_idx[best_score % top_k], best_end_idx[best_score // top_k]
34 | 
35 | def extract(extractor, tokenizer, example):
36 |     encoding = tokenizer(example["question"], example["predict"], return_tensors="pt", max_length=512, padding="max_length", truncation=True)
37 |     input_ids = encoding['input_ids'].to("cuda")
38 | 
39 |     with torch.no_grad():
40 |         start_scores, end_scores = extractor(input_ids=input_ids).to_tuple()
41 | 
42 |     start_score, end_score = get_best_valid_start_end_idx(start_scores[0], end_scores[0], top_k=8, max_size=16)
43 | 
44 |     all_tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"][0].tolist())
45 |     answer_tokens = all_tokens[start_score: end_score + 1]
46 | 
47 |     example["output"] = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens))
48 |     
49 |     answers = expand_to_aliases(example["answer"], ignore_prefix=True, ignore_suffix=True)
50 |     predictions = expand_to_aliases([example["output"]], ignore_prefix=True)
51 |     
52 |     example["match"] = len(list(answers & predictions)) > 0
53 | 
54 |     return example
55 | 
56 | 
57 | def eval(model, args):
58 |     ds = [json.loads(data_str) for data_str in open(args.evaluate_task_data_path).readlines()]
59 |     
60 |     for ix, sample in enumerate(tqdm(ds)):
61 |         output = model.query(sample['question'])
62 |         ds[ix]['predict'] = output['answer']
63 |     
64 |     print('Start Extracting Answer...')
65 |     
66 |     tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-base-trivia-itc")
67 |     extractor = BigBirdForQuestionAnswering.from_pretrained("google/bigbird-base-trivia-itc").to("cuda")
68 |     
69 |     scores = {}
70 |     acc = {}
71 |     
72 |     for sample in tqdm(ds):
73 |         example = {}
74 |         match = extract(extractor, tokenizer, sample)['match']
75 |         labels = sample['labels']
76 |         for label in labels:
77 |             if label not in scores:
78 |                 scores[label] = [0, 0]
79 |             scores[label][1] += 1
80 |             if match:
81 |                 scores[label][0] += 1
82 |     
83 |     for split, data in scores.items():
84 |         acc[split] = data[0] / data[1]
85 |     
86 |     return acc


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .modeling_webglm import WebGLM, load_model
2 | from .utils import citation_correction


--------------------------------------------------------------------------------
/model/modeling_webglm.py:
--------------------------------------------------------------------------------
 1 | from .retriever import ReferenceRetiever
 2 | from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 3 | import re, os
 4 | 
 5 | class WebGLM:
 6 |     def __init__(self, webglm_ckpt_path, retriever_ckpt_path, device=None, filter_max_batch_size=400, searcher_name="serpapi") -> None:
 7 |         self.device = device
 8 |         self.ref_retriever = ReferenceRetiever(retriever_ckpt_path, device, filter_max_batch_size, searcher_name)
 9 |         self.tokenizer = AutoTokenizer.from_pretrained(webglm_ckpt_path, trust_remote_code=True)
10 |         self.model = AutoModelForSeq2SeqLM.from_pretrained(webglm_ckpt_path, trust_remote_code=True)
11 |         self.model = self.model.half()
12 |         if device:
13 |             self.model.to(device)
14 |         self.model.eval()
15 |     
16 |     def query(self, question):
17 |         refs = self.ref_retriever.query(question)
18 |         if not refs:
19 |             return { "references": [], "answer": "" }
20 |         prompt = ''
21 |         for ix, ref in enumerate(refs):
22 |             txt = ref["text"]
23 |             prompt += f'Reference [{ix+1}]: {txt}' '\\'
24 |         prompt += f'Question: {question}\\Answer: [gMASK]'
25 |         inputs = self.tokenizer(prompt, return_tensors="pt")
26 |         inputs = self.tokenizer.build_inputs_for_generation(inputs, max_gen_length=1024)
27 |         if self.device:
28 |             inputs = inputs.to(self.device)
29 |         outputs = self.model.generate(**inputs, max_length=1024, eos_token_id = self.tokenizer.eop_token_id, pad_token_id=self.tokenizer.eop_token_id)
30 |         f = re.findall(r"<\|startofpiece\|>(.+)<\|endofpiece\|>", self.tokenizer.decode(outputs[0].tolist()))
31 |         assert len(f) > 0
32 |         return { "answer": f[0].strip(), "references": refs}
33 |     
34 |     def stream_query(self, question):
35 |         refs = self.ref_retriever.query(question)
36 |         if not refs:
37 |             yield { "references": [], "answer": "" }
38 |             return
39 |         yield { "references": refs }
40 |         prompt = ''
41 |         for ix, ref in enumerate(refs):
42 |             txt = ref["text"]
43 |             prompt += f'Reference [{ix+1}]: {txt}' '\\'
44 |         prompt += f'Question: {question}\\Answer: [gMASK]'
45 |         inputs = self.tokenizer(prompt, return_tensors="pt")
46 |         inputs = self.tokenizer.build_inputs_for_generation(inputs, max_gen_length=1024)
47 |         if self.device:
48 |             inputs = inputs.to(self.device)
49 |         outputs = self.model.generate(**inputs, max_length=1024, eos_token_id = self.tokenizer.eop_token_id, pad_token_id=self.tokenizer.eop_token_id)
50 |         f = re.findall(r"<\|startofpiece\|>(.+)<\|endofpiece\|>", self.tokenizer.decode(outputs[0].tolist()))
51 |         assert len(f) > 0
52 |         yield { "answer": f[0].strip() }
53 | 
54 | 
55 | def load_model(args):
56 |     webglm_ckpt_path = args.webglm_ckpt_path or os.getenv("WEBGLM_CKPT") or 'THUDM/WebGLM'
57 |     retiever_ckpt_path = args.retriever_ckpt_path or os.getenv("WEBGLM_RETRIEVER_CKPT")
58 |     if not retiever_ckpt_path:
59 |         print('Retriever checkpoint not specified, please specify it with --retriever_ckpt_path or $WEBGLM_RETRIEVER_CKPT')
60 |         exit(1)
61 |     if args.serpapi_key:
62 |         os.environ["SERPAPI_KEY"] = args.serpapi_key
63 |     
64 |     print('WebGLM Initializing...')
65 |     
66 |     webglm = WebGLM(webglm_ckpt_path, retiever_ckpt_path, args.device, args.filter_max_batch_size, args.searcher)
67 |     
68 |     print('WebGLM Loaded')
69 |     
70 |     return webglm


--------------------------------------------------------------------------------
/model/retriever/__init__.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from .searching import create_searcher
 3 | from .fetching import Fetcher
 4 | from .extracting import Extractor
 5 | from .filtering import ReferenceFilter
 6 | 
 7 | from typing import Optional, Union, List, Dict, Tuple, Iterable, Callable, Any
 8 | 
 9 | class ReferenceRetiever():
10 |     def __init__(self, retriever_ckpt_path, device=None, filter_max_batch_size=400, searcher="serpapi") -> None:
11 |         self.searcher = create_searcher(searcher)
12 |         self.fetcher = Fetcher()
13 |         self.extractor = Extractor()
14 |         self.filter = ReferenceFilter(retriever_ckpt_path, device, filter_max_batch_size)
15 | 
16 |     def query(self, question) -> List[Dict[str, str]]:
17 |         print("[System] Searching ...")
18 |         search_results = self.searcher.search(question)
19 |         urls = [result.url for result in search_results]
20 |         titles = {result.url: result.title for result in search_results}
21 |         print("[System] Count of available urls: ", len(urls))
22 |         if len(urls) == 0:
23 |             print("[System] No available urls. Please check your network connection.")
24 |             return None
25 |             
26 |         print("[System] Fetching ...")
27 |         fetch_results = self.fetcher.fetch(urls)
28 |         cnt = sum([len(fetch_results[key]) for key in fetch_results])
29 |         print("[System] Count of available fetch results: ", cnt)
30 |         if cnt == 0:
31 |             print("[System] No available fetch results. Please check playwright or your network.")
32 |             return None
33 |             
34 |         print("[System] Extracting ...")
35 |         data_list = []
36 |         for url in fetch_results:
37 |             extract_results = self.extractor.extract_by_html2text(fetch_results[url])
38 |             for value in extract_results:
39 |                 data_list.append({
40 |                     "url": url,
41 |                     "title": titles[url],
42 |                     "text": value
43 |                 })
44 |         print("[System] Count of paragraphs: ", len(data_list))
45 |         if len(data_list) == 0:
46 |             print("[System] No available paragraphs. The references provide no useful information.")
47 |             return None
48 |         
49 |         print("[System] Filtering ...")
50 |         return self.filter.produce_references(question, data_list, 5)


--------------------------------------------------------------------------------
/model/retriever/extracting/__init__.py:
--------------------------------------------------------------------------------
 1 | from .extracting_by_bs4 import extracting as bs4
 2 | from .html2text import html2text
 3 | 
 4 | from typing import List, Dict
 5 | import re
 6 | 
 7 | class Extractor:
 8 |     def __init__(self) -> None:
 9 |         pass
10 |     
11 |     def _pre_filter(self, paragraphs):
12 |         # sorted_paragraphs = sorted(paragraphs, key=lambda x: len(x))
13 |         # if len(sorted_paragraphs[-1]) < 10:
14 |         #     return []
15 |         ret = []
16 |         for item in paragraphs:
17 |             item = item.strip()
18 |             item = re.sub(r"\[\d+\]", "", item) 
19 |             if len(item) < 50:
20 |                 continue
21 |             if len(item) > 1200:
22 |                 item = item[:1200] + "..."
23 |             ret.append(item)
24 |         return ret
25 |     
26 |     def extract_by_bs4(self, html) -> List[str]:
27 |         return self._pre_filter(bs4(html))
28 |     
29 |     def extract_by_html2text(self, html) -> List[str]:
30 |         return self._pre_filter(html2text(html).split("\n"))


--------------------------------------------------------------------------------
/model/retriever/extracting/extracting_by_bs4.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import asyncio
 3 | import multiprocessing
 4 | import json
 5 | import os
 6 | import sys
 7 | from typing import List, Dict
 8 | 
 9 | def extracting(html: str) -> List[str]:
10 |     html = html.replace("\n", " ")
11 |     soup = BeautifulSoup(html, 'html.parser')
12 |     raw = soup.find('body')
13 |     if raw:
14 |         raw = raw.get_text("\n")
15 |     else:
16 |         raw = soup.get_text("\n")
17 |     paragraphs = []
18 |     for item in raw.split("\n"):
19 |         item = item.strip()
20 |         if not item:
21 |             continue
22 |         paragraphs.append(item)
23 |     return paragraphs


--------------------------------------------------------------------------------
/model/retriever/extracting/html2text.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """html2text: Turn HTML into equivalent text (Markdown or plain text)."""
  3 | __version__ = "3.200.3"
  4 | __author__ = "Aaron Swartz (me@aaronsw.com)"
  5 | __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
  6 | __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
  7 | 
  8 | # TODO:
  9 | #   Support decoded entities with unifiable.
 10 | 
 11 | try:
 12 |     True
 13 | except NameError:
 14 |     setattr(__builtins__, 'True', 1)
 15 |     setattr(__builtins__, 'False', 0)
 16 | 
 17 | def has_key(x, y):
 18 |     if hasattr(x, 'has_key'): return x.has_key(y)
 19 |     else: return y in x
 20 | 
 21 | try:
 22 |     import htmlentitydefs
 23 |     import urlparse
 24 |     import HTMLParser
 25 | except ImportError: #Python3
 26 |     import html.entities as htmlentitydefs
 27 |     import urllib.parse as urlparse
 28 |     import html.parser as HTMLParser
 29 | try: #Python3
 30 |     import urllib.request as urllib
 31 | except:
 32 |     import urllib
 33 | import optparse, re, sys, codecs, types
 34 | 
 35 | try: from textwrap import wrap
 36 | except: pass
 37 | 
 38 | # Use Unicode characters instead of their ascii psuedo-replacements
 39 | UNICODE_SNOB = 0
 40 | 
 41 | # Escape all special characters.  Output is less readable, but avoids corner case formatting issues.
 42 | ESCAPE_SNOB = 0
 43 | 
 44 | # Put the links after each paragraph instead of at the end.
 45 | LINKS_EACH_PARAGRAPH = 0
 46 | 
 47 | # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
 48 | BODY_WIDTH = 78
 49 | 
 50 | # Don't show internal links (href="#local-anchor") -- corresponding link targets
 51 | # won't be visible in the plain text file anyway.
 52 | SKIP_INTERNAL_LINKS = True
 53 | 
 54 | # Use inline, rather than reference, formatting for images and links
 55 | INLINE_LINKS = True
 56 | 
 57 | # Number of pixels Google indents nested lists
 58 | GOOGLE_LIST_INDENT = 36
 59 | 
 60 | # Don't add markdown elements and output nicely for plain reading
 61 | NO_MARKDOWN = True
 62 | 
 63 | IGNORE_ANCHORS = True
 64 | IGNORE_IMAGES = True
 65 | IGNORE_EMPHASIS = True
 66 | 
 67 | ### Entity Nonsense ###
 68 | 
 69 | def name2cp(k):
 70 |     if k == 'apos': return ord("'")
 71 |     if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
 72 |         return htmlentitydefs.name2codepoint[k]
 73 |     else:
 74 |         k = htmlentitydefs.entitydefs[k]
 75 |         if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
 76 |         return ord(codecs.latin_1_decode(k)[0])
 77 | 
 78 | unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
 79 | 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
 80 | 'ndash':'-', 'oelig':'oe', 'aelig':'ae',
 81 | 'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
 82 | 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
 83 | 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
 84 | 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
 85 | 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u',
 86 | 'lrm':'', 'rlm':''}
 87 | 
 88 | # All types of possible quotation marks - this is used to strip any blockquotes
 89 | # before we add our own quotes in, for plain text formatting
 90 | all_quotes = u'\u0022\u0027\u00AB\u00BB\u2018\u2019\u201A\u201B\u201C\u201D\u201E\u201F\u2039\u203A'
 91 | 
 92 | unifiable_n = {}
 93 | 
 94 | for k in unifiable.keys():
 95 |     unifiable_n[name2cp(k)] = unifiable[k]
 96 | 
 97 | ### End Entity Nonsense ###
 98 | 
 99 | def onlywhite(line):
100 |     """Return true if the line does only consist of whitespace characters."""
101 |     for c in line:
102 |         if c != ' ' and c !=  '  ':
103 |             return c == ' '
104 |     return line
105 | 
106 | def hn(tag):
107 |     if tag[0] == 'h' and len(tag) == 2:
108 |         try:
109 |             n = int(tag[1])
110 |             if n in range(1, 10): return n
111 |         except ValueError: return 0
112 | 
113 | def dumb_property_dict(style):
114 |     """returns a hash of css attributes"""
115 |     return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]);
116 | 
117 | def dumb_css_parser(data):
118 |     """returns a hash of css selectors, each of which contains a hash of css attributes"""
119 |     # remove @import sentences
120 |     data += ';'
121 |     importIndex = data.find('@import')
122 |     while importIndex != -1:
123 |         data = data[0:importIndex] + data[data.find(';', importIndex) + 1:]
124 |         importIndex = data.find('@import')
125 | 
126 |     # parse the css. reverted from dictionary compehension in order to support older pythons
127 |     elements =  [x.split('{') for x in data.split('}') if '{' in x.strip()]
128 |     try:
129 |         elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements])
130 |     except ValueError:
131 |         elements = {} # not that important
132 | 
133 |     return elements
134 | 
135 | def element_style(attrs, style_def, parent_style):
136 |     """returns a hash of the 'final' style attributes of the element"""
137 |     style = parent_style.copy()
138 |     if 'class' in attrs:
139 |         for css_class in attrs['class'].split():
140 |             css_style = style_def['.' + css_class]
141 |             style.update(css_style)
142 |     if 'style' in attrs:
143 |         immediate_style = dumb_property_dict(attrs['style'])
144 |         style.update(immediate_style)
145 |     return style
146 | 
147 | def google_list_style(style):
148 |     """finds out whether this is an ordered or unordered list"""
149 |     if 'list-style-type' in style:
150 |         list_style = style['list-style-type']
151 |         if list_style in ['disc', 'circle', 'square', 'none']:
152 |             return 'ul'
153 |     return 'ol'
154 | 
155 | def google_has_height(style):
156 |     """check if the style of the element has the 'height' attribute explicitly defined"""
157 |     if 'height' in style:
158 |         return True
159 |     return False
160 | 
161 | def google_text_emphasis(style):
162 |     """return a list of all emphasis modifiers of the element"""
163 |     emphasis = []
164 |     if 'text-decoration' in style:
165 |         emphasis.append(style['text-decoration'])
166 |     if 'font-style' in style:
167 |         emphasis.append(style['font-style'])
168 |     if 'font-weight' in style:
169 |         emphasis.append(style['font-weight'])
170 |     return emphasis
171 | 
172 | def google_fixed_width_font(style):
173 |     """check if the css of the current element defines a fixed width font"""
174 |     font_family = ''
175 |     if 'font-family' in style:
176 |         font_family = style['font-family']
177 |     if 'Courier New' == font_family or 'Consolas' == font_family:
178 |         return True
179 |     return False
180 | 
181 | def list_numbering_start(attrs):
182 |     """extract numbering from list element attributes"""
183 |     if 'start' in attrs:
184 |         return int(attrs['start']) - 1
185 |     else:
186 |         return 0
187 | 
188 | class HTML2Text(HTMLParser.HTMLParser):
189 |     def __init__(self, out=None, baseurl=''):
190 |         HTMLParser.HTMLParser.__init__(self)
191 | 
192 |         # Config options
193 |         self.unicode_snob = UNICODE_SNOB
194 |         self.escape_snob = ESCAPE_SNOB
195 |         self.links_each_paragraph = LINKS_EACH_PARAGRAPH
196 |         self.body_width = BODY_WIDTH
197 |         self.skip_internal_links = SKIP_INTERNAL_LINKS
198 |         self.inline_links = INLINE_LINKS
199 |         self.google_list_indent = GOOGLE_LIST_INDENT
200 |         self.no_markdown = NO_MARKDOWN
201 |         self.ignore_links = IGNORE_ANCHORS
202 |         self.ignore_images = IGNORE_IMAGES
203 |         self.ignore_emphasis = IGNORE_EMPHASIS
204 |         self.google_doc = False
205 |         self.ul_item_mark = '*'
206 |         self.emphasis_mark = '_'
207 |         self.strong_mark = '**'
208 |         self.hr_mark = '* * *'
209 |         self.blockquote_marks = ('> ', '')
210 | 
211 |         if out is None:
212 |             self.out = self.outtextf
213 |         else:
214 |             self.out = out
215 | 
216 |         self.outtextlist = []  # empty list to store output characters before they are "joined"
217 | 
218 |         try:
219 |             self.outtext = unicode()
220 |         except NameError:  # Python3
221 |             self.outtext = str()
222 | 
223 |         self.quiet = 0
224 |         self.p_p = 0  # number of newline character to print before next output
225 |         self.outcount = 0
226 |         self.start = 1
227 |         self.space = 0
228 |         self.a = []
229 |         self.astack = []
230 |         self.maybe_automatic_link = None
231 |         self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://')
232 |         self.acount = 0
233 |         self.list = []
234 |         self.blockquote = 0
235 |         self.pre = 0
236 |         self.startpre = 0
237 |         self.code = False
238 |         self.br_toggle = ''
239 |         self.lastWasNL = 0
240 |         self.lastWasList = False
241 |         self.style = 0
242 |         self.style_def = {}
243 |         self.tag_stack = []
244 |         self.emphasis = 0
245 |         self.drop_white_space = 0
246 |         self.inheader = False
247 |         self.abbr_title = None  # current abbreviation definition
248 |         self.abbr_data = None  # last inner HTML (for abbr being defined)
249 |         self.abbr_list = {}  # stack of abbreviations to write later
250 |         self.baseurl = baseurl
251 |         self.last_tag_started = None # holds the most recent tag we entered
252 | 
253 |         try: del unifiable_n[name2cp('nbsp')]
254 |         except KeyError: pass
255 |         unifiable['nbsp'] = '&nbsp_place_holder;'
256 | 
257 |     def normalise_options(self):
258 |         """ Configure options just before handle """
259 |         if self.no_markdown:
260 |             # Configure for plain text output
261 |             self.body_width = 0
262 |             self.escape_snob = False
263 |             self.ignore_links = True
264 |             self.ignore_images = True
265 |             self.ignore_emphasis = True
266 |             if self.unicode_snob:
267 |                 self.ul_item_mark = u'\u2013'
268 |                 self.blockquote_marks = (u'\u201C', u'\u201D')
269 |                 self.hr_mark = u'\u2014\u2014\u2014'
270 |             else:
271 |                 self.ul_item_mark = '-'
272 |                 self.blockquote_marks = ('"', '"')
273 |                 self.hr_mark = '---'
274 | 
275 |     def feed(self, data):
276 |         data = data.replace("</' + 'script>", "</ignore>")
277 |         HTMLParser.HTMLParser.feed(self, data)
278 | 
279 |     def handle(self, data):
280 |         self.normalise_options()
281 |         self.feed(data)
282 |         self.feed(" ") 
283 |         return self.post_process(self.close())
284 | 
285 |     def outtextf(self, s):
286 |         self.outtextlist.append(s)
287 |         if s: self.lastWasNL = s[-1] == '\n'
288 | 
289 |     def close(self):
290 |         HTMLParser.HTMLParser.close(self)
291 | 
292 |         self.pbr()
293 |         self.o('', 0, 'end')
294 | 
295 |         self.outtext = self.outtext.join(self.outtextlist)
296 |         if self.unicode_snob:
297 |             nbsp = unichr(name2cp('nbsp'))
298 |         else:
299 |             nbsp = u' '
300 |         self.outtext = self.outtext.replace(u'&nbsp_place_holder;', nbsp)
301 | 
302 |         return self.outtext
303 | 
304 |     def handle_charref(self, c):
305 |         self.o(self.charref(c), 1)
306 | 
307 |     def handle_entityref(self, c):
308 |         self.o(self.entityref(c), 1)
309 | 
310 |     def handle_starttag(self, tag, attrs):
311 |         self.handle_tag(tag, attrs, 1)
312 | 
313 |     def handle_endtag(self, tag):
314 |         self.handle_tag(tag, None, 0)
315 | 
316 |     def previousIndex(self, attrs):
317 |         """ returns the index of certain set of attributes (of a link) in the
318 |             self.a list
319 | 
320 |             If the set of attributes is not found, returns None
321 |         """
322 |         if not has_key(attrs, 'href'): return None
323 | 
324 |         i = -1
325 |         for a in self.a:
326 |             i += 1
327 |             match = 0
328 | 
329 |             if has_key(a, 'href') and a['href'] == attrs['href']:
330 |                 if has_key(a, 'title') or has_key(attrs, 'title'):
331 |                         if (has_key(a, 'title') and has_key(attrs, 'title') and
332 |                             a['title'] == attrs['title']):
333 |                             match = True
334 |                 else:
335 |                     match = True
336 | 
337 |             if match: return i
338 | 
339 |     def drop_last(self, nLetters):
340 |         if not self.quiet:
341 |             self.outtext = self.outtext[:-nLetters]
342 | 
343 |     def handle_emphasis(self, start, tag_style, parent_style):
344 |         """handles various text emphases"""
345 |         tag_emphasis = google_text_emphasis(tag_style)
346 |         parent_emphasis = google_text_emphasis(parent_style)
347 | 
348 |         # handle Google's text emphasis
349 |         strikethrough =  'line-through' in tag_emphasis and self.hide_strikethrough
350 |         bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis
351 |         italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis
352 |         fixed = google_fixed_width_font(tag_style) and not \
353 |                 google_fixed_width_font(parent_style) and not self.pre
354 | 
355 |         if start:
356 |             # crossed-out text must be handled before other attributes
357 |             # in order not to output qualifiers unnecessarily
358 |             if bold or italic or fixed:
359 |                 self.emphasis += 1
360 |             if strikethrough:
361 |                 self.quiet += 1
362 |             if italic:
363 |                 self.o(self.emphasis_mark)
364 |                 self.drop_white_space += 1
365 |             if bold:
366 |                 self.o(self.strong_mark)
367 |                 self.drop_white_space += 1
368 |             if fixed:
369 |                 self.o('`')
370 |                 self.drop_white_space += 1
371 |                 self.code = True
372 |         else:
373 |             if bold or italic or fixed:
374 |                 # there must not be whitespace before closing emphasis mark
375 |                 self.emphasis -= 1
376 |                 self.space = 0
377 |                 self.outtext = self.outtext.rstrip()
378 |             if fixed:
379 |                 if self.drop_white_space:
380 |                     # empty emphasis, drop it
381 |                     self.drop_last(1)
382 |                     self.drop_white_space -= 1
383 |                 else:
384 |                     self.o('`')
385 |                 self.code = False
386 |             if bold:
387 |                 if self.drop_white_space:
388 |                     # empty emphasis, drop it
389 |                     self.drop_last(2)
390 |                     self.drop_white_space -= 1
391 |                 else:
392 |                     self.o(self.strong_mark)
393 |             if italic:
394 |                 if self.drop_white_space:
395 |                     # empty emphasis, drop it
396 |                     self.drop_last(1)
397 |                     self.drop_white_space -= 1
398 |                 else:
399 |                     self.o(self.emphasis_mark)
400 |             # space is only allowed after *all* emphasis marks
401 |             if (bold or italic) and not self.emphasis:
402 |                     self.o(" ")
403 |             if strikethrough:
404 |                 self.quiet -= 1
405 | 
406 |     def handle_tag(self, tag, attrs, start):
407 |         #attrs = fixattrs(attrs)
408 |         if attrs is None:
409 |             attrs = {}
410 |         else:
411 |             attrs = dict(attrs)
412 |         if start:
413 |             self.last_tag_started = tag
414 | 
415 |         if self.google_doc:
416 |             # the attrs parameter is empty for a closing tag. in addition, we
417 |             # need the attributes of the parent nodes in order to get a
418 |             # complete style description for the current element. we assume
419 |             # that google docs export well formed html.
420 |             parent_style = {}
421 |             if start:
422 |                 if self.tag_stack:
423 |                   parent_style = self.tag_stack[-1][2]
424 |                 tag_style = element_style(attrs, self.style_def, parent_style)
425 |                 self.tag_stack.append((tag, attrs, tag_style))
426 |             else:
427 |                 dummy, attrs, tag_style = self.tag_stack.pop()
428 |                 if self.tag_stack:
429 |                     parent_style = self.tag_stack[-1][2]
430 | 
431 |         if hn(tag):
432 |             self.p()
433 |             if not self.no_markdown:
434 |                 if start:
435 |                     self.inheader = True
436 |                     # self.o(hn(tag)*"#" + ' ')
437 |                 else:
438 |                     self.inheader = False
439 |                     return # prevent redundant emphasis marks on headers
440 | 
441 |         if tag in ['p', 'div']:
442 |             if self.google_doc:
443 |                 if start and google_has_height(tag_style):
444 |                     self.p()
445 |                 else:
446 |                     self.soft_br()
447 |             else:
448 |                 self.p()
449 | 
450 |         if tag == "br" and start:
451 |             self.o("  \n")
452 | 
453 |         if tag == "hr" and start:
454 |             self.p()
455 |             self.o(self.hr_mark)
456 |             self.p()
457 | 
458 |         if tag in ["head", "style", 'script']:
459 |             if start: self.quiet += 1
460 |             else: self.quiet -= 1
461 | 
462 |         if tag == "style":
463 |             if start: self.style += 1
464 |             else: self.style -= 1
465 | 
466 |         if tag == "body":
467 |             self.quiet = 0 # sites like 9rules.com never close <head>
468 | 
469 |         if tag == "blockquote":
470 |             if start:
471 |                 self.p(); 
472 |                 self.o(self.blockquote_marks[0], 0, 1)
473 |                 self.start = 1
474 |                 self.blockquote += 1
475 |             else:
476 |                 if self.no_markdown:
477 |                     # remove whitespace and extra quotes before adding our own quotes
478 |                     self.rstrip_outtext(all_quotes)
479 |                 self.o(self.blockquote_marks[1], 0, 1)
480 |                 self.blockquote -= 1
481 |                 self.p()
482 | 
483 |         if tag in ['em', 'i', 'u'] and not self.ignore_emphasis:
484 |             self.o(self.emphasis_mark)
485 |         if tag in ['strong', 'b'] and not self.ignore_emphasis: 
486 |             self.o(self.strong_mark)
487 |         if tag in ['del', 'strike', 's'] and not self.no_markdown:
488 |             if start:
489 |                 self.o("<"+tag+">")
490 |             else:
491 |                 self.o("</"+tag+">")
492 | 
493 |         if self.google_doc:
494 |             if not self.inheader and not self.no_markdown:
495 |                 # handle some font attributes, but leave headers clean
496 |                 self.handle_emphasis(start, tag_style, parent_style)
497 | 
498 |         if tag in ["code", "tt"] and not self.pre: 
499 |             self.o('`') #TODO: `` `this` ``
500 |         if tag == "abbr":
501 |             if start:
502 |                 self.abbr_title = None
503 |                 self.abbr_data = ''
504 |                 if has_key(attrs, 'title'):
505 |                     self.abbr_title = attrs['title']
506 |             else:
507 |                 if self.abbr_title != None:
508 |                     self.abbr_list[self.abbr_data] = self.abbr_title
509 |                     self.abbr_title = None
510 |                 self.abbr_data = ''
511 | 
512 |         if tag == "a" and not self.ignore_links:
513 |             if start:
514 |                 if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')):
515 |                     self.astack.append(attrs)
516 |                     self.maybe_automatic_link = attrs['href']
517 |                 else:
518 |                     self.astack.append(None)
519 |             else:
520 |                 if self.astack:
521 |                     a = self.astack.pop()
522 |                     if self.maybe_automatic_link:
523 |                         self.maybe_automatic_link = None
524 |                     elif a:
525 |                         if self.inline_links:
526 |                             self.o("](" + escape_md(a['href']) + ")")
527 |                         else:
528 |                             i = self.previousIndex(a)
529 |                             if i is not None:
530 |                                 a = self.a[i]
531 |                             else:
532 |                                 self.acount += 1
533 |                                 a['count'] = self.acount
534 |                                 a['outcount'] = self.outcount
535 |                                 self.a.append(a)
536 |                             self.o("][" + str(a['count']) + "]")
537 | 
538 |         if tag == "img" and start and not self.ignore_images:
539 |             if has_key(attrs, 'src'):
540 |                 attrs['href'] = attrs['src']
541 |                 alt = attrs.get('alt', '')
542 |                 self.o("![" + escape_md(alt) + "]")
543 | 
544 |                 if self.inline_links:
545 |                     self.o("(" + escape_md(attrs['href']) + ")")
546 |                 else:
547 |                     i = self.previousIndex(attrs)
548 |                     if i is not None:
549 |                         attrs = self.a[i]
550 |                     else:
551 |                         self.acount += 1
552 |                         attrs['count'] = self.acount
553 |                         attrs['outcount'] = self.outcount
554 |                         self.a.append(attrs)
555 |                     self.o("[" + str(attrs['count']) + "]")
556 | 
557 |         if tag == 'dl' and start: self.p()
558 |         if tag == 'dt' and not start: self.pbr()
559 |         if tag == 'dd' and start: self.o('    ')
560 |         if tag == 'dd' and not start: self.pbr()
561 | 
562 |         if tag in ["ol", "ul"]:
563 |             # Google Docs create sub lists as top level lists
564 |             if (not self.list) and (not self.lastWasList):
565 |                 self.p()
566 |             if start:
567 |                 if self.google_doc:
568 |                     list_style = google_list_style(tag_style)
569 |                 else:
570 |                     list_style = tag
571 |                 numbering_start = list_numbering_start(attrs)
572 |                 self.list.append({'name':list_style, 'num':numbering_start})
573 |             else:
574 |                 if self.list: self.list.pop()
575 |             self.lastWasList = True
576 |         else:
577 |             self.lastWasList = False
578 | 
579 |         if tag == 'li':
580 |             self.pbr()
581 |             if start:
582 |                 if self.list: li = self.list[-1]
583 |                 else: li = {'name':'ul', 'num':0}
584 |                 if self.google_doc:
585 |                     nest_count = self.google_nest_count(tag_style)
586 |                 else:
587 |                     nest_count = len(self.list)
588 |                 self.o("  " * nest_count) #TODO: line up <ol><li>s > 9 correctly.
589 |                 if li['name'] == "ul": self.o(self.ul_item_mark + " ")
590 |                 elif li['name'] == "ol":
591 |                     li['num'] += 1
592 |                     self.o(str(li['num'])+". ")
593 |                 self.start = 1
594 | 
595 |         if tag in ["table", "tr"] and start: self.p()
596 |         if tag == 'td': self.pbr()
597 | 
598 |         if tag == "pre":
599 |             if start:
600 |                 self.startpre = 1
601 |                 self.pre = 1
602 |             else:
603 |                 self.pre = 0
604 |             self.p()
605 | 
606 |     def pbr(self):
607 |         if self.p_p == 0:
608 |             self.p_p = 1
609 | 
610 |     def p(self):
611 |         self.p_p = 2
612 | 
613 |     def soft_br(self):
614 |         self.pbr()
615 |         self.br_toggle = '  '
616 | 
617 |     def o(self, data, puredata=0, force=0):
618 |         if self.abbr_data is not None:
619 |             self.abbr_data += data
620 | 
621 |         if not self.quiet:
622 |             if self.google_doc:
623 |                 # prevent white space immediately after 'begin emphasis' marks ('**' and '_')
624 |                 lstripped_data = data.lstrip()
625 |                 if self.drop_white_space and not (self.pre or self.code):
626 |                     data = lstripped_data
627 |                 if lstripped_data != '':
628 |                     self.drop_white_space = 0
629 | 
630 |             if puredata and not self.pre:
631 |                 data = re.sub('\s+', ' ', data)
632 |                 if data and data[0] == ' ':
633 |                     self.space = 1
634 |                     data = data[1:]
635 |             if not data and not force: return
636 | 
637 |             if self.startpre:
638 |                 #self.out(" :") #TODO: not output when already one there
639 |                 if not data.startswith("\n"):  # <pre>stuff...
640 |                     data = "\n" + data
641 | 
642 |             if puredata and self.last_tag_started == 'blockquote' and self.no_markdown:
643 |                 data = data.lstrip(' \t\n\r'+all_quotes)
644 | 
645 |             bq = ''
646 |             if not self.no_markdown:
647 |                 bq = (">" * self.blockquote)
648 |                 if not (force and data and data[0] == ">") and self.blockquote:
649 |                     bq += " "
650 | 
651 |             if self.pre:
652 |                 if not self.list:
653 |                     bq += "    "
654 |                 #else: list content is already partially indented
655 |                 for i in range(len(self.list)):
656 |                     bq += "    "
657 |                 data = data.replace("\n", "\n"+bq)
658 | 
659 |             if self.startpre:
660 |                 self.startpre = 0
661 |                 if self.list:
662 |                     data = data.lstrip("\n") # use existing initial indentation
663 | 
664 |             if self.start:
665 |                 self.space = 0
666 |                 self.p_p = 0
667 |                 self.start = 0
668 | 
669 |             if force == 'end':
670 |                 # It's the end.
671 |                 self.p_p = 0
672 |                 self.out("\n")
673 |                 self.space = 0
674 | 
675 |             if self.p_p:
676 |                 self.out((self.br_toggle+'\n'+bq)*self.p_p)
677 |                 self.space = 0
678 |                 self.br_toggle = ''
679 | 
680 |             if self.space:
681 |                 if not self.lastWasNL: self.out(' ')
682 |                 self.space = 0
683 | 
684 |             if self.a and ((self.p_p == 2 and self.links_each_paragraph) or force == "end"):
685 |                 if force == "end": self.out("\n")
686 | 
687 |                 newa = []
688 |                 for link in self.a:
689 |                     if self.outcount > link['outcount']:
690 |                         self.out("   ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
691 |                         if has_key(link, 'title'): self.out(" ("+link['title']+")")
692 |                         self.out("\n")
693 |                     else:
694 |                         newa.append(link)
695 | 
696 |                 if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
697 | 
698 |                 self.a = newa
699 | 
700 |             if self.abbr_list and force == "end":
701 |                 for abbr, definition in self.abbr_list.items():
702 |                     self.out("  *[" + abbr + "]: " + definition + "\n")
703 | 
704 |             self.p_p = 0
705 |             self.out(data)
706 |             self.outcount += 1
707 | 
708 |     def handle_data(self, data):
709 |         if r'\/script>' in data: self.quiet -= 1
710 | 
711 |         if self.style:
712 |             self.style_def.update(dumb_css_parser(data))
713 | 
714 |         if not self.maybe_automatic_link is None:
715 |             href = self.maybe_automatic_link
716 |             if href == data and self.absolute_url_matcher.match(href):
717 |                 self.o("<" + data + ">")
718 |                 return
719 |             else:
720 |                 self.o("[")
721 |                 self.maybe_automatic_link = None
722 | 
723 |         if not self.code and not self.pre and not self.no_markdown:
724 |             data = escape_md_section(data, snob=self.escape_snob)
725 |         self.o(data, 1)
726 | 
727 |     def unknown_decl(self, data): pass
728 | 
729 |     def charref(self, name):
730 |         if name[0] in ['x','X']:
731 |             c = int(name[1:], 16)
732 |         else:
733 |             c = int(name)
734 | 
735 |         if not self.unicode_snob and c in unifiable_n.keys():
736 |             return unifiable_n[c]
737 |         else:
738 |             try:
739 |                 return unichr(c)
740 |             except NameError: #Python3
741 |                 return chr(c)
742 | 
743 |     def entityref(self, c):
744 |         if not self.unicode_snob and c in unifiable.keys():
745 |             return unifiable[c]
746 |         else:
747 |             try: 
748 |                 name2cp(c)
749 |             except KeyError: 
750 |                 if self.no_markdown:
751 |                     # let original ampersand and character through
752 |                     return "&" + c
753 |                 else:
754 |                     return "&" + c + ';'
755 |             else:
756 |                 try:
757 |                     return unichr(name2cp(c))
758 |                 except NameError: #Python3
759 |                     return chr(name2cp(c))
760 | 
761 |     def replaceEntities(self, s):
762 |         s = s.group(1)
763 |         if s[0] == "#":
764 |             return self.charref(s[1:])
765 |         else: return self.entityref(s)
766 | 
767 |     r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
768 |     def unescape(self, s):
769 |         return self.r_unescape.sub(self.replaceEntities, s)
770 | 
771 |     def google_nest_count(self, style):
772 |         """calculate the nesting count of google doc lists"""
773 |         nest_count = 0
774 |         if 'margin-left' in style:
775 |             nest_count = int(style['margin-left'][:-2]) / self.google_list_indent
776 |         return nest_count
777 | 
778 |     def post_process(self, text):
779 |         if self.no_markdown:
780 |             # Tidy up for plain text response
781 |             text = remove_multi_blank_lines(text)
782 |         else:
783 |             # Wrapping does not work with plain text yet, as the criteria in skipwrap
784 |             # depends on markdown formatting and syntax
785 |             text = self.optwrap(text)
786 |         return text
787 | 
788 |     def optwrap(self, text):
789 |         """Wrap all paragraphs in the provided text."""
790 |         if not self.body_width:
791 |             return text
792 | 
793 |         assert wrap, "Requires Python 2.3."
794 |         result = ''
795 |         newlines = 0
796 |         for para in text.split("\n"):
797 |             if len(para) > 0:
798 |                 if not skipwrap(para):
799 |                     result += "\n".join(wrap(para, self.body_width))
800 |                     if para.endswith('  '):
801 |                         result += "  \n"
802 |                         newlines = 1
803 |                     else:
804 |                         result += "\n\n"
805 |                         newlines = 2
806 |                 else:
807 |                     if not onlywhite(para):
808 |                         result += para + "\n"
809 |                         newlines = 1
810 |             else:
811 |                 if newlines < 2:
812 |                     result += "\n"
813 |                     newlines += 1
814 |         return result
815 | 
816 |     def rstrip_outtext(self, additional_chars):
817 |         """ Remove whitespace at the end of the outtext """
818 |         if self.outtextlist:
819 |             self.outtextlist[-1] = self.outtextlist[-1].rstrip(' \r\t\n'+additional_chars)
820 | 
821 | multi_blank_line_matcher = re.compile(r'([ \t]*\n){3,}')
822 | ordered_list_matcher = re.compile(r'\d+\.\s')
823 | unordered_list_matcher = re.compile(r'[-\*\+]\s')
824 | md_chars_matcher = re.compile(r"([\\\[\]\(\)])")
825 | md_chars_matcher_all = re.compile(r"([`\*_{}\[\]\(\)#!])")
826 | md_dot_matcher = re.compile(r"""
827 |     ^             # start of line
828 |     (\s*\d+)      # optional whitespace and a number
829 |     (\.)          # dot
830 |     (?=\s)        # lookahead assert whitespace
831 |     """, re.MULTILINE | re.VERBOSE)
832 | md_plus_matcher = re.compile(r"""
833 |     ^
834 |     (\s*)
835 |     (\+)
836 |     (?=\s)
837 |     """, flags=re.MULTILINE | re.VERBOSE)
838 | md_dash_matcher = re.compile(r"""
839 |     ^
840 |     (\s*)
841 |     (-)
842 |     (?=\s|\-)     # followed by whitespace (bullet list, or spaced out hr)
843 |                   # or another dash (header or hr)
844 |     """, flags=re.MULTILINE | re.VERBOSE)
845 | slash_chars = r'\`*_{}[]()#+-.!'
846 | md_backslash_matcher = re.compile(r'''
847 |     (\\)          # match one slash
848 |     (?=[%s])      # followed by a char that requires escaping
849 |     ''' % re.escape(slash_chars),
850 |     flags=re.VERBOSE)
851 | 
852 | def skipwrap(para):
853 |     # If the text begins with four spaces or one tab, it's a code block; don't wrap
854 |     if para[0:4] == '    ' or para[0] == '\t':
855 |         return True
856 |     # If the text begins with only two "--", possibly preceded by whitespace, that's
857 |     # an emdash; so wrap.
858 |     stripped = para.lstrip()
859 |     if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
860 |         return False
861 |     # I'm not sure what this is for; I thought it was to detect lists, but there's
862 |     # a <br>-inside-<span> case in one of the tests that also depends upon it.
863 |     if stripped[0:1] == '-' or stripped[0:1] == '*':
864 |         return True
865 |     # If the text begins with a single -, *, or +, followed by a space, or an integer,
866 |     # followed by a ., followed by a space (in either case optionally preceeded by
867 |     # whitespace), it's a list; don't wrap.
868 |     if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped):
869 |         return True
870 |     return False
871 | 
872 | def wrapwrite(text):
873 |     text = text.encode('utf-8')
874 |     try: #Python3
875 |         sys.stdout.buffer.write(text)
876 |     except AttributeError:
877 |         sys.stdout.write(text)
878 | 
879 | def html2text(html, baseurl=''):
880 |     h = HTML2Text(baseurl=baseurl)
881 |     return h.handle(html)
882 | 
883 | def unescape(s, unicode_snob=False):
884 |     h = HTML2Text()
885 |     h.unicode_snob = unicode_snob
886 |     return h.unescape(s)
887 | 
888 | def escape_md(text):
889 |     """Escapes markdown-sensitive characters within other markdown constructs."""
890 |     return md_chars_matcher.sub(r"\\\1", text)
891 | 
892 | def escape_md_section(text, snob=False):
893 |     """Escapes markdown-sensitive characters across whole document sections."""
894 |     text = md_backslash_matcher.sub(r"\\\1", text)
895 |     if snob:
896 |         text = md_chars_matcher_all.sub(r"\\\1", text)
897 |     text = md_dot_matcher.sub(r"\1\\\2", text)
898 |     text = md_plus_matcher.sub(r"\1\\\2", text)
899 |     text = md_dash_matcher.sub(r"\1\\\2", text)
900 |     return text
901 | 
902 | def remove_multi_blank_lines(text):
903 |     """ Ensure there can only be one blank line between text """
904 |     return multi_blank_line_matcher.sub('\n\n', text)
905 | 
906 | def main():
907 |     baseurl = ''
908 | 
909 |     p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
910 |                               version='%prog ' + __version__)
911 |     p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true",
912 |         default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis")
913 |     p.add_option("--ignore-links", dest="ignore_links", action="store_true",
914 |         default=IGNORE_ANCHORS, help="don't include any formatting for links")
915 |     p.add_option("--ignore-images", dest="ignore_images", action="store_true",
916 |         default=IGNORE_IMAGES, help="don't include any formatting for images")
917 |     p.add_option("--no-markdown", dest="no_markdown", action="store_true",
918 |         default=NO_MARKDOWN, help="don't use markdown syntax and display nicely as plain text")
919 |     p.add_option("-g", "--google-doc", action="store_true", dest="google_doc",
920 |         default=False, help="convert an html-exported Google Document")
921 |     p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash",
922 |         default=False, help="use a dash rather than a star for unordered list items")
923 |     p.add_option("-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk",
924 |         default=False, help="use an asterisk rather than an underscore for emphasized text")
925 |     p.add_option("-b", "--body-width", dest="body_width", action="store", type="int",
926 |         default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap")
927 |     p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int",
928 |         default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists")
929 |     p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough",
930 |         default=False, help="hide strike-through text. only relevant when -g is specified as well")
931 |     p.add_option("--escape-all", action="store_true", dest="escape_snob",
932 |         default=False, help="Escape all special characters.  Output is less readable, but avoids corner case formatting issues.")
933 |     (options, args) = p.parse_args()
934 | 
935 |     # process input
936 |     encoding = "utf-8"
937 |     if len(args) > 0:
938 |         file_ = args[0]
939 |         if len(args) == 2:
940 |             encoding = args[1]
941 |         if len(args) > 2:
942 |             p.error('Too many arguments')
943 | 
944 |         if file_.startswith('http://') or file_.startswith('https://'):
945 |             baseurl = file_
946 |             j = urllib.urlopen(baseurl)
947 |             data = j.read()
948 |             if encoding is None:
949 |                 try:
950 |                     from feedparser import _getCharacterEncoding as enc
951 |                 except ImportError:
952 |                     enc = lambda x, y: ('utf-8', 1)
953 |                 encoding = enc(j.headers, data)[0]
954 |                 if encoding == 'us-ascii':
955 |                     encoding = 'utf-8'
956 |         else:
957 |             data = open(file_, 'rb').read()
958 |             if encoding is None:
959 |                 try:
960 |                     from chardet import detect
961 |                 except ImportError:
962 |                     detect = lambda x: {'encoding': 'utf-8'}
963 |                 encoding = detect(data)['encoding']
964 |     else:
965 |         data = sys.stdin.read()
966 | 
967 |     data = data.decode(encoding)
968 |     h = HTML2Text(baseurl=baseurl)
969 |     # handle options
970 |     if options.ul_style_dash: h.ul_item_mark = '-'
971 |     if options.em_style_asterisk:
972 |         h.emphasis_mark = '*'
973 |         h.strong_mark = '__'
974 | 
975 |     h.body_width = options.body_width
976 |     h.list_indent = options.list_indent
977 |     h.ignore_emphasis = options.ignore_emphasis
978 |     h.ignore_links = options.ignore_links
979 |     h.ignore_images = options.ignore_images
980 |     h.no_markdown = options.no_markdown
981 |     h.google_doc = options.google_doc
982 |     h.hide_strikethrough = options.hide_strikethrough
983 |     h.escape_snob = options.escape_snob
984 | 
985 |     wrapwrite(h.handle(data))
986 | 
987 | 
988 | if __name__ == "__main__":
989 |     main()
990 | 


--------------------------------------------------------------------------------
/model/retriever/fetching/__init__.py:
--------------------------------------------------------------------------------
 1 | from .playwright_based_crawl_new import get_raw_pages
 2 | from .import playwright_based_crawl_new
 3 | 
 4 | import asyncio
 5 |     
 6 | from typing import List, Dict
 7 | 
 8 | class Fetcher:
 9 |     def __init__(self) -> None:
10 |         self.loop = asyncio.get_event_loop()
11 |         # TODO delete loop -> loop.close()
12 | 
13 |     
14 |     def _pre_handle_urls(self, urls: List[str]) -> List[str]:
15 |         urls_new = []
16 |         for url in urls:
17 |             if url in urls_new or "http://%s"%url in urls_new or "https://%s"%url in urls_new:
18 |                 continue
19 |             if not url.startswith("http"):
20 |                 url = "http://%s" % url
21 |             urls_new.append(url)
22 |         return urls_new
23 | 
24 |     def fetch(self, urls: List[str]) -> Dict[str, List[str]]:
25 |         
26 |         urls = self._pre_handle_urls(urls)
27 |         
28 |         self.loop.run_until_complete(get_raw_pages(urls, close_browser=True))
29 |         responses = [playwright_based_crawl_new.results[url] for url in urls] 
30 | 
31 |         ret = dict()
32 |         for url, resp in zip(urls, responses):
33 |             if not resp[1]:
34 |                 pass
35 |             else:
36 |                 ret[url] = resp[1]
37 | 
38 |         return ret
39 | 


--------------------------------------------------------------------------------
/model/retriever/fetching/playwright_based_crawl_new.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from playwright.async_api import async_playwright, Page
 3 |  
 4 | 
 5 | results ={}
 6 | status ={}
 7 | context = None
 8 |         
 9 | 
10 | async def one_page_handle(context, url):
11 |     # 开启事件监听
12 |     # page.on('response',printdata)
13 |     # 进入子页面
14 |     try:
15 |         global results
16 |         results[url] = [None,None]
17 |         response = await context.request.get(url, timeout=5000)
18 |         # 等待子页面加载完毕
19 |         results[url] = (response.status, await response.text())
20 |     except Exception as e:
21 |         pass
22 |  
23 | async def get_conetent():
24 |     global context
25 |     if not context:
26 |         # print("加载驱动")
27 |         playwright = await async_playwright().start()
28 |         browser = await playwright.firefox.launch()
29 |         # 新建上下文
30 |         context = await browser.new_context()
31 |     return context
32 |     
33 | 
34 | async def close_browser(browser):
35 |      # 关闭浏览器驱动
36 |     await browser.close()
37 | 
38 | async def get_raw_pages_(context, urls):
39 |     # 封装异步任务
40 |     tasks = []
41 |     global results
42 |     results = {}
43 |     for url in urls:
44 |         tasks.append(asyncio.create_task(one_page_handle(context, url)))
45 |  
46 |     await asyncio.wait(tasks, timeout=10)
47 | 
48 |    
49 | async def get_raw_pages(urls, close_browser=False):
50 |     context = await get_conetent()
51 |     await get_raw_pages_(context,urls)
52 |     
53 | 


--------------------------------------------------------------------------------
/model/retriever/filtering/__init__.py:
--------------------------------------------------------------------------------
1 | from .contriver import ReferenceFilter


--------------------------------------------------------------------------------
/model/retriever/filtering/contriver.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoTokenizer, AutoModel
 3 | import os
 4 | 
 5 | from typing import Optional, Union, List, Dict, Tuple, Iterable, Callable, Any
 6 | 
 7 | class ContrieverScorer:
 8 |     def __init__(self, retriever_ckpt_path, device=None, max_batch_size=400) -> None:
 9 |         query_encoder_path = os.path.join(retriever_ckpt_path, 'query_encoder')
10 |         reference_encoder_path = os.path.join(retriever_ckpt_path, 'reference_encoder')
11 |             
12 |         self.tokenizer = AutoTokenizer.from_pretrained("facebook/contriever-msmarco")
13 |         self.query_encoder = AutoModel.from_pretrained(query_encoder_path)
14 |         self.reference_encoder = AutoModel.from_pretrained(reference_encoder_path)
15 |         self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if not device else device
16 |         self.query_encoder = self.query_encoder.to(self.device).eval()
17 |         self.reference_encoder = self.reference_encoder.to(self.device).eval()
18 |         assert max_batch_size > 0
19 |         self.max_batch_size = max_batch_size
20 | 
21 |     def get_query_embeddings(self, sentences: List[str]) -> torch.Tensor:
22 |         # Tokenization and Inference
23 |         torch.cuda.empty_cache()
24 |         with torch.no_grad():
25 |             inputs = self.tokenizer(sentences, padding=True,
26 |                                     truncation=True, return_tensors='pt')
27 |             for key in inputs:
28 |                 inputs[key] = inputs[key].to(self.device)
29 |             outputs = self.query_encoder(**inputs)
30 |             # Mean Pool
31 |             token_embeddings = outputs[0]
32 |             mask = inputs["attention_mask"]
33 |             token_embeddings = token_embeddings.masked_fill(
34 |                 ~mask[..., None].bool(), 0.)
35 |             sentence_embeddings = token_embeddings.sum(
36 |                 dim=1) / mask.sum(dim=1)[..., None]
37 |             return sentence_embeddings
38 |     
39 |     def get_embeddings(self, sentences: List[str]) -> torch.Tensor:
40 |         # Tokenization and Inference
41 |         torch.cuda.empty_cache()
42 |         with torch.no_grad():
43 |             inputs = self.tokenizer(sentences, padding=True,
44 |                                     truncation=True, return_tensors='pt')
45 |             for key in inputs:
46 |                 inputs[key] = inputs[key].to(self.device)
47 |             outputs = self.reference_encoder(**inputs)
48 |             # Mean Pool
49 |             token_embeddings = outputs[0]
50 |             mask = inputs["attention_mask"]
51 |             token_embeddings = token_embeddings.masked_fill(
52 |                 ~mask[..., None].bool(), 0.)
53 |             sentence_embeddings = token_embeddings.sum(
54 |                 dim=1) / mask.sum(dim=1)[..., None]
55 |             return sentence_embeddings
56 | 
57 |     def score_documents_on_query(self, query: str, documents: List[str]) -> torch.Tensor:
58 |         query_embedding = self.get_query_embeddings([query])[0]
59 |         document_embeddings = self.get_embeddings(documents)
60 |         return query_embedding@document_embeddings.t()
61 | 
62 |     def select_topk(self, query: str, documents: List[str], k=1):
63 |         """
64 |         Returns:
65 |             `ret`: `torch.return_types.topk`, use `ret.values` or `ret.indices` to get value or index tensor
66 |         """
67 |         scores = []
68 |         for i in range((len(documents) + self.max_batch_size - 1) // self.max_batch_size):
69 |             scores.append(self.score_documents_on_query(query, documents[self.max_batch_size*i:self.max_batch_size*(i+1)]).to('cpu'))
70 |         scores = torch.concat(scores)
71 |         return scores.topk(min(k, len(scores)))
72 | 
73 | 
74 | class ReferenceFilter:
75 |     def __init__(self, retriever_ckpt_path, device=None, max_batch_size=400) -> None:
76 |         self.scorer = ContrieverScorer(retriever_ckpt_path, device, max_batch_size)
77 | 
78 |     def produce_references(self, query, paragraphs: List[Dict[str, str]], topk=5) -> List[Dict[str, str]]:
79 |         """Individually calculate scores of each sentence, and return `topk`. paragraphs should be like a list of {title, url, text}."""
80 |         # paragraphs = self._pre_filter(paragraphs)
81 |         texts = [item['text'] for item in paragraphs]
82 |         topk = self.scorer.select_topk(query, texts, topk)
83 |         indices = list(topk.indices.detach().cpu().numpy())
84 |         return [paragraphs[idx] for idx in indices]
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/model/retriever/searching/__init__.py:
--------------------------------------------------------------------------------
 1 | from .serpapi import Searcher as SerpAPISearcher
 2 | from .bing_search import Searcher as BingSearcher
 3 | from .searcher import SearchResult, SearcherInterface
 4 | 
 5 | def create_searcher(name: str) -> SearcherInterface:
 6 |     if name == "serpapi":
 7 |         return SerpAPISearcher()
 8 |     elif name == "bing":
 9 |         return BingSearcher()
10 |     else:
11 |         raise NotImplementedError()


--------------------------------------------------------------------------------
/model/retriever/searching/bing_search.py:
--------------------------------------------------------------------------------
 1 | from playwright.sync_api import sync_playwright
 2 | from .searcher import *
 3 | from typing import List, Dict, Tuple, Optional
 4 | 
 5 | import json
 6 | 
 7 | def get_bing_search_raw_page(question: str):
 8 |     results = []
 9 |     with sync_playwright() as p:
10 |         browser = p.chromium.launch()
11 |         context = browser.new_context()
12 |         page = context.new_page()
13 |         try:
14 |             page.goto(f"https://www.bing.com/search?q={question}")
15 |         except:
16 |             page.goto(f"https://www.bing.com")
17 |             page.fill('input[name="q"]', question)
18 |             page.press('input[name="q"]', 'Enter')
19 |         try:
20 |             page.wait_for_load_state('networkidle', timeout=3000)
21 |         except:
22 |             pass
23 |         # page.wait_for_load_state('networkidle')
24 |         search_results = page.query_selector_all('.b_algo h2')
25 |         for result in search_results:
26 |             title = result.inner_text()
27 |             a_tag = result.query_selector('a')
28 |             if not a_tag: continue
29 |             url = a_tag.get_attribute('href')
30 |             if not url: continue
31 |             # print(title, url)
32 |             results.append({
33 |                 'title': title,
34 |                 'url': url
35 |             })
36 |         browser.close()
37 |     return results
38 | 
39 | def query_bing(question, max_tries=3):
40 |     cnt = 0
41 |     while cnt < max_tries:
42 |         cnt += 1
43 |         results = get_bing_search_raw_page(question)
44 |         if results:
45 |             return results
46 |     print('No Bing Result')
47 |     return None
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     
52 |     with open('crawl.json', 'w', encoding='utf-8') as f:
53 |         json.dump(query_bing('how to cook a steak'), f, ensure_ascii=False, indent=4)
54 |         
55 |     exit(0)
56 | 
57 | 
58 | class Searcher(SearcherInterface):
59 |     def __init__(self) -> None:
60 |         pass
61 | 
62 |     def _parse(self, result) -> List[SearchResult]:
63 |         if not result:
64 |             return None
65 |         ret = []
66 |         for item in result:
67 |             ret.append(SearchResult(item['title'], item['url'], None))
68 |         return ret
69 | 
70 |     def search(self, query) -> List[SearchResult]:
71 |         return self._parse(query_bing(query))
72 | 
73 | 
74 | 
75 | if __name__ == '__main__':
76 |     
77 |     print(json.dumps(query_bing('how to cook a cake?'), ensure_ascii=False, indent=4))


--------------------------------------------------------------------------------
/model/retriever/searching/searcher.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import List, Dict
 3 | 
 4 | class SearchResult:
 5 |     def __init__(self, title, url, snip) -> None:
 6 |         self.title = title
 7 |         self.url = url
 8 |         self.snip = snip
 9 | 
10 |     def dump(self):
11 |         return {
12 |             "title": self.title,
13 |             "url": self.url,
14 |             "snip": self.snip
15 |         }
16 | 
17 |     def __str__(self) -> str:
18 |         return json.dumps(self.dump())
19 |     
20 | class SearcherInterface:
21 |     def search(self, query) -> List[SearchResult]:
22 |         raise NotImplementedError()


--------------------------------------------------------------------------------
/model/retriever/searching/serpapi.py:
--------------------------------------------------------------------------------
 1 | import json, os
 2 | import requests
 3 | from .searcher import *
 4 | from typing import List, Dict
 5 |     
 6 | 
 7 | 
 8 | def serp_api(query: str, api_key: str):
 9 |     params = {
10 |         "engine": "google",
11 |         "q": query,
12 |         "api_key": api_key
13 |     }
14 |     resp = requests.get("https://serpapi.com/search", params=params)
15 |     if resp.status_code != 200:
16 |         raise Exception("Serpapi returned %d\n%s"%(resp.status_code, resp.text))
17 |     result = resp.json()
18 |     ret = []
19 |     for item in result['organic_results']:
20 |         if "title" not in item or "link" not in item or "snippet" not in item:
21 |             continue
22 |         ret.append(SearchResult(item['title'], item['link'], item['snippet']))
23 |     return ret
24 | 
25 | 
26 | 
27 | def dump_results(results: List[SearchResult]):
28 |     return json.dumps([result.dump() for result in results])
29 | 
30 | 
31 | class Searcher(SearcherInterface):
32 |     def __init__(self) -> None:
33 |         self.SERPAPI_KEY = os.getenv("SERPAPI_KEY")
34 |         if not self.SERPAPI_KEY:
35 |             print("[Error] SERPAPI_KEY is not set, please set it to use serpapi")
36 |             exit(1)
37 | 
38 |     def _parse(self, result) -> List[SearchResult]:
39 |         if not result:
40 |             return None
41 |         ret = []
42 |         for item in result:
43 |             ret.append(SearchResult(item['ref'], item['url'], item['snip']))
44 |         return ret
45 | 
46 |     def search(self, query) -> List[SearchResult]:
47 |         return serp_api(query, self.SERPAPI_KEY)
48 | 


--------------------------------------------------------------------------------
/model/stopwords/english:
--------------------------------------------------------------------------------
  1 | 'd
  2 | 'll
  3 | 'm
  4 | 're
  5 | 's
  6 | 't
  7 | 've
  8 | ZT
  9 | ZZ
 10 | a
 11 | a's
 12 | able
 13 | about
 14 | above
 15 | abst
 16 | accordance
 17 | according
 18 | accordingly
 19 | across
 20 | act
 21 | actually
 22 | added
 23 | adj
 24 | adopted
 25 | affected
 26 | affecting
 27 | affects
 28 | after
 29 | afterwards
 30 | again
 31 | against
 32 | ah
 33 | ain't
 34 | all
 35 | allow
 36 | allows
 37 | almost
 38 | alone
 39 | along
 40 | already
 41 | also
 42 | although
 43 | always
 44 | am
 45 | among
 46 | amongst
 47 | an
 48 | and
 49 | announce
 50 | another
 51 | any
 52 | anybody
 53 | anyhow
 54 | anymore
 55 | anyone
 56 | anything
 57 | anyway
 58 | anyways
 59 | anywhere
 60 | apart
 61 | apparently
 62 | appear
 63 | appreciate
 64 | appropriate
 65 | approximately
 66 | are
 67 | area
 68 | areas
 69 | aren
 70 | aren't
 71 | arent
 72 | arise
 73 | around
 74 | as
 75 | aside
 76 | ask
 77 | asked
 78 | asking
 79 | asks
 80 | associated
 81 | at
 82 | auth
 83 | available
 84 | away
 85 | awfully
 86 | b
 87 | back
 88 | backed
 89 | backing
 90 | backs
 91 | be
 92 | became
 93 | because
 94 | become
 95 | becomes
 96 | becoming
 97 | been
 98 | before
 99 | beforehand
100 | began
101 | begin
102 | beginning
103 | beginnings
104 | begins
105 | behind
106 | being
107 | beings
108 | believe
109 | below
110 | beside
111 | besides
112 | best
113 | better
114 | between
115 | beyond
116 | big
117 | biol
118 | both
119 | brief
120 | briefly
121 | but
122 | by
123 | c
124 | c'mon
125 | c's
126 | ca
127 | came
128 | can
129 | can't
130 | cannot
131 | cant
132 | case
133 | cases
134 | cause
135 | causes
136 | certain
137 | certainly
138 | changes
139 | clear
140 | clearly
141 | co
142 | com
143 | come
144 | comes
145 | concerning
146 | consequently
147 | consider
148 | considering
149 | contain
150 | containing
151 | contains
152 | corresponding
153 | could
154 | couldn't
155 | couldnt
156 | course
157 | currently
158 | d
159 | date
160 | definitely
161 | describe
162 | described
163 | despite
164 | did
165 | didn't
166 | differ
167 | different
168 | differently
169 | discuss
170 | do
171 | does
172 | doesn't
173 | doing
174 | don't
175 | done
176 | down
177 | downed
178 | downing
179 | downs
180 | downwards
181 | due
182 | during
183 | e
184 | each
185 | early
186 | ed
187 | edu
188 | effect
189 | eg
190 | eight
191 | eighty
192 | either
193 | else
194 | elsewhere
195 | end
196 | ended
197 | ending
198 | ends
199 | enough
200 | entirely
201 | especially
202 | et
203 | et-al
204 | etc
205 | even
206 | evenly
207 | ever
208 | every
209 | everybody
210 | everyone
211 | everything
212 | everywhere
213 | ex
214 | exactly
215 | example
216 | except
217 | f
218 | face
219 | faces
220 | fact
221 | facts
222 | far
223 | felt
224 | few
225 | ff
226 | fifth
227 | find
228 | finds
229 | first
230 | five
231 | fix
232 | followed
233 | following
234 | follows
235 | for
236 | former
237 | formerly
238 | forth
239 | found
240 | four
241 | from
242 | full
243 | fully
244 | further
245 | furthered
246 | furthering
247 | furthermore
248 | furthers
249 | g
250 | gave
251 | general
252 | generally
253 | get
254 | gets
255 | getting
256 | give
257 | given
258 | gives
259 | giving
260 | go
261 | goes
262 | going
263 | gone
264 | good
265 | goods
266 | got
267 | gotten
268 | great
269 | greater
270 | greatest
271 | greetings
272 | group
273 | grouped
274 | grouping
275 | groups
276 | h
277 | had
278 | hadn't
279 | happens
280 | hardly
281 | has
282 | hasn't
283 | have
284 | haven't
285 | having
286 | he
287 | he's
288 | hed
289 | hello
290 | help
291 | hence
292 | her
293 | here
294 | here's
295 | hereafter
296 | hereby
297 | herein
298 | heres
299 | hereupon
300 | hers
301 | herself
302 | hes
303 | hi
304 | hid
305 | high
306 | higher
307 | highest
308 | him
309 | himself
310 | his
311 | hither
312 | home
313 | hopefully
314 | how
315 | howbeit
316 | however
317 | hundred
318 | i
319 | i'd
320 | i'll
321 | i'm
322 | i've
323 | id
324 | ie
325 | if
326 | ignored
327 | im
328 | immediate
329 | immediately
330 | importance
331 | important
332 | in
333 | inasmuch
334 | inc
335 | include
336 | indeed
337 | index
338 | indicate
339 | indicated
340 | indicates
341 | information
342 | inner
343 | insofar
344 | instead
345 | interest
346 | interested
347 | interesting
348 | interests
349 | into
350 | invention
351 | inward
352 | is
353 | isn't
354 | it
355 | it'd
356 | it'll
357 | it's
358 | itd
359 | its
360 | itself
361 | j
362 | just
363 | k
364 | keep
365 | keeps
366 | kept
367 | keys
368 | kg
369 | kind
370 | km
371 | knew
372 | know
373 | known
374 | knows
375 | l
376 | large
377 | largely
378 | last
379 | lately
380 | later
381 | latest
382 | latter
383 | latterly
384 | least
385 | less
386 | lest
387 | let
388 | let's
389 | lets
390 | like
391 | liked
392 | likely
393 | line
394 | little
395 | long
396 | longer
397 | longest
398 | look
399 | looking
400 | looks
401 | ltd
402 | m
403 | made
404 | mainly
405 | make
406 | makes
407 | making
408 | man
409 | many
410 | may
411 | maybe
412 | me
413 | mean
414 | means
415 | meantime
416 | meanwhile
417 | member
418 | members
419 | men
420 | merely
421 | mg
422 | might
423 | million
424 | miss
425 | ml
426 | more
427 | moreover
428 | most
429 | mostly
430 | mr
431 | mrs
432 | much
433 | mug
434 | must
435 | my
436 | myself
437 | n
438 | n't
439 | na
440 | name
441 | namely
442 | nay
443 | nd
444 | near
445 | nearly
446 | necessarily
447 | necessary
448 | need
449 | needed
450 | needing
451 | needs
452 | neither
453 | never
454 | nevertheless
455 | new
456 | newer
457 | newest
458 | next
459 | nine
460 | ninety
461 | no
462 | nobody
463 | non
464 | none
465 | nonetheless
466 | noone
467 | nor
468 | normally
469 | nos
470 | not
471 | noted
472 | nothing
473 | novel
474 | now
475 | nowhere
476 | number
477 | numbers
478 | o
479 | obtain
480 | obtained
481 | obviously
482 | of
483 | off
484 | often
485 | oh
486 | ok
487 | okay
488 | old
489 | older
490 | oldest
491 | omitted
492 | on
493 | once
494 | one
495 | ones
496 | only
497 | onto
498 | open
499 | opened
500 | opening
501 | opens
502 | or
503 | ord
504 | order
505 | ordered
506 | ordering
507 | orders
508 | other
509 | others
510 | otherwise
511 | ought
512 | our
513 | ours
514 | ourselves
515 | out
516 | outside
517 | over
518 | overall
519 | owing
520 | own
521 | p
522 | page
523 | pages
524 | part
525 | parted
526 | particular
527 | particularly
528 | parting
529 | parts
530 | past
531 | per
532 | perhaps
533 | place
534 | placed
535 | places
536 | please
537 | plus
538 | point
539 | pointed
540 | pointing
541 | points
542 | poorly
543 | possible
544 | possibly
545 | potentially
546 | pp
547 | predominantly
548 | present
549 | presented
550 | presenting
551 | presents
552 | presumably
553 | previously
554 | primarily
555 | probably
556 | problem
557 | problems
558 | promptly
559 | proud
560 | provides
561 | put
562 | puts
563 | q
564 | que
565 | quickly
566 | quite
567 | qv
568 | r
569 | ran
570 | rather
571 | rd
572 | re
573 | readily
574 | really
575 | reasonably
576 | recent
577 | recently
578 | ref
579 | refs
580 | regarding
581 | regardless
582 | regards
583 | related
584 | relatively
585 | research
586 | respectively
587 | resulted
588 | resulting
589 | results
590 | right
591 | room
592 | rooms
593 | run
594 | s
595 | said
596 | same
597 | saw
598 | say
599 | saying
600 | says
601 | sec
602 | second
603 | secondly
604 | seconds
605 | section
606 | see
607 | seeing
608 | seem
609 | seemed
610 | seeming
611 | seems
612 | seen
613 | sees
614 | self
615 | selves
616 | sensible
617 | sent
618 | serious
619 | seriously
620 | seven
621 | several
622 | shall
623 | she
624 | she'll
625 | shed
626 | shes
627 | should
628 | shouldn't
629 | show
630 | showed
631 | showing
632 | shown
633 | showns
634 | shows
635 | side
636 | sides
637 | significant
638 | significantly
639 | similar
640 | similarly
641 | since
642 | six
643 | slightly
644 | small
645 | smaller
646 | smallest
647 | so
648 | some
649 | somebody
650 | somehow
651 | someone
652 | somethan
653 | something
654 | sometime
655 | sometimes
656 | somewhat
657 | somewhere
658 | soon
659 | sorry
660 | specifically
661 | specified
662 | specify
663 | specifying
664 | state
665 | states
666 | still
667 | stop
668 | strongly
669 | sub
670 | substantially
671 | successfully
672 | such
673 | sufficiently
674 | suggest
675 | sup
676 | sure
677 | t
678 | t's
679 | take
680 | taken
681 | taking
682 | tell
683 | tends
684 | th
685 | than
686 | thank
687 | thanks
688 | thanx
689 | that
690 | that'll
691 | that's
692 | that've
693 | thats
694 | the
695 | their
696 | theirs
697 | them
698 | themselves
699 | then
700 | thence
701 | there
702 | there'll
703 | there's
704 | there've
705 | thereafter
706 | thereby
707 | thered
708 | therefore
709 | therein
710 | thereof
711 | therere
712 | theres
713 | thereto
714 | thereupon
715 | these
716 | they
717 | they'd
718 | they'll
719 | they're
720 | they've
721 | theyd
722 | theyre
723 | thing
724 | things
725 | think
726 | thinks
727 | third
728 | this
729 | thorough
730 | thoroughly
731 | those
732 | thou
733 | though
734 | thoughh
735 | thought
736 | thoughts
737 | thousand
738 | three
739 | throug
740 | through
741 | throughout
742 | thru
743 | thus
744 | til
745 | tip
746 | to
747 | today
748 | together
749 | too
750 | took
751 | toward
752 | towards
753 | tried
754 | tries
755 | truly
756 | try
757 | trying
758 | ts
759 | turn
760 | turned
761 | turning
762 | turns
763 | twice
764 | two
765 | u
766 | un
767 | under
768 | unfortunately
769 | unless
770 | unlike
771 | unlikely
772 | until
773 | unto
774 | up
775 | upon
776 | ups
777 | us
778 | use
779 | used
780 | useful
781 | usefully
782 | usefulness
783 | uses
784 | using
785 | usually
786 | uucp
787 | v
788 | value
789 | various
790 | very
791 | via
792 | viz
793 | vol
794 | vols
795 | vs
796 | w
797 | want
798 | wanted
799 | wanting
800 | wants
801 | was
802 | wasn't
803 | way
804 | ways
805 | we
806 | we'd
807 | we'll
808 | we're
809 | we've
810 | wed
811 | welcome
812 | well
813 | wells
814 | went
815 | were
816 | weren't
817 | what
818 | what'll
819 | what's
820 | whatever
821 | whats
822 | when
823 | whence
824 | whenever
825 | where
826 | where's
827 | whereafter
828 | whereas
829 | whereby
830 | wherein
831 | wheres
832 | whereupon
833 | wherever
834 | whether
835 | which
836 | while
837 | whim
838 | whither
839 | who
840 | who'll
841 | who's
842 | whod
843 | whoever
844 | whole
845 | whom
846 | whomever
847 | whos
848 | whose
849 | why
850 | widely
851 | will
852 | willing
853 | wish
854 | with
855 | within
856 | without
857 | won't
858 | wonder
859 | words
860 | work
861 | worked
862 | working
863 | works
864 | world
865 | would
866 | wouldn't
867 | www
868 | x
869 | y
870 | year
871 | years
872 | yes
873 | yet
874 | you
875 | you'd
876 | you'll
877 | you're
878 | you've
879 | youd
880 | young
881 | younger
882 | youngest
883 | your
884 | youre
885 | yours
886 | yourself
887 | yourselves
888 | z
889 | zero
890 | zt
891 | zz
892 | !
893 | """"
894 | #
895 | $
896 | %
897 | &
898 | '
899 | (
900 | )
901 | *
902 | +
903 | ","
904 | -
905 | .
906 | /
907 | :
908 | ;
909 | <
910 | =
911 | >
912 | ?
913 | @
914 | [
915 | \
916 | ]
917 | ^
918 | _
919 | `
920 | {
921 | |
922 | }
923 | ~


--------------------------------------------------------------------------------
/model/stopwords/explaination:
--------------------------------------------------------------------------------
1 | reason
2 | reasons


--------------------------------------------------------------------------------
/model/utils.py:
--------------------------------------------------------------------------------
  1 | import re, os
  2 | from rouge_score import rouge_scorer, tokenize
  3 | 
  4 | class DataUtils:
  5 |     @staticmethod
  6 |     def split_segments(statement: str):
  7 |         all_statements = []
  8 |         statement = re.sub(' +', ' ', statement.replace('\n', ' '))
  9 |         split_pattern = r'(?<!\w\.\w.)(?<![A-Z]\.)(?<![A-Z][a-z]\.)(?<! [a-z]\.)(?<![A-Z][a-z][a-z]\.)(?<=\.|\?|\!)\"*\s*\s*(?:\W*)([A-Z])'
 10 |         tmp_statements = []
 11 |         
 12 |         for s in re.split(r"(\[\d+\])", statement):
 13 |             if not s:
 14 |                 continue
 15 |             cites = re.findall(r"\[(\d+)\]", s)
 16 |             if not cites: # Segment
 17 |                 tmp_statements.append([s, []])
 18 |             elif not tmp_statements: # Citation Mark, but no Segments
 19 |                 continue
 20 |             else: # Citation Mark
 21 |                 for item in cites:
 22 |                     tmp_statements[-1][1].append(int(item) - 1)
 23 |         
 24 |         for s, cite in tmp_statements:
 25 |             prefix = ""
 26 |             for ix, seg in enumerate(re.split(split_pattern, s)):
 27 |                 if len(prefix) > 20:
 28 |                     all_statements.append([prefix, []])
 29 |                     prefix = ""
 30 |                 prefix += seg
 31 |                 if prefix and prefix[-1] in ['.!?:']:
 32 |                     prefix += " "
 33 |             if prefix:
 34 |                 if all_statements and len(prefix) < 20:
 35 |                     all_statements[-1][0] += prefix
 36 |                 else:
 37 |                     all_statements.append([prefix, []])
 38 |             if all_statements:
 39 |                 all_statements[-1][1] += cite
 40 |         
 41 |         return [seg[0] for seg in all_statements], [seg[1] for seg in all_statements]
 42 |     
 43 |     @staticmethod
 44 |     def matching_score(all_statements, references):
 45 |         def remove_stopwords(stmt: str):
 46 |             stmt = tokenize.tokenize(stmt, None)
 47 |             ret = []
 48 |             for item in stmt:
 49 |                 if item in stopwords:
 50 |                     continue
 51 |                 ret.append(item)
 52 |             return " ".join(ret)
 53 |         
 54 |         all_statements = [remove_stopwords(item) for item in all_statements]
 55 |         references = [remove_stopwords(item) for item in references]
 56 |         
 57 |         # return None
 58 |         scorer = rouge_scorer.RougeScorer(['rouge1'])
 59 |         all_scores = []
 60 |         for statement in all_statements:
 61 |             if len(tokenize.tokenize(statement, None)) < 5:
 62 |                 all_scores.append([0] * len(references))
 63 |                 continue
 64 |             ref_score = []
 65 |             for idx, ref in enumerate(references):
 66 |                 rouge = scorer.score(ref, statement)['rouge1'].precision
 67 |                 # print(rouge)
 68 |                 ref_score.append(rouge)
 69 |             all_scores.append(ref_score)
 70 |         return all_scores
 71 |     
 72 |     @staticmethod
 73 |     def get_ideal_citations(all_scores, raw_citations, citation_threshold, extra_bonus=0.3):
 74 |         
 75 |         assert len(all_scores) == len(raw_citations)
 76 |         
 77 |         ideal_citations = []
 78 |         for seg_idx, scores in enumerate(all_scores):
 79 |             idc = []
 80 |             best_idx = 0
 81 |             best_scr = 0
 82 |             for idx, score in enumerate(scores):
 83 |                 if idx in raw_citations[seg_idx]:
 84 |                     score += extra_bonus / len(raw_citations[seg_idx])
 85 |                 if score >= citation_threshold:
 86 |                     idc.append(idx)
 87 |                 if score > best_scr:
 88 |                     best_idx = idx
 89 |                     best_scr = score
 90 |             if len(idc) == 0 and len(raw_citations[seg_idx]) > 0:
 91 |                 idc.append(best_idx)
 92 |             ideal_citations.append(idc)
 93 |         return ideal_citations
 94 |     
 95 |     @staticmethod
 96 |     def recompose(all_statements, raw_citations, references, sep=" ", citation_threshold=0.75) -> str:
 97 |         scores = DataUtils.matching_score(all_statements, references)
 98 |         ret = ""
 99 |         ideal_citations = DataUtils.get_ideal_citations(scores, raw_citations, citation_threshold)
100 |         for seg, cit in zip(all_statements, ideal_citations):
101 |             # judge if seg[0] is alphanumeric
102 |             if ret and ret[-1] == "]" and seg and seg[0].isalnum():
103 |                 ret += sep
104 |             ret += seg
105 |             for c in cit:
106 |                 ret += "[%d]"%(c+1)
107 |             if ret and ret[-1] in ".!?:":
108 |                 ret += sep
109 |         return ret.strip()
110 | 
111 | class Stopwords:
112 |     @staticmethod
113 |     def load():
114 |         src = [
115 |             "./model/stopwords/english",
116 |             "./model/stopwords/explaination",
117 |         ]
118 |         ret = []
119 |         for item in src:
120 |             with open(item, "r") as f:
121 |                 ret += [word.strip() for word in f.readlines()]
122 |         return ret
123 | 
124 | 
125 | stopwords = set(Stopwords.load())
126 | 
127 | def citation_correction(original_answer, references):
128 |     segments, raw_cite = DataUtils.split_segments(original_answer)
129 |     
130 |     return DataUtils.recompose(segments, raw_cite, references)


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | beautifulsoup4==4.11.2
 2 | chardet==5.1.0
 3 | datasets==2.12.0
 4 | feedparser==6.0.10
 5 | gradio==3.33.1
 6 | HTMLParser==0.0.2
 7 | playwright==1.26.0
 8 | Requests==2.31.0
 9 | rouge_score==0.1.2
10 | torch==1.12.1
11 | tqdm==4.65.0
12 | transformers==4.29.1
13 | 


--------------------------------------------------------------------------------
/scripts/nq_open.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | script_path=$(realpath $0)
 3 | script_dir=$(dirname $script_path)
 4 | main_dir=$(dirname $script_dir)
 5 | 
 6 | source "${main_dir}/configs/model_webglm.sh"
 7 | 
 8 | DATA_PATH="data/nq_open.jsonl"
 9 | 
10 | run_cmd="python ${main_dir}/evaluate.py \
11 |        --webglm_ckpt_path $GENERATOR_CHECKPOINT_PATH \
12 |        --task nq_open \
13 |        --evaluate_task_data_path $DATA_PATH"
14 | 
15 | eval ${run_cmd}


--------------------------------------------------------------------------------
/scripts/triviaqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | script_path=$(realpath $0)
 3 | script_dir=$(dirname $script_path)
 4 | main_dir=$(dirname $script_dir)
 5 | 
 6 | source "${main_dir}/configs/model_webglm.sh"
 7 | 
 8 | DATA_PATH="data/triviaqa.jsonl"
 9 | 
10 | run_cmd="python ${main_dir}/evaluate.py \
11 |        --webglm_ckpt_path $GENERATOR_CHECKPOINT_PATH \
12 |        --task triviaqa \
13 |        --evaluate_task_data_path $DATA_PATH"
14 | 
15 | eval ${run_cmd}


--------------------------------------------------------------------------------
/scripts/web_questions.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | script_path=$(realpath $0)
 3 | script_dir=$(dirname $script_path)
 4 | main_dir=$(dirname $script_dir)
 5 | 
 6 | source "${main_dir}/configs/model_webglm.sh"
 7 | 
 8 | DATA_PATH="data/web_questions.jsonl"
 9 | 
10 | run_cmd="python ${main_dir}/evaluate.py \
11 |        --webglm_ckpt_path $GENERATOR_CHECKPOINT_PATH \
12 |        --task web_questions \
13 |        --evaluate_task_data_path $DATA_PATH"
14 | 
15 | eval ${run_cmd}


--------------------------------------------------------------------------------
/train_retriever.py:
--------------------------------------------------------------------------------
  1 | from transformers import RobertaTokenizer, RobertaModel, AutoModelWithLMHead, AutoTokenizer, Trainer, AutoModel, BertLMHeadModel
  2 | from datasets.load import load_dataset, load_from_disk
  3 | import torch, os, sys, time, random, json, argparse
  4 | from rouge_score.rouge_scorer import RougeScorer
  5 | 
  6 | from torch.utils.data import Dataset, DataLoader
  7 | from torch.optim import AdamW
  8 | from torch.utils.data.distributed import DistributedSampler
  9 | 
 10 | class QuestionReferenceDensity(torch.nn.Module):
 11 |     def __init__(self) -> None:
 12 |         super().__init__()
 13 |         self.question_encoder = AutoModel.from_pretrained("facebook/contriever-msmarco")
 14 |         self.reference_encoder = AutoModel.from_pretrained("facebook/contriever-msmarco")
 15 | 
 16 |         total = sum([param.nelement() for param in self.parameters()])
 17 |         print("Number of parameter: %.2fM" % (total / 1e6))
 18 |     
 19 |     def mean_pooling(self, token_embeddings, mask):
 20 |         token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
 21 |         sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
 22 |         return sentence_embeddings
 23 |         
 24 |     
 25 |     def forward(self, question, pos, neg):
 26 |         global args
 27 |         
 28 |         q = self.question_encoder(**question)
 29 |         r_pos = self.reference_encoder(**pos)
 30 |         r_neg = self.reference_encoder(**neg)
 31 |         cls_q = self.mean_pooling(q[0], question["attention_mask"])
 32 |         cls_q /= args.temp
 33 |         cls_r_pos = self.mean_pooling(r_pos[0], pos["attention_mask"])
 34 |         cls_r_neg = self.mean_pooling(r_neg[0], neg["attention_mask"])
 35 |         
 36 |         l_pos = torch.matmul(cls_q, torch.transpose(cls_r_pos, 0, 1))
 37 | 
 38 |         l_neg = torch.matmul(cls_q, torch.transpose(cls_r_neg, 0, 1))
 39 | 
 40 |         return l_pos, l_neg
 41 |         
 42 |     @staticmethod
 43 |     def loss(l_pos, l_neg):
 44 |         return torch.nn.functional.cross_entropy(torch.cat([l_pos, l_neg], dim=1), torch.arange(0, len(l_pos), dtype=torch.long, device=args.device))
 45 |     
 46 |     @staticmethod
 47 |     def num_correct(l_pos, l_neg):
 48 |         return ((torch.diag(l_pos) > torch.diag(l_neg))==True).sum()
 49 | 
 50 |     @staticmethod
 51 |     def acc(l_pos, l_neg):
 52 |         return ((torch.diag(l_pos) > torch.diag(l_neg))==True).sum() / len(l_pos)
 53 | 
 54 | 
 55 | class WarmupLinearScheduler(torch.optim.lr_scheduler.LambdaLR):
 56 |     def __init__(self, optimizer, warmup, total, ratio, last_epoch=-1):
 57 |         self.warmup = warmup
 58 |         self.total = total
 59 |         self.ratio = ratio
 60 |         super(WarmupLinearScheduler, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
 61 | 
 62 |     def lr_lambda(self, step):
 63 |         if step < self.warmup:
 64 |             return (1 - self.ratio) * step / float(max(1, self.warmup))
 65 | 
 66 |         return max(
 67 |             0.0,
 68 |             1.0 + (self.ratio - 1) * (step - self.warmup) / float(max(1.0, self.total - self.warmup)),
 69 |         )
 70 | 
 71 | 
 72 | def move_dict_to_device(obj, device):
 73 |     for key in obj:
 74 |         obj[key] = obj[key].to(device)
 75 | 
 76 | def collate(data):
 77 |     question = tokenizer([item["question"] for item in data], return_tensors="pt", padding=True, truncation=True)
 78 |     positive_reference = tokenizer([item["positive_reference"] for item in data], return_tensors="pt", padding=True, truncation=True)
 79 |     negative_reference = tokenizer([item["negative_reference"] for item in data], return_tensors="pt", padding=True, truncation=True)
 80 | 
 81 |     for key in question: question[key] = question[key].to(args.device)
 82 |     for key in positive_reference: positive_reference[key] = positive_reference[key].to(args.device)
 83 |     for key in negative_reference: negative_reference[key] = negative_reference[key].to(args.device)
 84 | 
 85 |     return question, positive_reference, negative_reference
 86 | 
 87 | def eval():
 88 |     # print("EVAL ...")
 89 |     model.eval()
 90 |     with torch.no_grad():
 91 |         total_acc = 0
 92 |         for q, pos, neg in eval_loader:
 93 |             results = model(q, pos, neg)
 94 |             # print(results)
 95 |             # exit()
 96 |             tot_cr = model.num_correct(*results)
 97 |             total_acc += tot_cr
 98 | 
 99 |         print("EVALUATION, Acc: %10.6f"%(total_acc / len(eval_set)))
100 |     
101 | def save(name):
102 |     os.makedirs(log_dir, exist_ok=True)
103 |     model.question_encoder.save_pretrained(os.path.join(log_dir, name, "query_encoder"))
104 |     model.reference_encoder.save_pretrained(os.path.join(log_dir, name, "reference_encoder"))
105 | 
106 | def train(max_epoch = 10, eval_step = 200, save_step = 400, print_step = 50):
107 |     step = 0
108 |     for epoch in range(0, max_epoch):
109 |         print("EPOCH %d"%epoch)
110 |         for q, pos, neg in train_loader:
111 |             model.train()
112 |             step += 1
113 |             opt.zero_grad()
114 |             results = model(q, pos, neg)
115 |             loss = model.loss(*results)
116 |             
117 |             if step % print_step == 0:
118 |                 print("Step %4d, Loss, Acc: %10.6f, %10.6f"%(step, loss, model.acc(*results)))
119 |             
120 |             loss.backward()
121 |             opt.step()
122 |             
123 |             scheduler.step()
124 |             model.zero_grad()
125 |             if step % eval_step == 0:
126 |                 eval()
127 |                 pass
128 |             if step % save_step == 0:
129 |                 save("step-%d"%(step))
130 |             
131 | 
132 |         save("step-%d-epoch-%d"%(step, epoch))
133 |         # eval()
134 | 
135 | if __name__ == "__main__":
136 |     args = argparse.ArgumentParser()
137 |     args.add_argument("--max_epoch", type=int, default=3)
138 |     args.add_argument("--eval_step", type=int, default=40)
139 |     args.add_argument("--save_step", type=int, default=40)
140 |     args.add_argument("--print_step", type=int, default=40)
141 |     args.add_argument("--device", type=str, default="cuda")
142 |     args.add_argument("--temp", type=float, default=0.05)
143 |     args.add_argument("--train_batch_size", type=int, default=64)
144 |     args.add_argument("--eval_batch_size", type=int, default=32)
145 |     args.add_argument("--lr", type=float, default=1e-6)
146 |     args.add_argument("--warmup", type=int, default=100)
147 |     args.add_argument("--total", type=int, default=1000)
148 |     args.add_argument("--ratio", type=float, default=0.0)
149 |     args.add_argument("--save_dir", type=str, default="./retriever_runs")
150 |     args.add_argument("--train_data_dir", type=str, required=True)
151 |     
152 |     args = args.parse_args()
153 |     
154 |     log_dir = os.path.join(args.save_dir, time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time())))
155 |     
156 |     train_set = load_from_disk(os.path.join(args.train_data_dir, "train"))
157 |     eval_set = load_from_disk(os.path.join(args.train_data_dir, "eval"))
158 |     
159 |     tokenizer = AutoTokenizer.from_pretrained("facebook/contriever-msmarco")
160 |     train_loader = DataLoader(train_set, batch_size=args.train_batch_size, collate_fn=collate)
161 |     eval_loader = DataLoader(eval_set, batch_size=args.eval_batch_size, collate_fn=collate)
162 | 
163 |     model = QuestionReferenceDensity()
164 |     model = model.to(args.device)
165 |     opt = AdamW(model.parameters(), lr=args.lr, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.01)
166 |     scheduler_args = {
167 |         "warmup": args.warmup,
168 |         "total": args.total,
169 |         "ratio": args.ratio,
170 |     }
171 |     scheduler = WarmupLinearScheduler(opt, **scheduler_args)
172 |     temp = args.temp
173 |     
174 |     train(max_epoch=args.max_epoch, eval_step=args.eval_step, save_step=args.save_step, print_step=args.print_step)
175 | 
176 | 


--------------------------------------------------------------------------------
/web_demo.py:
--------------------------------------------------------------------------------
 1 | import gradio as gr
 2 | from model import citation_correction, load_model
 3 | import argparse
 4 | 
 5 | from arguments import add_model_config_args
 6 | 
 7 | TOTAL_NUM = 10
 8 | CSS = """
 9 |     #col {
10 |         width: min(100%, 800px);
11 |         top: 0;
12 |         right: 0;
13 |         bottom: 0;
14 |         left: 0;
15 |         margin: auto;
16 |     }
17 |     
18 |     footer{display:none !important}
19 | """
20 | 
21 | 
22 |     
23 | # a summary structure ( use <summary> tag in html )
24 | # title is in summary, click to expand
25 | # in the container, there is an icon that can be clicked to jump to url.
26 | # the other part is the text.
27 | ref_html = """
28 | 
29 | <details style="border: 1px solid #ccc; padding: 10px; border-radius: 4px; margin-bottom: 4px">
30 |     <summary style="display: flex; align-items: center; font-weight: bold;">
31 |         <span style="margin-right: 10px;">[{index}] {title}</span>
32 |         <a href="{url}" style="text-decoration: none; background: none !important;" target="_blank">
33 |             <!--[Here should be a link icon]-->
34 |             <i style="border: solid #000; border-width: 0 2px 2px 0; display: inline-block; padding: 3px; transform:rotate(-45deg); -webkit-transform(-45deg)"></i>   
35 |         </a>
36 |     </summary>
37 |     <p style="margin-top: 10px;">{text}</p>
38 | </details>
39 | 
40 | """
41 | 
42 | def query(query: str):    
43 |     
44 |     refs = []
45 |     answer = "Loading ..."
46 |     
47 |     yield answer, ""
48 |     
49 |     for resp in webglm.stream_query(query):
50 |         if "references" in resp:
51 |             refs = resp["references"]
52 |         if "answer" in resp:
53 |             answer = resp["answer"]
54 |             answer = citation_correction(answer, [ref['text'] for ref in refs])
55 |         yield answer, "<h3>References (Click to Expand)</h3>" + "\n".join([ref_html.format(**item, index = idx + 1) for idx, item in enumerate(refs)])
56 |     
57 | if __name__ == '__main__':
58 |     
59 |     arg = argparse.ArgumentParser()
60 |     add_model_config_args(arg)
61 |     args = arg.parse_args()
62 |     
63 |     webglm = load_model(args)
64 |     
65 |     with gr.Blocks(theme=gr.themes.Base(), css=CSS) as demo:
66 |         
67 |         with gr.Column(elem_id='col'):
68 |             gr.Markdown(
69 |             """
70 |             # WebGLM Demo
71 |             """)
72 |             with gr.Row():
73 |                 # with gr.Column(scale=8):
74 |                 query_box = gr.Textbox(show_label=False, placeholder="Enter question and press ENTER").style(container=False)
75 |                 # with gr.Column(scale=1, min_width=60):
76 |                 #     query_button = gr.Button('Query')
77 |             
78 |             answer_box = gr.Textbox(show_label=False, value='', lines=5)
79 |             
80 |             # with gr.Box():
81 |             ref_boxes = gr.HTML(label="References")
82 |     
83 |             # with gr.Column() as refs_col:
84 |             #     ref_boxes = []
85 |             #     for i in range(TOTAL_NUM):
86 |             #         ref_boxes.append(gr.Textbox(f"Textbox {i}", visible=False)) 
87 |  
88 |         query_box.submit(query, query_box, [answer_box, ref_boxes])
89 |         # query_button.click(query, query_box, [answer_box, ref_boxes])
90 | 
91 |     demo.queue()
92 |     demo.launch()


--------------------------------------------------------------------------------