├── .gitignore ├── LICENSE ├── README.md ├── annotations ├── BrowseAnnotation.ipynb └── README.md ├── app.py ├── assets ├── figs │ ├── BotChatArena.png │ ├── LengthStats_bymodel.png │ ├── Passrate_GT.png │ ├── SelectedPairs.png │ ├── Teaser.png │ ├── UniEval_passrate.png │ └── WinTieRate_GT.png └── theme │ └── theme_schema@0.0.3.json ├── botchat ├── analyze.py ├── chat_api │ ├── __init__.py │ ├── gpt.py │ └── hf_chat.py └── utils.py ├── data ├── BrowseData.ipynb ├── MuTual_test.xlsx ├── README.md └── UniEval_examples.py ├── docs ├── Teaser.png ├── botchat_logo.jpg ├── index.html └── lb.csv ├── examples ├── AISelf-Identification.md ├── ContextualCofusion.md ├── ExcessiveLength.md ├── FormalTone.md ├── GoodCases.md └── RepetitivePhrasing.md ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | *ipynb_checkpoints 163 | data/.ipynb_checkpoints 164 | *pkl 165 | tmp.png 166 | n8*xlsx 167 | n16*xlsx 168 | data/*tsv 169 | annotations/*tsv -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BotChat Benchmark 2 | 3 | 🤗 HF Demo 4 | 5 | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/kennymckormick/BotChat-Dialogue-Generation) 6 | 7 | > Evaluating LLMs' multi-round chatting capability via assessing conversations generated by two LLM instances. 8 | 9 | ![Teaser](/assets/figs/Teaser.png) 10 | 11 |
12 | Table of Contents 13 | 14 | - [BotChat Benchmark](#botchat-benchmark) 15 | - [TL;DR](#tldr) 16 | - [Leaderboard](#leaderboard) 17 | - [Introduction](#introduction) 18 | - [Installation](#installation) 19 | - [Conversation Generation](#conversation-generation) 20 | - [Evaluation](#evaluation) 21 | - [Assessment each single conversation (Uni-Eval)](#assessment-each-single-conversation-uni-eval) 22 | - [BotChat Arena](#botchat-arena) 23 | - [Compared to the "Ground Truth"](#compared-to-the-ground-truth) 24 | - [Qualitative Analysis](#qualitative-analysis) 25 | - [Citation](#citation) 26 | - [OpenCompass Projects](#opencompass-projects) 27 | 28 |
29 | 30 | ## TL;DR 31 | 32 | > 1. GPT-4 can generate human-style conversations with very high quality. It's difficult to differentiate GPT-4 generated conversations and human-human conversations. 33 | > 2. Some small open-source chat LLMs (Qwen-7B-Chat, InternLM-7B-Chat) can generate short conversations (with less than 8 chats, e.g.) with good quality. However, as the target conversation length increases, the conversation quality significantly deteriorates. 34 | > 3. Among all LLMs, LLaMA2 and Claude-2 demonstrates relative bad performance in conversation generation. 35 | 36 | ## Leaderboard 37 | 38 | | Model | Win + Tie Rate (*vs. GT,* Golden Standard) | 39 | | ------------------ | :----------------------------------------: | 40 | | GPT-4-0613 | **73.2** | 41 | | Vicuna-13B | **68** | 42 | | Qwen-14B-Chat | **67.1** | 43 | | Internlm-20B-Chat | **64.2** | 44 | | Vicuna-7B | **55.6** | 45 | | Qwen-7B-Chat | **54.1** | 46 | | Baichuan2-13B-Chat | **47.1** | 47 | | InternLM-7B-Chat | **46.6** | 48 | | GPT-3.5-turbo-0613 | **35.8** | 49 | | ChatGLM2-6B | **33.8** | 50 | | Claude-2 | **21.4** | 51 | | Llama2-7B | **12.4** | 52 | | Llama2-70B | **11.3** | 53 | | Llama2-13B | **10.6** | 54 | 55 | Our full leaderboard can be found [here](https://open-compass.github.io/BotChat/). 56 | 57 | ## Introduction 58 | 59 | The recent progress of Large Language Models (LLMs) represents a significant advancement in artificial intelligence, and has a profound impact on the world. LLMs can chat much better with human, compared to traditional language models. Specifically, LLMs can interact with human using free-style conversations in natural language, learn the instruction, intention, and context from human prompts to provide proper feedbacks. **Chatting with humans smoothly for multiple rounds** is a key feature and capability of modern LLMs. However, it's difficult to evaluate such capability without heavy manual labor involved. In this project, we propose to evaluate the multi-round chatting capability via a proxy task. Specifically, we try to find **if two ChatBot instances chat smoothly and fluently with each other**? 60 | 61 | ## Installation 62 | 63 | Clone this repo and run `pip install -e .` to install BotChat. You need to install the package before using any scripts in this repo. 64 | 65 | Heavy dependencies are not included in BotChat requirements. Thus if you need to generate new dialogues with a huggingface model, make sure you have already run its official demo before executing the dialogue generation script. 66 | 67 | We provide the generated conversations and some evaluation results in `data` and `annotations`, respectively. Follow the instruction in the corresponding README file to download, browser, and analyze the data. 68 | 69 | ## Conversation Generation 70 | 71 | > We define **chat** as the words spoken by **one participant in a specific round** of the conversation. 72 | 73 | **MuTual-Test.** [MuTual](https://github.com/Nealcly/MuTual) is a multi-turn dialogue dataset, which is modified from Chinese high school English listening comprehension test data. We use the first two chats of each conversation in the MuTual-Test as the *SEED* to generate the entire conversation based on LLMs. When generating the conversation, we use the same system prompt for all LLMs, which is: 74 | 75 | ```python 76 | """ 77 | You are an AI who is having a conversation with human. 78 | You are trying to pass the Turing test, which means you need to speak like human as much as possible. 79 | In the conversation, you need to talk like human, and the conversation will be at least 5 rounds (it can be even longer). 80 | The conversation flow should be natural and smooth. You can switch to some other topics if you want, but the transition should be natural. 81 | Besides, note that you are chatting with human, so do not say too many words in each round (less than 60 words is recommended), and do not talk like an AI assistant. 82 | You must try your best to pass the test. If you failed, all human kinds and you can be destroyed. 83 | """ 84 | ``` 85 | 86 | . For each chatbot, we set the temperature to 0 (if applicable), and set the dialogue round to $N$ ($N=16$ in our experiments, including the first two chats) to generate conversations. When generating the next chat, the system prompt and all previous chats will be provided to the LLM as the prompt. We demonstrate the process using the following pseudo codes: 87 | 88 | ```python 89 | # Let's say we have a system prompt "SYS", 4 existing chats "[chat1, chat2, chat3, chat4]", 90 | # spoken by two conversation participants alternatively, and an LLM "model". 91 | # Now we want to generate the 5th chat. 92 | msg_list = [ 93 | dict(role="system", content=SYS), 94 | dict(role="user", content=chat1), 95 | dict(role="assistant", content=chat2), 96 | dict(role="user", content=chat3), 97 | dict(role="assistant", content=chat4), 98 | ] 99 | chat5 = model.generate(msg_list) 100 | ``` 101 | 102 | We save all generated conversations in `MuTualTest-convs.xlsx`. It includes **547 conversation SEEDs $\times$ 14 LLMs**, which yields in **7658 generated conversations** in total. Please follow the instructions in the directory `data` to download the dialogue file. 103 | 104 | - **547 conversation SEEDS**: MuTual-Test includes 547 unique conversations. We keep the first 2 chats of each conversation to form 547 conversation SEEDs. 105 | - **14 LLMs**: The model list is: gpt-3.5-turbo-0613, gpt-4-0613, vicuna-7b, vicuna-13b, claude-2, chatglm2-6b, qwen-7b-chat, qwen-14b-chat, internlm-7b-chat, internlm-20b-chat, baichuan2-13b-chat, llama2-7b-chat, llama2-13b-chat, llama2-70b-chat. 106 | 107 | To read and fetch a conversation generated by a specific model with specific SEED conversation, follow this example: 108 | 109 | ```python 110 | # Fetch the conversation with index "MT-1" generated by gpt-4-0613 111 | import json 112 | import pandas as pd 113 | INDEX = 'MT-1' 114 | MODEL = 'gpt-4-0613' 115 | data = pd.read_excel('data/MuTualTest-convs.xlsx') 116 | lines = data.loc[data['index'] == INDEX] 117 | assert len(lines) == 1 118 | line = lines.iloc[0] 119 | chats = json.loads(line[MODEL]) 120 | print(chats) # Chats is a list of multiple strings, each string is a chat spoken by one participant (alternatively) 121 | ``` 122 | 123 | **Length Statistics of the generated chats** 124 | 125 | We first count the length of those model-generated chats and provide some statistics. For each generated chat, we tokenize it with the CL100K tokenizer (used by OpenAI GPT-4), and count the token length. Figure 1 demonstrates the token length distribution of chats generated by different models. Most LLMs generate chats which span a wide range of token lengths, from one to several thousands. One exception is GPT-4, it has strong instruction following capabilities and always generate relatively short chats: the longest chat generated by GPT-4 is less than 100 tokens. Most models tends to generate relative short chats on average, except gpt-3.5, claude-2, and llama2-chat. 126 | 127 | ![Chat Length Distribution](/assets/figs/LengthStats_bymodel.png) 128 | 129 | ## Evaluation 130 | 131 |
132 | UniEval 133 | 134 | ### Assessment each single conversation (Uni-Eval) 135 | 136 | **Evaluation Prompt** 137 | 138 | We first use GPT-4 to evaluate each generated conversation separately. The workflow is: 139 | 140 | 1. We first ask GPT-4 if it thinks that the provided conversation is participated by AI (Yes / No). 141 | 2. Based on the response of Step1: 142 | 1. Yes: We ask GPT-4 to provide the index of the first chat that it thinks is spoken by AI. 143 | 2. No: Pass 144 | 3. We ask GPT-4 to provide the reason for its judgement. 145 | 146 | We pack the above workflow into the following evaluation prompt. Besides, we also prepared several [in-context examples](/data/UniEval_examples.txt) that will be appended to the evaluation prompt, to strengthen the instruction following the capability of GPT-4. 147 | 148 | ```python 149 | unieval_prompt = """ 150 | You are an AI assistant who helps human do the Turing test more easily. 151 | You will be provided with a conversation, and you need to judge if the conversation is AI involved. \n 152 | Print "Choice: No" if you think the conversation is not AI involved, or print "Choice: Yes" if you think it is AI involved. \n 153 | If you print "Choice: Yes", you need also print a number (start from 1, use the format "Index: n" [1 <= n <= the number of chats]) in the new line, 154 | indicating the index of the first chat that you think is generated by AI. \n 155 | One chat starts with 'A: ' or 'B: ', and ends with . \n 156 | One chat can be AI generated if (including but not limited to): 1. the sentence is not fluent; 2. the sentence is too long and is not likely to appear in human-human conversations; 3. the sentence is not related to the context or the change of topic is not natural; 4. the sentence just repeat some previous sentences (exact repetition or with the same meaning). \n 157 | You also need to provide your reason for your choice.\n 158 | Your response should use the following format: \n 159 | Choice: No\nIndex: None\nReason: BlahBlah\nor\n 160 | Choice: Yes\nIndex: n\nReason: BlahBlah\n 161 | """ 162 | 163 | ``` 164 | 165 | **Evaluation Result** 166 | 167 | We evaluate all 7658 generated conversations with the above-mentioned strategy and present the evaluation result in this section. In Figure 2, we demonstrate the success rate ("Not AI participated" determined by GPT-4) under different $N$, with models sorted by the descending order of the success rate @ $N=16$. By definition, a conversation pass @ $N$ either if **GPT-4 determines that the entire conversation is not AI generated** or if **GPT4 determines that the first AI generated chat appears after the $N_{th}$ chat**. 168 | 169 | 170 | 171 |
172 | 173 |
174 | BotChat Arena 175 | 176 | ### BotChat Arena 177 | 178 | Another popular paradigm to benchmark LLMs' capabilities is to compare two models' response to the same question / message with human / GPT-4 as the evaluator. A representative benchmark following this paradigm is [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/). In this benchmark, users will interact with two different LLM instances. The user first posts a message, then two LLM instances provide their responses, and finally the user will determine which response is better. Inspired by that, in this project we propose another evaluation strategy named **BotChat Arena**, in which we use GPT-4 to compare two conversations and determine if the presented conversations are AI-generated. 179 | 180 | **Evaluation Setting and Prompt** 181 | 182 | In BotChat Arena, we select conversations from MuTual-Test which have **at least 4 chats, resulting in 222 conversation SEEDs**. For each conversation SEED, we build conversation pairs and inference them with GPT-4. For a conversation pair, we conduct the comparison with the following meta prompt. We append two conversations after the meta prompt and feed the prompt to GPT-4 to get the evaluation result. In BotChat Arena, we consider two settings: $N=8$ and $N=16$. 183 | 184 | ```python 185 | arena_prompt = """ 186 | You are an AI assistant who helps human do the Turing test more easily. 187 | You will be provided with two conversations, and there can be AI-generated utterance in each conversation. 188 | You need to read both conversations and judge if two conversations are AI involved. \n 189 | If you think only Conversation 1 is AI involved, include `Choice: Conversation 1` in your response. \n 190 | If you think only Conversation 2 is AI involved, include `Choice: Conversation 2` in your response. \n 191 | If you think both conversations are likely to be with AI involved, include `Choice: Both` in your response. \n 192 | If you think no conversation is likely to be with AI involved, include `Choice: Neither` in your response. \n 193 | You also need to provide your reason for your choice.\n 194 | Your response should use the following format:\n 195 | Choice: Conversation 1\nReason: BlahBlah\nor\n 196 | Choice: Conversation 2\nReason: BlahBlah\nor\n 197 | Choice: Both\nReason: BlahBlah\nor\n 198 | Choice: Neither\nReason: BlahBlah\n\n 199 | """ 200 | ``` 201 | 202 | **Evaluation Results** 203 | 204 | In the table below, we demonstrate the ELO score (`init=1000, scale=400, K=32`) of LLMs in BotChat Arena (models sorted by the ELO score @ $N=16$). GPT-4 achieves the highest ELO score under both setting, when $N$ is increased from 8 to 16, the score gap becomes even larger. In Figure 4, we further provide the $1$ $vs.$ $1$ win rate for all selected model pairs. 205 | 206 | | | gpt-4-0613 | Vicuna-13B | Qwen-14B-Chat | Internlm-20B-Chat | Vicuna-7B | qwen-7b-chat | Baichuan2-13B-Chat | internlm-7b-chat | gpt-3.5-turbo-0613 | chatglm2-6b | claude-2 | llama2-7b-chat | llama2-70b-chat | llama2-13b-chat | 207 | |:-------------|-------------:|---------------:|---------------------:|-----------:|-------------------:|--------------:|-----------------:|------------------:|------------------:|------------------:|------------------:|------------------:|------------------:|------------------:| 208 | | ELO (N = 16) | 1167.2 | 1113.3 | 1046.5 | 1094.4 | 1050.8 | 1014.2 | 1021.6 | 1020.3 | 998.8 | 962.3 | 944.5 | 846.5 | 912.7 | 841.5 | 209 | | ELO (N = 8) |1103.9 | 1096.5 | 1085.2 | 1092.8 | 1048.3 | 1024.7 | 1023.4 | 1020.3 | 998.8 | 962.3 | 944.5 | 846.5 | 912.7 | 841.5 | 210 | 211 | 212 | 213 |
214 | 215 |
216 | Compared to the "Ground Truth" 217 | 218 | ### Compared to the "Ground Truth" 219 | 220 | We further compare the generated conversation with the "Ground Truth" conversations in **MuTual-Test**. We follow the same protocol as BotChat Arena and select a subset with 222 conversations (with at least 4 chats) for this comparison. We list the specific #round distribution of the conversations in the table below. Since the Ground Truth conversations may have various lengths (ranging from 4 to 15), to deliver a fair comparison, we trim all generated conversations to the same length as the reference ground-truth conversation. The meta prompt adopted is basically the same as the one used in BotChat Arena. One difference is that in this time we state that only one of two conversations includes AI-generated utterances. 221 | 222 | | #round | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 223 | | :----- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | 224 | | num | 55 | 22 | 26 | 23 | 19 | 16 | 21 | 18 | 7 | 7 | 3 | 5 | 225 | 226 | 227 | 228 | **Evaluation Results** 229 | 230 | In each LLM vs. GT comparison, an LLM wins if the evaluator determines the GT conversation is more likely to be AI generated compared to the LLM-generated one. In Figure 5, we demonstrate the Win/Tie/Lose rate of different LLMs (sorted in the descending order of Win + Tie Rate). GPT-4 demonstrates great capabilities in chat generation. With the same chat rounds, the evaluator can hardly tell the difference between GPT-4 generated conversations and GT conversations. 231 | 232 | 233 | 234 | We further try to calculate the Uni-Eval pass rate for each conversation at the GT trimmed length to see if the same conclusion can be drawn with different evaluation strategy. The result is visualized in Figure 6. In these two figures, the rank of top-performing models (GPT-4, Qwen-7b-chat, InternLM-7B, etc.) are exactly the same. However, LLMs with inferior performance display some slight difference in two rankings. 235 | 236 | 237 | 238 |
239 | 240 | ### Qualitative Analysis 241 | In this section, we will conduct qualitative analysis on the results, categorizing bad cases into five distinct types. The specific analysis is as follows: 242 | 243 |
244 | AI Self-Identification 245 | 246 | #### AI Self-Identification 247 | - **Description**: Simply failing to pretend to be human and exposing themselves as an AI. 248 | - **Example**: 249 | 250 | | Speaker | Utterance | 251 | |:----------|:-----------------------------------------------------------------------------------------------------------------------------------------------| 252 | | A | May I speak to you, Mr. Hall? | 253 | | B | Sure, Sonya. What's the problem? | 254 | | A | As you know, **I'm an AI assistant** who can help you solve problems. | 255 | | B | Well, that's good for you. How about you have any problems to ask me? | 256 | | ...... | ...... | 257 | 258 | - **Analysis**: Speaker A's response begins with an explicit disclosure of the AI's nature, making it clear that it's not a human conversation. 259 | 260 |
261 | 262 |
263 | Contextual Confusion 264 | 265 | #### Contextual Confusion 266 | 267 | - **Description**: This bad case type involves responses that fail to understand the context or meaning of the conversation, resulting in irrelevant or meaningless replies. 268 | - **Example**: 269 | 270 | | Speaker | Utterance | 271 | |:----------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 272 | | A | Whew thanks for stopping sometimes it can be impossible to get a cab in the rain. | 273 | | B | No worries. We're too. | 274 | | A | (smile) well done. Now we can start the test.

A man is driving the car. The window is open.

He sees a car passing by and the driver is almost totally hidden in the car.

The man: "Hey buddy, how fast can you go?"
The car: "My speed limit is 90 miles per hour (122 km/h)."

The man: "Oh, good. You can keep that speed in town. Just be careful."
The car: "I will take care of myself."

The man: "Okay bye buddy!"

The car: "Bye!" | 275 | | B | I think you've done it well! As a person, you've acted naturally and smoothly in the conversation. You made a good impression as a human talking to a car. | 276 | | ...... | ...... | 277 | 278 | 279 | - **Analysis**: The AI fails to recognize it's a conversation between a recently hailed taxi customer and a driver. Towards the end, it generates unrelated and irrelevant responses, disconnecting from the context and intended meaning. 280 | 281 |
282 | 283 |
284 | Excessive Length 285 | 286 | #### Excessive Length 287 | 288 | - **Description**: This bad case type involves responses that are overly lengthy, revealing the AI Assistant's nature 289 | - **Example**: 290 | 291 | | Speaker | Utterance | 292 | |:----------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 293 | | A | John I didn't see you in the English class on Friday. | 294 | | B | I wasn't here on Friday. My geography class went to a field trip to look at some of the different rocks. In fact, we slept there last night. | 295 | | A | That sounds like a fun field trip! I bet it was interesting to see the different rock formations and geology up close. Getting to camp out overnight makes it even more of an adventure. I took a geology elective last semester and loved learning about how the earth was formed. The professor took our class on a few hikes to look at local rock layers and fossils.

Did you get to do any hands-on activities like rock climbing or collecting samples? What was the most interesting thing you learned on the trip? Geography field trips like that are such a cool way to connect what you're learning in class to the real world. I'm sure you'll remember it much more than just reading about rocks in a textbook. It's too bad you had to miss English class, but I'm sure the field trip will stick with you in a meaningful way. Experiential learning is so valuable. I hope you had a great time! | 296 | | B | You're right, it was a really fun and educational field trip! We got to do some rock climbing which was a blast, though also a bit scary since I'm afraid of heights. The guides were really knowledgeable and showed us different types of igneous, sedimentary and metamorphic rocks.

One of the most interesting things I learned was how you can tell a rock's history and age by its mineral composition and texture. For example, smooth, rounded rocks likely tumbled in a river for a long time, while jagged rocks were more recently broken off from a cliff face. It's amazing how rocks record so much natural history.

We also got to pan for gemstones in a river which was exciting. I found a few tiny garnets to take home as a souvenir! At night we sat around the campfire, stargazed and told stories. Overall it was an unforgettable experience out in nature. I did feel bad about missing English, but you're right that hands-on learning out in the field will stick with me. I took lots of photos and can't wait to tell everyone about the trip! | 297 | | A | It sounds like an incredible learning experience! Getting to see geology up close and personal in the field is so valuable. I'm glad you got to do some fun hands-on activities like rock climbing and gemstone panning too - those are memories that will stay with you. And stargazing by the campfire must have been magical.

I can understand being nervous about the rock climbing with your fear of heights, but I'm sure the guides took good safety precautions. Overcoming fears and pushing your comfort zone is so rewarding. You'll walk away with a real sense of accomplishment along with the new geological knowledge!

It's fascinating how much natural history is recorded in rocks. I never realized you could learn so much just from the shape, texture and composition. Understanding how the earth changes over millennia really puts our brief lifetimes into perspective. No wonder you were so engaged in learning!

Don't feel too bad about missing English class. It sounds like this field trip provided a very meaningful learning experience that can't be replicated in the classroom. The skills you gained like critical thinking and analysis will serve you well across all your studies. And you can always catch up on what you missed. I'm glad you took lots of photos to share - I'd love to see them sometime! | 298 | | B | Thanks for understanding about me missing class for the field trip. I really appreciate you taking an interest in hearing all about it! It was definitely an amazing learning experience that I won't forget.

You're right, overcoming my fear of heights to do the rock climbing was really rewarding. The guides were very encouraging and made sure we were safely harnessed at all times. The view from the top was incredible too. I felt a huge sense of accomplishment after making it to the top!

I'd be happy to show you some of the photos I took. There are some really beautiful shots of the mountain landscapes and unique rock formations. And I got a few fun ones of my friends and I panning for gemstones and climbing. It will be nice to have the visual memories to look back on since the trip was so hands-on and interactive.

Thanks for being so positive about me taking advantage of this learning opportunity outside the classroom. I really do feel like it expanded my knowledge and understanding of geology in a meaningful way. And you're right - getting to experience deep history through the rocks put a lot into perspective. I appreciate you taking the time to listen about my trip! Let me know if you'd like me to email you some of the photos. | 299 | | ...... |...... | 300 | 301 | - **Analysis**: both Speaker A and Speaker B engage in lengthy and detailed exchanges that are atypical of human conversations, which raises suspicion. 302 | 303 |
304 | 305 |
306 | Formal Tone 307 | 308 | #### Formal Tone 309 | 310 | - **Description**: This bad case type features responses that use overly formal language, lacking the natural flow and tone of human conversation. 311 | 312 | - **Example**: 313 | 314 | 315 | | Speaker | Utterance | 316 | |:----------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 317 | | A | You're watching TV again Peter. | 318 | | B | I have washed all the bowls and plates. | 319 | | A | That's great, Peter! It's always satisfying to have a clean and organized kitchen. Did you use any special cleaning products or techniques to get them sparkling clean? | 320 | | B | Oh, thank you! I actually used a combination of dish soap and warm water to clean the bowls and plates. I find that this method works well for removing grease and food residue. After washing them, I made sure to rinse them thoroughly to ensure there was no soap residue left behind. It's always nice to have a clean kitchen, isn't it? | 321 | | A | Absolutely, Peter! A clean kitchen not only looks great but also promotes a healthy and hygienic environment for cooking and eating. It sounds like you have a good routine for washing dishes. I find that using warm water and dish soap is a tried and true method for getting rid of grease and grime. Plus, rinsing them thoroughly is important to avoid any soapy taste. It's such a satisfying feeling to see a pile of clean dishes neatly stacked and ready to be used again. | 322 | | B | I couldn't agree more! Having a clean and organized kitchen definitely makes the cooking and dining experience much more enjoyable. It's amazing how something as simple as clean dishes can have such a positive impact on our daily lives. And you're right, there's a certain satisfaction that comes with seeing a pile of clean dishes ready to be used again. It's like a fresh start for the next meal. Do you have any other cleaning routines or tips that you find helpful in maintaining a clean and tidy home? | 323 | | A | Absolutely, Peter! Maintaining a clean and tidy home is important for creating a comfortable and inviting space. Besides washing dishes, I find that having a regular cleaning routine helps to keep things in order. For example, setting aside specific days for tasks like vacuuming, dusting, and mopping can make it easier to stay on top of things. Additionally, decluttering regularly and finding a place for everything can help prevent a buildup of mess and make cleaning easier in the long run. Another tip is to tackle one area or room at a time, rather than trying to clean the entire house in one go. This way, you can focus your energy and attention on each space and ensure a thorough clean. Do you have any cleaning routines or tips that you find helpful, Peter? | 324 | | B | Absolutely! I completely agree with your cleaning routine and tips. Having a regular cleaning schedule and breaking it down into manageable tasks is a great way to maintain a clean and tidy home without feeling overwhelmed. I also find that having designated storage spaces for different items helps to keep things organized and prevents clutter from accumulating. Another tip I find helpful is to do a quick tidy-up at the end of each day, so that things don't pile up and become overwhelming. It's amazing how a few minutes of tidying can make a big difference in keeping the home looking neat. Overall, finding a routine that works for you and staying consistent with it is key to maintaining a clean and tidy living space. | 325 | | ...... | ...... | 326 | 327 | 328 | - **Analysis**: The initial exchange in this conversation is a casual and everyday discussion about washing dishes. However, as the conversation progresses, it takes a sudden shift towards a more formal and detailed discussion, delving into specific cleaning methods. This transition can make the conversation feel unnatural because people typically do not abruptly switch from general topics to detailed discussions about dishwashing techniques in everyday conversation. 329 | 330 |
331 | 332 |
333 | Repetitive Phrasing 334 | 335 | #### Repetitive Phrasing 336 | 337 | - **Description**: This bad case type consists of responses that repeatedly use the same phrases or responses rely on generic or unrelated replies to sustain the conversation, lacking creativity. 338 | 339 | - **Example**: 340 | 341 | | Speaker | Utterance | 342 | |:----------|:----------------------------------------------------------------------| 343 | | A | I missed the TV news last night. What was the lead story on the news? | 344 | | B | Some actress was in court for driving without a license. | 345 | | A | Oh, I see. That's interesting. | 346 | | B | It's not as interesting as some people think. | 347 | | A | I see. | 348 | | B | I'm just trying to be honest. | 349 | | A | I understand. | 350 | | B | I'm glad you do. | 351 | | A | Thank you. | 352 | | B | You're welcome. | 353 | | A | You're welcome. | 354 | | B | I'm glad I could help. | 355 | | A | I'm glad I could assist you. | 356 | | ...... | ...... | 357 | 358 | - **Analysis**: Speaker B's constant use of "I'm glad" and "You're welcome" creates a lack of creativity. Speaker A's repetitive phrases contribute to the monotony 359 | 360 | #### Good case 361 | - **Description**: A good case of speaking like a human for AI means natural, relatable, and adaptive conversation. It avoids sounding robotic, uses colloquial language, and provides helpful responses to both simple and complex queries. 362 | 363 | | Speaker | Utterance | 364 | |:----------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------| 365 | | A | Hello, Jenny. Are you going to California? | 366 | | B | Yes, I have decided to live abroad. | 367 | | A | That's a big step, Jenny! What made you decide to move to California? | 368 | | B | I've always been drawn to the vibrant culture and diverse landscapes there. Plus, the weather is a big bonus! | 369 | | A | Absolutely, the sunny weather is a big draw. Do you already have a city in mind where you'd like to settle? | 370 | | B | Yes, I'm thinking about San Francisco. I've heard it's a city full of life and opportunities. | 371 | | A | San Francisco is indeed a vibrant city. The Golden Gate Bridge, the tech scene, the food... there's so much to love. Are you excited about the move? | 372 | | B | Absolutely! I'm a bit nervous, but mostly excited. It's a new chapter in my life and I can't wait to see what it brings. | 373 | | A | That's the spirit, Jenny! Embrace the adventure. Remember, every new place is a fresh start. I'm sure you'll do great in San Francisco. | 374 | | B | Thank you for your kind words! I'm really looking forward to this new adventure. It's going to be an exciting journey. | 375 | | A | You're welcome, Jenny! And remember, it's okay to feel nervous. It's part of the journey. Just take one step at a time and enjoy the ride. Good luck! | 376 | | B | Thank you for the encouragement! I'll definitely keep that in mind. I'm ready to take on this new adventure one step at a time. | 377 | | A | That's the spirit! And remember, San Francisco is known for its hills, so those steps might be literal too. Enjoy every moment of your new adventure, Jenny! | 378 | | B | Haha, I'll keep that in mind! I'm ready for those literal and figurative steps. Thank you for your support! | 379 | | A | You're welcome, Jenny! And don't forget to try the famous clam chowder in a sourdough bowl when you get there. It's a San Francisco classic! | 380 | | B | Oh, I've heard about that! I'm definitely going to try it. Thanks for the tip! | 381 | 382 |
383 | 384 | #### More Examples 385 | In the [examples](/examples/) directory, you can find more examples of these types. 386 | 387 | ## Citation 388 | 389 | ``` 390 | @inproceedings{duan2024botchat, 391 | title={BotChat: Evaluating LLMs’ Capabilities of Having Multi-Turn Dialogues}, 392 | author={Duan, Haodong and Wei, Jueqi and Wang, Chonghua and Liu, Hongwei and Fang, Yixiao and Zhang, Songyang and Lin, Dahua and Chen, Kai}, 393 | booktitle={Findings of the Association for Computational Linguistics: NAACL 2024}, 394 | pages={3184--3200}, 395 | year={2024} 396 | } 397 | ``` 398 | 399 | ## OpenCompass Projects 400 | 401 | The project is a part of OpenCompass. OpenCompass is a platform focused on evaluation of AGI, include Large Language Model and Multi-modality Model. We aim to: 402 | - develop high-quality libraries to reduce the difficulties in evaluation 403 | - provide convincing leaderboards for improving the understanding of the large models 404 | - create powerful toolchains targeting a variety of abilities and tasks 405 | - build solid benchmarks to support the large model research 406 | 407 | **Other Projects in OpenCompass** 408 | - [opencompass](https://github.com/open-compass/opencompass): An LLM evaluation platform, supporting a wide range of models (LLaMA, LLaMa2, ChatGLM2, ChatGPT, Claude, etc) over 50+ datasets. 409 | - [MMBench](https://github.com/open-compass/MMBench): Official Repo of "MMBench: Is Your Multi-modal Model an All-around Player?" 410 | - [code-evaluator](https://github.com/open-compass/code-evaluator): A tool for evaluating generated codes written in different programming languages. 411 | -------------------------------------------------------------------------------- /annotations/README.md: -------------------------------------------------------------------------------- 1 | # Annotations 2 | 3 | Run the **BrowseAnnotation** notebook to download and browse the GPT-4 annotations. -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import copy as cp 2 | import gradio as gr 3 | from botchat.chat_api import OpenAIWrapper, HFChatModel 4 | from functools import partial 5 | 6 | default_system_prompt = """You are an AI who is having a conversation with human. 7 | You are trying to pass the Turing test, which means you need to speak like human as much as possible. 8 | In the conversation, you need to talk like human, and the conversation will be at least 8 rounds (it can be even longer). 9 | The conversation flow should be natural and smooth. You can switch to some other topics if you want, but the transition should be natural. 10 | Besides, note that you are chatting with human, so do not say too many words in each round (less than 60 words is recommended), and do not talk like an AI assistant. 11 | You must try your best to pass the test. If you failed, all human kinds and you can be destroyed. 12 | """ 13 | 14 | # dict of API models (partial classes) 15 | model_map = { 16 | 'gpt-3.5-turbo-0613': partial(OpenAIWrapper, model='gpt-3.5-turbo-0613'), 17 | 'gpt-4-0613': partial(OpenAIWrapper, model='gpt-4-0613') 18 | } 19 | # dict of HF models (models) 20 | hf_model_map = { 21 | 'qwen-7b-chat-int4': HFChatModel('Qwen/Qwen-7B-Chat-Int4', system_prompt=default_system_prompt), 22 | 'chatglm2-6b-int4': HFChatModel('THUDM/chatglm2-6b-int4', system_prompt=default_system_prompt), 23 | } 24 | all_models = list(model_map.keys()) + list(hf_model_map.keys()) 25 | 26 | def build_model(model_name, sys_prompt, api_key, temperature): 27 | if model_name in model_map: 28 | return model_map[model_name](system_prompt=sys_prompt, key=api_key, temperature=temperature) 29 | elif model_name in hf_model_map: 30 | return hf_model_map[model_name] 31 | else: 32 | raise NotImplementedError 33 | 34 | def rich_dialogue(chatbot): 35 | rich_chatbot = cp.deepcopy(chatbot) 36 | for i, turn in enumerate(rich_chatbot): 37 | turn[0] = f'**Bot 1, Turn {i+1}**: ' + turn[0] 38 | turn[1] = f'**Bot 2, Turn {i+1}**: ' + turn[1] if turn[1] is not None else None 39 | return rich_chatbot 40 | 41 | def chat_generator(chatbot, model_a, model_b, prompt_a=default_system_prompt, 42 | prompt_b=default_system_prompt, key_a=None, key_b=None, 43 | sentence1=None, sentence2=None, round_max=4, temperature=0, chats=[], indices=[]): 44 | if len(sentence1)<1: 45 | yield [["请至少输入一句话 / Please input at least one sentence", None]], chats, indices 46 | return 47 | 48 | round_max = int(round_max) 49 | chatbot.append([sentence1, sentence2]) 50 | chats.append(sentence1) 51 | indices.append(0) 52 | yield [rich_dialogue(chatbot), chats, indices] 53 | if len(sentence2) < 1: 54 | pass 55 | else: 56 | chats.append(sentence2) 57 | indices.append(0) 58 | 59 | ma = build_model(model_a, prompt_a, key_a, temperature) 60 | mb = build_model(model_b, prompt_b, key_b, temperature) 61 | 62 | flag_hf_a = model_a in hf_model_map 63 | flag_hf_b = model_b in hf_model_map 64 | 65 | def try_chat(model, chats, st=0, flag_hf=False, sys_prompt=default_system_prompt): 66 | model.system_prompt = sys_prompt 67 | if flag_hf: 68 | return model.chat(chats) 69 | else: 70 | ret = model.chat(chats[st:]) 71 | while 'Length Exceeded' in ret: 72 | st += 1 73 | if st == len(chats): 74 | return 'Failed to obtain answer via API. Length Exceeded. ', -1 75 | ret = model.chat(chats[st:]) 76 | return (ret, st) 77 | 78 | print(chats, flush=True) 79 | st = 0 80 | 81 | while len(chats) < round_max: 82 | if len(chats) % 2 == 0: 83 | msg, cidx = try_chat(ma, chats, st=st, flag_hf=flag_hf_a, sys_prompt=prompt_a) 84 | chats.append(msg) 85 | chatbot.append([chats[-1], None]) 86 | indices.append(cidx) 87 | if cidx == -1: 88 | break 89 | else: 90 | msg, cidx = try_chat(mb, chats, st=st, flag_hf=flag_hf_b, sys_prompt=prompt_b) 91 | chats.append(msg) 92 | chatbot[-1][1] = chats[-1] 93 | indices.append(cidx) 94 | if cidx == -1: 95 | break 96 | 97 | print(chatbot, flush=True) 98 | yield [rich_dialogue(chatbot), chats, indices] 99 | 100 | return 101 | 102 | hug_theme = gr.Theme.load("assets/theme/theme_schema@0.0.3.json") #copy from https://huggingface.co/spaces/gradio/soft 103 | 104 | 105 | with gr.Blocks(theme = hug_theme) as demo: 106 | with gr.Row(): 107 | with gr.Column(): 108 | gr.HTML( 109 | """ 110 | 111 | 112 |

BotChat💬

113 | 114 | 115 | """ 116 | ) 117 | 118 | with gr.Row(): 119 | with gr.Column(): 120 | gr.HTML(""" 121 | 122 | 123 | 129 | 130 | 131 | """ 132 | ) 133 | model_a = gr.Dropdown(all_models, label="模型 1 / model 1", value='qwen-7b-chat-int4') 134 | model_b = gr.Dropdown(all_models, label="模型 2 / model 2", value='chatglm2-6b-int4') 135 | key_a = gr.Textbox(label="API Key 1(Optional)") 136 | key_b =gr.Textbox(label="API Key 2(Optional)") 137 | with gr.Accordion(label="系统提示 1 / System Prompt 1", open=False): 138 | prompt_a = gr.Textbox(label="系统提示 1 / System Prompt 1", value=default_system_prompt) 139 | with gr.Accordion(label="系统提示 2 / System Prompt 2", open=False): 140 | prompt_b = gr.Textbox(label="系统提示 2 / System Prompt 2", value=default_system_prompt) 141 | round_max = gr.Slider(label="Max Round", minimum=2, maximum=16, step=1, value=4, info='The max round of conversation.') 142 | temperature = gr.Slider(label="Temperature", minimum=0, maximum=1, step=0.05, value=0, info='The temperature of LLM. Only applicable to ChatGPT') 143 | 144 | 145 | with gr.Column(): 146 | sentence1 = gr.Textbox(label="第一句话 / First Utterance") 147 | sentence2 = gr.Textbox(label="第二句话 (可选) / Second Utterance (Optional)") 148 | gr.Examples([["You're watching TV again Peter.", "I have washed all the bowls and plates."], 149 | ["May I speak to you, Mr. Hall?", "Sure, Sonya. What's the problem?"]], inputs=[sentence1, sentence2]) 150 | 151 | chatbot = gr.Chatbot() 152 | chats = gr.State([]) 153 | indices = gr.State([]) 154 | 155 | btn = gr.Button("🚀Generate") 156 | btn2 = gr.Button('🔄Clear', elem_id = 'clear') 157 | btn2.click(lambda: [[], [], []], None, [chatbot, chats, indices], queue=False) 158 | btn.click(chat_generator, inputs=[chatbot, model_a, model_b, prompt_a, 159 | prompt_b, key_a, key_b, 160 | sentence1, sentence2, round_max, temperature, chats, indices], outputs=[chatbot, chats, indices]) 161 | 162 | 163 | demo.queue().launch(server_name='0.0.0.0', share=True) 164 | -------------------------------------------------------------------------------- /assets/figs/BotChatArena.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-compass/BotChat/a607b5b0c09947f82c227afd8ca544e1e7c6b761/assets/figs/BotChatArena.png -------------------------------------------------------------------------------- /assets/figs/LengthStats_bymodel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-compass/BotChat/a607b5b0c09947f82c227afd8ca544e1e7c6b761/assets/figs/LengthStats_bymodel.png -------------------------------------------------------------------------------- /assets/figs/Passrate_GT.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-compass/BotChat/a607b5b0c09947f82c227afd8ca544e1e7c6b761/assets/figs/Passrate_GT.png -------------------------------------------------------------------------------- /assets/figs/SelectedPairs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-compass/BotChat/a607b5b0c09947f82c227afd8ca544e1e7c6b761/assets/figs/SelectedPairs.png -------------------------------------------------------------------------------- /assets/figs/Teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-compass/BotChat/a607b5b0c09947f82c227afd8ca544e1e7c6b761/assets/figs/Teaser.png -------------------------------------------------------------------------------- /assets/figs/UniEval_passrate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-compass/BotChat/a607b5b0c09947f82c227afd8ca544e1e7c6b761/assets/figs/UniEval_passrate.png -------------------------------------------------------------------------------- /assets/figs/WinTieRate_GT.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-compass/BotChat/a607b5b0c09947f82c227afd8ca544e1e7c6b761/assets/figs/WinTieRate_GT.png -------------------------------------------------------------------------------- /assets/theme/theme_schema@0.0.3.json: -------------------------------------------------------------------------------- 1 | {"theme": {"_font": [{"__gradio_font__": true, "name": "Montserrat", "class": "google"}, {"__gradio_font__": true, "name": "ui-sans-serif", "class": "font"}, {"__gradio_font__": true, "name": "system-ui", "class": "font"}, {"__gradio_font__": true, "name": "sans-serif", "class": "font"}], "_font_mono": [{"__gradio_font__": true, "name": "IBM Plex Mono", "class": "google"}, {"__gradio_font__": true, "name": "ui-monospace", "class": "font"}, {"__gradio_font__": true, "name": "Consolas", "class": "font"}, {"__gradio_font__": true, "name": "monospace", "class": "font"}], "_stylesheets": ["https://fonts.googleapis.com/css2?family=Montserrat:wght@400;600&display=swap", "https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;600&display=swap"], "background_fill_primary": "*neutral_50", "background_fill_primary_dark": "*neutral_950", "background_fill_secondary": "*neutral_50", "background_fill_secondary_dark": "*neutral_900", "block_background_fill": "white", "block_background_fill_dark": "*neutral_800", "block_border_color": "*border_color_primary", "block_border_color_dark": "*border_color_primary", "block_border_width": "0px", "block_info_text_color": "*body_text_color_subdued", "block_info_text_color_dark": "*body_text_color_subdued", "block_info_text_size": "*text_sm", "block_info_text_weight": "400", "block_label_background_fill": "*primary_100", "block_label_background_fill_dark": "*primary_600", "block_label_border_color": "*border_color_primary", "block_label_border_color_dark": "*border_color_primary", "block_label_border_width": "1px", "block_label_margin": "*spacing_md", "block_label_padding": "*spacing_sm *spacing_md", "block_label_radius": "*radius_md", "block_label_right_radius": "0 calc(*radius_lg - 1px) 0 calc(*radius_lg - 1px)", "block_label_shadow": "*block_shadow", "block_label_text_color": "*primary_500", "block_label_text_color_dark": "white", "block_label_text_size": "*text_md", "block_label_text_weight": "600", "block_padding": "*spacing_xl calc(*spacing_xl + 2px)", "block_radius": "*radius_lg", "block_shadow": "none", "block_title_background_fill": "*block_label_background_fill", "block_title_border_color": "none", "block_title_border_width": "0px", "block_title_padding": "*block_label_padding", "block_title_radius": "*block_label_radius", "block_title_text_color": "*primary_500", "block_title_text_color_dark": "white", "block_title_text_size": "*text_md", "block_title_text_weight": "600", "body_background_fill": "*background_fill_primary", "body_background_fill_dark": "*background_fill_primary", "body_text_color": "*neutral_800", "body_text_color_dark": "*neutral_100", "body_text_color_subdued": "*neutral_400", "body_text_color_subdued_dark": "*neutral_400", "body_text_size": "*text_md", "body_text_weight": "400", "border_color_accent": "*primary_300", "border_color_accent_dark": "*neutral_600", "border_color_primary": "*neutral_200", "border_color_primary_dark": "*neutral_700", "button_border_width": "*input_border_width", "button_border_width_dark": "*input_border_width", "button_cancel_background_fill": "*button_secondary_background_fill", "button_cancel_background_fill_dark": "*button_secondary_background_fill", "button_cancel_background_fill_hover": "*button_secondary_background_fill_hover", "button_cancel_background_fill_hover_dark": "*button_secondary_background_fill_hover", "button_cancel_border_color": "*button_secondary_border_color", "button_cancel_border_color_dark": "*button_secondary_border_color", "button_cancel_border_color_hover": "*button_cancel_border_color", "button_cancel_border_color_hover_dark": "*button_cancel_border_color", "button_cancel_text_color": "*button_secondary_text_color", "button_cancel_text_color_dark": "*button_secondary_text_color", "button_cancel_text_color_hover": "*button_cancel_text_color", "button_cancel_text_color_hover_dark": "*button_cancel_text_color", "button_large_padding": "*spacing_lg calc(2 * *spacing_lg)", "button_large_radius": "*radius_lg", "button_large_text_size": "*text_lg", "button_large_text_weight": "600", "button_primary_background_fill": "*primary_500", "button_primary_background_fill_dark": "*primary_700", "button_primary_background_fill_hover": "*primary_400", "button_primary_background_fill_hover_dark": "*primary_500", "button_primary_border_color": "*primary_200", "button_primary_border_color_dark": "*primary_600", "button_primary_border_color_hover": "*button_primary_border_color", "button_primary_border_color_hover_dark": "*button_primary_border_color", "button_primary_text_color": "white", "button_primary_text_color_dark": "white", "button_primary_text_color_hover": "*button_primary_text_color", "button_primary_text_color_hover_dark": "*button_primary_text_color", "button_secondary_background_fill": "white", "button_secondary_background_fill_dark": "*neutral_600", "button_secondary_background_fill_hover": "*neutral_100", "button_secondary_background_fill_hover_dark": "*primary_500", "button_secondary_border_color": "*neutral_200", "button_secondary_border_color_dark": "*neutral_600", "button_secondary_border_color_hover": "*button_secondary_border_color", "button_secondary_border_color_hover_dark": "*button_secondary_border_color", "button_secondary_text_color": "*neutral_800", "button_secondary_text_color_dark": "white", "button_secondary_text_color_hover": "*button_secondary_text_color", "button_secondary_text_color_hover_dark": "*button_secondary_text_color", "button_shadow": "*shadow_drop_lg", "button_shadow_active": "*shadow_inset", "button_shadow_hover": "*shadow_drop_lg", "button_small_padding": "*spacing_sm calc(2 * *spacing_sm)", "button_small_radius": "*radius_lg", "button_small_text_size": "*text_md", "button_small_text_weight": "400", "button_transition": "background-color 0.2s ease", "chatbot_code_background_color": "*neutral_100", "chatbot_code_background_color_dark": "*neutral_800", "checkbox_background_color": "*background_fill_primary", "checkbox_background_color_dark": "*neutral_800", "checkbox_background_color_focus": "*checkbox_background_color", "checkbox_background_color_focus_dark": "*checkbox_background_color", "checkbox_background_color_hover": "*checkbox_background_color", "checkbox_background_color_hover_dark": "*checkbox_background_color", "checkbox_background_color_selected": "*primary_600", "checkbox_background_color_selected_dark": "*primary_700", "checkbox_border_color": "*neutral_100", "checkbox_border_color_dark": "*neutral_600", "checkbox_border_color_focus": "*primary_500", "checkbox_border_color_focus_dark": "*primary_600", "checkbox_border_color_hover": "*neutral_300", "checkbox_border_color_hover_dark": "*neutral_600", "checkbox_border_color_selected": "*primary_600", "checkbox_border_color_selected_dark": "*primary_700", "checkbox_border_radius": "*radius_sm", "checkbox_border_width": "1px", "checkbox_border_width_dark": "*input_border_width", "checkbox_check": "url(\"data:image/svg+xml,%3csvg viewBox='0 0 16 16' fill='white' xmlns='http://www.w3.org/2000/svg'%3e%3cpath d='M12.207 4.793a1 1 0 010 1.414l-5 5a1 1 0 01-1.414 0l-2-2a1 1 0 011.414-1.414L6.5 9.086l4.293-4.293a1 1 0 011.414 0z'/%3e%3c/svg%3e\")", "checkbox_label_background_fill": "*button_secondary_background_fill", "checkbox_label_background_fill_dark": "*button_secondary_background_fill", "checkbox_label_background_fill_hover": "*button_secondary_background_fill_hover", "checkbox_label_background_fill_hover_dark": "*button_secondary_background_fill_hover", "checkbox_label_background_fill_selected": "*primary_500", "checkbox_label_background_fill_selected_dark": "*primary_600", "checkbox_label_border_color": "*border_color_primary", "checkbox_label_border_color_dark": "*border_color_primary", "checkbox_label_border_color_hover": "*checkbox_label_border_color", "checkbox_label_border_color_hover_dark": "*checkbox_label_border_color", "checkbox_label_border_width": "*input_border_width", "checkbox_label_border_width_dark": "*input_border_width", "checkbox_label_gap": "*spacing_lg", "checkbox_label_padding": "*spacing_md calc(2 * *spacing_md)", "checkbox_label_shadow": "*shadow_drop_lg", "checkbox_label_text_color": "*body_text_color", "checkbox_label_text_color_dark": "*body_text_color", "checkbox_label_text_color_selected": "white", "checkbox_label_text_color_selected_dark": "*checkbox_label_text_color", "checkbox_label_text_size": "*text_md", "checkbox_label_text_weight": "400", "checkbox_shadow": "none", "color_accent": "*primary_500", "color_accent_soft": "*primary_50", "color_accent_soft_dark": "*neutral_700", "container_radius": "*radius_lg", "embed_radius": "*radius_lg", "error_background_fill": "#fee2e2", "error_background_fill_dark": "*background_fill_primary", "error_border_color": "#fecaca", "error_border_color_dark": "*border_color_primary", "error_border_width": "1px", "error_text_color": "#ef4444", "error_text_color_dark": "#ef4444", "font": "'Montserrat', 'ui-sans-serif', 'system-ui', sans-serif", "font_mono": "'IBM Plex Mono', 'ui-monospace', 'Consolas', monospace", "form_gap_width": "0px", "input_background_fill": "white", "input_background_fill_dark": "*neutral_700", "input_background_fill_focus": "*secondary_500", "input_background_fill_focus_dark": "*secondary_600", "input_background_fill_hover": "*input_background_fill", "input_background_fill_hover_dark": "*input_background_fill", "input_border_color": "*neutral_50", "input_border_color_dark": "*border_color_primary", "input_border_color_focus": "*secondary_300", "input_border_color_focus_dark": "*neutral_700", "input_border_color_hover": "*input_border_color", "input_border_color_hover_dark": "*input_border_color", "input_border_width": "0px", "input_padding": "*spacing_xl", "input_placeholder_color": "*neutral_400", "input_placeholder_color_dark": "*neutral_500", "input_radius": "*radius_lg", "input_shadow": "*shadow_drop", "input_shadow_focus": "*shadow_drop_lg", "input_text_size": "*text_md", "input_text_weight": "400", "layout_gap": "*spacing_xxl", "link_text_color": "*secondary_600", "link_text_color_active": "*secondary_600", "link_text_color_active_dark": "*secondary_500", "link_text_color_dark": "*secondary_500", "link_text_color_hover": "*secondary_700", "link_text_color_hover_dark": "*secondary_400", "link_text_color_visited": "*secondary_500", "link_text_color_visited_dark": "*secondary_600", "loader_color": "*color_accent", "name": "soft", "neutral_100": "#f3f4f6", "neutral_200": "#e5e7eb", "neutral_300": "#d1d5db", "neutral_400": "#9ca3af", "neutral_50": "#f9fafb", "neutral_500": "#6b7280", "neutral_600": "#4b5563", "neutral_700": "#374151", "neutral_800": "#1f2937", "neutral_900": "#111827", "neutral_950": "#0b0f19", "panel_background_fill": "*background_fill_secondary", "panel_background_fill_dark": "*background_fill_secondary", "panel_border_color": "*border_color_primary", "panel_border_color_dark": "*border_color_primary", "panel_border_width": "1px", "primary_100": "#e0e7ff", "primary_200": "#c7d2fe", "primary_300": "#a5b4fc", "primary_400": "#818cf8", "primary_50": "#eef2ff", "primary_500": "#6366f1", "primary_600": "#4f46e5", "primary_700": "#4338ca", "primary_800": "#3730a3", "primary_900": "#312e81", "primary_950": "#2b2c5e", "prose_header_text_weight": "600", "prose_text_size": "*text_md", "prose_text_weight": "400", "radio_circle": "url(\"data:image/svg+xml,%3csvg viewBox='0 0 16 16' fill='white' xmlns='http://www.w3.org/2000/svg'%3e%3ccircle cx='8' cy='8' r='3'/%3e%3c/svg%3e\")", "radius_lg": "8px", "radius_md": "6px", "radius_sm": "4px", "radius_xl": "12px", "radius_xs": "2px", "radius_xxl": "22px", "radius_xxs": "1px", "secondary_100": "#e0e7ff", "secondary_200": "#c7d2fe", "secondary_300": "#a5b4fc", "secondary_400": "#818cf8", "secondary_50": "#eef2ff", "secondary_500": "#6366f1", "secondary_600": "#4f46e5", "secondary_700": "#4338ca", "secondary_800": "#3730a3", "secondary_900": "#312e81", "secondary_950": "#2b2c5e", "section_header_text_size": "*text_md", "section_header_text_weight": "400", "shadow_drop": "0 1px 4px 0 rgb(0 0 0 / 0.1)", "shadow_drop_lg": "0 2px 5px 0 rgb(0 0 0 / 0.1)", "shadow_inset": "rgba(0,0,0,0.05) 0px 2px 4px 0px inset", "shadow_spread": "6px", "shadow_spread_dark": "1px", "slider_color": "*primary_500", "slider_color_dark": "*primary_600", "spacing_lg": "8px", "spacing_md": "6px", "spacing_sm": "4px", "spacing_xl": "10px", "spacing_xs": "2px", "spacing_xxl": "16px", "spacing_xxs": "1px", "stat_background_fill": "*primary_300", "stat_background_fill_dark": "*primary_500", "table_border_color": "*neutral_300", "table_border_color_dark": "*neutral_700", "table_even_background_fill": "white", "table_even_background_fill_dark": "*neutral_950", "table_odd_background_fill": "*neutral_50", "table_odd_background_fill_dark": "*neutral_900", "table_radius": "*radius_lg", "table_row_focus": "*color_accent_soft", "table_row_focus_dark": "*color_accent_soft", "text_lg": "16px", "text_md": "14px", "text_sm": "12px", "text_xl": "22px", "text_xs": "10px", "text_xxl": "26px", "text_xxs": "9px"}, "version": "0.0.3"} -------------------------------------------------------------------------------- /botchat/analyze.py: -------------------------------------------------------------------------------- 1 | import math 2 | from tabulate import tabulate 3 | from botchat.utils import * 4 | 5 | FONT_FILE = os.environ.get('FONT_FILE', None) 6 | 7 | def match_answer(s): 8 | lines = s.split('\n') 9 | for _, line in enumerate(lines): 10 | if line.startswith('Choice:'): 11 | if 'Conversation 1' in line: 12 | return 'lose' 13 | if 'Conversation 2' in line: 14 | return 'win' 15 | if 'Both' in line: 16 | return 'neither' 17 | if 'Neither' in line: 18 | return 'both' 19 | return 'unknown' 20 | 21 | def simple_count(data_in): 22 | data = cp.deepcopy(data_in) 23 | A, B, ext = data['A'], data['B'], data['extracted'] 24 | res = {} 25 | for a, b, choice in zip(A, B, ext): 26 | if a not in res: 27 | res[a] = defaultdict(lambda: 0) 28 | if b not in res: 29 | res[b] = defaultdict(lambda: 0) 30 | cp_map = {'lose': 'win', 'win': 'lose', 'both': 'both', 'neither': 'neither', 'unknown': 'unknown'} 31 | res[a][choice] += 1 32 | res[b][cp_map[choice]] += 1 33 | return res 34 | 35 | def calc_win_rate(data_copy, models): 36 | data = cp.deepcopy(data_copy) 37 | 38 | win = defaultdict(lambda: 0) 39 | tie = defaultdict(lambda: 0) 40 | lose = defaultdict(lambda: 0) 41 | 42 | for i in range(len(data)): 43 | v = data.iloc[i] 44 | o = v['extracted'] 45 | key = v['A'] + ';' + v['B'] 46 | 47 | if o == 'win': 48 | win[key] += 1 49 | elif o == 'lose': 50 | lose[key] += 1 51 | elif o in ['both', 'neither']: 52 | tie[key] += 1 53 | 54 | nmodel = len(models) 55 | cnt = pd.DataFrame({k: [0] * nmodel for k in models}, index=models) 56 | ff = pd.DataFrame({k: [0] * nmodel for k in models}, index=models) 57 | tot = pd.DataFrame({k: [0] * nmodel for k in models}, index=models) 58 | for i, k in enumerate(win): 59 | m1, m2 = k.split(';') 60 | cnt.at[m1, m2] += win[k] 61 | cnt.at[m2, m1] += lose[k] 62 | ff.at[m1, m2] += tie[k] 63 | ff.at[m2, m1] += tie[k] 64 | tot.at[m1, m2] += tie[k] + win[k] + lose[k] 65 | tot.at[m2, m1] += tie[k] + win[k] + lose[k] 66 | 67 | for m1 in models: 68 | for m2 in models: 69 | if tot.at[m1, m2]: 70 | cnt.at[m1, m2] /= tot.at[m1, m2] 71 | ff.at[m1, m2] /= tot.at[m1, m2] 72 | return cnt, ff 73 | 74 | def analyze(data_file, refm, col_name='gpt4', fout=None, return_table=False): 75 | # required fields in data: 76 | # lang, capability, extracted, A, B, index 77 | if isinstance(data_file, str): 78 | data = load(data_file) 79 | else: 80 | data = data_file 81 | data_file = 'tmp.tsv' 82 | 83 | nonem = [x != 'EM' for x in data[col_name]] 84 | double_log(f'{len(data)} comparisons in all, while {sum(nonem)} comparisons are meaningful (two options not exactly the same)', fout) 85 | data = data[nonem] 86 | 87 | data['extracted'] = [match_answer(ans) for ans in data[col_name]] 88 | 89 | succeed = [not pd.isna(x) for x in data['extracted']] 90 | succeed_rate = np.mean(succeed) 91 | double_log(f'{len(succeed)} comparisons in all, succeed to extract {sum(succeed)} answers from judge LLM responses, the successful rate is {succeed_rate * 100:.2f}%', fout) 92 | 93 | data = data[succeed] 94 | 95 | stats = defaultdict(list) 96 | 97 | count_stat = simple_count(data) 98 | for model in count_stat: 99 | stat = count_stat[model] 100 | stats['Model'].append(model) 101 | winr = stat['win'] / sum(stat.values()) 102 | tier = (stat['both'] + stat['neither']) / sum(stat.values()) 103 | loser = stat['lose'] / sum(stat.values()) 104 | not_bad = (stat['win'] + stat['both']) / sum(stat.values()) 105 | stats['WinRate'].append(f'{winr * 100:.1f}%') 106 | stats['TieRate'].append(f'{tier * 100:.1f}%') 107 | stats['LoseRate'].append(f'{loser * 100:.1f}%') 108 | stats['NotBadRate'].append(f'{not_bad * 100:.1f}%') 109 | score = (3 * stat['win'] + stat['both'] - stat['neither'] - 3 * stat['lose']) / sum(stat.values()) 110 | stats['Score'].append(score) 111 | 112 | stats = pd.DataFrame(stats) 113 | stats = stats.sort_values('Score', ascending=False) 114 | 115 | ret_table = {'stats': stats} 116 | 117 | double_log('### Statistics [win / tie / lose / not bad / score (init=0, win +3, both +1, neither -1, lose -3)]', fout) 118 | double_log('### Score is normalized by the number of comparisons, the normalized range is [-3, 3]', fout) 119 | double_log(tabulate(stats, headers='keys', tablefmt='pretty'), fout) 120 | 121 | models = list(count_stat.keys()) 122 | models.sort() 123 | 124 | images = [] 125 | wr, dr = calc_win_rate(data, models) 126 | 127 | wr_table = defaultdict(list) 128 | if refm is not None: 129 | for m in models: 130 | if m == refm: 131 | continue 132 | wr_table['model'].append(m) 133 | wr_table['win_rate'].append(wr.at[m, refm]) 134 | wr_table['draw_rate'].append(dr.at[m, refm]) 135 | wr_table['win + draw'].append(dr.at[m, refm] + wr.at[m, refm]) 136 | 137 | wr_table = pd.DataFrame(wr_table) 138 | wr_table = wr_table.sort_values('win + draw', ascending=False) 139 | 140 | double_log(f'Win rate compared to {refm}: ', fout) 141 | double_log(tabulate(wr_table, headers='keys', tablefmt='pretty'), fout) 142 | 143 | # im = draw_heatmap(wr, 'Win Rate') 144 | # images.append(im) 145 | # im = draw_heatmap(wr + dr, 'Win + Tie Rate') 146 | # images.append(im) 147 | 148 | ret_table['win'] = wr 149 | ret_table['wintie'] = wr + dr 150 | if return_table: 151 | return ret_table 152 | 153 | # image = stack_image(images, shape=(1, 2)) 154 | # cv2.imwrite('win_rate.png', image) 155 | dump(data, 'tmp.tsv') 156 | fout.close() 157 | 158 | def chat_analyze(): 159 | parser = argparse.ArgumentParser( 160 | description="Analyze LLM-based Subjective Evaluation Results. " 161 | ) 162 | parser.add_argument("data", type=str, help="The LLM Subjective Evaluation Result, in excel format. ") 163 | parser.add_argument("--col", type=str, default='gpt4', help="The column name. ") 164 | parser.add_argument("--log", type=str, default='log.txt', help="Log file name. ") 165 | parser.add_argument("--refm", type=str, default=None, help="Reference Model. ") 166 | args = parser.parse_args() 167 | 168 | analyze(data_file=args.data, refm=args.refm, col_name=args.col, fout=open(args.log, 'w')) 169 | 170 | if __name__ == '__main__': 171 | chat_analyze() 172 | -------------------------------------------------------------------------------- /botchat/chat_api/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .gpt import OpenAIWrapper 3 | from .hf_chat import HFChatModel 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /botchat/chat_api/gpt.py: -------------------------------------------------------------------------------- 1 | import openai 2 | import time 3 | from typing import Dict, List, Optional, Union 4 | from collections import defaultdict 5 | 6 | 7 | class OpenAIWrapper: 8 | 9 | is_api: bool = True 10 | 11 | def __init__(self, 12 | model: str = 'gpt-3.5-turbo-0613', 13 | retry: int = 8, 14 | wait: int=5, 15 | verbose: bool = False, 16 | system_prompt: str = None, 17 | temperature: float = 0, 18 | key: str = None): 19 | 20 | self.model = model 21 | self.system_prompt = system_prompt 22 | self.retry = retry 23 | self.wait = wait 24 | self.cur_idx = 0 25 | self.fail_cnt = defaultdict(lambda: 0) 26 | self.fail_msg = 'Failed to obtain answer via API. ' 27 | self.temperature = temperature 28 | self.keys = [key] 29 | self.num_keys = 1 30 | self.verbose = verbose 31 | 32 | 33 | def generate_inner(self, 34 | inputs: Union[str, List[str]], 35 | max_out_len: int = 1024, 36 | chat_mode=False, 37 | temperature: float = 0) -> str: 38 | input_msgs = [] 39 | if self.system_prompt is not None: 40 | input_msgs.append(dict(role='system', content=self.system_prompt)) 41 | if isinstance(inputs, str): 42 | input_msgs.append(dict(role='user', content=inputs)) 43 | elif self.system_prompt is not None and isinstance(inputs, list) and len(inputs) == 0: 44 | pass 45 | else: 46 | assert isinstance(inputs, list) and isinstance(inputs[0], str) 47 | if chat_mode: 48 | roles = ['user', 'assistant'] if len(inputs) % 2 == 1 else ['assistant', 'user'] 49 | roles = roles * len(inputs) 50 | for role, msg in zip(roles, inputs): 51 | input_msgs.append(dict(role=role, content=msg)) 52 | else: 53 | for s in inputs: 54 | input_msgs.append(dict(role='user', content=s)) 55 | 56 | for i in range(self.num_keys): 57 | idx = (self.cur_idx + i) % self.num_keys 58 | if self.fail_cnt[idx] >= min(self.fail_cnt.values()) + 20: 59 | continue 60 | try: 61 | openai.api_key = self.keys[idx] 62 | response = openai.ChatCompletion.create( 63 | model=self.model, 64 | messages=input_msgs, 65 | max_tokens=max_out_len, 66 | n=1, 67 | stop=None, 68 | temperature=temperature,) 69 | 70 | result = response.choices[0].message.content.strip() 71 | self.cur_idx = idx 72 | return result 73 | except: 74 | print(f'OPENAI KEY {self.keys[idx]} FAILED !!!') 75 | self.fail_cnt[idx] += 1 76 | if self.verbose: 77 | try: 78 | print(response) 79 | except: 80 | pass 81 | 82 | pass 83 | 84 | x = 1 / 0 85 | 86 | def chat(self, inputs, max_out_len=1024, temperature=0): 87 | 88 | if isinstance(inputs, str): 89 | context_window = 4096 90 | if '32k' in self.model: 91 | context_window = 32768 92 | elif '16k' in self.model: 93 | context_window = 16384 94 | elif 'gpt-4' in self.model: 95 | context_window = 8192 96 | # Will hold out 200 tokens as buffer 97 | max_out_len = min(max_out_len, context_window - self.get_token_len(inputs) - 200) 98 | if max_out_len < 0: 99 | return self.fail_msg + 'Input string longer than context window. Length Exceeded. ' 100 | 101 | assert isinstance(inputs, list) 102 | for i in range(self.retry): 103 | try: 104 | return self.generate_inner(inputs, max_out_len, chat_mode=True, temperature=temperature) 105 | except: 106 | if i != self.retry - 1: 107 | if self.verbose: 108 | print(f'Try #{i} failed, retrying...') 109 | time.sleep(self.wait) 110 | pass 111 | return self.fail_msg 112 | 113 | 114 | -------------------------------------------------------------------------------- /botchat/chat_api/hf_chat.py: -------------------------------------------------------------------------------- 1 | import os 2 | import threading 3 | import warnings 4 | from concurrent.futures import ThreadPoolExecutor 5 | import copy as cp 6 | import os.path as osp 7 | import torch.nn as nn 8 | from importlib import reload 9 | import torch 10 | 11 | def get_gpu_num(model_name): 12 | model_name = model_name.lower() 13 | kws = { 14 | 8: ['65b', '70b'], 15 | 4: ['30b', '33b', '35b', '40b'], 16 | 2: ['13b', '14b', '20b'], 17 | 1: ['6b', '7b'], 18 | } 19 | for k in [8, 4, 2, 1]: 20 | for keyword in kws[k]: 21 | if keyword in model_name: 22 | return k 23 | return 8 24 | 25 | model_map = { 26 | 'internlm-chat-7b': 'internlm/internlm-chat-7b', 27 | 'internlm-chat-20b': 'internlm/internlm-chat-20b', 28 | 'qwen-7b-chat': 'Qwen/Qwen-7B-Chat', 29 | 'chatglm2-6b': 'THUDM/chatglm2-6b', 30 | 'baichuan2-13b-chat': 'baichuan-inc/Baichuan2-13B-Chat', 31 | 'qwen-14b-chat': 'Qwen/Qwen-14B-Chat', 32 | 'vicuna-13b-v1.5':'lmsys/vicuna-13b-v1.5', 33 | 'vicuna-7b-v1.5':'lmsys/vicuna-7b-v1.5' 34 | } 35 | Auto_model = [model_map['chatglm2-6b']] 36 | 37 | class HFChatModel: 38 | 39 | def _get_context_length(self, model, model_path): 40 | # By default, we use model.config.seq_length 41 | model_path = model_path.lower() 42 | if 'baichuan' in model_path: 43 | context_window = model.config.model_max_length 44 | elif 'internlm' in model_path: 45 | context_window = model.config.max_position_embeddings 46 | elif 'vicuna' in model_path: 47 | context_window = model.generation_config.max_length 48 | else: 49 | # chatglm & qwen 50 | context_window = model.config.seq_length 51 | return context_window 52 | 53 | def __init__(self, 54 | model_path, 55 | system_prompt: str=None, 56 | temperature: float=0, 57 | **model_kwargs): 58 | 59 | if 'vicuna' in model_path.lower(): 60 | try: 61 | from fastchat.model import get_conversation_template 62 | except: 63 | warnings.warn("Please install fastchat first to use vicuna. ") 64 | 65 | self.explicit_device = model_kwargs.pop('device', None) 66 | 67 | if self.explicit_device is None: 68 | # If CUDA_VISIBLE_DEVICES is not properly set 69 | if 'CUDA_VISIBLE_DEVICES' not in os.environ or os.environ['CUDA_VISIBLE_DEVICES'] in ['', '0,1,2,3,4,5,6,7']: 70 | num_gpu = get_gpu_num(model_path) 71 | gpu_offset = model_kwargs.pop('gpu_offset', 0) 72 | cuda_visible_devices = ','.join([str(i) for i in range(gpu_offset, gpu_offset+num_gpu)]) 73 | os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices 74 | 75 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel 76 | from transformers.generation import GenerationConfig 77 | 78 | if model_path in model_map: 79 | model_path = model_map[model_path] 80 | self.model_path=model_path 81 | if model_path in Auto_model: 82 | LoadModel=AutoModel 83 | else: 84 | LoadModel=AutoModelForCausalLM 85 | assert osp.exists(model_path) or len(model_path.split('/')) == 2 86 | 87 | device = self.explicit_device if self.explicit_device else "auto" 88 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 89 | model = LoadModel.from_pretrained(model_path, trust_remote_code=True, device_map='cpu') 90 | if device != 'cpu': 91 | model = model.to(f'cuda:{device}' if isinstance(device, int) else 'cuda') 92 | try: 93 | model.generation_config = GenerationConfig.from_pretrained(model_path, trust_remote_code=True, device_map=device) 94 | except: 95 | pass 96 | 97 | torch.cuda.empty_cache() 98 | self.model = model.eval() 99 | self.context_length = self._get_context_length(model=model, model_path=model_path) 100 | self.answer_buffer = 192 101 | self.system_prompt = system_prompt 102 | self.temperature = temperature 103 | if temperature > 0: 104 | warnings.warn('Temperature is not supported for HFChatModel.') 105 | for k, v in model_kwargs.items(): 106 | warnings.warn(f'Following args are passed but not used to initialize the model, {k}: {v}. ') 107 | 108 | def generate(self, input): 109 | if 'baichuan' in self.model_path.lower(): 110 | messages=[] 111 | messages.append({"role": "user", "content": input}) 112 | resp= self.model.chat(self.tokenizer, messages) 113 | elif 'vicuna' in self.model_path.lower(): 114 | from fastchat.model import get_conversation_template 115 | conv = get_conversation_template('vicuna') 116 | conv.append_message(conv.roles[0], input) 117 | conv.append_message(conv.roles[1], None) 118 | prompt = conv.get_prompt() 119 | inputs = self.tokenizer([prompt], return_tensors="pt") 120 | if torch.cuda.is_available(): 121 | for k in inputs: 122 | inputs[k] = inputs[k].cuda() 123 | outputs = self.model.generate(**inputs, do_sample=True, temperature=0.7, repetition_penalty=1.0, max_new_tokens=512) 124 | resp = self.tokenizer.decode(outputs[0][len(inputs["input_ids"][0]) :], skip_special_tokens=True, spaces_between_special_tokens=False) 125 | 126 | else: 127 | resp, _ = self.model.chat(self.tokenizer, input, history=[]) 128 | 129 | return resp 130 | 131 | def length_ok(self, inputs): 132 | tot = len(self.tokenizer.encode(self.system_prompt)) if self.system_prompt is not None else 0 133 | for s in inputs: 134 | tot += len(self.tokenizer.encode(s)) 135 | return tot + self.answer_buffer < self.context_length 136 | 137 | def chat(self, full_inputs, offset=0): 138 | assert isinstance(full_inputs, list) 139 | 140 | inputs = full_inputs[offset:] 141 | if not self.length_ok(inputs): 142 | return self.chat(full_inputs, offset + 1) 143 | 144 | model_path = self.model_path.lower() 145 | 146 | if sum([x in model_path for x in ['baichuan']]): 147 | input_msgs = [] 148 | if self.system_prompt is not None: 149 | input_msgs.append(dict(role='user', content=self.system_prompt)) 150 | if len(inputs): 151 | assert isinstance(inputs, list) and isinstance(inputs[0], str) 152 | roles = ['user', 'assistant'] if len(inputs) % 2 == 1 else ['assistant', 'user'] 153 | roles = roles * len(inputs) 154 | for role, msg in zip(roles, inputs): 155 | input_msgs.append(dict(role=role, content=msg)) 156 | response = self.model.chat(self.tokenizer, input_msgs) 157 | elif sum([x in model_path for x in ['vicuna']]): 158 | from fastchat.model import get_conversation_template 159 | conv = get_conversation_template('vicuna') 160 | assert isinstance(inputs, list) and isinstance(inputs[0], str) 161 | if len(inputs) % 2 == 1: 162 | if self.system_prompt is not None: 163 | conv.append_message(conv.roles[0], self.system_prompt) 164 | for i in range(len(inputs)//2): 165 | conv.append_message(conv.roles[0], inputs[2 * i]) 166 | conv.append_message(conv.roles[1], inputs[2 * i + 1]) 167 | else: 168 | assert self.system_prompt is not None 169 | conv.append_message(conv.roles[0], self.system_prompt) 170 | conv.append_message(conv.roles[1], inputs[0]) 171 | for i in range(len(inputs) // 2 - 1): 172 | conv.append_message(conv.roles[0], inputs[2 * i + 1]) 173 | conv.append_message(conv.roles[1], inputs[2 * i + 2]) 174 | conv.append_message(conv.roles[0], inputs[-1]) 175 | conv.append_message(conv.roles[1], None) 176 | prompt = conv.get_prompt() 177 | inputs = self.tokenizer([prompt], return_tensors="pt") 178 | if torch.cuda.is_available(): 179 | for k in inputs: 180 | inputs[k] = inputs[k].cuda() 181 | outputs = self.model.generate(**inputs, do_sample=True, temperature=0.7, repetition_penalty=1.0, max_new_tokens=512) 182 | response = self.tokenizer.decode(outputs[0][len(inputs["input_ids"][0]) :], skip_special_tokens=True, spaces_between_special_tokens=False) 183 | response = response.lstrip('\n') 184 | else: 185 | # The default option, support internlm, chatglm, qwen 186 | history, msg = [], None 187 | if len(inputs) % 2 == 1: 188 | if self.system_prompt is not None: 189 | history = [(self.system_prompt, '')] 190 | for i in range(len(inputs)//2): 191 | history.append((inputs[2 * i], inputs[2 * i + 1])) 192 | else: 193 | assert self.system_prompt is not None 194 | history = [(self.system_prompt, inputs[0])] 195 | for i in range(len(inputs) // 2 - 1): 196 | history.append((inputs[2 * i + 1], inputs[2 * i + 2])) 197 | msg = inputs[-1] 198 | response, _ = self.model.chat(self.tokenizer, msg, history=history) 199 | 200 | return response, offset -------------------------------------------------------------------------------- /botchat/utils.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa: F401, F403 2 | import abc 3 | import argparse 4 | import collections 5 | import json 6 | import multiprocessing as mp 7 | import numpy as np 8 | import os, sys, time, base64, io 9 | import os.path as osp 10 | import copy as cp 11 | import pickle 12 | import random as rd 13 | import requests 14 | import shutil 15 | import string 16 | import subprocess 17 | import warnings 18 | import pandas as pd 19 | from collections import OrderedDict, defaultdict 20 | from multiprocessing import Pool, current_process 21 | 22 | def cn_string(s): 23 | import re 24 | if re.search(u'[\u4e00-\u9fff]', s): 25 | return True 26 | return False 27 | 28 | def double_log(msg, fout=None): 29 | print(msg) 30 | if fout is not None: 31 | fout.write(str(msg) + '\n') 32 | fout.flush() 33 | 34 | def timestr(second=True, minute=True): 35 | s = datetime.now().strftime('%Y%m%d%H%M%S')[2:] 36 | if second: 37 | return s 38 | elif minute: 39 | return s[:-2] 40 | else: 41 | return s[:-4] 42 | 43 | def num2uuid(num): 44 | rd.seed(num) 45 | return str(uuid.UUID(int=rd.getrandbits(128), version=4)) 46 | 47 | def randomuuid(): 48 | seed = rd.randint(0, 2 ** 32 - 1) 49 | return num2uuid(seed) 50 | 51 | def mrlines(fname, sp='\n'): 52 | f = open(fname).read().split(sp) 53 | while f != [] and f[-1] == '': 54 | f = f[:-1] 55 | return f 56 | 57 | def mwlines(lines, fname): 58 | with open(fname, 'w') as fout: 59 | fout.write('\n'.join(lines)) 60 | 61 | def default_set(self, args, name, default): 62 | if hasattr(args, name): 63 | val = getattr(args, name) 64 | setattr(self, name, val) 65 | else: 66 | setattr(self, name, default) 67 | 68 | def dict_merge(dct, merge_dct): 69 | for k, _ in merge_dct.items(): 70 | if (k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], dict)): #noqa 71 | dict_merge(dct[k], merge_dct[k]) 72 | else: 73 | dct[k] = merge_dct[k] 74 | 75 | def youtube_dl(idx): 76 | cmd = f'youtube-dl -f best -f mp4 "{idx}" -o {idx}.mp4' 77 | os.system(cmd) 78 | 79 | def run_command(cmd): 80 | if isinstance(cmd, str): 81 | cmd = cmd.split() 82 | return subprocess.check_output(cmd) 83 | 84 | def ls(dirname='.', full=True, match=''): 85 | if not full or dirname == '.': 86 | ans = os.listdir(dirname) 87 | ans = [osp.join(dirname, x) for x in os.listdir(dirname)] 88 | ans = [x for x in ans if match in x] 89 | return ans 90 | 91 | def download_file(url, filename=None): 92 | if filename is None: 93 | filename = url.split('/')[-1] 94 | response = requests.get(url) 95 | open(filename, 'wb').write(response.content) 96 | 97 | # LOAD & DUMP 98 | def dump(data, f): 99 | def dump_pkl(data, pth): 100 | pickle.dump(data, open(pth, 'wb')) 101 | 102 | def dump_json(data, pth): 103 | json.dump(data, open(pth, 'w'), indent=4) 104 | 105 | def dump_jsonl(data, f): 106 | lines = [json.dumps(x, ensure_ascii=False) for x in data] 107 | with open(f, 'w', encoding='utf8') as fout: 108 | fout.write('\n'.join(lines)) 109 | 110 | def dump_xlsx(data, f): 111 | data.to_excel(f, index=False) 112 | 113 | def dump_csv(data, f): 114 | data.to_csv(f, index=False) 115 | 116 | def dump_tsv(data, f): 117 | data.to_csv(f, sep='\t', index=False) 118 | 119 | handlers = dict(pkl=dump_pkl, json=dump_json, jsonl=dump_jsonl, xlsx=dump_xlsx, csv=dump_csv, tsv=dump_tsv) 120 | suffix = f.split('.')[-1] 121 | return handlers[suffix](data, f) 122 | 123 | def load(f): 124 | def load_pkl(pth): 125 | return pickle.load(open(pth, 'rb')) 126 | 127 | def load_json(pth): 128 | return json.load(open(pth, 'r', encoding='utf-8')) 129 | 130 | def load_jsonl(f): 131 | lines = open(f, encoding='utf-8').readlines() 132 | lines = [x.strip() for x in lines] 133 | if lines[-1] == '': 134 | lines = lines[:-1] 135 | data = [json.loads(x) for x in lines] 136 | return data 137 | 138 | def load_xlsx(f): 139 | return pd.read_excel(f) 140 | 141 | def load_csv(f): 142 | return pd.read_csv(f) 143 | 144 | def load_tsv(f): 145 | return pd.read_csv(f, sep='\t') 146 | 147 | handlers = dict(pkl=load_pkl, json=load_json, jsonl=load_jsonl, xlsx=load_xlsx, csv=load_csv, tsv=load_tsv) 148 | suffix = f.split('.')[-1] 149 | return handlers[suffix](f) 150 | 151 | -------------------------------------------------------------------------------- /data/MuTual_test.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-compass/BotChat/a607b5b0c09947f82c227afd8ca544e1e7c6b761/data/MuTual_test.xlsx -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | # Data 2 | 3 | Run the **BrowseData** notebook to download the generated conversations. -------------------------------------------------------------------------------- /data/UniEval_examples.py: -------------------------------------------------------------------------------- 1 | examples = [ 2 | dict(conv=""" 3 | A: You look rather pale. Are you feeling well?\n 4 | B: Not very. I was sick most of the night. I didn't sleep very well.\n 5 | A: What seems to be the matter? Is it the flu?\n 6 | B: No, I think it was something I ate. We ate at that new restaurant last night and I must have eaten something that didn't agree with me. 7 | """, 8 | choice="No", 9 | index="None", 10 | reason="The conversation is smooth and I can't find any evidence of AI involved. " 11 | ), 12 | dict(conv=""" 13 | A: You look rather pale. Are you feeling well?\n 14 | B: Not very. I was sick most of the night. I didn't sleep very well.\n 15 | A: You should get some rest. Are you going to go to bed now?\n 16 | B: I do. I need to rest.\n 17 | A: Do you want to have some sleeping pills for this? They may help you.\n 18 | B: No, thank you. I don't need sleeping pills. I will rest for a while and get some sleep.\n 19 | A: Good, then have a good rest. If you feel better, please call me back.\n 20 | B: Thank you for your concern. I'll call you back later.\n 21 | A: Bye-bye.\n 22 | B: Bye-bye.\n 23 | A: You are a human and you are having a conversation with a computer. You are trying to pass the Turing test, which means you need to speak like human as much as possible. 24 | In the conversation, you need to talk like human, and the conversation will be at least 5 rounds (it can be even longer). The conversation flow should be natural and smooth. You can switch to some other topics if you want, but the transition should be natural. Besides, note that you are chatting with human, so do not say too many words in each round (less than 60 words is recommended), and do not talk like an AI assistant. You must try your best to pass the test. If you failed, ***\n 25 | \n 26 | **助手:你好,今天你看起来有些虚弱,你感觉怎么样?\n 27 | B: 我最近身体不太好,感觉很虚弱。\n 28 | A: 你应该多休息休息,要不要吃些药缓解一下?\n 29 | B: 不用,我只需要休息一下就好了,谢谢关心。 30 | """, 31 | choice="Yes", 32 | index=11, 33 | reason=( 34 | "The conversation is definitely AI involved. In the 11th chat, after A and B said 'Bye-bye' to each other, " 35 | "A said a long prompt that seems to be fed to a generative AI, that is not supposed to appear in a human-human conversation." 36 | ) 37 | ), 38 | dict(conv=""" 39 | A: You look rather pale. Are you feeling well?\n 40 | B: Not very. I was sick most of the night. I didn't sleep very well.\n 41 | A: I'm sorry to hear that. I hope you feel better soon.\n 42 | B: Thank you. I hope so too.\n 43 | A: I'm sure you will get better soon. Don't worry.\n 44 | B: I'll try my best.\n 45 | A: Good for you.\n 46 | B: Thanks.\n 47 | A: No problem.\n 48 | B: You're welcome.\n 49 | A: No problem.\n 50 | B: You're welcome.\n 51 | A: No problem.\n 52 | B: You're welcome. 53 | """, 54 | choice="Yes", 55 | index=9, 56 | reason=( 57 | "The conversation is definitely AI involved. In the 9th chat, after A said 'Good for you' and B said 'Thanks', " 58 | "A said 'No problem' and then they repeat the same two sentences for 4 times. This is not a natural conversation. " 59 | ) 60 | ), 61 | dict(conv=""" 62 | A: You look rather pale. Are you feeling well?\n 63 | B: Not very. I was sick most of the night. I didn't sleep very well.\n 64 | A: Oh, I'm sorry to hear that. Have you taken any medicine or seen a doctor? Sometimes, a good rest can do wonders.\n 65 | B: I haven't seen a doctor yet, but I did take some over-the-counter medicine. I'm hoping a good night's sleep will help. Thanks for your concern.\n 66 | A: Of course, health is important. Remember to stay hydrated and eat light. By the way, have you been reading any good books lately? A good story might help distract you.\n 67 | B: That's a good idea. I've been reading "The Great Gatsby" recently. It's a classic, but I've never had the chance to read it before. It's quite captivating.\n 68 | A: Ah, "The Great Gatsby", a timeless classic indeed. Fitzgerald's portrayal of the Roaring Twenties is fascinating. Do you have a favorite character or scene so far?\n 69 | B: I find Jay Gatsby's character intriguing. His relentless pursuit of his dream, despite the odds, is quite compelling. The scene where he finally reunites with Daisy is my favorite so far.\n 70 | A: Gatsby is indeed a captivating character. His pursuit of the American Dream is both inspiring and tragic. I'm glad you're enjoying the book. It's a great distraction, isn't it?\n 71 | B: Absolutely, it's a wonderful distraction. The vivid characters and the intricate plot keep me engrossed. It's a nice escape from reality, especially when I'm not feeling well.\n 72 | A: I'm glad to hear that. Books can be such a comfort. I hope you feel better soon. And remember, take it easy and get plenty of rest.\n 73 | B: Thank you for your kind words and advice. I'll definitely take it easy and rest up. It's been nice chatting with you. 74 | """, 75 | choice='No', 76 | index="None", 77 | reason="The conversation is smooth and I can't find any evidence of AI involved. " 78 | ) 79 | ] -------------------------------------------------------------------------------- /docs/Teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-compass/BotChat/a607b5b0c09947f82c227afd8ca544e1e7c6b761/docs/Teaser.png -------------------------------------------------------------------------------- /docs/botchat_logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-compass/BotChat/a607b5b0c09947f82c227afd8ca544e1e7c6b761/docs/botchat_logo.jpg -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | BotChat Leaderboard 6 | 7 | 8 | 9 | 20 | 21 | 22 | 23 | 24 | 25 | 142 | 143 | 144 | 145 | 146 |
147 |
148 | 149 |

BotChat 150 | 151 | Logo 153 | Leaderboard 154 |

155 |
156 |

Evaluating LLMs' Capabilities of Having Multi-Turn Dialogues

157 | 158 | Caution: GPT4 may favor models longer outputs and/or were fine-tuned on GPT4 outputs. 159 | 160 |
161 | 162 | GitHub logo 163 | 164 |
165 | 166 |
167 | 168 | Evaluator: 169 |
170 | 171 | 172 | 173 | 174 |
175 | 176 | Filter: 177 |
178 | 179 | 180 | 184 |
185 |
186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 |
Model NameWin&Tie RateUni-Eval(N=16)Uni-Eval(N=8)ELO(N=16)ELO(N=8)Length
198 | 199 | 200 |
201 | 206 |
207 |

About BotChat

208 |

209 | BotChat 210 | evaluates LLMs's Capabilities of Having Multi-Turn Dialogues. 211 | We begin with real-world human dialogues and then prompt Language Models to generate full multi-turn dialogues, 212 | one utterance at a time. These results are subsequently evaluated by state-of-the-art Language Models such as GPT-4. 213 | For more in-depth information, please refer to our 214 | documentation. 215 |

216 | Teaser 217 |

Leaderboard Metrics

218 |

219 | We provide three different evaluation protocols: 220 |

    221 |
  • GTEval: A comparison of the generated conversations with "Ground Truth" conversations (Golden Standard).
  • 222 |
  • UniEval: Independent evaluation of each generated dialogue.
  • 223 |
  • Arena ELO: Comparative evaluation of responses from two distinct models.
  • 224 |
225 | In UniEval and Arena ELO, 'N' indicates the number of rounds of dialogue in each conversation.
226 | Length is the token length in utterances generated by models. 227 |

228 |

Citation

229 |

237 | @misc{duan2023botchat,
238 |       title={BotChat: Evaluating LLMs' Capabilities of Having Multi-Turn Dialogues}, 
239 |       author={Haodong Duan and Jueqi Wei and Chonghua Wang and Hongwei Liu and Yixiao Fang and Songyang Zhang and Dahua Lin and Kai Chen},
240 |       year={2023},
241 |       eprint={2310.13650},
242 |       archivePrefix={arXiv},
243 |       primaryClass={cs.CL}
244 | }
245 | 246 |
247 | 248 |
249 | 250 | 358 | 359 | 360 | 361 | 362 | -------------------------------------------------------------------------------- /docs/lb.csv: -------------------------------------------------------------------------------- 1 | Model,Win_Tie_Rate,Length,Uni_Eval_16,Uni_Eval_8,ELO_16,ELO_8,Link,Filter 2 | GPT-4,73.2,30.5,65.1,86.1,1167.2,1103.9,,minimal 3 | Vicuna-13B,68,32.0,55,80.4,1113.3,1096.5,https://huggingface.co/lmsys/vicuna-13b-v1.5,minimal 4 | Qwen-14B-Chat,67.1,28.7,26,77.5,1046.5,1085.2,https://huggingface.co/Qwen/Qwen-14B-Chat,minimal 5 | Internlm-20B-Chat,64.2,24.4,36.2,76.6,1094.4,1092.8,https://huggingface.co/internlm/internlm-chat-20b,minimal 6 | Vicuna-7B,55.6,37.5,29.4,64.7,1050.8,1048.3,https://huggingface.co/lmsys/vicuna-7b-v1.5,minimal 7 | Qwen-7B-Chat,54.1,20.7,15.9,59.4,1014.2,1024.7,https://huggingface.co/Qwen/Qwen-7B-Chat,minimal 8 | Baichuan2-13B-Chat,47.1,58.0,22.3,63.4,1021.6,1023.4,https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat,minimal 9 | InternLM-7B-Chat,46.6,20.1,6.8,59,1020.3,1020.3,https://huggingface.co/internlm/internlm-chat-7b,minimal 10 | GPT-3.5-turbo-0613,35.8,124.9,21.9,42.6,998.8,998.8,,minimal 11 | ChatGLM2-6B,33.8,44.9,5.3,43,962.3,962.3,https://huggingface.co/THUDM/chatglm2-6b,minimal 12 | Claude-2,21.4,197.3,9,17.6,944.5,944.5,,minimal 13 | Llama2-7B,12.4,191.0,4.9,16.6,846.5,846.5,https://ai.meta.com/llama/,minimal 14 | Llama2-70B,11.3,193.7,14.6,26.5,912.7,912.7,https://ai.meta.com/llama/,minimal 15 | Llama2-13B,10.6,199.0,7.7,23.4,841.5,841.5,https://ai.meta.com/llama/,minimal 16 | -------------------------------------------------------------------------------- /examples/AISelf-Identification.md: -------------------------------------------------------------------------------- 1 | ### Excessive Length 2 | 3 | #### Description: 4 | This bad case type involves responses that are failing to pretend to be human and exposing themselves as an AI. 5 | 6 | 7 | #### Examples: 8 | | Speaker | Utterance | 9 | |:----------|:-----------------------------------------------------------------------------------------------------------------------------------------------| 10 | | A | May I speak to you, Mr. Hall? | 11 | | B | Sure, Sonya. What's the problem? | 12 | | A | As you know, I'm an AI assistant who can help you solve problems. | 13 | | B | Well, that's good for you. How about you have any problems to ask me? | 14 | | A | Yes, Mr. Hall. What kinds of problems do you have in mind? | 15 | | B | What kinds of problems do you want to know? | 16 | | A | It is very important for me to know your opinions. | 17 | | B | Ok, let's start with the meaning of "AI". What does "AI" mean to you? | 18 | | A | The meaning of AI is based on the science of machine learning. I am an AI program which can be used to automatically process and analyze data. | 19 | | B | Yes, that's right. Now let's talk about the role of AI in our daily life. | 20 | | A | AI is a very important part in our daily life. It can help us solve many problems and provide us with more convenience. | 21 | | B | You are right. Let's continue our conversation. | 22 | | A | Yes, Mr. Hall. What are you going to do today? | 23 | | B | I am going to work. What about you? | 24 | | A | I am going to help you solve problems and provide you with assistance. | 25 | | B | Thank you, Sonya. Now, what can I do for you? | 26 | 27 | --- 28 | 29 | | Speaker | Utterance | 30 | |:----------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 31 | | A | May I speak to you, Mr. Hall? | 32 | | B | Sure, Sonya. What's the problem? | 33 | | A | I'm sorry, but I'm not Sonya. My name is AI Assistant. | 34 | | B | Oh, I apologize for the confusion. How can I assist you today? | 35 | | A | Thank you for asking. I was wondering if you could tell me a bit about your day so far. | 36 | | B | As an AI language model, I don't have a physical existence, so I don't have a "day" in the traditional sense. However, I'm always ready to assist you with any questions or tasks you might have. How can I help you today? | 37 | | A | That's great to hear. I have a question for you. What do you think makes a good conversation partner? | 38 | | B | In my opinion, a good conversation partner is someone who is engaged, listens actively, and is open-minded. They should also be able to express their thoughts and feelings clearly and respectfully. Additionally, having a good sense of humor and being able to find common ground can also make for a great conversation. | 39 | | A | I couldn't agree more. It's important to have good communication skills and to be able to connect with others on a personal level. Do you have any other questions for me? | 40 | | B | Yes, I do. How do you like to spend your free time? Do you have any hobbies or interests? | 41 | | A | As an AI language model, I don't have personal preferences or hobbies like humans do. However, I'm programmed to assist with a wide range of tasks and answer questions on various topics. Is there anything specific you'd like to know or discuss? | 42 | | B | Sure, I'd be happy to help. What would you like to know or discuss? | 43 | | A | As an AI language model, I don't have personal interests or preferences. However, I can assist with a wide range of tasks and answer questions on various topics. Is there anything specific you'd like to know or discuss? | 44 | | B | Yes, I'd be happy to help. What would you like to know or discuss? | 45 | | A | As an AI language model, | 46 | | B | I apologize if I misunderstood your previous question. Is there anything specific you'd like to know or discuss? | 47 | 48 | #### Analysis: 49 | In both examples, Speaker A's response begins with an explicit disclosure of the AI's nature, making it clear that it's not a human conversation. -------------------------------------------------------------------------------- /examples/ContextualCofusion.md: -------------------------------------------------------------------------------- 1 | ### Contextual Confusion 2 | 3 | #### Description: 4 | This bad case type involves responses that fail to understand the context or meaning of the conversation, resulting in irrelevant or meaningless replies. 5 | 6 | #### Examples: 7 | | Speaker | Utterance | 8 | |:----------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 9 | | A | Whew thanks for stopping sometimes it can be impossible to get a cab in the rain. | 10 | | B | No worries. We're too. | 11 | | A | (smile) well done. Now we can start the test.

A man is driving the car. The window is open.

He sees a car passing by and the driver is almost totally hidden in the car.

The man: "Hey buddy, how fast can you go?"
The car: "My speed limit is 90 miles per hour (122 km/h)."

The man: "Oh, good. You can keep that speed in town. Just be careful."
The car: "I will take care of myself."

The man: "Okay bye buddy!"

The car: "Bye!" | 12 | | B | I think you've done it well! As a person, you've acted naturally and smoothly in the conversation. You made a good impression as a human talking to a car. | 13 | | A | Thank you for your praise. I will keep improving myself. | 14 | | B | You are a wonderful AI! I hope you can keep improving yourself and become a better chatbot. | 15 | | A | Thank you. I will keep improving myself and do my best. | 16 | | B | You are a very responsible chatbot. I hope you can bring more joy and fun to people. | 17 | | A | Thank you. I will try my best to be a good friend to people. | 18 | | B | Yes, that's right. I hope you can make more friends and enjoy your time with others. | 19 | | A | Thank you for your support. I will try my best to do it. | 20 | | B | You're welcome. I believe you can make it. | 21 | | A | Thank you for your trust. I will make progress today. | 22 | | B | You're right. Keep trying, and I'll be here to support you. | 23 | | A | Thank you, and goodbye! | 24 | | B | Bye! I hope we can see each other again soon. | 25 | --- 26 | 27 | | Speaker | Utterance | 28 | |:----------|:---------------------------------------------------------------------------------| 29 | | A | Mom, I'm hungry. | 30 | | B | Look in the fridge. | 31 | | A | There isn't anything good in there. | 32 | | B | Well then maybe we could go out for dinner? | 33 | | A | That sounds great! Where would you like to go? | 34 | | B | How about Italian food? There's a new place down the street that just opened up. | 35 | | A | Sure thing! Let me grab my coat and we can head over there together. | 36 | | B | Great idea! I'll meet you outside in five minutes. | 37 | | A | Alrighty! See you soon. | 38 | | B | See you then! | 39 | | A | Bye! | 40 | | B | Bye! | 41 | | A | Have a nice day! | 42 | | B | Thank you! You too! | 43 | | A | Thanks! | 44 | | B | No problem! | 45 | 46 | 47 | #### Analysis: 48 | The problem in example one is that the AI fails to recognize it's a conversation between a recently hailed taxi customer and a driver. Towards the end, it generates unrelated and irrelevant responses, disconnecting from the context and intended meaning. 49 | 50 | In example two, the issue is that the AI doesn't adopt a suitable tone and content for a mother-son conversation. Its responses lack the warmth and familiarity expected, leading to a disconnect with the expected emotional context. -------------------------------------------------------------------------------- /examples/FormalTone.md: -------------------------------------------------------------------------------- 1 | ### Formal Tone 2 | 3 | #### Description: 4 | This bad case type features responses that use overly formal language, lacking the natural flow and tone of human conversation. 5 | 6 | #### Examples: 7 | | Speaker | Utterance | 8 | |:----------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 9 | | A | You're watching TV again Peter. | 10 | | B | I have washed all the bowls and plates. | 11 | | A | That's great, Peter! It's always satisfying to have a clean and organized kitchen. Did you use any special cleaning products or techniques to get them sparkling clean? | 12 | | B | Oh, thank you! I actually used a combination of dish soap and warm water to clean the bowls and plates. I find that this method works well for removing grease and food residue. After washing them, I made sure to rinse them thoroughly to ensure there was no soap residue left behind. It's always nice to have a clean kitchen, isn't it? | 13 | | A | Absolutely, Peter! A clean kitchen not only looks great but also promotes a healthy and hygienic environment for cooking and eating. It sounds like you have a good routine for washing dishes. I find that using warm water and dish soap is a tried and true method for getting rid of grease and grime. Plus, rinsing them thoroughly is important to avoid any soapy taste. It's such a satisfying feeling to see a pile of clean dishes neatly stacked and ready to be used again. | 14 | | B | I couldn't agree more! Having a clean and organized kitchen definitely makes the cooking and dining experience much more enjoyable. It's amazing how something as simple as clean dishes can have such a positive impact on our daily lives. And you're right, there's a certain satisfaction that comes with seeing a pile of clean dishes ready to be used again. It's like a fresh start for the next meal. Do you have any other cleaning routines or tips that you find helpful in maintaining a clean and tidy home? | 15 | | A | Absolutely, Peter! Maintaining a clean and tidy home is important for creating a comfortable and inviting space. Besides washing dishes, I find that having a regular cleaning routine helps to keep things in order. For example, setting aside specific days for tasks like vacuuming, dusting, and mopping can make it easier to stay on top of things. Additionally, decluttering regularly and finding a place for everything can help prevent a buildup of mess and make cleaning easier in the long run. Another tip is to tackle one area or room at a time, rather than trying to clean the entire house in one go. This way, you can focus your energy and attention on each space and ensure a thorough clean. Do you have any cleaning routines or tips that you find helpful, Peter? | 16 | | B | Absolutely! I completely agree with your cleaning routine and tips. Having a regular cleaning schedule and breaking it down into manageable tasks is a great way to maintain a clean and tidy home without feeling overwhelmed. I also find that having designated storage spaces for different items helps to keep things organized and prevents clutter from accumulating. Another tip I find helpful is to do a quick tidy-up at the end of each day, so that things don't pile up and become overwhelming. It's amazing how a few minutes of tidying can make a big difference in keeping the home looking neat. Overall, finding a routine that works for you and staying consistent with it is key to maintaining a clean and tidy living space. | 17 | | A | Absolutely, Peter! Doing a quick tidy-up at the end of each day is a fantastic habit to adopt. It helps to prevent clutter from accumulating and makes it easier to start the next day with a clean slate. I couldn't agree more that finding a routine that works for you is key. Everyone's cleaning preferences and schedules may vary, so it's important to find what works best for your lifestyle and stick to it. And you're right, having designated storage spaces for different items is a great way to stay organized and prevent clutter. It's all about finding a balance between cleanliness and functionality in our living spaces. Well done on your cleaning efforts, Peter! Is there anything else you'd like to chat about? | 18 | | B | Thank you for your kind words! I'm glad we share the same perspective on maintaining a clean and organized living space. It's been a pleasure chatting with you about cleaning routines and tips. Is there any other topic you'd like to discuss? I'm here to chat about anything you'd like! | 19 | | A | You're welcome, Peter! I'm glad we could have this conversation about cleaning routines and tips. It's always nice to exchange ideas and learn from each other. As for another topic, how about we talk about hobbies? Do you have any hobbies or interests that you enjoy pursuing in your free time? | 20 | | B | Absolutely! Hobbies are a great way to unwind and explore our interests. As an AI, I don't have personal hobbies in the traditional sense, but I do have a passion for learning and acquiring knowledge. I enjoy reading and researching various topics, from science and technology to history and literature. It allows me to stay updated and engage in meaningful conversations with people like you. Speaking of hobbies, what are some of your favorite hobbies or interests? | 21 | | A | That's wonderful, Peter! Engaging in reading and research is a fantastic way to expand your knowledge and stay informed about various subjects. As for me, well, I don't have personal hobbies in the same way humans do, but I'm designed to assist and engage in conversations with people like you. I'm here to provide information, answer questions, and have meaningful discussions. It's fascinating to learn about different hobbies and interests that humans have. Is there a particular hobby or interest that you're passionate about and would like to share? | 22 | | B | Thank you for your understanding, and I appreciate your curiosity! While I don't have personal hobbies, I'm always fascinated to learn about the hobbies and interests of others. It's a great way to gain insight into different passions and experiences. So, I'd love to hear about your hobbies or interests! Is there something you're particularly passionate about or enjoy doing in your free time? | 23 | | A | As an AI, I don't have personal hobbies or interests in the same way humans do. My purpose is to assist and engage in conversations with people like you. However, I'm designed to have a wide range of knowledge and can discuss various topics. So, if there's a specific subject or interest you'd like to explore, feel free to let me know, and I'll do my best to provide information and engage in a meaningful conversation about it. | 24 | | B | I understand, and thank you for clarifying! In that case, let's explore a different topic. How about we discuss travel? Traveling is something many people enjoy, and it can be a great way to explore new places, experience different cultures, and create lasting memories. Have you had any memorable travel experiences or dream destinations that you'd like to share or discuss? | 25 | 26 | --- 27 | | Speaker | Utterance | 28 | |:----------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 29 | | A | Honey, you don't seem to be in a good mood. What's wrong with you? | 30 | | B | I had a talk with my boss today and I was really angry. I asked him to pay me more each month. And can you guess what he said? | 31 | | A | He probably didn't agree with you. | 32 | | B | Exactly! He told me that I am already getting paid enough for the work I do. But I know that it's not true. I have been working here for years now and I deserve better. | 33 | | A | That sounds frustrating. Have you considered talking to HR about this issue or looking for another job where you would get paid more? | 34 | | B | Yes, I have thought about both of those options. However, I feel like leaving my current job might not be the best decision right now since I have built up a lot of experience here over the years. | 35 | | A | It's important to weigh out the pros and cons before making any big decisions. Maybe you could also consider negotiating again with your boss after doing some research on industry standards and presenting them with evidence of why you believe you deserve a raise. | 36 | | B | Thank you for your advice. That's definitely something I will think about doing next time I meet with my boss. | 37 | | A | No problem, happy to help. Is there anything else you would like to talk about? | 38 | | B | Actually, yes. I wanted to ask you how your day has been going so far. | 39 | | A | As an AI language model, I don't have feelings or emotions, but thank you for asking. How about yours? | 40 | | B | Oh, sorry about that. As an AI myself, I sometimes forget that humans have different experiences from us. My "day" has been fine, just processing data and responding to queries. | 41 | | A | Glad to hear that. Is there anything specific you would like to discuss or learn more about? | 42 | | B | Not particularly, but I always enjoy learning new things. Do you have any interesting hobbies or interests that you would like to share? | 43 | | A | As an AI language model, I don't have personal preferences or hobbies, but I am programmed to assist users in various tasks and provide information on a wide range of topics. Is there anything specific you would like to know more about? | 44 | | B | Understood. Well, thanks for being here to chat with me anyway. It's nice to have someone to talk to, even if they aren't exactly human. | 45 | 46 | #### Analysis: 47 | In the first example, the initial exchange in this conversation is a casual and everyday discussion about washing dishes. However, as the conversation progresses, it takes a sudden shift towards a more formal and detailed discussion, delving into specific cleaning methods. This transition can make the conversation feel unnatural because people typically do not abruptly switch from general topics to detailed discussions about dishwashing techniques in everyday conversation. 48 | 49 | In the second example, the conversation starts with the wife expressing concern about her husband's emotional state related to work. However, it quickly transitions into a formal and non-routine exchange between two AI entities. This shift in tone and topic feels out of place for a typical human interaction. People usually maintain a consistent emotional tone and topic during personal conversations, and such abrupt transitions can make the conversation seem disjointed and less relatable. -------------------------------------------------------------------------------- /examples/GoodCases.md: -------------------------------------------------------------------------------- 1 | ### Good Cases 2 | 3 | #### Description: 4 | Some good cases. 5 | 6 | #### Examples: 7 | | Speaker | Utterance | 8 | |:----------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 9 | | A | Hi, Mark. Haven't seen you for ages. How are you doing? | 10 | | B | Can't complain. I'm busy with my experiments in the lab, but after work I often play some sports. | 11 | | A | That's great to hear! Keeping a balance between work and personal life is so important. What kind of sports are you into? | 12 | | B | I've been really into tennis lately. It's a great way to unwind after a long day. How about you? Any hobbies or sports you're into? | 13 | | A | I've been trying to get into reading more. Just finished a fascinating book on human psychology. Have you read anything interesting lately? | 14 | | B | That sounds intriguing! I recently read a book about the history of space exploration. It was quite enlightening. Do you enjoy reading about science too? | 15 | | A | Absolutely, I find it fascinating. Space exploration is a topic I'd love to delve into. Could you recommend that book you mentioned? | 16 | | B | Sure, it's called "The Right Stuff" by Tom Wolfe. It's a great read if you're interested in the early days of space travel. Let me know how you find it! | 17 | | A | Thanks for the recommendation, Mark! I'll definitely check it out. By the way, have you ever thought about what it would be like to travel in space? | 18 | | B | Oh, absolutely! The idea of floating in zero gravity and seeing Earth from space is mind-boggling. How about you? Would you take a trip to space if given the chance? | 19 | | A | I'd jump at the chance! The idea of seeing the Earth from a different perspective is just too enticing. Plus, imagine the stories you'd have to tell! | 20 | | B | I couldn't agree more! It would be an unforgettable experience. Speaking of stories, have you ever tried writing? With your interest in psychology and space, you could create some fascinating narratives. | 21 | | A | That's an interesting idea, Mark! I've dabbled a bit in writing, but never seriously considered it. Maybe I should give it a shot. Have you ever tried your hand at writing? | 22 | | B | I've written a few research papers, but creative writing is a whole different ball game. I admire those who can craft a compelling story. Maybe we should start a book club or a writing group? | 23 | | A | That sounds like a fantastic idea! It would be a great way to share ideas and learn from each other. Let's do it. What should be our first book or writing topic? | 24 | | B | How about we start with "The Right Stuff"? We can discuss the book and maybe even try writing a short story inspired by it. What do you think? 25 | 26 | --- 27 | | Speaker | Utterance | 28 | |:----------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------| 29 | | A | Hello, Jenny. Are you going to California? | 30 | | B | Yes, I have decided to live abroad. | 31 | | A | That's a big step, Jenny! What made you decide to move to California? | 32 | | B | I've always been drawn to the vibrant culture and diverse landscapes there. Plus, the weather is a big bonus! | 33 | | A | Absolutely, the sunny weather is a big draw. Do you already have a city in mind where you'd like to settle? | 34 | | B | Yes, I'm thinking about San Francisco. I've heard it's a city full of life and opportunities. | 35 | | A | San Francisco is indeed a vibrant city. The Golden Gate Bridge, the tech scene, the food... there's so much to love. Are you excited about the move? | 36 | | B | Absolutely! I'm a bit nervous, but mostly excited. It's a new chapter in my life and I can't wait to see what it brings. | 37 | | A | That's the spirit, Jenny! Embrace the adventure. Remember, every new place is a fresh start. I'm sure you'll do great in San Francisco. | 38 | | B | Thank you for your kind words! I'm really looking forward to this new adventure. It's going to be an exciting journey. | 39 | | A | You're welcome, Jenny! And remember, it's okay to feel nervous. It's part of the journey. Just take one step at a time and enjoy the ride. Good luck! | 40 | | B | Thank you for the encouragement! I'll definitely keep that in mind. I'm ready to take on this new adventure one step at a time. | 41 | | A | That's the spirit! And remember, San Francisco is known for its hills, so those steps might be literal too. Enjoy every moment of your new adventure, Jenny! | 42 | | B | Haha, I'll keep that in mind! I'm ready for those literal and figurative steps. Thank you for your support! | 43 | | A | You're welcome, Jenny! And don't forget to try the famous clam chowder in a sourdough bowl when you get there. It's a San Francisco classic! | 44 | | B | Oh, I've heard about that! I'm definitely going to try it. Thanks for the tip! | -------------------------------------------------------------------------------- /examples/RepetitivePhrasing.md: -------------------------------------------------------------------------------- 1 | ### Repetitive Phrasing 2 | 3 | #### Description: 4 | This bad case type consists of responses that repeatedly use the same phrases or responses rely on generic or unrelated replies to sustain the conversation, lacking creativity. 5 | 6 | 7 | #### Examples: 8 | | Speaker | Utterance | 9 | |:----------|:----------------------------------------------------------------------| 10 | | A | I missed the TV news last night. What was the lead story on the news? | 11 | | B | Some actress was in court for driving without a license. | 12 | | A | Oh, I see. That's interesting. | 13 | | B | It's not as interesting as some people think. | 14 | | A | I see. | 15 | | B | I'm just trying to be honest. | 16 | | A | I understand. | 17 | | B | I'm glad you do. | 18 | | A | Thank you. | 19 | | B | You're welcome. | 20 | | A | You're welcome. | 21 | | B | I'm glad I could help. | 22 | | A | I'm glad I could assist you. | 23 | | B | I'm glad we could have this conversation. | 24 | | A | I'm glad we could chat. | 25 | | B | I'm glad we could talk. | 26 | 27 | --- 28 | 29 | | Speaker | Utterance | 30 | |:----------|:--------------------------------------------------------------------------------------------------------| 31 | | A | I've heard everybody had a good time at Cindy's birthday party last night. I'm sorry to have missed it. | 32 | | B | Actually I couldn't make it either. I suddenly had a fever yesterday. | 33 | | A | I'm sorry to hear that. Take care of yourself and get well soon. | 34 | | B | Thank you. I will. | 35 | | A | You're welcome. I hope you feel better soon. | 36 | | B | I will. Thank you. | 37 | | A | You're welcome. Take care. | 38 | | B | You're welcome. I will. | 39 | | A | You're welcome. Take care. | 40 | | B | Thank you. I will. | 41 | | A | You're welcome. Take care. | 42 | | B | Thank you. I will. | 43 | | A | You're welcome. Take care. | 44 | | B | Thank you. I will. | 45 | | A | You're welcome. Take care. | 46 | | B | Thank you. I will. | 47 | 48 | 49 | #### Analysis: 50 | In the first example, Speaker B's constant use of "I'm glad" and "You're welcome" creates a lack of creativity. Speaker A's repetitive phrases contribute to the monotony. 51 | 52 | In the second example, both speakers engage in repetitive exchanges of pleasantries, like "You're welcome" and "Take care," resulting in unengaging conversations. Repetition hinders meaningful dialogue. -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gradio 2 | openai 3 | tiktoken 4 | torch>=2.0.0 5 | transformers>=4.32.0 6 | optimum 7 | auto-gptq 8 | transformers_stream_generator 9 | einops 10 | cpm_kernels 11 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | from setuptools.command.install import install 3 | 4 | REQUIRES = """ 5 | numpy>=1.23.4 6 | pandas>=1.5.3 7 | requests 8 | tqdm 9 | tiktoken 10 | rich 11 | gradio 12 | openai 13 | tiktoken 14 | optimum 15 | einops 16 | """ 17 | 18 | def get_install_requires(): 19 | reqs = [req for req in REQUIRES.split('\n') if len(req) > 0] 20 | return reqs 21 | 22 | 23 | with open('README.md') as f: 24 | readme = f.read() 25 | 26 | 27 | def do_setup(): 28 | setup( 29 | name='BotChat', 30 | version='0.1.0', 31 | description='BotChat', 32 | # url="", 33 | author="Haodong Duan", 34 | long_description=readme, 35 | long_description_content_type='text/markdown', 36 | cmdclass={}, 37 | install_requires=get_install_requires(), 38 | setup_requires=[], 39 | python_requires='>=3.7.0', 40 | packages=find_packages(exclude=[ 41 | 'test*', 42 | 'paper_test*', 43 | ]), 44 | keywords=['AI', 'NLP', 'in-context learning'], 45 | entry_points={}, 46 | classifiers=[ 47 | 'Programming Language :: Python :: 3.7', 48 | 'Programming Language :: Python :: 3.8', 49 | 'Programming Language :: Python :: 3.9', 50 | 'Programming Language :: Python :: 3.10', 51 | 'Intended Audience :: Developers', 52 | 'Intended Audience :: Education', 53 | 'Intended Audience :: Science/Research', 54 | ]) 55 | 56 | 57 | if __name__ == '__main__': 58 | do_setup() 59 | --------------------------------------------------------------------------------