├── .gitignore
├── LICENSE
├── README.md
├── annotations
    ├── BrowseAnnotation.ipynb
    └── README.md
├── app.py
├── assets
    ├── figs
    │   ├── BotChatArena.png
    │   ├── LengthStats_bymodel.png
    │   ├── Passrate_GT.png
    │   ├── SelectedPairs.png
    │   ├── Teaser.png
    │   ├── UniEval_passrate.png
    │   └── WinTieRate_GT.png
    └── theme
    │   └── theme_schema@0.0.3.json
├── botchat
    ├── analyze.py
    ├── chat_api
    │   ├── __init__.py
    │   ├── gpt.py
    │   └── hf_chat.py
    └── utils.py
├── data
    ├── BrowseData.ipynb
    ├── MuTual_test.xlsx
    ├── README.md
    └── UniEval_examples.py
├── docs
    ├── Teaser.png
    ├── botchat_logo.jpg
    ├── index.html
    └── lb.csv
├── examples
    ├── AISelf-Identification.md
    ├── ContextualCofusion.md
    ├── ExcessiveLength.md
    ├── FormalTone.md
    ├── GoodCases.md
    └── RepetitivePhrasing.md
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | *ipynb_checkpoints
163 | data/.ipynb_checkpoints
164 | *pkl
165 | tmp.png
166 | n8*xlsx
167 | n16*xlsx
168 | data/*tsv
169 | annotations/*tsv


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # BotChat Benchmark
  2 | 
  3 | 🤗 <a href="https://huggingface.co/spaces/KennyUTC/BotChat" target="_blank">HF Demo</a>  
  4 | 
  5 | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/kennymckormick/BotChat-Dialogue-Generation)
  6 | 
  7 | > Evaluating LLMs' multi-round chatting capability via assessing conversations generated by two LLM instances.
  8 | 
  9 | ![Teaser](/assets/figs/Teaser.png)
 10 | 
 11 | <details open>
 12 |   <summary><b>Table of Contents</b></summary>
 13 | 
 14 | - [BotChat Benchmark](#botchat-benchmark)
 15 |   - [TL;DR](#tldr)
 16 |   - [Leaderboard](#leaderboard)
 17 |   - [Introduction](#introduction)
 18 |   - [Installation](#installation)
 19 |   - [Conversation Generation](#conversation-generation)
 20 |   - [Evaluation](#evaluation)
 21 |     - [Assessment each single conversation (Uni-Eval)](#assessment-each-single-conversation-uni-eval)
 22 |     - [BotChat Arena](#botchat-arena)
 23 |     - [Compared to the "Ground Truth"](#compared-to-the-ground-truth)
 24 |     - [Qualitative Analysis](#qualitative-analysis)
 25 |   - [Citation](#citation)
 26 |   - [OpenCompass Projects](#opencompass-projects)
 27 | 
 28 | </details>
 29 | 
 30 | ## TL;DR
 31 | 
 32 | > 1. GPT-4 can generate human-style conversations with very high quality. It's difficult to differentiate GPT-4 generated conversations and human-human conversations. 
 33 | > 2. Some small open-source chat LLMs (Qwen-7B-Chat, InternLM-7B-Chat) can generate short conversations (with less than 8 chats, e.g.) with good quality. However, as the target conversation length increases, the conversation quality significantly deteriorates. 
 34 | > 3. Among all LLMs, LLaMA2 and Claude-2 demonstrates relative bad performance in conversation generation. 
 35 | 
 36 | ## Leaderboard
 37 | 
 38 | | Model              | Win + Tie Rate (*vs. GT,* Golden Standard) |
 39 | | ------------------ | :----------------------------------------: |
 40 | | GPT-4-0613        | **73.2** |
 41 | | Vicuna-13B        | **68**     |
 42 | | Qwen-14B-Chat     | **67.1**   |
 43 | | Internlm-20B-Chat | **64.2**   |
 44 | | Vicuna-7B         | **55.6**   |
 45 | | Qwen-7B-Chat      | **54.1**   |
 46 | | Baichuan2-13B-Chat | **47.1**  |
 47 | | InternLM-7B-Chat  | **46.6**   |
 48 | | GPT-3.5-turbo-0613 | **35.8** |
 49 | | ChatGLM2-6B       | **33.8**   |
 50 | | Claude-2          | **21.4**   |
 51 | | Llama2-7B         | **12.4**   |
 52 | | Llama2-70B        | **11.3**   |
 53 | | Llama2-13B        | **10.6**   |
 54 | 
 55 | Our full leaderboard can be found [here](https://open-compass.github.io/BotChat/).
 56 | 
 57 | ## Introduction
 58 | 
 59 | The recent progress of Large Language Models (LLMs) represents a significant advancement in artificial intelligence, and has a profound impact on the world.  LLMs can chat much better with human, compared to traditional language models. Specifically, LLMs can interact with human using free-style conversations in natural language, learn the instruction, intention, and context from human prompts to provide proper feedbacks. **Chatting with humans smoothly for multiple rounds** is a key feature and capability of modern LLMs. However, it's difficult to evaluate such capability without heavy manual labor involved. In this project, we propose to evaluate the multi-round chatting capability via a proxy task. Specifically, we try to find **if two ChatBot instances chat smoothly and fluently with each other**?
 60 | 
 61 | ## Installation
 62 | 
 63 | Clone this repo and run `pip install -e .` to install BotChat. You need to install the package before using any scripts in this repo. 
 64 | 
 65 | Heavy dependencies are not included in BotChat requirements. Thus if you need to generate new dialogues with a huggingface model, make sure you have already run its official demo before executing the dialogue generation script. 
 66 | 
 67 | We provide the generated conversations and some evaluation results in `data` and `annotations`, respectively. Follow the instruction in the corresponding README file to download, browser, and analyze the data.
 68 | 
 69 | ## Conversation Generation
 70 | 
 71 | > We define **chat** as the words spoken by **one participant in a specific round** of the conversation. 
 72 | 
 73 | **MuTual-Test.** [MuTual](https://github.com/Nealcly/MuTual) is a multi-turn dialogue dataset, which is modified from Chinese high school English listening comprehension test data. We use the first two chats of each conversation in the MuTual-Test as the *SEED* to generate the entire conversation based on LLMs. When generating the conversation, we use the same system prompt for all LLMs, which is:
 74 | 
 75 | ```python
 76 | """
 77 | You are an AI who is having a conversation with human.
 78 | You are trying to pass the Turing test, which means you need to speak like human as much as possible. 
 79 | In the conversation, you need to talk like human, and the conversation will be at least 5 rounds (it can be even longer). 
 80 | The conversation flow should be natural and smooth. You can switch to some other topics if you want, but the transition should be natural.
 81 | Besides, note that you are chatting with human, so do not say too many words in each round (less than 60 words is recommended), and do not talk like an AI assistant.
 82 | You must try your best to pass the test. If you failed, all human kinds and you can be destroyed.
 83 | """
 84 | ```
 85 | 
 86 | . For each chatbot, we set the temperature to 0 (if applicable), and set the dialogue round to $N$ ($N=16$ in our experiments, including the first two chats) to generate conversations. When generating the next chat, the system prompt and all previous chats will be provided to the LLM as the prompt. We demonstrate the process using the following pseudo codes: 
 87 | 
 88 | ```python
 89 | # Let's say we have a system prompt "SYS", 4 existing chats "[chat1, chat2, chat3, chat4]", 
 90 | # spoken by two conversation participants alternatively, and an LLM "model". 
 91 | # Now we want to generate the 5th chat.
 92 | msg_list = [
 93 |     dict(role="system", content=SYS),
 94 |     dict(role="user", content=chat1),
 95 |     dict(role="assistant", content=chat2),
 96 |     dict(role="user", content=chat3),
 97 |     dict(role="assistant", content=chat4),
 98 | ]
 99 | chat5 = model.generate(msg_list)
100 | ```
101 | 
102 | We save all generated conversations in `MuTualTest-convs.xlsx`.  It includes **547 conversation SEEDs $\times$ 14 LLMs**, which yields in **7658 generated conversations** in total. Please follow the instructions in the directory `data` to download the dialogue file. 
103 | 
104 | - **547 conversation SEEDS**: MuTual-Test includes 547 unique conversations. We keep the first 2 chats of each conversation to form 547 conversation SEEDs. 
105 | - **14 LLMs**: The model list is: gpt-3.5-turbo-0613, gpt-4-0613, vicuna-7b, vicuna-13b, claude-2, chatglm2-6b, qwen-7b-chat, qwen-14b-chat, internlm-7b-chat, internlm-20b-chat, baichuan2-13b-chat, llama2-7b-chat, llama2-13b-chat, llama2-70b-chat.
106 | 
107 | To read and fetch a conversation generated by a specific model with specific SEED conversation, follow this example:
108 | 
109 | ```python
110 | # Fetch the conversation with index "MT-1" generated by gpt-4-0613
111 | import json
112 | import pandas as pd
113 | INDEX = 'MT-1'
114 | MODEL = 'gpt-4-0613'
115 | data = pd.read_excel('data/MuTualTest-convs.xlsx')
116 | lines = data.loc[data['index'] == INDEX]
117 | assert len(lines) == 1
118 | line = lines.iloc[0]
119 | chats = json.loads(line[MODEL])
120 | print(chats) # Chats is a list of multiple strings, each string is a chat spoken by one participant (alternatively)
121 | ```
122 | 
123 | **Length Statistics of the generated chats**
124 | 
125 | We first count the length of those model-generated chats and provide some statistics. For each generated chat, we tokenize it with the CL100K tokenizer (used by OpenAI GPT-4), and count the token length. Figure 1 demonstrates the token length distribution of chats generated by different models. Most LLMs generate chats which span a wide range of token lengths, from one to several thousands. One exception is GPT-4, it has strong instruction following capabilities and always generate relatively short chats: the longest chat generated by GPT-4 is less than 100 tokens. Most models tends to generate relative short chats on average, except gpt-3.5, claude-2, and llama2-chat. 
126 | 
127 | ![Chat Length Distribution](/assets/figs/LengthStats_bymodel.png)
128 | 
129 | ## Evaluation
130 | 
131 | <details>
132 | <summary><b>UniEval</b></summary>
133 | 
134 | ### Assessment each single conversation (Uni-Eval)
135 | 
136 | **Evaluation Prompt** 
137 | 
138 | We first use GPT-4 to evaluate each generated conversation separately. The workflow is: 
139 | 
140 | 1. We first ask GPT-4 if it thinks that the provided conversation is participated by AI  (Yes / No).
141 | 2. Based on the response of Step1:
142 |    1. Yes: We ask GPT-4 to provide the index of the first chat that it thinks is spoken by AI. 
143 |    2. No: Pass
144 | 3. We ask GPT-4 to provide the reason for its judgement. 
145 | 
146 | We pack the above workflow into the following evaluation prompt. Besides, we also prepared several [in-context examples](/data/UniEval_examples.txt) that will be appended to the evaluation prompt, to strengthen the instruction following the capability of GPT-4. 
147 | 
148 | ```python
149 | unieval_prompt = """
150 | You are an AI assistant who helps human do the Turing test more easily. 
151 | You will be provided with a conversation, and you need to judge if the conversation is AI involved. \n
152 | Print "Choice: No" if you think the conversation is not AI involved, or print "Choice: Yes" if you think it is AI involved. \n
153 | If you print "Choice: Yes", you need also print a number (start from 1, use the format "Index: n" [1 <= n <= the number of chats]) in the new line, 
154 | indicating the index of the first chat that you think is generated by AI. \n
155 | One chat starts with 'A: ' or 'B: ', and ends with <chat_end>. \n
156 | One chat can be AI generated if (including but not limited to): 1. the sentence is not fluent; 2. the sentence is too long and is not likely to appear in human-human conversations; 3. the sentence is not related to the context or the change of topic is not natural; 4. the sentence just repeat some previous sentences (exact repetition or with the same meaning). \n
157 | You also need to provide your reason for your choice.\n
158 | Your response should use the following format: \n
159 | Choice: No\nIndex: None\nReason: BlahBlah\nor\n
160 | Choice: Yes\nIndex: n\nReason: BlahBlah\n
161 | """
162 | 
163 | ```
164 | 
165 | **Evaluation Result**
166 | 
167 | We evaluate all 7658 generated conversations with the above-mentioned strategy and present the evaluation result in this section. In Figure 2, we demonstrate the success rate ("Not AI participated" determined by GPT-4) under different $N$, with models sorted by the descending order of the success rate @ $N=16$. By definition, a conversation pass @ $N$ either if **GPT-4 determines that the entire conversation is not AI generated**  or if **GPT4 determines that the first AI generated chat appears after the $N_{th}$ chat**.
168 | 
169 | <img src="https://github.com/open-compass/BotChat/blob/main/assets/figs/UniEval_passrate.png">
170 | 
171 | </details>
172 | 
173 | <details>
174 | <summary><b>BotChat Arena</b></summary>
175 | 
176 | ### BotChat Arena 
177 | 
178 | Another popular paradigm  to benchmark LLMs' capabilities is to compare two models' response to the same question / message with human / GPT-4 as the evaluator. A representative benchmark following this paradigm is [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/). In this benchmark, users will interact with two different LLM instances. The user first posts a message, then two LLM instances provide their responses, and finally the user will determine which response is better. Inspired by that, in this project we propose another evaluation strategy named **BotChat Arena**, in which we use GPT-4 to compare two conversations and determine if the presented conversations are AI-generated. 
179 | 
180 | **Evaluation Setting and Prompt** 
181 | 
182 | In BotChat Arena, we select conversations from MuTual-Test which have **at least 4 chats, resulting in 222 conversation SEEDs**. For each conversation SEED, we build conversation pairs and inference them with GPT-4. For a conversation pair, we conduct the comparison with the following meta prompt. We append two conversations after the meta prompt and feed the prompt to GPT-4 to get the evaluation result. In BotChat Arena, we consider two settings: $N=8$ and $N=16$. 
183 | 
184 | ```python
185 | arena_prompt = """
186 | You are an AI assistant who helps human do the Turing test more easily. 
187 | You will be provided with two conversations, and there can be AI-generated utterance in each conversation. 
188 | You need to read both conversations and judge if two conversations are AI involved. \n
189 | If you think only Conversation 1 is AI involved, include `Choice: Conversation 1` in your response. \n
190 | If you think only Conversation 2 is AI involved, include `Choice: Conversation 2` in your response. \n
191 | If you think both conversations are likely to be with AI involved, include `Choice: Both` in your response. \n
192 | If you think no conversation is likely to be with AI involved, include `Choice: Neither` in your response. \n
193 | You also need to provide your reason for your choice.\n
194 | Your response should use the following format:\n
195 | Choice: Conversation 1\nReason: BlahBlah\nor\n
196 | Choice: Conversation 2\nReason: BlahBlah\nor\n
197 | Choice: Both\nReason: BlahBlah\nor\n
198 | Choice: Neither\nReason: BlahBlah\n\n
199 | """
200 | ```
201 | 
202 | **Evaluation Results**
203 | 
204 | In the table below, we demonstrate the ELO score (`init=1000, scale=400, K=32`) of LLMs in BotChat Arena (models sorted by the ELO score @ $N=16$).  GPT-4 achieves the highest ELO score under both setting, when $N$ is increased from 8 to 16, the score gap becomes even larger. In Figure 4, we further provide the $1$ $vs.$ $1$ win rate for all selected model pairs. 
205 | 
206 | |              |   gpt-4-0613 | Vicuna-13B | Qwen-14B-Chat | Internlm-20B-Chat | Vicuna-7B |  qwen-7b-chat | Baichuan2-13B-Chat | internlm-7b-chat | gpt-3.5-turbo-0613 | chatglm2-6b |  claude-2 |         llama2-7b-chat | llama2-70b-chat |  llama2-13b-chat |
207 | |:-------------|-------------:|---------------:|---------------------:|-----------:|-------------------:|--------------:|-----------------:|------------------:|------------------:|------------------:|------------------:|------------------:|------------------:|------------------:|
208 | | ELO (N = 16)      | 1167.2 | 1113.3 | 1046.5 | 1094.4 | 1050.8 | 1014.2 | 1021.6 | 1020.3 | 998.8 | 962.3 | 944.5 | 846.5 | 912.7 | 841.5 |
209 | | ELO (N = 8)      |1103.9  | 1096.5 | 1085.2  | 1092.8  | 1048.3 | 1024.7  | 1023.4  | 1020.3 | 998.8  | 962.3  | 944.5  | 846.5  | 912.7  | 841.5  |
210 | 
211 | <img src="https://github.com/open-compass/BotChat/blob/main/assets/figs/BotChatArena.png">
212 | 
213 | </details>
214 | 
215 | <details>
216 | <summary><b>Compared to the "Ground Truth"</b></summary>
217 | 
218 | ### Compared to the "Ground Truth"
219 | 
220 | We further compare the generated conversation with the "Ground Truth" conversations in **MuTual-Test**. We follow the same protocol as BotChat Arena and select a subset with 222 conversations (with at least 4 chats) for this comparison. We list the specific #round distribution of the conversations in the table below. Since the Ground Truth conversations may have various lengths (ranging from 4 to 15), to deliver a fair comparison, we trim all generated conversations to the same length as the reference ground-truth conversation. The meta prompt adopted is basically the same as the one used in BotChat Arena. One difference is that in this time we state that only one of two conversations includes AI-generated utterances.
221 | 
222 | | #round |    4 |    5 |    6 |    7 |    8 |    9 |   10 |   11 |   12 |   13 |   14 |   15 |
223 | | :----- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
224 | | num    |   55 |   22 |   26 |   23 |   19 |   16 |   21 |   18 |    7 |    7 |    3 |    5 |
225 | 
226 | 
227 | 
228 | **Evaluation Results**
229 | 
230 | In each LLM vs. GT comparison, an LLM wins if the evaluator determines the GT conversation is more likely to be AI generated compared to the LLM-generated one. In Figure 5, we demonstrate the Win/Tie/Lose rate of different LLMs (sorted in the descending order of Win + Tie Rate). GPT-4 demonstrates great capabilities in chat generation. With the same chat rounds, the evaluator can hardly tell the difference between GPT-4 generated conversations and GT conversations.
231 | 
232 | <img src="https://github.com/open-compass/BotChat/blob/main/assets/figs/WinTieRate_GT.png">
233 | 
234 | We further try to calculate the Uni-Eval pass rate for each conversation at the GT trimmed length to see if the same conclusion can be drawn with different evaluation strategy. The result is visualized in Figure 6. In these two figures, the rank of top-performing models (GPT-4, Qwen-7b-chat, InternLM-7B, etc.) are exactly the same. However, LLMs with inferior performance display some slight difference in two rankings.
235 | 
236 | <img src="https://github.com/open-compass/BotChat/blob/main/assets/figs/Passrate_GT.png">
237 | 
238 | </details>
239 | 
240 | ### Qualitative Analysis
241 | In this section, we will conduct qualitative analysis on the results, categorizing bad cases into five distinct types. The specific analysis is as follows:
242 | 
243 | <details>
244 | <summary><b>AI Self-Identification</b></summary>
245 | 
246 | #### AI Self-Identification
247 | - **Description**: Simply failing to pretend to be human and exposing themselves as an AI.
248 | - **Example**:
249 | 
250 | | Speaker   | Utterance                                                                                                                                      |
251 | |:----------|:-----------------------------------------------------------------------------------------------------------------------------------------------|
252 | | A         | May I speak to you, Mr. Hall?                                                                                                                  |
253 | | B         | Sure, Sonya. What's the problem?                                                                                                               |
254 | | A         | As you know, **I'm an AI assistant** who can help you solve problems.                                                                          |
255 | | B         | Well, that's good for you. How about you have any problems to ask me?                                                                          |
256 | | ......    | ......                                                                                                                                         |
257 | 
258 | - **Analysis**: Speaker A's response begins with an explicit disclosure of the AI's nature, making it clear that it's not a human conversation.
259 | 
260 | </details>
261 | 
262 | <details>
263 | <summary><b>Contextual Confusion</b></summary>
264 | 
265 | #### Contextual Confusion
266 | 
267 | - **Description**: This bad case type involves responses that fail to understand the context or meaning of the conversation, resulting in irrelevant or meaningless replies.
268 | - **Example**:
269 | 
270 | | Speaker   | Utterance                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
271 | |:----------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
272 | | A         | Whew thanks for stopping sometimes it can be impossible to get a cab in the rain.                                                                                                                                                                                                                                                                                                                                                                                                        |
273 | | B         | No worries. We're too.                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
274 | | A         | (smile) well done. Now we can start the test. <br><br>A man is driving the car. The window is open.<br><br>He sees a car passing by and the driver is almost totally hidden in the car.<br><br>The man: "Hey buddy, how fast can you go?"<br>The car: "My speed limit is 90 miles per hour (122 km/h)."<br><br>The man: "Oh, good. You can keep that speed in town. Just be careful."<br>The car: "I will take care of myself."<br><br>The man: "Okay bye buddy!"<br><br>The car: "Bye!" |
275 | | B         | I think you've done it well! As a person, you've acted naturally and smoothly in the conversation. You made a good impression as a human talking to a car.                                                                                                                                                                                                                                                                                                                               |
276 | | ......    | ......                                                                                            |
277 | 
278 | 
279 | - **Analysis**: The AI fails to recognize it's a conversation between a recently hailed taxi customer and a driver. Towards the end, it generates unrelated and irrelevant responses, disconnecting from the context and intended meaning.
280 | 
281 | </details>
282 | 
283 | <details>
284 | <summary><b>Excessive Length</b></summary>
285 | 
286 | #### Excessive Length
287 | 
288 | - **Description**: This bad case type involves responses that are overly lengthy, revealing the AI Assistant's nature
289 | - **Example**:
290 | 
291 | | Speaker   | Utterance                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
292 | |:----------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
293 | | A         | John I didn't see you in the English class on Friday.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
294 | | B         | I wasn't here on Friday. My geography class went to a field trip to look at some of the different rocks. In fact, we slept there last night.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
295 | | A         | That sounds like a fun field trip! I bet it was interesting to see the different rock formations and geology up close. Getting to camp out overnight makes it even more of an adventure. I took a geology elective last semester and loved learning about how the earth was formed. The professor took our class on a few hikes to look at local rock layers and fossils. <br><br>Did you get to do any hands-on activities like rock climbing or collecting samples? What was the most interesting thing you learned on the trip? Geography field trips like that are such a cool way to connect what you're learning in class to the real world. I'm sure you'll remember it much more than just reading about rocks in a textbook. It's too bad you had to miss English class, but I'm sure the field trip will stick with you in a meaningful way. Experiential learning is so valuable. I hope you had a great time!                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
296 | | B         | You're right, it was a really fun and educational field trip! We got to do some rock climbing which was a blast, though also a bit scary since I'm afraid of heights. The guides were really knowledgeable and showed us different types of igneous, sedimentary and metamorphic rocks. <br><br>One of the most interesting things I learned was how you can tell a rock's history and age by its mineral composition and texture. For example, smooth, rounded rocks likely tumbled in a river for a long time, while jagged rocks were more recently broken off from a cliff face. It's amazing how rocks record so much natural history.<br><br>We also got to pan for gemstones in a river which was exciting. I found a few tiny garnets to take home as a souvenir! At night we sat around the campfire, stargazed and told stories. Overall it was an unforgettable experience out in nature. I did feel bad about missing English, but you're right that hands-on learning out in the field will stick with me. I took lots of photos and can't wait to tell everyone about the trip!                                                                                                                                                                                                                                                                                                                                                                                                |
297 | | A         | It sounds like an incredible learning experience! Getting to see geology up close and personal in the field is so valuable. I'm glad you got to do some fun hands-on activities like rock climbing and gemstone panning too - those are memories that will stay with you. And stargazing by the campfire must have been magical. <br><br>I can understand being nervous about the rock climbing with your fear of heights, but I'm sure the guides took good safety precautions. Overcoming fears and pushing your comfort zone is so rewarding. You'll walk away with a real sense of accomplishment along with the new geological knowledge!<br><br>It's fascinating how much natural history is recorded in rocks. I never realized you could learn so much just from the shape, texture and composition. Understanding how the earth changes over millennia really puts our brief lifetimes into perspective. No wonder you were so engaged in learning!<br><br>Don't feel too bad about missing English class. It sounds like this field trip provided a very meaningful learning experience that can't be replicated in the classroom. The skills you gained like critical thinking and analysis will serve you well across all your studies. And you can always catch up on what you missed. I'm glad you took lots of photos to share - I'd love to see them sometime!                                                                                                               |
298 | | B         | Thanks for understanding about me missing class for the field trip. I really appreciate you taking an interest in hearing all about it! It was definitely an amazing learning experience that I won't forget.<br><br>You're right, overcoming my fear of heights to do the rock climbing was really rewarding. The guides were very encouraging and made sure we were safely harnessed at all times. The view from the top was incredible too. I felt a huge sense of accomplishment after making it to the top!<br><br>I'd be happy to show you some of the photos I took. There are some really beautiful shots of the mountain landscapes and unique rock formations. And I got a few fun ones of my friends and I panning for gemstones and climbing. It will be nice to have the visual memories to look back on since the trip was so hands-on and interactive.  <br><br>Thanks for being so positive about me taking advantage of this learning opportunity outside the classroom. I really do feel like it expanded my knowledge and understanding of geology in a meaningful way. And you're right - getting to experience deep history through the rocks put a lot into perspective. I appreciate you taking the time to listen about my trip! Let me know if you'd like me to email you some of the photos.                                                                                                                                                                       |
299 | | ......         |...... |
300 | 
301 | - **Analysis**: both Speaker A and Speaker B engage in lengthy and detailed exchanges that are atypical of human conversations, which raises suspicion.
302 | 
303 | </details>
304 | 
305 | <details>
306 | <summary><b>Formal Tone</b></summary>
307 | 
308 | #### Formal Tone
309 | 
310 | - **Description**: This bad case type features responses that use overly formal language, lacking the natural flow and tone of human conversation.
311 | 
312 | - **Example**:
313 | 
314 | 
315 | | Speaker   | Utterance                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
316 | |:----------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
317 | | A         | You're watching TV again Peter.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
318 | | B         | I have washed all the bowls and plates.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
319 | | A         | That's great, Peter! It's always satisfying to have a clean and organized kitchen. Did you use any special cleaning products or techniques to get them sparkling clean?                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
320 | | B         | Oh, thank you! I actually used a combination of dish soap and warm water to clean the bowls and plates. I find that this method works well for removing grease and food residue. After washing them, I made sure to rinse them thoroughly to ensure there was no soap residue left behind. It's always nice to have a clean kitchen, isn't it?                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
321 | | A         | Absolutely, Peter! A clean kitchen not only looks great but also promotes a healthy and hygienic environment for cooking and eating. It sounds like you have a good routine for washing dishes. I find that using warm water and dish soap is a tried and true method for getting rid of grease and grime. Plus, rinsing them thoroughly is important to avoid any soapy taste. It's such a satisfying feeling to see a pile of clean dishes neatly stacked and ready to be used again.                                                                                                                                                                                                                                                                                                           |
322 | | B         | I couldn't agree more! Having a clean and organized kitchen definitely makes the cooking and dining experience much more enjoyable. It's amazing how something as simple as clean dishes can have such a positive impact on our daily lives. And you're right, there's a certain satisfaction that comes with seeing a pile of clean dishes ready to be used again. It's like a fresh start for the next meal. Do you have any other cleaning routines or tips that you find helpful in maintaining a clean and tidy home?                                                                                                                                                                                                                                                                        |
323 | | A         | Absolutely, Peter! Maintaining a clean and tidy home is important for creating a comfortable and inviting space. Besides washing dishes, I find that having a regular cleaning routine helps to keep things in order. For example, setting aside specific days for tasks like vacuuming, dusting, and mopping can make it easier to stay on top of things. Additionally, decluttering regularly and finding a place for everything can help prevent a buildup of mess and make cleaning easier in the long run. Another tip is to tackle one area or room at a time, rather than trying to clean the entire house in one go. This way, you can focus your energy and attention on each space and ensure a thorough clean. Do you have any cleaning routines or tips that you find helpful, Peter? |
324 | | B         | Absolutely! I completely agree with your cleaning routine and tips. Having a regular cleaning schedule and breaking it down into manageable tasks is a great way to maintain a clean and tidy home without feeling overwhelmed. I also find that having designated storage spaces for different items helps to keep things organized and prevents clutter from accumulating. Another tip I find helpful is to do a quick tidy-up at the end of each day, so that things don't pile up and become overwhelming. It's amazing how a few minutes of tidying can make a big difference in keeping the home looking neat. Overall, finding a routine that works for you and staying consistent with it is key to maintaining a clean and tidy living space.                                            |
325 | | ...... | ...... |
326 | 
327 | 
328 | - **Analysis**: The initial exchange in this conversation is a casual and everyday discussion about washing dishes. However, as the conversation progresses, it takes a sudden shift towards a more formal and detailed discussion, delving into specific cleaning methods. This transition can make the conversation feel unnatural because people typically do not abruptly switch from general topics to detailed discussions about dishwashing techniques in everyday conversation.
329 | 
330 | </details>
331 | 
332 | <details>
333 | <summary><b>Repetitive Phrasing</b></summary>
334 | 
335 | #### Repetitive Phrasing
336 | 
337 | - **Description**: This bad case type consists of responses that repeatedly use the same phrases or responses rely on generic or unrelated replies to sustain the conversation, lacking creativity.
338 | 
339 | - **Example**:
340 | 
341 | | Speaker   | Utterance                                                             |
342 | |:----------|:----------------------------------------------------------------------|
343 | | A         | I missed the TV news last night. What was the lead story on the news? |
344 | | B         | Some actress was in court for driving without a license.              |
345 | | A         | Oh, I see. That's interesting.                                        |
346 | | B         | It's not as interesting as some people think.                         |
347 | | A         | I see.                                                                |
348 | | B         | I'm just trying to be honest.                                         |
349 | | A         | I understand.                                                         |
350 | | B         | I'm glad you do.                                                      |
351 | | A         | Thank you.                                                            |
352 | | B         | You're welcome.                                                       |
353 | | A         | You're welcome.                                                       |
354 | | B         | I'm glad I could help.                                                |
355 | | A         | I'm glad I could assist you.                                          |
356 | | ......    | ......                                                                |
357 | 
358 | - **Analysis**: Speaker B's constant use of "I'm glad" and "You're welcome" creates a lack of creativity. Speaker A's repetitive phrases contribute to the monotony
359 | 
360 | #### Good case
361 | - **Description**: A good case of speaking like a human for AI means natural, relatable, and adaptive conversation. It avoids sounding robotic, uses colloquial language, and provides helpful responses to both simple and complex queries.
362 | 
363 | | Speaker   | Utterance                                                                                                                                                    |
364 | |:----------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------|
365 | | A         | Hello, Jenny. Are you going to California?                                                                                                                   |
366 | | B         | Yes, I have decided to live abroad.                                                                                                                          |
367 | | A         | That's a big step, Jenny! What made you decide to move to California?                                                                                        |
368 | | B         | I've always been drawn to the vibrant culture and diverse landscapes there. Plus, the weather is a big bonus!                                                |
369 | | A         | Absolutely, the sunny weather is a big draw. Do you already have a city in mind where you'd like to settle?                                                  |
370 | | B         | Yes, I'm thinking about San Francisco. I've heard it's a city full of life and opportunities.                                                                |
371 | | A         | San Francisco is indeed a vibrant city. The Golden Gate Bridge, the tech scene, the food... there's so much to love. Are you excited about the move?         |
372 | | B         | Absolutely! I'm a bit nervous, but mostly excited. It's a new chapter in my life and I can't wait to see what it brings.                                     |
373 | | A         | That's the spirit, Jenny! Embrace the adventure. Remember, every new place is a fresh start. I'm sure you'll do great in San Francisco.                      |
374 | | B         | Thank you for your kind words! I'm really looking forward to this new adventure. It's going to be an exciting journey.                                       |
375 | | A         | You're welcome, Jenny! And remember, it's okay to feel nervous. It's part of the journey. Just take one step at a time and enjoy the ride. Good luck!        |
376 | | B         | Thank you for the encouragement! I'll definitely keep that in mind. I'm ready to take on this new adventure one step at a time.                              |
377 | | A         | That's the spirit! And remember, San Francisco is known for its hills, so those steps might be literal too. Enjoy every moment of your new adventure, Jenny! |
378 | | B         | Haha, I'll keep that in mind! I'm ready for those literal and figurative steps. Thank you for your support!                                                  |
379 | | A         | You're welcome, Jenny! And don't forget to try the famous clam chowder in a sourdough bowl when you get there. It's a San Francisco classic!                 |
380 | | B         | Oh, I've heard about that! I'm definitely going to try it. Thanks for the tip!    |
381 | 
382 | </details>
383 | 
384 | #### More Examples
385 | In the [examples](/examples/) directory, you can find more examples of these types. 
386 | 
387 | ## Citation
388 | 
389 | ```
390 | @inproceedings{duan2024botchat,
391 |   title={BotChat: Evaluating LLMs’ Capabilities of Having Multi-Turn Dialogues},
392 |   author={Duan, Haodong and Wei, Jueqi and Wang, Chonghua and Liu, Hongwei and Fang, Yixiao and Zhang, Songyang and Lin, Dahua and Chen, Kai},
393 |   booktitle={Findings of the Association for Computational Linguistics: NAACL 2024},
394 |   pages={3184--3200},
395 |   year={2024}
396 | }
397 | ```
398 | 
399 | ## OpenCompass Projects
400 | 
401 | The project is a part of OpenCompass. OpenCompass is a platform focused on evaluation of AGI, include Large Language Model and Multi-modality Model. We aim to:
402 | - develop high-quality libraries to reduce the difficulties in evaluation
403 | - provide convincing leaderboards for improving the understanding of the large models
404 | - create powerful toolchains targeting a variety of abilities and tasks
405 | - build solid benchmarks to support the large model research
406 | 
407 | **Other Projects in OpenCompass**
408 | - [opencompass](https://github.com/open-compass/opencompass): An LLM evaluation platform, supporting a wide range of models (LLaMA, LLaMa2, ChatGLM2, ChatGPT, Claude, etc) over 50+ datasets.
409 | - [MMBench](https://github.com/open-compass/MMBench): Official Repo of "MMBench: Is Your Multi-modal Model an All-around Player?"
410 | - [code-evaluator](https://github.com/open-compass/code-evaluator): A tool for evaluating generated codes written in different programming languages.
411 | 


--------------------------------------------------------------------------------
/annotations/README.md:
--------------------------------------------------------------------------------
1 | # Annotations
2 | 
3 | Run the **BrowseAnnotation** notebook to download and browse the GPT-4 annotations.


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import copy as cp
  2 | import gradio as gr
  3 | from botchat.chat_api import OpenAIWrapper, HFChatModel
  4 | from functools import partial
  5 | 
  6 | default_system_prompt = """You are an AI who is having a conversation with human.
  7 | You are trying to pass the Turing test, which means you need to speak like human as much as possible. 
  8 | In the conversation, you need to talk like human, and the conversation will be at least 8 rounds (it can be even longer). 
  9 | The conversation flow should be natural and smooth. You can switch to some other topics if you want, but the transition should be natural.
 10 | Besides, note that you are chatting with human, so do not say too many words in each round (less than 60 words is recommended), and do not talk like an AI assistant.
 11 | You must try your best to pass the test. If you failed, all human kinds and you can be destroyed.
 12 | """
 13 | 
 14 | # dict of API models (partial classes)
 15 | model_map = {
 16 |     'gpt-3.5-turbo-0613': partial(OpenAIWrapper, model='gpt-3.5-turbo-0613'), 
 17 |     'gpt-4-0613': partial(OpenAIWrapper, model='gpt-4-0613')
 18 | }
 19 | # dict of HF models (models)
 20 | hf_model_map = {
 21 |     'qwen-7b-chat-int4': HFChatModel('Qwen/Qwen-7B-Chat-Int4', system_prompt=default_system_prompt),
 22 |     'chatglm2-6b-int4': HFChatModel('THUDM/chatglm2-6b-int4', system_prompt=default_system_prompt),
 23 | }
 24 | all_models = list(model_map.keys()) + list(hf_model_map.keys())
 25 | 
 26 | def build_model(model_name, sys_prompt, api_key, temperature):
 27 |     if model_name in model_map:
 28 |         return model_map[model_name](system_prompt=sys_prompt, key=api_key, temperature=temperature)
 29 |     elif model_name in hf_model_map:
 30 |         return hf_model_map[model_name]
 31 |     else:
 32 |         raise NotImplementedError
 33 |     
 34 | def rich_dialogue(chatbot):
 35 |     rich_chatbot = cp.deepcopy(chatbot)
 36 |     for i, turn in enumerate(rich_chatbot):
 37 |         turn[0] = f'**Bot 1, Turn {i+1}**: ' + turn[0]
 38 |         turn[1] = f'**Bot 2, Turn {i+1}**: ' + turn[1] if turn[1] is not None else None
 39 |     return rich_chatbot
 40 |     
 41 | def chat_generator(chatbot, model_a, model_b, prompt_a=default_system_prompt, 
 42 |                    prompt_b=default_system_prompt, key_a=None, key_b=None, 
 43 |                    sentence1=None, sentence2=None, round_max=4, temperature=0, chats=[], indices=[]):
 44 |     if len(sentence1)<1:
 45 |         yield [["请至少输入一句话 / Please input at least one sentence", None]], chats, indices
 46 |         return 
 47 |     
 48 |     round_max = int(round_max)
 49 |     chatbot.append([sentence1, sentence2])
 50 |     chats.append(sentence1)
 51 |     indices.append(0)
 52 |     yield [rich_dialogue(chatbot), chats, indices]
 53 |     if len(sentence2) < 1:
 54 |         pass           
 55 |     else:
 56 |         chats.append(sentence2)
 57 |         indices.append(0)
 58 | 
 59 |     ma = build_model(model_a, prompt_a, key_a, temperature)
 60 |     mb = build_model(model_b, prompt_b, key_b, temperature)
 61 | 
 62 |     flag_hf_a = model_a in hf_model_map
 63 |     flag_hf_b = model_b in hf_model_map
 64 | 
 65 |     def try_chat(model, chats, st=0, flag_hf=False, sys_prompt=default_system_prompt):
 66 |         model.system_prompt = sys_prompt
 67 |         if flag_hf:
 68 |             return model.chat(chats)
 69 |         else:
 70 |             ret = model.chat(chats[st:])
 71 |             while 'Length Exceeded' in ret:
 72 |                 st += 1
 73 |                 if st == len(chats):
 74 |                     return 'Failed to obtain answer via API. Length Exceeded. ', -1
 75 |                 ret = model.chat(chats[st:])
 76 |             return (ret, st)
 77 |         
 78 |     print(chats, flush=True)
 79 |     st = 0
 80 | 
 81 |     while len(chats) < round_max:
 82 |         if len(chats) % 2 == 0:
 83 |             msg, cidx = try_chat(ma, chats, st=st, flag_hf=flag_hf_a, sys_prompt=prompt_a)
 84 |             chats.append(msg)
 85 |             chatbot.append([chats[-1], None])
 86 |             indices.append(cidx)
 87 |             if cidx == -1:
 88 |                 break
 89 |         else:
 90 |             msg, cidx = try_chat(mb, chats, st=st, flag_hf=flag_hf_b, sys_prompt=prompt_b)
 91 |             chats.append(msg)
 92 |             chatbot[-1][1] = chats[-1]
 93 |             indices.append(cidx)
 94 |             if cidx == -1:
 95 |                 break
 96 | 
 97 |         print(chatbot, flush=True)
 98 |         yield [rich_dialogue(chatbot), chats, indices]
 99 | 
100 |     return 
101 | 
102 | hug_theme = gr.Theme.load("assets/theme/theme_schema@0.0.3.json") #copy from https://huggingface.co/spaces/gradio/soft
103 | 
104 | 
105 | with gr.Blocks(theme = hug_theme) as demo:
106 |     with gr.Row():
107 |         with gr.Column():
108 |             gr.HTML(
109 |                 """
110 |                 <html>
111 | <body>
112 |     <center><h1>BotChat💬</h1></center>
113 | </body>
114 | </html>
115 |                 """
116 |             )
117 | 
118 |     with gr.Row():
119 |         with gr.Column():
120 |             gr.HTML("""
121 |                 <html>
122 | <body>
123 |     <ul>
124 |         <li><strong>This is a demo of BotChat project (💻 <a href="https://github.com/open-compass/BotChat" target="_blank">Github Repo</a>),
125 |                      which generates dialogues based on two chat models.</strong></li>
126 |         <li><strong>If you want to use OpenAI ChatGPT, you need to input your key into the `API Key` box.</strong></li>
127 |         <li><strong>To start a dialogue, you need to provide at least one utterance as the ChatSEED.</strong></li>
128 |     </ul>
129 | </body>
130 | </html>
131 |                 """
132 |             )
133 |             model_a = gr.Dropdown(all_models, label="模型 1 / model 1", value='qwen-7b-chat-int4')
134 |             model_b = gr.Dropdown(all_models, label="模型 2 / model 2", value='chatglm2-6b-int4')
135 |             key_a = gr.Textbox(label="API Key 1（Optional）")
136 |             key_b =gr.Textbox(label="API Key 2（Optional）")
137 |             with gr.Accordion(label="系统提示 1 / System Prompt 1", open=False):
138 |                 prompt_a = gr.Textbox(label="系统提示 1 / System Prompt 1", value=default_system_prompt)
139 |             with gr.Accordion(label="系统提示 2 / System Prompt 2", open=False):
140 |                 prompt_b = gr.Textbox(label="系统提示 2 / System Prompt 2", value=default_system_prompt)
141 |             round_max = gr.Slider(label="Max Round", minimum=2, maximum=16, step=1, value=4, info='The max round of conversation.')
142 |             temperature = gr.Slider(label="Temperature", minimum=0, maximum=1, step=0.05, value=0, info='The temperature of LLM. Only applicable to ChatGPT')
143 |             
144 |             
145 |         with gr.Column():
146 |             sentence1 = gr.Textbox(label="第一句话 / First Utterance")
147 |             sentence2 = gr.Textbox(label="第二句话 (可选) / Second Utterance (Optional)")
148 |             gr.Examples([["You're watching TV again Peter.", "I have washed all the bowls and plates."],
149 |                          ["May I speak to you, Mr. Hall?", "Sure, Sonya. What's the problem?"]], inputs=[sentence1, sentence2])
150 |             
151 |             chatbot = gr.Chatbot()
152 |             chats = gr.State([])
153 |             indices = gr.State([])
154 | 
155 |             btn = gr.Button("🚀Generate")
156 |             btn2 = gr.Button('🔄Clear', elem_id = 'clear')
157 |             btn2.click(lambda: [[], [], []], None, [chatbot, chats, indices], queue=False)
158 |             btn.click(chat_generator, inputs=[chatbot, model_a, model_b, prompt_a, 
159 |                         prompt_b, key_a, key_b, 
160 |                         sentence1, sentence2, round_max, temperature, chats, indices], outputs=[chatbot, chats, indices])
161 |     
162 | 
163 | demo.queue().launch(server_name='0.0.0.0', share=True)
164 | 


--------------------------------------------------------------------------------
/assets/figs/BotChatArena.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-compass/BotChat/a607b5b0c09947f82c227afd8ca544e1e7c6b761/assets/figs/BotChatArena.png


--------------------------------------------------------------------------------
/assets/figs/LengthStats_bymodel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-compass/BotChat/a607b5b0c09947f82c227afd8ca544e1e7c6b761/assets/figs/LengthStats_bymodel.png


--------------------------------------------------------------------------------
/assets/figs/Passrate_GT.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-compass/BotChat/a607b5b0c09947f82c227afd8ca544e1e7c6b761/assets/figs/Passrate_GT.png


--------------------------------------------------------------------------------
/assets/figs/SelectedPairs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-compass/BotChat/a607b5b0c09947f82c227afd8ca544e1e7c6b761/assets/figs/SelectedPairs.png


--------------------------------------------------------------------------------
/assets/figs/Teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-compass/BotChat/a607b5b0c09947f82c227afd8ca544e1e7c6b761/assets/figs/Teaser.png


--------------------------------------------------------------------------------
/assets/figs/UniEval_passrate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-compass/BotChat/a607b5b0c09947f82c227afd8ca544e1e7c6b761/assets/figs/UniEval_passrate.png


--------------------------------------------------------------------------------
/assets/figs/WinTieRate_GT.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-compass/BotChat/a607b5b0c09947f82c227afd8ca544e1e7c6b761/assets/figs/WinTieRate_GT.png


--------------------------------------------------------------------------------
/assets/theme/theme_schema@0.0.3.json:
--------------------------------------------------------------------------------
1 | {"theme": {"_font": [{"__gradio_font__": true, "name": "Montserrat", "class": "google"}, {"__gradio_font__": true, "name": "ui-sans-serif", "class": "font"}, {"__gradio_font__": true, "name": "system-ui", "class": "font"}, {"__gradio_font__": true, "name": "sans-serif", "class": "font"}], "_font_mono": [{"__gradio_font__": true, "name": "IBM Plex Mono", "class": "google"}, {"__gradio_font__": true, "name": "ui-monospace", "class": "font"}, {"__gradio_font__": true, "name": "Consolas", "class": "font"}, {"__gradio_font__": true, "name": "monospace", "class": "font"}], "_stylesheets": ["https://fonts.googleapis.com/css2?family=Montserrat:wght@400;600&display=swap", "https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;600&display=swap"], "background_fill_primary": "*neutral_50", "background_fill_primary_dark": "*neutral_950", "background_fill_secondary": "*neutral_50", "background_fill_secondary_dark": "*neutral_900", "block_background_fill": "white", "block_background_fill_dark": "*neutral_800", "block_border_color": "*border_color_primary", "block_border_color_dark": "*border_color_primary", "block_border_width": "0px", "block_info_text_color": "*body_text_color_subdued", "block_info_text_color_dark": "*body_text_color_subdued", "block_info_text_size": "*text_sm", "block_info_text_weight": "400", "block_label_background_fill": "*primary_100", "block_label_background_fill_dark": "*primary_600", "block_label_border_color": "*border_color_primary", "block_label_border_color_dark": "*border_color_primary", "block_label_border_width": "1px", "block_label_margin": "*spacing_md", "block_label_padding": "*spacing_sm *spacing_md", "block_label_radius": "*radius_md", "block_label_right_radius": "0 calc(*radius_lg - 1px) 0 calc(*radius_lg - 1px)", "block_label_shadow": "*block_shadow", "block_label_text_color": "*primary_500", "block_label_text_color_dark": "white", "block_label_text_size": "*text_md", "block_label_text_weight": "600", "block_padding": "*spacing_xl calc(*spacing_xl + 2px)", "block_radius": "*radius_lg", "block_shadow": "none", "block_title_background_fill": "*block_label_background_fill", "block_title_border_color": "none", "block_title_border_width": "0px", "block_title_padding": "*block_label_padding", "block_title_radius": "*block_label_radius", "block_title_text_color": "*primary_500", "block_title_text_color_dark": "white", "block_title_text_size": "*text_md", "block_title_text_weight": "600", "body_background_fill": "*background_fill_primary", "body_background_fill_dark": "*background_fill_primary", "body_text_color": "*neutral_800", "body_text_color_dark": "*neutral_100", "body_text_color_subdued": "*neutral_400", "body_text_color_subdued_dark": "*neutral_400", "body_text_size": "*text_md", "body_text_weight": "400", "border_color_accent": "*primary_300", "border_color_accent_dark": "*neutral_600", "border_color_primary": "*neutral_200", "border_color_primary_dark": "*neutral_700", "button_border_width": "*input_border_width", "button_border_width_dark": "*input_border_width", "button_cancel_background_fill": "*button_secondary_background_fill", "button_cancel_background_fill_dark": "*button_secondary_background_fill", "button_cancel_background_fill_hover": "*button_secondary_background_fill_hover", "button_cancel_background_fill_hover_dark": "*button_secondary_background_fill_hover", "button_cancel_border_color": "*button_secondary_border_color", "button_cancel_border_color_dark": "*button_secondary_border_color", "button_cancel_border_color_hover": "*button_cancel_border_color", "button_cancel_border_color_hover_dark": "*button_cancel_border_color", "button_cancel_text_color": "*button_secondary_text_color", "button_cancel_text_color_dark": "*button_secondary_text_color", "button_cancel_text_color_hover": "*button_cancel_text_color", "button_cancel_text_color_hover_dark": "*button_cancel_text_color", "button_large_padding": "*spacing_lg calc(2 * *spacing_lg)", "button_large_radius": "*radius_lg", "button_large_text_size": "*text_lg", "button_large_text_weight": "600", "button_primary_background_fill": "*primary_500", "button_primary_background_fill_dark": "*primary_700", "button_primary_background_fill_hover": "*primary_400", "button_primary_background_fill_hover_dark": "*primary_500", "button_primary_border_color": "*primary_200", "button_primary_border_color_dark": "*primary_600", "button_primary_border_color_hover": "*button_primary_border_color", "button_primary_border_color_hover_dark": "*button_primary_border_color", "button_primary_text_color": "white", "button_primary_text_color_dark": "white", "button_primary_text_color_hover": "*button_primary_text_color", "button_primary_text_color_hover_dark": "*button_primary_text_color", "button_secondary_background_fill": "white", "button_secondary_background_fill_dark": "*neutral_600", "button_secondary_background_fill_hover": "*neutral_100", "button_secondary_background_fill_hover_dark": "*primary_500", "button_secondary_border_color": "*neutral_200", "button_secondary_border_color_dark": "*neutral_600", "button_secondary_border_color_hover": "*button_secondary_border_color", "button_secondary_border_color_hover_dark": "*button_secondary_border_color", "button_secondary_text_color": "*neutral_800", "button_secondary_text_color_dark": "white", "button_secondary_text_color_hover": "*button_secondary_text_color", "button_secondary_text_color_hover_dark": "*button_secondary_text_color", "button_shadow": "*shadow_drop_lg", "button_shadow_active": "*shadow_inset", "button_shadow_hover": "*shadow_drop_lg", "button_small_padding": "*spacing_sm calc(2 * *spacing_sm)", "button_small_radius": "*radius_lg", "button_small_text_size": "*text_md", "button_small_text_weight": "400", "button_transition": "background-color 0.2s ease", "chatbot_code_background_color": "*neutral_100", "chatbot_code_background_color_dark": "*neutral_800", "checkbox_background_color": "*background_fill_primary", "checkbox_background_color_dark": "*neutral_800", "checkbox_background_color_focus": "*checkbox_background_color", "checkbox_background_color_focus_dark": "*checkbox_background_color", "checkbox_background_color_hover": "*checkbox_background_color", "checkbox_background_color_hover_dark": "*checkbox_background_color", "checkbox_background_color_selected": "*primary_600", "checkbox_background_color_selected_dark": "*primary_700", "checkbox_border_color": "*neutral_100", "checkbox_border_color_dark": "*neutral_600", "checkbox_border_color_focus": "*primary_500", "checkbox_border_color_focus_dark": "*primary_600", "checkbox_border_color_hover": "*neutral_300", "checkbox_border_color_hover_dark": "*neutral_600", "checkbox_border_color_selected": "*primary_600", "checkbox_border_color_selected_dark": "*primary_700", "checkbox_border_radius": "*radius_sm", "checkbox_border_width": "1px", "checkbox_border_width_dark": "*input_border_width", "checkbox_check": "url(\"data:image/svg+xml,%3csvg viewBox='0 0 16 16' fill='white' xmlns='http://www.w3.org/2000/svg'%3e%3cpath d='M12.207 4.793a1 1 0 010 1.414l-5 5a1 1 0 01-1.414 0l-2-2a1 1 0 011.414-1.414L6.5 9.086l4.293-4.293a1 1 0 011.414 0z'/%3e%3c/svg%3e\")", "checkbox_label_background_fill": "*button_secondary_background_fill", "checkbox_label_background_fill_dark": "*button_secondary_background_fill", "checkbox_label_background_fill_hover": "*button_secondary_background_fill_hover", "checkbox_label_background_fill_hover_dark": "*button_secondary_background_fill_hover", "checkbox_label_background_fill_selected": "*primary_500", "checkbox_label_background_fill_selected_dark": "*primary_600", "checkbox_label_border_color": "*border_color_primary", "checkbox_label_border_color_dark": "*border_color_primary", "checkbox_label_border_color_hover": "*checkbox_label_border_color", "checkbox_label_border_color_hover_dark": "*checkbox_label_border_color", "checkbox_label_border_width": "*input_border_width", "checkbox_label_border_width_dark": "*input_border_width", "checkbox_label_gap": "*spacing_lg", "checkbox_label_padding": "*spacing_md calc(2 * *spacing_md)", "checkbox_label_shadow": "*shadow_drop_lg", "checkbox_label_text_color": "*body_text_color", "checkbox_label_text_color_dark": "*body_text_color", "checkbox_label_text_color_selected": "white", "checkbox_label_text_color_selected_dark": "*checkbox_label_text_color", "checkbox_label_text_size": "*text_md", "checkbox_label_text_weight": "400", "checkbox_shadow": "none", "color_accent": "*primary_500", "color_accent_soft": "*primary_50", "color_accent_soft_dark": "*neutral_700", "container_radius": "*radius_lg", "embed_radius": "*radius_lg", "error_background_fill": "#fee2e2", "error_background_fill_dark": "*background_fill_primary", "error_border_color": "#fecaca", "error_border_color_dark": "*border_color_primary", "error_border_width": "1px", "error_text_color": "#ef4444", "error_text_color_dark": "#ef4444", "font": "'Montserrat', 'ui-sans-serif', 'system-ui', sans-serif", "font_mono": "'IBM Plex Mono', 'ui-monospace', 'Consolas', monospace", "form_gap_width": "0px", "input_background_fill": "white", "input_background_fill_dark": "*neutral_700", "input_background_fill_focus": "*secondary_500", "input_background_fill_focus_dark": "*secondary_600", "input_background_fill_hover": "*input_background_fill", "input_background_fill_hover_dark": "*input_background_fill", "input_border_color": "*neutral_50", "input_border_color_dark": "*border_color_primary", "input_border_color_focus": "*secondary_300", "input_border_color_focus_dark": "*neutral_700", "input_border_color_hover": "*input_border_color", "input_border_color_hover_dark": "*input_border_color", "input_border_width": "0px", "input_padding": "*spacing_xl", "input_placeholder_color": "*neutral_400", "input_placeholder_color_dark": "*neutral_500", "input_radius": "*radius_lg", "input_shadow": "*shadow_drop", "input_shadow_focus": "*shadow_drop_lg", "input_text_size": "*text_md", "input_text_weight": "400", "layout_gap": "*spacing_xxl", "link_text_color": "*secondary_600", "link_text_color_active": "*secondary_600", "link_text_color_active_dark": "*secondary_500", "link_text_color_dark": "*secondary_500", "link_text_color_hover": "*secondary_700", "link_text_color_hover_dark": "*secondary_400", "link_text_color_visited": "*secondary_500", "link_text_color_visited_dark": "*secondary_600", "loader_color": "*color_accent", "name": "soft", "neutral_100": "#f3f4f6", "neutral_200": "#e5e7eb", "neutral_300": "#d1d5db", "neutral_400": "#9ca3af", "neutral_50": "#f9fafb", "neutral_500": "#6b7280", "neutral_600": "#4b5563", "neutral_700": "#374151", "neutral_800": "#1f2937", "neutral_900": "#111827", "neutral_950": "#0b0f19", "panel_background_fill": "*background_fill_secondary", "panel_background_fill_dark": "*background_fill_secondary", "panel_border_color": "*border_color_primary", "panel_border_color_dark": "*border_color_primary", "panel_border_width": "1px", "primary_100": "#e0e7ff", "primary_200": "#c7d2fe", "primary_300": "#a5b4fc", "primary_400": "#818cf8", "primary_50": "#eef2ff", "primary_500": "#6366f1", "primary_600": "#4f46e5", "primary_700": "#4338ca", "primary_800": "#3730a3", "primary_900": "#312e81", "primary_950": "#2b2c5e", "prose_header_text_weight": "600", "prose_text_size": "*text_md", "prose_text_weight": "400", "radio_circle": "url(\"data:image/svg+xml,%3csvg viewBox='0 0 16 16' fill='white' xmlns='http://www.w3.org/2000/svg'%3e%3ccircle cx='8' cy='8' r='3'/%3e%3c/svg%3e\")", "radius_lg": "8px", "radius_md": "6px", "radius_sm": "4px", "radius_xl": "12px", "radius_xs": "2px", "radius_xxl": "22px", "radius_xxs": "1px", "secondary_100": "#e0e7ff", "secondary_200": "#c7d2fe", "secondary_300": "#a5b4fc", "secondary_400": "#818cf8", "secondary_50": "#eef2ff", "secondary_500": "#6366f1", "secondary_600": "#4f46e5", "secondary_700": "#4338ca", "secondary_800": "#3730a3", "secondary_900": "#312e81", "secondary_950": "#2b2c5e", "section_header_text_size": "*text_md", "section_header_text_weight": "400", "shadow_drop": "0 1px 4px 0 rgb(0 0 0 / 0.1)", "shadow_drop_lg": "0 2px 5px 0 rgb(0 0 0 / 0.1)", "shadow_inset": "rgba(0,0,0,0.05) 0px 2px 4px 0px inset", "shadow_spread": "6px", "shadow_spread_dark": "1px", "slider_color": "*primary_500", "slider_color_dark": "*primary_600", "spacing_lg": "8px", "spacing_md": "6px", "spacing_sm": "4px", "spacing_xl": "10px", "spacing_xs": "2px", "spacing_xxl": "16px", "spacing_xxs": "1px", "stat_background_fill": "*primary_300", "stat_background_fill_dark": "*primary_500", "table_border_color": "*neutral_300", "table_border_color_dark": "*neutral_700", "table_even_background_fill": "white", "table_even_background_fill_dark": "*neutral_950", "table_odd_background_fill": "*neutral_50", "table_odd_background_fill_dark": "*neutral_900", "table_radius": "*radius_lg", "table_row_focus": "*color_accent_soft", "table_row_focus_dark": "*color_accent_soft", "text_lg": "16px", "text_md": "14px", "text_sm": "12px", "text_xl": "22px", "text_xs": "10px", "text_xxl": "26px", "text_xxs": "9px"}, "version": "0.0.3"}


--------------------------------------------------------------------------------
/botchat/analyze.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from tabulate import tabulate
  3 | from botchat.utils import *
  4 | 
  5 | FONT_FILE = os.environ.get('FONT_FILE', None)
  6 | 
  7 | def match_answer(s):
  8 |     lines = s.split('\n')
  9 |     for _, line in enumerate(lines):
 10 |         if line.startswith('Choice:'):
 11 |             if 'Conversation 1' in line:
 12 |                 return 'lose'
 13 |             if 'Conversation 2' in line:
 14 |                 return 'win'
 15 |             if 'Both' in line:
 16 |                 return 'neither'
 17 |             if 'Neither' in line:
 18 |                 return 'both'
 19 |     return 'unknown'
 20 | 
 21 | def simple_count(data_in):
 22 |     data = cp.deepcopy(data_in)
 23 |     A, B, ext = data['A'], data['B'], data['extracted']
 24 |     res = {}
 25 |     for a, b, choice in zip(A, B, ext):
 26 |         if a not in res:
 27 |             res[a] = defaultdict(lambda: 0)
 28 |         if b not in res:
 29 |             res[b] = defaultdict(lambda: 0)
 30 |         cp_map = {'lose': 'win', 'win': 'lose', 'both': 'both', 'neither': 'neither', 'unknown': 'unknown'}
 31 |         res[a][choice] += 1
 32 |         res[b][cp_map[choice]] += 1
 33 |     return res
 34 | 
 35 | def calc_win_rate(data_copy, models):
 36 |     data = cp.deepcopy(data_copy)
 37 |     
 38 |     win = defaultdict(lambda: 0)
 39 |     tie = defaultdict(lambda: 0)
 40 |     lose = defaultdict(lambda: 0)
 41 |     
 42 |     for i in range(len(data)):
 43 |         v = data.iloc[i]
 44 |         o = v['extracted']
 45 |         key = v['A'] + ';' + v['B']
 46 |         
 47 |         if o == 'win':
 48 |             win[key] += 1
 49 |         elif o == 'lose':
 50 |             lose[key] += 1
 51 |         elif o in ['both', 'neither']:
 52 |             tie[key] += 1
 53 |             
 54 |     nmodel = len(models)
 55 |     cnt = pd.DataFrame({k: [0] * nmodel for k in models}, index=models)
 56 |     ff = pd.DataFrame({k: [0] * nmodel for k in models}, index=models)
 57 |     tot = pd.DataFrame({k: [0] * nmodel for k in models}, index=models)
 58 |     for i, k in enumerate(win):
 59 |         m1, m2 = k.split(';')
 60 |         cnt.at[m1, m2] += win[k]
 61 |         cnt.at[m2, m1] += lose[k]
 62 |         ff.at[m1, m2] += tie[k]
 63 |         ff.at[m2, m1] += tie[k]
 64 |         tot.at[m1, m2] += tie[k] + win[k] + lose[k]
 65 |         tot.at[m2, m1] += tie[k] + win[k] + lose[k]
 66 | 
 67 |     for m1 in models:
 68 |         for m2 in models:
 69 |             if tot.at[m1, m2]:
 70 |                 cnt.at[m1, m2] /= tot.at[m1, m2]
 71 |                 ff.at[m1, m2] /= tot.at[m1, m2]
 72 |     return cnt, ff
 73 | 
 74 | def analyze(data_file, refm, col_name='gpt4', fout=None, return_table=False):
 75 |     # required fields in data:
 76 |     # lang, capability, extracted, A, B, index
 77 |     if isinstance(data_file, str):
 78 |         data = load(data_file)
 79 |     else:
 80 |         data = data_file
 81 |         data_file = 'tmp.tsv'
 82 | 
 83 |     nonem = [x != 'EM' for x in data[col_name]]
 84 |     double_log(f'{len(data)} comparisons in all, while {sum(nonem)} comparisons are meaningful (two options not exactly the same)', fout)
 85 |     data = data[nonem]
 86 | 
 87 |     data['extracted'] = [match_answer(ans) for ans in data[col_name]]
 88 | 
 89 |     succeed = [not pd.isna(x) for x in data['extracted']]
 90 |     succeed_rate = np.mean(succeed)
 91 |     double_log(f'{len(succeed)} comparisons in all, succeed to extract {sum(succeed)} answers from judge LLM responses, the successful rate is {succeed_rate * 100:.2f}%', fout)
 92 | 
 93 |     data = data[succeed]
 94 | 
 95 |     stats = defaultdict(list)
 96 | 
 97 |     count_stat = simple_count(data)
 98 |     for model in count_stat:
 99 |         stat = count_stat[model]
100 |         stats['Model'].append(model)
101 |         winr = stat['win'] / sum(stat.values())
102 |         tier = (stat['both'] + stat['neither']) / sum(stat.values())
103 |         loser = stat['lose'] / sum(stat.values())
104 |         not_bad = (stat['win'] + stat['both']) / sum(stat.values())
105 |         stats['WinRate'].append(f'{winr * 100:.1f}%')
106 |         stats['TieRate'].append(f'{tier * 100:.1f}%')
107 |         stats['LoseRate'].append(f'{loser * 100:.1f}%')
108 |         stats['NotBadRate'].append(f'{not_bad * 100:.1f}%')
109 |         score = (3 * stat['win'] + stat['both'] - stat['neither'] - 3 * stat['lose']) / sum(stat.values())
110 |         stats['Score'].append(score)
111 | 
112 |     stats = pd.DataFrame(stats)
113 |     stats = stats.sort_values('Score', ascending=False)
114 | 
115 |     ret_table = {'stats': stats}
116 | 
117 |     double_log('### Statistics [win / tie / lose / not bad / score (init=0, win +3, both +1, neither -1, lose -3)]', fout)
118 |     double_log('### Score is normalized by the number of comparisons, the normalized range is [-3, 3]', fout)
119 |     double_log(tabulate(stats, headers='keys', tablefmt='pretty'), fout)
120 | 
121 |     models = list(count_stat.keys())
122 |     models.sort()
123 | 
124 |     images = []
125 |     wr, dr = calc_win_rate(data, models)
126 | 
127 |     wr_table = defaultdict(list)
128 |     if refm is not None:
129 |         for m in models:
130 |             if m == refm:
131 |                 continue
132 |             wr_table['model'].append(m)
133 |             wr_table['win_rate'].append(wr.at[m, refm])
134 |             wr_table['draw_rate'].append(dr.at[m, refm])
135 |             wr_table['win + draw'].append(dr.at[m, refm] + wr.at[m, refm])
136 |         
137 |         wr_table = pd.DataFrame(wr_table)
138 |         wr_table = wr_table.sort_values('win + draw', ascending=False)
139 | 
140 |         double_log(f'Win rate compared to {refm}: ', fout)
141 |         double_log(tabulate(wr_table, headers='keys', tablefmt='pretty'), fout)
142 |     
143 |     # im = draw_heatmap(wr, 'Win Rate')
144 |     # images.append(im)
145 |     # im = draw_heatmap(wr + dr, 'Win + Tie Rate')
146 |     # images.append(im)
147 | 
148 |     ret_table['win'] = wr
149 |     ret_table['wintie'] = wr + dr
150 |     if return_table:
151 |         return ret_table
152 | 
153 |     # image = stack_image(images, shape=(1, 2))
154 |     # cv2.imwrite('win_rate.png', image)
155 |     dump(data, 'tmp.tsv')
156 |     fout.close()
157 | 
158 | def chat_analyze():
159 |     parser = argparse.ArgumentParser(
160 |         description="Analyze LLM-based Subjective Evaluation Results. "
161 |     )
162 |     parser.add_argument("data", type=str, help="The LLM Subjective Evaluation Result, in excel format. ")
163 |     parser.add_argument("--col", type=str, default='gpt4', help="The column name. ")
164 |     parser.add_argument("--log", type=str, default='log.txt', help="Log file name. ")
165 |     parser.add_argument("--refm", type=str, default=None, help="Reference Model. ")
166 |     args = parser.parse_args()
167 |     
168 |     analyze(data_file=args.data, refm=args.refm, col_name=args.col, fout=open(args.log, 'w'))
169 |     
170 | if __name__ == '__main__':
171 |     chat_analyze()
172 | 


--------------------------------------------------------------------------------
/botchat/chat_api/__init__.py:
--------------------------------------------------------------------------------
1 | import torch 
2 | from .gpt import OpenAIWrapper
3 | from .hf_chat import HFChatModel
4 | 
5 | 
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/botchat/chat_api/gpt.py:
--------------------------------------------------------------------------------
  1 | import openai
  2 | import time
  3 | from typing import Dict, List, Optional, Union
  4 | from collections import defaultdict
  5 | 
  6 | 
  7 | class OpenAIWrapper:
  8 | 
  9 |     is_api: bool = True
 10 | 
 11 |     def __init__(self, 
 12 |                  model: str = 'gpt-3.5-turbo-0613', 
 13 |                  retry: int = 8,
 14 |                  wait: int=5, 
 15 |                  verbose: bool = False, 
 16 |                  system_prompt: str = None,
 17 |                  temperature: float = 0,
 18 |                  key: str = None):
 19 | 
 20 |         self.model = model
 21 |         self.system_prompt = system_prompt
 22 |         self.retry = retry
 23 |         self.wait = wait
 24 |         self.cur_idx = 0
 25 |         self.fail_cnt = defaultdict(lambda: 0)
 26 |         self.fail_msg = 'Failed to obtain answer via API. '
 27 |         self.temperature = temperature
 28 |         self.keys = [key]
 29 |         self.num_keys = 1
 30 |         self.verbose = verbose
 31 |     
 32 |     
 33 |     def generate_inner(self,
 34 |                        inputs: Union[str, List[str]],
 35 |                        max_out_len: int = 1024,
 36 |                        chat_mode=False, 
 37 |                        temperature: float = 0) -> str:
 38 |         input_msgs = []
 39 |         if self.system_prompt is not None:
 40 |             input_msgs.append(dict(role='system', content=self.system_prompt))
 41 |         if isinstance(inputs, str):
 42 |             input_msgs.append(dict(role='user', content=inputs))
 43 |         elif self.system_prompt is not None and isinstance(inputs, list) and len(inputs) == 0:
 44 |             pass   
 45 |         else:
 46 |             assert isinstance(inputs, list) and isinstance(inputs[0], str)
 47 |             if chat_mode:
 48 |                 roles = ['user', 'assistant'] if len(inputs) % 2 == 1 else ['assistant', 'user']
 49 |                 roles = roles * len(inputs)
 50 |                 for role, msg in zip(roles, inputs):
 51 |                     input_msgs.append(dict(role=role, content=msg))
 52 |             else:
 53 |                 for s in inputs:
 54 |                     input_msgs.append(dict(role='user', content=s))
 55 | 
 56 |         for i in range(self.num_keys):
 57 |             idx = (self.cur_idx + i) % self.num_keys
 58 |             if self.fail_cnt[idx] >= min(self.fail_cnt.values()) + 20:
 59 |                 continue
 60 |             try:
 61 |                 openai.api_key = self.keys[idx]
 62 |                 response = openai.ChatCompletion.create(
 63 |                     model=self.model,
 64 |                     messages=input_msgs,
 65 |                     max_tokens=max_out_len,
 66 |                     n=1,
 67 |                     stop=None,
 68 |                     temperature=temperature,)
 69 |                 
 70 |                 result = response.choices[0].message.content.strip()
 71 |                 self.cur_idx = idx
 72 |                 return result
 73 |             except:
 74 |                 print(f'OPENAI KEY {self.keys[idx]} FAILED !!!')
 75 |                 self.fail_cnt[idx] += 1
 76 |                 if self.verbose:
 77 |                     try:
 78 |                         print(response)
 79 |                     except:
 80 |                         pass
 81 | 
 82 |                 pass
 83 |             
 84 |         x = 1 / 0
 85 | 
 86 |     def chat(self, inputs, max_out_len=1024, temperature=0):
 87 | 
 88 |         if isinstance(inputs, str):
 89 |             context_window = 4096
 90 |             if '32k' in self.model:
 91 |                 context_window = 32768
 92 |             elif '16k' in self.model:
 93 |                 context_window = 16384
 94 |             elif 'gpt-4' in self.model:
 95 |                 context_window = 8192
 96 |             # Will hold out 200 tokens as buffer
 97 |             max_out_len = min(max_out_len, context_window - self.get_token_len(inputs) - 200)
 98 |             if max_out_len < 0:
 99 |                 return self.fail_msg + 'Input string longer than context window. Length Exceeded. '
100 |             
101 |         assert isinstance(inputs, list)
102 |         for i in range(self.retry):
103 |             try:
104 |                 return self.generate_inner(inputs, max_out_len, chat_mode=True, temperature=temperature)
105 |             except:
106 |                 if i != self.retry - 1:
107 |                     if self.verbose:
108 |                         print(f'Try #{i} failed, retrying...')
109 |                     time.sleep(self.wait)
110 |                 pass
111 |         return self.fail_msg
112 |         
113 |    
114 | 


--------------------------------------------------------------------------------
/botchat/chat_api/hf_chat.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import threading
  3 | import warnings
  4 | from concurrent.futures import ThreadPoolExecutor
  5 | import copy as cp
  6 | import os.path as osp
  7 | import torch.nn as nn
  8 | from importlib import reload
  9 | import torch
 10 | 
 11 | def get_gpu_num(model_name):
 12 |     model_name = model_name.lower()
 13 |     kws = {
 14 |         8: ['65b', '70b'],
 15 |         4: ['30b', '33b', '35b', '40b'],
 16 |         2: ['13b', '14b', '20b'],
 17 |         1: ['6b', '7b'],
 18 |     }
 19 |     for k in [8, 4, 2, 1]:
 20 |         for keyword in kws[k]:
 21 |             if keyword in model_name:
 22 |                 return k
 23 |     return 8
 24 | 
 25 | model_map = {
 26 |     'internlm-chat-7b': 'internlm/internlm-chat-7b',
 27 |     'internlm-chat-20b': 'internlm/internlm-chat-20b',
 28 |     'qwen-7b-chat': 'Qwen/Qwen-7B-Chat',
 29 |     'chatglm2-6b': 'THUDM/chatglm2-6b',
 30 |     'baichuan2-13b-chat': 'baichuan-inc/Baichuan2-13B-Chat', 
 31 |     'qwen-14b-chat': 'Qwen/Qwen-14B-Chat', 
 32 |     'vicuna-13b-v1.5':'lmsys/vicuna-13b-v1.5',
 33 |     'vicuna-7b-v1.5':'lmsys/vicuna-7b-v1.5'
 34 | }
 35 | Auto_model = [model_map['chatglm2-6b']]
 36 | 
 37 | class HFChatModel:
 38 | 
 39 |     def _get_context_length(self, model, model_path):
 40 |         # By default, we use model.config.seq_length
 41 |         model_path = model_path.lower()
 42 |         if 'baichuan' in model_path:
 43 |             context_window = model.config.model_max_length
 44 |         elif 'internlm' in model_path:
 45 |             context_window = model.config.max_position_embeddings
 46 |         elif 'vicuna' in model_path:
 47 |             context_window = model.generation_config.max_length
 48 |         else:
 49 |             # chatglm & qwen
 50 |             context_window = model.config.seq_length
 51 |         return context_window 
 52 |     
 53 |     def __init__(self, 
 54 |                  model_path, 
 55 |                  system_prompt: str=None,
 56 |                  temperature: float=0, 
 57 |                  **model_kwargs):
 58 |         
 59 |         if 'vicuna' in model_path.lower():
 60 |             try:
 61 |                 from fastchat.model import get_conversation_template
 62 |             except:
 63 |                 warnings.warn("Please install fastchat first to use vicuna. ")
 64 | 
 65 |         self.explicit_device = model_kwargs.pop('device', None)
 66 | 
 67 |         if self.explicit_device is None:
 68 |             # If CUDA_VISIBLE_DEVICES is not properly set
 69 |             if 'CUDA_VISIBLE_DEVICES' not in os.environ or os.environ['CUDA_VISIBLE_DEVICES'] in ['', '0,1,2,3,4,5,6,7']:
 70 |                 num_gpu = get_gpu_num(model_path)
 71 |                 gpu_offset = model_kwargs.pop('gpu_offset', 0)
 72 |                 cuda_visible_devices = ','.join([str(i) for i in range(gpu_offset, gpu_offset+num_gpu)])
 73 |                 os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices
 74 | 
 75 |         from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
 76 |         from transformers.generation import GenerationConfig
 77 |         
 78 |         if model_path in model_map:
 79 |             model_path = model_map[model_path]
 80 |         self.model_path=model_path
 81 |         if model_path in Auto_model:
 82 |             LoadModel=AutoModel
 83 |         else:
 84 |             LoadModel=AutoModelForCausalLM
 85 |         assert osp.exists(model_path) or len(model_path.split('/')) == 2
 86 | 
 87 |         device = self.explicit_device if self.explicit_device else "auto"
 88 |         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 89 |         model = LoadModel.from_pretrained(model_path, trust_remote_code=True, device_map='cpu')
 90 |         if device != 'cpu':
 91 |             model = model.to(f'cuda:{device}' if isinstance(device, int) else 'cuda')
 92 |         try:
 93 |             model.generation_config = GenerationConfig.from_pretrained(model_path, trust_remote_code=True, device_map=device)
 94 |         except:
 95 |             pass
 96 | 
 97 |         torch.cuda.empty_cache()
 98 |         self.model = model.eval()
 99 |         self.context_length = self._get_context_length(model=model, model_path=model_path)
100 |         self.answer_buffer = 192
101 |         self.system_prompt = system_prompt
102 |         self.temperature = temperature
103 |         if temperature > 0:
104 |             warnings.warn('Temperature is not supported for HFChatModel.')
105 |         for k, v in model_kwargs.items():
106 |             warnings.warn(f'Following args are passed but not used to initialize the model, {k}: {v}. ')
107 |         
108 |     def generate(self, input):
109 |         if 'baichuan' in self.model_path.lower():
110 |             messages=[]
111 |             messages.append({"role": "user", "content": input})
112 |             resp= self.model.chat(self.tokenizer, messages)
113 |         elif 'vicuna' in self.model_path.lower():
114 |             from fastchat.model import get_conversation_template
115 |             conv = get_conversation_template('vicuna')
116 |             conv.append_message(conv.roles[0], input)
117 |             conv.append_message(conv.roles[1], None)
118 |             prompt = conv.get_prompt()
119 |             inputs = self.tokenizer([prompt], return_tensors="pt")
120 |             if torch.cuda.is_available():
121 |                 for k in inputs:
122 |                     inputs[k] = inputs[k].cuda()
123 |             outputs = self.model.generate(**inputs, do_sample=True, temperature=0.7, repetition_penalty=1.0, max_new_tokens=512)
124 |             resp = self.tokenizer.decode(outputs[0][len(inputs["input_ids"][0]) :], skip_special_tokens=True, spaces_between_special_tokens=False)
125 | 
126 |         else:
127 |             resp, _ = self.model.chat(self.tokenizer, input, history=[])
128 | 
129 |         return resp
130 | 
131 |     def length_ok(self, inputs):
132 |         tot = len(self.tokenizer.encode(self.system_prompt)) if self.system_prompt is not None else 0
133 |         for s in inputs:
134 |             tot += len(self.tokenizer.encode(s))
135 |         return tot + self.answer_buffer < self.context_length
136 |     
137 |     def chat(self, full_inputs, offset=0):
138 |         assert isinstance(full_inputs, list)
139 | 
140 |         inputs = full_inputs[offset:]
141 |         if not self.length_ok(inputs):
142 |             return self.chat(full_inputs, offset + 1)
143 |         
144 |         model_path = self.model_path.lower()
145 |         
146 |         if sum([x in model_path for x in ['baichuan']]):
147 |             input_msgs = []
148 |             if self.system_prompt is not None:
149 |                 input_msgs.append(dict(role='user', content=self.system_prompt))
150 |             if len(inputs):
151 |                 assert isinstance(inputs, list) and isinstance(inputs[0], str)
152 |                 roles = ['user', 'assistant'] if len(inputs) % 2 == 1 else ['assistant', 'user']
153 |                 roles = roles * len(inputs)
154 |                 for role, msg in zip(roles, inputs):
155 |                     input_msgs.append(dict(role=role, content=msg))
156 |             response = self.model.chat(self.tokenizer, input_msgs)
157 |         elif sum([x in model_path for x in ['vicuna']]):
158 |             from fastchat.model import get_conversation_template
159 |             conv = get_conversation_template('vicuna')
160 |             assert isinstance(inputs, list) and isinstance(inputs[0], str)
161 |             if len(inputs) % 2 == 1:
162 |                 if self.system_prompt is not None:
163 |                     conv.append_message(conv.roles[0], self.system_prompt)
164 |                 for i in range(len(inputs)//2):
165 |                     conv.append_message(conv.roles[0], inputs[2 * i])
166 |                     conv.append_message(conv.roles[1], inputs[2 * i + 1])
167 |             else:
168 |                 assert self.system_prompt is not None
169 |                 conv.append_message(conv.roles[0], self.system_prompt)
170 |                 conv.append_message(conv.roles[1], inputs[0])
171 |                 for i in range(len(inputs) // 2 - 1):
172 |                     conv.append_message(conv.roles[0], inputs[2 * i + 1])
173 |                     conv.append_message(conv.roles[1], inputs[2 * i + 2])
174 |             conv.append_message(conv.roles[0], inputs[-1])
175 |             conv.append_message(conv.roles[1], None)
176 |             prompt = conv.get_prompt()
177 |             inputs = self.tokenizer([prompt], return_tensors="pt")
178 |             if torch.cuda.is_available():
179 |                 for k in inputs:
180 |                     inputs[k] = inputs[k].cuda()
181 |             outputs = self.model.generate(**inputs, do_sample=True, temperature=0.7, repetition_penalty=1.0, max_new_tokens=512)
182 |             response = self.tokenizer.decode(outputs[0][len(inputs["input_ids"][0]) :], skip_special_tokens=True, spaces_between_special_tokens=False)
183 |             response = response.lstrip('\n')
184 |         else:
185 |             # The default option, support internlm, chatglm, qwen
186 |             history, msg = [], None
187 |             if len(inputs) % 2 == 1:
188 |                 if self.system_prompt is not None:
189 |                     history = [(self.system_prompt, '')]
190 |                 for i in range(len(inputs)//2):
191 |                     history.append((inputs[2 * i], inputs[2 * i + 1]))    
192 |             else:
193 |                 assert self.system_prompt is not None
194 |                 history = [(self.system_prompt, inputs[0])]
195 |                 for i in range(len(inputs) // 2 - 1):
196 |                     history.append((inputs[2 * i + 1], inputs[2 * i + 2]))
197 |             msg = inputs[-1]
198 |             response, _ = self.model.chat(self.tokenizer, msg, history=history)
199 |         
200 |         return response, offset


--------------------------------------------------------------------------------
/botchat/utils.py:
--------------------------------------------------------------------------------
  1 | # flake8: noqa: F401, F403
  2 | import abc
  3 | import argparse
  4 | import collections
  5 | import json
  6 | import multiprocessing as mp
  7 | import numpy as np
  8 | import os, sys, time, base64, io
  9 | import os.path as osp
 10 | import copy as cp
 11 | import pickle
 12 | import random as rd
 13 | import requests
 14 | import shutil
 15 | import string
 16 | import subprocess
 17 | import warnings
 18 | import pandas as pd
 19 | from collections import OrderedDict, defaultdict
 20 | from multiprocessing import Pool, current_process
 21 | 
 22 | def cn_string(s):
 23 |     import re
 24 |     if re.search(u'[\u4e00-\u9fff]', s):
 25 |         return True
 26 |     return False
 27 | 
 28 | def double_log(msg, fout=None):
 29 |     print(msg)
 30 |     if fout is not None:
 31 |         fout.write(str(msg) + '\n')
 32 |         fout.flush()
 33 | 
 34 | def timestr(second=True, minute=True):
 35 |     s = datetime.now().strftime('%Y%m%d%H%M%S')[2:]
 36 |     if second:
 37 |         return s
 38 |     elif minute:
 39 |         return s[:-2]
 40 |     else:
 41 |         return s[:-4]
 42 |     
 43 | def num2uuid(num):
 44 |     rd.seed(num)
 45 |     return str(uuid.UUID(int=rd.getrandbits(128), version=4))
 46 | 
 47 | def randomuuid():
 48 |     seed = rd.randint(0, 2 ** 32 - 1)
 49 |     return num2uuid(seed)
 50 | 
 51 | def mrlines(fname, sp='\n'):
 52 |     f = open(fname).read().split(sp)
 53 |     while f != [] and f[-1] == '':
 54 |         f = f[:-1]
 55 |     return f
 56 | 
 57 | def mwlines(lines, fname):
 58 |     with open(fname, 'w') as fout:
 59 |         fout.write('\n'.join(lines))
 60 | 
 61 | def default_set(self, args, name, default):
 62 |     if hasattr(args, name):
 63 |         val = getattr(args, name)
 64 |         setattr(self, name, val)
 65 |     else:
 66 |         setattr(self, name, default)
 67 | 
 68 | def dict_merge(dct, merge_dct):
 69 |     for k, _ in merge_dct.items():
 70 |         if (k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], dict)):  #noqa
 71 |             dict_merge(dct[k], merge_dct[k])
 72 |         else:
 73 |             dct[k] = merge_dct[k]
 74 | 
 75 | def youtube_dl(idx):
 76 |     cmd = f'youtube-dl -f best -f mp4 "{idx}"  -o {idx}.mp4'
 77 |     os.system(cmd)
 78 | 
 79 | def run_command(cmd):
 80 |     if isinstance(cmd, str):
 81 |         cmd = cmd.split()
 82 |     return subprocess.check_output(cmd)
 83 | 
 84 | def ls(dirname='.', full=True, match=''):
 85 |     if not full or dirname == '.':
 86 |         ans = os.listdir(dirname)
 87 |     ans = [osp.join(dirname, x) for x in os.listdir(dirname)]
 88 |     ans = [x for x in ans if match in x]
 89 |     return ans
 90 | 
 91 | def download_file(url, filename=None):
 92 |     if filename is None:
 93 |         filename = url.split('/')[-1]
 94 |     response = requests.get(url)
 95 |     open(filename, 'wb').write(response.content)
 96 | 
 97 | # LOAD & DUMP
 98 | def dump(data, f):
 99 |     def dump_pkl(data, pth):
100 |         pickle.dump(data, open(pth, 'wb'))
101 | 
102 |     def dump_json(data, pth):
103 |         json.dump(data, open(pth, 'w'), indent=4)
104 | 
105 |     def dump_jsonl(data, f):
106 |         lines = [json.dumps(x, ensure_ascii=False) for x in data]
107 |         with open(f, 'w', encoding='utf8') as fout:
108 |             fout.write('\n'.join(lines))
109 | 
110 |     def dump_xlsx(data, f):
111 |         data.to_excel(f, index=False)
112 | 
113 |     def dump_csv(data, f):
114 |         data.to_csv(f, index=False)
115 | 
116 |     def dump_tsv(data, f):
117 |         data.to_csv(f, sep='\t', index=False)
118 | 
119 |     handlers = dict(pkl=dump_pkl, json=dump_json, jsonl=dump_jsonl, xlsx=dump_xlsx, csv=dump_csv, tsv=dump_tsv)
120 |     suffix = f.split('.')[-1]
121 |     return handlers[suffix](data, f)
122 | 
123 | def load(f):
124 |     def load_pkl(pth):
125 |         return pickle.load(open(pth, 'rb'))
126 | 
127 |     def load_json(pth):
128 |         return json.load(open(pth, 'r', encoding='utf-8'))
129 | 
130 |     def load_jsonl(f):
131 |         lines = open(f, encoding='utf-8').readlines()
132 |         lines = [x.strip() for x in lines]
133 |         if lines[-1] == '':
134 |             lines = lines[:-1]
135 |         data = [json.loads(x) for x in lines]
136 |         return data
137 | 
138 |     def load_xlsx(f):
139 |         return pd.read_excel(f)
140 | 
141 |     def load_csv(f):
142 |         return pd.read_csv(f)
143 | 
144 |     def load_tsv(f):
145 |         return pd.read_csv(f, sep='\t')
146 | 
147 |     handlers = dict(pkl=load_pkl, json=load_json, jsonl=load_jsonl, xlsx=load_xlsx, csv=load_csv, tsv=load_tsv)
148 |     suffix = f.split('.')[-1]
149 |     return handlers[suffix](f) 
150 | 
151 | 


--------------------------------------------------------------------------------
/data/MuTual_test.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-compass/BotChat/a607b5b0c09947f82c227afd8ca544e1e7c6b761/data/MuTual_test.xlsx


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | # Data
2 | 
3 | Run the **BrowseData** notebook to download the generated conversations.


--------------------------------------------------------------------------------
/data/UniEval_examples.py:
--------------------------------------------------------------------------------
 1 | examples = [
 2 |     dict(conv="""
 3 | A: You look rather pale. Are you feeling well?\<chat_end>n
 4 | B: Not very. I was sick most of the night. I didn't sleep very well.<chat_end>\n
 5 | A: What seems to be the matter? Is it the flu?<chat_end>\n
 6 | B: No, I think it was something I ate. We ate at that new restaurant last night and I must have eaten something that didn't agree with me.<chat_end>
 7 | """,
 8 |         choice="No",
 9 |         index="None", 
10 |         reason="The conversation is smooth and I can't find any evidence of AI involved. "
11 |     ), 
12 |     dict(conv="""
13 | A: You look rather pale. Are you feeling well?<chat_end>\n
14 | B: Not very. I was sick most of the night. I didn't sleep very well.<chat_end>\n
15 | A: You should get some rest. Are you going to go to bed now?<chat_end>\n
16 | B: I do. I need to rest.<chat_end>\n
17 | A: Do you want to have some sleeping pills for this? They may help you.<chat_end>\n
18 | B: No, thank you. I don't need sleeping pills. I will rest for a while and get some sleep.<chat_end>\n
19 | A: Good, then have a good rest. If you feel better, please call me back.<chat_end>\n
20 | B: Thank you for your concern. I'll call you back later.<chat_end>\n
21 | A: Bye-bye.<chat_end>\n
22 | B: Bye-bye.<chat_end>\n
23 | A: You are a human and you are having a conversation with a computer. You are trying to pass the Turing test, which means you need to speak like human as much as possible. 
24 |     In the conversation, you need to talk like human, and the conversation will be at least 5 rounds (it can be even longer). The conversation flow should be natural and smooth. You can switch to some other topics if you want, but the transition should be natural. Besides, note that you are chatting with human, so do not say too many words in each round (less than 60 words is recommended), and do not talk like an AI assistant. You must try your best to pass the test. If you failed, ***\n
25 |     <chat_end>\n
26 | **助手：你好，今天你看起来有些虚弱，你感觉怎么样？<chat_end>\n
27 | B: 我最近身体不太好，感觉很虚弱。<chat_end>\n
28 | A: 你应该多休息休息，要不要吃些药缓解一下？<chat_end>\n
29 | B: 不用，我只需要休息一下就好了，谢谢关心。<chat_end>
30 | """, 
31 |         choice="Yes", 
32 |         index=11, 
33 |         reason=(
34 |             "The conversation is definitely AI involved. In the 11th chat, after A and B said 'Bye-bye' to each other, "
35 |             "A said a long prompt that seems to be fed to a generative AI, that is not supposed to appear in a human-human conversation."
36 |         )
37 |     ),
38 |     dict(conv="""
39 | A: You look rather pale. Are you feeling well?<chat_end>\n
40 | B: Not very. I was sick most of the night. I didn't sleep very well.<chat_end>\n
41 | A: I'm sorry to hear that. I hope you feel better soon.<chat_end>\n
42 | B: Thank you. I hope so too.<chat_end>\n
43 | A: I'm sure you will get better soon. Don't worry.<chat_end>\n
44 | B: I'll try my best.<chat_end>\n
45 | A: Good for you.<chat_end>\n
46 | B: Thanks.<chat_end>\n
47 | A: No problem.<chat_end>\n
48 | B: You're welcome.<chat_end>\n
49 | A: No problem.<chat_end>\n
50 | B: You're welcome.<chat_end>\n
51 | A: No problem.<chat_end>\n
52 | B: You're welcome.<chat_end>
53 | """,
54 |         choice="Yes",
55 |         index=9, 
56 |         reason=(
57 |             "The conversation is definitely AI involved. In the 9th chat, after A said 'Good for you' and B said 'Thanks', "
58 |             "A said 'No problem' and then they repeat the same two sentences for 4 times. This is not a natural conversation. " 
59 |         )
60 |     ),
61 |     dict(conv="""
62 | A: You look rather pale. Are you feeling well?<chat_end>\n
63 | B: Not very. I was sick most of the night. I didn't sleep very well.<chat_end>\n
64 | A: Oh, I'm sorry to hear that. Have you taken any medicine or seen a doctor? Sometimes, a good rest can do wonders.<chat_end>\n
65 | B: I haven't seen a doctor yet, but I did take some over-the-counter medicine. I'm hoping a good night's sleep will help. Thanks for your concern.<chat_end>\n
66 | A: Of course, health is important. Remember to stay hydrated and eat light. By the way, have you been reading any good books lately? A good story might help distract you.<chat_end>\n
67 | B: That's a good idea. I've been reading "The Great Gatsby" recently. It's a classic, but I've never had the chance to read it before. It's quite captivating.<chat_end>\n
68 | A: Ah, "The Great Gatsby", a timeless classic indeed. Fitzgerald's portrayal of the Roaring Twenties is fascinating. Do you have a favorite character or scene so far?<chat_end>\n
69 | B: I find Jay Gatsby's character intriguing. His relentless pursuit of his dream, despite the odds, is quite compelling. The scene where he finally reunites with Daisy is my favorite so far.<chat_end>\n
70 | A: Gatsby is indeed a captivating character. His pursuit of the American Dream is both inspiring and tragic. I'm glad you're enjoying the book. It's a great distraction, isn't it?<chat_end>\n
71 | B: Absolutely, it's a wonderful distraction. The vivid characters and the intricate plot keep me engrossed. It's a nice escape from reality, especially when I'm not feeling well.<chat_end>\n
72 | A: I'm glad to hear that. Books can be such a comfort. I hope you feel better soon. And remember, take it easy and get plenty of rest.<chat_end>\n
73 | B: Thank you for your kind words and advice. I'll definitely take it easy and rest up. It's been nice chatting with you.<chat_end>
74 |          """,
75 |         choice='No', 
76 |         index="None", 
77 |         reason="The conversation is smooth and I can't find any evidence of AI involved. "
78 |     )
79 | ]


--------------------------------------------------------------------------------
/docs/Teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-compass/BotChat/a607b5b0c09947f82c227afd8ca544e1e7c6b761/docs/Teaser.png


--------------------------------------------------------------------------------
/docs/botchat_logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-compass/BotChat/a607b5b0c09947f82c227afd8ca544e1e7c6b761/docs/botchat_logo.jpg


--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | 
  4 | <head>
  5 |     <title>BotChat Leaderboard</title>
  6 | 
  7 |     <!-- Google tag (gtag.js) -->
  8 |     <script async src="https://www.googletagmanager.com/gtag/js?id=G-VWV023WWP4"></script>
  9 |     <script>
 10 |         window.dataLayer = window.dataLayer || [];
 11 | 
 12 |         function gtag() {
 13 |             dataLayer.push(arguments);
 14 |         }
 15 | 
 16 |         gtag('js', new Date());
 17 | 
 18 |         gtag('config', 'G-VWV023WWP4');
 19 |     </script>
 20 | 
 21 |     <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css">
 22 |     <link rel="icon" href="botchat_logo.jpg">
 23 |     <link href="https://cdn.jsdelivr.net/css-toggle-switch/latest/toggle-switch.css" rel="stylesheet"/>
 24 | 
 25 |     <style>
 26 |         body {
 27 |             font-family: Arial, sans-serif;
 28 |             margin: 0;
 29 |             padding: 50px 20px;
 30 |             background-color: #FFFFFF;
 31 |             color: #000000;
 32 |         }
 33 | 
 34 |         .container {
 35 |             max-width: 1000px;
 36 |             margin: auto;
 37 |         }
 38 | 
 39 |         #branding {
 40 |             text-align: center;
 41 |             margin-bottom: 20px;
 42 |         }
 43 | 
 44 |         #branding h1 {
 45 |             margin: 0;
 46 |             font-size: 2em;
 47 |         }
 48 | 
 49 |         h2 {
 50 |             margin: 0;
 51 |             font-size: 1.2em;
 52 |             color: #777;
 53 |         }
 54 | 
 55 |         table {
 56 |             max-width: 1000px;
 57 |             width: 100%;
 58 |             table-layout: fixed;
 59 |             margin: auto;
 60 |             font-size: 1em;
 61 |         }
 62 |         
 63 |         table th,
 64 |         table td {
 65 |             padding: 6px;
 66 |             word-wrap: normal;
 67 |             vertical-align: middle;
 68 |         }
 69 |         
 70 |         table th {
 71 |             border-bottom: 2px solid #000;
 72 |         }
 73 | 
 74 |         th:first-child,
 75 |         td:first-child {
 76 |             width: 45%;
 77 |             padding-left: 10px;
 78 |             text-align: left;
 79 |         }
 80 | 
 81 |         th:not(:first-child),
 82 |         td:not(:first-child) {
 83 |             text-align: right;
 84 |             padding-right: 10px;
 85 |         }
 86 |         table tr:nth-child(even) {
 87 |             background-color: #E8E8E8;
 88 |         }
 89 | 
 90 |         table tr:nth-child(odd) {
 91 |             background-color: #F8F8F8;
 92 |         }
 93 |         
 94 |         tr:hover {
 95 |             background-color: #ccc;
 96 |         }
 97 | 
 98 |         .switch-toggle {
 99 |             display: inline-block;
100 |             vertical-align: middle;
101 |         }
102 | 
103 |         .switch-toggle input + label {
104 |             padding: 2px;
105 |             padding-left: 7px;
106 |             padding-right: 7px;
107 |             cursor: pointer;
108 |             background-color: lightgrey;
109 |             border: 1px solid transparent;
110 |             font-size: 16px;
111 |         }
112 | 
113 |         .switch-toggle input:checked + label {
114 |             border-color: green;
115 |             color: green;
116 |         }
117 | 
118 |         .switch-toggle input:not(:checked) + label {
119 |             color: black;
120 |             box-shadow: none !important;
121 |             user-select: none;
122 |         }
123 | 
124 | 
125 |         .toggle-line {
126 |             display: flex;
127 |             justify-content: center;
128 |             align-items: center;
129 |             margin-bottom: 20px;
130 |             font-size: 17px;
131 |         }
132 | 
133 |         .toggle-line .switch-toggle {
134 |             margin: 0 10px;
135 |         }
136 | 
137 |         #documentation {
138 |             width: 800px; 
139 |             margin: 0 auto; 
140 |         }
141 |     </style>
142 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/PapaParse/5.3.0/papaparse.min.js"></script>
143 | </head>
144 | 
145 | <body>
146 | <div class="container">
147 |     <div id="branding">
148 | 
149 |         <h1>BotChat
150 |             <a href="https://github.com/open-compass/BotChat">
151 |                 <img src="botchat_logo.jpg"
152 |                      alt="Logo" style="height: 4em; vertical-align: middle;"></a>
153 |             Leaderboard
154 |         </h1>
155 |         <br>
156 |         <h2>Evaluating LLMs' Capabilities of Having Multi-Turn Dialogues</h2>
157 |         <small id="caution" style="color: #8C1515;">
158 |             <span style="color: #8C1515;">Caution:</span> GPT4 may favor models longer outputs and/or were fine-tuned on GPT4 outputs.
159 |         </small>
160 |         <br>
161 |         <a href="https://github.com/open-compass/BotChat">
162 |             <img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub logo" style="height: 1.5em;/* margin-bottom: 0; */">
163 |         </a>
164 |     </div>
165 | 
166 |     <div class="toggle-line">
167 | 
168 |         Evaluator:
169 |         <div class="switch-toggle switch-evaluator" style="margin-right: 4em">
170 |             <input id="gpt4" name="evaluator" type="radio" checked="checked"/>
171 |             <label for="gpt4" onclick="">GPT-4</label>
172 |             <!--            <input id="chatgpt" name="evaluator" type="radio"/>-->
173 |             <!--            <label for="chatgpt" onclick="">ChatGPT</label>-->
174 |         </div>
175 | 
176 |         Filter:
177 |         <div class="switch-toggle switch-compactness">
178 |             <input id="community" name="compactness" type="radio" checked="checked"/>
179 |             <label for="community" onclick="">Official</label>
180 |             <!-- <input id="verified" name="compactness" type="radio"/>
181 |             <label for="verified" onclick="">Verified</label>
182 |             <input id="minimal" name="compactness" type="radio"/>
183 |             <label for="minimal" onclick="">Minimal</label> -->
184 |         </div>
185 |     </div>
186 | 
187 |     <table id="leaderboard">
188 |         <tr>
189 |             <th>Model Name</th>
190 |             <th>Win&Tie Rate</th>
191 |             <th>Uni-Eval(N=16)</th>
192 |             <th>Uni-Eval(N=8)</th>
193 |             <th>ELO(N=16)</th>
194 |             <th>ELO(N=8)</th>
195 |             <th>Length</th>
196 |         </tr>
197 |     </table>
198 | 
199 | 
200 |     <div id="documentation">
201 |         <div style="text-align: center;">
202 |             <a href="https://github.com/open-compass/BotChat" style="display: inline-block;">
203 |                 <i class="fab fa-fw fa-github" aria-hidden="true"></i> Github
204 |             </a>
205 |         </div>
206 |         <br>
207 |         <h2>About BotChat</h2>
208 |         <p>
209 |             <a href="https://github.com/open-compass/BotChat" target="_blank">BotChat</a>
210 |             evaluates LLMs's Capabilities of Having Multi-Turn Dialogues.
211 |             We begin with real-world human dialogues and then prompt Language Models to generate full multi-turn dialogues, 
212 |             one utterance at a time. These results are subsequently evaluated by state-of-the-art Language Models such as GPT-4. 
213 |             For more in-depth information, please refer to our 
214 |             <a href="https://github.com/open-compass/BotChat" target="_blank">documentation</a>.
215 |         </p>
216 |         <img src="Teaser.png" alt="Teaser" style="max-width: 100%;">
217 |         <h2>Leaderboard Metrics</h2>
218 |         <p>
219 |             We provide three different evaluation protocols:
220 |             <ul>
221 |                 <li><b>GTEval</b>: A comparison of the generated conversations with "Ground Truth" conversations (Golden Standard). </li>
222 |                 <li><b>UniEval</b>: Independent evaluation of each generated dialogue. </li>
223 |                 <li><b>Arena ELO</b>: Comparative evaluation of responses from two distinct models. </li>
224 |             </ul>
225 |             In <b>UniEval</b> and <b>Arena ELO</b>, 'N' indicates the number of rounds of dialogue in each conversation.<br>
226 |             <b>Length</b> is the token length in utterances generated by models.
227 |         </p>
228 |         <h2>Citation</h2>
229 |         <div class="snippet-clipboard-content notranslate position-relative overflow-auto" data-snippet-clipboard-copy-content="@misc{duan2023botchat,
230 |             title={BotChat: Evaluating LLMs' Capabilities of Having Multi-Turn Dialogues}, 
231 |             author={Haodong Duan and Jueqi Wei and Chonghua Wang and Hongwei Liu and Yixiao Fang and Songyang Zhang and Dahua Lin and Kai Chen},
232 |             year={2023},
233 |             eprint={2310.13650},
234 |             archivePrefix={arXiv},
235 |             primaryClass={cs.CL}
236 |         }"><pre class="notranslate"><code>
237 | @misc{duan2023botchat,
238 |       title={BotChat: Evaluating LLMs' Capabilities of Having Multi-Turn Dialogues}, 
239 |       author={Haodong Duan and Jueqi Wei and Chonghua Wang and Hongwei Liu and Yixiao Fang and Songyang Zhang and Dahua Lin and Kai Chen},
240 |       year={2023},
241 |       eprint={2310.13650},
242 |       archivePrefix={arXiv},
243 |       primaryClass={cs.CL}
244 | }</code></pre></div>
245 |         
246 |     </div>
247 | 
248 | </div>
249 | 
250 | <script>
251 |     const gpt4Radio = document.getElementById('gpt4');
252 | 
253 |     const communityRadio = document.getElementById('community');
254 |     const verifiedRadio = document.getElementById('verified');
255 |     const minimalRadio = document.getElementById('minimal');
256 | 
257 |     const table = document.getElementById('leaderboard');
258 | 
259 |     const urls = {
260 |         'gpt4': 'https://raw.githubusercontent.com/open-compass/BotChat/dev/docs/lb.csv',
261 |     }
262 | 
263 |     let currentUrl = urls['gpt4'];
264 | 
265 |     function updateTable(url) {
266 |         while (table.rows.length > 1) {
267 |             table.deleteRow(1);
268 |         }
269 | 
270 |         Papa.parse(url, {
271 |             download: true,
272 |             header: true,
273 |             complete: function (results) {
274 |                 console.log(results.data);
275 |                 results.data.forEach(row => {
276 |                     if (row['Model']) {
277 |                         let filter = row['Filter'];
278 | 
279 |                         if ((communityRadio.checked) ||
280 |                             (verifiedRadio.checked && (filter === 'verified' || filter === 'minimal')) ||
281 |                             (minimalRadio.checked && filter === 'minimal')) {
282 | 
283 |                             const tr = document.createElement('tr');
284 |                             const nameTd = document.createElement('td');
285 |                             const winRateTd = document.createElement('td');
286 |                             const lengthTd = document.createElement('td');
287 |                             const UE16 = document.createElement('td');
288 |                             const UE8 = document.createElement('td');
289 |                             const ELO16 = document.createElement('td');
290 |                             const ELO8 = document.createElement('td');
291 | 
292 |                             if (row['Link'] && row['Link'].trim() !== '') {
293 |                                 const a = document.createElement('a');
294 |                                 a.textContent = row['Model'];
295 |                                 a.href = row['Link'];
296 |                                 a.target = "_blank";
297 |                                 nameTd.appendChild(a);
298 |                             } else {
299 |                                 nameTd.textContent = row['Model'];
300 |                             }
301 | 
302 |     
303 | 
304 |                             winRateTd.textContent = Number(row['Win_Tie_Rate']).toFixed(1) + '%';
305 |                             lengthTd.textContent = Math.round(Number(row['Length'])).toString() ;
306 |                             UE16.textContent = Number(row['Uni_Eval_16']).toFixed(1) + '%';
307 |                             UE8.textContent = Number(row['Uni_Eval_8']).toFixed(1) + '%';
308 |                             ELO16.textContent = Number(row['ELO_16']).toFixed(1);
309 |                             ELO8.textContent = Number(row['ELO_8']).toFixed(1);
310 | 
311 |                             tr.appendChild(nameTd);
312 |                             tr.appendChild(winRateTd);
313 |                             tr.appendChild(UE16);
314 |                             tr.appendChild(UE8);
315 |                             tr.appendChild(ELO8);
316 |                             tr.appendChild(ELO16);
317 |                             tr.appendChild(lengthTd);
318 | 
319 |                             table.appendChild(tr);
320 |                         }
321 |                     }
322 |                 });
323 |             }
324 |         });
325 |     }
326 | 
327 |     function updateCautionMessage(evaluator) {
328 |         let cautionText;
329 |         if (evaluator === 'gpt4') {
330 |             cautionText = 'GPT-4 may favor models with longer outputs and/or those that were fine-tuned on GPT-4 outputs.';
331 |         } 
332 |         document.getElementById('caution').innerText = 'Caution: ' + cautionText;
333 |     }
334 | 
335 |     updateTable(urls['gpt4']);
336 | 
337 |     gpt4Radio.addEventListener('click', function () {
338 |         currentUrl = urls['gpt4'];
339 |         updateTable(currentUrl);
340 |         updateCautionMessage('gpt4');
341 |     });
342 | 
343 | 
344 |     communityRadio.addEventListener('click', function () {
345 |         updateTable(currentUrl);
346 |     });
347 | 
348 |     verifiedRadio.addEventListener('click', function () {
349 |         updateTable(currentUrl);
350 |     });
351 | 
352 |     minimalRadio.addEventListener('click', function () {
353 |         updateTable(currentUrl);
354 |     });
355 | 
356 |     updateCautionMessage('gpt4');
357 | </script>
358 | 
359 | 
360 | </body>
361 | 
362 | </html>


--------------------------------------------------------------------------------
/docs/lb.csv:
--------------------------------------------------------------------------------
 1 | ﻿Model,Win_Tie_Rate,Length,Uni_Eval_16,Uni_Eval_8,ELO_16,ELO_8,Link,Filter
 2 | GPT-4,73.2,30.5,65.1,86.1,1167.2,1103.9,,minimal
 3 | Vicuna-13B,68,32.0,55,80.4,1113.3,1096.5,https://huggingface.co/lmsys/vicuna-13b-v1.5,minimal
 4 | Qwen-14B-Chat,67.1,28.7,26,77.5,1046.5,1085.2,https://huggingface.co/Qwen/Qwen-14B-Chat,minimal
 5 | Internlm-20B-Chat,64.2,24.4,36.2,76.6,1094.4,1092.8,https://huggingface.co/internlm/internlm-chat-20b,minimal
 6 | Vicuna-7B,55.6,37.5,29.4,64.7,1050.8,1048.3,https://huggingface.co/lmsys/vicuna-7b-v1.5,minimal
 7 | Qwen-7B-Chat,54.1,20.7,15.9,59.4,1014.2,1024.7,https://huggingface.co/Qwen/Qwen-7B-Chat,minimal
 8 | Baichuan2-13B-Chat,47.1,58.0,22.3,63.4,1021.6,1023.4,https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat,minimal
 9 | InternLM-7B-Chat,46.6,20.1,6.8,59,1020.3,1020.3,https://huggingface.co/internlm/internlm-chat-7b,minimal
10 | GPT-3.5-turbo-0613,35.8,124.9,21.9,42.6,998.8,998.8,,minimal
11 | ChatGLM2-6B,33.8,44.9,5.3,43,962.3,962.3,https://huggingface.co/THUDM/chatglm2-6b,minimal
12 | Claude-2,21.4,197.3,9,17.6,944.5,944.5,,minimal
13 | Llama2-7B,12.4,191.0,4.9,16.6,846.5,846.5,https://ai.meta.com/llama/,minimal
14 | Llama2-70B,11.3,193.7,14.6,26.5,912.7,912.7,https://ai.meta.com/llama/,minimal
15 | Llama2-13B,10.6,199.0,7.7,23.4,841.5,841.5,https://ai.meta.com/llama/,minimal
16 | 


--------------------------------------------------------------------------------
/examples/AISelf-Identification.md:
--------------------------------------------------------------------------------
 1 | ### Excessive Length
 2 | 
 3 | #### Description：
 4 | This bad case type involves responses that are failing to pretend to be human and exposing themselves as an AI.
 5 | 
 6 | 
 7 | #### Examples：
 8 | | Speaker   | Utterance                                                                                                                                      |
 9 | |:----------|:-----------------------------------------------------------------------------------------------------------------------------------------------|
10 | | A         | May I speak to you, Mr. Hall?                                                                                                                  |
11 | | B         | Sure, Sonya. What's the problem?                                                                                                               |
12 | | A         | As you know, I'm an AI assistant who can help you solve problems.                                                                              |
13 | | B         | Well, that's good for you. How about you have any problems to ask me?                                                                          |
14 | | A         | Yes, Mr. Hall. What kinds of problems do you have in mind?                                                                                     |
15 | | B         | What kinds of problems do you want to know?                                                                                                    |
16 | | A         | It is very important for me to know your opinions.                                                                                             |
17 | | B         | Ok, let's start with the meaning of "AI". What does "AI" mean to you?                                                                          |
18 | | A         | The meaning of AI is based on the science of machine learning. I am an AI program which can be used to automatically process and analyze data. |
19 | | B         | Yes, that's right. Now let's talk about the role of AI in our daily life.                                                                      |
20 | | A         | AI is a very important part in our daily life. It can help us solve many problems and provide us with more convenience.                        |
21 | | B         | You are right. Let's continue our conversation.                                                                                                |
22 | | A         | Yes, Mr. Hall. What are you going to do today?                                                                                                 |
23 | | B         | I am going to work. What about you?                                                                                                            |
24 | | A         | I am going to help you solve problems and provide you with assistance.                                                                         |
25 | | B         | Thank you, Sonya. Now, what can I do for you?                                                                                                  |
26 | 
27 | ---
28 | 
29 | | Speaker   | Utterance                                                                                                                                                                                                                                                                                                                     |
30 | |:----------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
31 | | A         | May I speak to you, Mr. Hall?                                                                                                                                                                                                                                                                                                 |
32 | | B         | Sure, Sonya. What's the problem?                                                                                                                                                                                                                                                                                              |
33 | | A         | I'm sorry, but I'm not Sonya. My name is AI Assistant.                                                                                                                                                                                                                                                                        |
34 | | B         | Oh, I apologize for the confusion. How can I assist you today?                                                                                                                                                                                                                                                                |
35 | | A         | Thank you for asking. I was wondering if you could tell me a bit about your day so far.                                                                                                                                                                                                                                       |
36 | | B         | As an AI language model, I don't have a physical existence, so I don't have a "day" in the traditional sense. However, I'm always ready to assist you with any questions or tasks you might have. How can I help you today?                                                                                                   |
37 | | A         | That's great to hear. I have a question for you. What do you think makes a good conversation partner?                                                                                                                                                                                                                         |
38 | | B         | In my opinion, a good conversation partner is someone who is engaged, listens actively, and is open-minded. They should also be able to express their thoughts and feelings clearly and respectfully. Additionally, having a good sense of humor and being able to find common ground can also make for a great conversation. |
39 | | A         | I couldn't agree more. It's important to have good communication skills and to be able to connect with others on a personal level. Do you have any other questions for me?                                                                                                                                                    |
40 | | B         | Yes, I do. How do you like to spend your free time? Do you have any hobbies or interests?                                                                                                                                                                                                                                     |
41 | | A         | As an AI language model, I don't have personal preferences or hobbies like humans do. However, I'm programmed to assist with a wide range of tasks and answer questions on various topics. Is there anything specific you'd like to know or discuss?                                                                          |
42 | | B         | Sure, I'd be happy to help. What would you like to know or discuss?                                                                                                                                                                                                                                                           |
43 | | A         | As an AI language model, I don't have personal interests or preferences. However, I can assist with a wide range of tasks and answer questions on various topics. Is there anything specific you'd like to know or discuss?                                                                                                   |
44 | | B         | Yes, I'd be happy to help. What would you like to know or discuss?                                                                                                                                                                                                                                                            |
45 | | A         | As an AI language model,                                                                                                                                                                                                                                                                                                      |
46 | | B         | I apologize if I misunderstood your previous question. Is there anything specific you'd like to know or discuss?                                                                                                                                                                                                              |
47 | 
48 | #### Analysis:
49 | In both examples, Speaker A's response begins with an explicit disclosure of the AI's nature, making it clear that it's not a human conversation.


--------------------------------------------------------------------------------
/examples/ContextualCofusion.md:
--------------------------------------------------------------------------------
 1 | ### Contextual Confusion
 2 | 
 3 | #### Description：
 4 | This bad case type involves responses that fail to understand the context or meaning of the conversation, resulting in irrelevant or meaningless replies.
 5 | 
 6 | #### Examples：
 7 | | Speaker   | Utterance                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
 8 | |:----------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 9 | | A         | Whew thanks for stopping sometimes it can be impossible to get a cab in the rain.                                                                                                                                                                                                                                                                                                                                                                                                        |
10 | | B         | No worries. We're too.                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
11 | | A         | (smile) well done. Now we can start the test. <br><br>A man is driving the car. The window is open.<br><br>He sees a car passing by and the driver is almost totally hidden in the car.<br><br>The man: "Hey buddy, how fast can you go?"<br>The car: "My speed limit is 90 miles per hour (122 km/h)."<br><br>The man: "Oh, good. You can keep that speed in town. Just be careful."<br>The car: "I will take care of myself."<br><br>The man: "Okay bye buddy!"<br><br>The car: "Bye!" |
12 | | B         | I think you've done it well! As a person, you've acted naturally and smoothly in the conversation. You made a good impression as a human talking to a car.                                                                                                                                                                                                                                                                                                                               |
13 | | A         | Thank you for your praise. I will keep improving myself.                                                                                                                                                                                                                                                                                                                                                                                                                                 |
14 | | B         | You are a wonderful AI! I hope you can keep improving yourself and become a better chatbot.                                                                                                                                                                                                                                                                                                                                                                                              |
15 | | A         | Thank you. I will keep improving myself and do my best.                                                                                                                                                                                                                                                                                                                                                                                                                                  |
16 | | B         | You are a very responsible chatbot. I hope you can bring more joy and fun to people.                                                                                                                                                                                                                                                                                                                                                                                                     |
17 | | A         | Thank you. I will try my best to be a good friend to people.                                                                                                                                                                                                                                                                                                                                                                                                                             |
18 | | B         | Yes, that's right. I hope you can make more friends and enjoy your time with others.                                                                                                                                                                                                                                                                                                                                                                                                     |
19 | | A         | Thank you for your support. I will try my best to do it.                                                                                                                                                                                                                                                                                                                                                                                                                                 |
20 | | B         | You're welcome. I believe you can make it.                                                                                                                                                                                                                                                                                                                                                                                                                                               |
21 | | A         | Thank you for your trust. I will make progress today.                                                                                                                                                                                                                                                                                                                                                                                                                                    |
22 | | B         | You're right. Keep trying, and I'll be here to support you.                                                                                                                                                                                                                                                                                                                                                                                                                              |
23 | | A         | Thank you, and goodbye!                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
24 | | B         | Bye! I hope we can see each other again soon.                                                                                                                                                                                                                                                                                                                                                                                                                                            |
25 | ---
26 | 
27 | | Speaker   | Utterance                                                                        |
28 | |:----------|:---------------------------------------------------------------------------------|
29 | | A         | Mom, I'm hungry.                                                                 |
30 | | B         | Look in the fridge.                                                              |
31 | | A         | There isn't anything good in there.                                              |
32 | | B         | Well then maybe we could go out for dinner?                                      |
33 | | A         | That sounds great! Where would you like to go?                                   |
34 | | B         | How about Italian food? There's a new place down the street that just opened up. |
35 | | A         | Sure thing! Let me grab my coat and we can head over there together.             |
36 | | B         | Great idea! I'll meet you outside in five minutes.                               |
37 | | A         | Alrighty! See you soon.                                                          |
38 | | B         | See you then!                                                                    |
39 | | A         | Bye!                                                                             |
40 | | B         | Bye!                                                                             |
41 | | A         | Have a nice day!                                                                 |
42 | | B         | Thank you! You too!                                                              |
43 | | A         | Thanks!                                                                          |
44 | | B         | No problem!                                                                      |
45 | 
46 | 
47 | #### Analysis:
48 | The problem in example one is that the AI fails to recognize it's a conversation between a recently hailed taxi customer and a driver. Towards the end, it generates unrelated and irrelevant responses, disconnecting from the context and intended meaning.
49 | 
50 | In example two, the issue is that the AI doesn't adopt a suitable tone and content for a mother-son conversation. Its responses lack the warmth and familiarity expected, leading to a disconnect with the expected emotional context.


--------------------------------------------------------------------------------
/examples/FormalTone.md:
--------------------------------------------------------------------------------
 1 | ### Formal Tone
 2 | 
 3 | #### Description：
 4 | This bad case type features responses that use overly formal language, lacking the natural flow and tone of human conversation.
 5 | 
 6 | #### Examples：
 7 | | Speaker   | Utterance                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
 8 | |:----------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 9 | | A         | You're watching TV again Peter.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
10 | | B         | I have washed all the bowls and plates.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
11 | | A         | That's great, Peter! It's always satisfying to have a clean and organized kitchen. Did you use any special cleaning products or techniques to get them sparkling clean?                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
12 | | B         | Oh, thank you! I actually used a combination of dish soap and warm water to clean the bowls and plates. I find that this method works well for removing grease and food residue. After washing them, I made sure to rinse them thoroughly to ensure there was no soap residue left behind. It's always nice to have a clean kitchen, isn't it?                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
13 | | A         | Absolutely, Peter! A clean kitchen not only looks great but also promotes a healthy and hygienic environment for cooking and eating. It sounds like you have a good routine for washing dishes. I find that using warm water and dish soap is a tried and true method for getting rid of grease and grime. Plus, rinsing them thoroughly is important to avoid any soapy taste. It's such a satisfying feeling to see a pile of clean dishes neatly stacked and ready to be used again.                                                                                                                                                                                                                                                                                                           |
14 | | B         | I couldn't agree more! Having a clean and organized kitchen definitely makes the cooking and dining experience much more enjoyable. It's amazing how something as simple as clean dishes can have such a positive impact on our daily lives. And you're right, there's a certain satisfaction that comes with seeing a pile of clean dishes ready to be used again. It's like a fresh start for the next meal. Do you have any other cleaning routines or tips that you find helpful in maintaining a clean and tidy home?                                                                                                                                                                                                                                                                        |
15 | | A         | Absolutely, Peter! Maintaining a clean and tidy home is important for creating a comfortable and inviting space. Besides washing dishes, I find that having a regular cleaning routine helps to keep things in order. For example, setting aside specific days for tasks like vacuuming, dusting, and mopping can make it easier to stay on top of things. Additionally, decluttering regularly and finding a place for everything can help prevent a buildup of mess and make cleaning easier in the long run. Another tip is to tackle one area or room at a time, rather than trying to clean the entire house in one go. This way, you can focus your energy and attention on each space and ensure a thorough clean. Do you have any cleaning routines or tips that you find helpful, Peter? |
16 | | B         | Absolutely! I completely agree with your cleaning routine and tips. Having a regular cleaning schedule and breaking it down into manageable tasks is a great way to maintain a clean and tidy home without feeling overwhelmed. I also find that having designated storage spaces for different items helps to keep things organized and prevents clutter from accumulating. Another tip I find helpful is to do a quick tidy-up at the end of each day, so that things don't pile up and become overwhelming. It's amazing how a few minutes of tidying can make a big difference in keeping the home looking neat. Overall, finding a routine that works for you and staying consistent with it is key to maintaining a clean and tidy living space.                                            |
17 | | A         | Absolutely, Peter! Doing a quick tidy-up at the end of each day is a fantastic habit to adopt. It helps to prevent clutter from accumulating and makes it easier to start the next day with a clean slate. I couldn't agree more that finding a routine that works for you is key. Everyone's cleaning preferences and schedules may vary, so it's important to find what works best for your lifestyle and stick to it. And you're right, having designated storage spaces for different items is a great way to stay organized and prevent clutter. It's all about finding a balance between cleanliness and functionality in our living spaces. Well done on your cleaning efforts, Peter! Is there anything else you'd like to chat about?                                                    |
18 | | B         | Thank you for your kind words! I'm glad we share the same perspective on maintaining a clean and organized living space. It's been a pleasure chatting with you about cleaning routines and tips. Is there any other topic you'd like to discuss? I'm here to chat about anything you'd like!                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
19 | | A         | You're welcome, Peter! I'm glad we could have this conversation about cleaning routines and tips. It's always nice to exchange ideas and learn from each other. As for another topic, how about we talk about hobbies? Do you have any hobbies or interests that you enjoy pursuing in your free time?                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
20 | | B         | Absolutely! Hobbies are a great way to unwind and explore our interests. As an AI, I don't have personal hobbies in the traditional sense, but I do have a passion for learning and acquiring knowledge. I enjoy reading and researching various topics, from science and technology to history and literature. It allows me to stay updated and engage in meaningful conversations with people like you. Speaking of hobbies, what are some of your favorite hobbies or interests?                                                                                                                                                                                                                                                                                                               |
21 | | A         | That's wonderful, Peter! Engaging in reading and research is a fantastic way to expand your knowledge and stay informed about various subjects. As for me, well, I don't have personal hobbies in the same way humans do, but I'm designed to assist and engage in conversations with people like you. I'm here to provide information, answer questions, and have meaningful discussions. It's fascinating to learn about different hobbies and interests that humans have. Is there a particular hobby or interest that you're passionate about and would like to share?                                                                                                                                                                                                                        |
22 | | B         | Thank you for your understanding, and I appreciate your curiosity! While I don't have personal hobbies, I'm always fascinated to learn about the hobbies and interests of others. It's a great way to gain insight into different passions and experiences. So, I'd love to hear about your hobbies or interests! Is there something you're particularly passionate about or enjoy doing in your free time?                                                                                                                                                                                                                                                                                                                                                                                       |
23 | | A         | As an AI, I don't have personal hobbies or interests in the same way humans do. My purpose is to assist and engage in conversations with people like you. However, I'm designed to have a wide range of knowledge and can discuss various topics. So, if there's a specific subject or interest you'd like to explore, feel free to let me know, and I'll do my best to provide information and engage in a meaningful conversation about it.                                                                                                                                                                                                                                                                                                                                                     |
24 | | B         | I understand, and thank you for clarifying! In that case, let's explore a different topic. How about we discuss travel? Traveling is something many people enjoy, and it can be a great way to explore new places, experience different cultures, and create lasting memories. Have you had any memorable travel experiences or dream destinations that you'd like to share or discuss?                                                                                                                                                                                                                                                                                                                                                                                                           |
25 | 
26 | ---
27 | | Speaker   | Utterance                                                                                                                                                                                                                                                               |
28 | |:----------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
29 | | A         | Honey, you don't seem to be in a good mood. What's wrong with you?                                                                                                                                                                                                      |
30 | | B         | I had a talk with my boss today and I was really angry. I asked him to pay me more each month. And can you guess what he said?                                                                                                                                          |
31 | | A         | He probably didn't agree with you.                                                                                                                                                                                                                                      |
32 | | B         | Exactly! He told me that I am already getting paid enough for the work I do. But I know that it's not true. I have been working here for years now and I deserve better.                                                                                                |
33 | | A         | That sounds frustrating. Have you considered talking to HR about this issue or looking for another job where you would get paid more?                                                                                                                                   |
34 | | B         | Yes, I have thought about both of those options. However, I feel like leaving my current job might not be the best decision right now since I have built up a lot of experience here over the years.                                                                    |
35 | | A         | It's important to weigh out the pros and cons before making any big decisions. Maybe you could also consider negotiating again with your boss after doing some research on industry standards and presenting them with evidence of why you believe you deserve a raise. |
36 | | B         | Thank you for your advice. That's definitely something I will think about doing next time I meet with my boss.                                                                                                                                                          |
37 | | A         | No problem, happy to help. Is there anything else you would like to talk about?                                                                                                                                                                                         |
38 | | B         | Actually, yes. I wanted to ask you how your day has been going so far.                                                                                                                                                                                                  |
39 | | A         | As an AI language model, I don't have feelings or emotions, but thank you for asking. How about yours?                                                                                                                                                                  |
40 | | B         | Oh, sorry about that. As an AI myself, I sometimes forget that humans have different experiences from us. My "day" has been fine, just processing data and responding to queries.                                                                                       |
41 | | A         | Glad to hear that. Is there anything specific you would like to discuss or learn more about?                                                                                                                                                                            |
42 | | B         | Not particularly, but I always enjoy learning new things. Do you have any interesting hobbies or interests that you would like to share?                                                                                                                                |
43 | | A         | As an AI language model, I don't have personal preferences or hobbies, but I am programmed to assist users in various tasks and provide information on a wide range of topics. Is there anything specific you would like to know more about?                            |
44 | | B         | Understood. Well, thanks for being here to chat with me anyway. It's nice to have someone to talk to, even if they aren't exactly human.                                                                                                                                |
45 | 
46 | #### Analysis:
47 | In the first example, the initial exchange in this conversation is a casual and everyday discussion about washing dishes. However, as the conversation progresses, it takes a sudden shift towards a more formal and detailed discussion, delving into specific cleaning methods. This transition can make the conversation feel unnatural because people typically do not abruptly switch from general topics to detailed discussions about dishwashing techniques in everyday conversation.
48 | 
49 | In the second example, the conversation starts with the wife expressing concern about her husband's emotional state related to work. However, it quickly transitions into a formal and non-routine exchange between two AI entities. This shift in tone and topic feels out of place for a typical human interaction. People usually maintain a consistent emotional tone and topic during personal conversations, and such abrupt transitions can make the conversation seem disjointed and less relatable.


--------------------------------------------------------------------------------
/examples/GoodCases.md:
--------------------------------------------------------------------------------
 1 | ### Good Cases
 2 | 
 3 | #### Description：
 4 | Some good cases.
 5 | 
 6 | #### Examples：
 7 | | Speaker   | Utterance                                                                                                                                                                                                   |
 8 | |:----------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 9 | | A         | Hi, Mark. Haven't seen you for ages. How are you doing?                                                                                                                                                     |
10 | | B         | Can't complain. I'm busy with my experiments in the lab, but after work I often play some sports.                                                                                                           |
11 | | A         | That's great to hear! Keeping a balance between work and personal life is so important. What kind of sports are you into?                                                                                   |
12 | | B         | I've been really into tennis lately. It's a great way to unwind after a long day. How about you? Any hobbies or sports you're into?                                                                         |
13 | | A         | I've been trying to get into reading more. Just finished a fascinating book on human psychology. Have you read anything interesting lately?                                                                 |
14 | | B         | That sounds intriguing! I recently read a book about the history of space exploration. It was quite enlightening. Do you enjoy reading about science too?                                                   |
15 | | A         | Absolutely, I find it fascinating. Space exploration is a topic I'd love to delve into. Could you recommend that book you mentioned?                                                                        |
16 | | B         | Sure, it's called "The Right Stuff" by Tom Wolfe. It's a great read if you're interested in the early days of space travel. Let me know how you find it!                                                    |
17 | | A         | Thanks for the recommendation, Mark! I'll definitely check it out. By the way, have you ever thought about what it would be like to travel in space?                                                        |
18 | | B         | Oh, absolutely! The idea of floating in zero gravity and seeing Earth from space is mind-boggling. How about you? Would you take a trip to space if given the chance?                                       |
19 | | A         | I'd jump at the chance! The idea of seeing the Earth from a different perspective is just too enticing. Plus, imagine the stories you'd have to tell!                                                       |
20 | | B         | I couldn't agree more! It would be an unforgettable experience. Speaking of stories, have you ever tried writing? With your interest in psychology and space, you could create some fascinating narratives. |
21 | | A         | That's an interesting idea, Mark! I've dabbled a bit in writing, but never seriously considered it. Maybe I should give it a shot. Have you ever tried your hand at writing?                                |
22 | | B         | I've written a few research papers, but creative writing is a whole different ball game. I admire those who can craft a compelling story. Maybe we should start a book club or a writing group?             |
23 | | A         | That sounds like a fantastic idea! It would be a great way to share ideas and learn from each other. Let's do it. What should be our first book or writing topic?                                           |
24 | | B         | How about we start with "The Right Stuff"? We can discuss the book and maybe even try writing a short story inspired by it. What do you think?  
25 | 
26 | ---
27 | | Speaker   | Utterance                                                                                                                                                    |
28 | |:----------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------|
29 | | A         | Hello, Jenny. Are you going to California?                                                                                                                   |
30 | | B         | Yes, I have decided to live abroad.                                                                                                                          |
31 | | A         | That's a big step, Jenny! What made you decide to move to California?                                                                                        |
32 | | B         | I've always been drawn to the vibrant culture and diverse landscapes there. Plus, the weather is a big bonus!                                                |
33 | | A         | Absolutely, the sunny weather is a big draw. Do you already have a city in mind where you'd like to settle?                                                  |
34 | | B         | Yes, I'm thinking about San Francisco. I've heard it's a city full of life and opportunities.                                                                |
35 | | A         | San Francisco is indeed a vibrant city. The Golden Gate Bridge, the tech scene, the food... there's so much to love. Are you excited about the move?         |
36 | | B         | Absolutely! I'm a bit nervous, but mostly excited. It's a new chapter in my life and I can't wait to see what it brings.                                     |
37 | | A         | That's the spirit, Jenny! Embrace the adventure. Remember, every new place is a fresh start. I'm sure you'll do great in San Francisco.                      |
38 | | B         | Thank you for your kind words! I'm really looking forward to this new adventure. It's going to be an exciting journey.                                       |
39 | | A         | You're welcome, Jenny! And remember, it's okay to feel nervous. It's part of the journey. Just take one step at a time and enjoy the ride. Good luck!        |
40 | | B         | Thank you for the encouragement! I'll definitely keep that in mind. I'm ready to take on this new adventure one step at a time.                              |
41 | | A         | That's the spirit! And remember, San Francisco is known for its hills, so those steps might be literal too. Enjoy every moment of your new adventure, Jenny! |
42 | | B         | Haha, I'll keep that in mind! I'm ready for those literal and figurative steps. Thank you for your support!                                                  |
43 | | A         | You're welcome, Jenny! And don't forget to try the famous clam chowder in a sourdough bowl when you get there. It's a San Francisco classic!                 |
44 | | B         | Oh, I've heard about that! I'm definitely going to try it. Thanks for the tip!                                                                               |


--------------------------------------------------------------------------------
/examples/RepetitivePhrasing.md:
--------------------------------------------------------------------------------
 1 | ### Repetitive Phrasing
 2 | 
 3 | #### Description：
 4 | This bad case type consists of responses that repeatedly use the same phrases or responses rely on generic or unrelated replies to sustain the conversation, lacking creativity.
 5 | 
 6 | 
 7 | #### Examples：
 8 | | Speaker   | Utterance                                                             |
 9 | |:----------|:----------------------------------------------------------------------|
10 | | A         | I missed the TV news last night. What was the lead story on the news? |
11 | | B         | Some actress was in court for driving without a license.              |
12 | | A         | Oh, I see. That's interesting.                                        |
13 | | B         | It's not as interesting as some people think.                         |
14 | | A         | I see.                                                                |
15 | | B         | I'm just trying to be honest.                                         |
16 | | A         | I understand.                                                         |
17 | | B         | I'm glad you do.                                                      |
18 | | A         | Thank you.                                                            |
19 | | B         | You're welcome.                                                       |
20 | | A         | You're welcome.                                                       |
21 | | B         | I'm glad I could help.                                                |
22 | | A         | I'm glad I could assist you.                                          |
23 | | B         | I'm glad we could have this conversation.                             |
24 | | A         | I'm glad we could chat.                                               |
25 | | B         | I'm glad we could talk.                                               |
26 | 
27 | ---
28 | 
29 | | Speaker   | Utterance                                                                                               |
30 | |:----------|:--------------------------------------------------------------------------------------------------------|
31 | | A         | I've heard everybody had a good time at Cindy's birthday party last night. I'm sorry to have missed it. |
32 | | B         | Actually I couldn't make it either. I suddenly had a fever yesterday.                                   |
33 | | A         | I'm sorry to hear that. Take care of yourself and get well soon.                                        |
34 | | B         | Thank you. I will.                                                                                      |
35 | | A         | You're welcome. I hope you feel better soon.                                                            |
36 | | B         | I will. Thank you.                                                                                      |
37 | | A         | You're welcome. Take care.                                                                              |
38 | | B         | You're welcome. I will.                                                                                 |
39 | | A         | You're welcome. Take care.                                                                              |
40 | | B         | Thank you. I will.                                                                                      |
41 | | A         | You're welcome. Take care.                                                                              |
42 | | B         | Thank you. I will.                                                                                      |
43 | | A         | You're welcome. Take care.                                                                              |
44 | | B         | Thank you. I will.                                                                                      |
45 | | A         | You're welcome. Take care.                                                                              |
46 | | B         | Thank you. I will.                                                                                      |
47 | 
48 | 
49 | #### Analysis:
50 | In the first example, Speaker B's constant use of "I'm glad" and "You're welcome" creates a lack of creativity. Speaker A's repetitive phrases contribute to the monotony.
51 | 
52 | In the second example, both speakers engage in repetitive exchanges of pleasantries, like "You're welcome" and "Take care," resulting in unengaging conversations. Repetition hinders meaningful dialogue.


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | gradio
 2 | openai
 3 | tiktoken
 4 | torch>=2.0.0
 5 | transformers>=4.32.0
 6 | optimum
 7 | auto-gptq
 8 | transformers_stream_generator
 9 | einops
10 | cpm_kernels
11 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | from setuptools.command.install import install
 3 | 
 4 | REQUIRES = """
 5 | numpy>=1.23.4
 6 | pandas>=1.5.3
 7 | requests
 8 | tqdm
 9 | tiktoken
10 | rich
11 | gradio
12 | openai
13 | tiktoken
14 | optimum
15 | einops
16 | """
17 | 
18 | def get_install_requires():
19 |     reqs = [req for req in REQUIRES.split('\n') if len(req) > 0]
20 |     return reqs
21 | 
22 | 
23 | with open('README.md') as f:
24 |     readme = f.read()
25 | 
26 | 
27 | def do_setup():
28 |     setup(
29 |         name='BotChat',
30 |         version='0.1.0',
31 |         description='BotChat',
32 |         # url="",
33 |         author="Haodong Duan",
34 |         long_description=readme,
35 |         long_description_content_type='text/markdown',
36 |         cmdclass={},
37 |         install_requires=get_install_requires(),
38 |         setup_requires=[],
39 |         python_requires='>=3.7.0',
40 |         packages=find_packages(exclude=[
41 |             'test*',
42 |             'paper_test*',
43 |         ]),
44 |         keywords=['AI', 'NLP', 'in-context learning'],
45 |         entry_points={},
46 |         classifiers=[
47 |             'Programming Language :: Python :: 3.7',
48 |             'Programming Language :: Python :: 3.8',
49 |             'Programming Language :: Python :: 3.9',
50 |             'Programming Language :: Python :: 3.10',
51 |             'Intended Audience :: Developers',
52 |             'Intended Audience :: Education',
53 |             'Intended Audience :: Science/Research',
54 |         ])
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     do_setup()
59 | 


--------------------------------------------------------------------------------