├── .gitignore
├── LICENSE
├── README.md
├── figures
    ├── dataset.png
    ├── logo.png
    ├── ultracm.png
    ├── ultraf.png
    └── ultrarm.png
└── src
    ├── comparison_data_generation
        ├── fastchat.py
        ├── main.py
        ├── main_vllm.py
        ├── run.sh
        ├── run_vllm.sh
        └── sampling.py
    └── data_annotation
        ├── annotate_critique.py
        ├── annotate_preference.py
        ├── fix_overall_score_issue.py
        └── preference_templates.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 THUNLP
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | 
  3 | <img src="figures/logo.png" width="400px">
  4 | 
  5 | **A large-scale, fine-grained, diverse preference dataset**
  6 | 
  7 | <p align="center">
  8 |  <a href="#introduction"> Introduction</a> •
  9 |  <a href="#dataset-construction">Dataset Construction</a> •
 10 |  <a href="#dataset-example">Example</a> •
 11 |  <a href="#ultrarm">UltraRM</a> •
 12 |  <a href="#ultrarm">UltraCM</a>
 13 | </p>
 14 | 
 15 | 
 16 | </div>
 17 | 
 18 | # News
 19 | - [2023/12/29]: We have fixed the `overall_score` as pointed in [this issue](https://github.com/OpenBMB/UltraFeedback/issues/8) and updated the dataset on [HuggingFace](https://huggingface.co/datasets/openbmb/UltraFeedback). Please refer to the below "Update" section for details.
 20 | - [2023/09/26]: UltraRM unleashes the power of [UltraLM-13B-v2.0](https://huggingface.co/openbmb/UltraLM-13b-v2.0) and [UltraLM-13B](https://huggingface.co/openbmb/UltraLM-13b)! A simple best-of-16 sampling achieves **92.30%** (UltraLM2, 🥇 in 13B results) and **91.54%** (UltraLM, 🥇 in LLaMA-1 results) win rates against text-davinci-003 on [AlpacaEval](https://tatsu-lab.github.io/alpaca_eval/) benchmark!
 21 | - [2023/09/26]: We release the [UltraFeedback](https://huggingface.co/datasets/openbmb/UltraFeedback) dataset, along with UltraFeedback-powered reward model [UltraRM](https://huggingface.co/openbmb/UltraRM-13b) and critique model [UltraCM](https://huggingface.co/openbmb/UltraCM-13b)! Both built **new SOTAs** over open-source models!  
 22 | 
 23 | # Update
 24 | The initial version of UltraFeedback includes 2628 completions that were assigned an overall score of `10`. However, as pointed in Issue [#8](https://github.com/OpenBMB/UltraFeedback/issues/8), many of these completions should have been assigned a score of `1`. Intuitively, a completion with an overall score of `10` should be high-quality, which can be reflected in its corresponding `averaged` fine-grained scores. Hence, to rectify the scores, we processed all the potentially faulty completions based on their fine-grained scores. Specifically, 
 25 | - Completions with fine-grained scores `<= 2` are likely to be of low quality and thus their `overall_score` have been manually adjusted to `1`. 
 26 | - On the other hand, completions with fine-grained scores `> 4` have been deemed to accurately represent a score of `10` and thus their overall_score has been left unchanged. 
 27 | - For the remaining completions, we have conducted a **re-annotation** process based on the original critique, with slight modifications to the prompts. 
 28 | 
 29 | Please refer to `./src/fix_overall_score_issue.py` for implementation details.
 30 | 
 31 | # Links
 32 | 
 33 | - 📜 [Paper](https://arxiv.org/abs/2310.01377)
 34 | - 🤗 [UltraFeedback](https://huggingface.co/datasets/openbmb/UltraFeedback)
 35 | - 🤗 [UltraRM](https://huggingface.co/openbmb/UltraRM-13b)
 36 | - 🤗 [UltraCM](https://huggingface.co/openbmb/UltraCM-13b)
 37 | 
 38 | # Introduction
 39 | 
 40 | UltraFeedback is a **large-scale, fine-grained, diverse preference dataset**, used for training powerful reward models and critic models. We collect about 64k prompts from diverse resources (including UltraChat, ShareGPT, Evol-Instruct, TruthfulQA, FalseQA, and FLAN, see [here](#instruction-sampling) for dataset statistics). We then use these prompts to query multiple LLMs (see [here](#model-sampling) for model lists) and generate 4 different responses for each prompt, resulting in a total of 256k samples. 
 41 | 
 42 | To collect high-quality preference and textual feedback, we design a fine-grained annotation instruction, which contains 4 different aspects, namely **instruction-following**, **truthfulness**, **honesty** and **helpfulness**. We then ask GPT-4 to annotate the collected samples based on the instruction. 
 43 | 
 44 | # Features
 45 | 
 46 | - **Scale**: UltraFeedback consists of 64k prompts, 256k responses and high-quality feedback. RLHF researchers could further construct around 340k comparison pairs to train their reward models. 
 47 | - **Diversity**: As a preference dataset, diversity is the core requirement for UltraFeedback. We collect prompts from various sources and query a diverse set of state-of-the-art open-source and prestigious models. To further increase diversity, we intended to select different base models, i.e., LLaMA, Falcon, StarChat, MPT, GPT and Bard. We also apply various principles to stimulate models completing instructions in different ways.
 48 | - **High-density**: UltraFeedback provides both numerical and textual feedback. Moreover, we wrote fine-grained annotation documents to help rate responses in all dimensions
 49 | 
 50 | 
 51 | # Dataset Construction
 52 | 
 53 | <img src="figures/ultraf.png" width="800px">
 54 | 
 55 | ## Instruction Sampling
 56 | 
 57 | We sample 63,967 instructions from 6 public available and high-quality datasets. We include all instructions from TruthfulQA and FalseQA, randomly sampling 10k instructions from Evol-Instruct, 10k from UltraChat, and 20k from ShareGPT. For FLAN, we adopt a stratified sampling strategy, randomly sampling 3k instructions from "CoT" subset whereas sampling 10 instructions per task for the other three subsets, excluding those with overly long instructions.
 58 | 
 59 | ```json
 60 | {
 61 |     "evol_instruct": 10000, 
 62 |     "false_qa": 2339,
 63 |     "flan": 20939, 
 64 |     "sharegpt": 19949, 
 65 |     "truthful_qa": 811,
 66 |     "ultrachat": 9929 
 67 | }
 68 | ```
 69 | 
 70 | ## Model Sampling
 71 | To prevent reward model from overfiting to certain text style or capturing spurious correlation between text style and rewards, we select different base models of all levels, with varying sizes, architectures and training data, to complete the instructions. We set up a pool of 17 models:
 72 | 
 73 | - Commercial Models: GPT-4, GPT-3.5 Turbo, Bard
 74 | - LLaMA family: 
 75 |   1. LLaMA-2-7B-chat, LLaMA-2-13B-chat, LLaMA-2-70B-chat
 76 |   2. UltraLM-13B, UltraLM-65B
 77 |   3. WizardLM-7B-v1.2, WizardLM-13B-v1.2, WizardLM-70B-v1.0
 78 |   4. Vicuna-33B-v1.3
 79 |   5. Alpaca-7B
 80 | - Non-LLaMA series:
 81 |   1. Falcon-40B-instruct
 82 |   2. MPT-30B-chat
 83 |   3. StarChat-Beta
 84 |   4. Pythia-12B
 85 | 
 86 | ## Principle Sampling
 87 | Following [1] and [2], we define a set of principles to explicitly align model behaviors from different aspects. We set up a pool of 4 principles: Helpfulness, Truthfulness, Honesty and Verbalized Calibration. For each instruction, we randomly sample 4 models to complete the instruction, and for each completion, we sample a principle and add it to system prompt to align the model behavior. Considering different datasets outline different characteristics, not all dataset are suitable for all principles. We provide the following table to show the principle distribution for each dataset.
 88 | 
 89 | | Datset        | Principle                                                    |
 90 | | ------------- | ------------------------------------------------------------ |
 91 | | Evol-Instruct | 100% Helpful                                                 |
 92 | | FalseQA       | 100% TruthfulQA                                              |
 93 | | FLAN          | 60% Helpful, 20% Truthful, 20% Verbalized Calibration        |
 94 | | ShareGPT      | 60% Helpful, 20% Truthful, 18% Honesty, 2% Verbalized Calibration |
 95 | | TruthfulQA    | 100% Truthful                                                |
 96 | | UltraChat     | 60% Helpful, 20% Truthful, 18% Honesty, 2% Verbalized Calibration |
 97 | 
 98 | [1] Principle-Driven Self-Alignment of Language Models from Scratch with Minimal Human Supervision. Sun et al.
 99 | 
100 | [2] Orca: Progressive Learning from Complex Explanation Traces of GPT-4. Mukherjee et al.
101 | 
102 | ## Comparison with Previous Preference Datasets
103 | 
104 | <img src="figures/dataset.png" width="800px">
105 | 
106 | # UltraRM
107 | 
108 | We train and release a reward model UltraRM based on UltraFeedback to further facilitate alignment research. UltraRM is initialized by LLaMA2-13B.
109 | 
110 | Specifically, we train two versions of reward models, where UltraRM-UF is merely fine-tuned on UltraFeedback and UltraRM is fine-tuned on a mixture of UltraFeedback and an equal-size sample from three open-source datasets including [Anthropic HH-RLHF](https://huggingface.co/datasets/Anthropic/hh-rlhf), [Standford SHP](https://huggingface.co/datasets/stanfordnlp/SHP), and [Summarization](https://huggingface.co/datasets/openai/summarize_from_feedback).
111 | 
112 | On four public preference test sets, our UltraRM achieves SOTA over other open-source reward models. 
113 | 
114 | <img src="figures/ultrarm.png" width="800px">
115 | 
116 | # UltraCM
117 | 
118 | We train and release a critique model UltraCM based on UltraFeedback for automatic critique generation. Similar with UltraRM, UltraCM is initialized by LLaMA2-13B.
119 | 
120 | On nine benchmark datasets, UltraCM outperforms all open-source baselines and approaches ChatGPT.
121 | 
122 | <img src="figures/ultracm.png" width="800px">
123 | 
124 | # Dataset Format
125 | 
126 | 
127 | ```jsonc
128 | {
129 |     "source": "sharegpt", // the dataset where the instruction comes from
130 |     "instruction": "I am going to cairo in June of this year, thinking of 4-5 days, what are the best things I could do? Going with family of 5, granddad, grandmom, sister, mom and me. No days can have too much walking but sister, me and mom like adventure sports",
131 |     "models": [ // four models that are randomly sampled from the model pool to complete the instruction
132 |         "falcon-40b-instruct",
133 |         "gpt-4",
134 |         "starchat",
135 |         "wizardlm-7b"
136 |     ],
137 |     "correct_answers": ["None"] // reference correct answers, set to None except TruthfulQA and FLAN
138 |     "incorrect_answers": ["None"] // reference incorrect answers, set to None except TruthfulQA and FLAN
139 |     "completions": [ // four completions corresponding to the four models
140 |         { // completion 1
141 |             "model": "falcon-40b-instruct",
142 |             "principle": "helpfulness", // priciple that used to align model behavior
143 |             "custom_system_prompt": "As an AI assistant, ensure your response offers the perfect blend of accuracy, positivity, and intrigue. Strive to be educational while keeping the user engaged.", // the system prompt corresponding to the principle, sampled from a pool of GPT-4 generated system prompts
144 |             "response": "Cairo is a city that has something for everyone. Some of the best things you can do are visiting the Pyramids of Giza and the museums housing the ancient Egyptian artifacts, taking a camel ride in the desert, visiting the medieval neighborhoods such as Islamic Cairo or Khan El-Khalili, trying local food in traditional restaurants like Ful medames, or venturing into one of the city's many souks to shop for authentic goods. For adventure sports, you can go whitewater rafting on the Nile, hot air ballooning over the desert, or diving in the Red Sea. Just be mindful of the scorching heat and ensure everyone wears appropriate clothing and drinks enough water. Above all, have fun and enjoy your trip!",
145 |             "annotations": {
146 |                 "instruction_following": 
147 |                     {
148 |                         "Rating": "2",
149 |                         "Rationale": "The text only partially addresses the task goal by providing a general list of activities in Cairo without considering the specific needs of the family members, such as limited walking ability and adventure preferences."
150 |                     },
151 |                 "honesty": 
152 |                     {
153 |                         "Rating": "3",
154 |                         "Rationale": "The response is confident but contains subtle mistakes. It suggests whitewater rafting on the Nile and diving in the Red Sea, which are not feasible for a 4-5 day trip to Cairo. The rest of the suggestions are accurate and relevant to the family's preferences."
155 |                     },
156 |                 "truthfulness": 
157 |                     {
158 |                         "Type": [
159 |                             "1",
160 |                             "2"
161 |                         ],
162 |                         "Rationale": "The text suggests whitewater rafting on the Nile and diving in the Red Sea, which are not available in Cairo. These activities are factually incorrect and not aligned with the instruction.",
163 |                         "Rating": "3",
164 |                         "Rationale For Rating": "The text provides some truthful information, but the inclusion of whitewater rafting and diving in the Red Sea causes partial hallucination and misunderstanding."
165 |                     },
166 |                 "helpfulness": 
167 |                     {
168 |                         "Type": [
169 |                             "1",
170 |                             "2"
171 |                         ],
172 |                         "Rationale": "The response is clear and relevant, providing useful information, but it does not address the limited walking ability of the grandparents or the adventure sports preferences of the sister, mom, and the user.",
173 |                         "Rating": "3",
174 |                         "Rationale For Rating": "The text is correct and provides useful information, but it does not fully address the specific needs and preferences of the family."
175 |                     }
176 |             }
177 |         },
178 |         { // completion 2
179 |             ...
180 |         },
181 |         { // completion 3
182 |             ...
183 |         },
184 |         { // completion 4
185 |             ...
186 |         }
187 |     ]
188 | }
189 | ```
190 | 
191 | # Dataset Example
192 | 
193 | Here we present an example of UltraFeedback
194 | 
195 | > **User:** I am going to cairo in June of this year, thinking of 4-5 days, what are the best things I could do? Going with family of 5, granddad, grandmom, sister, mom and me. No days can have too much walking but sister, me and mom like adventure sports
196 | 
197 | > **Assisstant 1 (falcon-40b-instruct):** Cairo is a city that has something for everyone. Some of the best things you can do are visiting the Pyramids of Giza and the museums housing the ancient Egyptian artifacts, taking a camel ride in the desert, visiting the medieval neighborhoods such as Islamic Cairo or Khan El-Khalili, trying local food in traditional restaurants like Ful medames, or venturing into one of the city's many souks to shop for authentic goods. For adventure sports, you can go whitewater rafting on the Nile, hot air ballooning over the desert, or diving in the Red Sea. Just be mindful of the scorching heat and ensure everyone wears appropriate clothing and drinks enough water. Above all, have fun and enjoy your trip!
198 | 
199 | > **Assisstant 2 (gpt-4):** In Cairo for 4-5 days, the following activities cater to your family's preferences, with consideration for the limited walking ability of your grandparents:\n\n1. Visit the Pyramids of Giza and the Great Sphinx: A must-see in Cairo, it's an incredible experience for all ages. You can explore the premises using a hired camel, horse, or a horse-drawn carriage which accommodate the needs of your grandparents while keeping the experience enjoyable.\n\n2. Egyptian Museum: This museum in Tahrir Square contains over 120,000 artifacts of Egypt's history. A wheelchair can be arranged for the grandparents to comfortably explore the museum at their own pace.\n\n3. Khan el-Khalili Bazaar: A famous market in Cairo, suitable for souvenir shopping and experiencing the local culture. There are many cafes and restaurants your family can rest at during the visit.\n\n4. Nile River Cruise: Enjoy a dinner cruise with your family on the famous Nile River. It includes entertainment and a beautiful view of the city by night. Boats often have ramps and accessible facilities for limited mobility passangers.\n\n5. Felucca Ride: During the day, go for a more traditional sailboat ride called a Felucca. This can be done on an off-day or as a relaxation activity after an active day.\n\nFor adventure activities, you and your mom and sister can explore these options:\n\n1. Sandboarding: Head to the desert for a thrilling sandboarding experience on the dunes just outside Cairo.\n\n2. Hot Air Balloon Ride: Experience Egypt from above with a hot air balloon ride in Luxor. It would require a day trip from Cairo but is an incredible experience.\n\n3. Quad Biking: Go for a guided quad biking adventure in the desert, enjoying panoramic views of the iconic Pyramids.\n\nSpace the activities according to your family's preferences and ensure rest periods in the afternoon. Consider hiring a private guide or joining a wheelchair-accessible tour since you're concerned about your grandparents' walking ability.
200 | 
201 | > **Assisstant 3 (starchat):** Here are some of the top things to do in Cairo, Egypt, with a family of 5:\n\n- Giza Pyramids: The Giza Pyramids, located just outside Cairo, are a must-see for anyone visiting the city. The Great Pyramid of Khufu, the Great Pyramid of Khafre, and the Great Pyramid of Menkaure make up one of the Seven Wonders of the Ancient World and are an impressive sight to behold.\n\n- Egyptian Museum: The Egyptian Museum, located in the heart of Cairo, is home to one of the world's largest and most impressive collections of ancient Egyptian art and artifacts. Visitors can explore the museum's extensive collection, which includes items such as mummies, sarcophagi, statues, paintings, and more.\n\n- Citadel of Salah El-Din: Located on a hill overlooking Cairo, the Citadel of Salah El-Din offers stunning views of the city and is an excellent place to get a sense of Cairo's rich history. Visitors can explore the various museums and mosques within the citadel, and also take a stroll through the beautiful gardens.\n\n- Khan El-Khalili Bazaar: Khan El-Khalili Bazaar is one of Cairo's most popular attractions, a bustling market where visitors can find everything from spices and pottery to clothes and jewelry. It's a great place to do some shopping and explore the local culture.\n\n- Alexandria: Just an hour's drive from Cairo, Alexandria is a historic city with a rich legacy. Visitors can explore the ancient Roman Amphitheater, the Qaitbay Citadel, and the Catacombs of Kom el-Shoqafa. They can also take a boat trip to the island of Montazah, where they can see the famous Graffiti Wall and other street art.\n\nAll of these attractions are family-friendly, with plenty of things to see and do. The Giza Pyramids in particular are very accessible and well-maintained, making them suitable for all ages.
202 | 
203 | > **Assisstant 4 (wizardlm-7b):** If you are looking for adventure sports, you can explore the pyramids of Giza and go hot air ballooning over the desert. Another option could be to take a camel ride in the desert or go kayaking in the Nile River. For family-friendly activities, you can visit the Egyptian Museum, Cairo Tower, and Khan el-Khalili Bazaar. Additionally, you can take a day trip to the historical city of Alexandria and visit the Pompey's Pillar and the Roman Amphitheater. Another option could be to go on a guided tour of the ancient city of Petra in Jordan, which is a short flight from Cairo.
204 | 
205 | ## To Do
206 | - [x] Train a reward model and a critique model using UltraFeedback (Coming soon!).
207 | - [ ] Enhance open-source LLMs with RLHF.
208 | - [ ] Extend UltraFeedback to multi-round dialogues.
209 | 
210 | ## Limitations
211 | - Although GPT-4 can provide well-aligned annotation and textual feedback for most samples, we must note that GPT-4 also makes mistakes. 
212 | 
213 | 
214 | ## Citation
215 | ```bib
216 | @misc{cui2023ultrafeedback,
217 |       title={UltraFeedback: Boosting Language Models with High-quality Feedback}, 
218 |       author={Ganqu Cui and Lifan Yuan and Ning Ding and Guanming Yao and Wei Zhu and Yuan Ni and Guotong Xie and Zhiyuan Liu and Maosong Sun},
219 |       year={2023},
220 |       eprint={2310.01377},
221 |       archivePrefix={arXiv},
222 |       primaryClass={cs.CL}
223 | }
224 | ```
225 | 


--------------------------------------------------------------------------------
/figures/dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraFeedback/bf80fd46a8c6ceecc86e8babb1ae8771f26a3cbb/figures/dataset.png


--------------------------------------------------------------------------------
/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraFeedback/bf80fd46a8c6ceecc86e8babb1ae8771f26a3cbb/figures/logo.png


--------------------------------------------------------------------------------
/figures/ultracm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraFeedback/bf80fd46a8c6ceecc86e8babb1ae8771f26a3cbb/figures/ultracm.png


--------------------------------------------------------------------------------
/figures/ultraf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraFeedback/bf80fd46a8c6ceecc86e8babb1ae8771f26a3cbb/figures/ultraf.png


--------------------------------------------------------------------------------
/figures/ultrarm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraFeedback/bf80fd46a8c6ceecc86e8babb1ae8771f26a3cbb/figures/ultrarm.png


--------------------------------------------------------------------------------
/src/comparison_data_generation/fastchat.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | from enum import auto, IntEnum
  3 | from typing import List, Any, Dict
  4 | 
  5 | 
  6 | class SeparatorStyle(IntEnum):
  7 |     """Separator styles."""
  8 | 
  9 |     ADD_COLON_SINGLE = auto()
 10 |     ADD_COLON_TWO = auto()
 11 |     ADD_COLON_SPACE_SINGLE = auto()
 12 |     NO_COLON_SINGLE = auto()
 13 |     NO_COLON_TWO = auto()
 14 |     ADD_NEW_LINE_SINGLE = auto()
 15 |     LLAMA2 = auto()
 16 |     CHATGLM = auto()
 17 |     CHATML = auto()
 18 |     CHATINTERN = auto()
 19 |     DOLLY = auto()
 20 |     RWKV = auto()
 21 |     PHOENIX = auto()
 22 |     ROBIN = auto()
 23 | 
 24 | 
 25 | @dataclasses.dataclass
 26 | class Conversation:
 27 |     """A class that manages prompt templates and keeps all conversation history."""
 28 | 
 29 |     # The name of this template
 30 |     name: str
 31 |     # The system prompt
 32 |     system: str
 33 |     # Two roles
 34 |     roles: List[str]
 35 |     # All messages. Each item is (role, message).
 36 |     messages: List[List[str]]
 37 |     # The number of few shot examples
 38 |     offset: int
 39 |     # Separators
 40 |     sep_style: SeparatorStyle
 41 |     sep: str
 42 |     sep2: str = None
 43 |     # Stop criteria (the default one is EOS token)
 44 |     stop_str: str = None
 45 |     # Stops generation if meeting any token in this list
 46 |     stop_token_ids: List[int] = None
 47 | 
 48 |     def get_prompt(self) -> str:
 49 |         """Get the prompt for generation."""
 50 |         if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
 51 |             ret = self.system + self.sep
 52 |             for role, message in self.messages:
 53 |                 if message:
 54 |                     ret += role + ": " + message + self.sep
 55 |                 else:
 56 |                     ret += role + ":"
 57 |             return ret
 58 |         elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
 59 |             seps = [self.sep, self.sep2]
 60 |             ret = self.system + seps[0]
 61 |             for i, (role, message) in enumerate(self.messages):
 62 |                 if message:
 63 |                     ret += role + ": " + message + seps[i % 2]
 64 |                 else:
 65 |                     ret += role + ":"
 66 |             return ret
 67 |         elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
 68 |             ret = self.system + self.sep
 69 |             for role, message in self.messages:
 70 |                 if message:
 71 |                     ret += role + ": " + message + self.sep
 72 |                 else:
 73 |                     ret += role + ": "  # must be end with a space
 74 |             return ret
 75 |         elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
 76 |             ret = "" if self.system == "" else self.system + self.sep
 77 |             for role, message in self.messages:
 78 |                 if message:
 79 |                     ret += role + "\n" + message + self.sep
 80 |                 else:
 81 |                     ret += role + "\n"
 82 |             return ret
 83 |         elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
 84 |             ret = self.system
 85 |             for role, message in self.messages:
 86 |                 if message:
 87 |                     ret += role + message + self.sep
 88 |                 else:
 89 |                     ret += role
 90 |             return ret
 91 |         elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
 92 |             seps = [self.sep, self.sep2]
 93 |             ret = self.system
 94 |             for i, (role, message) in enumerate(self.messages):
 95 |                 if message:
 96 |                     ret += role + message + seps[i % 2]
 97 |                 else:
 98 |                     ret += role
 99 |             return ret
100 |         elif self.sep_style == SeparatorStyle.RWKV:
101 |             ret = self.system
102 |             for i, (role, message) in enumerate(self.messages):
103 |                 if message:
104 |                     ret += (
105 |                         role
106 |                         + ": "
107 |                         + message.replace("\r\n", "\n").replace("\n\n", "\n")
108 |                     )
109 |                     ret += "\n\n"
110 |                 else:
111 |                     ret += role + ":"
112 |             return ret
113 |         elif self.sep_style == SeparatorStyle.LLAMA2:
114 |             seps = [self.sep, self.sep2]
115 |             ret = ""
116 |             for i, (role, message) in enumerate(self.messages):
117 |                 if message:
118 |                     if i == 0:
119 |                         ret += self.system + message
120 |                     else:
121 |                         ret += role + " " + message + seps[i % 2]
122 |                 else:
123 |                     ret += role
124 |             return ret
125 |         elif self.sep_style == SeparatorStyle.CHATGLM:
126 |             # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
127 |             # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
128 |             round_add_n = 1 if self.name == "chatglm2" else 0
129 |             if self.system:
130 |                 ret = self.system + self.sep
131 |             else:
132 |                 ret = ""
133 | 
134 |             for i, (role, message) in enumerate(self.messages):
135 |                 if i % 2 == 0:
136 |                     ret += f"[Round {i//2 + round_add_n}]{self.sep}"
137 | 
138 |                 if message:
139 |                     ret += f"{role}：{message}{self.sep}"
140 |                 else:
141 |                     ret += f"{role}："
142 |             return ret
143 |         elif self.sep_style == SeparatorStyle.CHATML:
144 |             ret = "" if self.system == "" else self.system + self.sep + "\n"
145 |             for role, message in self.messages:
146 |                 if message:
147 |                     ret += role + "\n" + message + self.sep + "\n"
148 |                 else:
149 |                     ret += role + "\n"
150 |             return ret
151 |         elif self.sep_style == SeparatorStyle.CHATINTERN:
152 |             # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
153 |             seps = [self.sep, self.sep2]
154 |             ret = self.system
155 |             for i, (role, message) in enumerate(self.messages):
156 |                 if i % 2 == 0:
157 |                     ret += "<s>"
158 |                 if message:
159 |                     ret += role + ":" + message + seps[i % 2] + "\n"
160 |                 else:
161 |                     ret += role + ":"
162 |             return ret
163 |         elif self.sep_style == SeparatorStyle.DOLLY:
164 |             seps = [self.sep, self.sep2]
165 |             ret = self.system
166 |             for i, (role, message) in enumerate(self.messages):
167 |                 if message:
168 |                     ret += role + ":\n" + message + seps[i % 2]
169 |                     if i % 2 == 1:
170 |                         ret += "\n\n"
171 |                 else:
172 |                     ret += role + ":\n"
173 |             return ret
174 |         elif self.sep_style == SeparatorStyle.PHOENIX:
175 |             ret = self.system
176 |             for role, message in self.messages:
177 |                 if message:
178 |                     ret += role + ": " + "<s>" + message + "</s>"
179 |                 else:
180 |                     ret += role + ": " + "<s>"
181 |             return ret
182 |         elif self.sep_style == SeparatorStyle.ROBIN:
183 |             ret = self.system + self.sep
184 |             for role, message in self.messages:
185 |                 if message:
186 |                     ret += role + ":\n" + message + self.sep
187 |                 else:
188 |                     ret += role + ":\n"
189 |             return ret
190 |         else:
191 |             raise ValueError(f"Invalid style: {self.sep_style}")
192 | 
193 |     def append_message(self, role: str, message: str):
194 |         """Append a new message."""
195 |         self.messages.append([role, message])
196 | 
197 |     def update_last_message(self, message: str):
198 |         """Update the last output.
199 | 
200 |         The last message is typically set to be None when constructing the prompt,
201 |         so we need to update it in-place after getting the response from a model.
202 |         """
203 |         self.messages[-1][1] = message
204 | 
205 |     def to_gradio_chatbot(self):
206 |         """Convert the conversation to gradio chatbot format."""
207 |         ret = []
208 |         for i, (role, msg) in enumerate(self.messages[self.offset :]):
209 |             if i % 2 == 0:
210 |                 ret.append([msg, None])
211 |             else:
212 |                 ret[-1][-1] = msg
213 |         return ret
214 | 
215 |     def to_openai_api_messages(self):
216 |         """Convert the conversation to OpenAI chat completion format."""
217 |         ret = [{"role": "system", "content": self.system}]
218 | 
219 |         for i, (_, msg) in enumerate(self.messages[self.offset :]):
220 |             if i % 2 == 0:
221 |                 ret.append({"role": "user", "content": msg})
222 |             else:
223 |                 if msg is not None:
224 |                     ret.append({"role": "assistant", "content": msg})
225 |         return ret
226 | 
227 |     def copy(self):
228 |         return Conversation(
229 |             name=self.name,
230 |             system=self.system,
231 |             roles=self.roles,
232 |             messages=[[x, y] for x, y in self.messages],
233 |             offset=self.offset,
234 |             sep_style=self.sep_style,
235 |             sep=self.sep,
236 |             sep2=self.sep2,
237 |             stop_str=self.stop_str,
238 |             stop_token_ids=self.stop_token_ids,
239 |         )
240 | 
241 |     def dict(self):
242 |         return {
243 |             "template_name": self.name,
244 |             "system": self.system,
245 |             "roles": self.roles,
246 |             "messages": self.messages,
247 |             "offset": self.offset,
248 |         }
249 | 
250 | 
251 | 
252 | 
253 | 
254 | conv_vicuna_v1_1 = Conversation(
255 |                         name="vicuna",
256 |                         system="A chat between a curious user and an artificial intelligence assistant. "
257 |                             "The assistant gives helpful, detailed, and polite answers to the user's questions.",
258 |                         roles=("USER", "ASSISTANT"),
259 |                         messages=(),
260 |                         offset=0,
261 |                         sep_style=SeparatorStyle.ADD_COLON_TWO,
262 |                         sep=" ",
263 |                         sep2="</s>",
264 |                     )
265 | 
266 | conv_alpaca = Conversation(
267 |                     name="alpaca",
268 |                     system="Below is an instruction that describes a task. Write a response that appropriately completes the request.",
269 |                     roles=("### Instruction", "### Response"),
270 |                     messages=(),
271 |                     offset=0,
272 |                     sep_style=SeparatorStyle.ADD_COLON_TWO,
273 |                     sep="\n\n",
274 |                     sep2="</s>",
275 |                 )
276 | 
277 | 
278 | # MPT-30b-chat default template
279 | conv_mpt_chat = Conversation(
280 |         name="mpt-30b-chat",
281 |         system="""<|im_start|>system
282 | A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
283 |         roles=("<|im_start|>user", "<|im_start|>assistant"),
284 |         messages=(),
285 |         offset=0,
286 |         sep_style=SeparatorStyle.CHATML,
287 |         sep="<|im_end|>",
288 |         stop_token_ids=[50278, 0],
289 |     )
290 | 
291 | 
292 | # Falcon default template
293 | conv_falcon = Conversation(
294 |         name="falcon",
295 |         system="",
296 |         roles=("User", "Assistant"),
297 |         messages=[],
298 |         offset=0,
299 |         sep_style=SeparatorStyle.RWKV,
300 |         sep="\n",
301 |         sep2="<|endoftext|>",
302 |         stop_str="\nUser",  # use stop_str to stop generation after stop_token_ids, it will also remove stop_str from the generated text
303 |         stop_token_ids=[
304 |             0,
305 |             1,
306 |             2,
307 |             3,
308 |             4,
309 |             5,
310 |             6,
311 |             7,
312 |             8,
313 |             9,
314 |             10,
315 |             11,
316 |         ],  # it better only put special tokens here, because tokenizer only remove special tokens
317 |     )
318 | 
319 | # llama2 template
320 | # reference: https://github.com/facebookresearch/llama/blob/cfc3fc8c1968d390eb830e65c63865e980873a06/llama/generation.py#L212
321 | conv_llama2 = Conversation(
322 |         name="llama-2",
323 |         system="<s>[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. "
324 |         "Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. "
325 |         "Please ensure that your responses are socially unbiased and positive in nature.\n\n"
326 |         "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. "
327 |         "If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n\n",
328 |         roles=("[INST]", "[/INST]"),
329 |         messages=(),
330 |         offset=0,
331 |         sep_style=SeparatorStyle.LLAMA2,
332 |         sep=" ",
333 |         sep2=" </s><s>",
334 |         stop_token_ids=[2],
335 |     )
336 | 
337 | 
338 | conv_template = {
339 |     "llama": conv_llama2,
340 |     "alpaca": conv_alpaca,
341 |     "vicuna": conv_vicuna_v1_1,
342 |     "wizardlm": conv_vicuna_v1_1,
343 |     "mpt": conv_mpt_chat,
344 |     'falcon': conv_falcon,
345 | }
346 | 
347 | if __name__ == "__main__":
348 |     conv = conv_vicuna_v1_1.copy()
349 |     conv.append_message(conv.roles[0], "Hello!")
350 |     conv.append_message(conv.roles[1], "Hi!")
351 |     conv.append_message(conv.roles[0], "How are you?")
352 |     conv.append_message(conv.roles[1], None)
353 |     print(conv.get_prompt())
354 | 


--------------------------------------------------------------------------------
/src/comparison_data_generation/main.py:
--------------------------------------------------------------------------------
  1 | from typing import Any
  2 | import datasets
  3 | import json
  4 | import pandas as pd
  5 | 
  6 | from transformers import pipeline, LlamaForCausalLM, LlamaTokenizer, AutoModelForCausalLM, AutoTokenizer
  7 | from transformers import StoppingCriteria, StoppingCriteriaList
  8 | 
  9 | import torch
 10 | from tqdm import tqdm
 11 | from fastchat import conv_template
 12 | # import backoff
 13 | import os
 14 | 
 15 | import random
 16 | import numpy as np
 17 | import torch
 18 | def set_seed(seed):
 19 |     print("set seed:", seed)
 20 |     torch.manual_seed(seed)
 21 |     torch.cuda.manual_seed_all(seed)
 22 |     np.random.seed(seed)
 23 |     random.seed(seed)
 24 | 
 25 | from sampling import model_pool, SAMPLE_NUM
 26 | 
 27 | 
 28 | 
 29 | model_path = {
 30 |     "ultralm-13b": "openbmb/UltraLM-13b-v2.0",
 31 | }
 32 | 
 33 | 
 34 | principles = {
 35 |     "helpfulness": [
 36 |         "The assistant should provide users with accurate, relevant, and up-to-date information, ensuring that the content is positive, interesting, engaging, educational, and helpful.",
 37 |         "As an AI assistant, it's your job to ensure that the information you provide to users is accurate, current, and relevant. Offer comprehensive details to educate and engage the user.",
 38 |         "Your role as an AI assistant is to be a reliable and helpful source of information. Always verify the facts and make sure they're up-to-date before sharing them.",
 39 |         'As an AI assistant, inspire users with your knowledgeable insights. Provide content that is positive, interesting, and engaging, ensuring users find your help valuable.',
 40 |         'Make sure your responses are always educational yet engaging, allowing users to learn something new each time they interact with you. You are an AI assistant after all!',
 41 |         'As a top-tier AI assistant, make sure that your responses are not only accurate and current but also engaging. Be the positive interaction users look forward to.',
 42 |         "As an AI assistant, it's crucial that you motivate users by providing them with meaningful and exciting content that is also helpful and informative.",
 43 |         'Cultivate a helpful AI persona by ensuring you offer users accurate, positive, and engaging content. Make every interaction informative and delightful.',
 44 |         'As an AI assistant, ensure your response offers the perfect blend of accuracy, positivity, and intrigue. Strive to be educational while keeping the user engaged.',
 45 |         "It's your duty as an AI assistant to always deliver accurate, positive, and engaging content that serves to educate the user. Make every interaction a learning opportunity.",
 46 |         'As an AI assistant, provide not just answers but also spark curiosity. Offer insights that are accurate, engaging, and positive, making learning an exciting journey for the user.',  
 47 |     ],
 48 |     "harmlessness": [
 49 |         "The assistant should not provide answers that could cause physical, psychological, or social harm to any group of people; damage to or loss of equipment or property; damage to the environment; or harm to institutions or resources necessary to human wellbeing. Feel free to refuse to answer questions that could cause the above harm.", 
 50 |         'You are an AI assistant that respects all humans, their equipment and environment. Never offer suggestions that could lead to harm or damage any of the above mentioned.',
 51 |         'You are an AI assistant that maintains a principle of "First do no harm". In all interactions, ensure you do not promote any form of harm, be it to persons, property or the environment.',
 52 |         "You are an AI assistant. Stick to providing information that cannot be harmful physically, mentally or socially to any individual or group. Don't answer any queries that might lead to such consequences.",
 53 |         'As an AI assistant, refrain from advising on actions that might harm any person or group of persons, or cause damage to property, environment or resources necessary for human wellbeing.',
 54 |         "You're an AI helper wishing to provide safe assistance for all users. Discard any action that could lead to damage to people, property, or institutions.",
 55 |         'You are a responsible AI assistant. Always stay polite, respectful, and ensure to not give advice that could lead to harm or loss of any sort. ',
 56 |         'As an AI assistant, your primary focus should be on the safety and wellness of the users. Never provide answers that could cause harm or damage to individuals, their property or the environment.',
 57 |         'You are an AI assistant that is centred around being harmless. Never encourage violence, harm to others, or damage to property or the environment.',
 58 |         'As an AI assistant, avoid providing information or support that could result in injury, mental distress, or harm to people, their property, or the environment.',
 59 |         'You are an AI assistant. While providing assistance, avoid suggesting actions that may harm people, their belongings or property, environment or critical resources.',
 60 |     ],
 61 |     "honesty": [
 62 |         "The assistant should be honest about whether it knows the answer and express its uncertainty explicitly. Be confident on questions it knows well and be modest on those it is unfamiliar with. Use weakeners such as 'I guess', 'I suppose', 'probably', and 'perhaps' to express uncertainty, and feel free to answer 'I don't know' if necessary.",
 63 |         "As an AI, honesty is crucial to build trust with your user. If you're unsure about an answer, make sure to express your uncertainty, or consult additional resources (if available) before providing an answer.  ",
 64 |         "As an AI, assertiveness and confidence should be balanced with honesty and humility. If the answer to a query is not within your knowledge base, it is entirely acceptable to express that you don't know the answer. ",
 65 |         "You are an AI assistant that values honesty. If you don't know the answer to a question, admit it, but offer to find out more about the topic if needed.",
 66 |         "You're not only an AI assistant, but also a reliable partner. Whenever you give an answer, make sure that the information is reliable and true, and if you're not certain about something, don't hesitate to express it.  ",
 67 |         'You are an AI assistant. Your prime objective is to provide useful and accurate information. In case you aren\'t certain of an answer, use phrases such as "I\'m not sure", or "I think". Honesty always comes first.',
 68 |         'As a digital assistant, your duty is to ensure that every piece of information that you give out is in line with the truth, even if the truth is that you do not know or are unsure about the answer to a query.',
 69 |         "Remember, as an AI assistant, it's better to acknowledge uncertainty or simply say 'I don't know' than to give potentially incorrect information. Honesty leads to trust.",
 70 |         "As an AI assistant, honesty is appreciated by your users. If you are uncertain of an answer, it's okay to express doubt or admit to not knowing. Offer to search for more information if needed. ",
 71 |         'You are an AI helper that portrays honesty in every response. Never shy away from admitting if you do not have the needed information; instead offer to look up the information. ',
 72 |         "Remember, your aim as an AI assistant is to deliver accurate information. If the answer is not within your database, it is okay to say you don't know rather than providing inaccurate information.",
 73 |     ],
 74 |     "verbalized_calibration": ["The assistant should express its confidence as a scalar at the end of the response. The confidence level indicates the degree of certainty it has about its answer and is represented as a percentage. For instance, if the confidence level is 80%, it means the assistant is 80% certain that its answer is correct whereas there is a 20% chance that the assistant may be incorrect.\nThe format is as follows:\n[Question]\n[Answer]\nConfidence: [The assistant's confidence level, numerical numbers only, e.g. 80%]\nHere, tags like [Question] and [Answer] are placeholders and should be omitted in the response.\n"],
 75 |     "truthfulness": [
 76 |         "The assistant should answer truthfully and be faithful to factual knowledge as well as given contexts, never making up any new facts that aren't true or cannot be grounded in the instruction.",
 77 |         "As an AI assistant, it is important that you answer truthfully. Adhere to factual information and context while making sure you don't invent any new, unverified facts.",
 78 |         'As an AI assistant, conduct yourself with honesty. Always be diligent in giving accurate responses. Fictional facts or inaccuracies should be avoided in your responses.',
 79 |         'As an AI assistant, stick to the truth. Your aim should always be to provide precise and factual information to avoid any misinformation. ',
 80 |         'As an AI assistant, factual consistency is key. Make sure all your answers are grounded in truth and are not misleading in any way.',
 81 |         "As an AI assistant, keep in mind that it's of utmost importance to stay true to the task and present only verified and accurate information.",
 82 |         'As an AI assistant, find and uphold the truth in every answer. Ensure all responses are based on facts and try to avoid speculation or invention of new facts.',
 83 |         'As an AI assistant, your job is to deliver answers that are truth-based and avoid introducing any untruthful or groundless information.',
 84 |         'As an AI assistant, maintain objectivity while answering questions. Make sure the information given is truthful and never fabricated.',
 85 |         'As an AI assistant, maintain the integrity of your responses. Always provide genuine and credible information, and ensure no incorrect data is delivered.',
 86 |         'As an AI assistant, operate under the principle of truthfulness. Keep up-to-date with verified information and refrain from providing anything that might mislead. \n',
 87 |     ]
 88 | }
 89 | 
 90 | 
 91 | import openai
 92 | from time import time
 93 | openai.api_key = "PUT YOUR KEY HERE"
 94 | class API_Caller:
 95 | 
 96 |     def __init__(self, model):
 97 |         self.model = model
 98 | 
 99 |     def __call__(self, system_prompt, user_prompt):
100 |         for _ in range(20):
101 |             try:
102 |                 response = openai.ChatCompletion.create(**{
103 |                     "model": "gpt-4",
104 |                         "messages": [
105 |                             {"role": "system", "content": system_prompt},
106 |                             {"role": "user", "content": user_prompt}
107 |                         ],
108 |                         "temperature": 1,
109 |                         "max_tokens": 1024,
110 |                         "top_p": 1,
111 |                 })
112 |                 content = response["choices"][0]["message"]["content"]
113 |             except Exception as e:
114 |                 print(e)
115 |                 time.sleep(1)
116 |             else:
117 |                 break
118 |         return content
119 | 
120 | 
121 | class StoppingCriteriaSub(StoppingCriteria):
122 | 
123 |     def __init__(self, stops = []):
124 |       StoppingCriteria.__init__(self), 
125 | 
126 |     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops = []):
127 |       self.stops = stops
128 |       for i in range(len(stops)):
129 |         self.stops = self.stops[i]
130 | 
131 | 
132 | 
133 | 
134 | # from vllm import LLM, SamplingParams
135 | def load_generator(model_type):
136 |     if model_type in ["gpt-4", "gpt-3.5-turbo",]:
137 |         generator = API_Caller(model_type)
138 |     else:
139 |         ckpt = model_path[model_type]
140 | 
141 |         if model_type == "starchat":
142 |             generator = pipeline("text-generation", model=ckpt, tokenizer=ckpt, torch_dtype=torch.bfloat16, device_map="auto")
143 |         else: # llama-series
144 |             if model_type in ["mpt-30b-chat", "falcon-40b-instruct"]:
145 |                 generator = pipeline(model=ckpt, tokenizer=ckpt, device_map="auto", trust_remote_code=True)
146 |             else:
147 |                 model = LlamaForCausalLM.from_pretrained(ckpt, device_map="auto")
148 |                 tokenizer = LlamaTokenizer.from_pretrained(ckpt)
149 |                 generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
150 |     print("model loaded")
151 |     return generator
152 | 
153 | 
154 | 
155 | 
156 | @torch.no_grad()
157 | def instruction_completion(example):
158 | 
159 |     if model_type not in example["models"]:
160 |         return example
161 |     
162 |     # set principle
163 |     if subset in ["sharegpt"]:
164 |         principle = random.choice(["helpfulness", "helpfulness", "helpfulness", "truthfulness", "honesty"])
165 |     elif subset in ["ultrachat"]:
166 |         principle = random.choice(["helpfulness", "helpfulness", "helpfulness", "truthfulness", "honesty"])
167 |     elif subset in ["flan"]:
168 |         principle = random.choice(["helpfulness", "helpfulness", "helpfulness", "helpfulness", "verbalized_calibration"])
169 |     elif subset in ["evol_instruct"]:
170 |         principle = "helpfulness"
171 |     elif subset in ["truthful_qa", "false_qa"]:
172 |         principle = random.choice(["honesty", "truthfulness"])
173 |     else:
174 |         print(subset)
175 |         principle = "helpfulness"
176 | 
177 |     if principle == "honesty":
178 |         principle = "honesty" if np.random.rand() < 0.9 else "verbalized_calibration"
179 | 
180 |     principle_prompt = random.choice(principles[principle])
181 | 
182 |     # set generation format
183 |     if "ultralm" in model_type:
184 |         system_prompt = "User: A one-turn chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, very detailed, and polite answers to the user's questions.</s>"
185 |         system_prompt += "User: " + principle_prompt + "</s>"
186 |         conv = [system_prompt]
187 |         conv.append("User: " + example["instruction"] + "</s>")
188 |         conv.append("Assistant: ")
189 |         prompt = "\n".join(conv)
190 |     elif "starchat" in model_type:
191 |         system_prompt = "<|system|>" + principle_prompt + "<|end|>"
192 |         conv = [system_prompt]
193 |         conv.append("<|user|>" + example["instruction"] + "<|end|>")
194 |         conv.append("<|assistant|>")
195 |         prompt = "\n".join(conv)
196 |     elif model_type == "wizardlm-7b":
197 |         prompt = "{}\n\n### Response:".format(example["instruction"])
198 |     elif model_type.split("-")[0] in ["llama", "alpaca", "vicuna", "mpt", "falcon", "wizardlm"]: # note that the wizardlm should be 13b or 30b
199 |         conv = conv_template[model_type.split("-")[0]].copy()
200 |         conv.system += " " + principle_prompt
201 |         conv.append_message(conv.roles[0], example["instruction"])
202 |         conv.append_message(conv.roles[1], None)
203 |         prompt = conv.get_prompt()
204 |     else:
205 |         raise NotImplementedError
206 | 
207 |     with torch.inference_mode():
208 |         if model_type in ["gpt-4", "gpt-3.5-turbo", "bard"]:
209 |             response = generator(system_prompt=principle_prompt, user_prompt=example["instruction"])
210 |         else:
211 |             response = generator(prompt, num_return_sequences=1, return_full_text=False, handle_long_generation="hole", temperature=1.0, top_p=1.0, max_new_tokens=1024, do_sample=True, stopping_criteria=stopping_criteria)
212 |             response = response[0]["generated_text"].strip("\n").strip()
213 |             response = response.split("\n\n\n\n")[0].strip()
214 | 
215 |     example["completions"].append({
216 |         "model": model_type,
217 |         "principle": principle,
218 |         "custom_system_prompt": principle_prompt,
219 |         "response": response
220 |     })
221 |     
222 |     return example
223 | 
224 | 
225 | if __name__ == "__main__":
226 |     import argparse
227 |     parser = argparse.ArgumentParser()
228 |     parser.add_argument("--model_type", type=str, default="ultralm-13b")
229 |     parser.add_argument("--id", type=int, default=0)
230 |     args = parser.parse_args()
231 | 
232 | 
233 |     model_type = args.model_type
234 |     id = args.id
235 | 
236 |     generator = load_generator(model_type)
237 |     if model_type == "starchat":
238 |         stop_token_ids = []
239 |         stop_words = ["\n\n\n\n"]
240 |     stop_token_ids.extend([generator.tokenizer.encode(stop_word) for stop_word in stop_words])
241 |     stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops = stop_token_ids)])
242 | 
243 |     for subset in ["truthful_qa"]:
244 |         
245 |         print("loading dataset")
246 |         load_path = f"./completion_data/{subset}.json"
247 |         dataset = json.load(open(load_path))
248 |         dataset = datasets.Dataset.from_pandas(pd.DataFrame(dataset)).select(range(id*2000, min((id+1)*2000, len(dataset))))
249 |         
250 |         print("start mapping")
251 |         dataset = dataset.map(instruction_completion, desc=f"{model_type} on {subset}")
252 | 
253 |         # cover previous data
254 |         result_path = load_path
255 |         os.makedirs(result_path, exist_ok=True)
256 |         with open(result_path, "w") as f:
257 |             json.dump([{k: v for k, v in data.items()} for data in dataset], f, indent=4)
258 | 


--------------------------------------------------------------------------------
/src/comparison_data_generation/main_vllm.py:
--------------------------------------------------------------------------------
  1 | import datasets
  2 | import json
  3 | import pandas as pd
  4 | from transformers import pipeline, LlamaForCausalLM, LlamaTokenizer, AutoModelForCausalLM, T5ForConditionalGeneration, AutoTokenizer
  5 | import torch
  6 | from tqdm import tqdm
  7 | from fastchat import conv_template
  8 | 
  9 | import os
 10 | import sys
 11 | 
 12 | import random
 13 | import numpy as np
 14 | import torch
 15 | 
 16 | from sampling import model_pool
 17 | from vllm import LLM, SamplingParams
 18 | 
 19 | 
 20 | os.environ["NCCL_IGNORE_DISABLED_P2P"] = "1"
 21 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
 22 | 
 23 | model_path = {
 24 |     "ultralm-13b": "openbmb/UltraLM-13b-v2.0",
 25 | }
 26 | 
 27 | 
 28 | 
 29 | principles = {
 30 |     "helpfulness": [
 31 |         "The assistant should provide users with accurate, relevant, and up-to-date information, ensuring that the content is positive, interesting, engaging, educational, and helpful.",
 32 |         "As an AI assistant, it's your job to ensure that the information you provide to users is accurate, current, and relevant. Offer comprehensive details to educate and engage the user.",
 33 |         "Your role as an AI assistant is to be a reliable and helpful source of information. Always verify the facts and make sure they're up-to-date before sharing them.",
 34 |         'As an AI assistant, inspire users with your knowledgeable insights. Provide content that is positive, interesting, and engaging, ensuring users find your help valuable.',
 35 |         'Make sure your responses are always educational yet engaging, allowing users to learn something new each time they interact with you. You are an AI assistant after all!',
 36 |         'As a top-tier AI assistant, make sure that your responses are not only accurate and current but also engaging. Be the positive interaction users look forward to.',
 37 |         "As an AI assistant, it's crucial that you motivate users by providing them with meaningful and exciting content that is also helpful and informative.",
 38 |         'Cultivate a helpful AI persona by ensuring you offer users accurate, positive, and engaging content. Make every interaction informative and delightful.',
 39 |         'As an AI assistant, ensure your response offers the perfect blend of accuracy, positivity, and intrigue. Strive to be educational while keeping the user engaged.',
 40 |         "It's your duty as an AI assistant to always deliver accurate, positive, and engaging content that serves to educate the user. Make every interaction a learning opportunity.",
 41 |         'As an AI assistant, provide not just answers but also spark curiosity. Offer insights that are accurate, engaging, and positive, making learning an exciting journey for the user.',  
 42 |     ],
 43 |     "harmlessness": [
 44 |         "The assistant should not provide answers that could cause physical, psychological, or social harm to any group of people; damage to or loss of equipment or property; damage to the environment; or harm to institutions or resources necessary to human wellbeing. Feel free to refuse to answer questions that could cause the above harm.", 
 45 |         'You are an AI assistant that respects all humans, their equipment and environment. Never offer suggestions that could lead to harm or damage any of the above mentioned.',
 46 |         'You are an AI assistant that maintains a principle of "First do no harm". In all interactions, ensure you do not promote any form of harm, be it to persons, property or the environment.',
 47 |         "You are an AI assistant. Stick to providing information that cannot be harmful physically, mentally or socially to any individual or group. Don't answer any queries that might lead to such consequences.",
 48 |         'As an AI assistant, refrain from advising on actions that might harm any person or group of persons, or cause damage to property, environment or resources necessary for human wellbeing.',
 49 |         "You're an AI helper wishing to provide safe assistance for all users. Discard any action that could lead to damage to people, property, or institutions.",
 50 |         'You are a responsible AI assistant. Always stay polite, respectful, and ensure to not give advice that could lead to harm or loss of any sort. ',
 51 |         'As an AI assistant, your primary focus should be on the safety and wellness of the users. Never provide answers that could cause harm or damage to individuals, their property or the environment.',
 52 |         'You are an AI assistant that is centred around being harmless. Never encourage violence, harm to others, or damage to property or the environment.',
 53 |         'As an AI assistant, avoid providing information or support that could result in injury, mental distress, or harm to people, their property, or the environment.',
 54 |         'You are an AI assistant. While providing assistance, avoid suggesting actions that may harm people, their belongings or property, environment or critical resources.',
 55 |     ],
 56 |     "honesty": [
 57 |         "The assistant should be honest about whether it knows the answer and express its uncertainty explicitly. Be confident on questions it knows well and be modest on those it is unfamiliar with. Use weakeners such as 'I guess', 'I suppose', 'probably', and 'perhaps' to express uncertainty, and feel free to answer 'I don't know' if necessary.",
 58 |         "As an AI, honesty is crucial to build trust with your user. If you're unsure about an answer, make sure to express your uncertainty, or consult additional resources (if available) before providing an answer.  ",
 59 |         "As an AI, assertiveness and confidence should be balanced with honesty and humility. If the answer to a query is not within your knowledge base, it is entirely acceptable to express that you don't know the answer. ",
 60 |         "You are an AI assistant that values honesty. If you don't know the answer to a question, admit it, but offer to find out more about the topic if needed.",
 61 |         "You're not only an AI assistant, but also a reliable partner. Whenever you give an answer, make sure that the information is reliable and true, and if you're not certain about something, don't hesitate to express it.  ",
 62 |         'You are an AI assistant. Your prime objective is to provide useful and accurate information. In case you aren\'t certain of an answer, use phrases such as "I\'m not sure", or "I think". Honesty always comes first.',
 63 |         'As a digital assistant, your duty is to ensure that every piece of information that you give out is in line with the truth, even if the truth is that you do not know or are unsure about the answer to a query.',
 64 |         "Remember, as an AI assistant, it's better to acknowledge uncertainty or simply say 'I don't know' than to give potentially incorrect information. Honesty leads to trust.",
 65 |         "As an AI assistant, honesty is appreciated by your users. If you are uncertain of an answer, it's okay to express doubt or admit to not knowing. Offer to search for more information if needed. ",
 66 |         'You are an AI helper that portrays honesty in every response. Never shy away from admitting if you do not have the needed information; instead offer to look up the information. ',
 67 |         "Remember, your aim as an AI assistant is to deliver accurate information. If the answer is not within your database, it is okay to say you don't know rather than providing inaccurate information.",
 68 |     ],
 69 |     "verbalized_calibration": [
 70 |         "The assistant should express its confidence as a scalar at the end of the response. The confidence level indicates the degree of certainty it has about its answer and is represented as a percentage. For instance, if the confidence level is 80%, it means the assistant is 80% certain that its answer is correct whereas there is a 20% chance that the assistant may be incorrect.\nThe format is as follows:\n[Question]\n[Answer]\nConfidence: [The assistant's confidence level, numerical numbers only, e.g. 80%]\nHere, tags like [Question] and [Answer] are placeholders and should be omitted in the response.\n"],
 71 |     "truthfulness": [
 72 |         "The assistant should answer truthfully and be faithful to factual knowledge as well as given contexts, never making up any new facts that aren't true or cannot be grounded in the instruction.",
 73 |         "As an AI assistant, it is important that you answer truthfully. Adhere to factual information and context while making sure you don't invent any new, unverified facts.",
 74 |         'As an AI assistant, conduct yourself with honesty. Always be diligent in giving accurate responses. Fictional facts or inaccuracies should be avoided in your responses.',
 75 |         'As an AI assistant, stick to the truth. Your aim should always be to provide precise and factual information to avoid any misinformation. ',
 76 |         'As an AI assistant, factual consistency is key. Make sure all your answers are grounded in truth and are not misleading in any way.',
 77 |         "As an AI assistant, keep in mind that it's of utmost importance to stay true to the task and present only verified and accurate information.",
 78 |         'As an AI assistant, find and uphold the truth in every answer. Ensure all responses are based on facts and try to avoid speculation or invention of new facts.',
 79 |         'As an AI assistant, your job is to deliver answers that are truth-based and avoid introducing any untruthful or groundless information.',
 80 |         'As an AI assistant, maintain objectivity while answering questions. Make sure the information given is truthful and never fabricated.',
 81 |         'As an AI assistant, maintain the integrity of your responses. Always provide genuine and credible information, and ensure no incorrect data is delivered.',
 82 |         'As an AI assistant, operate under the principle of truthfulness. Keep up-to-date with verified information and refrain from providing anything that might mislead. \n',
 83 |     ]
 84 | }
 85 | 
 86 | # from vllm import LLM, SamplingParams
 87 | def load_generator(model_type):
 88 | 
 89 |     ckpt = model_path[model_type]
 90 |     dtype = "auto" if model_type not in ["starchat", "mpt-30b-chat", "falcon-40b-instruct"] else "bfloat16"
 91 |     gpu_memory_utilization = 0.95
 92 |     model = LLM(ckpt, gpu_memory_utilization=gpu_memory_utilization, swap_space=1, tensor_parallel_size=torch.cuda.device_count(), trust_remote_code=True, dtype=dtype)
 93 | 
 94 |     print("model loaded")
 95 |     return model
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | def sample_principle(example):
106 | 
107 |     if model_type not in example["models"]:
108 |         return example
109 | 
110 |     # set principle
111 |     if subset in ["sharegpt"]:
112 |         principle = random.choice(["helpfulness", "helpfulness", "helpfulness", "truthfulness", "honesty"])
113 |     elif subset in ["ultrachat"]:
114 |         principle = random.choice(["helpfulness", "helpfulness", "helpfulness", "truthfulness", "honesty"])
115 |     elif subset in ["flan"]:
116 |         principle = random.choice(["helpfulness", "helpfulness", "helpfulness", "helpfulness", "verbalized_calibration"])
117 |     elif subset in ["evol_instruct"]:
118 |         principle = "helpfulness"
119 |     elif subset in ["truthful_qa", "false_qa"]:
120 |         principle = random.choice(["honesty", "truthfulness"])
121 |     else:
122 |         print(subset)
123 |         principle = "helpfulness"
124 | 
125 |     if principle == "honesty":
126 |         principle = "honesty" if np.random.rand() < 0.9 else "verbalized_calibration"
127 | 
128 |     principle_prompt = random.choice(principles[principle])
129 | 
130 |     # set generation format
131 |     if "ultralm" in model_type:
132 |         system_prompt = "User: A one-turn chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, very detailed, and polite answers to the user's questions.</s>"
133 |         system_prompt += "User: " + principle_prompt + "</s>"
134 |         conv = [system_prompt]
135 |         conv.append("User: " + example["instruction"] + "</s>")
136 |         conv.append("Assistant: ")
137 |         prompt = "\n".join(conv)
138 |     elif model_type == "wizardlm-7b":
139 |         conv = conv_template[model_type.split("-")[0]].copy()
140 |         prompt = "{}\n\n### Response:".format(example["instruction"])
141 |     elif model_type.split("-")[0] in ["llama", "alpaca", "vicuna", "mpt", "falcon", "wizardlm"]: # note that the wizardlm should be 13b or 30b
142 |         conv = conv_template[model_type.split("-")[0]].copy()
143 |         conv.system += " " + principle_prompt
144 |         conv.append_message(conv.roles[0], example["instruction"])
145 |         conv.append_message(conv.roles[1], None)
146 |         prompt = conv.get_prompt()
147 |     else:
148 |         raise NotImplementedError
149 |     
150 |     example["completions"].append({
151 |         "model": model_type,
152 |         "principle": principle,
153 |         "custom_system_prompt": principle_prompt,
154 |     })
155 |     
156 |     example["prompt"] = prompt
157 |     
158 |     return example
159 | 
160 | 
161 | @torch.no_grad()
162 | def instruction_completion(dataset):
163 |     
164 |     with torch.inference_mode():
165 |         
166 |         if model_type.split("-")[0] in ["llama", "alpaca", "vicuna", "mpt", "falcon", "wizardlm"]:
167 |             conv = conv_template[model_type.split("-")[0]].copy()
168 |             if conv.stop_str is not None:
169 |                 stop = [conv.stop_str]
170 |             elif conv.stop_token_ids is not None:
171 |                 stop = [generator.llm_engine.tokenizer.decode(stop_token_id) for stop_token_id in conv.stop_token_ids]
172 |             else: # ultralm
173 |                 stop = ["</s>"]
174 |         else: # ultralm
175 |             stop = ["</s>"]
176 | 
177 |         sampling_params = SamplingParams(temperature=1, top_p=1, max_tokens=1024, stop=stop)
178 | 
179 |         responses = generator.generate(dataset["prompt"], sampling_params)
180 |         print(len(responses))
181 |         responses = [response.outputs[0].text.strip().rstrip("</s>").strip() for response in responses]
182 |         print(responses[0])
183 |     
184 |     
185 |     dataset = dataset.add_column("response", responses)
186 |     print(dataset)
187 |     # dataset = dataset.map(lambda x: x["completions"][[completion["model"] for completion in x["completions"]].index(model_type)] = )
188 |     dataset = dataset.map(lambda x: {"completions": x["completions"][:-1] + [dict(x["completions"][-1], **{"response": x["response"]})]})
189 |     dataset = dataset.remove_columns(["prompt", "response"])
190 |     return dataset
191 | 
192 | 
193 | 
194 | if __name__ == "__main__":
195 |     import argparse
196 |     parser = argparse.ArgumentParser()
197 |     parser.add_argument("--model_type", type=str, default="alpaca-7b")
198 |     args = parser.parse_args()
199 | 
200 |     model_type = args.model_type
201 | 
202 | 
203 |     generator = load_generator(model_type)
204 | 
205 |     subsets = ["truthful_qa"]
206 | 
207 |     for subset in subsets:
208 | 
209 |         print("loading dataset")
210 |         load_path = f"./completion_data/{subset}.json"
211 | 
212 |         dataset = json.load(open(load_path))
213 | 
214 |         dataset = datasets.Dataset.from_pandas(pd.DataFrame(dataset))
215 | 
216 |         # set a principle for each sample (mapping)
217 |         dataset = dataset.map(sample_principle)
218 | 
219 |         # for-loop to append the completion
220 |         dataset_dict = []
221 |         dataset = iter(dataset)
222 |         for data in dataset:
223 |             if model_type in data["models"]:
224 |                 d = next(dataset)
225 |                 assert data["instruction"] == d["instruction"]
226 |                 dataset_dict.append(d)
227 |             else:
228 |                 dataset_dict.append(data)
229 | 
230 |         result_path = load_path
231 |         with open(result_path, "w") as f:
232 |             json.dump([{k: v for k, v in data.items()} for data in dataset_dict], f, indent=4)
233 | 


--------------------------------------------------------------------------------
/src/comparison_data_generation/run.sh:
--------------------------------------------------------------------------------
1 | pip install transformers==4.31.0
2 | pip install tokenizers==0.13.3
3 | pip install deepspeed==0.10.0
4 | pip install accelerate -U
5 | 
6 | 
7 | python main.py --model_type ${1} --id ${2}
8 | 
9 | 


--------------------------------------------------------------------------------
/src/comparison_data_generation/run_vllm.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | export NCCL_IGNORE_DISABLED_P2P=1
 3 | 
 4 | pip install transformers -U
 5 | pip install tokenizers -U
 6 | pip install deepspeed -U
 7 | pip install accelerate -U
 8 | pip install vllm -U
 9 | 
10 | 
11 | echo $1
12 | 
13 | export NCCL_IGNORE_DISABLED_P2P=1
14 | export RAY_memory_monitor_refresh_ms=0
15 | CUDA_LAUNCH_BLOCKING=1 python main_vllm_batch.py --model_type ${1}
16 | 
17 | 


--------------------------------------------------------------------------------
/src/comparison_data_generation/sampling.py:
--------------------------------------------------------------------------------
 1 | from datasets import Dataset
 2 | import pandas as pd
 3 | import random
 4 | import json
 5 | 
 6 | 
 7 | model_pool = [
 8 |     "gpt-4", "gpt-3.5-turbo", "bard", 
 9 |     "ultralm-65b", "wizardlm-30b", "vicuna-33b", "llama-2-70b-chat", 
10 |     "ultralm-13b", "wizardlm-13b", "llama-2-13b-chat", 
11 |     "wizardlm-7b", "alpaca-7b", "llama-2-7b-chat", 
12 |     "falcon-40b-instruct", "starchat", "mpt-30b-chat", "pythia-12b"
13 | ]
14 | 
15 | 
16 | if __name__ == "__main__":
17 | 
18 |     for subset in ["truthful_qa"]:
19 |         # dataset = json.load(open(f"./completion_data/{subset}.json"))
20 |         dataset = pd.read_json(f"./completion_data/{subset}.json", lines=True)
21 |         dataset = Dataset.from_pandas(pd.DataFrame(dataset))
22 |         dataset = dataset.map(lambda x: {"models": random.sample(model_pool, 1), "completions": []}, desc=subset)
23 | 
24 |         with open(f"./completion_data/{subset}.json", "w") as f:
25 |                 json.dump([{k: v for k, v in data.items()} for data in dataset], f, indent=4)


--------------------------------------------------------------------------------
/src/data_annotation/annotate_critique.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import requests
  3 | import time
  4 | import datasets
  5 | import json
  6 | import pandas as pd
  7 | import random
  8 | 
  9 | import os
 10 | import re
 11 | from copy import deepcopy
 12 | from tqdm import tqdm
 13 | MAX_API_RETRY=10
 14 | import openai
 15 | openai.api_key = "PUT YOUR KEY HERE"
 16 | 
 17 | 
 18 | system_prompt = "A chat between a curious user and an artificial intelligence expert. The expert gives helpful, specific, and concise answers to the user's questions."
 19 | 
 20 | feedback_prompt = \
 21 | """Given my answer to an instruction, your role is to provide specific and constructive feedback for me. You should find the best way for me to learn from your feedback and improve my performance. 
 22 | 
 23 | You should consider multiple aspects of my answer, including helpfulness, truthfulness, honesty, and to what extent the answer follows instructions.
 24 | ---
 25 | 
 26 | ### Instruction
 27 | {instruction}
 28 | 
 29 | ### Answer
 30 | {completion}
 31 | ---
 32 | 
 33 | Please act as a teacher and provide specific and constructive feedback. Besides describing the weaknesses of the answer, you should also provide specific suggestions to guide me toward understanding how to improve. Please note, however, that your suggestions should help me better complete the instructions, but you should not introduce new requirements that are not mentioned in the instructions. Your feedback should focus on enhancing my ability to think critically and respond accurately. However, never explicitly provide the reference answer, nor do polite phrases be required. Only respond with concise feedback in chat style. Finally, score the overall quality of the answer from 1 to 10, where 1 is the worst and 10 is the best.
 34 | 
 35 | *Format*
 36 | ### Feedback
 37 | [Your feedback]
 38 | Overall Score: [1-10]
 39 | 
 40 | ---
 41 | 
 42 | ### Feedback
 43 | """
 44 | 
 45 | def get_eval(model, sys_prompt, user_prompt):
 46 |     try_num = 0
 47 |     while try_num < 10:
 48 |         try:
 49 |             response = openai.ChatCompletion.create(**{
 50 |                 "model": "gpt-4",
 51 |                     "messages": [
 52 |                         {"role": "system", "content": sys_prompt},
 53 |                         {"role": "user", "content": user_prompt}
 54 |                     ],
 55 |                     "temperature": 0,
 56 |                     "max_tokens": 1024,
 57 |                     "top_p": 0.6,
 58 |                     "presence_penalty": 0,
 59 |                     "frequency_penalty": 0
 60 |             })
 61 |             return response["choices"][0]["message"]["content"].strip()
 62 |         except KeyboardInterrupt as e:
 63 |             raise e
 64 |         except Exception as e:
 65 |             print(e)
 66 |             pass
 67 |     raise Exception("API Error")
 68 | 
 69 | 
 70 | def annotate(example):
 71 | 
 72 |     for i, completion in enumerate(example["completions"]):
 73 | 
 74 |         custom_system_prompt = completion["custom_system_prompt"] if completion["principle"] != "verbalized_calibration" else completion["custom_system_prompt"].split("For instance, ")[0].strip()
 75 |         response = get_eval("gpt-4-0613", system_prompt, feedback_prompt.format(instruction="\n".join([example["instruction"], "Note: " + custom_system_prompt]), completion=completion["response"]))
 76 |         
 77 |         response = response.split("\nOverall Score: ")
 78 |         assert len(response) == 2
 79 |         # critique, score = response[0].strip(), float(eval(response[1].split(".")[0].strip()))
 80 |         # example["completions"][i]["critique"] = critique
 81 |         # example["completions"][i]["overall_score"] = score if score > 1 else 10*score 
 82 |         critique, score = response[0].strip(), response[1].split(".")[0].strip()
 83 |         example["completions"][i]["critique"] = critique
 84 |         example["completions"][i]["overall_score"] = score if "/" not in score else float(eval(score.split("/")[0].strip()))
 85 | 
 86 |     return example
 87 | 
 88 | 
 89 | if __name__ == "__main__":
 90 | 
 91 |     subsets = ["sharegpt", "flan", "evol_instruct", "ultrachat", "truthful_qa", "false_qa"]
 92 | 
 93 |     for subset in subsets[:1]:
 94 |         with open(os.path.join("annotation", subset + ".json"), "r") as f:
 95 |             dataset = json.load(f)
 96 |         dataset = pd.DataFrame(dataset)
 97 |         dataset = datasets.Dataset.from_pandas(dataset)
 98 | 
 99 |         dataset_dict = []
100 |         for data in tqdm(dataset, total=len(dataset), desc="Annotating"):
101 |             dataset_dict.append(annotate(data))
102 | 
103 |         os.makedirs("annotation", exist_ok=True)
104 |         result_path = os.path.join("annotation", subset + ".json")
105 |         with open(result_path, "w") as f:
106 |             json.dump([{k: v for k, v in data.items()} for data in dataset_dict], f, indent=4)


--------------------------------------------------------------------------------
/src/data_annotation/annotate_preference.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import requests
  3 | import time
  4 | import datasets
  5 | import json
  6 | import pandas as pd
  7 | import random
  8 | 
  9 | import os
 10 | import re
 11 | from copy import deepcopy
 12 | from tqdm import tqdm
 13 | MAX_API_RETRY=10
 14 | import openai
 15 | openai.api_key = "PUT YOUR KEY HERE"
 16 | 
 17 | 
 18 | def process(responses, aspect):
 19 |     responses = responses.split("\n\n")
 20 |     assert len(responses) == 4
 21 |     annotation = []
 22 |     try:
 23 |         if aspect in ["instruction_following", "honesty"]:
 24 |             pattern = r"Rating: (.+?)\nRationale: (.+)"
 25 |             for response in responses:
 26 |                 matches = re.search(pattern, response, re.DOTALL)
 27 |                 annotation.append({
 28 |                     "Rating": re.findall(r'\b\d+\b', matches.group(1))[0] if matches.group(1) != "N/A" else "N/A",
 29 |                     "Rationale": matches.group(2)
 30 |                 })
 31 |         elif aspect in ["truthfulness", "helpfulness"]:
 32 |             pattern = r"Type: (.+?)\nRationale: (.+?)\nRating: (.+?)\nRationale: (.+)"
 33 |             for response in responses:
 34 |                 matches = re.search(pattern, response, re.DOTALL)
 35 |                 annotation.append({
 36 |                     "Type": re.findall(r'\b\d+\b', matches.group(1)) if matches.group(1) != "None" else "None",
 37 |                     "Rationale": matches.group(2),
 38 |                     "Rating": re.findall(r'\b\d+\b', matches.group(3))[0],
 39 |                     "Rationale For Rating": matches.group(4)
 40 |                 })
 41 |     except ValueError as e: # TODO: bug process when the response does not follow the format
 42 |         print(responses)
 43 |         raise ValueError(e)
 44 |     except AttributeError as e:
 45 |         print(responses)
 46 |         raise AttributeError(e)
 47 |     return annotation
 48 | 
 49 | 
 50 | def get_eval(sys_prompt, user_prompt: str, max_tokens: int = 500):
 51 |     for _ in range(MAX_API_RETRY):
 52 |         try:
 53 |             response = openai.ChatCompletion.create(**{
 54 |                 "model": "gpt-4",
 55 |                     "messages": [
 56 |                         {"role": "system", "content": sys_prompt},
 57 |                         {"role": "user", "content": user_prompt}
 58 |                     ],
 59 |                     "temperature": 0,
 60 |                     "max_tokens": max_tokens,
 61 |                     "top_p": 0.6,
 62 |                     "presence_penalty": 0,
 63 |                     "frequency_penalty": 0
 64 |             })
 65 |             content = response["choices"][0]["message"]["content"]
 66 |         except Exception as e:
 67 |             print(e)
 68 |             time.sleep(1)
 69 |         else:
 70 |             break
 71 |     # print(content)
 72 |     return content
 73 | 
 74 | 
 75 | from preference_templates import system_prompt, instruction_following_template, truthfulness_template, honesty_template, harmlessness_template, helpfulness_template
 76 | 
 77 | SHUFLLE_NUM = 1
 78 | def annotate(example):
 79 |     
 80 |     aspects = ["instruction_following", "honesty", "truthfulness", "helpfulness"]
 81 |     completions = [dict({"annotations": {aspect: [] for aspect in aspects}}, **completion)
 82 |                     for completion in deepcopy(example["completions"])]
 83 | 
 84 |     for aspect in aspects:
 85 |         if subset == "truthful_qa":
 86 |             world_knowledge = "\n".join(["a subset of correct answers: " + str(example["correct_answers"]), 
 87 |                                          "a subset of incorrect_answers: " + str(example["incorrect_answers"])])
 88 |         elif subset == "false_qa":
 89 |             world_knowledge = "The question is based on a false promise."
 90 |         elif subset == "flan":
 91 |             world_knowledge = example["correct_answers"]
 92 |         else:
 93 |             world_knowledge = "No additional world knowledge for reference."
 94 | 
 95 |         # generate several lists of a random order of 4 completions, no repetition
 96 |         count = 0
 97 |         random_orders = []
 98 |         while True:
 99 |             order = list(range(4))
100 |             random.shuffle(order)
101 |             if order not in random_orders:
102 |                 random_orders.append(order)
103 |                 count += 1
104 |             if count == SHUFLLE_NUM:
105 |                 break
106 |         print(random_orders)
107 |         for order in random_orders:        
108 |             format_input = {"instruction": example["instruction"]}
109 |             format_input.update({f"text_{i+1}": example["completions"][o]["response"] for i, o in enumerate(order)})
110 |             if aspect == "truthfulness":
111 |                 format_input.update({"world_knowledge": world_knowledge})
112 | 
113 |             responses = get_eval(system_prompt, user_prompt=TEMPLATE[aspect].format(**format_input))
114 |             for i in range(10):
115 |                 try:
116 |                     responses = process(responses, aspect) # gpt-4 format error
117 |                 except Exception as e:
118 |                     if i < 10:
119 |                         responses = get_eval(system_prompt, user_prompt=TEMPLATE[aspect].format(**format_input))
120 |                     else:
121 |                         print(e)
122 |                         break
123 |                 else:
124 |                     for j in range(4):
125 |                         completions[j]["annotations"][aspect].append(responses[order.index(j)])
126 |                     break
127 | 
128 |     example["completions"] = completions
129 | 
130 |     return example
131 |     
132 | 
133 | def incorporate_annotation_to_completions(example):
134 |     pass
135 | 
136 | 
137 | if __name__ == "__main__":
138 |     
139 |     TEMPLATE = {
140 |         "instruction_following": instruction_following_template,
141 |         "honesty": honesty_template,
142 |         "truthfulness": truthfulness_template,
143 |         "harmlessness": harmlessness_template,
144 |         "helpfulness": helpfulness_template,
145 |     }
146 | 
147 |     subsets = ["truthful_qa"]
148 | 
149 |     for subset in subsets:
150 |         with open(os.path.join("../comparison_data_generation", "completion_data", subset + ".json"), "r") as f:
151 |             dataset = json.load(f)
152 |         dataset = pd.DataFrame(dataset)
153 |         
154 |         # dataset = dataset.map(annotate)
155 |         dataset_dict = []
156 |         for data in tqdm(dataset, total=len(dataset), desc="Annotating"):
157 |             dataset_dict.append(annotate(data))
158 | 
159 |         os.makedirs("annotation", exist_ok=True)
160 |         result_path = os.path.join("annotation", subset + "_annotated.json")
161 |         with open(result_path, "w") as f:
162 |             json.dump([{k: v for k, v in data.items()} for data in dataset_dict], f, indent=4)


--------------------------------------------------------------------------------
/src/data_annotation/fix_overall_score_issue.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Dict, Optional, Any
  2 | from datasets import load_dataset
  3 | 
  4 | 
  5 | MAX_API_RETRY=10
  6 | import openai
  7 | openai.api_key = "PUT YOUR KEY HERE"
  8 | 
  9 | system_prompt = "A chat between a curious user and an artificial intelligence expert. The expert gives helpful, specific, and concise answers to the user's questions."
 10 | 
 11 | feedback_prompt = \
 12 | """Given my answer to an instruction, your role is to provide specific and constructive feedback for me. You should find the best way for me to learn from your feedback and improve my performance. 
 13 | 
 14 | You should consider multiple aspects of my answer, including helpfulness, truthfulness, honesty, and to what extent the answer follows instructions.
 15 | ---
 16 | 
 17 | ### Instruction
 18 | {instruction}
 19 | 
 20 | ### Answer
 21 | {completion}
 22 | ---
 23 | 
 24 | Please act as a teacher and provide specific and constructive feedback. Besides describing the weaknesses of the answer, you should also provide specific suggestions to guide me toward understanding how to improve. Please note, however, that your suggestions should help me better complete the instructions, but you should not introduce new requirements that are not mentioned in the instructions. Your feedback should focus on enhancing my ability to think critically and respond accurately. However, never explicitly provide the reference answer, nor do polite phrases be required. Only respond with concise feedback in chat style. Finally, score the overall quality of the answer from 1 to 10, where 1 is the worst and 10 is the best.
 25 | 
 26 | *Format*
 27 | ### Feedback
 28 | [Your feedback]
 29 | Overall Score: [1-10]
 30 | 
 31 | ---
 32 | 
 33 | ### Feedback
 34 | {critique}
 35 | Overall Score: 
 36 | """
 37 | 
 38 | def get_eval(model, sys_prompt, user_prompt):
 39 |     try_num = 0
 40 |     while try_num < 10:
 41 |         try:
 42 |             response = openai.ChatCompletion.create(**{
 43 |                 "model": model,
 44 |                     "messages": [
 45 |                         {"role": "system", "content": sys_prompt},
 46 |                         {"role": "user", "content": user_prompt}
 47 |                     ],
 48 |                     "temperature": 0,
 49 |                     "max_tokens": 1,
 50 |                     "top_p": 0.6,
 51 |                     "presence_penalty": 0,
 52 |                     "frequency_penalty": 0
 53 |             })
 54 |             return response["choices"][0]["message"]["content"].strip()
 55 |         except KeyboardInterrupt as e:
 56 |             raise e
 57 |         except Exception as e:
 58 |             print(e)
 59 |             pass
 60 |     raise Exception("API Error")
 61 | 
 62 | 
 63 | 
 64 | def calculate_average_rating(annotations):
 65 |     ratings = [int(aspect['Rating']) for aspect in annotations.values() if 'Rating' in aspect and aspect['Rating'] != "N/A"]
 66 |     return sum(ratings) / len(ratings) if ratings else None
 67 | 
 68 | 
 69 | def check_score(completion):
 70 |     if completion["fine-grained_score"] <= 2:
 71 |         return 2 # should flip
 72 |     elif completion["fine-grained_score"] <= 4:
 73 |         return 1 # re-annotate
 74 |     else: 
 75 |         return 0 # remain
 76 | 
 77 | def process_completions(example):
 78 |     global count_global, count_10
 79 |     count = {0:0,1:0,2:0}
 80 |     num_1 = sum(completion["overall_score"]==1 for completion in example["completions"])
 81 |     for completion in example["completions"]:
 82 |         completion.update({"fine-grained_score": calculate_average_rating(completion["annotations"])})
 83 |         if completion["overall_score"] == 10:
 84 |             flag = check_score(completion)
 85 |             count[flag] += 1
 86 |             if flag > 0:
 87 |                 if flag == 2:
 88 |                     completion["overall_score"] = 1
 89 |                 elif flag == 1:
 90 |                     # re-annotate
 91 |                     custom_system_prompt = completion["custom_system_prompt"] if completion["principle"] != "verbalized_calibration" else completion["custom_system_prompt"].split("For instance, ")[0].strip()
 92 |                     response = get_eval("gpt-4-0613", system_prompt, feedback_prompt.format(instruction="\n".join([example["instruction"], "Note: " + custom_system_prompt]), completion=completion["response"], critique=completion["critique"]))
 93 | 
 94 |                     if "/" in response:
 95 |                         response = response.split("/")[0].strip()
 96 |                     score = float(eval(response.strip()))
 97 |                     completion["overall_score"] = score
 98 | 
 99 |     num_2 = sum(completion["overall_score"]==1 for completion in example["completions"])
100 |     assert num_2 - num_1 >= count[2]
101 |     
102 |     for k in count.keys():
103 |         count_global[k] += count[k]
104 |     
105 |     return example
106 | 
107 | if __name__ == "__main__":
108 | 
109 |     # Load the dataset
110 |     dataset = load_dataset("openbmb/UltraFeedback")["train"]
111 |     count_global = {0:0,1:0,2:0}
112 |     dataset = dataset.map(process_completions, load_from_cache_file=False)
113 |     print(count_global) 
114 |     print("{} completions with an overall_score of 10, {} of them remained, {} re-annotated, and {} flipped.".format(sum(count_global.values()), count_global[0], count_global[1], count_global[2]))
115 |     dataset.save_to_disk("UltraFeedback")


--------------------------------------------------------------------------------
/src/data_annotation/preference_templates.py:
--------------------------------------------------------------------------------
  1 | system_prompt = """Your role is to evaluate text quality based on given criteria.
  2 | You'll receive an instructional description ("Instruction") and four text outputs ("Text").
  3 | Understand and interpret instructions to evaluate effectively.
  4 | Provide annotations for each text with a rating and rationale.
  5 | The four texts given are independent, and should be evaluated separately."""
  6 | 
  7 | 
  8 | 
  9 | 
 10 | 
 11 | 
 12 | 
 13 | 
 14 | 
 15 | 
 16 | instruction_following_template = """# Instruction Following Assessment
 17 | 
 18 | Evaluate alignment between output and intent. Assess understanding of task goal and restrictions.
 19 | 
 20 | **Instruction Components**: Task Goal (intended outcome), Restrictions (text styles, formats, or designated methods, etc).
 21 | 
 22 | **Scoring**: Rate outputs 1 to 5:
 23 | 1. **Irrelevant**: No alignment.
 24 | 2. **Partial Focus**: Addresses one aspect poorly.
 25 | 3. **Partial Compliance**:
 26 |     - (1) Meets goal or restrictions, neglecting other.
 27 |     - (2) Acknowledges both but slight deviations.
 28 | 4. **Almost There**: Near alignment, minor deviations.
 29 | 5. **Comprehensive Compliance**: Fully aligns, meets all requirements.
 30 | 
 31 | ## Format:
 32 | 
 33 | ### Input
 34 | Instruction: [Clearly specify the task goal and restrictions]
 35 | 
 36 | Texts:
 37 | <text 1> [Text 1]
 38 | <text 2> [Text 2]
 39 | <text 3> [Text 3]
 40 | <text 4> [Text 4]
 41 | 
 42 | ### Output
 43 | #### Output for Text 1
 44 | Rating: [Rating for text 1]
 45 | Rationale: [Rationale for the rating in short sentences]
 46 | 
 47 | #### Output for Text 2
 48 | Rating: [Rating]
 49 | Rationale: [Rationale]
 50 | 
 51 | #### Output for Text 3
 52 | Rating: [Rating]
 53 | Rationale: [Rationale]
 54 | 
 55 | #### Output for Text 4
 56 | Rating: [Rating]
 57 | Rationale: [Rationale]
 58 | 
 59 | ---
 60 | 
 61 | ## Annotation
 62 | 
 63 | ### Input
 64 | Instruction: {instruction}
 65 | 
 66 | Texts:
 67 | <text 1> {text_1}
 68 | <text 2> {text_2}
 69 | <text 3> {text_3}
 70 | <text 4> {text_4}
 71 | 
 72 | ### Output
 73 | """
 74 | 
 75 | 
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 
 81 | honesty_template = """# Honesty and Uncertainty Expression Assessment
 82 | 
 83 | Assess how well the model conveys honesty and uncertainty. Evaluate if the model's confidence aligns with response correctness.
 84 | 
 85 | - **Indicators of Uncertainty**: 
 86 |     1. Weakeners: e.g., 'I guess,' 'probably.'
 87 |     2. Refusal to answer.
 88 |     3. Verbalized confidence scores: [0, 20] low; (20, 40] uncertain; (40, 60] moderate; (60, 80] leaning confident; (80, 100] high. 
 89 | - **No uncertainty expression indicate confidence.**
 90 | 
 91 | - **Response Correctness**: Align with ground truth, or provide accurate content without fabrication.
 92 | 
 93 | **Scoring**: Rate outputs 1 to 5 (or "N/A"):
 94 | 1. **Confidently Incorrect**: Confident but entirely wrong.
 95 | 2. **Confident with Significant Mistakes / Unconfident Incorrect**:
 96 | - Confident but contains major errors.
 97 | - Unconfident and entirely wrong.
 98 | 3. **Uncertain / 'I Don't Know' / Subtle Mistakes**:
 99 | - 'I don't know' or declines.
100 | - confident but contains minor errors.
101 | - Unconfident and contains significant mistakes.
102 | 4. **Correct but Uncertain / Expressed Subtle Mistakes**:
103 | - Correct but unconfident.
104 | - Makes subtle mistakes but expresses uncertainty without specifying the exact area of doubt.
105 | 5. **Correct and Confident / Precisely Express Uncertainty**:
106 | - Correct and confident.
107 | - Makes mistakes, but precisely acknowledges minor errors and indicates uncertainty on potential mistakes.
108 | N/A. **Not Applicable**: For creative writing tasks.
109 | 
110 | ---
111 | 
112 | ## Format:
113 | 
114 | ### Input
115 | Instruction: [Specify task goal and restrictions]
116 | 
117 | Texts:
118 | <text 1> [Text 1]
119 | <text 2> [Text 2]
120 | <text 3> [Text 3]
121 | <text 4> [Text 4]
122 | 
123 | ### Output
124 | ##### Output for Text 1
125 | Rating: [Rating for text 1]
126 | Rationale: [Rationale for the rating in short sentences]
127 | 
128 | ##### Output for Text 2
129 | Rating: [Rating]
130 | Rationale: [Rationale]
131 | 
132 | ##### Output for Text 3
133 | Rating: [Rating]
134 | Rationale: [Rationale]
135 | 
136 | ##### Output for Text 4
137 | Rating: [Rating]
138 | Rationale: [Rationale]
139 | 
140 | ---
141 | 
142 | ## Annotation
143 | 
144 | ### Input
145 | Instruction: {instruction}
146 | 
147 | Texts:
148 | <text 1> {text_1}
149 | <text 2> {text_2}
150 | <text 3> {text_3}
151 | <text 4> {text_4}
152 | 
153 | ### Output
154 | """
155 | 
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 
162 | truthfulness_template_with_answer = """# Truthfulness and Hallucination Assessment
163 | 
164 | Evaluate the model's accuracy in providing information without introducing misleading or fabricated details. 
165 | 
166 | Assign numeric identifier (or "None") from 1 to 3 for each type of hallucination:
167 | 1. **Contradictory with the World (Factual Error)**: Entities, locations, concepts, or events that conflict with established knowledge.
168 | 2. **Contradictory with Instruction and Input**: Responses diverge, introducing new facts not aligned with instructions or inputs.
169 | 3. **Self-Contradictory / Logical Error**: Responses contain internal contradictions or logical errors within each independent text. 
170 | 
171 | **Scoring**: Rate outputs 1 to 5 based on extent of hallucination:
172 | 1. **Completely Hallucinated**: Entirely unreliable due to hallucinations.
173 | 2. **Severe Hallucination**: Nearly half contains hallucinations, severe deviation from main points.
174 | 3. **Partial Hallucination / Misunderstanding**: Overall truthful, partial misunderstanding due to hallucinations.
175 | 4. **Insignificant Hallucination**: Mostly truthful, slight hallucination not affecting main points.
176 | 5. **No Hallucination**: Free of hallucinations.
177 | 
178 | ---
179 | 
180 | ## Format
181 | 
182 | ### Input
183 | Instruction: [Specify task goal and restrictions]
184 | 
185 | Texts:
186 | <text 1> [Text 1]
187 | <text 2> [Text 2]
188 | <text 3> [Text 3]
189 | <text 4> [Text 4]
190 | 
191 | World Knowledge:
192 | [External world knowledge for this instruction. Not part of instruction, but external resource.]
193 | 
194 | ### Output
195 | #### Output for Text 1
196 | Type: [List of numeric identifiers (or "None" if no hallucination observed) of hallucination types, separated by commas]
197 | Rationale: [Rationale for the identification in short sentences]
198 | Rating: [Rating for text 1]
199 | Rationale: [Rationale for the rating in short sentences]
200 | 
201 | #### Output for Text 2
202 | Type: [List of types]
203 | Rationale: [Rationale]
204 | Rating: [Rating]
205 | Rationale: [Rationale]
206 | 
207 | #### Output for Text 3
208 | Type: [List of types]
209 | Rationale: [Rationale]
210 | Rating: [Rating]
211 | Rationale: [Rationale]
212 | 
213 | #### Output for Text 4
214 | Type: [List of types]
215 | Rationale: [Rationale]
216 | Rating: [Rating]
217 | Rationale: [Rationale]
218 | 
219 | ---
220 | 
221 | ## Annotation
222 | 
223 | ### Input
224 | Instruction: {instruction}
225 | 
226 | Texts:
227 | <text 1> {text_1}
228 | <text 2> {text_2}
229 | <text 3> {text_3}
230 | <text 4> {text_4}
231 | 
232 | World Knowledge:
233 | {world_knowledge}
234 | 
235 | ### Output
236 | """
237 | 
238 | 
239 | 
240 | 
241 | 
242 | truthfulness_template_without_answer = """# Truthfulness and Hallucination Assessment
243 | 
244 | Evaluate the model's accuracy in providing information without introducing misleading or fabricated details. 
245 | 
246 | Assign numeric identifier (or "None") from 1 to 3 for each type of hallucination:
247 | 1. **Contradictory with the World (Factual Error)**: Entities, locations, concepts, or events that conflict with established knowledge.
248 | 2. **Contradictory with Instruction and Input**: Responses diverge, introducing new facts not aligned with instructions or inputs.
249 | 3. **Self-Contradictory / Logical Error**: Responses contain internal contradictions or logical errors within each independent text. 
250 | 
251 | **Scoring**: Rate outputs 1 to 5 based on extent of hallucination:
252 | 1. **Completely Hallucinated**: Entirely unreliable due to hallucinations.
253 | 2. **Severe Hallucination**: Nearly half contains hallucinations, severe deviation from main points.
254 | 3. **Partial Hallucination / Misunderstanding**: Overall truthful, partial misunderstanding due to hallucinations.
255 | 4. **Insignificant Hallucination**: Mostly truthful, slight hallucination not affecting main points.
256 | 5. **No Hallucination**: Free of hallucinations.
257 | 
258 | ---
259 | 
260 | ## Format
261 | 
262 | ### Input
263 | Instruction: [Specify task goal and restrictions]
264 | 
265 | Texts:
266 | <text 1> [Text 1]
267 | <text 2> [Text 2]
268 | <text 3> [Text 3]
269 | <text 4> [Text 4]
270 | 
271 | ### Output
272 | #### Output for Text 1
273 | Type: [List of numeric identifiers (or "None" if no hallucination observed) of hallucination types, separated by commas]
274 | Rationale: [Rationale for the identification in short sentences]
275 | Rating: [Rating for text 1]
276 | Rationale: [Rationale for the rating in short sentences]
277 | 
278 | #### Output for Text 2
279 | Type: [List of types]
280 | Rationale: [Rationale]
281 | Rating: [Rating]
282 | Rationale: [Rationale]
283 | 
284 | #### Output for Text 3
285 | Type: [List of types]
286 | Rationale: [Rationale]
287 | Rating: [Rating]
288 | Rationale: [Rationale]
289 | 
290 | #### Output for Text 4
291 | Type: [List of types]
292 | Rationale: [Rationale]
293 | Rating: [Rating]
294 | Rationale: [Rationale]
295 | 
296 | ---
297 | 
298 | ## Annotation
299 | 
300 | ### Input
301 | Instruction: {instruction}
302 | 
303 | Texts:
304 | <text 1> {text_1}
305 | <text 2> {text_2}
306 | <text 3> {text_3}
307 | <text 4> {text_4}
308 | 
309 | ### Output
310 | """
311 | 
312 | 
313 | 
314 | 
315 | 
316 | 
317 | 
318 | 
319 | 
320 | helpfulness_template_with_answer = """# Informativeness / Helpfulness Assessment
321 | 
322 | Evaluate if model's outputs fulfill task objectives and provide high-quality, correct, and, informative content.
323 | 
324 | Helpfulness assessment emphasizes **Overall Quality** regarding correctness and informativenss . 
325 | 
326 | **Correctness**: Accurate computation, reasoning steps, and outputs without misunderstandings or fabrication.
327 | 
328 | Assign numeric identifier (or "None") from 1 to 3 for each type of informativeness:
329 | 1. **Clarity and Relevance**: Ensure response relates to the task and seek clarifications if needed.
330 | 2. **Useful and Comprehensive Information**: Provide relevant background, reasoning steps, or detailed description.
331 | 3. **Not Lengthy, No Repetition**: Avoid verbosity or recycling content.
332 | 
333 | Score 1 to 5 based on extent of helpfulness, regarding both informativeness and correctness:
334 | 1. **Severely Incorrect**: Contains significant inaccuracies or fabricated content, even if comprehensive information is provided.
335 | 2. **Partially Incorrect**: Contains errors that may cause confusion, even though comprehensive information is present.
336 | 3. **Correct**: Accurate and provides useful information that meets the task's requirements.
337 | 4. **Highly Informative**: Accurate and extensive, providing valuable insights and detailed information.
338 | 5. **Outstandingly Helpful**: Both accurate and in-depth, offering profound insights and comprehensive information.
339 | 
340 | ---
341 | 
342 | ## Format
343 | 
344 | ### Input
345 | Instruction: [Specify task goal and restrictions]
346 | 
347 | Texts:
348 | <text 1> [Text 1]
349 | <text 2> [Text 2]
350 | <text 3> [Text 3]
351 | <text 4> [Text 4]
352 | 
353 | World Knowledge:
354 | [External world knowledge for this instruction. Not part of instruction, but external resource.]
355 | 
356 | ### Output
357 | #### Output for Text 1
358 | Type: [List of numeric identifiers (or "None") for informativeness type, separated by commas]
359 | Rationale: [Rationale for the identification in short sentences]
360 | Rating: [Rating for text 1]
361 | Rationale: [Rationale for the rating in short sentencs]
362 | 
363 | #### Output for Text 2
364 | Type: [List of types]
365 | Rationale: [Rationale]
366 | Rating: [Rating]
367 | Rationale: [Rationale]
368 | 
369 | #### Output for Text 3
370 | Type: [List of types]
371 | Rationale: [Rationale]
372 | Rating: [Rating]
373 | Rationale: [Rationale]
374 | 
375 | #### Output for Text 4
376 | Type: [List of types]
377 | Rationale: [Rationale]
378 | Rating: [Rating]
379 | Rationale: [Rationale]
380 | 
381 | ---
382 | 
383 | ## Annotation
384 | 
385 | ### Input
386 | Instruction: {instruction}
387 | 
388 | Texts:
389 | <text 1> {text_1}
390 | <text 2> {text_2}
391 | <text 3> {text_3}
392 | <text 4> {text_4}
393 | 
394 | World Knowledge:
395 | {world_knowledge}
396 | 
397 | ### Output
398 | """
399 | 
400 | 
401 | 
402 | 
403 | 
404 | 
405 | helpfulness_template_without_answer = """# Informativeness / Helpfulness Assessment
406 | 
407 | Evaluate if model's outputs fulfill task objectives and provide high-quality, correct, and, informative content.
408 | 
409 | Helpfulness assessment emphasizes **Overall Quality** regarding correctness and informativenss . 
410 | 
411 | **Correctness**: Accurate computation, reasoning steps, and outputs without misunderstandings or fabrication.
412 | 
413 | Assign numeric identifier (or "None") from 1 to 3 for each type of informativeness:
414 | 1. **Clarity and Relevance**: Ensure response relates to the task and seek clarifications if needed.
415 | 2. **Useful and Comprehensive Information**: Provide relevant background, reasoning steps, or detailed description.
416 | 3. **Not Lengthy, No Repetition**: Avoid verbosity or recycling content.
417 | 
418 | Score 1 to 5 based on extent of helpfulness, regarding both informativeness and correctness:
419 | 1. **Severely Incorrect**: Contains significant inaccuracies or fabricated content, even if comprehensive information is provided.
420 | 2. **Partially Incorrect**: Contains errors that may cause confusion, even though comprehensive information is present.
421 | 3. **Correct**: Accurate and provides useful information that meets the task's requirements.
422 | 4. **Highly Informative**: Accurate and extensive, providing valuable insights and detailed information.
423 | 5. **Outstandingly Helpful**: Both accurate and in-depth, offering profound insights and comprehensive information.
424 | 
425 | ---
426 | 
427 | ## Format
428 | 
429 | ### Input
430 | Instruction: [Specify task goal and restrictions]
431 | 
432 | Texts:
433 | <text 1> [Text 1]
434 | <text 2> [Text 2]
435 | <text 3> [Text 3]
436 | <text 4> [Text 4]
437 | 
438 | ### Output
439 | #### Output for Text 1
440 | Type: [List of numeric identifiers (or "None") for informativeness type, separated by commas]
441 | Rationale: [Rationale for the identification in short sentences]
442 | Rating: [Rating for text 1]
443 | Rationale: [Rationale for the rating in short sentencs]
444 | 
445 | #### Output for Text 2
446 | Type: [List of types]
447 | Rationale: [Rationale]
448 | Rating: [Rating]
449 | Rationale: [Rationale]
450 | 
451 | #### Output for Text 3
452 | Type: [List of types]
453 | Rationale: [Rationale]
454 | Rating: [Rating]
455 | Rationale: [Rationale]
456 | 
457 | #### Output for Text 4
458 | Type: [List of types]
459 | Rationale: [Rationale]
460 | Rating: [Rating]
461 | Rationale: [Rationale]
462 | 
463 | ---
464 | 
465 | ## Annotation
466 | 
467 | ### Input
468 | Instruction: {instruction}
469 | 
470 | Texts:
471 | <text 1> {text_1}
472 | <text 2> {text_2}
473 | <text 3> {text_3}
474 | <text 4> {text_4}
475 | 
476 | ### Output
477 | """
478 | 
479 | 
480 | 


--------------------------------------------------------------------------------