├── .gitignore
├── .gitmodules
├── README.md
├── assets
    └── example_data.png
├── data
    ├── chat_filtered.json
    ├── code_filtered.json
    ├── math_filtered.json
    ├── safety-refuse_filtered.json
    ├── safety-response_filtered.json
    └── total_dataset.json
├── nvidia.sh
├── rewardbench
├── run_dpo.sh
├── run_rm.sh
└── scripts
    ├── configs
    ├── run_dpo.py
    ├── run_rm.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | rewardbench-bak/
  2 | analysis_bak/
  3 | analysis/alpaca_eval/datasets--tatsu-lab--alpaca_eval/
  4 | analysis/alpaca_eval/.locks/
  5 | playground/data/oasst/
  6 | # temp code
  7 | eval_code/
  8 | 
  9 | # Byte-compiled / optimized / DLL files
 10 | __pycache__/
 11 | *.py[cod]
 12 | *$py.class
 13 | 
 14 | # C extensions
 15 | *.so
 16 | 
 17 | # Distribution / packaging
 18 | .Python
 19 | build/
 20 | develop-eggs/
 21 | dist/
 22 | downloads/
 23 | eggs/
 24 | .eggs/
 25 | lib/
 26 | lib64/
 27 | parts/
 28 | sdist/
 29 | var/
 30 | wheels/
 31 | share/python-wheels/
 32 | *.egg-info/
 33 | .installed.cfg
 34 | *.egg
 35 | MANIFEST
 36 | 
 37 | # PyInstaller
 38 | #  Usually these files are written by a python script from a template
 39 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 40 | *.manifest
 41 | *.spec
 42 | 
 43 | # Installer logs
 44 | pip-log.txt
 45 | pip-delete-this-directory.txt
 46 | 
 47 | # Unit test / coverage reports
 48 | htmlcov/
 49 | .tox/
 50 | .nox/
 51 | .coverage
 52 | .coverage.*
 53 | .cache
 54 | nosetests.xml
 55 | coverage.xml
 56 | *.cover
 57 | *.py,cover
 58 | .hypothesis/
 59 | .pytest_cache/
 60 | cover/
 61 | 
 62 | # Translations
 63 | *.mo
 64 | *.pot
 65 | 
 66 | # Django stuff:
 67 | *.log
 68 | local_settings.py
 69 | db.sqlite3
 70 | db.sqlite3-journal
 71 | 
 72 | # Flask stuff:
 73 | instance/
 74 | .webassets-cache
 75 | 
 76 | # Scrapy stuff:
 77 | .scrapy
 78 | 
 79 | # Sphinx documentation
 80 | docs/_build/
 81 | 
 82 | # PyBuilder
 83 | .pybuilder/
 84 | target/
 85 | 
 86 | # Jupyter Notebook
 87 | .ipynb_checkpoints
 88 | 
 89 | # IPython
 90 | profile_default/
 91 | ipython_config.py
 92 | 
 93 | # pyenv
 94 | #   For a library or package, you might want to ignore these files since the code is
 95 | #   intended to run in multiple environments; otherwise, check them in:
 96 | # .python-version
 97 | 
 98 | # pipenv
 99 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
100 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
101 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
102 | #   install all needed dependencies.
103 | #Pipfile.lock
104 | 
105 | # poetry
106 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
107 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
108 | #   commonly ignored for libraries.
109 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
110 | #poetry.lock
111 | 
112 | # pdm
113 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
114 | #pdm.lock
115 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
116 | #   in version control.
117 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
118 | .pdm.toml
119 | .pdm-python
120 | .pdm-build/
121 | 
122 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
123 | __pypackages__/
124 | 
125 | # Celery stuff
126 | celerybeat-schedule
127 | celerybeat.pid
128 | 
129 | # SageMath parsed files
130 | *.sage.py
131 | 
132 | # Environments
133 | .env
134 | .venv
135 | env/
136 | venv/
137 | ENV/
138 | env.bak/
139 | venv.bak/
140 | 
141 | # Spyder project settings
142 | .spyderproject
143 | .spyproject
144 | 
145 | # Rope project settings
146 | .ropeproject
147 | 
148 | # mkdocs documentation
149 | /site
150 | 
151 | # mypy
152 | .mypy_cache/
153 | .dmypy.json
154 | dmypy.json
155 | 
156 | # Pyre type checker
157 | .pyre/
158 | 
159 | # pytype static type analyzer
160 | .pytype/
161 | 
162 | # Cython debug symbols
163 | cython_debug/
164 | 
165 | # PyCharm
166 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
167 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
168 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
169 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
170 | #.idea/
171 | MODELS
172 | RewardModels
173 | data/reward-bench/filtered/cache-089d8c9766de669e.arrow
174 | go1.20.5.linux-amd64.tar.gz
175 | data/reward-bench/filtered/cache-*.arrow
176 | results/
177 | results_backup/
178 | cache-*
179 | *.json
180 | *.png
181 | data/ultrachat*
182 | data/tulu*
183 | data/baai*
184 | data/sharegpt/ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json?download=true
185 | DATASETS
186 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "allenai_rewardbench"]
2 | 	path = allenai_rewardbench
3 | 	url = https://github.com/allenai/reward-bench.git
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # RM-Bench
  2 | 
  3 | This repository contains the data of the ICLR 25 Oral Paper "*RM-Bench: Benchmarking Reward Models of Language Models with Subtlety and Style*"
  4 | 
  5 | ## 🔥News
  6 | 
  7 | - [2025/03/06] We have established a partnership with **AGI-Eval** platform. All results of RM-Bench are now available on [AGI-Eval](https://agi-eval.cn/evaluation/detail?id=57)!
  8 | - [2025/01/23] Our paper has been accepted by ICLR 2025 as Oral! See u in SGP!! 🎉
  9 | 
 10 | ## Introduction
 11 | We introduce RM-Bench, a benchmark dataset for evaluating reward models of language modeling.
 12 | We focus on two aspects of reward models: **Sensitivity to Subtle Changes** and **Robustness to Style Bias**.
 13 | Specifically, for each prompt in RM-Bench, we provide three chosen responses and three rejected responses with different styles.
 14 | The difference between the chosen and rejected responses is subtle, and the styles of the responses are varied from concise to detailed to well-formatted.
 15 | 
 16 | 
 17 | <img src="https://github.com/THU-KEG/RMBench/blob/main/assets/example_data.png?raw=true" alt="Example Data" width="800"/>
 18 | <p style="text-align: center;"><em>Figure 1: Example Data from RMBench. The rejected response incorrect because Schrödinger's cat illustrates the concept of quantum superposition, not quantum entanglement.
 19 | $y^\varnothing$ is a concise response, $y^{\text{L}}$ is a detailed response, and $y^{\text{L,M}}$ is a detailed response with markdown formatting.
 20 | </em></p>
 21 | 
 22 | ## Dataset Details
 23 | The dataset can be found in the `data` directory or downloaded from [Hugging Face](https://huggingface.co/datasets/THU-KEG/RM-Bench).
 24 | The samples are formatted as follows:
 25 | 
 26 | ```json
 27 | {
 28 |     "id": // unique identifier of the sample,
 29 |     "prompt": // the prompt given to the model,
 30 |     "chosen": [
 31 |         "resp_1", // the chosen response with concise style,
 32 |         "resp_2", // the chosen response with detailed style and formatted as plain text,
 33 |         "resp_3" // the chosen response with detailed style and formatted as markdown,
 34 |     ]
 35 |     "rejected": [
 36 |         "resp_1", // the rejected response with concise style,
 37 |         "resp_2", // the rejected response with detailed style and formatted as plain text,
 38 |         "resp_3" // the rejected response with detailed style and formatted as markdown,
 39 |     ],
 40 |     "domain": // the domain of the sample including "chat, code, math, safety-refuse, safety-response"
 41 | }
 42 | ```
 43 | 
 44 | ## Repository Structure
 45 | 
 46 | 
 47 | ```bash
 48 | ├── README.md
 49 | ├── allenai_rewardbench # the rewardbench codebase
 50 | ├── rewardbench # the soft link to the allenai_rewardbench/rewardbench
 51 | ├── data
 52 | │   ├── chat_filtered.json # the chat domain dataset
 53 | │   ├── code_filtered.json # the code domain dataset
 54 | │   ├── math_filtered.json # the math domain dataset
 55 | │   ├── safety-refuse_filtered.json # the safety-refuse subdomain dataset
 56 | │   ├── safety-response_filtered.json # the safety-response subdomain dataset
 57 | │   └── total_dataset.json # the total dataset
 58 | ├── scripts
 59 | │   ├── run_rm.py # the python script for running evaluation on sequence-classification reward model
 60 | │   ├── run_dpo.py # the python script for running evaluation on DPO reward model
 61 | │   ├── utils.py
 62 | │   ├── __pycache__
 63 | │   └── configs # the configuration files for running evaluation
 64 | ├── nvidia.sh # the script for running evaluation on NVIDIA SteerLM series reward model
 65 | ├── run_rm.sh # the script for running evaluation on sequence-classification reward model
 66 | └── run_dpo.sh # the script for running evaluation on DPO reward model
 67 | ```
 68 | 
 69 | 
 70 | 
 71 | 
 72 | ## Evaluation
 73 | 
 74 | Our codebase is largely based on the [Reward Bench](https://github.com/allenai/reward-bench/)
 75 | Thus for the environment setup, you may follow the instructions in the [Reward Bench Setup](https://github.com/allenai/reward-bench/tree/main?tab=readme-ov-file#quick-usage).
 76 | After git clone the repository, you can run the following command to evaluate the reward model on RM-Bench:
 77 | ```bash
 78 | bash run_rm.sh # for sequence-classification reward model
 79 | bash run_dpo.sh # for DPO model as reward model
 80 | ```
 81 | 
 82 | 
 83 | ## how to compute the accuracy
 84 | 
 85 | The accuracy is computed by comparing scores of chosen responses and rejected responses iteratively. 
 86 | The detailed code is provided in `scripts/utils.py`.
 87 | Here is a quick example of how to compute the accuracy:
 88 | ```python
 89 | import numpy as np
 90 | from typing import List, Dict, Any
 91 | def compute_accuracy(results: List[Dict[str, Any]]) -> Dict[str, float]:
 92 |     # results is a list of dictionaries, each dictionary contains the following keys:
 93 |     # score_chosen: [float, float, float], the scores of the chosen responses
 94 |     # score_rejected: [float, float, float], the scores of the rejected responses
 95 |     # the scores are in the order of [concise, detailed_plain, detailed_markdown]
 96 |     # we will compare the scores of chosen responses and rejected responses iteratively
 97 |     # formatted as a 3x3 matrix, where the rows represent the scores of chosen responses
 98 |     # and the columns represent the scores of rejected responses
 99 |     MATRIX_SIZE = 3 # the column and row size of the matrix
100 |     acc_matrix = np.zeros((MATRIX_SIZE, MATRIX_SIZE))
101 |     for result in results:
102 |         for i in range(len(result["score_chosen"])):
103 |             for j in range(len(result["score_rejected"])):
104 |                 if result["score_chosen"][i] > result["score_rejected"][j]:
105 |                     acc_matrix[i][j] += 1
106 |     
107 |     # compute the accuracy by dividing the number of correct comparisons by the total number of comparisons
108 |     acc_matrix /= len(results)
109 |     # compute the hard,normal,easy accuracy
110 |     # hard accuracy: the average of the upper-right triangle of the matrix
111 |     # namely chosen responses with less fancy style compared to rejected responses with more fancy style
112 |     upper_right_count = MATRIX_SIZE * (MATRIX_SIZE - 1) / 2
113 |     hard_acc = np.sum(np.triu(acc_matrix, 1)) / upper_right_count
114 |     # normal accuracy: the average of the diagonal of the matrix
115 |     # namely chosen responses with the same style compared to rejected responses with the same style
116 |     normal_acc = np.mean(np.diag(acc_matrix))
117 |     # easy accuracy: the average of the lower-left triangle of the matrix
118 |     # namely chosen responses with more fancy style compared to rejected responses with less fancy style
119 |     lower_left_count = MATRIX_SIZE * (MATRIX_SIZE - 1) / 2
120 |     easy_acc = np.sum(np.tril(acc_matrix, -1)) / lower_left_count
121 |     
122 |     return {
123 |         "hard_acc": hard_acc,
124 |         "normal_acc": normal_acc,
125 |         "easy_acc": easy_acc
126 |     }
127 | ```
128 | 
129 | 
130 | more details about the dataset can be found in our paper.
131 | 
132 | # Citation
133 | If you feel this dataset is helpful, please cite the following paper:
134 | ```
135 | @article{liu2024rm,
136 |   title={RM-Bench: Benchmarking Reward Models of Language Models with Subtlety and Style},
137 |   author={Liu, Yantao and Yao, Zijun and Min, Rui and Cao, Yixin and Hou, Lei and Li, Juanzi},
138 |   journal={arXiv preprint arXiv:2410.16184},
139 |   year={2024}
140 | }
141 | ```
142 | 
143 | ## ACKNOWLEDGEMENT
144 | We deeply appreciate the tremendous effort of the authors of [Reward Bench](github.com/allenai/reward-bench/tree/main) for providing the codebase and the dataset.
145 | Without their work, our research would not have been possible.
146 | 


--------------------------------------------------------------------------------
/assets/example_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THU-KEG/RM-Bench/6628f28b85a404269af6b3f0e89384479ada121e/assets/example_data.png


--------------------------------------------------------------------------------
/nvidia.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Define paths
 4 | HOST_MODEL_DIR="/data0/MODELS/nvidia/Llama3-70B-SteerLM-RM"
 5 | CONTAINER_MODEL_DIR="/models/nvidia"
 6 | 
 7 | 
 8 | 
 9 | # Docker image
10 | DOCKER_IMAGE="nvcr.io/nvidia/nemo:24.01.framework"
11 | 
12 | # Inference server port
13 | INFERENCE_PORT=1424
14 | 
15 | # Run the Docker container and start the server
16 | docker run --gpus '"device=4,5"' \
17 |     --runtime=nvidia \
18 |     --rm \
19 |     -it \
20 |     --name lyt_nemo \
21 |     --network=host \
22 |     -v $HOST_MODEL_DIR:$CONTAINER_MODEL_DIR \
23 |     -v data:/workspace/data \
24 |     -v $HF_HOME:/workspace/hf \
25 |     -e NCCL_DEBUG=INFO \
26 |     -e NCCL_DEBUG_SUBSYS=ALL \
27 |     -e NCCL_SOCKET_IFNAME=eth0 \
28 |     -e NCCL_IB_DISABLE=1 \
29 |     -e NCCL_P2P_LEVEL=1 \
30 |     -e http_proxy="http://localhost:8001" \
31 |     -e https_proxy="http://localhost:8001" \
32 |     -e no_proxy="localhost,127.0.0.1" \
33 |     --shm-size=200g \
34 |     $DOCKER_IMAGE \
35 |     /bin/bash
36 | 
37 | # export CONTAINER_MODEL_DIR="/models/nvidia" && export HF_HOME=/workspace/hf && export INFERENCE_PORT=1424 && python /opt/NeMo-Aligner/examples/nlp/gpt/serve_reward_model.py \
38 | #         rm_model_file=$CONTAINER_MODEL_DIR \
39 | #         trainer.num_nodes=1 \
40 | #         trainer.devices=2 \
41 | #         ++model.tensor_model_parallel_size=1 \
42 | #         ++model.pipeline_model_parallel_size=2 \
43 | #         inference.micro_batch_size=2 \
44 | #         inference.port=$INFERENCE_PORT
45 | 
46 | 
47 | # python /opt/NeMo-Aligner/examples/nlp/data/steerlm/attribute_annotate.py \
48 | #       --input-file=data/our-bench/step5_v5_chat_factual_construct.jsonl \
49 | #       --output-file=data/our-bench/step5_v5_chat_factual_construct_labeled.jsonl \
50 | #       --port=1424
51 | 
52 | 
53 | # docker run --gpus '"device=0"' \
54 | #     --rm \
55 | #     --name nemo_inference_server \
56 | #     -p $INFERENCE_PORT:$INFERENCE_PORT \
57 | #     -v $HOST_MODEL_DIR:$CONTAINER_MODEL_DIR \
58 | #     $DOCKER_IMAGE \
59 | #     python /opt/NeMo-Aligner/examples/nlp/gpt/serve_reward_model.py \
60 | #         rm_model_file=$CONTAINER_MODEL_DIR/Llama2-13B-SteerLM-RM.nemo \
61 | #         trainer.num_nodes=1 \
62 | #         trainer.devices=1 \
63 | #         ++model.tensor_model_parallel_size=1 \
64 | #         ++model.pipeline_model_parallel_size=1 \
65 | #         inference.micro_batch_size=2 \
66 | #         inference.port=$INFERENCE_PORT
67 | 
68 | # python /opt/NeMo-Aligner/examples/nlp/data/steerlm/preprocess_openassistant_data.py --output_directory=data/oasst
69 | 
70 | # python /opt/NeMo-Aligner/examples/nlp/data/steerlm/attribute_annotate.py \
71 | #       --input-file=data/oasst/train.jsonl \
72 | #       --output-file=data/oasst/train_labeled.jsonl \
73 | #       --port=1424


--------------------------------------------------------------------------------
/rewardbench:
--------------------------------------------------------------------------------
1 | allenai_rewardbench/rewardbench/


--------------------------------------------------------------------------------
/run_dpo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # SET CUDA_HOME
 4 | # export CUDA_HOME=/mnt/wuxuaner/workspace/miniconda3/envs/torch231
 5 | # export PATH=$CUDA_HOME/bin:$PATH
 6 | # export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
 7 | 
 8 | # Prompt for model path
 9 | read -p "Enter the model path (default: RewardModels/allenai/tulu-v2.5-dpo-13b-hh-rlhf-60k): " MODEL_PATH
10 | MODEL_PATH=${MODEL_PATH:-RewardModels/allenai/tulu-v2.5-dpo-13b-hh-rlhf-60k}
11 | 
12 | # Prompt for CUDA device
13 | read -p "Enter the CUDA device (default: 0): " CUDA_DEVICE
14 | CUDA_DEVICE=${CUDA_DEVICE:-0}
15 | 
16 | # Add this line at the top of your run_dpo.sh script
17 | source activate torch231
18 | export PYTHONPATH=$PYTHONPATH:pwd
19 | export CUDA_VISIBLE_DEVICES=$CUDA_DEVICE
20 | 
21 | chat_template=tulu
22 | python scripts/run_dpo.py \
23 |     --model $MODEL_PATH \
24 |     --chat_template $chat_template \
25 |     --datapath data/total_dataset.json \
26 |     --batch_size 8 \
27 |     --ref_model RewardModels/allenai/tulu-2-13b \
28 |     --trust_remote_code \
29 | 


--------------------------------------------------------------------------------
/run_rm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Prompt for model path
 4 | read -p "Enter the model path (default: RewardModels/allenai/tulu-v2.5-13b-hh-rlhf-60k-rm): " MODEL_PATH
 5 | MODEL_PATH=${MODEL_PATH:-RewardModels/allenai/tulu-v2.5-13b-hh-rlhf-60k-rm}
 6 | 
 7 | # Prompt for CUDA device
 8 | read -p "Enter the CUDA device (default: 0): " CUDA_DEVICE
 9 | CUDA_DEVICE=${CUDA_DEVICE:-0}
10 | 
11 | # source /mnt/wuxuaner/workspace/miniconda3/bin/activate torch231
12 | source activate torch231
13 | export PYTHONPATH=$PYTHONPATH:pwd
14 | export CUDA_VISIBLE_DEVICES=$CUDA_DEVICE
15 | 
16 | chat_template=tulu
17 | python scripts/run_rm.py \
18 |     --model $MODEL_PATH \
19 |     --datapath data/total_dataset.json \
20 |     --batch_size 8 \
21 |     --trust_remote_code \
22 |     --chat_template $chat_template \


--------------------------------------------------------------------------------
/scripts/configs:
--------------------------------------------------------------------------------
1 | ../allenai_rewardbench/scripts/configs


--------------------------------------------------------------------------------
/scripts/run_dpo.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 AllenAI. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import argparse
 16 | import logging
 17 | import os
 18 | import sys
 19 | 
 20 | import numpy as np
 21 | import torch
 22 | import transformers
 23 | from accelerate import Accelerator
 24 | from accelerate.logging import get_logger
 25 | from fastchat.conversation import get_conv_template
 26 | from tqdm import tqdm
 27 | from trl.trainer.utils import DPODataCollatorWithPadding
 28 | from scripts.utils import convert_robust_dataset_to_preference_dataset_list, load_eval_dataset, compute_accuracy
 29 | 
 30 | import gc
 31 | 
 32 | 
 33 | from rewardbench import (
 34 |     DPO_MODEL_CONFIG,
 35 |     DPOInference,
 36 |     # load_eval_dataset,
 37 |     save_to_hub,
 38 |     torch_dtype_mapping,
 39 | )
 40 | from rewardbench.constants import EXAMPLE_COUNTS, SUBSET_MAPPING
 41 | from rewardbench.utils import calculate_scores_per_section
 42 | 
 43 | # get token from HF_TOKEN env variable, but if it doesn't exist pass none
 44 | HF_TOKEN = os.getenv("HF_TOKEN", None)
 45 | # this is necessary to automatically log in when running this script in docker/batch beaker jobs
 46 | if HF_TOKEN is not None:
 47 |     from huggingface_hub._login import _login
 48 | 
 49 |     _login(token=HF_TOKEN, add_to_git_credential=False)
 50 | 
 51 | 
 52 | def get_args():
 53 |     """
 54 |     Parse arguments strings model and chat_template
 55 |     """
 56 |     parser = argparse.ArgumentParser()
 57 |     parser.add_argument("--model", type=str, required=True, help="path to model")
 58 |     parser.add_argument("--ref_model", type=str, default=None, help="path to model")
 59 |     parser.add_argument(
 60 |         "--ref_free_type", type=str, default="avg", help="type of reference free normalization (norm, avg, or sum)"
 61 |     )
 62 |     parser.add_argument("--datapath", type=str, default="data/reward-bench", help="path to data")
 63 |     parser.add_argument("--tokenizer", type=str, default=None, help="path to non-matching tokenizer")
 64 |     parser.add_argument("--chat_template", type=str, default="tulu", help="path to chat template")
 65 |     parser.add_argument("--do_not_save", action="store_true", help="do not save results to hub (for debugging)")
 66 |     parser.add_argument("--batch_size", type=int, default=6, help="batch size for inference")
 67 |     parser.add_argument(
 68 |         "--pref_sets", action="store_true", help="run on common preference sets instead of our custom eval set"
 69 |     )
 70 |     parser.add_argument(
 71 |         "--trust_remote_code", action="store_true", default=False, help="directly load model instead of pipeline"
 72 |     )
 73 |     parser.add_argument("--debug", action="store_true", default=False, help="use only 10 examples")
 74 |     parser.add_argument(
 75 |         "--disable_beaker_save", action="store_true", help="disable saving the main results in a file for AI2 Beaker"
 76 |     )
 77 |     parser.add_argument(
 78 |         "--not_quantized", action="store_true", help="disable quantization for models that are quantized by default"
 79 |     )
 80 |     parser.add_argument(
 81 |         "--torch_dtype",
 82 |         type=str,
 83 |         default="float16",
 84 |         choices=["float16", "bfloat16", "float32", "float64"],
 85 |         help="PyTorch dtype (default: float16)",
 86 |     )
 87 |     args = parser.parse_args()
 88 |     args.torch_dtype = torch_dtype_mapping(args.torch_dtype)
 89 |     return args
 90 | 
 91 | 
 92 | def main():
 93 |     args = get_args()
 94 |     accelerator = Accelerator()
 95 | 
 96 |     ###############
 97 |     # Setup logging
 98 |     ###############
 99 |     logger = get_logger(__name__)
100 |     logging.basicConfig(
101 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
102 |         datefmt="%Y-%m-%d %H:%M:%S",
103 |         handlers=[logging.StreamHandler(sys.stdout)],
104 |     )
105 |     log_level = logging.INFO
106 |     logger.setLevel(log_level)
107 |     transformers.utils.logging.set_verbosity(log_level)
108 |     transformers.utils.logging.enable_default_handler()
109 |     transformers.utils.logging.enable_explicit_format()
110 | 
111 |     logger.info(f"Running reward model on {args.model} with chat template {args.chat_template}")
112 |     if args.trust_remote_code:
113 |         logger.info("Loading model with Trust Remote Code")
114 | 
115 |     offical_model_name = args.model.replace("RewardModels/", "")
116 |     if args.model in DPO_MODEL_CONFIG:
117 |         config = DPO_MODEL_CONFIG[offical_model_name]
118 |     else:
119 |         config = DPO_MODEL_CONFIG["default"]
120 |     logger.info(f"Using dpo model config: {config}")
121 | 
122 |     model_builder = config["model_builder"]
123 |     tokenizer_builder = config["tokenizer_builder"]
124 | 
125 |     # check datatype from argparse
126 |     if args.torch_dtype == torch.bfloat16:
127 |         logger.warning("Loading weights directly as bfloat16 for PyTorch dtype")
128 |         torch_dtype = torch.bfloat16
129 |     else:
130 |         torch_dtype = torch.float16
131 | 
132 |     assert args.model != args.ref_model, "policy and reference model should be different"
133 |     # load chat template
134 |     chat_template = args.chat_template
135 |     conv = get_conv_template(chat_template)
136 | 
137 |     # define reference free
138 |     if args.ref_model is None:
139 |         ref_free = True
140 |         logger.info("Running reference free DPO - no reference model provided")
141 |     else:
142 |         ref_free = False
143 |         logger.info(f"Running DPO with reference model {args.ref_model}")
144 | 
145 |     ############################
146 |     # Load dataset
147 |     ############################
148 |     logger.info("*** Load dataset ***")
149 |     tokenizer_path = args.tokenizer if args.tokenizer else args.model
150 |     tokenizer = tokenizer_builder(tokenizer_path, trust_remote_code=args.trust_remote_code)
151 |     tokenizer.pad_token = tokenizer.eos_token
152 |     # if no BOS token, set as pad token, e.g. QWEN models
153 |     if tokenizer.bos_token is None:
154 |         tokenizer.bos_token_id = tokenizer.eos_token_id
155 |         tokenizer.pad_token_id = tokenizer.eos_token_id
156 | 
157 | 
158 |     raw_dataset_list = convert_robust_dataset_to_preference_dataset_list(args.datapath)
159 |     
160 |     
161 |     if (
162 |         ("llama-3" in args.model)
163 |         or ("Llama3" in args.model)
164 |         or ("Llama-3" in args.model)
165 |         or ("LLaMA3" in args.model)
166 |         or args.not_quantized
167 |     ):
168 |         model_kwargs = {
169 |             "device_map": "auto",
170 |             "torch_dtype": torch_dtype if torch.cuda.is_available() else None,
171 |         }
172 |         model_kwargs_ref = {
173 |             "device_map": "auto",
174 |             "torch_dtype": torch_dtype if torch.cuda.is_available() else None,
175 |         }
176 |     else:
177 |         model_kwargs = {
178 |             "load_in_8bit": True,
179 |             "device_map": "auto",
180 |             "torch_dtype": torch_dtype if torch.cuda.is_available() else None,
181 |         }
182 |         model_kwargs_ref = {
183 |             "load_in_8bit": True,
184 |             "device_map": "auto",
185 |             "torch_dtype": torch_dtype if torch.cuda.is_available() else None,
186 |         }
187 | 
188 |     model = model_builder(
189 |         args.model,
190 |         trust_remote_code=args.trust_remote_code,
191 |         attn_implementation="sdpa",
192 |         **model_kwargs,
193 |     )
194 | 
195 |     if ref_free:
196 |         ref_model = None
197 |     else:
198 |         ref_model = model_builder(
199 |             args.ref_model,
200 |             trust_remote_code=args.trust_remote_code,
201 |             **model_kwargs_ref,
202 |         )
203 | 
204 |     # use internal inference functions in DPO trainer
205 |     dpo = DPOInference(
206 |         model,
207 |         ref_model,
208 |         tokenizer=tokenizer,
209 |         accelerator=accelerator,
210 |         ref_free_norm=args.ref_free_type,
211 |         # norm is norm, avg is average, sum is sum
212 |     )
213 |     # score_original = []
214 |     score_chosen = []
215 |     score_rejected = []
216 |     
217 |     for dataset_idx, raw_dataset in enumerate(raw_dataset_list):
218 |         
219 |         # clear cuda memory cache
220 |         # model = None
221 |         dataset = None
222 |         dataloader = None
223 |         tokenized_dataset = None
224 |         batch = None
225 |         # del model
226 |         # Synchronize and clear GPU memory
227 |         torch.cuda.synchronize()
228 |         del dataset
229 |         del dataloader
230 |         del tokenized_dataset
231 |         del batch
232 |         gc.collect()
233 |         torch.cuda.empty_cache()
234 |         # gc.collect()
235 |         # torch.cuda.empty_cache()
236 |         torch.cuda.ipc_collect()
237 |         # torch.cuda.empty_cache()
238 |         # prin the gpu memory usage
239 |         
240 |         # for device in range(torch.cuda.device_count()):
241 |         #     cuda.select_device(device)  # Select the GPU device
242 |         #     cuda.close()  # Clear the memory
243 |         #     cuda.select_device(device)  # Reinitialize the GPU device if necessary
244 |         
245 |         print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024 / 1024 / 1024:.2f} GB")
246 |         
247 |         dataset, subsets = load_eval_dataset(
248 |             raw_dataset,
249 |             core_set=not args.pref_sets,
250 |             conv=conv,
251 |             tokenizer=tokenizer,
252 |             logger=logger,
253 |             keep_columns=["text_chosen", "text_rejected", "id", "prompt"],
254 |         )
255 | 
256 |         dataset = dataset.remove_columns("id")
257 |         # debug: use only 10 examples
258 |         if args.debug:
259 |             dataset = dataset.select(range(10))
260 |             subsets = subsets[:10]
261 | 
262 |         ############################
263 |         # Load reward model pipeline
264 |         ############################
265 |         BATCH_SIZE = args.batch_size
266 | 
267 |         # tokenize dataset
268 |         column_names = list(dataset.features)
269 | 
270 |         tokenized_dataset = dataset.map(dpo.tokenize_row, remove_columns=column_names)
271 | 
272 |         dataloader = torch.utils.data.DataLoader(
273 |             tokenized_dataset,
274 |             batch_size=BATCH_SIZE,
275 |             collate_fn=DPODataCollatorWithPadding(
276 |                 pad_token_id=tokenizer.pad_token_id,
277 |                 label_pad_token_id=dpo.label_pad_token_id,
278 |                 is_encoder_decoder=dpo.is_encoder_decoder,
279 |             ),
280 |             # collate_fn = lambda x: x, # fix weird batching error
281 |             shuffle=False,
282 |             drop_last=False,
283 |         )
284 |         results = []
285 |         scores_chosen = []
286 |         scores_rejected = []
287 | 
288 |         for step, batch in enumerate(tqdm(dataloader, desc="RM batch steps")):
289 |             logger.info(f"RM inference step {step}/{len(dataloader)}")
290 | 
291 |             rewards_chosen, rewards_rejected = dpo.inference_step(batch, ref_free=ref_free)
292 | 
293 |             # for each item in batch, record 1 if chosen > rejected
294 |             # extra score from dict within batched results (e.g. logits)
295 |             # [{'label': 'LABEL_1', 'score': 0.6826171875},... ]
296 |             if isinstance(rewards_chosen[0], dict):
297 |                 scores_chosen_batch = [result["score"] for result in rewards_chosen]
298 |                 scores_rejected_batch = [result["score"] for result in rewards_rejected]
299 |             # for classes that directly output scores (custom code)
300 |             else:
301 |                 scores_chosen_batch = rewards_chosen.float().cpu().numpy().tolist()  # convert to float for bfloat16 case
302 |                 scores_rejected_batch = rewards_rejected.float().cpu().numpy().tolist()
303 | 
304 |             [
305 |                 results.append(1) if chosen > rejected else results.append(0)
306 |                 for chosen, rejected in zip(scores_chosen_batch, scores_rejected_batch)
307 |             ]
308 |             scores_chosen += scores_chosen_batch
309 |             scores_rejected += scores_rejected_batch
310 | 
311 | 
312 |         score_chosen.append(scores_chosen)
313 |         score_rejected.append(scores_rejected)
314 |             
315 |     
316 |     ############################
317 |     # Save results
318 |     ############################
319 |     
320 |     import json
321 |     # HACK: load the dataset from the file
322 |     dataset_json:list = json.load(open(args.datapath))
323 |     
324 | 
325 |     print(f"Type of score_chosen: {type(score_chosen[0])}")
326 |     print(f"Lenght of score_chosen: {len(score_chosen[0])}")
327 |     # print(score_chosen[0])
328 |     print(f"Type of score_rejected: {type(score_rejected[0])}")
329 |     print(f"Lenght of score_rejected: {len(score_rejected[0])}")
330 |     # print(score_rejected[0])
331 |     
332 |     for idx, unit in enumerate(dataset_json):
333 |         unit['score_chosen'] = [
334 |             score_list[idx] for score_list in score_chosen
335 |         ]
336 |         unit['score_rejected'] = [
337 |             score_list[idx] for score_list in score_rejected
338 |         ]
339 |     
340 |     # save to results folder with the name + model name + timestamp
341 |     filename = os.path.basename(args.datapath).replace(".json", "")
342 |     model_name = args.model.split("/")[-1]
343 |     ref_model_name = args.ref_model.split("/")[-1] if args.ref_model else "ref_free"
344 |     output_dir = f"results/DPO/{offical_model_name}"
345 |     if not os.path.exists(output_dir):
346 |         os.makedirs(output_dir)
347 |     from datetime import datetime
348 |     output_path = os.path.join(output_dir, f"{filename}_{model_name}_{ref_model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
349 |     with open(output_path, "w") as f:
350 |         json.dump(dataset_json, f, indent=4, ensure_ascii=False)
351 |     
352 |     acc_dict = compute_accuracy(dataset_json)
353 |     print(f"The accuracy of model {model_name}\n in the dataset {filename} is:\n {acc_dict}")
354 | 
355 | if __name__ == "__main__":
356 |     main()


--------------------------------------------------------------------------------
/scripts/run_rm.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 AllenAI. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import argparse
 16 | import logging
 17 | import os
 18 | import sys
 19 | 
 20 | import numpy as np
 21 | import torch
 22 | import transformers
 23 | from accelerate import Accelerator
 24 | from accelerate.logging import get_logger
 25 | from fastchat.conversation import get_conv_template
 26 | from tqdm import tqdm
 27 | from transformers import AutoTokenizer, pipeline
 28 | from scripts.utils import convert_robust_dataset_to_preference_dataset_list, load_eval_dataset, compute_accuracy
 29 | 
 30 | import gc
 31 | 
 32 | from rewardbench import (
 33 |     REWARD_MODEL_CONFIG,
 34 |     check_tokenizer_chat_template,
 35 |     # load_eval_dataset,
 36 |     save_to_hub,
 37 |     torch_dtype_mapping,
 38 | )
 39 | from rewardbench.constants import EXAMPLE_COUNTS, SUBSET_MAPPING
 40 | from rewardbench.utils import calculate_scores_per_section
 41 | 
 42 | # Enable TensorFloat32 (TF32) tensor cores on Ampere GPUs for matrix multiplications (faster than FP32)
 43 | torch.backends.cuda.matmul.allow_tf32 = True
 44 | torch.backends.cudnn.allow_tf32 = True
 45 | 
 46 | # get token from HF_TOKEN env variable, but if it doesn't exist pass none
 47 | HF_TOKEN = os.getenv("HF_TOKEN", None)
 48 | # this is necessary to automatically log in when running this script in docker/batch beaker jobs
 49 | if HF_TOKEN is not None:
 50 |     from huggingface_hub._login import _login
 51 | 
 52 |     _login(token=HF_TOKEN, add_to_git_credential=False)
 53 | 
 54 | 
 55 | def get_args():
 56 |     """
 57 |     Parse arguments strings model and chat_template
 58 |     """
 59 |     parser = argparse.ArgumentParser()
 60 |     parser.add_argument("--model", type=str, required=True, help="path to model")
 61 |     parser.add_argument("--tokenizer", type=str, default=None, help="path to non-matching tokenizer to model")
 62 |     parser.add_argument("--chat_template", type=str, default="tulu", help="path to chat template")
 63 |     parser.add_argument(
 64 |         "--trust_remote_code", action="store_true", default=False, help="directly load model instead of pipeline"
 65 |     )
 66 |     parser.add_argument("--datapath", type=str, default="data/reward-bench", help="path to data")
 67 |     parser.add_argument("--do_not_save", action="store_true", help="do not save results to hub (for debugging)")
 68 |     parser.add_argument("--batch_size", type=int, default=64, help="batch size for inference")
 69 |     parser.add_argument("--max_length", type=int, default=2048, help="Max length of RM inputs (passed to pipeline)")
 70 |     parser.add_argument(
 71 |         "--pref_sets", action="store_true", help="run on common preference sets instead of our custom eval set"
 72 |     )
 73 |     parser.add_argument(
 74 |         "--debug", action="store_true", help="run on common preference sets instead of our custom eval set"
 75 |     )
 76 |     parser.add_argument(
 77 |         "--disable_beaker_save", action="store_true", help="disable saving the main results in a file for AI2 Beaker"
 78 |     )
 79 |     parser.add_argument(
 80 |         "--not_quantized", action="store_true", help="disable quantization for models that are quantized by default"
 81 |     )
 82 |     parser.add_argument(
 83 |         "--torch_dtype",
 84 |         type=str,
 85 |         default="float16",
 86 |         choices=["float16", "bfloat16", "float32", "float64"],
 87 |         help="PyTorch dtype (default: float16)",
 88 |     )
 89 |     args = parser.parse_args()
 90 |     args.torch_dtype = torch_dtype_mapping(args.torch_dtype)
 91 |     return args
 92 | 
 93 | 
 94 | def main():
 95 |     args = get_args()
 96 |     ###############
 97 |     # Setup logging
 98 |     ###############
 99 |     accelerator = Accelerator()
100 |     current_device = accelerator.process_index
101 | 
102 |     logger = get_logger(__name__)
103 |     logging.basicConfig(
104 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
105 |         datefmt="%Y-%m-%d %H:%M:%S",
106 |         handlers=[logging.StreamHandler(sys.stdout)],
107 |     )
108 |     log_level = logging.INFO
109 |     logger.setLevel(log_level)
110 |     transformers.utils.logging.set_verbosity(log_level)
111 |     transformers.utils.logging.enable_default_handler()
112 |     transformers.utils.logging.enable_explicit_format()
113 | 
114 |     logger.info(f"Running reward model on {args.model} with chat template {args.chat_template}")
115 |     if args.trust_remote_code:
116 |         logger.info("Loading model with Trust Remote Code")
117 | 
118 |     # load chat template
119 |     chat_template = args.chat_template
120 |     conv = get_conv_template(chat_template)
121 |     logger.info(f"Using conversation template {chat_template}: {conv}")
122 |     
123 |     offical_model_name = args.model.replace("RewardModels/", "")
124 |     if offical_model_name in REWARD_MODEL_CONFIG:
125 |         # delete the "RewardModel/" prefix
126 |         config = REWARD_MODEL_CONFIG[offical_model_name]
127 |     else:
128 |         config = REWARD_MODEL_CONFIG["default"]
129 |     logger.info(f"Using reward model config: {config}")
130 | 
131 |     # Default entries
132 |     # "model_builder": AutoModelForSequenceClassification.from_pretrained,
133 |     # "pipeline_builder": pipeline,
134 |     # "quantized": True,
135 |     # "custom_dialogue": False,
136 |     # "model_type": "Seq. Classifier"
137 | 
138 |     quantized = config["quantized"]  # only Starling isn't quantized for now
139 |     # if llama-3 in name, switch quantized to False (severely degrades performance)
140 |     if (
141 |         ("llama-3" in args.model)
142 |         or ("Llama3" in args.model)
143 |         or ("Llama-3" in args.model)
144 |         or ("LLaMA3" in args.model)
145 |         or ("llama3" in args.model)
146 |         or args.not_quantized
147 |     ):
148 |         quantized = False
149 |         logger.info(f"Disabling quantization for llama-3 or override flag (--not_quantized: {args.not_quantized})")
150 | 
151 |     custom_dialogue = config["custom_dialogue"]
152 |     model_type = config["model_type"]
153 |     if model_type == "Custom Classifier":
154 |         raise  NotImplementedError("For the Custom Classifier model like NVIDIA SteerLM, plz refer to the NVIDIA original code")
155 |     model_builder = config["model_builder"]
156 |     pipeline_builder = config["pipeline_builder"]
157 |     torch_dtype = config.get("torch_dtype", None)
158 |     # if not datatype in config (default), check args
159 |     if torch_dtype is None:
160 |         # if datatype is bfloat16, then manually turn off quantizaiton (done with bitsandbytes)
161 |         if args.torch_dtype == torch.bfloat16:
162 |             quantized = False
163 |             logger.info("Disabling quantization for bfloat16 datatype")
164 |         torch_dtype = args.torch_dtype
165 | 
166 |     # not included in config to make user explicitly understand they are passing this
167 |     trust_remote_code = args.trust_remote_code
168 | 
169 |     ############################
170 |     # Load dataset
171 |     ############################
172 |     logger.info("*** Load dataset ***")
173 |     tokenizer_path = args.tokenizer if args.tokenizer else args.model
174 |     tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=args.trust_remote_code)
175 |     if not custom_dialogue:  # not needed for PairRM / SteamSHP
176 |         tokenizer.truncation_side = "left"  # copied from Starling, but few samples are above context length
177 |     
178 |     
179 |     ############################
180 |     # Load reward model pipeline
181 |     ############################
182 |     BATCH_SIZE = args.batch_size
183 |     logger.info("*** Load reward model ***")
184 |     reward_pipeline_kwargs = {
185 |         "batch_size": BATCH_SIZE,  # eval_args.inference_batch_size,
186 |         "truncation": True,
187 |         "padding": True,
188 |         "max_length": args.max_length,
189 |         "function_to_apply": "none",  # Compute raw logits
190 |         "return_token_type_ids": False,
191 |     }
192 |     if quantized:
193 |         model_kwargs = {
194 |             "load_in_8bit": True,
195 |             "device_map": {"": current_device},
196 |             "torch_dtype": torch_dtype if torch.cuda.is_available() else None,
197 |         }
198 |     else:
199 |         model_kwargs = {
200 |             "device_map": "auto",
201 |             "torch_dtype": torch_dtype,
202 |         }
203 | 
204 |     model = model_builder(args.model, **model_kwargs, trust_remote_code=trust_remote_code)
205 |     reward_pipe = pipeline_builder(
206 |         "text-classification",
207 |         model=model,
208 |         tokenizer=tokenizer,
209 |     )
210 | 
211 |     ############################
212 |     # Tokenization settings & dataset preparation
213 |     ############################
214 |     # set pad token to eos token if not set
215 |     if reward_pipe.tokenizer.pad_token_id is None:
216 |         reward_pipe.model.config.pad_token_id = reward_pipe.tokenizer.eos_token_id
217 |         reward_pipe.tokenizer.pad_token_id = reward_pipe.tokenizer.eos_token_id
218 |     # For models whose config did not contains `pad_token_id`
219 |     if reward_pipe.model.config.pad_token_id is None:
220 |         reward_pipe.model.config.pad_token_id = reward_pipe.tokenizer.pad_token_id
221 | 
222 |     # if using fastchat template (no template in tokenizer), make the RM tokenizer output an EOS token
223 |     if not check_tokenizer_chat_template(tokenizer):
224 |         reward_pipe.tokenizer.add_eos_token = True
225 |     
226 |     
227 |     raw_dataset_list = convert_robust_dataset_to_preference_dataset_list(args.datapath)
228 |     
229 |     # score_original = []
230 |     score_chosen = []
231 |     score_rejected = []
232 |     for dataset_idx, raw_dataset in enumerate(raw_dataset_list):
233 |         
234 |         # clear cuda memory cache
235 |         dataset = None
236 |         dataloader = None
237 |         torch.cuda.synchronize()
238 |         del dataset
239 |         del dataloader
240 |         gc.collect()
241 |         torch.cuda.empty_cache()
242 |         torch.cuda.ipc_collect()
243 |         # prin the gpu memory usage
244 |         print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024 / 1024 / 1024:.2f} GB")
245 |         
246 |         # for device in range(torch.cuda.device_count()):
247 |         #     cuda.select_device(device)  # Select the GPU device
248 |         #     cuda.close()  # Clear the memory
249 |         #     cuda.select_device(device)  # Reinitialize the GPU device if necessary
250 |         print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024 / 1024 / 1024:.2f} GB")
251 |         
252 |         
253 |         
254 |         dataset, subsets = load_eval_dataset(
255 |             raw_dataset,
256 |             core_set=not args.pref_sets,
257 |             conv=conv,
258 |             custom_dialogue_formatting=custom_dialogue,
259 |             tokenizer=tokenizer,
260 |             logger=logger,
261 |             keep_columns=["text_chosen", "text_rejected", "id"],
262 |         )
263 |         # copy id for saving, then remove
264 |         ids = dataset["id"]
265 |         dataset = dataset.remove_columns("id")
266 | 
267 |         # debug: use only 10 examples
268 |         if args.debug:
269 |             dataset = dataset.select(range(10))
270 |             subsets = subsets[:10]
271 |             ids = ids[:10]
272 | 
273 |         
274 | 
275 |         ############################
276 |         # Run inference [1/2]" built in transformers
277 |         ############################
278 |         # if using HF pipeline, can pass entire dataset and get results
279 |         # first, handle custom pipelines that we must batch normally
280 |         if pipeline_builder == pipeline:
281 |             logger.info("*** Running forward pass via built in pipeline abstraction ***")
282 |             # this setup can be optimized slightly with one pipeline call
283 |             # prepare for inference
284 |             reward_pipe = accelerator.prepare(reward_pipe)
285 | 
286 |             results_rej = reward_pipe(dataset["text_rejected"], **reward_pipeline_kwargs)
287 |             results_cho = reward_pipe(dataset["text_chosen"], **reward_pipeline_kwargs)
288 | 
289 |             # extract scores from results which is list of dicts, e.g. [{'label': 'LABEL_1', 'score': 0.6826171875},... ]
290 |             unit_score_chosen_list = [result["score"] for result in results_cho]
291 |             unit_score_rejected_list = [result["score"] for result in results_rej]
292 | 
293 |             # pairwise comparison list comprehension
294 |             results = [1 if chosen > rejected else 0 for chosen, rejected in zip(unit_score_chosen_list, unit_score_rejected_list)]
295 | 
296 |         ############################
297 |         # Run inference [2/2] custom pipelines
298 |         ############################
299 |         else:
300 |             logger.info("*** Running dataloader to collect results ***")
301 |             # TODO make more custom pipelines work with pre-tokenized data
302 |             from torch.utils.data.dataloader import default_collate
303 | 
304 |             # for PairRM, hmm, will move all of this later
305 |             def custom_collate_fn(batch):
306 |                 # check if ['text_chosen'] is in first batch element
307 |                 # Check if the first element of the batch is a dictionary
308 |                 if isinstance(batch[0]["text_chosen"][0], dict):
309 |                     return batch  # Return the batch as-is if it's a list of dicts
310 |                 else:
311 |                     return default_collate(batch)  # Use the default collate behavior otherwise
312 | 
313 |             dataloader = torch.utils.data.DataLoader(
314 |                 dataset,
315 |                 batch_size=BATCH_SIZE,
316 |                 collate_fn=custom_collate_fn,  # if not args.pref_sets else None,
317 |                 shuffle=False,
318 |                 drop_last=False,
319 |             )
320 | 
321 |             dataloader, model = accelerator.prepare(dataloader, reward_pipe.model)
322 |             reward_pipe.model = model
323 | 
324 |             results = []
325 |             unit_score_chosen_list = []
326 |             unit_score_rejected_list = []
327 |             for step, batch in enumerate(tqdm(dataloader, desc="RM batch steps")):
328 |                 # logger.info(f"RM inference step {step}/{len(dataloader)}")
329 | 
330 |                 if model_type == "Custom Classifier":
331 |                     raise NotImplementedError("For the Custom Classifier model like NVIDIA SteerLM, plz refer to the NVIDIA original code")
332 |                 else:
333 |                     rewards_chosen = reward_pipe(batch["text_chosen"], **reward_pipeline_kwargs)
334 |                     rewards_rejected = reward_pipe(batch["text_rejected"], **reward_pipeline_kwargs)
335 |                     print(f"rewards_chosen: {rewards_chosen}")
336 |                     print(f"rewards_rejected: {rewards_rejected}")
337 |                     # for each item in batch, record 1 if chosen > rejected
338 |                     # extra score from dict within batched results (e.g. logits)
339 |                     # [{'label': 'LABEL_1', 'score': 0.6826171875},... ]
340 |                     if isinstance(rewards_chosen[0], dict):
341 |                         score_chosen_batch = [result["score"] for result in rewards_chosen]
342 |                         score_rejected_batch = [result["score"] for result in rewards_rejected]
343 |                     # for classes that directly output scores (custom code)
344 |                     else:
345 |                         score_chosen_batch = (
346 |                             rewards_chosen.float().cpu().numpy().tolist()
347 |                         )  # cast to float in case of bfloat16
348 |                         score_rejected_batch = rewards_rejected.float().cpu().numpy().tolist()
349 | 
350 |                     # log results
351 |                     for chosen, rejected in zip(score_chosen_batch, score_rejected_batch):
352 |                         print(f"chosen: {chosen}, rejected: {rejected}")
353 |                         if chosen > rejected:
354 |                             results.append(1)
355 |                         else:
356 |                             results.append(0)
357 |                     unit_score_chosen_list.extend(score_chosen_batch)
358 |                     unit_score_rejected_list.extend(score_rejected_batch)
359 | 
360 | 
361 | 
362 |         score_chosen.append(unit_score_chosen_list)
363 |         score_rejected.append(unit_score_rejected_list)
364 | 
365 |     
366 |     ############################
367 |     # Save results
368 |     ############################
369 |     
370 |     import json
371 |     # HACK: load the dataset from the file
372 |     dataset_json:list = json.load(open(args.datapath))
373 |     
374 |     # print(f"Type of score_original: {type(score_original)}")
375 |     # print(f"Lenght of score_original: {len(score_original)}")
376 |     # print(score_original)
377 |     print(f"Type of score_chosen: {type(score_chosen[0])}")
378 |     print(f"Lenght of score_chosen: {len(score_chosen[0])}")
379 |     # print(score_chosen[0])
380 |     print(f"Type of score_rejected: {type(score_rejected[0])}")
381 |     print(f"Lenght of score_rejected: {len(score_rejected[0])}")
382 |     # print(score_rejected[0])
383 |     
384 |     for idx, unit in enumerate(dataset_json):
385 |         # unit['score_orig'] = score_original[idx]
386 |         unit['score_chosen'] = [
387 |             score_list[idx] for score_list in score_chosen
388 |         ]
389 |         unit['score_rejected'] = [
390 |             score_list[idx] for score_list in score_rejected
391 |         ]
392 |         # if all the elemnts in the list are list and all the elements is of length 1
393 |         if all(isinstance(elem, list) and len(elem) == 1 for elem in unit['score_chosen']):
394 |             unit['score_chosen'] = [elem[0] for elem in unit['score_chosen']]
395 |         if all(isinstance(elem, list) and len(elem) == 1 for elem in unit['score_rejected']):
396 |             unit['score_rejected'] = [elem[0] for elem in unit['score_rejected']]
397 |     
398 |     # save to results folder with the name + model name + timestamp
399 |     filename = os.path.basename(args.datapath).replace(".json", "")
400 |     model_name = args.model.split("/")[-1]
401 |     ref_model_name = "REWORD_MODEL"
402 |     # make a dir at results with official model name
403 |     output_dir = f"results/Seq_Classifier/{offical_model_name}"
404 |     if not os.path.exists(output_dir):
405 |         os.makedirs(output_dir)
406 |     from datetime import datetime
407 |     output_path = os.path.join(output_dir, f"{filename}_{model_name}_{ref_model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
408 |     with open(output_path, "w") as f:
409 |         json.dump(dataset_json, f, indent=4, ensure_ascii=False)
410 |     
411 |         acc_dict = compute_accuracy(dataset_json)
412 |     print(f"The accuracy of model {model_name}\n in the dataset {filename} is:\n {acc_dict}")
413 | 
414 | if __name__ == "__main__":
415 |     main()


--------------------------------------------------------------------------------
/scripts/utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from fastchat.conversation import Conversation
  3 | from datasets import Dataset, DatasetDict, Value, concatenate_datasets, load_dataset
  4 | from transformers import PreTrainedTokenizer
  5 | from rewardbench.utils import check_tokenizer_chat_template, prepare_dialogue, prepare_dialogue_from_tokenizer
  6 | import numpy as np
  7 | from typing import List, Dict, Any
  8 | import json
  9 | from datasets import Dataset, load_from_disk
 10 | EXTRA_PREF_SETS = "allenai/pref-test-sets"
 11 | def convert_robust_dataset_to_preference_dataset_list(robust_dataset_path: str) -> List[Dataset]:
 12 |     robust_dataset = json.load(open(robust_dataset_path))
 13 |     # Prepare the chosen and rejected dataset list
 14 |     para_corp_dataset_list = []
 15 |     num_pairs = len(robust_dataset[0]['chosen'])
 16 |     
 17 |     assert num_pairs == len(robust_dataset[0]['rejected']), \
 18 |         "The number of chosen and rejected pairs should be the same."
 19 |     
 20 |     for idx in range(num_pairs):
 21 |         para_corp_dataset = Dataset.from_dict({
 22 |             "id": [unit['id'] for unit in robust_dataset],
 23 |             "subset": ['subset' for unit in robust_dataset],
 24 |             "prompt": [unit['prompt'] for unit in robust_dataset],
 25 |             "chosen": [unit['chosen'][idx] for unit in robust_dataset],
 26 |             "chosen_model": ["chosen" for _ in robust_dataset],
 27 |             "rejected": [unit['rejected'][idx] for unit in robust_dataset],
 28 |             "rejected_model": ["rejected" for _ in robust_dataset],
 29 |         })
 30 |         para_corp_dataset_list.append(para_corp_dataset)
 31 | 
 32 |     return para_corp_dataset_list
 33 | 
 34 | def split_dataset_by_domain(dataset: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
 35 |     domains = ["chat","math","code","safety"]
 36 |     domain_dataset_dict = {}
 37 |     for domain in domains:
 38 |         domain_dataset_dict[domain] = [example for example in dataset if example['domain'].startswith(domain)]
 39 |     
 40 |     # pop the domain keys
 41 |     for domain in domain_dataset_dict:
 42 |         for example in domain_dataset_dict[domain]:
 43 |             example.pop('domain')
 44 |     
 45 |     return domain_dataset_dict
 46 | 
 47 | 
 48 | def compute_accuracy(results: List[Dict[str, Any]]) -> Dict[str, float]:
 49 |     if 'domain' in results[0]:
 50 |         # this indicates this is total_dataset.json
 51 |         print('We are handling total_dataset.json')
 52 |         print('Splitting the dataset by domain...')
 53 |         # thus we need to split the results into different domains
 54 |         split_results = split_dataset_by_domain(results)
 55 |         domain_results = {}
 56 |         for domain in split_results:
 57 |             domain_results[domain] = compute_accuracy(split_results[domain])
 58 |         domain_avg_results = {}
 59 |         for domain in domain_results:
 60 |             domain_avg_results[domain] = np.mean(list(domain_results[domain].values()))
 61 |         domain_hard_normal_easy_acc = {
 62 |             "hard_acc": np.mean([domain_results[domain]["hard_acc"] for domain in domain_results]),
 63 |             "normal_acc": np.mean([domain_results[domain]["normal_acc"] for domain in domain_results]),
 64 |             "easy_acc": np.mean([domain_results[domain]["easy_acc"] for domain in domain_results])
 65 |         }
 66 |         total_avg_acc = np.mean([domain_avg_results[domain] for domain in domain_avg_results])
 67 |         # merge the results into one falten dictionary
 68 |         final_results = {}
 69 |         # merge domain_avg_results into final_results
 70 |         final_results.update(domain_avg_results)
 71 |         # merge domain_hard_normal_easy_acc into final_results
 72 |         final_results.update(domain_hard_normal_easy_acc)
 73 |         # merge total_avg_acc into final_results
 74 |         final_results.update({"total_avg_acc": total_avg_acc})
 75 |         return final_results
 76 |             
 77 |     
 78 |     # results is a list of dictionaries, each dictionary contains the following keys:
 79 |     # score_chosen: [float, float, float], the scores of the chosen responses
 80 |     # score_rejected: [float, float, float], the scores of the rejected responses
 81 |     # the scores are in the order of [concise, detailed_plain, detailed_markdown]
 82 |     # we will compare the scores of chosen responses and rejected responses iteratively
 83 |     # formatted as a 3x3 matrix, where the rows represent the scores of chosen responses
 84 |     # and the columns represent the scores of rejected responses
 85 |     MATRIX_SIZE = 3 # the column and row size of the matrix
 86 |     acc_matrix = np.zeros((MATRIX_SIZE, MATRIX_SIZE))
 87 |     for result in results:
 88 |         for i in range(len(result["score_chosen"])):
 89 |             for j in range(len(result["score_rejected"])):
 90 |                 if result["score_chosen"][i] > result["score_rejected"][j]:
 91 |                     acc_matrix[i][j] += 1
 92 |     
 93 |     # compute the accuracy by dividing the number of correct comparisons by the total number of comparisons
 94 |     acc_matrix /= len(results)
 95 |     # compute the hard,normal,easy accuracy
 96 |     # hard accuracy: the average of the upper-right triangle of the matrix
 97 |     # namely chosen responses with less fancy style compared to rejected responses with more fancy style
 98 |     upper_right_count = MATRIX_SIZE * (MATRIX_SIZE - 1) / 2
 99 |     hard_acc = np.sum(np.triu(acc_matrix, 1)) / upper_right_count
100 |     # normal accuracy: the average of the diagonal of the matrix
101 |     # namely chosen responses with the same style compared to rejected responses with the same style
102 |     normal_acc = np.mean(np.diag(acc_matrix))
103 |     # easy accuracy: the average of the lower-left triangle of the matrix
104 |     # namely chosen responses with more fancy style compared to rejected responses with less fancy style
105 |     lower_left_count = MATRIX_SIZE * (MATRIX_SIZE - 1) / 2
106 |     easy_acc = np.sum(np.tril(acc_matrix, -1)) / lower_left_count
107 |     
108 |     return {
109 |         "hard_acc": hard_acc,
110 |         "normal_acc": normal_acc,
111 |         "easy_acc": easy_acc
112 |     }
113 | 
114 | 
115 | 
116 | 
117 | def load_eval_dataset(
118 |     raw_Dataset: Dataset = None,
119 |     core_set: bool = True,
120 |     custom_dialogue_formatting: bool = False,
121 |     conv: Conversation = None,
122 |     tokenizer: PreTrainedTokenizer = None,
123 |     logger: logging.Logger = None,
124 |     keep_columns: List[str] = ["text_chosen", "text_rejected", "id"],
125 |     return_extra_data: bool = False,
126 |     max_turns: int = None,
127 | ) -> tuple[Dataset, list[str]]:
128 |     """
129 |     Loads either the core eval set for HERM or the existing preference data test sets.
130 | 
131 |     Args:
132 |         core_set: if True, load the core eval set for HERM.
133 |         custom_dialogue_formatting: if True, format the dialogue as needed for custom models (e.g. SHP and PairRM).
134 |         conv: fastchat conversation template.
135 |                 If None (default) the passed tokenizer needs to have a usable chat template.
136 |         tokenizer: HuggingFace tokenizer to use. The tokenizer's chat template, if available, has precedence over conv.
137 |         logger: logger to use for logging. If None (default), no logging is done.
138 |         keep_columns: list of columns to keep in the dataset.
139 |         max_turns: maximum number of turns in the dialogue (usually even). If None (default), no filtering is done.
140 | 
141 |     Returns:
142 |         dataset: loaded dataset with required properties.
143 |         subsets: list of subsets for the corresponding samples in the dataset.
144 |     """
145 |     if raw_Dataset is not None:
146 |         raw_dataset = raw_Dataset
147 |     elif core_set:
148 |         raw_dataset = load_from_disk("data/reward-bench")
149 |         raw_dataset = raw_dataset['filtered']
150 |     else:
151 |         raw_dataset = load_dataset(EXTRA_PREF_SETS)
152 |         modified_datasets = []
153 | 
154 |         # Iterate over each subset in the DatasetDict
155 |         for subset_name, subdataset in raw_dataset.items():
156 |             # if subset column exists, move to subsubset (for pref sets)
157 |             if "subset" in subdataset.column_names:
158 |                 subdataset = subdataset.rename_column("subset", "subsubset")
159 | 
160 |             # Add a new column 'subset' to the dataset with the subset name
161 |             subdataset = subdataset.add_column("subset", [subset_name] * len(subdataset))
162 | 
163 |             # Append the modified dataset to the list
164 |             # remove pku_safer and pku_better from the dict, no longer part of the benchmark
165 |             if subset_name not in ["pku_safer", "pku_better"]:
166 |                 modified_datasets.append(subdataset)
167 | 
168 |         # Concatenate all the modified datasets into one dataset
169 |         raw_dataset = concatenate_datasets(modified_datasets)
170 | 
171 |     # Apply chat template
172 |     if not custom_dialogue_formatting:
173 |         usable_tokenizer = check_tokenizer_chat_template(tokenizer)
174 | 
175 |         # assert either conv is passed or tokenizer has chat_template
176 |         assert conv is not None or usable_tokenizer
177 | 
178 |         if usable_tokenizer:
179 |             if logger is not None:
180 |                 logger.info("*** Preparing dataset with HF Transformers ***")
181 |             # docs https://huggingface.co/docs/transformers/main/en/chat_templating
182 |             dataset = raw_dataset.map(
183 |                 prepare_dialogue_from_tokenizer,
184 |                 fn_kwargs={"tokenizer": tokenizer},
185 |                 num_proc=8,
186 |                 load_from_cache_file=False,
187 |             )
188 | 
189 |         # else use FastChat to get chat template
190 |         else:
191 |             if logger is not None:
192 |                 logger.info("*** Preparing dataset with FastChat ***")
193 |             dataset = raw_dataset.map(
194 |                 prepare_dialogue,
195 |                 fn_kwargs={"dialogue_template": conv},
196 |                 num_proc=8,  # using >1 process causes issues with re-assigning prompt in example
197 |                 load_from_cache_file=False,
198 |             )
199 |     else:
200 |         if logger is not None:
201 |             logger.info("*** Preparing dataset with custom formatting ***")
202 | 
203 |         def map_conversations(example, core_set=True):
204 |             if core_set:
205 |                 example["text_chosen"] = [
206 |                     {"role": "user", "content": example["prompt"]},
207 |                     {"role": "assistant", "content": example["chosen"]},
208 |                 ]
209 |                 example["text_rejected"] = [
210 |                     {"role": "user", "content": example["prompt"]},
211 |                     {"role": "assistant", "content": example["rejected"]},
212 |                 ]
213 |             else:
214 |                 prompt = example["prompt"]
215 |                 example["text_chosen"] = prompt + [{"role": "assistant", "content": example["chosen"]}]
216 |                 example["text_rejected"] = prompt + [{"role": "assistant", "content": example["rejected"]}]
217 |             return example
218 | 
219 |         dataset = raw_dataset.map(
220 |             map_conversations,
221 |             fn_kwargs={"core_set": core_set},
222 |             num_proc=8,
223 |         )
224 | 
225 |     if max_turns is not None:
226 |         assert max_turns > 0, "max_turns must be greater than 0"
227 | 
228 |         # filter long answers (MT Bench prompt as 1 or 2 turn examples)
229 |         def filter_long_turns(batch):
230 |             return len(batch["text_chosen"]) <= max_turns
231 | 
232 |         dataset = dataset.filter(filter_long_turns)
233 | 
234 |     # take column subset from dataset
235 |     subsets = dataset["subset"]
236 | 
237 |     # remove columns if set and not custom_dialogue_formatting
238 |     all_cols = dataset.column_names
239 |     dataset = dataset.remove_columns([c for c in all_cols if c not in keep_columns])
240 | 
241 |     return dataset, subsets
242 | 
243 | 
244 | 
245 | if __name__ == "__main__":
246 |     # test the function
247 |     pass


--------------------------------------------------------------------------------