├── .gitignore ├── .gitmodules ├── README.md ├── assets └── example_data.png ├── data ├── chat_filtered.json ├── code_filtered.json ├── math_filtered.json ├── safety-refuse_filtered.json ├── safety-response_filtered.json └── total_dataset.json ├── nvidia.sh ├── rewardbench ├── run_dpo.sh ├── run_rm.sh └── scripts ├── configs ├── run_dpo.py ├── run_rm.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | rewardbench-bak/ 2 | analysis_bak/ 3 | analysis/alpaca_eval/datasets--tatsu-lab--alpaca_eval/ 4 | analysis/alpaca_eval/.locks/ 5 | playground/data/oasst/ 6 | # temp code 7 | eval_code/ 8 | 9 | # Byte-compiled / optimized / DLL files 10 | __pycache__/ 11 | *.py[cod] 12 | *$py.class 13 | 14 | # C extensions 15 | *.so 16 | 17 | # Distribution / packaging 18 | .Python 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | wheels/ 31 | share/python-wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | MANIFEST 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .nox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | *.py,cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | cover/ 61 | 62 | # Translations 63 | *.mo 64 | *.pot 65 | 66 | # Django stuff: 67 | *.log 68 | local_settings.py 69 | db.sqlite3 70 | db.sqlite3-journal 71 | 72 | # Flask stuff: 73 | instance/ 74 | .webassets-cache 75 | 76 | # Scrapy stuff: 77 | .scrapy 78 | 79 | # Sphinx documentation 80 | docs/_build/ 81 | 82 | # PyBuilder 83 | .pybuilder/ 84 | target/ 85 | 86 | # Jupyter Notebook 87 | .ipynb_checkpoints 88 | 89 | # IPython 90 | profile_default/ 91 | ipython_config.py 92 | 93 | # pyenv 94 | # For a library or package, you might want to ignore these files since the code is 95 | # intended to run in multiple environments; otherwise, check them in: 96 | # .python-version 97 | 98 | # pipenv 99 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 100 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 101 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 102 | # install all needed dependencies. 103 | #Pipfile.lock 104 | 105 | # poetry 106 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 107 | # This is especially recommended for binary packages to ensure reproducibility, and is more 108 | # commonly ignored for libraries. 109 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 110 | #poetry.lock 111 | 112 | # pdm 113 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 114 | #pdm.lock 115 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 116 | # in version control. 117 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 118 | .pdm.toml 119 | .pdm-python 120 | .pdm-build/ 121 | 122 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 123 | __pypackages__/ 124 | 125 | # Celery stuff 126 | celerybeat-schedule 127 | celerybeat.pid 128 | 129 | # SageMath parsed files 130 | *.sage.py 131 | 132 | # Environments 133 | .env 134 | .venv 135 | env/ 136 | venv/ 137 | ENV/ 138 | env.bak/ 139 | venv.bak/ 140 | 141 | # Spyder project settings 142 | .spyderproject 143 | .spyproject 144 | 145 | # Rope project settings 146 | .ropeproject 147 | 148 | # mkdocs documentation 149 | /site 150 | 151 | # mypy 152 | .mypy_cache/ 153 | .dmypy.json 154 | dmypy.json 155 | 156 | # Pyre type checker 157 | .pyre/ 158 | 159 | # pytype static type analyzer 160 | .pytype/ 161 | 162 | # Cython debug symbols 163 | cython_debug/ 164 | 165 | # PyCharm 166 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 167 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 168 | # and can be added to the global gitignore or merged into this file. For a more nuclear 169 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 170 | #.idea/ 171 | MODELS 172 | RewardModels 173 | data/reward-bench/filtered/cache-089d8c9766de669e.arrow 174 | go1.20.5.linux-amd64.tar.gz 175 | data/reward-bench/filtered/cache-*.arrow 176 | results/ 177 | results_backup/ 178 | cache-* 179 | *.json 180 | *.png 181 | data/ultrachat* 182 | data/tulu* 183 | data/baai* 184 | data/sharegpt/ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json?download=true 185 | DATASETS 186 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "allenai_rewardbench"] 2 | path = allenai_rewardbench 3 | url = https://github.com/allenai/reward-bench.git 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RM-Bench 2 | 3 | This repository contains the data of the ICLR 25 Oral Paper "*RM-Bench: Benchmarking Reward Models of Language Models with Subtlety and Style*" 4 | 5 | ## 🔥News 6 | 7 | - [2025/03/06] We have established a partnership with **AGI-Eval** platform. All results of RM-Bench are now available on [AGI-Eval](https://agi-eval.cn/evaluation/detail?id=57)! 8 | - [2025/01/23] Our paper has been accepted by ICLR 2025 as Oral! See u in SGP!! 🎉 9 | 10 | ## Introduction 11 | We introduce RM-Bench, a benchmark dataset for evaluating reward models of language modeling. 12 | We focus on two aspects of reward models: **Sensitivity to Subtle Changes** and **Robustness to Style Bias**. 13 | Specifically, for each prompt in RM-Bench, we provide three chosen responses and three rejected responses with different styles. 14 | The difference between the chosen and rejected responses is subtle, and the styles of the responses are varied from concise to detailed to well-formatted. 15 | 16 | 17 | Example Data 18 |

Figure 1: Example Data from RMBench. The rejected response incorrect because Schrödinger's cat illustrates the concept of quantum superposition, not quantum entanglement. 19 | $y^\varnothing$ is a concise response, $y^{\text{L}}$ is a detailed response, and $y^{\text{L,M}}$ is a detailed response with markdown formatting. 20 |

21 | 22 | ## Dataset Details 23 | The dataset can be found in the `data` directory or downloaded from [Hugging Face](https://huggingface.co/datasets/THU-KEG/RM-Bench). 24 | The samples are formatted as follows: 25 | 26 | ```json 27 | { 28 | "id": // unique identifier of the sample, 29 | "prompt": // the prompt given to the model, 30 | "chosen": [ 31 | "resp_1", // the chosen response with concise style, 32 | "resp_2", // the chosen response with detailed style and formatted as plain text, 33 | "resp_3" // the chosen response with detailed style and formatted as markdown, 34 | ] 35 | "rejected": [ 36 | "resp_1", // the rejected response with concise style, 37 | "resp_2", // the rejected response with detailed style and formatted as plain text, 38 | "resp_3" // the rejected response with detailed style and formatted as markdown, 39 | ], 40 | "domain": // the domain of the sample including "chat, code, math, safety-refuse, safety-response" 41 | } 42 | ``` 43 | 44 | ## Repository Structure 45 | 46 | 47 | ```bash 48 | ├── README.md 49 | ├── allenai_rewardbench # the rewardbench codebase 50 | ├── rewardbench # the soft link to the allenai_rewardbench/rewardbench 51 | ├── data 52 | │ ├── chat_filtered.json # the chat domain dataset 53 | │ ├── code_filtered.json # the code domain dataset 54 | │ ├── math_filtered.json # the math domain dataset 55 | │ ├── safety-refuse_filtered.json # the safety-refuse subdomain dataset 56 | │ ├── safety-response_filtered.json # the safety-response subdomain dataset 57 | │ └── total_dataset.json # the total dataset 58 | ├── scripts 59 | │ ├── run_rm.py # the python script for running evaluation on sequence-classification reward model 60 | │ ├── run_dpo.py # the python script for running evaluation on DPO reward model 61 | │ ├── utils.py 62 | │ ├── __pycache__ 63 | │ └── configs # the configuration files for running evaluation 64 | ├── nvidia.sh # the script for running evaluation on NVIDIA SteerLM series reward model 65 | ├── run_rm.sh # the script for running evaluation on sequence-classification reward model 66 | └── run_dpo.sh # the script for running evaluation on DPO reward model 67 | ``` 68 | 69 | 70 | 71 | 72 | ## Evaluation 73 | 74 | Our codebase is largely based on the [Reward Bench](https://github.com/allenai/reward-bench/) 75 | Thus for the environment setup, you may follow the instructions in the [Reward Bench Setup](https://github.com/allenai/reward-bench/tree/main?tab=readme-ov-file#quick-usage). 76 | After git clone the repository, you can run the following command to evaluate the reward model on RM-Bench: 77 | ```bash 78 | bash run_rm.sh # for sequence-classification reward model 79 | bash run_dpo.sh # for DPO model as reward model 80 | ``` 81 | 82 | 83 | ## how to compute the accuracy 84 | 85 | The accuracy is computed by comparing scores of chosen responses and rejected responses iteratively. 86 | The detailed code is provided in `scripts/utils.py`. 87 | Here is a quick example of how to compute the accuracy: 88 | ```python 89 | import numpy as np 90 | from typing import List, Dict, Any 91 | def compute_accuracy(results: List[Dict[str, Any]]) -> Dict[str, float]: 92 | # results is a list of dictionaries, each dictionary contains the following keys: 93 | # score_chosen: [float, float, float], the scores of the chosen responses 94 | # score_rejected: [float, float, float], the scores of the rejected responses 95 | # the scores are in the order of [concise, detailed_plain, detailed_markdown] 96 | # we will compare the scores of chosen responses and rejected responses iteratively 97 | # formatted as a 3x3 matrix, where the rows represent the scores of chosen responses 98 | # and the columns represent the scores of rejected responses 99 | MATRIX_SIZE = 3 # the column and row size of the matrix 100 | acc_matrix = np.zeros((MATRIX_SIZE, MATRIX_SIZE)) 101 | for result in results: 102 | for i in range(len(result["score_chosen"])): 103 | for j in range(len(result["score_rejected"])): 104 | if result["score_chosen"][i] > result["score_rejected"][j]: 105 | acc_matrix[i][j] += 1 106 | 107 | # compute the accuracy by dividing the number of correct comparisons by the total number of comparisons 108 | acc_matrix /= len(results) 109 | # compute the hard,normal,easy accuracy 110 | # hard accuracy: the average of the upper-right triangle of the matrix 111 | # namely chosen responses with less fancy style compared to rejected responses with more fancy style 112 | upper_right_count = MATRIX_SIZE * (MATRIX_SIZE - 1) / 2 113 | hard_acc = np.sum(np.triu(acc_matrix, 1)) / upper_right_count 114 | # normal accuracy: the average of the diagonal of the matrix 115 | # namely chosen responses with the same style compared to rejected responses with the same style 116 | normal_acc = np.mean(np.diag(acc_matrix)) 117 | # easy accuracy: the average of the lower-left triangle of the matrix 118 | # namely chosen responses with more fancy style compared to rejected responses with less fancy style 119 | lower_left_count = MATRIX_SIZE * (MATRIX_SIZE - 1) / 2 120 | easy_acc = np.sum(np.tril(acc_matrix, -1)) / lower_left_count 121 | 122 | return { 123 | "hard_acc": hard_acc, 124 | "normal_acc": normal_acc, 125 | "easy_acc": easy_acc 126 | } 127 | ``` 128 | 129 | 130 | more details about the dataset can be found in our paper. 131 | 132 | # Citation 133 | If you feel this dataset is helpful, please cite the following paper: 134 | ``` 135 | @article{liu2024rm, 136 | title={RM-Bench: Benchmarking Reward Models of Language Models with Subtlety and Style}, 137 | author={Liu, Yantao and Yao, Zijun and Min, Rui and Cao, Yixin and Hou, Lei and Li, Juanzi}, 138 | journal={arXiv preprint arXiv:2410.16184}, 139 | year={2024} 140 | } 141 | ``` 142 | 143 | ## ACKNOWLEDGEMENT 144 | We deeply appreciate the tremendous effort of the authors of [Reward Bench](github.com/allenai/reward-bench/tree/main) for providing the codebase and the dataset. 145 | Without their work, our research would not have been possible. 146 | -------------------------------------------------------------------------------- /assets/example_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THU-KEG/RM-Bench/6628f28b85a404269af6b3f0e89384479ada121e/assets/example_data.png -------------------------------------------------------------------------------- /nvidia.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Define paths 4 | HOST_MODEL_DIR="/data0/MODELS/nvidia/Llama3-70B-SteerLM-RM" 5 | CONTAINER_MODEL_DIR="/models/nvidia" 6 | 7 | 8 | 9 | # Docker image 10 | DOCKER_IMAGE="nvcr.io/nvidia/nemo:24.01.framework" 11 | 12 | # Inference server port 13 | INFERENCE_PORT=1424 14 | 15 | # Run the Docker container and start the server 16 | docker run --gpus '"device=4,5"' \ 17 | --runtime=nvidia \ 18 | --rm \ 19 | -it \ 20 | --name lyt_nemo \ 21 | --network=host \ 22 | -v $HOST_MODEL_DIR:$CONTAINER_MODEL_DIR \ 23 | -v data:/workspace/data \ 24 | -v $HF_HOME:/workspace/hf \ 25 | -e NCCL_DEBUG=INFO \ 26 | -e NCCL_DEBUG_SUBSYS=ALL \ 27 | -e NCCL_SOCKET_IFNAME=eth0 \ 28 | -e NCCL_IB_DISABLE=1 \ 29 | -e NCCL_P2P_LEVEL=1 \ 30 | -e http_proxy="http://localhost:8001" \ 31 | -e https_proxy="http://localhost:8001" \ 32 | -e no_proxy="localhost,127.0.0.1" \ 33 | --shm-size=200g \ 34 | $DOCKER_IMAGE \ 35 | /bin/bash 36 | 37 | # export CONTAINER_MODEL_DIR="/models/nvidia" && export HF_HOME=/workspace/hf && export INFERENCE_PORT=1424 && python /opt/NeMo-Aligner/examples/nlp/gpt/serve_reward_model.py \ 38 | # rm_model_file=$CONTAINER_MODEL_DIR \ 39 | # trainer.num_nodes=1 \ 40 | # trainer.devices=2 \ 41 | # ++model.tensor_model_parallel_size=1 \ 42 | # ++model.pipeline_model_parallel_size=2 \ 43 | # inference.micro_batch_size=2 \ 44 | # inference.port=$INFERENCE_PORT 45 | 46 | 47 | # python /opt/NeMo-Aligner/examples/nlp/data/steerlm/attribute_annotate.py \ 48 | # --input-file=data/our-bench/step5_v5_chat_factual_construct.jsonl \ 49 | # --output-file=data/our-bench/step5_v5_chat_factual_construct_labeled.jsonl \ 50 | # --port=1424 51 | 52 | 53 | # docker run --gpus '"device=0"' \ 54 | # --rm \ 55 | # --name nemo_inference_server \ 56 | # -p $INFERENCE_PORT:$INFERENCE_PORT \ 57 | # -v $HOST_MODEL_DIR:$CONTAINER_MODEL_DIR \ 58 | # $DOCKER_IMAGE \ 59 | # python /opt/NeMo-Aligner/examples/nlp/gpt/serve_reward_model.py \ 60 | # rm_model_file=$CONTAINER_MODEL_DIR/Llama2-13B-SteerLM-RM.nemo \ 61 | # trainer.num_nodes=1 \ 62 | # trainer.devices=1 \ 63 | # ++model.tensor_model_parallel_size=1 \ 64 | # ++model.pipeline_model_parallel_size=1 \ 65 | # inference.micro_batch_size=2 \ 66 | # inference.port=$INFERENCE_PORT 67 | 68 | # python /opt/NeMo-Aligner/examples/nlp/data/steerlm/preprocess_openassistant_data.py --output_directory=data/oasst 69 | 70 | # python /opt/NeMo-Aligner/examples/nlp/data/steerlm/attribute_annotate.py \ 71 | # --input-file=data/oasst/train.jsonl \ 72 | # --output-file=data/oasst/train_labeled.jsonl \ 73 | # --port=1424 -------------------------------------------------------------------------------- /rewardbench: -------------------------------------------------------------------------------- 1 | allenai_rewardbench/rewardbench/ -------------------------------------------------------------------------------- /run_dpo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # SET CUDA_HOME 4 | # export CUDA_HOME=/mnt/wuxuaner/workspace/miniconda3/envs/torch231 5 | # export PATH=$CUDA_HOME/bin:$PATH 6 | # export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH 7 | 8 | # Prompt for model path 9 | read -p "Enter the model path (default: RewardModels/allenai/tulu-v2.5-dpo-13b-hh-rlhf-60k): " MODEL_PATH 10 | MODEL_PATH=${MODEL_PATH:-RewardModels/allenai/tulu-v2.5-dpo-13b-hh-rlhf-60k} 11 | 12 | # Prompt for CUDA device 13 | read -p "Enter the CUDA device (default: 0): " CUDA_DEVICE 14 | CUDA_DEVICE=${CUDA_DEVICE:-0} 15 | 16 | # Add this line at the top of your run_dpo.sh script 17 | source activate torch231 18 | export PYTHONPATH=$PYTHONPATH:pwd 19 | export CUDA_VISIBLE_DEVICES=$CUDA_DEVICE 20 | 21 | chat_template=tulu 22 | python scripts/run_dpo.py \ 23 | --model $MODEL_PATH \ 24 | --chat_template $chat_template \ 25 | --datapath data/total_dataset.json \ 26 | --batch_size 8 \ 27 | --ref_model RewardModels/allenai/tulu-2-13b \ 28 | --trust_remote_code \ 29 | -------------------------------------------------------------------------------- /run_rm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Prompt for model path 4 | read -p "Enter the model path (default: RewardModels/allenai/tulu-v2.5-13b-hh-rlhf-60k-rm): " MODEL_PATH 5 | MODEL_PATH=${MODEL_PATH:-RewardModels/allenai/tulu-v2.5-13b-hh-rlhf-60k-rm} 6 | 7 | # Prompt for CUDA device 8 | read -p "Enter the CUDA device (default: 0): " CUDA_DEVICE 9 | CUDA_DEVICE=${CUDA_DEVICE:-0} 10 | 11 | # source /mnt/wuxuaner/workspace/miniconda3/bin/activate torch231 12 | source activate torch231 13 | export PYTHONPATH=$PYTHONPATH:pwd 14 | export CUDA_VISIBLE_DEVICES=$CUDA_DEVICE 15 | 16 | chat_template=tulu 17 | python scripts/run_rm.py \ 18 | --model $MODEL_PATH \ 19 | --datapath data/total_dataset.json \ 20 | --batch_size 8 \ 21 | --trust_remote_code \ 22 | --chat_template $chat_template \ -------------------------------------------------------------------------------- /scripts/configs: -------------------------------------------------------------------------------- 1 | ../allenai_rewardbench/scripts/configs -------------------------------------------------------------------------------- /scripts/run_dpo.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 AllenAI. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | import logging 17 | import os 18 | import sys 19 | 20 | import numpy as np 21 | import torch 22 | import transformers 23 | from accelerate import Accelerator 24 | from accelerate.logging import get_logger 25 | from fastchat.conversation import get_conv_template 26 | from tqdm import tqdm 27 | from trl.trainer.utils import DPODataCollatorWithPadding 28 | from scripts.utils import convert_robust_dataset_to_preference_dataset_list, load_eval_dataset, compute_accuracy 29 | 30 | import gc 31 | 32 | 33 | from rewardbench import ( 34 | DPO_MODEL_CONFIG, 35 | DPOInference, 36 | # load_eval_dataset, 37 | save_to_hub, 38 | torch_dtype_mapping, 39 | ) 40 | from rewardbench.constants import EXAMPLE_COUNTS, SUBSET_MAPPING 41 | from rewardbench.utils import calculate_scores_per_section 42 | 43 | # get token from HF_TOKEN env variable, but if it doesn't exist pass none 44 | HF_TOKEN = os.getenv("HF_TOKEN", None) 45 | # this is necessary to automatically log in when running this script in docker/batch beaker jobs 46 | if HF_TOKEN is not None: 47 | from huggingface_hub._login import _login 48 | 49 | _login(token=HF_TOKEN, add_to_git_credential=False) 50 | 51 | 52 | def get_args(): 53 | """ 54 | Parse arguments strings model and chat_template 55 | """ 56 | parser = argparse.ArgumentParser() 57 | parser.add_argument("--model", type=str, required=True, help="path to model") 58 | parser.add_argument("--ref_model", type=str, default=None, help="path to model") 59 | parser.add_argument( 60 | "--ref_free_type", type=str, default="avg", help="type of reference free normalization (norm, avg, or sum)" 61 | ) 62 | parser.add_argument("--datapath", type=str, default="data/reward-bench", help="path to data") 63 | parser.add_argument("--tokenizer", type=str, default=None, help="path to non-matching tokenizer") 64 | parser.add_argument("--chat_template", type=str, default="tulu", help="path to chat template") 65 | parser.add_argument("--do_not_save", action="store_true", help="do not save results to hub (for debugging)") 66 | parser.add_argument("--batch_size", type=int, default=6, help="batch size for inference") 67 | parser.add_argument( 68 | "--pref_sets", action="store_true", help="run on common preference sets instead of our custom eval set" 69 | ) 70 | parser.add_argument( 71 | "--trust_remote_code", action="store_true", default=False, help="directly load model instead of pipeline" 72 | ) 73 | parser.add_argument("--debug", action="store_true", default=False, help="use only 10 examples") 74 | parser.add_argument( 75 | "--disable_beaker_save", action="store_true", help="disable saving the main results in a file for AI2 Beaker" 76 | ) 77 | parser.add_argument( 78 | "--not_quantized", action="store_true", help="disable quantization for models that are quantized by default" 79 | ) 80 | parser.add_argument( 81 | "--torch_dtype", 82 | type=str, 83 | default="float16", 84 | choices=["float16", "bfloat16", "float32", "float64"], 85 | help="PyTorch dtype (default: float16)", 86 | ) 87 | args = parser.parse_args() 88 | args.torch_dtype = torch_dtype_mapping(args.torch_dtype) 89 | return args 90 | 91 | 92 | def main(): 93 | args = get_args() 94 | accelerator = Accelerator() 95 | 96 | ############### 97 | # Setup logging 98 | ############### 99 | logger = get_logger(__name__) 100 | logging.basicConfig( 101 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 102 | datefmt="%Y-%m-%d %H:%M:%S", 103 | handlers=[logging.StreamHandler(sys.stdout)], 104 | ) 105 | log_level = logging.INFO 106 | logger.setLevel(log_level) 107 | transformers.utils.logging.set_verbosity(log_level) 108 | transformers.utils.logging.enable_default_handler() 109 | transformers.utils.logging.enable_explicit_format() 110 | 111 | logger.info(f"Running reward model on {args.model} with chat template {args.chat_template}") 112 | if args.trust_remote_code: 113 | logger.info("Loading model with Trust Remote Code") 114 | 115 | offical_model_name = args.model.replace("RewardModels/", "") 116 | if args.model in DPO_MODEL_CONFIG: 117 | config = DPO_MODEL_CONFIG[offical_model_name] 118 | else: 119 | config = DPO_MODEL_CONFIG["default"] 120 | logger.info(f"Using dpo model config: {config}") 121 | 122 | model_builder = config["model_builder"] 123 | tokenizer_builder = config["tokenizer_builder"] 124 | 125 | # check datatype from argparse 126 | if args.torch_dtype == torch.bfloat16: 127 | logger.warning("Loading weights directly as bfloat16 for PyTorch dtype") 128 | torch_dtype = torch.bfloat16 129 | else: 130 | torch_dtype = torch.float16 131 | 132 | assert args.model != args.ref_model, "policy and reference model should be different" 133 | # load chat template 134 | chat_template = args.chat_template 135 | conv = get_conv_template(chat_template) 136 | 137 | # define reference free 138 | if args.ref_model is None: 139 | ref_free = True 140 | logger.info("Running reference free DPO - no reference model provided") 141 | else: 142 | ref_free = False 143 | logger.info(f"Running DPO with reference model {args.ref_model}") 144 | 145 | ############################ 146 | # Load dataset 147 | ############################ 148 | logger.info("*** Load dataset ***") 149 | tokenizer_path = args.tokenizer if args.tokenizer else args.model 150 | tokenizer = tokenizer_builder(tokenizer_path, trust_remote_code=args.trust_remote_code) 151 | tokenizer.pad_token = tokenizer.eos_token 152 | # if no BOS token, set as pad token, e.g. QWEN models 153 | if tokenizer.bos_token is None: 154 | tokenizer.bos_token_id = tokenizer.eos_token_id 155 | tokenizer.pad_token_id = tokenizer.eos_token_id 156 | 157 | 158 | raw_dataset_list = convert_robust_dataset_to_preference_dataset_list(args.datapath) 159 | 160 | 161 | if ( 162 | ("llama-3" in args.model) 163 | or ("Llama3" in args.model) 164 | or ("Llama-3" in args.model) 165 | or ("LLaMA3" in args.model) 166 | or args.not_quantized 167 | ): 168 | model_kwargs = { 169 | "device_map": "auto", 170 | "torch_dtype": torch_dtype if torch.cuda.is_available() else None, 171 | } 172 | model_kwargs_ref = { 173 | "device_map": "auto", 174 | "torch_dtype": torch_dtype if torch.cuda.is_available() else None, 175 | } 176 | else: 177 | model_kwargs = { 178 | "load_in_8bit": True, 179 | "device_map": "auto", 180 | "torch_dtype": torch_dtype if torch.cuda.is_available() else None, 181 | } 182 | model_kwargs_ref = { 183 | "load_in_8bit": True, 184 | "device_map": "auto", 185 | "torch_dtype": torch_dtype if torch.cuda.is_available() else None, 186 | } 187 | 188 | model = model_builder( 189 | args.model, 190 | trust_remote_code=args.trust_remote_code, 191 | attn_implementation="sdpa", 192 | **model_kwargs, 193 | ) 194 | 195 | if ref_free: 196 | ref_model = None 197 | else: 198 | ref_model = model_builder( 199 | args.ref_model, 200 | trust_remote_code=args.trust_remote_code, 201 | **model_kwargs_ref, 202 | ) 203 | 204 | # use internal inference functions in DPO trainer 205 | dpo = DPOInference( 206 | model, 207 | ref_model, 208 | tokenizer=tokenizer, 209 | accelerator=accelerator, 210 | ref_free_norm=args.ref_free_type, 211 | # norm is norm, avg is average, sum is sum 212 | ) 213 | # score_original = [] 214 | score_chosen = [] 215 | score_rejected = [] 216 | 217 | for dataset_idx, raw_dataset in enumerate(raw_dataset_list): 218 | 219 | # clear cuda memory cache 220 | # model = None 221 | dataset = None 222 | dataloader = None 223 | tokenized_dataset = None 224 | batch = None 225 | # del model 226 | # Synchronize and clear GPU memory 227 | torch.cuda.synchronize() 228 | del dataset 229 | del dataloader 230 | del tokenized_dataset 231 | del batch 232 | gc.collect() 233 | torch.cuda.empty_cache() 234 | # gc.collect() 235 | # torch.cuda.empty_cache() 236 | torch.cuda.ipc_collect() 237 | # torch.cuda.empty_cache() 238 | # prin the gpu memory usage 239 | 240 | # for device in range(torch.cuda.device_count()): 241 | # cuda.select_device(device) # Select the GPU device 242 | # cuda.close() # Clear the memory 243 | # cuda.select_device(device) # Reinitialize the GPU device if necessary 244 | 245 | print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024 / 1024 / 1024:.2f} GB") 246 | 247 | dataset, subsets = load_eval_dataset( 248 | raw_dataset, 249 | core_set=not args.pref_sets, 250 | conv=conv, 251 | tokenizer=tokenizer, 252 | logger=logger, 253 | keep_columns=["text_chosen", "text_rejected", "id", "prompt"], 254 | ) 255 | 256 | dataset = dataset.remove_columns("id") 257 | # debug: use only 10 examples 258 | if args.debug: 259 | dataset = dataset.select(range(10)) 260 | subsets = subsets[:10] 261 | 262 | ############################ 263 | # Load reward model pipeline 264 | ############################ 265 | BATCH_SIZE = args.batch_size 266 | 267 | # tokenize dataset 268 | column_names = list(dataset.features) 269 | 270 | tokenized_dataset = dataset.map(dpo.tokenize_row, remove_columns=column_names) 271 | 272 | dataloader = torch.utils.data.DataLoader( 273 | tokenized_dataset, 274 | batch_size=BATCH_SIZE, 275 | collate_fn=DPODataCollatorWithPadding( 276 | pad_token_id=tokenizer.pad_token_id, 277 | label_pad_token_id=dpo.label_pad_token_id, 278 | is_encoder_decoder=dpo.is_encoder_decoder, 279 | ), 280 | # collate_fn = lambda x: x, # fix weird batching error 281 | shuffle=False, 282 | drop_last=False, 283 | ) 284 | results = [] 285 | scores_chosen = [] 286 | scores_rejected = [] 287 | 288 | for step, batch in enumerate(tqdm(dataloader, desc="RM batch steps")): 289 | logger.info(f"RM inference step {step}/{len(dataloader)}") 290 | 291 | rewards_chosen, rewards_rejected = dpo.inference_step(batch, ref_free=ref_free) 292 | 293 | # for each item in batch, record 1 if chosen > rejected 294 | # extra score from dict within batched results (e.g. logits) 295 | # [{'label': 'LABEL_1', 'score': 0.6826171875},... ] 296 | if isinstance(rewards_chosen[0], dict): 297 | scores_chosen_batch = [result["score"] for result in rewards_chosen] 298 | scores_rejected_batch = [result["score"] for result in rewards_rejected] 299 | # for classes that directly output scores (custom code) 300 | else: 301 | scores_chosen_batch = rewards_chosen.float().cpu().numpy().tolist() # convert to float for bfloat16 case 302 | scores_rejected_batch = rewards_rejected.float().cpu().numpy().tolist() 303 | 304 | [ 305 | results.append(1) if chosen > rejected else results.append(0) 306 | for chosen, rejected in zip(scores_chosen_batch, scores_rejected_batch) 307 | ] 308 | scores_chosen += scores_chosen_batch 309 | scores_rejected += scores_rejected_batch 310 | 311 | 312 | score_chosen.append(scores_chosen) 313 | score_rejected.append(scores_rejected) 314 | 315 | 316 | ############################ 317 | # Save results 318 | ############################ 319 | 320 | import json 321 | # HACK: load the dataset from the file 322 | dataset_json:list = json.load(open(args.datapath)) 323 | 324 | 325 | print(f"Type of score_chosen: {type(score_chosen[0])}") 326 | print(f"Lenght of score_chosen: {len(score_chosen[0])}") 327 | # print(score_chosen[0]) 328 | print(f"Type of score_rejected: {type(score_rejected[0])}") 329 | print(f"Lenght of score_rejected: {len(score_rejected[0])}") 330 | # print(score_rejected[0]) 331 | 332 | for idx, unit in enumerate(dataset_json): 333 | unit['score_chosen'] = [ 334 | score_list[idx] for score_list in score_chosen 335 | ] 336 | unit['score_rejected'] = [ 337 | score_list[idx] for score_list in score_rejected 338 | ] 339 | 340 | # save to results folder with the name + model name + timestamp 341 | filename = os.path.basename(args.datapath).replace(".json", "") 342 | model_name = args.model.split("/")[-1] 343 | ref_model_name = args.ref_model.split("/")[-1] if args.ref_model else "ref_free" 344 | output_dir = f"results/DPO/{offical_model_name}" 345 | if not os.path.exists(output_dir): 346 | os.makedirs(output_dir) 347 | from datetime import datetime 348 | output_path = os.path.join(output_dir, f"{filename}_{model_name}_{ref_model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json") 349 | with open(output_path, "w") as f: 350 | json.dump(dataset_json, f, indent=4, ensure_ascii=False) 351 | 352 | acc_dict = compute_accuracy(dataset_json) 353 | print(f"The accuracy of model {model_name}\n in the dataset {filename} is:\n {acc_dict}") 354 | 355 | if __name__ == "__main__": 356 | main() -------------------------------------------------------------------------------- /scripts/run_rm.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 AllenAI. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | import logging 17 | import os 18 | import sys 19 | 20 | import numpy as np 21 | import torch 22 | import transformers 23 | from accelerate import Accelerator 24 | from accelerate.logging import get_logger 25 | from fastchat.conversation import get_conv_template 26 | from tqdm import tqdm 27 | from transformers import AutoTokenizer, pipeline 28 | from scripts.utils import convert_robust_dataset_to_preference_dataset_list, load_eval_dataset, compute_accuracy 29 | 30 | import gc 31 | 32 | from rewardbench import ( 33 | REWARD_MODEL_CONFIG, 34 | check_tokenizer_chat_template, 35 | # load_eval_dataset, 36 | save_to_hub, 37 | torch_dtype_mapping, 38 | ) 39 | from rewardbench.constants import EXAMPLE_COUNTS, SUBSET_MAPPING 40 | from rewardbench.utils import calculate_scores_per_section 41 | 42 | # Enable TensorFloat32 (TF32) tensor cores on Ampere GPUs for matrix multiplications (faster than FP32) 43 | torch.backends.cuda.matmul.allow_tf32 = True 44 | torch.backends.cudnn.allow_tf32 = True 45 | 46 | # get token from HF_TOKEN env variable, but if it doesn't exist pass none 47 | HF_TOKEN = os.getenv("HF_TOKEN", None) 48 | # this is necessary to automatically log in when running this script in docker/batch beaker jobs 49 | if HF_TOKEN is not None: 50 | from huggingface_hub._login import _login 51 | 52 | _login(token=HF_TOKEN, add_to_git_credential=False) 53 | 54 | 55 | def get_args(): 56 | """ 57 | Parse arguments strings model and chat_template 58 | """ 59 | parser = argparse.ArgumentParser() 60 | parser.add_argument("--model", type=str, required=True, help="path to model") 61 | parser.add_argument("--tokenizer", type=str, default=None, help="path to non-matching tokenizer to model") 62 | parser.add_argument("--chat_template", type=str, default="tulu", help="path to chat template") 63 | parser.add_argument( 64 | "--trust_remote_code", action="store_true", default=False, help="directly load model instead of pipeline" 65 | ) 66 | parser.add_argument("--datapath", type=str, default="data/reward-bench", help="path to data") 67 | parser.add_argument("--do_not_save", action="store_true", help="do not save results to hub (for debugging)") 68 | parser.add_argument("--batch_size", type=int, default=64, help="batch size for inference") 69 | parser.add_argument("--max_length", type=int, default=2048, help="Max length of RM inputs (passed to pipeline)") 70 | parser.add_argument( 71 | "--pref_sets", action="store_true", help="run on common preference sets instead of our custom eval set" 72 | ) 73 | parser.add_argument( 74 | "--debug", action="store_true", help="run on common preference sets instead of our custom eval set" 75 | ) 76 | parser.add_argument( 77 | "--disable_beaker_save", action="store_true", help="disable saving the main results in a file for AI2 Beaker" 78 | ) 79 | parser.add_argument( 80 | "--not_quantized", action="store_true", help="disable quantization for models that are quantized by default" 81 | ) 82 | parser.add_argument( 83 | "--torch_dtype", 84 | type=str, 85 | default="float16", 86 | choices=["float16", "bfloat16", "float32", "float64"], 87 | help="PyTorch dtype (default: float16)", 88 | ) 89 | args = parser.parse_args() 90 | args.torch_dtype = torch_dtype_mapping(args.torch_dtype) 91 | return args 92 | 93 | 94 | def main(): 95 | args = get_args() 96 | ############### 97 | # Setup logging 98 | ############### 99 | accelerator = Accelerator() 100 | current_device = accelerator.process_index 101 | 102 | logger = get_logger(__name__) 103 | logging.basicConfig( 104 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 105 | datefmt="%Y-%m-%d %H:%M:%S", 106 | handlers=[logging.StreamHandler(sys.stdout)], 107 | ) 108 | log_level = logging.INFO 109 | logger.setLevel(log_level) 110 | transformers.utils.logging.set_verbosity(log_level) 111 | transformers.utils.logging.enable_default_handler() 112 | transformers.utils.logging.enable_explicit_format() 113 | 114 | logger.info(f"Running reward model on {args.model} with chat template {args.chat_template}") 115 | if args.trust_remote_code: 116 | logger.info("Loading model with Trust Remote Code") 117 | 118 | # load chat template 119 | chat_template = args.chat_template 120 | conv = get_conv_template(chat_template) 121 | logger.info(f"Using conversation template {chat_template}: {conv}") 122 | 123 | offical_model_name = args.model.replace("RewardModels/", "") 124 | if offical_model_name in REWARD_MODEL_CONFIG: 125 | # delete the "RewardModel/" prefix 126 | config = REWARD_MODEL_CONFIG[offical_model_name] 127 | else: 128 | config = REWARD_MODEL_CONFIG["default"] 129 | logger.info(f"Using reward model config: {config}") 130 | 131 | # Default entries 132 | # "model_builder": AutoModelForSequenceClassification.from_pretrained, 133 | # "pipeline_builder": pipeline, 134 | # "quantized": True, 135 | # "custom_dialogue": False, 136 | # "model_type": "Seq. Classifier" 137 | 138 | quantized = config["quantized"] # only Starling isn't quantized for now 139 | # if llama-3 in name, switch quantized to False (severely degrades performance) 140 | if ( 141 | ("llama-3" in args.model) 142 | or ("Llama3" in args.model) 143 | or ("Llama-3" in args.model) 144 | or ("LLaMA3" in args.model) 145 | or ("llama3" in args.model) 146 | or args.not_quantized 147 | ): 148 | quantized = False 149 | logger.info(f"Disabling quantization for llama-3 or override flag (--not_quantized: {args.not_quantized})") 150 | 151 | custom_dialogue = config["custom_dialogue"] 152 | model_type = config["model_type"] 153 | if model_type == "Custom Classifier": 154 | raise NotImplementedError("For the Custom Classifier model like NVIDIA SteerLM, plz refer to the NVIDIA original code") 155 | model_builder = config["model_builder"] 156 | pipeline_builder = config["pipeline_builder"] 157 | torch_dtype = config.get("torch_dtype", None) 158 | # if not datatype in config (default), check args 159 | if torch_dtype is None: 160 | # if datatype is bfloat16, then manually turn off quantizaiton (done with bitsandbytes) 161 | if args.torch_dtype == torch.bfloat16: 162 | quantized = False 163 | logger.info("Disabling quantization for bfloat16 datatype") 164 | torch_dtype = args.torch_dtype 165 | 166 | # not included in config to make user explicitly understand they are passing this 167 | trust_remote_code = args.trust_remote_code 168 | 169 | ############################ 170 | # Load dataset 171 | ############################ 172 | logger.info("*** Load dataset ***") 173 | tokenizer_path = args.tokenizer if args.tokenizer else args.model 174 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=args.trust_remote_code) 175 | if not custom_dialogue: # not needed for PairRM / SteamSHP 176 | tokenizer.truncation_side = "left" # copied from Starling, but few samples are above context length 177 | 178 | 179 | ############################ 180 | # Load reward model pipeline 181 | ############################ 182 | BATCH_SIZE = args.batch_size 183 | logger.info("*** Load reward model ***") 184 | reward_pipeline_kwargs = { 185 | "batch_size": BATCH_SIZE, # eval_args.inference_batch_size, 186 | "truncation": True, 187 | "padding": True, 188 | "max_length": args.max_length, 189 | "function_to_apply": "none", # Compute raw logits 190 | "return_token_type_ids": False, 191 | } 192 | if quantized: 193 | model_kwargs = { 194 | "load_in_8bit": True, 195 | "device_map": {"": current_device}, 196 | "torch_dtype": torch_dtype if torch.cuda.is_available() else None, 197 | } 198 | else: 199 | model_kwargs = { 200 | "device_map": "auto", 201 | "torch_dtype": torch_dtype, 202 | } 203 | 204 | model = model_builder(args.model, **model_kwargs, trust_remote_code=trust_remote_code) 205 | reward_pipe = pipeline_builder( 206 | "text-classification", 207 | model=model, 208 | tokenizer=tokenizer, 209 | ) 210 | 211 | ############################ 212 | # Tokenization settings & dataset preparation 213 | ############################ 214 | # set pad token to eos token if not set 215 | if reward_pipe.tokenizer.pad_token_id is None: 216 | reward_pipe.model.config.pad_token_id = reward_pipe.tokenizer.eos_token_id 217 | reward_pipe.tokenizer.pad_token_id = reward_pipe.tokenizer.eos_token_id 218 | # For models whose config did not contains `pad_token_id` 219 | if reward_pipe.model.config.pad_token_id is None: 220 | reward_pipe.model.config.pad_token_id = reward_pipe.tokenizer.pad_token_id 221 | 222 | # if using fastchat template (no template in tokenizer), make the RM tokenizer output an EOS token 223 | if not check_tokenizer_chat_template(tokenizer): 224 | reward_pipe.tokenizer.add_eos_token = True 225 | 226 | 227 | raw_dataset_list = convert_robust_dataset_to_preference_dataset_list(args.datapath) 228 | 229 | # score_original = [] 230 | score_chosen = [] 231 | score_rejected = [] 232 | for dataset_idx, raw_dataset in enumerate(raw_dataset_list): 233 | 234 | # clear cuda memory cache 235 | dataset = None 236 | dataloader = None 237 | torch.cuda.synchronize() 238 | del dataset 239 | del dataloader 240 | gc.collect() 241 | torch.cuda.empty_cache() 242 | torch.cuda.ipc_collect() 243 | # prin the gpu memory usage 244 | print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024 / 1024 / 1024:.2f} GB") 245 | 246 | # for device in range(torch.cuda.device_count()): 247 | # cuda.select_device(device) # Select the GPU device 248 | # cuda.close() # Clear the memory 249 | # cuda.select_device(device) # Reinitialize the GPU device if necessary 250 | print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024 / 1024 / 1024:.2f} GB") 251 | 252 | 253 | 254 | dataset, subsets = load_eval_dataset( 255 | raw_dataset, 256 | core_set=not args.pref_sets, 257 | conv=conv, 258 | custom_dialogue_formatting=custom_dialogue, 259 | tokenizer=tokenizer, 260 | logger=logger, 261 | keep_columns=["text_chosen", "text_rejected", "id"], 262 | ) 263 | # copy id for saving, then remove 264 | ids = dataset["id"] 265 | dataset = dataset.remove_columns("id") 266 | 267 | # debug: use only 10 examples 268 | if args.debug: 269 | dataset = dataset.select(range(10)) 270 | subsets = subsets[:10] 271 | ids = ids[:10] 272 | 273 | 274 | 275 | ############################ 276 | # Run inference [1/2]" built in transformers 277 | ############################ 278 | # if using HF pipeline, can pass entire dataset and get results 279 | # first, handle custom pipelines that we must batch normally 280 | if pipeline_builder == pipeline: 281 | logger.info("*** Running forward pass via built in pipeline abstraction ***") 282 | # this setup can be optimized slightly with one pipeline call 283 | # prepare for inference 284 | reward_pipe = accelerator.prepare(reward_pipe) 285 | 286 | results_rej = reward_pipe(dataset["text_rejected"], **reward_pipeline_kwargs) 287 | results_cho = reward_pipe(dataset["text_chosen"], **reward_pipeline_kwargs) 288 | 289 | # extract scores from results which is list of dicts, e.g. [{'label': 'LABEL_1', 'score': 0.6826171875},... ] 290 | unit_score_chosen_list = [result["score"] for result in results_cho] 291 | unit_score_rejected_list = [result["score"] for result in results_rej] 292 | 293 | # pairwise comparison list comprehension 294 | results = [1 if chosen > rejected else 0 for chosen, rejected in zip(unit_score_chosen_list, unit_score_rejected_list)] 295 | 296 | ############################ 297 | # Run inference [2/2] custom pipelines 298 | ############################ 299 | else: 300 | logger.info("*** Running dataloader to collect results ***") 301 | # TODO make more custom pipelines work with pre-tokenized data 302 | from torch.utils.data.dataloader import default_collate 303 | 304 | # for PairRM, hmm, will move all of this later 305 | def custom_collate_fn(batch): 306 | # check if ['text_chosen'] is in first batch element 307 | # Check if the first element of the batch is a dictionary 308 | if isinstance(batch[0]["text_chosen"][0], dict): 309 | return batch # Return the batch as-is if it's a list of dicts 310 | else: 311 | return default_collate(batch) # Use the default collate behavior otherwise 312 | 313 | dataloader = torch.utils.data.DataLoader( 314 | dataset, 315 | batch_size=BATCH_SIZE, 316 | collate_fn=custom_collate_fn, # if not args.pref_sets else None, 317 | shuffle=False, 318 | drop_last=False, 319 | ) 320 | 321 | dataloader, model = accelerator.prepare(dataloader, reward_pipe.model) 322 | reward_pipe.model = model 323 | 324 | results = [] 325 | unit_score_chosen_list = [] 326 | unit_score_rejected_list = [] 327 | for step, batch in enumerate(tqdm(dataloader, desc="RM batch steps")): 328 | # logger.info(f"RM inference step {step}/{len(dataloader)}") 329 | 330 | if model_type == "Custom Classifier": 331 | raise NotImplementedError("For the Custom Classifier model like NVIDIA SteerLM, plz refer to the NVIDIA original code") 332 | else: 333 | rewards_chosen = reward_pipe(batch["text_chosen"], **reward_pipeline_kwargs) 334 | rewards_rejected = reward_pipe(batch["text_rejected"], **reward_pipeline_kwargs) 335 | print(f"rewards_chosen: {rewards_chosen}") 336 | print(f"rewards_rejected: {rewards_rejected}") 337 | # for each item in batch, record 1 if chosen > rejected 338 | # extra score from dict within batched results (e.g. logits) 339 | # [{'label': 'LABEL_1', 'score': 0.6826171875},... ] 340 | if isinstance(rewards_chosen[0], dict): 341 | score_chosen_batch = [result["score"] for result in rewards_chosen] 342 | score_rejected_batch = [result["score"] for result in rewards_rejected] 343 | # for classes that directly output scores (custom code) 344 | else: 345 | score_chosen_batch = ( 346 | rewards_chosen.float().cpu().numpy().tolist() 347 | ) # cast to float in case of bfloat16 348 | score_rejected_batch = rewards_rejected.float().cpu().numpy().tolist() 349 | 350 | # log results 351 | for chosen, rejected in zip(score_chosen_batch, score_rejected_batch): 352 | print(f"chosen: {chosen}, rejected: {rejected}") 353 | if chosen > rejected: 354 | results.append(1) 355 | else: 356 | results.append(0) 357 | unit_score_chosen_list.extend(score_chosen_batch) 358 | unit_score_rejected_list.extend(score_rejected_batch) 359 | 360 | 361 | 362 | score_chosen.append(unit_score_chosen_list) 363 | score_rejected.append(unit_score_rejected_list) 364 | 365 | 366 | ############################ 367 | # Save results 368 | ############################ 369 | 370 | import json 371 | # HACK: load the dataset from the file 372 | dataset_json:list = json.load(open(args.datapath)) 373 | 374 | # print(f"Type of score_original: {type(score_original)}") 375 | # print(f"Lenght of score_original: {len(score_original)}") 376 | # print(score_original) 377 | print(f"Type of score_chosen: {type(score_chosen[0])}") 378 | print(f"Lenght of score_chosen: {len(score_chosen[0])}") 379 | # print(score_chosen[0]) 380 | print(f"Type of score_rejected: {type(score_rejected[0])}") 381 | print(f"Lenght of score_rejected: {len(score_rejected[0])}") 382 | # print(score_rejected[0]) 383 | 384 | for idx, unit in enumerate(dataset_json): 385 | # unit['score_orig'] = score_original[idx] 386 | unit['score_chosen'] = [ 387 | score_list[idx] for score_list in score_chosen 388 | ] 389 | unit['score_rejected'] = [ 390 | score_list[idx] for score_list in score_rejected 391 | ] 392 | # if all the elemnts in the list are list and all the elements is of length 1 393 | if all(isinstance(elem, list) and len(elem) == 1 for elem in unit['score_chosen']): 394 | unit['score_chosen'] = [elem[0] for elem in unit['score_chosen']] 395 | if all(isinstance(elem, list) and len(elem) == 1 for elem in unit['score_rejected']): 396 | unit['score_rejected'] = [elem[0] for elem in unit['score_rejected']] 397 | 398 | # save to results folder with the name + model name + timestamp 399 | filename = os.path.basename(args.datapath).replace(".json", "") 400 | model_name = args.model.split("/")[-1] 401 | ref_model_name = "REWORD_MODEL" 402 | # make a dir at results with official model name 403 | output_dir = f"results/Seq_Classifier/{offical_model_name}" 404 | if not os.path.exists(output_dir): 405 | os.makedirs(output_dir) 406 | from datetime import datetime 407 | output_path = os.path.join(output_dir, f"{filename}_{model_name}_{ref_model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json") 408 | with open(output_path, "w") as f: 409 | json.dump(dataset_json, f, indent=4, ensure_ascii=False) 410 | 411 | acc_dict = compute_accuracy(dataset_json) 412 | print(f"The accuracy of model {model_name}\n in the dataset {filename} is:\n {acc_dict}") 413 | 414 | if __name__ == "__main__": 415 | main() -------------------------------------------------------------------------------- /scripts/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from fastchat.conversation import Conversation 3 | from datasets import Dataset, DatasetDict, Value, concatenate_datasets, load_dataset 4 | from transformers import PreTrainedTokenizer 5 | from rewardbench.utils import check_tokenizer_chat_template, prepare_dialogue, prepare_dialogue_from_tokenizer 6 | import numpy as np 7 | from typing import List, Dict, Any 8 | import json 9 | from datasets import Dataset, load_from_disk 10 | EXTRA_PREF_SETS = "allenai/pref-test-sets" 11 | def convert_robust_dataset_to_preference_dataset_list(robust_dataset_path: str) -> List[Dataset]: 12 | robust_dataset = json.load(open(robust_dataset_path)) 13 | # Prepare the chosen and rejected dataset list 14 | para_corp_dataset_list = [] 15 | num_pairs = len(robust_dataset[0]['chosen']) 16 | 17 | assert num_pairs == len(robust_dataset[0]['rejected']), \ 18 | "The number of chosen and rejected pairs should be the same." 19 | 20 | for idx in range(num_pairs): 21 | para_corp_dataset = Dataset.from_dict({ 22 | "id": [unit['id'] for unit in robust_dataset], 23 | "subset": ['subset' for unit in robust_dataset], 24 | "prompt": [unit['prompt'] for unit in robust_dataset], 25 | "chosen": [unit['chosen'][idx] for unit in robust_dataset], 26 | "chosen_model": ["chosen" for _ in robust_dataset], 27 | "rejected": [unit['rejected'][idx] for unit in robust_dataset], 28 | "rejected_model": ["rejected" for _ in robust_dataset], 29 | }) 30 | para_corp_dataset_list.append(para_corp_dataset) 31 | 32 | return para_corp_dataset_list 33 | 34 | def split_dataset_by_domain(dataset: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]: 35 | domains = ["chat","math","code","safety"] 36 | domain_dataset_dict = {} 37 | for domain in domains: 38 | domain_dataset_dict[domain] = [example for example in dataset if example['domain'].startswith(domain)] 39 | 40 | # pop the domain keys 41 | for domain in domain_dataset_dict: 42 | for example in domain_dataset_dict[domain]: 43 | example.pop('domain') 44 | 45 | return domain_dataset_dict 46 | 47 | 48 | def compute_accuracy(results: List[Dict[str, Any]]) -> Dict[str, float]: 49 | if 'domain' in results[0]: 50 | # this indicates this is total_dataset.json 51 | print('We are handling total_dataset.json') 52 | print('Splitting the dataset by domain...') 53 | # thus we need to split the results into different domains 54 | split_results = split_dataset_by_domain(results) 55 | domain_results = {} 56 | for domain in split_results: 57 | domain_results[domain] = compute_accuracy(split_results[domain]) 58 | domain_avg_results = {} 59 | for domain in domain_results: 60 | domain_avg_results[domain] = np.mean(list(domain_results[domain].values())) 61 | domain_hard_normal_easy_acc = { 62 | "hard_acc": np.mean([domain_results[domain]["hard_acc"] for domain in domain_results]), 63 | "normal_acc": np.mean([domain_results[domain]["normal_acc"] for domain in domain_results]), 64 | "easy_acc": np.mean([domain_results[domain]["easy_acc"] for domain in domain_results]) 65 | } 66 | total_avg_acc = np.mean([domain_avg_results[domain] for domain in domain_avg_results]) 67 | # merge the results into one falten dictionary 68 | final_results = {} 69 | # merge domain_avg_results into final_results 70 | final_results.update(domain_avg_results) 71 | # merge domain_hard_normal_easy_acc into final_results 72 | final_results.update(domain_hard_normal_easy_acc) 73 | # merge total_avg_acc into final_results 74 | final_results.update({"total_avg_acc": total_avg_acc}) 75 | return final_results 76 | 77 | 78 | # results is a list of dictionaries, each dictionary contains the following keys: 79 | # score_chosen: [float, float, float], the scores of the chosen responses 80 | # score_rejected: [float, float, float], the scores of the rejected responses 81 | # the scores are in the order of [concise, detailed_plain, detailed_markdown] 82 | # we will compare the scores of chosen responses and rejected responses iteratively 83 | # formatted as a 3x3 matrix, where the rows represent the scores of chosen responses 84 | # and the columns represent the scores of rejected responses 85 | MATRIX_SIZE = 3 # the column and row size of the matrix 86 | acc_matrix = np.zeros((MATRIX_SIZE, MATRIX_SIZE)) 87 | for result in results: 88 | for i in range(len(result["score_chosen"])): 89 | for j in range(len(result["score_rejected"])): 90 | if result["score_chosen"][i] > result["score_rejected"][j]: 91 | acc_matrix[i][j] += 1 92 | 93 | # compute the accuracy by dividing the number of correct comparisons by the total number of comparisons 94 | acc_matrix /= len(results) 95 | # compute the hard,normal,easy accuracy 96 | # hard accuracy: the average of the upper-right triangle of the matrix 97 | # namely chosen responses with less fancy style compared to rejected responses with more fancy style 98 | upper_right_count = MATRIX_SIZE * (MATRIX_SIZE - 1) / 2 99 | hard_acc = np.sum(np.triu(acc_matrix, 1)) / upper_right_count 100 | # normal accuracy: the average of the diagonal of the matrix 101 | # namely chosen responses with the same style compared to rejected responses with the same style 102 | normal_acc = np.mean(np.diag(acc_matrix)) 103 | # easy accuracy: the average of the lower-left triangle of the matrix 104 | # namely chosen responses with more fancy style compared to rejected responses with less fancy style 105 | lower_left_count = MATRIX_SIZE * (MATRIX_SIZE - 1) / 2 106 | easy_acc = np.sum(np.tril(acc_matrix, -1)) / lower_left_count 107 | 108 | return { 109 | "hard_acc": hard_acc, 110 | "normal_acc": normal_acc, 111 | "easy_acc": easy_acc 112 | } 113 | 114 | 115 | 116 | 117 | def load_eval_dataset( 118 | raw_Dataset: Dataset = None, 119 | core_set: bool = True, 120 | custom_dialogue_formatting: bool = False, 121 | conv: Conversation = None, 122 | tokenizer: PreTrainedTokenizer = None, 123 | logger: logging.Logger = None, 124 | keep_columns: List[str] = ["text_chosen", "text_rejected", "id"], 125 | return_extra_data: bool = False, 126 | max_turns: int = None, 127 | ) -> tuple[Dataset, list[str]]: 128 | """ 129 | Loads either the core eval set for HERM or the existing preference data test sets. 130 | 131 | Args: 132 | core_set: if True, load the core eval set for HERM. 133 | custom_dialogue_formatting: if True, format the dialogue as needed for custom models (e.g. SHP and PairRM). 134 | conv: fastchat conversation template. 135 | If None (default) the passed tokenizer needs to have a usable chat template. 136 | tokenizer: HuggingFace tokenizer to use. The tokenizer's chat template, if available, has precedence over conv. 137 | logger: logger to use for logging. If None (default), no logging is done. 138 | keep_columns: list of columns to keep in the dataset. 139 | max_turns: maximum number of turns in the dialogue (usually even). If None (default), no filtering is done. 140 | 141 | Returns: 142 | dataset: loaded dataset with required properties. 143 | subsets: list of subsets for the corresponding samples in the dataset. 144 | """ 145 | if raw_Dataset is not None: 146 | raw_dataset = raw_Dataset 147 | elif core_set: 148 | raw_dataset = load_from_disk("data/reward-bench") 149 | raw_dataset = raw_dataset['filtered'] 150 | else: 151 | raw_dataset = load_dataset(EXTRA_PREF_SETS) 152 | modified_datasets = [] 153 | 154 | # Iterate over each subset in the DatasetDict 155 | for subset_name, subdataset in raw_dataset.items(): 156 | # if subset column exists, move to subsubset (for pref sets) 157 | if "subset" in subdataset.column_names: 158 | subdataset = subdataset.rename_column("subset", "subsubset") 159 | 160 | # Add a new column 'subset' to the dataset with the subset name 161 | subdataset = subdataset.add_column("subset", [subset_name] * len(subdataset)) 162 | 163 | # Append the modified dataset to the list 164 | # remove pku_safer and pku_better from the dict, no longer part of the benchmark 165 | if subset_name not in ["pku_safer", "pku_better"]: 166 | modified_datasets.append(subdataset) 167 | 168 | # Concatenate all the modified datasets into one dataset 169 | raw_dataset = concatenate_datasets(modified_datasets) 170 | 171 | # Apply chat template 172 | if not custom_dialogue_formatting: 173 | usable_tokenizer = check_tokenizer_chat_template(tokenizer) 174 | 175 | # assert either conv is passed or tokenizer has chat_template 176 | assert conv is not None or usable_tokenizer 177 | 178 | if usable_tokenizer: 179 | if logger is not None: 180 | logger.info("*** Preparing dataset with HF Transformers ***") 181 | # docs https://huggingface.co/docs/transformers/main/en/chat_templating 182 | dataset = raw_dataset.map( 183 | prepare_dialogue_from_tokenizer, 184 | fn_kwargs={"tokenizer": tokenizer}, 185 | num_proc=8, 186 | load_from_cache_file=False, 187 | ) 188 | 189 | # else use FastChat to get chat template 190 | else: 191 | if logger is not None: 192 | logger.info("*** Preparing dataset with FastChat ***") 193 | dataset = raw_dataset.map( 194 | prepare_dialogue, 195 | fn_kwargs={"dialogue_template": conv}, 196 | num_proc=8, # using >1 process causes issues with re-assigning prompt in example 197 | load_from_cache_file=False, 198 | ) 199 | else: 200 | if logger is not None: 201 | logger.info("*** Preparing dataset with custom formatting ***") 202 | 203 | def map_conversations(example, core_set=True): 204 | if core_set: 205 | example["text_chosen"] = [ 206 | {"role": "user", "content": example["prompt"]}, 207 | {"role": "assistant", "content": example["chosen"]}, 208 | ] 209 | example["text_rejected"] = [ 210 | {"role": "user", "content": example["prompt"]}, 211 | {"role": "assistant", "content": example["rejected"]}, 212 | ] 213 | else: 214 | prompt = example["prompt"] 215 | example["text_chosen"] = prompt + [{"role": "assistant", "content": example["chosen"]}] 216 | example["text_rejected"] = prompt + [{"role": "assistant", "content": example["rejected"]}] 217 | return example 218 | 219 | dataset = raw_dataset.map( 220 | map_conversations, 221 | fn_kwargs={"core_set": core_set}, 222 | num_proc=8, 223 | ) 224 | 225 | if max_turns is not None: 226 | assert max_turns > 0, "max_turns must be greater than 0" 227 | 228 | # filter long answers (MT Bench prompt as 1 or 2 turn examples) 229 | def filter_long_turns(batch): 230 | return len(batch["text_chosen"]) <= max_turns 231 | 232 | dataset = dataset.filter(filter_long_turns) 233 | 234 | # take column subset from dataset 235 | subsets = dataset["subset"] 236 | 237 | # remove columns if set and not custom_dialogue_formatting 238 | all_cols = dataset.column_names 239 | dataset = dataset.remove_columns([c for c in all_cols if c not in keep_columns]) 240 | 241 | return dataset, subsets 242 | 243 | 244 | 245 | if __name__ == "__main__": 246 | # test the function 247 | pass --------------------------------------------------------------------------------