├── .gitignore ├── LICENSE ├── README.md ├── armo-rm ├── README.md ├── stage-1_prepare.py ├── stage-1_train.py ├── stage-2_prepare.py └── stage-2_train.py ├── bradley-terry-rm ├── README.md ├── gemma_2B_rm.py ├── llama3_8B_rm.py └── mistral_7B_rm.py ├── decision_tree ├── README.md ├── collect_llm_preferences.py └── get_embeddings.py ├── deepspeed_configs ├── deepspeed_1.json ├── deepspeed_2.json └── deepspeed_3.json ├── math-rm ├── README.md ├── figs │ ├── ds_gsm8k.png │ ├── ds_gsm8k_more_test.png │ ├── ds_math.png │ ├── ds_math_more_test.png │ ├── mistral_gsm8k.png │ └── mistral_math.png ├── llama-3.1-orm.yaml ├── llama-3.1-prm.yaml ├── orm_evaluate.py ├── prm_evaluate.py └── scalar_orm_train.py ├── odin ├── README.md ├── gemma_two_head.py ├── odin-arch.jpg └── serving_two_head.py ├── pair-pm ├── README.md ├── RRM │ └── rrm_augmentation.py ├── SSRM │ ├── conf_threshold.py │ └── pseudo_label.py ├── gemma-2b-it.yaml ├── gemma-9b-it.yaml ├── llama3-8b-it.yaml ├── prepare_model.py └── process_pair_data.py └── useful_code ├── eval_reward_bench_bt.py └── eval_reward_bench_pm.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/README.md -------------------------------------------------------------------------------- /armo-rm/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/armo-rm/README.md -------------------------------------------------------------------------------- /armo-rm/stage-1_prepare.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/armo-rm/stage-1_prepare.py -------------------------------------------------------------------------------- /armo-rm/stage-1_train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/armo-rm/stage-1_train.py -------------------------------------------------------------------------------- /armo-rm/stage-2_prepare.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/armo-rm/stage-2_prepare.py -------------------------------------------------------------------------------- /armo-rm/stage-2_train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/armo-rm/stage-2_train.py -------------------------------------------------------------------------------- /bradley-terry-rm/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/bradley-terry-rm/README.md -------------------------------------------------------------------------------- /bradley-terry-rm/gemma_2B_rm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/bradley-terry-rm/gemma_2B_rm.py -------------------------------------------------------------------------------- /bradley-terry-rm/llama3_8B_rm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/bradley-terry-rm/llama3_8B_rm.py -------------------------------------------------------------------------------- /bradley-terry-rm/mistral_7B_rm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/bradley-terry-rm/mistral_7B_rm.py -------------------------------------------------------------------------------- /decision_tree/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/decision_tree/README.md -------------------------------------------------------------------------------- /decision_tree/collect_llm_preferences.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/decision_tree/collect_llm_preferences.py -------------------------------------------------------------------------------- /decision_tree/get_embeddings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/decision_tree/get_embeddings.py -------------------------------------------------------------------------------- /deepspeed_configs/deepspeed_1.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/deepspeed_configs/deepspeed_1.json -------------------------------------------------------------------------------- /deepspeed_configs/deepspeed_2.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/deepspeed_configs/deepspeed_2.json -------------------------------------------------------------------------------- /deepspeed_configs/deepspeed_3.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/deepspeed_configs/deepspeed_3.json -------------------------------------------------------------------------------- /math-rm/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/math-rm/README.md -------------------------------------------------------------------------------- /math-rm/figs/ds_gsm8k.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/math-rm/figs/ds_gsm8k.png -------------------------------------------------------------------------------- /math-rm/figs/ds_gsm8k_more_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/math-rm/figs/ds_gsm8k_more_test.png -------------------------------------------------------------------------------- /math-rm/figs/ds_math.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/math-rm/figs/ds_math.png -------------------------------------------------------------------------------- /math-rm/figs/ds_math_more_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/math-rm/figs/ds_math_more_test.png -------------------------------------------------------------------------------- /math-rm/figs/mistral_gsm8k.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/math-rm/figs/mistral_gsm8k.png -------------------------------------------------------------------------------- /math-rm/figs/mistral_math.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/math-rm/figs/mistral_math.png -------------------------------------------------------------------------------- /math-rm/llama-3.1-orm.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/math-rm/llama-3.1-orm.yaml -------------------------------------------------------------------------------- /math-rm/llama-3.1-prm.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/math-rm/llama-3.1-prm.yaml -------------------------------------------------------------------------------- /math-rm/orm_evaluate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/math-rm/orm_evaluate.py -------------------------------------------------------------------------------- /math-rm/prm_evaluate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/math-rm/prm_evaluate.py -------------------------------------------------------------------------------- /math-rm/scalar_orm_train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/math-rm/scalar_orm_train.py -------------------------------------------------------------------------------- /odin/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/odin/README.md -------------------------------------------------------------------------------- /odin/gemma_two_head.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/odin/gemma_two_head.py -------------------------------------------------------------------------------- /odin/odin-arch.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/odin/odin-arch.jpg -------------------------------------------------------------------------------- /odin/serving_two_head.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/odin/serving_two_head.py -------------------------------------------------------------------------------- /pair-pm/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/pair-pm/README.md -------------------------------------------------------------------------------- /pair-pm/RRM/rrm_augmentation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/pair-pm/RRM/rrm_augmentation.py -------------------------------------------------------------------------------- /pair-pm/SSRM/conf_threshold.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/pair-pm/SSRM/conf_threshold.py -------------------------------------------------------------------------------- /pair-pm/SSRM/pseudo_label.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/pair-pm/SSRM/pseudo_label.py -------------------------------------------------------------------------------- /pair-pm/gemma-2b-it.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/pair-pm/gemma-2b-it.yaml -------------------------------------------------------------------------------- /pair-pm/gemma-9b-it.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/pair-pm/gemma-9b-it.yaml -------------------------------------------------------------------------------- /pair-pm/llama3-8b-it.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/pair-pm/llama3-8b-it.yaml -------------------------------------------------------------------------------- /pair-pm/prepare_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/pair-pm/prepare_model.py -------------------------------------------------------------------------------- /pair-pm/process_pair_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/pair-pm/process_pair_data.py -------------------------------------------------------------------------------- /useful_code/eval_reward_bench_bt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/useful_code/eval_reward_bench_bt.py -------------------------------------------------------------------------------- /useful_code/eval_reward_bench_pm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RLHFlow/RLHF-Reward-Modeling/HEAD/useful_code/eval_reward_bench_pm.py --------------------------------------------------------------------------------