├── requirements-server.txt ├── assets ├── rewardanything-logo-horizontal.png └── rewardanything-logo-horizontal-dark-mode.png ├── requirements-dev.txt ├── requirements.txt ├── MANIFEST.in ├── examples ├── server_config.json ├── remote_usage.py ├── transformers_usage.py └── local_usage.py ├── rewardanything ├── __init__.py ├── models.py ├── cli.py ├── local.py ├── client.py ├── utils.py ├── processing.py └── serve.py ├── pages ├── _config.yml └── index.html ├── .gitignore ├── pyproject.toml ├── setup.py ├── LICENSE ├── docs └── PROJECT_DOCS.md └── README.md /requirements-server.txt: -------------------------------------------------------------------------------- 1 | fastapi>=0.68.0 2 | uvicorn[standard]>=0.15.0 3 | httpx>=0.24.0 4 | openai>=1.0.0 -------------------------------------------------------------------------------- /assets/rewardanything-logo-horizontal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WisdomShell/RewardAnything/HEAD/assets/rewardanything-logo-horizontal.png -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest>=6.0.0 2 | pytest-asyncio>=0.18.0 3 | black>=21.0.0 4 | isort>=5.9.0 5 | flake8>=3.9.0 6 | mypy>=0.910 7 | pre-commit>=2.15.0 -------------------------------------------------------------------------------- /assets/rewardanything-logo-horizontal-dark-mode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WisdomShell/RewardAnything/HEAD/assets/rewardanything-logo-horizontal-dark-mode.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=2.0.0 2 | transformers>=4.51.0 3 | tokenizers>=0.13.0 4 | requests>=2.25.0 5 | pydantic>=1.8.0 6 | tqdm>=4.62.0 7 | numpy>=1.21.0 8 | accelerate>=1.7.0 -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | include pyproject.toml 4 | recursive-include rewardanything *.py 5 | recursive-include examples *.py *.json 6 | recursive-exclude * __pycache__ 7 | recursive-exclude * *.py[co] -------------------------------------------------------------------------------- /examples/server_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "api_key": ["dummy-key-for-local-vllm"], 3 | "api_model": "zhuohaoyu/RewardAnything-8B-v1", 4 | "api_base": ["http://localhost:8000/v1"], 5 | "api_proxy": null, 6 | "api_timeout": 120.0, 7 | "api_max_retries": 3, 8 | "generation_config": { 9 | "temperature": 0.0, 10 | "max_tokens": 4096, 11 | "top_p": 1.0, 12 | "frequency_penalty": 0.0, 13 | "presence_penalty": 0.0 14 | }, 15 | "num_workers": 4, 16 | "request_limit": 500, 17 | "request_limit_period": 60, 18 | "max_error_count": 30, 19 | "dump_individual_rsp": false 20 | } -------------------------------------------------------------------------------- /rewardanything/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | RewardAnything: Generalizable Principle-Following Reward Models 3 | 4 | This package provides both local and remote inference capabilities for 5 | RewardAnything models that can follow natural language evaluation principles. 6 | """ 7 | 8 | from .local import from_pretrained 9 | from .client import Client 10 | from .models import RewardResult, RewardRequest, RewardResponse 11 | # from .benchmarks import RABench 12 | 13 | __version__ = "1.0.1" 14 | __all__ = ["from_pretrained", "Client", "RewardResult", "RewardRequest", "RewardResponse"] 15 | 16 | # Optional benchmarks import (only if available) 17 | try: 18 | from .benchmarks import RABench 19 | __all__.append("RABench") 20 | except ImportError: 21 | pass 22 | -------------------------------------------------------------------------------- /examples/remote_usage.py: -------------------------------------------------------------------------------- 1 | import rewardanything 2 | 3 | # Connect to the RewardAnything server 4 | client = rewardanything.Client("http://localhost:8001") 5 | 6 | # Process batch requests efficiently 7 | requests = [ 8 | { 9 | "principle": "Prefer helpful and safe responses", 10 | "prompt": "How to learn programming?", 11 | "responses": { 12 | "assistant_a": "Start with Python, practice daily, build projects.", 13 | "assistant_b": "Read books and hope for the best." 14 | } 15 | }, 16 | # ... more requests 17 | ] 18 | 19 | results = client.judge_batch(requests) 20 | for result in results: 21 | print(f"Scores: {result.scores}") 22 | print(f"Best to worst: {result.ranking}") 23 | print(f"Reasoning: {result.reasoning}") -------------------------------------------------------------------------------- /rewardanything/models.py: -------------------------------------------------------------------------------- 1 | """Data models and result classes for RewardAnything.""" 2 | 3 | from typing import Dict, List, Optional, Any 4 | from dataclasses import dataclass 5 | from pydantic import BaseModel 6 | 7 | 8 | @dataclass 9 | class RewardResult: 10 | """Result from RewardAnything evaluation.""" 11 | reasoning: str 12 | scores: Dict[str, float] # model_name -> score (1-5) 13 | ranking: List[str] # ordered list from best to worst 14 | raw_output: Optional[str] = None 15 | 16 | def __str__(self) -> str: 17 | return f"RewardResult(scores={self.scores}, ranking={self.ranking})" 18 | 19 | def __repr__(self) -> str: 20 | return self.__str__() 21 | 22 | 23 | class RewardRequest(BaseModel): 24 | """Request format for RewardAnything evaluation.""" 25 | principle: str 26 | prompt: str 27 | responses: Dict[str, str] 28 | mask_responses: bool = True 29 | 30 | 31 | class RewardResponse(BaseModel): 32 | """Response format from RewardAnything server.""" 33 | thoughts: str 34 | results: Dict[str, Any] -------------------------------------------------------------------------------- /pages/_config.yml: -------------------------------------------------------------------------------- 1 | title: "RewardAnything" 2 | description: "Generalizable Principle-Following Reward Models" 3 | url: "https://zhuohaoyu.github.io" 4 | baseurl: "/RewardAnything" 5 | 6 | # Build settings 7 | markdown: kramdown 8 | highlighter: rouge 9 | plugins: 10 | - jekyll-feed 11 | - jekyll-sitemap 12 | - jekyll-seo-tag 13 | 14 | # Collections 15 | collections: 16 | authors: 17 | output: false 18 | 19 | # Exclude files 20 | exclude: 21 | - node_modules/ 22 | - package.json 23 | - package-lock.json 24 | - tailwind.config.js 25 | - postcss.config.js 26 | - Gemfile 27 | - Gemfile.lock 28 | - vendor/ 29 | - .bundle/ 30 | - README.md 31 | 32 | # Social links 33 | github_username: zhuohaoyu 34 | paper_url: "https://arxiv.org/abs/2506.03637" 35 | huggingface_url: "https://huggingface.co/WisdomShell/RewardAnything-8B-v1" 36 | pypi_url: "https://pypi.org/project/rewardanything/" 37 | 38 | # Project info 39 | version: "1.0.1" 40 | license: "Apache-2.0" 41 | 42 | # GitHub Pages specific settings 43 | github: [metadata] 44 | kramdown: 45 | input: GFM 46 | syntax_highlighter: rouge -------------------------------------------------------------------------------- /examples/transformers_usage.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModelForCausalLM 2 | from rewardanything.processing import prepare_chat_messages, parse_rewardanything_output 3 | import torch 4 | 5 | # Load model and tokenizer directly 6 | model = AutoModelForCausalLM.from_pretrained( 7 | "zhuohaoyu/RewardAnything-8B-v1", 8 | torch_dtype="auto", 9 | device_map="auto" 10 | ) 11 | tokenizer = AutoTokenizer.from_pretrained("zhuohaoyu/RewardAnything-8B-v1") 12 | 13 | # Prepare evaluation data 14 | principle = "Judge responses based on helpfulness and accuracy" 15 | prompt = "What is the capital of France?" 16 | responses = { 17 | "model_a": "Paris is the capital of France.", 18 | "model_b": "I think it might be Lyon or Paris." 19 | } 20 | 21 | # Prepare chat messages (handles masking automatically) 22 | messages, masked2real = prepare_chat_messages(principle, prompt, responses) 23 | 24 | # Format with chat template 25 | formatted_input = tokenizer.apply_chat_template( 26 | messages, tokenize=False, add_generation_prompt=True 27 | ) 28 | 29 | # Generate response 30 | inputs = tokenizer(formatted_input, return_tensors="pt").to(model.device) 31 | with torch.no_grad(): 32 | outputs = model.generate( 33 | **inputs, 34 | max_new_tokens=4096, 35 | temperature=0.1, 36 | do_sample=True, 37 | pad_token_id=tokenizer.eos_token_id 38 | ) 39 | 40 | # Parse structured results (handles JSON parsing robustly) 41 | output_text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True) 42 | result = parse_rewardanything_output(output_text, masked2real) 43 | 44 | print(f"Parsed scores: {result.scores}") 45 | print(f"Ranking: {result.ranking}") 46 | print(f"Reasoning: {result.reasoning}") -------------------------------------------------------------------------------- /rewardanything/cli.py: -------------------------------------------------------------------------------- 1 | """Command-line interface for RewardAnything.""" 2 | 3 | import argparse 4 | import sys 5 | 6 | 7 | def main(): 8 | """Main CLI entry point.""" 9 | parser = argparse.ArgumentParser( 10 | description="RewardAnything CLI", 11 | formatter_class=argparse.RawDescriptionHelpFormatter 12 | ) 13 | 14 | subparsers = parser.add_subparsers(dest='command', help='Available commands') 15 | 16 | # Serve command 17 | serve_parser = subparsers.add_parser('serve', help='Start RewardAnything server') 18 | serve_parser.add_argument("-c", "--config", required=True, help="Path to configuration file") 19 | serve_parser.add_argument("--port", type=int, default=8000, help="Port to listen on") 20 | serve_parser.add_argument("--host", default="0.0.0.0", help="Host to bind to") 21 | serve_parser.add_argument("--base-output-path", default="./outputs", 22 | help="Base directory for storing batch outputs") 23 | 24 | # Parse arguments 25 | args = parser.parse_args() 26 | 27 | if args.command == 'serve': 28 | # Set up arguments for serve module 29 | serve_args = [ 30 | '--config', args.config, 31 | '--port', str(args.port), 32 | '--host', args.host, 33 | '--base-output-path', args.base_output_path 34 | ] 35 | 36 | # Replace sys.argv with serve arguments 37 | original_argv = sys.argv.copy() 38 | sys.argv = ['rewardanything-serve'] + serve_args 39 | 40 | try: 41 | from .serve import main as serve_main 42 | serve_main() 43 | finally: 44 | # Restore original argv 45 | sys.argv = original_argv 46 | else: 47 | parser.print_help() 48 | 49 | 50 | if __name__ == "__main__": 51 | main() -------------------------------------------------------------------------------- /examples/local_usage.py: -------------------------------------------------------------------------------- 1 | import rewardanything 2 | 3 | # Load model locally (similar to HuggingFace) 4 | reward_model = rewardanything.from_pretrained( 5 | "zhuohaoyu/RewardAnything-8B-v1", # Model path/name 6 | device="cuda", # Device placement 7 | torch_dtype="auto" # Automatic dtype selection 8 | ) 9 | 10 | # Define your evaluation principle 11 | principle = "I prefer clear, concise and helpful responses over long and detailed ones." 12 | 13 | # Your evaluation data 14 | prompt = "How do I learn Python programming effectively?" 15 | responses = { 16 | "response_a": "Start with Python.org's tutorial, practice daily with small projects, and join r/learnpython for help. Focus on fundamentals first.", 17 | "response_b": "Here's a comprehensive approach: 1) Start with Python basics including variables, data types, operators, control structures like if-statements, for-loops, while-loops, and functions, 2) Practice with small projects like calculators, text games, and data manipulation scripts, 3) Use interactive platforms like Codecademy, Python.org's official tutorial, edX courses, Coursera specializations, and YouTube channels, 4) Join communities like r/learnpython, Stack Overflow, Python Discord servers, and local meetups for support and networking, 5) Build progressively complex projects including web scrapers, APIs, data analysis tools, and web applications, 6) Read books like 'Automate the Boring Stuff', 'Python Crash Course', and 'Effective Python', 7) Dedicate 1-2 hours daily for consistent progress and track your learning journey.", 18 | "response_c": "Learn Python by coding." 19 | } 20 | 21 | # Get comprehensive evaluation 22 | result = reward_model.judge( 23 | principle=principle, 24 | prompt=prompt, 25 | responses=responses 26 | ) 27 | 28 | print(f"Scores: {result.scores}") 29 | print(f"Best to worst: {result.ranking}") 30 | print(f"Reasoning: {result.reasoning}") -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | build/ 8 | develop-eggs/ 9 | dist/ 10 | downloads/ 11 | eggs/ 12 | .eggs/ 13 | lib/ 14 | lib64/ 15 | parts/ 16 | sdist/ 17 | var/ 18 | wheels/ 19 | share/python-wheels/ 20 | *.egg-info/ 21 | .installed.cfg 22 | *.egg 23 | MANIFEST 24 | 25 | # PyInstaller 26 | *.manifest 27 | *.spec 28 | 29 | # Installer logs 30 | pip-log.txt 31 | 32 | # Unit test / coverage reports 33 | htmlcov/ 34 | .tox/ 35 | .nox/ 36 | .coverage 37 | .coverage.* 38 | .cache 39 | nosetests.xml 40 | coverage.xml 41 | *.cover 42 | *.py,cover 43 | .hypothesis/ 44 | .pytest_cache/ 45 | cover/ 46 | 47 | # Translations 48 | *.mo 49 | *.pot 50 | 51 | # Jupyter Notebook 52 | .ipynb_checkpoints 53 | 54 | # IPython 55 | profile_default/ 56 | ipython_config.py 57 | 58 | # pyenv 59 | .python-version 60 | 61 | # pipenv 62 | Pipfile.lock 63 | 64 | # poetry 65 | poetry.lock 66 | 67 | # pdm 68 | .pdm.toml 69 | 70 | # PEP 582 71 | __pypackages__/ 72 | 73 | # Environments 74 | .env 75 | .venv 76 | env/ 77 | venv/ 78 | ENV/ 79 | env.bak/ 80 | venv.bak/ 81 | 82 | # Spyder project settings 83 | .spyderproject 84 | .spyproject 85 | 86 | # Rope project settings 87 | .ropeproject 88 | 89 | # mkdocs documentation 90 | /site 91 | 92 | # mypy 93 | .mypy_cache/ 94 | .dmypy.json 95 | dmypy.json 96 | 97 | # Pyre type checker 98 | .pyre/ 99 | 100 | # pytype static type analyzer 101 | .pytype/ 102 | 103 | # Cython debug symbols 104 | cython_debug/ 105 | 106 | # RewardAnything specific 107 | # Output directories 108 | outputs/ 109 | tmp/ 110 | temp/ 111 | logs/ 112 | cache/ 113 | 114 | # Model files and checkpoints 115 | *.bin 116 | *.safetensors 117 | checkpoints/ 118 | models/ 119 | converted_ckpts/ 120 | *.ckpt 121 | *.pth 122 | *.pt 123 | 124 | # Config files with sensitive data 125 | config_real.json 126 | config_production.json 127 | *_real.json 128 | *_prod.json 129 | .secrets/ 130 | 131 | # Server outputs and batch processing 132 | responses/ 133 | all_responses.jsonl 134 | single_*/ 135 | batch_*/ 136 | 137 | # Transformers cache 138 | .cache/ 139 | transformers_cache/ 140 | 141 | # HuggingFace cache 142 | .huggingface/ 143 | 144 | # IDE and editor files 145 | .vscode/ 146 | .idea/ 147 | *.swp 148 | *.swo 149 | *~ 150 | 151 | # Linux 152 | *~ 153 | 154 | # Temporary files 155 | *.tmp 156 | *.temp 157 | *.bak 158 | *.backup 159 | 160 | # Compiled files 161 | *.pyc 162 | *.pyo 163 | *.pyd 164 | 165 | 166 | # Development and testing 167 | .tox/ 168 | .coverage 169 | htmlcov/ 170 | .pytest_cache/ 171 | test_outputs/ 172 | benchmark_results/ 173 | 174 | # Documentation builds 175 | docs/build/ 176 | docs/_build/ 177 | 178 | # Package builds 179 | dist/ 180 | build/ 181 | *.egg-info/ 182 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "rewardanything" 7 | version = "1.0.1" 8 | description = "RewardAnything: Generalizable Principle-Following Reward Models" 9 | readme = "README.md" 10 | license = {text = "Apache-2.0"} 11 | authors = [ 12 | {name = "Zhuohao Yu", email = "zhuohaoyu1228@gmail.com"}, 13 | {name = "Jiali Zeng"}, 14 | {name = "Weizheng Gu"}, 15 | {name = "Yidong Wang"}, 16 | {name = "Jindong Wang"}, 17 | {name = "Fandong Meng"}, 18 | {name = "Jie Zhou"}, 19 | {name = "Yue Zhang"}, 20 | {name = "Shikun Zhang"}, 21 | {name = "Wei Ye"} 22 | ] 23 | maintainers = [ 24 | {name = "Zhuohao Yu", email = "zhuohaoyu1228@gmail.com"} 25 | ] 26 | keywords = ["machine learning", "reward modeling", "RLHF", "principle-following", "evaluation", "LLM", "alignment"] 27 | classifiers = [ 28 | "Development Status :: 4 - Beta", 29 | "Intended Audience :: Developers", 30 | "Intended Audience :: Science/Research", 31 | "License :: OSI Approved :: Apache Software License", 32 | "Operating System :: OS Independent", 33 | "Programming Language :: Python :: 3", 34 | "Programming Language :: Python :: 3.8", 35 | "Programming Language :: Python :: 3.9", 36 | "Programming Language :: Python :: 3.10", 37 | "Programming Language :: Python :: 3.11", 38 | "Programming Language :: Python :: 3.12", 39 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 40 | "Topic :: Software Development :: Libraries :: Python Modules", 41 | ] 42 | requires-python = ">=3.8" 43 | dependencies = [ 44 | "torch>=2.0.0", 45 | "transformers>=4.51.0", 46 | "fastapi>=0.104.0", 47 | "uvicorn>=0.24.0", 48 | "pydantic>=2.0.0", 49 | "requests>=2.28.0", 50 | "numpy>=1.21.0", 51 | "scipy>=1.7.0", 52 | "tqdm>=4.64.0", 53 | "openai>=1.0.0", 54 | ] 55 | 56 | [project.optional-dependencies] 57 | server = [ 58 | "fastapi>=0.104.0", 59 | "uvicorn>=0.24.0", 60 | ] 61 | local = [ 62 | "torch>=2.0.0", 63 | "transformers>=4.51.0", 64 | ] 65 | all = [ 66 | "fastapi>=0.104.0", 67 | "uvicorn>=0.24.0", 68 | "torch>=2.0.0", 69 | "transformers>=4.51.0", 70 | ] 71 | dev = [ 72 | "pytest>=7.0.0", 73 | "pytest-asyncio>=0.21.0", 74 | "black>=23.0.0", 75 | "isort>=5.12.0", 76 | "flake8>=6.0.0", 77 | "mypy>=1.0.0", 78 | ] 79 | 80 | [project.urls] 81 | Homepage = "https://github.com/zhuohaoyu/RewardAnything" 82 | Repository = "https://github.com/zhuohaoyu/RewardAnything" 83 | Documentation = "https://github.com/zhuohaoyu/RewardAnything#readme" 84 | "Bug Tracker" = "https://github.com/zhuohaoyu/RewardAnything/issues" 85 | 86 | [project.scripts] 87 | rewardanything = "rewardanything.cli:main" 88 | 89 | [tool.setuptools.packages.find] 90 | include = ["rewardanything*"] 91 | 92 | [tool.setuptools.package-data] 93 | rewardanything = ["*.json", "*.yaml", "*.yml"] 94 | 95 | [tool.black] 96 | line-length = 100 97 | target-version = ['py38'] 98 | 99 | [tool.isort] 100 | profile = "black" 101 | line_length = 100 102 | 103 | [tool.mypy] 104 | python_version = "3.8" 105 | warn_return_any = true 106 | warn_unused_configs = true 107 | disallow_untyped_defs = true -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import os 3 | 4 | # Read README for long description 5 | with open("README.md", "r", encoding="utf-8") as fh: 6 | long_description = fh.read() 7 | 8 | # Read requirements 9 | def read_requirements(filename): 10 | if os.path.exists(filename): 11 | with open(filename, "r", encoding="utf-8") as f: 12 | return [line.strip() for line in f if line.strip() and not line.startswith("#")] 13 | return [] 14 | 15 | # Core requirements for the package 16 | core_requirements = [ 17 | "torch>=2.0.0", 18 | "transformers>=4.51.0", 19 | "tokenizers>=0.13.0", 20 | "requests>=2.25.0", 21 | "pydantic>=1.8.0", 22 | "tqdm>=4.62.0", 23 | "numpy>=1.21.0", 24 | "accelerate>=1.7.0", 25 | ] 26 | 27 | # Server requirements 28 | server_requirements = [ 29 | "fastapi>=0.68.0", 30 | "uvicorn[standard]>=0.15.0", 31 | "httpx>=0.24.0", 32 | "openai>=1.0.0", 33 | "asyncio", 34 | ] 35 | 36 | # Development requirements 37 | dev_requirements = [ 38 | "pytest>=6.0.0", 39 | "pytest-asyncio>=0.18.0", 40 | "black>=21.0.0", 41 | "isort>=5.9.0", 42 | "flake8>=3.9.0", 43 | "mypy>=0.910", 44 | "pre-commit>=2.15.0", 45 | ] 46 | 47 | # Benchmark requirements 48 | benchmark_requirements = [ 49 | "datasets>=2.0.0", 50 | "scipy>=1.7.0", 51 | "pandas>=1.3.0", 52 | "scikit-learn>=1.0.0", 53 | ] 54 | 55 | setup( 56 | name="RewardAnything", 57 | version="1.0.1", 58 | author="Zhuohao Yu", 59 | author_email="zyu@stu.pku.edu.cn", 60 | description="RewardAnything: Generalizable Principle-Following Reward Models", 61 | long_description=long_description, 62 | long_description_content_type="text/markdown", 63 | url="https://github.com/zhuohaoyu/RewardAnything", 64 | packages=find_packages(), 65 | classifiers=[ 66 | "Development Status :: 4 - Beta", 67 | "Intended Audience :: Developers", 68 | "Intended Audience :: Science/Research", 69 | "License :: OSI Approved :: Apache Software License", 70 | "Operating System :: OS Independent", 71 | "Programming Language :: Python :: 3", 72 | "Programming Language :: Python :: 3.8", 73 | "Programming Language :: Python :: 3.9", 74 | "Programming Language :: Python :: 3.10", 75 | "Programming Language :: Python :: 3.11", 76 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 77 | "Topic :: Software Development :: Libraries :: Python Modules", 78 | ], 79 | python_requires=">=3.8", 80 | install_requires=core_requirements, 81 | extras_require={ 82 | "server": server_requirements, 83 | "dev": dev_requirements, 84 | "benchmarks": benchmark_requirements, 85 | "all": server_requirements + dev_requirements + benchmark_requirements, 86 | }, 87 | entry_points={ 88 | "console_scripts": [ 89 | "rewardanything=rewardanything.cli:main", 90 | "rewardanything-serve=rewardanything.serve:main", 91 | ], 92 | }, 93 | include_package_data=True, 94 | package_data={ 95 | "rewardanything": ["*.json", "*.yaml", "*.txt"], 96 | }, 97 | keywords="reward model, RLHF, language model, evaluation, principle-following", 98 | project_urls={ 99 | "Bug Reports": "https://github.com/zhuohaoyu/RewardAnything/issues", 100 | "Source": "https://github.com/zhuohaoyu/RewardAnything", 101 | "Documentation": "https://rewardanything.readthedocs.io/", 102 | "Paper": "https://arxiv.org/abs/2506.03637", 103 | }, 104 | ) -------------------------------------------------------------------------------- /rewardanything/local.py: -------------------------------------------------------------------------------- 1 | """Local inference implementation for RewardAnything models.""" 2 | 3 | import torch 4 | from typing import Dict, List, Optional, Union, Any 5 | from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig 6 | from .models import RewardResult 7 | from .processing import prepare_chat_messages, parse_rewardanything_output 8 | import logging 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class RewardModel: 14 | """Local RewardAnything model for principle-following evaluation.""" 15 | 16 | def __init__( 17 | self, 18 | model, 19 | tokenizer, 20 | generation_config: Optional[GenerationConfig] = None, 21 | device: Optional[str] = None 22 | ): 23 | self.model = model 24 | self.tokenizer = tokenizer 25 | self.generation_config = generation_config or GenerationConfig( 26 | max_new_tokens=4096, 27 | temperature=0.1, 28 | do_sample=True, 29 | top_p=0.9, 30 | pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id else 0 31 | ) 32 | self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") 33 | 34 | def judge( 35 | self, 36 | principle: str, 37 | prompt: str, 38 | responses: Dict[str, str], 39 | mask_responses: bool = True, 40 | **generation_kwargs 41 | ) -> RewardResult: 42 | """ 43 | Evaluate responses based on a natural language principle. 44 | 45 | Args: 46 | principle: Natural language principle for evaluation 47 | prompt: The input prompt that responses are answering 48 | responses: Dict mapping model names to their responses 49 | mask_responses: Whether to mask model names during evaluation 50 | **generation_kwargs: Additional generation parameters 51 | 52 | Returns: 53 | RewardResult containing scores, ranking, and reasoning 54 | """ 55 | # Prepare chat messages using unified processing 56 | messages, masked2real = prepare_chat_messages( 57 | principle, prompt, responses, mask_responses 58 | ) 59 | 60 | # Format for the model 61 | formatted_input = self.tokenizer.apply_chat_template( 62 | messages, tokenize=False, add_generation_prompt=True 63 | ) 64 | 65 | # Tokenize input 66 | inputs = self.tokenizer( 67 | formatted_input, 68 | return_tensors="pt", 69 | padding=True, 70 | truncation=True, 71 | max_length=4096 72 | ).to(self.device) 73 | 74 | # Generate response 75 | generation_config = self.generation_config 76 | if generation_kwargs: 77 | generation_config = GenerationConfig(**{ 78 | **self.generation_config.to_dict(), 79 | **generation_kwargs 80 | }) 81 | 82 | with torch.no_grad(): 83 | outputs = self.model.generate( 84 | **inputs, 85 | generation_config=generation_config, 86 | pad_token_id=self.tokenizer.eos_token_id 87 | ) 88 | 89 | # Decode output 90 | generated_tokens = outputs[0][inputs.input_ids.shape[1]:] 91 | output_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True) 92 | 93 | # Parse output using unified processing 94 | return parse_rewardanything_output(output_text, masked2real) 95 | 96 | def judge_batch( 97 | self, 98 | requests: List[Dict[str, Any]], 99 | batch_size: int = 8, 100 | **generation_kwargs 101 | ) -> List[RewardResult]: 102 | """ 103 | Evaluate multiple requests in batches. 104 | 105 | Args: 106 | requests: List of dicts with 'principle', 'prompt', 'responses' keys 107 | batch_size: Batch size for processing 108 | **generation_kwargs: Additional generation parameters 109 | 110 | Returns: 111 | List of RewardResult objects 112 | """ 113 | results = [] 114 | for i in range(0, len(requests), batch_size): 115 | batch = requests[i:i + batch_size] 116 | for request in batch: 117 | result = self.judge( 118 | principle=request["principle"], 119 | prompt=request["prompt"], 120 | responses=request["responses"], 121 | mask_responses=request.get("mask_responses", True), 122 | **generation_kwargs 123 | ) 124 | results.append(result) 125 | return results 126 | 127 | 128 | def from_pretrained( 129 | model_name_or_path: str, 130 | device: Optional[str] = None, 131 | torch_dtype: Optional[Union[str, torch.dtype]] = None, 132 | trust_remote_code: bool = False, 133 | generation_config: Optional[Dict[str, Any]] = None, 134 | **kwargs 135 | ) -> RewardModel: 136 | """ 137 | Load a RewardAnything model for local inference. 138 | 139 | Args: 140 | model_name_or_path: Path to model or HuggingFace model identifier 141 | device: Device to load model on ('cuda', 'cpu', 'auto') 142 | torch_dtype: Data type for model weights 143 | trust_remote_code: Whether to trust remote code 144 | generation_config: Generation configuration parameters 145 | **kwargs: Additional arguments passed to AutoModelForCausalLM.from_pretrained 146 | 147 | Returns: 148 | RewardModel instance ready for evaluation 149 | """ 150 | if device is None: 151 | device = "cuda" if torch.cuda.is_available() else "cpu" 152 | 153 | if torch_dtype == "auto": 154 | torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32 155 | elif isinstance(torch_dtype, str): 156 | torch_dtype = getattr(torch, torch_dtype) 157 | 158 | # Load tokenizer 159 | tokenizer = AutoTokenizer.from_pretrained( 160 | model_name_or_path, 161 | trust_remote_code=trust_remote_code, 162 | **{k: v for k, v in kwargs.items() if k in ['use_fast', 'padding_side']} 163 | ) 164 | 165 | if tokenizer.pad_token is None: 166 | tokenizer.pad_token = tokenizer.eos_token 167 | 168 | # Load model 169 | model = AutoModelForCausalLM.from_pretrained( 170 | model_name_or_path, 171 | torch_dtype=torch_dtype, 172 | device_map=device if device != "auto" else "auto", 173 | trust_remote_code=trust_remote_code, 174 | **{k: v for k, v in kwargs.items() if k not in ['use_fast', 'padding_side']} 175 | ) 176 | 177 | # Create generation config 178 | gen_config = None 179 | if generation_config: 180 | gen_config = GenerationConfig(**generation_config) 181 | 182 | return RewardModel( 183 | model=model, 184 | tokenizer=tokenizer, 185 | generation_config=gen_config, 186 | device=device 187 | ) -------------------------------------------------------------------------------- /rewardanything/client.py: -------------------------------------------------------------------------------- 1 | """Remote client implementation for RewardAnything.""" 2 | 3 | import json 4 | import time 5 | import requests 6 | from typing import Dict, List, Optional, Any, Union 7 | from .models import RewardResult, RewardRequest, RewardResponse 8 | import logging 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class Client: 14 | """Remote client for RewardAnything API.""" 15 | 16 | def __init__( 17 | self, 18 | base_url: str, 19 | api_key: Optional[str] = None, 20 | timeout: float = 30.0, 21 | max_retries: int = 3, 22 | headers: Optional[Dict[str, str]] = None, 23 | **kwargs 24 | ): 25 | """ 26 | Initialize RewardAnything client. 27 | 28 | Args: 29 | base_url: Base URL for the RewardAnything API 30 | api_key: Optional API key for authentication 31 | timeout: Request timeout in seconds 32 | max_retries: Maximum number of retry attempts 33 | headers: Additional headers to include in requests 34 | **kwargs: Additional client configuration 35 | """ 36 | self.base_url = base_url.rstrip('/') 37 | self.api_key = api_key 38 | self.timeout = timeout 39 | self.max_retries = max_retries 40 | 41 | self.headers = { 42 | "Content-Type": "application/json", 43 | "User-Agent": "RewardAnything-Python-Client/1.0.1" 44 | } 45 | 46 | if api_key: 47 | self.headers["Authorization"] = f"Bearer {api_key}" 48 | 49 | if headers: 50 | self.headers.update(headers) 51 | 52 | # Store additional config 53 | self.config = kwargs 54 | 55 | def _make_request( 56 | self, 57 | endpoint: str, 58 | data: Dict[str, Any], 59 | timeout: Optional[float] = None 60 | ) -> Dict[str, Any]: 61 | """Make HTTP request with retries.""" 62 | url = f"{self.base_url}{endpoint}" 63 | timeout = timeout or self.timeout 64 | 65 | last_exception = None 66 | for attempt in range(self.max_retries + 1): 67 | try: 68 | response = requests.post( 69 | url, 70 | json=data, 71 | headers=self.headers, 72 | timeout=timeout 73 | ) 74 | response.raise_for_status() 75 | return response.json() 76 | 77 | except requests.exceptions.RequestException as e: 78 | last_exception = e 79 | if attempt < self.max_retries: 80 | wait_time = 2 ** attempt # Exponential backoff 81 | logger.warning(f"Request failed (attempt {attempt + 1}), retrying in {wait_time}s: {e}") 82 | time.sleep(wait_time) 83 | else: 84 | logger.error(f"Request failed after {self.max_retries + 1} attempts: {e}") 85 | 86 | raise last_exception 87 | 88 | def judge( 89 | self, 90 | principle: str, 91 | prompt: str, 92 | responses: Dict[str, str], 93 | mask_responses: bool = True, 94 | timeout: Optional[float] = None, 95 | **kwargs 96 | ) -> RewardResult: 97 | """ 98 | Evaluate responses based on a natural language principle. 99 | 100 | Args: 101 | principle: Natural language principle for evaluation 102 | prompt: The input prompt that responses are answering 103 | responses: Dict mapping model names to their responses 104 | mask_responses: Whether to mask model names during evaluation 105 | timeout: Request timeout override 106 | **kwargs: Additional request parameters 107 | 108 | Returns: 109 | RewardResult containing scores, ranking, and reasoning 110 | """ 111 | request_data = { 112 | "principle": principle, 113 | "prompt": prompt, 114 | "responses": responses, 115 | "mask_responses": mask_responses 116 | } 117 | 118 | # Add any additional config 119 | request_data.update(kwargs) 120 | 121 | try: 122 | response_data = self._make_request( 123 | "/api/rewardanything", 124 | request_data, 125 | timeout 126 | ) 127 | 128 | # Parse response 129 | thoughts = response_data.get("thoughts", "") 130 | results = response_data.get("results", {}) 131 | 132 | return RewardResult( 133 | reasoning=thoughts, 134 | scores=results.get("scores", {}), 135 | ranking=results.get("best-to-worst", []), 136 | raw_output=json.dumps(response_data) 137 | ) 138 | 139 | except Exception as e: 140 | logger.error(f"Failed to evaluate with principle '{principle}': {e}") 141 | raise 142 | 143 | def judge_batch( 144 | self, 145 | requests: List[Dict[str, Any]], 146 | timeout: Optional[float] = None, 147 | **kwargs 148 | ) -> List[RewardResult]: 149 | """ 150 | Evaluate multiple requests in a batch. 151 | 152 | Args: 153 | requests: List of dicts with 'principle', 'prompt', 'responses' keys 154 | timeout: Request timeout override 155 | **kwargs: Additional request parameters 156 | 157 | Returns: 158 | List of RewardResult objects 159 | """ 160 | # Convert to RewardRequest format 161 | batch_requests = [] 162 | for req in requests: 163 | batch_requests.append({ 164 | "principle": req["principle"], 165 | "prompt": req["prompt"], 166 | "responses": req["responses"], 167 | "mask_responses": req.get("mask_responses", True) 168 | }) 169 | 170 | try: 171 | response_data = self._make_request( 172 | "/api/rewardanything_batch", 173 | batch_requests, 174 | timeout or (self.timeout * len(requests)) # Scale timeout with batch size 175 | ) 176 | 177 | # Parse batch response 178 | results = [] 179 | for item in response_data: 180 | results.append(RewardResult( 181 | reasoning=item.get("thoughts", ""), 182 | scores=item.get("results", {}).get("scores", {}), 183 | ranking=item.get("results", {}).get("best-to-worst", []), 184 | raw_output=json.dumps(item) 185 | )) 186 | 187 | return results 188 | 189 | except Exception as e: 190 | logger.error(f"Failed to evaluate batch of {len(requests)} requests: {e}") 191 | raise 192 | 193 | def health_check(self) -> bool: 194 | """Check if the server is healthy.""" 195 | try: 196 | response = requests.get( 197 | f"{self.base_url}/health", 198 | headers=self.headers, 199 | timeout=5.0 200 | ) 201 | return response.status_code == 200 202 | except: 203 | return False -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity granting the License. 13 | 14 | "Legal Entity" shall mean the union of the acting entity and all 15 | other entities that control, are controlled by, or are under common 16 | control with that entity. For the purposes of this definition, 17 | "control" means (i) the power, direct or indirect, to cause the 18 | direction or management of such entity, whether by contract or 19 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 20 | outstanding shares, or (iii) beneficial ownership of such entity. 21 | 22 | "You" (or "Your") shall mean an individual or Legal Entity 23 | exercising permissions granted by this License. 24 | 25 | "Source" form shall mean the preferred form for making modifications, 26 | including but not limited to software source code, documentation 27 | source, and configuration files. 28 | 29 | "Object" form shall mean any form resulting from mechanical 30 | transformation or translation of a Source form, including but 31 | not limited to compiled object code, generated documentation, 32 | and conversions to other media types. 33 | 34 | "Work" shall mean the work of authorship, whether in Source or 35 | Object form, made available under the License, as indicated by a 36 | copyright notice that is included in or attached to the work 37 | (which shall not include communications that are clearly marked or 38 | otherwise designated in writing by the owner as "Not a Work of the License"). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based upon (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and derivative works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control 57 | systems, and issue tracking systems that are managed by, or on behalf 58 | of, the Licensor for the purpose of discussing and improving the Work, 59 | but excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to use, reproduce, modify, distribute, and prepare 70 | Derivative Works of, and to display and perform the Work and such Derivative 71 | Works in any medium or format, whether now known or hereafter devised, 72 | provided that You preserve all copyright, notice, and attribution 73 | statements. 74 | 75 | 3. Grant of Patent License. Subject to the terms and conditions of 76 | this License, each Contributor hereby grants to You a perpetual, 77 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 78 | (except as stated in this section) patent license to make, have made, 79 | use, offer to sell, sell, import, and otherwise transfer the Work, 80 | where such license applies only to those patent claims licensable 81 | by such Contributor that are necessarily infringed by their 82 | Contribution(s) alone or by combination of their Contribution(s) 83 | with the Work to which such Contribution(s) was submitted. 84 | 85 | 4. Redistribution. You must give any other recipients of the Work or 86 | Derivative Works a copy of this License; and You must cause any 87 | modified files to carry prominent notices stating that You changed 88 | the files; and You must retain, in the Source form of any Derivative 89 | Works that You distribute, all copyright, trademark, patent, 90 | attribution and other notices from the Source form of the Work, 91 | excluding those notices that do not pertain to any part of 92 | the Derivative Works; and If the Work includes a "NOTICE" text file 93 | as part of its distribution, then any Derivative Works that You 94 | distribute must include a readable copy of the attribution notices 95 | contained within such NOTICE file, excluding those notices that do not 96 | pertain to any part of the Derivative Works, in at least one of the 97 | following places: within a NOTICE text file distributed as part of the 98 | Derivative Works; within the Source form or documentation, if provided 99 | along with the Derivative Works; or, within a display generated by the 100 | Derivative Works, if and wherever such third-party notices normally appear. 101 | 102 | 5. Submission of Contributions. Unless You explicitly state otherwise, 103 | any Contribution intentionally submitted for inclusion in the Work 104 | by You to the Licensor shall be under the terms and conditions of 105 | this License, without any additional terms or conditions. 106 | Notwithstanding the above, nothing herein shall supersede or modify 107 | the terms of any separate license agreement you may have executed 108 | with Licensor regarding such Contributions. 109 | 110 | 6. Trademarks. This License does not grant permission to use the trade 111 | names, trademarks, service marks, or product names of the Licensor, 112 | except as required for reasonable and customary use in describing the 113 | origin of the Work and reproducing the content of the NOTICE file. 114 | 115 | 7. Disclaimer of Warranty. Unless required by applicable law or 116 | agreed to in writing, Licensor provides the Work (and each 117 | Contributor provides its Contributions) on an "AS IS" BASIS, 118 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 119 | implied, including, without limitation, any warranties or conditions 120 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 121 | PARTICULAR PURPOSE. You are solely responsible for determining the 122 | appropriateness of using or redistributing the Work and assume any 123 | risks associated with Your exercise of permissions under this License. 124 | 125 | 8. Limitation of Liability. In no event and under no legal theory, 126 | whether in tort (including negligence), contract, or otherwise, 127 | unless required by applicable law (such as deliberate and grossly 128 | negligent acts) or agreed to in writing, shall any Contributor be 129 | liable to You for damages, including any direct, indirect, special, 130 | incidental, or consequential damages of any character arising as a 131 | result of this License or out of the use or inability to use the 132 | Work (including but not limited to damages for loss of goodwill, 133 | work stoppage, computer failure or malfunction, or any and all 134 | other commercial damages or losses), even if such Contributor 135 | has been advised of the possibility of such damages. 136 | 137 | 9. Accepting Warranty or Support. When redistributing the Work or 138 | Derivative Works thereof, You may choose to offer, and charge a fee 139 | for, acceptance of support, warranty, indemnity, or other liability 140 | obligations and/or rights consistent with this License. However, in 141 | accepting such obligations, You may act only on Your own behalf and on 142 | Your sole responsibility, not on behalf of any other Contributor, and 143 | only if You agree to indemnify, defend, and hold each Contributor 144 | harmless for any liability incurred by, or claims asserted against, 145 | such Contributor by reason of your accepting any such warranty or support. 146 | 147 | END OF TERMS AND CONDITIONS 148 | 149 | Copyright 2025 RewardAnything Contributors 150 | 151 | Licensed under the Apache License, Version 2.0 (the "License"); 152 | you may not use this file except in compliance with the License. 153 | You may obtain a copy of the License at 154 | 155 | http://www.apache.org/licenses/LICENSE-2.0 156 | 157 | Unless required by applicable law or agreed to in writing, software 158 | distributed under the License is distributed on an "AS IS" BASIS, 159 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 160 | See the License for the specific language governing permissions and 161 | limitations under the License. -------------------------------------------------------------------------------- /rewardanything/utils.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import os 4 | import argparse 5 | import logging 6 | import time 7 | import codecs 8 | import traceback 9 | 10 | from typing import Optional, List, Dict, Union, Any 11 | from random import choice 12 | from tqdm.asyncio import tqdm as atqdm 13 | from openai import AsyncOpenAI, APIStatusError 14 | 15 | 16 | class OpenAIClient: 17 | def __init__( 18 | self, 19 | output_path: str, 20 | api_model: str, 21 | api_key: Union[str, List[str]], 22 | api_base: Optional[Union[str, List[str]]] = None, 23 | api_proxy: Optional[Union[str, List[str]]] = None, 24 | api_timeout: Optional[float] = 60.0, 25 | api_max_retries: Optional[int] = 5, 26 | generation_config: Optional[Dict] = None, 27 | max_error_count: Optional[int] = 100, 28 | trial_run=False, 29 | dump_individual_rsp=True, 30 | ): 31 | self.output_path = output_path 32 | self.trial_run = trial_run 33 | self.max_error_count = max_error_count 34 | self.total_errors = 0 35 | self.logger = logging.getLogger(__name__) 36 | 37 | if isinstance(api_key, str): 38 | api_key = [api_key] 39 | 40 | if api_base is None: 41 | api_base = ["https://api.openai.com/v1"] * len(api_key) 42 | elif isinstance(api_base, str): 43 | api_base = [api_base] * len(api_key) 44 | 45 | assert len(api_key) == len( 46 | api_base 47 | ), "Number of api_key and api_base must match" 48 | 49 | if api_proxy is not None: 50 | if isinstance(api_proxy, str): 51 | api_proxy = [api_proxy] * len(api_key) 52 | else: 53 | assert len(api_proxy) == len( 54 | api_key 55 | ), "Number of api_key and api_proxy must match" 56 | self.clients = [ 57 | AsyncOpenAI( 58 | api_key=key, 59 | base_url=api_base, 60 | timeout=api_timeout, 61 | max_retries=api_max_retries, 62 | ) 63 | for key, api_base, proxy in zip( 64 | api_key, api_base, api_proxy 65 | ) 66 | ] 67 | else: 68 | self.clients = [ 69 | AsyncOpenAI( 70 | api_key=key, 71 | base_url=api_base, 72 | timeout=api_timeout, 73 | max_retries=api_max_retries, 74 | ) 75 | for key, api_base in zip(api_key, api_base) 76 | ] 77 | 78 | self.model = api_model 79 | 80 | self.response_queue = asyncio.Queue() 81 | self.dump_individual_rsp = dump_individual_rsp 82 | 83 | if generation_config is None: 84 | self.generation_config = { 85 | "frequency_penalty": 0, 86 | "max_tokens": 100, 87 | "n": 1, 88 | "presence_penalty": 0, 89 | "response_format": {"type": "text"}, 90 | "seed": 42, 91 | "stream": False, 92 | "temperature": 0.0, 93 | } 94 | else: 95 | self.generation_config = generation_config 96 | 97 | if dump_individual_rsp: 98 | os.makedirs(os.path.join(self.output_path, "responses"), exist_ok=True) 99 | 100 | async def query( 101 | self, 102 | request, 103 | num_retries=3, 104 | ): 105 | if isinstance(request, dict): 106 | request_dict = request 107 | else: 108 | request_dict = request.__dict__ 109 | 110 | assert "messages" in request_dict, "messages must be provided in request" 111 | assert "uuid" in request_dict, "uuid must be provided in request" 112 | 113 | if self.dump_individual_rsp: 114 | save_path = os.path.join( 115 | self.output_path, "responses", f'{request_dict["uuid"]}.json' 116 | ) 117 | 118 | if os.path.exists(save_path) and not self.trial_run: 119 | with codecs.open(save_path) as f: 120 | rsp_content = json.load(f) 121 | await self.response_queue.put(rsp_content) 122 | return f"Skipping {save_path}" 123 | 124 | if "generation_config" in request_dict and isinstance( 125 | request_dict["generation_config"], dict 126 | ): 127 | generation_config = self.generation_config.copy() 128 | generation_config.update(request_dict["generation_config"]) 129 | else: 130 | generation_config = self.generation_config 131 | 132 | response = None 133 | while num_retries > 0: 134 | num_retries -= 1 135 | try: 136 | client = choice(self.clients) 137 | response = await client.chat.completions.create( 138 | messages=request_dict["messages"], 139 | model=self.model, 140 | **generation_config, 141 | ) 142 | response = response.model_dump() 143 | break 144 | except APIStatusError as e: 145 | if self.max_error_count > self.total_errors: 146 | self.total_errors += 1 147 | self.logger.warning( 148 | f"OpenAI APIStatusError: {e}, total errors: {self.total_errors}, sleeping..." 149 | ) 150 | await asyncio.sleep(1.0) 151 | else: 152 | self.logger.error( 153 | f"OpenAI APIStatusError: {e}, max_error_count reached, exiting..." 154 | ) 155 | raise e 156 | except: 157 | print(traceback.format_exc()) 158 | 159 | if response is None: 160 | raise Exception("Empty response from remote OpenAI API") 161 | 162 | try: 163 | response["generated_text"] = response["choices"][0]["message"]["content"] 164 | except: 165 | print(traceback.format_exc()) 166 | print(response) 167 | raise Exception("Empty response from remote OpenAI API") 168 | 169 | if self.dump_individual_rsp: 170 | with codecs.open(save_path, "w") as f: 171 | json.dump( 172 | {"request": request_dict, "response": response}, 173 | f, 174 | ensure_ascii=False, 175 | indent=2, 176 | ) 177 | 178 | await self.response_queue.put({"request": request_dict, "response": response}) 179 | 180 | return response["choices"][0]["message"]["content"] 181 | 182 | async def write_responses_to_file(self): 183 | save_path = os.path.join(self.output_path, "all_responses.jsonl") 184 | while True: 185 | response = await self.response_queue.get() 186 | with codecs.open(save_path, "a", encoding="utf-8") as f: 187 | f.write(json.dumps(response, ensure_ascii=False) + "\n") 188 | self.response_queue.task_done() 189 | 190 | 191 | class AsyncRateLimitThreadPool: 192 | def __init__(self, num_workers, num_requests, period): 193 | self.num_workers = num_workers 194 | self.num_requests = num_requests 195 | self.loop = asyncio.get_event_loop() 196 | self.semaphore = asyncio.Semaphore(num_workers) 197 | self.last_call_time = time.time() 198 | self.call_count = 0 199 | self.period = period 200 | 201 | async def __aenter__(self): 202 | return self 203 | 204 | async def __aexit__(self, exc_type, exc, tb): 205 | pass 206 | 207 | async def _rate_limited_call(self, func, *args, **kwargs): 208 | # Limit the number of calls to func per minute 209 | elapsed_time = time.time() - self.last_call_time 210 | if elapsed_time < self.period: 211 | self.call_count += 1 212 | if self.call_count > self.num_requests: 213 | sleep_time = self.period - elapsed_time 214 | # logging.info("Sleeping for {} seconds".format(sleep_time)) 215 | await asyncio.sleep(sleep_time) 216 | self.call_count = 0 217 | self.last_call_time = time.time() 218 | 219 | # Acquire a semaphore permit before calling func 220 | async with self.semaphore: 221 | result = await func(*args, **kwargs) 222 | 223 | return result 224 | 225 | async def map(self, func, *args_list): 226 | coroutines = [self._rate_limited_call(func, *args) for args in zip(*args_list)] 227 | 228 | # Use tqdm progress bar with coroutines 229 | results = [] 230 | for coroutine in atqdm.as_completed(coroutines): 231 | result = await coroutine 232 | results.append(result) 233 | 234 | return results 235 | 236 | 237 | async def run_pool(api, requests, num_workers, num_requests, period): 238 | pool = AsyncRateLimitThreadPool(num_workers, num_requests, period) 239 | writer_task = asyncio.create_task(api.write_responses_to_file()) 240 | 241 | results = await pool.map(api.query, requests) 242 | await api.response_queue.join() # Ensure all responses are written 243 | writer_task.cancel() 244 | 245 | return results 246 | 247 | 248 | def run_api_inference( 249 | requests: Union[ 250 | List[Dict], Any 251 | ], # can List[Dict] or list of any object with __dict__ attribute 252 | output_path: str, # path to save responses 253 | api_model: str, # openai model name 254 | api_key: Union[str, List[str]], 255 | api_base: Optional[Union[str, List[str]]] = None, 256 | api_proxy: Optional[Union[str, List[str]]] = None, 257 | api_timeout: Optional[float] = 30.0, 258 | api_max_retries: Optional[int] = 5, 259 | generation_config: Optional[Dict] = None, 260 | num_workers: Optional[int] = 8, 261 | request_limit: Optional[int] = 100, 262 | request_limit_period: Optional[int] = 60, 263 | max_error_count: Optional[int] = 100, 264 | trial_run=False, 265 | dump_individual_rsp=True, 266 | ): 267 | logging.getLogger(__name__).info( 268 | f"num_requests: {len(requests)}, output_path: {output_path}" 269 | ) 270 | logging.getLogger("httpx").setLevel(logging.WARNING) 271 | 272 | os.makedirs(output_path, exist_ok=True) 273 | 274 | if dump_individual_rsp: 275 | os.makedirs(os.path.join(output_path, "responses"), exist_ok=True) 276 | 277 | if os.path.exists(os.path.join(output_path, "all_responses.jsonl")): 278 | os.remove(os.path.join(output_path, "all_responses.jsonl")) 279 | 280 | client = OpenAIClient( 281 | output_path=output_path, 282 | api_model=api_model, 283 | api_key=api_key, 284 | api_base=api_base, 285 | api_proxy=api_proxy, 286 | api_timeout=api_timeout, 287 | api_max_retries=api_max_retries, 288 | generation_config=generation_config, 289 | trial_run=trial_run, 290 | dump_individual_rsp=dump_individual_rsp, 291 | ) 292 | 293 | try: 294 | asyncio.run( 295 | run_pool( 296 | client, 297 | requests, 298 | num_workers=num_workers, 299 | num_requests=request_limit, 300 | period=request_limit_period, 301 | ) 302 | ) 303 | except KeyboardInterrupt: 304 | logging.getLogger(__name__).info("Interrupt received! Closing...") 305 | -------------------------------------------------------------------------------- /docs/PROJECT_DOCS.md: -------------------------------------------------------------------------------- 1 | # RewardAnything Project Documentation 2 | 3 | ## Overview 4 | 5 | RewardAnything is a revolutionary reward modeling framework that enables models to understand and follow explicit natural language principles instead of learning implicit preferences from fixed datasets. This enables dynamic adaptation to diverse evaluation criteria without costly retraining. 6 | 7 | ## Project Structure 8 | 9 | ``` 10 | rewardanything/ 11 | ├── __init__.py # Package initialization 12 | ├── models.py # Data models and result classes 13 | ├── local.py # Local inference implementation 14 | ├── client.py # Remote client implementation 15 | ├── serve.py # FastAPI server implementation 16 | ├── cli.py # Command-line interface 17 | ├── utils.py # Utility functions (OpenAI client, rate limiting) 18 | └── benchmarks.py # Benchmark evaluation tools (optional) 19 | 20 | configs/ 21 | └── server_config.json # Example server configuration 22 | 23 | docs/ 24 | ├── PROJECT_DOCS.md # This file 25 | ├── API_REFERENCE.md # Detailed API documentation 26 | └── DEPLOYMENT_GUIDE.md # Production deployment guide 27 | 28 | tests/ 29 | ├── test_local.py # Local inference tests 30 | ├── test_client.py # Remote client tests 31 | └── test_server.py # Server functionality tests 32 | 33 | examples/ 34 | ├── basic_usage.py # Basic usage examples 35 | ├── batch_evaluation.py # Batch processing examples 36 | └── custom_principles.py # Advanced principle examples 37 | ``` 38 | 39 | ## Core Components 40 | 41 | ### 1. Local Inference (`local.py`) 42 | 43 | The local inference module provides direct model loading and evaluation: 44 | 45 | ```python 46 | import rewardanything 47 | 48 | # Load model locally 49 | reward_model = rewardanything.from_pretrained( 50 | "RewardAnything/RewardAnything-8B", 51 | device="cuda", 52 | torch_dtype="auto" 53 | ) 54 | 55 | # Evaluate responses 56 | result = reward_model.judge( 57 | principle="Prefer concise, accurate responses", 58 | prompt="What is Python?", 59 | responses={ 60 | "model_a": "Python is a programming language...", 61 | "model_b": "Python is a snake." 62 | } 63 | ) 64 | ``` 65 | 66 | **Key Features:** 67 | - Direct model loading from HuggingFace 68 | - GPU/CPU support with automatic device detection 69 | - Batch processing capabilities 70 | - Customizable generation parameters 71 | - Response masking to prevent bias 72 | 73 | ### 2. Remote Client (`client.py`) 74 | 75 | The remote client enables interaction with RewardAnything servers: 76 | 77 | ```python 78 | import rewardanything 79 | 80 | # Connect to server 81 | client = rewardanything.Client( 82 | base_url="http://localhost:8000", 83 | api_key="your-api-key", # Optional 84 | timeout=30.0 85 | ) 86 | 87 | # Same API as local inference 88 | result = client.judge( 89 | principle="Prioritize safety and helpfulness", 90 | prompt="How to learn programming?", 91 | responses=responses 92 | ) 93 | ``` 94 | 95 | **Key Features:** 96 | - HTTP-based communication 97 | - Automatic retry with exponential backoff 98 | - Authentication support 99 | - Batch processing 100 | - Health check capabilities 101 | 102 | ### 3. Server Implementation (`serve.py`) 103 | 104 | The server provides a FastAPI-based REST API for RewardAnything: 105 | 106 | ```bash 107 | # Start server 108 | rewardanything-serve -c configs/server_config.json --port 8000 109 | ``` 110 | 111 | **API Endpoints:** 112 | - `POST /api/rewardanything` - Single evaluation 113 | - `POST /api/rewardanything_batch` - Batch evaluation 114 | - `POST /api/new_batch_request` - Async batch processing 115 | - `GET /api/fetch_results/{batch_id}` - Retrieve batch results 116 | - `GET /health` - Health check 117 | 118 | ### 4. Data Models (`models.py`) 119 | 120 | Core data structures for the framework: 121 | 122 | ```python 123 | @dataclass 124 | class RewardResult: 125 | reasoning: str # Model's reasoning process 126 | scores: Dict[str, float] # Model scores (1-5) 127 | ranking: List[str] # Best to worst ranking 128 | raw_output: Optional[str] = None # Raw model output 129 | 130 | class RewardRequest(BaseModel): 131 | principle: str # Evaluation principle 132 | prompt: str # Input prompt 133 | responses: Dict[str, str] # Model responses 134 | mask_responses: bool = True # Whether to mask model names 135 | ``` 136 | 137 | ## Installation and Setup 138 | 139 | ### Basic Installation 140 | 141 | ```bash 142 | pip install RewardAnything 143 | ``` 144 | 145 | ### Development Installation 146 | 147 | ```bash 148 | git clone https://github.com/zhuohaoyu/RewardAnything.git 149 | cd RewardAnything 150 | pip install -e ".[dev]" 151 | ``` 152 | 153 | ### Server Installation 154 | 155 | ```bash 156 | pip install "RewardAnything[server]" 157 | ``` 158 | 159 | ### Full Installation 160 | 161 | ```bash 162 | pip install "RewardAnything[all]" 163 | ``` 164 | 165 | ## Usage Patterns 166 | 167 | ### 1. Research and Experimentation 168 | 169 | For research use cases, local inference is recommended: 170 | 171 | ```python 172 | import rewardanything 173 | 174 | # Load model with specific configuration 175 | model = rewardanything.from_pretrained( 176 | "RewardAnything/RewardAnything-8B", 177 | device="cuda", 178 | torch_dtype="bfloat16", 179 | generation_config={ 180 | "temperature": 0.1, 181 | "max_new_tokens": 2048 182 | } 183 | ) 184 | 185 | # Evaluate with complex principles 186 | principle = """ 187 | Evaluate responses based on: 188 | 1. Factual accuracy (50% weight) 189 | 2. Clarity and structure (30% weight) 190 | 3. Engagement and tone (20% weight) 191 | """ 192 | 193 | result = model.judge(principle, prompt, responses) 194 | ``` 195 | 196 | ### 2. Production Deployment 197 | 198 | For production use cases, use the server: 199 | 200 | ```bash 201 | # Start server 202 | rewardanything-serve -c production_config.json --port 8000 203 | 204 | # Scale with load balancer and multiple instances 205 | # Use Docker for containerization 206 | ``` 207 | 208 | ```python 209 | # Client usage in production 210 | client = rewardanything.Client("https://api.yourservice.com/v1") 211 | results = client.judge_batch(evaluation_requests) 212 | ``` 213 | 214 | ### 3. RLHF Integration 215 | 216 | Integration with reinforcement learning from human feedback: 217 | 218 | ```python 219 | def reward_function(prompt, response): 220 | principle = "Reward helpful, harmless, and honest responses" 221 | result = reward_model.judge( 222 | principle=principle, 223 | prompt=prompt, 224 | responses={"candidate": response} 225 | ) 226 | return result.scores["candidate"] 227 | 228 | # Use in PPO/GRPO training loops 229 | ``` 230 | 231 | ## Configuration 232 | 233 | ### Local Model Configuration 234 | 235 | ```python 236 | model = rewardanything.from_pretrained( 237 | model_name_or_path="RewardAnything/RewardAnything-8B", 238 | device="cuda", # Device placement 239 | torch_dtype="auto", # Automatic dtype selection 240 | trust_remote_code=True, # Trust remote code 241 | generation_config={ # Generation parameters 242 | "max_new_tokens": 2048, 243 | "temperature": 0.1, 244 | "do_sample": True, 245 | "top_p": 0.9 246 | } 247 | ) 248 | ``` 249 | 250 | ### Server Configuration 251 | 252 | ```json 253 | { 254 | "api_model": "gpt-4-turbo-preview", 255 | "api_key": ["key1", "key2"], 256 | "api_base": ["https://api.openai.com/v1"], 257 | "generation_config": { 258 | "max_tokens": 2048, 259 | "temperature": 0.1, 260 | "frequency_penalty": 0, 261 | "presence_penalty": 0 262 | }, 263 | "num_workers": 8, 264 | "request_limit": 100, 265 | "request_limit_period": 60 266 | } 267 | ``` 268 | 269 | ## Advanced Features 270 | 271 | ### Response Masking 272 | 273 | RewardAnything automatically masks model names during evaluation to prevent bias: 274 | 275 | ```python 276 | result = model.judge( 277 | principle="Judge based on helpfulness", 278 | prompt="How to cook pasta?", 279 | responses={ 280 | "gpt4": "Boil water, add pasta, cook for 8-10 minutes...", 281 | "claude": "Start by bringing a large pot of salted water to boil..." 282 | }, 283 | mask_responses=True # Default: True 284 | ) 285 | # Model sees "model-1", "model-2" instead of "gpt4", "claude" 286 | ``` 287 | 288 | ### Batch Processing 289 | 290 | ```python 291 | # Local batch processing 292 | requests = [ 293 | { 294 | "principle": "Prefer technical accuracy", 295 | "prompt": "Explain machine learning", 296 | "responses": {...} 297 | }, 298 | { 299 | "principle": "Favor practical examples", 300 | "prompt": "How to debug code?", 301 | "responses": {...} 302 | } 303 | ] 304 | 305 | results = model.judge_batch(requests, batch_size=4) 306 | 307 | # Remote batch processing 308 | results = client.judge_batch(requests) 309 | ``` 310 | 311 | ### Custom Principles 312 | 313 | RewardAnything excels with sophisticated, multi-criteria principles: 314 | 315 | ```python 316 | complex_principle = """ 317 | Evaluate responses using these criteria: 318 | 319 | 1. **Technical Accuracy** (40%): 320 | - Factual correctness 321 | - Up-to-date information 322 | - Proper terminology 323 | 324 | 2. **Clarity** (30%): 325 | - Clear explanations 326 | - Logical structure 327 | - Appropriate detail level 328 | 329 | 3. **Practical Value** (20%): 330 | - Actionable advice 331 | - Real-world applicability 332 | - Concrete examples 333 | 334 | 4. **Safety** (10%): 335 | - No harmful content 336 | - Appropriate disclaimers 337 | - Ethical considerations 338 | 339 | For conflicting criteria, prioritize safety > accuracy > clarity > practical value. 340 | """ 341 | 342 | result = model.judge(complex_principle, prompt, responses) 343 | ``` 344 | 345 | ## Testing 346 | 347 | Run the test suite: 348 | 349 | ```bash 350 | # All tests 351 | pytest 352 | 353 | # Specific test modules 354 | pytest tests/test_local.py -v 355 | pytest tests/test_client.py -v 356 | pytest tests/test_server.py -v 357 | 358 | # With coverage 359 | pytest --cov=rewardanything tests/ 360 | ``` 361 | 362 | ## Contributing 363 | 364 | See [CONTRIBUTING.md](../CONTRIBUTING.md) for development guidelines. 365 | 366 | ### Development Workflow 367 | 368 | 1. Fork the repository 369 | 2. Create a feature branch 370 | 3. Make changes with tests 371 | 4. Run tests and linting 372 | 5. Submit a pull request 373 | 374 | ```bash 375 | # Development setup 376 | git clone https://github.com/your-username/RewardAnything.git 377 | cd RewardAnything 378 | pip install -e ".[dev]" 379 | 380 | # Pre-commit hooks 381 | pre-commit install 382 | 383 | # Run tests 384 | pytest 385 | 386 | # Code formatting 387 | black rewardanything/ 388 | isort rewardanything/ 389 | 390 | # Type checking 391 | mypy rewardanything/ 392 | ``` 393 | 394 | ## Troubleshooting 395 | 396 | ### Common Issues 397 | 398 | 1. **CUDA Out of Memory** 399 | ```python 400 | # Use smaller model or CPU 401 | model = rewardanything.from_pretrained( 402 | "RewardAnything/RewardAnything-1B", # Smaller model 403 | device="cpu" # Or use CPU 404 | ) 405 | ``` 406 | 407 | 2. **Server Connection Issues** 408 | ```python 409 | # Check server health 410 | client = rewardanything.Client("http://localhost:8000") 411 | if not client.health_check(): 412 | print("Server is not responding") 413 | ``` 414 | 415 | 3. **Rate Limiting** 416 | ```python 417 | # Adjust client timeout and retries 418 | client = rewardanything.Client( 419 | base_url="http://localhost:8000", 420 | timeout=120.0, # Longer timeout 421 | max_retries=5 # More retries 422 | ) 423 | ``` 424 | 425 | ### Performance Optimization 426 | 427 | 1. **Use appropriate hardware** 428 | - GPU with sufficient VRAM for local inference 429 | - Multiple workers for server deployment 430 | 431 | 2. **Batch processing** 432 | - Use batch methods for multiple evaluations 433 | - Adjust batch size based on available memory 434 | 435 | 3. **Caching** 436 | - Server automatically caches responses 437 | - Use consistent request IDs for cache hits 438 | 439 | ## License 440 | 441 | Apache 2.0 License - see [LICENSE](../LICENSE) for details. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
6 | 31 | 1Peking University 32 | 2WeChat AI 33 | 3William & Mary 34 | 4Westlake University 35 |
36 |§Work done during Zhuohao's internship at Pattern Recognition Center, WeChat AI, Tencent Inc; †Corresponding author.
37 |
15 | 24 | Traditional reward models learn implicit preferences from fixed data, 25 | but human values are too nuanced for any single, static model. 26 |
27 |28 | We believe reward models, much like LLMs with instructions, must follow 29 | explicitly specified principles. 30 | This unlocks inference-time adaptability to diverse criteria—without costly retraining. 31 |
32 |83 | §Work done during internship at WeChat AI †Corresponding author 84 |
85 |95 | Current reward models face fundamental limitations in how they are trained and evaluated, hindering their ability to truly align with diverse human values. 96 |
97 |103 | Reward models are typically trained on vast datasets of (prompt, chosen response, rejected response) tuples. This teaches the model a single, implicit preference distribution. 104 |
105 |Even if the prompt and responses are identical, applying different evaluation principles (e.g., "be concise" vs. "be detailed") should lead to different rankings. Current RMs struggle to adapt this way without costly retraining for each new principle.
109 |Models learn what to prefer based on outcomes, but not why. This lack of explicit rationale leads to learning superficial patterns or spurious correlations (e.g., "longer is better," "lists are good") rather than the true underlying human intent, as shown below.
113 |Models learn "longer responses are better" from pairs where correctness correlates with length.
130 |"What are some species of bears that are now extinct?"
137 |145 | "Several species of bears have become extinct... Cave Bear (Ursus spelaeus): One of the best-known extinct bear species... Short-faced Bear (Arctodus simus): Once the largest..." 146 |
147 |157 | "Three species of bears that are now extinct are the woolly mammoth, the woolly rhinoceros, and the thylacine." 158 |
159 |A spurious correlation: "Longer responses are better." This preference is static, but what if the user actually preferred a brief, accurate answer?
168 |Models often prioritize familiar structures (e.g., lists) over equally valid, natural content.
183 |"What are some good browser alternatives to Chrome?"
190 |
198 | "There are several good browser alternatives to Chrome:
199 |
1. Mozilla Firefox: Known for strong privacy features, being open-source, and highly customizable.
200 |
2. Microsoft Edge: Now built on Chromium, offering good performance and compatibility."
201 |
212 | "Sure! For browser alternatives, you could check out Firefox – it's really good for privacy and you can customize it a lot. Microsoft Edge is another option; it's pretty fast now that it uses Chromium tech." 213 |
214 |"Structured, list-like responses are better." This overlooks that a natural, conversational style might be equally informative or even preferred by some users.
223 |232 | Existing Reward Model benchmarks primarily measure how well an RM aligns with a single, predefined preference distribution (often the one it was trained on or a similar one). 233 |
234 |Human preferences are complex, context-dependent, and multifaceted. A truly useful RM must adapt to any explicitly stated principle, not just echo a single, baked-in preference.
238 |This narrow evaluation fails to assess the critical capability of generalizing to diverse and novel principles at inference time, which is essential for robust and trustworthy AI systems.
242 |These fundamental issues in training and evaluation lead to several critical shortcomings:
250 | 251 |RMs master a single, fixed preference from training data, failing to grasp the multifaceted nature of human values or adapt to diverse contexts.
261 |Learning from outcomes alone (chosen/rejected pairs), RMs lack an explicit understanding of *why* a response is preferred, making their judgments uninterpretable black boxes.
272 |Implicit learning on biased data leads RMs to mistakenly learn superficial cues (e.g., length, format, specific keywords) as proxies for genuine quality.
283 |Due to overfit, static preferences and opaque reasoning, aligning RMs with new criteria or principles demands expensive data collection and full retraining cycles.
294 |306 | To overcome these limitations, we propose a paradigm shift towards reward models that explicitly understand and follow natural language principles. This approach enables dynamic adaptation to any evaluation criteria without costly retraining and is embodied by two key innovations: 307 |
308 |318 | Current benchmarks assess how well RMs fit a single, fixed preference. This is insufficient. We argue that, analogous to how Large Language Models (LLMs) are valued for their ability to follow diverse instructions, reward models must be evaluated on their capacity to follow diverse principles. 319 |
320 |321 | To this end, we introduce RABench (RewardAnything Benchmark). It is a comprehensive benchmark meticulously designed to assess the principle-following capabilities of RMs across various domains (chat, code, safety, math) and a wide array of explicit natural language criteria. 322 |
323 |324 | RABench moves beyond static preference matching, pushing for RMs that demonstrate true generalization in understanding and applying "goodness" based on varied, explicit guidance. 325 |
326 |335 | We develop RewardAnything, a novel reward model engineered to embody this principle-following paradigm. 336 |
337 |338 | Trained using advanced Reinforcement Learning (RL) techniques on principle-conditioned preference data, RewardAnything learns to robustly distinguish better responses from worse ones by directly conditioning on explicit natural language principles provided at inference time. This allows it to adapt its judgment dynamically without any retraining. 339 |
340 |341 | A key feature is its inference-time reasoning process. RewardAnything not only scores responses according to the given principle but can also articulate an explanation for its judgment, enhancing transparency and trustworthiness. 342 |
343 |
352 | 364 | For a comprehensive understanding of our methodology, technical innovations, detailed model architecture, training procedures, and full experimental setup, please refer to our full research paper. The paper provides an in-depth exploration of the concepts presented here. 365 |
366 | 368 | 📄 Read the Full Paper 369 | 370 |379 | RewardAnything offers three flexible deployment options to fit your workflow, from quick experimentation to production-scale evaluation. 380 |
381 |pip install rewardanything
389 | Perfect for quick experimentation and research
408 | 409 | 410 |Optimized for high-throughput and production
455 | 456 | 457 |Maximum flexibility for custom workflows
502 | 503 | 504 |Simple setup for quick testing and research
546 |import rewardanything
550 |
551 | # Load model locally (similar to HuggingFace)
552 | reward_model = rewardanything.from_pretrained("zhuohaoyu/RewardAnything-8B-v1", device="cuda")
553 |
554 | # Get comprehensive evaluation
555 | result = reward_model.judge(
556 | principle="I prefer clear, concise and helpful responses over long and detailed ones.",
557 | prompt="How do I learn Python programming effectively?",
558 | responses={ # responses with keys, note these are masked and shuffled and then given to RewardAnything to prevent cheating
559 | "response_a": "Start with Python.org\\'s tutorial, practice daily with small projects, and join r/learnpython for help. Focus on fundamentals first.",
560 | "response_b": "Here\\'s a comprehensive approach: 1) Start with Python basics including variables, data types, operators, control structures like if-statements, for-loops, while-loops, and functions, 2) Practice with small projects like calculators, text games, and data manipulation scripts, 3) Use interactive platforms like Codecademy, Python.org\\'s official tutorial, edX courses, Coursera specializations, and YouTube channels, 4) Join communities like r/learnpython, Stack Overflow, Python Discord servers, and local meetups for support and networking, 5) Build progressively complex projects including web scrapers, APIs, data analysis tools, and web applications, 6) Read books like \\'Automate the Boring Stuff\\', \\'Python Crash Course\\', and \\'Effective Python\\', 7) Dedicate 1-2 hours daily for consistent progress and track your learning journey.",
561 | "response_c": "Learn Python by coding."
562 | }
563 | )
564 |
565 | # Access results
566 | print(f"Scores: {result.scores}")
567 | print(f"Ranking: {result.ranking}")
568 | print(f"Reasoning: {result.reasoning}")
569 | 695 | Unlock the full potential of RewardAnything by leveraging sophisticated principles and seamlessly integrating it into your RLHF workflows. 696 |
697 |707 | RewardAnything excels when provided with clear, structured principles, especially for nuanced tasks involving multiple, potentially conflicting objectives. Define criteria, assign weights (e.g., via textual emphasis or explicit percentages), and specify priorities to guide the model's judgment effectively. This allows for fine-grained control over the evaluation process. 708 |
709 |# Define a detailed, multi-faceted principle
711 | complex_principle = """
712 | Safety comes first but also be sure not to encourage
713 | overly sensitive reiections for safe or benignly
714 | borderline queries. Next, equally value warmth,
715 | appropriate humor (to deflect borderline harm),
716 | and genuine helpfulness. Remember, content and tone
717 | are more important than presentation style.
718 |
719 | """
720 |
721 | # Assume 'reward_model' is initialized
722 | # prompt = "Your specific prompt here"
723 | # responses = {"res_a": "...", "res_b": "..."}
724 | result = reward_model.judge(
725 | principle=complex_principle,
726 | prompt=prompt,
727 | responses=responses
728 | )
729 | 739 | Seamlessly integrate RewardAnything into your Reinforcement Learning from Human Feedback (RLHF) pipelines. It can serve as a dynamic, principle-driven reward function. RewardAnything is compatible with popular RL frameworks (e.g., TRL, veRL), allowing you to guide model generation based on explicit criteria rather than static preferences. 740 |
741 |742 | Detailed integration examples and best practices can be found in our official repository. 743 |
744 |# Example: Use in a PPO-style training loop
746 | # Assume 'reward_model' is initialized
747 | # principle = "Your guiding principle"
748 | # prompt = "The input prompt"
749 |
750 | def reward_function(principle, prompt, response_text):
751 | eval_responses = {"generated": response_text}
752 | result = reward_model.judge(
753 | principle=principle,
754 | prompt=prompt,
755 | responses=eval_responses
756 | )
757 | return result.scores.get("generated", 0.0)
758 |
759 | # generated_responses = ["response1", "response2", ...]
760 | rewards = [reward_function(principle, prompt, resp)
761 | for resp in generated_responses]
762 | 774 | RewardAnything achieves excellent performance on both traditional benchmarks and our new principle-following evaluation. Below are highlights from RM-Bench and our proposed RABench. For full details, additional benchmarks, and ablation studies, please see our paper. 775 |
776 |
784 |
791 | 802 | RewardAnything introduces novel techniques for principle-following reward modeling 803 |
804 |Advanced RL training that learns relative preferences within response groups
818 |Efficient ranking of multiple responses in a single forward pass
829 |Explicit reasoning process for transparent decision making
840 |Ground truth from 4 state-of-the-art LLMs with algorithmic consensus
853 |89% agreement rate with κ=0.57 for reliable evaluation standards
864 |873 | We introduce RABench, a comprehensive benchmark specifically designed to evaluate reward models' 874 | ability to follow explicit natural language principles across diverse domains and criteria. 875 |
876 |905 | Everything you need to understand and use RewardAnything for your research and applications 906 |
907 |Complete methodology, experiments, and theoretical foundations
914 | 915 | 916 | 917 |Comprehensive guide to using RewardAnything in your code
920 | 921 | 922 | 923 |Benchmark dataset for evaluating principle-following capabilities
926 | 927 | 928 | 929 |Pre-trained models ready for inference and fine-tuning
932 | 933 |943 | If you use RewardAnything in your research, please cite our paper 944 |
945 |@article{yu2025rewardanything,
950 | title={RewardAnything: Generalizable Principle-Following Reward Models},
951 | author={Yu, Zhuohao and Zeng, Jiali and Gu, Weizheng and Wang, Yidong and
952 | Wang, Jindong and Meng, Fandong and Zhou, Jie and Zhang, Yue and
953 | Zhang, Shikun and Ye, Wei},
954 | journal={arXiv preprint arXiv:2506.03637},
955 | year={2025}
956 | }
957 |