├── .github └── ISSUE_TEMPLATE │ └── add-results-to-leaderboard.yml ├── .gitignore ├── README.md ├── agent_reward_bench ├── __init__.py ├── benchmarks │ ├── __init__.py │ └── base.py ├── data │ ├── annotations.csv │ ├── assistantbench.csv │ ├── complete_task_ids.csv │ ├── splits.csv │ ├── visualwebarena.csv │ ├── visualwebarena.task_ids.json │ ├── webarena.csv │ ├── webarena.task_ids.json │ ├── workarena.csv │ └── workarena_l2.task_ids.json ├── envs │ ├── assistantbench │ │ ├── __init__.py │ │ ├── evaluation │ │ │ ├── evaluate_utils │ │ │ │ ├── evaluate_dicts.py │ │ │ │ ├── evaluate_factory.py │ │ │ │ ├── evaluate_numbers.py │ │ │ │ ├── evaluate_strings.py │ │ │ │ └── utils.py │ │ │ └── evaluator.py │ │ ├── register.py │ │ ├── task.py │ │ └── utils.py │ ├── visualwebarena │ │ ├── __init__.py │ │ ├── config.py │ │ ├── instance.py │ │ ├── register.py │ │ ├── task.py │ │ └── utils.py │ └── webarena_utils │ │ ├── __init__.py │ │ ├── config.py │ │ ├── instance.py │ │ └── task.py ├── eval │ ├── __init__.py │ ├── metrics.py │ └── utils.py ├── judge │ ├── __init__.py │ ├── args.py │ ├── defaults.py │ ├── existing │ │ ├── __init__.py │ │ ├── aer.py │ │ └── nnetnav.py │ └── utils.py ├── modeling │ └── __init__.py ├── processing │ ├── filter_webarena.py │ └── filter_workarena.py ├── trajectories.py ├── utils.py └── version.py ├── apps ├── README.md ├── annotations.csv ├── demo.py ├── leaderboard.py ├── requirements-demo.txt └── results.csv ├── assets └── primary.png ├── requirements.txt ├── scripts ├── clean_processed_trajectories.py ├── convert_trajectories_to_json.py ├── convert_trajectories_to_tars.py ├── run_agent.py ├── run_judge.py └── score_judgments.py ├── setup.py └── vars ├── servicenow.sh └── set_envs.sh /.github/ISSUE_TEMPLATE/add-results-to-leaderboard.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/.github/ISSUE_TEMPLATE/add-results-to-leaderboard.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/.gitignore -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/README.md -------------------------------------------------------------------------------- /agent_reward_bench/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/__init__.py -------------------------------------------------------------------------------- /agent_reward_bench/benchmarks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/benchmarks/__init__.py -------------------------------------------------------------------------------- /agent_reward_bench/benchmarks/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/benchmarks/base.py -------------------------------------------------------------------------------- /agent_reward_bench/data/annotations.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/data/annotations.csv -------------------------------------------------------------------------------- /agent_reward_bench/data/assistantbench.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/data/assistantbench.csv -------------------------------------------------------------------------------- /agent_reward_bench/data/complete_task_ids.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/data/complete_task_ids.csv -------------------------------------------------------------------------------- /agent_reward_bench/data/splits.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/data/splits.csv -------------------------------------------------------------------------------- /agent_reward_bench/data/visualwebarena.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/data/visualwebarena.csv -------------------------------------------------------------------------------- /agent_reward_bench/data/visualwebarena.task_ids.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/data/visualwebarena.task_ids.json -------------------------------------------------------------------------------- /agent_reward_bench/data/webarena.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/data/webarena.csv -------------------------------------------------------------------------------- /agent_reward_bench/data/webarena.task_ids.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/data/webarena.task_ids.json -------------------------------------------------------------------------------- /agent_reward_bench/data/workarena.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/data/workarena.csv -------------------------------------------------------------------------------- /agent_reward_bench/data/workarena_l2.task_ids.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/data/workarena_l2.task_ids.json -------------------------------------------------------------------------------- /agent_reward_bench/envs/assistantbench/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/envs/assistantbench/__init__.py -------------------------------------------------------------------------------- /agent_reward_bench/envs/assistantbench/evaluation/evaluate_utils/evaluate_dicts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/envs/assistantbench/evaluation/evaluate_utils/evaluate_dicts.py -------------------------------------------------------------------------------- /agent_reward_bench/envs/assistantbench/evaluation/evaluate_utils/evaluate_factory.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/envs/assistantbench/evaluation/evaluate_utils/evaluate_factory.py -------------------------------------------------------------------------------- /agent_reward_bench/envs/assistantbench/evaluation/evaluate_utils/evaluate_numbers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/envs/assistantbench/evaluation/evaluate_utils/evaluate_numbers.py -------------------------------------------------------------------------------- /agent_reward_bench/envs/assistantbench/evaluation/evaluate_utils/evaluate_strings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/envs/assistantbench/evaluation/evaluate_utils/evaluate_strings.py -------------------------------------------------------------------------------- /agent_reward_bench/envs/assistantbench/evaluation/evaluate_utils/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/envs/assistantbench/evaluation/evaluate_utils/utils.py -------------------------------------------------------------------------------- /agent_reward_bench/envs/assistantbench/evaluation/evaluator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/envs/assistantbench/evaluation/evaluator.py -------------------------------------------------------------------------------- /agent_reward_bench/envs/assistantbench/register.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/envs/assistantbench/register.py -------------------------------------------------------------------------------- /agent_reward_bench/envs/assistantbench/task.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/envs/assistantbench/task.py -------------------------------------------------------------------------------- /agent_reward_bench/envs/assistantbench/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/envs/assistantbench/utils.py -------------------------------------------------------------------------------- /agent_reward_bench/envs/visualwebarena/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/envs/visualwebarena/__init__.py -------------------------------------------------------------------------------- /agent_reward_bench/envs/visualwebarena/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/envs/visualwebarena/config.py -------------------------------------------------------------------------------- /agent_reward_bench/envs/visualwebarena/instance.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/envs/visualwebarena/instance.py -------------------------------------------------------------------------------- /agent_reward_bench/envs/visualwebarena/register.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/envs/visualwebarena/register.py -------------------------------------------------------------------------------- /agent_reward_bench/envs/visualwebarena/task.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/envs/visualwebarena/task.py -------------------------------------------------------------------------------- /agent_reward_bench/envs/visualwebarena/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/envs/visualwebarena/utils.py -------------------------------------------------------------------------------- /agent_reward_bench/envs/webarena_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/envs/webarena_utils/__init__.py -------------------------------------------------------------------------------- /agent_reward_bench/envs/webarena_utils/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/envs/webarena_utils/config.py -------------------------------------------------------------------------------- /agent_reward_bench/envs/webarena_utils/instance.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/envs/webarena_utils/instance.py -------------------------------------------------------------------------------- /agent_reward_bench/envs/webarena_utils/task.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/envs/webarena_utils/task.py -------------------------------------------------------------------------------- /agent_reward_bench/eval/__init__.py: -------------------------------------------------------------------------------- 1 | from . import metrics, utils -------------------------------------------------------------------------------- /agent_reward_bench/eval/metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/eval/metrics.py -------------------------------------------------------------------------------- /agent_reward_bench/eval/utils.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /agent_reward_bench/judge/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/judge/__init__.py -------------------------------------------------------------------------------- /agent_reward_bench/judge/args.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/judge/args.py -------------------------------------------------------------------------------- /agent_reward_bench/judge/defaults.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/judge/defaults.py -------------------------------------------------------------------------------- /agent_reward_bench/judge/existing/__init__.py: -------------------------------------------------------------------------------- 1 | from . import aer, nnetnav -------------------------------------------------------------------------------- /agent_reward_bench/judge/existing/aer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/judge/existing/aer.py -------------------------------------------------------------------------------- /agent_reward_bench/judge/existing/nnetnav.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/judge/existing/nnetnav.py -------------------------------------------------------------------------------- /agent_reward_bench/judge/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/judge/utils.py -------------------------------------------------------------------------------- /agent_reward_bench/modeling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/modeling/__init__.py -------------------------------------------------------------------------------- /agent_reward_bench/processing/filter_webarena.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/processing/filter_webarena.py -------------------------------------------------------------------------------- /agent_reward_bench/processing/filter_workarena.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/processing/filter_workarena.py -------------------------------------------------------------------------------- /agent_reward_bench/trajectories.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/trajectories.py -------------------------------------------------------------------------------- /agent_reward_bench/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/agent_reward_bench/utils.py -------------------------------------------------------------------------------- /agent_reward_bench/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1.2" -------------------------------------------------------------------------------- /apps/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/apps/README.md -------------------------------------------------------------------------------- /apps/annotations.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/apps/annotations.csv -------------------------------------------------------------------------------- /apps/demo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/apps/demo.py -------------------------------------------------------------------------------- /apps/leaderboard.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/apps/leaderboard.py -------------------------------------------------------------------------------- /apps/requirements-demo.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | orjson 3 | Pillow 4 | pyparsing -------------------------------------------------------------------------------- /apps/results.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/apps/results.csv -------------------------------------------------------------------------------- /assets/primary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/assets/primary.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/requirements.txt -------------------------------------------------------------------------------- /scripts/clean_processed_trajectories.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/scripts/clean_processed_trajectories.py -------------------------------------------------------------------------------- /scripts/convert_trajectories_to_json.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/scripts/convert_trajectories_to_json.py -------------------------------------------------------------------------------- /scripts/convert_trajectories_to_tars.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/scripts/convert_trajectories_to_tars.py -------------------------------------------------------------------------------- /scripts/run_agent.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/scripts/run_agent.py -------------------------------------------------------------------------------- /scripts/run_judge.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/scripts/run_judge.py -------------------------------------------------------------------------------- /scripts/score_judgments.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/scripts/score_judgments.py -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/setup.py -------------------------------------------------------------------------------- /vars/servicenow.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/vars/servicenow.sh -------------------------------------------------------------------------------- /vars/set_envs.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/McGill-NLP/agent-reward-bench/HEAD/vars/set_envs.sh --------------------------------------------------------------------------------