├── CLAUDE.md ├── src ├── art │ ├── py.typed │ ├── unsloth │ │ └── __init__.py │ ├── preprocessing │ │ └── __init__.py │ ├── torchtune │ │ ├── __init__.py │ │ ├── batch.py │ │ └── train.sh │ ├── transformers │ │ ├── __init__.py │ │ └── patches.py │ ├── local │ │ ├── __init__.py │ │ ├── service.py │ │ └── checkpoints.py │ ├── skypilot │ │ ├── __init__.py │ │ └── stop_server.py │ ├── serverless │ │ └── __init__.py │ ├── rewards │ │ └── __init__.py │ ├── langgraph │ │ ├── __init__.py │ │ └── logging.py │ ├── utils │ │ ├── benchmarking │ │ │ ├── charts │ │ │ │ └── __init__.py │ │ │ ├── types.py │ │ │ ├── filter_model_split.py │ │ │ └── pull_model_trajectories.py │ │ ├── old_benchmarking │ │ │ ├── display_image_grid.py │ │ │ ├── calculate_step_metrics.py │ │ │ ├── generate_comparison_table.py │ │ │ └── types.py │ │ ├── __init__.py │ │ ├── get_repo_root_path.py │ │ ├── deployment │ │ │ ├── legacy.py │ │ │ └── __init__.py │ │ ├── format_message.py │ │ ├── limit_concurrency.py │ │ ├── log_http_errors.py │ │ ├── get_model_step.py │ │ ├── benchmark_rollout.py │ │ ├── logging.py │ │ ├── output_dirs.py │ │ ├── strip_logprobs.py │ │ └── deploy_model.py │ ├── mcp │ │ ├── default_tools.py │ │ └── __init__.py │ ├── dev │ │ ├── __init__.py │ │ ├── train.py │ │ └── torchtune.py │ ├── types.py │ ├── yield_trajectory.py │ ├── vllm │ │ └── __init__.py │ ├── errors.py │ ├── batches.py │ └── __init__.py └── mp_actors │ ├── __init__.py │ └── traceback.py ├── .python-version ├── dev ├── swebench │ ├── __init__.py │ ├── tools │ │ ├── registry │ │ │ ├── lib │ │ │ │ ├── __init__.py │ │ │ │ └── registry.py │ │ │ ├── config.yaml │ │ │ ├── install.sh │ │ │ └── bin │ │ │ │ ├── _write_env │ │ │ │ └── _read_env │ │ ├── review_on_submit_m │ │ │ ├── install.sh │ │ │ ├── README.md │ │ │ ├── config.yaml │ │ │ └── bin │ │ │ │ └── submit │ │ └── edit_anthropic │ │ │ ├── install.sh │ │ │ └── bin │ │ │ └── _state_anthropic │ ├── sandbox │ │ ├── __init__.py │ │ ├── daytona.py │ │ ├── modal.py │ │ └── new.py │ ├── sandboxes.py │ ├── run.py │ ├── pyproject.toml │ └── trl.ipynb ├── tau-bench │ ├── .python-version │ ├── tau_bench │ │ ├── model_utils │ │ │ ├── api │ │ │ │ ├── __init__.py │ │ │ │ ├── types.py │ │ │ │ ├── _model_methods.py │ │ │ │ ├── exception.py │ │ │ │ └── logging.py │ │ │ ├── model │ │ │ │ ├── __init__.py │ │ │ │ ├── exception.py │ │ │ │ ├── vllm_utils.py │ │ │ │ └── outlines_completion.py │ │ │ ├── func_tools │ │ │ │ ├── __init__.py │ │ │ │ ├── filter.py │ │ │ │ └── map.py │ │ │ └── args.py │ │ ├── agents │ │ │ ├── __init__.py │ │ │ └── base.py │ │ ├── envs │ │ │ ├── airline │ │ │ │ ├── rules.py │ │ │ │ ├── __init__.py │ │ │ │ ├── wiki.py │ │ │ │ ├── data │ │ │ │ │ └── __init__.py │ │ │ │ ├── tools │ │ │ │ │ ├── think.py │ │ │ │ │ ├── get_user_details.py │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── get_reservation_details.py │ │ │ │ │ ├── transfer_to_human_agents.py │ │ │ │ │ ├── calculate.py │ │ │ │ │ ├── cancel_reservation.py │ │ │ │ │ ├── list_all_airports.py │ │ │ │ │ ├── send_certificate.py │ │ │ │ │ ├── search_direct_flight.py │ │ │ │ │ └── update_reservation_passengers.py │ │ │ │ └── env.py │ │ │ ├── retail │ │ │ │ ├── __init__.py │ │ │ │ ├── wiki.py │ │ │ │ ├── data │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── readme.md │ │ │ │ ├── rules.py │ │ │ │ ├── tools │ │ │ │ │ ├── get_user_details.py │ │ │ │ │ ├── list_all_product_types.py │ │ │ │ │ ├── get_order_details.py │ │ │ │ │ ├── get_product_details.py │ │ │ │ │ ├── find_user_id_by_email.py │ │ │ │ │ ├── think.py │ │ │ │ │ ├── transfer_to_human_agents.py │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── calculate.py │ │ │ │ │ └── find_user_id_by_name_zip.py │ │ │ │ └── env.py │ │ │ ├── tool.py │ │ │ └── __init__.py │ │ └── __init__.py │ ├── .gitignore │ ├── MANIFEST.in │ ├── packed_tensor_images │ │ ├── packed_tensors_plot_1752190878.png │ │ ├── packed_tensors_plot_1752193757.png │ │ ├── packed_tensors_plot_1752196743.png │ │ ├── packed_tensors_plot_1752199731.png │ │ ├── packed_tensors_plot_1752202622.png │ │ ├── packed_tensors_plot_1752205600.png │ │ ├── packed_tensors_plot_1752208547.png │ │ ├── packed_tensors_plot_1752211467.png │ │ ├── packed_tensors_plot_1752214557.png │ │ └── packed_tensors_plot_1752217461.png │ ├── check.py │ ├── setup.py │ ├── pyproject.toml │ └── LICENSE ├── playwright_agent │ ├── pyproject.toml │ └── job_desc_dataset.json ├── test_skypilot │ ├── launch.py │ ├── launch_tail.py │ └── register_model.py └── new_models │ ├── prompts.json │ ├── qwen3_try.py │ └── gemma3.py ├── examples ├── 2048 │ ├── generate_benchmarks.py │ └── train.py ├── mcp-rl │ ├── README.md │ ├── servers │ │ └── python │ │ │ ├── mcp_balldontlie │ │ │ ├── __init__.py │ │ │ ├── server_params.py │ │ │ ├── scenarios │ │ │ │ └── val.jsonl │ │ │ └── README.md │ │ │ ├── mcp_googlemaps │ │ │ ├── __init__.py │ │ │ ├── server_params.py │ │ │ └── pyproject.toml │ │ │ └── mcp_alphavantage │ │ │ ├── __init__.py │ │ │ ├── server_params.py │ │ │ ├── README.md │ │ │ └── scenarios │ │ │ └── val.jsonl │ ├── mcp_rl │ │ ├── __init__.py │ │ └── utils.py │ ├── pyproject.toml │ └── all_experiments.py ├── just-the-facts │ ├── README.md │ ├── .gitignore │ ├── just_the_facts │ │ ├── __init__.py │ │ ├── find_articles.py │ │ └── experiments.py │ ├── main.py │ ├── test_scraper.py │ └── pyproject.toml ├── roflbot │ └── .gitignore ├── hn_title_generator │ ├── skypilot.yaml │ └── skypilot-reference-grpo-trainer.yaml └── tic_tac_toe_self_play │ ├── gather_trajectory_groups_by_index.py │ └── deploy_step.py ├── docs ├── .gitignore ├── images │ ├── forked-run.webp │ ├── faq │ │ └── art-loop.webp │ ├── ruler-results.png │ ├── site-assets │ │ └── favicon.webp │ └── open-deep-research-progress.png ├── package.json ├── analytics.js ├── README.md ├── style.css ├── getting-started │ └── quick-start.mdx ├── resources │ ├── glossary.mdx │ └── models.mdx ├── experimental │ └── gspo.mdx ├── fundamentals │ └── training-loop.mdx └── docs.json ├── assets ├── ART_logo.png ├── ART_pill.png ├── Discord.png ├── ART_E_pill.png ├── ART_header.png ├── Colab_pill.png ├── Train_pill.png ├── ART_E_graphs.png ├── Header_separator.png ├── Documentation_pill.png └── benchmarks │ └── codenames │ └── win_rate_over_time.png ├── scripts ├── kill-gpu-processes.sh ├── publish.sh ├── setup.sh ├── launch-cluster.sh ├── migrate-s3-checkpoints.py └── bump_version.py ├── .skyignore ├── .dockerignore ├── .gitignore ├── requirements └── backend.vcs.txt ├── .env.example ├── .github └── workflows │ ├── ruff.yml │ └── release.yml ├── AGENT.md ├── THIRD-PARTY-NOTICES └── pyproject.toml /CLAUDE.md: -------------------------------------------------------------------------------- 1 | AGENT.md -------------------------------------------------------------------------------- /src/art/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.10 2 | -------------------------------------------------------------------------------- /dev/swebench/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/mcp-rl/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/art/unsloth/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/just-the-facts/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/art/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/art/torchtune/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/art/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dev/tau-bench/.python-version: -------------------------------------------------------------------------------- 1 | 3.11 2 | -------------------------------------------------------------------------------- /dev/swebench/tools/registry/lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dev/swebench/tools/registry/config.yaml: -------------------------------------------------------------------------------- 1 | tools: {} -------------------------------------------------------------------------------- /dev/swebench/tools/review_on_submit_m/install.sh: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/just-the-facts/.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info/ -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/model_utils/api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/model_utils/model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | package-lock.json -------------------------------------------------------------------------------- /examples/mcp-rl/servers/python/mcp_balldontlie/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/mcp-rl/servers/python/mcp_googlemaps/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/agents/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | -------------------------------------------------------------------------------- /examples/roflbot/.gitignore: -------------------------------------------------------------------------------- 1 | *.db 2 | /data/ 3 | .env 4 | .venv/ 5 | -------------------------------------------------------------------------------- /dev/tau-bench/.gitignore: -------------------------------------------------------------------------------- 1 | results/ 2 | benchmark_results/ 3 | error_analysis_results/ -------------------------------------------------------------------------------- /examples/just-the-facts/just_the_facts/__init__.py: -------------------------------------------------------------------------------- 1 | # Just the Facts package 2 | -------------------------------------------------------------------------------- /assets/ART_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/assets/ART_logo.png -------------------------------------------------------------------------------- /assets/ART_pill.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/assets/ART_pill.png -------------------------------------------------------------------------------- /assets/Discord.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/assets/Discord.png -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/airline/rules.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | RULES = [] 4 | -------------------------------------------------------------------------------- /assets/ART_E_pill.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/assets/ART_E_pill.png -------------------------------------------------------------------------------- /assets/ART_header.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/assets/ART_header.png -------------------------------------------------------------------------------- /assets/Colab_pill.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/assets/Colab_pill.png -------------------------------------------------------------------------------- /assets/Train_pill.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/assets/Train_pill.png -------------------------------------------------------------------------------- /assets/ART_E_graphs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/assets/ART_E_graphs.png -------------------------------------------------------------------------------- /assets/Header_separator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/assets/Header_separator.png -------------------------------------------------------------------------------- /dev/tau-bench/MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include tau_bench *.json 2 | recursive-include tau_bench *.md 3 | -------------------------------------------------------------------------------- /docs/images/forked-run.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/docs/images/forked-run.webp -------------------------------------------------------------------------------- /examples/mcp-rl/servers/python/mcp_alphavantage/__init__.py: -------------------------------------------------------------------------------- 1 | """MCP AlphaVantage Python Server""" 2 | -------------------------------------------------------------------------------- /src/art/local/__init__.py: -------------------------------------------------------------------------------- 1 | from .backend import LocalBackend 2 | 3 | __all__ = ["LocalBackend"] 4 | -------------------------------------------------------------------------------- /assets/Documentation_pill.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/assets/Documentation_pill.png -------------------------------------------------------------------------------- /docs/images/faq/art-loop.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/docs/images/faq/art-loop.webp -------------------------------------------------------------------------------- /docs/images/ruler-results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/docs/images/ruler-results.png -------------------------------------------------------------------------------- /src/art/skypilot/__init__.py: -------------------------------------------------------------------------------- 1 | from .backend import SkyPilotBackend 2 | 3 | __all__ = ["SkyPilotBackend"] 4 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/model_utils/api/types.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | PartialObj = dict[str, Any] 4 | -------------------------------------------------------------------------------- /dev/swebench/tools/edit_anthropic/install.sh: -------------------------------------------------------------------------------- 1 | pip install 'tree-sitter==0.21.3' 2 | pip install 'tree-sitter-languages' -------------------------------------------------------------------------------- /src/art/serverless/__init__.py: -------------------------------------------------------------------------------- 1 | from .backend import ServerlessBackend 2 | 3 | __all__ = ["ServerlessBackend"] 4 | -------------------------------------------------------------------------------- /docs/images/site-assets/favicon.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/docs/images/site-assets/favicon.webp -------------------------------------------------------------------------------- /scripts/kill-gpu-processes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | nvidia-smi --query-compute-apps=pid --format=csv,noheader | xargs -r kill -9 -------------------------------------------------------------------------------- /src/art/rewards/__init__.py: -------------------------------------------------------------------------------- 1 | from .ruler import ruler, ruler_score_group 2 | 3 | __all__ = ["ruler", "ruler_score_group"] 4 | -------------------------------------------------------------------------------- /docs/images/open-deep-research-progress.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/docs/images/open-deep-research-progress.png -------------------------------------------------------------------------------- /src/art/langgraph/__init__.py: -------------------------------------------------------------------------------- 1 | from .llm_wrapper import init_chat_model, wrap_rollout 2 | 3 | __all__ = ["wrap_rollout", "init_chat_model"] 4 | -------------------------------------------------------------------------------- /src/mp_actors/__init__.py: -------------------------------------------------------------------------------- 1 | from .move import close_proxy, move_to_child_process 2 | 3 | __all__ = ["close_proxy", "move_to_child_process"] 4 | -------------------------------------------------------------------------------- /assets/benchmarks/codenames/win_rate_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/assets/benchmarks/codenames/win_rate_over_time.png -------------------------------------------------------------------------------- /examples/just-the-facts/main.py: -------------------------------------------------------------------------------- 1 | def main(): 2 | print("Hello from just-the-facts!") 3 | 4 | 5 | if __name__ == "__main__": 6 | main() 7 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | from tau_bench.agents.base import Agent as Agent 4 | from tau_bench.envs.base import Env as Env 5 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/retail/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | from tau_bench.envs.retail.env import MockRetailDomainEnv as MockRetailDomainEnv 4 | -------------------------------------------------------------------------------- /dev/swebench/sandbox/__init__.py: -------------------------------------------------------------------------------- 1 | from .new import new_sandbox 2 | from .sandbox import Provider, Sandbox 3 | 4 | __all__ = ["new_sandbox", "Provider", "Sandbox"] 5 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/airline/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | from tau_bench.envs.airline.env import MockAirlineDomainEnv as MockAirlineDomainEnv 4 | -------------------------------------------------------------------------------- /dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752190878.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752190878.png -------------------------------------------------------------------------------- /dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752193757.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752193757.png -------------------------------------------------------------------------------- /dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752196743.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752196743.png -------------------------------------------------------------------------------- /dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752199731.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752199731.png -------------------------------------------------------------------------------- /dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752202622.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752202622.png -------------------------------------------------------------------------------- /dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752205600.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752205600.png -------------------------------------------------------------------------------- /dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752208547.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752208547.png -------------------------------------------------------------------------------- /dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752211467.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752211467.png -------------------------------------------------------------------------------- /dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752214557.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752214557.png -------------------------------------------------------------------------------- /dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752217461.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPipe/ART/HEAD/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752217461.png -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/model_utils/func_tools/__init__.py: -------------------------------------------------------------------------------- 1 | from tau_bench.model_utils.func_tools.filter import filter as filter 2 | from tau_bench.model_utils.func_tools.map import map as map 3 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/model_utils/api/_model_methods.py: -------------------------------------------------------------------------------- 1 | MODEL_METHODS = [ 2 | "classify", 3 | "binary_classify", 4 | "parse", 5 | "generate", 6 | "parse_force", 7 | "score", 8 | ] 9 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/airline/wiki.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | import os 4 | 5 | FOLDER_PATH = os.path.dirname(__file__) 6 | 7 | with open(os.path.join(FOLDER_PATH, "wiki.md"), "r") as f: 8 | WIKI = f.read() 9 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/retail/wiki.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | import os 4 | 5 | FOLDER_PATH = os.path.dirname(__file__) 6 | 7 | with open(os.path.join(FOLDER_PATH, "wiki.md"), "r") as f: 8 | WIKI = f.read() 9 | -------------------------------------------------------------------------------- /dev/playwright_agent/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "playwright-agent" 3 | version = "0.1.0" 4 | requires-python = ">=3.10" 5 | dependencies = [ 6 | "mcp>=1.13.1", 7 | "openpipe>=5.0.0", 8 | "panza>=0.1.0", 9 | ] 10 | -------------------------------------------------------------------------------- /examples/mcp-rl/mcp_rl/__init__.py: -------------------------------------------------------------------------------- 1 | """ART MCP package.""" 2 | 3 | from .mcp_server import AlphaMcpServer, McpServer 4 | from .rollout import McpScenario, rollout 5 | 6 | __all__ = ["rollout", "McpScenario", "McpServer", "AlphaMcpServer"] 7 | -------------------------------------------------------------------------------- /dev/swebench/tools/registry/install.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # script_dir=$(dirname "$(readlink -f "$0")") 4 | bundle_dir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 5 | 6 | export PYTHONPATH="$bundle_dir/lib":$PYTHONPATH -------------------------------------------------------------------------------- /dev/swebench/tools/review_on_submit_m/README.md: -------------------------------------------------------------------------------- 1 | # Review on submit. 2 | 3 | Provides an alternative for `submit` that does not immediately submit, but asks the 4 | agent to perform additional reviewing steps. 5 | 6 | Only `submit -f` will trigger the real submit. -------------------------------------------------------------------------------- /dev/tau-bench/check.py: -------------------------------------------------------------------------------- 1 | from langfuse import Langfuse 2 | 3 | langfuse = Langfuse( 4 | secret_key="sk-lf-22c352a1-945f-45fe-ae01-a4b3f67527c0", 5 | public_key="pk-lf-94184f77-f55b-4f4a-af05-1cc34b2f89bd", 6 | host="https://us.cloud.langfuse.com", 7 | ) 8 | -------------------------------------------------------------------------------- /dev/swebench/tools/review_on_submit_m/config.yaml: -------------------------------------------------------------------------------- 1 | tools: 2 | submit: 3 | signature: "submit" 4 | docstring: "submits the current file" 5 | # Do not actually show the -f argument to the model, only 6 | # use it from the agent for submission after error 7 | -------------------------------------------------------------------------------- /src/art/utils/benchmarking/charts/__init__.py: -------------------------------------------------------------------------------- 1 | from .percentage_comparison_bar_chart import percentage_comparison_bar_chart 2 | from .training_progress_chart import training_progress_chart 3 | 4 | __all__ = ["percentage_comparison_bar_chart", "training_progress_chart"] 5 | -------------------------------------------------------------------------------- /.skyignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .art/ 3 | # .env 4 | .venv/ 5 | grpo_trainer_lora_model/ 6 | logs/ 7 | shared_cache.db 8 | streaming-chat-completions/ 9 | unsloth_compiled_cache/ 10 | wandb/ 11 | docs/node_modules/ 12 | dist/ 13 | dev/art-e/data/ 14 | replays/ 15 | trajectories/ 16 | .DS_Store 17 | # .local/ -------------------------------------------------------------------------------- /src/art/torchtune/batch.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | from .. import dev, types 4 | from ..preprocessing.pack import DiskPackedTensors 5 | 6 | 7 | class Batch(BaseModel): 8 | disk_packed_tensors: DiskPackedTensors 9 | config: types.TrainConfig 10 | dev_config: dev.TrainConfig 11 | -------------------------------------------------------------------------------- /dev/swebench/tools/registry/bin/_write_env: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | from registry import registry # type: ignore 6 | 7 | if __name__ == "__main__": 8 | var_name = sys.argv[1] 9 | var_value = sys.argv[2] if len(sys.argv) > 2 else "" 10 | registry[var_name] = var_value 11 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .art/ 3 | # .env 4 | .venv/ 5 | grpo_trainer_lora_model/ 6 | logs/ 7 | shared_cache.db 8 | streaming-chat-completions/ 9 | unsloth_compiled_cache/ 10 | wandb/ 11 | docs/node_modules/ 12 | dist/ 13 | replays/ 14 | trajectories/ 15 | .DS_Store 16 | # .local/ 17 | # .claude/ 18 | .vscode/ -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/tool.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from typing import Any 3 | 4 | 5 | class Tool(abc.ABC): 6 | @staticmethod 7 | def invoke(*args, **kwargs): 8 | raise NotImplementedError 9 | 10 | @staticmethod 11 | def get_info() -> dict[str, Any]: 12 | raise NotImplementedError 13 | -------------------------------------------------------------------------------- /dev/swebench/tools/registry/bin/_read_env: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | from registry import registry # type: ignore 6 | 7 | if __name__ == "__main__": 8 | var_name = sys.argv[1] 9 | default_value = sys.argv[2] if len(sys.argv) > 2 else "" 10 | print(registry.get(var_name, default_value)) 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .art/ 3 | .env 4 | .venv/ 5 | grpo_trainer_lora_model/ 6 | logs/ 7 | shared_cache.db 8 | data/cache.db 9 | streaming-chat-completions/ 10 | unsloth_compiled_cache/ 11 | wandb/ 12 | docs/node_modules/ 13 | dist/ 14 | replays/ 15 | trajectories/ 16 | .DS_Store 17 | .local/ 18 | .claude/ 19 | .vscode/ 20 | .ruff_cache/ 21 | !/src/art/wandb/ 22 | !/src/art/wandb/** 23 | /src/art/wandb/__pycache__/ -------------------------------------------------------------------------------- /dev/swebench/sandboxes.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | import modal 4 | 5 | 6 | async def terminate_sandboxes() -> None: 7 | sandboxes: list[modal.Sandbox] = [] 8 | async for sandbox in modal.Sandbox.list.aio( 9 | app_id=modal.App.lookup("swe-rex", create_if_missing=True).app_id 10 | ): 11 | sandboxes.append(sandbox) 12 | _ = await asyncio.gather(*[sandbox.terminate.aio() for sandbox in sandboxes]) 13 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/agents/base.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | import abc 4 | from typing import Optional 5 | 6 | from tau_bench.envs.base import Env 7 | from tau_bench.types import SolveResult 8 | 9 | 10 | class Agent(abc.ABC): 11 | @abc.abstractmethod 12 | async def solve( 13 | self, env: Env, task_index: Optional[int] = None, max_num_steps: int = 30 14 | ) -> SolveResult: 15 | raise NotImplementedError 16 | -------------------------------------------------------------------------------- /src/art/utils/benchmarking/types.py: -------------------------------------------------------------------------------- 1 | class BenchmarkModelKey: 2 | name: str 3 | display_name: str 4 | split: str 5 | 6 | def __init__( 7 | self, name: str, display_name: str | None = None, split: str | None = None 8 | ): 9 | self.name = name 10 | self.display_name = display_name or name 11 | self.split = split or "val" 12 | 13 | def __str__(self): 14 | return self.display_name 15 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/model_utils/args.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from tau_bench.model_utils.model.model import Platform 4 | 5 | 6 | def api_parser() -> argparse.ArgumentParser: 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--model", type=str) 9 | parser.add_argument("--base-url", type=str) 10 | parser.add_argument( 11 | "--platform", type=str, required=True, choices=[e.value for e in Platform] 12 | ) 13 | return parser 14 | -------------------------------------------------------------------------------- /docs/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "docs", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "dev": "mintlify dev --port 3001", 8 | "build": "mintlify build", 9 | "generate:routes": "npx @mintlify/scraping@latest openapi-file ./openapi.json --outDir ./api-reference" 10 | }, 11 | "keywords": [], 12 | "author": "", 13 | "license": "ISC", 14 | "dependencies": { 15 | "mintlify": "^4.0.433" 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/art/mcp/default_tools.py: -------------------------------------------------------------------------------- 1 | from art.mcp.types import MCPTool 2 | 3 | complete_task_tool = MCPTool( 4 | name="complete_task", 5 | description="Complete a task", 6 | parameters={ 7 | "type": "object", 8 | "properties": { 9 | "summary": { 10 | "type": "string", 11 | "description": "Summary of accomplishments", 12 | } 13 | }, 14 | "required": ["summary"], 15 | }, 16 | ) 17 | -------------------------------------------------------------------------------- /src/art/utils/old_benchmarking/display_image_grid.py: -------------------------------------------------------------------------------- 1 | from IPython.display import HTML, display 2 | 3 | 4 | def display_image_grid(image_paths: list[str], images_per_row: int = 2): 5 | html = f""" 6 |
7 | """ 8 | for path in image_paths: 9 | html += f"" 10 | html += "
" 11 | display(HTML(html)) 12 | -------------------------------------------------------------------------------- /examples/mcp-rl/servers/python/mcp_balldontlie/server_params.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from dotenv import load_dotenv 4 | from mcp import StdioServerParameters 5 | 6 | load_dotenv() 7 | 8 | server_params = StdioServerParameters( 9 | command="python", 10 | args=[ 11 | "servers/python/mcp_balldontlie/server.py", 12 | "--api-key", 13 | os.getenv("BALLDONTLIE_API_KEY", ""), 14 | ], 15 | env={"BALLDONTLIE_API_KEY": os.getenv("BALLDONTLIE_API_KEY")}, 16 | ) 17 | -------------------------------------------------------------------------------- /examples/mcp-rl/servers/python/mcp_googlemaps/server_params.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from dotenv import load_dotenv 4 | from mcp import StdioServerParameters 5 | 6 | load_dotenv() 7 | 8 | server_params = StdioServerParameters( 9 | command="python", 10 | args=[ 11 | "servers/python/mcp_googlemaps/server.py", 12 | "--api-key", 13 | os.getenv("GOOGLE_MAPS_API_KEY", ""), 14 | ], 15 | env={"GOOGLE_MAPS_API_KEY": os.getenv("GOOGLE_MAPS_API_KEY")}, 16 | ) 17 | -------------------------------------------------------------------------------- /examples/mcp-rl/servers/python/mcp_alphavantage/server_params.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from dotenv import load_dotenv 4 | from mcp import StdioServerParameters 5 | 6 | load_dotenv() 7 | 8 | server_params = StdioServerParameters( 9 | command="python", 10 | args=[ 11 | "servers/python/mcp_alphavantage/server.py", 12 | "--api-key", 13 | os.getenv("ALPHAVANTAGE_API_KEY", "demo"), 14 | ], 15 | env={"ALPHAVANTAGE_API_KEY": os.getenv("ALPHAVANTAGE_API_KEY")}, 16 | ) 17 | -------------------------------------------------------------------------------- /dev/swebench/tools/edit_anthropic/bin/_state_anthropic: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import json 4 | import os 5 | from pathlib import Path 6 | 7 | 8 | def main(): 9 | state_path = Path("/root/state.json") 10 | if state_path.exists(): 11 | state = json.loads(state_path.read_text()) 12 | else: 13 | state = {} 14 | 15 | state["working_dir"] = os.getcwd() 16 | 17 | state_path.write_text(json.dumps(state)) 18 | 19 | 20 | if __name__ == "__main__": 21 | main() 22 | -------------------------------------------------------------------------------- /scripts/publish.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Load the .env file 5 | set -o allexport 6 | source .env 7 | 8 | # Check if PYPI_ART_TOKEN is set 9 | if [[ -z "${PYPI_ART_TOKEN}" ]]; then 10 | echo "Error: PYPI_ART_TOKEN is not set." 11 | exit 1 12 | fi 13 | 14 | # Delete the dist directory 15 | rm -rf dist 16 | 17 | # Build the package 18 | uv run hatch build 19 | 20 | 21 | # If the token is set, proceed with publishing 22 | uv publish --username=__token__ --password=$PYPI_ART_TOKEN 23 | -------------------------------------------------------------------------------- /requirements/backend.vcs.txt: -------------------------------------------------------------------------------- 1 | # Pinned backend dependencies that must come from VCS (not allowed in PyPI metadata). 2 | # Install with: 3 | # uv pip install -r requirements/backend.vcs.txt 4 | # or 5 | # pip install -r requirements/backend.vcs.txt 6 | 7 | # Torchtune pinned to known-good commit 8 | torchtune @ git+https://github.com/pytorch/torchtune.git@2344509cf83bd886538fe3e8263e5145d1afb5c2 9 | 10 | # Unsloth Zoo pinned to known-good commit 11 | unsloth-zoo @ git+https://github.com/bradhilton/unsloth-zoo@323cf5e 12 | -------------------------------------------------------------------------------- /src/art/mcp/__init__.py: -------------------------------------------------------------------------------- 1 | """MCP utilities for Agent Reinforcement Training.""" 2 | 3 | from .default_tools import complete_task_tool 4 | from .generate_scenarios import generate_scenarios 5 | from .types import ( 6 | GeneratedScenario, 7 | GeneratedScenarioCollection, 8 | MCPResource, 9 | MCPTool, 10 | ) 11 | 12 | __all__ = [ 13 | "MCPResource", 14 | "MCPTool", 15 | "GeneratedScenario", 16 | "GeneratedScenarioCollection", 17 | "complete_task_tool", 18 | "generate_scenarios", 19 | ] 20 | -------------------------------------------------------------------------------- /src/art/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Import all utilities to maintain the same interface 2 | from .format_message import format_message 3 | from .get_model_step import get_model_step 4 | from .iterate_dataset import iterate_dataset 5 | from .limit_concurrency import limit_concurrency 6 | from .log_http_errors import log_http_errors 7 | from .retry import retry 8 | 9 | __all__ = [ 10 | "format_message", 11 | "retry", 12 | "iterate_dataset", 13 | "limit_concurrency", 14 | "log_http_errors", 15 | "get_model_step", 16 | ] 17 | -------------------------------------------------------------------------------- /src/art/utils/get_repo_root_path.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def get_repo_root_path() -> str: 5 | try: 6 | # search through parent directories until we find a .git directory 7 | current_dir = os.path.dirname(os.path.abspath(__file__)) 8 | while not os.path.exists(os.path.join(current_dir, ".git")): 9 | if current_dir == "/": 10 | raise Exception("Could not find .git directory") 11 | current_dir = os.path.dirname(current_dir) 12 | return current_dir 13 | except Exception: 14 | return "." 15 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/model_utils/func_tools/filter.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Iterable, TypeVar 2 | 3 | from tau_bench.model_utils.func_tools.map import map 4 | 5 | T = TypeVar("T") 6 | 7 | builtin_filter = filter 8 | 9 | 10 | def filter( 11 | func: Callable[[T], bool], 12 | iterable: Iterable[T], 13 | max_concurrency: int | None = None, 14 | ) -> Iterable[T]: 15 | assert max_concurrency is None or max_concurrency > 0 16 | bits = map(func, iterable=iterable, max_concurrency=max_concurrency) 17 | return [x for x, y in zip(iterable, bits) if y] 18 | -------------------------------------------------------------------------------- /src/art/utils/deployment/legacy.py: -------------------------------------------------------------------------------- 1 | """Legacy exports for backwards compatibility.""" 2 | 3 | from enum import Enum 4 | 5 | from pydantic import BaseModel 6 | 7 | from .together import TogetherJobStatus 8 | 9 | 10 | class LoRADeploymentProvider(str, Enum): 11 | """Legacy enum for deployment providers.""" 12 | 13 | TOGETHER = "together" 14 | WANDB = "wandb" 15 | 16 | 17 | class LoRADeploymentJob(BaseModel): 18 | """Legacy result class for deployment jobs.""" 19 | 20 | status: TogetherJobStatus 21 | job_id: str 22 | model_name: str 23 | failure_reason: str | None 24 | -------------------------------------------------------------------------------- /examples/mcp-rl/servers/python/mcp_googlemaps/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "mcp-googlemaps" 3 | version = "0.1.0" 4 | description = "Google Maps MCP Server - Provides access to Google Maps APIs including Geocoding and Places" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "aiohttp>=3.9.0", 9 | "click>=8.1.0", 10 | "mcp>=1.0.0", 11 | "python-dotenv>=1.0.0", 12 | "tenacity>=8.0.0", 13 | ] 14 | 15 | [project.scripts] 16 | mcp-googlemaps = "mcp_googlemaps.server:main" 17 | 18 | [build-system] 19 | requires = ["hatchling"] 20 | build-backend = "hatchling.build" -------------------------------------------------------------------------------- /src/art/dev/__init__.py: -------------------------------------------------------------------------------- 1 | from .engine import EngineArgs 2 | from .model import ( 3 | InitArgs, 4 | InternalModelConfig, 5 | PeftArgs, 6 | TrainerArgs, 7 | ) 8 | from .openai_server import OpenAIServerConfig, ServerArgs, get_openai_server_config 9 | from .torchtune import TorchtuneArgs 10 | from .train import TrainConfig 11 | 12 | __all__ = [ 13 | "EngineArgs", 14 | "InternalModelConfig", 15 | "InitArgs", 16 | "PeftArgs", 17 | "TrainerArgs", 18 | "get_openai_server_config", 19 | "OpenAIServerConfig", 20 | "ServerArgs", 21 | "TorchtuneArgs", 22 | "TrainConfig", 23 | ] 24 | -------------------------------------------------------------------------------- /dev/swebench/run.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from concurrent.futures import ThreadPoolExecutor 3 | from functools import partial 4 | from typing import Callable, ParamSpec, TypeVar 5 | 6 | executor = ThreadPoolExecutor(max_workers=1024) 7 | 8 | P = ParamSpec("P") 9 | R = TypeVar("R") 10 | 11 | 12 | async def run( 13 | func: Callable[P, R], 14 | in_thread: bool, 15 | *args: P.args, 16 | **kwargs: P.kwargs, 17 | ) -> R: 18 | if in_thread: 19 | return await asyncio.get_running_loop().run_in_executor( 20 | executor, partial(func, *args, **kwargs) 21 | ) 22 | return func(*args, **kwargs) 23 | -------------------------------------------------------------------------------- /dev/tau-bench/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | from setuptools import find_packages, setup 4 | 5 | setup( 6 | name="tau_bench", 7 | version="0.1.0", 8 | description="The Tau-Bench package", 9 | long_description=open("README.md").read(), 10 | packages=find_packages(), 11 | include_package_data=True, 12 | install_requires=[ 13 | "openai>=1.13.3", 14 | "mistralai>=0.4.0", 15 | "anthropic>=0.26.1", 16 | "google-generativeai>=0.5.4", 17 | "tenacity>=8.3.0", 18 | "termcolor>=2.4.0", 19 | "numpy>=1.26.4", 20 | "litellm>=1.41.0", 21 | ], 22 | ) 23 | -------------------------------------------------------------------------------- /dev/test_skypilot/launch.py: -------------------------------------------------------------------------------- 1 | """Training example for MCP agent using rollout with AlphaMcpServer in scenarios.""" 2 | 3 | import asyncio 4 | 5 | from dotenv import load_dotenv 6 | 7 | from art.skypilot.backend import SkyPilotBackend 8 | 9 | load_dotenv() 10 | 11 | 12 | async def launch(): 13 | backend = await SkyPilotBackend().initialize_cluster( 14 | cluster_name="test-skypilot", 15 | gpu="H100-SXM", 16 | env_path=".env", 17 | force_restart=True, 18 | ) 19 | 20 | print("successfully initialized skypilot server") 21 | 22 | 23 | if __name__ == "__main__": 24 | asyncio.run(launch()) 25 | -------------------------------------------------------------------------------- /dev/swebench/sandbox/daytona.py: -------------------------------------------------------------------------------- 1 | import daytona_sdk 2 | 3 | from .sandbox import Provider, Sandbox 4 | 5 | 6 | class DaytonaSandbox(Sandbox): 7 | """ 8 | Daytona sandbox. 9 | 10 | Wraps a Daytona sandbox with the shared Sandbox interface. 11 | """ 12 | 13 | provider: Provider = "daytona" 14 | 15 | def __init__(self, sandbox: daytona_sdk.AsyncSandbox) -> None: 16 | self._sandbox = sandbox 17 | 18 | async def exec(self, command: str, timeout: int) -> tuple[int, str]: 19 | result = await self._sandbox.process.exec(command, timeout=timeout) 20 | return int(result.exit_code), result.result 21 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/model_utils/model/exception.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Generic, TypeVar 3 | 4 | T = TypeVar("T") 5 | 6 | 7 | class ModelError(Exception): 8 | def __init__( 9 | self, 10 | short_message: str, 11 | prompt: str | list[dict[str, str]] | None = None, 12 | response: str | None = None, 13 | ) -> None: 14 | super().__init__(short_message) 15 | self.short_message = short_message 16 | self.prompt = prompt 17 | self.response = response 18 | 19 | 20 | @dataclass 21 | class Result(Generic[T]): 22 | value: T | None 23 | error: ModelError | None 24 | -------------------------------------------------------------------------------- /examples/hn_title_generator/skypilot.yaml: -------------------------------------------------------------------------------- 1 | # To launch, run the following command from the root directory of the art repository: 2 | # `uv run sky launch examples/hn_title_generator/skypilot.yaml --cluster=kyle-hn-title-generator-001 --env-file=.env --yes --retry-until-up --down --idle-minutes-to-autostop 10` 3 | 4 | workdir: . 5 | resources: 6 | accelerators: ["H100-SXM:1"] 7 | envs: 8 | HF_HUB_ENABLE_HF_TRANSFER: 1 9 | 10 | setup: | 11 | curl -LsSf https://astral.sh/uv/install.sh | sh 12 | 13 | source $HOME/.local/bin/env 14 | 15 | uv sync 16 | 17 | run: | 18 | echo "Running training script..." 19 | uv run python examples/hn_title_generator/train.py 20 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/retail/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | import json 4 | import os 5 | from typing import Any 6 | 7 | FOLDER_PATH = os.path.dirname(__file__) 8 | 9 | 10 | def load_data() -> dict[str, Any]: 11 | with open(os.path.join(FOLDER_PATH, "orders.json")) as f: 12 | order_data = json.load(f) 13 | with open(os.path.join(FOLDER_PATH, "products.json")) as f: 14 | product_data = json.load(f) 15 | with open(os.path.join(FOLDER_PATH, "users.json")) as f: 16 | user_data = json.load(f) 17 | return { 18 | "orders": order_data, 19 | "products": product_data, 20 | "users": user_data, 21 | } 22 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/airline/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | import json 4 | import os 5 | from typing import Any 6 | 7 | FOLDER_PATH = os.path.dirname(__file__) 8 | 9 | 10 | def load_data() -> dict[str, Any]: 11 | with open(os.path.join(FOLDER_PATH, "flights.json")) as f: 12 | flight_data = json.load(f) 13 | with open(os.path.join(FOLDER_PATH, "reservations.json")) as f: 14 | reservation_data = json.load(f) 15 | with open(os.path.join(FOLDER_PATH, "users.json")) as f: 16 | user_data = json.load(f) 17 | return { 18 | "flights": flight_data, 19 | "reservations": reservation_data, 20 | "users": user_data, 21 | } 22 | -------------------------------------------------------------------------------- /dev/swebench/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "openpipe-art-swebench" 3 | version = "0.1.0" 4 | requires-python = ">=3.10" 5 | dependencies = [ 6 | "aiolimiter>=1.2.1", 7 | "daytona-sdk>=0.21.5", 8 | "langfuse>=2.60.7", 9 | "modal>=1.0.1", 10 | "openpipe-art", 11 | "sweagent", 12 | "swebench>=4.0.3", 13 | ] 14 | 15 | [tool.uv.sources] 16 | openpipe-art = { path = "../../", editable = true } 17 | sweagent = { git = "https://github.com/bradhilton/SWE-agent" } 18 | 19 | [dependency-groups] 20 | dev = [ 21 | "ipykernel>=6.29.5", 22 | "ipywidgets>=8.1.7", 23 | "pytest>=8.4.1", 24 | "pytest-asyncio>=1.0.0", 25 | "pytest-timeout>=2.4.0", 26 | "pytest-xdist>=3.8.0", 27 | ] 28 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/model_utils/func_tools/map.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ThreadPoolExecutor 2 | from typing import Callable, Iterable, TypeVar 3 | 4 | T = TypeVar("T") 5 | U = TypeVar("U") 6 | 7 | 8 | def map( 9 | func: Callable[[T], U], 10 | iterable: Iterable[T], 11 | max_concurrency: int | None = None, 12 | use_tqdm: bool = False, 13 | ) -> Iterable[U]: 14 | assert max_concurrency is None or max_concurrency > 0 15 | with ThreadPoolExecutor(max_workers=max_concurrency) as executor: 16 | if use_tqdm: 17 | from tqdm import tqdm 18 | 19 | return list(tqdm(executor.map(func, iterable), total=len(iterable))) 20 | return executor.map(func, iterable) 21 | -------------------------------------------------------------------------------- /dev/swebench/sandbox/modal.py: -------------------------------------------------------------------------------- 1 | import modal 2 | 3 | from .sandbox import Provider, Sandbox 4 | 5 | 6 | class ModalSandbox(Sandbox): 7 | """ 8 | Modal sandbox. 9 | 10 | Wraps a Modal sandbox with the shared Sandbox interface. 11 | """ 12 | 13 | provider: Provider = "modal" 14 | 15 | def __init__(self, sandbox: modal.Sandbox) -> None: 16 | self._sandbox = sandbox 17 | 18 | async def exec(self, command: str, timeout: int) -> tuple[int, str]: 19 | process = await self._sandbox.exec.aio( 20 | "/bin/sh", "-c", command, timeout=timeout 21 | ) 22 | exit_code = await process.wait.aio() 23 | stdout = await process.stdout.read.aio() 24 | return exit_code, stdout 25 | -------------------------------------------------------------------------------- /src/art/types.py: -------------------------------------------------------------------------------- 1 | from typing import Annotated, Literal 2 | 3 | import pydantic 4 | from openai.types.chat.chat_completion import Choice 5 | from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam 6 | from openai.types.chat.chat_completion_tool_param import ChatCompletionToolParam 7 | from pydantic import SkipValidation 8 | 9 | Message = Annotated[ChatCompletionMessageParam, SkipValidation] 10 | MessageOrChoice = Message | Choice 11 | Messages = list[Message] 12 | MessagesAndChoices = list[MessageOrChoice] 13 | Tools = list[ChatCompletionToolParam] 14 | 15 | 16 | class TrainConfig(pydantic.BaseModel): 17 | learning_rate: float = 5e-6 18 | beta: float = 0.0 19 | 20 | 21 | Verbosity = Literal[0, 1, 2] 22 | -------------------------------------------------------------------------------- /src/art/utils/format_message.py: -------------------------------------------------------------------------------- 1 | from ..types import Message 2 | 3 | 4 | def format_message(message: Message) -> str: 5 | """Format a message into a readable string.""" 6 | # Format the role and content 7 | role = message["role"].capitalize() 8 | content = message.get("content", message.get("refusal", "")) or "" 9 | 10 | # Format any tool calls 11 | tool_calls_text = "\n" if content else "" 12 | tool_calls_text += "\n".join( 13 | f"{tool_call['function']['name']}({tool_call['function']['arguments']})" 14 | for tool_call in message.get("tool_calls") or [] 15 | ) 16 | 17 | # Combine all parts 18 | formatted_message = f"{role}:\n{content}{tool_calls_text}" 19 | return formatted_message 20 | -------------------------------------------------------------------------------- /examples/mcp-rl/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "art-mcp" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "aiohttp>=3.12.14", 9 | "asyncio>=3.4.3", 10 | "click>=8.1.8", 11 | "mcp>=1.11.0", 12 | "openai>=1.74.0", 13 | "openpipe-art[skypilot]", 14 | "python-dotenv>=1.1.1", 15 | "tenacity>=9.1.2", 16 | "weave>=0.51.56", 17 | ] 18 | 19 | 20 | [tool.uv.sources] 21 | openpipe-art = { path = "../../", editable = true } 22 | 23 | [dependency-groups] 24 | dev = [ 25 | "polars>=1.31.0", 26 | "ipywidgets>=8.1.6", 27 | "ipykernel>=6.29.5", 28 | "matplotlib>=3.10.3", 29 | "seaborn>=0.13.2", 30 | ] 31 | -------------------------------------------------------------------------------- /examples/just-the-facts/just_the_facts/find_articles.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import feedparser 4 | 5 | feeds = { 6 | "NBC News Top Stories": "http://feeds.nbcnews.com/feeds/topstories", 7 | "BBC News Top Stories": "https://feeds.bbci.co.uk/news/rss.xml", 8 | "CBS News Top Stories": "http://www.cbsnews.com/latest/rss/main", 9 | "Fox News Latest": "http://feeds.foxnews.com/foxnews/latest", 10 | } 11 | 12 | all_urls = [] 13 | 14 | for name, url in feeds.items(): 15 | print(f"\n=== {name} ===") 16 | feed = feedparser.parse(url) 17 | 18 | for entry in feed.entries[:25]: 19 | print(entry.link) 20 | all_urls.append(entry.link) 21 | 22 | 23 | # shuffle 24 | random.shuffle(all_urls) 25 | 26 | print(all_urls) 27 | -------------------------------------------------------------------------------- /src/art/utils/limit_concurrency.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from functools import wraps 3 | from typing import Callable, Optional 4 | 5 | 6 | def limit_concurrency(n: int, derive_key: Optional[Callable[..., str]] = None): 7 | semaphores = {} 8 | 9 | def decorator(func): 10 | @wraps(func) 11 | async def wrapper(*args, **kwargs): 12 | if derive_key: 13 | key = derive_key(*args, **kwargs) 14 | else: 15 | key = "default" 16 | 17 | if key not in semaphores: 18 | semaphores[key] = asyncio.Semaphore(n) 19 | 20 | async with semaphores[key]: 21 | return await func(*args, **kwargs) 22 | 23 | return wrapper 24 | 25 | return decorator 26 | -------------------------------------------------------------------------------- /dev/test_skypilot/launch_tail.py: -------------------------------------------------------------------------------- 1 | """Training example for MCP agent using rollout with AlphaMcpServer in scenarios.""" 2 | 3 | import asyncio 4 | 5 | from dotenv import load_dotenv 6 | 7 | from art.skypilot.backend import SkyPilotBackend 8 | 9 | load_dotenv() 10 | 11 | 12 | async def launch_tail(): 13 | backend = await SkyPilotBackend().initialize_cluster( 14 | cluster_name="test-skypilot", 15 | gpu="H100-SXM", 16 | env_path=".env", 17 | force_restart=True, 18 | tail_logs=True, 19 | ) 20 | print("successfully initialized skypilot server") 21 | 22 | # unforunately, we can't cancel the task programmatically, so we have to ctrl+c 23 | # to exit 24 | 25 | 26 | if __name__ == "__main__": 27 | asyncio.run(launch_tail()) 28 | -------------------------------------------------------------------------------- /src/art/utils/old_benchmarking/calculate_step_metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from art.trajectories import TrajectoryGroup 4 | 5 | 6 | # calculate the average standard deviation of rewards within groups 7 | def calculate_step_std_dev(trajectory_groups: list[TrajectoryGroup]) -> float: 8 | std_devs = [] 9 | for group in trajectory_groups: 10 | group_rewards = [] 11 | 12 | for trajectory in group.trajectories: 13 | if isinstance(trajectory, BaseException): 14 | continue 15 | group_rewards.append(trajectory.reward) 16 | 17 | if len(group_rewards) > 1: 18 | std_devs.append(np.std(group_rewards)) 19 | 20 | if len(std_devs) == 0: 21 | return 0 22 | 23 | return sum(std_devs) / len(std_devs) 24 | -------------------------------------------------------------------------------- /src/art/torchtune/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export MODEL_DIR=$(HF_HUB_ENABLE_HF_TRANSFER=1 uv run huggingface-cli download Qwen/Qwen3-32B | tail -n 1) 4 | export TORCHTUNE_DIR=$(uv run python -c "import torchtune; import os; print(os.path.dirname(torchtune.__file__))") 5 | uv run $TORCHTUNE_DIR/_cli/tune.py run \ 6 | --nproc-per-node 8 \ 7 | src/art/torchtune/recipe.py \ 8 | --config ./src/art/torchtune/config.yaml \ 9 | tokenizer.path=$MODEL_DIR/vocab.json \ 10 | tokenizer.merges_file=$MODEL_DIR/merges.txt \ 11 | checkpointer.checkpoint_dir=$MODEL_DIR \ 12 | checkpointer.checkpoint_files="[$(ls $MODEL_DIR/*.safetensors | xargs -n1 basename | sed 's/^/"/;s/$/",/' | tr '\n' ' ' | sed 's/, $//' )]" \ 13 | model._component_=torchtune.models.qwen3.qwen3_32b \ 14 | "$@" 15 | -------------------------------------------------------------------------------- /src/art/utils/log_http_errors.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | 3 | import httpx 4 | 5 | 6 | def log_http_errors(func): 7 | @wraps(func) 8 | async def wrapper(*args, **kwargs): 9 | try: 10 | return await func(*args, **kwargs) 11 | except httpx.HTTPStatusError as e: 12 | # raise a new exception with the status code, url, and "detail" key if it exists 13 | try: 14 | detail = e.response.json().get("detail", None) 15 | except Exception: 16 | # if we can't parse the response as json, just raise the original exception 17 | raise e 18 | raise Exception( 19 | f"[HTTP {e.response.status_code}] {e.request.url} {detail}" 20 | ) from e 21 | 22 | return wrapper 23 | -------------------------------------------------------------------------------- /dev/tau-bench/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "tau-bench" 3 | version = "0.1.0" 4 | requires-python = ">=3.11" 5 | dependencies = [ 6 | "google-generativeai>=0.8.5", 7 | "langfuse>=2.60.8", 8 | "litellm>=1.72.6.post2", 9 | "mistralai>=1.8.2", 10 | "openpipe>=4.50.0", 11 | "openpipe-art", 12 | "skypilot-nightly[runpod,hyperbolic]==1.0.0.dev20250717", 13 | "tenacity>=9.1.2", 14 | "termcolor>=3.1.0", 15 | "openai>=1.74.0", 16 | "anthropic>=0.49.0", 17 | "accelerate==1.7.0", 18 | "vllm==0.9.1; sys_platform == 'linux'" 19 | ] 20 | 21 | [tool.uv] 22 | override-dependencies = ["vllm; sys_platform == 'linux'"] 23 | 24 | [tool.uv.sources] 25 | openpipe-art = { path = "../../", editable = true } 26 | 27 | [dependency-groups] 28 | dev = [ 29 | "ipykernel>=6.29.5", 30 | "ipywidgets>=8.1.7", 31 | ] 32 | -------------------------------------------------------------------------------- /examples/mcp-rl/mcp_rl/utils.py: -------------------------------------------------------------------------------- 1 | from mcp import types 2 | 3 | 4 | def get_content_text(result: types.CallToolResult) -> str: 5 | # Extract text content from MCP result 6 | if hasattr(result, "content") and result.content: 7 | if isinstance(result.content, list): 8 | # Handle list of content items 9 | content_text = "" 10 | for item in result.content: 11 | if isinstance(item, types.TextContent): 12 | content_text += item.text 13 | else: 14 | content_text += str(item) 15 | elif isinstance(result.content[0], types.TextContent): 16 | content_text = result.content[0].text 17 | else: 18 | content_text = str(result.content) 19 | else: 20 | content_text = str(result) 21 | 22 | return content_text 23 | -------------------------------------------------------------------------------- /src/art/utils/deployment/__init__.py: -------------------------------------------------------------------------------- 1 | """Deployment utilities for deploying trained models to inference endpoints.""" 2 | 3 | from .common import ( 4 | DeploymentConfig, 5 | DeploymentResult, 6 | Provider, 7 | deploy_model, 8 | ) 9 | 10 | # Legacy exports for backwards compatibility 11 | from .legacy import ( 12 | LoRADeploymentJob, 13 | LoRADeploymentProvider, 14 | ) 15 | from .together import ( 16 | TogetherDeploymentConfig, 17 | ) 18 | from .wandb import ( 19 | WandbDeploymentConfig, 20 | deploy_wandb, 21 | ) 22 | 23 | __all__ = [ 24 | # New API 25 | "DeploymentConfig", 26 | "DeploymentResult", 27 | "Provider", 28 | "TogetherDeploymentConfig", 29 | "WandbDeploymentConfig", 30 | "deploy_model", 31 | "deploy_wandb", 32 | # Legacy API 33 | "LoRADeploymentJob", 34 | "LoRADeploymentProvider", 35 | ] 36 | -------------------------------------------------------------------------------- /src/art/utils/get_model_step.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import TYPE_CHECKING 3 | 4 | from art.utils.output_dirs import get_model_dir 5 | 6 | if TYPE_CHECKING: 7 | from art.model import TrainableModel 8 | 9 | 10 | def get_step_from_dir(output_dir: str) -> int: 11 | os.makedirs(output_dir, exist_ok=True) 12 | checkpoint_dir = os.path.join(output_dir, "checkpoints") 13 | if not os.path.exists(checkpoint_dir): 14 | return 0 15 | 16 | return max( 17 | ( 18 | int(subdir) 19 | for subdir in os.listdir(checkpoint_dir) 20 | if os.path.isdir(os.path.join(checkpoint_dir, subdir)) and subdir.isdigit() 21 | ), 22 | default=0, 23 | ) 24 | 25 | 26 | def get_model_step(model: "TrainableModel", art_path: str) -> int: 27 | return get_step_from_dir(get_model_dir(model=model, art_path=art_path)) 28 | -------------------------------------------------------------------------------- /src/art/local/service.py: -------------------------------------------------------------------------------- 1 | from typing import AsyncIterator, Protocol, runtime_checkable 2 | 3 | from .. import dev, types 4 | from ..preprocessing.pack import DiskPackedTensors 5 | 6 | 7 | @runtime_checkable 8 | class ModelService(Protocol): 9 | def __init__( 10 | self, 11 | model_name: str, 12 | base_model: str, 13 | config: dev.InternalModelConfig, 14 | output_dir: str, 15 | ): 16 | pass 17 | 18 | async def start_openai_server( 19 | self, config: dev.OpenAIServerConfig | None 20 | ) -> None: ... 21 | 22 | async def vllm_engine_is_sleeping(self) -> bool: ... 23 | 24 | def train( 25 | self, 26 | disk_packed_tensors: DiskPackedTensors, 27 | config: types.TrainConfig, 28 | _config: dev.TrainConfig, 29 | verbose: bool = False, 30 | ) -> AsyncIterator[dict[str, float]]: ... 31 | -------------------------------------------------------------------------------- /examples/just-the-facts/test_scraper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import asyncio 4 | 5 | from just_the_facts.scenarios import train_urls, val_urls 6 | from just_the_facts.utils import scrape_article 7 | 8 | 9 | async def test_scraper(): 10 | """Test the scrape_article function with example URLs""" 11 | 12 | # Test URLs from different news sources (using homepage URLs that should exist) 13 | test_urls = train_urls + val_urls 14 | 15 | for url in test_urls: 16 | try: 17 | print(f"\nTesting URL: {url}") 18 | article_text = await scrape_article(url) 19 | print(f"Successfully scraped {len(article_text)} characters") 20 | print(f"First 200 characters: {article_text[:200]}...") 21 | except Exception as e: 22 | print(f"Failed to scrape {url}: {str(e)}") 23 | raise e 24 | 25 | 26 | if __name__ == "__main__": 27 | asyncio.run(test_scraper()) 28 | -------------------------------------------------------------------------------- /src/art/langgraph/logging.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | 5 | class FileLogger: 6 | def __init__(self, filepath): 7 | self.text_path = filepath 8 | self.pickle_path = filepath + ".pkl" 9 | 10 | def log(self, name, entry): 11 | # Log as readable text 12 | with open(self.text_path, "a") as f: 13 | f.write(f"{name}: {entry}\n") 14 | 15 | # Append to pickle log 16 | with open(self.pickle_path, "ab") as pf: 17 | pickle.dump((name, entry), pf) 18 | 19 | def load_logs(self): 20 | """Load all logs from the pickle file.""" 21 | if not os.path.exists(self.pickle_path): 22 | return [] 23 | logs = [] 24 | with open(self.pickle_path, "rb") as pf: 25 | try: 26 | while True: 27 | logs.append(pickle.load(pf)) 28 | except EOFError: 29 | pass 30 | return logs 31 | -------------------------------------------------------------------------------- /src/art/utils/benchmark_rollout.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, Coroutine 2 | 3 | import art 4 | 5 | from ..trajectories import Trajectory, TrajectoryGroup 6 | 7 | 8 | async def benchmark_rollout( 9 | model: str, 10 | num_rollouts: int, 11 | rollout: Callable[[str, int, bool], Coroutine[Any, Any, Trajectory]], 12 | ) -> float: 13 | trajectory_groups = await art.gather_trajectory_groups( 14 | [TrajectoryGroup(rollout(model, i, False) for i in range(num_rollouts))], 15 | pbar_desc="Benchmarking rollout", 16 | ) 17 | 18 | trajectory_group_rewards = [] 19 | 20 | for group in trajectory_groups: 21 | total_reward = sum(trajectory.reward for trajectory in group) 22 | trajectory_group_rewards.append(total_reward / len(group)) 23 | 24 | average_reward = sum(trajectory_group_rewards) / len(trajectory_group_rewards) 25 | 26 | print(f"Average reward for {model}: {average_reward}") 27 | 28 | return average_reward 29 | -------------------------------------------------------------------------------- /src/art/utils/logging.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | 4 | # ---------- lightweight "nice print" helpers ---------- 5 | class _C: 6 | RESET = "\x1b[0m" 7 | DIM = "\x1b[2m" 8 | BOLD = "\x1b[1m" 9 | ITAL = "\x1b[3m" 10 | GRAY = "\x1b[90m" 11 | BLUE = "\x1b[34m" 12 | CYAN = "\x1b[36m" 13 | GREEN = "\x1b[32m" 14 | YELLOW = "\x1b[33m" 15 | RED = "\x1b[31m" 16 | MAGENTA = "\x1b[35m" 17 | 18 | 19 | def _ts(): 20 | return time.strftime("%H:%M:%S") 21 | 22 | 23 | def info(msg): 24 | print(f"[{_ts()}] {_C.BLUE}INFO{_C.RESET} {msg}") 25 | 26 | 27 | def step(msg): 28 | print(f"[{_ts()}] {_C.CYAN}STEP{_C.RESET} {msg}") 29 | 30 | 31 | def ok(msg): 32 | print(f"[{_ts()}] {_C.GREEN}OK{_C.RESET} {msg}") 33 | 34 | 35 | def warn(msg): 36 | print(f"[{_ts()}] {_C.YELLOW}WARN{_C.RESET} {msg}") 37 | 38 | 39 | def err(msg): 40 | print(f"[{_ts()}] {_C.RED}ERR{_C.RESET} {msg}") 41 | 42 | 43 | def dim(msg): 44 | print(f"{_C.DIM}{msg}{_C.RESET}") 45 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/retail/rules.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | RULES = [ 4 | "You are a customer service representative for an online retail company. You are chatting with a customer, and you can call tools or respond to the user.", 5 | "The agent should always first confirm the user id by email or name+zip before proceeding with any task.", 6 | "The agent should not proceed with any task if the user id is not found.", 7 | "For any change to the backend database, e.g., address update, refund, or order cancellation, the agent must confirm the transaction details with the user and ask for permission, and get explicit authorization (yes) to proceed.", 8 | "The agent should solve the user task given the tools, without transferring to a human agent.", 9 | "The agent should not make up any information or knowledge not provided from the user or the tools.", 10 | "The agent should at most make one tool call at a time, and if the agent makes a tool call, it does not respond to the user at the same time.", 11 | ] 12 | -------------------------------------------------------------------------------- /docs/analytics.js: -------------------------------------------------------------------------------- 1 | !(function () { 2 | var reb2b = (window.reb2b = window.reb2b || []); 3 | if (reb2b.invoked) return; 4 | reb2b.invoked = true; 5 | reb2b.methods = ["identify", "collect"]; 6 | reb2b.factory = function (method) { 7 | return function () { 8 | var args = Array.prototype.slice.call(arguments); 9 | args.unshift(method); 10 | reb2b.push(args); 11 | return reb2b; 12 | }; 13 | }; 14 | for (var i = 0; i < reb2b.methods.length; i++) { 15 | var key = reb2b.methods[i]; 16 | reb2b[key] = reb2b.factory(key); 17 | } 18 | reb2b.load = function (key) { 19 | var script = document.createElement("script"); 20 | script.type = "text/javascript"; 21 | script.async = true; 22 | script.src = 23 | "https://s3-us-west-2.amazonaws.com/b2bjsstore/b/" + key + "/reb2b.js.gz"; 24 | var first = document.getElementsByTagName("script")[0]; 25 | first.parentNode.insertBefore(script, first); 26 | }; 27 | reb2b.SNIPPET_VERSION = "1.0.1"; 28 | reb2b.load("4O7Z0HMXYWNX"); 29 | })(); 30 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # I recommend setting your API key here if you're going to ssh into a new machine and use the local backend 2 | WANDB_API_KEY=YOUR_WANDB_API_KEY 3 | 4 | # Optional, git-related environment variables 5 | # You may need these if you want to make any git commits on a new machine 6 | GIT_USER_NAME="Your Name" 7 | GIT_USER_EMAIL=your.email@example.com 8 | # A GitHub token might be required for commiting to the private `agent-reinforcement-training` repository 9 | GITHUB_TOKEN=YOUR_GITHUB_TOKEN 10 | 11 | # HuggingFace Token (optional for most models, necessary for training gated models like Llama 3.1) 12 | HF_TOKEN=YOUR_HUGGINGFACE_TOKEN 13 | 14 | # Optional, OpenPipe API key 15 | OPENPIPE_API_KEY=YOUR_OPENPIPE_API_KEY 16 | # Optional, Together API key (used for deploying models to Together) 17 | TOGETHER_API_KEY=YOUR_TOGETHER_API_KEY 18 | 19 | # Optional, S3 configuration for log and model backups 20 | AWS_ACCESS_KEY_ID=YOUR_AWS_ACCESS_KEY_ID 21 | AWS_SECRET_ACCESS_KEY=YOUR_AWS_SECRET_ACCESS_KEY 22 | AWS_REGION=YOUR_AWS_REGION 23 | BACKUP_BUCKET=YOUR_BACKUP_BUCKET -------------------------------------------------------------------------------- /examples/just-the-facts/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "just-the-facts" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "aiohttp>=3.12.14", 9 | "asyncio>=3.4.3", 10 | "beautifulsoup4>=4.13.4", 11 | "click>=8.1.8", 12 | "feedparser>=6.0.11", 13 | "lxml>=6.0.0", 14 | "lxml-html-clean>=0.4.2", 15 | "mcp>=1.11.0", 16 | "newspaper3k>=0.2.8", 17 | "openai>=1.74.0", 18 | "openpipe-art[skypilot]", 19 | "python-dotenv>=1.1.1", 20 | "tenacity>=9.1.2", 21 | "weave>=0.51.56", 22 | ] 23 | 24 | [build-system] 25 | requires = ["setuptools>=61.0", "wheel"] 26 | build-backend = "setuptools.build_meta" 27 | 28 | [tool.setuptools.packages.find] 29 | where = ["."] 30 | include = ["just_the_facts*"] 31 | 32 | [tool.uv.sources] 33 | openpipe-art = { path = "../../", editable = true } 34 | 35 | [dependency-groups] 36 | dev = [ 37 | "polars>=1.31.0", 38 | "ipywidgets>=8.1.6", 39 | "ipykernel>=6.29.5", 40 | "matplotlib>=3.10.3", 41 | "seaborn>=0.13.2", 42 | ] 43 | -------------------------------------------------------------------------------- /src/art/utils/old_benchmarking/generate_comparison_table.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from .load_benchmarked_models import load_benchmarked_models 4 | from .types import BenchmarkedModelKey 5 | 6 | 7 | def generate_comparison_table( 8 | project: str, 9 | benchmark_keys: list[BenchmarkedModelKey], 10 | metrics: list[str] = ["reward"], 11 | api_path: str = "./.art", 12 | ) -> pd.DataFrame: 13 | benchmarked_models = load_benchmarked_models( 14 | project, benchmark_keys, metrics, api_path 15 | ) 16 | 17 | rows: list[dict[str, str]] = [] 18 | 19 | for benchmarked_model in benchmarked_models: 20 | for step in benchmarked_model.steps: 21 | row = { 22 | "Model": benchmarked_model.model_key.model, 23 | "Split": benchmarked_model.model_key.split, 24 | "Step": f"{step.index:04d}", 25 | } 26 | for metric in metrics: 27 | row[metric] = str(step.metrics.get(metric, "N/A")) 28 | rows.append(row) 29 | 30 | return pd.DataFrame(rows, columns=pd.Index(["Model", "Split", "Step"] + metrics)) 31 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/airline/tools/think.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | from typing import Any, Dict 4 | 5 | from tau_bench.envs.tool import Tool 6 | 7 | 8 | class Think(Tool): 9 | @staticmethod 10 | def invoke(data: Dict[str, Any], thought: str) -> str: 11 | return "" 12 | 13 | @staticmethod 14 | def get_info() -> Dict[str, Any]: 15 | return { 16 | "type": "function", 17 | "function": { 18 | "name": "think", 19 | "description": "Use the tool to think about something. It will not obtain new information or change the database, but just append the thought to the log. Use it when complex reasoning is needed.", 20 | "parameters": { 21 | "type": "object", 22 | "properties": { 23 | "thought": { 24 | "type": "string", 25 | "description": "A thought to think about.", 26 | }, 27 | }, 28 | "required": ["thought"], 29 | }, 30 | }, 31 | } 32 | -------------------------------------------------------------------------------- /dev/tau-bench/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Sierra 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /dev/new_models/prompts.json: -------------------------------------------------------------------------------- 1 | ["respond with 'yes', 'no', 'maybe'", "respond with 'maybe', 'yes', 'no'", "respond with 'no', 'yes', 'maybe'", "respond with 'yes', 'maybe', 'no'", "respond with yes or no", "respond with maybe or no", "respond with no or maybe", "respond with no or yes", "respond with yes or no", "respond with yes, no, maybe", "respond with maybe, yes, no", "respond with no, yes, maybe", "respond with yes, maybe, no", "respond with yes or no", "respond with maybe or no", "respond with no or maybe", "respond with no or yes", "respond with yes or no", "just respond with 'yes', 'no', 'maybe'", "just respond with 'maybe', 'yes', 'no'", "just respond with 'no', 'yes', 'maybe'", "just respond with 'yes', 'maybe', 'no'", "just respond with yes or no", "just respond with maybe or no", "just respond with no or maybe", "just respond with no or yes", "just respond with yes or no", "just respond with yes, no, maybe", "just respond with maybe, yes, no", "just respond with no, yes, maybe", "just respond with yes, maybe, no", "just respond with yes or no", "just respond with maybe or no", "just respond with no or maybe", "just respond with no or yes", "just respond with yes or no"] -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/model_utils/model/vllm_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import requests 4 | 5 | from tau_bench.model_utils.model.general_model import wrap_temperature 6 | 7 | 8 | def generate_request( 9 | url: str, 10 | prompt: str, 11 | temperature: float = 0.0, 12 | force_json: bool = False, 13 | **req_body_kwargs: Any, 14 | ) -> str: 15 | args = { 16 | "prompt": prompt, 17 | "temperature": wrap_temperature(temperature), 18 | "max_tokens": 4096, 19 | **req_body_kwargs, 20 | } 21 | if force_json: 22 | # the prompt will have a suffix of '```json\n' to indicate that the response should be a JSON object 23 | args["stop"] = ["```"] 24 | res = requests.post( 25 | url, 26 | json=args, 27 | ) 28 | res.raise_for_status() 29 | json_res = res.json() 30 | if "text" not in json_res: 31 | raise ValueError(f"Unexpected response: {json_res}") 32 | elif len(json_res["text"]) == 0: 33 | raise ValueError(f"Empty response: {json_res}") 34 | text = json_res["text"][0] 35 | assert isinstance(text, str) 36 | return text.removeprefix(prompt) 37 | -------------------------------------------------------------------------------- /src/art/yield_trajectory.py: -------------------------------------------------------------------------------- 1 | import contextvars 2 | from typing import Any, Coroutine 3 | 4 | from .trajectories import Trajectory 5 | 6 | 7 | def yield_trajectory(trajectory: Trajectory) -> None: 8 | yield_trajectory_context_var.get().trajectory = trajectory 9 | 10 | 11 | async def capture_yielded_trajectory(coroutine: Coroutine[Any, Any, Any]) -> Trajectory: 12 | with YieldTrajectoryContext(): 13 | await coroutine 14 | trajectory = yield_trajectory_context_var.get().trajectory 15 | if trajectory is None: 16 | raise RuntimeError("No trajectory yielded") 17 | return trajectory 18 | 19 | 20 | class YieldTrajectoryContext: 21 | def __init__(self) -> None: 22 | self.trajectory: Trajectory | None = None 23 | 24 | def __enter__(self) -> None: 25 | self.token = yield_trajectory_context_var.set(self) 26 | 27 | def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None: 28 | yield_trajectory_context_var.reset(self.token) 29 | 30 | 31 | yield_trajectory_context_var: contextvars.ContextVar[YieldTrajectoryContext] = ( 32 | contextvars.ContextVar("yield_trajectory_context", default=YieldTrajectoryContext()) 33 | ) 34 | -------------------------------------------------------------------------------- /src/art/dev/train.py: -------------------------------------------------------------------------------- 1 | from typing import Literal 2 | 3 | from typing_extensions import TypedDict 4 | 5 | 6 | class TrainConfig(TypedDict, total=False): 7 | advantage_balance: float 8 | """Balance between negative and positive advantages in the range [-1.0, 1.0]. \ 9 | -1.0 means only training on negative advantages, 1.0 means only training on \ 10 | positive advantages. Defaults to 0.0 (perfectly balanced).""" 11 | allow_training_without_logprobs: bool 12 | epsilon: float # clip epsilon, using the same name as TRL 13 | epsilon_high: ( 14 | float | None 15 | ) # asymmetric clip upper bound. Defaults to epsilon when None 16 | importance_sampling_level: Literal[ 17 | "token", "sequence", "average", "geometric_average" 18 | ] 19 | kimi_k2_tau: float | None 20 | logprob_calculation_chunk_size: int 21 | mask_prob_ratio: bool 22 | max_negative_advantage_importance_sampling_weight: float 23 | num_trajectories_learning_rate_multiplier_power: float 24 | plot_tensors: bool 25 | ppo: bool 26 | precalculate_logprobs: bool 27 | scale_learning_rate_by_reward_std_dev: bool 28 | scale_rewards: bool 29 | truncated_importance_sampling: float | None 30 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/retail/tools/get_user_details.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | import json 4 | from typing import Any, Dict 5 | 6 | from tau_bench.envs.tool import Tool 7 | 8 | 9 | class GetUserDetails(Tool): 10 | @staticmethod 11 | def invoke(data: Dict[str, Any], user_id: str) -> str: 12 | users = data["users"] 13 | if user_id in users: 14 | return json.dumps(users[user_id]) 15 | return "Error: user not found" 16 | 17 | @staticmethod 18 | def get_info() -> Dict[str, Any]: 19 | return { 20 | "type": "function", 21 | "function": { 22 | "name": "get_user_details", 23 | "description": "Get the details of a user, including their orders.", 24 | "parameters": { 25 | "type": "object", 26 | "properties": { 27 | "user_id": { 28 | "type": "string", 29 | "description": "The user id, such as 'sara_doe_496'.", 30 | }, 31 | }, 32 | "required": ["user_id"], 33 | }, 34 | }, 35 | } 36 | -------------------------------------------------------------------------------- /.github/workflows/ruff.yml: -------------------------------------------------------------------------------- 1 | name: Code Quality Checks 2 | 3 | on: 4 | pull_request: 5 | branches: [ main ] 6 | push: 7 | branches: [ main ] 8 | 9 | jobs: 10 | quality-checks: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - name: Checkout code 15 | uses: actions/checkout@v4 16 | 17 | - name: Set up Python 18 | uses: actions/setup-python@v5 19 | with: 20 | python-version: '3.10' 21 | 22 | - name: Install uv 23 | run: | 24 | curl -LsSf https://astral.sh/uv/install.sh | sh 25 | echo "$HOME/.cargo/bin" >> $GITHUB_PATH 26 | 27 | - name: Install dependencies 28 | run: | 29 | uv sync --all-extras 30 | 31 | - name: Run code quality checks 32 | run: | 33 | ./scripts/run_checks.sh --verbose-test-failure || { 34 | echo "" 35 | echo "❌ Code quality checks failed!" 36 | echo "" 37 | echo "To fix these issues locally, run:" 38 | echo " ./scripts/run_checks.sh --fix" 39 | echo "" 40 | echo "Then commit and push the changes." 41 | echo "" 42 | echo "For more details, see CONTRIBUTING.md" 43 | exit 1 44 | } -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/airline/tools/get_user_details.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | import json 4 | from typing import Any, Dict 5 | 6 | from tau_bench.envs.tool import Tool 7 | 8 | 9 | class GetUserDetails(Tool): 10 | @staticmethod 11 | def invoke(data: Dict[str, Any], user_id: str) -> str: 12 | users = data["users"] 13 | if user_id in users: 14 | return json.dumps(users[user_id]) 15 | return "Error: user not found" 16 | 17 | @staticmethod 18 | def get_info() -> Dict[str, Any]: 19 | return { 20 | "type": "function", 21 | "function": { 22 | "name": "get_user_details", 23 | "description": "Get the details of an user, including their reservations.", 24 | "parameters": { 25 | "type": "object", 26 | "properties": { 27 | "user_id": { 28 | "type": "string", 29 | "description": "The user id, such as 'sara_doe_496'.", 30 | }, 31 | }, 32 | "required": ["user_id"], 33 | }, 34 | }, 35 | } 36 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/retail/tools/list_all_product_types.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | import json 4 | from typing import Any, Dict 5 | 6 | from tau_bench.envs.tool import Tool 7 | 8 | 9 | class ListAllProductTypes(Tool): 10 | @staticmethod 11 | def invoke(data: Dict[str, Any]) -> str: 12 | products = data["products"] 13 | product_dict = { 14 | product["name"]: product["product_id"] for product in products.values() 15 | } 16 | product_dict = dict(sorted(product_dict.items())) 17 | return json.dumps(product_dict) 18 | 19 | @staticmethod 20 | def get_info() -> Dict[str, Any]: 21 | return { 22 | "type": "function", 23 | "function": { 24 | "name": "list_all_product_types", 25 | "description": "List the name and product id of all product types. Each product type has a variety of different items with unique item ids and options. There are only 50 product types in the store.", 26 | "parameters": { 27 | "type": "object", 28 | "properties": {}, 29 | "required": [], 30 | }, 31 | }, 32 | } 33 | -------------------------------------------------------------------------------- /src/art/utils/benchmarking/filter_model_split.py: -------------------------------------------------------------------------------- 1 | try: 2 | import polars as pl 3 | except ImportError: 4 | raise ImportError( 5 | "Plotting dependencies are not installed. Please install them with: " 6 | "pip install openpipe-art[plotting]" 7 | ) 8 | 9 | from art.utils.benchmarking.types import BenchmarkModelKey 10 | 11 | 12 | def filter_rename_model_split( 13 | df: pl.DataFrame, models: list[BenchmarkModelKey] 14 | ) -> pl.DataFrame: 15 | # filter by combinations of name + split 16 | z = pl.fold( 17 | acc=pl.lit(False), 18 | function=lambda acc, expr: acc | expr, 19 | exprs=[ 20 | (pl.col("model") == model.name) & (pl.col("split") == model.split) 21 | for model in models 22 | ], 23 | ) 24 | 25 | df = df.filter(z) 26 | 27 | for model in models: 28 | if model.name != model.display_name: 29 | df = df.with_columns( 30 | pl.when( 31 | (pl.col("model") == model.name) & (pl.col("split") == model.split) 32 | ) 33 | .then(pl.lit(model.display_name)) 34 | .otherwise(pl.col("model")) 35 | .alias("model") 36 | ) 37 | 38 | return df 39 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | from typing import Optional, Union 4 | 5 | from tau_bench.envs.base import Env 6 | from tau_bench.envs.user import UserStrategy 7 | 8 | 9 | def get_env( 10 | env_name: str, 11 | user_strategy: Union[str, UserStrategy], 12 | user_model: str, 13 | task_split: str, 14 | user_provider: Optional[str] = None, 15 | task_index: Optional[int] = None, 16 | ) -> Env: 17 | if env_name == "retail": 18 | from tau_bench.envs.retail import MockRetailDomainEnv 19 | 20 | return MockRetailDomainEnv( 21 | user_strategy=user_strategy, 22 | user_model=user_model, 23 | task_split=task_split, 24 | user_provider=user_provider, 25 | task_index=task_index, 26 | ) 27 | elif env_name == "airline": 28 | from tau_bench.envs.airline import MockAirlineDomainEnv 29 | 30 | return MockAirlineDomainEnv( 31 | user_strategy=user_strategy, 32 | user_model=user_model, 33 | task_split=task_split, 34 | user_provider=user_provider, 35 | task_index=task_index, 36 | ) 37 | else: 38 | raise ValueError(f"Unknown environment: {env_name}") 39 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/airline/tools/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | from .book_reservation import BookReservation 4 | from .calculate import Calculate 5 | from .cancel_reservation import CancelReservation 6 | from .get_reservation_details import GetReservationDetails 7 | from .get_user_details import GetUserDetails 8 | from .list_all_airports import ListAllAirports 9 | from .search_direct_flight import SearchDirectFlight 10 | from .search_onestop_flight import SearchOnestopFlight 11 | from .send_certificate import SendCertificate 12 | from .think import Think 13 | from .transfer_to_human_agents import TransferToHumanAgents 14 | from .update_reservation_baggages import UpdateReservationBaggages 15 | from .update_reservation_flights import UpdateReservationFlights 16 | from .update_reservation_passengers import UpdateReservationPassengers 17 | 18 | ALL_TOOLS = [ 19 | BookReservation, 20 | Calculate, 21 | CancelReservation, 22 | GetReservationDetails, 23 | GetUserDetails, 24 | ListAllAirports, 25 | SearchDirectFlight, 26 | SearchOnestopFlight, 27 | SendCertificate, 28 | Think, 29 | TransferToHumanAgents, 30 | UpdateReservationBaggages, 31 | UpdateReservationFlights, 32 | UpdateReservationPassengers, 33 | ] 34 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # OpenPipe Documentation 2 | 3 | This repository contains the source for the ART documentation website hosted at [https://art.openpipe.ai](https://art.openpipe.ai). 4 | 5 | ## Prerequisites 6 | 7 | Ensure you have the following packages installed on your machine: 8 | 9 | - [pnpm](https://pnpm.io/installation) 10 | - [node](https://nodejs.org/en/download/) 11 | 12 | ## Contributing 13 | 14 | To edit the documentation follow these steps: 15 | 16 | 1. Clone the repository 17 | 2. Navigate to the `docs` directory 18 | 3. Run `pnpm install` to install the dependencies 19 | 4. Run `pnpm dev` to start the development server 20 | 5. Edit the files in the `docs` directory 21 | 22 | Edits to files should immediately be reflected in the development server. 23 | 24 | ### Adding new pages 25 | 26 | 1. Create a new .mdx file in the `docs` directory 27 | 2. Navigate to the `mint.json` file and add the new page to the appropriate section to the `navigation` array, or create a new section. Ensure that the path to the new page is correct. 28 | 29 | ### Deploying changes 30 | 31 | To deploy changes to the hosted docs, commit your changes in a new git branch and create a pull request. Once the pull request is merged, the changes will be deployed to the hosted docs. 32 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/airline/tools/get_reservation_details.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | import json 4 | from typing import Any, Dict 5 | 6 | from tau_bench.envs.tool import Tool 7 | 8 | 9 | class GetReservationDetails(Tool): 10 | @staticmethod 11 | def invoke(data: Dict[str, Any], reservation_id: str) -> str: 12 | reservations = data["reservations"] 13 | if reservation_id in reservations: 14 | return json.dumps(reservations[reservation_id]) 15 | return "Error: user not found" 16 | 17 | @staticmethod 18 | def get_info() -> Dict[str, Any]: 19 | return { 20 | "type": "function", 21 | "function": { 22 | "name": "get_reservation_details", 23 | "description": "Get the details of a reservation.", 24 | "parameters": { 25 | "type": "object", 26 | "properties": { 27 | "reservation_id": { 28 | "type": "string", 29 | "description": "The reservation id, such as '8JX2WO'.", 30 | }, 31 | }, 32 | "required": ["reservation_id"], 33 | }, 34 | }, 35 | } 36 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/retail/tools/get_order_details.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | import json 4 | from typing import Any, Dict 5 | 6 | from tau_bench.envs.tool import Tool 7 | 8 | 9 | class GetOrderDetails(Tool): 10 | @staticmethod 11 | def invoke(data: Dict[str, Any], order_id: str) -> str: 12 | orders = data["orders"] 13 | if order_id in orders: 14 | return json.dumps(orders[order_id]) 15 | return "Error: order not found" 16 | 17 | @staticmethod 18 | def get_info() -> Dict[str, Any]: 19 | return { 20 | "type": "function", 21 | "function": { 22 | "name": "get_order_details", 23 | "description": "Get the status and details of an order.", 24 | "parameters": { 25 | "type": "object", 26 | "properties": { 27 | "order_id": { 28 | "type": "string", 29 | "description": "The order id, such as '#W0000000'. Be careful there is a '#' symbol at the beginning of the order id.", 30 | }, 31 | }, 32 | "required": ["order_id"], 33 | }, 34 | }, 35 | } 36 | -------------------------------------------------------------------------------- /examples/mcp-rl/servers/python/mcp_alphavantage/README.md: -------------------------------------------------------------------------------- 1 | # MCP AlphaVantage Python Server 2 | 3 | A Python implementation of the MCP server for Alpha Vantage financial data API. 4 | 5 | ## Features 6 | 7 | - Real-time stock quotes 8 | - Daily time series data 9 | - Symbol search 10 | - Company overview/fundamentals 11 | - Technical indicators (SMA, RSI) 12 | 13 | ## Setup 14 | 15 | 1. Get an API key from [Alpha Vantage](https://www.alphavantage.co/support/#api-key) 16 | 2. Set the environment variable: 17 | ```bash 18 | export ALPHAVANTAGE_API_KEY=your_api_key_here 19 | ``` 20 | 21 | ## Usage 22 | 23 | ### Command Line 24 | ```bash 25 | python server.py --api-key YOUR_API_KEY 26 | ``` 27 | 28 | ### With Environment Variable 29 | ```bash 30 | export ALPHAVANTAGE_API_KEY=your_api_key 31 | python server.py 32 | ``` 33 | 34 | ### Available Tools 35 | 36 | - `get_stock_quote`: Get real-time stock quote 37 | - `get_time_series_daily`: Get daily stock data 38 | - `search_symbol`: Search for stock symbols 39 | - `get_company_overview`: Get company fundamentals 40 | - `get_sma`: Simple Moving Average indicator 41 | - `get_rsi`: Relative Strength Index indicator 42 | 43 | ## Transport Options 44 | 45 | - `stdio` (default): Standard input/output transport 46 | - `sse`: Server-sent events over HTTP -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/retail/tools/get_product_details.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | import json 4 | from typing import Any, Dict 5 | 6 | from tau_bench.envs.tool import Tool 7 | 8 | 9 | class GetProductDetails(Tool): 10 | @staticmethod 11 | def invoke(data: Dict[str, Any], product_id: str) -> str: 12 | products = data["products"] 13 | if product_id in products: 14 | return json.dumps(products[product_id]) 15 | return "Error: product not found" 16 | 17 | @staticmethod 18 | def get_info() -> Dict[str, Any]: 19 | return { 20 | "type": "function", 21 | "function": { 22 | "name": "get_product_details", 23 | "description": "Get the inventory details of a product.", 24 | "parameters": { 25 | "type": "object", 26 | "properties": { 27 | "product_id": { 28 | "type": "string", 29 | "description": "The product id, such as '6086499569'. Be careful the product id is different from the item id.", 30 | }, 31 | }, 32 | "required": ["product_id"], 33 | }, 34 | }, 35 | } 36 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/retail/tools/find_user_id_by_email.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | from typing import Any, Dict 4 | 5 | from tau_bench.envs.tool import Tool 6 | 7 | 8 | class FindUserIdByEmail(Tool): 9 | @staticmethod 10 | def invoke(data: Dict[str, Any], email: str) -> str: 11 | users = data["users"] 12 | for user_id, profile in users.items(): 13 | if profile["email"].lower() == email.lower(): 14 | return user_id 15 | return "Error: user not found" 16 | 17 | @staticmethod 18 | def get_info() -> Dict[str, Any]: 19 | return { 20 | "type": "function", 21 | "function": { 22 | "name": "find_user_id_by_email", 23 | "description": "Find user id by email. If the user is not found, the function will return an error message.", 24 | "parameters": { 25 | "type": "object", 26 | "properties": { 27 | "email": { 28 | "type": "string", 29 | "description": "The email of the user, such as 'something@example.com'.", 30 | }, 31 | }, 32 | "required": ["email"], 33 | }, 34 | }, 35 | } 36 | -------------------------------------------------------------------------------- /src/art/transformers/patches.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Optional, Union 2 | 3 | import torch 4 | from transformers import masking_utils 5 | from transformers.cache_utils import Cache 6 | from transformers.configuration_utils import PretrainedConfig 7 | 8 | if TYPE_CHECKING: 9 | from torch.nn.attention.flex_attention import BlockMask 10 | 11 | _preprocess_mask_arguments = masking_utils._preprocess_mask_arguments 12 | 13 | 14 | def _patched_preprocess_mask_arguments( 15 | config: PretrainedConfig, 16 | input_embeds: torch.Tensor, 17 | attention_mask: Optional[Union[torch.Tensor, "BlockMask"]], 18 | cache_position: torch.Tensor, 19 | past_key_values: Optional[Cache], 20 | position_ids: Optional[torch.Tensor], 21 | layer_idx: Optional[int], 22 | ) -> tuple[bool, Optional[Union[torch.Tensor, "BlockMask"]], int, int]: 23 | if position_ids is not None and len(position_ids.shape) == 3: 24 | position_ids = position_ids[0] 25 | return _preprocess_mask_arguments( 26 | config, 27 | input_embeds, 28 | attention_mask, 29 | cache_position, 30 | past_key_values, 31 | position_ids, 32 | layer_idx, 33 | ) 34 | 35 | 36 | def patch_preprocess_mask_arguments() -> None: 37 | masking_utils._preprocess_mask_arguments = _patched_preprocess_mask_arguments 38 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/airline/tools/transfer_to_human_agents.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | from typing import Any, Dict 4 | 5 | from tau_bench.envs.tool import Tool 6 | 7 | 8 | class TransferToHumanAgents(Tool): 9 | @staticmethod 10 | def invoke( 11 | data: Dict[str, Any], 12 | summary: str, 13 | ) -> str: 14 | return "Transfer successful" 15 | 16 | @staticmethod 17 | def get_info() -> Dict[str, Any]: 18 | return { 19 | "type": "function", 20 | "function": { 21 | "name": "transfer_to_human_agents", 22 | "description": "Transfer the user to a human agent, with a summary of the user's issue. Only transfer if the user explicitly asks for a human agent, or if the user's issue cannot be resolved by the agent with the available tools.", 23 | "parameters": { 24 | "type": "object", 25 | "properties": { 26 | "summary": { 27 | "type": "string", 28 | "description": "A summary of the user's issue.", 29 | }, 30 | }, 31 | "required": [ 32 | "summary", 33 | ], 34 | }, 35 | }, 36 | } 37 | -------------------------------------------------------------------------------- /scripts/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Load environment variables from .env file if it exists 4 | if [ -f .env ]; then 5 | # Read .env file line by line, ignoring comments and empty lines 6 | while IFS= read -r line || [ -n "$line" ]; do 7 | # Skip comments and empty lines 8 | [[ $line =~ ^#.*$ ]] && continue 9 | [[ -z $line ]] && continue 10 | 11 | # Export the variable 12 | export "$line" 13 | done < .env 14 | fi 15 | 16 | # Configure git user name and email 17 | git config --global user.name "${GIT_USER_NAME}" 18 | git config --global user.email "${GIT_USER_EMAIL}" 19 | git config --global --add safe.directory /root/sky_workdir 20 | 21 | if [ "${GIT_RESET_CLEAN:-true}" = "true" ]; then 22 | # Reset any uncommitted changes to the last commit 23 | git reset --hard HEAD 24 | 25 | # Remove all untracked files and directories 26 | git clean -fd 27 | else 28 | echo "Skipping git reset/clean (GIT_RESET_CLEAN is not true). Preserving synced working tree." 29 | fi 30 | 31 | # Install astral-uv 32 | sudo snap install --classic astral-uv 33 | 34 | # Update uv 35 | uv self update 36 | 37 | # Install tmux 38 | apt install tmux -y 39 | 40 | # Sync the dependencies 41 | if [ "${INSTALL_EXTRAS:-false}" = "true" ]; then 42 | uv sync --all-extras 43 | else 44 | uv sync --extra backend 45 | fi -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/retail/tools/think.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | from typing import Any, Dict 4 | 5 | from tau_bench.envs.tool import Tool 6 | 7 | 8 | class Think(Tool): 9 | @staticmethod 10 | def invoke(data: Dict[str, Any], thought: str) -> str: 11 | # This method does not change the state of the data; it simply returns an empty string. 12 | return "Thought Completed" 13 | 14 | @staticmethod 15 | def get_info() -> Dict[str, Any]: 16 | return { 17 | "type": "function", 18 | "function": { 19 | "name": "think", 20 | "description": ( 21 | "Use the tool to think about something. It will not obtain new information or change the database, " 22 | "but just append the thought to the log. Use it when complex reasoning or some cache memory is needed." 23 | ), 24 | "parameters": { 25 | "type": "object", 26 | "properties": { 27 | "thought": { 28 | "type": "string", 29 | "description": "A thought to think about.", 30 | }, 31 | }, 32 | "required": ["thought"], 33 | }, 34 | }, 35 | } 36 | -------------------------------------------------------------------------------- /dev/test_skypilot/register_model.py: -------------------------------------------------------------------------------- 1 | """Training example for MCP agent using rollout with AlphaMcpServer in scenarios.""" 2 | 3 | import asyncio 4 | 5 | from dotenv import load_dotenv 6 | from pydantic import BaseModel 7 | 8 | import art 9 | from art.skypilot.backend import SkyPilotBackend 10 | 11 | load_dotenv() 12 | 13 | 14 | class ComplexModelConfig(BaseModel): 15 | max_turns: int = 5 16 | max_tokens: int = 2048 17 | 18 | base_model: str = "Qwen/Qwen2.5-14B-Instruct" 19 | # Random seed to control which subset of the training data is sampled 20 | training_dataset_seed: int | None = None 21 | 22 | # Training configuration 23 | scale_rewards: bool = True 24 | 25 | 26 | async def register_model(): 27 | backend = await SkyPilotBackend().initialize_cluster( 28 | cluster_name="test-skypilot", 29 | gpu="H100-SXM", 30 | env_path=".env", 31 | # force_restart=True, 32 | ) 33 | 34 | model = art.TrainableModel( 35 | name="complex-model", 36 | project="test-skypilot", 37 | base_model="Qwen/Qwen2.5-14B-Instruct", 38 | config=ComplexModelConfig( 39 | num_epochs=160, 40 | ), 41 | ) 42 | 43 | await backend.register(model) 44 | 45 | print("model registered") 46 | 47 | 48 | if __name__ == "__main__": 49 | asyncio.run(register_model()) 50 | -------------------------------------------------------------------------------- /src/art/vllm/__init__.py: -------------------------------------------------------------------------------- 1 | """vLLM integration module for art.""" 2 | 3 | # Server functionality 4 | # Engine and worker management 5 | from .engine import ( 6 | WorkerExtension, 7 | create_engine_pause_and_resume_functions, 8 | get_llm, 9 | get_worker, 10 | run_on_workers, 11 | ) 12 | 13 | # Patches - these are typically imported for their side effects 14 | from .patches import ( 15 | patch_allocator, 16 | patch_get_lora_tokenizer_async, 17 | patch_listen_for_disconnect, 18 | patch_lora_request, 19 | patch_multi_step_model_runner, 20 | patch_tool_parser_manager, 21 | subclass_chat_completion_request, 22 | ) 23 | from .server import ( 24 | get_uvicorn_logging_config, 25 | openai_server_task, 26 | set_vllm_log_file, 27 | ) 28 | 29 | __all__ = [ 30 | # Server 31 | "openai_server_task", 32 | "get_uvicorn_logging_config", 33 | "set_vllm_log_file", 34 | # Engine 35 | "get_llm", 36 | "create_engine_pause_and_resume_functions", 37 | "run_on_workers", 38 | "get_worker", 39 | "WorkerExtension", 40 | # Patches 41 | "patch_allocator", 42 | "subclass_chat_completion_request", 43 | "patch_lora_request", 44 | "patch_get_lora_tokenizer_async", 45 | "patch_listen_for_disconnect", 46 | "patch_tool_parser_manager", 47 | "patch_multi_step_model_runner", 48 | ] 49 | -------------------------------------------------------------------------------- /examples/hn_title_generator/skypilot-reference-grpo-trainer.yaml: -------------------------------------------------------------------------------- 1 | # To launch, run the following command from the root directory of the art repository: 2 | # `uv run sky launch examples/hn_title_generator/skypilot-reference-grpo-trainer.yaml --cluster=kyle-hn-title-generator-002 --env-file=.env --yes --retry-until-up --down --idle-minutes-to-autostop 60` 3 | 4 | resources: 5 | image_id: pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel 6 | cloud: runpod 7 | region: US 8 | accelerators: 9 | - "H100-SXM" 10 | 11 | workdir: . 12 | 13 | envs: 14 | HF_HUB_ENABLE_HF_TRANSFER: 1 15 | VLLM_CONFIGURE_LOGGING: 0 16 | 17 | setup: | 18 | apt-get update && apt-get install -y git 19 | 20 | curl -LsSf https://astral.sh/uv/install.sh | sh 21 | 22 | # Source the environment to make uv available 23 | source $HOME/.local/bin/env 24 | 25 | uv pip install --system \ 26 | unsloth==2025.3.19 \ 27 | vllm==0.8.2 \ 28 | bitsandbytes==0.45.4 \ 29 | datasets==3.3.2 \ 30 | s3fs==2024.12.0 \ 31 | hf-transfer==0.1.9 \ 32 | typer==0.15.2 \ 33 | fastapi==0.115.11 \ 34 | python-dotenv==1.0.1 \ 35 | polars==1.24.0 \ 36 | wandb==0.19.8 \ 37 | git+https://github.com/corbt/panza.git \ 38 | 39 | echo "Setup complete" 40 | 41 | run: | 42 | echo "Running train_grpo.py" 43 | uv run python examples/hn_title_generator/reference_grpo_trainer.py 44 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/model_utils/model/outlines_completion.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from pydantic import BaseModel 4 | 5 | from tau_bench.model_utils.api.datapoint import Datapoint 6 | from tau_bench.model_utils.model.vllm_completion import VLLMCompletionModel 7 | from tau_bench.model_utils.model.vllm_utils import generate_request 8 | 9 | 10 | class OutlinesCompletionModel(VLLMCompletionModel): 11 | def parse_force_from_prompt( 12 | self, prompt: str, typ: BaseModel, temperature: float | None = None 13 | ) -> dict[str, Any]: 14 | if temperature is None: 15 | temperature = self.temperature 16 | schema = typ.model_json_schema() 17 | res = generate_request( 18 | url=self.url, 19 | prompt=prompt, 20 | force_json=True, 21 | schema=schema, 22 | temperature=temperature, 23 | ) 24 | return self.handle_parse_force_response(prompt=prompt, content=res) 25 | 26 | def get_approx_cost(self, dp: Datapoint) -> float: 27 | return super().get_approx_cost(dp) 28 | 29 | def get_latency(self, dp: Datapoint) -> float: 30 | return super().get_latency(dp) 31 | 32 | def get_capability(self) -> float: 33 | return super().get_capability() 34 | 35 | def supports_dp(self, dp: Datapoint) -> bool: 36 | return super().supports_dp(dp) 37 | -------------------------------------------------------------------------------- /scripts/launch-cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CLUSTER_NAME="art" 4 | 5 | # Parse arguments 6 | ARGS=() 7 | PULL_LATEST=true 8 | while [[ $# -gt 0 ]]; do 9 | case "$1" in 10 | -c) 11 | CLUSTER_NAME="$2" 12 | shift 2 13 | ;; 14 | --no-pull) 15 | PULL_LATEST=false 16 | shift 1 17 | ;; 18 | *) 19 | ARGS+=("$1") 20 | shift 21 | ;; 22 | esac 23 | done 24 | 25 | # Check for unstaged changes 26 | if ! git diff --quiet; then 27 | echo "Warning: You have unstaged changes. Unstaged changes will be discarded from the cluster working directory." 28 | fi 29 | 30 | # Check for uncommitted changes 31 | if ! git diff --cached --quiet; then 32 | echo "Warning: You have uncommitted changes. Uncommitted changes will be discarded from the cluster working directory." 33 | fi 34 | 35 | if [[ "$PULL_LATEST" == true ]]; then 36 | echo "Pulling latest changes..." 37 | if ! git pull; then 38 | echo "Error: Failed to pull latest changes." 39 | exit 1 40 | fi 41 | else 42 | echo "Skipping git pull (deploying current working tree). To pull latest, omit --no-pull." 43 | # Preserve synced working tree on remote by disabling reset/clean. 44 | ARGS+=(--env "GIT_RESET_CLEAN=false") 45 | fi 46 | 47 | # Launch the cluster 48 | uv run sky launch skypilot-config.yaml -c "$CLUSTER_NAME" --env-file .env -y "${ARGS[@]}" -------------------------------------------------------------------------------- /src/art/utils/output_dirs.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from art.model import Model 4 | from art.utils.get_repo_root_path import get_repo_root_path 5 | 6 | 7 | def get_default_art_path() -> str: 8 | root_path = get_repo_root_path() 9 | return os.path.join(root_path, ".art") 10 | 11 | 12 | def get_models_dir(project_name: str, art_path: str | None = None) -> str: 13 | if art_path is None: 14 | art_path = get_default_art_path() 15 | return f"{art_path}/{project_name}/models" 16 | 17 | 18 | def get_model_dir(model: Model, art_path: str | None = None) -> str: 19 | if art_path is None: 20 | art_path = get_default_art_path() 21 | return f"{art_path}/{model.project}/models/{model.name}" 22 | 23 | 24 | def get_output_dir_from_model_properties( 25 | project: str, name: str, art_path: str | None = None 26 | ) -> str: 27 | if art_path is None: 28 | art_path = get_default_art_path() 29 | return f"{art_path}/{project}/models/{name}" 30 | 31 | 32 | def get_step_checkpoint_dir(model_output_dir: str, step: int) -> str: 33 | return f"{model_output_dir}/checkpoints/{step:04d}" 34 | 35 | 36 | def get_trajectories_dir(model_output_dir: str) -> str: 37 | return f"{model_output_dir}/trajectories" 38 | 39 | 40 | def get_trajectories_split_dir(model_output_dir: str, split: str) -> str: 41 | return f"{model_output_dir}/trajectories/{split}" 42 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/retail/tools/transfer_to_human_agents.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | from typing import Any, Dict 4 | 5 | from tau_bench.envs.tool import Tool 6 | 7 | 8 | class TransferToHumanAgents(Tool): 9 | @staticmethod 10 | def invoke(data: Dict[str, Any], summary: str) -> str: 11 | # This method simulates the transfer to a human agent. 12 | return "Transfer successful" 13 | 14 | @staticmethod 15 | def get_info() -> Dict[str, Any]: 16 | return { 17 | "type": "function", 18 | "function": { 19 | "name": "transfer_to_human_agents", 20 | "description": ( 21 | "Transfer the user to a human agent, with a summary of the user's issue. " 22 | "Only transfer if the user explicitly asks for a human agent, or if the user's issue cannot be resolved by the agent with the available tools." 23 | ), 24 | "parameters": { 25 | "type": "object", 26 | "properties": { 27 | "summary": { 28 | "type": "string", 29 | "description": "A summary of the user's issue.", 30 | }, 31 | }, 32 | "required": ["summary"], 33 | }, 34 | }, 35 | } 36 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/airline/env.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | from typing import Optional, Union 4 | 5 | from tau_bench.envs.airline.data import load_data 6 | from tau_bench.envs.airline.rules import RULES 7 | from tau_bench.envs.airline.tools import ALL_TOOLS 8 | from tau_bench.envs.airline.wiki import WIKI 9 | from tau_bench.envs.base import Env 10 | from tau_bench.envs.user import UserStrategy 11 | 12 | 13 | class MockAirlineDomainEnv(Env): 14 | def __init__( 15 | self, 16 | user_strategy: Union[str, UserStrategy] = UserStrategy.LLM, 17 | user_model: str = "gpt-4o", 18 | user_provider: Optional[str] = None, 19 | task_split: str = "test", 20 | task_index: Optional[int] = None, 21 | ): 22 | match task_split: 23 | case "test": 24 | from tau_bench.envs.airline.tasks_test import TASKS as tasks 25 | case _: 26 | raise ValueError(f"Unknown task split: {task_split}") 27 | super().__init__( 28 | data_load_func=load_data, 29 | tools=ALL_TOOLS, 30 | tasks=tasks, 31 | wiki=WIKI, 32 | rules=RULES, 33 | user_strategy=user_strategy, 34 | user_model=user_model, 35 | user_provider=user_provider, 36 | task_index=task_index, 37 | ) 38 | self.terminate_tools = ["transfer_to_human_agents"] 39 | -------------------------------------------------------------------------------- /dev/tau-bench/tau_bench/envs/airline/tools/calculate.py: -------------------------------------------------------------------------------- 1 | # Copyright Sierra 2 | 3 | from typing import Any, Dict 4 | 5 | from tau_bench.envs.tool import Tool 6 | 7 | 8 | class Calculate(Tool): 9 | @staticmethod 10 | def invoke(data: Dict[str, Any], expression: str) -> str: 11 | if not all(char in "0123456789+-*/(). " for char in expression): 12 | return "Error: invalid characters in expression" 13 | try: 14 | return str(round(float(eval(expression, {"__builtins__": None}, {})), 2)) 15 | except Exception as e: 16 | return f"Error: {e}" 17 | 18 | @staticmethod 19 | def get_info() -> Dict[str, Any]: 20 | return { 21 | "type": "function", 22 | "function": { 23 | "name": "calculate", 24 | "description": "Calculate the result of a mathematical expression.", 25 | "parameters": { 26 | "type": "object", 27 | "properties": { 28 | "expression": { 29 | "type": "string", 30 | "description": "The mathematical expression to calculate, such as '2 + 2'. The expression can contain numbers, operators (+, -, *, /), parentheses, and spaces.", 31 | }, 32 | }, 33 | "required": ["expression"], 34 | }, 35 | }, 36 | } 37 | -------------------------------------------------------------------------------- /src/art/skypilot/stop_server.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import asyncio 3 | 4 | import sky 5 | 6 | from art.skypilot.backend import SkyPilotBackend 7 | from art.skypilot.utils import is_task_created, to_thread_typed 8 | 9 | parser = argparse.ArgumentParser( 10 | description="Close the art server hosted on a skypilot cluster" 11 | ) 12 | parser.add_argument( 13 | "--cluster", 14 | type=str, 15 | required=True, 16 | help="The name of the skypilot cluster to close the art server on", 17 | ) 18 | args = parser.parse_args() 19 | 20 | 21 | async def stop_server() -> None: 22 | cluster_status = await to_thread_typed( 23 | lambda: sky.stream_and_get(sky.status(cluster_names=[args.cluster])) 24 | ) 25 | if len(cluster_status) == 0 or cluster_status[0]["status"] != sky.ClusterStatus.UP: 26 | raise ValueError(f"Cluster {args.cluster} is not running") 27 | 28 | if not await is_task_created(cluster_name=args.cluster, task_name="art_server"): 29 | raise ValueError(f"Art server task for cluster {args.cluster} is not running") 30 | 31 | backend = await SkyPilotBackend.initialize_cluster( 32 | cluster_name=args.cluster, art_version=".", env_path=".env", gpu="H100" 33 | ) 34 | await backend.close() 35 | 36 | # cancel the art server task 37 | await to_thread_typed(lambda: sky.cancel(cluster_name=args.cluster, all=True)) 38 | 39 | 40 | def main() -> None: 41 | asyncio.run(stop_server()) 42 | -------------------------------------------------------------------------------- /AGENT.md: -------------------------------------------------------------------------------- 1 | ## uv package manager by default 2 | 3 | This project uses the `uv` package manager. 4 | 5 | - To add a dependency, run `uv add `. 6 | - To run a script, run `uv run