├── CLAUDE.md
├── src
    ├── art
    │   ├── py.typed
    │   ├── unsloth
    │   │   └── __init__.py
    │   ├── preprocessing
    │   │   └── __init__.py
    │   ├── torchtune
    │   │   ├── __init__.py
    │   │   ├── batch.py
    │   │   └── train.sh
    │   ├── transformers
    │   │   ├── __init__.py
    │   │   └── patches.py
    │   ├── local
    │   │   ├── __init__.py
    │   │   ├── service.py
    │   │   └── checkpoints.py
    │   ├── skypilot
    │   │   ├── __init__.py
    │   │   └── stop_server.py
    │   ├── serverless
    │   │   └── __init__.py
    │   ├── rewards
    │   │   └── __init__.py
    │   ├── langgraph
    │   │   ├── __init__.py
    │   │   └── logging.py
    │   ├── utils
    │   │   ├── benchmarking
    │   │   │   ├── charts
    │   │   │   │   └── __init__.py
    │   │   │   ├── types.py
    │   │   │   ├── filter_model_split.py
    │   │   │   └── pull_model_trajectories.py
    │   │   ├── old_benchmarking
    │   │   │   ├── display_image_grid.py
    │   │   │   ├── calculate_step_metrics.py
    │   │   │   ├── generate_comparison_table.py
    │   │   │   └── types.py
    │   │   ├── __init__.py
    │   │   ├── get_repo_root_path.py
    │   │   ├── deployment
    │   │   │   ├── legacy.py
    │   │   │   └── __init__.py
    │   │   ├── format_message.py
    │   │   ├── limit_concurrency.py
    │   │   ├── log_http_errors.py
    │   │   ├── get_model_step.py
    │   │   ├── benchmark_rollout.py
    │   │   ├── logging.py
    │   │   ├── output_dirs.py
    │   │   ├── strip_logprobs.py
    │   │   └── deploy_model.py
    │   ├── mcp
    │   │   ├── default_tools.py
    │   │   └── __init__.py
    │   ├── dev
    │   │   ├── __init__.py
    │   │   ├── train.py
    │   │   └── torchtune.py
    │   ├── types.py
    │   ├── yield_trajectory.py
    │   ├── vllm
    │   │   └── __init__.py
    │   ├── errors.py
    │   ├── batches.py
    │   └── __init__.py
    └── mp_actors
    │   ├── __init__.py
    │   └── traceback.py
├── .python-version
├── dev
    ├── swebench
    │   ├── __init__.py
    │   ├── tools
    │   │   ├── registry
    │   │   │   ├── lib
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── registry.py
    │   │   │   ├── config.yaml
    │   │   │   ├── install.sh
    │   │   │   └── bin
    │   │   │   │   ├── _write_env
    │   │   │   │   └── _read_env
    │   │   ├── review_on_submit_m
    │   │   │   ├── install.sh
    │   │   │   ├── README.md
    │   │   │   ├── config.yaml
    │   │   │   └── bin
    │   │   │   │   └── submit
    │   │   └── edit_anthropic
    │   │   │   ├── install.sh
    │   │   │   └── bin
    │   │   │       └── _state_anthropic
    │   ├── sandbox
    │   │   ├── __init__.py
    │   │   ├── daytona.py
    │   │   ├── modal.py
    │   │   └── new.py
    │   ├── sandboxes.py
    │   ├── run.py
    │   ├── pyproject.toml
    │   └── trl.ipynb
    ├── tau-bench
    │   ├── .python-version
    │   ├── tau_bench
    │   │   ├── model_utils
    │   │   │   ├── api
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── types.py
    │   │   │   │   ├── _model_methods.py
    │   │   │   │   ├── exception.py
    │   │   │   │   └── logging.py
    │   │   │   ├── model
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── exception.py
    │   │   │   │   ├── vllm_utils.py
    │   │   │   │   └── outlines_completion.py
    │   │   │   ├── func_tools
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── filter.py
    │   │   │   │   └── map.py
    │   │   │   └── args.py
    │   │   ├── agents
    │   │   │   ├── __init__.py
    │   │   │   └── base.py
    │   │   ├── envs
    │   │   │   ├── airline
    │   │   │   │   ├── rules.py
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── wiki.py
    │   │   │   │   ├── data
    │   │   │   │   │   └── __init__.py
    │   │   │   │   ├── tools
    │   │   │   │   │   ├── think.py
    │   │   │   │   │   ├── get_user_details.py
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── get_reservation_details.py
    │   │   │   │   │   ├── transfer_to_human_agents.py
    │   │   │   │   │   ├── calculate.py
    │   │   │   │   │   ├── cancel_reservation.py
    │   │   │   │   │   ├── list_all_airports.py
    │   │   │   │   │   ├── send_certificate.py
    │   │   │   │   │   ├── search_direct_flight.py
    │   │   │   │   │   └── update_reservation_passengers.py
    │   │   │   │   └── env.py
    │   │   │   ├── retail
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── wiki.py
    │   │   │   │   ├── data
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── readme.md
    │   │   │   │   ├── rules.py
    │   │   │   │   ├── tools
    │   │   │   │   │   ├── get_user_details.py
    │   │   │   │   │   ├── list_all_product_types.py
    │   │   │   │   │   ├── get_order_details.py
    │   │   │   │   │   ├── get_product_details.py
    │   │   │   │   │   ├── find_user_id_by_email.py
    │   │   │   │   │   ├── think.py
    │   │   │   │   │   ├── transfer_to_human_agents.py
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── calculate.py
    │   │   │   │   │   └── find_user_id_by_name_zip.py
    │   │   │   │   └── env.py
    │   │   │   ├── tool.py
    │   │   │   └── __init__.py
    │   │   └── __init__.py
    │   ├── .gitignore
    │   ├── MANIFEST.in
    │   ├── packed_tensor_images
    │   │   ├── packed_tensors_plot_1752190878.png
    │   │   ├── packed_tensors_plot_1752193757.png
    │   │   ├── packed_tensors_plot_1752196743.png
    │   │   ├── packed_tensors_plot_1752199731.png
    │   │   ├── packed_tensors_plot_1752202622.png
    │   │   ├── packed_tensors_plot_1752205600.png
    │   │   ├── packed_tensors_plot_1752208547.png
    │   │   ├── packed_tensors_plot_1752211467.png
    │   │   ├── packed_tensors_plot_1752214557.png
    │   │   └── packed_tensors_plot_1752217461.png
    │   ├── check.py
    │   ├── setup.py
    │   ├── pyproject.toml
    │   └── LICENSE
    ├── playwright_agent
    │   ├── pyproject.toml
    │   └── job_desc_dataset.json
    ├── test_skypilot
    │   ├── launch.py
    │   ├── launch_tail.py
    │   └── register_model.py
    └── new_models
    │   ├── prompts.json
    │   ├── qwen3_try.py
    │   └── gemma3.py
├── examples
    ├── 2048
    │   ├── generate_benchmarks.py
    │   └── train.py
    ├── mcp-rl
    │   ├── README.md
    │   ├── servers
    │   │   └── python
    │   │   │   ├── mcp_balldontlie
    │   │   │       ├── __init__.py
    │   │   │       ├── server_params.py
    │   │   │       ├── scenarios
    │   │   │       │   └── val.jsonl
    │   │   │       └── README.md
    │   │   │   ├── mcp_googlemaps
    │   │   │       ├── __init__.py
    │   │   │       ├── server_params.py
    │   │   │       └── pyproject.toml
    │   │   │   └── mcp_alphavantage
    │   │   │       ├── __init__.py
    │   │   │       ├── server_params.py
    │   │   │       ├── README.md
    │   │   │       └── scenarios
    │   │   │           └── val.jsonl
    │   ├── mcp_rl
    │   │   ├── __init__.py
    │   │   └── utils.py
    │   ├── pyproject.toml
    │   └── all_experiments.py
    ├── just-the-facts
    │   ├── README.md
    │   ├── .gitignore
    │   ├── just_the_facts
    │   │   ├── __init__.py
    │   │   ├── find_articles.py
    │   │   └── experiments.py
    │   ├── main.py
    │   ├── test_scraper.py
    │   └── pyproject.toml
    ├── roflbot
    │   └── .gitignore
    ├── hn_title_generator
    │   ├── skypilot.yaml
    │   └── skypilot-reference-grpo-trainer.yaml
    └── tic_tac_toe_self_play
    │   ├── gather_trajectory_groups_by_index.py
    │   └── deploy_step.py
├── docs
    ├── .gitignore
    ├── images
    │   ├── forked-run.webp
    │   ├── faq
    │   │   └── art-loop.webp
    │   ├── ruler-results.png
    │   ├── site-assets
    │   │   └── favicon.webp
    │   └── open-deep-research-progress.png
    ├── package.json
    ├── analytics.js
    ├── README.md
    ├── style.css
    ├── getting-started
    │   └── quick-start.mdx
    ├── resources
    │   ├── glossary.mdx
    │   └── models.mdx
    ├── experimental
    │   └── gspo.mdx
    ├── fundamentals
    │   └── training-loop.mdx
    └── docs.json
├── assets
    ├── ART_logo.png
    ├── ART_pill.png
    ├── Discord.png
    ├── ART_E_pill.png
    ├── ART_header.png
    ├── Colab_pill.png
    ├── Train_pill.png
    ├── ART_E_graphs.png
    ├── Header_separator.png
    ├── Documentation_pill.png
    └── benchmarks
    │   └── codenames
    │       └── win_rate_over_time.png
├── scripts
    ├── kill-gpu-processes.sh
    ├── publish.sh
    ├── setup.sh
    ├── launch-cluster.sh
    ├── migrate-s3-checkpoints.py
    └── bump_version.py
├── .skyignore
├── .dockerignore
├── .gitignore
├── requirements
    └── backend.vcs.txt
├── .env.example
├── .github
    └── workflows
    │   ├── ruff.yml
    │   └── release.yml
├── AGENT.md
├── THIRD-PARTY-NOTICES
└── pyproject.toml


/CLAUDE.md:
--------------------------------------------------------------------------------
1 | AGENT.md


--------------------------------------------------------------------------------
/src/art/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.10
2 | 


--------------------------------------------------------------------------------
/dev/swebench/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/mcp-rl/README.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/art/unsloth/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/just-the-facts/README.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/art/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/art/torchtune/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/art/transformers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dev/tau-bench/.python-version:
--------------------------------------------------------------------------------
1 | 3.11
2 | 


--------------------------------------------------------------------------------
/dev/swebench/tools/registry/lib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dev/swebench/tools/registry/config.yaml:
--------------------------------------------------------------------------------
1 | tools: {}


--------------------------------------------------------------------------------
/dev/swebench/tools/review_on_submit_m/install.sh:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/just-the-facts/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info/


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/model_utils/api/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/model_utils/model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | package-lock.json


--------------------------------------------------------------------------------
/examples/mcp-rl/servers/python/mcp_balldontlie/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/mcp-rl/servers/python/mcp_googlemaps/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/agents/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Sierra
2 | 


--------------------------------------------------------------------------------
/examples/roflbot/.gitignore:
--------------------------------------------------------------------------------
1 | *.db
2 | /data/
3 | .env
4 | .venv/
5 | 


--------------------------------------------------------------------------------
/dev/tau-bench/.gitignore:
--------------------------------------------------------------------------------
1 | results/
2 | benchmark_results/
3 | error_analysis_results/


--------------------------------------------------------------------------------
/examples/just-the-facts/just_the_facts/__init__.py:
--------------------------------------------------------------------------------
1 | # Just the Facts package
2 | 


--------------------------------------------------------------------------------
/assets/ART_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/assets/ART_logo.png


--------------------------------------------------------------------------------
/assets/ART_pill.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/assets/ART_pill.png


--------------------------------------------------------------------------------
/assets/Discord.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/assets/Discord.png


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/airline/rules.py:
--------------------------------------------------------------------------------
1 | # Copyright Sierra
2 | 
3 | RULES = []
4 | 


--------------------------------------------------------------------------------
/assets/ART_E_pill.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/assets/ART_E_pill.png


--------------------------------------------------------------------------------
/assets/ART_header.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/assets/ART_header.png


--------------------------------------------------------------------------------
/assets/Colab_pill.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/assets/Colab_pill.png


--------------------------------------------------------------------------------
/assets/Train_pill.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/assets/Train_pill.png


--------------------------------------------------------------------------------
/assets/ART_E_graphs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/assets/ART_E_graphs.png


--------------------------------------------------------------------------------
/assets/Header_separator.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/assets/Header_separator.png


--------------------------------------------------------------------------------
/dev/tau-bench/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include tau_bench *.json
2 | recursive-include tau_bench *.md
3 | 


--------------------------------------------------------------------------------
/docs/images/forked-run.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/docs/images/forked-run.webp


--------------------------------------------------------------------------------
/examples/mcp-rl/servers/python/mcp_alphavantage/__init__.py:
--------------------------------------------------------------------------------
1 | """MCP AlphaVantage Python Server"""
2 | 


--------------------------------------------------------------------------------
/src/art/local/__init__.py:
--------------------------------------------------------------------------------
1 | from .backend import LocalBackend
2 | 
3 | __all__ = ["LocalBackend"]
4 | 


--------------------------------------------------------------------------------
/assets/Documentation_pill.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/assets/Documentation_pill.png


--------------------------------------------------------------------------------
/docs/images/faq/art-loop.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/docs/images/faq/art-loop.webp


--------------------------------------------------------------------------------
/docs/images/ruler-results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/docs/images/ruler-results.png


--------------------------------------------------------------------------------
/src/art/skypilot/__init__.py:
--------------------------------------------------------------------------------
1 | from .backend import SkyPilotBackend
2 | 
3 | __all__ = ["SkyPilotBackend"]
4 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/model_utils/api/types.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 | 
3 | PartialObj = dict[str, Any]
4 | 


--------------------------------------------------------------------------------
/dev/swebench/tools/edit_anthropic/install.sh:
--------------------------------------------------------------------------------
1 | pip install 'tree-sitter==0.21.3'
2 | pip install 'tree-sitter-languages'


--------------------------------------------------------------------------------
/src/art/serverless/__init__.py:
--------------------------------------------------------------------------------
1 | from .backend import ServerlessBackend
2 | 
3 | __all__ = ["ServerlessBackend"]
4 | 


--------------------------------------------------------------------------------
/docs/images/site-assets/favicon.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/docs/images/site-assets/favicon.webp


--------------------------------------------------------------------------------
/scripts/kill-gpu-processes.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | nvidia-smi --query-compute-apps=pid --format=csv,noheader | xargs -r kill -9


--------------------------------------------------------------------------------
/src/art/rewards/__init__.py:
--------------------------------------------------------------------------------
1 | from .ruler import ruler, ruler_score_group
2 | 
3 | __all__ = ["ruler", "ruler_score_group"]
4 | 


--------------------------------------------------------------------------------
/docs/images/open-deep-research-progress.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/docs/images/open-deep-research-progress.png


--------------------------------------------------------------------------------
/src/art/langgraph/__init__.py:
--------------------------------------------------------------------------------
1 | from .llm_wrapper import init_chat_model, wrap_rollout
2 | 
3 | __all__ = ["wrap_rollout", "init_chat_model"]
4 | 


--------------------------------------------------------------------------------
/src/mp_actors/__init__.py:
--------------------------------------------------------------------------------
1 | from .move import close_proxy, move_to_child_process
2 | 
3 | __all__ = ["close_proxy", "move_to_child_process"]
4 | 


--------------------------------------------------------------------------------
/assets/benchmarks/codenames/win_rate_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/assets/benchmarks/codenames/win_rate_over_time.png


--------------------------------------------------------------------------------
/examples/just-the-facts/main.py:
--------------------------------------------------------------------------------
1 | def main():
2 |     print("Hello from just-the-facts!")
3 | 
4 | 
5 | if __name__ == "__main__":
6 |     main()
7 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Sierra
2 | 
3 | from tau_bench.agents.base import Agent as Agent
4 | from tau_bench.envs.base import Env as Env
5 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/retail/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Sierra
2 | 
3 | from tau_bench.envs.retail.env import MockRetailDomainEnv as MockRetailDomainEnv
4 | 


--------------------------------------------------------------------------------
/dev/swebench/sandbox/__init__.py:
--------------------------------------------------------------------------------
1 | from .new import new_sandbox
2 | from .sandbox import Provider, Sandbox
3 | 
4 | __all__ = ["new_sandbox", "Provider", "Sandbox"]
5 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/airline/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Sierra
2 | 
3 | from tau_bench.envs.airline.env import MockAirlineDomainEnv as MockAirlineDomainEnv
4 | 


--------------------------------------------------------------------------------
/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752190878.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752190878.png


--------------------------------------------------------------------------------
/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752193757.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752193757.png


--------------------------------------------------------------------------------
/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752196743.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752196743.png


--------------------------------------------------------------------------------
/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752199731.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752199731.png


--------------------------------------------------------------------------------
/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752202622.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752202622.png


--------------------------------------------------------------------------------
/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752205600.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752205600.png


--------------------------------------------------------------------------------
/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752208547.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752208547.png


--------------------------------------------------------------------------------
/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752211467.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752211467.png


--------------------------------------------------------------------------------
/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752214557.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752214557.png


--------------------------------------------------------------------------------
/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752217461.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPipe/ART/HEAD/dev/tau-bench/packed_tensor_images/packed_tensors_plot_1752217461.png


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/model_utils/func_tools/__init__.py:
--------------------------------------------------------------------------------
1 | from tau_bench.model_utils.func_tools.filter import filter as filter
2 | from tau_bench.model_utils.func_tools.map import map as map
3 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/model_utils/api/_model_methods.py:
--------------------------------------------------------------------------------
1 | MODEL_METHODS = [
2 |     "classify",
3 |     "binary_classify",
4 |     "parse",
5 |     "generate",
6 |     "parse_force",
7 |     "score",
8 | ]
9 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/airline/wiki.py:
--------------------------------------------------------------------------------
1 | # Copyright Sierra
2 | 
3 | import os
4 | 
5 | FOLDER_PATH = os.path.dirname(__file__)
6 | 
7 | with open(os.path.join(FOLDER_PATH, "wiki.md"), "r") as f:
8 |     WIKI = f.read()
9 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/retail/wiki.py:
--------------------------------------------------------------------------------
1 | # Copyright Sierra
2 | 
3 | import os
4 | 
5 | FOLDER_PATH = os.path.dirname(__file__)
6 | 
7 | with open(os.path.join(FOLDER_PATH, "wiki.md"), "r") as f:
8 |     WIKI = f.read()
9 | 


--------------------------------------------------------------------------------
/dev/playwright_agent/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "playwright-agent"
 3 | version = "0.1.0"
 4 | requires-python = ">=3.10"
 5 | dependencies = [
 6 |     "mcp>=1.13.1",
 7 |     "openpipe>=5.0.0",
 8 |     "panza>=0.1.0",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/examples/mcp-rl/mcp_rl/__init__.py:
--------------------------------------------------------------------------------
1 | """ART MCP package."""
2 | 
3 | from .mcp_server import AlphaMcpServer, McpServer
4 | from .rollout import McpScenario, rollout
5 | 
6 | __all__ = ["rollout", "McpScenario", "McpServer", "AlphaMcpServer"]
7 | 


--------------------------------------------------------------------------------
/dev/swebench/tools/registry/install.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | # script_dir=$(dirname "$(readlink -f "$0")")
4 | bundle_dir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
5 | 
6 | export PYTHONPATH="$bundle_dir/lib":$PYTHONPATH


--------------------------------------------------------------------------------
/dev/swebench/tools/review_on_submit_m/README.md:
--------------------------------------------------------------------------------
1 | # Review on submit.
2 | 
3 | Provides an alternative for `submit` that does not immediately submit, but asks the
4 | agent to perform additional reviewing steps.
5 | 
6 | Only `submit -f` will trigger the real submit.


--------------------------------------------------------------------------------
/dev/tau-bench/check.py:
--------------------------------------------------------------------------------
1 | from langfuse import Langfuse
2 | 
3 | langfuse = Langfuse(
4 |     secret_key="sk-lf-22c352a1-945f-45fe-ae01-a4b3f67527c0",
5 |     public_key="pk-lf-94184f77-f55b-4f4a-af05-1cc34b2f89bd",
6 |     host="https://us.cloud.langfuse.com",
7 | )
8 | 


--------------------------------------------------------------------------------
/dev/swebench/tools/review_on_submit_m/config.yaml:
--------------------------------------------------------------------------------
1 | tools:
2 |   submit:
3 |     signature: "submit"
4 |     docstring: "submits the current file"
5 |     # Do not actually show the -f argument to the model, only
6 |     # use it from the agent for submission after error
7 | 


--------------------------------------------------------------------------------
/src/art/utils/benchmarking/charts/__init__.py:
--------------------------------------------------------------------------------
1 | from .percentage_comparison_bar_chart import percentage_comparison_bar_chart
2 | from .training_progress_chart import training_progress_chart
3 | 
4 | __all__ = ["percentage_comparison_bar_chart", "training_progress_chart"]
5 | 


--------------------------------------------------------------------------------
/.skyignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | .art/
 3 | # .env
 4 | .venv/
 5 | grpo_trainer_lora_model/
 6 | logs/
 7 | shared_cache.db
 8 | streaming-chat-completions/
 9 | unsloth_compiled_cache/
10 | wandb/
11 | docs/node_modules/
12 | dist/
13 | dev/art-e/data/
14 | replays/
15 | trajectories/
16 | .DS_Store
17 | # .local/


--------------------------------------------------------------------------------
/src/art/torchtune/batch.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | from .. import dev, types
 4 | from ..preprocessing.pack import DiskPackedTensors
 5 | 
 6 | 
 7 | class Batch(BaseModel):
 8 |     disk_packed_tensors: DiskPackedTensors
 9 |     config: types.TrainConfig
10 |     dev_config: dev.TrainConfig
11 | 


--------------------------------------------------------------------------------
/dev/swebench/tools/registry/bin/_write_env:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | from registry import registry  # type: ignore
 6 | 
 7 | if __name__ == "__main__":
 8 |     var_name = sys.argv[1]
 9 |     var_value = sys.argv[2] if len(sys.argv) > 2 else ""
10 |     registry[var_name] = var_value
11 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | .art/
 3 | # .env
 4 | .venv/
 5 | grpo_trainer_lora_model/
 6 | logs/
 7 | shared_cache.db
 8 | streaming-chat-completions/
 9 | unsloth_compiled_cache/
10 | wandb/
11 | docs/node_modules/
12 | dist/
13 | replays/
14 | trajectories/
15 | .DS_Store
16 | # .local/
17 | # .claude/
18 | .vscode/


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/tool.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | from typing import Any
 3 | 
 4 | 
 5 | class Tool(abc.ABC):
 6 |     @staticmethod
 7 |     def invoke(*args, **kwargs):
 8 |         raise NotImplementedError
 9 | 
10 |     @staticmethod
11 |     def get_info() -> dict[str, Any]:
12 |         raise NotImplementedError
13 | 


--------------------------------------------------------------------------------
/dev/swebench/tools/registry/bin/_read_env:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | from registry import registry  # type: ignore
 6 | 
 7 | if __name__ == "__main__":
 8 |     var_name = sys.argv[1]
 9 |     default_value = sys.argv[2] if len(sys.argv) > 2 else ""
10 |     print(registry.get(var_name, default_value))
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | .art/
 3 | .env
 4 | .venv/
 5 | grpo_trainer_lora_model/
 6 | logs/
 7 | shared_cache.db
 8 | data/cache.db
 9 | streaming-chat-completions/
10 | unsloth_compiled_cache/
11 | wandb/
12 | docs/node_modules/
13 | dist/
14 | replays/
15 | trajectories/
16 | .DS_Store
17 | .local/
18 | .claude/
19 | .vscode/
20 | .ruff_cache/
21 | !/src/art/wandb/
22 | !/src/art/wandb/**
23 | /src/art/wandb/__pycache__/


--------------------------------------------------------------------------------
/dev/swebench/sandboxes.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | import modal
 4 | 
 5 | 
 6 | async def terminate_sandboxes() -> None:
 7 |     sandboxes: list[modal.Sandbox] = []
 8 |     async for sandbox in modal.Sandbox.list.aio(
 9 |         app_id=modal.App.lookup("swe-rex", create_if_missing=True).app_id
10 |     ):
11 |         sandboxes.append(sandbox)
12 |     _ = await asyncio.gather(*[sandbox.terminate.aio() for sandbox in sandboxes])
13 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/agents/base.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | import abc
 4 | from typing import Optional
 5 | 
 6 | from tau_bench.envs.base import Env
 7 | from tau_bench.types import SolveResult
 8 | 
 9 | 
10 | class Agent(abc.ABC):
11 |     @abc.abstractmethod
12 |     async def solve(
13 |         self, env: Env, task_index: Optional[int] = None, max_num_steps: int = 30
14 |     ) -> SolveResult:
15 |         raise NotImplementedError
16 | 


--------------------------------------------------------------------------------
/src/art/utils/benchmarking/types.py:
--------------------------------------------------------------------------------
 1 | class BenchmarkModelKey:
 2 |     name: str
 3 |     display_name: str
 4 |     split: str
 5 | 
 6 |     def __init__(
 7 |         self, name: str, display_name: str | None = None, split: str | None = None
 8 |     ):
 9 |         self.name = name
10 |         self.display_name = display_name or name
11 |         self.split = split or "val"
12 | 
13 |     def __str__(self):
14 |         return self.display_name
15 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/model_utils/args.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from tau_bench.model_utils.model.model import Platform
 4 | 
 5 | 
 6 | def api_parser() -> argparse.ArgumentParser:
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("--model", type=str)
 9 |     parser.add_argument("--base-url", type=str)
10 |     parser.add_argument(
11 |         "--platform", type=str, required=True, choices=[e.value for e in Platform]
12 |     )
13 |     return parser
14 | 


--------------------------------------------------------------------------------
/docs/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "docs",
 3 |   "version": "1.0.0",
 4 |   "description": "",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "dev": "mintlify dev --port 3001",
 8 |     "build": "mintlify build",
 9 |     "generate:routes": "npx @mintlify/scraping@latest openapi-file ./openapi.json --outDir ./api-reference"
10 |   },
11 |   "keywords": [],
12 |   "author": "",
13 |   "license": "ISC",
14 |   "dependencies": {
15 |     "mintlify": "^4.0.433"
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/art/mcp/default_tools.py:
--------------------------------------------------------------------------------
 1 | from art.mcp.types import MCPTool
 2 | 
 3 | complete_task_tool = MCPTool(
 4 |     name="complete_task",
 5 |     description="Complete a task",
 6 |     parameters={
 7 |         "type": "object",
 8 |         "properties": {
 9 |             "summary": {
10 |                 "type": "string",
11 |                 "description": "Summary of accomplishments",
12 |             }
13 |         },
14 |         "required": ["summary"],
15 |     },
16 | )
17 | 


--------------------------------------------------------------------------------
/src/art/utils/old_benchmarking/display_image_grid.py:
--------------------------------------------------------------------------------
 1 | from IPython.display import HTML, display
 2 | 
 3 | 
 4 | def display_image_grid(image_paths: list[str], images_per_row: int = 2):
 5 |     html = f"""
 6 |     <div style='display: grid; grid-template-columns: repeat({images_per_row}, 1fr); gap: 10px;'>
 7 |     """
 8 |     for path in image_paths:
 9 |         html += f"<img src='{path}' style='max-width: 100%; height: auto;'>"
10 |     html += "</div>"
11 |     display(HTML(html))
12 | 


--------------------------------------------------------------------------------
/examples/mcp-rl/servers/python/mcp_balldontlie/server_params.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from dotenv import load_dotenv
 4 | from mcp import StdioServerParameters
 5 | 
 6 | load_dotenv()
 7 | 
 8 | server_params = StdioServerParameters(
 9 |     command="python",
10 |     args=[
11 |         "servers/python/mcp_balldontlie/server.py",
12 |         "--api-key",
13 |         os.getenv("BALLDONTLIE_API_KEY", ""),
14 |     ],
15 |     env={"BALLDONTLIE_API_KEY": os.getenv("BALLDONTLIE_API_KEY")},
16 | )
17 | 


--------------------------------------------------------------------------------
/examples/mcp-rl/servers/python/mcp_googlemaps/server_params.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from dotenv import load_dotenv
 4 | from mcp import StdioServerParameters
 5 | 
 6 | load_dotenv()
 7 | 
 8 | server_params = StdioServerParameters(
 9 |     command="python",
10 |     args=[
11 |         "servers/python/mcp_googlemaps/server.py",
12 |         "--api-key",
13 |         os.getenv("GOOGLE_MAPS_API_KEY", ""),
14 |     ],
15 |     env={"GOOGLE_MAPS_API_KEY": os.getenv("GOOGLE_MAPS_API_KEY")},
16 | )
17 | 


--------------------------------------------------------------------------------
/examples/mcp-rl/servers/python/mcp_alphavantage/server_params.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from dotenv import load_dotenv
 4 | from mcp import StdioServerParameters
 5 | 
 6 | load_dotenv()
 7 | 
 8 | server_params = StdioServerParameters(
 9 |     command="python",
10 |     args=[
11 |         "servers/python/mcp_alphavantage/server.py",
12 |         "--api-key",
13 |         os.getenv("ALPHAVANTAGE_API_KEY", "demo"),
14 |     ],
15 |     env={"ALPHAVANTAGE_API_KEY": os.getenv("ALPHAVANTAGE_API_KEY")},
16 | )
17 | 


--------------------------------------------------------------------------------
/dev/swebench/tools/edit_anthropic/bin/_state_anthropic:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import json
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | 
 8 | def main():
 9 |     state_path = Path("/root/state.json")
10 |     if state_path.exists():
11 |         state = json.loads(state_path.read_text())
12 |     else:
13 |         state = {}
14 | 
15 |     state["working_dir"] = os.getcwd()
16 | 
17 |     state_path.write_text(json.dumps(state))
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     main()
22 | 


--------------------------------------------------------------------------------
/scripts/publish.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # Load the .env file
 5 | set -o allexport
 6 | source .env
 7 | 
 8 | # Check if PYPI_ART_TOKEN is set
 9 | if [[ -z "${PYPI_ART_TOKEN}" ]]; then
10 |     echo "Error: PYPI_ART_TOKEN is not set."
11 |     exit 1
12 | fi
13 | 
14 | # Delete the dist directory
15 | rm -rf dist
16 | 
17 | # Build the package
18 | uv run hatch build
19 | 
20 | 
21 | # If the token is set, proceed with publishing
22 | uv publish --username=__token__ --password=$PYPI_ART_TOKEN
23 | 


--------------------------------------------------------------------------------
/requirements/backend.vcs.txt:
--------------------------------------------------------------------------------
 1 | # Pinned backend dependencies that must come from VCS (not allowed in PyPI metadata).
 2 | # Install with:
 3 | #   uv pip install -r requirements/backend.vcs.txt
 4 | # or
 5 | #   pip install -r requirements/backend.vcs.txt
 6 | 
 7 | # Torchtune pinned to known-good commit
 8 |  torchtune @ git+https://github.com/pytorch/torchtune.git@2344509cf83bd886538fe3e8263e5145d1afb5c2
 9 | 
10 | # Unsloth Zoo pinned to known-good commit
11 |  unsloth-zoo @ git+https://github.com/bradhilton/unsloth-zoo@323cf5e
12 | 


--------------------------------------------------------------------------------
/src/art/mcp/__init__.py:
--------------------------------------------------------------------------------
 1 | """MCP utilities for Agent Reinforcement Training."""
 2 | 
 3 | from .default_tools import complete_task_tool
 4 | from .generate_scenarios import generate_scenarios
 5 | from .types import (
 6 |     GeneratedScenario,
 7 |     GeneratedScenarioCollection,
 8 |     MCPResource,
 9 |     MCPTool,
10 | )
11 | 
12 | __all__ = [
13 |     "MCPResource",
14 |     "MCPTool",
15 |     "GeneratedScenario",
16 |     "GeneratedScenarioCollection",
17 |     "complete_task_tool",
18 |     "generate_scenarios",
19 | ]
20 | 


--------------------------------------------------------------------------------
/src/art/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Import all utilities to maintain the same interface
 2 | from .format_message import format_message
 3 | from .get_model_step import get_model_step
 4 | from .iterate_dataset import iterate_dataset
 5 | from .limit_concurrency import limit_concurrency
 6 | from .log_http_errors import log_http_errors
 7 | from .retry import retry
 8 | 
 9 | __all__ = [
10 |     "format_message",
11 |     "retry",
12 |     "iterate_dataset",
13 |     "limit_concurrency",
14 |     "log_http_errors",
15 |     "get_model_step",
16 | ]
17 | 


--------------------------------------------------------------------------------
/src/art/utils/get_repo_root_path.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def get_repo_root_path() -> str:
 5 |     try:
 6 |         # search through parent directories until we find a .git directory
 7 |         current_dir = os.path.dirname(os.path.abspath(__file__))
 8 |         while not os.path.exists(os.path.join(current_dir, ".git")):
 9 |             if current_dir == "/":
10 |                 raise Exception("Could not find .git directory")
11 |             current_dir = os.path.dirname(current_dir)
12 |         return current_dir
13 |     except Exception:
14 |         return "."
15 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/model_utils/func_tools/filter.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, Iterable, TypeVar
 2 | 
 3 | from tau_bench.model_utils.func_tools.map import map
 4 | 
 5 | T = TypeVar("T")
 6 | 
 7 | builtin_filter = filter
 8 | 
 9 | 
10 | def filter(
11 |     func: Callable[[T], bool],
12 |     iterable: Iterable[T],
13 |     max_concurrency: int | None = None,
14 | ) -> Iterable[T]:
15 |     assert max_concurrency is None or max_concurrency > 0
16 |     bits = map(func, iterable=iterable, max_concurrency=max_concurrency)
17 |     return [x for x, y in zip(iterable, bits) if y]
18 | 


--------------------------------------------------------------------------------
/src/art/utils/deployment/legacy.py:
--------------------------------------------------------------------------------
 1 | """Legacy exports for backwards compatibility."""
 2 | 
 3 | from enum import Enum
 4 | 
 5 | from pydantic import BaseModel
 6 | 
 7 | from .together import TogetherJobStatus
 8 | 
 9 | 
10 | class LoRADeploymentProvider(str, Enum):
11 |     """Legacy enum for deployment providers."""
12 | 
13 |     TOGETHER = "together"
14 |     WANDB = "wandb"
15 | 
16 | 
17 | class LoRADeploymentJob(BaseModel):
18 |     """Legacy result class for deployment jobs."""
19 | 
20 |     status: TogetherJobStatus
21 |     job_id: str
22 |     model_name: str
23 |     failure_reason: str | None
24 | 


--------------------------------------------------------------------------------
/examples/mcp-rl/servers/python/mcp_googlemaps/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "mcp-googlemaps"
 3 | version = "0.1.0"
 4 | description = "Google Maps MCP Server - Provides access to Google Maps APIs including Geocoding and Places"
 5 | readme = "README.md"
 6 | requires-python = ">=3.10"
 7 | dependencies = [
 8 |     "aiohttp>=3.9.0",
 9 |     "click>=8.1.0",
10 |     "mcp>=1.0.0",
11 |     "python-dotenv>=1.0.0",
12 |     "tenacity>=8.0.0",
13 | ]
14 | 
15 | [project.scripts]
16 | mcp-googlemaps = "mcp_googlemaps.server:main"
17 | 
18 | [build-system]
19 | requires = ["hatchling"]
20 | build-backend = "hatchling.build"


--------------------------------------------------------------------------------
/src/art/dev/__init__.py:
--------------------------------------------------------------------------------
 1 | from .engine import EngineArgs
 2 | from .model import (
 3 |     InitArgs,
 4 |     InternalModelConfig,
 5 |     PeftArgs,
 6 |     TrainerArgs,
 7 | )
 8 | from .openai_server import OpenAIServerConfig, ServerArgs, get_openai_server_config
 9 | from .torchtune import TorchtuneArgs
10 | from .train import TrainConfig
11 | 
12 | __all__ = [
13 |     "EngineArgs",
14 |     "InternalModelConfig",
15 |     "InitArgs",
16 |     "PeftArgs",
17 |     "TrainerArgs",
18 |     "get_openai_server_config",
19 |     "OpenAIServerConfig",
20 |     "ServerArgs",
21 |     "TorchtuneArgs",
22 |     "TrainConfig",
23 | ]
24 | 


--------------------------------------------------------------------------------
/dev/swebench/run.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from concurrent.futures import ThreadPoolExecutor
 3 | from functools import partial
 4 | from typing import Callable, ParamSpec, TypeVar
 5 | 
 6 | executor = ThreadPoolExecutor(max_workers=1024)
 7 | 
 8 | P = ParamSpec("P")
 9 | R = TypeVar("R")
10 | 
11 | 
12 | async def run(
13 |     func: Callable[P, R],
14 |     in_thread: bool,
15 |     *args: P.args,
16 |     **kwargs: P.kwargs,
17 | ) -> R:
18 |     if in_thread:
19 |         return await asyncio.get_running_loop().run_in_executor(
20 |             executor, partial(func, *args, **kwargs)
21 |         )
22 |     return func(*args, **kwargs)
23 | 


--------------------------------------------------------------------------------
/dev/tau-bench/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | from setuptools import find_packages, setup
 4 | 
 5 | setup(
 6 |     name="tau_bench",
 7 |     version="0.1.0",
 8 |     description="The Tau-Bench package",
 9 |     long_description=open("README.md").read(),
10 |     packages=find_packages(),
11 |     include_package_data=True,
12 |     install_requires=[
13 |         "openai>=1.13.3",
14 |         "mistralai>=0.4.0",
15 |         "anthropic>=0.26.1",
16 |         "google-generativeai>=0.5.4",
17 |         "tenacity>=8.3.0",
18 |         "termcolor>=2.4.0",
19 |         "numpy>=1.26.4",
20 |         "litellm>=1.41.0",
21 |     ],
22 | )
23 | 


--------------------------------------------------------------------------------
/dev/test_skypilot/launch.py:
--------------------------------------------------------------------------------
 1 | """Training example for MCP agent using rollout with AlphaMcpServer in scenarios."""
 2 | 
 3 | import asyncio
 4 | 
 5 | from dotenv import load_dotenv
 6 | 
 7 | from art.skypilot.backend import SkyPilotBackend
 8 | 
 9 | load_dotenv()
10 | 
11 | 
12 | async def launch():
13 |     backend = await SkyPilotBackend().initialize_cluster(
14 |         cluster_name="test-skypilot",
15 |         gpu="H100-SXM",
16 |         env_path=".env",
17 |         force_restart=True,
18 |     )
19 | 
20 |     print("successfully initialized skypilot server")
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     asyncio.run(launch())
25 | 


--------------------------------------------------------------------------------
/dev/swebench/sandbox/daytona.py:
--------------------------------------------------------------------------------
 1 | import daytona_sdk
 2 | 
 3 | from .sandbox import Provider, Sandbox
 4 | 
 5 | 
 6 | class DaytonaSandbox(Sandbox):
 7 |     """
 8 |     Daytona sandbox.
 9 | 
10 |     Wraps a Daytona sandbox with the shared Sandbox interface.
11 |     """
12 | 
13 |     provider: Provider = "daytona"
14 | 
15 |     def __init__(self, sandbox: daytona_sdk.AsyncSandbox) -> None:
16 |         self._sandbox = sandbox
17 | 
18 |     async def exec(self, command: str, timeout: int) -> tuple[int, str]:
19 |         result = await self._sandbox.process.exec(command, timeout=timeout)
20 |         return int(result.exit_code), result.result
21 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/model_utils/model/exception.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Generic, TypeVar
 3 | 
 4 | T = TypeVar("T")
 5 | 
 6 | 
 7 | class ModelError(Exception):
 8 |     def __init__(
 9 |         self,
10 |         short_message: str,
11 |         prompt: str | list[dict[str, str]] | None = None,
12 |         response: str | None = None,
13 |     ) -> None:
14 |         super().__init__(short_message)
15 |         self.short_message = short_message
16 |         self.prompt = prompt
17 |         self.response = response
18 | 
19 | 
20 | @dataclass
21 | class Result(Generic[T]):
22 |     value: T | None
23 |     error: ModelError | None
24 | 


--------------------------------------------------------------------------------
/examples/hn_title_generator/skypilot.yaml:
--------------------------------------------------------------------------------
 1 | # To launch, run the following command from the root directory of the art repository:
 2 | # `uv run sky launch examples/hn_title_generator/skypilot.yaml --cluster=kyle-hn-title-generator-001 --env-file=.env --yes --retry-until-up --down --idle-minutes-to-autostop 10`
 3 | 
 4 | workdir: .
 5 | resources:
 6 |   accelerators: ["H100-SXM:1"]
 7 | envs:
 8 |   HF_HUB_ENABLE_HF_TRANSFER: 1
 9 | 
10 | setup: |
11 |   curl -LsSf https://astral.sh/uv/install.sh | sh
12 | 
13 |   source $HOME/.local/bin/env
14 | 
15 |   uv sync
16 | 
17 | run: |
18 |   echo "Running training script..."
19 |   uv run python examples/hn_title_generator/train.py
20 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/retail/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | import json
 4 | import os
 5 | from typing import Any
 6 | 
 7 | FOLDER_PATH = os.path.dirname(__file__)
 8 | 
 9 | 
10 | def load_data() -> dict[str, Any]:
11 |     with open(os.path.join(FOLDER_PATH, "orders.json")) as f:
12 |         order_data = json.load(f)
13 |     with open(os.path.join(FOLDER_PATH, "products.json")) as f:
14 |         product_data = json.load(f)
15 |     with open(os.path.join(FOLDER_PATH, "users.json")) as f:
16 |         user_data = json.load(f)
17 |     return {
18 |         "orders": order_data,
19 |         "products": product_data,
20 |         "users": user_data,
21 |     }
22 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/airline/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | import json
 4 | import os
 5 | from typing import Any
 6 | 
 7 | FOLDER_PATH = os.path.dirname(__file__)
 8 | 
 9 | 
10 | def load_data() -> dict[str, Any]:
11 |     with open(os.path.join(FOLDER_PATH, "flights.json")) as f:
12 |         flight_data = json.load(f)
13 |     with open(os.path.join(FOLDER_PATH, "reservations.json")) as f:
14 |         reservation_data = json.load(f)
15 |     with open(os.path.join(FOLDER_PATH, "users.json")) as f:
16 |         user_data = json.load(f)
17 |     return {
18 |         "flights": flight_data,
19 |         "reservations": reservation_data,
20 |         "users": user_data,
21 |     }
22 | 


--------------------------------------------------------------------------------
/dev/swebench/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "openpipe-art-swebench"
 3 | version = "0.1.0"
 4 | requires-python = ">=3.10"
 5 | dependencies = [
 6 |     "aiolimiter>=1.2.1",
 7 |     "daytona-sdk>=0.21.5",
 8 |     "langfuse>=2.60.7",
 9 |     "modal>=1.0.1",
10 |     "openpipe-art",
11 |     "sweagent",
12 |     "swebench>=4.0.3",
13 | ]
14 | 
15 | [tool.uv.sources]
16 | openpipe-art = { path = "../../", editable = true }
17 | sweagent = { git = "https://github.com/bradhilton/SWE-agent" }
18 | 
19 | [dependency-groups]
20 | dev = [
21 |     "ipykernel>=6.29.5",
22 |     "ipywidgets>=8.1.7",
23 |     "pytest>=8.4.1",
24 |     "pytest-asyncio>=1.0.0",
25 |     "pytest-timeout>=2.4.0",
26 |     "pytest-xdist>=3.8.0",
27 | ]
28 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/model_utils/func_tools/map.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import ThreadPoolExecutor
 2 | from typing import Callable, Iterable, TypeVar
 3 | 
 4 | T = TypeVar("T")
 5 | U = TypeVar("U")
 6 | 
 7 | 
 8 | def map(
 9 |     func: Callable[[T], U],
10 |     iterable: Iterable[T],
11 |     max_concurrency: int | None = None,
12 |     use_tqdm: bool = False,
13 | ) -> Iterable[U]:
14 |     assert max_concurrency is None or max_concurrency > 0
15 |     with ThreadPoolExecutor(max_workers=max_concurrency) as executor:
16 |         if use_tqdm:
17 |             from tqdm import tqdm
18 | 
19 |             return list(tqdm(executor.map(func, iterable), total=len(iterable)))
20 |         return executor.map(func, iterable)
21 | 


--------------------------------------------------------------------------------
/dev/swebench/sandbox/modal.py:
--------------------------------------------------------------------------------
 1 | import modal
 2 | 
 3 | from .sandbox import Provider, Sandbox
 4 | 
 5 | 
 6 | class ModalSandbox(Sandbox):
 7 |     """
 8 |     Modal sandbox.
 9 | 
10 |     Wraps a Modal sandbox with the shared Sandbox interface.
11 |     """
12 | 
13 |     provider: Provider = "modal"
14 | 
15 |     def __init__(self, sandbox: modal.Sandbox) -> None:
16 |         self._sandbox = sandbox
17 | 
18 |     async def exec(self, command: str, timeout: int) -> tuple[int, str]:
19 |         process = await self._sandbox.exec.aio(
20 |             "/bin/sh", "-c", command, timeout=timeout
21 |         )
22 |         exit_code = await process.wait.aio()
23 |         stdout = await process.stdout.read.aio()
24 |         return exit_code, stdout
25 | 


--------------------------------------------------------------------------------
/src/art/types.py:
--------------------------------------------------------------------------------
 1 | from typing import Annotated, Literal
 2 | 
 3 | import pydantic
 4 | from openai.types.chat.chat_completion import Choice
 5 | from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
 6 | from openai.types.chat.chat_completion_tool_param import ChatCompletionToolParam
 7 | from pydantic import SkipValidation
 8 | 
 9 | Message = Annotated[ChatCompletionMessageParam, SkipValidation]
10 | MessageOrChoice = Message | Choice
11 | Messages = list[Message]
12 | MessagesAndChoices = list[MessageOrChoice]
13 | Tools = list[ChatCompletionToolParam]
14 | 
15 | 
16 | class TrainConfig(pydantic.BaseModel):
17 |     learning_rate: float = 5e-6
18 |     beta: float = 0.0
19 | 
20 | 
21 | Verbosity = Literal[0, 1, 2]
22 | 


--------------------------------------------------------------------------------
/src/art/utils/format_message.py:
--------------------------------------------------------------------------------
 1 | from ..types import Message
 2 | 
 3 | 
 4 | def format_message(message: Message) -> str:
 5 |     """Format a message into a readable string."""
 6 |     # Format the role and content
 7 |     role = message["role"].capitalize()
 8 |     content = message.get("content", message.get("refusal", "")) or ""
 9 | 
10 |     # Format any tool calls
11 |     tool_calls_text = "\n" if content else ""
12 |     tool_calls_text += "\n".join(
13 |         f"{tool_call['function']['name']}({tool_call['function']['arguments']})"
14 |         for tool_call in message.get("tool_calls") or []
15 |     )
16 | 
17 |     # Combine all parts
18 |     formatted_message = f"{role}:\n{content}{tool_calls_text}"
19 |     return formatted_message
20 | 


--------------------------------------------------------------------------------
/examples/mcp-rl/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "art-mcp"
 3 | version = "0.1.0"
 4 | description = "Add your description here"
 5 | readme = "README.md"
 6 | requires-python = ">=3.10"
 7 | dependencies = [
 8 |     "aiohttp>=3.12.14",
 9 |     "asyncio>=3.4.3",
10 |     "click>=8.1.8",
11 |     "mcp>=1.11.0",
12 |     "openai>=1.74.0",
13 |     "openpipe-art[skypilot]",
14 |     "python-dotenv>=1.1.1",
15 |     "tenacity>=9.1.2",
16 |     "weave>=0.51.56",
17 | ]
18 | 
19 | 
20 | [tool.uv.sources]
21 | openpipe-art = { path = "../../", editable = true }
22 | 
23 | [dependency-groups]
24 | dev = [
25 |     "polars>=1.31.0",
26 |     "ipywidgets>=8.1.6",
27 |     "ipykernel>=6.29.5",
28 |     "matplotlib>=3.10.3",
29 |     "seaborn>=0.13.2",
30 | ]
31 | 


--------------------------------------------------------------------------------
/examples/just-the-facts/just_the_facts/find_articles.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | import feedparser
 4 | 
 5 | feeds = {
 6 |     "NBC News Top Stories": "http://feeds.nbcnews.com/feeds/topstories",
 7 |     "BBC News Top Stories": "https://feeds.bbci.co.uk/news/rss.xml",
 8 |     "CBS News Top Stories": "http://www.cbsnews.com/latest/rss/main",
 9 |     "Fox News Latest": "http://feeds.foxnews.com/foxnews/latest",
10 | }
11 | 
12 | all_urls = []
13 | 
14 | for name, url in feeds.items():
15 |     print(f"\n=== {name} ===")
16 |     feed = feedparser.parse(url)
17 | 
18 |     for entry in feed.entries[:25]:
19 |         print(entry.link)
20 |         all_urls.append(entry.link)
21 | 
22 | 
23 | # shuffle
24 | random.shuffle(all_urls)
25 | 
26 | print(all_urls)
27 | 


--------------------------------------------------------------------------------
/src/art/utils/limit_concurrency.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from functools import wraps
 3 | from typing import Callable, Optional
 4 | 
 5 | 
 6 | def limit_concurrency(n: int, derive_key: Optional[Callable[..., str]] = None):
 7 |     semaphores = {}
 8 | 
 9 |     def decorator(func):
10 |         @wraps(func)
11 |         async def wrapper(*args, **kwargs):
12 |             if derive_key:
13 |                 key = derive_key(*args, **kwargs)
14 |             else:
15 |                 key = "default"
16 | 
17 |             if key not in semaphores:
18 |                 semaphores[key] = asyncio.Semaphore(n)
19 | 
20 |             async with semaphores[key]:
21 |                 return await func(*args, **kwargs)
22 | 
23 |         return wrapper
24 | 
25 |     return decorator
26 | 


--------------------------------------------------------------------------------
/dev/test_skypilot/launch_tail.py:
--------------------------------------------------------------------------------
 1 | """Training example for MCP agent using rollout with AlphaMcpServer in scenarios."""
 2 | 
 3 | import asyncio
 4 | 
 5 | from dotenv import load_dotenv
 6 | 
 7 | from art.skypilot.backend import SkyPilotBackend
 8 | 
 9 | load_dotenv()
10 | 
11 | 
12 | async def launch_tail():
13 |     backend = await SkyPilotBackend().initialize_cluster(
14 |         cluster_name="test-skypilot",
15 |         gpu="H100-SXM",
16 |         env_path=".env",
17 |         force_restart=True,
18 |         tail_logs=True,
19 |     )
20 |     print("successfully initialized skypilot server")
21 | 
22 |     # unforunately, we can't cancel the task programmatically, so we have to ctrl+c
23 |     # to exit
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     asyncio.run(launch_tail())
28 | 


--------------------------------------------------------------------------------
/src/art/utils/old_benchmarking/calculate_step_metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from art.trajectories import TrajectoryGroup
 4 | 
 5 | 
 6 | # calculate the average standard deviation of rewards within groups
 7 | def calculate_step_std_dev(trajectory_groups: list[TrajectoryGroup]) -> float:
 8 |     std_devs = []
 9 |     for group in trajectory_groups:
10 |         group_rewards = []
11 | 
12 |         for trajectory in group.trajectories:
13 |             if isinstance(trajectory, BaseException):
14 |                 continue
15 |             group_rewards.append(trajectory.reward)
16 | 
17 |         if len(group_rewards) > 1:
18 |             std_devs.append(np.std(group_rewards))
19 | 
20 |     if len(std_devs) == 0:
21 |         return 0
22 | 
23 |     return sum(std_devs) / len(std_devs)
24 | 


--------------------------------------------------------------------------------
/src/art/torchtune/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export MODEL_DIR=$(HF_HUB_ENABLE_HF_TRANSFER=1 uv run huggingface-cli download Qwen/Qwen3-32B | tail -n 1)
 4 | export TORCHTUNE_DIR=$(uv run python -c "import torchtune; import os; print(os.path.dirname(torchtune.__file__))")
 5 | uv run $TORCHTUNE_DIR/_cli/tune.py run \
 6 |     --nproc-per-node 8 \
 7 |     src/art/torchtune/recipe.py \
 8 |     --config ./src/art/torchtune/config.yaml \
 9 |     tokenizer.path=$MODEL_DIR/vocab.json \
10 |     tokenizer.merges_file=$MODEL_DIR/merges.txt \
11 |     checkpointer.checkpoint_dir=$MODEL_DIR \
12 |     checkpointer.checkpoint_files="[$(ls $MODEL_DIR/*.safetensors | xargs -n1 basename | sed 's/^/"/;s/$/",/' | tr '\n' ' ' | sed 's/, $//' )]" \
13 |     model._component_=torchtune.models.qwen3.qwen3_32b \
14 |     "$@"
15 |     


--------------------------------------------------------------------------------
/src/art/utils/log_http_errors.py:
--------------------------------------------------------------------------------
 1 | from functools import wraps
 2 | 
 3 | import httpx
 4 | 
 5 | 
 6 | def log_http_errors(func):
 7 |     @wraps(func)
 8 |     async def wrapper(*args, **kwargs):
 9 |         try:
10 |             return await func(*args, **kwargs)
11 |         except httpx.HTTPStatusError as e:
12 |             # raise a new exception with the status code, url, and "detail" key if it exists
13 |             try:
14 |                 detail = e.response.json().get("detail", None)
15 |             except Exception:
16 |                 # if we can't parse the response as json, just raise the original exception
17 |                 raise e
18 |             raise Exception(
19 |                 f"[HTTP {e.response.status_code}] {e.request.url} {detail}"
20 |             ) from e
21 | 
22 |     return wrapper
23 | 


--------------------------------------------------------------------------------
/dev/tau-bench/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "tau-bench"
 3 | version = "0.1.0"
 4 | requires-python = ">=3.11"
 5 | dependencies = [
 6 |     "google-generativeai>=0.8.5",
 7 |     "langfuse>=2.60.8",
 8 |     "litellm>=1.72.6.post2",
 9 |     "mistralai>=1.8.2",
10 |     "openpipe>=4.50.0",
11 |     "openpipe-art",
12 |     "skypilot-nightly[runpod,hyperbolic]==1.0.0.dev20250717",
13 |     "tenacity>=9.1.2",
14 |     "termcolor>=3.1.0",
15 |     "openai>=1.74.0",
16 |     "anthropic>=0.49.0",
17 |     "accelerate==1.7.0",
18 |     "vllm==0.9.1; sys_platform == 'linux'"
19 | ]
20 | 
21 | [tool.uv]
22 | override-dependencies = ["vllm; sys_platform == 'linux'"]
23 | 
24 | [tool.uv.sources]
25 | openpipe-art = { path = "../../", editable = true }
26 | 
27 | [dependency-groups]
28 | dev = [
29 |     "ipykernel>=6.29.5",
30 |     "ipywidgets>=8.1.7",
31 | ]
32 | 


--------------------------------------------------------------------------------
/examples/mcp-rl/mcp_rl/utils.py:
--------------------------------------------------------------------------------
 1 | from mcp import types
 2 | 
 3 | 
 4 | def get_content_text(result: types.CallToolResult) -> str:
 5 |     # Extract text content from MCP result
 6 |     if hasattr(result, "content") and result.content:
 7 |         if isinstance(result.content, list):
 8 |             # Handle list of content items
 9 |             content_text = ""
10 |             for item in result.content:
11 |                 if isinstance(item, types.TextContent):
12 |                     content_text += item.text
13 |                 else:
14 |                     content_text += str(item)
15 |         elif isinstance(result.content[0], types.TextContent):
16 |             content_text = result.content[0].text
17 |         else:
18 |             content_text = str(result.content)
19 |     else:
20 |         content_text = str(result)
21 | 
22 |     return content_text
23 | 


--------------------------------------------------------------------------------
/src/art/utils/deployment/__init__.py:
--------------------------------------------------------------------------------
 1 | """Deployment utilities for deploying trained models to inference endpoints."""
 2 | 
 3 | from .common import (
 4 |     DeploymentConfig,
 5 |     DeploymentResult,
 6 |     Provider,
 7 |     deploy_model,
 8 | )
 9 | 
10 | # Legacy exports for backwards compatibility
11 | from .legacy import (
12 |     LoRADeploymentJob,
13 |     LoRADeploymentProvider,
14 | )
15 | from .together import (
16 |     TogetherDeploymentConfig,
17 | )
18 | from .wandb import (
19 |     WandbDeploymentConfig,
20 |     deploy_wandb,
21 | )
22 | 
23 | __all__ = [
24 |     # New API
25 |     "DeploymentConfig",
26 |     "DeploymentResult",
27 |     "Provider",
28 |     "TogetherDeploymentConfig",
29 |     "WandbDeploymentConfig",
30 |     "deploy_model",
31 |     "deploy_wandb",
32 |     # Legacy API
33 |     "LoRADeploymentJob",
34 |     "LoRADeploymentProvider",
35 | ]
36 | 


--------------------------------------------------------------------------------
/src/art/utils/get_model_step.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import TYPE_CHECKING
 3 | 
 4 | from art.utils.output_dirs import get_model_dir
 5 | 
 6 | if TYPE_CHECKING:
 7 |     from art.model import TrainableModel
 8 | 
 9 | 
10 | def get_step_from_dir(output_dir: str) -> int:
11 |     os.makedirs(output_dir, exist_ok=True)
12 |     checkpoint_dir = os.path.join(output_dir, "checkpoints")
13 |     if not os.path.exists(checkpoint_dir):
14 |         return 0
15 | 
16 |     return max(
17 |         (
18 |             int(subdir)
19 |             for subdir in os.listdir(checkpoint_dir)
20 |             if os.path.isdir(os.path.join(checkpoint_dir, subdir)) and subdir.isdigit()
21 |         ),
22 |         default=0,
23 |     )
24 | 
25 | 
26 | def get_model_step(model: "TrainableModel", art_path: str) -> int:
27 |     return get_step_from_dir(get_model_dir(model=model, art_path=art_path))
28 | 


--------------------------------------------------------------------------------
/src/art/local/service.py:
--------------------------------------------------------------------------------
 1 | from typing import AsyncIterator, Protocol, runtime_checkable
 2 | 
 3 | from .. import dev, types
 4 | from ..preprocessing.pack import DiskPackedTensors
 5 | 
 6 | 
 7 | @runtime_checkable
 8 | class ModelService(Protocol):
 9 |     def __init__(
10 |         self,
11 |         model_name: str,
12 |         base_model: str,
13 |         config: dev.InternalModelConfig,
14 |         output_dir: str,
15 |     ):
16 |         pass
17 | 
18 |     async def start_openai_server(
19 |         self, config: dev.OpenAIServerConfig | None
20 |     ) -> None: ...
21 | 
22 |     async def vllm_engine_is_sleeping(self) -> bool: ...
23 | 
24 |     def train(
25 |         self,
26 |         disk_packed_tensors: DiskPackedTensors,
27 |         config: types.TrainConfig,
28 |         _config: dev.TrainConfig,
29 |         verbose: bool = False,
30 |     ) -> AsyncIterator[dict[str, float]]: ...
31 | 


--------------------------------------------------------------------------------
/examples/just-the-facts/test_scraper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import asyncio
 4 | 
 5 | from just_the_facts.scenarios import train_urls, val_urls
 6 | from just_the_facts.utils import scrape_article
 7 | 
 8 | 
 9 | async def test_scraper():
10 |     """Test the scrape_article function with example URLs"""
11 | 
12 |     # Test URLs from different news sources (using homepage URLs that should exist)
13 |     test_urls = train_urls + val_urls
14 | 
15 |     for url in test_urls:
16 |         try:
17 |             print(f"\nTesting URL: {url}")
18 |             article_text = await scrape_article(url)
19 |             print(f"Successfully scraped {len(article_text)} characters")
20 |             print(f"First 200 characters: {article_text[:200]}...")
21 |         except Exception as e:
22 |             print(f"Failed to scrape {url}: {str(e)}")
23 |             raise e
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     asyncio.run(test_scraper())
28 | 


--------------------------------------------------------------------------------
/src/art/langgraph/logging.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | 
 4 | 
 5 | class FileLogger:
 6 |     def __init__(self, filepath):
 7 |         self.text_path = filepath
 8 |         self.pickle_path = filepath + ".pkl"
 9 | 
10 |     def log(self, name, entry):
11 |         # Log as readable text
12 |         with open(self.text_path, "a") as f:
13 |             f.write(f"{name}: {entry}\n")
14 | 
15 |         # Append to pickle log
16 |         with open(self.pickle_path, "ab") as pf:
17 |             pickle.dump((name, entry), pf)
18 | 
19 |     def load_logs(self):
20 |         """Load all logs from the pickle file."""
21 |         if not os.path.exists(self.pickle_path):
22 |             return []
23 |         logs = []
24 |         with open(self.pickle_path, "rb") as pf:
25 |             try:
26 |                 while True:
27 |                     logs.append(pickle.load(pf))
28 |             except EOFError:
29 |                 pass
30 |         return logs
31 | 


--------------------------------------------------------------------------------
/src/art/utils/benchmark_rollout.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Callable, Coroutine
 2 | 
 3 | import art
 4 | 
 5 | from ..trajectories import Trajectory, TrajectoryGroup
 6 | 
 7 | 
 8 | async def benchmark_rollout(
 9 |     model: str,
10 |     num_rollouts: int,
11 |     rollout: Callable[[str, int, bool], Coroutine[Any, Any, Trajectory]],
12 | ) -> float:
13 |     trajectory_groups = await art.gather_trajectory_groups(
14 |         [TrajectoryGroup(rollout(model, i, False) for i in range(num_rollouts))],
15 |         pbar_desc="Benchmarking rollout",
16 |     )
17 | 
18 |     trajectory_group_rewards = []
19 | 
20 |     for group in trajectory_groups:
21 |         total_reward = sum(trajectory.reward for trajectory in group)
22 |         trajectory_group_rewards.append(total_reward / len(group))
23 | 
24 |     average_reward = sum(trajectory_group_rewards) / len(trajectory_group_rewards)
25 | 
26 |     print(f"Average reward for {model}: {average_reward}")
27 | 
28 |     return average_reward
29 | 


--------------------------------------------------------------------------------
/src/art/utils/logging.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | 
 4 | # ---------- lightweight "nice print" helpers ----------
 5 | class _C:
 6 |     RESET = "\x1b[0m"
 7 |     DIM = "\x1b[2m"
 8 |     BOLD = "\x1b[1m"
 9 |     ITAL = "\x1b[3m"
10 |     GRAY = "\x1b[90m"
11 |     BLUE = "\x1b[34m"
12 |     CYAN = "\x1b[36m"
13 |     GREEN = "\x1b[32m"
14 |     YELLOW = "\x1b[33m"
15 |     RED = "\x1b[31m"
16 |     MAGENTA = "\x1b[35m"
17 | 
18 | 
19 | def _ts():
20 |     return time.strftime("%H:%M:%S")
21 | 
22 | 
23 | def info(msg):
24 |     print(f"[{_ts()}] {_C.BLUE}INFO{_C.RESET}  {msg}")
25 | 
26 | 
27 | def step(msg):
28 |     print(f"[{_ts()}] {_C.CYAN}STEP{_C.RESET}  {msg}")
29 | 
30 | 
31 | def ok(msg):
32 |     print(f"[{_ts()}] {_C.GREEN}OK{_C.RESET}    {msg}")
33 | 
34 | 
35 | def warn(msg):
36 |     print(f"[{_ts()}] {_C.YELLOW}WARN{_C.RESET}  {msg}")
37 | 
38 | 
39 | def err(msg):
40 |     print(f"[{_ts()}] {_C.RED}ERR{_C.RESET}   {msg}")
41 | 
42 | 
43 | def dim(msg):
44 |     print(f"{_C.DIM}{msg}{_C.RESET}")
45 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/retail/rules.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | RULES = [
 4 |     "You are a customer service representative for an online retail company. You are chatting with a customer, and you can call tools or respond to the user.",
 5 |     "The agent should always first confirm the user id by email or name+zip before proceeding with any task.",
 6 |     "The agent should not proceed with any task if the user id is not found.",
 7 |     "For any change to the backend database, e.g., address update, refund, or order cancellation, the agent must confirm the transaction details with the user and ask for permission, and get explicit authorization (yes) to proceed.",
 8 |     "The agent should solve the user task given the tools, without transferring to a human agent.",
 9 |     "The agent should not make up any information or knowledge not provided from the user or the tools.",
10 |     "The agent should at most make one tool call at a time, and if the agent makes a tool call, it does not respond to the user at the same time.",
11 | ]
12 | 


--------------------------------------------------------------------------------
/docs/analytics.js:
--------------------------------------------------------------------------------
 1 | !(function () {
 2 |   var reb2b = (window.reb2b = window.reb2b || []);
 3 |   if (reb2b.invoked) return;
 4 |   reb2b.invoked = true;
 5 |   reb2b.methods = ["identify", "collect"];
 6 |   reb2b.factory = function (method) {
 7 |     return function () {
 8 |       var args = Array.prototype.slice.call(arguments);
 9 |       args.unshift(method);
10 |       reb2b.push(args);
11 |       return reb2b;
12 |     };
13 |   };
14 |   for (var i = 0; i < reb2b.methods.length; i++) {
15 |     var key = reb2b.methods[i];
16 |     reb2b[key] = reb2b.factory(key);
17 |   }
18 |   reb2b.load = function (key) {
19 |     var script = document.createElement("script");
20 |     script.type = "text/javascript";
21 |     script.async = true;
22 |     script.src =
23 |       "https://s3-us-west-2.amazonaws.com/b2bjsstore/b/" + key + "/reb2b.js.gz";
24 |     var first = document.getElementsByTagName("script")[0];
25 |     first.parentNode.insertBefore(script, first);
26 |   };
27 |   reb2b.SNIPPET_VERSION = "1.0.1";
28 |   reb2b.load("4O7Z0HMXYWNX");
29 | })();
30 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
 1 | # I recommend setting your API key here if you're going to ssh into a new machine and use the local backend
 2 | WANDB_API_KEY=YOUR_WANDB_API_KEY
 3 | 
 4 | # Optional, git-related environment variables
 5 | # You may need these if you want to make any git commits on a new machine
 6 | GIT_USER_NAME="Your Name"
 7 | GIT_USER_EMAIL=your.email@example.com
 8 | # A GitHub token might be required for commiting to the private `agent-reinforcement-training` repository
 9 | GITHUB_TOKEN=YOUR_GITHUB_TOKEN
10 | 
11 | # HuggingFace Token (optional for most models, necessary for training gated models like Llama 3.1)
12 | HF_TOKEN=YOUR_HUGGINGFACE_TOKEN
13 | 
14 | # Optional, OpenPipe API key
15 | OPENPIPE_API_KEY=YOUR_OPENPIPE_API_KEY
16 | # Optional, Together API key (used for deploying models to Together)
17 | TOGETHER_API_KEY=YOUR_TOGETHER_API_KEY
18 | 
19 | # Optional, S3 configuration for log and model backups
20 | AWS_ACCESS_KEY_ID=YOUR_AWS_ACCESS_KEY_ID
21 | AWS_SECRET_ACCESS_KEY=YOUR_AWS_SECRET_ACCESS_KEY
22 | AWS_REGION=YOUR_AWS_REGION
23 | BACKUP_BUCKET=YOUR_BACKUP_BUCKET


--------------------------------------------------------------------------------
/examples/just-the-facts/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "just-the-facts"
 3 | version = "0.1.0"
 4 | description = "Add your description here"
 5 | readme = "README.md"
 6 | requires-python = ">=3.10"
 7 | dependencies = [
 8 |     "aiohttp>=3.12.14",
 9 |     "asyncio>=3.4.3",
10 |     "beautifulsoup4>=4.13.4",
11 |     "click>=8.1.8",
12 |     "feedparser>=6.0.11",
13 |     "lxml>=6.0.0",
14 |     "lxml-html-clean>=0.4.2",
15 |     "mcp>=1.11.0",
16 |     "newspaper3k>=0.2.8",
17 |     "openai>=1.74.0",
18 |     "openpipe-art[skypilot]",
19 |     "python-dotenv>=1.1.1",
20 |     "tenacity>=9.1.2",
21 |     "weave>=0.51.56",
22 | ]
23 | 
24 | [build-system]
25 | requires = ["setuptools>=61.0", "wheel"]
26 | build-backend = "setuptools.build_meta"
27 | 
28 | [tool.setuptools.packages.find]
29 | where = ["."]
30 | include = ["just_the_facts*"]
31 | 
32 | [tool.uv.sources]
33 | openpipe-art = { path = "../../", editable = true }
34 | 
35 | [dependency-groups]
36 | dev = [
37 |     "polars>=1.31.0",
38 |     "ipywidgets>=8.1.6",
39 |     "ipykernel>=6.29.5",
40 |     "matplotlib>=3.10.3",
41 |     "seaborn>=0.13.2",
42 | ]
43 | 


--------------------------------------------------------------------------------
/src/art/utils/old_benchmarking/generate_comparison_table.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from .load_benchmarked_models import load_benchmarked_models
 4 | from .types import BenchmarkedModelKey
 5 | 
 6 | 
 7 | def generate_comparison_table(
 8 |     project: str,
 9 |     benchmark_keys: list[BenchmarkedModelKey],
10 |     metrics: list[str] = ["reward"],
11 |     api_path: str = "./.art",
12 | ) -> pd.DataFrame:
13 |     benchmarked_models = load_benchmarked_models(
14 |         project, benchmark_keys, metrics, api_path
15 |     )
16 | 
17 |     rows: list[dict[str, str]] = []
18 | 
19 |     for benchmarked_model in benchmarked_models:
20 |         for step in benchmarked_model.steps:
21 |             row = {
22 |                 "Model": benchmarked_model.model_key.model,
23 |                 "Split": benchmarked_model.model_key.split,
24 |                 "Step": f"{step.index:04d}",
25 |             }
26 |             for metric in metrics:
27 |                 row[metric] = str(step.metrics.get(metric, "N/A"))
28 |             rows.append(row)
29 | 
30 |     return pd.DataFrame(rows, columns=pd.Index(["Model", "Split", "Step"] + metrics))
31 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/airline/tools/think.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | from typing import Any, Dict
 4 | 
 5 | from tau_bench.envs.tool import Tool
 6 | 
 7 | 
 8 | class Think(Tool):
 9 |     @staticmethod
10 |     def invoke(data: Dict[str, Any], thought: str) -> str:
11 |         return ""
12 | 
13 |     @staticmethod
14 |     def get_info() -> Dict[str, Any]:
15 |         return {
16 |             "type": "function",
17 |             "function": {
18 |                 "name": "think",
19 |                 "description": "Use the tool to think about something. It will not obtain new information or change the database, but just append the thought to the log. Use it when complex reasoning is needed.",
20 |                 "parameters": {
21 |                     "type": "object",
22 |                     "properties": {
23 |                         "thought": {
24 |                             "type": "string",
25 |                             "description": "A thought to think about.",
26 |                         },
27 |                     },
28 |                     "required": ["thought"],
29 |                 },
30 |             },
31 |         }
32 | 


--------------------------------------------------------------------------------
/dev/tau-bench/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Sierra
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/dev/new_models/prompts.json:
--------------------------------------------------------------------------------
1 | ["respond with 'yes', 'no', 'maybe'", "respond with 'maybe', 'yes', 'no'", "respond with 'no', 'yes', 'maybe'", "respond with 'yes', 'maybe', 'no'", "respond with yes or no", "respond with maybe or no", "respond with no or maybe", "respond with no or yes", "respond with yes or no", "respond with yes, no, maybe", "respond with maybe, yes, no", "respond with no, yes, maybe", "respond with yes, maybe, no", "respond with yes or no", "respond with maybe or no", "respond with no or maybe", "respond with no or yes", "respond with yes or no", "just respond with 'yes', 'no', 'maybe'", "just respond with 'maybe', 'yes', 'no'", "just respond with 'no', 'yes', 'maybe'", "just respond with 'yes', 'maybe', 'no'", "just respond with yes or no", "just respond with maybe or no", "just respond with no or maybe", "just respond with no or yes", "just respond with yes or no", "just respond with yes, no, maybe", "just respond with maybe, yes, no", "just respond with no, yes, maybe", "just respond with yes, maybe, no", "just respond with yes or no", "just respond with maybe or no", "just respond with no or maybe", "just respond with no or yes", "just respond with yes or no"]


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/model_utils/model/vllm_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | import requests
 4 | 
 5 | from tau_bench.model_utils.model.general_model import wrap_temperature
 6 | 
 7 | 
 8 | def generate_request(
 9 |     url: str,
10 |     prompt: str,
11 |     temperature: float = 0.0,
12 |     force_json: bool = False,
13 |     **req_body_kwargs: Any,
14 | ) -> str:
15 |     args = {
16 |         "prompt": prompt,
17 |         "temperature": wrap_temperature(temperature),
18 |         "max_tokens": 4096,
19 |         **req_body_kwargs,
20 |     }
21 |     if force_json:
22 |         # the prompt will have a suffix of '```json\n' to indicate that the response should be a JSON object
23 |         args["stop"] = ["```"]
24 |     res = requests.post(
25 |         url,
26 |         json=args,
27 |     )
28 |     res.raise_for_status()
29 |     json_res = res.json()
30 |     if "text" not in json_res:
31 |         raise ValueError(f"Unexpected response: {json_res}")
32 |     elif len(json_res["text"]) == 0:
33 |         raise ValueError(f"Empty response: {json_res}")
34 |     text = json_res["text"][0]
35 |     assert isinstance(text, str)
36 |     return text.removeprefix(prompt)
37 | 


--------------------------------------------------------------------------------
/src/art/yield_trajectory.py:
--------------------------------------------------------------------------------
 1 | import contextvars
 2 | from typing import Any, Coroutine
 3 | 
 4 | from .trajectories import Trajectory
 5 | 
 6 | 
 7 | def yield_trajectory(trajectory: Trajectory) -> None:
 8 |     yield_trajectory_context_var.get().trajectory = trajectory
 9 | 
10 | 
11 | async def capture_yielded_trajectory(coroutine: Coroutine[Any, Any, Any]) -> Trajectory:
12 |     with YieldTrajectoryContext():
13 |         await coroutine
14 |         trajectory = yield_trajectory_context_var.get().trajectory
15 |         if trajectory is None:
16 |             raise RuntimeError("No trajectory yielded")
17 |         return trajectory
18 | 
19 | 
20 | class YieldTrajectoryContext:
21 |     def __init__(self) -> None:
22 |         self.trajectory: Trajectory | None = None
23 | 
24 |     def __enter__(self) -> None:
25 |         self.token = yield_trajectory_context_var.set(self)
26 | 
27 |     def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
28 |         yield_trajectory_context_var.reset(self.token)
29 | 
30 | 
31 | yield_trajectory_context_var: contextvars.ContextVar[YieldTrajectoryContext] = (
32 |     contextvars.ContextVar("yield_trajectory_context", default=YieldTrajectoryContext())
33 | )
34 | 


--------------------------------------------------------------------------------
/src/art/dev/train.py:
--------------------------------------------------------------------------------
 1 | from typing import Literal
 2 | 
 3 | from typing_extensions import TypedDict
 4 | 
 5 | 
 6 | class TrainConfig(TypedDict, total=False):
 7 |     advantage_balance: float
 8 |     """Balance between negative and positive advantages in the range [-1.0, 1.0]. \
 9 | -1.0 means only training on negative advantages, 1.0 means only training on \
10 | positive advantages. Defaults to 0.0 (perfectly balanced)."""
11 |     allow_training_without_logprobs: bool
12 |     epsilon: float  # clip epsilon, using the same name as TRL
13 |     epsilon_high: (
14 |         float | None
15 |     )  # asymmetric clip upper bound. Defaults to epsilon when None
16 |     importance_sampling_level: Literal[
17 |         "token", "sequence", "average", "geometric_average"
18 |     ]
19 |     kimi_k2_tau: float | None
20 |     logprob_calculation_chunk_size: int
21 |     mask_prob_ratio: bool
22 |     max_negative_advantage_importance_sampling_weight: float
23 |     num_trajectories_learning_rate_multiplier_power: float
24 |     plot_tensors: bool
25 |     ppo: bool
26 |     precalculate_logprobs: bool
27 |     scale_learning_rate_by_reward_std_dev: bool
28 |     scale_rewards: bool
29 |     truncated_importance_sampling: float | None
30 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/retail/tools/get_user_details.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | import json
 4 | from typing import Any, Dict
 5 | 
 6 | from tau_bench.envs.tool import Tool
 7 | 
 8 | 
 9 | class GetUserDetails(Tool):
10 |     @staticmethod
11 |     def invoke(data: Dict[str, Any], user_id: str) -> str:
12 |         users = data["users"]
13 |         if user_id in users:
14 |             return json.dumps(users[user_id])
15 |         return "Error: user not found"
16 | 
17 |     @staticmethod
18 |     def get_info() -> Dict[str, Any]:
19 |         return {
20 |             "type": "function",
21 |             "function": {
22 |                 "name": "get_user_details",
23 |                 "description": "Get the details of a user, including their orders.",
24 |                 "parameters": {
25 |                     "type": "object",
26 |                     "properties": {
27 |                         "user_id": {
28 |                             "type": "string",
29 |                             "description": "The user id, such as 'sara_doe_496'.",
30 |                         },
31 |                     },
32 |                     "required": ["user_id"],
33 |                 },
34 |             },
35 |         }
36 | 


--------------------------------------------------------------------------------
/.github/workflows/ruff.yml:
--------------------------------------------------------------------------------
 1 | name: Code Quality Checks
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [ main ]
 6 |   push:
 7 |     branches: [ main ]
 8 | 
 9 | jobs:
10 |   quality-checks:
11 |     runs-on: ubuntu-latest
12 |     
13 |     steps:
14 |     - name: Checkout code
15 |       uses: actions/checkout@v4
16 |     
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v5
19 |       with:
20 |         python-version: '3.10'
21 |     
22 |     - name: Install uv
23 |       run: |
24 |         curl -LsSf https://astral.sh/uv/install.sh | sh
25 |         echo "$HOME/.cargo/bin" >> $GITHUB_PATH
26 | 
27 |     - name: Install dependencies
28 |       run: |
29 |         uv sync --all-extras
30 |     
31 |     - name: Run code quality checks
32 |       run: |
33 |         ./scripts/run_checks.sh --verbose-test-failure || {
34 |           echo ""
35 |           echo "❌ Code quality checks failed!"
36 |           echo ""
37 |           echo "To fix these issues locally, run:"
38 |           echo "  ./scripts/run_checks.sh --fix"
39 |           echo ""
40 |           echo "Then commit and push the changes."
41 |           echo ""
42 |           echo "For more details, see CONTRIBUTING.md"
43 |           exit 1
44 |         }


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/airline/tools/get_user_details.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | import json
 4 | from typing import Any, Dict
 5 | 
 6 | from tau_bench.envs.tool import Tool
 7 | 
 8 | 
 9 | class GetUserDetails(Tool):
10 |     @staticmethod
11 |     def invoke(data: Dict[str, Any], user_id: str) -> str:
12 |         users = data["users"]
13 |         if user_id in users:
14 |             return json.dumps(users[user_id])
15 |         return "Error: user not found"
16 | 
17 |     @staticmethod
18 |     def get_info() -> Dict[str, Any]:
19 |         return {
20 |             "type": "function",
21 |             "function": {
22 |                 "name": "get_user_details",
23 |                 "description": "Get the details of an user, including their reservations.",
24 |                 "parameters": {
25 |                     "type": "object",
26 |                     "properties": {
27 |                         "user_id": {
28 |                             "type": "string",
29 |                             "description": "The user id, such as 'sara_doe_496'.",
30 |                         },
31 |                     },
32 |                     "required": ["user_id"],
33 |                 },
34 |             },
35 |         }
36 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/retail/tools/list_all_product_types.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | import json
 4 | from typing import Any, Dict
 5 | 
 6 | from tau_bench.envs.tool import Tool
 7 | 
 8 | 
 9 | class ListAllProductTypes(Tool):
10 |     @staticmethod
11 |     def invoke(data: Dict[str, Any]) -> str:
12 |         products = data["products"]
13 |         product_dict = {
14 |             product["name"]: product["product_id"] for product in products.values()
15 |         }
16 |         product_dict = dict(sorted(product_dict.items()))
17 |         return json.dumps(product_dict)
18 | 
19 |     @staticmethod
20 |     def get_info() -> Dict[str, Any]:
21 |         return {
22 |             "type": "function",
23 |             "function": {
24 |                 "name": "list_all_product_types",
25 |                 "description": "List the name and product id of all product types. Each product type has a variety of different items with unique item ids and options. There are only 50 product types in the store.",
26 |                 "parameters": {
27 |                     "type": "object",
28 |                     "properties": {},
29 |                     "required": [],
30 |                 },
31 |             },
32 |         }
33 | 


--------------------------------------------------------------------------------
/src/art/utils/benchmarking/filter_model_split.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import polars as pl
 3 | except ImportError:
 4 |     raise ImportError(
 5 |         "Plotting dependencies are not installed. Please install them with: "
 6 |         "pip install openpipe-art[plotting]"
 7 |     )
 8 | 
 9 | from art.utils.benchmarking.types import BenchmarkModelKey
10 | 
11 | 
12 | def filter_rename_model_split(
13 |     df: pl.DataFrame, models: list[BenchmarkModelKey]
14 | ) -> pl.DataFrame:
15 |     # filter by combinations of name + split
16 |     z = pl.fold(
17 |         acc=pl.lit(False),
18 |         function=lambda acc, expr: acc | expr,
19 |         exprs=[
20 |             (pl.col("model") == model.name) & (pl.col("split") == model.split)
21 |             for model in models
22 |         ],
23 |     )
24 | 
25 |     df = df.filter(z)
26 | 
27 |     for model in models:
28 |         if model.name != model.display_name:
29 |             df = df.with_columns(
30 |                 pl.when(
31 |                     (pl.col("model") == model.name) & (pl.col("split") == model.split)
32 |                 )
33 |                 .then(pl.lit(model.display_name))
34 |                 .otherwise(pl.col("model"))
35 |                 .alias("model")
36 |             )
37 | 
38 |     return df
39 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | from typing import Optional, Union
 4 | 
 5 | from tau_bench.envs.base import Env
 6 | from tau_bench.envs.user import UserStrategy
 7 | 
 8 | 
 9 | def get_env(
10 |     env_name: str,
11 |     user_strategy: Union[str, UserStrategy],
12 |     user_model: str,
13 |     task_split: str,
14 |     user_provider: Optional[str] = None,
15 |     task_index: Optional[int] = None,
16 | ) -> Env:
17 |     if env_name == "retail":
18 |         from tau_bench.envs.retail import MockRetailDomainEnv
19 | 
20 |         return MockRetailDomainEnv(
21 |             user_strategy=user_strategy,
22 |             user_model=user_model,
23 |             task_split=task_split,
24 |             user_provider=user_provider,
25 |             task_index=task_index,
26 |         )
27 |     elif env_name == "airline":
28 |         from tau_bench.envs.airline import MockAirlineDomainEnv
29 | 
30 |         return MockAirlineDomainEnv(
31 |             user_strategy=user_strategy,
32 |             user_model=user_model,
33 |             task_split=task_split,
34 |             user_provider=user_provider,
35 |             task_index=task_index,
36 |         )
37 |     else:
38 |         raise ValueError(f"Unknown environment: {env_name}")
39 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/airline/tools/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | from .book_reservation import BookReservation
 4 | from .calculate import Calculate
 5 | from .cancel_reservation import CancelReservation
 6 | from .get_reservation_details import GetReservationDetails
 7 | from .get_user_details import GetUserDetails
 8 | from .list_all_airports import ListAllAirports
 9 | from .search_direct_flight import SearchDirectFlight
10 | from .search_onestop_flight import SearchOnestopFlight
11 | from .send_certificate import SendCertificate
12 | from .think import Think
13 | from .transfer_to_human_agents import TransferToHumanAgents
14 | from .update_reservation_baggages import UpdateReservationBaggages
15 | from .update_reservation_flights import UpdateReservationFlights
16 | from .update_reservation_passengers import UpdateReservationPassengers
17 | 
18 | ALL_TOOLS = [
19 |     BookReservation,
20 |     Calculate,
21 |     CancelReservation,
22 |     GetReservationDetails,
23 |     GetUserDetails,
24 |     ListAllAirports,
25 |     SearchDirectFlight,
26 |     SearchOnestopFlight,
27 |     SendCertificate,
28 |     Think,
29 |     TransferToHumanAgents,
30 |     UpdateReservationBaggages,
31 |     UpdateReservationFlights,
32 |     UpdateReservationPassengers,
33 | ]
34 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # OpenPipe Documentation
 2 | 
 3 | This repository contains the source for the ART documentation website hosted at [https://art.openpipe.ai](https://art.openpipe.ai).
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | Ensure you have the following packages installed on your machine:
 8 | 
 9 | - [pnpm](https://pnpm.io/installation)
10 | - [node](https://nodejs.org/en/download/)
11 | 
12 | ## Contributing
13 | 
14 | To edit the documentation follow these steps:
15 | 
16 | 1. Clone the repository
17 | 2. Navigate to the `docs` directory
18 | 3. Run `pnpm install` to install the dependencies
19 | 4. Run `pnpm dev` to start the development server
20 | 5. Edit the files in the `docs` directory
21 | 
22 | Edits to files should immediately be reflected in the development server.
23 | 
24 | ### Adding new pages
25 | 
26 | 1. Create a new .mdx file in the `docs` directory
27 | 2. Navigate to the `mint.json` file and add the new page to the appropriate section to the `navigation` array, or create a new section. Ensure that the path to the new page is correct.
28 | 
29 | ### Deploying changes
30 | 
31 | To deploy changes to the hosted docs, commit your changes in a new git branch and create a pull request. Once the pull request is merged, the changes will be deployed to the hosted docs.
32 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/airline/tools/get_reservation_details.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | import json
 4 | from typing import Any, Dict
 5 | 
 6 | from tau_bench.envs.tool import Tool
 7 | 
 8 | 
 9 | class GetReservationDetails(Tool):
10 |     @staticmethod
11 |     def invoke(data: Dict[str, Any], reservation_id: str) -> str:
12 |         reservations = data["reservations"]
13 |         if reservation_id in reservations:
14 |             return json.dumps(reservations[reservation_id])
15 |         return "Error: user not found"
16 | 
17 |     @staticmethod
18 |     def get_info() -> Dict[str, Any]:
19 |         return {
20 |             "type": "function",
21 |             "function": {
22 |                 "name": "get_reservation_details",
23 |                 "description": "Get the details of a reservation.",
24 |                 "parameters": {
25 |                     "type": "object",
26 |                     "properties": {
27 |                         "reservation_id": {
28 |                             "type": "string",
29 |                             "description": "The reservation id, such as '8JX2WO'.",
30 |                         },
31 |                     },
32 |                     "required": ["reservation_id"],
33 |                 },
34 |             },
35 |         }
36 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/retail/tools/get_order_details.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | import json
 4 | from typing import Any, Dict
 5 | 
 6 | from tau_bench.envs.tool import Tool
 7 | 
 8 | 
 9 | class GetOrderDetails(Tool):
10 |     @staticmethod
11 |     def invoke(data: Dict[str, Any], order_id: str) -> str:
12 |         orders = data["orders"]
13 |         if order_id in orders:
14 |             return json.dumps(orders[order_id])
15 |         return "Error: order not found"
16 | 
17 |     @staticmethod
18 |     def get_info() -> Dict[str, Any]:
19 |         return {
20 |             "type": "function",
21 |             "function": {
22 |                 "name": "get_order_details",
23 |                 "description": "Get the status and details of an order.",
24 |                 "parameters": {
25 |                     "type": "object",
26 |                     "properties": {
27 |                         "order_id": {
28 |                             "type": "string",
29 |                             "description": "The order id, such as '#W0000000'. Be careful there is a '#' symbol at the beginning of the order id.",
30 |                         },
31 |                     },
32 |                     "required": ["order_id"],
33 |                 },
34 |             },
35 |         }
36 | 


--------------------------------------------------------------------------------
/examples/mcp-rl/servers/python/mcp_alphavantage/README.md:
--------------------------------------------------------------------------------
 1 | # MCP AlphaVantage Python Server
 2 | 
 3 | A Python implementation of the MCP server for Alpha Vantage financial data API.
 4 | 
 5 | ## Features
 6 | 
 7 | - Real-time stock quotes
 8 | - Daily time series data
 9 | - Symbol search
10 | - Company overview/fundamentals
11 | - Technical indicators (SMA, RSI)
12 | 
13 | ## Setup
14 | 
15 | 1. Get an API key from [Alpha Vantage](https://www.alphavantage.co/support/#api-key)
16 | 2. Set the environment variable:
17 |    ```bash
18 |    export ALPHAVANTAGE_API_KEY=your_api_key_here
19 |    ```
20 | 
21 | ## Usage
22 | 
23 | ### Command Line
24 | ```bash
25 | python server.py --api-key YOUR_API_KEY
26 | ```
27 | 
28 | ### With Environment Variable
29 | ```bash
30 | export ALPHAVANTAGE_API_KEY=your_api_key
31 | python server.py
32 | ```
33 | 
34 | ### Available Tools
35 | 
36 | - `get_stock_quote`: Get real-time stock quote
37 | - `get_time_series_daily`: Get daily stock data
38 | - `search_symbol`: Search for stock symbols
39 | - `get_company_overview`: Get company fundamentals
40 | - `get_sma`: Simple Moving Average indicator
41 | - `get_rsi`: Relative Strength Index indicator
42 | 
43 | ## Transport Options
44 | 
45 | - `stdio` (default): Standard input/output transport
46 | - `sse`: Server-sent events over HTTP


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/retail/tools/get_product_details.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | import json
 4 | from typing import Any, Dict
 5 | 
 6 | from tau_bench.envs.tool import Tool
 7 | 
 8 | 
 9 | class GetProductDetails(Tool):
10 |     @staticmethod
11 |     def invoke(data: Dict[str, Any], product_id: str) -> str:
12 |         products = data["products"]
13 |         if product_id in products:
14 |             return json.dumps(products[product_id])
15 |         return "Error: product not found"
16 | 
17 |     @staticmethod
18 |     def get_info() -> Dict[str, Any]:
19 |         return {
20 |             "type": "function",
21 |             "function": {
22 |                 "name": "get_product_details",
23 |                 "description": "Get the inventory details of a product.",
24 |                 "parameters": {
25 |                     "type": "object",
26 |                     "properties": {
27 |                         "product_id": {
28 |                             "type": "string",
29 |                             "description": "The product id, such as '6086499569'. Be careful the product id is different from the item id.",
30 |                         },
31 |                     },
32 |                     "required": ["product_id"],
33 |                 },
34 |             },
35 |         }
36 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/retail/tools/find_user_id_by_email.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | from typing import Any, Dict
 4 | 
 5 | from tau_bench.envs.tool import Tool
 6 | 
 7 | 
 8 | class FindUserIdByEmail(Tool):
 9 |     @staticmethod
10 |     def invoke(data: Dict[str, Any], email: str) -> str:
11 |         users = data["users"]
12 |         for user_id, profile in users.items():
13 |             if profile["email"].lower() == email.lower():
14 |                 return user_id
15 |         return "Error: user not found"
16 | 
17 |     @staticmethod
18 |     def get_info() -> Dict[str, Any]:
19 |         return {
20 |             "type": "function",
21 |             "function": {
22 |                 "name": "find_user_id_by_email",
23 |                 "description": "Find user id by email. If the user is not found, the function will return an error message.",
24 |                 "parameters": {
25 |                     "type": "object",
26 |                     "properties": {
27 |                         "email": {
28 |                             "type": "string",
29 |                             "description": "The email of the user, such as 'something@example.com'.",
30 |                         },
31 |                     },
32 |                     "required": ["email"],
33 |                 },
34 |             },
35 |         }
36 | 


--------------------------------------------------------------------------------
/src/art/transformers/patches.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING, Optional, Union
 2 | 
 3 | import torch
 4 | from transformers import masking_utils
 5 | from transformers.cache_utils import Cache
 6 | from transformers.configuration_utils import PretrainedConfig
 7 | 
 8 | if TYPE_CHECKING:
 9 |     from torch.nn.attention.flex_attention import BlockMask
10 | 
11 | _preprocess_mask_arguments = masking_utils._preprocess_mask_arguments
12 | 
13 | 
14 | def _patched_preprocess_mask_arguments(
15 |     config: PretrainedConfig,
16 |     input_embeds: torch.Tensor,
17 |     attention_mask: Optional[Union[torch.Tensor, "BlockMask"]],
18 |     cache_position: torch.Tensor,
19 |     past_key_values: Optional[Cache],
20 |     position_ids: Optional[torch.Tensor],
21 |     layer_idx: Optional[int],
22 | ) -> tuple[bool, Optional[Union[torch.Tensor, "BlockMask"]], int, int]:
23 |     if position_ids is not None and len(position_ids.shape) == 3:
24 |         position_ids = position_ids[0]
25 |     return _preprocess_mask_arguments(
26 |         config,
27 |         input_embeds,
28 |         attention_mask,
29 |         cache_position,
30 |         past_key_values,
31 |         position_ids,
32 |         layer_idx,
33 |     )
34 | 
35 | 
36 | def patch_preprocess_mask_arguments() -> None:
37 |     masking_utils._preprocess_mask_arguments = _patched_preprocess_mask_arguments
38 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/airline/tools/transfer_to_human_agents.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | from typing import Any, Dict
 4 | 
 5 | from tau_bench.envs.tool import Tool
 6 | 
 7 | 
 8 | class TransferToHumanAgents(Tool):
 9 |     @staticmethod
10 |     def invoke(
11 |         data: Dict[str, Any],
12 |         summary: str,
13 |     ) -> str:
14 |         return "Transfer successful"
15 | 
16 |     @staticmethod
17 |     def get_info() -> Dict[str, Any]:
18 |         return {
19 |             "type": "function",
20 |             "function": {
21 |                 "name": "transfer_to_human_agents",
22 |                 "description": "Transfer the user to a human agent, with a summary of the user's issue. Only transfer if the user explicitly asks for a human agent, or if the user's issue cannot be resolved by the agent with the available tools.",
23 |                 "parameters": {
24 |                     "type": "object",
25 |                     "properties": {
26 |                         "summary": {
27 |                             "type": "string",
28 |                             "description": "A summary of the user's issue.",
29 |                         },
30 |                     },
31 |                     "required": [
32 |                         "summary",
33 |                     ],
34 |                 },
35 |             },
36 |         }
37 | 


--------------------------------------------------------------------------------
/scripts/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Load environment variables from .env file if it exists
 4 | if [ -f .env ]; then
 5 |     # Read .env file line by line, ignoring comments and empty lines
 6 |     while IFS= read -r line || [ -n "$line" ]; do
 7 |         # Skip comments and empty lines
 8 |         [[ $line =~ ^#.*$ ]] && continue
 9 |         [[ -z $line ]] && continue
10 |         
11 |         # Export the variable
12 |         export "$line"
13 |     done < .env
14 | fi
15 | 
16 | # Configure git user name and email
17 | git config --global user.name "${GIT_USER_NAME}"
18 | git config --global user.email "${GIT_USER_EMAIL}"
19 | git config --global --add safe.directory /root/sky_workdir
20 | 
21 | if [ "${GIT_RESET_CLEAN:-true}" = "true" ]; then
22 |     # Reset any uncommitted changes to the last commit
23 |     git reset --hard HEAD
24 | 
25 |     # Remove all untracked files and directories
26 |     git clean -fd
27 | else
28 |     echo "Skipping git reset/clean (GIT_RESET_CLEAN is not true). Preserving synced working tree."
29 | fi
30 | 
31 | # Install astral-uv
32 | sudo snap install --classic astral-uv
33 | 
34 | # Update uv
35 | uv self update
36 | 
37 | # Install tmux
38 | apt install tmux -y
39 | 
40 | # Sync the dependencies
41 | if [ "${INSTALL_EXTRAS:-false}" = "true" ]; then
42 |     uv sync --all-extras
43 | else
44 |     uv sync --extra backend
45 | fi


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/retail/tools/think.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | from typing import Any, Dict
 4 | 
 5 | from tau_bench.envs.tool import Tool
 6 | 
 7 | 
 8 | class Think(Tool):
 9 |     @staticmethod
10 |     def invoke(data: Dict[str, Any], thought: str) -> str:
11 |         # This method does not change the state of the data; it simply returns an empty string.
12 |         return "Thought Completed"
13 | 
14 |     @staticmethod
15 |     def get_info() -> Dict[str, Any]:
16 |         return {
17 |             "type": "function",
18 |             "function": {
19 |                 "name": "think",
20 |                 "description": (
21 |                     "Use the tool to think about something. It will not obtain new information or change the database, "
22 |                     "but just append the thought to the log. Use it when complex reasoning or some cache memory is needed."
23 |                 ),
24 |                 "parameters": {
25 |                     "type": "object",
26 |                     "properties": {
27 |                         "thought": {
28 |                             "type": "string",
29 |                             "description": "A thought to think about.",
30 |                         },
31 |                     },
32 |                     "required": ["thought"],
33 |                 },
34 |             },
35 |         }
36 | 


--------------------------------------------------------------------------------
/dev/test_skypilot/register_model.py:
--------------------------------------------------------------------------------
 1 | """Training example for MCP agent using rollout with AlphaMcpServer in scenarios."""
 2 | 
 3 | import asyncio
 4 | 
 5 | from dotenv import load_dotenv
 6 | from pydantic import BaseModel
 7 | 
 8 | import art
 9 | from art.skypilot.backend import SkyPilotBackend
10 | 
11 | load_dotenv()
12 | 
13 | 
14 | class ComplexModelConfig(BaseModel):
15 |     max_turns: int = 5
16 |     max_tokens: int = 2048
17 | 
18 |     base_model: str = "Qwen/Qwen2.5-14B-Instruct"
19 |     # Random seed to control which subset of the training data is sampled
20 |     training_dataset_seed: int | None = None
21 | 
22 |     # Training configuration
23 |     scale_rewards: bool = True
24 | 
25 | 
26 | async def register_model():
27 |     backend = await SkyPilotBackend().initialize_cluster(
28 |         cluster_name="test-skypilot",
29 |         gpu="H100-SXM",
30 |         env_path=".env",
31 |         # force_restart=True,
32 |     )
33 | 
34 |     model = art.TrainableModel(
35 |         name="complex-model",
36 |         project="test-skypilot",
37 |         base_model="Qwen/Qwen2.5-14B-Instruct",
38 |         config=ComplexModelConfig(
39 |             num_epochs=160,
40 |         ),
41 |     )
42 | 
43 |     await backend.register(model)
44 | 
45 |     print("model registered")
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     asyncio.run(register_model())
50 | 


--------------------------------------------------------------------------------
/src/art/vllm/__init__.py:
--------------------------------------------------------------------------------
 1 | """vLLM integration module for art."""
 2 | 
 3 | # Server functionality
 4 | # Engine and worker management
 5 | from .engine import (
 6 |     WorkerExtension,
 7 |     create_engine_pause_and_resume_functions,
 8 |     get_llm,
 9 |     get_worker,
10 |     run_on_workers,
11 | )
12 | 
13 | # Patches - these are typically imported for their side effects
14 | from .patches import (
15 |     patch_allocator,
16 |     patch_get_lora_tokenizer_async,
17 |     patch_listen_for_disconnect,
18 |     patch_lora_request,
19 |     patch_multi_step_model_runner,
20 |     patch_tool_parser_manager,
21 |     subclass_chat_completion_request,
22 | )
23 | from .server import (
24 |     get_uvicorn_logging_config,
25 |     openai_server_task,
26 |     set_vllm_log_file,
27 | )
28 | 
29 | __all__ = [
30 |     # Server
31 |     "openai_server_task",
32 |     "get_uvicorn_logging_config",
33 |     "set_vllm_log_file",
34 |     # Engine
35 |     "get_llm",
36 |     "create_engine_pause_and_resume_functions",
37 |     "run_on_workers",
38 |     "get_worker",
39 |     "WorkerExtension",
40 |     # Patches
41 |     "patch_allocator",
42 |     "subclass_chat_completion_request",
43 |     "patch_lora_request",
44 |     "patch_get_lora_tokenizer_async",
45 |     "patch_listen_for_disconnect",
46 |     "patch_tool_parser_manager",
47 |     "patch_multi_step_model_runner",
48 | ]
49 | 


--------------------------------------------------------------------------------
/examples/hn_title_generator/skypilot-reference-grpo-trainer.yaml:
--------------------------------------------------------------------------------
 1 | # To launch, run the following command from the root directory of the art repository:
 2 | # `uv run sky launch examples/hn_title_generator/skypilot-reference-grpo-trainer.yaml --cluster=kyle-hn-title-generator-002 --env-file=.env --yes --retry-until-up --down --idle-minutes-to-autostop 60`
 3 | 
 4 | resources:
 5 |   image_id: pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel
 6 |   cloud: runpod
 7 |   region: US
 8 |   accelerators:
 9 |     - "H100-SXM"
10 | 
11 | workdir: .
12 | 
13 | envs:
14 |   HF_HUB_ENABLE_HF_TRANSFER: 1
15 |   VLLM_CONFIGURE_LOGGING: 0
16 | 
17 | setup: |
18 |   apt-get update && apt-get install -y git
19 | 
20 |   curl -LsSf https://astral.sh/uv/install.sh | sh
21 | 
22 |   # Source the environment to make uv available
23 |   source $HOME/.local/bin/env
24 | 
25 |   uv pip install --system \
26 |     unsloth==2025.3.19 \
27 |     vllm==0.8.2 \
28 |     bitsandbytes==0.45.4 \
29 |     datasets==3.3.2 \
30 |     s3fs==2024.12.0 \
31 |     hf-transfer==0.1.9 \
32 |     typer==0.15.2 \
33 |     fastapi==0.115.11 \
34 |     python-dotenv==1.0.1 \
35 |     polars==1.24.0 \
36 |     wandb==0.19.8 \
37 |     git+https://github.com/corbt/panza.git \
38 | 
39 |   echo "Setup complete"
40 | 
41 | run: |
42 |   echo "Running train_grpo.py"
43 |   uv run python examples/hn_title_generator/reference_grpo_trainer.py
44 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/model_utils/model/outlines_completion.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from pydantic import BaseModel
 4 | 
 5 | from tau_bench.model_utils.api.datapoint import Datapoint
 6 | from tau_bench.model_utils.model.vllm_completion import VLLMCompletionModel
 7 | from tau_bench.model_utils.model.vllm_utils import generate_request
 8 | 
 9 | 
10 | class OutlinesCompletionModel(VLLMCompletionModel):
11 |     def parse_force_from_prompt(
12 |         self, prompt: str, typ: BaseModel, temperature: float | None = None
13 |     ) -> dict[str, Any]:
14 |         if temperature is None:
15 |             temperature = self.temperature
16 |         schema = typ.model_json_schema()
17 |         res = generate_request(
18 |             url=self.url,
19 |             prompt=prompt,
20 |             force_json=True,
21 |             schema=schema,
22 |             temperature=temperature,
23 |         )
24 |         return self.handle_parse_force_response(prompt=prompt, content=res)
25 | 
26 |     def get_approx_cost(self, dp: Datapoint) -> float:
27 |         return super().get_approx_cost(dp)
28 | 
29 |     def get_latency(self, dp: Datapoint) -> float:
30 |         return super().get_latency(dp)
31 | 
32 |     def get_capability(self) -> float:
33 |         return super().get_capability()
34 | 
35 |     def supports_dp(self, dp: Datapoint) -> bool:
36 |         return super().supports_dp(dp)
37 | 


--------------------------------------------------------------------------------
/scripts/launch-cluster.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CLUSTER_NAME="art"
 4 | 
 5 | # Parse arguments
 6 | ARGS=()
 7 | PULL_LATEST=true
 8 | while [[ $# -gt 0 ]]; do
 9 |   case "$1" in
10 |     -c)
11 |       CLUSTER_NAME="$2"
12 |       shift 2
13 |       ;;
14 |     --no-pull)
15 |       PULL_LATEST=false
16 |       shift 1
17 |       ;;
18 |     *)
19 |       ARGS+=("$1")
20 |       shift
21 |       ;;
22 |   esac
23 | done
24 | 
25 | # Check for unstaged changes
26 | if ! git diff --quiet; then
27 |     echo "Warning: You have unstaged changes. Unstaged changes will be discarded from the cluster working directory."
28 | fi
29 | 
30 | # Check for uncommitted changes
31 | if ! git diff --cached --quiet; then
32 |     echo "Warning: You have uncommitted changes. Uncommitted changes will be discarded from the cluster working directory."
33 | fi
34 | 
35 | if [[ "$PULL_LATEST" == true ]]; then
36 |     echo "Pulling latest changes..."
37 |     if ! git pull; then
38 |         echo "Error: Failed to pull latest changes."
39 |         exit 1
40 |     fi
41 | else
42 |     echo "Skipping git pull (deploying current working tree). To pull latest, omit --no-pull."
43 |     # Preserve synced working tree on remote by disabling reset/clean.
44 |     ARGS+=(--env "GIT_RESET_CLEAN=false")
45 | fi
46 | 
47 | # Launch the cluster
48 | uv run sky launch skypilot-config.yaml -c "$CLUSTER_NAME" --env-file .env -y "${ARGS[@]}"


--------------------------------------------------------------------------------
/src/art/utils/output_dirs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from art.model import Model
 4 | from art.utils.get_repo_root_path import get_repo_root_path
 5 | 
 6 | 
 7 | def get_default_art_path() -> str:
 8 |     root_path = get_repo_root_path()
 9 |     return os.path.join(root_path, ".art")
10 | 
11 | 
12 | def get_models_dir(project_name: str, art_path: str | None = None) -> str:
13 |     if art_path is None:
14 |         art_path = get_default_art_path()
15 |     return f"{art_path}/{project_name}/models"
16 | 
17 | 
18 | def get_model_dir(model: Model, art_path: str | None = None) -> str:
19 |     if art_path is None:
20 |         art_path = get_default_art_path()
21 |     return f"{art_path}/{model.project}/models/{model.name}"
22 | 
23 | 
24 | def get_output_dir_from_model_properties(
25 |     project: str, name: str, art_path: str | None = None
26 | ) -> str:
27 |     if art_path is None:
28 |         art_path = get_default_art_path()
29 |     return f"{art_path}/{project}/models/{name}"
30 | 
31 | 
32 | def get_step_checkpoint_dir(model_output_dir: str, step: int) -> str:
33 |     return f"{model_output_dir}/checkpoints/{step:04d}"
34 | 
35 | 
36 | def get_trajectories_dir(model_output_dir: str) -> str:
37 |     return f"{model_output_dir}/trajectories"
38 | 
39 | 
40 | def get_trajectories_split_dir(model_output_dir: str, split: str) -> str:
41 |     return f"{model_output_dir}/trajectories/{split}"
42 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/retail/tools/transfer_to_human_agents.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | from typing import Any, Dict
 4 | 
 5 | from tau_bench.envs.tool import Tool
 6 | 
 7 | 
 8 | class TransferToHumanAgents(Tool):
 9 |     @staticmethod
10 |     def invoke(data: Dict[str, Any], summary: str) -> str:
11 |         # This method simulates the transfer to a human agent.
12 |         return "Transfer successful"
13 | 
14 |     @staticmethod
15 |     def get_info() -> Dict[str, Any]:
16 |         return {
17 |             "type": "function",
18 |             "function": {
19 |                 "name": "transfer_to_human_agents",
20 |                 "description": (
21 |                     "Transfer the user to a human agent, with a summary of the user's issue. "
22 |                     "Only transfer if the user explicitly asks for a human agent, or if the user's issue cannot be resolved by the agent with the available tools."
23 |                 ),
24 |                 "parameters": {
25 |                     "type": "object",
26 |                     "properties": {
27 |                         "summary": {
28 |                             "type": "string",
29 |                             "description": "A summary of the user's issue.",
30 |                         },
31 |                     },
32 |                     "required": ["summary"],
33 |                 },
34 |             },
35 |         }
36 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/airline/env.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | from typing import Optional, Union
 4 | 
 5 | from tau_bench.envs.airline.data import load_data
 6 | from tau_bench.envs.airline.rules import RULES
 7 | from tau_bench.envs.airline.tools import ALL_TOOLS
 8 | from tau_bench.envs.airline.wiki import WIKI
 9 | from tau_bench.envs.base import Env
10 | from tau_bench.envs.user import UserStrategy
11 | 
12 | 
13 | class MockAirlineDomainEnv(Env):
14 |     def __init__(
15 |         self,
16 |         user_strategy: Union[str, UserStrategy] = UserStrategy.LLM,
17 |         user_model: str = "gpt-4o",
18 |         user_provider: Optional[str] = None,
19 |         task_split: str = "test",
20 |         task_index: Optional[int] = None,
21 |     ):
22 |         match task_split:
23 |             case "test":
24 |                 from tau_bench.envs.airline.tasks_test import TASKS as tasks
25 |             case _:
26 |                 raise ValueError(f"Unknown task split: {task_split}")
27 |         super().__init__(
28 |             data_load_func=load_data,
29 |             tools=ALL_TOOLS,
30 |             tasks=tasks,
31 |             wiki=WIKI,
32 |             rules=RULES,
33 |             user_strategy=user_strategy,
34 |             user_model=user_model,
35 |             user_provider=user_provider,
36 |             task_index=task_index,
37 |         )
38 |         self.terminate_tools = ["transfer_to_human_agents"]
39 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/airline/tools/calculate.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | from typing import Any, Dict
 4 | 
 5 | from tau_bench.envs.tool import Tool
 6 | 
 7 | 
 8 | class Calculate(Tool):
 9 |     @staticmethod
10 |     def invoke(data: Dict[str, Any], expression: str) -> str:
11 |         if not all(char in "0123456789+-*/(). " for char in expression):
12 |             return "Error: invalid characters in expression"
13 |         try:
14 |             return str(round(float(eval(expression, {"__builtins__": None}, {})), 2))
15 |         except Exception as e:
16 |             return f"Error: {e}"
17 | 
18 |     @staticmethod
19 |     def get_info() -> Dict[str, Any]:
20 |         return {
21 |             "type": "function",
22 |             "function": {
23 |                 "name": "calculate",
24 |                 "description": "Calculate the result of a mathematical expression.",
25 |                 "parameters": {
26 |                     "type": "object",
27 |                     "properties": {
28 |                         "expression": {
29 |                             "type": "string",
30 |                             "description": "The mathematical expression to calculate, such as '2 + 2'. The expression can contain numbers, operators (+, -, *, /), parentheses, and spaces.",
31 |                         },
32 |                     },
33 |                     "required": ["expression"],
34 |                 },
35 |             },
36 |         }
37 | 


--------------------------------------------------------------------------------
/src/art/skypilot/stop_server.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import asyncio
 3 | 
 4 | import sky
 5 | 
 6 | from art.skypilot.backend import SkyPilotBackend
 7 | from art.skypilot.utils import is_task_created, to_thread_typed
 8 | 
 9 | parser = argparse.ArgumentParser(
10 |     description="Close the art server hosted on a skypilot cluster"
11 | )
12 | parser.add_argument(
13 |     "--cluster",
14 |     type=str,
15 |     required=True,
16 |     help="The name of the skypilot cluster to close the art server on",
17 | )
18 | args = parser.parse_args()
19 | 
20 | 
21 | async def stop_server() -> None:
22 |     cluster_status = await to_thread_typed(
23 |         lambda: sky.stream_and_get(sky.status(cluster_names=[args.cluster]))
24 |     )
25 |     if len(cluster_status) == 0 or cluster_status[0]["status"] != sky.ClusterStatus.UP:
26 |         raise ValueError(f"Cluster {args.cluster} is not running")
27 | 
28 |     if not await is_task_created(cluster_name=args.cluster, task_name="art_server"):
29 |         raise ValueError(f"Art server task for cluster {args.cluster} is not running")
30 | 
31 |     backend = await SkyPilotBackend.initialize_cluster(
32 |         cluster_name=args.cluster, art_version=".", env_path=".env", gpu="H100"
33 |     )
34 |     await backend.close()
35 | 
36 |     # cancel the art server task
37 |     await to_thread_typed(lambda: sky.cancel(cluster_name=args.cluster, all=True))
38 | 
39 | 
40 | def main() -> None:
41 |     asyncio.run(stop_server())
42 | 


--------------------------------------------------------------------------------
/AGENT.md:
--------------------------------------------------------------------------------
 1 | ## uv package manager by default
 2 | 
 3 | This project uses the `uv` package manager.
 4 | 
 5 | - To add a dependency, run `uv add <package>`.
 6 | - To run a script, run `uv run <script>`.
 7 | - To examine dependencies, consult the `pyproject.toml` file.
 8 | 
 9 | ## Testing
10 | 
11 | - Always run tests before committing. The test command is `./scripts/run_checks.sh`.
12 | 
13 | ## Releases
14 | 
15 | - If asked to help with a release, refer to the checklist in CONTRIBUTING.md. Be sure to first share a draft of the release notes with the user before actually publishing the release to GitHub.
16 | - To trigger the release workflow via GitHub CLI: `gh workflow run create-draft-release.yml --field version_type=patch` (use `minor` or `major` instead of `patch` as needed)
17 | 
18 | ## Documentation
19 | 
20 | - All documentation is in the `docs` directory.
21 | - If you add a new page, be sure to add it to the sidebar in `docs/docs.json`.
22 | - If you move a page, be sure to update the sidebar in `docs/docs.json` and check for any broken links.
23 | 
24 | ### Adding images
25 | 
26 | - Add images to the `docs/images` directory
27 | - If the image is a png, first convert it to webp using `magick <input.png> <output.webp>`. Do not include the original png in the repo.
28 | - Use the `<Frame>` tag to add images with captions as seen in the page `checkpoint-forking.mdx`.
29 | 
30 | ### Adding notes
31 | 
32 | - Add notes using the `<Note>` tag as seen in the page `ruler.mdx`
33 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/retail/tools/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | from .calculate import Calculate
 4 | from .cancel_pending_order import CancelPendingOrder
 5 | from .exchange_delivered_order_items import ExchangeDeliveredOrderItems
 6 | from .find_user_id_by_email import FindUserIdByEmail
 7 | from .find_user_id_by_name_zip import FindUserIdByNameZip
 8 | from .get_order_details import GetOrderDetails
 9 | from .get_product_details import GetProductDetails
10 | from .get_user_details import GetUserDetails
11 | from .list_all_product_types import ListAllProductTypes
12 | from .modify_pending_order_address import ModifyPendingOrderAddress
13 | from .modify_pending_order_items import ModifyPendingOrderItems
14 | from .modify_pending_order_payment import ModifyPendingOrderPayment
15 | from .modify_user_address import ModifyUserAddress
16 | from .return_delivered_order_items import ReturnDeliveredOrderItems
17 | from .think import Think
18 | from .transfer_to_human_agents import TransferToHumanAgents
19 | 
20 | ALL_TOOLS = [
21 |     Calculate,
22 |     CancelPendingOrder,
23 |     ExchangeDeliveredOrderItems,
24 |     FindUserIdByEmail,
25 |     FindUserIdByNameZip,
26 |     GetOrderDetails,
27 |     GetProductDetails,
28 |     GetUserDetails,
29 |     ListAllProductTypes,
30 |     ModifyPendingOrderAddress,
31 |     ModifyPendingOrderItems,
32 |     ModifyPendingOrderPayment,
33 |     ModifyUserAddress,
34 |     ReturnDeliveredOrderItems,
35 |     Think,
36 |     TransferToHumanAgents,
37 | ]
38 | 


--------------------------------------------------------------------------------
/src/art/utils/strip_logprobs.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import logging
 3 | import sys
 4 | from typing import Any
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | def strip_logprobs(obj: Any) -> Any:
10 |     """
11 |     Recursively remove 'logprobs' keys from nested data structures to reduce data storage size.
12 | 
13 |     Args:
14 |         obj: Any nested data structure
15 | 
16 |     Returns:
17 |         The same structure with 'logprobs' keys removed, or the original
18 |         object if deepcopy fails
19 |     """
20 | 
21 |     try:
22 |         copied_obj = copy.deepcopy(obj)
23 |     except Exception as e:
24 |         logger.warning(
25 |             f"Failed to deepcopy object in strip_logprobs: {e}. "
26 |             "Returning original object unchanged."
27 |         )
28 |         return obj
29 | 
30 |     result = _strip_logprobs(copied_obj)
31 | 
32 |     return result
33 | 
34 | 
35 | def _strip_logprobs(obj: Any) -> Any:
36 |     if isinstance(obj, dict):
37 |         return {k: _strip_logprobs(v) for k, v in obj.items() if k != "logprobs"}
38 |     elif isinstance(obj, (list, tuple)):
39 |         result = [_strip_logprobs(v) for v in obj]
40 |         return tuple(result) if isinstance(obj, tuple) else result
41 |     elif hasattr(obj, "__dict__"):
42 |         for k, v in obj.__dict__.items():
43 |             if k == "logprobs":
44 |                 setattr(obj, k, None)
45 |             else:
46 |                 setattr(obj, k, _strip_logprobs(v))
47 |         return obj
48 |     return obj
49 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/retail/tools/calculate.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | from typing import Any, Dict
 4 | 
 5 | from tau_bench.envs.tool import Tool
 6 | 
 7 | 
 8 | class Calculate(Tool):
 9 |     @staticmethod
10 |     def invoke(data: Dict[str, Any], expression: str) -> str:
11 |         if not all(char in "0123456789+-*/(). " for char in expression):
12 |             return "Error: invalid characters in expression"
13 |         try:
14 |             # Evaluate the mathematical expression safely
15 |             return str(round(float(eval(expression, {"__builtins__": None}, {})), 2))
16 |         except Exception as e:
17 |             return f"Error: {e}"
18 | 
19 |     @staticmethod
20 |     def get_info() -> Dict[str, Any]:
21 |         return {
22 |             "type": "function",
23 |             "function": {
24 |                 "name": "calculate",
25 |                 "description": "Calculate the result of a mathematical expression.",
26 |                 "parameters": {
27 |                     "type": "object",
28 |                     "properties": {
29 |                         "expression": {
30 |                             "type": "string",
31 |                             "description": "The mathematical expression to calculate, such as '2 + 2'. The expression can contain numbers, operators (+, -, *, /), parentheses, and spaces.",
32 |                         },
33 |                     },
34 |                     "required": ["expression"],
35 |                 },
36 |             },
37 |         }
38 | 


--------------------------------------------------------------------------------
/examples/mcp-rl/servers/python/mcp_balldontlie/scenarios/val.jsonl:
--------------------------------------------------------------------------------
1 | {"task": "Lookup details for players with the first name 'LeBron' across all teams, then compile a thorough report comparing their season stats and overall impact.", "difficulty": 3}
2 | {"task": "Fetch game data for a specific season and date range, then generate an extensive analysis report on the performance trends and game momentum over that period.", "difficulty": 4}
3 | {"task": "Get information for NBA teams in the Western conference that belong to a specific division, and produce a comprehensive analysis report on their season progress.", "difficulty": 3}
4 | {"task": "Retrieve all NBA teams in the Eastern conference, then generate a detailed summary of the team profiles and their recent performance trends.", "difficulty": 2}
5 | {"task": "Fetch both player and game data for a trending player, then compile a detailed summary report analyzing their performance trends, in-game impact, and career progression.", "difficulty": 5}
6 | {"task": "Collect NBA game data for a series of key dates, analyze score trends and performance shifts, and compile a well-documented summary report.", "difficulty": 3}
7 | {"task": "Fetch detailed game statistics for a playoff series and compile an extensive report summarizing game momentum, key moments, and statistical insights.", "difficulty": 4}
8 | {"task": "Obtain information on teams in the Western conference, then generate an in-depth summary report analyzing overall team performance and trends across the season.", "difficulty": 3}
9 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/retail/data/readme.md:
--------------------------------------------------------------------------------
 1 | # Mock Data Generation
 2 | 
 3 | ## Current Mock Data for the Benchmark
 4 | Feel free to use some of the data for other purposes.
 5 | - `users.json`: a database of users with their emails, addresses, and orders
 6 | - `products.json`: a database of products, where each product has variants (e.g., size, color).
 7 | - `orders.json`: a database of orders that can be operated upon.
 8 | 
 9 | 
10 | Check `../tools` for mock APIs on top of current mock data.
11 | 
12 | 
13 | ### Experience of Mock Data Generation
14 | 
15 | Read our paper to learn more about the generation process for each database. In general, it involves the following stages:
16 | 
17 | 1. Design the type and schema of each database. Can use GPT for co-brainstorming but has to be human decided as it is the foundation of everything else.
18 | 2. For each schema, figure out which parts can be programmaticly generated and which parts need GPT. For example,
19 |     - Product types (shirt, lamp, pen) and user names (Sara, John, Noah) need GPT generation
20 |     - Product price and shipping date can be generated via code
21 | 3. Use GPT to generate seed data (first names, last names, addresses, cities, etc.), then use a program to compose them with other code generated data. Can use GPT to help write the code for this part, but I think code-based database construction is more reliable than GPT-based database construction (e.g., give some example user profiles and ask GPT to generate more --- issues with diversity and reliability).
22 | 


--------------------------------------------------------------------------------
/docs/style.css:
--------------------------------------------------------------------------------
 1 | /* Styles for tables wrapped in a .full-width container */
 2 | .full-width table th,
 3 | .full-width table td {
 4 |   vertical-align: top !important;
 5 |   line-height: 1.5;
 6 |   /* Horizontal borders */
 7 |   border-top: 1px solid #d0d0d0;
 8 |   border-bottom: 1px solid #d0d0d0;
 9 |   /* Vertical inner borders */
10 |   border-left: 1px solid #d0d0d0;
11 |   padding: 6px 8px;
12 |   width: auto;
13 | }
14 | 
15 | /* Hide the "Powered by Mintlify" credit link everywhere */
16 | a[href^="https://mintlify.com/preview-request"]
17 | {
18 |   display: none !important;
19 | }
20 | 
21 | .full-width {
22 |   margin-top: -16px;
23 | }
24 | 
25 | .full-width table {
26 |   width: 100%;
27 |   border-collapse: collapse;
28 |   table-layout: auto;
29 | }
30 | 
31 | .prose.overflow-x-auto .full-width table {
32 |   display: table !important;
33 |   width: 100% !important;
34 |   min-width: 100%;
35 | }
36 | 
37 | /* Remove border on the far left of the table */
38 | .full-width table th:first-child,
39 | .full-width table td:first-child {
40 |   border-left: none;
41 |   padding-left: 8px !important;
42 | }
43 | 
44 | /* Responsive card container */
45 | .cards-container {
46 |   display: flex;
47 |   gap: 1rem;
48 |   justify-content: space-between;
49 | }
50 | 
51 | .card-wrapper {
52 |   width: 50%;
53 | }
54 | 
55 | /* Single column layout for screens smaller than 1000px */
56 | @media (max-width: 999px) {
57 |   .cards-container {
58 |     flex-direction: column;
59 |     gap: 0;
60 |   }
61 | 
62 |   .card-wrapper {
63 |     width: 100%;
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/retail/env.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | from typing import Optional, Union
 4 | 
 5 | from tau_bench.envs.base import Env
 6 | from tau_bench.envs.retail.data import load_data
 7 | from tau_bench.envs.retail.rules import RULES
 8 | from tau_bench.envs.retail.tools import ALL_TOOLS
 9 | from tau_bench.envs.retail.wiki import WIKI
10 | from tau_bench.envs.user import UserStrategy
11 | 
12 | 
13 | class MockRetailDomainEnv(Env):
14 |     def __init__(
15 |         self,
16 |         user_strategy: Union[str, UserStrategy] = UserStrategy.LLM,
17 |         user_model: str = "gpt-4o",
18 |         user_provider: Optional[str] = None,
19 |         task_split: str = "test",
20 |         task_index: Optional[int] = None,
21 |     ):
22 |         match task_split:
23 |             case "test":
24 |                 from tau_bench.envs.retail.tasks_test import TASKS_TEST as tasks
25 |             case "train":
26 |                 from tau_bench.envs.retail.tasks_train import TASKS_TRAIN as tasks
27 |             case "dev":
28 |                 from tau_bench.envs.retail.tasks_dev import TASKS_DEV as tasks
29 |             case _:
30 |                 raise ValueError(f"Unknown task split: {task_split}")
31 |         super().__init__(
32 |             data_load_func=load_data,
33 |             tools=ALL_TOOLS,
34 |             tasks=tasks,
35 |             wiki=WIKI,
36 |             rules=RULES,
37 |             user_strategy=user_strategy,
38 |             user_model=user_model,
39 |             user_provider=user_provider,
40 |             task_index=task_index,
41 |         )
42 |         self.terminate_tools = ["transfer_to_human_agents"]
43 | 


--------------------------------------------------------------------------------
/examples/mcp-rl/servers/python/mcp_alphavantage/scenarios/val.jsonl:
--------------------------------------------------------------------------------
1 | {"task": "Calculate the 60-minute SMA for IBM and generate a detailed trend analysis including a summary of the work done and a thorough analysis and report of the results.", "difficulty": 3}
2 | {"task": "Compare the weekly RSI and monthly SMA for a stock like Costco (COST), providing a detailed trend analysis and generating a summary of the work done and a thorough analysis and report of the results.", "difficulty": 4}
3 | {"task": "Obtain the company overview and daily RSI for Facebook (FB), then generate a combined summary of the work done and a thorough analysis and report of the results.", "difficulty": 3}
4 | {"task": "Perform a market research task by searching for stock symbols related to 'bank', selecting a top candidate, retrieving its daily time series and technical indicators (SMA and RSI), and generating a summary of the work done with a thorough analysis and report of the results.", "difficulty": 5}
5 | {"task": "Retrieve the daily time series for Tesla (TSLA) and generate a summary of the work done and a thorough analysis and report of the results.", "difficulty": 2}
6 | {"task": "Perform a technical analysis by comparing the 30-day SMA and the 14-day RSI for IBM. Generate a summary of the work done and a thorough analysis and report of the results.", "difficulty": 3}
7 | {"task": "Fetch the daily Relative Strength Index (RSI) for Apple (AAPL) and generate a summary of the work done and a thorough analysis and report of the results.", "difficulty": 2}
8 | {"task": "Search for stock symbols that match the keyword 'Tesla', then analyze the top result and generate a summary of the work done and a thorough analysis and report of the results.", "difficulty": 2}
9 | 


--------------------------------------------------------------------------------
/src/art/utils/old_benchmarking/types.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | 
 4 | class BenchmarkedModelKey:
 5 |     model: str
 6 |     split: str
 7 |     step_indices: list[int] | None = None
 8 | 
 9 |     def __init__(self, model: str, split: str, step_indices: list[int] | None = None):
10 |         self.model = model
11 |         self.split = split
12 |         self.step_indices = step_indices
13 | 
14 |     def __str__(self):
15 |         steps_str = ""
16 |         if self.step_indices is not None:
17 |             if len(self.step_indices) == 1:
18 |                 steps_str = f"{self.step_indices[0]}"
19 |             else:
20 |                 steps_str = f"{self.step_indices[0]}-{self.step_indices[-1]}"
21 |         return f"{self.model} {self.split} {steps_str}"
22 | 
23 | 
24 | class BenchmarkedModelStep:
25 |     index: int
26 |     recorded_at: datetime | None = None
27 |     metrics: dict[str, float] = {}
28 | 
29 |     def __init__(self, index: int, metrics: dict[str, float] | None = None):
30 |         self.index = index
31 |         self.metrics = metrics if metrics is not None else {}
32 | 
33 |     def __str__(self):
34 |         return f"{self.index} {self.metrics}"
35 | 
36 | 
37 | class BenchmarkedModel:
38 |     model_key: BenchmarkedModelKey
39 |     steps: list[BenchmarkedModelStep] = []
40 | 
41 |     def __init__(
42 |         self,
43 |         model_key: BenchmarkedModelKey,
44 |         steps: list[BenchmarkedModelStep] | None = None,
45 |     ):
46 |         self.model_key = model_key
47 |         self.steps = steps if steps is not None else []
48 | 
49 |     def __str__(self):
50 |         steps_str = "\n".join([str(step) for step in self.steps])
51 |         return f"{self.model_key}\n{steps_str}"
52 | 


--------------------------------------------------------------------------------
/examples/mcp-rl/servers/python/mcp_balldontlie/README.md:
--------------------------------------------------------------------------------
 1 | # Balldontlie NBA Stats MCP Server
 2 | 
 3 | An MCP server that provides access to NBA statistics through the Balldontlie API, focused on free tier endpoints.
 4 | 
 5 | ## Features (Free Tier Only)
 6 | 
 7 | - **Teams**: Get NBA team information with conference/division filters
 8 | - **Players**: Search and filter NBA players with comprehensive parameters
 9 | - **Games**: Retrieve game data with date/season/team filters and date ranges
10 | 
11 | ## Setup
12 | 
13 | 1. Get a free API key from [Balldontlie](https://www.balldontlie.io/)
14 | 2. Set your API key as an environment variable:
15 |    ```bash
16 |    export BALLDONTLIE_API_KEY=your_api_key_here
17 |    ```
18 | 
19 | ## Available Tools (Free Tier)
20 | 
21 | ### get_teams
22 | Get all NBA teams or a specific team by ID with optional conference/division filters.
23 | - Parameters: `team_id`, `division`, `conference`
24 | 
25 | ### get_players  
26 | Search NBA players with comprehensive filtering options.
27 | - Parameters: `player_id`, `search`, `first_name`, `last_name`, `team_ids`, `player_ids`, `cursor`, `per_page`
28 | 
29 | ### get_games
30 | Retrieve NBA games with extensive filtering capabilities.
31 | - Parameters: `game_id`, `dates`, `seasons`, `team_ids`, `postseason`, `start_date`, `end_date`, `cursor`, `per_page`
32 | 
33 | ## Free Tier Limitations
34 | 
35 | - Rate limit: 5 requests per minute
36 | - Access only to basic teams, players, and games endpoints
37 | - No access to advanced statistics, box scores, player injuries, or other premium features
38 | 
39 | ## Data Coverage
40 | 
41 | - Historical data from 1979 to present
42 | - Basic team, player, and game information
43 | - Regular season and postseason game data


--------------------------------------------------------------------------------
/src/art/utils/deploy_model.py:
--------------------------------------------------------------------------------
 1 | """
 2 | DEPRECATED: This module is deprecated. Import from art.utils.deployment instead.
 3 | 
 4 | This file re-exports from the new location for backwards compatibility.
 5 | """
 6 | 
 7 | # Re-export everything from the new deployment module
 8 | from art.utils.deployment import (
 9 |     # New API
10 |     DeploymentConfig,
11 |     DeploymentResult,
12 |     # Legacy API
13 |     LoRADeploymentJob,
14 |     LoRADeploymentProvider,
15 |     Provider,
16 |     TogetherDeploymentConfig,
17 |     WandbDeploymentConfig,
18 |     deploy_model,
19 |     deploy_wandb,
20 | )
21 | 
22 | # Also export these for any code that imports them directly
23 | from art.utils.deployment.together import (
24 |     TOGETHER_SUPPORTED_BASE_MODELS,
25 |     TogetherJobStatus,
26 | )
27 | from art.utils.deployment.wandb import (
28 |     WANDB_SUPPORTED_BASE_MODELS,
29 | )
30 | 
31 | # Keep these imports for any code that uses them
32 | from art.utils.get_model_step import get_model_step
33 | from art.utils.output_dirs import get_default_art_path
34 | from art.utils.s3 import archive_and_presign_step_url, pull_model_from_s3
35 | 
36 | __all__ = [
37 |     # New API
38 |     "DeploymentConfig",
39 |     "DeploymentResult",
40 |     "Provider",
41 |     "TogetherDeploymentConfig",
42 |     "WandbDeploymentConfig",
43 |     "deploy_model",
44 |     "deploy_wandb",
45 |     # Legacy API
46 |     "LoRADeploymentJob",
47 |     "LoRADeploymentProvider",
48 |     # Constants
49 |     "TOGETHER_SUPPORTED_BASE_MODELS",
50 |     "WANDB_SUPPORTED_BASE_MODELS",
51 |     "TogetherJobStatus",
52 |     # Utilities (for backwards compat)
53 |     "get_model_step",
54 |     "get_default_art_path",
55 |     "archive_and_presign_step_url",
56 |     "pull_model_from_s3",
57 | ]
58 | 


--------------------------------------------------------------------------------
/src/mp_actors/traceback.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | import inspect
 3 | import sys
 4 | from types import TracebackType
 5 | from typing import Any, Callable, TypeVar, cast
 6 | 
 7 | T = TypeVar("T", bound=Callable[..., Any])
 8 | 
 9 | 
10 | def streamline_tracebacks() -> Callable[[T], T]:
11 |     def decorator(func: T) -> T:
12 |         is_async = inspect.iscoroutinefunction(func)
13 | 
14 |         @functools.wraps(func)
15 |         def wrapper(*args: Any, **kwargs: Any) -> Any:
16 |             try:
17 |                 return func(*args, **kwargs)
18 |             except Exception as e:
19 |                 raise e.with_traceback(streamlined_traceback())
20 | 
21 |         @functools.wraps(func)
22 |         async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
23 |             try:
24 |                 return await func(*args, **kwargs)
25 |             except Exception as e:
26 |                 raise e.with_traceback(streamlined_traceback())
27 | 
28 |         return cast(T, async_wrapper if is_async else wrapper)
29 | 
30 |     return decorator
31 | 
32 | 
33 | def streamlined_traceback() -> TracebackType | None:
34 |     traceback = sys.exc_info()[2]
35 |     if traceback is None:
36 |         return None
37 |     child_traceback, _ = get_child_traceback(traceback)
38 |     return child_traceback
39 | 
40 | 
41 | def get_child_traceback(
42 |     traceback: TracebackType,
43 | ) -> tuple[TracebackType, bool]:
44 |     if not traceback.tb_next:
45 |         return traceback, False
46 |     next_traceback, mp_actor_code = get_child_traceback(traceback.tb_next)
47 |     if mp_actor_code:
48 |         return next_traceback, True
49 |     elif "/mp_actors/" in traceback.tb_frame.f_code.co_filename:
50 |         return next_traceback, True
51 |     return traceback, False
52 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/airline/tools/cancel_reservation.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | import json
 4 | from typing import Any, Dict
 5 | 
 6 | from tau_bench.envs.tool import Tool
 7 | 
 8 | 
 9 | class CancelReservation(Tool):
10 |     @staticmethod
11 |     def invoke(
12 |         data: Dict[str, Any],
13 |         reservation_id: str,
14 |     ) -> str:
15 |         reservations = data["reservations"]
16 |         if reservation_id not in reservations:
17 |             return "Error: reservation not found"
18 |         reservation = reservations[reservation_id]
19 | 
20 |         # reverse the payment
21 |         refunds = []
22 |         for payment in reservation["payment_history"]:
23 |             refunds.append(
24 |                 {
25 |                     "payment_id": payment["payment_id"],
26 |                     "amount": -payment["amount"],
27 |                 }
28 |             )
29 |         reservation["payment_history"].extend(refunds)
30 |         reservation["status"] = "cancelled"
31 |         return json.dumps(reservation)
32 | 
33 |     @staticmethod
34 |     def get_info() -> Dict[str, Any]:
35 |         return {
36 |             "type": "function",
37 |             "function": {
38 |                 "name": "cancel_reservation",
39 |                 "description": "Cancel the whole reservation.",
40 |                 "parameters": {
41 |                     "type": "object",
42 |                     "properties": {
43 |                         "reservation_id": {
44 |                             "type": "string",
45 |                             "description": "The reservation ID, such as 'ZFA04Y'.",
46 |                         },
47 |                     },
48 |                     "required": ["reservation_id"],
49 |                 },
50 |             },
51 |         }
52 | 


--------------------------------------------------------------------------------
/src/art/utils/benchmarking/pull_model_trajectories.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from art.local import LocalBackend
 4 | from art.model import Model as ArtModel
 5 | 
 6 | 
 7 | async def pull_model_trajectories(model: ArtModel) -> None:
 8 |     """Pull trajectory checkpoints for *model* from the configured S3 bucket.
 9 | 
10 |     This is a lightweight helper that mirrors the S3-sync logic used inside
11 |     ``art_e.train`` but without performing any training.  It can be invoked from
12 |     notebooks or other scripts to ensure that the local ART project directory
13 |     contains all trajectory files for subsequent evaluation / analysis.
14 | 
15 |     Parameters
16 |     ----------
17 |     model : art.Model
18 |         Any ART model instance (trainable or not) that should be synchronised.
19 | 
20 |     Environment
21 |     -----------
22 |     BACKUP_BUCKET : str
23 |         Name of the S3 bucket that stores model artefacts. The variable is
24 |         loaded from the current environment (``dotenv`` is consulted so that
25 |         values from a local *.env* file are respected).
26 |     """
27 | 
28 |     bucket = os.getenv("BACKUP_BUCKET")
29 |     if bucket is None:
30 |         raise EnvironmentError(
31 |             "Environment variable BACKUP_BUCKET is required but was not found."
32 |         )
33 | 
34 |     # Use the LocalBackend context manager to work with the on-disk artefacts.
35 |     with LocalBackend() as backend:
36 |         print(
37 |             f"Pulling trajectories for model '{model.name}' from S3 bucket '{bucket}'…",
38 |             flush=True,
39 |         )
40 | 
41 |         await backend._experimental_pull_from_s3(
42 |             model,
43 |             s3_bucket=bucket,
44 |             verbose=True,
45 |             exclude=["checkpoints", "logs"],
46 |         )
47 | 
48 |         print("Finished pulling trajectories.", flush=True)
49 | 


--------------------------------------------------------------------------------
/dev/swebench/tools/review_on_submit_m/bin/submit:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | from pathlib import Path
 5 | import subprocess
 6 | import sys
 7 | import os
 8 | import io
 9 | 
10 | from registry import registry
11 | 
12 | 
13 | def main() -> None:
14 |     parser = argparse.ArgumentParser(description="Submit changes for review")
15 |     parser.add_argument("-f", "--force", action="store_true", help="Force submit without review")
16 |     args = parser.parse_args()
17 | 
18 |     repo_root = registry.get("ROOT", os.getenv("ROOT"))
19 |     assert repo_root
20 | 
21 |     patch_path = Path("/root/model.patch")
22 | 
23 |     subprocess.run(
24 |         f"git add -A && git diff --cached > {patch_path}",
25 |         shell=True,
26 |         stdout=subprocess.DEVNULL,
27 |         stderr=subprocess.DEVNULL,
28 |         cwd=repo_root,
29 |     )
30 | 
31 |     patch = patch_path.read_text(errors="backslashreplace")
32 | 
33 |     submit_review_messages = registry.get("SUBMIT_REVIEW_MESSAGES", [])
34 |     n_stages = len(submit_review_messages)
35 |     current_stage = registry.get("SUBMIT_STAGE", 0)
36 |     if not args.force and current_stage != n_stages:
37 |         message = submit_review_messages[current_stage]
38 |         message = message.replace("{{diff}}", patch)
39 |         message = message.replace("{{problem_statement}}", registry.get("PROBLEM_STATEMENT", ""))
40 |         registry["SUBMIT_STAGE"] = current_stage + 1
41 |         print(message)
42 |         sys.exit(0)
43 | 
44 |     print("<<SWE_AGENT_SUBMISSION>>")
45 |     print(patch)
46 |     print("<<SWE_AGENT_SUBMISSION>>")
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     # There are some super strange "ascii can't decode x" errors when printing to the terminal
51 |     # that can be solved with setting the default encoding for stdout
52 |     # (note that python3.6 doesn't have the reconfigure method)
53 |     sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8")
54 |     main()
55 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/airline/tools/list_all_airports.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | import json
 4 | from typing import Any, Dict
 5 | 
 6 | from tau_bench.envs.tool import Tool
 7 | 
 8 | 
 9 | class ListAllAirports(Tool):
10 |     @staticmethod
11 |     def invoke(data: Dict[str, Any]) -> str:
12 |         airports = [
13 |             "SFO",
14 |             "JFK",
15 |             "LAX",
16 |             "ORD",
17 |             "DFW",
18 |             "DEN",
19 |             "SEA",
20 |             "ATL",
21 |             "MIA",
22 |             "BOS",
23 |             "PHX",
24 |             "IAH",
25 |             "LAS",
26 |             "MCO",
27 |             "EWR",
28 |             "CLT",
29 |             "MSP",
30 |             "DTW",
31 |             "PHL",
32 |             "LGA",
33 |         ]
34 |         cities = [
35 |             "San Francisco",
36 |             "New York",
37 |             "Los Angeles",
38 |             "Chicago",
39 |             "Dallas",
40 |             "Denver",
41 |             "Seattle",
42 |             "Atlanta",
43 |             "Miami",
44 |             "Boston",
45 |             "Phoenix",
46 |             "Houston",
47 |             "Las Vegas",
48 |             "Orlando",
49 |             "Newark",
50 |             "Charlotte",
51 |             "Minneapolis",
52 |             "Detroit",
53 |             "Philadelphia",
54 |             "LaGuardia",
55 |         ]
56 |         return json.dumps({airport: city for airport, city in zip(airports, cities)})
57 | 
58 |     @staticmethod
59 |     def get_info() -> Dict[str, Any]:
60 |         return {
61 |             "type": "function",
62 |             "function": {
63 |                 "name": "list_all_airports",
64 |                 "description": "List all airports and their cities.",
65 |                 "parameters": {
66 |                     "type": "object",
67 |                     "properties": {},
68 |                     "required": [],
69 |                 },
70 |             },
71 |         }
72 | 


--------------------------------------------------------------------------------
/examples/tic_tac_toe_self_play/gather_trajectory_groups_by_index.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from typing import Awaitable, Iterable
 3 | 
 4 | import tqdm
 5 | 
 6 | import art
 7 | from art.gather import GatherContext, set_gather_context, wrap_group_awaitable
 8 | 
 9 | 
10 | async def gather_trajectory_groups_by_index(
11 |     grouped_trajectory_awaitables: Iterable[Awaitable[tuple[art.Trajectory, ...]]],
12 |     *,
13 |     pbar_desc: str | None = "gather",
14 |     pbar_total_completion_tokens: bool = False,
15 |     max_exceptions: int | float = 0,
16 |     trajectories_per_rollout: int = 1,
17 | ) -> list[art.TrajectoryGroup]:
18 |     if pbar_total_completion_tokens:
19 |         print(
20 |             "pbar_total_completion_tokens is deprecated and will be removed in a future version."
21 |         )
22 |     grouped_trajectory_awaitables = list(grouped_trajectory_awaitables)
23 |     context = GatherContext(
24 |         pbar=None,
25 |         max_exceptions=max_exceptions,
26 |     )
27 | 
28 |     with set_gather_context(context):
29 |         future = asyncio.gather(
30 |             *[wrap_group_awaitable(g) for g in grouped_trajectory_awaitables]
31 |         )
32 |         total = (
33 |             sum(
34 |                 getattr(g, "_num_trajectories", 1)
35 |                 for g in grouped_trajectory_awaitables
36 |             )
37 |             * trajectories_per_rollout
38 |         )
39 |         context.pbar = tqdm.tqdm(desc=pbar_desc, total=total)
40 |         all_trajectory_tuples = await future
41 | 
42 |     if context.pbar is not None:
43 |         context.pbar.close()
44 | 
45 |     # Group trajectories by position in tuple
46 |     num_groups = len(all_trajectory_tuples[0])
47 |     grouped_trajectories: list[list[art.Trajectory]] = [[] for _ in range(num_groups)]
48 | 
49 |     for traj_tuple in all_trajectory_tuples:
50 |         for i, traj in enumerate(traj_tuple):
51 |             grouped_trajectories[i].append(traj)
52 | 
53 |     return [art.TrajectoryGroup(trajs) for trajs in grouped_trajectories]
54 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/airline/tools/send_certificate.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | from typing import Any, Dict
 4 | 
 5 | from tau_bench.envs.tool import Tool
 6 | 
 7 | 
 8 | class SendCertificate(Tool):
 9 |     @staticmethod
10 |     def invoke(
11 |         data: Dict[str, Any],
12 |         user_id: str,
13 |         amount: int,
14 |     ) -> str:
15 |         users = data["users"]
16 |         if user_id not in users:
17 |             return "Error: user not found"
18 |         user = users[user_id]
19 | 
20 |         # add a certificate, assume at most 3 cases per task
21 |         for id in [3221322, 3221323, 3221324]:
22 |             payment_id = f"certificate_{id}"
23 |             if payment_id not in user["payment_methods"]:
24 |                 user["payment_methods"][payment_id] = {
25 |                     "source": "certificate",
26 |                     "amount": amount,
27 |                     "id": payment_id,
28 |                 }
29 |                 return f"Certificate {payment_id} added to user {user_id} with amount {amount}."
30 | 
31 |     @staticmethod
32 |     def get_info() -> Dict[str, Any]:
33 |         return {
34 |             "type": "function",
35 |             "function": {
36 |                 "name": "send_certificate",
37 |                 "description": "Send a certificate to a user. Be careful!",
38 |                 "parameters": {
39 |                     "type": "object",
40 |                     "properties": {
41 |                         "user_id": {
42 |                             "type": "string",
43 |                             "description": "The ID of the user to book the reservation, such as 'sara_doe_496'.",
44 |                         },
45 |                         "amount": {
46 |                             "type": "number",
47 |                             "description": "Certificate amount to send.",
48 |                         },
49 |                     },
50 |                     "required": ["user_id", "amount"],
51 |                 },
52 |             },
53 |         }
54 | 


--------------------------------------------------------------------------------
/dev/swebench/trl.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "%load_ext autoreload\n",
10 |     "%autoreload 2"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": null,
16 |    "metadata": {},
17 |    "outputs": [],
18 |    "source": [
19 |     "%%html\n",
20 |     "<style>\n",
21 |     ".cell-output-ipywidget-background {\n",
22 |     "    background-color: transparent !important;\n",
23 |     "}\n",
24 |     ":root {\n",
25 |     "    --jp-widgets-color: var(--vscode-editor-foreground);\n",
26 |     "    --jp-widgets-font-size: var(--vscode-editor-font-size);\n",
27 |     "}  \n",
28 |     "</style>"
29 |    ]
30 |   },
31 |   {
32 |    "cell_type": "code",
33 |    "execution_count": null,
34 |    "metadata": {},
35 |    "outputs": [],
36 |    "source": [
37 |     "import datasets\n",
38 |     "\n",
39 |     "# from transformers.models.auto.modeling_auto import AutoModelForCausalLM\n",
40 |     "import trl\n",
41 |     "\n",
42 |     "# model = AutoModelForCausalLM.from_pretrained(\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
43 |     "data = {\"prompt\": \"\"}\n",
44 |     "trainer = trl.trainer.GRPOTrainer(\n",
45 |     "    model=\"Qwen/Qwen3-14B\",\n",
46 |     "    reward_funcs=[],\n",
47 |     "    args=trl.trainer.GRPOConfig(bf16=True),\n",
48 |     "    train_dataset=datasets.Dataset.from_list([data for _ in range(10_000_000)]),\n",
49 |     ")"
50 |    ]
51 |   }
52 |  ],
53 |  "metadata": {
54 |   "kernelspec": {
55 |    "display_name": ".venv",
56 |    "language": "python",
57 |    "name": "python3"
58 |   },
59 |   "language_info": {
60 |    "codemirror_mode": {
61 |     "name": "ipython",
62 |     "version": 3
63 |    },
64 |    "file_extension": ".py",
65 |    "mimetype": "text/x-python",
66 |    "name": "python",
67 |    "nbconvert_exporter": "python",
68 |    "pygments_lexer": "ipython3",
69 |    "version": "3.10.13"
70 |   }
71 |  },
72 |  "nbformat": 4,
73 |  "nbformat_minor": 2
74 | }
75 | 


--------------------------------------------------------------------------------
/src/art/errors.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains errors that are returned by LocalBackend. They are normal exceptions
 3 | with status_code and detail attributes that can be handled by FastAPI exception handlers
 4 | to return them as JSON responses with appropriate HTTP status codes.
 5 | """
 6 | 
 7 | 
 8 | class ARTError(Exception):
 9 |     """Base class for ART exceptions that should be converted to HTTP responses."""
10 | 
11 |     def __init__(self, message: str, status_code: int):
12 |         super().__init__(message)
13 |         self.detail = message
14 |         self.status_code = status_code
15 | 
16 | 
17 | class ForbiddenBucketCreationError(ARTError):
18 |     """An error raised when the user receives a 403 Forbidden error when trying to create a bucket.
19 | 
20 |     This can occur if the bucket already exists and belongs to another user, or if the user's credentials
21 |     do not have permission to create a bucket.
22 | 
23 |     Status code: 403
24 |     """
25 | 
26 |     def __init__(self, message: str):
27 |         super().__init__(message, status_code=403)
28 | 
29 | 
30 | class UnsupportedLoRADeploymentProviderError(ARTError):
31 |     """An error raised when the user attempts to deploy a model to a provider that does not support
32 |     serverless LoRA deployment.
33 | 
34 |     Status code: 400
35 |     """
36 | 
37 |     def __init__(self, message: str):
38 |         super().__init__(message, status_code=400)
39 | 
40 | 
41 | class UnsupportedBaseModelDeploymentError(ARTError):
42 |     """An error raised when the user attempts to deploy a model to a provider that does not support
43 |     it for serverless LoRA deployment.
44 | 
45 |     Status code: 400
46 |     """
47 | 
48 |     def __init__(self, message: str):
49 |         super().__init__(message, status_code=400)
50 | 
51 | 
52 | class LoRADeploymentTimedOutError(ARTError):
53 |     """An error raised when deployment of a LoRA times out.
54 | 
55 |     Status code: 504
56 |     """
57 | 
58 |     def __init__(self, message: str):
59 |         super().__init__(message, status_code=504)
60 | 


--------------------------------------------------------------------------------
/scripts/migrate-s3-checkpoints.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Script to migrate model checkpoints in S3 from old to new structure.
 4 | 
 5 | Old structure: s3://bucket/prefix/project/models/model_name/0001/
 6 | New structure: s3://bucket/prefix/project/models/model_name/checkpoints/0001/
 7 | 
 8 | Usage:
 9 |     python scripts/migrate-s3-checkpoints.py --project myproject --model mymodel
10 |     python scripts/migrate-s3-checkpoints.py --project myproject --model mymodel --dry-run
11 |     python scripts/migrate-s3-checkpoints.py --project myproject --model mymodel --bucket custom-bucket --prefix custom-prefix
12 | """
13 | 
14 | import argparse
15 | import asyncio
16 | import sys
17 | from pathlib import Path
18 | 
19 | # Add the src directory to the Python path
20 | sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
21 | 
22 | from art.utils.s3_checkpoint_utils import migrate_s3_checkpoints_to_new_structure
23 | 
24 | 
25 | async def main():
26 |     parser = argparse.ArgumentParser(
27 |         description="Migrate model checkpoints in S3 from old to new structure"
28 |     )
29 |     parser.add_argument(
30 |         "--project",
31 |         required=True,
32 |         help="Project name",
33 |     )
34 |     parser.add_argument(
35 |         "--model",
36 |         required=True,
37 |         help="Model name",
38 |     )
39 |     parser.add_argument(
40 |         "--bucket",
41 |         help="S3 bucket name (defaults to BACKUP_BUCKET env var)",
42 |     )
43 |     parser.add_argument(
44 |         "--prefix",
45 |         help="S3 prefix",
46 |     )
47 |     parser.add_argument(
48 |         "--dry-run",
49 |         action="store_true",
50 |         help="Only show what would be done without making changes",
51 |     )
52 | 
53 |     args = parser.parse_args()
54 | 
55 |     await migrate_s3_checkpoints_to_new_structure(
56 |         model_name=args.model,
57 |         project=args.project,
58 |         s3_bucket=args.bucket,
59 |         prefix=args.prefix,
60 |         dry_run=args.dry_run,
61 |     )
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     asyncio.run(main())
66 | 


--------------------------------------------------------------------------------
/src/art/batches.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from typing import AsyncIterator, Awaitable, Iterable
 3 | 
 4 | from tqdm import auto as tqdm
 5 | 
 6 | from .gather import GatherContext, set_gather_context, wrap_group_awaitable
 7 | from .trajectories import TrajectoryGroup
 8 | 
 9 | 
10 | async def trajectory_group_batches(
11 |     groups: Iterable[Awaitable[TrajectoryGroup]],
12 |     *,
13 |     batch_size: int,
14 |     max_batch_exceptions: int | float = 0,
15 |     max_concurrent_batches: int = 4,
16 |     skip_batches: int = 0,
17 |     pbar_desc: str | None = "batches",
18 |     pbar_total_completion_tokens: bool = False,
19 | ) -> AsyncIterator[list[TrajectoryGroup]]:
20 |     if pbar_total_completion_tokens:
21 |         print(
22 |             "pbar_total_completion_tokens is deprecated and will be removed in a future version."
23 |         )
24 |     unstarted = list(groups)[batch_size * skip_batches :]
25 |     pending = set[asyncio.Task[TrajectoryGroup | None]]()
26 |     batch = list[TrajectoryGroup]()
27 |     context = GatherContext(
28 |         max_exceptions=max_batch_exceptions,
29 |         increment_pbar=False,
30 |     )
31 |     with set_gather_context(context):
32 |         while unstarted or pending:
33 |             if context.pbar is None:
34 |                 context.pbar = tqdm.tqdm(desc=pbar_desc, total=batch_size)
35 |             while len(pending) < batch_size * max_concurrent_batches and unstarted:
36 |                 pending.add(asyncio.create_task(wrap_group_awaitable(unstarted.pop(0))))
37 |             done, pending = await asyncio.wait(
38 |                 pending, return_when=asyncio.FIRST_COMPLETED
39 |             )
40 |             context.pbar.update(len(done))
41 |             batch.extend(g for task in done if (g := task.result()) is not None)
42 |             if len(batch) >= batch_size:
43 |                 if context.pbar is not None:
44 |                     context.pbar.close()
45 |                     context.reset()
46 |                 yield batch[:batch_size]
47 |                 batch = batch[batch_size:]
48 |         if batch:
49 |             yield batch
50 | 


--------------------------------------------------------------------------------
/dev/new_models/qwen3_try.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | 
 4 | from dotenv import load_dotenv
 5 | 
 6 | import art
 7 | from art.local import LocalBackend
 8 | 
 9 | load_dotenv()
10 | 
11 | 
12 | async def rollout(model: art.TrainableModel, prompt: str) -> art.Trajectory:
13 |     messages: art.Messages = [
14 |         {
15 |             "role": "user",
16 |             "content": prompt,
17 |         }
18 |     ]
19 |     client = model.openai_client()
20 |     chat_completion = await client.chat.completions.create(
21 |         messages=messages,
22 |         model=model.name,
23 |         max_tokens=100,
24 |         timeout=100,
25 |         extra_body={"chat_template_kwargs": {"enable_thinking": False}},
26 |     )
27 |     choice = chat_completion.choices[0]
28 |     content = choice.message.content
29 |     assert isinstance(content, str)
30 |     if content == "yes":
31 |         reward = 0.5
32 |     elif content == "no":
33 |         reward = 0.75
34 |     elif content == "maybe":
35 |         reward = 1.0
36 |     else:
37 |         reward = 0.0
38 |     return art.Trajectory(messages_and_choices=[*messages, choice], reward=reward)
39 | 
40 | 
41 | async def main():
42 |     with open("dev/new_models/prompts.json", "r") as f:
43 |         prompts = json.load(f)
44 |     print(prompts)
45 | 
46 |     backend = LocalBackend()
47 |     model = art.TrainableModel(
48 |         name="007",
49 |         project="yes-no-maybe-s",
50 |         base_model="Qwen/Qwen3-0.6B",
51 |         # base_model="Qwen/Qwen2.5-0.5B-Instruct",
52 |     )
53 |     await model.register(backend)
54 |     for _ in range(await model.get_step(), 1_000):
55 |         train_groups = await art.gather_trajectory_groups(
56 |             (
57 |                 art.TrajectoryGroup(rollout(model, prompt) for _ in range(32))
58 |                 for prompt in prompts
59 |             ),
60 |             pbar_desc="gather",
61 |         )
62 |         await model.train(
63 |             train_groups,
64 |             config=art.TrainConfig(learning_rate=1e-4),
65 |         )
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     asyncio.run(main())
70 | 


--------------------------------------------------------------------------------
/examples/just-the-facts/just_the_facts/experiments.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | import art
 4 | 
 5 | 
 6 | class JustTheFactsConfig(BaseModel):
 7 |     learning_rate: float = 1e-6
 8 |     num_epochs: int = 20
 9 | 
10 |     eval_steps: int = 1
11 |     groups_per_step: int = 3
12 |     trajectories_per_group: int = 4
13 |     scale_rewards: bool = False
14 | 
15 | 
16 | models: dict[str, art.TrainableModel[JustTheFactsConfig]] = {
17 |     "facts-14b-001": art.TrainableModel(
18 |         name="facts-14b-001",
19 |         project="just-the-facts",
20 |         base_model="Qwen/Qwen2.5-14B-Instruct",
21 |         config=JustTheFactsConfig(
22 |             num_epochs=20,
23 |         ),
24 |     )
25 | }
26 | 
27 | models["facts-14b-002"] = models["facts-14b-001"].model_copy(deep=True)
28 | models["facts-14b-002"].name = "facts-14b-002"
29 | models["facts-14b-002"].base_model = "Qwen/Qwen2.5-14B-Instruct"
30 | 
31 | models["facts-14b-003"] = models["facts-14b-001"].model_copy(deep=True)
32 | models["facts-14b-003"].name = "facts-14b-003"
33 | models["facts-14b-003"].base_model = "Qwen/Qwen2.5-14B-Instruct"
34 | models["facts-14b-003"].config.scale_rewards = True
35 | models["facts-14b-003"].config.trajectories_per_group = 12
36 | 
37 | 
38 | models["facts-7b-001"] = models["facts-14b-001"].model_copy(deep=True)
39 | models["facts-7b-001"].name = "facts-7b-001"
40 | models["facts-7b-001"].base_model = "Qwen/Qwen2.5-7B-Instruct"
41 | 
42 | models["facts-qwen3-4b-001"] = models["facts-14b-001"].model_copy(deep=True)
43 | models["facts-qwen3-4b-001"].name = "facts-qwen3-4b-001"
44 | models["facts-qwen3-4b-001"].base_model = "Qwen/Qwen3-4B"
45 | 
46 | models["facts-qwen3-4b-002"] = models["facts-14b-001"].model_copy(deep=True)
47 | models["facts-qwen3-4b-002"].name = "facts-qwen3-4b-002"
48 | models["facts-qwen3-4b-002"].base_model = "Qwen/Qwen3-4B"
49 | 
50 | models["facts-qwen3-4b-003"] = models["facts-14b-001"].model_copy(deep=True)
51 | models["facts-qwen3-4b-003"].name = "facts-qwen3-4b-003"
52 | models["facts-qwen3-4b-003"].base_model = "Qwen/Qwen3-4B"
53 | models["facts-qwen3-4b-003"].config.groups_per_step = 8
54 | 


--------------------------------------------------------------------------------
/dev/new_models/gemma3.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | 
 4 | from dotenv import load_dotenv
 5 | 
 6 | import art
 7 | from art.local import LocalBackend
 8 | 
 9 | load_dotenv()
10 | 
11 | 
12 | async def rollout(model: art.TrainableModel, prompt: str) -> art.Trajectory:
13 |     messages: art.Messages = [
14 |         {
15 |             "role": "user",
16 |             "content": prompt,
17 |         }
18 |     ]
19 |     client = model.openai_client()
20 |     chat_completion = await client.chat.completions.create(
21 |         messages=messages,
22 |         model=model.name,
23 |         max_tokens=100,
24 |         timeout=100,
25 |     )
26 |     choice = chat_completion.choices[0]
27 |     content = choice.message.content
28 |     assert isinstance(content, str)
29 |     if content == "yes":
30 |         reward = 0.5
31 |     elif content == "no":
32 |         reward = 0.75
33 |     elif content == "maybe":
34 |         reward = 1.0
35 |     else:
36 |         reward = 0.0
37 |     return art.Trajectory(messages_and_choices=[*messages, choice], reward=reward)
38 | 
39 | 
40 | async def main():
41 |     with open("dev/new_models/prompts.json", "r") as f:
42 |         prompts = json.load(f)
43 |     print(prompts)
44 | 
45 |     backend = LocalBackend()
46 |     model = art.TrainableModel(
47 |         name="001-gemma3",
48 |         project="yes-no-maybe-s",
49 |         base_model="google/gemma-3-4b-it",
50 |         _internal_config={
51 |             "init_args": {
52 |                 "enable_prefix_caching": False,
53 |             },
54 |         },
55 |     )
56 |     await model.register(backend)
57 |     for _ in range(await model.get_step(), 1_000):
58 |         train_groups = await art.gather_trajectory_groups(
59 |             (
60 |                 art.TrajectoryGroup(rollout(model, prompt) for _ in range(32))
61 |                 for prompt in prompts
62 |             ),
63 |             pbar_desc="gather",
64 |         )
65 |         await model.train(
66 |             train_groups,
67 |             config=art.TrainConfig(learning_rate=1e-4),
68 |         )
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     asyncio.run(main())
73 | 


--------------------------------------------------------------------------------
/THIRD-PARTY-NOTICES:
--------------------------------------------------------------------------------
 1 | This project includes code from PyTorch TorchTune:
 2 | 
 3 | - Repository: https://github.com/pytorch/torchtune
 4 | - License: BSD 3-Clause License
 5 | - License URL: https://github.com/pytorch/torchtune/blob/main/LICENSE
 6 | 
 7 | The following files contain code copied and modified from TorchTune:
 8 | 
 9 | - src/art/torchtune/recipe.py
10 | 
11 | Note: This project also uses TorchTune as a dependency. The above notice applies
12 | specifically to files containing copied/modified code from the TorchTune repository.
13 | 
14 | Full license text:
15 | 
16 | BSD 3-Clause License
17 | 
18 | Copyright 2024 Meta
19 | 
20 | Redistribution and use in source and binary forms, with or without modification,
21 | are permitted provided that the following conditions are met:
22 | 
23 | 1. Redistributions of source code must retain the above copyright notice, this list
24 |    of conditions and the following disclaimer.
25 | 
26 | 2. Redistributions in binary form must reproduce the above copyright notice, this
27 |    list of conditions and the following disclaimer in the documentation
28 |    and/or other materials provided with the distribution.
29 | 
30 | 3. Neither the name of the copyright holder nor the names of its contributors may
31 |    be used to endorse or promote products derived from this software without specific
32 |    prior written permission.
33 | 
34 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
35 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
36 | OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
37 | SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
38 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
39 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
40 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
41 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
42 | ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
43 | DAMAGE.
44 | 


--------------------------------------------------------------------------------
/docs/getting-started/quick-start.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Quick Start"
 3 | description: "Get started with ART in a few quick steps."
 4 | icon: "forward"
 5 | ---
 6 | 
 7 | In this Quick Start tutorial, we'll be training Qwen3 14B Instruct to play [2048](https://play2048.co/), a simple game that requires forward planning and basic math skills.
 8 | 
 9 | <Info>
10 | 
11 | Reading time: <b>15 min</b>
12 | 
13 | Training time: <b>2 hours</b>
14 | 
15 | Total cost: <b>Free!</b>
16 | 
17 | </Info>
18 | 
19 | ## Step 1: Provision W&B API key
20 | 
21 | [ART](https://github.com/OpenPipe/art) is an open source library and works across infra and observability providers. To keep things simple in this tutorial, we'll exclusively use Weights & Biases services, which means we'll only need to provision one API key. We'll use these services:
22 | 
23 | * **W&B Training** - autoscale GPUs for inference and training
24 | * **W&B Models** - record metrics like reward
25 | * **W&B Weave** - record your model's traces as it generates completions
26 | * **W&B Artifacts** - store and manage your model's checkpoints
27 | 
28 | Weights & Biases currently provides a small free tier for all the services we'll use during this quickstart, so you shouldn't need to add a credit card to get started.
29 | 
30 | - [Weights & Biases](https://wandb.ai/home)
31 | 
32 | Once you have your Weights & Biases API key, open the [notebook](https://colab.research.google.com/github/openpipe/art-notebooks/blob/main/examples/2048/2048.ipynb) in Google Colab and set it in the **Environment Variables** cell. Then continue on to the next step.
33 | 
34 | ## Step 2: Run the notebook
35 | 
36 | At the top of the [notebook](https://colab.research.google.com/github/openpipe/art-notebooks/blob/main/examples/2048/2048.ipynb) you should see a small **Run all** button. Press it to begin training your model.
37 | 
38 | 
39 | ## Step 3: Track metrics
40 | 
41 | While your run progresses, observe its traces and metrics in your [W&B workspace](https://wandb.ai/home). You should start seeing some progress in the first 20-30 steps. If you have questions along the way, please ask in the [Discord](https://discord.gg/zbBHRUpwf4). Happy training!
42 | 


--------------------------------------------------------------------------------
/examples/2048/generate_benchmarks.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import random
 4 | 
 5 | from dotenv import load_dotenv
 6 | from rollout import rollout
 7 | 
 8 | import art
 9 | from art.local import LocalBackend
10 | 
11 | load_dotenv()
12 | 
13 | random.seed(42)
14 | 
15 | # Initialize the server
16 | backend = LocalBackend()
17 | 
18 | # comparison models
19 | gpt_4o_mini = art.Model(
20 |     name="gpt-4o-mini",
21 |     project="2048",
22 |     inference_model_name="openai/gpt-4o-mini",
23 |     inference_base_url="https://openrouter.ai/api/v1",
24 |     inference_api_key=os.getenv("OPENROUTER_API_KEY"),
25 | )
26 | gpt_4o = art.Model(
27 |     name="gpt-4o",
28 |     project="2048",
29 |     inference_model_name="openai/gpt-4o",
30 |     inference_base_url="https://openrouter.ai/api/v1",
31 |     inference_api_key=os.getenv("OPENROUTER_API_KEY"),
32 | )
33 | gpt_4_1 = art.Model(
34 |     name="gpt-4.1",
35 |     project="2048",
36 |     inference_model_name="openai/gpt-4.1",
37 |     inference_base_url="https://openrouter.ai/api/v1",
38 |     inference_api_key=os.getenv("OPENROUTER_API_KEY"),
39 | )
40 | 
41 | 
42 | async def log_comparison_model(comparison_model: art.Model):
43 |     trajectories = await art.gather_trajectory_groups(
44 |         (
45 |             art.TrajectoryGroup(
46 |                 rollout(comparison_model, 0, is_validation=True) for _ in range(12)
47 |             )
48 |             for _ in range(1)
49 |         ),
50 |         pbar_desc=f"gather {comparison_model.name}",
51 |         max_exceptions=1,
52 |     )
53 | 
54 |     await comparison_model.log(
55 |         trajectories,
56 |         split="val",
57 |     )
58 |     await backend._experimental_push_to_s3(
59 |         comparison_model,
60 |     )
61 | 
62 | 
63 | async def run_benchmarks():
64 |     await gpt_4o_mini.register(backend)
65 |     await gpt_4o.register(backend)
66 |     await gpt_4_1.register(backend)
67 | 
68 |     promises = []
69 | 
70 |     for comparison_model in [gpt_4o_mini, gpt_4o, gpt_4_1]:
71 |         promises.append(log_comparison_model(comparison_model))
72 | 
73 |     await asyncio.gather(*promises)
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     asyncio.run(run_benchmarks())
78 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/airline/tools/search_direct_flight.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | import json
 4 | from typing import Any, Dict
 5 | 
 6 | from tau_bench.envs.tool import Tool
 7 | 
 8 | 
 9 | class SearchDirectFlight(Tool):
10 |     @staticmethod
11 |     def invoke(data: Dict[str, Any], origin: str, destination: str, date: str) -> str:
12 |         flights = data["flights"]
13 |         results = []
14 |         for flight in flights.values():
15 |             if flight["origin"] == origin and flight["destination"] == destination:
16 |                 if (
17 |                     date in flight["dates"]
18 |                     and flight["dates"][date]["status"] == "available"
19 |                 ):
20 |                     # results add flight except dates, but add flight["datas"][date]
21 |                     results.append({k: v for k, v in flight.items() if k != "dates"})
22 |                     results[-1].update(flight["dates"][date])
23 |         return json.dumps(results)
24 | 
25 |     @staticmethod
26 |     def get_info() -> Dict[str, Any]:
27 |         return {
28 |             "type": "function",
29 |             "function": {
30 |                 "name": "search_direct_flight",
31 |                 "description": "Search direct flights between two cities on a specific date.",
32 |                 "parameters": {
33 |                     "type": "object",
34 |                     "properties": {
35 |                         "origin": {
36 |                             "type": "string",
37 |                             "description": "The origin city airport in three letters, such as 'JFK'.",
38 |                         },
39 |                         "destination": {
40 |                             "type": "string",
41 |                             "description": "The destination city airport in three letters, such as 'LAX'.",
42 |                         },
43 |                         "date": {
44 |                             "type": "string",
45 |                             "description": "The date of the flight in the format 'YYYY-MM-DD', such as '2024-01-01'.",
46 |                         },
47 |                     },
48 |                     "required": ["origin", "destination", "date"],
49 |                 },
50 |             },
51 |         }
52 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/retail/tools/find_user_id_by_name_zip.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | from typing import Any, Dict
 4 | 
 5 | from tau_bench.envs.tool import Tool
 6 | 
 7 | 
 8 | class FindUserIdByNameZip(Tool):
 9 |     @staticmethod
10 |     def invoke(data: Dict[str, Any], first_name: str, last_name: str, zip: str) -> str:
11 |         users = data["users"]
12 |         for user_id, profile in users.items():
13 |             if (
14 |                 profile["name"]["first_name"].lower() == first_name.lower()
15 |                 and profile["name"]["last_name"].lower() == last_name.lower()
16 |                 and profile["address"]["zip"] == zip
17 |             ):
18 |                 return user_id
19 |         return "Error: user not found"
20 | 
21 |     @staticmethod
22 |     def get_info() -> Dict[str, Any]:
23 |         return {
24 |             "type": "function",
25 |             "function": {
26 |                 "name": "find_user_id_by_name_zip",
27 |                 "description": (
28 |                     "Find user id by first name, last name, and zip code. If the user is not found, the function "
29 |                     "will return an error message. By default, find user id by email, and only call this function "
30 |                     "if the user is not found by email or cannot remember email."
31 |                 ),
32 |                 "parameters": {
33 |                     "type": "object",
34 |                     "properties": {
35 |                         "first_name": {
36 |                             "type": "string",
37 |                             "description": "The first name of the customer, such as 'John'.",
38 |                         },
39 |                         "last_name": {
40 |                             "type": "string",
41 |                             "description": "The last name of the customer, such as 'Doe'.",
42 |                         },
43 |                         "zip": {
44 |                             "type": "string",
45 |                             "description": "The zip code of the customer, such as '12345'.",
46 |                         },
47 |                     },
48 |                     "required": ["first_name", "last_name", "zip"],
49 |                 },
50 |             },
51 |         }
52 | 


--------------------------------------------------------------------------------
/src/art/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | # Import peft (and transformers by extension) before unsloth to enable sleep mode
 4 | if os.environ.get("IMPORT_PEFT", "0") == "1":
 5 |     import peft  # type: ignore # noqa: F401
 6 | 
 7 | # Import unsloth before transformers, peft, and trl to maximize Unsloth optimizations
 8 | # NOTE: If we import peft before unsloth to enable sleep mode, a warning will be shown
 9 | if os.environ.get("IMPORT_UNSLOTH", "0") == "1":
10 |     import unsloth  # type: ignore # noqa: F401
11 | 
12 | if os.environ.get("IMPORT_PEFT", "0") == "1":
13 |     # torch.cuda.MemPool doesn't currently support expandable_segments which is used in sleep mode
14 |     conf = os.environ["PYTORCH_CUDA_ALLOC_CONF"].split(",")
15 |     if "expandable_segments:True" in conf:
16 |         conf.remove("expandable_segments:True")
17 |     os.environ["PYTORCH_CUDA_ALLOC_CONF"] = ",".join(conf)
18 | 
19 | try:
20 |     import transformers  # type: ignore
21 | 
22 |     try:
23 |         from .transformers.patches import patch_preprocess_mask_arguments
24 | 
25 |         patch_preprocess_mask_arguments()
26 |     except Exception:
27 |         pass
28 | except ImportError:
29 |     pass
30 | 
31 | 
32 | from . import dev
33 | from .auto_trajectory import auto_trajectory, capture_auto_trajectory
34 | from .backend import Backend
35 | from .batches import trajectory_group_batches
36 | from .gather import gather_trajectories, gather_trajectory_groups
37 | from .model import Model, TrainableModel
38 | from .serverless import ServerlessBackend
39 | from .trajectories import Trajectory, TrajectoryGroup
40 | from .types import Messages, MessagesAndChoices, Tools, TrainConfig
41 | from .utils import retry
42 | from .yield_trajectory import capture_yielded_trajectory, yield_trajectory
43 | 
44 | __all__ = [
45 |     "dev",
46 |     "auto_trajectory",
47 |     "capture_auto_trajectory",
48 |     "gather_trajectories",
49 |     "gather_trajectory_groups",
50 |     "trajectory_group_batches",
51 |     "Backend",
52 |     "ServerlessBackend",
53 |     "Messages",
54 |     "MessagesAndChoices",
55 |     "Tools",
56 |     "Model",
57 |     "TrainableModel",
58 |     "retry",
59 |     "TrainConfig",
60 |     "Trajectory",
61 |     "TrajectoryGroup",
62 |     "capture_yielded_trajectory",
63 |     "yield_trajectory",
64 | ]
65 | 


--------------------------------------------------------------------------------
/docs/resources/glossary.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Glossary"
 3 | description: "Terms and definitions used in the ART docs."
 4 | icon: "circle-info"
 5 | ---
 6 | 
 7 | ## Additional Histories
 8 | 
 9 | A feature that allows a trajectory to contain multiple separate conversation histories. Used for training agents with non-linear conversation flows, preserving special tokens across turns, or handling sub-agent interactions. See [Additional Histories](/features/additional-histories) for details.
10 | 
11 | ## Agent
12 | 
13 | A program that uses an LLM to perform a task.
14 | 
15 | ## Batch Size
16 | 
17 | The number of training scenarios that are run in a single training step.
18 | 
19 | ## Reward Function
20 | 
21 | The function used to assess agent performance and score a trajectory.
22 | 
23 | ## Rollout
24 | 
25 | A single attempt by the agent to complete a training or validation scenario.
26 | 
27 | ## Training Environment
28 | 
29 | The programmatic environment that the agent interacts with. This includes all the tools available to the agent, the data it can query, and any other external aspects of the system the agent is operating in.
30 | 
31 | ## Training Loop
32 | 
33 | The training loop is the process of training the agent.
34 | 
35 | ## Training Scenarios
36 | 
37 | The scenarios that the agent will run through during training. Adding new training scenarios that represent edge cases on which the agent is currently underperforming will help it correct is behavior.
38 | 
39 | ## Training Step
40 | 
41 | A single step in the training loop. During a training step, the agent completes a set of training scenarios and has its performance assessed and weights updated to improve its performance.
42 | 
43 | ## Trajectory
44 | 
45 | A set of system, user, and assistant messages that are produced by the agent in a single rollout.
46 | 
47 | ## Trajectory Group
48 | 
49 | A set of trajectories that the agent produced while completing a single scenario. Differences in trajectory rewards are used to train the agent.
50 | 
51 | ## Trajectory Group Size
52 | 
53 | The number of trajectories in a trajectory group.
54 | 
55 | ## Validation Scenarios
56 | 
57 | Validation scenarios are the scenarios that the agent is evaluated on. These scenarios are used to assess the agent's performance and determine whether it has improved.
58 | 


--------------------------------------------------------------------------------
/examples/2048/train.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import random
 3 | 
 4 | from dotenv import load_dotenv
 5 | from rollout import rollout
 6 | 
 7 | import art
 8 | from art.local import LocalBackend
 9 | from art.rewards import ruler_score_group
10 | 
11 | load_dotenv()
12 | 
13 | random.seed(42)
14 | 
15 | # Declare the model
16 | model = art.TrainableModel(
17 |     name="tutorial-001",
18 |     project="2048",
19 |     base_model="Qwen/Qwen2.5-3B-Instruct",
20 | )
21 | model._internal_config = art.dev.InternalModelConfig(
22 |     init_args=art.dev.InitArgs(
23 |         max_seq_length=8192,
24 |     ),
25 | )
26 | 
27 | TRAIN_STEPS = 40
28 | SIMULTANEOUS_GAMES = 18
29 | ENABLE_RULER = True
30 | 
31 | 
32 | async def train():
33 |     # Initialize the server
34 |     backend = LocalBackend()
35 | 
36 |     # Register the model with the local backend (sets up logging, inference, and training)
37 |     await model.register(backend)
38 | 
39 |     await backend._experimental_pull_from_s3(
40 |         model,
41 |         verbose=True,
42 |     )
43 | 
44 |     # train for 40 steps
45 |     for i in range(await model.get_step(), TRAIN_STEPS):
46 |         train_groups = await art.gather_trajectory_groups(
47 |             (
48 |                 art.TrajectoryGroup(
49 |                     # for each step, rollout 18 trajectories
50 |                     rollout(model, i, is_validation=False)
51 |                     for _ in range(SIMULTANEOUS_GAMES)
52 |                 )
53 |                 for _ in range(1)
54 |             ),
55 |             after_each=lambda group: (
56 |                 ruler_score_group(
57 |                     group,
58 |                     "openai/o4-mini",
59 |                     debug=True,
60 |                     swallow_exceptions=True,  # Return None on error, filtering out the group
61 |                 )
62 |                 if ENABLE_RULER
63 |                 else None
64 |             ),
65 |             pbar_desc="gather",
66 |             max_exceptions=10,
67 |         )
68 | 
69 |         # save the model to S3
70 |         await backend._experimental_push_to_s3(
71 |             model,
72 |         )
73 | 
74 |         await model.train(
75 |             train_groups,
76 |             config=art.TrainConfig(learning_rate=1e-5),
77 |         )
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     asyncio.run(train())
82 | 


--------------------------------------------------------------------------------
/dev/swebench/tools/registry/lib/registry.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from pathlib import Path
 4 | from typing import Any, Optional
 5 | 
 6 | 
 7 | class EnvRegistry:
 8 |     """Read and write variables into a file. This is used to persist state between tool
 9 |     calls without using environment variables (which are problematic because you cannot
10 |     set them in a subprocess).
11 | 
12 |     The default file location is `/root/.swe-agent-env`, though this can be overridden
13 |     by the `env_file` argument or the `SWE_AGENT_ENV_FILE` environment variable.
14 |     """
15 | 
16 |     def __init__(self, env_file: Optional[Path] = None):
17 |         self._env_file = env_file
18 | 
19 |     @property
20 |     def env_file(self) -> Path:
21 |         if self._env_file is None:
22 |             env_file = Path(
23 |                 os.environ.get("SWE_AGENT_ENV_FILE", "/root/.swe-agent-env")
24 |             )
25 |         else:
26 |             env_file = self._env_file
27 |         if not env_file.exists():
28 |             env_file.write_text("{}")
29 |         return env_file
30 | 
31 |     def __getitem__(self, key: str) -> str:
32 |         return json.loads(self.env_file.read_text())[key]
33 | 
34 |     def get(
35 |         self, key: str, default_value: Any = None, fallback_to_env: bool = True
36 |     ) -> Any:
37 |         """Get a value from registry:
38 | 
39 |         Args:
40 |             key: The key to get the value for.
41 |             default_value: The default value to return if the key is not found in the registry.
42 |             fallback_to_env: If True, fallback to environment variables if the key is not found in the registry.
43 |                 If there's no environment variable, return the default value.
44 |         """
45 |         if fallback_to_env and key in os.environ:
46 |             default_value = os.environ[key]
47 |         return json.loads(self.env_file.read_text()).get(key, default_value)
48 | 
49 |     def get_if_none(self, value: Any, key: str, default_value: Any = None) -> Any:
50 |         if value is not None:
51 |             return value
52 |         return self.get(key, default_value)
53 | 
54 |     def __setitem__(self, key: str, value: Any):
55 |         env = json.loads(self.env_file.read_text())
56 |         env[key] = value
57 |         self.env_file.write_text(json.dumps(env))
58 | 
59 | 
60 | registry = EnvRegistry()
61 | 


--------------------------------------------------------------------------------
/dev/playwright_agent/job_desc_dataset.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "url": "https://www.openpipe.ai",
 4 |         "question": "Find all the open jobs and descriptions and salary ranges for this company"
 5 |     },
 6 |     {
 7 |         "url": "https://www.withflex.com/shop",
 8 |         "question": "Find all the open jobs and descriptions and salary ranges for this company"
 9 |     },
10 |     {
11 |         "url": "https://www.trymartin.com/",
12 |         "question": "Find all the open jobs and descriptions and salary ranges for this company"
13 |     },
14 |     {
15 |         "url": "https://www.healthtech1.uk/",
16 |         "question": "Find all the open jobs and descriptions and salary ranges for this company"
17 |     },
18 |     {
19 |         "url": "https://www.feanixbio.com/",
20 |         "question": "Find all the open jobs and descriptions and salary ranges for this company"
21 |     },
22 |     {
23 |         "url": "https://www.orbio.earth/",
24 |         "question": "Find all the open jobs and descriptions and salary ranges for this company"
25 |     },
26 |     {
27 |         "url": "https://aide.dev/",
28 |         "question": "Find all the open jobs and descriptions and salary ranges for this company"
29 |     },
30 |     {
31 |         "url": "https://www.trycatamaran.com/",
32 |         "question": "Find all the open jobs and descriptions and salary ranges for this company"
33 |     },
34 |     {
35 |         "url": "https://www.coba.ai/",
36 |         "question": "Find all the open jobs and descriptions and salary ranges for this company"
37 |     },
38 |     {
39 |         "url": "https://trytaylor.ai/",
40 |         "question": "Find all the open jobs and descriptions and salary ranges for this company"
41 |     },
42 |     {
43 |         "url": "https://www.tempo.new/",
44 |         "question": "Find all the open jobs and descriptions and salary ranges for this company"
45 |     },
46 |     {
47 |         "url": "https://curatle.com/",
48 |         "question": "Find all the open jobs and descriptions and salary ranges for this company"
49 |     },
50 |     {
51 |         "url": "https://autotab.com/",
52 |         "question": "Find all the open jobs and descriptions and salary ranges for this company"
53 |     },
54 |     {
55 |         "url": "https://oliolabs.co/",
56 |         "question": "Find all the open jobs and descriptions and salary ranges for this company"
57 |     }
58 | ]


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/model_utils/api/exception.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import time
 4 | from concurrent.futures import ThreadPoolExecutor
 5 | from typing import Any, Callable, TypeVar
 6 | 
 7 | from tau_bench.model_utils.model.exception import ModelError, Result
 8 | 
 9 | T = TypeVar("T")
10 | 
11 | _REPORT_DIR = os.path.expanduser("~/.llm-primitives/log")
12 | 
13 | 
14 | def set_report_dir(path: str) -> None:
15 |     global _REPORT_DIR
16 |     _REPORT_DIR = path
17 | 
18 | 
19 | def get_report_dir() -> str:
20 |     return _REPORT_DIR
21 | 
22 | 
23 | def log_report_to_disk(report: dict[str, Any], path: str) -> None:
24 |     with open(path, "w") as f:
25 |         json.dump(report, f, indent=4)
26 | 
27 | 
28 | def generate_report_location() -> str:
29 |     if not os.path.exists(_REPORT_DIR):
30 |         os.makedirs(_REPORT_DIR)
31 |     return os.path.join(_REPORT_DIR, f"report-{time.time_ns()}.json")
32 | 
33 | 
34 | class APIError(Exception):
35 |     def __init__(
36 |         self, short_message: str, report: dict[str, Any] | None = None
37 |     ) -> None:
38 |         self.report_path = generate_report_location()
39 |         self.short_message = short_message
40 |         self.report = report
41 |         if self.report is not None:
42 |             log_report_to_disk(
43 |                 report={"error_type": "APIError", "report": report},
44 |                 path=self.report_path,
45 |             )
46 |         super().__init__(
47 |             f"{short_message}\n\nSee the full report at {self.report_path}"
48 |         )
49 | 
50 | 
51 | def execute_and_filter_model_errors(
52 |     funcs: list[Callable[[], T]],
53 |     max_concurrency: int | None = None,
54 | ) -> list[T] | list[ModelError]:
55 |     def _invoke_w_o_llm_error(invocable: Callable[[], T]) -> Result:
56 |         try:
57 |             return Result(value=invocable(), error=None)
58 |         except ModelError as e:
59 |             return Result(value=None, error=e)
60 | 
61 |     with ThreadPoolExecutor(max_workers=max_concurrency) as executor:
62 |         results = list(executor.map(_invoke_w_o_llm_error, funcs))
63 | 
64 |     errors: list[ModelError] = []
65 |     values = []
66 |     for res in results:
67 |         if res.error is not None:
68 |             errors.append(res.error)
69 |         else:
70 |             values.append(res.value)
71 |     if len(values) == 0:
72 |         assert len(errors) > 0
73 |         raise errors[0]
74 |     return values
75 | 


--------------------------------------------------------------------------------
/dev/swebench/sandbox/new.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from contextlib import asynccontextmanager
 3 | from typing import AsyncIterator
 4 | 
 5 | import daytona_sdk
 6 | import modal
 7 | 
 8 | from .daytona import DaytonaSandbox
 9 | from .modal import ModalSandbox
10 | from .sandbox import Provider, Sandbox
11 | 
12 | daytona = daytona_sdk.AsyncDaytona()
13 | modal_app_task: asyncio.Task[modal.App] | None = None
14 | 
15 | # Enable Modal output to see image build logs
16 | modal.enable_output()
17 | 
18 | 
19 | @asynccontextmanager
20 | async def new_sandbox(
21 |     *, image: str, provider: Provider, timeout: int = 60
22 | ) -> AsyncIterator[Sandbox]:
23 |     """
24 |     Context manager for a new sandbox.
25 | 
26 |     Args:
27 |         image: The image to use for the sandbox.
28 |         provider: The provider to use for the sandbox: "daytona" or "modal".
29 | 
30 |     Returns:
31 |         A context manager that yields a sandbox object.
32 | 
33 |     Example:
34 |         ```python
35 |         async with new_sandbox(image=instance["image_name"], provider="daytona") as sandbox:
36 |             failed, passed = await sandbox.eval(instance["FAIL_TO_PASS"])
37 |         ```
38 |     """
39 |     if provider == "daytona":
40 |         global daytona
41 |         for _ in range(2):
42 |             try:
43 |                 sandbox = await daytona.create(
44 |                     daytona_sdk.CreateSandboxFromImageParams(image=image),
45 |                     timeout=timeout,
46 |                 )
47 |                 break
48 |             except daytona_sdk.DaytonaError as e:
49 |                 if "Event loop is closed" in str(e):
50 |                     await daytona.close()
51 |                     daytona = daytona_sdk.AsyncDaytona()
52 |                     continue
53 |                 raise
54 |         try:
55 |             yield DaytonaSandbox(sandbox)
56 |         finally:
57 |             await sandbox.delete()
58 |     else:
59 |         global modal_app_task
60 |         if modal_app_task is None:
61 |             modal_app_task = asyncio.create_task(
62 |                 modal.App.lookup.aio("swebench", create_if_missing=True)
63 |             )
64 |         app = await modal_app_task
65 |         sandbox = await modal.Sandbox.create.aio(
66 |             app=app, image=modal.Image.from_registry(image), timeout=timeout
67 |         )
68 |         try:
69 |             yield ModalSandbox(sandbox)
70 |         finally:
71 |             await sandbox.terminate.aio()
72 | 


--------------------------------------------------------------------------------
/docs/experimental/gspo.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "GSPO (Group Sequence Policy Optimization)"
 3 | description: "A stable and efficient RL algorithm for training language models"
 4 | ---
 5 | 
 6 | <Note>
 7 | GSPO is an experimental feature. The API and behavior may change in future releases.
 8 | </Note>
 9 | 
10 | ## Overview
11 | 
12 | GSPO was introduced by the Qwen team to train state-of-the-art models including [Qwen3-235B-A22B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507). It can improve training stability and efficiency for Mixture-of-Experts (MoE) models, and may have limited or no impact for dense models.
13 | 
14 | ## Key Benefits
15 | 
16 | - **Stable Training**: Maintains stable training processes and resolves stability challenges in large MoE models
17 | - **Efficient Scaling**: Achieves higher training efficiency and continues improving with increased computational resources  
18 | - **Infrastructure-Friendly**: More tolerant of precision discrepancies, eliminating the need for complex strategies like "Routing Replay"
19 | 
20 | ## How It Works
21 | 
22 | GSPO's core innovation is its **sequence-level optimization objective**. Instead of focusing on individual token likelihoods, GSPO defines importance ratios based on the **sequence likelihood** with length normalization to reduce variance.
23 | 
24 | The algorithm optimizes:
25 | 
26 | ```
27 | J_GSPO(θ) = E[1/G ∑ᵢ min(sᵢ(θ) Âᵢ, clip(sᵢ(θ), 1-ε, 1+ε) Âᵢ)]
28 | ```
29 | 
30 | Where the importance ratio `sᵢ(θ)` is defined as:
31 | 
32 | ```
33 | sᵢ(θ) = (π_θ(yᵢ|x) / π_θ_old(yᵢ|x))^(1/|yᵢ|)
34 | ```
35 | 
36 | This sequence-level approach makes GSPO more robust to noise and eliminates the need for complex MoE-specific strategies.
37 | 
38 | ## Configuration
39 | 
40 | GSPO can be configured using the `importance_sampling_level` parameter when training with ART:
41 | 
42 | ```python
43 | model.train(
44 |     trajectory_groups,
45 |     _config=art.dev.TrainConfig(
46 |         importance_sampling_level="sequence",
47 |     )
48 | )
49 | ```
50 | 
51 | ## Technical Details
52 | 
53 | For a deeper understanding of GSPO's technical foundations and comparative analysis with other RL algorithms, see the [original research paper](https://qwenlm.github.io/blog/gspo/).
54 | 
55 | ## Limitations
56 | 
57 | - As an experimental feature, GSPO may have limited compatibility with some model architectures
58 | - Performance characteristics may vary depending on model size and dataset
59 | - API is subject to change in future releases
60 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     types: [closed]
 6 |     branches: [main]
 7 | 
 8 | permissions:
 9 |   contents: write
10 |   id-token: write
11 | 
12 | jobs:
13 |   release:
14 |     runs-on: ubuntu-latest
15 |     if: github.event.pull_request.merged == true && startsWith(github.event.pull_request.head.ref, 'release/')
16 |     steps:
17 |       - uses: actions/checkout@v4
18 |         with:
19 |           fetch-depth: 0
20 | 
21 |       - name: Set up Python
22 |         uses: actions/setup-python@v5
23 |         with:
24 |           python-version: '3.11'
25 | 
26 |       - name: Install uv
27 |         run: |
28 |           curl -LsSf https://astral.sh/uv/install.sh | sh
29 |           echo "$HOME/.cargo/bin" >> $GITHUB_PATH
30 | 
31 |       - name: Install dependencies
32 |         run: |
33 |           uv venv
34 |           uv pip install -e .
35 |           uv pip install hatch
36 | 
37 |       - name: Build package
38 |         run: uv run hatch build
39 | 
40 |       - name: Get version from pyproject.toml
41 |         id: get_version
42 |         run: |
43 |           VERSION=$(python -c "import tomllib; print(tomllib.load(open('pyproject.toml', 'rb'))['project']['version'])")
44 |           echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
45 | 
46 |       - name: Create git tag
47 |         run: |
48 |           git config --local user.email "action@github.com"
49 |           git config --local user.name "GitHub Action"
50 |           git tag v${{ steps.get_version.outputs.VERSION }}
51 |           git push origin v${{ steps.get_version.outputs.VERSION }}
52 | 
53 |       - name: Publish draft release
54 |         env:
55 |           GH_TOKEN: ${{ github.token }}
56 |         run: |
57 |           # Check if draft release exists and publish it
58 |           if gh release view v${{ steps.get_version.outputs.VERSION }} --json isDraft | jq -r '.isDraft' | grep -q true; then
59 |             gh release edit v${{ steps.get_version.outputs.VERSION }} --draft=false
60 |           else
61 |             echo "::error::No draft release found for v${{ steps.get_version.outputs.VERSION }}"
62 |             exit 1
63 |           fi
64 | 
65 |       - name: Upload assets to release
66 |         env:
67 |           GH_TOKEN: ${{ github.token }}
68 |         run: |
69 |           gh release upload v${{ steps.get_version.outputs.VERSION }} dist/*
70 | 
71 |       - name: Publish to PyPI
72 |         uses: pypa/gh-action-pypi-publish@release/v1
73 |         with:
74 |           password: ${{ secrets.PYPI_ART_TOKEN }}


--------------------------------------------------------------------------------
/docs/fundamentals/training-loop.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "ART Training Loop"
 3 | description: "Learn how inference and training work within ART."
 4 | sidebarTitle: "Training Loop"
 5 | icon: "recycle"
 6 | ---
 7 | 
 8 | ART's functionality is divided into a [**client**](/fundamentals/art-client) and a [**backend**](/fundamentals/art-backend). The OpenAI-compatible client is responsible for interfacing between ART and your codebase. Using the client, you can pass messages and get completions from your LLM as it improves. The backend runs independently on any machine with a GPU. It abstracts away the complexity of the inference and training portions of the RL loop while allowing for some custom configuration. An outline of the training loop is shown below:
 9 | 
10 | 1. **Inference**
11 | 
12 |    1. Your code uses the ART client to perform an agentic workflow (usually executing several rollouts in parallel to gather data faster).
13 |    2. Completion requests are routed to the ART backend, which runs the model's latest LoRA in vLLM.
14 |    3. As the agent executes, each `system`, `user`, and `assistant` message is stored in a Trajectory.
15 |    4. After your rollouts finish, your code assigns a `reward` to each Trajectory, with higher rewards indicating better performance than low ones.
16 | 
17 | 2. **Training**
18 |    1. When all rollouts have finished, Trajectories are grouped and sent to the backend. Inference is blocked while training executes.
19 |    2. The backend trains your model using GRPO, initializing from the latest checkpoint (or an empty LoRA on the first iteration).
20 |    3. The backend saves the newly trained LoRA to a local directory and loads it into vLLM.
21 |    4. Inference is unblocked and the loop resumes at step 1.
22 | 
23 | This training loop runs until a specified number of inference and training iterations have completed.
24 | 
25 | Training and inference use both the ART **client** and **backend**. Learn more by following the links below!
26 | 
27 | <div className="cards-container">
28 |   <div className="card-wrapper">
29 |     <Card
30 |       title="ART Client"
31 |       icon="laptop-code"
32 |       href="/fundamentals/art-client"
33 |       horizontal={true}
34 |       arrow={true}
35 |     >
36 |       The client is responsible for interfacing between your code and the ART
37 |       backend.
38 |     </Card>
39 |   </div>
40 |   <div className="card-wrapper">
41 |     <Card
42 |       title="ART Backend"
43 |       icon="server"
44 |       href="/fundamentals/art-backend"
45 |       horizontal={true}
46 |       arrow={true}
47 |     >
48 |       The backend is responsible for generating tokens and training your models.
49 |     </Card>
50 |   </div>
51 | </div>
52 | 


--------------------------------------------------------------------------------
/src/art/local/checkpoints.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | from art.utils.get_model_step import get_step_from_dir
 5 | 
 6 | 
 7 | def delete_checkpoints(output_dir: str, excluding: list[int]) -> None:
 8 |     checkpoint_base_dir = os.path.join(output_dir, "checkpoints")
 9 |     if not os.path.exists(checkpoint_base_dir):
10 |         return
11 | 
12 |     for dir in os.listdir(checkpoint_base_dir):
13 |         if (
14 |             os.path.isdir(os.path.join(checkpoint_base_dir, dir))
15 |             and dir.isdigit()
16 |             and int(dir) not in excluding
17 |         ):
18 |             checkpoint_dir = os.path.join(checkpoint_base_dir, dir)
19 |             shutil.rmtree(checkpoint_dir)
20 |             print(f"Deleted checkpoint {checkpoint_dir}")
21 | 
22 | 
23 | def get_last_checkpoint_dir(output_dir: str) -> str | None:
24 |     step = get_step_from_dir(output_dir)
25 |     if step == 0:
26 |         return None
27 | 
28 |     checkpoint_dir = os.path.join(output_dir, "checkpoints", f"{step:04d}")
29 |     if os.path.exists(checkpoint_dir):
30 |         return checkpoint_dir
31 | 
32 |     return None
33 | 
34 | 
35 | def migrate_checkpoints_to_new_structure(output_dir: str) -> None:
36 |     """
37 |     Migrate existing checkpoints from the old structure to the new structure.
38 |     Old: .art/{project}/models/{model_name}/{step}
39 |     New: .art/{project}/models/{model_name}/checkpoints/{step}
40 |     """
41 |     # Create checkpoints directory if it doesn't exist
42 |     checkpoint_base_dir = os.path.join(output_dir, "checkpoints")
43 |     os.makedirs(checkpoint_base_dir, exist_ok=True)
44 | 
45 |     # Find all directories in the output_dir that are checkpoints (parseable as ints)
46 |     migrated_count = 0
47 |     for item in os.listdir(output_dir):
48 |         item_path = os.path.join(output_dir, item)
49 |         if os.path.isdir(item_path) and item.isdigit():
50 |             # This is a checkpoint directory in the old structure
51 |             new_checkpoint_path = os.path.join(checkpoint_base_dir, item)
52 | 
53 |             # Skip if already exists in new location
54 |             if os.path.exists(new_checkpoint_path):
55 |                 print(
56 |                     f"Checkpoint {item} already exists in new location, skipping migration"
57 |                 )
58 |                 continue
59 | 
60 |             # Move the checkpoint to the new location
61 |             print(f"Migrating checkpoint {item} to new structure...")
62 |             shutil.move(item_path, new_checkpoint_path)
63 |             migrated_count += 1
64 | 
65 |     if migrated_count > 0:
66 |         print(f"Successfully migrated {migrated_count} checkpoint(s) to new structure")
67 | 


--------------------------------------------------------------------------------
/src/art/dev/torchtune.py:
--------------------------------------------------------------------------------
  1 | from typing import Literal
  2 | 
  3 | from typing_extensions import Required, TypedDict
  4 | 
  5 | # Update from torchtune.models.{model_family}.__init__.py files
  6 | # Only include argument-less TransformerDecoder factory functions
  7 | TorchtuneModel = Literal[
  8 |     # Gemma
  9 |     "gemma_2b",
 10 |     "gemma_7b",
 11 |     # Gemma2
 12 |     "gemma2_2b",
 13 |     "gemma2_9b",
 14 |     "gemma2_27b",
 15 |     # Llama2
 16 |     "llama2_7b",
 17 |     "llama2_13b",
 18 |     "llama2_70b",
 19 |     "llama2_reward_7b",
 20 |     # Llama3
 21 |     "llama3_8b",
 22 |     "llama3_70b",
 23 |     # Llama3.1
 24 |     "llama3_1_8b",
 25 |     "llama3_1_70b",
 26 |     "llama3_1_405b",
 27 |     # Llama3.2
 28 |     "llama3_2_1b",
 29 |     "llama3_2_3b",
 30 |     # Llama3.2 Vision
 31 |     "llama3_2_vision_11b",
 32 |     "llama3_2_vision_90b",
 33 |     # Llama3.3
 34 |     "llama3_3_70b",
 35 |     # Llama4
 36 |     "llama4_scout_17b_16e",
 37 |     "llama4_maverick_17b_128e",
 38 |     # Mistral
 39 |     "mistral_7b",
 40 |     "mistral_reward_7b",
 41 |     # Phi3
 42 |     "phi3_mini",
 43 |     # Phi4
 44 |     "phi4_14b",
 45 |     # Qwen2
 46 |     "qwen2_0_5b",
 47 |     "qwen2_1_5b",
 48 |     "qwen2_7b",
 49 |     # Qwen2.5
 50 |     "qwen2_5_0_5b",
 51 |     "qwen2_5_1_5b_base",
 52 |     "qwen2_5_1_5b_instruct",
 53 |     "qwen2_5_3b",
 54 |     "qwen2_5_7b_base",
 55 |     "qwen2_5_7b_instruct",
 56 |     "qwen2_5_14b_base",
 57 |     "qwen2_5_14b_instruct",
 58 |     "qwen2_5_32b_base",
 59 |     "qwen2_5_32b_instruct",
 60 |     "qwen2_5_72b_base",
 61 |     "qwen2_5_72b_instruct",
 62 |     # Qwen3
 63 |     "qwen3_0_6b_base",
 64 |     "qwen3_0_6b_instruct",
 65 |     "qwen3_1_7b_base",
 66 |     "qwen3_1_7b_instruct",
 67 |     "qwen3_4b_base",
 68 |     "qwen3_4b_instruct",
 69 |     "qwen3_8b_base",
 70 |     "qwen3_8b_instruct",
 71 |     "qwen3_14b_base",
 72 |     "qwen3_14b_instruct",
 73 |     "qwen3_32b",
 74 | ]
 75 | 
 76 | # Update from torchtune.training.checkpointing._utils.ModelType
 77 | TorchtuneModelType = Literal[
 78 |     "GEMMA",
 79 |     "GEMMA2",
 80 |     "LLAMA2",
 81 |     "LLAMA3",
 82 |     "LLAMA3_2",
 83 |     "LLAMA3_VISION",
 84 |     "LLAMA4",
 85 |     "MISTRAL",
 86 |     "PHI3_MINI",
 87 |     "PHI4",
 88 |     "REWARD",
 89 |     "QWEN2",
 90 |     "CLIP_TEXT",
 91 |     "T5_ENCODER",
 92 |     "QWEN3",
 93 | ]
 94 | 
 95 | 
 96 | class TorchtuneArgs(TypedDict, total=False):
 97 |     model: Required[TorchtuneModel]
 98 |     model_type: Required[TorchtuneModelType]
 99 |     tensor_parallel_dim: int
100 |     context_parallel_dim: int
101 |     enable_activation_offloading: bool
102 |     async_weight_syncing: bool
103 | 


--------------------------------------------------------------------------------
/scripts/bump_version.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Version bumping script for ART releases.
 4 | 
 5 | Usage:
 6 |     python scripts/bump_version.py patch  # 0.3.13 -> 0.3.14
 7 |     python scripts/bump_version.py minor  # 0.3.13 -> 0.4.0
 8 |     python scripts/bump_version.py major  # 0.3.13 -> 1.0.0
 9 | """
10 | 
11 | import re
12 | import subprocess
13 | import sys
14 | from pathlib import Path
15 | 
16 | 
17 | def get_current_version():
18 |     """Extract current version from pyproject.toml."""
19 |     pyproject_path = Path(__file__).parent.parent / "pyproject.toml"
20 |     content = pyproject_path.read_text()
21 |     match = re.search(r'version = "(\d+\.\d+\.\d+)"', content)
22 |     if not match:
23 |         raise ValueError("Could not find version in pyproject.toml")
24 |     return match.group(1)
25 | 
26 | 
27 | def bump_version(current_version, bump_type):
28 |     """Bump version based on type (major, minor, patch)."""
29 |     major, minor, patch = map(int, current_version.split("."))
30 | 
31 |     if bump_type == "major":
32 |         return f"{major + 1}.0.0"
33 |     elif bump_type == "minor":
34 |         return f"{major}.{minor + 1}.0"
35 |     elif bump_type == "patch":
36 |         return f"{major}.{minor}.{patch + 1}"
37 |     else:
38 |         raise ValueError(f"Invalid bump type: {bump_type}")
39 | 
40 | 
41 | def update_version(new_version):
42 |     """Update version in pyproject.toml."""
43 |     pyproject_path = Path(__file__).parent.parent / "pyproject.toml"
44 |     content = pyproject_path.read_text()
45 | 
46 |     # Update version
47 |     new_content = re.sub(
48 |         r'version = "\d+\.\d+\.\d+"', f'version = "{new_version}"', content
49 |     )
50 | 
51 |     pyproject_path.write_text(new_content)
52 | 
53 |     # run uv sync
54 |     subprocess.run(["uv", "sync"])
55 | 
56 | 
57 | def main():
58 |     if len(sys.argv) != 2 or sys.argv[1] not in ["major", "minor", "patch"]:
59 |         print(__doc__)
60 |         sys.exit(1)
61 | 
62 |     bump_type = sys.argv[1]
63 | 
64 |     try:
65 |         current = get_current_version()
66 |         new = bump_version(current, bump_type)
67 | 
68 |         print(f"Bumping version from {current} to {new}")
69 |         update_version(new)
70 |         print("✓ Updated pyproject.toml")
71 |         print("\nNext steps:")
72 |         print(
73 |             f"1. Commit the change: git add pyproject.toml uv.lock && git commit -m 'Bump version to {new}'"
74 |         )
75 |         print(f"2. Create and push tag: git tag v{new} && git push origin v{new}")
76 |         print(
77 |             "3. The GitHub Action will automatically create a release and publish to PyPI"
78 |         )
79 | 
80 |     except Exception as e:
81 |         print(f"Error: {e}")
82 |         sys.exit(1)
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     main()
87 | 


--------------------------------------------------------------------------------
/docs/resources/models.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Supported Models"
 3 | sidebarTitle: "Models"
 4 | description: "Train open source models on ART."
 5 | icon: "robot"
 6 | ---
 7 | 
 8 | ## Serverless Models
 9 | 
10 | We currently only support the following model for serverless training. We are actively adding support for both larger and smaller models. If there's a particular model you'd like to see serverless support for, please send a request to support@wandb.com.
11 | 
12 | - [OpenPipe Qwen 3 14B Instruct](https://huggingface.co/OpenPipe/Qwen3-14B-Instruct)
13 |   - Good balance of performance and size. Has support for tool calling and generally trains well. This is our recommended model for users new to RL.
14 | 
15 | 
16 | ## Recommended Local Models
17 | 
18 | If you're developing locally or in your own hardware, here are a couple other models you could try in addition to the recommended serverless list.
19 | 
20 | - [Qwen2.5 7B Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)
21 |   - Less capable than 14B, but smaller and faster
22 | - [Qwen2.5 32B Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct)
23 |   - More capable than 14B, but larger and slower
24 | 
25 | ## More Models
26 | 
27 | ART has wide support for models supported by [vLLM](https://docs.vllm.ai/en/latest/models/supported_models.html). However, not all models support all features. For instance, if a model's chat template does not include tool call support, you won't be able to use tools with it natively. And if a model's architecture doesn't have support for LoRA layers, it won't be compatible with our LoRA-based backend, but still may work with our full-fine-tuning backend.
28 | 
29 | Here are additional models that we've tested and found to work well with ART:
30 | 
31 | - [Llama 3.1 8B Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)
32 | - [Llama 3.2 1B Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)
33 | - [Llama 3.2 3B Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)
34 | - [Llama 3.3 70B Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct)
35 | - [Qwen2.5 72B Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct)
36 | - Additionally, the [Qwen 3](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) family of models is well supported for single-turn workflows. For multi-turn workflows the Qwen 3 chat template removes the `<think>` tokens from previous turns, which makes training more complicated. It is still possible to use for multi-turn workflows by splitting each turn into a separate message history with our `additional_histories` trajectory parameter (see [Additional Histories](/features/additional-histories)).
37 | 
38 | If you're curious about a model that is not listed above, ask in the Discord [#support](https://discord.com/channels/1359674493949448375/1359674622965973185) channel.
39 | 


--------------------------------------------------------------------------------
/examples/tic_tac_toe_self_play/deploy_step.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import asyncio
 3 | import os
 4 | 
 5 | from rollout import ModelConfig, TicTacToeScenario, rollout
 6 | from train import BASE_MODEL, CLUSTER_NAME, MODEL_NAME, PROJECT_NAME
 7 | 
 8 | import art
 9 | from art.utils.deployment import TogetherDeploymentConfig, deploy_model
10 | 
11 | 
12 | async def deploy_step():
13 |     parser = argparse.ArgumentParser(description="Train a model to play Tic-Tac-Toe")
14 |     parser.add_argument(
15 |         "--backend",
16 |         choices=["skypilot", "local"],
17 |         default="local",
18 |         help="Backend to use for training (default: local)",
19 |     )
20 |     parser.add_argument(
21 |         "--step",
22 |         type=int,
23 |         help="Step to deploy",
24 |     )
25 |     args = parser.parse_args()
26 | 
27 |     model = art.TrainableModel(
28 |         name=MODEL_NAME,
29 |         project=PROJECT_NAME,
30 |         base_model=BASE_MODEL,
31 |     )
32 | 
33 |     # Avoid import unnecessary backend dependencies
34 |     if args.backend == "skypilot":
35 |         from art.skypilot.backend import SkyPilotBackend
36 | 
37 |         backend = await SkyPilotBackend.initialize_cluster(
38 |             cluster_name=CLUSTER_NAME,
39 |             art_version=".",
40 |             env_path=".env",
41 |             gpu="H100",
42 |         )
43 |     else:
44 |         from art.local.backend import LocalBackend
45 | 
46 |         backend = LocalBackend()
47 | 
48 |     # Pull checkpoint from S3
49 |     checkpoint_path = await backend._experimental_pull_model_checkpoint(
50 |         model,
51 |         step=args.step,
52 |         s3_bucket=os.environ.get("BACKUP_BUCKET"),
53 |         verbose=True,
54 |     )
55 | 
56 |     # Deploy to Together
57 |     deployment_result = await deploy_model(
58 |         model=model,
59 |         checkpoint_path=checkpoint_path,
60 |         step=args.step,
61 |         provider="together",
62 |         config=TogetherDeploymentConfig(
63 |             s3_bucket=os.environ.get("BACKUP_BUCKET"),
64 |             wait_for_completion=True,
65 |         ),
66 |         verbose=True,
67 |     )
68 | 
69 |     deployed_model_name = deployment_result.inference_model_name
70 | 
71 |     lora_model = art.Model(
72 |         name=deployed_model_name,
73 |         project="tic-tac-toe",
74 |         inference_api_key=os.environ["TOGETHER_API_KEY"],
75 |         inference_base_url="https://api.together.xyz/v1",
76 |         inference_model_name=deployed_model_name,
77 |         config=ModelConfig(),
78 |     )
79 | 
80 |     print("Starting a rollout using the deployed model!")
81 |     x_trajectory, y_trajectory = await rollout(
82 |         x_model=lora_model,
83 |         y_model=lora_model,
84 |         scenario=TicTacToeScenario(step=0, split="val"),
85 |     )
86 | 
87 |     print(x_trajectory)
88 |     print(y_trajectory)
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     asyncio.run(deploy_step())
93 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [project]
  2 | name = "openpipe-art"
  3 | version = "0.5.3"
  4 | description = "The OpenPipe Agent Reinforcement Training (ART) library"
  5 | readme = "README.md"
  6 | requires-python = ">=3.10"
  7 | dependencies = [
  8 |     "openai>=1.65.5",
  9 |     "typer>=0.15.2",
 10 |     "litellm==1.74.1",
 11 |     "weave>=0.51.51",
 12 | ]
 13 | 
 14 | [project.optional-dependencies]
 15 | plotting = ["matplotlib>=3.10.1", "seaborn>=0.13.2"]
 16 | 
 17 | backend = [
 18 |     "peft>=0.14.0",
 19 |     "hf-xet>=1.1.0",
 20 |     "bitsandbytes>=0.45.2",
 21 |     "unsloth==2025.10.3",
 22 |     "unsloth-zoo==2025.10.3",
 23 |     "vllm>=0.9.2,<=0.10.0",
 24 |     "torchtune",
 25 |     "trl>=0.19.0",
 26 |     "torch>=2.7.0",
 27 |     "torchao>=0.9.0",
 28 |     "accelerate==1.7.0",
 29 |     "awscli>=1.38.1",
 30 |     "setproctitle>=1.3.6",
 31 |     "tblib>=3.0.0",
 32 |     "setuptools>=78.1.0",
 33 |     "wandb==0.22.1",
 34 |     "polars>=1.26.0",
 35 |     "transformers==4.53.2",
 36 |     "trl==0.20.0",
 37 |     "nbclient>=0.10.1",
 38 |     "pytest>=8.4.1",
 39 |     "nbmake>=1.5.5",
 40 |     "gql<4",
 41 | ]
 42 | 
 43 | skypilot = [
 44 |     "semver>=3.0.4",
 45 |     "skypilot[cudo,do,fluidstack,gcp,lambda,kubernetes,paperspace,runpod]==0.10.5",
 46 | ]
 47 | 
 48 | langgraph = [
 49 |     "langchain-core>=0.3.51",
 50 |     "langgraph>=0.6.2",
 51 |     "langchain-openai>=0.3.27",
 52 | ]
 53 | 
 54 | [project.scripts]
 55 | art = "art.cli:app"
 56 | stop-server = "art.skypilot.stop_server:main"
 57 | 
 58 | [build-system]
 59 | requires = ["hatchling"]
 60 | build-backend = "hatchling.build"
 61 | 
 62 | [tool.hatch.metadata]
 63 | allow-direct-references = true
 64 | 
 65 | [tool.hatch.build.targets.wheel]
 66 | packages = ["src/art", "src/mp_actors"]
 67 | 
 68 | [tool.hatch.build]
 69 | sources = ["src"]
 70 | 
 71 | [tool.hatch.build.targets.sdist]
 72 | exclude = [
 73 |     "/dev",
 74 |     "/wandb",
 75 |     "/.art",
 76 |     "/.ruff_cache",
 77 |     "/.venv",
 78 |     "/dist",
 79 |     "/.git",
 80 |     "/.github",
 81 |     "/examples/*/data",
 82 |     "/examples/*/wandb",
 83 |     "**/__pycache__",
 84 |     "**/*.pyc",
 85 | ]
 86 | 
 87 | [tool.ruff.lint]
 88 | select = ["I"]
 89 | 
 90 | [tool.ruff.lint.isort]
 91 | case-sensitive = false
 92 | 
 93 | [tool.pytest.ini_options]
 94 | asyncio_mode = "auto"
 95 | 
 96 | [tool.uv]
 97 | required-version = ">=0.6.15"
 98 | 
 99 | [dependency-groups]
100 | dev = [
101 |     "black>=25.1.0",
102 |     "ipykernel>=6.29.5",
103 |     "ipywidgets>=8.1.5",
104 |     "hatch>=1.14.1",
105 |     "ruff>=0.12.1",
106 |     "pytest>=8.4.1",
107 |     "nbval>=0.11.0",
108 |     "pytest-xdist>=3.8.0",
109 |     "pyright[nodejs]>=1.1.403",
110 |     "pytest-asyncio>=1.1.0",
111 | ]
112 | 
113 | [tool.uv.sources]
114 | panza = { git = "https://github.com/corbt/panza.git" }
115 | sweagent = { git = "https://github.com/bradhilton/SWE-agent" }
116 | torchtune = { git = "https://github.com/pytorch/torchtune.git", rev = "2344509cf83bd886538fe3e8263e5145d1afb5c2" }
117 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/model_utils/api/logging.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | import inspect
 3 | import json
 4 | from multiprocessing import Lock
 5 | from typing import Any
 6 | 
 7 | from pydantic import BaseModel
 8 | 
 9 | from tau_bench.model_utils.api.sample import SamplingStrategy
10 | from tau_bench.model_utils.model.utils import optionalize_type
11 | 
12 | log_files = {}
13 | 
14 | 
15 | def prep_for_json_serialization(obj: Any, from_parse_method: bool = False):
16 |     # TODO: refine type annotations
17 |     if isinstance(obj, (str, int, float, bool, type(None))):
18 |         return obj
19 |     elif isinstance(obj, dict):
20 |         return {k: prep_for_json_serialization(v) for k, v in obj.items()}
21 |     elif isinstance(obj, list):
22 |         return [prep_for_json_serialization(v) for v in obj]
23 |     elif isinstance(obj, tuple):
24 |         return tuple(prep_for_json_serialization(v) for v in obj)
25 |     elif isinstance(obj, set):
26 |         return {prep_for_json_serialization(v) for v in obj}
27 |     elif isinstance(obj, frozenset):
28 |         return frozenset(prep_for_json_serialization(v) for v in obj)
29 |     elif isinstance(obj, BaseModel):
30 |         return obj.model_dump(mode="json")
31 |     elif isinstance(obj, type) and issubclass(obj, BaseModel):
32 |         if from_parse_method:
33 |             optionalized_type = optionalize_type(obj)
34 |             return optionalized_type.model_json_schema()
35 |         else:
36 |             return obj.model_json_schema()
37 |     elif isinstance(obj, SamplingStrategy):
38 |         return obj.__class__.__name__
39 |     else:
40 |         raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
41 | 
42 | 
43 | def log_call(func):
44 |     @functools.wraps(func)
45 |     def wrapper(self, *args, **kwargs):
46 |         response = func(self, *args, **kwargs)
47 |         log_file = getattr(self, "_log_file", None)
48 |         if log_file is not None:
49 |             if log_file not in log_files:
50 |                 log_files[log_file] = Lock()
51 |             sig = inspect.signature(func)
52 |             bound_args = sig.bind(self, *args, **kwargs)
53 |             bound_args.apply_defaults()
54 |             all_args = bound_args.arguments
55 |             all_args.pop("self", None)
56 | 
57 |             cls_name = self.__class__.__name__
58 |             log_entry = {
59 |                 "cls_name": cls_name,
60 |                 "method_name": func.__name__,
61 |                 "kwargs": {
62 |                     k: prep_for_json_serialization(
63 |                         v, from_parse_method=func.__name__ in ["parse", "async_parse"]
64 |                     )
65 |                     for k, v in all_args.items()
66 |                 },
67 |                 "response": prep_for_json_serialization(response),
68 |             }
69 |             with log_files[log_file]:
70 |                 with open(log_file, "a") as f:
71 |                     f.write(f"{json.dumps(log_entry)}\n")
72 |         return response
73 | 
74 |     return wrapper
75 | 


--------------------------------------------------------------------------------
/dev/tau-bench/tau_bench/envs/airline/tools/update_reservation_passengers.py:
--------------------------------------------------------------------------------
 1 | # Copyright Sierra
 2 | 
 3 | import json
 4 | from typing import Any, Dict, List
 5 | 
 6 | from tau_bench.envs.tool import Tool
 7 | 
 8 | 
 9 | class UpdateReservationPassengers(Tool):
10 |     @staticmethod
11 |     def invoke(
12 |         data: Dict[str, Any],
13 |         reservation_id: str,
14 |         passengers: List[Dict[str, Any]],
15 |     ) -> str:
16 |         reservations = data["reservations"]
17 |         if reservation_id not in reservations:
18 |             return "Error: reservation not found"
19 |         reservation = reservations[reservation_id]
20 |         if len(passengers) != len(reservation["passengers"]):
21 |             return "Error: number of passengers does not match"
22 |         reservation["passengers"] = passengers
23 |         return json.dumps(reservation)
24 | 
25 |     @staticmethod
26 |     def get_info() -> Dict[str, Any]:
27 |         return {
28 |             "type": "function",
29 |             "function": {
30 |                 "name": "update_reservation_passengers",
31 |                 "description": "Update the passenger information of a reservation.",
32 |                 "parameters": {
33 |                     "type": "object",
34 |                     "properties": {
35 |                         "reservation_id": {
36 |                             "type": "string",
37 |                             "description": "The reservation ID, such as 'ZFA04Y'.",
38 |                         },
39 |                         "passengers": {
40 |                             "type": "array",
41 |                             "description": "An array of objects containing details about each passenger.",
42 |                             "items": {
43 |                                 "type": "object",
44 |                                 "properties": {
45 |                                     "first_name": {
46 |                                         "type": "string",
47 |                                         "description": "The first name of the passenger, such as 'Noah'.",
48 |                                     },
49 |                                     "last_name": {
50 |                                         "type": "string",
51 |                                         "description": "The last name of the passenger, such as 'Brown'.",
52 |                                     },
53 |                                     "dob": {
54 |                                         "type": "string",
55 |                                         "description": "The date of birth of the passenger in the format 'YYYY-MM-DD', such as '1990-01-01'.",
56 |                                     },
57 |                                 },
58 |                                 "required": ["first_name", "last_name", "dob"],
59 |                             },
60 |                         },
61 |                     },
62 |                     "required": ["reservation_id", "passengers"],
63 |                 },
64 |             },
65 |         }
66 | 


--------------------------------------------------------------------------------
/docs/docs.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "name": "ART",
  3 |   "description": "Train LLMs to be better agents using RL",
  4 |   "theme": "mint",
  5 |   "logo": {
  6 |     "light": "/images/site-assets/logo/light.svg",
  7 |     "dark": "/images/site-assets/logo/dark.svg"
  8 |   },
  9 |   "favicon": "/images/site-assets/favicon.webp",
 10 |   "colors": {
 11 |     "primary": "#FF5733",
 12 |     "light": "#FF5733",
 13 |     "dark": "#FF5733"
 14 |   },
 15 |   "styling": {
 16 |     "eyebrows": "section"
 17 |   },
 18 |   "modeToggle": {
 19 |     "default": "light"
 20 |   },
 21 |   "topbarCtaButton": {
 22 |     "name": "Subscribe",
 23 |     "url": "https://openpipe.ai"
 24 |   },
 25 |   "navbar": {
 26 |     "links": [
 27 |       {
 28 |         "label": "Discord",
 29 |         "href": "https://discord.gg/zbBHRUpwf4"
 30 |       },
 31 |       {
 32 |         "label": "Blog",
 33 |         "href": "https://openpipe.ai/blog"
 34 |       }
 35 |     ],
 36 |     "primary": {
 37 |       "type": "button",
 38 |       "label": "GitHub",
 39 |       "href": "https://github.com/openpipe/ART"
 40 |     }
 41 |   },
 42 |   "navigation": {
 43 |     "groups": [
 44 |       {
 45 |         "group": "Get Started",
 46 |         "pages": [
 47 |           "getting-started/about",
 48 |           "getting-started/quick-start",
 49 |           "getting-started/installation-setup",
 50 |           "getting-started/notebooks",
 51 |           "getting-started/faq"
 52 |         ]
 53 |       },
 54 |       {
 55 |         "group": "Fundamentals",
 56 |         "pages": [
 57 |           "fundamentals/training-loop",
 58 |           "fundamentals/art-client",
 59 |           "fundamentals/art-backend",
 60 |           "fundamentals/ruler"
 61 |         ]
 62 |       },
 63 |       {
 64 |         "group": "Features",
 65 |         "pages": [
 66 |           "features/checkpoint-forking",
 67 |           "features/checkpoint-deletion",
 68 |           "features/additional-histories",
 69 |           "features/mcp-rl"
 70 |         ]
 71 |       },
 72 |       {
 73 |         "group": "Integrations",
 74 |         "pages": [
 75 |           "integrations/langgraph-integration",
 76 |           "integrations/openenv-integration"
 77 |         ]
 78 |       },
 79 |       {
 80 |         "group": "Tutorials",
 81 |         "pages": [
 82 |           "tutorials/summarizer",
 83 |           "tutorials/open-deep-research"
 84 |         ]
 85 |       },
 86 |       {
 87 |         "group": "Resources",
 88 |         "pages": [
 89 |           "resources/models",
 90 |           "resources/glossary"
 91 |         ]
 92 |       },
 93 |       {
 94 |         "group": "Experimental",
 95 |         "pages": [
 96 |           "experimental/gspo"
 97 |         ]
 98 |       }
 99 |     ]
100 |   },
101 |   "topbarLinks": [],
102 |   "footerSocials": {
103 |     "x": "https://twitter.com/OpenPipeAI",
104 |     "linkedin": "https://www.linkedin.com/company/openpipe/about/",
105 |     "bluesky": "https://bsky.app/profile/openpipe.bsky.social",
106 |     "github": "https://github.com/openpipe/ART"
107 |   }
108 | }


--------------------------------------------------------------------------------
/examples/mcp-rl/all_experiments.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | import art
 4 | 
 5 | 
 6 | class McpPolicyConfig(BaseModel):
 7 |     max_turns: int = 5
 8 |     max_tokens: int = 2048
 9 | 
10 |     base_model: str = "Qwen/Qwen2.5-14B-Instruct"
11 | 
12 |     # MCP server configuration
13 |     mcp_server_name: str = "mcp_alphavantage"  # Default to alphavantage server
14 | 
15 |     # Training configuration fields
16 |     trajectories_per_group: int = 7
17 |     groups_per_step: int = 4
18 |     learning_rate: float = 1e-6
19 |     eval_steps: int = 1
20 |     val_set_size: int = 8
21 |     training_dataset_size: int = 16
22 |     num_epochs: int = 80
23 |     # Model name to use for RULER rescoring (LLM-as-a-judge)
24 |     ruler_judge_model: str = "openrouter/openai/o4-mini"
25 |     minimum_reward_std_dev: float = 0.0
26 |     # Random seed to control which subset of the training data is sampled
27 |     training_dataset_seed: int | None = None
28 | 
29 |     # Fork configuration
30 |     fork_from_model: str | None = None
31 |     fork_from_project: str | None = None
32 |     fork_not_after_step: int | None = None
33 | 
34 |     # Training configuration
35 |     scale_rewards: bool = True
36 | 
37 | 
38 | models: dict[str, art.TrainableModel[McpPolicyConfig]] = {
39 |     "mcp-7b-001": art.TrainableModel(
40 |         name="mcp-7b-001",
41 |         project="mcp-agent-training",
42 |         base_model="Qwen/Qwen2.5-7B-Instruct",
43 |         config=McpPolicyConfig(
44 |             num_epochs=20,
45 |         ),
46 |     )
47 | }
48 | 
49 | 
50 | models["mcp-14b-001"] = models["mcp-7b-001"].model_copy(deep=True)
51 | models["mcp-14b-001"].name = "mcp-14b-001"
52 | models["mcp-14b-001"].base_model = "Qwen/Qwen2.5-14B-Instruct"
53 | models["mcp-14b-001"].config.num_epochs = 160
54 | 
55 | # Model using alphavantage server with explicit specification
56 | models["mcp-14b-alpha-001"] = models["mcp-7b-001"].model_copy(deep=True)
57 | models["mcp-14b-alpha-001"].project = "mcp_alphavantage"
58 | models["mcp-14b-alpha-001"].name = "mcp-14b-alpha-001"
59 | models["mcp-14b-alpha-001"].config.mcp_server_name = "mcp_alphavantage"
60 | models["mcp-14b-alpha-001"].config.num_epochs = 300
61 | 
62 | 
63 | models["mcp-14b-alpha-002"] = models["mcp-14b-alpha-001"].model_copy(deep=True)
64 | models["mcp-14b-alpha-002"].name = "mcp-14b-alpha-002"
65 | 
66 | 
67 | models["mcp-14b-alpha-003"] = models["mcp-14b-alpha-001"].model_copy(deep=True)
68 | models["mcp-14b-alpha-003"].name = "mcp-14b-alpha-003"
69 | 
70 | 
71 | models["mcp-14b-alpha-004"] = models["mcp-14b-alpha-001"].model_copy(deep=True)
72 | models["mcp-14b-alpha-004"].name = "mcp-14b-alpha-004"
73 | models["mcp-14b-alpha-004"].config.learning_rate = 1e-6
74 | 
75 | # Model using balldontlie server
76 | models["mcp-14b-ball-001"] = models["mcp-7b-001"].model_copy(deep=True)
77 | models["mcp-14b-ball-001"].project = "mcp_balldontlie"
78 | models["mcp-14b-ball-001"].name = "mcp-14b-ball-001"
79 | models["mcp-14b-ball-001"].config.mcp_server_name = "mcp_balldontlie"
80 | models["mcp-14b-ball-001"].config.num_epochs = 300
81 | 


--------------------------------------------------------------------------------