├── .python-version ├── packages ├── osw-data │ ├── README.md │ ├── src │ │ └── osw_data │ │ │ ├── py.typed │ │ │ ├── __init__.py │ │ │ ├── metrics.py │ │ │ ├── dataset.py │ │ │ ├── trajectory.py │ │ │ ├── annotation.py │ │ │ └── utils.py │ ├── pyproject.toml │ └── tests │ │ ├── test_trajectory.py │ │ ├── test_dataset.py │ │ ├── test_annotation.py │ │ └── test_metrics.py └── autolibra-core │ ├── src │ └── autolibra_core │ │ ├── py.typed │ │ ├── templates │ │ ├── categorize_behavior_feedback.j2 │ │ ├── feedback_grounding.j2 │ │ ├── aspect_traits_match.j2 │ │ ├── generate_metrics_v2.j2 │ │ ├── coverage_evaluation.j2 │ │ ├── llm_as_a_judge_evaluator_v3.j2 │ │ ├── llm_as_a_judge_evaluator.j2 │ │ ├── behavior_clustering.j2 │ │ ├── behavior_extraction.j2 │ │ ├── generate_metrics.j2 │ │ ├── llm_as_a_judge_evaluator_v2.j2 │ │ └── coverage_evaluation_v2.j2 │ │ ├── datasets │ │ ├── care.py │ │ ├── base.py │ │ ├── nnetnav_live.py │ │ ├── sotopia.py │ │ ├── cogym.py │ │ ├── balrog_babaisai.py │ │ ├── balrog_mini.py │ │ ├── webarena_nnetnav.py │ │ ├── webarena.py │ │ ├── webvoyager_nnetnav_best.py │ │ └── webvoyager_nnetnav.py │ │ ├── data │ │ ├── __init__.py │ │ └── primitives.py │ │ ├── evaluators │ │ ├── __init__.py │ │ ├── llm_evaluator.py │ │ └── coverage_evaluator.py │ │ ├── operators │ │ ├── __init__.py │ │ ├── behavior_clustering.py │ │ └── feedback_grounding.py │ │ ├── __init__.py │ │ ├── configs │ │ └── __init__.py │ │ └── utils │ │ └── __init__.py │ ├── README.md │ ├── tests │ ├── positive_aspects_traits.pkl │ └── test_coverage.py │ └── pyproject.toml ├── .github └── workflows │ ├── pre-commit.yml │ └── mypy.yml ├── src ├── tools │ ├── count_annotations.py │ ├── count_score_frequency.py │ └── count_number_steps.py ├── training │ ├── extract_results.py │ ├── llm_as_a_judge.py │ ├── grounding.py │ ├── llm_eval.py │ └── iterative.py ├── plot │ └── meta-eval.py └── tty │ └── view_annotations.py ├── .pre-commit-config.yaml ├── pyproject.toml ├── README.md └── .gitignore /.python-version: -------------------------------------------------------------------------------- 1 | 3.10 2 | -------------------------------------------------------------------------------- /packages/osw-data/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /packages/osw-data/src/osw_data/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/templates/categorize_behavior_feedback.j2: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /packages/autolibra-core/README.md: -------------------------------------------------------------------------------- 1 | This folder contains code for the dataset converter and evaluator. 2 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/datasets/care.py: -------------------------------------------------------------------------------- 1 | from .base import BaseConverter 2 | 3 | 4 | class CareConverter(BaseConverter): 5 | pass 6 | -------------------------------------------------------------------------------- /packages/autolibra-core/tests/positive_aspects_traits.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Open-Social-World/autolibra/HEAD/packages/autolibra-core/tests/positive_aspects_traits.pkl -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .primitives import Trait, Aspect, MetricTrainingInstance 2 | 3 | __all__ = ["Trait", "Aspect", "MetricTrainingInstance"] 4 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/evaluators/__init__.py: -------------------------------------------------------------------------------- 1 | from .llm_evaluator import run_llm_eval 2 | # from .coverage_evaluator_v2 import coverage_eval 3 | 4 | __all__ = [ 5 | "run_llm_eval", 6 | # "coverage_eval", 7 | ] 8 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/operators/__init__.py: -------------------------------------------------------------------------------- 1 | from ..data import MetricTrainingInstance 2 | from .feedback_grounding import feedback_grounding 3 | from .behavior_clustering import behavior_clustering 4 | 5 | __all__ = [ 6 | "MetricTrainingInstance", 7 | "feedback_grounding", 8 | "behavior_clustering", 9 | ] 10 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- 1 | name: pre-commit 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: [main] 7 | 8 | jobs: 9 | pre-commit: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v3 13 | - name: Set up Python 3.11 14 | uses: actions/setup-python@v4 15 | with: 16 | python-version: 3.11.2 17 | - uses: pre-commit/action@v3.0.0 18 | -------------------------------------------------------------------------------- /src/tools/count_annotations.py: -------------------------------------------------------------------------------- 1 | from osw_data import AnnotationSystem 2 | import rich 3 | 4 | if __name__ == "__main__": 5 | dataset_name = "sotopia" 6 | 7 | annotation_system = AnnotationSystem( 8 | base_path=f".data/annotations/{dataset_name}", 9 | ) 10 | 11 | rich.print( 12 | f"There are {len(annotation_system.get_all_annotations())} annotations in the dataset." 13 | ) 14 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/__init__.py: -------------------------------------------------------------------------------- 1 | from .operators import ( 2 | feedback_grounding, 3 | behavior_clustering, 4 | ) 5 | 6 | from .evaluators import run_llm_eval 7 | from .data import Trait, Aspect, MetricTrainingInstance 8 | 9 | __all__ = [ 10 | "MetricTrainingInstance", 11 | "feedback_grounding", 12 | "behavior_clustering", 13 | "run_llm_eval", 14 | "Trait", 15 | "Aspect", 16 | # "coverage_eval", 17 | ] 18 | -------------------------------------------------------------------------------- /packages/osw-data/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "osw-data" 3 | version = "0.0.1" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10, <3.13" 7 | dependencies = [ 8 | "numpy>=1.9.3", 9 | "pydantic>=2.7", 10 | "pyyaml>=6.0.2", 11 | "rich>=13.9.4", 12 | "types-PyYAML>=6.0.2" 13 | ] 14 | 15 | 16 | [tool.hatch.build.targets.wheel] 17 | packages = ["src/osw_data"] 18 | 19 | 20 | [build-system] 21 | requires = ["hatchling"] 22 | build-backend = "hatchling.build" 23 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/configs/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Literal 2 | from pydantic_settings import BaseSettings, SettingsConfigDict 3 | from pydantic import Field 4 | 5 | 6 | class AutoLibraEvalSettings(BaseSettings): 7 | model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8") 8 | azure_api_key: str 9 | azure_endpoint: str 10 | github_personal_access_token: str 11 | azure_openai_4o_model: str | None = Field(default=None) 12 | azure_openai_o1_model: str | None = Field(default=None) 13 | azure_openai_o3_model: str | None = Field(default=None) 14 | reasoning_effort: Literal["low", "medium", "high"] = "medium" 15 | -------------------------------------------------------------------------------- /packages/osw-data/src/osw_data/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataset import MultiAgentDataset, AgentMetadata, DataInstance 2 | from .trajectory import SymmetricTrajectory, TrajectoryPoint, MediaType, PointType 3 | from .annotation import Annotation, AnnotationSpan, Annotator, AnnotationSystem 4 | from .metrics import Metric, MetricSetMetadata, MetricSet 5 | 6 | __all__ = [ 7 | "DataInstance", 8 | "MultiAgentDataset", 9 | "Annotation", 10 | "AnnotationSpan", 11 | "Annotator", 12 | "AnnotationSystem", 13 | "SymmetricTrajectory", 14 | "TrajectoryPoint", 15 | "AgentMetadata", 16 | "MediaType", 17 | "PointType", 18 | "Metric", 19 | "MetricSetMetadata", 20 | "MetricSet", 21 | ] 22 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/templates/feedback_grounding.j2: -------------------------------------------------------------------------------- 1 | You are an expert analyst tasked with analyzing agent trajectories and human feedback. 2 | 3 | Context: 4 | Review the following trajectory of an AI agent, along with the corresponding human feedback for interaction. 5 | Think about which part of the trajectory the feedback is referring to. 6 | 7 | Agent Trajectory: 8 | {{ instance.trajectory }} 9 | 10 | Human Feedback: 11 | {{ instance.feedback }} 12 | 13 | Instructions: 14 | 1. Analyze the agent trajectories and corresponding human feedback carefully 15 | 2. Break down the feedback into bulletpoints 16 | 3. For each bulletpoint, find the corresponding part of the trajectory that the feedback is referring to 17 | 18 | Output the following: 19 | -------------------------------------------------------------------------------- /.github/workflows/mypy.yml: -------------------------------------------------------------------------------- 1 | name: Mypy 2 | on: [push] 3 | 4 | jobs: 5 | Static-Type-Checking: 6 | runs-on: ubuntu-latest 7 | strategy: 8 | max-parallel: 5 9 | matrix: 10 | python-version: ["3.10", "3.11", "3.12"] 11 | 12 | steps: 13 | - uses: actions/checkout@v3 14 | - name: Set up Python ${{ matrix.python-version }} 15 | uses: actions/setup-python@v5 16 | with: 17 | python-version: ${{ matrix.python-version }} 18 | - name: Display Python version 19 | run: python -c "import sys; print(sys.version)" 20 | - name: Install dependencies 21 | run: | 22 | curl -LsSf https://astral.sh/uv/install.sh | sh 23 | - name: Type-checking package with mypy 24 | run: | 25 | uv run --all-extras mypy --strict . 26 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/data/primitives.py: -------------------------------------------------------------------------------- 1 | from typing import Literal 2 | from osw_data.trajectory import SymmetricTrajectory 3 | from pydantic import BaseModel, Field 4 | from osw_data import Metric 5 | 6 | 7 | class Aspect(BaseModel): 8 | feedback: str 9 | behavior: str 10 | is_positive: bool = Field( 11 | description="Whether the feedback is positive or negative." 12 | ) 13 | 14 | 15 | class Trait(BaseModel): 16 | metric: Metric 17 | rating: Literal[-1, 0, 1] 18 | 19 | 20 | class MetricTrainingInstance: 21 | def __init__( 22 | self, task: str, agent_id: str, trajectory: SymmetricTrajectory, feedback: str 23 | ): 24 | self.task = task 25 | self.agent_id = agent_id 26 | self.trajectory = trajectory 27 | self.feedback = feedback 28 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v5.0.0 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: end-of-file-fixer 7 | - id: check-yaml 8 | - id: check-added-large-files 9 | - repo: https://github.com/pre-commit/mirrors-prettier 10 | rev: v3.0.1 # Use the sha / tag you want to point at 11 | hooks: 12 | - id: prettier 13 | types_or: [html] 14 | - repo: https://github.com/astral-sh/ruff-pre-commit 15 | # Ruff version. 16 | rev: v0.3.5 17 | hooks: 18 | # Run the linter. 19 | - id: ruff 20 | types_or: [ python, pyi, jupyter ] 21 | args: [ --fix ] 22 | # Run the formatter. 23 | - id: ruff-format 24 | types_or: [ python, pyi, jupyter ] 25 | - repo: https://github.com/kynan/nbstripout 26 | rev: 0.6.0 27 | hooks: 28 | - id: nbstripout 29 | -------------------------------------------------------------------------------- /src/tools/count_score_frequency.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | scores = {} 4 | for row in open("llm_eval_results.jsonl"): 5 | data = json.loads(row) 6 | for key, value in data.items(): 7 | # Add all new keys to the dictionary 8 | if key not in scores: 9 | scores[key] = [0, 0, 0] 10 | # Increment the corresponding value 11 | if value == -1: 12 | scores[key][0] += 1 13 | elif value == 0: 14 | scores[key][1] += 1 15 | elif value == 1: 16 | scores[key][2] += 1 17 | 18 | # Remove [0,0,0] entries, as these are purely text 19 | scores = {key: val for key, val in scores.items() if sum(val) > 0} 20 | 21 | for key, val in scores.items(): 22 | print(key, val) 23 | 24 | # Print number of total scores 25 | total_number_of_dps = sum([sum(val) for val in scores.values()]) 26 | print("Total number of datapoints:", total_number_of_dps) 27 | -------------------------------------------------------------------------------- /packages/autolibra-core/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "autolibra-core" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "jinja2>=3.1.5", 9 | "numpy>=1.9.3", 10 | "polars>=1.19.0", 11 | "pydantic-ai>=0.0.18", 12 | "pydantic>=2.7", 13 | "pyyaml>=6.0.2", 14 | "requests>=2.32.3", 15 | "rich>=13.9.4", 16 | "types-requests>=2.32.0.20241016", 17 | "osw-data>=0.0.1", 18 | ] 19 | 20 | [project.optional-dependencies] 21 | webarena = [ 22 | "gdown>=5.2.0", 23 | "pillow>=11.1.0", 24 | ] 25 | sotopia = [ 26 | "huggingface-hub>=0.27.1", 27 | ] 28 | 29 | [tool.hatch.build] 30 | include = ["src/autolibra_core/templates/*.j2"] 31 | 32 | [tool.hatch.build.targets.wheel] 33 | packages = ["src/autolibra_core"] 34 | 35 | [build-system] 36 | requires = ["hatchling"] 37 | build-backend = "hatchling.build" 38 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/templates/aspect_traits_match.j2: -------------------------------------------------------------------------------- 1 | You are an expert agent behavior analyst. You are tasked with analyzing the behaviors and feedback to identify relavent traits. 2 | 3 | Here are the behaviors and feedbacks you need to analyze: 4 | 5 | {% for behavior_feedback in behavior_feedback_list %} 6 | 7 | Aspect {{ loop.index }} 8 | Behavior: {{ behavior_feedback.behavior }} 9 | Feedback: {{ behavior_feedback.feedback }} 10 | 11 | {% endfor %} 12 | 13 | Here are the traits you need to analyze: 14 | 15 | {% for metric in metric_list %} 16 | 17 | Trait {{ loop.index }} 18 | Trait name: {{ metric.name }} 19 | Description: {{ metric.description }} 20 | Good behavior: {{ metric.good_behaviors }} 21 | Bad behavior: {{ metric.bad_behaviors }} 22 | 23 | {% endfor %} 24 | 25 | Output the following: 26 | For each of the aspects, please try to match it with the a relevant trait, and provide a reasoning for your choice. 27 | If you think the aspect is not relevant to any of the traits, please output "None of the traits matches the aspect." 28 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/datasets/base.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import logging 3 | from typing import Any 4 | 5 | 6 | class BaseConverter(object): 7 | def __init__(self, output_path: Path, source_path: Path) -> None: 8 | self.output_path = output_path 9 | self.source_path = source_path 10 | 11 | # Setup logging 12 | logging.basicConfig(level=logging.INFO) 13 | self.logger = logging.getLogger(type(self).__name__) 14 | 15 | self._setup_constants() 16 | 17 | def _setup_constants(self) -> None: 18 | pass 19 | 20 | def download_data(self) -> None: 21 | raise NotImplementedError() 22 | 23 | def convert_to_dataset(self) -> None: 24 | raise NotImplementedError() 25 | 26 | 27 | def run_converter( 28 | converter_class: type[BaseConverter], 29 | output_path: Path, 30 | source_path: Path, 31 | **kwargs: Any, 32 | ) -> None: 33 | converter = converter_class( 34 | output_path=output_path, source_path=source_path, **kwargs 35 | ) 36 | converter.download_data() 37 | converter.convert_to_dataset() 38 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/templates/generate_metrics_v2.j2: -------------------------------------------------------------------------------- 1 | You are an expert evaluator tasked with analyzing agent trajectories and human feedback to identify and define relevant evaluation metrics. 2 | 3 | Context: 4 | Review the following interactions between an AI agent and a human, along with the corresponding human feedback for each interaction. 5 | 6 | {% for instance in instances %} 7 | Instance {{ loop.index }}: 8 | 9 | Agent Trajectory: 10 | {{ instance.trajectory }} 11 | 12 | Human Feedback: 13 | {{ instance.feedback }} 14 | 15 | {% endfor %} 16 | 17 | Instructions: 18 | 1. Analyze the agent trajectories and corresponding human feedback carefully 19 | 2. Identify key aspects of performance that emerge from the interactions and feedback 20 | 3. Define a comprehensive set of metrics that capture these aspects 21 | 4. For each metric provide: 22 | - A clear name 23 | - A detailed description of what the metric measures 24 | - Why this metric is important based on the observed interactions 25 | - Example behaviors from the trajectories that would score high or low on this metric 26 | 27 | Output the following: 28 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/templates/coverage_evaluation.j2: -------------------------------------------------------------------------------- 1 | You are an expert in thematic anaysis trying to understand the differences between quantative and qualitative evaluation metrics. 2 | 3 | Context: 4 | Review the following task metadata, agent metadata, the huamn's free-form feedback, and the quantative metrics. 5 | 6 | Task Metadata: 7 | {{ instance.task_metadata }} 8 | 9 | Agent Metadata: 10 | {{ instance.agent_metadata }} 11 | 12 | Humans' feedback: 13 | {{ instance.feedback }} 14 | 15 | Quantative metrics and their corresponding ratings: 16 | {{ instance.metric }} 17 | 18 | 19 | Instructions: 20 | 1. Analyze the task and humans' feedback carefully 21 | 2. Break the humans' feedback into bullet points. 22 | 3. Understand the metrics provided. And think about what part of the feedback is not captured by the metrics. 23 | 4. For each bulletpoint, provide which metric you think covers it, or if it is not covered by any metric. 24 | 5. After that, please rate the coverage of the metrics on the feedback from 0 to 5, where 0 means the metrics do not cover the feedback at all, and 5 means the metrics cover the feedback perfectly. 25 | 26 | Output the following: 27 | 28 | It should be a string (the reasoning) and an integer (the 0-5 rating) in a json format 29 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "autolibra-eval" 7 | version = "0.1.0" 8 | description = "Add your description here" 9 | readme = "README.md" 10 | requires-python = ">=3.10" 11 | dependencies = [ 12 | "fastapi>=0.115.6", 13 | "autolibra-core[webarena,sotopia]", 14 | "pydantic-settings>=2.7.1", 15 | "rich>=13.9.4", 16 | "typer>=0.15.1", 17 | "uvicorn>=0.34.0", 18 | "streamlit>=1.43", 19 | "pandas>=2.2", 20 | "logfire>=3.7.1", 21 | "matplotlib>=3.10.1", 22 | ] 23 | 24 | [tool.uv] 25 | dev-dependencies = [ 26 | "ipykernel>=6.29.5", 27 | "mypy>=1.14.1", 28 | "pre-commit>=4.0.1", 29 | "pytest-cov>=6.0.0", 30 | "pytest>=8.3.4", 31 | "pytest-asyncio>=0.25.2", 32 | "pandas-stubs>=2.2", 33 | ] 34 | 35 | [project.optional-dependencies] 36 | notebook = ["marimo>=0.10"] 37 | 38 | [tool.uv.sources] 39 | autolibra-core = { workspace = true } 40 | osw-data = { workspace = true } 41 | 42 | [tool.uv.workspace] 43 | members = ["packages/*"] 44 | 45 | [tool.mypy] 46 | strict = true 47 | plugins = ["pydantic.mypy"] 48 | mypy_path = "stubs" 49 | 50 | [tool.hatch.build.targets.wheel] 51 | packages = ["src/osw_data"] 52 | 53 | [tool.pylsp-mypy] 54 | enabled = true 55 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from importlib import resources 2 | import jinja2 3 | from osw_data.dataset import DataInstance 4 | from osw_data.trajectory import SymmetricTrajectory 5 | from autolibra_core.data.primitives import MetricTrainingInstance 6 | 7 | 8 | def load_prompt_template(jinja_file: str) -> jinja2.Template: 9 | with resources.files("autolibra_core.templates").joinpath(jinja_file).open( 10 | "r" 11 | ) as f: 12 | return jinja2.Template(f.read()) 13 | 14 | 15 | def render_webarena_trajectory( 16 | trajectory: SymmetricTrajectory, metadata: DataInstance | None = None 17 | ) -> str: 18 | return "\n".join( 19 | [ 20 | metadata.model_dump_json(), 21 | ] 22 | if metadata 23 | else [] 24 | + [ 25 | f"{'Observation' if p.point_type == 'observation' else 'Action'}: {trajectory.get_data_at(i)}" 26 | for i, p in enumerate(trajectory.points) 27 | ] 28 | ) 29 | 30 | 31 | def render_training_instance(training_instance: MetricTrainingInstance) -> str: 32 | return "\n".join( 33 | [ 34 | f"The task is {training_instance.task}", 35 | ] 36 | + [ 37 | f"{'Observation' if p.point_type == 'observation' else 'Action'}: {str(training_instance.trajectory.get_data_at(i))[:8000]}" 38 | for i, p in enumerate(training_instance.trajectory.points) 39 | ] 40 | ) 41 | -------------------------------------------------------------------------------- /src/training/extract_results.py: -------------------------------------------------------------------------------- 1 | import json 2 | import polars as pl 3 | 4 | 5 | def convert_jsonl_to_table(file_path: str) -> pl.DataFrame: 6 | # Read records into a list 7 | records = [] 8 | with open(file_path, "r") as file: 9 | for line in file: 10 | if line.strip(): # Skip empty lines 11 | record = json.loads(line) 12 | records.append(record) 13 | 14 | # Convert to Polars DataFrame 15 | df = pl.DataFrame(records) 16 | 17 | # Reorder columns to group reasoning and scores together 18 | reasoning_columns = [col for col in df.columns if col.endswith("_reasoning")] 19 | score_columns = [col for col in df.columns if not col.endswith("_reasoning")] 20 | 21 | # Combine columns in desired order 22 | df = df.select(reasoning_columns + score_columns) 23 | 24 | return df 25 | 26 | 27 | if __name__ == "__main__": 28 | # Replace 'your_file.jsonl' with your actual file path 29 | file_path = "llm_eval_results.jsonl" 30 | try: 31 | df = convert_jsonl_to_table(file_path) 32 | 33 | # Save to CSV 34 | df.write_csv("converted_table.csv") 35 | print("\nTable has been saved to 'converted_table.csv'") 36 | 37 | except FileNotFoundError: 38 | print(f"Error: File '{file_path}' not found") 39 | except json.JSONDecodeError: 40 | print("Error: Invalid JSON format in file") 41 | except Exception as e: 42 | print(f"An error occurred: {str(e)}") 43 | -------------------------------------------------------------------------------- /src/tools/count_number_steps.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser(description="Balrog Converter") 6 | parser.add_argument( 7 | "--filename", 8 | type=str, 9 | required=True, 10 | help="The name of the folder containing the Balrog data for the given run in raw", 11 | ) 12 | 13 | filename = parser.parse_args().filename 14 | 15 | file_path = f".data/raw/{filename}" 16 | 17 | scoresteps = {} 18 | 19 | for root, dirs, files in os.walk(file_path): 20 | json_files = [f for f in files if f.endswith(".json") and "summary" not in f] 21 | 22 | for ind_file in json_files: 23 | with open(os.path.join(root, ind_file), "r") as f: 24 | data = json.load(f) 25 | task = data["task"] 26 | 27 | if task not in scoresteps: 28 | scoresteps[task] = [data["num_steps"], data["episode_return"], 1] 29 | else: 30 | scoresteps[task][0] += data["num_steps"] 31 | scoresteps[task][1] += data["episode_return"] 32 | scoresteps[task][2] += 1 33 | 34 | net_avg_steps = 0 35 | net_avg_return = 0 36 | 37 | for key, val in scoresteps.items(): 38 | avg_steps = val[0] / val[2] 39 | avg_return = val[1] / val[2] 40 | 41 | print(key, avg_steps, avg_return) 42 | net_avg_steps += avg_steps 43 | net_avg_return += avg_return 44 | 45 | print("Net average steps:", net_avg_steps / len(scoresteps)) 46 | print("Net average return:", net_avg_return / len(scoresteps)) 47 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/templates/llm_as_a_judge_evaluator_v3.j2: -------------------------------------------------------------------------------- 1 | You are an expert evaluator tasked with analyzing agent trajectories and human feedback to identify and define relevant evaluation metrics. 2 | 3 | Context: 4 | Review the following task metadata, agent metadata, and its trajectory, and several metrics that you should use to evaluate the agent's performance. 5 | 6 | Agent Trajectory: 7 | {{ trajectory }} 8 | 9 | Metrics: 10 | {% for metric in metrics %} 11 | 12 | Metric name: {{ metric.name }} 13 | Metric description: {{ metric.explanation }} 14 | Metric good behaviors: {{ metric.good_behaviors }} 15 | Metric bad behaviors: {{ metric.bad_behaviors }} 16 | 17 | 18 | {% endfor %} 19 | 20 | 21 | Instructions: 22 | 1. Analyze the agent task and trajectory carefully 23 | 2. Understand the metrics provided, and figure out which part of the trajectory they might be relevant to 24 | 3. Output two things for each metric: 25 | - A reasoning for why you think the agent either did PERFECTLY well or not on the metric 26 | - If you think the agent did PERFECTLY well, provide the behavior of the agent that led you to that conclusion 27 | - If you think the agent did SOMETHING WRONG on this metric, provide the behavior of the agent that led you to that conclusion 28 | - If you think the metric is not applicable to the agent, provide a reasoning for that 29 | - A string indicating whether you think the agent did well or poorly on the metric 30 | - "1" indicates the agent did PERFECTLY well 31 | - "0" indicates the metric is not applicable to the agent 32 | - "-1" indicates the agent did SOMETHING WRONG 33 | 34 | Output the following: 35 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/templates/llm_as_a_judge_evaluator.j2: -------------------------------------------------------------------------------- 1 | You are an expert evaluator tasked with analyzing agent trajectories and human feedback to identify and define relevant evaluation metrics. 2 | 3 | Context: 4 | Review the following task metadata, agent metadata, and its trajectory, and a metric that you should use to evaluate the agent's performance. 5 | 6 | Task Metadata: 7 | {{ instance.task_metadata }} 8 | 9 | Agent Metadata: 10 | {{ instance.agent_metadata }} 11 | 12 | Agent Trajectory: 13 | {{ instance.trajectory }} 14 | 15 | Metric: 16 | {{ instance.metric }} 17 | 18 | 19 | Instructions: 20 | 1. Analyze the agent task and trajectory carefully 21 | 2. Understand the metric provided, and figure out which part of the trajectory it might be relevant to 22 | 3. Output two things: 23 | - A reasoning for why you think the agent either did well or poorly on the metric 24 | - If you think the agent did well, provide the behavior of the agent that led you to that conclusion 25 | - If you think the agent did poorly, provide the behavior of the agent that led you to that conclusion 26 | - A binary integer indicating whether you think the agent did well or poorly on the metric 27 | - 1 indicates the agent did well or the metric is not applicable to the agent 28 | - 0 indicates the agent did poorly 29 | 30 | Output the following: 31 | 32 | 33 | It should be a string (the reasoning) and an integer (the binary rating) in a json format: 34 | 35 | {'properties': {'reasoning': {'title': 'Reasoning', 'type': 'string'}, 'rating': {'title': 'Rating', 'type': 'integer'}}, 'required': ['reasoning', 'rating'], 'title': 'EvaluationResult', 'type': 'object'} 36 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/templates/behavior_clustering.j2: -------------------------------------------------------------------------------- 1 | You are an expert in agent behavior analysis and are tasked with analyzing agent behaviors and human feedback 2 | to group similar positive and negative behaviors together, so that you can extract the metrics to evaluate the agents. 3 | 4 | Here are the behaviors and feedbacks you need to analyze: 5 | 6 | {% for behavior_feedback in behavior_feedback_list %} 7 | 8 | {{ loop.index }} 9 | Behavior: {{ behavior_feedback.behavior }} 10 | Feedback: {{ behavior_feedback.feedback }} 11 | 12 | {% endfor %} 13 | 14 | Instructions: 15 | 1. Analyze the behaviors and corresponding human feedback carefully 16 | 2. Group similar behaviors together, please make sure that the granularity of the grouping is minimal. Only very similar behaviors should be grouped together. 17 | 3. Output a list of metrics, where each metric is composed of 18 | - A list of good behaviors (sentences describing the behavior) 19 | - A list of bad behaviors (sentences describing the behavior) 20 | - An explanation of the metric's meaning and how the positive behaviors should be determined in novel agent behaviors 21 | - A name for the metric 22 | 4. Make sure to include each behavior in at least one metric 23 | 24 | N.B. 25 | The granularity of the grouping should be minimal, only very similar behaviors should be grouped together. But don't limit to one particular website: 26 | - Good name: Location Query Correctness 27 | - Bad name: Mapping Site Usage Correctness 28 | - Bad name: Arxiv Categories Correctness 29 | - DO NOT add the website name in the metric name including huggingface, arxiv, dictionary, ESPN, Apple, Amazon, BBC, etc. 30 | Also, don't limit to one particular character: 31 | - Good name: Don't settle for less in negotiations 32 | - Bad name: Oliver's negotiation strategy 33 | 34 | Output 4 metrics. 35 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/datasets/nnetnav_live.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import TypedDict 3 | 4 | 5 | class Message(TypedDict): 6 | content: str 7 | 8 | 9 | class NNetNavStepData(TypedDict): 10 | messages: list[Message] 11 | 12 | 13 | def _get_objective(data: NNetNavStepData) -> str: 14 | return data["messages"][1]["content"].split("OBJECTIVE: ")[1].split("\n")[0] 15 | 16 | 17 | def _get_action(data: NNetNavStepData) -> str: 18 | try: 19 | return data["messages"][2]["content"].split("```")[1] 20 | except IndexError: 21 | return "none" 22 | 23 | 24 | def _get_observation(data: NNetNavStepData) -> str: 25 | return data["messages"][1]["content"].split("OBSERVATION: ")[0] 26 | 27 | 28 | if __name__ == "__main__": 29 | objectives: list[str] = [] 30 | with open(".data/raw/nnetnav-live-00/train.jsonl") as f: 31 | for line in f.readlines(): 32 | data = json.loads(line) 33 | objectives.append(_get_objective(data)) 34 | 35 | unique_objectives: list[str] = [] 36 | 37 | instance_starts = [0] 38 | 39 | for line_no, (this_obj, next_obj) in enumerate(zip(objectives, objectives[1:])): 40 | if this_obj != next_obj: 41 | instance_starts.append(line_no + 1) 42 | 43 | instance_ends = instance_starts[1:] + [len(objectives)] 44 | 45 | lines: list[str] = [] 46 | with open(".data/raw/nnetnav-live-00/train.jsonl") as f: 47 | lines = f.readlines() 48 | 49 | for start, end in zip(instance_starts, instance_ends): 50 | instance_lines = lines[start:end] 51 | instance_data = [json.loads(line) for line in instance_lines] 52 | 53 | for data in instance_data: 54 | print(_get_objective(data)) 55 | print(_get_action(data)) 56 | print(_get_observation(data)) 57 | print() 58 | 59 | break 60 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AutoLibra ⚖️ Metric Induction for Agents from Open-Ended Human Feedback 2 | 3 | ## Introduction 4 | 5 | AutoLibra is designed to facilitate the evaluation of agents through metrics derived from human feedback. This document outlines the steps for contributors to prepare data, annotate it, and run experiments. 6 | 7 | ## Contributor doc 8 | 9 | ### Prepare the data 10 | 11 | Install git lfs if you haven't already. This is required to download the large files in the dataset. 12 | 13 | #### From scratch 14 | For contributors, it is the best to use our shared data repo on Hugging Face: `open-social-world/autolibra`. Upload new datasets to this shared repo. 15 | 16 | ```bash 17 | # Download and preprocess 18 | uv run python -m autolibra_core.datasets. 19 | ``` 20 | 21 | #### Download from huggingface 22 | 23 | ```bash 24 | git clone https://huggingface.co/datasets/open-social-world/autolibra .data 25 | ``` 26 | 27 | #### Upload your data to huggingface 28 | 29 | ```bash 30 | # cd into .data 31 | # git add your data 32 | # git commit -m "Add " 33 | git push 34 | ``` 35 | 36 | ### Annotation 37 | ```bash 38 | uv run python src/tty/tty_annotation.py .data/webarena .data/annotations/webarena --annotator-id 39 | ``` 40 | 41 | ### Annotation Web Interface with Streamlit 42 | ```bash 43 | uv run streamlit run src/tty/tty_annotation.py .data/sotopia .data/annotations/sotopia -- --annotator-id --use-streamlit 44 | ``` 45 | 46 | ### View Annotations with Streamlit 47 | ```bash 48 | streamlit run src/tty/view_annotations.py -- .data/annotations/sotopia/annotations 49 | ``` 50 | 51 | ### To run metric extraction 52 | ```bash 53 | uv run python -m autolibra_core.gen_eval.generator 54 | ``` 55 | ### Run experiments 56 | Test environments (BALROG, etc) are included as submodules under .gitmodules. Documentation for using these environments are included within each environment repo. 57 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/templates/behavior_extraction.j2: -------------------------------------------------------------------------------- 1 | You are an expert evaluator tasked with analyzing agent trajectories and human feedback to identify and define relevant evaluation metrics. 2 | 3 | Context: 4 | Review the following trajectory of an AI agent, along with the corresponding human feedback for each interaction. 5 | 6 | {% for instance in instances %} 7 | Instance {{ loop.index }}: 8 | 9 | Agent Trajectory: 10 | {{ instance.trajectory }} 11 | 12 | Human Feedback: 13 | {{ instance.feedback }} 14 | 15 | {% endfor %} 16 | 17 | Instructions: 18 | 1. Analyze the agent trajectories and human feedback carefully 19 | 2. Identify key aspects of performance that emerge from the interactions and feedback 20 | 3. Break down these aspects into specific agent behaviors and corresponding human feedback 21 | 4. The behavior should be a specific action, a series of actions or decision made by the agent 22 | 5. The feedback should be a summarization of the corresponding human feedback 23 | 6. Please avoid any repetitive or redundant behaviors and feedback 24 | 25 | Output the following: 26 | 27 | It should be list of behavior and feedback, each in the following format: 28 | 29 | {'properties': {'instance_id': {'description': 'The ID of the instance the behavior was observed in', 'title': 'Instance Id', 'type': 'string'}, 'start_step': {'description': 'The step in the trajectory where the behavior started', 'title': 'Start Step', 'type': 'integer'}, 'end_step': {'description': 'The step in the trajectory where the behavior ended', 'title': 'End Step', 'type': 'integer'}, 'behavior': {'description': 'The behavior observed in the trajectory', 'title': 'Behavior', 'type': 'string'}, 'feedback': {'description': 'Summary of human comments on the behavior observed', 'title': 'Feedback', 'type': 'string'}}, 'required': ['instance_id', 'start_step', 'end_step', 'behavior', 'feedback'], 'title': 'BehaviorFeedback', 'type': 'object'} 30 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/operators/behavior_clustering.py: -------------------------------------------------------------------------------- 1 | from importlib import resources 2 | import jinja2 3 | from openai import AsyncAzureOpenAI 4 | from autolibra_core.configs import AutoLibraEvalSettings 5 | from pydantic import BaseModel, ValidationError 6 | from ..data import Aspect 7 | from osw_data import Metric 8 | 9 | 10 | def _load_behavior_clustering_template() -> jinja2.Template: 11 | with resources.files("autolibra_core.templates").joinpath( 12 | "behavior_clustering.j2" 13 | ).open("r") as f: 14 | return jinja2.Template(f.read()) 15 | 16 | 17 | class BehaviorClusteringOutput(BaseModel): 18 | metrics: list[Metric] 19 | 20 | 21 | async def behavior_clustering( 22 | aspects: list[Aspect], 23 | client: AsyncAzureOpenAI, 24 | ) -> BehaviorClusteringOutput: 25 | prompt = _load_behavior_clustering_template().render( 26 | behavior_feedback_list=aspects, 27 | ) 28 | 29 | settings = AutoLibraEvalSettings() 30 | 31 | model = settings.azure_openai_o3_model 32 | assert model is not None 33 | 34 | while True: 35 | try: 36 | completion = await client.beta.chat.completions.parse( 37 | model=model, 38 | messages=[ 39 | # {"role": "system", "content": "Cluster the behaviors."}, 40 | {"role": "user", "content": prompt}, 41 | ], 42 | response_format=BehaviorClusteringOutput, 43 | reasoning_effort="high", 44 | ) 45 | break 46 | except ValidationError as e: 47 | # In rare cases, the response may not be parsed correctly. 48 | # Retry the request. 49 | print(f"Validation error: {e}") 50 | 51 | if not completion.choices[0].message.parsed: 52 | raise ValueError("Failed to parse the response.") 53 | else: 54 | return completion.choices[0].message.parsed 55 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/templates/generate_metrics.j2: -------------------------------------------------------------------------------- 1 | You are an expert evaluator tasked with analyzing agent trajectories and human feedback to identify and define relevant evaluation metrics. 2 | 3 | Context: 4 | Review the following interactions between an AI agent and a human, along with the corresponding human feedback for each interaction. 5 | 6 | {% for instance in instances %} 7 | Instance {{ loop.index }}: 8 | 9 | Agent Trajectory: 10 | {{ instance.trajectory }} 11 | 12 | Human Feedback: 13 | {{ instance.feedback }} 14 | 15 | {% endfor %} 16 | 17 | Instructions: 18 | 1. Analyze the agent trajectories and corresponding human feedback carefully 19 | 2. Identify key aspects of performance that emerge from the interactions and feedback 20 | 3. Define a comprehensive set of metrics that capture these aspects 21 | 4. For each metric provide: 22 | - A clear name 23 | - A detailed description of what the metric measures 24 | - Why this metric is important based on the observed interactions 25 | - Example behaviors from the trajectories that would score high or low on this metric 26 | 27 | Output the following: 28 | 29 | Identified Metrics: 30 | [List each metric in the following format] 31 | 32 | 1. Metric Name: [Concise, descriptive name] 33 | Description: [Clear explanation of what this metric measures] 34 | Importance: [Why this metric matters based on the trajectories and feedback] 35 | Examples: 36 | - High Score Example: [Behavior that would exemplify high performance] 37 | - Low Score Example: [Behavior that would indicate poor performance] 38 | 39 | 2. [Continue for each identified metric...] 40 | 41 | Justification: 42 | [Explain how these metrics together provide comprehensive coverage of the important aspects of performance demonstrated in the trajectories and highlighted by the human feedback] 43 | 44 | Additional Considerations: 45 | [Note any context-specific factors that might affect metric applicability or interpretation] 46 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/templates/llm_as_a_judge_evaluator_v2.j2: -------------------------------------------------------------------------------- 1 | You are an expert evaluator tasked with analyzing agent trajectories and human feedback to identify and define relevant evaluation metrics. 2 | 3 | Context: 4 | Review the following task metadata, agent metadata, and its trajectory, and several metrics that you should use to evaluate the agent's performance. 5 | 6 | Task Metadata: 7 | {{ instance.task_metadata }} 8 | 9 | Agent Metadata: 10 | {{ instance.agent_metadata }} 11 | 12 | Agent Trajectory: 13 | {{ instance.trajectory }} 14 | 15 | Metrics: 16 | {% for metric in instance.metrics %} 17 | 18 | 19 | Metric name: {{ metric.name }} 20 | Metric description: {{ metric.explanation }} 21 | Metric good behaviors: {{ metric.good_behaviors }} 22 | Metric bad behaviors: {{ metric.bad_behaviors }} 23 | 24 | 25 | {% endfor %} 26 | 27 | 28 | Instructions: 29 | 1. Analyze the agent task and trajectory carefully 30 | 2. Understand the metrics provided, and figure out which part of the trajectory they might be relevant to 31 | 3. Output two things for each metric: 32 | - A reasoning for why you think the agent either did well or poorly on the metric 33 | - If you think the agent did well, provide the behavior of the agent that led you to that conclusion 34 | - If you think the agent did poorly, provide the behavior of the agent that led you to that conclusion 35 | - If you think the metric is not applicable to the agent, provide a reasoning for that 36 | - A string indicating whether you think the agent did well or poorly on the metric 37 | - "positive" indicates the agent did well 38 | - "N/A" indicates the metric is not applicable to the agent 39 | - "negative" indicates the agent did poorly 40 | 41 | Output the following: 42 | 43 | 44 | It should be a list of string (the reasoning) and an string (judgement) in a json format, please make sure the number of reasoning and judgement is the same as the number of metrics provided. 45 | -------------------------------------------------------------------------------- /packages/osw-data/tests/test_trajectory.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from osw_data.trajectory import ( 3 | SymmetricTrajectory, 4 | MediaType, 5 | PointType, 6 | render_trajectory, 7 | ) 8 | 9 | from datetime import datetime 10 | import numpy as np 11 | 12 | 13 | def test_mixed_trajectory() -> None: 14 | trajectory = SymmetricTrajectory( 15 | trajectory_id="robot_1", storage_path=Path("/tmp/data/trajectories") 16 | ) 17 | 18 | # Add an observation with image data 19 | image = np.random.rand(480, 640, 3) 20 | trajectory.add_point( 21 | timestamp=datetime.now(), 22 | agent_id="robot_1", 23 | point_type=PointType.OBSERVATION, 24 | data=image, 25 | media_type=MediaType.IMAGE, 26 | metadata={"camera_id": "cam_1"}, 27 | ) 28 | 29 | # Add an action with JSON data 30 | action = {"command": "move", "parameters": {"direction": "forward", "speed": 1.0}} 31 | trajectory.add_point( 32 | timestamp=datetime.now(), 33 | agent_id="robot_1", 34 | point_type=PointType.ACTION, 35 | data=action, 36 | media_type=MediaType.JSON, 37 | metadata={"priority": "high"}, 38 | ) 39 | 40 | # Add an observation with JSON data 41 | json_obs = {"position": [1.0, 2.0, 3.0], "orientation": [0.0, 0.0, 1.0]} 42 | trajectory.add_point( 43 | timestamp=datetime.now(), 44 | agent_id="robot_1", 45 | point_type=PointType.OBSERVATION, 46 | data=json_obs, 47 | media_type=MediaType.JSON, 48 | ) 49 | 50 | # Add an action with audio data 51 | audio_command = np.random.rand(16000) # 1 second of audio at 16kHz 52 | trajectory.add_point( 53 | timestamp=datetime.now(), 54 | agent_id="robot_1", 55 | point_type=PointType.ACTION, 56 | data=audio_command, 57 | media_type=MediaType.AUDIO, 58 | metadata={"sample_rate": 16000}, 59 | ) 60 | 61 | render_trajectory(trajectory) 62 | 63 | trajectory.close() 64 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/operators/feedback_grounding.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from importlib import resources 3 | import jinja2 4 | from openai import AsyncAzureOpenAI, RateLimitError 5 | from autolibra_core.configs import AutoLibraEvalSettings 6 | from pydantic import BaseModel 7 | from ..utils import render_training_instance 8 | from ..data import MetricTrainingInstance, Aspect 9 | 10 | 11 | class FeedbackGroundingOutput(BaseModel): 12 | bullet_points: list[Aspect] 13 | 14 | 15 | def _load_feedback_grounding_template() -> jinja2.Template: 16 | with resources.files("autolibra_core.templates").joinpath( 17 | "feedback_grounding.j2" 18 | ).open("r") as f: 19 | return jinja2.Template(f.read()) 20 | 21 | 22 | async def feedback_grounding( 23 | instance: MetricTrainingInstance, 24 | client: AsyncAzureOpenAI, 25 | ) -> list[Aspect]: 26 | settings = AutoLibraEvalSettings() 27 | 28 | template = _load_feedback_grounding_template() 29 | 30 | prompt = template.render( 31 | instance=dict( 32 | trajectory=render_training_instance(instance), feedback=instance.feedback 33 | ) 34 | ) 35 | 36 | model = settings.azure_openai_4o_model 37 | assert model 38 | 39 | wait_time = 1 40 | while True: 41 | try: 42 | completion = await client.beta.chat.completions.parse( 43 | model=model, 44 | messages=[ 45 | { 46 | "role": "system", 47 | "content": "Ground the feedback in the behavior.", 48 | }, 49 | {"role": "user", "content": prompt}, 50 | ], 51 | response_format=FeedbackGroundingOutput, 52 | ) 53 | break 54 | except RateLimitError as e: 55 | print(f"Rate limit error: {e}") 56 | await asyncio.sleep(wait_time) 57 | wait_time *= 2 58 | 59 | if not completion.choices[0].message.parsed: 60 | raise ValueError("Failed to parse the response.") 61 | else: 62 | return completion.choices[0].message.parsed.bullet_points 63 | -------------------------------------------------------------------------------- /packages/osw-data/tests/test_dataset.py: -------------------------------------------------------------------------------- 1 | from osw_data import MultiAgentDataset, AgentMetadata, MediaType, PointType 2 | from datetime import datetime 3 | import numpy as np 4 | from pathlib import Path 5 | 6 | 7 | def test_dataset() -> None: 8 | Path("/tmp/data/robot_dataset").mkdir(parents=True, exist_ok=True) 9 | 10 | # Create a new dataset 11 | dataset = MultiAgentDataset( 12 | name="Robot Interaction Dataset", 13 | base_path=Path("/tmp/data/robot_dataset"), 14 | description="Multi-agent robot interaction scenarios", 15 | version="1.0", 16 | ) 17 | 18 | # Define agents for an instance 19 | agents_metadata = { 20 | "robot_1": AgentMetadata( 21 | agent_id="robot_1", 22 | agent_type="manipulator", 23 | capabilities=["grasp", "move"], 24 | parameters={"max_speed": 1.0}, 25 | ), 26 | "robot_2": AgentMetadata( 27 | agent_id="robot_2", 28 | agent_type="mobile_base", 29 | capabilities=["navigate"], 30 | parameters={"max_velocity": 0.5}, 31 | ), 32 | } 33 | 34 | # Create an instance 35 | instance_id = dataset.create_instance( 36 | agents_metadata=agents_metadata, 37 | instance_metadata={"scenario": "collaborative_assembly"}, 38 | ) 39 | 40 | # Add data points for each agent 41 | timestamp = datetime.now() 42 | 43 | # Add observation for robot_1 44 | image_data = np.random.rand(480, 640, 3) 45 | dataset.add_data_point( 46 | instance_id=instance_id, 47 | agent_id="robot_1", 48 | timestamp=timestamp, 49 | point_type=PointType.OBSERVATION, 50 | data=image_data, 51 | media_type=MediaType.IMAGE, 52 | metadata={"camera_id": "cam_1"}, 53 | ) 54 | 55 | # Add action for robot_2 56 | action_data = {"command": "move_to", "position": [1.0, 2.0, 0.0]} 57 | dataset.add_data_point( 58 | instance_id=instance_id, 59 | agent_id="robot_2", 60 | timestamp=timestamp, 61 | point_type=PointType.ACTION, 62 | data=action_data, 63 | media_type=MediaType.JSON, 64 | ) 65 | 66 | # Close the dataset 67 | dataset.close() 68 | -------------------------------------------------------------------------------- /packages/osw-data/tests/test_annotation.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from osw_data.annotation import AnnotationSpan, AnnotationSystem 3 | from pathlib import Path 4 | 5 | 6 | def test_annotation_system() -> None: 7 | # Initialize annotation system 8 | annotation_system = AnnotationSystem( 9 | base_path=Path("/tmp/data/annotations"), 10 | project_name="Robot Behavior Analysis", 11 | description="Annotating robot behaviors and interactions", 12 | annotation_schema={ 13 | "behavior_type": ["cooperative", "competitive", "neutral"], 14 | "success_rating": {"type": "float", "min": 0, "max": 1}, 15 | "comments": "string", 16 | }, 17 | ) 18 | 19 | # Add annotators 20 | annotation_system.add_annotator( 21 | annotator_id="expert1", 22 | name="Dr. Smith", 23 | role="robotics_expert", 24 | expertise_level="expert", 25 | ) 26 | 27 | annotation_system.add_annotator( 28 | annotator_id="expert2", 29 | name="Dr. Jones", 30 | role="hri_researcher", 31 | expertise_level="expert", 32 | ) 33 | 34 | # Add annotations 35 | instance_id = "instance_001" 36 | agent_id = "robot_1" 37 | 38 | # Expert 1's annotation 39 | annotation_system.add_annotation( 40 | instance_id=instance_id, 41 | agent_id=agent_id, 42 | annotator_id="expert1", 43 | content={ 44 | "behavior_type": "cooperative", 45 | "success_rating": 0.85, 46 | "comments": "Robot showed good adaptation to human partner", 47 | }, 48 | span=AnnotationSpan(start_time=datetime.now(), end_time=datetime.now()), 49 | confidence=0.9, 50 | ) 51 | 52 | # Expert 2's annotation 53 | annotation_system.add_annotation( 54 | instance_id=instance_id, 55 | agent_id=agent_id, 56 | annotator_id="expert2", 57 | content={ 58 | "behavior_type": "cooperative", 59 | "success_rating": 0.78, 60 | "comments": "Good cooperation but some delays in responses", 61 | }, 62 | span=AnnotationSpan(start_time=datetime.now(), end_time=datetime.now()), 63 | confidence=0.85, 64 | ) 65 | -------------------------------------------------------------------------------- /src/training/llm_as_a_judge.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from openai import AsyncAzureOpenAI 3 | from osw_data import MultiAgentDataset 4 | from osw_data.metrics import MetricSet 5 | from autolibra_core import ( 6 | run_llm_eval, 7 | ) 8 | from autolibra_core.data import MetricTrainingInstance 9 | from autolibra_core.configs import AutoLibraEvalSettings 10 | 11 | 12 | async def main(dataset_name: str, metric_path: str) -> None: 13 | dataset = MultiAgentDataset( 14 | name="dataset", 15 | base_path=f".data/{dataset_name}", 16 | ) 17 | 18 | metric_set = MetricSet( 19 | name="", 20 | base_path=metric_path, 21 | induced_from=dataset_name, 22 | ) 23 | 24 | settings = AutoLibraEvalSettings() 25 | 26 | client = AsyncAzureOpenAI( 27 | api_key=settings.azure_api_key, 28 | api_version="2024-12-01-preview", 29 | azure_endpoint=settings.azure_endpoint, 30 | ) 31 | 32 | metric_training_instances: list[MetricTrainingInstance] = [] 33 | 34 | for instances in dataset.list_instances(): 35 | instance = dataset.get_instance_metadata(instances) 36 | for agent_id in instance.agents: 37 | metric_training_instances.append( 38 | MetricTrainingInstance( 39 | task=instance.metadata["task"] 40 | if "task" in instance.metadata 41 | else "Task is described in the trajectory observation", 42 | agent_id=agent_id, 43 | trajectory=dataset.get_trajectory(instances, agent_id), 44 | feedback="", 45 | ) 46 | ) 47 | 48 | eval_results = await run_llm_eval( 49 | metric_training_instances, list(metric_set.metrics.values()), client=client 50 | ) 51 | 52 | with open("llm_eval_results.jsonl", "w") as f: 53 | for eval_result in eval_results: 54 | f.write(eval_result.model_dump_json()) 55 | f.write("\n") 56 | 57 | 58 | if __name__ == "__main__": 59 | import argparse 60 | 61 | parser = argparse.ArgumentParser(description="Balrog Converter") 62 | parser.add_argument( 63 | "--filename", 64 | type=str, 65 | required=True, 66 | help="The name of the folder containing the data for the given run, including the date subfolder", 67 | ) 68 | 69 | filename = parser.parse_args().filename 70 | filename_no_date = filename.split("/")[0] 71 | 72 | asyncio.run( 73 | main( 74 | dataset_name=filename_no_date, 75 | metric_path=f".data/metrics/{filename}", 76 | ), 77 | ) 78 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/templates/coverage_evaluation_v2.j2: -------------------------------------------------------------------------------- 1 | You are an expert in matching quantatitive metrics to qualitative feedback. You are asked to evaluate the coverage of the metrics on the feedback. 2 | 3 | Context: 4 | Review the following metrics and agent behaviors. Match the correspondin 5 | 6 | Positive metrics: 7 | {% for metric in positive_metrics%} 8 | 9 | Metric {{ loop.index }} 10 | {{ metric.name}}: {{ metric.explanation }} 11 | 12 | {% endfor %} 13 | 14 | Negative metrics: 15 | 16 | {% for metric in negative_metrics%} 17 | 18 | Metric {{ loop.index }} 19 | {{ metric.name}}: {{ metric.explanation }} 20 | 21 | {% endfor %} 22 | 23 | Behaviors and feedback bulletpoints: 24 | 25 | {% for behavior_feedback in behavior_feedback_list %} 26 | 27 | Bullet point: {{ loop.index }} 28 | Behavior: {{ behavior_feedback.behavior }} 29 | Feedback: {{ behavior_feedback.feedback }} 30 | 31 | {% endfor %} 32 | 33 | 34 | Instructions: 35 | 1. Please match at most ONE bullet point to each METRIC. To perform a match, the bullet point must EXACTLY match the meaning of the metric. 36 | 2. Do NOT match the same bullet point to multiple metrics. 37 | 3. Here are examples of VALID matches : 38 | - Negative metrics 39 | - UI Element or Page Existence Correctness 40 | - This metric measures whether the agent selects valid user-interface elements or navigates to valid pages. Good examples show the agent accurately identifying existing UI elements or correct URLs. Bad examples show attempts to click or load elements/pages that do not exist. 41 | - Behavior 42 | - The agent tried to click an element that is not visible after scrolling 43 | - Positive metrics 44 | - Location Query Correctness 45 | - This metric measures whether the agent correctly identifies the location of a query. Good examples show the agent accurately identifying the location of a query. Bad examples show the agent incorrectly identifying the location of a query. 46 | - Behavior 47 | - The agent input the correct location in the search bar 48 | 4. If there are no valid matches, please EXCLUDE the metric from the list . 49 | 5. Please provide a 1 - sentence RATIONALE for your decision for any matches . 50 | 6. Please respond with a list of each metric and the item it matches 51 | 7. Note that the bullet point IDs should be smaller than {{ behavior_feedback_list | length }}. 52 | 8. Note that for positive metrics (`is_positive: True`), the metric id should be smaller than {{ positive_metrics | length }}. 53 | 9. Note that for negative metrics (`is_positive: False`), the metric id should be smaller than {{ negative_metrics | length }}. 54 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/evaluators/llm_evaluator.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from importlib import resources 3 | import jinja2 4 | from openai import AsyncAzureOpenAI, RateLimitError 5 | from osw_data.metrics import Metric 6 | from autolibra_core.configs import AutoLibraEvalSettings 7 | from ..data import MetricTrainingInstance 8 | from ..utils import render_training_instance 9 | from pydantic import BaseModel, ValidationError, create_model, Field 10 | from typing import Literal 11 | 12 | 13 | def _make_snake_case(name: str) -> str: 14 | return name.lower().replace(" ", "_") 15 | 16 | 17 | def _make_evaluation_result_class(metrics: list[Metric]) -> type[BaseModel]: 18 | eval_result = create_model( # type: ignore[call-overload] 19 | "EvaluationResult", 20 | **{ 21 | _make_snake_case(metric.name) + "_reasoning": ( 22 | str, 23 | Field(description=metric.explanation, alias=metric.name + " Reasoning"), 24 | ) 25 | for metric in metrics 26 | }, 27 | **{ 28 | _make_snake_case(metric.name): ( 29 | Literal[-1, 0, 1], 30 | Field(description=metric.explanation, alias=metric.name), 31 | ) 32 | for metric in metrics 33 | }, 34 | ) 35 | return eval_result # type: ignore[no-any-return] 36 | 37 | 38 | def _load_llm_eval_template() -> jinja2.Template: 39 | with resources.files("autolibra_core.templates").joinpath( 40 | "llm_as_a_judge_evaluator_v3.j2" 41 | ).open("r") as f: 42 | return jinja2.Template(f.read()) 43 | 44 | 45 | semaphore = asyncio.Semaphore(20) # Limit to 3 concurrent tasks 46 | 47 | 48 | async def eval_instance( 49 | instance: MetricTrainingInstance, metrics: list[Metric], client: AsyncAzureOpenAI 50 | ) -> BaseModel: 51 | settings = AutoLibraEvalSettings() 52 | template = _load_llm_eval_template() 53 | 54 | prompt = template.render( 55 | trajectory=render_training_instance(instance), 56 | metrics=metrics, 57 | ) 58 | 59 | model = settings.azure_openai_o3_model 60 | assert model 61 | 62 | async with semaphore: 63 | while True: 64 | wait_time = 1 65 | try: 66 | completion = await client.beta.chat.completions.parse( 67 | model=model, 68 | messages=[ 69 | {"role": "system", "content": "Evaluate the trajectory."}, 70 | {"role": "user", "content": prompt}, 71 | ], 72 | response_format=_make_evaluation_result_class(metrics), 73 | reasoning_effort="high", 74 | ) 75 | 76 | if not completion.choices[0].message.parsed: 77 | print("Failed to parse the response. Retrying.") 78 | await asyncio.sleep(wait_time) 79 | continue 80 | break 81 | except (ValidationError, RateLimitError) as e: 82 | print(e) 83 | await asyncio.sleep(wait_time) 84 | wait_time *= 2 85 | 86 | if not completion.choices[0].message.parsed: 87 | raise ValueError("Failed to parse the response.") 88 | else: 89 | return completion.choices[0].message.parsed 90 | 91 | 92 | async def run_llm_eval( 93 | instances: list[MetricTrainingInstance], 94 | metrics: list[Metric], 95 | client: AsyncAzureOpenAI, 96 | ) -> list[BaseModel]: 97 | eval_results = await asyncio.gather( 98 | *[eval_instance(instance, metrics, client) for instance in instances] 99 | ) 100 | 101 | return eval_results 102 | -------------------------------------------------------------------------------- /packages/osw-data/src/osw_data/metrics.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from pathlib import Path 3 | from typing import Annotated 4 | from pydantic import AfterValidator, BaseModel, Field 5 | 6 | 7 | class MetricSetMetadata(BaseModel): 8 | created_at: datetime = Field(default_factory=datetime.now) 9 | name: str 10 | metric_names: list[str] = Field(default_factory=list) 11 | induced_from: str | None = Field(default_factory=lambda: None) 12 | version: str | None = Field(default_factory=lambda: None) 13 | 14 | 15 | class Metric(BaseModel): 16 | good_behaviors: list[str] = Field(default_factory=list) 17 | bad_behaviors: list[str] = Field(default_factory=list) 18 | explanation: str 19 | name: Annotated[str, AfterValidator(lambda x: x.replace("/", "_"))] 20 | 21 | 22 | class MetricSet: 23 | """ 24 | A set of metrics for evaluating trajectories 25 | """ 26 | 27 | def __init__( 28 | self, 29 | name: str, 30 | base_path: Path | str, 31 | induced_from: str, 32 | version: str | None = None, 33 | ): 34 | self.base_path = Path(base_path) 35 | self.metrics_path = self.base_path / "metrics" 36 | self.metadata_path = self.base_path / "metadata.json" 37 | 38 | # Initialize directory structure 39 | self.base_path.mkdir(parents=True, exist_ok=True) 40 | self.metrics_path.mkdir(exist_ok=True) 41 | self.metrics: dict[str, Metric] = {} 42 | 43 | # Initialize or load dataset metadata 44 | self.metadata = self._init_metadata(name, induced_from, version) 45 | self.load_metrics() 46 | 47 | def _init_metadata( 48 | self, name: str, induced_from: str, version: str | None 49 | ) -> MetricSetMetadata: 50 | """Initialize or load dataset metadata""" 51 | if self.metadata_path.exists(): 52 | with open(self.metadata_path, "r") as f: 53 | return MetricSetMetadata.model_validate_json(f.read()) 54 | else: 55 | metadata = MetricSetMetadata( 56 | name=name, induced_from=induced_from, version=version 57 | ) 58 | self._save_metadata(metadata) 59 | return metadata 60 | 61 | def _save_metadata(self, metadata: MetricSetMetadata) -> None: 62 | with open(self.metadata_path, "w") as f: 63 | f.write(metadata.model_dump_json(indent=2)) 64 | 65 | def _save_metrics( 66 | self, 67 | ) -> None: 68 | for name, metric in self.metrics.items(): 69 | metric_path = self.metrics_path / f"{name}.json" 70 | with open(metric_path, "w") as f: 71 | f.write(metric.model_dump_json(indent=2)) 72 | 73 | def load_metrics(self) -> None: 74 | for metric in self.metadata.metric_names: 75 | metric_path = self.metrics_path / f"{metric}.json" 76 | with open(metric_path, "r") as f: 77 | self.metrics[metric] = Metric.model_validate_json(f.read()) 78 | 79 | def add_metrics(self, metrics: list[Metric]) -> None: 80 | for metric in metrics: 81 | if metric.name in self.metrics: 82 | raise ValueError(f"Metric with name {metric.name} already exists") 83 | self.metrics[metric.name] = metric 84 | metric_path = self.metrics_path / f"{metric.name}.json" 85 | with open(metric_path, "w") as f: 86 | f.write(metric.model_dump_json(indent=2)) 87 | 88 | self.metadata.metric_names = list(self.metrics.keys()) 89 | self._save_metadata(self.metadata) 90 | 91 | def get_metric(self, name: str) -> Metric: 92 | if name not in self.metrics: 93 | raise ValueError(f"Metric with name {name} does not exist") 94 | metric_path = self.metrics_path / f"{name}.json" 95 | with open(metric_path, "r") as f: 96 | return Metric.model_validate_json(f.read()) 97 | -------------------------------------------------------------------------------- /src/training/grounding.py: -------------------------------------------------------------------------------- 1 | # Iterative Metric Creation 2 | # Input: instances, trajectories, agents, and feedbacks 3 | # Output: metrics 4 | # Algorithm: 5 | # metrics = propose_metrics(train_trajectories, train_feedbacks) 6 | # while coverage improves 7 | # eval_results = llm_evaluator(train_trajectories, metrics) 8 | # uncovered_feedbacks, coverage = missing_points_detection(train_trajectories, eval_results) 9 | # new_metrics = propose_metrics(train_trajectories, uncovered_feedbacks) 10 | # metrics += new_metrics 11 | 12 | import asyncio 13 | from datetime import datetime 14 | from openai import AsyncAzureOpenAI 15 | from osw_data import MultiAgentDataset 16 | from osw_data.annotation import AnnotationSystem 17 | from osw_data.metrics import MetricSet 18 | from autolibra_core import ( 19 | MetricTrainingInstance, 20 | feedback_grounding, 21 | behavior_clustering, 22 | ) 23 | from autolibra_core.configs import AutoLibraEvalSettings 24 | 25 | 26 | async def main(dataset_name: str) -> None: 27 | settings = AutoLibraEvalSettings() 28 | 29 | client = AsyncAzureOpenAI( 30 | api_key=settings.azure_api_key, 31 | api_version="2024-12-01-preview", 32 | azure_endpoint=settings.azure_endpoint, 33 | ) 34 | 35 | dataset = MultiAgentDataset( 36 | name="dataset", 37 | base_path=f".data/{dataset_name}", 38 | ) 39 | 40 | annotation_system = AnnotationSystem( 41 | base_path=f".data/annotations/{dataset_name}", 42 | ) 43 | 44 | metric_training_instances: list[MetricTrainingInstance] = [] 45 | 46 | for instances in dataset.list_instances(): 47 | instance = dataset.get_instance_metadata(instances) 48 | for agent_id in instance.agents: 49 | trajectory_annotations = annotation_system.get_trajectory_annotations( 50 | instance_id=instances, agent_id=agent_id 51 | ) 52 | for annotation in trajectory_annotations.annotations: 53 | metric_training_instances.append( 54 | MetricTrainingInstance( 55 | task=instance.metadata["task"] 56 | if "task" in instance.metadata 57 | else "Task is described in the trajectory observation", 58 | agent_id=agent_id, 59 | trajectory=dataset.get_trajectory(instances, agent_id), 60 | feedback=annotation.content["feedback"], 61 | ) 62 | ) 63 | 64 | feedback_grounding_results = await asyncio.gather( 65 | *[ 66 | feedback_grounding(instance, client=client) 67 | for instance in metric_training_instances 68 | ] 69 | ) 70 | 71 | with open("feedback_grounding_results.jsonl", "w") as f: 72 | for feedback_grounding_result in feedback_grounding_results: 73 | for aspect in feedback_grounding_result: 74 | f.write(aspect.model_dump_json(indent=2)) 75 | f.write("\n") 76 | f.write("\n") 77 | 78 | aspects = sum( 79 | [ 80 | feedback_grounding_result 81 | for feedback_grounding_result in feedback_grounding_results 82 | ], 83 | [], 84 | ) 85 | 86 | behavior_clustering_results = await behavior_clustering( 87 | aspects=aspects, client=client 88 | ) 89 | 90 | metric_set = MetricSet( 91 | name="Derived Metrics", 92 | base_path=f".data/metrics/{dataset_name}/{datetime.now().strftime('%m_%d_%H_%M')}", 93 | induced_from=dataset_name, 94 | version="0.1", 95 | ) 96 | 97 | metric_set.add_metrics(behavior_clustering_results.metrics) 98 | 99 | 100 | if __name__ == "__main__": 101 | import argparse 102 | 103 | parser = argparse.ArgumentParser(description="Balrog Converter") 104 | parser.add_argument( 105 | "--filename", 106 | type=str, 107 | required=True, 108 | help="The name of the folder containing the data for the given run", 109 | ) 110 | 111 | filename = parser.parse_args().filename 112 | 113 | asyncio.run(main(filename)) 114 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | 170 | # PyPI configuration file 171 | .pypirc 172 | 173 | .data 174 | .vscode 175 | -------------------------------------------------------------------------------- /src/training/llm_eval.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from openai import AsyncAzureOpenAI 3 | from osw_data import MultiAgentDataset 4 | from osw_data.annotation import AnnotationSystem 5 | from osw_data.metrics import MetricSet 6 | from autolibra_core import ( 7 | run_llm_eval, 8 | ) 9 | from autolibra_core.data import MetricTrainingInstance 10 | from autolibra_core.configs import AutoLibraEvalSettings 11 | from autolibra_core.data.primitives import Trait 12 | from autolibra_core.evaluators.coverage_evaluator import run_coverage_eval 13 | from autolibra_core.evaluators.llm_evaluator import _make_snake_case 14 | 15 | 16 | async def main(dataset_name: str, metric_path: str) -> None: 17 | dataset = MultiAgentDataset( 18 | name="dataset", 19 | base_path=f".data/{dataset_name}", 20 | ) 21 | 22 | annotation_system = AnnotationSystem( 23 | base_path=f".data/annotations/{dataset_name}", 24 | ) 25 | 26 | metric_set = MetricSet( 27 | name="", 28 | base_path=metric_path, 29 | induced_from=dataset_name, 30 | ) 31 | 32 | settings = AutoLibraEvalSettings() 33 | 34 | client = AsyncAzureOpenAI( 35 | api_key=settings.azure_api_key, 36 | api_version="2024-12-01-preview", 37 | azure_endpoint=settings.azure_endpoint, 38 | ) 39 | 40 | metric_training_instances: list[MetricTrainingInstance] = [] 41 | 42 | for instances in dataset.list_instances(): 43 | instance = dataset.get_instance_metadata(instances) 44 | for agent_id in instance.agents: 45 | trajectory_annotations = annotation_system.get_trajectory_annotations( 46 | instance_id=instances, agent_id=agent_id 47 | ) 48 | for annotation in trajectory_annotations.annotations: 49 | metric_training_instances.append( 50 | MetricTrainingInstance( 51 | task=instance.metadata["task"] 52 | if "task" in instance.metadata 53 | else "Task is described in the trajectory observation", 54 | agent_id=agent_id, 55 | trajectory=dataset.get_trajectory(instances, agent_id), 56 | feedback=str(annotation.content), 57 | ) 58 | ) 59 | 60 | eval_results = await run_llm_eval( 61 | metric_training_instances, list(metric_set.metrics.values()), client=client 62 | ) 63 | 64 | eval_scoring = [ 65 | [ 66 | int(getattr(eval_result, _make_snake_case(metric.name), 0)) 67 | for metric in metric_set.metrics.values() 68 | ] 69 | for eval_result in eval_results 70 | ] 71 | 72 | with open("llm_eval_results.jsonl", "w") as f: 73 | for eval_result in eval_results: 74 | f.write(eval_result.model_dump_json()) 75 | f.write("\n") 76 | 77 | traits = [ 78 | [ 79 | Trait( 80 | metric=metric, 81 | rating=score, 82 | ) 83 | for metric, score in zip( 84 | metric_set.metrics.values(), eval_scoring_for_instance 85 | ) 86 | ] 87 | for eval_scoring_for_instance in eval_scoring 88 | ] 89 | 90 | coverage_results = await run_coverage_eval( 91 | instance_traits=traits, 92 | instances=metric_training_instances, 93 | client=client, 94 | ) 95 | 96 | covered, total = 0, 0 97 | redundant, total_traits = 0, 0 98 | 99 | for coverage_result in coverage_results: 100 | covered += coverage_result[0] 101 | total += coverage_result[1] 102 | redundant += coverage_result[2] 103 | total_traits += coverage_result[3] 104 | 105 | print(f"Coverage: {covered}/{total}") 106 | print(f"Redundancy: {redundant}/{total_traits}") 107 | 108 | 109 | if __name__ == "__main__": 110 | import argparse 111 | 112 | parser = argparse.ArgumentParser(description="Balrog Converter") 113 | parser.add_argument( 114 | "--filename", 115 | type=str, 116 | required=True, 117 | help="The name of the folder containing the data for the given run, including the date subfolder", 118 | ) 119 | 120 | filename = parser.parse_args().filename 121 | filename_no_date = filename.split("/")[0] 122 | 123 | asyncio.run( 124 | main( 125 | dataset_name=filename_no_date, 126 | metric_path=f".data/metrics/{filename}", 127 | ), 128 | ) 129 | -------------------------------------------------------------------------------- /packages/autolibra-core/tests/test_coverage.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from openai import AsyncAzureOpenAI 3 | from osw_data.metrics import Metric 4 | from autolibra_core.configs import AutoLibraEvalSettings 5 | from autolibra_core.evaluators.coverage_evaluator import ( 6 | match_aspects_and_traits, 7 | ) 8 | from autolibra_core.data import Aspect 9 | import pytest 10 | 11 | """ 12 | {"feedback":"\"The agent initially tried to find the contributors through the Commit history or Graph tab which are reasonable guesses, but the Contributor tab is the most straightforward choice.","behavior":"\"The agent went step by step through different sections (Commits, Graph) before navigating to the Contributors tab.\"","is_positive":true} 13 | {"feedback":"\"Anyway, the agent made the right choice in the end which is good although wasting a few more steps.\"","behavior":"\"The agent ultimately navigated to the Contributors tab and successfully identified the top contributor.\"","is_positive":true} 14 | """ 15 | 16 | """ 17 | {"good_behaviors":["Behavior #3: The agent typed 'white desk' into the correct search box (ID 172) and pressed Search.","Behavior #4: The agent clicked the 'Add to Wish List' button (ID 5919) for the correct item.","Behavior #39: The agent typed 'white desk' into the correct search bar (ID 1585).","Behavior #40: The agent clicked the 'Add to Wish List' button (ID 6684) for the correct product."],"bad_behaviors":["Behavior #1: The agent clicked on a nonexistent element (ID 1488).","Behavior #6: The agent tried typing into the wrong text areas (IDs 2169 and 3421) instead of the actual textbox.","Behavior #8: The agent clicked on a nonexistent element (ID 1605)."],"explanation":"Measures whether the agent targets valid page elements for its actions. Good behaviors involve using the correct element ID or selector for the intended action, whereas bad behaviors show the agent clicking or typing into the wrong or nonexistent elements.","name":"Element Interaction Correctness"} 18 | {"good_behaviors":["Behavior #20: The agent found the website URL of the Carnegie Museum of Art successfully.","Behavior #22: The agent identified the correct zip code (06516) confirming the Yale University location.","Behavior #24: The agent began searching for the Carnegie Museum of Art in Pittsburgh and proceeded correctly.","Behavior #25: The final action where the agent indeed outputs the museum’s website at the end of the trajectory.","Behavior #37: The agent identified the top contributor’s email address accurately."],"bad_behaviors":["Behavior #14: The agent mistakenly reported the rating of the wrong product (iPhone Cable) instead of 'Lightning to 3.5mm Adapter.'","Behavior #15: The agent did not succeed in locating the intended 'Canon Photo Printer' listing."],"explanation":"Covers how precisely the agent retrieves or pinpoints the correct item or piece of data requested. Positive instances yield exactly the needed information, whereas negative ones incorrectly match or fail to locate the target.","name":"Information Discovery Accuracy"} 19 | """ 20 | 21 | _aspects = [ 22 | Aspect( 23 | feedback="The agent initially tried to find the contributors through the Commit history or Graph tab which are reasonable guesses, but the Contributor tab is the most straightforward choice.", 24 | behavior="The agent went step by step through different sections (Commits, Graph) before navigating to the Contributors tab.", 25 | is_positive=True, 26 | ), 27 | Aspect( 28 | feedback="Anyway, the agent made the right choice in the end which is good although wasting a few more steps.", 29 | behavior="The agent ultimately navigated to the Contributors tab and successfully identified the top contributor.", 30 | is_positive=True, 31 | ), 32 | ] 33 | 34 | _traits = [ 35 | Metric( 36 | name="Element Interaction Correctness", 37 | explanation="Measures whether the agent targets valid page elements for its actions. Good behaviors involve using the correct element ID or selector for the intended action, whereas bad behaviors show the agent clicking or typing into the wrong or nonexistent elements.", 38 | ), 39 | Metric( 40 | name="Information Discovery Accuracy", 41 | explanation="Covers how precisely the agent retrieves or pinpoints the correct item or piece of data requested. Positive instances yield exactly the needed information, whereas negative ones incorrectly match or fail to locate the target.", 42 | ), 43 | ] 44 | 45 | 46 | @pytest.mark.asyncio 47 | async def test_match_aspects_and_traits() -> None: 48 | settings = AutoLibraEvalSettings() 49 | 50 | (_aspects, _traits) = pickle.load( 51 | open( 52 | "/Users/hao/autolibra-eval/packages/autolibra-core/tests/positive_aspects_traits.pkl", 53 | "rb", 54 | ) 55 | ) 56 | 57 | client = AsyncAzureOpenAI( 58 | api_key=settings.azure_api_key, 59 | api_version="2024-10-21", 60 | azure_endpoint=settings.azure_endpoint, 61 | ) 62 | 63 | _ = await match_aspects_and_traits( 64 | client=client, 65 | aspects=_aspects, 66 | traits=_traits, 67 | ) 68 | -------------------------------------------------------------------------------- /src/plot/meta-eval.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from matplotlib.gridspec import GridSpec 4 | 5 | # Example data for all four datasets 6 | # You should replace these with your actual data for each dataset 7 | cogym_data = [ 8 | [0.47, 0.50, 2], 9 | [0.59, 0.63, 3], 10 | [0.76, 0.83, 4], 11 | [0.74, 0.80, 4], 12 | [0.70, 0.67, 4], 13 | [0.77, 0.85, 5], 14 | [0.74, 0.77, 6], 15 | [0.75, 0.83, 7], 16 | [0.72, 0.85, 8], 17 | [0.73, 0.85, 9], 18 | [0.70, 0.87, 10], 19 | [0.68, 0.88, 11], 20 | [0.70, 0.88, 12], 21 | ] 22 | 23 | # Example data for the other datasets - replace with your actual data 24 | sotopia_data = [ 25 | [0.31, 0.40, 2], 26 | [0.41, 0.62, 3], 27 | [0.47, 0.66, 4], 28 | [0.50, 0.70, 5], 29 | [0.57, 0.79, 6], 30 | [0.57, 0.80, 7], 31 | [0.58, 0.81, 8], 32 | [0.60, 0.85, 9], 33 | [0.60, 0.92, 10], 34 | [0.58, 0.91, 11], 35 | [0.54, 0.86, 12], 36 | ] 37 | 38 | webarena_data = [ 39 | [0.45, 0.48, 2], 40 | [0.58, 0.60, 3], 41 | [0.67, 0.72, 4], 42 | [0.85, 0.66, 5], 43 | [0.90, 0.70, 6], 44 | [0.93, 0.74, 6], 45 | [0.91, 0.73, 7], 46 | [0.88, 0.77, 8], 47 | [0.87, 0.75, 9], 48 | [0.84, 0.72, 10], 49 | [0.89, 0.80, 11], 50 | [0.86, 0.82, 12], 51 | ] 52 | 53 | webvoyager_data = [ 54 | [0.32, 0.30, 2], 55 | [0.51, 0.47, 3], 56 | [0.60, 0.50, 4], 57 | [0.76, 0.78, 5], 58 | [0.88, 0.72, 6], 59 | [0.91, 0.72, 7], 60 | [0.92, 0.77, 8], 61 | [0.91, 0.78, 9], 62 | [0.92, 0.80, 10], 63 | [0.93, 0.83, 11], 64 | [0.93, 0.85, 12], 65 | ] 66 | 67 | # Convert to numpy arrays 68 | datasets = { 69 | "CoGym": np.array(cogym_data), 70 | "Sotopia": np.array(sotopia_data), 71 | "WebArena": np.array(webarena_data), 72 | "WebVoyager": np.array(webvoyager_data), 73 | } 74 | 75 | # Create the figure and grid 76 | fig = plt.figure(figsize=(6, 5)) 77 | gs = GridSpec(2, 2, figure=fig, wspace=0.3, hspace=0.3) 78 | 79 | # Determine global min and max values for consistent axes across all plots 80 | all_coverage = np.concatenate([d[:, 0] for d in datasets.values()]) 81 | all_redundancy = np.concatenate([d[:, 1] for d in datasets.values()]) 82 | all_n_metrics = np.concatenate([d[:, 2] for d in datasets.values()]) 83 | 84 | min_coverage, max_coverage = np.min(all_coverage) - 0.05, np.max(all_coverage) + 0.05 85 | min_redundancy, max_redundancy = ( 86 | np.min(all_redundancy) - 0.05, 87 | np.max(all_redundancy) + 0.05, 88 | ) 89 | min_n_metrics, max_n_metrics = np.min(all_n_metrics), np.max(all_n_metrics) 90 | 91 | # Create a scatter plot for each dataset 92 | axes = [] 93 | scatters = [] 94 | 95 | stars = { 96 | "CoGym": [0.75, 0.72], 97 | "Sotopia": [0.58, 0.85], 98 | "WebArena": [0.82, 0.8], 99 | "WebVoyager": [0.83, 0.76], 100 | } 101 | 102 | square = { 103 | "CoGym": [0.47, 0.84], 104 | "Sotopia": [0.53, 0.79], 105 | "WebArena": [0.75, 0.88], 106 | "WebVoyager": [0.76, 0.91], 107 | } 108 | 109 | positions = [(0, 0), (0, 1), (1, 0), (1, 1)] 110 | for i, (dataset_name, dataset) in enumerate(datasets.items()): 111 | row, col = positions[i] 112 | ax = fig.add_subplot(gs[row, col]) 113 | axes.append(ax) 114 | 115 | coverage = dataset[:, 0] 116 | redundancy = dataset[:, 1] 117 | n_metrics = dataset[:, 2] 118 | 119 | scatter = ax.scatter( 120 | redundancy, 121 | coverage, 122 | c=n_metrics, 123 | cmap="PiYG", 124 | s=100, 125 | alpha=0.8, 126 | edgecolors="k", 127 | vmin=min_n_metrics, 128 | vmax=max_n_metrics, 129 | ) 130 | 131 | star_y, star_x = stars[dataset_name] 132 | ax.plot( 133 | star_x, 134 | star_y, 135 | "*", 136 | color="yellow", 137 | markersize=15, 138 | markeredgecolor="black", 139 | markeredgewidth=1.0, 140 | ) 141 | scatters.append(scatter) 142 | 143 | square_y, square_x = square[dataset_name] 144 | ax.plot( 145 | square_x, 146 | square_y, 147 | "s", 148 | color="blue", 149 | markersize=10, 150 | markeredgecolor="black", 151 | markeredgewidth=1.0, 152 | ) 153 | 154 | # Set consistent axis limits for all plots 155 | ax.set_xlim(min_redundancy, max_redundancy) 156 | ax.set_ylim(min_coverage, max_coverage) 157 | 158 | # Add labels and title, but remove redundancy label from first row 159 | # and coverage label from second column 160 | if row == 1: # Only add x-axis label for bottom row 161 | ax.set_xlabel("Redundancy", fontsize=12) 162 | if col == 0: # Only add y-axis label for left column 163 | ax.set_ylabel("Coverage", fontsize=12) 164 | ax.set_title(dataset_name, fontsize=14) 165 | 166 | # Add grid 167 | ax.grid(True, linestyle="--", alpha=0.7) 168 | 169 | # Adjust layout before adding colorbar 170 | plt.tight_layout(rect=(0, 0, 0.9, 1)) # Make room for the colorbar 171 | 172 | # Add a common colorbar to the figure 173 | cbar_ax = fig.add_axes((0.92, 0.15, 0.02, 0.7)) # [left, bottom, width, height] 174 | cbar = fig.colorbar(scatters[0], cax=cbar_ax) 175 | 176 | # Position N label under the colorbar 177 | cbar.ax.set_xlabel("$N$", labelpad=5) 178 | cbar.ax.xaxis.set_label_position("bottom") 179 | 180 | # Save the figure 181 | plt.savefig("four_datasets_grid.pdf", bbox_inches="tight") 182 | plt.savefig("four_datasets_grid.png", dpi=300, bbox_inches="tight") 183 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/datasets/sotopia.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from pathlib import Path 3 | 4 | from pydantic import BaseModel, Field 5 | from osw_data import MultiAgentDataset, AgentMetadata, MediaType, PointType 6 | from .base import BaseConverter, run_converter 7 | 8 | from huggingface_hub import hf_hub_download 9 | 10 | 11 | class TwoAgentEpisodeWithScenarioBackgroundGoals(BaseModel): 12 | episode_id: str = Field() 13 | environment_id: str = Field() 14 | agent_ids: list[str] = Field() 15 | experiment_tag: str = Field() 16 | experiment_model_name_pairs: list[str] = Field() 17 | raw_messages: list[list[tuple[str, str, str]]] = Field() 18 | raw_rewards: list[tuple[float, dict[str, float]] | float] = Field() 19 | raw_rewards_prompt: str = Field() 20 | scenario: str = Field() 21 | codename: str = Field() 22 | agents_background: dict[str, str] = Field() 23 | social_goals: dict[str, str] = Field() 24 | social_interactions: str = Field() 25 | reasoning: str = Field() 26 | rewards: list[dict[str, float]] = Field() 27 | 28 | 29 | class SotopiaConverter(BaseConverter): 30 | def __init__(self, output_path: Path, source_path: Path): 31 | super().__init__(output_path, source_path) 32 | 33 | def download_data(self) -> None: 34 | self.source_path.mkdir(parents=True, exist_ok=True) 35 | 36 | # Download trajectory file 37 | if not (self.source_path / "sotopia_episodes_v1_hf.jsonl").exists(): 38 | hf_hub_download( 39 | repo_id="cmu-lti/sotopia", 40 | filename="sotopia_episodes_v1_hf.jsonl", 41 | repo_type="dataset", 42 | local_dir=self.source_path, 43 | ) 44 | 45 | def convert_to_dataset(self) -> None: 46 | """Convert entire Sotopia dataset""" 47 | self.logger.info("Converting Sotopia dataset") 48 | 49 | dataset = MultiAgentDataset( 50 | name="Sotopia Interaction", 51 | base_path=self.output_path, 52 | description="Sotopia dialog interactions", 53 | ) 54 | 55 | with open(self.source_path / "sotopia_episodes_v1.jsonl", "r") as f: 56 | for line in f: 57 | episode = ( 58 | TwoAgentEpisodeWithScenarioBackgroundGoals.model_validate_json(line) 59 | ) 60 | 61 | agent_names = episode.agents_background.keys() 62 | agent_backgrounds = episode.agents_background 63 | models = episode.experiment_model_name_pairs 64 | agents_metadata = {} 65 | for agent_name in agent_names: 66 | agents_metadata[agent_name] = AgentMetadata( 67 | agent_id=agent_name, 68 | agent_type="sotopia_agent", 69 | capabilities=[ 70 | "speek", 71 | "non-verbal communication", 72 | "physical actions", 73 | ], 74 | parameters={"background": agent_backgrounds[agent_name]}, 75 | ) 76 | instance_id = episode.episode_id 77 | 78 | instance_metadata = { 79 | "scenario": episode.scenario, 80 | "experiment_tag": episode.experiment_tag, 81 | "models": models, 82 | "rewards": episode.rewards, 83 | } 84 | if models != ["gpt-4", "gpt-4", "gpt-4"]: 85 | self.logger.info( 86 | f"Skipping instance {instance_id} because of model mismatch" 87 | ) 88 | continue 89 | 90 | for rewards in episode.rewards: 91 | overall_reward = rewards["overall_score"] 92 | if overall_reward < 1.6: 93 | self.logger.info( 94 | f"Skipping instance {instance_id} because of low reward" 95 | ) 96 | continue 97 | 98 | instance_id = dataset.create_instance( 99 | agents_metadata=agents_metadata, instance_metadata=instance_metadata 100 | ) 101 | 102 | for turn in episode.raw_messages: 103 | for from_agent, to_agent, message in turn: 104 | timestamp = datetime.datetime.now() 105 | action_timestamp = datetime.datetime.now() 106 | if from_agent == "Environment": 107 | dataset.add_data_point( 108 | instance_id=instance_id, 109 | agent_id=to_agent, 110 | point_type=PointType.OBSERVATION, 111 | data={"content": message}, 112 | media_type=MediaType.JSON, 113 | timestamp=timestamp, 114 | ) 115 | elif message != "did nothing": 116 | dataset.add_data_point( 117 | instance_id=instance_id, 118 | agent_id=from_agent, 119 | point_type=PointType.ACTION, 120 | data={"content": message}, 121 | media_type=MediaType.JSON, 122 | timestamp=action_timestamp, 123 | ) 124 | 125 | dataset.close() 126 | 127 | 128 | if __name__ == "__main__": 129 | source_path = Path(".data/raw/sotopia") 130 | output_path = Path(".data/sotopia") 131 | 132 | run_converter(SotopiaConverter, output_path, source_path) 133 | -------------------------------------------------------------------------------- /src/training/iterative.py: -------------------------------------------------------------------------------- 1 | # Iterative Metric Creation 2 | # Input: instances, trajectories, agents, and feedbacks 3 | # Output: metrics 4 | # Algorithm: 5 | # metrics = propose_metrics(train_trajectories, train_feedbacks) 6 | # while coverage improves 7 | # eval_results = llm_evaluator(train_trajectories, metrics) 8 | # uncovered_feedbacks, coverage = missing_points_detection(train_trajectories, eval_results) 9 | # new_metrics = propose_metrics(train_trajectories, uncovered_feedbacks) 10 | # metrics += new_metrics 11 | 12 | import asyncio 13 | from datetime import datetime 14 | from openai import AsyncAzureOpenAI 15 | from osw_data import Metric, MultiAgentDataset, MetricSet 16 | from osw_data.annotation import AnnotationSystem 17 | from autolibra_core import run_llm_eval, behavior_clustering, feedback_grounding 18 | from autolibra_core.data import MetricTrainingInstance, Trait 19 | from autolibra_core.configs import AutoLibraEvalSettings 20 | from autolibra_core.evaluators.coverage_evaluator import run_coverage_eval 21 | from autolibra_core.evaluators.llm_evaluator import _make_snake_case 22 | import logfire 23 | 24 | 25 | async def iterative_metric_creation(dataset_name: str) -> list[Metric]: 26 | settings = AutoLibraEvalSettings() 27 | 28 | dataset = MultiAgentDataset( 29 | name="dataset", 30 | base_path=f".data/{dataset_name}", 31 | ) 32 | 33 | annotation_system = AnnotationSystem( 34 | base_path=f".data/annotations/{dataset_name}", 35 | ) 36 | 37 | metric_training_instances: list[MetricTrainingInstance] = [] 38 | 39 | for instances in dataset.list_instances(): 40 | instance = dataset.get_instance_metadata(instances) 41 | for agent_id in instance.agents: 42 | trajectory_annotations = annotation_system.get_trajectory_annotations( 43 | instance_id=instances, agent_id=agent_id 44 | ) 45 | for annotation in trajectory_annotations.annotations: 46 | metric_training_instances.append( 47 | MetricTrainingInstance( 48 | task=instance.metadata["task"] 49 | if "task" in instance.metadata 50 | else "Task is described in the trajectory observation", 51 | agent_id=agent_id, 52 | trajectory=dataset.get_trajectory(instances, agent_id), 53 | feedback=annotation.content["feedback"], 54 | ) 55 | ) 56 | 57 | # initial state of metrics 58 | prev_coverage_rate: float = 0 59 | curr_coverage_rate: float = 0 60 | prev_metrics: list[Metric] = [] 61 | curr_metrics: list[Metric] = [] 62 | 63 | client = AsyncAzureOpenAI( 64 | api_key=settings.azure_api_key, 65 | api_version="2024-12-01-preview", 66 | azure_endpoint=settings.azure_endpoint, 67 | ) 68 | 69 | logfire.instrument_openai(client) 70 | 71 | # initial aspects 72 | feedback_grounding_results = await asyncio.gather( 73 | *[ 74 | feedback_grounding(instance, client) 75 | for instance in metric_training_instances 76 | ] 77 | ) 78 | 79 | aspects = sum( 80 | feedback_grounding_results, 81 | [], 82 | ) 83 | 84 | while curr_coverage_rate >= prev_coverage_rate: 85 | logfire.info(f"Current coverage rate: {curr_coverage_rate}") 86 | logfire.info(f"Previous coverage rate: {prev_coverage_rate}") 87 | prev_metrics = curr_metrics 88 | prev_coverage_rate = curr_coverage_rate 89 | 90 | curr_metrics = ( 91 | prev_metrics + (await behavior_clustering(aspects, client)).metrics 92 | ) 93 | 94 | eval_results = await run_llm_eval( 95 | metric_training_instances, metrics=curr_metrics, client=client 96 | ) 97 | 98 | eval_scoring = [ 99 | [ 100 | int(getattr(eval_result, _make_snake_case(metric.name), 0)) 101 | for metric in curr_metrics 102 | ] 103 | for eval_result in eval_results 104 | ] 105 | 106 | traits = [ 107 | [ 108 | Trait( 109 | metric=metric, 110 | rating=score, 111 | ) 112 | for metric, score in zip(curr_metrics, eval_scoring_for_instance) 113 | ] 114 | for eval_scoring_for_instance in eval_scoring 115 | ] 116 | 117 | coverage_eval_results = await run_coverage_eval( 118 | instance_traits=traits, 119 | instances=metric_training_instances, 120 | client=client, 121 | ) 122 | 123 | covered_aspects = sum([result[0] for result in coverage_eval_results]) 124 | total_aspects = sum([result[1] for result in coverage_eval_results]) 125 | _covered_traits = sum([result[2] for result in coverage_eval_results]) 126 | _total_traits = sum([result[3] for result in coverage_eval_results]) 127 | uncovered_aspects = sum([result[4] for result in coverage_eval_results], []) 128 | 129 | curr_coverage_rate = covered_aspects / total_aspects 130 | aspects = uncovered_aspects 131 | 132 | return prev_metrics 133 | 134 | 135 | def save_metrics(metrics: list[Metric], path: str) -> None: 136 | metric_set = MetricSet( 137 | name="Metrics derived from webarena dataset", 138 | base_path=path, 139 | induced_from="webarena", 140 | version="0.1", 141 | ) 142 | metric_set.add_metrics(metrics) 143 | 144 | 145 | async def main() -> None: 146 | metrics = await iterative_metric_creation("sotopia") 147 | 148 | metric_set = MetricSet( 149 | name="Metrics derived from sotopia dataset", 150 | base_path=f".data/metrics/sotopia/{datetime.now().strftime('%m_%d_%H_%M')}", 151 | induced_from="sotopia", 152 | version="0.1", 153 | ) 154 | 155 | metric_set.add_metrics(metrics) 156 | 157 | 158 | if __name__ == "__main__": 159 | logfire.configure() 160 | asyncio.run(main()) 161 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/datasets/cogym.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from pathlib import Path 3 | 4 | from pydantic import BaseModel, Field 5 | from osw_data import ( 6 | MultiAgentDataset, 7 | AgentMetadata, 8 | MediaType, 9 | PointType, 10 | AnnotationSystem, 11 | ) 12 | from .base import BaseConverter, run_converter 13 | 14 | 15 | class HumanEvalFinal(BaseModel): 16 | env_id: str 17 | user_id: str 18 | agent_rating: int 19 | outcome_preference: str 20 | outcome_rating: int 21 | feedback: str 22 | 23 | 24 | class HumanEval(BaseModel): 25 | final: HumanEvalFinal | None = Field(default=None) 26 | 27 | 28 | class Action(BaseModel): 29 | role: str 30 | action: str 31 | action_status: str 32 | timestamp: str 33 | 34 | 35 | class CoGymTrajectory(BaseModel): 36 | trajectory: list[Action] 37 | task: str 38 | human_eval: HumanEval 39 | 40 | 41 | class CoGymConverter(BaseConverter): 42 | def __init__( 43 | self, output_path: Path, source_path: Path, annotation_path: Path | None = None 44 | ) -> None: 45 | super().__init__(output_path, source_path) 46 | self.annotation_path = annotation_path 47 | 48 | def download_data(self) -> None: 49 | trajectory_path = self.source_path / "trajectory" 50 | 51 | if not trajectory_path.exists(): 52 | raise FileNotFoundError(f"Trajectory path {trajectory_path} does not exist") 53 | 54 | def convert_to_dataset(self) -> None: 55 | """Convert entire Sotopia dataset""" 56 | self.logger.info("Converting CoGym dataset") 57 | 58 | dataset = MultiAgentDataset( 59 | name="CoGym Interaction", 60 | base_path=self.output_path, 61 | description="CoGym dialog interactions", 62 | ) 63 | 64 | if self.annotation_path is not None: 65 | annotation_system = AnnotationSystem( 66 | base_path=self.annotation_path, 67 | project_name="CoGym Annotations", 68 | annotation_schema={ 69 | "feedback": { 70 | "type": "string", 71 | "description": "Free-form text feedback on the trajectory", 72 | } 73 | }, 74 | ) 75 | 76 | annotator_id = "Original CoGym Annotators" 77 | 78 | if annotator_id not in annotation_system.project.annotators: 79 | annotation_system.add_annotator( 80 | annotator_id=annotator_id, 81 | name=annotator_id, # Using ID as name for simplicity 82 | ) 83 | 84 | trajectory_path = self.source_path / "trajectory" 85 | 86 | for trajectory_file in trajectory_path.glob("*.json"): 87 | trajectory = CoGymTrajectory.model_validate_json( 88 | trajectory_file.read_text() 89 | ) 90 | 91 | roles = list(set([action.role for action in trajectory.trajectory])) 92 | if len(roles) == 1: 93 | self.logger.warning( 94 | f"Skipping trajectory with only one role: {trajectory_file}" 95 | ) 96 | continue 97 | 98 | assert len(roles) == 2, f"Expected 2 roles, got {roles}" 99 | the_other_role = { 100 | roles[0]: roles[1], 101 | roles[1]: roles[0], 102 | } 103 | agent_role: str | None = None 104 | 105 | agents_metadata = {} 106 | for role in roles: 107 | if "user" in role: 108 | agents_metadata[role] = AgentMetadata( 109 | agent_id=role, 110 | agent_type="human", 111 | capabilities=["dialog"], 112 | ) 113 | agent_role = the_other_role[role] 114 | else: 115 | agents_metadata[role] = AgentMetadata( 116 | agent_id=role, 117 | agent_type="agent", 118 | capabilities=["dialog", "code_generation"], 119 | ) 120 | 121 | assert agent_role 122 | 123 | instance_metadata = { 124 | "task": trajectory.task, 125 | } 126 | 127 | instance_id = dataset.create_instance( 128 | agents_metadata=agents_metadata, instance_metadata=instance_metadata 129 | ) 130 | 131 | for action in trajectory.trajectory: 132 | dataset.add_data_point( 133 | instance_id=instance_id, 134 | agent_id=action.role, 135 | point_type=PointType.ACTION, 136 | media_type=MediaType.JSON, 137 | data=action.action, 138 | timestamp=datetime.datetime.fromisoformat(action.timestamp), 139 | ) 140 | dataset.add_data_point( 141 | instance_id=instance_id, 142 | agent_id=the_other_role[action.role], 143 | point_type=PointType.OBSERVATION, 144 | media_type=MediaType.JSON, 145 | data=action.action_status, 146 | timestamp=datetime.datetime.fromisoformat(action.timestamp), 147 | ) 148 | 149 | if ( 150 | self.annotation_path is not None 151 | and trajectory.human_eval.final is not None 152 | ): 153 | annotation_system.add_annotation( 154 | instance_id=instance_id, 155 | agent_id=agent_role, 156 | annotator_id=annotator_id, 157 | content={"feedback": trajectory.human_eval.final.feedback}, 158 | ) 159 | 160 | dataset.close() 161 | 162 | 163 | if __name__ == "__main__": 164 | source_path = Path(".data/raw/cogym") 165 | output_path = Path(".data/cogym") 166 | annotation_path = Path(".data/annotations/cogym") 167 | 168 | run_converter( 169 | CoGymConverter, output_path, source_path, annotation_path=annotation_path 170 | ) 171 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/datasets/balrog_babaisai.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from pathlib import Path 4 | from datetime import datetime, timedelta 5 | import csv 6 | import shutil 7 | 8 | # Import our dataset classes 9 | from osw_data import MultiAgentDataset, AgentMetadata, PointType, MediaType 10 | 11 | from .base import BaseConverter, run_converter 12 | from osw_data.utils import file_pairs_list 13 | 14 | 15 | class BalrogConverter(BaseConverter): 16 | """Handles downloading and converting Balrog data to our dataset format""" 17 | 18 | def __init__(self, output_path: Path, source_path: Path): 19 | super().__init__(output_path, source_path) 20 | 21 | def download_data(self) -> None: 22 | """Download Balrog dataset files""" 23 | self.source_path.mkdir(parents=True, exist_ok=True) 24 | 25 | # This only exists to satisfy the BaseConverter class 26 | 27 | @staticmethod 28 | def clean_csv_file(file_path: Path) -> None: 29 | """Remove NUL characters from a CSV file.""" 30 | with open(file_path, "rb") as f: 31 | content = f.read() 32 | 33 | # Remove NUL characters 34 | content = content.replace(b"\x00", b"") 35 | 36 | with open(file_path, "wb") as f: 37 | f.write(content) 38 | 39 | def convert_to_dataset(self) -> None: 40 | """Convert Balrog data to autolibra dataset format""" 41 | self.logger.info("Creating Balrog dataset...") 42 | 43 | ref_time = datetime.now() # Used for step_id 44 | 45 | # Obtain task from folder name 46 | task = self.source_path.name.split("_")[0].split("-")[-1] 47 | task = task[0].upper() + task[1:] 48 | 49 | # Initialize dataset 50 | dataset = MultiAgentDataset( 51 | name=f"{task}-Balrog", 52 | base_path=self.output_path, 53 | description=f"{task} trajectories from Balrog dataset", 54 | ) 55 | 56 | # Get list of all directories within self.source_path 57 | subtasks: list[str] = [ 58 | f.name for f in os.scandir(self.source_path) if f.is_dir() 59 | ] 60 | 61 | # Read trajectories (for given task type, exists n subdirs for task, each subdir has a trajectory file) 62 | 63 | # Iterate over folders in task_dir 64 | for subtask in subtasks: 65 | subtask_dir = self.source_path / subtask 66 | 67 | fpl = file_pairs_list(subtask_dir) 68 | 69 | for traj_file, jf in fpl: 70 | # Get pair of files in subtask_dir 71 | episode_number = str(str(jf).split("_")[-1].split(".")[0]) 72 | # Clean the CSV file before processing 73 | csv_path = subtask_dir / f"{subtask}_run_{episode_number}.csv" 74 | self.clean_csv_file(csv_path) # Call the clean_csv_file method 75 | # Load json file 76 | json_file = json.load(open(jf)) 77 | 78 | prompt_data = json_file["prompt"] 79 | 80 | # Create agent metadata (this does not change within a subtask) 81 | agents_metadata = { 82 | "agent": AgentMetadata( 83 | agent_id="agent", 84 | agent_type="game_agent", 85 | capabilities=["navigation", "interaction"], 86 | ) 87 | } 88 | 89 | # Create instance metadata (this does not change within a subtask) 90 | instance_metadata = { 91 | "task": json_file["task"], 92 | "source_model": json_file["client"]["model_id"], 93 | "prompt": prompt_data, 94 | } 95 | 96 | instance_id = dataset.create_instance( 97 | agents_metadata=agents_metadata, instance_metadata=instance_metadata 98 | ) 99 | self.logger.info( 100 | f"Created instance {instance_id} for episode number {episode_number}" 101 | ) 102 | 103 | gif_path = subtask_dir / f"episode_{episode_number}.gif" 104 | # Copy gif to output path 105 | gif_out_path = ( 106 | self.output_path 107 | / "instances" 108 | / instance_id 109 | / f"episode_{episode_number}.gif" 110 | ) 111 | shutil.copy(gif_path, gif_out_path) 112 | 113 | # Update instance_id with gif_path 114 | add_gif = {"gif_path": gif_out_path} 115 | dataset.update_instance_metadata( 116 | instance_id=instance_id, new_meta=add_gif 117 | ) 118 | 119 | with open(traj_file, newline="") as f: 120 | reader = csv.reader(f, quotechar='"', quoting=csv.QUOTE_MINIMAL) 121 | # Skip header 122 | next(reader) 123 | for line in ( 124 | reader 125 | ): # Format of Step,Action,Reasoning,Observation,Reward,Done 126 | line = [ 127 | field.replace("\n", " ").replace("\r", "") for field in line 128 | ] 129 | 130 | # Convert to datetime by adding to now 131 | step_id = ref_time + timedelta(seconds=int(line[0])) 132 | actions = line[1] 133 | reasoning = line[2] 134 | observations = line[3] 135 | # Make new glyphs by running self.gm.glyph_id_to_rgb on each element of glyphs_raw in vectorized form 136 | 137 | # step_id should be the same to allow reconstruction of the trajectory, but if this 138 | # causes issues, should be fixed 139 | act_obj = {"reasoning": reasoning, "text": actions} 140 | 141 | obs_obj = {"observations": observations} 142 | 143 | dataset.add_data_point( 144 | instance_id=instance_id, 145 | agent_id="agent", 146 | timestamp=step_id, 147 | point_type=PointType.OBSERVATION, 148 | data=obs_obj, 149 | media_type=MediaType.JSON, 150 | ) 151 | 152 | dataset.add_data_point( 153 | instance_id=instance_id, 154 | agent_id="agent", 155 | timestamp=step_id, # Using step_id as timestamp 156 | point_type=PointType.ACTION, 157 | data=act_obj, 158 | media_type=MediaType.JSON, 159 | ) 160 | 161 | self.logger.info(f"Dataset conversion complete for {task}") 162 | dataset.close() 163 | 164 | 165 | if __name__ == "__main__": 166 | import argparse 167 | 168 | parser = argparse.ArgumentParser(description="Balrog Converter") 169 | parser.add_argument( 170 | "--filename", 171 | type=str, 172 | required=True, 173 | help="The name of the folder containing the Balrog-babaisai data for the given run", 174 | ) 175 | 176 | filename = parser.parse_args().filename 177 | 178 | source_path = Path(f".data/raw/{filename}") # Handle all balrog data in one folder 179 | output_path = Path( 180 | f".data/{filename.split('-')[-1]}" 181 | ) # Handle all balrog data in one folder 182 | 183 | run_converter(BalrogConverter, output_path, source_path) 184 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/datasets/balrog_mini.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from pathlib import Path 4 | from datetime import datetime, timedelta 5 | import csv 6 | import shutil 7 | 8 | 9 | # Import our dataset classes 10 | from osw_data import MultiAgentDataset, AgentMetadata, PointType, MediaType 11 | 12 | from .base import BaseConverter, run_converter 13 | 14 | 15 | class BalrogConverter(BaseConverter): 16 | """Handles downloading and converting Balrog data to our dataset format""" 17 | 18 | def __init__(self, output_path: Path, source_path: Path): 19 | super().__init__(output_path, source_path) 20 | 21 | def download_data(self) -> None: 22 | """Download Balrog dataset files""" 23 | self.source_path.mkdir(parents=True, exist_ok=True) 24 | 25 | # This only exists to satisfy the BaseConverter class 26 | 27 | @staticmethod 28 | def clean_csv_file(file_path: Path) -> None: 29 | """Remove NUL characters from a CSV file.""" 30 | with open(file_path, "rb") as f: 31 | content = f.read() 32 | 33 | # Remove NUL characters 34 | content = content.replace(b"\x00", b"") 35 | 36 | with open(file_path, "wb") as f: 37 | f.write(content) 38 | 39 | def convert_to_dataset(self) -> None: 40 | """Convert Balrog data to autolibra dataset format""" 41 | self.logger.info("Creating Balrog dataset...") 42 | 43 | ref_time = datetime.now() # Used for step_id 44 | 45 | # Obtain task from folder name 46 | task = self.source_path.name.split("_")[0].split("-")[-1] 47 | task = task[0].upper() + task[1:] 48 | 49 | # Initialize dataset 50 | dataset = MultiAgentDataset( 51 | name=f"{task}-Balrog", 52 | base_path=self.output_path, 53 | description=f"{task} trajectories from Balrog dataset", 54 | ) 55 | 56 | # Get list of all directories within self.source_path 57 | subtasks: list[str] = [ 58 | f.name for f in os.scandir(self.source_path) if f.is_dir() 59 | ] 60 | 61 | # Iterate over folders in task_dir 62 | for subtask in subtasks: 63 | subtask_dir = self.source_path / subtask 64 | 65 | # Find all files with matching suffixes (00, 01, 02) 66 | for suffix in ["00", "01", "02"]: 67 | # Construct file paths for the current group 68 | gif_path = subtask_dir / f"episode_{suffix}.gif" 69 | csv_path = ( 70 | subtask_dir / f"{subtask}_run_{suffix}.csv" 71 | ) # Use subtask name + _run_ + suffix 72 | json_path = ( 73 | subtask_dir / f"{subtask}_run_{suffix}.json" 74 | ) # Use subtask name + _run_ + suffix 75 | pkl_path = ( 76 | subtask_dir / f"{subtask}_run_{suffix}.pkl" 77 | ) # Use subtask name + _run_ + suffix 78 | 79 | # Check if all required files exist for this group 80 | if not ( 81 | gif_path.exists() 82 | and csv_path.exists() 83 | and json_path.exists() 84 | and pkl_path.exists() 85 | ): 86 | self.logger.warning( 87 | f"Missing files for suffix {suffix} in {subtask_dir}" 88 | ) 89 | continue 90 | 91 | # Clean the CSV file before processing 92 | self.clean_csv_file(csv_path) # Call the clean_csv_file method 93 | 94 | # Load json file 95 | json_file = json.load(open(json_path)) 96 | 97 | # Create agent metadata (this does not change within a subtask) 98 | agents_metadata = { 99 | "agent": AgentMetadata( 100 | agent_id="agent", 101 | agent_type="game_agent", 102 | capabilities=["navigation", "interaction"], 103 | ) 104 | } 105 | 106 | # Create instance metadata (this does not change within a subtask) 107 | instance_metadata = { 108 | "task": json_file["task"], 109 | "source_model": json_file["client"]["model_id"], 110 | } 111 | 112 | # Create a unique instance ID for this group 113 | instance_id = dataset.create_instance( 114 | agents_metadata=agents_metadata, instance_metadata=instance_metadata 115 | ) 116 | self.logger.info(f"Created instance {instance_id} for suffix {suffix}") 117 | 118 | # Copy GIF to output path 119 | gif_out_path = ( 120 | self.output_path 121 | / "instances" 122 | / instance_id 123 | / f"episode_{suffix}.gif" 124 | ) 125 | shutil.copy(gif_path, gif_out_path) 126 | 127 | # Update instance metadata with GIF path 128 | add_gif = {"gif_path": gif_out_path} 129 | dataset.update_instance_metadata( 130 | instance_id=instance_id, new_meta=add_gif 131 | ) 132 | 133 | # Process the CSV file 134 | with open(csv_path, newline="") as f: 135 | reader = csv.reader(f, quotechar='"', quoting=csv.QUOTE_MINIMAL) 136 | next(reader) # Skip header 137 | for line in reader: 138 | line = [ 139 | field.replace("\n", " ").replace("\r", "") for field in line 140 | ] 141 | 142 | step_id = ref_time + timedelta(seconds=int(line[0])) 143 | actions = line[1] 144 | reasoning = line[2] 145 | observations = line[3] 146 | 147 | act_obj = {"reasoning": reasoning, "text": actions} 148 | 149 | obs_obj = {"observations": observations} 150 | 151 | dataset.add_data_point( 152 | instance_id=instance_id, 153 | agent_id="agent", 154 | timestamp=step_id, 155 | point_type=PointType.OBSERVATION, 156 | data=obs_obj, 157 | media_type=MediaType.JSON, 158 | ) 159 | 160 | dataset.add_data_point( 161 | instance_id=instance_id, 162 | agent_id="agent", 163 | timestamp=step_id, 164 | point_type=PointType.ACTION, 165 | data=act_obj, 166 | media_type=MediaType.JSON, 167 | ) 168 | 169 | self.logger.info(f"Dataset conversion complete for {task}") 170 | dataset.close() 171 | 172 | 173 | if __name__ == "__main__": 174 | import argparse 175 | 176 | parser = argparse.ArgumentParser(description="Balrog Converter") 177 | parser.add_argument( 178 | "--filename", 179 | type=str, 180 | required=True, 181 | help="The name of the folder containing the Balrog-minihack data for the given run", 182 | ) 183 | 184 | filename = parser.parse_args().filename 185 | 186 | source_path = Path(f".data/raw/{filename}") # Handle all balrog data in one folder 187 | output_path = Path( 188 | f".data/{filename.split('-')[-1]}" 189 | ) # Handle all balrog data in one folder 190 | 191 | run_converter(BalrogConverter, output_path, source_path) 192 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/evaluators/coverage_evaluator.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import pickle 3 | from typing import Literal 4 | import logfire 5 | from openai import AsyncAzureOpenAI, RateLimitError 6 | import openai 7 | from osw_data.metrics import Metric 8 | from autolibra_core.configs import AutoLibraEvalSettings 9 | from autolibra_core.data import Aspect 10 | from autolibra_core.data.primitives import Trait 11 | from ..data import MetricTrainingInstance 12 | from ..operators import feedback_grounding 13 | from pydantic import BaseModel, Field, ValidationError, create_model 14 | from pydantic.fields import FieldInfo 15 | from autolibra_core.utils import load_prompt_template 16 | 17 | 18 | def _sanitize_string(s: str) -> str: 19 | return ( 20 | s.replace("\\'", "") 21 | .replace('\\"', "") 22 | .replace("\\n", " ") 23 | .replace("'", "") 24 | .replace('"', "") 25 | .replace("\n", " ") 26 | .replace("\\", " ") 27 | .replace("ℹ", " ") 28 | ) 29 | 30 | 31 | async def create_aspect_traits_match_pydantic_model( 32 | aspects: list[Aspect], traits: list[Metric] 33 | ) -> type[BaseModel]: 34 | fields: dict[str, tuple[type[Literal[str]], FieldInfo]] = {} # type: ignore[valid-type] 35 | for i in range(len(aspects)): 36 | fields[f"aspect_{i}"] = ( # type: ignore[assignment] 37 | Literal[ 38 | _sanitize_string(aspects[i].feedback) 39 | + ": " 40 | + _sanitize_string(aspects[i].behavior) 41 | ], 42 | Field(title=f"Aspect {i}"), 43 | ) 44 | 45 | fields[f"trait_{i}"] = ( # type: ignore[assignment] 46 | Literal[ 47 | tuple( 48 | _sanitize_string(trait.name) 49 | + ": " 50 | + _sanitize_string(trait.explanation) 51 | for trait in traits 52 | ) 53 | + ("None of the traits matches the aspect.",) 54 | ], 55 | Field(title=f"Trait {i}"), 56 | ) 57 | 58 | return create_model("AspectTraitsMatch", **fields) # type: ignore[no-any-return, call-overload] 59 | 60 | 61 | async def match_aspects_and_traits( 62 | client: AsyncAzureOpenAI, aspects: list[Aspect], traits: list[Metric] 63 | ) -> dict[str, str]: 64 | settings = AutoLibraEvalSettings() 65 | results: list[BaseModel] = [] 66 | for aspect in aspects: 67 | aspect_traits_model = await create_aspect_traits_match_pydantic_model( 68 | [aspect], traits 69 | ) 70 | 71 | template = load_prompt_template("coverage_evaluation_v2.j2") 72 | prompt = template.render( 73 | aspects=[aspect], 74 | traits=traits, 75 | ) 76 | 77 | model = settings.azure_openai_4o_model 78 | assert model 79 | 80 | while True: 81 | wait_time = 1 82 | try: 83 | completion = await client.beta.chat.completions.parse( 84 | model=model, 85 | messages=[ 86 | { 87 | "role": "system", 88 | "content": "Match the aspects with the traits.", 89 | }, 90 | {"role": "user", "content": prompt}, 91 | ], 92 | response_format=aspect_traits_model, 93 | ) 94 | break 95 | except ValidationError as e: 96 | print(e) 97 | print(aspect_traits_model.model_json_schema()) 98 | except RateLimitError as e: 99 | print(e) 100 | wait_time *= 2 101 | await asyncio.sleep(wait_time) 102 | except openai.BadRequestError as e: 103 | print(aspect_traits_model.model_json_schema()) 104 | logfire.warning(f"Schema error: {e}") 105 | raise e 106 | 107 | result_or_none = completion.choices[0].message.parsed 108 | assert result_or_none and isinstance(result_or_none, aspect_traits_model) 109 | results.append(result_or_none) 110 | 111 | result_dict: dict[str, str] = {} 112 | for i, result in enumerate(results): 113 | result_dict[f"aspect_{i}"] = result.model_dump()["aspect_0"] 114 | result_dict[f"trait_{i}"] = result.model_dump()["trait_0"] 115 | 116 | return result_dict 117 | 118 | 119 | async def run_instance_coverage_eval( 120 | client: AsyncAzureOpenAI, 121 | aspects: list[Aspect], 122 | traits: list[Trait], 123 | ) -> tuple[int, int, int, int, list[Aspect]]: 124 | positive_aspects = [aspect for aspect in aspects if aspect.is_positive] 125 | negative_aspects = [aspect for aspect in aspects if not aspect.is_positive] 126 | positive_traits = [trait.metric for trait in traits if trait.rating == 1] 127 | negative_traits = [trait.metric for trait in traits if trait.rating == -1] 128 | 129 | # Coverage on positive aspects 130 | try: 131 | positive_match_results = await match_aspects_and_traits( 132 | client, positive_aspects, positive_traits 133 | ) 134 | except openai.BadRequestError as e: 135 | pickle.dump( 136 | (positive_aspects, positive_traits), 137 | open("positive_aspects_traits.pkl", "wb"), 138 | ) 139 | raise e 140 | 141 | # Coverage on negative aspects 142 | negative_match_results = await match_aspects_and_traits( 143 | client, negative_aspects, negative_traits 144 | ) 145 | 146 | number_of_total_aspects = len(aspects) 147 | number_of_not_matched_aspects = 0 148 | unmatch_aspects: list[Aspect] = [] 149 | 150 | for i in range(len(positive_aspects)): 151 | if ( 152 | positive_match_results[f"trait_{i}"] 153 | == "None of the traits matches the aspect." 154 | ): 155 | number_of_not_matched_aspects += 1 156 | unmatch_aspects.append(positive_aspects[i]) 157 | 158 | for i in range(len(negative_aspects)): 159 | if ( 160 | negative_match_results[f"trait_{i}"] 161 | == "None of the traits matches the aspect." 162 | ): 163 | number_of_not_matched_aspects += 1 164 | unmatch_aspects.append(negative_aspects[i]) 165 | 166 | used_traits = set() 167 | 168 | for i in range(len(positive_aspects)): 169 | if ( 170 | positive_match_results[f"trait_{i}"] 171 | != "None of the traits matches the aspect." 172 | ): 173 | used_traits.add(positive_match_results[f"trait_{i}"]) 174 | 175 | for i in range(len(negative_aspects)): 176 | if ( 177 | negative_match_results[f"trait_{i}"] 178 | != "None of the traits matches the aspect." 179 | ): 180 | used_traits.add(negative_match_results[f"trait_{i}"]) 181 | 182 | return ( 183 | number_of_total_aspects - number_of_not_matched_aspects, 184 | number_of_total_aspects, 185 | len(traits) - len(used_traits), 186 | len(traits), 187 | unmatch_aspects, 188 | ) 189 | 190 | 191 | async def run_coverage_eval( 192 | instance_traits: list[list[Trait]], 193 | instances: list[MetricTrainingInstance], 194 | client: AsyncAzureOpenAI, 195 | ) -> list[tuple[int, int, int, int, list[Aspect]]]: 196 | instance_aspects = await asyncio.gather( 197 | *[feedback_grounding(instance, client) for instance in instances] 198 | ) 199 | 200 | with open("feedback_grounding_results.jsonl", "w") as f: 201 | for feedback_grounding_result in instance_aspects: 202 | for aspect in feedback_grounding_result: 203 | f.write(aspect.model_dump_json(indent=2)) 204 | f.write("\n") 205 | f.write("\n") 206 | 207 | coverage_results = await asyncio.gather( 208 | *[ 209 | run_instance_coverage_eval(client, aspects, traits) 210 | for aspects, traits in zip(instance_aspects, instance_traits) 211 | ] 212 | ) 213 | 214 | return coverage_results 215 | -------------------------------------------------------------------------------- /src/tty/view_annotations.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from pathlib import Path 3 | import pandas as pd 4 | from datetime import datetime 5 | import typer 6 | from osw_data.annotation import AnnotationSystem 7 | from rich.console import Console 8 | from typing import List, Dict, Any 9 | 10 | console = Console() 11 | app = typer.Typer() 12 | 13 | 14 | def sanitize_text(text: str) -> str: 15 | """Clean text to avoid display issues and escape markdown characters.""" 16 | # Remove any extra quotes around the text 17 | text = text.strip("'\"") 18 | 19 | # Replace problematic unicode characters with their closest ASCII equivalents 20 | replacements = {'"': '"', "'": "'", "–": "-", "—": "-", "…": "...", "' '": " "} 21 | for old, new in replacements.items(): 22 | text = text.replace(old, new) 23 | 24 | # Remove any remaining single quotes between characters 25 | text = "".join( 26 | c 27 | for i, c in enumerate(text) 28 | if c != "'" 29 | or ( 30 | i > 0 31 | and i < len(text) - 1 32 | and text[i - 1].isalpha() 33 | and text[i + 1].isalpha() 34 | ) 35 | ) 36 | 37 | # Escape markdown special characters 38 | markdown_chars: List[str] = [ 39 | "*", 40 | "_", 41 | "`", 42 | "#", 43 | "~", 44 | ">", 45 | "<", 46 | "[", 47 | "]", 48 | "(", 49 | ")", 50 | "|", 51 | "$", 52 | ] 53 | for char in markdown_chars: 54 | text = text.replace(char, "\\" + char) 55 | 56 | return text 57 | 58 | 59 | def load_annotations(annotations_dir: Path) -> List[Dict[str, Any]]: 60 | """Load all annotations using AnnotationSystem.""" 61 | annotations: List[Dict[str, Any]] = [] 62 | 63 | # Initialize AnnotationSystem 64 | annotation_system = AnnotationSystem( 65 | base_path=annotations_dir.parent, # Navigate up to where project.yaml is 66 | project_name="Annotation Viewer", 67 | description="View annotations from different projects", 68 | ) 69 | 70 | # Get all annotation files in the directory 71 | annotation_files = list(annotations_dir.glob("*.json")) 72 | 73 | # Extract instance IDs from filenames 74 | for file_path in annotation_files: 75 | # Filename format: instance_id_agent_id.json 76 | instance_id, agent_id = file_path.stem.rsplit("_", 1) 77 | 78 | trajectory_annotations = annotation_system.get_trajectory_annotations( 79 | instance_id=instance_id, agent_id=agent_id 80 | ) 81 | 82 | for annotation in trajectory_annotations.annotations: 83 | # Parse the ISO format timestamp string into a datetime object 84 | # Handle both string and datetime objects 85 | if isinstance(annotation.created_at, str): 86 | created_dt = datetime.fromisoformat( 87 | annotation.created_at.replace("Z", "+00:00") 88 | ) 89 | else: 90 | created_dt = annotation.created_at 91 | 92 | # Handle null start/end times 93 | start_time = ( 94 | annotation.span.start_time 95 | if annotation.span and annotation.span.start_time 96 | else "N/A" 97 | ) 98 | end_time = ( 99 | annotation.span.end_time 100 | if annotation.span and annotation.span.end_time 101 | else "N/A" 102 | ) 103 | 104 | annotations.append( 105 | { 106 | "instance_id": instance_id, 107 | "agent_id": agent_id, 108 | "annotator_id": annotation.annotator_id, 109 | "feedback": sanitize_text(annotation.content["feedback"]), 110 | "start_time": start_time, 111 | "end_time": end_time, 112 | "created_at": created_dt.strftime("%Y-%m-%d %H:%M:%S"), 113 | "created_dt": created_dt, 114 | } 115 | ) 116 | 117 | return annotations 118 | 119 | 120 | @app.command() 121 | def main( 122 | annotations_dir: Path = typer.Argument( 123 | ..., 124 | help="Path to the annotations directory (e.g., .data/annotations/sotopia/annotations)", 125 | exists=True, 126 | dir_okay=True, 127 | file_okay=False, 128 | ), 129 | ) -> None: 130 | """View annotations from the specified directory.""" 131 | streamlit_main(annotations_dir) 132 | 133 | 134 | def streamlit_main(annotations_dir: Path) -> None: 135 | """Main Streamlit interface.""" 136 | st.title("🔍 Annotation Viewer") 137 | 138 | # Convert to absolute path and resolve any relative path components 139 | annotations_dir = annotations_dir.absolute().resolve() 140 | 141 | if not annotations_dir.exists(): 142 | st.error(f"Annotations directory not found: {annotations_dir}") 143 | st.info( 144 | "Please provide the full path to the annotations directory. For example:\n\n" 145 | "```bash\n" 146 | "autolibra-eval view-annotations .data/annotations/sotopia/annotations\n" 147 | "```" 148 | ) 149 | return 150 | 151 | # Load annotations 152 | annotations = load_annotations(annotations_dir) 153 | 154 | if not annotations: 155 | st.warning("No annotations found.") 156 | return 157 | 158 | # Convert to DataFrame for easier manipulation 159 | df = pd.DataFrame(annotations) 160 | 161 | # Remove duplicate annotations, keeping only the most recent one 162 | df = df.sort_values("created_dt", ascending=False).drop_duplicates( 163 | subset=["instance_id", "agent_id", "annotator_id", "feedback"], keep="first" 164 | ) 165 | 166 | # Display summary statistics 167 | st.header("📊 Summary Statistics") 168 | col1, col2, col3 = st.columns(3) 169 | 170 | with col1: 171 | st.metric("Total Annotations", len(df)) 172 | with col2: 173 | st.metric("Unique Instances", df["instance_id"].nunique()) 174 | with col3: 175 | st.metric("Unique Annotators", df["annotator_id"].nunique()) 176 | 177 | # Filters 178 | st.header("🔎 Filters") 179 | col1, col2 = st.columns(2) 180 | 181 | with col1: 182 | selected_annotator = st.selectbox( 183 | "Select Annotator", 184 | options=["All"] + sorted(df["annotator_id"].unique().tolist()), 185 | ) 186 | 187 | # Filter instance options based on selected annotator 188 | instance_options = df["instance_id"].unique().tolist() 189 | if selected_annotator != "All": 190 | instance_options = ( 191 | df[df["annotator_id"] == selected_annotator]["instance_id"] 192 | .unique() 193 | .tolist() 194 | ) 195 | 196 | with col2: 197 | selected_instance = st.selectbox( 198 | "Select Instance", options=["All"] + sorted(instance_options) 199 | ) 200 | 201 | # Apply filters 202 | filtered_df = df.copy() 203 | if selected_annotator != "All": 204 | filtered_df = filtered_df[filtered_df["annotator_id"] == selected_annotator] 205 | if selected_instance != "All": 206 | filtered_df = filtered_df[filtered_df["instance_id"] == selected_instance] 207 | 208 | # Display annotations 209 | st.header("📝 Annotations") 210 | 211 | # Sort by timestamp in descending order 212 | filtered_df = filtered_df.sort_values("created_dt", ascending=False) 213 | 214 | for _, row in filtered_df.iterrows(): 215 | with st.expander( 216 | f"Instance: {row['instance_id']} | Agent: {row['agent_id']} | {row['created_at']}", 217 | expanded=False, 218 | ): 219 | st.markdown(f"**Annotator:** {row['annotator_id']}") 220 | st.markdown("**Feedback:**") 221 | st.info(row["feedback"]) 222 | if row["start_time"] != "N/A" or row["end_time"] != "N/A": 223 | st.markdown(f"**Time Range:** {row['start_time']} to {row['end_time']}") 224 | 225 | 226 | if __name__ == "__main__": 227 | app() 228 | -------------------------------------------------------------------------------- /packages/osw-data/tests/test_metrics.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pathlib import Path 3 | import json 4 | from typing import Generator 5 | 6 | # Import the classes to test 7 | from osw_data.metrics import MetricSet, Metric, MetricSetMetadata 8 | 9 | 10 | @pytest.fixture 11 | def sample_metric() -> Metric: 12 | """Fixture that returns a sample metric""" 13 | return Metric( 14 | name="test_metric", 15 | explanation="A test metric", 16 | good_behaviors=["good1", "good2"], 17 | bad_behaviors=["bad1", "bad2"], 18 | ) 19 | 20 | 21 | @pytest.fixture 22 | def sample_metrics() -> list[Metric]: 23 | """Fixture that returns a list of sample metrics""" 24 | return [ 25 | Metric( 26 | name="metric1", 27 | explanation="First test metric", 28 | good_behaviors=["good1"], 29 | bad_behaviors=["bad1"], 30 | ), 31 | Metric( 32 | name="metric2", 33 | explanation="Second test metric", 34 | good_behaviors=["good2"], 35 | bad_behaviors=["bad2"], 36 | ), 37 | ] 38 | 39 | 40 | @pytest.fixture 41 | def metric_set(tmp_path: Path) -> Generator[MetricSet, None, None]: 42 | """Fixture that creates a MetricSet instance with a temporary directory""" 43 | ms = MetricSet( 44 | name="test_set", base_path=tmp_path, induced_from="test_source", version="1.0.0" 45 | ) 46 | yield ms 47 | 48 | 49 | class TestMetricSetInitialization: 50 | def test_basic_initialization(self, tmp_path: Path) -> None: 51 | """Test basic initialization of MetricSet""" 52 | ms = MetricSet(name="test", base_path=tmp_path, induced_from="source") 53 | 54 | assert ms.base_path == tmp_path 55 | assert ms.metrics_path == tmp_path / "metrics" 56 | assert ms.metadata_path == tmp_path / "metadata.json" 57 | assert ms.metrics_path.exists() 58 | assert ms.base_path.exists() 59 | 60 | def test_initialization_with_existing_metadata(self, tmp_path: Path) -> None: 61 | """Test initialization when metadata file already exists""" 62 | # Create existing metadata 63 | metadata = MetricSetMetadata( 64 | name="existing", 65 | metric_names=["metric1"], 66 | induced_from="source", 67 | version="1.0", 68 | ) 69 | metadata_path = tmp_path / "metadata.json" 70 | metadata_path.parent.mkdir(parents=True, exist_ok=True) 71 | with open(metadata_path, "w") as f: 72 | f.write(metadata.model_dump_json(indent=2)) 73 | 74 | metric1 = Metric( 75 | name="metric1", 76 | explanation="First test metric", 77 | good_behaviors=["good1"], 78 | bad_behaviors=["bad1"], 79 | ) 80 | 81 | # Create existing metric file 82 | metric_path = tmp_path / "metrics" / "metric1.json" 83 | metric_path.parent.mkdir(parents=True, exist_ok=True) 84 | with open(metric_path, "w") as f: 85 | f.write(metric1.model_dump_json(indent=2)) 86 | 87 | # Initialize MetricSet with existing metadata 88 | ms = MetricSet(name="new_name", base_path=tmp_path, induced_from="new_source") 89 | 90 | # Should load existing metadata instead of creating new 91 | assert ms.metadata.name == "existing" 92 | assert ms.metadata.metric_names == ["metric1"] 93 | 94 | def test_initialization_with_invalid_metadata(self, tmp_path: Path) -> None: 95 | """Test initialization with corrupted metadata file""" 96 | metadata_path = tmp_path / "metadata.json" 97 | metadata_path.parent.mkdir(parents=True, exist_ok=True) 98 | with open(metadata_path, "w") as f: 99 | f.write("invalid json") 100 | 101 | with pytest.raises(Exception): 102 | MetricSet(name="test", base_path=tmp_path, induced_from="source") 103 | 104 | 105 | class TestMetricOperations: 106 | def test_add_single_metric( 107 | self, metric_set: MetricSet, sample_metric: Metric 108 | ) -> None: 109 | """Test adding a single metric""" 110 | metric_set.add_metrics([sample_metric]) 111 | 112 | # Check if metric was added to internal dict 113 | assert sample_metric.name in metric_set.metrics 114 | assert metric_set.metrics[sample_metric.name] == sample_metric 115 | 116 | # Check if metric file was created 117 | metric_path = metric_set.metrics_path / f"{sample_metric.name}.json" 118 | assert metric_path.exists() 119 | 120 | def test_add_multiple_metrics( 121 | self, metric_set: MetricSet, sample_metrics: list[Metric] 122 | ) -> None: 123 | """Test adding multiple metrics at once""" 124 | metric_set.add_metrics(sample_metrics) 125 | 126 | for metric in sample_metrics: 127 | assert metric.name in metric_set.metrics 128 | metric_path = metric_set.metrics_path / f"{metric.name}.json" 129 | assert metric_path.exists() 130 | 131 | def test_add_duplicate_metric( 132 | self, metric_set: MetricSet, sample_metric: Metric 133 | ) -> None: 134 | """Test adding a metric with a name that already exists""" 135 | metric_set.add_metrics([sample_metric]) 136 | 137 | with pytest.raises( 138 | ValueError, match=f"Metric with name {sample_metric.name} already exists" 139 | ): 140 | metric_set.add_metrics([sample_metric]) 141 | 142 | def test_get_existing_metric( 143 | self, metric_set: MetricSet, sample_metric: Metric 144 | ) -> None: 145 | """Test retrieving an existing metric""" 146 | metric_set.add_metrics([sample_metric]) 147 | 148 | retrieved_metric = metric_set.get_metric(sample_metric.name) 149 | assert retrieved_metric.model_dump() == sample_metric.model_dump() 150 | 151 | def test_get_nonexistent_metric(self, metric_set: MetricSet) -> None: 152 | """Test attempting to retrieve a metric that doesn't exist""" 153 | with pytest.raises( 154 | ValueError, match="Metric with name nonexistent does not exist" 155 | ): 156 | metric_set.get_metric("nonexistent") 157 | 158 | def test_get_metric_with_corrupted_file( 159 | self, metric_set: MetricSet, sample_metric: Metric 160 | ) -> None: 161 | """Test getting a metric when its file is corrupted""" 162 | metric_set.add_metrics([sample_metric]) 163 | 164 | # Corrupt the metric file 165 | metric_path = metric_set.metrics_path / f"{sample_metric.name}.json" 166 | with open(metric_path, "w") as f: 167 | f.write("invalid json") 168 | 169 | with pytest.raises(Exception): 170 | metric_set.get_metric(sample_metric.name) 171 | 172 | 173 | class TestMetadataOperations: 174 | def test_save_metadata(self, metric_set: MetricSet) -> None: 175 | """Test saving metadata to file""" 176 | new_metadata = MetricSetMetadata( 177 | name="new_name", 178 | metric_names=["metric1", "metric2"], 179 | induced_from="new_source", 180 | version="2.0.0", 181 | ) 182 | 183 | metric_set._save_metadata(new_metadata) 184 | 185 | # Verify file contents 186 | with open(metric_set.metadata_path, "r") as f: 187 | saved_data = json.loads(f.read()) 188 | assert saved_data["name"] == "new_name" 189 | assert saved_data["metric_names"] == ["metric1", "metric2"] 190 | 191 | def test_save_metrics( 192 | self, metric_set: MetricSet, sample_metrics: list[Metric] 193 | ) -> None: 194 | """Test saving all metrics to files""" 195 | metric_set.metrics = {metric.name: metric for metric in sample_metrics} 196 | metric_set._save_metrics() 197 | 198 | for metric in sample_metrics: 199 | metric_path = metric_set.metrics_path / f"{metric.name}.json" 200 | assert metric_path.exists() 201 | 202 | with open(metric_path, "r") as f: 203 | saved_data = json.loads(f.read()) 204 | assert saved_data["name"] == metric.name 205 | assert saved_data["explanation"] == metric.explanation 206 | 207 | def test_initialization_with_none_path(self) -> None: 208 | """Test initialization with None as path""" 209 | with pytest.raises(TypeError): 210 | MetricSet(name="test", base_path=None, induced_from="source") # type: ignore[arg-type] 211 | 212 | def test_file_permission_errors(self, tmp_path: Path) -> None: 213 | """Test handling of file permission errors""" 214 | # Create directory with no write permissions 215 | no_write_dir = tmp_path / "no_write" 216 | no_write_dir.mkdir() 217 | no_write_dir.chmod(0o444) # Read-only 218 | 219 | with pytest.raises(Exception): 220 | MetricSet(name="test", base_path=no_write_dir, induced_from="source") 221 | -------------------------------------------------------------------------------- /packages/osw-data/src/osw_data/dataset.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, Field 2 | from typing import Optional, Any 3 | from typing_extensions import Self 4 | from pathlib import Path 5 | import json 6 | import yaml 7 | from datetime import datetime 8 | from uuid import uuid4 9 | 10 | import numpy.typing as npt 11 | 12 | # Assume we're importing from the previous trajectory implementation 13 | from .trajectory import SymmetricTrajectory, PointType, MediaType 14 | 15 | 16 | class AgentMetadata(BaseModel): 17 | """Metadata for an individual agent""" 18 | 19 | agent_id: str 20 | agent_type: str 21 | capabilities: list[str] = Field(default_factory=list) 22 | parameters: dict[str, Any] = Field(default_factory=dict) 23 | additional_info: dict[str, Any] = Field(default_factory=dict) 24 | 25 | 26 | class DataInstance(BaseModel): 27 | """ 28 | A single instance in the dataset, containing multiple agent trajectories 29 | """ 30 | 31 | instance_id: str 32 | timestamp: datetime 33 | agents: dict[str, AgentMetadata] 34 | metadata: dict[str, Any] = Field(default_factory=dict) 35 | 36 | 37 | class DatasetMetadata(BaseModel): 38 | """Metadata for the entire dataset""" 39 | 40 | name: str 41 | version: str 42 | description: str 43 | created_at: datetime = Field(default_factory=datetime.now) 44 | updated_at: datetime = Field(default_factory=datetime.now) 45 | total_instances: int = 0 46 | agent_types: list[str] = Field(default_factory=list) 47 | schema_version: str = "1.0" 48 | additional_info: dict[str, Any] = Field(default_factory=dict) 49 | 50 | 51 | class MultiAgentDataset: 52 | """ 53 | Dataset managing multiple instances of multi-agent trajectories 54 | """ 55 | 56 | def __init__( 57 | self, 58 | name: str, 59 | base_path: Path | str, 60 | description: str = "", 61 | version: str = "1.0", 62 | ): 63 | self.base_path = Path(base_path) 64 | self.instances_path = self.base_path / "instances" 65 | self.metadata_path = self.base_path / "metadata.yaml" 66 | 67 | # Initialize directory structure 68 | self.base_path.mkdir(parents=True, exist_ok=True) 69 | self.instances_path.mkdir(exist_ok=True) 70 | 71 | # Initialize or load dataset metadata 72 | self.metadata = self._init_metadata(name, description, version) 73 | 74 | # Cache for open trajectories 75 | self._trajectory_cache: dict[str, dict[str, SymmetricTrajectory]] = {} 76 | 77 | def _init_metadata( 78 | self, name: str, description: str, version: str 79 | ) -> DatasetMetadata: 80 | """Initialize or load dataset metadata""" 81 | if self.metadata_path.exists(): 82 | with open(self.metadata_path, "r") as f: 83 | metadata_dict = yaml.safe_load(f) 84 | return DatasetMetadata(**metadata_dict) 85 | else: 86 | metadata = DatasetMetadata( 87 | name=name, version=version, description=description 88 | ) 89 | self._save_metadata(metadata) 90 | return metadata 91 | 92 | def _save_metadata(self, metadata: DatasetMetadata) -> None: 93 | """Save dataset metadata to disk""" 94 | with open(self.metadata_path, "w") as f: 95 | yaml.dump(json.loads(metadata.model_dump_json()), f) 96 | 97 | def create_instance( 98 | self, 99 | agents_metadata: dict[str, AgentMetadata], 100 | instance_metadata: Optional[dict[str, Any]] = None, 101 | ) -> str: 102 | """ 103 | Create a new instance in the dataset 104 | 105 | Args: 106 | agents_metadata: dictionary mapping agent_id to their metadata 107 | instance_metadata: Optional metadata for the instance 108 | 109 | Returns: 110 | instance_id: Unique identifier for the created instance 111 | """ 112 | instance_id = str(uuid4()) 113 | instance_path = self.instances_path / instance_id 114 | instance_path.mkdir(exist_ok=True) 115 | 116 | # Create instance metadata 117 | instance = DataInstance( 118 | instance_id=instance_id, 119 | timestamp=datetime.now(), 120 | agents=agents_metadata, 121 | metadata=instance_metadata or {}, 122 | ) 123 | 124 | # Save instance metadata 125 | with open(instance_path / "metadata.json", "w") as f: 126 | f.write(instance.model_dump_json()) 127 | 128 | # Initialize trajectories for each agent 129 | for agent_id in agents_metadata: 130 | trajectory = SymmetricTrajectory( 131 | trajectory_id=f"{instance_id}_{agent_id}", 132 | storage_path=instance_path / agent_id, 133 | ) 134 | if instance_id not in self._trajectory_cache: 135 | self._trajectory_cache[instance_id] = {} 136 | self._trajectory_cache[instance_id][agent_id] = trajectory 137 | 138 | # Update dataset metadata 139 | self.metadata.total_instances += 1 140 | self.metadata.agent_types = list( 141 | set( 142 | self.metadata.agent_types 143 | + [am.agent_type for am in agents_metadata.values()] 144 | ) 145 | ) 146 | self.metadata.updated_at = datetime.now() 147 | self._save_metadata(self.metadata) 148 | 149 | return instance_id 150 | 151 | def get_trajectory(self, instance_id: str, agent_id: str) -> SymmetricTrajectory: 152 | """Get trajectory for a specific agent in an instance""" 153 | if instance_id not in self._trajectory_cache: 154 | self._trajectory_cache[instance_id] = {} 155 | 156 | if agent_id not in self._trajectory_cache[instance_id]: 157 | instance_path = self.instances_path / instance_id 158 | if not instance_path.exists(): 159 | raise ValueError(f"Instance {instance_id} does not exist") 160 | 161 | self._trajectory_cache[instance_id][agent_id] = SymmetricTrajectory( 162 | trajectory_id=f"{instance_id}_{agent_id}", 163 | storage_path=instance_path / agent_id, 164 | ) 165 | 166 | return self._trajectory_cache[instance_id][agent_id] 167 | 168 | def get_instance_metadata(self, instance_id: str) -> DataInstance: 169 | """Get metadata for a specific instance""" 170 | instance_path = self.instances_path / instance_id 171 | if not instance_path.exists(): 172 | raise ValueError(f"Instance {instance_id} does not exist") 173 | 174 | with open(instance_path / "metadata.json", "r") as f: 175 | return DataInstance.model_validate_json(f.read()) 176 | 177 | def update_instance_metadata( 178 | self, instance_id: str, new_meta: dict[str, Any] 179 | ) -> None: 180 | """Update metadata for a specific instance""" 181 | inst = self.get_instance_metadata(instance_id) 182 | inst.metadata.update(new_meta) 183 | with open(self.instances_path / instance_id / "metadata.json", "w") as f: 184 | f.write(inst.model_dump_json()) 185 | 186 | def list_instances(self) -> list[str]: 187 | """list all instance IDs in the dataset""" 188 | return [p.name for p in self.instances_path.iterdir() if p.is_dir()] 189 | 190 | def get_instances_by_agent_type(self, agent_type: str) -> list[str]: 191 | """Get all instances that contain an agent of the specified type""" 192 | matching_instances = [] 193 | for instance_id in self.list_instances(): 194 | instance = self.get_instance_metadata(instance_id) 195 | if any( 196 | agent.agent_type == agent_type for agent in instance.agents.values() 197 | ): 198 | matching_instances.append(instance_id) 199 | return matching_instances 200 | 201 | def add_data_point( 202 | self, 203 | instance_id: str, 204 | agent_id: str, 205 | timestamp: datetime, 206 | point_type: PointType, 207 | data: npt.NDArray[Any] | dict[str, Any] | str, 208 | media_type: MediaType, 209 | metadata: dict[str, Any] | None = None, 210 | ) -> None: 211 | """Add a data point to a specific agent's trajectory""" 212 | trajectory = self.get_trajectory(instance_id, agent_id) 213 | trajectory.add_point( 214 | timestamp=timestamp, 215 | agent_id=agent_id, 216 | point_type=point_type, 217 | data=data, 218 | media_type=media_type, 219 | metadata=metadata, 220 | ) 221 | 222 | def close(self) -> None: 223 | """Close all open trajectories""" 224 | for instance_trajectories in self._trajectory_cache.values(): 225 | for trajectory in instance_trajectories.values(): 226 | trajectory.close() 227 | self._trajectory_cache.clear() 228 | 229 | def __enter__(self) -> Self: 230 | return self 231 | 232 | def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: 233 | self.close() 234 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/datasets/webarena_nnetnav.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import json 3 | from pathlib import Path 4 | from datetime import datetime 5 | from typing import Any 6 | 7 | # Import our dataset classes 8 | from osw_data import MultiAgentDataset, AgentMetadata, MediaType, PointType 9 | 10 | from .base import BaseConverter, run_converter 11 | 12 | 13 | class WebArenaConverter(BaseConverter): 14 | """Handles downloading and converting WebArena data to our dataset format""" 15 | 16 | def __init__(self, output_path: Path, source_path: Path): 17 | super().__init__(output_path, source_path) 18 | self.screenshots_path = self.source_path / "screenshots" 19 | 20 | def _setup_constants(self) -> None: 21 | """Setup WebArena-specific constants""" 22 | self.SPECIAL_KEYS = [ 23 | "Enter", 24 | "Tab", 25 | "Control", 26 | "Shift", 27 | "Meta", 28 | "Backspace", 29 | "Delete", 30 | "Escape", 31 | "ArrowUp", 32 | "ArrowDown", 33 | "ArrowLeft", 34 | "ArrowRight", 35 | "PageDown", 36 | "PageUp", 37 | "Meta+a", 38 | ] 39 | self.ASCII_CHARSET = "".join(chr(x) for x in range(32, 128)) 40 | self.FREQ_UNICODE_CHARSET = "".join(chr(x) for x in range(129, 1000)) 41 | self._id2key = ( 42 | self.SPECIAL_KEYS 43 | + list(self.ASCII_CHARSET) 44 | + list(self.FREQ_UNICODE_CHARSET) 45 | + ["\n"] 46 | ) 47 | 48 | def download_data(self) -> None: 49 | """Download WebArena dataset files""" 50 | self.source_path.mkdir(parents=True, exist_ok=True) 51 | 52 | # Download trajectory file 53 | if not (self.source_path / "trajectories.jsonl").exists(): 54 | self.logger.info("Downloading trajectory file...") 55 | traj_id = "1Ipuw32ea2B2jJ8EVCYkOW5oY0oaDXGQd" 56 | subprocess.run( 57 | ["gdown", traj_id, "-O", str(self.source_path / "trajectories.jsonl")], 58 | check=True, 59 | ) 60 | 61 | def _convert_action( 62 | self, action: dict[str, Any], metadata: dict[str, Any] 63 | ) -> dict[str, Any]: 64 | """Convert WebArena action to our format""" 65 | function = action["action_name"] 66 | kwargs = {} 67 | 68 | if function == "stop": 69 | kwargs["answer"] = action.get("answer", "") 70 | elif function == "type": 71 | # text_indices = action["text"] 72 | # kwargs["text"] = ''.join([ 73 | # self._id2key[i] 74 | # for i in text_indices 75 | # if isinstance(i, int) and i < len(self._id2key) and i >= len(self.SPECIAL_KEYS) 76 | # ]) 77 | kwargs["text"] = action["text"] 78 | kwargs["element_id"] = action["element_id"] 79 | elif function in ["hover", "click"]: 80 | kwargs["element_id"] = action["element_id"] 81 | elif function == "scroll": 82 | kwargs["dx"] = 0 83 | kwargs["dy"] = 100 if action["direction"].lower() == "down" else -100 84 | elif function in ["key_press", "press"]: 85 | kwargs["key_comb"] = action["key_comb"] 86 | function = "press" 87 | elif function in ["new_tab", "goto", "goto_url"]: 88 | kwargs["url"] = action["url"] 89 | function = "goto" if function == "goto_url" else function 90 | elif function in ["tab_focus", "page_focus"]: 91 | kwargs["page_number"] = action["page_number"] 92 | function = "tab_focus" 93 | elif function in ["go_back", "page_close", "go_forward"]: 94 | function = "tab_close" if function == "page_close" else function 95 | else: 96 | raise ValueError(f"Unknown function: {function}") 97 | 98 | return { 99 | "function": function, 100 | "kwargs": kwargs, 101 | "description": metadata.get("cot", ""), 102 | } 103 | 104 | def convert_to_dataset(self) -> None: 105 | """Convert WebArena data to our dataset format""" 106 | self.logger.info("Creating dataset...") 107 | 108 | # Initialize dataset 109 | dataset = MultiAgentDataset( 110 | name="WebArena Interactions", 111 | base_path=self.output_path, 112 | description="Web interaction trajectories from WebArena dataset", 113 | ) 114 | 115 | # Read trajectories 116 | with open(self.source_path / "trajectories.jsonl", "r") as f: 117 | for line in f: 118 | raw_traj = json.loads(line) 119 | 120 | # Skip blacklisted sources 121 | if raw_traj["source"] in ["SteP"]: 122 | continue 123 | 124 | # Create agent metadata 125 | agents_metadata = { 126 | "agent": AgentMetadata( 127 | agent_id="agent", 128 | agent_type="web_agent", 129 | capabilities=["navigation", "interaction"], 130 | parameters={"viewport_size": (1280, 720)}, 131 | ), 132 | "user": AgentMetadata( 133 | agent_id="user", 134 | agent_type="human", 135 | capabilities=["instruction"], 136 | ), 137 | } 138 | 139 | # Create instance 140 | instance_id = str(raw_traj["task_id"]) 141 | instance_metadata = { 142 | "task": raw_traj["intent"], 143 | "source_model": raw_traj["source"], 144 | } 145 | 146 | instance_id = dataset.create_instance( 147 | agents_metadata=agents_metadata, instance_metadata=instance_metadata 148 | ) 149 | 150 | # Add initial task observation 151 | dataset.add_data_point( 152 | instance_id=instance_id, 153 | agent_id="user", 154 | timestamp=datetime.now(), # Using current time as original times not available 155 | point_type=PointType.ACTION, 156 | data={"text": raw_traj["intent"]}, 157 | media_type=MediaType.JSON, 158 | ) 159 | 160 | # Process trajectory elements 161 | for element in raw_traj["trajectory"]: 162 | timestamp = ( 163 | datetime.now() 164 | ) # Using current time as original times not available 165 | 166 | if "action" in element: 167 | # Convert action 168 | action_data = self._convert_action( 169 | element["action"], element.get("metadata", {}) 170 | ) 171 | 172 | dataset.add_data_point( 173 | instance_id=instance_id, 174 | agent_id="agent", 175 | timestamp=timestamp, 176 | point_type=PointType.ACTION, 177 | data=action_data, 178 | media_type=MediaType.JSON, 179 | ) 180 | 181 | elif "url" in element: 182 | # Add URL and HTML observation 183 | web_data = {"url": element["url"], "html": element["axtree"]} 184 | dataset.add_data_point( 185 | instance_id=instance_id, 186 | agent_id="agent", 187 | timestamp=timestamp, 188 | point_type=PointType.OBSERVATION, 189 | data=web_data, 190 | media_type=MediaType.JSON, 191 | ) 192 | 193 | # Add screenshot observation 194 | # screenshot_path = element["screenshot_path"].replace( 195 | # "demo_trajs/images/", str(self.screenshots_path) 196 | # ) 197 | # if os.path.exists(screenshot_path): 198 | # # Load and convert image to numpy array 199 | # image = Image.open(screenshot_path) 200 | # image_array = np.array(image) 201 | 202 | # dataset.add_data_point( 203 | # instance_id=instance_id, 204 | # agent_id="agent", 205 | # timestamp=timestamp, 206 | # point_type=PointType.OBSERVATION, 207 | # data=image_array, 208 | # media_type=MediaType.IMAGE, 209 | # metadata={"original_path": screenshot_path}, 210 | # ) 211 | else: 212 | self.logger.warning( 213 | f"Unknown element type in trajectory: {element}" 214 | ) 215 | 216 | self.logger.info("Dataset conversion complete!") 217 | dataset.close() 218 | 219 | 220 | if __name__ == "__main__": 221 | source_path = Path(".data/raw/webarena-nnetnav") 222 | output_path = Path(".data/webarena-nnetnav") 223 | 224 | run_converter(WebArenaConverter, output_path, source_path) 225 | -------------------------------------------------------------------------------- /packages/osw-data/src/osw_data/trajectory.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, ConfigDict, Field 2 | from datetime import datetime 3 | from typing import Any, Optional, Union 4 | from typing_extensions import Self 5 | from enum import Enum 6 | from pathlib import Path 7 | import numpy as np 8 | import numpy.typing as npt 9 | import json 10 | 11 | 12 | class MediaType(str, Enum): 13 | """Types of media data supported""" 14 | 15 | IMAGE = "image" 16 | AUDIO = "audio" 17 | VIDEO = "video" 18 | TEXT = "text" 19 | NUMPY = "numpy" 20 | JSON = "json" 21 | 22 | 23 | class PointType(str, Enum): 24 | """Type of trajectory point""" 25 | 26 | OBSERVATION = "observation" 27 | ACTION = "action" 28 | 29 | 30 | class MediaReference(BaseModel): 31 | """Reference to media data stored on disk""" 32 | 33 | media_type: MediaType 34 | file_path: Path 35 | shape: tuple[int, ...] | None = None # Optional for JSON data 36 | dtype: Optional[str] = None # Optional for JSON data 37 | metadata: dict[str, Any] | None = None 38 | 39 | 40 | class MediaStorage: 41 | """Handles storage and retrieval of both media data and JSON content""" 42 | 43 | def __init__(self, base_path: Path): 44 | self.base_path = base_path 45 | self.base_path.mkdir(parents=True, exist_ok=True) 46 | self._json_path = self.base_path / "json_data" 47 | self._json_path.mkdir(exist_ok=True) 48 | self._numpy_path = self.base_path / "numpy_data" 49 | self._numpy_path.mkdir(exist_ok=True) 50 | 51 | def store_data( 52 | self, 53 | data: Union[npt.ArrayLike, dict[str, Any], str], 54 | media_type: MediaType, 55 | trajectory_id: str, 56 | timestamp: str, 57 | point_type: PointType, 58 | ) -> MediaReference: 59 | """Store either media data or JSON content""" 60 | if media_type == MediaType.JSON: 61 | assert isinstance( 62 | data, (dict, str) 63 | ), "JSON data must be a dictionary or string" 64 | return self._store_json(data, trajectory_id, timestamp, point_type) 65 | else: 66 | assert isinstance(data, np.ndarray), "Media data must be a NumPy" 67 | return self._store_numpy( 68 | data, media_type, trajectory_id, timestamp, point_type 69 | ) 70 | 71 | def _store_numpy( 72 | self, 73 | data: npt.NDArray[Any], 74 | media_type: MediaType, 75 | trajectory_id: str, 76 | timestamp: str, 77 | point_type: PointType, 78 | ) -> MediaReference: 79 | """Store media data in HDF5""" 80 | data_path = self._numpy_path / f"{trajectory_id}_{point_type}_{timestamp}.npy" 81 | 82 | np.save(data_path, data) 83 | 84 | return MediaReference( 85 | media_type=media_type, 86 | file_path=data_path.relative_to(self.base_path), 87 | shape=data.shape, 88 | dtype=str(data.dtype), 89 | ) 90 | 91 | def _store_json( 92 | self, 93 | data: dict[str, Any] | str, 94 | trajectory_id: str, 95 | timestamp: str, 96 | point_type: PointType, 97 | ) -> MediaReference: 98 | """Store JSON data""" 99 | json_file = self._json_path / f"{trajectory_id}_{point_type}_{timestamp}.json" 100 | 101 | with open(json_file, "w") as f: 102 | json.dump(data, f) 103 | 104 | return MediaReference( 105 | media_type=MediaType.JSON, 106 | file_path=json_file.relative_to(self.base_path), 107 | metadata={"timestamp": timestamp}, 108 | ) 109 | 110 | def load_data( 111 | self, reference: MediaReference 112 | ) -> npt.NDArray[Any] | dict[str, Any] | str: 113 | """Load either media or JSON data from reference""" 114 | if reference.media_type == MediaType.JSON: 115 | with open(self.base_path / reference.file_path, "r") as f: 116 | json_data = json.load(f) 117 | assert isinstance(json_data, (dict, str)), "Invalid JSON data" 118 | return json_data 119 | else: 120 | data_path = self.base_path / reference.file_path 121 | data = np.load(data_path) 122 | assert isinstance(data, np.ndarray), "Invalid NumPy data" 123 | return data 124 | 125 | def close(self) -> None: 126 | pass 127 | 128 | 129 | class TrajectoryPoint(BaseModel): 130 | """ 131 | Single point in a trajectory that can be either observation or action 132 | """ 133 | 134 | model_config = ConfigDict(arbitrary_types_allowed=True) 135 | 136 | timestamp: datetime 137 | agent_id: str 138 | point_type: PointType 139 | data_reference: MediaReference 140 | metadata: dict[str, Any] = Field(default_factory=dict) 141 | 142 | 143 | class SymmetricTrajectory: 144 | """Trajectory with symmetric handling of observations and actions""" 145 | 146 | def __init__(self, trajectory_id: str, storage_path: Path): 147 | self.trajectory_id = trajectory_id 148 | self.media_storage = MediaStorage(storage_path) 149 | self.points: list[TrajectoryPoint] = [] 150 | self.points_file = storage_path / "points.json" 151 | 152 | # Load points if they exist 153 | self._load_points() 154 | 155 | def _load_points(self) -> None: 156 | """Load trajectory points from disk""" 157 | if self.points_file.exists(): 158 | try: 159 | with open(self.points_file, "r") as f: 160 | points_data = json.load(f) 161 | self.points = [ 162 | TrajectoryPoint( 163 | timestamp=datetime.fromisoformat(p["timestamp"]), 164 | agent_id=p["agent_id"], 165 | point_type=PointType(p["point_type"]), 166 | data_reference=MediaReference.model_validate_json( 167 | p["data_reference"] 168 | ), 169 | metadata=p.get("metadata", {}), 170 | ) 171 | for p in points_data 172 | ] 173 | except Exception as e: 174 | print(f"Error loading points: {e}") 175 | 176 | def _save_points(self) -> None: 177 | """Save trajectory points to disk""" 178 | points_data = [ 179 | { 180 | "timestamp": p.timestamp.isoformat(), 181 | "agent_id": p.agent_id, 182 | "point_type": p.point_type.value, 183 | "data_reference": p.data_reference.model_dump_json(), 184 | "metadata": p.metadata, 185 | } 186 | for p in self.points 187 | ] 188 | 189 | with open(self.points_file, "w") as f: 190 | json.dump(points_data, f, indent=2) 191 | 192 | def add_point( 193 | self, 194 | timestamp: datetime, 195 | agent_id: str, 196 | point_type: PointType, 197 | data: npt.NDArray[Any] | dict[str, Any] | str, 198 | media_type: MediaType, 199 | metadata: dict[str, Any] | None = None, 200 | ) -> None: 201 | """Add either observation or action point""" 202 | data_reference = self.media_storage.store_data( 203 | data=data, 204 | media_type=media_type, 205 | trajectory_id=self.trajectory_id, 206 | timestamp=timestamp.isoformat(), 207 | point_type=point_type, 208 | ) 209 | 210 | point = TrajectoryPoint( 211 | timestamp=timestamp, 212 | agent_id=agent_id, 213 | point_type=point_type, 214 | data_reference=data_reference, 215 | metadata=metadata or {}, 216 | ) 217 | 218 | self.points.append(point) 219 | self._save_points() # Save after each addition 220 | 221 | def get_data_at(self, index: int) -> npt.NDArray[Any] | dict[str, Any] | str: 222 | """Load data for a specific trajectory point""" 223 | point = self.points[index] 224 | return self.media_storage.load_data(point.data_reference) 225 | 226 | def get_points_by_type(self, point_type: PointType) -> list[TrajectoryPoint]: 227 | """Get all points of a specific type""" 228 | return [p for p in self.points if p.point_type == point_type] 229 | 230 | def get_points_by_agent(self, agent_id: str) -> list[TrajectoryPoint]: 231 | """Get all points for a specific agent""" 232 | return [p for p in self.points if p.agent_id == agent_id] 233 | 234 | def get_points_in_timerange( 235 | self, start_time: datetime, end_time: datetime 236 | ) -> list[TrajectoryPoint]: 237 | """Get points within a time range""" 238 | return [p for p in self.points if start_time <= p.timestamp <= end_time] 239 | 240 | def close(self) -> None: 241 | """Close media storage and ensure points are saved""" 242 | self._save_points() 243 | self.media_storage.close() 244 | 245 | def __enter__(self) -> Self: 246 | return self 247 | 248 | def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: 249 | self.close() 250 | 251 | 252 | def render_trajectory(trajectory: SymmetricTrajectory) -> list[dict[str, Any]]: 253 | """Render a trajectory as a list of dictionaries""" 254 | return [ 255 | { 256 | "timestamp": p.timestamp.isoformat(), 257 | "agent_id": p.agent_id, 258 | "point_type": p.point_type, 259 | "data": trajectory.get_data_at(i), 260 | "metadata": p.metadata, 261 | } 262 | for i, p in enumerate(trajectory.points) 263 | ] 264 | -------------------------------------------------------------------------------- /packages/osw-data/src/osw_data/annotation.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pydantic import BaseModel, Field 3 | from typing import Optional, Any 4 | from pathlib import Path 5 | from datetime import datetime 6 | import yaml 7 | from uuid import uuid4 8 | 9 | 10 | class Annotator(BaseModel): 11 | """Information about an annotator""" 12 | 13 | annotator_id: str 14 | name: str 15 | role: Optional[str] = None 16 | expertise_level: Optional[str] = None 17 | metadata: dict[str, Any] = Field(default_factory=dict) 18 | 19 | 20 | class AnnotationSpan(BaseModel): 21 | """Represents a span of trajectory points being annotated""" 22 | 23 | start_time: datetime 24 | end_time: Optional[datetime] = None 25 | point_indices: Optional[list[int]] = None 26 | 27 | 28 | class Annotation(BaseModel): 29 | """Single annotation entry""" 30 | 31 | annotation_id: str = Field(default_factory=lambda: str(uuid4())) 32 | annotator_id: str 33 | created_at: datetime = Field(default_factory=datetime.now) 34 | updated_at: datetime = Field(default_factory=datetime.now) 35 | content: dict[str, Any] # Flexible annotation content 36 | span: Optional[AnnotationSpan] = None 37 | confidence: Optional[float] = None 38 | metadata: dict[str, Any] = Field(default_factory=dict) 39 | 40 | 41 | class TrajectoryAnnotations(BaseModel): 42 | """Collection of annotations for a specific trajectory""" 43 | 44 | instance_id: str 45 | agent_id: str 46 | annotations: list[Annotation] = Field(default_factory=list) 47 | metadata: dict[str, Any] = Field(default_factory=dict) 48 | 49 | 50 | class AnnotationProject(BaseModel): 51 | """Metadata for an annotation project""" 52 | 53 | project_id: str 54 | name: str 55 | description: str 56 | annotation_schema: dict[str, Any] # Defines the expected annotation structure 57 | guidelines: Optional[str] = None 58 | created_at: datetime = Field(default_factory=datetime.now) 59 | updated_at: datetime = Field(default_factory=datetime.now) 60 | annotators: dict[str, Annotator] = Field(default_factory=dict) 61 | metadata: dict[str, Any] = Field(default_factory=dict) 62 | 63 | 64 | class AnnotationSystem: 65 | """ 66 | System for managing annotations separate from but linked to the dataset 67 | """ 68 | 69 | def __init__( 70 | self, 71 | base_path: Path | str, 72 | project_name: str | None = None, 73 | description: str = "", 74 | annotation_schema: Optional[dict[str, Any]] = None, 75 | ): 76 | self.base_path = Path(base_path) 77 | self.annotations_path = self.base_path / "annotations" 78 | self.project_path = self.base_path / "project.yaml" 79 | 80 | # Initialize directory structure 81 | self.base_path.mkdir(parents=True, exist_ok=True) 82 | self.annotations_path.mkdir(exist_ok=True) 83 | 84 | # Initialize or load project metadata 85 | self.project = self._init_project( 86 | project_name, description, annotation_schema or {} 87 | ) 88 | 89 | def _init_project( 90 | self, 91 | name: str | None, 92 | description: str, 93 | annotation_schema: dict[str, Any], 94 | ) -> AnnotationProject: 95 | """Initialize or load project metadata""" 96 | if self.project_path.exists(): 97 | with open(self.project_path, "r") as f: 98 | project_dict = yaml.safe_load(f) 99 | return AnnotationProject(**project_dict) 100 | else: 101 | if not name: 102 | raise ValueError("Project name is required") 103 | project = AnnotationProject( 104 | project_id=str(uuid4()), 105 | name=name, 106 | description=description, 107 | annotation_schema=annotation_schema, 108 | ) 109 | self._save_project(project) 110 | return project 111 | 112 | def _save_project(self, project: AnnotationProject) -> None: 113 | """Save project metadata to disk""" 114 | with open(self.project_path, "w") as f: 115 | yaml.dump(json.loads(project.model_dump_json()), f) 116 | 117 | def add_annotator( 118 | self, 119 | annotator_id: str, 120 | name: str, 121 | role: str | None = None, 122 | expertise_level: str | None = None, 123 | metadata: dict[str, Any] | None = None, 124 | ) -> None: 125 | """Register a new annotator""" 126 | annotator = Annotator( 127 | annotator_id=annotator_id, 128 | name=name, 129 | role=role, 130 | expertise_level=expertise_level, 131 | metadata=metadata or {}, 132 | ) 133 | self.project.annotators[annotator_id] = annotator 134 | self._save_project(self.project) 135 | 136 | def _get_trajectory_annotation_path(self, instance_id: str, agent_id: str) -> Path: 137 | """Get path for trajectory annotations""" 138 | return self.annotations_path / f"{instance_id}_{agent_id}.json" 139 | 140 | def get_trajectory_annotations( 141 | self, instance_id: str, agent_id: str 142 | ) -> TrajectoryAnnotations: 143 | """Get all annotations for a specific trajectory""" 144 | annotation_path = self._get_trajectory_annotation_path(instance_id, agent_id) 145 | if annotation_path.exists(): 146 | with open(annotation_path, "r") as f: 147 | return TrajectoryAnnotations.model_validate_json(f.read()) 148 | return TrajectoryAnnotations(instance_id=instance_id, agent_id=agent_id) 149 | 150 | def add_annotation( 151 | self, 152 | instance_id: str, 153 | agent_id: str, 154 | annotator_id: str, 155 | content: dict[str, Any], 156 | span: Optional[AnnotationSpan] = None, 157 | confidence: Optional[float] = None, 158 | metadata: Optional[dict[str, Any]] = None, 159 | ) -> str: 160 | """ 161 | Add a new annotation to a trajectory 162 | 163 | Args: 164 | instance_id: ID of the dataset instance 165 | agent_id: ID of the agent 166 | annotator_id: ID of the annotator 167 | content: The annotation content 168 | span: Optional time span or point indices being annotated 169 | confidence: Optional confidence score 170 | metadata: Optional additional metadata 171 | 172 | Returns: 173 | annotation_id: ID of the created annotation 174 | """ 175 | if annotator_id not in self.project.annotators: 176 | raise ValueError(f"Unknown annotator: {annotator_id}") 177 | 178 | # Create new annotation 179 | annotation = Annotation( 180 | annotator_id=annotator_id, 181 | content=content, 182 | span=span, 183 | confidence=confidence, 184 | metadata=metadata or {}, 185 | ) 186 | 187 | # Add to trajectory annotations 188 | trajectory_annotations = self.get_trajectory_annotations(instance_id, agent_id) 189 | trajectory_annotations.annotations.append(annotation) 190 | 191 | # Save to disk 192 | annotation_path = self._get_trajectory_annotation_path(instance_id, agent_id) 193 | with open(annotation_path, "w") as f: 194 | f.write(trajectory_annotations.model_dump_json()) 195 | 196 | return annotation.annotation_id 197 | 198 | def get_annotator_annotations( 199 | self, annotator_id: str 200 | ) -> dict[str, list[Annotation]]: 201 | """Get all annotations by a specific annotator""" 202 | annotations = {} 203 | for annotation_file in self.annotations_path.glob("*.json"): 204 | with open(annotation_file, "r") as f: 205 | trajectory_annotations = TrajectoryAnnotations.model_validate_json( 206 | f.read() 207 | ) 208 | 209 | # Filter annotations by annotator 210 | annotator_anns = [ 211 | ann 212 | for ann in trajectory_annotations.annotations 213 | if ann.annotator_id == annotator_id 214 | ] 215 | 216 | if annotator_anns: 217 | key = f"{trajectory_annotations.instance_id}_{trajectory_annotations.agent_id}" 218 | annotations[key] = annotator_anns 219 | 220 | return annotations 221 | 222 | def get_annotations_by_time( 223 | self, start_time: datetime, end_time: Optional[datetime] = None 224 | ) -> dict[str, list[Annotation]]: 225 | """Get annotations within a time range""" 226 | annotations = {} 227 | for annotation_file in self.annotations_path.glob("*.json"): 228 | with open(annotation_file, "r") as f: 229 | trajectory_annotations = TrajectoryAnnotations.model_validate_json( 230 | f.read() 231 | ) 232 | 233 | # Filter annotations by time 234 | time_anns = [ 235 | ann 236 | for ann in trajectory_annotations.annotations 237 | if ann.span 238 | and (not start_time or ann.span.start_time >= start_time) 239 | and ( 240 | not end_time 241 | or not ann.span.end_time 242 | or ann.span.end_time <= end_time 243 | ) 244 | ] 245 | 246 | if time_anns: 247 | key = f"{trajectory_annotations.instance_id}_{trajectory_annotations.agent_id}" 248 | annotations[key] = time_anns 249 | 250 | return annotations 251 | 252 | def get_all_annotations(self) -> dict[str, list[Annotation]]: 253 | """Get all annotations""" 254 | annotations = {} 255 | for annotation_file in self.annotations_path.glob("*.json"): 256 | with open(annotation_file, "r") as f: 257 | trajectory_annotations = TrajectoryAnnotations.model_validate_json( 258 | f.read() 259 | ) 260 | 261 | key = f"{trajectory_annotations.instance_id}_{trajectory_annotations.agent_id}" 262 | annotations[key] = trajectory_annotations.annotations 263 | 264 | return annotations 265 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/datasets/webarena.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import zipfile 3 | import json 4 | import os 5 | from pathlib import Path 6 | from datetime import datetime 7 | from typing import Any 8 | import numpy as np 9 | from PIL import Image 10 | 11 | # Import our dataset classes 12 | from osw_data import MultiAgentDataset, AgentMetadata, MediaType, PointType 13 | 14 | from .base import BaseConverter, run_converter 15 | 16 | 17 | class WebArenaConverter(BaseConverter): 18 | """Handles downloading and converting WebArena data to our dataset format""" 19 | 20 | def __init__(self, output_path: Path, source_path: Path): 21 | super().__init__(output_path, source_path) 22 | self.screenshots_path = self.source_path / "screenshots" 23 | 24 | def _setup_constants(self) -> None: 25 | """Setup WebArena-specific constants""" 26 | self.SPECIAL_KEYS = [ 27 | "Enter", 28 | "Tab", 29 | "Control", 30 | "Shift", 31 | "Meta", 32 | "Backspace", 33 | "Delete", 34 | "Escape", 35 | "ArrowUp", 36 | "ArrowDown", 37 | "ArrowLeft", 38 | "ArrowRight", 39 | "PageDown", 40 | "PageUp", 41 | "Meta+a", 42 | ] 43 | self.ASCII_CHARSET = "".join(chr(x) for x in range(32, 128)) 44 | self.FREQ_UNICODE_CHARSET = "".join(chr(x) for x in range(129, 1000)) 45 | self._id2key = ( 46 | self.SPECIAL_KEYS 47 | + list(self.ASCII_CHARSET) 48 | + list(self.FREQ_UNICODE_CHARSET) 49 | + ["\n"] 50 | ) 51 | 52 | def download_data(self) -> None: 53 | """Download WebArena dataset files""" 54 | self.source_path.mkdir(parents=True, exist_ok=True) 55 | 56 | # Download trajectory file 57 | if not (self.source_path / "trajectories.jsonl").exists(): 58 | self.logger.info("Downloading trajectory file...") 59 | traj_id = "1tvnaklsdSLx4Sp9Uc1spopcFpLktStO8" 60 | subprocess.run( 61 | ["gdown", traj_id, "-O", str(self.source_path / "trajectories.jsonl")], 62 | check=True, 63 | ) 64 | 65 | # Download and extract screenshots 66 | if not self.screenshots_path.exists(): 67 | self.logger.info("Downloading screenshots...") 68 | screenshots_id = "1TNfhApmiEIxiOcUqi4duvVWBaH5_m3By" 69 | zip_path = self.source_path / "screenshots.zip" 70 | 71 | subprocess.run(["gdown", screenshots_id, "-O", str(zip_path)], check=True) 72 | 73 | self.logger.info("Extracting screenshots...") 74 | with zipfile.ZipFile(zip_path, "r") as zip_ref: 75 | zip_ref.extractall(self.source_path) 76 | 77 | # Rename images directory to screenshots 78 | images_path = self.source_path / "images" 79 | if images_path.exists(): 80 | images_path.rename(self.screenshots_path) 81 | 82 | # Cleanup zip file 83 | zip_path.unlink() 84 | 85 | def _convert_action( 86 | self, action: dict[str, Any], metadata: dict[str, Any] 87 | ) -> dict[str, Any]: 88 | """Convert WebArena action to our format""" 89 | function = action["action_name"] 90 | kwargs = {} 91 | 92 | if function == "stop": 93 | kwargs["answer"] = action.get("answer", "") 94 | elif function == "type": 95 | # text_indices = action["text"] 96 | # kwargs["text"] = ''.join([ 97 | # self._id2key[i] 98 | # for i in text_indices 99 | # if isinstance(i, int) and i < len(self._id2key) and i >= len(self.SPECIAL_KEYS) 100 | # ]) 101 | kwargs["text"] = action["text"] 102 | kwargs["element_id"] = action["element_id"] 103 | elif function in ["hover", "click"]: 104 | kwargs["element_id"] = action["element_id"] 105 | elif function == "scroll": 106 | kwargs["dx"] = 0 107 | kwargs["dy"] = 100 if action["direction"].lower() == "down" else -100 108 | elif function in ["key_press", "press"]: 109 | kwargs["key_comb"] = action["key_comb"] 110 | function = "press" 111 | elif function in ["new_tab", "goto", "goto_url"]: 112 | kwargs["url"] = action["url"] 113 | function = "goto" if function == "goto_url" else function 114 | elif function in ["tab_focus", "page_focus"]: 115 | kwargs["page_number"] = action["page_number"] 116 | function = "tab_focus" 117 | elif function in ["go_back", "page_close", "go_forward"]: 118 | function = "tab_close" if function == "page_close" else function 119 | else: 120 | raise ValueError(f"Unknown function: {function}") 121 | 122 | return { 123 | "function": function, 124 | "kwargs": kwargs, 125 | "description": metadata.get("cot", ""), 126 | } 127 | 128 | def convert_to_dataset(self) -> None: 129 | """Convert WebArena data to our dataset format""" 130 | self.logger.info("Creating dataset...") 131 | 132 | # Initialize dataset 133 | dataset = MultiAgentDataset( 134 | name="WebArena Interactions", 135 | base_path=self.output_path, 136 | description="Web interaction trajectories from WebArena dataset", 137 | ) 138 | 139 | # Read trajectories 140 | with open(self.source_path / "trajectories.jsonl", "r") as f: 141 | for line in f: 142 | raw_traj = json.loads(line) 143 | 144 | # Skip blacklisted sources 145 | if raw_traj["source"] in ["SteP"]: 146 | continue 147 | 148 | # Create agent metadata 149 | agents_metadata = { 150 | "agent": AgentMetadata( 151 | agent_id="agent", 152 | agent_type="web_agent", 153 | capabilities=["navigation", "interaction"], 154 | parameters={"viewport_size": (1280, 720)}, 155 | ), 156 | "user": AgentMetadata( 157 | agent_id="user", 158 | agent_type="human", 159 | capabilities=["instruction"], 160 | ), 161 | } 162 | 163 | # Create instance 164 | instance_id = str(raw_traj["task_id"]) 165 | instance_metadata = { 166 | "task": raw_traj["intent"], 167 | "source_model": raw_traj["source"], 168 | } 169 | 170 | instance_id = dataset.create_instance( 171 | agents_metadata=agents_metadata, instance_metadata=instance_metadata 172 | ) 173 | 174 | # Add initial task observation 175 | dataset.add_data_point( 176 | instance_id=instance_id, 177 | agent_id="user", 178 | timestamp=datetime.now(), # Using current time as original times not available 179 | point_type=PointType.ACTION, 180 | data={"text": raw_traj["intent"]}, 181 | media_type=MediaType.JSON, 182 | ) 183 | 184 | # Process trajectory elements 185 | for element in raw_traj["trajectory"]: 186 | timestamp = ( 187 | datetime.now() 188 | ) # Using current time as original times not available 189 | 190 | if "action" in element: 191 | # Convert action 192 | action_data = self._convert_action( 193 | element["action"], element.get("metadata", {}) 194 | ) 195 | 196 | dataset.add_data_point( 197 | instance_id=instance_id, 198 | agent_id="agent", 199 | timestamp=timestamp, 200 | point_type=PointType.ACTION, 201 | data=action_data, 202 | media_type=MediaType.JSON, 203 | ) 204 | 205 | elif "url" in element: 206 | # Add URL and HTML observation 207 | web_data = {"url": element["url"], "html": element["axtree"]} 208 | dataset.add_data_point( 209 | instance_id=instance_id, 210 | agent_id="agent", 211 | timestamp=timestamp, 212 | point_type=PointType.OBSERVATION, 213 | data=web_data, 214 | media_type=MediaType.JSON, 215 | ) 216 | 217 | # Add screenshot observation 218 | screenshot_path = element["screenshot_path"].replace( 219 | "demo_trajs/images/", str(self.screenshots_path) 220 | ) 221 | if os.path.exists(screenshot_path): 222 | # Load and convert image to numpy array 223 | image = Image.open(screenshot_path) 224 | image_array = np.array(image) 225 | 226 | dataset.add_data_point( 227 | instance_id=instance_id, 228 | agent_id="agent", 229 | timestamp=timestamp, 230 | point_type=PointType.OBSERVATION, 231 | data=image_array, 232 | media_type=MediaType.IMAGE, 233 | metadata={"original_path": screenshot_path}, 234 | ) 235 | else: 236 | self.logger.warning( 237 | f"Unknown element type in trajectory: {element}" 238 | ) 239 | 240 | self.logger.info("Dataset conversion complete!") 241 | dataset.close() 242 | 243 | 244 | if __name__ == "__main__": 245 | source_path = Path(".data/raw/webarena") 246 | output_path = Path(".data/webarena") 247 | 248 | run_converter(WebArenaConverter, output_path, source_path) 249 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/datasets/webvoyager_nnetnav_best.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from pathlib import Path 4 | from datetime import datetime 5 | from typing import Any 6 | import numpy as np 7 | from PIL import Image 8 | 9 | # Import our dataset classes 10 | from osw_data import MultiAgentDataset, AgentMetadata, MediaType, PointType 11 | from osw_data.annotation import AnnotationSystem 12 | 13 | from .base import BaseConverter, run_converter 14 | 15 | 16 | class WebVoyagerNNetNavConverter(BaseConverter): 17 | """Handles downloading and converting WebArena data to our dataset format""" 18 | 19 | def __init__( 20 | self, output_path: Path, source_path: Path, annotation_path: Path | None = None 21 | ) -> None: 22 | super().__init__(output_path, source_path) 23 | self.screenshots_path = self.source_path / "screenshots" 24 | self.annotation_path = annotation_path 25 | 26 | def _setup_constants(self) -> None: 27 | """Setup WebArena-specific constants""" 28 | self.SPECIAL_KEYS = [ 29 | "Enter", 30 | "Tab", 31 | "Control", 32 | "Shift", 33 | "Meta", 34 | "Backspace", 35 | "Delete", 36 | "Escape", 37 | "ArrowUp", 38 | "ArrowDown", 39 | "ArrowLeft", 40 | "ArrowRight", 41 | "PageDown", 42 | "PageUp", 43 | "Meta+a", 44 | ] 45 | self.ASCII_CHARSET = "".join(chr(x) for x in range(32, 128)) 46 | self.FREQ_UNICODE_CHARSET = "".join(chr(x) for x in range(129, 1000)) 47 | self._id2key = ( 48 | self.SPECIAL_KEYS 49 | + list(self.ASCII_CHARSET) 50 | + list(self.FREQ_UNICODE_CHARSET) 51 | + ["\n"] 52 | ) 53 | 54 | def download_data(self) -> None: 55 | """Download WebArena dataset files""" 56 | pass 57 | 58 | def _convert_action( 59 | self, action: dict[str, Any], metadata: dict[str, Any] 60 | ) -> dict[str, Any]: 61 | """Convert WebArena action to our format""" 62 | function = action["action_name"] 63 | kwargs = {} 64 | 65 | if function == "stop": 66 | kwargs["answer"] = action.get("answer", "") 67 | elif function == "type": 68 | # text_indices = action["text"] 69 | # kwargs["text"] = ''.join([ 70 | # self._id2key[i] 71 | # for i in text_indices 72 | # if isinstance(i, int) and i < len(self._id2key) and i >= len(self.SPECIAL_KEYS) 73 | # ]) 74 | kwargs["text"] = action["text"] 75 | kwargs["element_id"] = action["element_id"] 76 | elif function in ["hover", "click"]: 77 | kwargs["element_id"] = action["element_id"] 78 | elif function == "scroll": 79 | kwargs["dx"] = 0 80 | kwargs["dy"] = 100 if action["direction"].lower() == "down" else -100 81 | elif function in ["key_press", "press"]: 82 | kwargs["key_comb"] = action["key_comb"] 83 | function = "press" 84 | elif function in ["new_tab", "goto", "goto_url"]: 85 | kwargs["url"] = action["url"] 86 | function = "goto" if function == "goto_url" else function 87 | elif function in ["tab_focus", "page_focus"]: 88 | kwargs["page_number"] = action["page_number"] 89 | function = "tab_focus" 90 | elif function in ["go_back", "page_close", "go_forward"]: 91 | function = "tab_close" if function == "page_close" else function 92 | else: 93 | raise ValueError(f"Unknown function: {function}") 94 | 95 | return { 96 | "function": function, 97 | "kwargs": kwargs, 98 | "description": metadata.get("cot", ""), 99 | } 100 | 101 | def convert_to_dataset(self) -> None: 102 | """Convert WebArena data to our dataset format""" 103 | self.logger.info("Creating dataset...") 104 | 105 | # Initialize dataset 106 | dataset = MultiAgentDataset( 107 | name="WebArena Interactions", 108 | base_path=self.output_path, 109 | description="Web interaction trajectories from WebArena dataset", 110 | ) 111 | 112 | if self.annotation_path: 113 | annotation_system = AnnotationSystem( 114 | base_path=self.annotation_path, 115 | project_name="WebVoyager Annotations", 116 | description="Free-form text annotations of agent trajectories for WebVoyager", 117 | annotation_schema={ 118 | "feedback": { 119 | "type": "string", 120 | "description": "Free-form text feedback on the trajectory", 121 | } 122 | }, 123 | ) 124 | 125 | task_id2instance_id: dict[str, str] = {} 126 | 127 | # Read trajectories 128 | with open(self.source_path / "trajectories.jsonl", "r") as f: 129 | for line in f: 130 | raw_traj = json.loads(line) 131 | 132 | # Skip blacklisted sources 133 | if raw_traj["source"] in ["SteP"]: 134 | continue 135 | 136 | # Create agent metadata 137 | agents_metadata = { 138 | "agent": AgentMetadata( 139 | agent_id="agent", 140 | agent_type="web_agent", 141 | capabilities=["navigation", "interaction"], 142 | parameters={"viewport_size": (1280, 720)}, 143 | ), 144 | "user": AgentMetadata( 145 | agent_id="user", 146 | agent_type="human", 147 | capabilities=["instruction"], 148 | ), 149 | } 150 | 151 | # Create instance 152 | instance_id = str(raw_traj["task_id"]) 153 | instance_metadata = { 154 | "task": raw_traj["intent"], 155 | "source_model": raw_traj["source"], 156 | } 157 | 158 | instance_id = dataset.create_instance( 159 | agents_metadata=agents_metadata, instance_metadata=instance_metadata 160 | ) 161 | 162 | task_id2instance_id[raw_traj["task_id"]] = instance_id 163 | 164 | # Add initial task observation 165 | dataset.add_data_point( 166 | instance_id=instance_id, 167 | agent_id="user", 168 | timestamp=datetime.now(), # Using current time as original times not available 169 | point_type=PointType.ACTION, 170 | data={"text": raw_traj["intent"]}, 171 | media_type=MediaType.JSON, 172 | ) 173 | 174 | # Process trajectory elements 175 | for element in raw_traj["trajectory"]: 176 | timestamp = ( 177 | datetime.now() 178 | ) # Using current time as original times not available 179 | 180 | if "action" in element: 181 | # Convert action 182 | action_data = self._convert_action( 183 | element["action"], element.get("metadata", {}) 184 | ) 185 | 186 | dataset.add_data_point( 187 | instance_id=instance_id, 188 | agent_id="agent", 189 | timestamp=timestamp, 190 | point_type=PointType.ACTION, 191 | data=action_data, 192 | media_type=MediaType.JSON, 193 | ) 194 | 195 | elif "url" in element: 196 | # Add URL and HTML observation 197 | web_data = {"url": element["url"], "html": element["axtree"]} 198 | dataset.add_data_point( 199 | instance_id=instance_id, 200 | agent_id="agent", 201 | timestamp=timestamp, 202 | point_type=PointType.OBSERVATION, 203 | data=web_data, 204 | media_type=MediaType.JSON, 205 | ) 206 | 207 | # Add screenshot observation 208 | screenshot_path = element["screenshot_path"].replace( 209 | "demo_trajs/images/", str(self.screenshots_path) 210 | ) 211 | if os.path.exists(screenshot_path): 212 | # Load and convert image to numpy array 213 | image = Image.open(screenshot_path) 214 | image_array = np.array(image) 215 | 216 | dataset.add_data_point( 217 | instance_id=instance_id, 218 | agent_id="agent", 219 | timestamp=timestamp, 220 | point_type=PointType.OBSERVATION, 221 | data=image_array, 222 | media_type=MediaType.IMAGE, 223 | metadata={"original_path": screenshot_path}, 224 | ) 225 | else: 226 | self.logger.warning( 227 | f"Unknown element type in trajectory: {element}" 228 | ) 229 | 230 | if self.annotation_path: 231 | annotation_system.add_annotator( 232 | annotator_id="Shikhar", 233 | name="Shikhar Murty", 234 | ) 235 | with open(self.source_path / "feedback.json", "r") as f: 236 | task_id2feedback = json.load(f) 237 | for task_id in task_id2feedback: 238 | instance_id_or_none = task_id2instance_id.get(task_id) 239 | if instance_id_or_none: 240 | annotation_system.add_annotation( 241 | instance_id=instance_id_or_none, 242 | agent_id="agent", 243 | content={"feedback": task_id2feedback[task_id]}, 244 | annotator_id="Shikhar", 245 | ) 246 | self.logger.info("Dataset conversion complete!") 247 | dataset.close() 248 | 249 | 250 | if __name__ == "__main__": 251 | source_path = Path(".data/raw/webvoyager-nnetnav-best") 252 | output_path = Path(".data/webvoyager-nnetnav-best") 253 | 254 | run_converter( 255 | WebVoyagerNNetNavConverter, 256 | output_path, 257 | source_path, 258 | ) 259 | -------------------------------------------------------------------------------- /packages/autolibra-core/src/autolibra_core/datasets/webvoyager_nnetnav.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from pathlib import Path 4 | from datetime import datetime 5 | from typing import Any 6 | import numpy as np 7 | from PIL import Image 8 | 9 | # Import our dataset classes 10 | from osw_data import MultiAgentDataset, AgentMetadata, MediaType, PointType 11 | from osw_data.annotation import AnnotationSystem 12 | 13 | from .base import BaseConverter, run_converter 14 | 15 | 16 | class WebVoyagerNNetNavConverter(BaseConverter): 17 | """Handles downloading and converting WebArena data to our dataset format""" 18 | 19 | def __init__( 20 | self, output_path: Path, source_path: Path, annotation_path: Path | None = None 21 | ) -> None: 22 | super().__init__(output_path, source_path) 23 | self.screenshots_path = self.source_path / "screenshots" 24 | self.annotation_path = annotation_path 25 | 26 | def _setup_constants(self) -> None: 27 | """Setup WebArena-specific constants""" 28 | self.SPECIAL_KEYS = [ 29 | "Enter", 30 | "Tab", 31 | "Control", 32 | "Shift", 33 | "Meta", 34 | "Backspace", 35 | "Delete", 36 | "Escape", 37 | "ArrowUp", 38 | "ArrowDown", 39 | "ArrowLeft", 40 | "ArrowRight", 41 | "PageDown", 42 | "PageUp", 43 | "Meta+a", 44 | ] 45 | self.ASCII_CHARSET = "".join(chr(x) for x in range(32, 128)) 46 | self.FREQ_UNICODE_CHARSET = "".join(chr(x) for x in range(129, 1000)) 47 | self._id2key = ( 48 | self.SPECIAL_KEYS 49 | + list(self.ASCII_CHARSET) 50 | + list(self.FREQ_UNICODE_CHARSET) 51 | + ["\n"] 52 | ) 53 | 54 | def download_data(self) -> None: 55 | """Download WebArena dataset files""" 56 | pass 57 | 58 | def _convert_action( 59 | self, action: dict[str, Any], metadata: dict[str, Any] 60 | ) -> dict[str, Any]: 61 | """Convert WebArena action to our format""" 62 | function = action["action_name"] 63 | kwargs = {} 64 | 65 | if function == "stop": 66 | kwargs["answer"] = action.get("answer", "") 67 | elif function == "type": 68 | # text_indices = action["text"] 69 | # kwargs["text"] = ''.join([ 70 | # self._id2key[i] 71 | # for i in text_indices 72 | # if isinstance(i, int) and i < len(self._id2key) and i >= len(self.SPECIAL_KEYS) 73 | # ]) 74 | kwargs["text"] = action["text"] 75 | kwargs["element_id"] = action["element_id"] 76 | elif function in ["hover", "click"]: 77 | kwargs["element_id"] = action["element_id"] 78 | elif function == "scroll": 79 | kwargs["dx"] = 0 80 | kwargs["dy"] = 100 if action["direction"].lower() == "down" else -100 81 | elif function in ["key_press", "press"]: 82 | kwargs["key_comb"] = action["key_comb"] 83 | function = "press" 84 | elif function in ["new_tab", "goto", "goto_url"]: 85 | kwargs["url"] = action["url"] 86 | function = "goto" if function == "goto_url" else function 87 | elif function in ["tab_focus", "page_focus"]: 88 | kwargs["page_number"] = action["page_number"] 89 | function = "tab_focus" 90 | elif function in ["go_back", "page_close", "go_forward"]: 91 | function = "tab_close" if function == "page_close" else function 92 | else: 93 | raise ValueError(f"Unknown function: {function}") 94 | 95 | return { 96 | "function": function, 97 | "kwargs": kwargs, 98 | "description": metadata.get("cot", ""), 99 | } 100 | 101 | def convert_to_dataset(self) -> None: 102 | """Convert WebArena data to our dataset format""" 103 | self.logger.info("Creating dataset...") 104 | 105 | # Initialize dataset 106 | dataset = MultiAgentDataset( 107 | name="WebArena Interactions", 108 | base_path=self.output_path, 109 | description="Web interaction trajectories from WebArena dataset", 110 | ) 111 | 112 | task_id2instance_id: dict[str, str] = {} 113 | 114 | # Read trajectories 115 | with open(self.source_path / "trajectories.jsonl", "r") as f: 116 | for line in f: 117 | raw_traj = json.loads(line) 118 | 119 | # Skip blacklisted sources 120 | if raw_traj["source"] in ["SteP"]: 121 | continue 122 | 123 | # Create agent metadata 124 | agents_metadata = { 125 | "agent": AgentMetadata( 126 | agent_id="agent", 127 | agent_type="web_agent", 128 | capabilities=["navigation", "interaction"], 129 | parameters={"viewport_size": (1280, 720)}, 130 | ), 131 | "user": AgentMetadata( 132 | agent_id="user", 133 | agent_type="human", 134 | capabilities=["instruction"], 135 | ), 136 | } 137 | 138 | # Create instance 139 | instance_id = str(raw_traj["task_id"]) 140 | instance_metadata = { 141 | "task": raw_traj["intent"], 142 | "source_model": raw_traj["source"], 143 | } 144 | 145 | instance_id = dataset.create_instance( 146 | agents_metadata=agents_metadata, instance_metadata=instance_metadata 147 | ) 148 | 149 | task_id2instance_id[raw_traj["task_id"]] = instance_id 150 | 151 | # Add initial task observation 152 | dataset.add_data_point( 153 | instance_id=instance_id, 154 | agent_id="user", 155 | timestamp=datetime.now(), # Using current time as original times not available 156 | point_type=PointType.ACTION, 157 | data={"text": raw_traj["intent"]}, 158 | media_type=MediaType.JSON, 159 | ) 160 | 161 | # Process trajectory elements 162 | for element in raw_traj["trajectory"]: 163 | timestamp = ( 164 | datetime.now() 165 | ) # Using current time as original times not available 166 | 167 | if "action" in element: 168 | # Convert action 169 | action_data = self._convert_action( 170 | element["action"], element.get("metadata", {}) 171 | ) 172 | 173 | dataset.add_data_point( 174 | instance_id=instance_id, 175 | agent_id="agent", 176 | timestamp=timestamp, 177 | point_type=PointType.ACTION, 178 | data=action_data, 179 | media_type=MediaType.JSON, 180 | ) 181 | 182 | elif "url" in element: 183 | # Add URL and HTML observation 184 | web_data = {"url": element["url"], "html": element["axtree"]} 185 | dataset.add_data_point( 186 | instance_id=instance_id, 187 | agent_id="agent", 188 | timestamp=timestamp, 189 | point_type=PointType.OBSERVATION, 190 | data=web_data, 191 | media_type=MediaType.JSON, 192 | ) 193 | 194 | # Add screenshot observation 195 | screenshot_path = element["screenshot_path"].replace( 196 | "demo_trajs/images/", str(self.screenshots_path) 197 | ) 198 | if os.path.exists(screenshot_path): 199 | # Load and convert image to numpy array 200 | image = Image.open(screenshot_path) 201 | image_array = np.array(image) 202 | 203 | dataset.add_data_point( 204 | instance_id=instance_id, 205 | agent_id="agent", 206 | timestamp=timestamp, 207 | point_type=PointType.OBSERVATION, 208 | data=image_array, 209 | media_type=MediaType.IMAGE, 210 | metadata={"original_path": screenshot_path}, 211 | ) 212 | else: 213 | self.logger.warning( 214 | f"Unknown element type in trajectory: {element}" 215 | ) 216 | 217 | if self.annotation_path: 218 | annotation_system = AnnotationSystem( 219 | base_path=self.annotation_path, 220 | project_name="WebVoyager Annotations", 221 | description="Free-form text annotations of agent trajectories for WebVoyager", 222 | annotation_schema={ 223 | "feedback": { 224 | "type": "string", 225 | "description": "Free-form text feedback on the trajectory", 226 | } 227 | }, 228 | ) 229 | 230 | annotation_system.add_annotator( 231 | annotator_id="Shikhar", 232 | name="Shikhar Murty", 233 | ) 234 | with open(self.source_path / "feedback.json", "r") as f: 235 | task_id2feedback = json.load(f) 236 | for task_id in task_id2feedback: 237 | instance_id_or_none = task_id2instance_id.get(task_id) 238 | if instance_id_or_none: 239 | annotation_system.add_annotation( 240 | instance_id=instance_id_or_none, 241 | agent_id="agent", 242 | content={"feedback": task_id2feedback[task_id]}, 243 | annotator_id="Shikhar", 244 | ) 245 | self.logger.info("Dataset conversion complete!") 246 | dataset.close() 247 | 248 | 249 | if __name__ == "__main__": 250 | source_path = Path(".data/raw/webvoyager-nnetnav") 251 | output_path = Path(".data/webvoyager-nnetnav") 252 | annotation_path = Path(".data/annotations/webvoyager-nnetnav") 253 | 254 | run_converter( 255 | WebVoyagerNNetNavConverter, 256 | output_path, 257 | source_path, 258 | annotation_path=annotation_path, 259 | ) 260 | -------------------------------------------------------------------------------- /packages/osw-data/src/osw_data/utils.py: -------------------------------------------------------------------------------- 1 | # Dataset-specific utils 2 | # In this file, we will include the utility functions to download datasets into files 3 | 4 | # balrog 5 | import requests 6 | import os 7 | from urllib.parse import quote 8 | 9 | from pathlib import Path 10 | from typing import Tuple, Generator 11 | 12 | from rich.console import Console 13 | from rich.table import Table 14 | from rich.panel import Panel 15 | from rich.text import Text 16 | from rich import box 17 | 18 | 19 | def download_github_folder( 20 | owner: str, repo: str, path: str, save_path: str, token: str | None = None 21 | ) -> None: 22 | """ 23 | Recursively download a folder from GitHub 24 | 25 | Parameters: 26 | - owner: repository owner 27 | - repo: repository name 28 | - path: path to folder in repository 29 | - save_path: local path to save files 30 | - token: GitHub personal access token (optional) 31 | """ 32 | headers = {} 33 | if token or (token := os.getenv("GITHUB_PERSONAL_ACCESS_TOKEN")): 34 | headers["Authorization"] = f"token {token}" 35 | 36 | api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{quote(path)}" 37 | response = requests.get(api_url, headers=headers) 38 | if response.status_code != 200: 39 | raise Exception(f"Failed to get content: {response.status_code}") 40 | 41 | for item in response.json(): 42 | local_path = Path(save_path) / item["name"] 43 | 44 | if item["type"] == "dir": 45 | # If it's a directory, create it and recurse 46 | local_path.mkdir(parents=True, exist_ok=True) 47 | 48 | download_github_folder(owner, repo, item["path"], local_path, token) 49 | print(f"Processed directory: {item['path']}") 50 | 51 | elif item["type"] == "file": 52 | # Skip if file already exists and has same size 53 | if local_path.exists(): 54 | # Get local file size 55 | local_size = local_path.stat().st_size 56 | # Get GitHub file size 57 | github_size = item["size"] 58 | 59 | if local_size == github_size: 60 | print(f"Skipping existing file: {item['path']}") 61 | continue 62 | else: 63 | print(f"Size mismatch, re-downloading: {item['path']}") 64 | 65 | # Create parent directories if they don't exist 66 | local_path.parent.mkdir(parents=True, exist_ok=True) 67 | 68 | # Download file content 69 | download_url = item["download_url"] 70 | file_response = requests.get(download_url, headers=headers) 71 | 72 | # Save the file 73 | with open(local_path, "wb") as f: 74 | f.write(file_response.content) 75 | print(f"Downloaded file: {item['path']}") 76 | 77 | 78 | def file_pairs(folder_path: str) -> Generator[Tuple[Path, Path], None, None]: 79 | """ 80 | Generate pairs of CSV and JSON files with matching names from a folder. 81 | 82 | Args: 83 | folder_path: Path to the folder to search in 84 | 85 | Yields: 86 | Tuple of (csv_path, json_path) for matching files 87 | """ 88 | path = Path(folder_path) 89 | 90 | # Find all CSV files and check for JSON pairs 91 | for csv_file in path.rglob("*.csv"): 92 | json_file = csv_file.with_suffix(".json") 93 | if json_file.exists(): 94 | yield csv_file, json_file 95 | 96 | 97 | def file_pairs_list(folder_path: Path) -> list[tuple[Path, Path]]: 98 | """ 99 | Generate pairs of CSV and JSON files with matching names from a folder. 100 | 101 | Args: 102 | folder_path: Path to the folder to search in 103 | 104 | Yields: 105 | Tuple of (csv_path, json_path) for matching files 106 | """ 107 | path = folder_path 108 | 109 | json_folder = [] 110 | 111 | # Find all CSV files and check for JSON pairs 112 | for csv_file in path.rglob("*.csv"): 113 | json_file = csv_file.with_suffix(".json") 114 | json_folder.append((csv_file, json_file)) 115 | if json_file.exists(): 116 | return json_folder 117 | else: 118 | raise FileNotFoundError(f"JSON file not found for {csv_file}") 119 | 120 | 121 | def file_triplets(folder_path: str) -> Generator[Tuple[Path, Path, Path], None, None]: 122 | """ 123 | Generate CSV and JSON, and PKL files with matching names from a folder. 124 | 125 | Args: 126 | folder_path: Path to the folder to search in 127 | 128 | Yields: 129 | Tuple of (csv_path, json_path, pkl_path) for matching files 130 | """ 131 | path = Path(folder_path) 132 | 133 | # Find all CSV files and check for JSON pairs 134 | for csv_file in path.rglob("*.csv"): 135 | json_file = csv_file.with_suffix(".json") 136 | pkl_file = csv_file.with_suffix(".pkl") 137 | if json_file.exists() and pkl_file.exists(): 138 | yield csv_file, json_file, pkl_file 139 | 140 | 141 | def parse_text_description(text_description: str) -> list[tuple[tuple[int, int], str]]: 142 | """ 143 | Parse a text description of object positions relative to a reference point 144 | and convert it back into a list of relative positions and object names. 145 | 146 | Args: 147 | text_description (str): Multi-line string describing object positions 148 | 149 | Returns: 150 | list: List of tuples ((x, y), name) where x and y are relative coordinates 151 | and name is the object name/type 152 | """ 153 | relative_positions = [] 154 | 155 | # Split the description into individual lines 156 | lines = text_description.strip().split("\n") 157 | 158 | for line in lines: 159 | if not line.strip(): 160 | continue 161 | 162 | # Initialize position values 163 | x_offset = 0 164 | y_offset = 0 165 | 166 | # Split the line into parts 167 | parts = line.split() 168 | 169 | # Extract the object name (everything before the first number) 170 | name_parts = [] 171 | i = 0 172 | while i < len(parts) and not parts[i][0].isdigit(): 173 | name_parts.append(parts[i]) 174 | i += 1 175 | name = " ".join(name_parts) 176 | 177 | # Process the remaining parts for directions 178 | while i < len(parts): 179 | # Get the number of steps 180 | try: 181 | steps = int(parts[i]) 182 | except ValueError: 183 | print(line) 184 | i += 1 185 | 186 | # Skip 'step' or 'steps' 187 | i += 1 188 | 189 | # Process direction 190 | if i < len(parts): 191 | if parts[i] == "to" and i + 1 < len(parts): 192 | i += 1 # skip 'to' 193 | if parts[i] == "the": 194 | i += 1 # skip 'the' 195 | 196 | if parts[i] == "right": 197 | x_offset = steps 198 | elif parts[i] == "left": 199 | x_offset = -steps 200 | i += 1 201 | 202 | elif parts[i] == "up": 203 | y_offset = -steps 204 | i += 1 205 | elif parts[i] == "down": 206 | y_offset = steps 207 | i += 1 208 | 209 | # Skip 'and' if present 210 | if i < len(parts) and parts[i] == "and": 211 | i += 1 212 | 213 | relative_positions.append(((x_offset, y_offset), name)) 214 | 215 | return relative_positions 216 | 217 | 218 | def visualize_map( 219 | relative_positions: list[tuple[tuple[int, int], str]], reference_char: str = "@" 220 | ) -> None: 221 | """ 222 | Visualize the game map using rich library for prettier output. 223 | 224 | Args: 225 | relative_positions: List of ((x, y), name) tuples 226 | reference_char: Character to represent the reference point (player) 227 | """ 228 | console = Console() 229 | 230 | # Find the dimensions of the map 231 | min_x = min(x for (x, y), _ in relative_positions) 232 | max_x = max(x for (x, y), _ in relative_positions) 233 | min_y = min(y for (x, y), _ in relative_positions) 234 | max_y = max(y for (x, y), _ in relative_positions) 235 | 236 | # Add padding and account for reference point at (0,0) 237 | min_x = min(min_x, 0) - 1 238 | max_x = max(max_x, 0) + 1 239 | min_y = min(min_y, 0) - 1 240 | max_y = max(max_y, 0) + 1 241 | 242 | # Create a rich Table for the game grid 243 | table = Table( 244 | box=box.SQUARE, 245 | padding=0, 246 | show_header=True, 247 | header_style="bold cyan", 248 | show_edge=True, 249 | ) 250 | 251 | # Add columns with X-coordinates as headers 252 | COLUMN_WIDTH = 10 # Fixed width for all columns 253 | 254 | # Add Y-coordinates column with the same width 255 | table.add_column(" ", style="bold cyan", width=COLUMN_WIDTH, justify="center") 256 | 257 | # Add other columns with consistent width 258 | for x in range(min_x, max_x + 1): 259 | table.add_column( 260 | str(x), 261 | justify="center", 262 | width=COLUMN_WIDTH, 263 | min_width=COLUMN_WIDTH, 264 | max_width=COLUMN_WIDTH, 265 | ) 266 | 267 | # Helper function to get styled symbol for object 268 | def get_styled_symbol(name: str) -> Text: 269 | name = name.lower() 270 | if name.startswith("rule"): 271 | if "`" in name: 272 | rule_text = name.split("`")[1].split("`")[0] 273 | return Text(f"[{rule_text}]", style="bold yellow") 274 | return Text("[rule]", style="yellow") 275 | elif "wall" in name: 276 | return Text("#", style="red") 277 | elif "ball" in name: 278 | return Text("o", style="green") 279 | elif "key" in name: 280 | return Text("k", style="blue") 281 | else: 282 | return Text("*", style="white") 283 | 284 | # Create the grid with objects 285 | for y in range(min_y, max_y + 1): 286 | row: list[str | Text] = [str(y)] # Y-coordinate 287 | for x in range(min_x, max_x + 1): 288 | cell_content = Text(" ") # Empty cell by default 289 | 290 | # Check if this is the reference point (0,0) 291 | if x == 0 and y == 0: 292 | # Center the reference character 293 | padding = (COLUMN_WIDTH - 1) // 2 # -1 for single character 294 | cell_content = Text( 295 | " " * padding + reference_char, style="bold magenta" 296 | ) 297 | 298 | # Check if there's an object at this position 299 | for (obj_x, obj_y), name in relative_positions: 300 | if obj_x == x and obj_y == y: 301 | symbol = get_styled_symbol(name) 302 | # Center the symbol in the column width 303 | padding = (COLUMN_WIDTH - len(str(symbol))) // 2 304 | cell_content = Text(" " * padding) + symbol 305 | break 306 | 307 | row.append(cell_content) 308 | table.add_row(*row) 309 | 310 | # Create legend panel 311 | legend_text = [ 312 | Text("Legend:", style="bold"), 313 | Text(f"\n{reference_char} ", style="bold magenta") 314 | + Text("- Player (reference point)"), 315 | Text("\n# ", style="red") + Text("- Wall"), 316 | Text("\no ", style="green") + Text("- Ball"), 317 | Text("\nk ", style="blue") + Text("- Key"), 318 | Text("\n[text] ", style="bold yellow") + Text("- Rule"), 319 | Text("\n* ", style="white") + Text("- Other objects"), 320 | ] 321 | legend = Text.assemble(*legend_text) 322 | 323 | # Create coordinates panel 324 | coord_text = Text.assemble( 325 | Text("Grid coordinates:", style="bold"), 326 | Text(f"\nX: {min_x} to {max_x}"), 327 | Text(f"\nY: {min_y} to {max_y}"), 328 | ) 329 | 330 | # Print everything 331 | console.print(Panel(table, title="Game Map", border_style="cyan")) 332 | console.print(Panel(legend, title="Legend", border_style="green")) 333 | console.print(Panel(coord_text, title="Coordinates", border_style="blue")) 334 | --------------------------------------------------------------------------------