├── .python-version
├── packages
    ├── osw-data
    │   ├── README.md
    │   ├── src
    │   │   └── osw_data
    │   │   │   ├── py.typed
    │   │   │   ├── __init__.py
    │   │   │   ├── metrics.py
    │   │   │   ├── dataset.py
    │   │   │   ├── trajectory.py
    │   │   │   ├── annotation.py
    │   │   │   └── utils.py
    │   ├── pyproject.toml
    │   └── tests
    │   │   ├── test_trajectory.py
    │   │   ├── test_dataset.py
    │   │   ├── test_annotation.py
    │   │   └── test_metrics.py
    └── autolibra-core
    │   ├── src
    │       └── autolibra_core
    │       │   ├── py.typed
    │       │   ├── templates
    │       │       ├── categorize_behavior_feedback.j2
    │       │       ├── feedback_grounding.j2
    │       │       ├── aspect_traits_match.j2
    │       │       ├── generate_metrics_v2.j2
    │       │       ├── coverage_evaluation.j2
    │       │       ├── llm_as_a_judge_evaluator_v3.j2
    │       │       ├── llm_as_a_judge_evaluator.j2
    │       │       ├── behavior_clustering.j2
    │       │       ├── behavior_extraction.j2
    │       │       ├── generate_metrics.j2
    │       │       ├── llm_as_a_judge_evaluator_v2.j2
    │       │       └── coverage_evaluation_v2.j2
    │       │   ├── datasets
    │       │       ├── care.py
    │       │       ├── base.py
    │       │       ├── nnetnav_live.py
    │       │       ├── sotopia.py
    │       │       ├── cogym.py
    │       │       ├── balrog_babaisai.py
    │       │       ├── balrog_mini.py
    │       │       ├── webarena_nnetnav.py
    │       │       ├── webarena.py
    │       │       ├── webvoyager_nnetnav_best.py
    │       │       └── webvoyager_nnetnav.py
    │       │   ├── data
    │       │       ├── __init__.py
    │       │       └── primitives.py
    │       │   ├── evaluators
    │       │       ├── __init__.py
    │       │       ├── llm_evaluator.py
    │       │       └── coverage_evaluator.py
    │       │   ├── operators
    │       │       ├── __init__.py
    │       │       ├── behavior_clustering.py
    │       │       └── feedback_grounding.py
    │       │   ├── __init__.py
    │       │   ├── configs
    │       │       └── __init__.py
    │       │   └── utils
    │       │       └── __init__.py
    │   ├── README.md
    │   ├── tests
    │       ├── positive_aspects_traits.pkl
    │       └── test_coverage.py
    │   └── pyproject.toml
├── .github
    └── workflows
    │   ├── pre-commit.yml
    │   └── mypy.yml
├── src
    ├── tools
    │   ├── count_annotations.py
    │   ├── count_score_frequency.py
    │   └── count_number_steps.py
    ├── training
    │   ├── extract_results.py
    │   ├── llm_as_a_judge.py
    │   ├── grounding.py
    │   ├── llm_eval.py
    │   └── iterative.py
    ├── plot
    │   └── meta-eval.py
    └── tty
    │   └── view_annotations.py
├── .pre-commit-config.yaml
├── pyproject.toml
├── README.md
└── .gitignore


/.python-version:
--------------------------------------------------------------------------------
1 | 3.10
2 | 


--------------------------------------------------------------------------------
/packages/osw-data/README.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/packages/osw-data/src/osw_data/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/templates/categorize_behavior_feedback.j2:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/README.md:
--------------------------------------------------------------------------------
1 | This folder contains code for the dataset converter and evaluator.
2 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/datasets/care.py:
--------------------------------------------------------------------------------
1 | from .base import BaseConverter
2 | 
3 | 
4 | class CareConverter(BaseConverter):
5 |     pass
6 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/tests/positive_aspects_traits.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Open-Social-World/autolibra/HEAD/packages/autolibra-core/tests/positive_aspects_traits.pkl


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .primitives import Trait, Aspect, MetricTrainingInstance
2 | 
3 | __all__ = ["Trait", "Aspect", "MetricTrainingInstance"]
4 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/evaluators/__init__.py:
--------------------------------------------------------------------------------
1 | from .llm_evaluator import run_llm_eval
2 | # from .coverage_evaluator_v2 import coverage_eval
3 | 
4 | __all__ = [
5 |     "run_llm_eval",
6 |     # "coverage_eval",
7 | ]
8 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/operators/__init__.py:
--------------------------------------------------------------------------------
 1 | from ..data import MetricTrainingInstance
 2 | from .feedback_grounding import feedback_grounding
 3 | from .behavior_clustering import behavior_clustering
 4 | 
 5 | __all__ = [
 6 |     "MetricTrainingInstance",
 7 |     "feedback_grounding",
 8 |     "behavior_clustering",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
 1 | name: pre-commit
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 |     branches: [main]
 7 | 
 8 | jobs:
 9 |   pre-commit:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - uses: actions/checkout@v3
13 |     - name: Set up Python 3.11
14 |       uses: actions/setup-python@v4
15 |       with:
16 |         python-version: 3.11.2
17 |     - uses: pre-commit/action@v3.0.0
18 | 


--------------------------------------------------------------------------------
/src/tools/count_annotations.py:
--------------------------------------------------------------------------------
 1 | from osw_data import AnnotationSystem
 2 | import rich
 3 | 
 4 | if __name__ == "__main__":
 5 |     dataset_name = "sotopia"
 6 | 
 7 |     annotation_system = AnnotationSystem(
 8 |         base_path=f".data/annotations/{dataset_name}",
 9 |     )
10 | 
11 |     rich.print(
12 |         f"There are {len(annotation_system.get_all_annotations())} annotations in the dataset."
13 |     )
14 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/__init__.py:
--------------------------------------------------------------------------------
 1 | from .operators import (
 2 |     feedback_grounding,
 3 |     behavior_clustering,
 4 | )
 5 | 
 6 | from .evaluators import run_llm_eval
 7 | from .data import Trait, Aspect, MetricTrainingInstance
 8 | 
 9 | __all__ = [
10 |     "MetricTrainingInstance",
11 |     "feedback_grounding",
12 |     "behavior_clustering",
13 |     "run_llm_eval",
14 |     "Trait",
15 |     "Aspect",
16 |     # "coverage_eval",
17 | ]
18 | 


--------------------------------------------------------------------------------
/packages/osw-data/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "osw-data"
 3 | version = "0.0.1"
 4 | description = "Add your description here"
 5 | readme = "README.md"
 6 | requires-python = ">=3.10, <3.13"
 7 | dependencies = [
 8 |     "numpy>=1.9.3",
 9 |     "pydantic>=2.7",
10 |     "pyyaml>=6.0.2",
11 |     "rich>=13.9.4",
12 |     "types-PyYAML>=6.0.2"
13 | ]
14 | 
15 | 
16 | [tool.hatch.build.targets.wheel]
17 | packages = ["src/osw_data"]
18 | 
19 | 
20 | [build-system]
21 | requires = ["hatchling"]
22 | build-backend = "hatchling.build"
23 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/configs/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Literal
 2 | from pydantic_settings import BaseSettings, SettingsConfigDict
 3 | from pydantic import Field
 4 | 
 5 | 
 6 | class AutoLibraEvalSettings(BaseSettings):
 7 |     model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")
 8 |     azure_api_key: str
 9 |     azure_endpoint: str
10 |     github_personal_access_token: str
11 |     azure_openai_4o_model: str | None = Field(default=None)
12 |     azure_openai_o1_model: str | None = Field(default=None)
13 |     azure_openai_o3_model: str | None = Field(default=None)
14 |     reasoning_effort: Literal["low", "medium", "high"] = "medium"
15 | 


--------------------------------------------------------------------------------
/packages/osw-data/src/osw_data/__init__.py:
--------------------------------------------------------------------------------
 1 | from .dataset import MultiAgentDataset, AgentMetadata, DataInstance
 2 | from .trajectory import SymmetricTrajectory, TrajectoryPoint, MediaType, PointType
 3 | from .annotation import Annotation, AnnotationSpan, Annotator, AnnotationSystem
 4 | from .metrics import Metric, MetricSetMetadata, MetricSet
 5 | 
 6 | __all__ = [
 7 |     "DataInstance",
 8 |     "MultiAgentDataset",
 9 |     "Annotation",
10 |     "AnnotationSpan",
11 |     "Annotator",
12 |     "AnnotationSystem",
13 |     "SymmetricTrajectory",
14 |     "TrajectoryPoint",
15 |     "AgentMetadata",
16 |     "MediaType",
17 |     "PointType",
18 |     "Metric",
19 |     "MetricSetMetadata",
20 |     "MetricSet",
21 | ]
22 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/templates/feedback_grounding.j2:
--------------------------------------------------------------------------------
 1 | You are an expert analyst tasked with analyzing agent trajectories and human feedback.
 2 | 
 3 | Context:
 4 | Review the following trajectory of an AI agent, along with the corresponding human feedback for interaction.
 5 | Think about which part of the trajectory the feedback is referring to.
 6 | 
 7 | Agent Trajectory:
 8 | {{ instance.trajectory }}
 9 | 
10 | Human Feedback:
11 | {{ instance.feedback }}
12 | 
13 | Instructions:
14 | 1. Analyze the agent trajectories and corresponding human feedback carefully
15 | 2. Break down the feedback into bulletpoints
16 | 3. For each bulletpoint, find the corresponding part of the trajectory that the feedback is referring to
17 | 
18 | Output the following:
19 | 


--------------------------------------------------------------------------------
/.github/workflows/mypy.yml:
--------------------------------------------------------------------------------
 1 | name: Mypy
 2 | on: [push]
 3 | 
 4 | jobs:
 5 |   Static-Type-Checking:
 6 |     runs-on: ubuntu-latest
 7 |     strategy:
 8 |       max-parallel: 5
 9 |       matrix:
10 |         python-version: ["3.10", "3.11", "3.12"]
11 | 
12 |     steps:
13 |     - uses: actions/checkout@v3
14 |     - name: Set up Python ${{ matrix.python-version }}
15 |       uses: actions/setup-python@v5
16 |       with:
17 |         python-version: ${{ matrix.python-version }}
18 |     - name: Display Python version
19 |       run: python -c "import sys; print(sys.version)"
20 |     - name: Install dependencies
21 |       run: |
22 |         curl -LsSf https://astral.sh/uv/install.sh | sh
23 |     - name: Type-checking package with mypy
24 |       run: |
25 |         uv run --all-extras mypy --strict .
26 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/data/primitives.py:
--------------------------------------------------------------------------------
 1 | from typing import Literal
 2 | from osw_data.trajectory import SymmetricTrajectory
 3 | from pydantic import BaseModel, Field
 4 | from osw_data import Metric
 5 | 
 6 | 
 7 | class Aspect(BaseModel):
 8 |     feedback: str
 9 |     behavior: str
10 |     is_positive: bool = Field(
11 |         description="Whether the feedback is positive or negative."
12 |     )
13 | 
14 | 
15 | class Trait(BaseModel):
16 |     metric: Metric
17 |     rating: Literal[-1, 0, 1]
18 | 
19 | 
20 | class MetricTrainingInstance:
21 |     def __init__(
22 |         self, task: str, agent_id: str, trajectory: SymmetricTrajectory, feedback: str
23 |     ):
24 |         self.task = task
25 |         self.agent_id = agent_id
26 |         self.trajectory = trajectory
27 |         self.feedback = feedback
28 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v5.0.0
 4 |     hooks:
 5 |     -   id: trailing-whitespace
 6 |     -   id: end-of-file-fixer
 7 |     -   id: check-yaml
 8 |     -   id: check-added-large-files
 9 | -   repo: https://github.com/pre-commit/mirrors-prettier
10 |     rev: v3.0.1  # Use the sha / tag you want to point at
11 |     hooks:
12 |     -   id: prettier
13 |         types_or: [html]
14 | -   repo: https://github.com/astral-sh/ruff-pre-commit
15 |     # Ruff version.
16 |     rev: v0.3.5
17 |     hooks:
18 |     # Run the linter.
19 |     -   id: ruff
20 |         types_or: [ python, pyi, jupyter ]
21 |         args: [ --fix ]
22 |     # Run the formatter.
23 |     -   id: ruff-format
24 |         types_or: [ python, pyi, jupyter ]
25 | -   repo: https://github.com/kynan/nbstripout
26 |     rev: 0.6.0
27 |     hooks:
28 |       - id: nbstripout
29 | 


--------------------------------------------------------------------------------
/src/tools/count_score_frequency.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | scores = {}
 4 | for row in open("llm_eval_results.jsonl"):
 5 |     data = json.loads(row)
 6 |     for key, value in data.items():
 7 |         # Add all new keys to the dictionary
 8 |         if key not in scores:
 9 |             scores[key] = [0, 0, 0]
10 |         # Increment the corresponding value
11 |         if value == -1:
12 |             scores[key][0] += 1
13 |         elif value == 0:
14 |             scores[key][1] += 1
15 |         elif value == 1:
16 |             scores[key][2] += 1
17 | 
18 | # Remove [0,0,0] entries, as these are purely text
19 | scores = {key: val for key, val in scores.items() if sum(val) > 0}
20 | 
21 | for key, val in scores.items():
22 |     print(key, val)
23 | 
24 | # Print number of total scores
25 | total_number_of_dps = sum([sum(val) for val in scores.values()])
26 | print("Total number of datapoints:", total_number_of_dps)
27 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "autolibra-core"
 3 | version = "0.1.0"
 4 | description = "Add your description here"
 5 | readme = "README.md"
 6 | requires-python = ">=3.10"
 7 | dependencies = [
 8 |     "jinja2>=3.1.5",
 9 |     "numpy>=1.9.3",
10 |     "polars>=1.19.0",
11 |     "pydantic-ai>=0.0.18",
12 |     "pydantic>=2.7",
13 |     "pyyaml>=6.0.2",
14 |     "requests>=2.32.3",
15 |     "rich>=13.9.4",
16 |     "types-requests>=2.32.0.20241016",
17 |     "osw-data>=0.0.1",
18 | ]
19 | 
20 | [project.optional-dependencies]
21 | webarena = [
22 |     "gdown>=5.2.0",
23 |     "pillow>=11.1.0",
24 | ]
25 | sotopia = [
26 |     "huggingface-hub>=0.27.1",
27 | ]
28 | 
29 | [tool.hatch.build]
30 | include = ["src/autolibra_core/templates/*.j2"]
31 | 
32 | [tool.hatch.build.targets.wheel]
33 | packages = ["src/autolibra_core"]
34 | 
35 | [build-system]
36 | requires = ["hatchling"]
37 | build-backend = "hatchling.build"
38 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/templates/aspect_traits_match.j2:
--------------------------------------------------------------------------------
 1 | You are an expert agent behavior analyst. You are tasked with analyzing the behaviors and feedback to identify relavent traits.
 2 | 
 3 | Here are the behaviors and feedbacks you need to analyze:
 4 | 
 5 | {% for behavior_feedback in behavior_feedback_list %}
 6 | 
 7 | Aspect {{ loop.index }}
 8 | Behavior: {{ behavior_feedback.behavior }}
 9 | Feedback: {{ behavior_feedback.feedback }}
10 | 
11 | {% endfor %}
12 | 
13 | Here are the traits you need to analyze:
14 | 
15 | {% for metric in metric_list %}
16 | 
17 | Trait {{ loop.index }}
18 | Trait name: {{ metric.name }}
19 | Description: {{ metric.description }}
20 | Good behavior: {{ metric.good_behaviors }}
21 | Bad behavior: {{ metric.bad_behaviors }}
22 | 
23 | {% endfor %}
24 | 
25 | Output the following:
26 | For each of the aspects, please try to match it with the a relevant trait, and provide a reasoning for your choice.
27 | If you think the aspect is not relevant to any of the traits, please output "None of the traits matches the aspect."
28 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/datasets/base.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import logging
 3 | from typing import Any
 4 | 
 5 | 
 6 | class BaseConverter(object):
 7 |     def __init__(self, output_path: Path, source_path: Path) -> None:
 8 |         self.output_path = output_path
 9 |         self.source_path = source_path
10 | 
11 |         # Setup logging
12 |         logging.basicConfig(level=logging.INFO)
13 |         self.logger = logging.getLogger(type(self).__name__)
14 | 
15 |         self._setup_constants()
16 | 
17 |     def _setup_constants(self) -> None:
18 |         pass
19 | 
20 |     def download_data(self) -> None:
21 |         raise NotImplementedError()
22 | 
23 |     def convert_to_dataset(self) -> None:
24 |         raise NotImplementedError()
25 | 
26 | 
27 | def run_converter(
28 |     converter_class: type[BaseConverter],
29 |     output_path: Path,
30 |     source_path: Path,
31 |     **kwargs: Any,
32 | ) -> None:
33 |     converter = converter_class(
34 |         output_path=output_path, source_path=source_path, **kwargs
35 |     )
36 |     converter.download_data()
37 |     converter.convert_to_dataset()
38 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/templates/generate_metrics_v2.j2:
--------------------------------------------------------------------------------
 1 | You are an expert evaluator tasked with analyzing agent trajectories and human feedback to identify and define relevant evaluation metrics.
 2 | 
 3 | Context:
 4 | Review the following interactions between an AI agent and a human, along with the corresponding human feedback for each interaction.
 5 | 
 6 | {% for instance in instances %}
 7 | Instance {{ loop.index }}:
 8 | 
 9 | Agent Trajectory:
10 | {{ instance.trajectory }}
11 | 
12 | Human Feedback:
13 | {{ instance.feedback }}
14 | 
15 | {% endfor %}
16 | 
17 | Instructions:
18 | 1. Analyze the agent trajectories and corresponding human feedback carefully
19 | 2. Identify key aspects of performance that emerge from the interactions and feedback
20 | 3. Define a comprehensive set of metrics that capture these aspects
21 | 4. For each metric provide:
22 |    - A clear name
23 |    - A detailed description of what the metric measures
24 |    - Why this metric is important based on the observed interactions
25 |    - Example behaviors from the trajectories that would score high or low on this metric
26 | 
27 | Output the following:
28 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/templates/coverage_evaluation.j2:
--------------------------------------------------------------------------------
 1 | You are an expert in thematic anaysis trying to understand the differences between quantative and qualitative evaluation metrics.
 2 | 
 3 | Context:
 4 | Review the following task metadata, agent metadata, the huamn's free-form feedback, and the quantative metrics.
 5 | 
 6 | Task Metadata:
 7 | {{ instance.task_metadata }}
 8 | 
 9 | Agent Metadata:
10 | {{ instance.agent_metadata }}
11 | 
12 | Humans' feedback:
13 | {{ instance.feedback }}
14 | 
15 | Quantative metrics and their corresponding ratings:
16 | {{ instance.metric }}
17 | 
18 | 
19 | Instructions:
20 | 1. Analyze the task and humans' feedback carefully
21 | 2. Break the humans' feedback into bullet points.
22 | 3. Understand the metrics provided. And think about what part of the feedback is not captured by the metrics.
23 | 4. For each bulletpoint, provide which metric you think covers it, or if it is not covered by any metric.
24 | 5. After that, please rate the coverage of the metrics on the feedback from 0 to 5, where 0 means the metrics do not cover the feedback at all, and 5 means the metrics cover the feedback perfectly.
25 | 
26 | Output the following:
27 | 
28 | It should be a string (the reasoning) and an integer (the 0-5 rating) in a json format
29 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "autolibra-eval"
 7 | version = "0.1.0"
 8 | description = "Add your description here"
 9 | readme = "README.md"
10 | requires-python = ">=3.10"
11 | dependencies = [
12 |     "fastapi>=0.115.6",
13 |     "autolibra-core[webarena,sotopia]",
14 |     "pydantic-settings>=2.7.1",
15 |     "rich>=13.9.4",
16 |     "typer>=0.15.1",
17 |     "uvicorn>=0.34.0",
18 |     "streamlit>=1.43",
19 |     "pandas>=2.2",
20 |     "logfire>=3.7.1",
21 |     "matplotlib>=3.10.1",
22 | ]
23 | 
24 | [tool.uv]
25 | dev-dependencies = [
26 |     "ipykernel>=6.29.5",
27 |     "mypy>=1.14.1",
28 |     "pre-commit>=4.0.1",
29 |     "pytest-cov>=6.0.0",
30 |     "pytest>=8.3.4",
31 |     "pytest-asyncio>=0.25.2",
32 |     "pandas-stubs>=2.2",
33 | ]
34 | 
35 | [project.optional-dependencies]
36 | notebook = ["marimo>=0.10"]
37 | 
38 | [tool.uv.sources]
39 | autolibra-core = { workspace = true }
40 | osw-data = { workspace = true }
41 | 
42 | [tool.uv.workspace]
43 | members = ["packages/*"]
44 | 
45 | [tool.mypy]
46 | strict = true
47 | plugins = ["pydantic.mypy"]
48 | mypy_path = "stubs"
49 | 
50 | [tool.hatch.build.targets.wheel]
51 | packages = ["src/osw_data"]
52 | 
53 | [tool.pylsp-mypy]
54 | enabled = true
55 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from importlib import resources
 2 | import jinja2
 3 | from osw_data.dataset import DataInstance
 4 | from osw_data.trajectory import SymmetricTrajectory
 5 | from autolibra_core.data.primitives import MetricTrainingInstance
 6 | 
 7 | 
 8 | def load_prompt_template(jinja_file: str) -> jinja2.Template:
 9 |     with resources.files("autolibra_core.templates").joinpath(jinja_file).open(
10 |         "r"
11 |     ) as f:
12 |         return jinja2.Template(f.read())
13 | 
14 | 
15 | def render_webarena_trajectory(
16 |     trajectory: SymmetricTrajectory, metadata: DataInstance | None = None
17 | ) -> str:
18 |     return "\n".join(
19 |         [
20 |             metadata.model_dump_json(),
21 |         ]
22 |         if metadata
23 |         else []
24 |         + [
25 |             f"{'Observation' if p.point_type == 'observation' else 'Action'}: {trajectory.get_data_at(i)}"
26 |             for i, p in enumerate(trajectory.points)
27 |         ]
28 |     )
29 | 
30 | 
31 | def render_training_instance(training_instance: MetricTrainingInstance) -> str:
32 |     return "\n".join(
33 |         [
34 |             f"The task is {training_instance.task}",
35 |         ]
36 |         + [
37 |             f"{'Observation' if p.point_type == 'observation' else 'Action'}: {str(training_instance.trajectory.get_data_at(i))[:8000]}"
38 |             for i, p in enumerate(training_instance.trajectory.points)
39 |         ]
40 |     )
41 | 


--------------------------------------------------------------------------------
/src/training/extract_results.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import polars as pl
 3 | 
 4 | 
 5 | def convert_jsonl_to_table(file_path: str) -> pl.DataFrame:
 6 |     # Read records into a list
 7 |     records = []
 8 |     with open(file_path, "r") as file:
 9 |         for line in file:
10 |             if line.strip():  # Skip empty lines
11 |                 record = json.loads(line)
12 |                 records.append(record)
13 | 
14 |     # Convert to Polars DataFrame
15 |     df = pl.DataFrame(records)
16 | 
17 |     # Reorder columns to group reasoning and scores together
18 |     reasoning_columns = [col for col in df.columns if col.endswith("_reasoning")]
19 |     score_columns = [col for col in df.columns if not col.endswith("_reasoning")]
20 | 
21 |     # Combine columns in desired order
22 |     df = df.select(reasoning_columns + score_columns)
23 | 
24 |     return df
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     # Replace 'your_file.jsonl' with your actual file path
29 |     file_path = "llm_eval_results.jsonl"
30 |     try:
31 |         df = convert_jsonl_to_table(file_path)
32 | 
33 |         # Save to CSV
34 |         df.write_csv("converted_table.csv")
35 |         print("\nTable has been saved to 'converted_table.csv'")
36 | 
37 |     except FileNotFoundError:
38 |         print(f"Error: File '{file_path}' not found")
39 |     except json.JSONDecodeError:
40 |         print("Error: Invalid JSON format in file")
41 |     except Exception as e:
42 |         print(f"An error occurred: {str(e)}")
43 | 


--------------------------------------------------------------------------------
/src/tools/count_number_steps.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser(description="Balrog Converter")
 6 | parser.add_argument(
 7 |     "--filename",
 8 |     type=str,
 9 |     required=True,
10 |     help="The name of the folder containing the Balrog data for the given run in raw",
11 | )
12 | 
13 | filename = parser.parse_args().filename
14 | 
15 | file_path = f".data/raw/{filename}"
16 | 
17 | scoresteps = {}
18 | 
19 | for root, dirs, files in os.walk(file_path):
20 |     json_files = [f for f in files if f.endswith(".json") and "summary" not in f]
21 | 
22 |     for ind_file in json_files:
23 |         with open(os.path.join(root, ind_file), "r") as f:
24 |             data = json.load(f)
25 |             task = data["task"]
26 | 
27 |             if task not in scoresteps:
28 |                 scoresteps[task] = [data["num_steps"], data["episode_return"], 1]
29 |             else:
30 |                 scoresteps[task][0] += data["num_steps"]
31 |                 scoresteps[task][1] += data["episode_return"]
32 |                 scoresteps[task][2] += 1
33 | 
34 | net_avg_steps = 0
35 | net_avg_return = 0
36 | 
37 | for key, val in scoresteps.items():
38 |     avg_steps = val[0] / val[2]
39 |     avg_return = val[1] / val[2]
40 | 
41 |     print(key, avg_steps, avg_return)
42 |     net_avg_steps += avg_steps
43 |     net_avg_return += avg_return
44 | 
45 | print("Net average steps:", net_avg_steps / len(scoresteps))
46 | print("Net average return:", net_avg_return / len(scoresteps))
47 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/templates/llm_as_a_judge_evaluator_v3.j2:
--------------------------------------------------------------------------------
 1 | You are an expert evaluator tasked with analyzing agent trajectories and human feedback to identify and define relevant evaluation metrics.
 2 | 
 3 | Context:
 4 | Review the following task metadata, agent metadata, and its trajectory, and several metrics that you should use to evaluate the agent's performance.
 5 | 
 6 | Agent Trajectory:
 7 | {{ trajectory }}
 8 | 
 9 | Metrics:
10 | {% for metric in metrics %}
11 | 
12 | Metric name: {{ metric.name }}
13 | Metric description: {{ metric.explanation }}
14 | Metric good behaviors: {{ metric.good_behaviors }}
15 | Metric bad behaviors: {{ metric.bad_behaviors }}
16 | 
17 | 
18 | {% endfor %}
19 | 
20 | 
21 | Instructions:
22 | 1. Analyze the agent task and trajectory carefully
23 | 2. Understand the metrics provided, and figure out which part of the trajectory they might be relevant to
24 | 3. Output two things for each metric:
25 |    - A reasoning for why you think the agent either did PERFECTLY well or not on the metric
26 |       - If you think the agent did PERFECTLY well, provide the behavior of the agent that led you to that conclusion
27 |       - If you think the agent did SOMETHING WRONG on this metric, provide the behavior of the agent that led you to that conclusion
28 |       - If you think the metric is not applicable to the agent, provide a reasoning for that
29 |    - A string indicating whether you think the agent did well or poorly on the metric
30 |       - "1" indicates the agent did PERFECTLY well
31 |       - "0" indicates the metric is not applicable to the agent
32 |       - "-1" indicates the agent did SOMETHING WRONG
33 | 
34 | Output the following:
35 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/templates/llm_as_a_judge_evaluator.j2:
--------------------------------------------------------------------------------
 1 | You are an expert evaluator tasked with analyzing agent trajectories and human feedback to identify and define relevant evaluation metrics.
 2 | 
 3 | Context:
 4 | Review the following task metadata, agent metadata, and its trajectory, and a metric that you should use to evaluate the agent's performance.
 5 | 
 6 | Task Metadata:
 7 | {{ instance.task_metadata }}
 8 | 
 9 | Agent Metadata:
10 | {{ instance.agent_metadata }}
11 | 
12 | Agent Trajectory:
13 | {{ instance.trajectory }}
14 | 
15 | Metric:
16 | {{ instance.metric }}
17 | 
18 | 
19 | Instructions:
20 | 1. Analyze the agent task and trajectory carefully
21 | 2. Understand the metric provided, and figure out which part of the trajectory it might be relevant to
22 | 3. Output two things:
23 |    - A reasoning for why you think the agent either did well or poorly on the metric
24 |       - If you think the agent did well, provide the behavior of the agent that led you to that conclusion
25 |       - If you think the agent did poorly, provide the behavior of the agent that led you to that conclusion
26 |    - A binary integer indicating whether you think the agent did well or poorly on the metric
27 |       - 1 indicates the agent did well or the metric is not applicable to the agent
28 |       - 0 indicates the agent did poorly
29 | 
30 | Output the following:
31 | 
32 | 
33 | It should be a string (the reasoning) and an integer (the binary rating) in a json format:
34 | 
35 | {'properties': {'reasoning': {'title': 'Reasoning', 'type': 'string'}, 'rating': {'title': 'Rating', 'type': 'integer'}}, 'required': ['reasoning', 'rating'], 'title': 'EvaluationResult', 'type': 'object'}
36 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/templates/behavior_clustering.j2:
--------------------------------------------------------------------------------
 1 | You are an expert in agent behavior analysis and are tasked with analyzing agent behaviors and human feedback
 2 | to group similar positive and negative behaviors together, so that you can extract the metrics to evaluate the agents.
 3 | 
 4 | Here are the behaviors and feedbacks you need to analyze:
 5 | 
 6 | {% for behavior_feedback in behavior_feedback_list %}
 7 | 
 8 | {{ loop.index }}
 9 | Behavior: {{ behavior_feedback.behavior }}
10 | Feedback: {{ behavior_feedback.feedback }}
11 | 
12 | {% endfor %}
13 | 
14 | Instructions:
15 | 1. Analyze the behaviors and corresponding human feedback carefully
16 | 2. Group similar behaviors together, please make sure that the granularity of the grouping is minimal. Only very similar behaviors should be grouped together.
17 | 3. Output a list of metrics, where each metric is composed of
18 |     - A list of good behaviors (sentences describing the behavior)
19 |     - A list of bad behaviors (sentences describing the behavior)
20 |     - An explanation of the metric's meaning and how the positive behaviors should be determined in novel agent behaviors
21 |     - A name for the metric
22 | 4. Make sure to include each behavior in at least one metric
23 | 
24 | N.B.
25 | The granularity of the grouping should be minimal, only very similar behaviors should be grouped together. But don't limit to one particular website:
26 |     - Good name: Location Query Correctness
27 |     - Bad name: Mapping Site Usage Correctness
28 |     - Bad name: Arxiv Categories Correctness
29 |     - DO NOT add the website name in the metric name including huggingface, arxiv, dictionary, ESPN, Apple, Amazon, BBC, etc.
30 | Also, don't limit to one particular character:
31 |     - Good name: Don't settle for less in negotiations
32 |     - Bad name: Oliver's negotiation strategy
33 | 
34 | Output 4 metrics.
35 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/datasets/nnetnav_live.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import TypedDict
 3 | 
 4 | 
 5 | class Message(TypedDict):
 6 |     content: str
 7 | 
 8 | 
 9 | class NNetNavStepData(TypedDict):
10 |     messages: list[Message]
11 | 
12 | 
13 | def _get_objective(data: NNetNavStepData) -> str:
14 |     return data["messages"][1]["content"].split("OBJECTIVE: ")[1].split("\n")[0]
15 | 
16 | 
17 | def _get_action(data: NNetNavStepData) -> str:
18 |     try:
19 |         return data["messages"][2]["content"].split("```")[1]
20 |     except IndexError:
21 |         return "none"
22 | 
23 | 
24 | def _get_observation(data: NNetNavStepData) -> str:
25 |     return data["messages"][1]["content"].split("OBSERVATION: ")[0]
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     objectives: list[str] = []
30 |     with open(".data/raw/nnetnav-live-00/train.jsonl") as f:
31 |         for line in f.readlines():
32 |             data = json.loads(line)
33 |             objectives.append(_get_objective(data))
34 | 
35 |     unique_objectives: list[str] = []
36 | 
37 |     instance_starts = [0]
38 | 
39 |     for line_no, (this_obj, next_obj) in enumerate(zip(objectives, objectives[1:])):
40 |         if this_obj != next_obj:
41 |             instance_starts.append(line_no + 1)
42 | 
43 |     instance_ends = instance_starts[1:] + [len(objectives)]
44 | 
45 |     lines: list[str] = []
46 |     with open(".data/raw/nnetnav-live-00/train.jsonl") as f:
47 |         lines = f.readlines()
48 | 
49 |     for start, end in zip(instance_starts, instance_ends):
50 |         instance_lines = lines[start:end]
51 |         instance_data = [json.loads(line) for line in instance_lines]
52 | 
53 |         for data in instance_data:
54 |             print(_get_objective(data))
55 |             print(_get_action(data))
56 |             print(_get_observation(data))
57 |             print()
58 | 
59 |         break
60 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AutoLibra ⚖️ Metric Induction for Agents from Open-Ended Human Feedback
 2 | 
 3 | ## Introduction
 4 | 
 5 | AutoLibra is designed to facilitate the evaluation of agents through metrics derived from human feedback. This document outlines the steps for contributors to prepare data, annotate it, and run experiments.
 6 | 
 7 | ## Contributor doc
 8 | 
 9 | ### Prepare the data
10 | 
11 | Install git lfs if you haven't already. This is required to download the large files in the dataset.
12 | 
13 | #### From scratch
14 | For contributors, it is the best to use our shared data repo on Hugging Face: `open-social-world/autolibra`. Upload new datasets to this shared repo.
15 | 
16 | ```bash
17 | # Download and preprocess <dataset>
18 | uv run python -m autolibra_core.datasets.<dataset>
19 | ```
20 | 
21 | #### Download from huggingface
22 | 
23 | ```bash
24 | git clone https://huggingface.co/datasets/open-social-world/autolibra .data
25 | ```
26 | 
27 | #### Upload your data to huggingface
28 | 
29 | ```bash
30 | # cd into .data
31 | # git add your data
32 | # git commit -m "Add <dataset>"
33 | git push
34 | ```
35 | 
36 | ### Annotation
37 | ```bash
38 | uv run python src/tty/tty_annotation.py .data/webarena .data/annotations/webarena --annotator-id <your name>
39 | ```
40 | 
41 | ### Annotation Web Interface with Streamlit
42 | ```bash
43 | uv run streamlit run src/tty/tty_annotation.py .data/sotopia .data/annotations/sotopia -- --annotator-id <your name> --use-streamlit
44 | ```
45 | 
46 | ### View Annotations with Streamlit
47 | ```bash
48 | streamlit run src/tty/view_annotations.py -- .data/annotations/sotopia/annotations
49 | ```
50 | 
51 | ### To run metric extraction
52 | ```bash
53 | uv run python -m autolibra_core.gen_eval.generator
54 | ```
55 | ### Run experiments
56 | Test environments (BALROG, etc) are included as submodules under .gitmodules. Documentation for using these environments are included within each environment repo.
57 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/templates/behavior_extraction.j2:
--------------------------------------------------------------------------------
 1 | You are an expert evaluator tasked with analyzing agent trajectories and human feedback to identify and define relevant evaluation metrics.
 2 | 
 3 | Context:
 4 | Review the following trajectory of an AI agent, along with the corresponding human feedback for each interaction.
 5 | 
 6 | {% for instance in instances %}
 7 | Instance {{ loop.index }}:
 8 | 
 9 | Agent Trajectory:
10 | {{ instance.trajectory }}
11 | 
12 | Human Feedback:
13 | {{ instance.feedback }}
14 | 
15 | {% endfor %}
16 | 
17 | Instructions:
18 | 1. Analyze the agent trajectories and human feedback carefully
19 | 2. Identify key aspects of performance that emerge from the interactions and feedback
20 | 3. Break down these aspects into specific agent behaviors and corresponding human feedback
21 | 4. The behavior should be a specific action, a series of actions or decision made by the agent
22 | 5. The feedback should be a summarization of the corresponding human feedback
23 | 6. Please avoid any repetitive or redundant behaviors and feedback
24 | 
25 | Output the following:
26 | 
27 | It should be list of behavior and feedback, each in the following format:
28 | 
29 | {'properties': {'instance_id': {'description': 'The ID of the instance the behavior was observed in', 'title': 'Instance Id', 'type': 'string'}, 'start_step': {'description': 'The step in the trajectory where the behavior started', 'title': 'Start Step', 'type': 'integer'}, 'end_step': {'description': 'The step in the trajectory where the behavior ended', 'title': 'End Step', 'type': 'integer'}, 'behavior': {'description': 'The behavior observed in the trajectory', 'title': 'Behavior', 'type': 'string'}, 'feedback': {'description': 'Summary of human comments on the behavior observed', 'title': 'Feedback', 'type': 'string'}}, 'required': ['instance_id', 'start_step', 'end_step', 'behavior', 'feedback'], 'title': 'BehaviorFeedback', 'type': 'object'}
30 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/operators/behavior_clustering.py:
--------------------------------------------------------------------------------
 1 | from importlib import resources
 2 | import jinja2
 3 | from openai import AsyncAzureOpenAI
 4 | from autolibra_core.configs import AutoLibraEvalSettings
 5 | from pydantic import BaseModel, ValidationError
 6 | from ..data import Aspect
 7 | from osw_data import Metric
 8 | 
 9 | 
10 | def _load_behavior_clustering_template() -> jinja2.Template:
11 |     with resources.files("autolibra_core.templates").joinpath(
12 |         "behavior_clustering.j2"
13 |     ).open("r") as f:
14 |         return jinja2.Template(f.read())
15 | 
16 | 
17 | class BehaviorClusteringOutput(BaseModel):
18 |     metrics: list[Metric]
19 | 
20 | 
21 | async def behavior_clustering(
22 |     aspects: list[Aspect],
23 |     client: AsyncAzureOpenAI,
24 | ) -> BehaviorClusteringOutput:
25 |     prompt = _load_behavior_clustering_template().render(
26 |         behavior_feedback_list=aspects,
27 |     )
28 | 
29 |     settings = AutoLibraEvalSettings()
30 | 
31 |     model = settings.azure_openai_o3_model
32 |     assert model is not None
33 | 
34 |     while True:
35 |         try:
36 |             completion = await client.beta.chat.completions.parse(
37 |                 model=model,
38 |                 messages=[
39 |                     # {"role": "system", "content": "Cluster the behaviors."},
40 |                     {"role": "user", "content": prompt},
41 |                 ],
42 |                 response_format=BehaviorClusteringOutput,
43 |                 reasoning_effort="high",
44 |             )
45 |             break
46 |         except ValidationError as e:
47 |             # In rare cases, the response may not be parsed correctly.
48 |             # Retry the request.
49 |             print(f"Validation error: {e}")
50 | 
51 |     if not completion.choices[0].message.parsed:
52 |         raise ValueError("Failed to parse the response.")
53 |     else:
54 |         return completion.choices[0].message.parsed
55 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/templates/generate_metrics.j2:
--------------------------------------------------------------------------------
 1 | You are an expert evaluator tasked with analyzing agent trajectories and human feedback to identify and define relevant evaluation metrics.
 2 | 
 3 | Context:
 4 | Review the following interactions between an AI agent and a human, along with the corresponding human feedback for each interaction.
 5 | 
 6 | {% for instance in instances %}
 7 | Instance {{ loop.index }}:
 8 | 
 9 | Agent Trajectory:
10 | {{ instance.trajectory }}
11 | 
12 | Human Feedback:
13 | {{ instance.feedback }}
14 | 
15 | {% endfor %}
16 | 
17 | Instructions:
18 | 1. Analyze the agent trajectories and corresponding human feedback carefully
19 | 2. Identify key aspects of performance that emerge from the interactions and feedback
20 | 3. Define a comprehensive set of metrics that capture these aspects
21 | 4. For each metric provide:
22 |    - A clear name
23 |    - A detailed description of what the metric measures
24 |    - Why this metric is important based on the observed interactions
25 |    - Example behaviors from the trajectories that would score high or low on this metric
26 | 
27 | Output the following:
28 | 
29 | Identified Metrics:
30 | [List each metric in the following format]
31 | 
32 | 1. Metric Name: [Concise, descriptive name]
33 |    Description: [Clear explanation of what this metric measures]
34 |    Importance: [Why this metric matters based on the trajectories and feedback]
35 |    Examples:
36 |    - High Score Example: [Behavior that would exemplify high performance]
37 |    - Low Score Example: [Behavior that would indicate poor performance]
38 | 
39 | 2. [Continue for each identified metric...]
40 | 
41 | Justification:
42 | [Explain how these metrics together provide comprehensive coverage of the important aspects of performance demonstrated in the trajectories and highlighted by the human feedback]
43 | 
44 | Additional Considerations:
45 | [Note any context-specific factors that might affect metric applicability or interpretation]
46 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/templates/llm_as_a_judge_evaluator_v2.j2:
--------------------------------------------------------------------------------
 1 | You are an expert evaluator tasked with analyzing agent trajectories and human feedback to identify and define relevant evaluation metrics.
 2 | 
 3 | Context:
 4 | Review the following task metadata, agent metadata, and its trajectory, and several metrics that you should use to evaluate the agent's performance.
 5 | 
 6 | Task Metadata:
 7 | {{ instance.task_metadata }}
 8 | 
 9 | Agent Metadata:
10 | {{ instance.agent_metadata }}
11 | 
12 | Agent Trajectory:
13 | {{ instance.trajectory }}
14 | 
15 | Metrics:
16 | {% for metric in instance.metrics %}
17 | 
18 | 
19 | Metric name: {{ metric.name }}
20 | Metric description: {{ metric.explanation }}
21 | Metric good behaviors: {{ metric.good_behaviors }}
22 | Metric bad behaviors: {{ metric.bad_behaviors }}
23 | 
24 | 
25 | {% endfor %}
26 | 
27 | 
28 | Instructions:
29 | 1. Analyze the agent task and trajectory carefully
30 | 2. Understand the metrics provided, and figure out which part of the trajectory they might be relevant to
31 | 3. Output two things for each metric:
32 |    - A reasoning for why you think the agent either did well or poorly on the metric
33 |       - If you think the agent did well, provide the behavior of the agent that led you to that conclusion
34 |       - If you think the agent did poorly, provide the behavior of the agent that led you to that conclusion
35 |       - If you think the metric is not applicable to the agent, provide a reasoning for that
36 |    - A string indicating whether you think the agent did well or poorly on the metric
37 |       - "positive" indicates the agent did well
38 |       - "N/A" indicates the metric is not applicable to the agent
39 |       - "negative" indicates the agent did poorly
40 | 
41 | Output the following:
42 | 
43 | 
44 | It should be a list of string (the reasoning) and an string (judgement) in a json format, please make sure the number of reasoning and judgement is the same as the number of metrics provided.
45 | 


--------------------------------------------------------------------------------
/packages/osw-data/tests/test_trajectory.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from osw_data.trajectory import (
 3 |     SymmetricTrajectory,
 4 |     MediaType,
 5 |     PointType,
 6 |     render_trajectory,
 7 | )
 8 | 
 9 | from datetime import datetime
10 | import numpy as np
11 | 
12 | 
13 | def test_mixed_trajectory() -> None:
14 |     trajectory = SymmetricTrajectory(
15 |         trajectory_id="robot_1", storage_path=Path("/tmp/data/trajectories")
16 |     )
17 | 
18 |     # Add an observation with image data
19 |     image = np.random.rand(480, 640, 3)
20 |     trajectory.add_point(
21 |         timestamp=datetime.now(),
22 |         agent_id="robot_1",
23 |         point_type=PointType.OBSERVATION,
24 |         data=image,
25 |         media_type=MediaType.IMAGE,
26 |         metadata={"camera_id": "cam_1"},
27 |     )
28 | 
29 |     # Add an action with JSON data
30 |     action = {"command": "move", "parameters": {"direction": "forward", "speed": 1.0}}
31 |     trajectory.add_point(
32 |         timestamp=datetime.now(),
33 |         agent_id="robot_1",
34 |         point_type=PointType.ACTION,
35 |         data=action,
36 |         media_type=MediaType.JSON,
37 |         metadata={"priority": "high"},
38 |     )
39 | 
40 |     # Add an observation with JSON data
41 |     json_obs = {"position": [1.0, 2.0, 3.0], "orientation": [0.0, 0.0, 1.0]}
42 |     trajectory.add_point(
43 |         timestamp=datetime.now(),
44 |         agent_id="robot_1",
45 |         point_type=PointType.OBSERVATION,
46 |         data=json_obs,
47 |         media_type=MediaType.JSON,
48 |     )
49 | 
50 |     # Add an action with audio data
51 |     audio_command = np.random.rand(16000)  # 1 second of audio at 16kHz
52 |     trajectory.add_point(
53 |         timestamp=datetime.now(),
54 |         agent_id="robot_1",
55 |         point_type=PointType.ACTION,
56 |         data=audio_command,
57 |         media_type=MediaType.AUDIO,
58 |         metadata={"sample_rate": 16000},
59 |     )
60 | 
61 |     render_trajectory(trajectory)
62 | 
63 |     trajectory.close()
64 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/operators/feedback_grounding.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from importlib import resources
 3 | import jinja2
 4 | from openai import AsyncAzureOpenAI, RateLimitError
 5 | from autolibra_core.configs import AutoLibraEvalSettings
 6 | from pydantic import BaseModel
 7 | from ..utils import render_training_instance
 8 | from ..data import MetricTrainingInstance, Aspect
 9 | 
10 | 
11 | class FeedbackGroundingOutput(BaseModel):
12 |     bullet_points: list[Aspect]
13 | 
14 | 
15 | def _load_feedback_grounding_template() -> jinja2.Template:
16 |     with resources.files("autolibra_core.templates").joinpath(
17 |         "feedback_grounding.j2"
18 |     ).open("r") as f:
19 |         return jinja2.Template(f.read())
20 | 
21 | 
22 | async def feedback_grounding(
23 |     instance: MetricTrainingInstance,
24 |     client: AsyncAzureOpenAI,
25 | ) -> list[Aspect]:
26 |     settings = AutoLibraEvalSettings()
27 | 
28 |     template = _load_feedback_grounding_template()
29 | 
30 |     prompt = template.render(
31 |         instance=dict(
32 |             trajectory=render_training_instance(instance), feedback=instance.feedback
33 |         )
34 |     )
35 | 
36 |     model = settings.azure_openai_4o_model
37 |     assert model
38 | 
39 |     wait_time = 1
40 |     while True:
41 |         try:
42 |             completion = await client.beta.chat.completions.parse(
43 |                 model=model,
44 |                 messages=[
45 |                     {
46 |                         "role": "system",
47 |                         "content": "Ground the feedback in the behavior.",
48 |                     },
49 |                     {"role": "user", "content": prompt},
50 |                 ],
51 |                 response_format=FeedbackGroundingOutput,
52 |             )
53 |             break
54 |         except RateLimitError as e:
55 |             print(f"Rate limit error: {e}")
56 |             await asyncio.sleep(wait_time)
57 |             wait_time *= 2
58 | 
59 |     if not completion.choices[0].message.parsed:
60 |         raise ValueError("Failed to parse the response.")
61 |     else:
62 |         return completion.choices[0].message.parsed.bullet_points
63 | 


--------------------------------------------------------------------------------
/packages/osw-data/tests/test_dataset.py:
--------------------------------------------------------------------------------
 1 | from osw_data import MultiAgentDataset, AgentMetadata, MediaType, PointType
 2 | from datetime import datetime
 3 | import numpy as np
 4 | from pathlib import Path
 5 | 
 6 | 
 7 | def test_dataset() -> None:
 8 |     Path("/tmp/data/robot_dataset").mkdir(parents=True, exist_ok=True)
 9 | 
10 |     # Create a new dataset
11 |     dataset = MultiAgentDataset(
12 |         name="Robot Interaction Dataset",
13 |         base_path=Path("/tmp/data/robot_dataset"),
14 |         description="Multi-agent robot interaction scenarios",
15 |         version="1.0",
16 |     )
17 | 
18 |     # Define agents for an instance
19 |     agents_metadata = {
20 |         "robot_1": AgentMetadata(
21 |             agent_id="robot_1",
22 |             agent_type="manipulator",
23 |             capabilities=["grasp", "move"],
24 |             parameters={"max_speed": 1.0},
25 |         ),
26 |         "robot_2": AgentMetadata(
27 |             agent_id="robot_2",
28 |             agent_type="mobile_base",
29 |             capabilities=["navigate"],
30 |             parameters={"max_velocity": 0.5},
31 |         ),
32 |     }
33 | 
34 |     # Create an instance
35 |     instance_id = dataset.create_instance(
36 |         agents_metadata=agents_metadata,
37 |         instance_metadata={"scenario": "collaborative_assembly"},
38 |     )
39 | 
40 |     # Add data points for each agent
41 |     timestamp = datetime.now()
42 | 
43 |     # Add observation for robot_1
44 |     image_data = np.random.rand(480, 640, 3)
45 |     dataset.add_data_point(
46 |         instance_id=instance_id,
47 |         agent_id="robot_1",
48 |         timestamp=timestamp,
49 |         point_type=PointType.OBSERVATION,
50 |         data=image_data,
51 |         media_type=MediaType.IMAGE,
52 |         metadata={"camera_id": "cam_1"},
53 |     )
54 | 
55 |     # Add action for robot_2
56 |     action_data = {"command": "move_to", "position": [1.0, 2.0, 0.0]}
57 |     dataset.add_data_point(
58 |         instance_id=instance_id,
59 |         agent_id="robot_2",
60 |         timestamp=timestamp,
61 |         point_type=PointType.ACTION,
62 |         data=action_data,
63 |         media_type=MediaType.JSON,
64 |     )
65 | 
66 |     # Close the dataset
67 |     dataset.close()
68 | 


--------------------------------------------------------------------------------
/packages/osw-data/tests/test_annotation.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from osw_data.annotation import AnnotationSpan, AnnotationSystem
 3 | from pathlib import Path
 4 | 
 5 | 
 6 | def test_annotation_system() -> None:
 7 |     # Initialize annotation system
 8 |     annotation_system = AnnotationSystem(
 9 |         base_path=Path("/tmp/data/annotations"),
10 |         project_name="Robot Behavior Analysis",
11 |         description="Annotating robot behaviors and interactions",
12 |         annotation_schema={
13 |             "behavior_type": ["cooperative", "competitive", "neutral"],
14 |             "success_rating": {"type": "float", "min": 0, "max": 1},
15 |             "comments": "string",
16 |         },
17 |     )
18 | 
19 |     # Add annotators
20 |     annotation_system.add_annotator(
21 |         annotator_id="expert1",
22 |         name="Dr. Smith",
23 |         role="robotics_expert",
24 |         expertise_level="expert",
25 |     )
26 | 
27 |     annotation_system.add_annotator(
28 |         annotator_id="expert2",
29 |         name="Dr. Jones",
30 |         role="hri_researcher",
31 |         expertise_level="expert",
32 |     )
33 | 
34 |     # Add annotations
35 |     instance_id = "instance_001"
36 |     agent_id = "robot_1"
37 | 
38 |     # Expert 1's annotation
39 |     annotation_system.add_annotation(
40 |         instance_id=instance_id,
41 |         agent_id=agent_id,
42 |         annotator_id="expert1",
43 |         content={
44 |             "behavior_type": "cooperative",
45 |             "success_rating": 0.85,
46 |             "comments": "Robot showed good adaptation to human partner",
47 |         },
48 |         span=AnnotationSpan(start_time=datetime.now(), end_time=datetime.now()),
49 |         confidence=0.9,
50 |     )
51 | 
52 |     # Expert 2's annotation
53 |     annotation_system.add_annotation(
54 |         instance_id=instance_id,
55 |         agent_id=agent_id,
56 |         annotator_id="expert2",
57 |         content={
58 |             "behavior_type": "cooperative",
59 |             "success_rating": 0.78,
60 |             "comments": "Good cooperation but some delays in responses",
61 |         },
62 |         span=AnnotationSpan(start_time=datetime.now(), end_time=datetime.now()),
63 |         confidence=0.85,
64 |     )
65 | 


--------------------------------------------------------------------------------
/src/training/llm_as_a_judge.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from openai import AsyncAzureOpenAI
 3 | from osw_data import MultiAgentDataset
 4 | from osw_data.metrics import MetricSet
 5 | from autolibra_core import (
 6 |     run_llm_eval,
 7 | )
 8 | from autolibra_core.data import MetricTrainingInstance
 9 | from autolibra_core.configs import AutoLibraEvalSettings
10 | 
11 | 
12 | async def main(dataset_name: str, metric_path: str) -> None:
13 |     dataset = MultiAgentDataset(
14 |         name="dataset",
15 |         base_path=f".data/{dataset_name}",
16 |     )
17 | 
18 |     metric_set = MetricSet(
19 |         name="",
20 |         base_path=metric_path,
21 |         induced_from=dataset_name,
22 |     )
23 | 
24 |     settings = AutoLibraEvalSettings()
25 | 
26 |     client = AsyncAzureOpenAI(
27 |         api_key=settings.azure_api_key,
28 |         api_version="2024-12-01-preview",
29 |         azure_endpoint=settings.azure_endpoint,
30 |     )
31 | 
32 |     metric_training_instances: list[MetricTrainingInstance] = []
33 | 
34 |     for instances in dataset.list_instances():
35 |         instance = dataset.get_instance_metadata(instances)
36 |         for agent_id in instance.agents:
37 |             metric_training_instances.append(
38 |                 MetricTrainingInstance(
39 |                     task=instance.metadata["task"]
40 |                     if "task" in instance.metadata
41 |                     else "Task is described in the trajectory observation",
42 |                     agent_id=agent_id,
43 |                     trajectory=dataset.get_trajectory(instances, agent_id),
44 |                     feedback="",
45 |                 )
46 |             )
47 | 
48 |     eval_results = await run_llm_eval(
49 |         metric_training_instances, list(metric_set.metrics.values()), client=client
50 |     )
51 | 
52 |     with open("llm_eval_results.jsonl", "w") as f:
53 |         for eval_result in eval_results:
54 |             f.write(eval_result.model_dump_json())
55 |             f.write("\n")
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     import argparse
60 | 
61 |     parser = argparse.ArgumentParser(description="Balrog Converter")
62 |     parser.add_argument(
63 |         "--filename",
64 |         type=str,
65 |         required=True,
66 |         help="The name of the folder containing the data for the given run, including the date subfolder",
67 |     )
68 | 
69 |     filename = parser.parse_args().filename
70 |     filename_no_date = filename.split("/")[0]
71 | 
72 |     asyncio.run(
73 |         main(
74 |             dataset_name=filename_no_date,
75 |             metric_path=f".data/metrics/{filename}",
76 |         ),
77 |     )
78 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/templates/coverage_evaluation_v2.j2:
--------------------------------------------------------------------------------
 1 | You are an expert in matching quantatitive metrics to qualitative feedback. You are asked to evaluate the coverage of the metrics on the feedback.
 2 | 
 3 | Context:
 4 | Review the following metrics and agent behaviors. Match the correspondin
 5 | 
 6 | Positive metrics:
 7 | {% for metric in positive_metrics%}
 8 | 
 9 | Metric {{ loop.index }}
10 | {{ metric.name}}: {{ metric.explanation }}
11 | 
12 | {% endfor %}
13 | 
14 | Negative metrics:
15 | 
16 | {% for metric in negative_metrics%}
17 | 
18 | Metric {{ loop.index }}
19 | {{ metric.name}}: {{ metric.explanation }}
20 | 
21 | {% endfor %}
22 | 
23 | Behaviors and feedback bulletpoints:
24 | 
25 | {% for behavior_feedback in behavior_feedback_list %}
26 | 
27 | Bullet point: {{ loop.index }}
28 | Behavior: {{ behavior_feedback.behavior }}
29 | Feedback: {{ behavior_feedback.feedback }}
30 | 
31 | {% endfor %}
32 | 
33 | 
34 | Instructions:
35 | 1. Please match at most ONE bullet point to each METRIC. To perform a match, the bullet point must EXACTLY match the meaning of the metric.
36 | 2. Do NOT match the same bullet point to multiple metrics.
37 | 3. Here are examples of VALID matches :
38 |     - Negative metrics
39 |         - UI Element or Page Existence Correctness
40 |         - This metric measures whether the agent selects valid user-interface elements or navigates to valid pages. Good examples show the agent accurately identifying existing UI elements or correct URLs. Bad examples show attempts to click or load elements/pages that do not exist.
41 |     - Behavior
42 |         - The agent tried to click an element that is not visible after scrolling
43 |     - Positive metrics
44 |         - Location Query Correctness
45 |         - This metric measures whether the agent correctly identifies the location of a query. Good examples show the agent accurately identifying the location of a query. Bad examples show the agent incorrectly identifying the location of a query.
46 |     - Behavior
47 |         - The agent input the correct location in the search bar
48 | 4. If there are no valid matches, please EXCLUDE the metric from the list .
49 | 5. Please provide a 1 - sentence RATIONALE for your decision for any matches .
50 | 6. Please respond with a list of each metric and the item it matches
51 | 7. Note that the bullet point IDs should be smaller than {{ behavior_feedback_list | length }}.
52 | 8. Note that for positive metrics (`is_positive: True`), the metric id should be smaller than {{ positive_metrics | length }}.
53 | 9. Note that for negative metrics (`is_positive: False`), the metric id should be smaller than {{ negative_metrics | length }}.
54 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/evaluators/llm_evaluator.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from importlib import resources
  3 | import jinja2
  4 | from openai import AsyncAzureOpenAI, RateLimitError
  5 | from osw_data.metrics import Metric
  6 | from autolibra_core.configs import AutoLibraEvalSettings
  7 | from ..data import MetricTrainingInstance
  8 | from ..utils import render_training_instance
  9 | from pydantic import BaseModel, ValidationError, create_model, Field
 10 | from typing import Literal
 11 | 
 12 | 
 13 | def _make_snake_case(name: str) -> str:
 14 |     return name.lower().replace(" ", "_")
 15 | 
 16 | 
 17 | def _make_evaluation_result_class(metrics: list[Metric]) -> type[BaseModel]:
 18 |     eval_result = create_model(  # type: ignore[call-overload]
 19 |         "EvaluationResult",
 20 |         **{
 21 |             _make_snake_case(metric.name) + "_reasoning": (
 22 |                 str,
 23 |                 Field(description=metric.explanation, alias=metric.name + " Reasoning"),
 24 |             )
 25 |             for metric in metrics
 26 |         },
 27 |         **{
 28 |             _make_snake_case(metric.name): (
 29 |                 Literal[-1, 0, 1],
 30 |                 Field(description=metric.explanation, alias=metric.name),
 31 |             )
 32 |             for metric in metrics
 33 |         },
 34 |     )
 35 |     return eval_result  # type: ignore[no-any-return]
 36 | 
 37 | 
 38 | def _load_llm_eval_template() -> jinja2.Template:
 39 |     with resources.files("autolibra_core.templates").joinpath(
 40 |         "llm_as_a_judge_evaluator_v3.j2"
 41 |     ).open("r") as f:
 42 |         return jinja2.Template(f.read())
 43 | 
 44 | 
 45 | semaphore = asyncio.Semaphore(20)  # Limit to 3 concurrent tasks
 46 | 
 47 | 
 48 | async def eval_instance(
 49 |     instance: MetricTrainingInstance, metrics: list[Metric], client: AsyncAzureOpenAI
 50 | ) -> BaseModel:
 51 |     settings = AutoLibraEvalSettings()
 52 |     template = _load_llm_eval_template()
 53 | 
 54 |     prompt = template.render(
 55 |         trajectory=render_training_instance(instance),
 56 |         metrics=metrics,
 57 |     )
 58 | 
 59 |     model = settings.azure_openai_o3_model
 60 |     assert model
 61 | 
 62 |     async with semaphore:
 63 |         while True:
 64 |             wait_time = 1
 65 |             try:
 66 |                 completion = await client.beta.chat.completions.parse(
 67 |                     model=model,
 68 |                     messages=[
 69 |                         {"role": "system", "content": "Evaluate the trajectory."},
 70 |                         {"role": "user", "content": prompt},
 71 |                     ],
 72 |                     response_format=_make_evaluation_result_class(metrics),
 73 |                     reasoning_effort="high",
 74 |                 )
 75 | 
 76 |                 if not completion.choices[0].message.parsed:
 77 |                     print("Failed to parse the response. Retrying.")
 78 |                     await asyncio.sleep(wait_time)
 79 |                     continue
 80 |                 break
 81 |             except (ValidationError, RateLimitError) as e:
 82 |                 print(e)
 83 |                 await asyncio.sleep(wait_time)
 84 |                 wait_time *= 2
 85 | 
 86 |         if not completion.choices[0].message.parsed:
 87 |             raise ValueError("Failed to parse the response.")
 88 |         else:
 89 |             return completion.choices[0].message.parsed
 90 | 
 91 | 
 92 | async def run_llm_eval(
 93 |     instances: list[MetricTrainingInstance],
 94 |     metrics: list[Metric],
 95 |     client: AsyncAzureOpenAI,
 96 | ) -> list[BaseModel]:
 97 |     eval_results = await asyncio.gather(
 98 |         *[eval_instance(instance, metrics, client) for instance in instances]
 99 |     )
100 | 
101 |     return eval_results
102 | 


--------------------------------------------------------------------------------
/packages/osw-data/src/osw_data/metrics.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from pathlib import Path
 3 | from typing import Annotated
 4 | from pydantic import AfterValidator, BaseModel, Field
 5 | 
 6 | 
 7 | class MetricSetMetadata(BaseModel):
 8 |     created_at: datetime = Field(default_factory=datetime.now)
 9 |     name: str
10 |     metric_names: list[str] = Field(default_factory=list)
11 |     induced_from: str | None = Field(default_factory=lambda: None)
12 |     version: str | None = Field(default_factory=lambda: None)
13 | 
14 | 
15 | class Metric(BaseModel):
16 |     good_behaviors: list[str] = Field(default_factory=list)
17 |     bad_behaviors: list[str] = Field(default_factory=list)
18 |     explanation: str
19 |     name: Annotated[str, AfterValidator(lambda x: x.replace("/", "_"))]
20 | 
21 | 
22 | class MetricSet:
23 |     """
24 |     A set of metrics for evaluating trajectories
25 |     """
26 | 
27 |     def __init__(
28 |         self,
29 |         name: str,
30 |         base_path: Path | str,
31 |         induced_from: str,
32 |         version: str | None = None,
33 |     ):
34 |         self.base_path = Path(base_path)
35 |         self.metrics_path = self.base_path / "metrics"
36 |         self.metadata_path = self.base_path / "metadata.json"
37 | 
38 |         # Initialize directory structure
39 |         self.base_path.mkdir(parents=True, exist_ok=True)
40 |         self.metrics_path.mkdir(exist_ok=True)
41 |         self.metrics: dict[str, Metric] = {}
42 | 
43 |         # Initialize or load dataset metadata
44 |         self.metadata = self._init_metadata(name, induced_from, version)
45 |         self.load_metrics()
46 | 
47 |     def _init_metadata(
48 |         self, name: str, induced_from: str, version: str | None
49 |     ) -> MetricSetMetadata:
50 |         """Initialize or load dataset metadata"""
51 |         if self.metadata_path.exists():
52 |             with open(self.metadata_path, "r") as f:
53 |                 return MetricSetMetadata.model_validate_json(f.read())
54 |         else:
55 |             metadata = MetricSetMetadata(
56 |                 name=name, induced_from=induced_from, version=version
57 |             )
58 |             self._save_metadata(metadata)
59 |             return metadata
60 | 
61 |     def _save_metadata(self, metadata: MetricSetMetadata) -> None:
62 |         with open(self.metadata_path, "w") as f:
63 |             f.write(metadata.model_dump_json(indent=2))
64 | 
65 |     def _save_metrics(
66 |         self,
67 |     ) -> None:
68 |         for name, metric in self.metrics.items():
69 |             metric_path = self.metrics_path / f"{name}.json"
70 |             with open(metric_path, "w") as f:
71 |                 f.write(metric.model_dump_json(indent=2))
72 | 
73 |     def load_metrics(self) -> None:
74 |         for metric in self.metadata.metric_names:
75 |             metric_path = self.metrics_path / f"{metric}.json"
76 |             with open(metric_path, "r") as f:
77 |                 self.metrics[metric] = Metric.model_validate_json(f.read())
78 | 
79 |     def add_metrics(self, metrics: list[Metric]) -> None:
80 |         for metric in metrics:
81 |             if metric.name in self.metrics:
82 |                 raise ValueError(f"Metric with name {metric.name} already exists")
83 |             self.metrics[metric.name] = metric
84 |             metric_path = self.metrics_path / f"{metric.name}.json"
85 |             with open(metric_path, "w") as f:
86 |                 f.write(metric.model_dump_json(indent=2))
87 | 
88 |         self.metadata.metric_names = list(self.metrics.keys())
89 |         self._save_metadata(self.metadata)
90 | 
91 |     def get_metric(self, name: str) -> Metric:
92 |         if name not in self.metrics:
93 |             raise ValueError(f"Metric with name {name} does not exist")
94 |         metric_path = self.metrics_path / f"{name}.json"
95 |         with open(metric_path, "r") as f:
96 |             return Metric.model_validate_json(f.read())
97 | 


--------------------------------------------------------------------------------
/src/training/grounding.py:
--------------------------------------------------------------------------------
  1 | # Iterative Metric Creation
  2 | # Input: instances, trajectories, agents, and feedbacks
  3 | # Output: metrics
  4 | # Algorithm:
  5 | # metrics = propose_metrics(train_trajectories, train_feedbacks)
  6 | # while coverage improves
  7 | #     eval_results = llm_evaluator(train_trajectories, metrics)
  8 | #     uncovered_feedbacks, coverage = missing_points_detection(train_trajectories, eval_results)
  9 | #     new_metrics = propose_metrics(train_trajectories, uncovered_feedbacks)
 10 | #     metrics += new_metrics
 11 | 
 12 | import asyncio
 13 | from datetime import datetime
 14 | from openai import AsyncAzureOpenAI
 15 | from osw_data import MultiAgentDataset
 16 | from osw_data.annotation import AnnotationSystem
 17 | from osw_data.metrics import MetricSet
 18 | from autolibra_core import (
 19 |     MetricTrainingInstance,
 20 |     feedback_grounding,
 21 |     behavior_clustering,
 22 | )
 23 | from autolibra_core.configs import AutoLibraEvalSettings
 24 | 
 25 | 
 26 | async def main(dataset_name: str) -> None:
 27 |     settings = AutoLibraEvalSettings()
 28 | 
 29 |     client = AsyncAzureOpenAI(
 30 |         api_key=settings.azure_api_key,
 31 |         api_version="2024-12-01-preview",
 32 |         azure_endpoint=settings.azure_endpoint,
 33 |     )
 34 | 
 35 |     dataset = MultiAgentDataset(
 36 |         name="dataset",
 37 |         base_path=f".data/{dataset_name}",
 38 |     )
 39 | 
 40 |     annotation_system = AnnotationSystem(
 41 |         base_path=f".data/annotations/{dataset_name}",
 42 |     )
 43 | 
 44 |     metric_training_instances: list[MetricTrainingInstance] = []
 45 | 
 46 |     for instances in dataset.list_instances():
 47 |         instance = dataset.get_instance_metadata(instances)
 48 |         for agent_id in instance.agents:
 49 |             trajectory_annotations = annotation_system.get_trajectory_annotations(
 50 |                 instance_id=instances, agent_id=agent_id
 51 |             )
 52 |             for annotation in trajectory_annotations.annotations:
 53 |                 metric_training_instances.append(
 54 |                     MetricTrainingInstance(
 55 |                         task=instance.metadata["task"]
 56 |                         if "task" in instance.metadata
 57 |                         else "Task is described in the trajectory observation",
 58 |                         agent_id=agent_id,
 59 |                         trajectory=dataset.get_trajectory(instances, agent_id),
 60 |                         feedback=annotation.content["feedback"],
 61 |                     )
 62 |                 )
 63 | 
 64 |     feedback_grounding_results = await asyncio.gather(
 65 |         *[
 66 |             feedback_grounding(instance, client=client)
 67 |             for instance in metric_training_instances
 68 |         ]
 69 |     )
 70 | 
 71 |     with open("feedback_grounding_results.jsonl", "w") as f:
 72 |         for feedback_grounding_result in feedback_grounding_results:
 73 |             for aspect in feedback_grounding_result:
 74 |                 f.write(aspect.model_dump_json(indent=2))
 75 |                 f.write("\n")
 76 |             f.write("\n")
 77 | 
 78 |     aspects = sum(
 79 |         [
 80 |             feedback_grounding_result
 81 |             for feedback_grounding_result in feedback_grounding_results
 82 |         ],
 83 |         [],
 84 |     )
 85 | 
 86 |     behavior_clustering_results = await behavior_clustering(
 87 |         aspects=aspects, client=client
 88 |     )
 89 | 
 90 |     metric_set = MetricSet(
 91 |         name="Derived Metrics",
 92 |         base_path=f".data/metrics/{dataset_name}/{datetime.now().strftime('%m_%d_%H_%M')}",
 93 |         induced_from=dataset_name,
 94 |         version="0.1",
 95 |     )
 96 | 
 97 |     metric_set.add_metrics(behavior_clustering_results.metrics)
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     import argparse
102 | 
103 |     parser = argparse.ArgumentParser(description="Balrog Converter")
104 |     parser.add_argument(
105 |         "--filename",
106 |         type=str,
107 |         required=True,
108 |         help="The name of the folder containing the data for the given run",
109 |     )
110 | 
111 |     filename = parser.parse_args().filename
112 | 
113 |     asyncio.run(main(filename))
114 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 | 
170 | # PyPI configuration file
171 | .pypirc
172 | 
173 | .data
174 | .vscode
175 | 


--------------------------------------------------------------------------------
/src/training/llm_eval.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from openai import AsyncAzureOpenAI
  3 | from osw_data import MultiAgentDataset
  4 | from osw_data.annotation import AnnotationSystem
  5 | from osw_data.metrics import MetricSet
  6 | from autolibra_core import (
  7 |     run_llm_eval,
  8 | )
  9 | from autolibra_core.data import MetricTrainingInstance
 10 | from autolibra_core.configs import AutoLibraEvalSettings
 11 | from autolibra_core.data.primitives import Trait
 12 | from autolibra_core.evaluators.coverage_evaluator import run_coverage_eval
 13 | from autolibra_core.evaluators.llm_evaluator import _make_snake_case
 14 | 
 15 | 
 16 | async def main(dataset_name: str, metric_path: str) -> None:
 17 |     dataset = MultiAgentDataset(
 18 |         name="dataset",
 19 |         base_path=f".data/{dataset_name}",
 20 |     )
 21 | 
 22 |     annotation_system = AnnotationSystem(
 23 |         base_path=f".data/annotations/{dataset_name}",
 24 |     )
 25 | 
 26 |     metric_set = MetricSet(
 27 |         name="",
 28 |         base_path=metric_path,
 29 |         induced_from=dataset_name,
 30 |     )
 31 | 
 32 |     settings = AutoLibraEvalSettings()
 33 | 
 34 |     client = AsyncAzureOpenAI(
 35 |         api_key=settings.azure_api_key,
 36 |         api_version="2024-12-01-preview",
 37 |         azure_endpoint=settings.azure_endpoint,
 38 |     )
 39 | 
 40 |     metric_training_instances: list[MetricTrainingInstance] = []
 41 | 
 42 |     for instances in dataset.list_instances():
 43 |         instance = dataset.get_instance_metadata(instances)
 44 |         for agent_id in instance.agents:
 45 |             trajectory_annotations = annotation_system.get_trajectory_annotations(
 46 |                 instance_id=instances, agent_id=agent_id
 47 |             )
 48 |             for annotation in trajectory_annotations.annotations:
 49 |                 metric_training_instances.append(
 50 |                     MetricTrainingInstance(
 51 |                         task=instance.metadata["task"]
 52 |                         if "task" in instance.metadata
 53 |                         else "Task is described in the trajectory observation",
 54 |                         agent_id=agent_id,
 55 |                         trajectory=dataset.get_trajectory(instances, agent_id),
 56 |                         feedback=str(annotation.content),
 57 |                     )
 58 |                 )
 59 | 
 60 |     eval_results = await run_llm_eval(
 61 |         metric_training_instances, list(metric_set.metrics.values()), client=client
 62 |     )
 63 | 
 64 |     eval_scoring = [
 65 |         [
 66 |             int(getattr(eval_result, _make_snake_case(metric.name), 0))
 67 |             for metric in metric_set.metrics.values()
 68 |         ]
 69 |         for eval_result in eval_results
 70 |     ]
 71 | 
 72 |     with open("llm_eval_results.jsonl", "w") as f:
 73 |         for eval_result in eval_results:
 74 |             f.write(eval_result.model_dump_json())
 75 |             f.write("\n")
 76 | 
 77 |     traits = [
 78 |         [
 79 |             Trait(
 80 |                 metric=metric,
 81 |                 rating=score,
 82 |             )
 83 |             for metric, score in zip(
 84 |                 metric_set.metrics.values(), eval_scoring_for_instance
 85 |             )
 86 |         ]
 87 |         for eval_scoring_for_instance in eval_scoring
 88 |     ]
 89 | 
 90 |     coverage_results = await run_coverage_eval(
 91 |         instance_traits=traits,
 92 |         instances=metric_training_instances,
 93 |         client=client,
 94 |     )
 95 | 
 96 |     covered, total = 0, 0
 97 |     redundant, total_traits = 0, 0
 98 | 
 99 |     for coverage_result in coverage_results:
100 |         covered += coverage_result[0]
101 |         total += coverage_result[1]
102 |         redundant += coverage_result[2]
103 |         total_traits += coverage_result[3]
104 | 
105 |     print(f"Coverage: {covered}/{total}")
106 |     print(f"Redundancy: {redundant}/{total_traits}")
107 | 
108 | 
109 | if __name__ == "__main__":
110 |     import argparse
111 | 
112 |     parser = argparse.ArgumentParser(description="Balrog Converter")
113 |     parser.add_argument(
114 |         "--filename",
115 |         type=str,
116 |         required=True,
117 |         help="The name of the folder containing the data for the given run, including the date subfolder",
118 |     )
119 | 
120 |     filename = parser.parse_args().filename
121 |     filename_no_date = filename.split("/")[0]
122 | 
123 |     asyncio.run(
124 |         main(
125 |             dataset_name=filename_no_date,
126 |             metric_path=f".data/metrics/{filename}",
127 |         ),
128 |     )
129 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/tests/test_coverage.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | from openai import AsyncAzureOpenAI
 3 | from osw_data.metrics import Metric
 4 | from autolibra_core.configs import AutoLibraEvalSettings
 5 | from autolibra_core.evaluators.coverage_evaluator import (
 6 |     match_aspects_and_traits,
 7 | )
 8 | from autolibra_core.data import Aspect
 9 | import pytest
10 | 
11 | """
12 | {"feedback":"\"The agent initially tried to find the contributors through the Commit history or Graph tab which are reasonable guesses, but the Contributor tab is the most straightforward choice.","behavior":"\"The agent went step by step through different sections (Commits, Graph) before navigating to the Contributors tab.\"","is_positive":true}
13 | {"feedback":"\"Anyway, the agent made the right choice in the end which is good although wasting a few more steps.\"","behavior":"\"The agent ultimately navigated to the Contributors tab and successfully identified the top contributor.\"","is_positive":true}
14 | """
15 | 
16 | """
17 | {"good_behaviors":["Behavior #3: The agent typed 'white desk' into the correct search box (ID 172) and pressed Search.","Behavior #4: The agent clicked the 'Add to Wish List' button (ID 5919) for the correct item.","Behavior #39: The agent typed 'white desk' into the correct search bar (ID 1585).","Behavior #40: The agent clicked the 'Add to Wish List' button (ID 6684) for the correct product."],"bad_behaviors":["Behavior #1: The agent clicked on a nonexistent element (ID 1488).","Behavior #6: The agent tried typing into the wrong text areas (IDs 2169 and 3421) instead of the actual textbox.","Behavior #8: The agent clicked on a nonexistent element (ID 1605)."],"explanation":"Measures whether the agent targets valid page elements for its actions. Good behaviors involve using the correct element ID or selector for the intended action, whereas bad behaviors show the agent clicking or typing into the wrong or nonexistent elements.","name":"Element Interaction Correctness"}
18 | {"good_behaviors":["Behavior #20: The agent found the website URL of the Carnegie Museum of Art successfully.","Behavior #22: The agent identified the correct zip code (06516) confirming the Yale University location.","Behavior #24: The agent began searching for the Carnegie Museum of Art in Pittsburgh and proceeded correctly.","Behavior #25: The final action where the agent indeed outputs the museum’s website at the end of the trajectory.","Behavior #37: The agent identified the top contributor’s email address accurately."],"bad_behaviors":["Behavior #14: The agent mistakenly reported the rating of the wrong product (iPhone Cable) instead of 'Lightning to 3.5mm Adapter.'","Behavior #15: The agent did not succeed in locating the intended 'Canon Photo Printer' listing."],"explanation":"Covers how precisely the agent retrieves or pinpoints the correct item or piece of data requested. Positive instances yield exactly the needed information, whereas negative ones incorrectly match or fail to locate the target.","name":"Information Discovery Accuracy"}
19 | """
20 | 
21 | _aspects = [
22 |     Aspect(
23 |         feedback="The agent initially tried to find the contributors through the Commit history or Graph tab which are reasonable guesses, but the Contributor tab is the most straightforward choice.",
24 |         behavior="The agent went step by step through different sections (Commits, Graph) before navigating to the Contributors tab.",
25 |         is_positive=True,
26 |     ),
27 |     Aspect(
28 |         feedback="Anyway, the agent made the right choice in the end which is good although wasting a few more steps.",
29 |         behavior="The agent ultimately navigated to the Contributors tab and successfully identified the top contributor.",
30 |         is_positive=True,
31 |     ),
32 | ]
33 | 
34 | _traits = [
35 |     Metric(
36 |         name="Element Interaction Correctness",
37 |         explanation="Measures whether the agent targets valid page elements for its actions. Good behaviors involve using the correct element ID or selector for the intended action, whereas bad behaviors show the agent clicking or typing into the wrong or nonexistent elements.",
38 |     ),
39 |     Metric(
40 |         name="Information Discovery Accuracy",
41 |         explanation="Covers how precisely the agent retrieves or pinpoints the correct item or piece of data requested. Positive instances yield exactly the needed information, whereas negative ones incorrectly match or fail to locate the target.",
42 |     ),
43 | ]
44 | 
45 | 
46 | @pytest.mark.asyncio
47 | async def test_match_aspects_and_traits() -> None:
48 |     settings = AutoLibraEvalSettings()
49 | 
50 |     (_aspects, _traits) = pickle.load(
51 |         open(
52 |             "/Users/hao/autolibra-eval/packages/autolibra-core/tests/positive_aspects_traits.pkl",
53 |             "rb",
54 |         )
55 |     )
56 | 
57 |     client = AsyncAzureOpenAI(
58 |         api_key=settings.azure_api_key,
59 |         api_version="2024-10-21",
60 |         azure_endpoint=settings.azure_endpoint,
61 |     )
62 | 
63 |     _ = await match_aspects_and_traits(
64 |         client=client,
65 |         aspects=_aspects,
66 |         traits=_traits,
67 |     )
68 | 


--------------------------------------------------------------------------------
/src/plot/meta-eval.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import numpy as np
  3 | from matplotlib.gridspec import GridSpec
  4 | 
  5 | # Example data for all four datasets
  6 | # You should replace these with your actual data for each dataset
  7 | cogym_data = [
  8 |     [0.47, 0.50, 2],
  9 |     [0.59, 0.63, 3],
 10 |     [0.76, 0.83, 4],
 11 |     [0.74, 0.80, 4],
 12 |     [0.70, 0.67, 4],
 13 |     [0.77, 0.85, 5],
 14 |     [0.74, 0.77, 6],
 15 |     [0.75, 0.83, 7],
 16 |     [0.72, 0.85, 8],
 17 |     [0.73, 0.85, 9],
 18 |     [0.70, 0.87, 10],
 19 |     [0.68, 0.88, 11],
 20 |     [0.70, 0.88, 12],
 21 | ]
 22 | 
 23 | # Example data for the other datasets - replace with your actual data
 24 | sotopia_data = [
 25 |     [0.31, 0.40, 2],
 26 |     [0.41, 0.62, 3],
 27 |     [0.47, 0.66, 4],
 28 |     [0.50, 0.70, 5],
 29 |     [0.57, 0.79, 6],
 30 |     [0.57, 0.80, 7],
 31 |     [0.58, 0.81, 8],
 32 |     [0.60, 0.85, 9],
 33 |     [0.60, 0.92, 10],
 34 |     [0.58, 0.91, 11],
 35 |     [0.54, 0.86, 12],
 36 | ]
 37 | 
 38 | webarena_data = [
 39 |     [0.45, 0.48, 2],
 40 |     [0.58, 0.60, 3],
 41 |     [0.67, 0.72, 4],
 42 |     [0.85, 0.66, 5],
 43 |     [0.90, 0.70, 6],
 44 |     [0.93, 0.74, 6],
 45 |     [0.91, 0.73, 7],
 46 |     [0.88, 0.77, 8],
 47 |     [0.87, 0.75, 9],
 48 |     [0.84, 0.72, 10],
 49 |     [0.89, 0.80, 11],
 50 |     [0.86, 0.82, 12],
 51 | ]
 52 | 
 53 | webvoyager_data = [
 54 |     [0.32, 0.30, 2],
 55 |     [0.51, 0.47, 3],
 56 |     [0.60, 0.50, 4],
 57 |     [0.76, 0.78, 5],
 58 |     [0.88, 0.72, 6],
 59 |     [0.91, 0.72, 7],
 60 |     [0.92, 0.77, 8],
 61 |     [0.91, 0.78, 9],
 62 |     [0.92, 0.80, 10],
 63 |     [0.93, 0.83, 11],
 64 |     [0.93, 0.85, 12],
 65 | ]
 66 | 
 67 | # Convert to numpy arrays
 68 | datasets = {
 69 |     "CoGym": np.array(cogym_data),
 70 |     "Sotopia": np.array(sotopia_data),
 71 |     "WebArena": np.array(webarena_data),
 72 |     "WebVoyager": np.array(webvoyager_data),
 73 | }
 74 | 
 75 | # Create the figure and grid
 76 | fig = plt.figure(figsize=(6, 5))
 77 | gs = GridSpec(2, 2, figure=fig, wspace=0.3, hspace=0.3)
 78 | 
 79 | # Determine global min and max values for consistent axes across all plots
 80 | all_coverage = np.concatenate([d[:, 0] for d in datasets.values()])
 81 | all_redundancy = np.concatenate([d[:, 1] for d in datasets.values()])
 82 | all_n_metrics = np.concatenate([d[:, 2] for d in datasets.values()])
 83 | 
 84 | min_coverage, max_coverage = np.min(all_coverage) - 0.05, np.max(all_coverage) + 0.05
 85 | min_redundancy, max_redundancy = (
 86 |     np.min(all_redundancy) - 0.05,
 87 |     np.max(all_redundancy) + 0.05,
 88 | )
 89 | min_n_metrics, max_n_metrics = np.min(all_n_metrics), np.max(all_n_metrics)
 90 | 
 91 | # Create a scatter plot for each dataset
 92 | axes = []
 93 | scatters = []
 94 | 
 95 | stars = {
 96 |     "CoGym": [0.75, 0.72],
 97 |     "Sotopia": [0.58, 0.85],
 98 |     "WebArena": [0.82, 0.8],
 99 |     "WebVoyager": [0.83, 0.76],
100 | }
101 | 
102 | square = {
103 |     "CoGym": [0.47, 0.84],
104 |     "Sotopia": [0.53, 0.79],
105 |     "WebArena": [0.75, 0.88],
106 |     "WebVoyager": [0.76, 0.91],
107 | }
108 | 
109 | positions = [(0, 0), (0, 1), (1, 0), (1, 1)]
110 | for i, (dataset_name, dataset) in enumerate(datasets.items()):
111 |     row, col = positions[i]
112 |     ax = fig.add_subplot(gs[row, col])
113 |     axes.append(ax)
114 | 
115 |     coverage = dataset[:, 0]
116 |     redundancy = dataset[:, 1]
117 |     n_metrics = dataset[:, 2]
118 | 
119 |     scatter = ax.scatter(
120 |         redundancy,
121 |         coverage,
122 |         c=n_metrics,
123 |         cmap="PiYG",
124 |         s=100,
125 |         alpha=0.8,
126 |         edgecolors="k",
127 |         vmin=min_n_metrics,
128 |         vmax=max_n_metrics,
129 |     )
130 | 
131 |     star_y, star_x = stars[dataset_name]
132 |     ax.plot(
133 |         star_x,
134 |         star_y,
135 |         "*",
136 |         color="yellow",
137 |         markersize=15,
138 |         markeredgecolor="black",
139 |         markeredgewidth=1.0,
140 |     )
141 |     scatters.append(scatter)
142 | 
143 |     square_y, square_x = square[dataset_name]
144 |     ax.plot(
145 |         square_x,
146 |         square_y,
147 |         "s",
148 |         color="blue",
149 |         markersize=10,
150 |         markeredgecolor="black",
151 |         markeredgewidth=1.0,
152 |     )
153 | 
154 |     # Set consistent axis limits for all plots
155 |     ax.set_xlim(min_redundancy, max_redundancy)
156 |     ax.set_ylim(min_coverage, max_coverage)
157 | 
158 |     # Add labels and title, but remove redundancy label from first row
159 |     # and coverage label from second column
160 |     if row == 1:  # Only add x-axis label for bottom row
161 |         ax.set_xlabel("Redundancy", fontsize=12)
162 |     if col == 0:  # Only add y-axis label for left column
163 |         ax.set_ylabel("Coverage", fontsize=12)
164 |     ax.set_title(dataset_name, fontsize=14)
165 | 
166 |     # Add grid
167 |     ax.grid(True, linestyle="--", alpha=0.7)
168 | 
169 | # Adjust layout before adding colorbar
170 | plt.tight_layout(rect=(0, 0, 0.9, 1))  # Make room for the colorbar
171 | 
172 | # Add a common colorbar to the figure
173 | cbar_ax = fig.add_axes((0.92, 0.15, 0.02, 0.7))  # [left, bottom, width, height]
174 | cbar = fig.colorbar(scatters[0], cax=cbar_ax)
175 | 
176 | # Position N label under the colorbar
177 | cbar.ax.set_xlabel("$N$", labelpad=5)
178 | cbar.ax.xaxis.set_label_position("bottom")
179 | 
180 | # Save the figure
181 | plt.savefig("four_datasets_grid.pdf", bbox_inches="tight")
182 | plt.savefig("four_datasets_grid.png", dpi=300, bbox_inches="tight")
183 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/datasets/sotopia.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | from pathlib import Path
  3 | 
  4 | from pydantic import BaseModel, Field
  5 | from osw_data import MultiAgentDataset, AgentMetadata, MediaType, PointType
  6 | from .base import BaseConverter, run_converter
  7 | 
  8 | from huggingface_hub import hf_hub_download
  9 | 
 10 | 
 11 | class TwoAgentEpisodeWithScenarioBackgroundGoals(BaseModel):
 12 |     episode_id: str = Field()
 13 |     environment_id: str = Field()
 14 |     agent_ids: list[str] = Field()
 15 |     experiment_tag: str = Field()
 16 |     experiment_model_name_pairs: list[str] = Field()
 17 |     raw_messages: list[list[tuple[str, str, str]]] = Field()
 18 |     raw_rewards: list[tuple[float, dict[str, float]] | float] = Field()
 19 |     raw_rewards_prompt: str = Field()
 20 |     scenario: str = Field()
 21 |     codename: str = Field()
 22 |     agents_background: dict[str, str] = Field()
 23 |     social_goals: dict[str, str] = Field()
 24 |     social_interactions: str = Field()
 25 |     reasoning: str = Field()
 26 |     rewards: list[dict[str, float]] = Field()
 27 | 
 28 | 
 29 | class SotopiaConverter(BaseConverter):
 30 |     def __init__(self, output_path: Path, source_path: Path):
 31 |         super().__init__(output_path, source_path)
 32 | 
 33 |     def download_data(self) -> None:
 34 |         self.source_path.mkdir(parents=True, exist_ok=True)
 35 | 
 36 |         # Download trajectory file
 37 |         if not (self.source_path / "sotopia_episodes_v1_hf.jsonl").exists():
 38 |             hf_hub_download(
 39 |                 repo_id="cmu-lti/sotopia",
 40 |                 filename="sotopia_episodes_v1_hf.jsonl",
 41 |                 repo_type="dataset",
 42 |                 local_dir=self.source_path,
 43 |             )
 44 | 
 45 |     def convert_to_dataset(self) -> None:
 46 |         """Convert entire Sotopia dataset"""
 47 |         self.logger.info("Converting Sotopia dataset")
 48 | 
 49 |         dataset = MultiAgentDataset(
 50 |             name="Sotopia Interaction",
 51 |             base_path=self.output_path,
 52 |             description="Sotopia dialog interactions",
 53 |         )
 54 | 
 55 |         with open(self.source_path / "sotopia_episodes_v1.jsonl", "r") as f:
 56 |             for line in f:
 57 |                 episode = (
 58 |                     TwoAgentEpisodeWithScenarioBackgroundGoals.model_validate_json(line)
 59 |                 )
 60 | 
 61 |                 agent_names = episode.agents_background.keys()
 62 |                 agent_backgrounds = episode.agents_background
 63 |                 models = episode.experiment_model_name_pairs
 64 |                 agents_metadata = {}
 65 |                 for agent_name in agent_names:
 66 |                     agents_metadata[agent_name] = AgentMetadata(
 67 |                         agent_id=agent_name,
 68 |                         agent_type="sotopia_agent",
 69 |                         capabilities=[
 70 |                             "speek",
 71 |                             "non-verbal communication",
 72 |                             "physical actions",
 73 |                         ],
 74 |                         parameters={"background": agent_backgrounds[agent_name]},
 75 |                     )
 76 |                 instance_id = episode.episode_id
 77 | 
 78 |                 instance_metadata = {
 79 |                     "scenario": episode.scenario,
 80 |                     "experiment_tag": episode.experiment_tag,
 81 |                     "models": models,
 82 |                     "rewards": episode.rewards,
 83 |                 }
 84 |                 if models != ["gpt-4", "gpt-4", "gpt-4"]:
 85 |                     self.logger.info(
 86 |                         f"Skipping instance {instance_id} because of model mismatch"
 87 |                     )
 88 |                     continue
 89 | 
 90 |                 for rewards in episode.rewards:
 91 |                     overall_reward = rewards["overall_score"]
 92 |                     if overall_reward < 1.6:
 93 |                         self.logger.info(
 94 |                             f"Skipping instance {instance_id} because of low reward"
 95 |                         )
 96 |                         continue
 97 | 
 98 |                 instance_id = dataset.create_instance(
 99 |                     agents_metadata=agents_metadata, instance_metadata=instance_metadata
100 |                 )
101 | 
102 |                 for turn in episode.raw_messages:
103 |                     for from_agent, to_agent, message in turn:
104 |                         timestamp = datetime.datetime.now()
105 |                         action_timestamp = datetime.datetime.now()
106 |                         if from_agent == "Environment":
107 |                             dataset.add_data_point(
108 |                                 instance_id=instance_id,
109 |                                 agent_id=to_agent,
110 |                                 point_type=PointType.OBSERVATION,
111 |                                 data={"content": message},
112 |                                 media_type=MediaType.JSON,
113 |                                 timestamp=timestamp,
114 |                             )
115 |                         elif message != "did nothing":
116 |                             dataset.add_data_point(
117 |                                 instance_id=instance_id,
118 |                                 agent_id=from_agent,
119 |                                 point_type=PointType.ACTION,
120 |                                 data={"content": message},
121 |                                 media_type=MediaType.JSON,
122 |                                 timestamp=action_timestamp,
123 |                             )
124 | 
125 |         dataset.close()
126 | 
127 | 
128 | if __name__ == "__main__":
129 |     source_path = Path(".data/raw/sotopia")
130 |     output_path = Path(".data/sotopia")
131 | 
132 |     run_converter(SotopiaConverter, output_path, source_path)
133 | 


--------------------------------------------------------------------------------
/src/training/iterative.py:
--------------------------------------------------------------------------------
  1 | # Iterative Metric Creation
  2 | # Input: instances, trajectories, agents, and feedbacks
  3 | # Output: metrics
  4 | # Algorithm:
  5 | # metrics = propose_metrics(train_trajectories, train_feedbacks)
  6 | # while coverage improves
  7 | #     eval_results = llm_evaluator(train_trajectories, metrics)
  8 | #     uncovered_feedbacks, coverage = missing_points_detection(train_trajectories, eval_results)
  9 | #     new_metrics = propose_metrics(train_trajectories, uncovered_feedbacks)
 10 | #     metrics += new_metrics
 11 | 
 12 | import asyncio
 13 | from datetime import datetime
 14 | from openai import AsyncAzureOpenAI
 15 | from osw_data import Metric, MultiAgentDataset, MetricSet
 16 | from osw_data.annotation import AnnotationSystem
 17 | from autolibra_core import run_llm_eval, behavior_clustering, feedback_grounding
 18 | from autolibra_core.data import MetricTrainingInstance, Trait
 19 | from autolibra_core.configs import AutoLibraEvalSettings
 20 | from autolibra_core.evaluators.coverage_evaluator import run_coverage_eval
 21 | from autolibra_core.evaluators.llm_evaluator import _make_snake_case
 22 | import logfire
 23 | 
 24 | 
 25 | async def iterative_metric_creation(dataset_name: str) -> list[Metric]:
 26 |     settings = AutoLibraEvalSettings()
 27 | 
 28 |     dataset = MultiAgentDataset(
 29 |         name="dataset",
 30 |         base_path=f".data/{dataset_name}",
 31 |     )
 32 | 
 33 |     annotation_system = AnnotationSystem(
 34 |         base_path=f".data/annotations/{dataset_name}",
 35 |     )
 36 | 
 37 |     metric_training_instances: list[MetricTrainingInstance] = []
 38 | 
 39 |     for instances in dataset.list_instances():
 40 |         instance = dataset.get_instance_metadata(instances)
 41 |         for agent_id in instance.agents:
 42 |             trajectory_annotations = annotation_system.get_trajectory_annotations(
 43 |                 instance_id=instances, agent_id=agent_id
 44 |             )
 45 |             for annotation in trajectory_annotations.annotations:
 46 |                 metric_training_instances.append(
 47 |                     MetricTrainingInstance(
 48 |                         task=instance.metadata["task"]
 49 |                         if "task" in instance.metadata
 50 |                         else "Task is described in the trajectory observation",
 51 |                         agent_id=agent_id,
 52 |                         trajectory=dataset.get_trajectory(instances, agent_id),
 53 |                         feedback=annotation.content["feedback"],
 54 |                     )
 55 |                 )
 56 | 
 57 |     # initial state of metrics
 58 |     prev_coverage_rate: float = 0
 59 |     curr_coverage_rate: float = 0
 60 |     prev_metrics: list[Metric] = []
 61 |     curr_metrics: list[Metric] = []
 62 | 
 63 |     client = AsyncAzureOpenAI(
 64 |         api_key=settings.azure_api_key,
 65 |         api_version="2024-12-01-preview",
 66 |         azure_endpoint=settings.azure_endpoint,
 67 |     )
 68 | 
 69 |     logfire.instrument_openai(client)
 70 | 
 71 |     # initial aspects
 72 |     feedback_grounding_results = await asyncio.gather(
 73 |         *[
 74 |             feedback_grounding(instance, client)
 75 |             for instance in metric_training_instances
 76 |         ]
 77 |     )
 78 | 
 79 |     aspects = sum(
 80 |         feedback_grounding_results,
 81 |         [],
 82 |     )
 83 | 
 84 |     while curr_coverage_rate >= prev_coverage_rate:
 85 |         logfire.info(f"Current coverage rate: {curr_coverage_rate}")
 86 |         logfire.info(f"Previous coverage rate: {prev_coverage_rate}")
 87 |         prev_metrics = curr_metrics
 88 |         prev_coverage_rate = curr_coverage_rate
 89 | 
 90 |         curr_metrics = (
 91 |             prev_metrics + (await behavior_clustering(aspects, client)).metrics
 92 |         )
 93 | 
 94 |         eval_results = await run_llm_eval(
 95 |             metric_training_instances, metrics=curr_metrics, client=client
 96 |         )
 97 | 
 98 |         eval_scoring = [
 99 |             [
100 |                 int(getattr(eval_result, _make_snake_case(metric.name), 0))
101 |                 for metric in curr_metrics
102 |             ]
103 |             for eval_result in eval_results
104 |         ]
105 | 
106 |         traits = [
107 |             [
108 |                 Trait(
109 |                     metric=metric,
110 |                     rating=score,
111 |                 )
112 |                 for metric, score in zip(curr_metrics, eval_scoring_for_instance)
113 |             ]
114 |             for eval_scoring_for_instance in eval_scoring
115 |         ]
116 | 
117 |         coverage_eval_results = await run_coverage_eval(
118 |             instance_traits=traits,
119 |             instances=metric_training_instances,
120 |             client=client,
121 |         )
122 | 
123 |         covered_aspects = sum([result[0] for result in coverage_eval_results])
124 |         total_aspects = sum([result[1] for result in coverage_eval_results])
125 |         _covered_traits = sum([result[2] for result in coverage_eval_results])
126 |         _total_traits = sum([result[3] for result in coverage_eval_results])
127 |         uncovered_aspects = sum([result[4] for result in coverage_eval_results], [])
128 | 
129 |         curr_coverage_rate = covered_aspects / total_aspects
130 |         aspects = uncovered_aspects
131 | 
132 |     return prev_metrics
133 | 
134 | 
135 | def save_metrics(metrics: list[Metric], path: str) -> None:
136 |     metric_set = MetricSet(
137 |         name="Metrics derived from webarena dataset",
138 |         base_path=path,
139 |         induced_from="webarena",
140 |         version="0.1",
141 |     )
142 |     metric_set.add_metrics(metrics)
143 | 
144 | 
145 | async def main() -> None:
146 |     metrics = await iterative_metric_creation("sotopia")
147 | 
148 |     metric_set = MetricSet(
149 |         name="Metrics derived from sotopia dataset",
150 |         base_path=f".data/metrics/sotopia/{datetime.now().strftime('%m_%d_%H_%M')}",
151 |         induced_from="sotopia",
152 |         version="0.1",
153 |     )
154 | 
155 |     metric_set.add_metrics(metrics)
156 | 
157 | 
158 | if __name__ == "__main__":
159 |     logfire.configure()
160 |     asyncio.run(main())
161 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/datasets/cogym.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | from pathlib import Path
  3 | 
  4 | from pydantic import BaseModel, Field
  5 | from osw_data import (
  6 |     MultiAgentDataset,
  7 |     AgentMetadata,
  8 |     MediaType,
  9 |     PointType,
 10 |     AnnotationSystem,
 11 | )
 12 | from .base import BaseConverter, run_converter
 13 | 
 14 | 
 15 | class HumanEvalFinal(BaseModel):
 16 |     env_id: str
 17 |     user_id: str
 18 |     agent_rating: int
 19 |     outcome_preference: str
 20 |     outcome_rating: int
 21 |     feedback: str
 22 | 
 23 | 
 24 | class HumanEval(BaseModel):
 25 |     final: HumanEvalFinal | None = Field(default=None)
 26 | 
 27 | 
 28 | class Action(BaseModel):
 29 |     role: str
 30 |     action: str
 31 |     action_status: str
 32 |     timestamp: str
 33 | 
 34 | 
 35 | class CoGymTrajectory(BaseModel):
 36 |     trajectory: list[Action]
 37 |     task: str
 38 |     human_eval: HumanEval
 39 | 
 40 | 
 41 | class CoGymConverter(BaseConverter):
 42 |     def __init__(
 43 |         self, output_path: Path, source_path: Path, annotation_path: Path | None = None
 44 |     ) -> None:
 45 |         super().__init__(output_path, source_path)
 46 |         self.annotation_path = annotation_path
 47 | 
 48 |     def download_data(self) -> None:
 49 |         trajectory_path = self.source_path / "trajectory"
 50 | 
 51 |         if not trajectory_path.exists():
 52 |             raise FileNotFoundError(f"Trajectory path {trajectory_path} does not exist")
 53 | 
 54 |     def convert_to_dataset(self) -> None:
 55 |         """Convert entire Sotopia dataset"""
 56 |         self.logger.info("Converting CoGym dataset")
 57 | 
 58 |         dataset = MultiAgentDataset(
 59 |             name="CoGym Interaction",
 60 |             base_path=self.output_path,
 61 |             description="CoGym dialog interactions",
 62 |         )
 63 | 
 64 |         if self.annotation_path is not None:
 65 |             annotation_system = AnnotationSystem(
 66 |                 base_path=self.annotation_path,
 67 |                 project_name="CoGym Annotations",
 68 |                 annotation_schema={
 69 |                     "feedback": {
 70 |                         "type": "string",
 71 |                         "description": "Free-form text feedback on the trajectory",
 72 |                     }
 73 |                 },
 74 |             )
 75 | 
 76 |             annotator_id = "Original CoGym Annotators"
 77 | 
 78 |             if annotator_id not in annotation_system.project.annotators:
 79 |                 annotation_system.add_annotator(
 80 |                     annotator_id=annotator_id,
 81 |                     name=annotator_id,  # Using ID as name for simplicity
 82 |                 )
 83 | 
 84 |         trajectory_path = self.source_path / "trajectory"
 85 | 
 86 |         for trajectory_file in trajectory_path.glob("*.json"):
 87 |             trajectory = CoGymTrajectory.model_validate_json(
 88 |                 trajectory_file.read_text()
 89 |             )
 90 | 
 91 |             roles = list(set([action.role for action in trajectory.trajectory]))
 92 |             if len(roles) == 1:
 93 |                 self.logger.warning(
 94 |                     f"Skipping trajectory with only one role: {trajectory_file}"
 95 |                 )
 96 |                 continue
 97 | 
 98 |             assert len(roles) == 2, f"Expected 2 roles, got {roles}"
 99 |             the_other_role = {
100 |                 roles[0]: roles[1],
101 |                 roles[1]: roles[0],
102 |             }
103 |             agent_role: str | None = None
104 | 
105 |             agents_metadata = {}
106 |             for role in roles:
107 |                 if "user" in role:
108 |                     agents_metadata[role] = AgentMetadata(
109 |                         agent_id=role,
110 |                         agent_type="human",
111 |                         capabilities=["dialog"],
112 |                     )
113 |                     agent_role = the_other_role[role]
114 |                 else:
115 |                     agents_metadata[role] = AgentMetadata(
116 |                         agent_id=role,
117 |                         agent_type="agent",
118 |                         capabilities=["dialog", "code_generation"],
119 |                     )
120 | 
121 |             assert agent_role
122 | 
123 |             instance_metadata = {
124 |                 "task": trajectory.task,
125 |             }
126 | 
127 |             instance_id = dataset.create_instance(
128 |                 agents_metadata=agents_metadata, instance_metadata=instance_metadata
129 |             )
130 | 
131 |             for action in trajectory.trajectory:
132 |                 dataset.add_data_point(
133 |                     instance_id=instance_id,
134 |                     agent_id=action.role,
135 |                     point_type=PointType.ACTION,
136 |                     media_type=MediaType.JSON,
137 |                     data=action.action,
138 |                     timestamp=datetime.datetime.fromisoformat(action.timestamp),
139 |                 )
140 |                 dataset.add_data_point(
141 |                     instance_id=instance_id,
142 |                     agent_id=the_other_role[action.role],
143 |                     point_type=PointType.OBSERVATION,
144 |                     media_type=MediaType.JSON,
145 |                     data=action.action_status,
146 |                     timestamp=datetime.datetime.fromisoformat(action.timestamp),
147 |                 )
148 | 
149 |             if (
150 |                 self.annotation_path is not None
151 |                 and trajectory.human_eval.final is not None
152 |             ):
153 |                 annotation_system.add_annotation(
154 |                     instance_id=instance_id,
155 |                     agent_id=agent_role,
156 |                     annotator_id=annotator_id,
157 |                     content={"feedback": trajectory.human_eval.final.feedback},
158 |                 )
159 | 
160 |         dataset.close()
161 | 
162 | 
163 | if __name__ == "__main__":
164 |     source_path = Path(".data/raw/cogym")
165 |     output_path = Path(".data/cogym")
166 |     annotation_path = Path(".data/annotations/cogym")
167 | 
168 |     run_converter(
169 |         CoGymConverter, output_path, source_path, annotation_path=annotation_path
170 |     )
171 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/datasets/balrog_babaisai.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from pathlib import Path
  4 | from datetime import datetime, timedelta
  5 | import csv
  6 | import shutil
  7 | 
  8 | # Import our dataset classes
  9 | from osw_data import MultiAgentDataset, AgentMetadata, PointType, MediaType
 10 | 
 11 | from .base import BaseConverter, run_converter
 12 | from osw_data.utils import file_pairs_list
 13 | 
 14 | 
 15 | class BalrogConverter(BaseConverter):
 16 |     """Handles downloading and converting Balrog data to our dataset format"""
 17 | 
 18 |     def __init__(self, output_path: Path, source_path: Path):
 19 |         super().__init__(output_path, source_path)
 20 | 
 21 |     def download_data(self) -> None:
 22 |         """Download Balrog dataset files"""
 23 |         self.source_path.mkdir(parents=True, exist_ok=True)
 24 | 
 25 |         # This only exists to satisfy the BaseConverter class
 26 | 
 27 |     @staticmethod
 28 |     def clean_csv_file(file_path: Path) -> None:
 29 |         """Remove NUL characters from a CSV file."""
 30 |         with open(file_path, "rb") as f:
 31 |             content = f.read()
 32 | 
 33 |         # Remove NUL characters
 34 |         content = content.replace(b"\x00", b"")
 35 | 
 36 |         with open(file_path, "wb") as f:
 37 |             f.write(content)
 38 | 
 39 |     def convert_to_dataset(self) -> None:
 40 |         """Convert Balrog data to autolibra dataset format"""
 41 |         self.logger.info("Creating Balrog dataset...")
 42 | 
 43 |         ref_time = datetime.now()  # Used for step_id
 44 | 
 45 |         # Obtain task from folder name
 46 |         task = self.source_path.name.split("_")[0].split("-")[-1]
 47 |         task = task[0].upper() + task[1:]
 48 | 
 49 |         # Initialize dataset
 50 |         dataset = MultiAgentDataset(
 51 |             name=f"{task}-Balrog",
 52 |             base_path=self.output_path,
 53 |             description=f"{task} trajectories from Balrog dataset",
 54 |         )
 55 | 
 56 |         # Get list of all directories within self.source_path
 57 |         subtasks: list[str] = [
 58 |             f.name for f in os.scandir(self.source_path) if f.is_dir()
 59 |         ]
 60 | 
 61 |         # Read trajectories (for given task type, exists n subdirs for task, each subdir has a trajectory file)
 62 | 
 63 |         # Iterate over folders in task_dir
 64 |         for subtask in subtasks:
 65 |             subtask_dir = self.source_path / subtask
 66 | 
 67 |             fpl = file_pairs_list(subtask_dir)
 68 | 
 69 |             for traj_file, jf in fpl:
 70 |                 # Get pair of files in subtask_dir
 71 |                 episode_number = str(str(jf).split("_")[-1].split(".")[0])
 72 |                 # Clean the CSV file before processing
 73 |                 csv_path = subtask_dir / f"{subtask}_run_{episode_number}.csv"
 74 |                 self.clean_csv_file(csv_path)  # Call the clean_csv_file method
 75 |                 # Load json file
 76 |                 json_file = json.load(open(jf))
 77 | 
 78 |                 prompt_data = json_file["prompt"]
 79 | 
 80 |                 # Create agent metadata (this does not change within a subtask)
 81 |                 agents_metadata = {
 82 |                     "agent": AgentMetadata(
 83 |                         agent_id="agent",
 84 |                         agent_type="game_agent",
 85 |                         capabilities=["navigation", "interaction"],
 86 |                     )
 87 |                 }
 88 | 
 89 |                 # Create instance metadata (this does not change within a subtask)
 90 |                 instance_metadata = {
 91 |                     "task": json_file["task"],
 92 |                     "source_model": json_file["client"]["model_id"],
 93 |                     "prompt": prompt_data,
 94 |                 }
 95 | 
 96 |                 instance_id = dataset.create_instance(
 97 |                     agents_metadata=agents_metadata, instance_metadata=instance_metadata
 98 |                 )
 99 |                 self.logger.info(
100 |                     f"Created instance {instance_id} for episode number {episode_number}"
101 |                 )
102 | 
103 |                 gif_path = subtask_dir / f"episode_{episode_number}.gif"
104 |                 # Copy gif to output path
105 |                 gif_out_path = (
106 |                     self.output_path
107 |                     / "instances"
108 |                     / instance_id
109 |                     / f"episode_{episode_number}.gif"
110 |                 )
111 |                 shutil.copy(gif_path, gif_out_path)
112 | 
113 |                 # Update instance_id with gif_path
114 |                 add_gif = {"gif_path": gif_out_path}
115 |                 dataset.update_instance_metadata(
116 |                     instance_id=instance_id, new_meta=add_gif
117 |                 )
118 | 
119 |                 with open(traj_file, newline="") as f:
120 |                     reader = csv.reader(f, quotechar='"', quoting=csv.QUOTE_MINIMAL)
121 |                     # Skip header
122 |                     next(reader)
123 |                     for line in (
124 |                         reader
125 |                     ):  # Format of Step,Action,Reasoning,Observation,Reward,Done
126 |                         line = [
127 |                             field.replace("\n", " ").replace("\r", "") for field in line
128 |                         ]
129 | 
130 |                         # Convert to datetime by adding to now
131 |                         step_id = ref_time + timedelta(seconds=int(line[0]))
132 |                         actions = line[1]
133 |                         reasoning = line[2]
134 |                         observations = line[3]
135 |                         # Make new glyphs by running self.gm.glyph_id_to_rgb on each element of glyphs_raw in vectorized form
136 | 
137 |                         # step_id should be the same to allow reconstruction of the trajectory, but if this
138 |                         # causes issues, should be fixed
139 |                         act_obj = {"reasoning": reasoning, "text": actions}
140 | 
141 |                         obs_obj = {"observations": observations}
142 | 
143 |                         dataset.add_data_point(
144 |                             instance_id=instance_id,
145 |                             agent_id="agent",
146 |                             timestamp=step_id,
147 |                             point_type=PointType.OBSERVATION,
148 |                             data=obs_obj,
149 |                             media_type=MediaType.JSON,
150 |                         )
151 | 
152 |                         dataset.add_data_point(
153 |                             instance_id=instance_id,
154 |                             agent_id="agent",
155 |                             timestamp=step_id,  # Using step_id as timestamp
156 |                             point_type=PointType.ACTION,
157 |                             data=act_obj,
158 |                             media_type=MediaType.JSON,
159 |                         )
160 | 
161 |             self.logger.info(f"Dataset conversion complete for {task}")
162 |             dataset.close()
163 | 
164 | 
165 | if __name__ == "__main__":
166 |     import argparse
167 | 
168 |     parser = argparse.ArgumentParser(description="Balrog Converter")
169 |     parser.add_argument(
170 |         "--filename",
171 |         type=str,
172 |         required=True,
173 |         help="The name of the folder containing the Balrog-babaisai data for the given run",
174 |     )
175 | 
176 |     filename = parser.parse_args().filename
177 | 
178 |     source_path = Path(f".data/raw/{filename}")  # Handle all balrog data in one folder
179 |     output_path = Path(
180 |         f".data/{filename.split('-')[-1]}"
181 |     )  # Handle all balrog data in one folder
182 | 
183 |     run_converter(BalrogConverter, output_path, source_path)
184 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/datasets/balrog_mini.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from pathlib import Path
  4 | from datetime import datetime, timedelta
  5 | import csv
  6 | import shutil
  7 | 
  8 | 
  9 | # Import our dataset classes
 10 | from osw_data import MultiAgentDataset, AgentMetadata, PointType, MediaType
 11 | 
 12 | from .base import BaseConverter, run_converter
 13 | 
 14 | 
 15 | class BalrogConverter(BaseConverter):
 16 |     """Handles downloading and converting Balrog data to our dataset format"""
 17 | 
 18 |     def __init__(self, output_path: Path, source_path: Path):
 19 |         super().__init__(output_path, source_path)
 20 | 
 21 |     def download_data(self) -> None:
 22 |         """Download Balrog dataset files"""
 23 |         self.source_path.mkdir(parents=True, exist_ok=True)
 24 | 
 25 |         # This only exists to satisfy the BaseConverter class
 26 | 
 27 |     @staticmethod
 28 |     def clean_csv_file(file_path: Path) -> None:
 29 |         """Remove NUL characters from a CSV file."""
 30 |         with open(file_path, "rb") as f:
 31 |             content = f.read()
 32 | 
 33 |         # Remove NUL characters
 34 |         content = content.replace(b"\x00", b"")
 35 | 
 36 |         with open(file_path, "wb") as f:
 37 |             f.write(content)
 38 | 
 39 |     def convert_to_dataset(self) -> None:
 40 |         """Convert Balrog data to autolibra dataset format"""
 41 |         self.logger.info("Creating Balrog dataset...")
 42 | 
 43 |         ref_time = datetime.now()  # Used for step_id
 44 | 
 45 |         # Obtain task from folder name
 46 |         task = self.source_path.name.split("_")[0].split("-")[-1]
 47 |         task = task[0].upper() + task[1:]
 48 | 
 49 |         # Initialize dataset
 50 |         dataset = MultiAgentDataset(
 51 |             name=f"{task}-Balrog",
 52 |             base_path=self.output_path,
 53 |             description=f"{task} trajectories from Balrog dataset",
 54 |         )
 55 | 
 56 |         # Get list of all directories within self.source_path
 57 |         subtasks: list[str] = [
 58 |             f.name for f in os.scandir(self.source_path) if f.is_dir()
 59 |         ]
 60 | 
 61 |         # Iterate over folders in task_dir
 62 |         for subtask in subtasks:
 63 |             subtask_dir = self.source_path / subtask
 64 | 
 65 |             # Find all files with matching suffixes (00, 01, 02)
 66 |             for suffix in ["00", "01", "02"]:
 67 |                 # Construct file paths for the current group
 68 |                 gif_path = subtask_dir / f"episode_{suffix}.gif"
 69 |                 csv_path = (
 70 |                     subtask_dir / f"{subtask}_run_{suffix}.csv"
 71 |                 )  # Use subtask name + _run_ + suffix
 72 |                 json_path = (
 73 |                     subtask_dir / f"{subtask}_run_{suffix}.json"
 74 |                 )  # Use subtask name + _run_ + suffix
 75 |                 pkl_path = (
 76 |                     subtask_dir / f"{subtask}_run_{suffix}.pkl"
 77 |                 )  # Use subtask name + _run_ + suffix
 78 | 
 79 |                 # Check if all required files exist for this group
 80 |                 if not (
 81 |                     gif_path.exists()
 82 |                     and csv_path.exists()
 83 |                     and json_path.exists()
 84 |                     and pkl_path.exists()
 85 |                 ):
 86 |                     self.logger.warning(
 87 |                         f"Missing files for suffix {suffix} in {subtask_dir}"
 88 |                     )
 89 |                     continue
 90 | 
 91 |                 # Clean the CSV file before processing
 92 |                 self.clean_csv_file(csv_path)  # Call the clean_csv_file method
 93 | 
 94 |                 # Load json file
 95 |                 json_file = json.load(open(json_path))
 96 | 
 97 |                 # Create agent metadata (this does not change within a subtask)
 98 |                 agents_metadata = {
 99 |                     "agent": AgentMetadata(
100 |                         agent_id="agent",
101 |                         agent_type="game_agent",
102 |                         capabilities=["navigation", "interaction"],
103 |                     )
104 |                 }
105 | 
106 |                 # Create instance metadata (this does not change within a subtask)
107 |                 instance_metadata = {
108 |                     "task": json_file["task"],
109 |                     "source_model": json_file["client"]["model_id"],
110 |                 }
111 | 
112 |                 # Create a unique instance ID for this group
113 |                 instance_id = dataset.create_instance(
114 |                     agents_metadata=agents_metadata, instance_metadata=instance_metadata
115 |                 )
116 |                 self.logger.info(f"Created instance {instance_id} for suffix {suffix}")
117 | 
118 |                 # Copy GIF to output path
119 |                 gif_out_path = (
120 |                     self.output_path
121 |                     / "instances"
122 |                     / instance_id
123 |                     / f"episode_{suffix}.gif"
124 |                 )
125 |                 shutil.copy(gif_path, gif_out_path)
126 | 
127 |                 # Update instance metadata with GIF path
128 |                 add_gif = {"gif_path": gif_out_path}
129 |                 dataset.update_instance_metadata(
130 |                     instance_id=instance_id, new_meta=add_gif
131 |                 )
132 | 
133 |                 # Process the CSV file
134 |                 with open(csv_path, newline="") as f:
135 |                     reader = csv.reader(f, quotechar='"', quoting=csv.QUOTE_MINIMAL)
136 |                     next(reader)  # Skip header
137 |                     for line in reader:
138 |                         line = [
139 |                             field.replace("\n", " ").replace("\r", "") for field in line
140 |                         ]
141 | 
142 |                         step_id = ref_time + timedelta(seconds=int(line[0]))
143 |                         actions = line[1]
144 |                         reasoning = line[2]
145 |                         observations = line[3]
146 | 
147 |                         act_obj = {"reasoning": reasoning, "text": actions}
148 | 
149 |                         obs_obj = {"observations": observations}
150 | 
151 |                         dataset.add_data_point(
152 |                             instance_id=instance_id,
153 |                             agent_id="agent",
154 |                             timestamp=step_id,
155 |                             point_type=PointType.OBSERVATION,
156 |                             data=obs_obj,
157 |                             media_type=MediaType.JSON,
158 |                         )
159 | 
160 |                         dataset.add_data_point(
161 |                             instance_id=instance_id,
162 |                             agent_id="agent",
163 |                             timestamp=step_id,
164 |                             point_type=PointType.ACTION,
165 |                             data=act_obj,
166 |                             media_type=MediaType.JSON,
167 |                         )
168 | 
169 |         self.logger.info(f"Dataset conversion complete for {task}")
170 |         dataset.close()
171 | 
172 | 
173 | if __name__ == "__main__":
174 |     import argparse
175 | 
176 |     parser = argparse.ArgumentParser(description="Balrog Converter")
177 |     parser.add_argument(
178 |         "--filename",
179 |         type=str,
180 |         required=True,
181 |         help="The name of the folder containing the Balrog-minihack data for the given run",
182 |     )
183 | 
184 |     filename = parser.parse_args().filename
185 | 
186 |     source_path = Path(f".data/raw/{filename}")  # Handle all balrog data in one folder
187 |     output_path = Path(
188 |         f".data/{filename.split('-')[-1]}"
189 |     )  # Handle all balrog data in one folder
190 | 
191 |     run_converter(BalrogConverter, output_path, source_path)
192 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/evaluators/coverage_evaluator.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import pickle
  3 | from typing import Literal
  4 | import logfire
  5 | from openai import AsyncAzureOpenAI, RateLimitError
  6 | import openai
  7 | from osw_data.metrics import Metric
  8 | from autolibra_core.configs import AutoLibraEvalSettings
  9 | from autolibra_core.data import Aspect
 10 | from autolibra_core.data.primitives import Trait
 11 | from ..data import MetricTrainingInstance
 12 | from ..operators import feedback_grounding
 13 | from pydantic import BaseModel, Field, ValidationError, create_model
 14 | from pydantic.fields import FieldInfo
 15 | from autolibra_core.utils import load_prompt_template
 16 | 
 17 | 
 18 | def _sanitize_string(s: str) -> str:
 19 |     return (
 20 |         s.replace("\\'", "")
 21 |         .replace('\\"', "")
 22 |         .replace("\\n", " ")
 23 |         .replace("'", "")
 24 |         .replace('"', "")
 25 |         .replace("\n", " ")
 26 |         .replace("\\", " ")
 27 |         .replace("ℹ", " ")
 28 |     )
 29 | 
 30 | 
 31 | async def create_aspect_traits_match_pydantic_model(
 32 |     aspects: list[Aspect], traits: list[Metric]
 33 | ) -> type[BaseModel]:
 34 |     fields: dict[str, tuple[type[Literal[str]], FieldInfo]] = {}  # type: ignore[valid-type]
 35 |     for i in range(len(aspects)):
 36 |         fields[f"aspect_{i}"] = (  # type: ignore[assignment]
 37 |             Literal[
 38 |                 _sanitize_string(aspects[i].feedback)
 39 |                 + ": "
 40 |                 + _sanitize_string(aspects[i].behavior)
 41 |             ],
 42 |             Field(title=f"Aspect {i}"),
 43 |         )
 44 | 
 45 |         fields[f"trait_{i}"] = (  # type: ignore[assignment]
 46 |             Literal[
 47 |                 tuple(
 48 |                     _sanitize_string(trait.name)
 49 |                     + ": "
 50 |                     + _sanitize_string(trait.explanation)
 51 |                     for trait in traits
 52 |                 )
 53 |                 + ("None of the traits matches the aspect.",)
 54 |             ],
 55 |             Field(title=f"Trait {i}"),
 56 |         )
 57 | 
 58 |     return create_model("AspectTraitsMatch", **fields)  # type: ignore[no-any-return, call-overload]
 59 | 
 60 | 
 61 | async def match_aspects_and_traits(
 62 |     client: AsyncAzureOpenAI, aspects: list[Aspect], traits: list[Metric]
 63 | ) -> dict[str, str]:
 64 |     settings = AutoLibraEvalSettings()
 65 |     results: list[BaseModel] = []
 66 |     for aspect in aspects:
 67 |         aspect_traits_model = await create_aspect_traits_match_pydantic_model(
 68 |             [aspect], traits
 69 |         )
 70 | 
 71 |         template = load_prompt_template("coverage_evaluation_v2.j2")
 72 |         prompt = template.render(
 73 |             aspects=[aspect],
 74 |             traits=traits,
 75 |         )
 76 | 
 77 |         model = settings.azure_openai_4o_model
 78 |         assert model
 79 | 
 80 |         while True:
 81 |             wait_time = 1
 82 |             try:
 83 |                 completion = await client.beta.chat.completions.parse(
 84 |                     model=model,
 85 |                     messages=[
 86 |                         {
 87 |                             "role": "system",
 88 |                             "content": "Match the aspects with the traits.",
 89 |                         },
 90 |                         {"role": "user", "content": prompt},
 91 |                     ],
 92 |                     response_format=aspect_traits_model,
 93 |                 )
 94 |                 break
 95 |             except ValidationError as e:
 96 |                 print(e)
 97 |                 print(aspect_traits_model.model_json_schema())
 98 |             except RateLimitError as e:
 99 |                 print(e)
100 |                 wait_time *= 2
101 |                 await asyncio.sleep(wait_time)
102 |             except openai.BadRequestError as e:
103 |                 print(aspect_traits_model.model_json_schema())
104 |                 logfire.warning(f"Schema error: {e}")
105 |                 raise e
106 | 
107 |         result_or_none = completion.choices[0].message.parsed
108 |         assert result_or_none and isinstance(result_or_none, aspect_traits_model)
109 |         results.append(result_or_none)
110 | 
111 |     result_dict: dict[str, str] = {}
112 |     for i, result in enumerate(results):
113 |         result_dict[f"aspect_{i}"] = result.model_dump()["aspect_0"]
114 |         result_dict[f"trait_{i}"] = result.model_dump()["trait_0"]
115 | 
116 |     return result_dict
117 | 
118 | 
119 | async def run_instance_coverage_eval(
120 |     client: AsyncAzureOpenAI,
121 |     aspects: list[Aspect],
122 |     traits: list[Trait],
123 | ) -> tuple[int, int, int, int, list[Aspect]]:
124 |     positive_aspects = [aspect for aspect in aspects if aspect.is_positive]
125 |     negative_aspects = [aspect for aspect in aspects if not aspect.is_positive]
126 |     positive_traits = [trait.metric for trait in traits if trait.rating == 1]
127 |     negative_traits = [trait.metric for trait in traits if trait.rating == -1]
128 | 
129 |     # Coverage on positive aspects
130 |     try:
131 |         positive_match_results = await match_aspects_and_traits(
132 |             client, positive_aspects, positive_traits
133 |         )
134 |     except openai.BadRequestError as e:
135 |         pickle.dump(
136 |             (positive_aspects, positive_traits),
137 |             open("positive_aspects_traits.pkl", "wb"),
138 |         )
139 |         raise e
140 | 
141 |     # Coverage on negative aspects
142 |     negative_match_results = await match_aspects_and_traits(
143 |         client, negative_aspects, negative_traits
144 |     )
145 | 
146 |     number_of_total_aspects = len(aspects)
147 |     number_of_not_matched_aspects = 0
148 |     unmatch_aspects: list[Aspect] = []
149 | 
150 |     for i in range(len(positive_aspects)):
151 |         if (
152 |             positive_match_results[f"trait_{i}"]
153 |             == "None of the traits matches the aspect."
154 |         ):
155 |             number_of_not_matched_aspects += 1
156 |             unmatch_aspects.append(positive_aspects[i])
157 | 
158 |     for i in range(len(negative_aspects)):
159 |         if (
160 |             negative_match_results[f"trait_{i}"]
161 |             == "None of the traits matches the aspect."
162 |         ):
163 |             number_of_not_matched_aspects += 1
164 |             unmatch_aspects.append(negative_aspects[i])
165 | 
166 |     used_traits = set()
167 | 
168 |     for i in range(len(positive_aspects)):
169 |         if (
170 |             positive_match_results[f"trait_{i}"]
171 |             != "None of the traits matches the aspect."
172 |         ):
173 |             used_traits.add(positive_match_results[f"trait_{i}"])
174 | 
175 |     for i in range(len(negative_aspects)):
176 |         if (
177 |             negative_match_results[f"trait_{i}"]
178 |             != "None of the traits matches the aspect."
179 |         ):
180 |             used_traits.add(negative_match_results[f"trait_{i}"])
181 | 
182 |     return (
183 |         number_of_total_aspects - number_of_not_matched_aspects,
184 |         number_of_total_aspects,
185 |         len(traits) - len(used_traits),
186 |         len(traits),
187 |         unmatch_aspects,
188 |     )
189 | 
190 | 
191 | async def run_coverage_eval(
192 |     instance_traits: list[list[Trait]],
193 |     instances: list[MetricTrainingInstance],
194 |     client: AsyncAzureOpenAI,
195 | ) -> list[tuple[int, int, int, int, list[Aspect]]]:
196 |     instance_aspects = await asyncio.gather(
197 |         *[feedback_grounding(instance, client) for instance in instances]
198 |     )
199 | 
200 |     with open("feedback_grounding_results.jsonl", "w") as f:
201 |         for feedback_grounding_result in instance_aspects:
202 |             for aspect in feedback_grounding_result:
203 |                 f.write(aspect.model_dump_json(indent=2))
204 |                 f.write("\n")
205 |             f.write("\n")
206 | 
207 |     coverage_results = await asyncio.gather(
208 |         *[
209 |             run_instance_coverage_eval(client, aspects, traits)
210 |             for aspects, traits in zip(instance_aspects, instance_traits)
211 |         ]
212 |     )
213 | 
214 |     return coverage_results
215 | 


--------------------------------------------------------------------------------
/src/tty/view_annotations.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | from pathlib import Path
  3 | import pandas as pd
  4 | from datetime import datetime
  5 | import typer
  6 | from osw_data.annotation import AnnotationSystem
  7 | from rich.console import Console
  8 | from typing import List, Dict, Any
  9 | 
 10 | console = Console()
 11 | app = typer.Typer()
 12 | 
 13 | 
 14 | def sanitize_text(text: str) -> str:
 15 |     """Clean text to avoid display issues and escape markdown characters."""
 16 |     # Remove any extra quotes around the text
 17 |     text = text.strip("'\"")
 18 | 
 19 |     # Replace problematic unicode characters with their closest ASCII equivalents
 20 |     replacements = {'"': '"', "'": "'", "–": "-", "—": "-", "…": "...", "' '": " "}
 21 |     for old, new in replacements.items():
 22 |         text = text.replace(old, new)
 23 | 
 24 |     # Remove any remaining single quotes between characters
 25 |     text = "".join(
 26 |         c
 27 |         for i, c in enumerate(text)
 28 |         if c != "'"
 29 |         or (
 30 |             i > 0
 31 |             and i < len(text) - 1
 32 |             and text[i - 1].isalpha()
 33 |             and text[i + 1].isalpha()
 34 |         )
 35 |     )
 36 | 
 37 |     # Escape markdown special characters
 38 |     markdown_chars: List[str] = [
 39 |         "*",
 40 |         "_",
 41 |         "`",
 42 |         "#",
 43 |         "~",
 44 |         ">",
 45 |         "<",
 46 |         "[",
 47 |         "]",
 48 |         "(",
 49 |         ")",
 50 |         "|",
 51 |         "$",
 52 |     ]
 53 |     for char in markdown_chars:
 54 |         text = text.replace(char, "\\" + char)
 55 | 
 56 |     return text
 57 | 
 58 | 
 59 | def load_annotations(annotations_dir: Path) -> List[Dict[str, Any]]:
 60 |     """Load all annotations using AnnotationSystem."""
 61 |     annotations: List[Dict[str, Any]] = []
 62 | 
 63 |     # Initialize AnnotationSystem
 64 |     annotation_system = AnnotationSystem(
 65 |         base_path=annotations_dir.parent,  # Navigate up to where project.yaml is
 66 |         project_name="Annotation Viewer",
 67 |         description="View annotations from different projects",
 68 |     )
 69 | 
 70 |     # Get all annotation files in the directory
 71 |     annotation_files = list(annotations_dir.glob("*.json"))
 72 | 
 73 |     # Extract instance IDs from filenames
 74 |     for file_path in annotation_files:
 75 |         # Filename format: instance_id_agent_id.json
 76 |         instance_id, agent_id = file_path.stem.rsplit("_", 1)
 77 | 
 78 |         trajectory_annotations = annotation_system.get_trajectory_annotations(
 79 |             instance_id=instance_id, agent_id=agent_id
 80 |         )
 81 | 
 82 |         for annotation in trajectory_annotations.annotations:
 83 |             # Parse the ISO format timestamp string into a datetime object
 84 |             # Handle both string and datetime objects
 85 |             if isinstance(annotation.created_at, str):
 86 |                 created_dt = datetime.fromisoformat(
 87 |                     annotation.created_at.replace("Z", "+00:00")
 88 |                 )
 89 |             else:
 90 |                 created_dt = annotation.created_at
 91 | 
 92 |             # Handle null start/end times
 93 |             start_time = (
 94 |                 annotation.span.start_time
 95 |                 if annotation.span and annotation.span.start_time
 96 |                 else "N/A"
 97 |             )
 98 |             end_time = (
 99 |                 annotation.span.end_time
100 |                 if annotation.span and annotation.span.end_time
101 |                 else "N/A"
102 |             )
103 | 
104 |             annotations.append(
105 |                 {
106 |                     "instance_id": instance_id,
107 |                     "agent_id": agent_id,
108 |                     "annotator_id": annotation.annotator_id,
109 |                     "feedback": sanitize_text(annotation.content["feedback"]),
110 |                     "start_time": start_time,
111 |                     "end_time": end_time,
112 |                     "created_at": created_dt.strftime("%Y-%m-%d %H:%M:%S"),
113 |                     "created_dt": created_dt,
114 |                 }
115 |             )
116 | 
117 |     return annotations
118 | 
119 | 
120 | @app.command()
121 | def main(
122 |     annotations_dir: Path = typer.Argument(
123 |         ...,
124 |         help="Path to the annotations directory (e.g., .data/annotations/sotopia/annotations)",
125 |         exists=True,
126 |         dir_okay=True,
127 |         file_okay=False,
128 |     ),
129 | ) -> None:
130 |     """View annotations from the specified directory."""
131 |     streamlit_main(annotations_dir)
132 | 
133 | 
134 | def streamlit_main(annotations_dir: Path) -> None:
135 |     """Main Streamlit interface."""
136 |     st.title("🔍 Annotation Viewer")
137 | 
138 |     # Convert to absolute path and resolve any relative path components
139 |     annotations_dir = annotations_dir.absolute().resolve()
140 | 
141 |     if not annotations_dir.exists():
142 |         st.error(f"Annotations directory not found: {annotations_dir}")
143 |         st.info(
144 |             "Please provide the full path to the annotations directory. For example:\n\n"
145 |             "```bash\n"
146 |             "autolibra-eval view-annotations .data/annotations/sotopia/annotations\n"
147 |             "```"
148 |         )
149 |         return
150 | 
151 |     # Load annotations
152 |     annotations = load_annotations(annotations_dir)
153 | 
154 |     if not annotations:
155 |         st.warning("No annotations found.")
156 |         return
157 | 
158 |     # Convert to DataFrame for easier manipulation
159 |     df = pd.DataFrame(annotations)
160 | 
161 |     # Remove duplicate annotations, keeping only the most recent one
162 |     df = df.sort_values("created_dt", ascending=False).drop_duplicates(
163 |         subset=["instance_id", "agent_id", "annotator_id", "feedback"], keep="first"
164 |     )
165 | 
166 |     # Display summary statistics
167 |     st.header("📊 Summary Statistics")
168 |     col1, col2, col3 = st.columns(3)
169 | 
170 |     with col1:
171 |         st.metric("Total Annotations", len(df))
172 |     with col2:
173 |         st.metric("Unique Instances", df["instance_id"].nunique())
174 |     with col3:
175 |         st.metric("Unique Annotators", df["annotator_id"].nunique())
176 | 
177 |     # Filters
178 |     st.header("🔎 Filters")
179 |     col1, col2 = st.columns(2)
180 | 
181 |     with col1:
182 |         selected_annotator = st.selectbox(
183 |             "Select Annotator",
184 |             options=["All"] + sorted(df["annotator_id"].unique().tolist()),
185 |         )
186 | 
187 |     # Filter instance options based on selected annotator
188 |     instance_options = df["instance_id"].unique().tolist()
189 |     if selected_annotator != "All":
190 |         instance_options = (
191 |             df[df["annotator_id"] == selected_annotator]["instance_id"]
192 |             .unique()
193 |             .tolist()
194 |         )
195 | 
196 |     with col2:
197 |         selected_instance = st.selectbox(
198 |             "Select Instance", options=["All"] + sorted(instance_options)
199 |         )
200 | 
201 |     # Apply filters
202 |     filtered_df = df.copy()
203 |     if selected_annotator != "All":
204 |         filtered_df = filtered_df[filtered_df["annotator_id"] == selected_annotator]
205 |     if selected_instance != "All":
206 |         filtered_df = filtered_df[filtered_df["instance_id"] == selected_instance]
207 | 
208 |     # Display annotations
209 |     st.header("📝 Annotations")
210 | 
211 |     # Sort by timestamp in descending order
212 |     filtered_df = filtered_df.sort_values("created_dt", ascending=False)
213 | 
214 |     for _, row in filtered_df.iterrows():
215 |         with st.expander(
216 |             f"Instance: {row['instance_id']} | Agent: {row['agent_id']} | {row['created_at']}",
217 |             expanded=False,
218 |         ):
219 |             st.markdown(f"**Annotator:** {row['annotator_id']}")
220 |             st.markdown("**Feedback:**")
221 |             st.info(row["feedback"])
222 |             if row["start_time"] != "N/A" or row["end_time"] != "N/A":
223 |                 st.markdown(f"**Time Range:** {row['start_time']} to {row['end_time']}")
224 | 
225 | 
226 | if __name__ == "__main__":
227 |     app()
228 | 


--------------------------------------------------------------------------------
/packages/osw-data/tests/test_metrics.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from pathlib import Path
  3 | import json
  4 | from typing import Generator
  5 | 
  6 | # Import the classes to test
  7 | from osw_data.metrics import MetricSet, Metric, MetricSetMetadata
  8 | 
  9 | 
 10 | @pytest.fixture
 11 | def sample_metric() -> Metric:
 12 |     """Fixture that returns a sample metric"""
 13 |     return Metric(
 14 |         name="test_metric",
 15 |         explanation="A test metric",
 16 |         good_behaviors=["good1", "good2"],
 17 |         bad_behaviors=["bad1", "bad2"],
 18 |     )
 19 | 
 20 | 
 21 | @pytest.fixture
 22 | def sample_metrics() -> list[Metric]:
 23 |     """Fixture that returns a list of sample metrics"""
 24 |     return [
 25 |         Metric(
 26 |             name="metric1",
 27 |             explanation="First test metric",
 28 |             good_behaviors=["good1"],
 29 |             bad_behaviors=["bad1"],
 30 |         ),
 31 |         Metric(
 32 |             name="metric2",
 33 |             explanation="Second test metric",
 34 |             good_behaviors=["good2"],
 35 |             bad_behaviors=["bad2"],
 36 |         ),
 37 |     ]
 38 | 
 39 | 
 40 | @pytest.fixture
 41 | def metric_set(tmp_path: Path) -> Generator[MetricSet, None, None]:
 42 |     """Fixture that creates a MetricSet instance with a temporary directory"""
 43 |     ms = MetricSet(
 44 |         name="test_set", base_path=tmp_path, induced_from="test_source", version="1.0.0"
 45 |     )
 46 |     yield ms
 47 | 
 48 | 
 49 | class TestMetricSetInitialization:
 50 |     def test_basic_initialization(self, tmp_path: Path) -> None:
 51 |         """Test basic initialization of MetricSet"""
 52 |         ms = MetricSet(name="test", base_path=tmp_path, induced_from="source")
 53 | 
 54 |         assert ms.base_path == tmp_path
 55 |         assert ms.metrics_path == tmp_path / "metrics"
 56 |         assert ms.metadata_path == tmp_path / "metadata.json"
 57 |         assert ms.metrics_path.exists()
 58 |         assert ms.base_path.exists()
 59 | 
 60 |     def test_initialization_with_existing_metadata(self, tmp_path: Path) -> None:
 61 |         """Test initialization when metadata file already exists"""
 62 |         # Create existing metadata
 63 |         metadata = MetricSetMetadata(
 64 |             name="existing",
 65 |             metric_names=["metric1"],
 66 |             induced_from="source",
 67 |             version="1.0",
 68 |         )
 69 |         metadata_path = tmp_path / "metadata.json"
 70 |         metadata_path.parent.mkdir(parents=True, exist_ok=True)
 71 |         with open(metadata_path, "w") as f:
 72 |             f.write(metadata.model_dump_json(indent=2))
 73 | 
 74 |         metric1 = Metric(
 75 |             name="metric1",
 76 |             explanation="First test metric",
 77 |             good_behaviors=["good1"],
 78 |             bad_behaviors=["bad1"],
 79 |         )
 80 | 
 81 |         # Create existing metric file
 82 |         metric_path = tmp_path / "metrics" / "metric1.json"
 83 |         metric_path.parent.mkdir(parents=True, exist_ok=True)
 84 |         with open(metric_path, "w") as f:
 85 |             f.write(metric1.model_dump_json(indent=2))
 86 | 
 87 |         # Initialize MetricSet with existing metadata
 88 |         ms = MetricSet(name="new_name", base_path=tmp_path, induced_from="new_source")
 89 | 
 90 |         # Should load existing metadata instead of creating new
 91 |         assert ms.metadata.name == "existing"
 92 |         assert ms.metadata.metric_names == ["metric1"]
 93 | 
 94 |     def test_initialization_with_invalid_metadata(self, tmp_path: Path) -> None:
 95 |         """Test initialization with corrupted metadata file"""
 96 |         metadata_path = tmp_path / "metadata.json"
 97 |         metadata_path.parent.mkdir(parents=True, exist_ok=True)
 98 |         with open(metadata_path, "w") as f:
 99 |             f.write("invalid json")
100 | 
101 |         with pytest.raises(Exception):
102 |             MetricSet(name="test", base_path=tmp_path, induced_from="source")
103 | 
104 | 
105 | class TestMetricOperations:
106 |     def test_add_single_metric(
107 |         self, metric_set: MetricSet, sample_metric: Metric
108 |     ) -> None:
109 |         """Test adding a single metric"""
110 |         metric_set.add_metrics([sample_metric])
111 | 
112 |         # Check if metric was added to internal dict
113 |         assert sample_metric.name in metric_set.metrics
114 |         assert metric_set.metrics[sample_metric.name] == sample_metric
115 | 
116 |         # Check if metric file was created
117 |         metric_path = metric_set.metrics_path / f"{sample_metric.name}.json"
118 |         assert metric_path.exists()
119 | 
120 |     def test_add_multiple_metrics(
121 |         self, metric_set: MetricSet, sample_metrics: list[Metric]
122 |     ) -> None:
123 |         """Test adding multiple metrics at once"""
124 |         metric_set.add_metrics(sample_metrics)
125 | 
126 |         for metric in sample_metrics:
127 |             assert metric.name in metric_set.metrics
128 |             metric_path = metric_set.metrics_path / f"{metric.name}.json"
129 |             assert metric_path.exists()
130 | 
131 |     def test_add_duplicate_metric(
132 |         self, metric_set: MetricSet, sample_metric: Metric
133 |     ) -> None:
134 |         """Test adding a metric with a name that already exists"""
135 |         metric_set.add_metrics([sample_metric])
136 | 
137 |         with pytest.raises(
138 |             ValueError, match=f"Metric with name {sample_metric.name} already exists"
139 |         ):
140 |             metric_set.add_metrics([sample_metric])
141 | 
142 |     def test_get_existing_metric(
143 |         self, metric_set: MetricSet, sample_metric: Metric
144 |     ) -> None:
145 |         """Test retrieving an existing metric"""
146 |         metric_set.add_metrics([sample_metric])
147 | 
148 |         retrieved_metric = metric_set.get_metric(sample_metric.name)
149 |         assert retrieved_metric.model_dump() == sample_metric.model_dump()
150 | 
151 |     def test_get_nonexistent_metric(self, metric_set: MetricSet) -> None:
152 |         """Test attempting to retrieve a metric that doesn't exist"""
153 |         with pytest.raises(
154 |             ValueError, match="Metric with name nonexistent does not exist"
155 |         ):
156 |             metric_set.get_metric("nonexistent")
157 | 
158 |     def test_get_metric_with_corrupted_file(
159 |         self, metric_set: MetricSet, sample_metric: Metric
160 |     ) -> None:
161 |         """Test getting a metric when its file is corrupted"""
162 |         metric_set.add_metrics([sample_metric])
163 | 
164 |         # Corrupt the metric file
165 |         metric_path = metric_set.metrics_path / f"{sample_metric.name}.json"
166 |         with open(metric_path, "w") as f:
167 |             f.write("invalid json")
168 | 
169 |         with pytest.raises(Exception):
170 |             metric_set.get_metric(sample_metric.name)
171 | 
172 | 
173 | class TestMetadataOperations:
174 |     def test_save_metadata(self, metric_set: MetricSet) -> None:
175 |         """Test saving metadata to file"""
176 |         new_metadata = MetricSetMetadata(
177 |             name="new_name",
178 |             metric_names=["metric1", "metric2"],
179 |             induced_from="new_source",
180 |             version="2.0.0",
181 |         )
182 | 
183 |         metric_set._save_metadata(new_metadata)
184 | 
185 |         # Verify file contents
186 |         with open(metric_set.metadata_path, "r") as f:
187 |             saved_data = json.loads(f.read())
188 |             assert saved_data["name"] == "new_name"
189 |             assert saved_data["metric_names"] == ["metric1", "metric2"]
190 | 
191 |     def test_save_metrics(
192 |         self, metric_set: MetricSet, sample_metrics: list[Metric]
193 |     ) -> None:
194 |         """Test saving all metrics to files"""
195 |         metric_set.metrics = {metric.name: metric for metric in sample_metrics}
196 |         metric_set._save_metrics()
197 | 
198 |         for metric in sample_metrics:
199 |             metric_path = metric_set.metrics_path / f"{metric.name}.json"
200 |             assert metric_path.exists()
201 | 
202 |             with open(metric_path, "r") as f:
203 |                 saved_data = json.loads(f.read())
204 |                 assert saved_data["name"] == metric.name
205 |                 assert saved_data["explanation"] == metric.explanation
206 | 
207 |     def test_initialization_with_none_path(self) -> None:
208 |         """Test initialization with None as path"""
209 |         with pytest.raises(TypeError):
210 |             MetricSet(name="test", base_path=None, induced_from="source")  # type: ignore[arg-type]
211 | 
212 |     def test_file_permission_errors(self, tmp_path: Path) -> None:
213 |         """Test handling of file permission errors"""
214 |         # Create directory with no write permissions
215 |         no_write_dir = tmp_path / "no_write"
216 |         no_write_dir.mkdir()
217 |         no_write_dir.chmod(0o444)  # Read-only
218 | 
219 |         with pytest.raises(Exception):
220 |             MetricSet(name="test", base_path=no_write_dir, induced_from="source")
221 | 


--------------------------------------------------------------------------------
/packages/osw-data/src/osw_data/dataset.py:
--------------------------------------------------------------------------------
  1 | from pydantic import BaseModel, Field
  2 | from typing import Optional, Any
  3 | from typing_extensions import Self
  4 | from pathlib import Path
  5 | import json
  6 | import yaml
  7 | from datetime import datetime
  8 | from uuid import uuid4
  9 | 
 10 | import numpy.typing as npt
 11 | 
 12 | # Assume we're importing from the previous trajectory implementation
 13 | from .trajectory import SymmetricTrajectory, PointType, MediaType
 14 | 
 15 | 
 16 | class AgentMetadata(BaseModel):
 17 |     """Metadata for an individual agent"""
 18 | 
 19 |     agent_id: str
 20 |     agent_type: str
 21 |     capabilities: list[str] = Field(default_factory=list)
 22 |     parameters: dict[str, Any] = Field(default_factory=dict)
 23 |     additional_info: dict[str, Any] = Field(default_factory=dict)
 24 | 
 25 | 
 26 | class DataInstance(BaseModel):
 27 |     """
 28 |     A single instance in the dataset, containing multiple agent trajectories
 29 |     """
 30 | 
 31 |     instance_id: str
 32 |     timestamp: datetime
 33 |     agents: dict[str, AgentMetadata]
 34 |     metadata: dict[str, Any] = Field(default_factory=dict)
 35 | 
 36 | 
 37 | class DatasetMetadata(BaseModel):
 38 |     """Metadata for the entire dataset"""
 39 | 
 40 |     name: str
 41 |     version: str
 42 |     description: str
 43 |     created_at: datetime = Field(default_factory=datetime.now)
 44 |     updated_at: datetime = Field(default_factory=datetime.now)
 45 |     total_instances: int = 0
 46 |     agent_types: list[str] = Field(default_factory=list)
 47 |     schema_version: str = "1.0"
 48 |     additional_info: dict[str, Any] = Field(default_factory=dict)
 49 | 
 50 | 
 51 | class MultiAgentDataset:
 52 |     """
 53 |     Dataset managing multiple instances of multi-agent trajectories
 54 |     """
 55 | 
 56 |     def __init__(
 57 |         self,
 58 |         name: str,
 59 |         base_path: Path | str,
 60 |         description: str = "",
 61 |         version: str = "1.0",
 62 |     ):
 63 |         self.base_path = Path(base_path)
 64 |         self.instances_path = self.base_path / "instances"
 65 |         self.metadata_path = self.base_path / "metadata.yaml"
 66 | 
 67 |         # Initialize directory structure
 68 |         self.base_path.mkdir(parents=True, exist_ok=True)
 69 |         self.instances_path.mkdir(exist_ok=True)
 70 | 
 71 |         # Initialize or load dataset metadata
 72 |         self.metadata = self._init_metadata(name, description, version)
 73 | 
 74 |         # Cache for open trajectories
 75 |         self._trajectory_cache: dict[str, dict[str, SymmetricTrajectory]] = {}
 76 | 
 77 |     def _init_metadata(
 78 |         self, name: str, description: str, version: str
 79 |     ) -> DatasetMetadata:
 80 |         """Initialize or load dataset metadata"""
 81 |         if self.metadata_path.exists():
 82 |             with open(self.metadata_path, "r") as f:
 83 |                 metadata_dict = yaml.safe_load(f)
 84 |                 return DatasetMetadata(**metadata_dict)
 85 |         else:
 86 |             metadata = DatasetMetadata(
 87 |                 name=name, version=version, description=description
 88 |             )
 89 |             self._save_metadata(metadata)
 90 |             return metadata
 91 | 
 92 |     def _save_metadata(self, metadata: DatasetMetadata) -> None:
 93 |         """Save dataset metadata to disk"""
 94 |         with open(self.metadata_path, "w") as f:
 95 |             yaml.dump(json.loads(metadata.model_dump_json()), f)
 96 | 
 97 |     def create_instance(
 98 |         self,
 99 |         agents_metadata: dict[str, AgentMetadata],
100 |         instance_metadata: Optional[dict[str, Any]] = None,
101 |     ) -> str:
102 |         """
103 |         Create a new instance in the dataset
104 | 
105 |         Args:
106 |             agents_metadata: dictionary mapping agent_id to their metadata
107 |             instance_metadata: Optional metadata for the instance
108 | 
109 |         Returns:
110 |             instance_id: Unique identifier for the created instance
111 |         """
112 |         instance_id = str(uuid4())
113 |         instance_path = self.instances_path / instance_id
114 |         instance_path.mkdir(exist_ok=True)
115 | 
116 |         # Create instance metadata
117 |         instance = DataInstance(
118 |             instance_id=instance_id,
119 |             timestamp=datetime.now(),
120 |             agents=agents_metadata,
121 |             metadata=instance_metadata or {},
122 |         )
123 | 
124 |         # Save instance metadata
125 |         with open(instance_path / "metadata.json", "w") as f:
126 |             f.write(instance.model_dump_json())
127 | 
128 |         # Initialize trajectories for each agent
129 |         for agent_id in agents_metadata:
130 |             trajectory = SymmetricTrajectory(
131 |                 trajectory_id=f"{instance_id}_{agent_id}",
132 |                 storage_path=instance_path / agent_id,
133 |             )
134 |             if instance_id not in self._trajectory_cache:
135 |                 self._trajectory_cache[instance_id] = {}
136 |             self._trajectory_cache[instance_id][agent_id] = trajectory
137 | 
138 |         # Update dataset metadata
139 |         self.metadata.total_instances += 1
140 |         self.metadata.agent_types = list(
141 |             set(
142 |                 self.metadata.agent_types
143 |                 + [am.agent_type for am in agents_metadata.values()]
144 |             )
145 |         )
146 |         self.metadata.updated_at = datetime.now()
147 |         self._save_metadata(self.metadata)
148 | 
149 |         return instance_id
150 | 
151 |     def get_trajectory(self, instance_id: str, agent_id: str) -> SymmetricTrajectory:
152 |         """Get trajectory for a specific agent in an instance"""
153 |         if instance_id not in self._trajectory_cache:
154 |             self._trajectory_cache[instance_id] = {}
155 | 
156 |         if agent_id not in self._trajectory_cache[instance_id]:
157 |             instance_path = self.instances_path / instance_id
158 |             if not instance_path.exists():
159 |                 raise ValueError(f"Instance {instance_id} does not exist")
160 | 
161 |             self._trajectory_cache[instance_id][agent_id] = SymmetricTrajectory(
162 |                 trajectory_id=f"{instance_id}_{agent_id}",
163 |                 storage_path=instance_path / agent_id,
164 |             )
165 | 
166 |         return self._trajectory_cache[instance_id][agent_id]
167 | 
168 |     def get_instance_metadata(self, instance_id: str) -> DataInstance:
169 |         """Get metadata for a specific instance"""
170 |         instance_path = self.instances_path / instance_id
171 |         if not instance_path.exists():
172 |             raise ValueError(f"Instance {instance_id} does not exist")
173 | 
174 |         with open(instance_path / "metadata.json", "r") as f:
175 |             return DataInstance.model_validate_json(f.read())
176 | 
177 |     def update_instance_metadata(
178 |         self, instance_id: str, new_meta: dict[str, Any]
179 |     ) -> None:
180 |         """Update metadata for a specific instance"""
181 |         inst = self.get_instance_metadata(instance_id)
182 |         inst.metadata.update(new_meta)
183 |         with open(self.instances_path / instance_id / "metadata.json", "w") as f:
184 |             f.write(inst.model_dump_json())
185 | 
186 |     def list_instances(self) -> list[str]:
187 |         """list all instance IDs in the dataset"""
188 |         return [p.name for p in self.instances_path.iterdir() if p.is_dir()]
189 | 
190 |     def get_instances_by_agent_type(self, agent_type: str) -> list[str]:
191 |         """Get all instances that contain an agent of the specified type"""
192 |         matching_instances = []
193 |         for instance_id in self.list_instances():
194 |             instance = self.get_instance_metadata(instance_id)
195 |             if any(
196 |                 agent.agent_type == agent_type for agent in instance.agents.values()
197 |             ):
198 |                 matching_instances.append(instance_id)
199 |         return matching_instances
200 | 
201 |     def add_data_point(
202 |         self,
203 |         instance_id: str,
204 |         agent_id: str,
205 |         timestamp: datetime,
206 |         point_type: PointType,
207 |         data: npt.NDArray[Any] | dict[str, Any] | str,
208 |         media_type: MediaType,
209 |         metadata: dict[str, Any] | None = None,
210 |     ) -> None:
211 |         """Add a data point to a specific agent's trajectory"""
212 |         trajectory = self.get_trajectory(instance_id, agent_id)
213 |         trajectory.add_point(
214 |             timestamp=timestamp,
215 |             agent_id=agent_id,
216 |             point_type=point_type,
217 |             data=data,
218 |             media_type=media_type,
219 |             metadata=metadata,
220 |         )
221 | 
222 |     def close(self) -> None:
223 |         """Close all open trajectories"""
224 |         for instance_trajectories in self._trajectory_cache.values():
225 |             for trajectory in instance_trajectories.values():
226 |                 trajectory.close()
227 |         self._trajectory_cache.clear()
228 | 
229 |     def __enter__(self) -> Self:
230 |         return self
231 | 
232 |     def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
233 |         self.close()
234 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/datasets/webarena_nnetnav.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | import json
  3 | from pathlib import Path
  4 | from datetime import datetime
  5 | from typing import Any
  6 | 
  7 | # Import our dataset classes
  8 | from osw_data import MultiAgentDataset, AgentMetadata, MediaType, PointType
  9 | 
 10 | from .base import BaseConverter, run_converter
 11 | 
 12 | 
 13 | class WebArenaConverter(BaseConverter):
 14 |     """Handles downloading and converting WebArena data to our dataset format"""
 15 | 
 16 |     def __init__(self, output_path: Path, source_path: Path):
 17 |         super().__init__(output_path, source_path)
 18 |         self.screenshots_path = self.source_path / "screenshots"
 19 | 
 20 |     def _setup_constants(self) -> None:
 21 |         """Setup WebArena-specific constants"""
 22 |         self.SPECIAL_KEYS = [
 23 |             "Enter",
 24 |             "Tab",
 25 |             "Control",
 26 |             "Shift",
 27 |             "Meta",
 28 |             "Backspace",
 29 |             "Delete",
 30 |             "Escape",
 31 |             "ArrowUp",
 32 |             "ArrowDown",
 33 |             "ArrowLeft",
 34 |             "ArrowRight",
 35 |             "PageDown",
 36 |             "PageUp",
 37 |             "Meta+a",
 38 |         ]
 39 |         self.ASCII_CHARSET = "".join(chr(x) for x in range(32, 128))
 40 |         self.FREQ_UNICODE_CHARSET = "".join(chr(x) for x in range(129, 1000))
 41 |         self._id2key = (
 42 |             self.SPECIAL_KEYS
 43 |             + list(self.ASCII_CHARSET)
 44 |             + list(self.FREQ_UNICODE_CHARSET)
 45 |             + ["\n"]
 46 |         )
 47 | 
 48 |     def download_data(self) -> None:
 49 |         """Download WebArena dataset files"""
 50 |         self.source_path.mkdir(parents=True, exist_ok=True)
 51 | 
 52 |         # Download trajectory file
 53 |         if not (self.source_path / "trajectories.jsonl").exists():
 54 |             self.logger.info("Downloading trajectory file...")
 55 |             traj_id = "1Ipuw32ea2B2jJ8EVCYkOW5oY0oaDXGQd"
 56 |             subprocess.run(
 57 |                 ["gdown", traj_id, "-O", str(self.source_path / "trajectories.jsonl")],
 58 |                 check=True,
 59 |             )
 60 | 
 61 |     def _convert_action(
 62 |         self, action: dict[str, Any], metadata: dict[str, Any]
 63 |     ) -> dict[str, Any]:
 64 |         """Convert WebArena action to our format"""
 65 |         function = action["action_name"]
 66 |         kwargs = {}
 67 | 
 68 |         if function == "stop":
 69 |             kwargs["answer"] = action.get("answer", "")
 70 |         elif function == "type":
 71 |             # text_indices = action["text"]
 72 |             # kwargs["text"] = ''.join([
 73 |             #     self._id2key[i]
 74 |             #     for i in text_indices
 75 |             #     if isinstance(i, int) and i < len(self._id2key) and i >= len(self.SPECIAL_KEYS)
 76 |             # ])
 77 |             kwargs["text"] = action["text"]
 78 |             kwargs["element_id"] = action["element_id"]
 79 |         elif function in ["hover", "click"]:
 80 |             kwargs["element_id"] = action["element_id"]
 81 |         elif function == "scroll":
 82 |             kwargs["dx"] = 0
 83 |             kwargs["dy"] = 100 if action["direction"].lower() == "down" else -100
 84 |         elif function in ["key_press", "press"]:
 85 |             kwargs["key_comb"] = action["key_comb"]
 86 |             function = "press"
 87 |         elif function in ["new_tab", "goto", "goto_url"]:
 88 |             kwargs["url"] = action["url"]
 89 |             function = "goto" if function == "goto_url" else function
 90 |         elif function in ["tab_focus", "page_focus"]:
 91 |             kwargs["page_number"] = action["page_number"]
 92 |             function = "tab_focus"
 93 |         elif function in ["go_back", "page_close", "go_forward"]:
 94 |             function = "tab_close" if function == "page_close" else function
 95 |         else:
 96 |             raise ValueError(f"Unknown function: {function}")
 97 | 
 98 |         return {
 99 |             "function": function,
100 |             "kwargs": kwargs,
101 |             "description": metadata.get("cot", ""),
102 |         }
103 | 
104 |     def convert_to_dataset(self) -> None:
105 |         """Convert WebArena data to our dataset format"""
106 |         self.logger.info("Creating dataset...")
107 | 
108 |         # Initialize dataset
109 |         dataset = MultiAgentDataset(
110 |             name="WebArena Interactions",
111 |             base_path=self.output_path,
112 |             description="Web interaction trajectories from WebArena dataset",
113 |         )
114 | 
115 |         # Read trajectories
116 |         with open(self.source_path / "trajectories.jsonl", "r") as f:
117 |             for line in f:
118 |                 raw_traj = json.loads(line)
119 | 
120 |                 # Skip blacklisted sources
121 |                 if raw_traj["source"] in ["SteP"]:
122 |                     continue
123 | 
124 |                 # Create agent metadata
125 |                 agents_metadata = {
126 |                     "agent": AgentMetadata(
127 |                         agent_id="agent",
128 |                         agent_type="web_agent",
129 |                         capabilities=["navigation", "interaction"],
130 |                         parameters={"viewport_size": (1280, 720)},
131 |                     ),
132 |                     "user": AgentMetadata(
133 |                         agent_id="user",
134 |                         agent_type="human",
135 |                         capabilities=["instruction"],
136 |                     ),
137 |                 }
138 | 
139 |                 # Create instance
140 |                 instance_id = str(raw_traj["task_id"])
141 |                 instance_metadata = {
142 |                     "task": raw_traj["intent"],
143 |                     "source_model": raw_traj["source"],
144 |                 }
145 | 
146 |                 instance_id = dataset.create_instance(
147 |                     agents_metadata=agents_metadata, instance_metadata=instance_metadata
148 |                 )
149 | 
150 |                 # Add initial task observation
151 |                 dataset.add_data_point(
152 |                     instance_id=instance_id,
153 |                     agent_id="user",
154 |                     timestamp=datetime.now(),  # Using current time as original times not available
155 |                     point_type=PointType.ACTION,
156 |                     data={"text": raw_traj["intent"]},
157 |                     media_type=MediaType.JSON,
158 |                 )
159 | 
160 |                 # Process trajectory elements
161 |                 for element in raw_traj["trajectory"]:
162 |                     timestamp = (
163 |                         datetime.now()
164 |                     )  # Using current time as original times not available
165 | 
166 |                     if "action" in element:
167 |                         # Convert action
168 |                         action_data = self._convert_action(
169 |                             element["action"], element.get("metadata", {})
170 |                         )
171 | 
172 |                         dataset.add_data_point(
173 |                             instance_id=instance_id,
174 |                             agent_id="agent",
175 |                             timestamp=timestamp,
176 |                             point_type=PointType.ACTION,
177 |                             data=action_data,
178 |                             media_type=MediaType.JSON,
179 |                         )
180 | 
181 |                     elif "url" in element:
182 |                         # Add URL and HTML observation
183 |                         web_data = {"url": element["url"], "html": element["axtree"]}
184 |                         dataset.add_data_point(
185 |                             instance_id=instance_id,
186 |                             agent_id="agent",
187 |                             timestamp=timestamp,
188 |                             point_type=PointType.OBSERVATION,
189 |                             data=web_data,
190 |                             media_type=MediaType.JSON,
191 |                         )
192 | 
193 |                         # Add screenshot observation
194 |                         # screenshot_path = element["screenshot_path"].replace(
195 |                         #     "demo_trajs/images/", str(self.screenshots_path)
196 |                         # )
197 |                         # if os.path.exists(screenshot_path):
198 |                         #     # Load and convert image to numpy array
199 |                         #     image = Image.open(screenshot_path)
200 |                         #     image_array = np.array(image)
201 | 
202 |                         #     dataset.add_data_point(
203 |                         #         instance_id=instance_id,
204 |                         #         agent_id="agent",
205 |                         #         timestamp=timestamp,
206 |                         #         point_type=PointType.OBSERVATION,
207 |                         #         data=image_array,
208 |                         #         media_type=MediaType.IMAGE,
209 |                         #         metadata={"original_path": screenshot_path},
210 |                         #     )
211 |                     else:
212 |                         self.logger.warning(
213 |                             f"Unknown element type in trajectory: {element}"
214 |                         )
215 | 
216 |         self.logger.info("Dataset conversion complete!")
217 |         dataset.close()
218 | 
219 | 
220 | if __name__ == "__main__":
221 |     source_path = Path(".data/raw/webarena-nnetnav")
222 |     output_path = Path(".data/webarena-nnetnav")
223 | 
224 |     run_converter(WebArenaConverter, output_path, source_path)
225 | 


--------------------------------------------------------------------------------
/packages/osw-data/src/osw_data/trajectory.py:
--------------------------------------------------------------------------------
  1 | from pydantic import BaseModel, ConfigDict, Field
  2 | from datetime import datetime
  3 | from typing import Any, Optional, Union
  4 | from typing_extensions import Self
  5 | from enum import Enum
  6 | from pathlib import Path
  7 | import numpy as np
  8 | import numpy.typing as npt
  9 | import json
 10 | 
 11 | 
 12 | class MediaType(str, Enum):
 13 |     """Types of media data supported"""
 14 | 
 15 |     IMAGE = "image"
 16 |     AUDIO = "audio"
 17 |     VIDEO = "video"
 18 |     TEXT = "text"
 19 |     NUMPY = "numpy"
 20 |     JSON = "json"
 21 | 
 22 | 
 23 | class PointType(str, Enum):
 24 |     """Type of trajectory point"""
 25 | 
 26 |     OBSERVATION = "observation"
 27 |     ACTION = "action"
 28 | 
 29 | 
 30 | class MediaReference(BaseModel):
 31 |     """Reference to media data stored on disk"""
 32 | 
 33 |     media_type: MediaType
 34 |     file_path: Path
 35 |     shape: tuple[int, ...] | None = None  # Optional for JSON data
 36 |     dtype: Optional[str] = None  # Optional for JSON data
 37 |     metadata: dict[str, Any] | None = None
 38 | 
 39 | 
 40 | class MediaStorage:
 41 |     """Handles storage and retrieval of both media data and JSON content"""
 42 | 
 43 |     def __init__(self, base_path: Path):
 44 |         self.base_path = base_path
 45 |         self.base_path.mkdir(parents=True, exist_ok=True)
 46 |         self._json_path = self.base_path / "json_data"
 47 |         self._json_path.mkdir(exist_ok=True)
 48 |         self._numpy_path = self.base_path / "numpy_data"
 49 |         self._numpy_path.mkdir(exist_ok=True)
 50 | 
 51 |     def store_data(
 52 |         self,
 53 |         data: Union[npt.ArrayLike, dict[str, Any], str],
 54 |         media_type: MediaType,
 55 |         trajectory_id: str,
 56 |         timestamp: str,
 57 |         point_type: PointType,
 58 |     ) -> MediaReference:
 59 |         """Store either media data or JSON content"""
 60 |         if media_type == MediaType.JSON:
 61 |             assert isinstance(
 62 |                 data, (dict, str)
 63 |             ), "JSON data must be a dictionary or string"
 64 |             return self._store_json(data, trajectory_id, timestamp, point_type)
 65 |         else:
 66 |             assert isinstance(data, np.ndarray), "Media data must be a NumPy"
 67 |             return self._store_numpy(
 68 |                 data, media_type, trajectory_id, timestamp, point_type
 69 |             )
 70 | 
 71 |     def _store_numpy(
 72 |         self,
 73 |         data: npt.NDArray[Any],
 74 |         media_type: MediaType,
 75 |         trajectory_id: str,
 76 |         timestamp: str,
 77 |         point_type: PointType,
 78 |     ) -> MediaReference:
 79 |         """Store media data in HDF5"""
 80 |         data_path = self._numpy_path / f"{trajectory_id}_{point_type}_{timestamp}.npy"
 81 | 
 82 |         np.save(data_path, data)
 83 | 
 84 |         return MediaReference(
 85 |             media_type=media_type,
 86 |             file_path=data_path.relative_to(self.base_path),
 87 |             shape=data.shape,
 88 |             dtype=str(data.dtype),
 89 |         )
 90 | 
 91 |     def _store_json(
 92 |         self,
 93 |         data: dict[str, Any] | str,
 94 |         trajectory_id: str,
 95 |         timestamp: str,
 96 |         point_type: PointType,
 97 |     ) -> MediaReference:
 98 |         """Store JSON data"""
 99 |         json_file = self._json_path / f"{trajectory_id}_{point_type}_{timestamp}.json"
100 | 
101 |         with open(json_file, "w") as f:
102 |             json.dump(data, f)
103 | 
104 |         return MediaReference(
105 |             media_type=MediaType.JSON,
106 |             file_path=json_file.relative_to(self.base_path),
107 |             metadata={"timestamp": timestamp},
108 |         )
109 | 
110 |     def load_data(
111 |         self, reference: MediaReference
112 |     ) -> npt.NDArray[Any] | dict[str, Any] | str:
113 |         """Load either media or JSON data from reference"""
114 |         if reference.media_type == MediaType.JSON:
115 |             with open(self.base_path / reference.file_path, "r") as f:
116 |                 json_data = json.load(f)
117 |                 assert isinstance(json_data, (dict, str)), "Invalid JSON data"
118 |                 return json_data
119 |         else:
120 |             data_path = self.base_path / reference.file_path
121 |             data = np.load(data_path)
122 |             assert isinstance(data, np.ndarray), "Invalid NumPy data"
123 |             return data
124 | 
125 |     def close(self) -> None:
126 |         pass
127 | 
128 | 
129 | class TrajectoryPoint(BaseModel):
130 |     """
131 |     Single point in a trajectory that can be either observation or action
132 |     """
133 | 
134 |     model_config = ConfigDict(arbitrary_types_allowed=True)
135 | 
136 |     timestamp: datetime
137 |     agent_id: str
138 |     point_type: PointType
139 |     data_reference: MediaReference
140 |     metadata: dict[str, Any] = Field(default_factory=dict)
141 | 
142 | 
143 | class SymmetricTrajectory:
144 |     """Trajectory with symmetric handling of observations and actions"""
145 | 
146 |     def __init__(self, trajectory_id: str, storage_path: Path):
147 |         self.trajectory_id = trajectory_id
148 |         self.media_storage = MediaStorage(storage_path)
149 |         self.points: list[TrajectoryPoint] = []
150 |         self.points_file = storage_path / "points.json"
151 | 
152 |         # Load points if they exist
153 |         self._load_points()
154 | 
155 |     def _load_points(self) -> None:
156 |         """Load trajectory points from disk"""
157 |         if self.points_file.exists():
158 |             try:
159 |                 with open(self.points_file, "r") as f:
160 |                     points_data = json.load(f)
161 |                     self.points = [
162 |                         TrajectoryPoint(
163 |                             timestamp=datetime.fromisoformat(p["timestamp"]),
164 |                             agent_id=p["agent_id"],
165 |                             point_type=PointType(p["point_type"]),
166 |                             data_reference=MediaReference.model_validate_json(
167 |                                 p["data_reference"]
168 |                             ),
169 |                             metadata=p.get("metadata", {}),
170 |                         )
171 |                         for p in points_data
172 |                     ]
173 |             except Exception as e:
174 |                 print(f"Error loading points: {e}")
175 | 
176 |     def _save_points(self) -> None:
177 |         """Save trajectory points to disk"""
178 |         points_data = [
179 |             {
180 |                 "timestamp": p.timestamp.isoformat(),
181 |                 "agent_id": p.agent_id,
182 |                 "point_type": p.point_type.value,
183 |                 "data_reference": p.data_reference.model_dump_json(),
184 |                 "metadata": p.metadata,
185 |             }
186 |             for p in self.points
187 |         ]
188 | 
189 |         with open(self.points_file, "w") as f:
190 |             json.dump(points_data, f, indent=2)
191 | 
192 |     def add_point(
193 |         self,
194 |         timestamp: datetime,
195 |         agent_id: str,
196 |         point_type: PointType,
197 |         data: npt.NDArray[Any] | dict[str, Any] | str,
198 |         media_type: MediaType,
199 |         metadata: dict[str, Any] | None = None,
200 |     ) -> None:
201 |         """Add either observation or action point"""
202 |         data_reference = self.media_storage.store_data(
203 |             data=data,
204 |             media_type=media_type,
205 |             trajectory_id=self.trajectory_id,
206 |             timestamp=timestamp.isoformat(),
207 |             point_type=point_type,
208 |         )
209 | 
210 |         point = TrajectoryPoint(
211 |             timestamp=timestamp,
212 |             agent_id=agent_id,
213 |             point_type=point_type,
214 |             data_reference=data_reference,
215 |             metadata=metadata or {},
216 |         )
217 | 
218 |         self.points.append(point)
219 |         self._save_points()  # Save after each addition
220 | 
221 |     def get_data_at(self, index: int) -> npt.NDArray[Any] | dict[str, Any] | str:
222 |         """Load data for a specific trajectory point"""
223 |         point = self.points[index]
224 |         return self.media_storage.load_data(point.data_reference)
225 | 
226 |     def get_points_by_type(self, point_type: PointType) -> list[TrajectoryPoint]:
227 |         """Get all points of a specific type"""
228 |         return [p for p in self.points if p.point_type == point_type]
229 | 
230 |     def get_points_by_agent(self, agent_id: str) -> list[TrajectoryPoint]:
231 |         """Get all points for a specific agent"""
232 |         return [p for p in self.points if p.agent_id == agent_id]
233 | 
234 |     def get_points_in_timerange(
235 |         self, start_time: datetime, end_time: datetime
236 |     ) -> list[TrajectoryPoint]:
237 |         """Get points within a time range"""
238 |         return [p for p in self.points if start_time <= p.timestamp <= end_time]
239 | 
240 |     def close(self) -> None:
241 |         """Close media storage and ensure points are saved"""
242 |         self._save_points()
243 |         self.media_storage.close()
244 | 
245 |     def __enter__(self) -> Self:
246 |         return self
247 | 
248 |     def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
249 |         self.close()
250 | 
251 | 
252 | def render_trajectory(trajectory: SymmetricTrajectory) -> list[dict[str, Any]]:
253 |     """Render a trajectory as a list of dictionaries"""
254 |     return [
255 |         {
256 |             "timestamp": p.timestamp.isoformat(),
257 |             "agent_id": p.agent_id,
258 |             "point_type": p.point_type,
259 |             "data": trajectory.get_data_at(i),
260 |             "metadata": p.metadata,
261 |         }
262 |         for i, p in enumerate(trajectory.points)
263 |     ]
264 | 


--------------------------------------------------------------------------------
/packages/osw-data/src/osw_data/annotation.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from pydantic import BaseModel, Field
  3 | from typing import Optional, Any
  4 | from pathlib import Path
  5 | from datetime import datetime
  6 | import yaml
  7 | from uuid import uuid4
  8 | 
  9 | 
 10 | class Annotator(BaseModel):
 11 |     """Information about an annotator"""
 12 | 
 13 |     annotator_id: str
 14 |     name: str
 15 |     role: Optional[str] = None
 16 |     expertise_level: Optional[str] = None
 17 |     metadata: dict[str, Any] = Field(default_factory=dict)
 18 | 
 19 | 
 20 | class AnnotationSpan(BaseModel):
 21 |     """Represents a span of trajectory points being annotated"""
 22 | 
 23 |     start_time: datetime
 24 |     end_time: Optional[datetime] = None
 25 |     point_indices: Optional[list[int]] = None
 26 | 
 27 | 
 28 | class Annotation(BaseModel):
 29 |     """Single annotation entry"""
 30 | 
 31 |     annotation_id: str = Field(default_factory=lambda: str(uuid4()))
 32 |     annotator_id: str
 33 |     created_at: datetime = Field(default_factory=datetime.now)
 34 |     updated_at: datetime = Field(default_factory=datetime.now)
 35 |     content: dict[str, Any]  # Flexible annotation content
 36 |     span: Optional[AnnotationSpan] = None
 37 |     confidence: Optional[float] = None
 38 |     metadata: dict[str, Any] = Field(default_factory=dict)
 39 | 
 40 | 
 41 | class TrajectoryAnnotations(BaseModel):
 42 |     """Collection of annotations for a specific trajectory"""
 43 | 
 44 |     instance_id: str
 45 |     agent_id: str
 46 |     annotations: list[Annotation] = Field(default_factory=list)
 47 |     metadata: dict[str, Any] = Field(default_factory=dict)
 48 | 
 49 | 
 50 | class AnnotationProject(BaseModel):
 51 |     """Metadata for an annotation project"""
 52 | 
 53 |     project_id: str
 54 |     name: str
 55 |     description: str
 56 |     annotation_schema: dict[str, Any]  # Defines the expected annotation structure
 57 |     guidelines: Optional[str] = None
 58 |     created_at: datetime = Field(default_factory=datetime.now)
 59 |     updated_at: datetime = Field(default_factory=datetime.now)
 60 |     annotators: dict[str, Annotator] = Field(default_factory=dict)
 61 |     metadata: dict[str, Any] = Field(default_factory=dict)
 62 | 
 63 | 
 64 | class AnnotationSystem:
 65 |     """
 66 |     System for managing annotations separate from but linked to the dataset
 67 |     """
 68 | 
 69 |     def __init__(
 70 |         self,
 71 |         base_path: Path | str,
 72 |         project_name: str | None = None,
 73 |         description: str = "",
 74 |         annotation_schema: Optional[dict[str, Any]] = None,
 75 |     ):
 76 |         self.base_path = Path(base_path)
 77 |         self.annotations_path = self.base_path / "annotations"
 78 |         self.project_path = self.base_path / "project.yaml"
 79 | 
 80 |         # Initialize directory structure
 81 |         self.base_path.mkdir(parents=True, exist_ok=True)
 82 |         self.annotations_path.mkdir(exist_ok=True)
 83 | 
 84 |         # Initialize or load project metadata
 85 |         self.project = self._init_project(
 86 |             project_name, description, annotation_schema or {}
 87 |         )
 88 | 
 89 |     def _init_project(
 90 |         self,
 91 |         name: str | None,
 92 |         description: str,
 93 |         annotation_schema: dict[str, Any],
 94 |     ) -> AnnotationProject:
 95 |         """Initialize or load project metadata"""
 96 |         if self.project_path.exists():
 97 |             with open(self.project_path, "r") as f:
 98 |                 project_dict = yaml.safe_load(f)
 99 |                 return AnnotationProject(**project_dict)
100 |         else:
101 |             if not name:
102 |                 raise ValueError("Project name is required")
103 |             project = AnnotationProject(
104 |                 project_id=str(uuid4()),
105 |                 name=name,
106 |                 description=description,
107 |                 annotation_schema=annotation_schema,
108 |             )
109 |             self._save_project(project)
110 |             return project
111 | 
112 |     def _save_project(self, project: AnnotationProject) -> None:
113 |         """Save project metadata to disk"""
114 |         with open(self.project_path, "w") as f:
115 |             yaml.dump(json.loads(project.model_dump_json()), f)
116 | 
117 |     def add_annotator(
118 |         self,
119 |         annotator_id: str,
120 |         name: str,
121 |         role: str | None = None,
122 |         expertise_level: str | None = None,
123 |         metadata: dict[str, Any] | None = None,
124 |     ) -> None:
125 |         """Register a new annotator"""
126 |         annotator = Annotator(
127 |             annotator_id=annotator_id,
128 |             name=name,
129 |             role=role,
130 |             expertise_level=expertise_level,
131 |             metadata=metadata or {},
132 |         )
133 |         self.project.annotators[annotator_id] = annotator
134 |         self._save_project(self.project)
135 | 
136 |     def _get_trajectory_annotation_path(self, instance_id: str, agent_id: str) -> Path:
137 |         """Get path for trajectory annotations"""
138 |         return self.annotations_path / f"{instance_id}_{agent_id}.json"
139 | 
140 |     def get_trajectory_annotations(
141 |         self, instance_id: str, agent_id: str
142 |     ) -> TrajectoryAnnotations:
143 |         """Get all annotations for a specific trajectory"""
144 |         annotation_path = self._get_trajectory_annotation_path(instance_id, agent_id)
145 |         if annotation_path.exists():
146 |             with open(annotation_path, "r") as f:
147 |                 return TrajectoryAnnotations.model_validate_json(f.read())
148 |         return TrajectoryAnnotations(instance_id=instance_id, agent_id=agent_id)
149 | 
150 |     def add_annotation(
151 |         self,
152 |         instance_id: str,
153 |         agent_id: str,
154 |         annotator_id: str,
155 |         content: dict[str, Any],
156 |         span: Optional[AnnotationSpan] = None,
157 |         confidence: Optional[float] = None,
158 |         metadata: Optional[dict[str, Any]] = None,
159 |     ) -> str:
160 |         """
161 |         Add a new annotation to a trajectory
162 | 
163 |         Args:
164 |             instance_id: ID of the dataset instance
165 |             agent_id: ID of the agent
166 |             annotator_id: ID of the annotator
167 |             content: The annotation content
168 |             span: Optional time span or point indices being annotated
169 |             confidence: Optional confidence score
170 |             metadata: Optional additional metadata
171 | 
172 |         Returns:
173 |             annotation_id: ID of the created annotation
174 |         """
175 |         if annotator_id not in self.project.annotators:
176 |             raise ValueError(f"Unknown annotator: {annotator_id}")
177 | 
178 |         # Create new annotation
179 |         annotation = Annotation(
180 |             annotator_id=annotator_id,
181 |             content=content,
182 |             span=span,
183 |             confidence=confidence,
184 |             metadata=metadata or {},
185 |         )
186 | 
187 |         # Add to trajectory annotations
188 |         trajectory_annotations = self.get_trajectory_annotations(instance_id, agent_id)
189 |         trajectory_annotations.annotations.append(annotation)
190 | 
191 |         # Save to disk
192 |         annotation_path = self._get_trajectory_annotation_path(instance_id, agent_id)
193 |         with open(annotation_path, "w") as f:
194 |             f.write(trajectory_annotations.model_dump_json())
195 | 
196 |         return annotation.annotation_id
197 | 
198 |     def get_annotator_annotations(
199 |         self, annotator_id: str
200 |     ) -> dict[str, list[Annotation]]:
201 |         """Get all annotations by a specific annotator"""
202 |         annotations = {}
203 |         for annotation_file in self.annotations_path.glob("*.json"):
204 |             with open(annotation_file, "r") as f:
205 |                 trajectory_annotations = TrajectoryAnnotations.model_validate_json(
206 |                     f.read()
207 |                 )
208 | 
209 |                 # Filter annotations by annotator
210 |                 annotator_anns = [
211 |                     ann
212 |                     for ann in trajectory_annotations.annotations
213 |                     if ann.annotator_id == annotator_id
214 |                 ]
215 | 
216 |                 if annotator_anns:
217 |                     key = f"{trajectory_annotations.instance_id}_{trajectory_annotations.agent_id}"
218 |                     annotations[key] = annotator_anns
219 | 
220 |         return annotations
221 | 
222 |     def get_annotations_by_time(
223 |         self, start_time: datetime, end_time: Optional[datetime] = None
224 |     ) -> dict[str, list[Annotation]]:
225 |         """Get annotations within a time range"""
226 |         annotations = {}
227 |         for annotation_file in self.annotations_path.glob("*.json"):
228 |             with open(annotation_file, "r") as f:
229 |                 trajectory_annotations = TrajectoryAnnotations.model_validate_json(
230 |                     f.read()
231 |                 )
232 | 
233 |                 # Filter annotations by time
234 |                 time_anns = [
235 |                     ann
236 |                     for ann in trajectory_annotations.annotations
237 |                     if ann.span
238 |                     and (not start_time or ann.span.start_time >= start_time)
239 |                     and (
240 |                         not end_time
241 |                         or not ann.span.end_time
242 |                         or ann.span.end_time <= end_time
243 |                     )
244 |                 ]
245 | 
246 |                 if time_anns:
247 |                     key = f"{trajectory_annotations.instance_id}_{trajectory_annotations.agent_id}"
248 |                     annotations[key] = time_anns
249 | 
250 |         return annotations
251 | 
252 |     def get_all_annotations(self) -> dict[str, list[Annotation]]:
253 |         """Get all annotations"""
254 |         annotations = {}
255 |         for annotation_file in self.annotations_path.glob("*.json"):
256 |             with open(annotation_file, "r") as f:
257 |                 trajectory_annotations = TrajectoryAnnotations.model_validate_json(
258 |                     f.read()
259 |                 )
260 | 
261 |                 key = f"{trajectory_annotations.instance_id}_{trajectory_annotations.agent_id}"
262 |                 annotations[key] = trajectory_annotations.annotations
263 | 
264 |         return annotations
265 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/datasets/webarena.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | import zipfile
  3 | import json
  4 | import os
  5 | from pathlib import Path
  6 | from datetime import datetime
  7 | from typing import Any
  8 | import numpy as np
  9 | from PIL import Image
 10 | 
 11 | # Import our dataset classes
 12 | from osw_data import MultiAgentDataset, AgentMetadata, MediaType, PointType
 13 | 
 14 | from .base import BaseConverter, run_converter
 15 | 
 16 | 
 17 | class WebArenaConverter(BaseConverter):
 18 |     """Handles downloading and converting WebArena data to our dataset format"""
 19 | 
 20 |     def __init__(self, output_path: Path, source_path: Path):
 21 |         super().__init__(output_path, source_path)
 22 |         self.screenshots_path = self.source_path / "screenshots"
 23 | 
 24 |     def _setup_constants(self) -> None:
 25 |         """Setup WebArena-specific constants"""
 26 |         self.SPECIAL_KEYS = [
 27 |             "Enter",
 28 |             "Tab",
 29 |             "Control",
 30 |             "Shift",
 31 |             "Meta",
 32 |             "Backspace",
 33 |             "Delete",
 34 |             "Escape",
 35 |             "ArrowUp",
 36 |             "ArrowDown",
 37 |             "ArrowLeft",
 38 |             "ArrowRight",
 39 |             "PageDown",
 40 |             "PageUp",
 41 |             "Meta+a",
 42 |         ]
 43 |         self.ASCII_CHARSET = "".join(chr(x) for x in range(32, 128))
 44 |         self.FREQ_UNICODE_CHARSET = "".join(chr(x) for x in range(129, 1000))
 45 |         self._id2key = (
 46 |             self.SPECIAL_KEYS
 47 |             + list(self.ASCII_CHARSET)
 48 |             + list(self.FREQ_UNICODE_CHARSET)
 49 |             + ["\n"]
 50 |         )
 51 | 
 52 |     def download_data(self) -> None:
 53 |         """Download WebArena dataset files"""
 54 |         self.source_path.mkdir(parents=True, exist_ok=True)
 55 | 
 56 |         # Download trajectory file
 57 |         if not (self.source_path / "trajectories.jsonl").exists():
 58 |             self.logger.info("Downloading trajectory file...")
 59 |             traj_id = "1tvnaklsdSLx4Sp9Uc1spopcFpLktStO8"
 60 |             subprocess.run(
 61 |                 ["gdown", traj_id, "-O", str(self.source_path / "trajectories.jsonl")],
 62 |                 check=True,
 63 |             )
 64 | 
 65 |         # Download and extract screenshots
 66 |         if not self.screenshots_path.exists():
 67 |             self.logger.info("Downloading screenshots...")
 68 |             screenshots_id = "1TNfhApmiEIxiOcUqi4duvVWBaH5_m3By"
 69 |             zip_path = self.source_path / "screenshots.zip"
 70 | 
 71 |             subprocess.run(["gdown", screenshots_id, "-O", str(zip_path)], check=True)
 72 | 
 73 |             self.logger.info("Extracting screenshots...")
 74 |             with zipfile.ZipFile(zip_path, "r") as zip_ref:
 75 |                 zip_ref.extractall(self.source_path)
 76 | 
 77 |             # Rename images directory to screenshots
 78 |             images_path = self.source_path / "images"
 79 |             if images_path.exists():
 80 |                 images_path.rename(self.screenshots_path)
 81 | 
 82 |             # Cleanup zip file
 83 |             zip_path.unlink()
 84 | 
 85 |     def _convert_action(
 86 |         self, action: dict[str, Any], metadata: dict[str, Any]
 87 |     ) -> dict[str, Any]:
 88 |         """Convert WebArena action to our format"""
 89 |         function = action["action_name"]
 90 |         kwargs = {}
 91 | 
 92 |         if function == "stop":
 93 |             kwargs["answer"] = action.get("answer", "")
 94 |         elif function == "type":
 95 |             # text_indices = action["text"]
 96 |             # kwargs["text"] = ''.join([
 97 |             #     self._id2key[i]
 98 |             #     for i in text_indices
 99 |             #     if isinstance(i, int) and i < len(self._id2key) and i >= len(self.SPECIAL_KEYS)
100 |             # ])
101 |             kwargs["text"] = action["text"]
102 |             kwargs["element_id"] = action["element_id"]
103 |         elif function in ["hover", "click"]:
104 |             kwargs["element_id"] = action["element_id"]
105 |         elif function == "scroll":
106 |             kwargs["dx"] = 0
107 |             kwargs["dy"] = 100 if action["direction"].lower() == "down" else -100
108 |         elif function in ["key_press", "press"]:
109 |             kwargs["key_comb"] = action["key_comb"]
110 |             function = "press"
111 |         elif function in ["new_tab", "goto", "goto_url"]:
112 |             kwargs["url"] = action["url"]
113 |             function = "goto" if function == "goto_url" else function
114 |         elif function in ["tab_focus", "page_focus"]:
115 |             kwargs["page_number"] = action["page_number"]
116 |             function = "tab_focus"
117 |         elif function in ["go_back", "page_close", "go_forward"]:
118 |             function = "tab_close" if function == "page_close" else function
119 |         else:
120 |             raise ValueError(f"Unknown function: {function}")
121 | 
122 |         return {
123 |             "function": function,
124 |             "kwargs": kwargs,
125 |             "description": metadata.get("cot", ""),
126 |         }
127 | 
128 |     def convert_to_dataset(self) -> None:
129 |         """Convert WebArena data to our dataset format"""
130 |         self.logger.info("Creating dataset...")
131 | 
132 |         # Initialize dataset
133 |         dataset = MultiAgentDataset(
134 |             name="WebArena Interactions",
135 |             base_path=self.output_path,
136 |             description="Web interaction trajectories from WebArena dataset",
137 |         )
138 | 
139 |         # Read trajectories
140 |         with open(self.source_path / "trajectories.jsonl", "r") as f:
141 |             for line in f:
142 |                 raw_traj = json.loads(line)
143 | 
144 |                 # Skip blacklisted sources
145 |                 if raw_traj["source"] in ["SteP"]:
146 |                     continue
147 | 
148 |                 # Create agent metadata
149 |                 agents_metadata = {
150 |                     "agent": AgentMetadata(
151 |                         agent_id="agent",
152 |                         agent_type="web_agent",
153 |                         capabilities=["navigation", "interaction"],
154 |                         parameters={"viewport_size": (1280, 720)},
155 |                     ),
156 |                     "user": AgentMetadata(
157 |                         agent_id="user",
158 |                         agent_type="human",
159 |                         capabilities=["instruction"],
160 |                     ),
161 |                 }
162 | 
163 |                 # Create instance
164 |                 instance_id = str(raw_traj["task_id"])
165 |                 instance_metadata = {
166 |                     "task": raw_traj["intent"],
167 |                     "source_model": raw_traj["source"],
168 |                 }
169 | 
170 |                 instance_id = dataset.create_instance(
171 |                     agents_metadata=agents_metadata, instance_metadata=instance_metadata
172 |                 )
173 | 
174 |                 # Add initial task observation
175 |                 dataset.add_data_point(
176 |                     instance_id=instance_id,
177 |                     agent_id="user",
178 |                     timestamp=datetime.now(),  # Using current time as original times not available
179 |                     point_type=PointType.ACTION,
180 |                     data={"text": raw_traj["intent"]},
181 |                     media_type=MediaType.JSON,
182 |                 )
183 | 
184 |                 # Process trajectory elements
185 |                 for element in raw_traj["trajectory"]:
186 |                     timestamp = (
187 |                         datetime.now()
188 |                     )  # Using current time as original times not available
189 | 
190 |                     if "action" in element:
191 |                         # Convert action
192 |                         action_data = self._convert_action(
193 |                             element["action"], element.get("metadata", {})
194 |                         )
195 | 
196 |                         dataset.add_data_point(
197 |                             instance_id=instance_id,
198 |                             agent_id="agent",
199 |                             timestamp=timestamp,
200 |                             point_type=PointType.ACTION,
201 |                             data=action_data,
202 |                             media_type=MediaType.JSON,
203 |                         )
204 | 
205 |                     elif "url" in element:
206 |                         # Add URL and HTML observation
207 |                         web_data = {"url": element["url"], "html": element["axtree"]}
208 |                         dataset.add_data_point(
209 |                             instance_id=instance_id,
210 |                             agent_id="agent",
211 |                             timestamp=timestamp,
212 |                             point_type=PointType.OBSERVATION,
213 |                             data=web_data,
214 |                             media_type=MediaType.JSON,
215 |                         )
216 | 
217 |                         # Add screenshot observation
218 |                         screenshot_path = element["screenshot_path"].replace(
219 |                             "demo_trajs/images/", str(self.screenshots_path)
220 |                         )
221 |                         if os.path.exists(screenshot_path):
222 |                             # Load and convert image to numpy array
223 |                             image = Image.open(screenshot_path)
224 |                             image_array = np.array(image)
225 | 
226 |                             dataset.add_data_point(
227 |                                 instance_id=instance_id,
228 |                                 agent_id="agent",
229 |                                 timestamp=timestamp,
230 |                                 point_type=PointType.OBSERVATION,
231 |                                 data=image_array,
232 |                                 media_type=MediaType.IMAGE,
233 |                                 metadata={"original_path": screenshot_path},
234 |                             )
235 |                     else:
236 |                         self.logger.warning(
237 |                             f"Unknown element type in trajectory: {element}"
238 |                         )
239 | 
240 |         self.logger.info("Dataset conversion complete!")
241 |         dataset.close()
242 | 
243 | 
244 | if __name__ == "__main__":
245 |     source_path = Path(".data/raw/webarena")
246 |     output_path = Path(".data/webarena")
247 | 
248 |     run_converter(WebArenaConverter, output_path, source_path)
249 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/datasets/webvoyager_nnetnav_best.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from pathlib import Path
  4 | from datetime import datetime
  5 | from typing import Any
  6 | import numpy as np
  7 | from PIL import Image
  8 | 
  9 | # Import our dataset classes
 10 | from osw_data import MultiAgentDataset, AgentMetadata, MediaType, PointType
 11 | from osw_data.annotation import AnnotationSystem
 12 | 
 13 | from .base import BaseConverter, run_converter
 14 | 
 15 | 
 16 | class WebVoyagerNNetNavConverter(BaseConverter):
 17 |     """Handles downloading and converting WebArena data to our dataset format"""
 18 | 
 19 |     def __init__(
 20 |         self, output_path: Path, source_path: Path, annotation_path: Path | None = None
 21 |     ) -> None:
 22 |         super().__init__(output_path, source_path)
 23 |         self.screenshots_path = self.source_path / "screenshots"
 24 |         self.annotation_path = annotation_path
 25 | 
 26 |     def _setup_constants(self) -> None:
 27 |         """Setup WebArena-specific constants"""
 28 |         self.SPECIAL_KEYS = [
 29 |             "Enter",
 30 |             "Tab",
 31 |             "Control",
 32 |             "Shift",
 33 |             "Meta",
 34 |             "Backspace",
 35 |             "Delete",
 36 |             "Escape",
 37 |             "ArrowUp",
 38 |             "ArrowDown",
 39 |             "ArrowLeft",
 40 |             "ArrowRight",
 41 |             "PageDown",
 42 |             "PageUp",
 43 |             "Meta+a",
 44 |         ]
 45 |         self.ASCII_CHARSET = "".join(chr(x) for x in range(32, 128))
 46 |         self.FREQ_UNICODE_CHARSET = "".join(chr(x) for x in range(129, 1000))
 47 |         self._id2key = (
 48 |             self.SPECIAL_KEYS
 49 |             + list(self.ASCII_CHARSET)
 50 |             + list(self.FREQ_UNICODE_CHARSET)
 51 |             + ["\n"]
 52 |         )
 53 | 
 54 |     def download_data(self) -> None:
 55 |         """Download WebArena dataset files"""
 56 |         pass
 57 | 
 58 |     def _convert_action(
 59 |         self, action: dict[str, Any], metadata: dict[str, Any]
 60 |     ) -> dict[str, Any]:
 61 |         """Convert WebArena action to our format"""
 62 |         function = action["action_name"]
 63 |         kwargs = {}
 64 | 
 65 |         if function == "stop":
 66 |             kwargs["answer"] = action.get("answer", "")
 67 |         elif function == "type":
 68 |             # text_indices = action["text"]
 69 |             # kwargs["text"] = ''.join([
 70 |             #     self._id2key[i]
 71 |             #     for i in text_indices
 72 |             #     if isinstance(i, int) and i < len(self._id2key) and i >= len(self.SPECIAL_KEYS)
 73 |             # ])
 74 |             kwargs["text"] = action["text"]
 75 |             kwargs["element_id"] = action["element_id"]
 76 |         elif function in ["hover", "click"]:
 77 |             kwargs["element_id"] = action["element_id"]
 78 |         elif function == "scroll":
 79 |             kwargs["dx"] = 0
 80 |             kwargs["dy"] = 100 if action["direction"].lower() == "down" else -100
 81 |         elif function in ["key_press", "press"]:
 82 |             kwargs["key_comb"] = action["key_comb"]
 83 |             function = "press"
 84 |         elif function in ["new_tab", "goto", "goto_url"]:
 85 |             kwargs["url"] = action["url"]
 86 |             function = "goto" if function == "goto_url" else function
 87 |         elif function in ["tab_focus", "page_focus"]:
 88 |             kwargs["page_number"] = action["page_number"]
 89 |             function = "tab_focus"
 90 |         elif function in ["go_back", "page_close", "go_forward"]:
 91 |             function = "tab_close" if function == "page_close" else function
 92 |         else:
 93 |             raise ValueError(f"Unknown function: {function}")
 94 | 
 95 |         return {
 96 |             "function": function,
 97 |             "kwargs": kwargs,
 98 |             "description": metadata.get("cot", ""),
 99 |         }
100 | 
101 |     def convert_to_dataset(self) -> None:
102 |         """Convert WebArena data to our dataset format"""
103 |         self.logger.info("Creating dataset...")
104 | 
105 |         # Initialize dataset
106 |         dataset = MultiAgentDataset(
107 |             name="WebArena Interactions",
108 |             base_path=self.output_path,
109 |             description="Web interaction trajectories from WebArena dataset",
110 |         )
111 | 
112 |         if self.annotation_path:
113 |             annotation_system = AnnotationSystem(
114 |                 base_path=self.annotation_path,
115 |                 project_name="WebVoyager Annotations",
116 |                 description="Free-form text annotations of agent trajectories for WebVoyager",
117 |                 annotation_schema={
118 |                     "feedback": {
119 |                         "type": "string",
120 |                         "description": "Free-form text feedback on the trajectory",
121 |                     }
122 |                 },
123 |             )
124 | 
125 |         task_id2instance_id: dict[str, str] = {}
126 | 
127 |         # Read trajectories
128 |         with open(self.source_path / "trajectories.jsonl", "r") as f:
129 |             for line in f:
130 |                 raw_traj = json.loads(line)
131 | 
132 |                 # Skip blacklisted sources
133 |                 if raw_traj["source"] in ["SteP"]:
134 |                     continue
135 | 
136 |                 # Create agent metadata
137 |                 agents_metadata = {
138 |                     "agent": AgentMetadata(
139 |                         agent_id="agent",
140 |                         agent_type="web_agent",
141 |                         capabilities=["navigation", "interaction"],
142 |                         parameters={"viewport_size": (1280, 720)},
143 |                     ),
144 |                     "user": AgentMetadata(
145 |                         agent_id="user",
146 |                         agent_type="human",
147 |                         capabilities=["instruction"],
148 |                     ),
149 |                 }
150 | 
151 |                 # Create instance
152 |                 instance_id = str(raw_traj["task_id"])
153 |                 instance_metadata = {
154 |                     "task": raw_traj["intent"],
155 |                     "source_model": raw_traj["source"],
156 |                 }
157 | 
158 |                 instance_id = dataset.create_instance(
159 |                     agents_metadata=agents_metadata, instance_metadata=instance_metadata
160 |                 )
161 | 
162 |                 task_id2instance_id[raw_traj["task_id"]] = instance_id
163 | 
164 |                 # Add initial task observation
165 |                 dataset.add_data_point(
166 |                     instance_id=instance_id,
167 |                     agent_id="user",
168 |                     timestamp=datetime.now(),  # Using current time as original times not available
169 |                     point_type=PointType.ACTION,
170 |                     data={"text": raw_traj["intent"]},
171 |                     media_type=MediaType.JSON,
172 |                 )
173 | 
174 |                 # Process trajectory elements
175 |                 for element in raw_traj["trajectory"]:
176 |                     timestamp = (
177 |                         datetime.now()
178 |                     )  # Using current time as original times not available
179 | 
180 |                     if "action" in element:
181 |                         # Convert action
182 |                         action_data = self._convert_action(
183 |                             element["action"], element.get("metadata", {})
184 |                         )
185 | 
186 |                         dataset.add_data_point(
187 |                             instance_id=instance_id,
188 |                             agent_id="agent",
189 |                             timestamp=timestamp,
190 |                             point_type=PointType.ACTION,
191 |                             data=action_data,
192 |                             media_type=MediaType.JSON,
193 |                         )
194 | 
195 |                     elif "url" in element:
196 |                         # Add URL and HTML observation
197 |                         web_data = {"url": element["url"], "html": element["axtree"]}
198 |                         dataset.add_data_point(
199 |                             instance_id=instance_id,
200 |                             agent_id="agent",
201 |                             timestamp=timestamp,
202 |                             point_type=PointType.OBSERVATION,
203 |                             data=web_data,
204 |                             media_type=MediaType.JSON,
205 |                         )
206 | 
207 |                         # Add screenshot observation
208 |                         screenshot_path = element["screenshot_path"].replace(
209 |                             "demo_trajs/images/", str(self.screenshots_path)
210 |                         )
211 |                         if os.path.exists(screenshot_path):
212 |                             # Load and convert image to numpy array
213 |                             image = Image.open(screenshot_path)
214 |                             image_array = np.array(image)
215 | 
216 |                             dataset.add_data_point(
217 |                                 instance_id=instance_id,
218 |                                 agent_id="agent",
219 |                                 timestamp=timestamp,
220 |                                 point_type=PointType.OBSERVATION,
221 |                                 data=image_array,
222 |                                 media_type=MediaType.IMAGE,
223 |                                 metadata={"original_path": screenshot_path},
224 |                             )
225 |                     else:
226 |                         self.logger.warning(
227 |                             f"Unknown element type in trajectory: {element}"
228 |                         )
229 | 
230 |         if self.annotation_path:
231 |             annotation_system.add_annotator(
232 |                 annotator_id="Shikhar",
233 |                 name="Shikhar Murty",
234 |             )
235 |             with open(self.source_path / "feedback.json", "r") as f:
236 |                 task_id2feedback = json.load(f)
237 |                 for task_id in task_id2feedback:
238 |                     instance_id_or_none = task_id2instance_id.get(task_id)
239 |                     if instance_id_or_none:
240 |                         annotation_system.add_annotation(
241 |                             instance_id=instance_id_or_none,
242 |                             agent_id="agent",
243 |                             content={"feedback": task_id2feedback[task_id]},
244 |                             annotator_id="Shikhar",
245 |                         )
246 |         self.logger.info("Dataset conversion complete!")
247 |         dataset.close()
248 | 
249 | 
250 | if __name__ == "__main__":
251 |     source_path = Path(".data/raw/webvoyager-nnetnav-best")
252 |     output_path = Path(".data/webvoyager-nnetnav-best")
253 | 
254 |     run_converter(
255 |         WebVoyagerNNetNavConverter,
256 |         output_path,
257 |         source_path,
258 |     )
259 | 


--------------------------------------------------------------------------------
/packages/autolibra-core/src/autolibra_core/datasets/webvoyager_nnetnav.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from pathlib import Path
  4 | from datetime import datetime
  5 | from typing import Any
  6 | import numpy as np
  7 | from PIL import Image
  8 | 
  9 | # Import our dataset classes
 10 | from osw_data import MultiAgentDataset, AgentMetadata, MediaType, PointType
 11 | from osw_data.annotation import AnnotationSystem
 12 | 
 13 | from .base import BaseConverter, run_converter
 14 | 
 15 | 
 16 | class WebVoyagerNNetNavConverter(BaseConverter):
 17 |     """Handles downloading and converting WebArena data to our dataset format"""
 18 | 
 19 |     def __init__(
 20 |         self, output_path: Path, source_path: Path, annotation_path: Path | None = None
 21 |     ) -> None:
 22 |         super().__init__(output_path, source_path)
 23 |         self.screenshots_path = self.source_path / "screenshots"
 24 |         self.annotation_path = annotation_path
 25 | 
 26 |     def _setup_constants(self) -> None:
 27 |         """Setup WebArena-specific constants"""
 28 |         self.SPECIAL_KEYS = [
 29 |             "Enter",
 30 |             "Tab",
 31 |             "Control",
 32 |             "Shift",
 33 |             "Meta",
 34 |             "Backspace",
 35 |             "Delete",
 36 |             "Escape",
 37 |             "ArrowUp",
 38 |             "ArrowDown",
 39 |             "ArrowLeft",
 40 |             "ArrowRight",
 41 |             "PageDown",
 42 |             "PageUp",
 43 |             "Meta+a",
 44 |         ]
 45 |         self.ASCII_CHARSET = "".join(chr(x) for x in range(32, 128))
 46 |         self.FREQ_UNICODE_CHARSET = "".join(chr(x) for x in range(129, 1000))
 47 |         self._id2key = (
 48 |             self.SPECIAL_KEYS
 49 |             + list(self.ASCII_CHARSET)
 50 |             + list(self.FREQ_UNICODE_CHARSET)
 51 |             + ["\n"]
 52 |         )
 53 | 
 54 |     def download_data(self) -> None:
 55 |         """Download WebArena dataset files"""
 56 |         pass
 57 | 
 58 |     def _convert_action(
 59 |         self, action: dict[str, Any], metadata: dict[str, Any]
 60 |     ) -> dict[str, Any]:
 61 |         """Convert WebArena action to our format"""
 62 |         function = action["action_name"]
 63 |         kwargs = {}
 64 | 
 65 |         if function == "stop":
 66 |             kwargs["answer"] = action.get("answer", "")
 67 |         elif function == "type":
 68 |             # text_indices = action["text"]
 69 |             # kwargs["text"] = ''.join([
 70 |             #     self._id2key[i]
 71 |             #     for i in text_indices
 72 |             #     if isinstance(i, int) and i < len(self._id2key) and i >= len(self.SPECIAL_KEYS)
 73 |             # ])
 74 |             kwargs["text"] = action["text"]
 75 |             kwargs["element_id"] = action["element_id"]
 76 |         elif function in ["hover", "click"]:
 77 |             kwargs["element_id"] = action["element_id"]
 78 |         elif function == "scroll":
 79 |             kwargs["dx"] = 0
 80 |             kwargs["dy"] = 100 if action["direction"].lower() == "down" else -100
 81 |         elif function in ["key_press", "press"]:
 82 |             kwargs["key_comb"] = action["key_comb"]
 83 |             function = "press"
 84 |         elif function in ["new_tab", "goto", "goto_url"]:
 85 |             kwargs["url"] = action["url"]
 86 |             function = "goto" if function == "goto_url" else function
 87 |         elif function in ["tab_focus", "page_focus"]:
 88 |             kwargs["page_number"] = action["page_number"]
 89 |             function = "tab_focus"
 90 |         elif function in ["go_back", "page_close", "go_forward"]:
 91 |             function = "tab_close" if function == "page_close" else function
 92 |         else:
 93 |             raise ValueError(f"Unknown function: {function}")
 94 | 
 95 |         return {
 96 |             "function": function,
 97 |             "kwargs": kwargs,
 98 |             "description": metadata.get("cot", ""),
 99 |         }
100 | 
101 |     def convert_to_dataset(self) -> None:
102 |         """Convert WebArena data to our dataset format"""
103 |         self.logger.info("Creating dataset...")
104 | 
105 |         # Initialize dataset
106 |         dataset = MultiAgentDataset(
107 |             name="WebArena Interactions",
108 |             base_path=self.output_path,
109 |             description="Web interaction trajectories from WebArena dataset",
110 |         )
111 | 
112 |         task_id2instance_id: dict[str, str] = {}
113 | 
114 |         # Read trajectories
115 |         with open(self.source_path / "trajectories.jsonl", "r") as f:
116 |             for line in f:
117 |                 raw_traj = json.loads(line)
118 | 
119 |                 # Skip blacklisted sources
120 |                 if raw_traj["source"] in ["SteP"]:
121 |                     continue
122 | 
123 |                 # Create agent metadata
124 |                 agents_metadata = {
125 |                     "agent": AgentMetadata(
126 |                         agent_id="agent",
127 |                         agent_type="web_agent",
128 |                         capabilities=["navigation", "interaction"],
129 |                         parameters={"viewport_size": (1280, 720)},
130 |                     ),
131 |                     "user": AgentMetadata(
132 |                         agent_id="user",
133 |                         agent_type="human",
134 |                         capabilities=["instruction"],
135 |                     ),
136 |                 }
137 | 
138 |                 # Create instance
139 |                 instance_id = str(raw_traj["task_id"])
140 |                 instance_metadata = {
141 |                     "task": raw_traj["intent"],
142 |                     "source_model": raw_traj["source"],
143 |                 }
144 | 
145 |                 instance_id = dataset.create_instance(
146 |                     agents_metadata=agents_metadata, instance_metadata=instance_metadata
147 |                 )
148 | 
149 |                 task_id2instance_id[raw_traj["task_id"]] = instance_id
150 | 
151 |                 # Add initial task observation
152 |                 dataset.add_data_point(
153 |                     instance_id=instance_id,
154 |                     agent_id="user",
155 |                     timestamp=datetime.now(),  # Using current time as original times not available
156 |                     point_type=PointType.ACTION,
157 |                     data={"text": raw_traj["intent"]},
158 |                     media_type=MediaType.JSON,
159 |                 )
160 | 
161 |                 # Process trajectory elements
162 |                 for element in raw_traj["trajectory"]:
163 |                     timestamp = (
164 |                         datetime.now()
165 |                     )  # Using current time as original times not available
166 | 
167 |                     if "action" in element:
168 |                         # Convert action
169 |                         action_data = self._convert_action(
170 |                             element["action"], element.get("metadata", {})
171 |                         )
172 | 
173 |                         dataset.add_data_point(
174 |                             instance_id=instance_id,
175 |                             agent_id="agent",
176 |                             timestamp=timestamp,
177 |                             point_type=PointType.ACTION,
178 |                             data=action_data,
179 |                             media_type=MediaType.JSON,
180 |                         )
181 | 
182 |                     elif "url" in element:
183 |                         # Add URL and HTML observation
184 |                         web_data = {"url": element["url"], "html": element["axtree"]}
185 |                         dataset.add_data_point(
186 |                             instance_id=instance_id,
187 |                             agent_id="agent",
188 |                             timestamp=timestamp,
189 |                             point_type=PointType.OBSERVATION,
190 |                             data=web_data,
191 |                             media_type=MediaType.JSON,
192 |                         )
193 | 
194 |                         # Add screenshot observation
195 |                         screenshot_path = element["screenshot_path"].replace(
196 |                             "demo_trajs/images/", str(self.screenshots_path)
197 |                         )
198 |                         if os.path.exists(screenshot_path):
199 |                             # Load and convert image to numpy array
200 |                             image = Image.open(screenshot_path)
201 |                             image_array = np.array(image)
202 | 
203 |                             dataset.add_data_point(
204 |                                 instance_id=instance_id,
205 |                                 agent_id="agent",
206 |                                 timestamp=timestamp,
207 |                                 point_type=PointType.OBSERVATION,
208 |                                 data=image_array,
209 |                                 media_type=MediaType.IMAGE,
210 |                                 metadata={"original_path": screenshot_path},
211 |                             )
212 |                     else:
213 |                         self.logger.warning(
214 |                             f"Unknown element type in trajectory: {element}"
215 |                         )
216 | 
217 |         if self.annotation_path:
218 |             annotation_system = AnnotationSystem(
219 |                 base_path=self.annotation_path,
220 |                 project_name="WebVoyager Annotations",
221 |                 description="Free-form text annotations of agent trajectories for WebVoyager",
222 |                 annotation_schema={
223 |                     "feedback": {
224 |                         "type": "string",
225 |                         "description": "Free-form text feedback on the trajectory",
226 |                     }
227 |                 },
228 |             )
229 | 
230 |             annotation_system.add_annotator(
231 |                 annotator_id="Shikhar",
232 |                 name="Shikhar Murty",
233 |             )
234 |             with open(self.source_path / "feedback.json", "r") as f:
235 |                 task_id2feedback = json.load(f)
236 |                 for task_id in task_id2feedback:
237 |                     instance_id_or_none = task_id2instance_id.get(task_id)
238 |                     if instance_id_or_none:
239 |                         annotation_system.add_annotation(
240 |                             instance_id=instance_id_or_none,
241 |                             agent_id="agent",
242 |                             content={"feedback": task_id2feedback[task_id]},
243 |                             annotator_id="Shikhar",
244 |                         )
245 |         self.logger.info("Dataset conversion complete!")
246 |         dataset.close()
247 | 
248 | 
249 | if __name__ == "__main__":
250 |     source_path = Path(".data/raw/webvoyager-nnetnav")
251 |     output_path = Path(".data/webvoyager-nnetnav")
252 |     annotation_path = Path(".data/annotations/webvoyager-nnetnav")
253 | 
254 |     run_converter(
255 |         WebVoyagerNNetNavConverter,
256 |         output_path,
257 |         source_path,
258 |         annotation_path=annotation_path,
259 |     )
260 | 


--------------------------------------------------------------------------------
/packages/osw-data/src/osw_data/utils.py:
--------------------------------------------------------------------------------
  1 | # Dataset-specific utils
  2 | # In this file, we will include the utility functions to download datasets into files
  3 | 
  4 | # balrog
  5 | import requests
  6 | import os
  7 | from urllib.parse import quote
  8 | 
  9 | from pathlib import Path
 10 | from typing import Tuple, Generator
 11 | 
 12 | from rich.console import Console
 13 | from rich.table import Table
 14 | from rich.panel import Panel
 15 | from rich.text import Text
 16 | from rich import box
 17 | 
 18 | 
 19 | def download_github_folder(
 20 |     owner: str, repo: str, path: str, save_path: str, token: str | None = None
 21 | ) -> None:
 22 |     """
 23 |     Recursively download a folder from GitHub
 24 | 
 25 |     Parameters:
 26 |     - owner: repository owner
 27 |     - repo: repository name
 28 |     - path: path to folder in repository
 29 |     - save_path: local path to save files
 30 |     - token: GitHub personal access token (optional)
 31 |     """
 32 |     headers = {}
 33 |     if token or (token := os.getenv("GITHUB_PERSONAL_ACCESS_TOKEN")):
 34 |         headers["Authorization"] = f"token {token}"
 35 | 
 36 |     api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{quote(path)}"
 37 |     response = requests.get(api_url, headers=headers)
 38 |     if response.status_code != 200:
 39 |         raise Exception(f"Failed to get content: {response.status_code}")
 40 | 
 41 |     for item in response.json():
 42 |         local_path = Path(save_path) / item["name"]
 43 | 
 44 |         if item["type"] == "dir":
 45 |             # If it's a directory, create it and recurse
 46 |             local_path.mkdir(parents=True, exist_ok=True)
 47 | 
 48 |             download_github_folder(owner, repo, item["path"], local_path, token)
 49 |             print(f"Processed directory: {item['path']}")
 50 | 
 51 |         elif item["type"] == "file":
 52 |             # Skip if file already exists and has same size
 53 |             if local_path.exists():
 54 |                 # Get local file size
 55 |                 local_size = local_path.stat().st_size
 56 |                 # Get GitHub file size
 57 |                 github_size = item["size"]
 58 | 
 59 |                 if local_size == github_size:
 60 |                     print(f"Skipping existing file: {item['path']}")
 61 |                     continue
 62 |                 else:
 63 |                     print(f"Size mismatch, re-downloading: {item['path']}")
 64 | 
 65 |             # Create parent directories if they don't exist
 66 |             local_path.parent.mkdir(parents=True, exist_ok=True)
 67 | 
 68 |             # Download file content
 69 |             download_url = item["download_url"]
 70 |             file_response = requests.get(download_url, headers=headers)
 71 | 
 72 |             # Save the file
 73 |             with open(local_path, "wb") as f:
 74 |                 f.write(file_response.content)
 75 |             print(f"Downloaded file: {item['path']}")
 76 | 
 77 | 
 78 | def file_pairs(folder_path: str) -> Generator[Tuple[Path, Path], None, None]:
 79 |     """
 80 |     Generate pairs of CSV and JSON files with matching names from a folder.
 81 | 
 82 |     Args:
 83 |         folder_path: Path to the folder to search in
 84 | 
 85 |     Yields:
 86 |         Tuple of (csv_path, json_path) for matching files
 87 |     """
 88 |     path = Path(folder_path)
 89 | 
 90 |     # Find all CSV files and check for JSON pairs
 91 |     for csv_file in path.rglob("*.csv"):
 92 |         json_file = csv_file.with_suffix(".json")
 93 |         if json_file.exists():
 94 |             yield csv_file, json_file
 95 | 
 96 | 
 97 | def file_pairs_list(folder_path: Path) -> list[tuple[Path, Path]]:
 98 |     """
 99 |     Generate pairs of CSV and JSON files with matching names from a folder.
100 | 
101 |     Args:
102 |         folder_path: Path to the folder to search in
103 | 
104 |     Yields:
105 |         Tuple of (csv_path, json_path) for matching files
106 |     """
107 |     path = folder_path
108 | 
109 |     json_folder = []
110 | 
111 |     # Find all CSV files and check for JSON pairs
112 |     for csv_file in path.rglob("*.csv"):
113 |         json_file = csv_file.with_suffix(".json")
114 |         json_folder.append((csv_file, json_file))
115 |     if json_file.exists():
116 |         return json_folder
117 |     else:
118 |         raise FileNotFoundError(f"JSON file not found for {csv_file}")
119 | 
120 | 
121 | def file_triplets(folder_path: str) -> Generator[Tuple[Path, Path, Path], None, None]:
122 |     """
123 |     Generate CSV and JSON, and PKL files with matching names from a folder.
124 | 
125 |     Args:
126 |         folder_path: Path to the folder to search in
127 | 
128 |     Yields:
129 |         Tuple of (csv_path, json_path, pkl_path) for matching files
130 |     """
131 |     path = Path(folder_path)
132 | 
133 |     # Find all CSV files and check for JSON pairs
134 |     for csv_file in path.rglob("*.csv"):
135 |         json_file = csv_file.with_suffix(".json")
136 |         pkl_file = csv_file.with_suffix(".pkl")
137 |         if json_file.exists() and pkl_file.exists():
138 |             yield csv_file, json_file, pkl_file
139 | 
140 | 
141 | def parse_text_description(text_description: str) -> list[tuple[tuple[int, int], str]]:
142 |     """
143 |     Parse a text description of object positions relative to a reference point
144 |     and convert it back into a list of relative positions and object names.
145 | 
146 |     Args:
147 |         text_description (str): Multi-line string describing object positions
148 | 
149 |     Returns:
150 |         list: List of tuples ((x, y), name) where x and y are relative coordinates
151 |               and name is the object name/type
152 |     """
153 |     relative_positions = []
154 | 
155 |     # Split the description into individual lines
156 |     lines = text_description.strip().split("\n")
157 | 
158 |     for line in lines:
159 |         if not line.strip():
160 |             continue
161 | 
162 |         # Initialize position values
163 |         x_offset = 0
164 |         y_offset = 0
165 | 
166 |         # Split the line into parts
167 |         parts = line.split()
168 | 
169 |         # Extract the object name (everything before the first number)
170 |         name_parts = []
171 |         i = 0
172 |         while i < len(parts) and not parts[i][0].isdigit():
173 |             name_parts.append(parts[i])
174 |             i += 1
175 |         name = " ".join(name_parts)
176 | 
177 |         # Process the remaining parts for directions
178 |         while i < len(parts):
179 |             # Get the number of steps
180 |             try:
181 |                 steps = int(parts[i])
182 |             except ValueError:
183 |                 print(line)
184 |             i += 1
185 | 
186 |             # Skip 'step' or 'steps'
187 |             i += 1
188 | 
189 |             # Process direction
190 |             if i < len(parts):
191 |                 if parts[i] == "to" and i + 1 < len(parts):
192 |                     i += 1  # skip 'to'
193 |                     if parts[i] == "the":
194 |                         i += 1  # skip 'the'
195 | 
196 |                     if parts[i] == "right":
197 |                         x_offset = steps
198 |                     elif parts[i] == "left":
199 |                         x_offset = -steps
200 |                     i += 1
201 | 
202 |                 elif parts[i] == "up":
203 |                     y_offset = -steps
204 |                     i += 1
205 |                 elif parts[i] == "down":
206 |                     y_offset = steps
207 |                     i += 1
208 | 
209 |                 # Skip 'and' if present
210 |                 if i < len(parts) and parts[i] == "and":
211 |                     i += 1
212 | 
213 |         relative_positions.append(((x_offset, y_offset), name))
214 | 
215 |     return relative_positions
216 | 
217 | 
218 | def visualize_map(
219 |     relative_positions: list[tuple[tuple[int, int], str]], reference_char: str = "@"
220 | ) -> None:
221 |     """
222 |     Visualize the game map using rich library for prettier output.
223 | 
224 |     Args:
225 |         relative_positions: List of ((x, y), name) tuples
226 |         reference_char: Character to represent the reference point (player)
227 |     """
228 |     console = Console()
229 | 
230 |     # Find the dimensions of the map
231 |     min_x = min(x for (x, y), _ in relative_positions)
232 |     max_x = max(x for (x, y), _ in relative_positions)
233 |     min_y = min(y for (x, y), _ in relative_positions)
234 |     max_y = max(y for (x, y), _ in relative_positions)
235 | 
236 |     # Add padding and account for reference point at (0,0)
237 |     min_x = min(min_x, 0) - 1
238 |     max_x = max(max_x, 0) + 1
239 |     min_y = min(min_y, 0) - 1
240 |     max_y = max(max_y, 0) + 1
241 | 
242 |     # Create a rich Table for the game grid
243 |     table = Table(
244 |         box=box.SQUARE,
245 |         padding=0,
246 |         show_header=True,
247 |         header_style="bold cyan",
248 |         show_edge=True,
249 |     )
250 | 
251 |     # Add columns with X-coordinates as headers
252 |     COLUMN_WIDTH = 10  # Fixed width for all columns
253 | 
254 |     # Add Y-coordinates column with the same width
255 |     table.add_column(" ", style="bold cyan", width=COLUMN_WIDTH, justify="center")
256 | 
257 |     # Add other columns with consistent width
258 |     for x in range(min_x, max_x + 1):
259 |         table.add_column(
260 |             str(x),
261 |             justify="center",
262 |             width=COLUMN_WIDTH,
263 |             min_width=COLUMN_WIDTH,
264 |             max_width=COLUMN_WIDTH,
265 |         )
266 | 
267 |     # Helper function to get styled symbol for object
268 |     def get_styled_symbol(name: str) -> Text:
269 |         name = name.lower()
270 |         if name.startswith("rule"):
271 |             if "`" in name:
272 |                 rule_text = name.split("`")[1].split("`")[0]
273 |                 return Text(f"[{rule_text}]", style="bold yellow")
274 |             return Text("[rule]", style="yellow")
275 |         elif "wall" in name:
276 |             return Text("#", style="red")
277 |         elif "ball" in name:
278 |             return Text("o", style="green")
279 |         elif "key" in name:
280 |             return Text("k", style="blue")
281 |         else:
282 |             return Text("*", style="white")
283 | 
284 |     # Create the grid with objects
285 |     for y in range(min_y, max_y + 1):
286 |         row: list[str | Text] = [str(y)]  # Y-coordinate
287 |         for x in range(min_x, max_x + 1):
288 |             cell_content = Text(" ")  # Empty cell by default
289 | 
290 |             # Check if this is the reference point (0,0)
291 |             if x == 0 and y == 0:
292 |                 # Center the reference character
293 |                 padding = (COLUMN_WIDTH - 1) // 2  # -1 for single character
294 |                 cell_content = Text(
295 |                     " " * padding + reference_char, style="bold magenta"
296 |                 )
297 | 
298 |             # Check if there's an object at this position
299 |             for (obj_x, obj_y), name in relative_positions:
300 |                 if obj_x == x and obj_y == y:
301 |                     symbol = get_styled_symbol(name)
302 |                     # Center the symbol in the column width
303 |                     padding = (COLUMN_WIDTH - len(str(symbol))) // 2
304 |                     cell_content = Text(" " * padding) + symbol
305 |                     break
306 | 
307 |             row.append(cell_content)
308 |         table.add_row(*row)
309 | 
310 |     # Create legend panel
311 |     legend_text = [
312 |         Text("Legend:", style="bold"),
313 |         Text(f"\n{reference_char} ", style="bold magenta")
314 |         + Text("- Player (reference point)"),
315 |         Text("\n# ", style="red") + Text("- Wall"),
316 |         Text("\no ", style="green") + Text("- Ball"),
317 |         Text("\nk ", style="blue") + Text("- Key"),
318 |         Text("\n[text] ", style="bold yellow") + Text("- Rule"),
319 |         Text("\n* ", style="white") + Text("- Other objects"),
320 |     ]
321 |     legend = Text.assemble(*legend_text)
322 | 
323 |     # Create coordinates panel
324 |     coord_text = Text.assemble(
325 |         Text("Grid coordinates:", style="bold"),
326 |         Text(f"\nX: {min_x} to {max_x}"),
327 |         Text(f"\nY: {min_y} to {max_y}"),
328 |     )
329 | 
330 |     # Print everything
331 |     console.print(Panel(table, title="Game Map", border_style="cyan"))
332 |     console.print(Panel(legend, title="Legend", border_style="green"))
333 |     console.print(Panel(coord_text, title="Coordinates", border_style="blue"))
334 | 


--------------------------------------------------------------------------------