├── tests ├── __init__.py ├── assistantbench │ ├── __init__.py │ ├── test_env_general.py │ └── test_evaluation.py ├── core │ ├── data │ │ ├── screenshot.png │ │ ├── input_type │ │ │ ├── img_submit.gif │ │ │ ├── button_input.html │ │ │ ├── file_input.html │ │ │ ├── submit_nn_input.html │ │ │ ├── email_input.html │ │ │ ├── search_input.html │ │ │ ├── number_step_input.html │ │ │ ├── url_input.html │ │ │ ├── week_input.html │ │ │ ├── range_input.html │ │ │ ├── radio_input.html │ │ │ ├── date_input.html │ │ │ ├── hidden_field_input.html │ │ │ ├── number_input.html │ │ │ ├── telephone_input.html │ │ │ ├── color_picker_input.html │ │ │ ├── date_time_local_input.html │ │ │ ├── image_input.html │ │ │ ├── password_input.html │ │ │ ├── time_input.html │ │ │ ├── text_input.html │ │ │ ├── checkbox_input.html │ │ │ ├── submit_input.html │ │ │ ├── date_min_max_input.html │ │ │ └── reset_input.html │ │ ├── hover.html │ │ ├── textbox.html │ │ ├── basic_shadow_iframe_site │ │ │ ├── inner-iframe.html │ │ │ ├── basic_iframe_2.html │ │ │ ├── basic_iframe.html │ │ │ └── outer-iframe.html │ │ ├── basic_iframe_site │ │ │ ├── basic_iframe_2.html │ │ │ ├── inner-iframe.html │ │ │ ├── outer-iframe.html │ │ │ └── basic_iframe.html │ │ ├── dblclick.html │ │ ├── lots_of_iframes.html │ │ ├── basic_shadow_dom_site │ │ │ ├── simple_shadow_dom.html │ │ │ └── basic_shadow_dom.html │ │ ├── test_page.html │ │ ├── test_page_2.html │ │ ├── example.html │ │ ├── obstructed_checkbox_page.html │ │ └── long_page.html │ ├── __init__.py │ ├── test_actions_python.py │ ├── test_registration.py │ └── test_task.py ├── experiments │ ├── __init__.py │ ├── test_bgym.py │ └── test_exp_loop.py ├── miniwob │ ├── __init__.py │ ├── test_use-colorwheel-2.py │ ├── test_click-scroll-list.py │ ├── test_click-menu-2.py │ └── test_base.py ├── webarena │ ├── __init__.py │ ├── test_instance.py │ ├── test_env_general.py │ └── test_infeasible.py ├── visualwebarena │ ├── __init__.py │ ├── test_vwa_domains.py │ ├── test_vwa_tasks_with_reset.py │ └── test_vwa_tasks_without_reset.py └── utils.py ├── demo_agent ├── requirements.txt ├── README.md ├── environment.yml └── run_demo.py ├── browsergym ├── miniwob │ ├── requirements.txt │ ├── pyproject.toml │ ├── README.md │ └── src │ │ └── browsergym │ │ └── miniwob │ │ └── __init__.py ├── webarena │ ├── src │ │ └── browsergym │ │ │ └── webarena │ │ │ ├── config.py │ │ │ └── __init__.py │ ├── requirements.txt │ ├── pyproject.toml │ └── README.md ├── assistantbench │ ├── requirements.txt │ ├── src │ │ └── browsergym │ │ │ └── assistantbench │ │ │ ├── evaluation │ │ │ ├── evaluate_utils │ │ │ │ ├── evaluate_factory.py │ │ │ │ ├── evaluate_numbers.py │ │ │ │ ├── utils.py │ │ │ │ └── evaluate_dicts.py │ │ │ └── evaluator.py │ │ │ ├── __init__.py │ │ │ ├── utils.py │ │ │ └── task.py │ ├── README.md │ └── pyproject.toml ├── experiments │ ├── requirements.txt │ ├── src │ │ ├── browsergym │ │ │ └── experiments │ │ │ │ ├── benchmark │ │ │ │ ├── __init__.py │ │ │ │ └── metadata │ │ │ │ │ └── scripts.py │ │ │ │ ├── __init__.py │ │ │ │ ├── utils.py │ │ │ │ └── agent.py │ │ └── bgym │ │ │ └── __init__.py │ ├── README.md │ └── pyproject.toml ├── webarenalite │ ├── requirements.txt │ ├── src │ │ └── browsergym │ │ │ └── webarenalite │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ └── task.py │ └── pyproject.toml ├── visualwebarena │ ├── requirements.txt │ ├── src │ │ └── browsergym │ │ │ └── visualwebarena │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ ├── utils.py │ │ │ └── instance.py │ ├── pyproject.toml │ └── README.md ├── core │ ├── src │ │ └── browsergym │ │ │ └── core │ │ │ ├── chat_files │ │ │ ├── assistant.png │ │ │ └── img │ │ │ │ └── send.svg │ │ │ ├── constants.py │ │ │ ├── action │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── parsers.py │ │ │ └── python.py │ │ │ ├── __init__.py │ │ │ ├── javascript │ │ │ └── frame_unmark_elements.js │ │ │ ├── registration.py │ │ │ ├── task.py │ │ │ ├── chat.py │ │ │ └── spaces.py │ ├── requirements.txt │ ├── README.md │ └── pyproject.toml └── pyproject.toml ├── docs ├── src │ ├── core │ │ ├── observation_space.rst │ │ └── core.rst │ ├── api.rst │ ├── tutorials.rst │ ├── index.rst │ ├── examples │ │ ├── walkthrough.rst │ │ └── create_custom_task.rst │ ├── environments │ │ ├── webarena.rst │ │ ├── miniwob.rst │ │ └── workarena.rst │ ├── usage.rst │ └── conf.py ├── requirements.txt └── Makefile ├── dev ├── environment.yaml └── requirements.txt ├── .github ├── actions │ └── setup-python-uv │ │ └── action.yml └── workflows │ └── pypi.yml ├── LICENSE ├── pyproject.toml ├── .readthedocs.yaml ├── .pre-commit-config.yaml ├── Makefile └── .gitignore /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/assistantbench/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demo_agent/requirements.txt: -------------------------------------------------------------------------------- 1 | browsergym 2 | openai 3 | -------------------------------------------------------------------------------- /browsergym/miniwob/requirements.txt: -------------------------------------------------------------------------------- 1 | browsergym-core==0.14.3.dev1 2 | -------------------------------------------------------------------------------- /browsergym/webarena/src/browsergym/webarena/config.py: -------------------------------------------------------------------------------- 1 | TASK_IDS = range(812) 2 | -------------------------------------------------------------------------------- /browsergym/webarena/requirements.txt: -------------------------------------------------------------------------------- 1 | browsergym-core==0.14.3.dev1 2 | libwebarena==0.0.4 3 | -------------------------------------------------------------------------------- /browsergym/assistantbench/requirements.txt: -------------------------------------------------------------------------------- 1 | browsergym-core==0.14.3.dev1 2 | datasets 3 | scipy 4 | numpy 5 | -------------------------------------------------------------------------------- /browsergym/experiments/requirements.txt: -------------------------------------------------------------------------------- 1 | browsergym-core==0.14.3.dev1 2 | tiktoken>=0.4 3 | dataclasses-json 4 | -------------------------------------------------------------------------------- /tests/core/data/screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ServiceNow/BrowserGym/HEAD/tests/core/data/screenshot.png -------------------------------------------------------------------------------- /browsergym/webarenalite/requirements.txt: -------------------------------------------------------------------------------- 1 | browsergym-core==0.14.3.dev1 2 | browsergym-webarena==0.14.3.dev1 3 | libwebarena==0.0.4 4 | -------------------------------------------------------------------------------- /tests/core/__init__.py: -------------------------------------------------------------------------------- 1 | # bugfix: use same playwright instance in browsergym and pytest 2 | from ..utils import setup_playwright 3 | -------------------------------------------------------------------------------- /tests/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | # bugfix: use same playwright instance in browsergym and pytest 2 | from ..utils import setup_playwright 3 | -------------------------------------------------------------------------------- /tests/miniwob/__init__.py: -------------------------------------------------------------------------------- 1 | # bugfix: use same playwright instance in browsergym and pytest 2 | from ..utils import setup_playwright 3 | -------------------------------------------------------------------------------- /tests/webarena/__init__.py: -------------------------------------------------------------------------------- 1 | # bugfix: use same playwright instance in browsergym and pytest 2 | from ..utils import setup_playwright 3 | -------------------------------------------------------------------------------- /tests/core/data/input_type/img_submit.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ServiceNow/BrowserGym/HEAD/tests/core/data/input_type/img_submit.gif -------------------------------------------------------------------------------- /tests/visualwebarena/__init__.py: -------------------------------------------------------------------------------- 1 | # bugfix: use same playwright instance in browsergym and pytest 2 | from ..utils import setup_playwright 3 | -------------------------------------------------------------------------------- /browsergym/visualwebarena/requirements.txt: -------------------------------------------------------------------------------- 1 | browsergym-core==0.14.3.dev1 2 | browsergym-webarena 3 | libvisualwebarena==0.0.15 4 | requests 5 | torch 6 | -------------------------------------------------------------------------------- /demo_agent/README.md: -------------------------------------------------------------------------------- 1 | ## Install demo-agent 2 | 3 | conda env create -f environment.yml 4 | conda activate demo-agent 5 | playwright install chromium 6 | -------------------------------------------------------------------------------- /browsergym/experiments/src/browsergym/experiments/benchmark/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import Benchmark, HighLevelActionSetArgs 2 | from .configs import DEFAULT_BENCHMARKS 3 | -------------------------------------------------------------------------------- /docs/src/core/observation_space.rst: -------------------------------------------------------------------------------- 1 | Observation space 2 | _________________ 3 | 4 | For more details refer to the `WorkArena paper `_. 5 | -------------------------------------------------------------------------------- /browsergym/core/src/browsergym/core/chat_files/assistant.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ServiceNow/BrowserGym/HEAD/browsergym/core/src/browsergym/core/chat_files/assistant.png -------------------------------------------------------------------------------- /browsergym/experiments/src/browsergym/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | from .agent import Agent, AgentInfo 2 | from .loop import AbstractAgentArgs, EnvArgs, ExpArgs, get_exp_result 3 | -------------------------------------------------------------------------------- /browsergym/core/requirements.txt: -------------------------------------------------------------------------------- 1 | playwright==1.44 2 | gymnasium>=0.27 3 | numpy>=1.14 4 | pyparsing>=3 5 | Pillow>=10.1 6 | beautifulsoup4>=4.12 7 | lxml>=4.9,<6.0.0 8 | mcp[cli]>=1.6.0 9 | -------------------------------------------------------------------------------- /dev/environment.yaml: -------------------------------------------------------------------------------- 1 | name: browsergym-dev 2 | 3 | channels: 4 | - huggingface 5 | - conda-forge 6 | - defaults 7 | 8 | dependencies: 9 | - python>=3.10 10 | - pip 11 | 12 | - pip: 13 | - -r requirements.txt -------------------------------------------------------------------------------- /tests/core/data/input_type/button_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

Input Button

6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /demo_agent/environment.yml: -------------------------------------------------------------------------------- 1 | name: demo-agent 2 | 3 | channels: 4 | - huggingface 5 | - conda-forge 6 | - defaults 7 | 8 | dependencies: 9 | - python>=3.10 10 | - pip 11 | 12 | - pip: 13 | - -r requirements.txt 14 | -------------------------------------------------------------------------------- /tests/core/data/hover.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /tests/experiments/test_bgym.py: -------------------------------------------------------------------------------- 1 | import bgym 2 | import pytest 3 | 4 | 5 | def test_classes(): 6 | bgym.EnvArgs(task_name="something") 7 | bgym.HighLevelActionSet() 8 | with pytest.raises(TypeError): 9 | bgym.Agent() 10 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==7.3.7 2 | # sphinx-rtd-theme==2.0.0 3 | pydata-sphinx-theme == 0.15.3 4 | sphinx_design==0.6.0 5 | -e browsergym/core/ 6 | -e browsergym/miniwob/ 7 | -e browsergym/webarena/ 8 | -e browsergym/visualwebarena/ 9 | -e browsergym/experiments/ 10 | -------------------------------------------------------------------------------- /browsergym/core/src/browsergym/core/constants.py: -------------------------------------------------------------------------------- 1 | BROWSERGYM_ID_ATTRIBUTE = "bid" # Playwright's default is "data-testid" 2 | BROWSERGYM_VISIBILITY_ATTRIBUTE = "browsergym_visibility_ratio" 3 | BROWSERGYM_SETOFMARKS_ATTRIBUTE = "browsergym_set_of_marks" 4 | 5 | EXTRACT_OBS_MAX_TRIES = 5 6 | -------------------------------------------------------------------------------- /browsergym/core/src/browsergym/core/action/__init__.py: -------------------------------------------------------------------------------- 1 | _DEMO_MODE = False 2 | 3 | 4 | def set_global_demo_mode(demo_mode: bool): 5 | global _DEMO_MODE 6 | _DEMO_MODE = demo_mode 7 | 8 | 9 | def get_global_demo_mode(): 10 | global _DEMO_MODE 11 | return _DEMO_MODE 12 | -------------------------------------------------------------------------------- /tests/core/data/textbox.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Simple HTML Page 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /browsergym/core/README.md: -------------------------------------------------------------------------------- 1 | # BrowserGym core 2 | 3 | This package provides `browsergym.core`, which provides the core functionalities of [BrowserGym](https://github.com/ServiceNow/BrowserGym). 4 | 5 | ## Setup 6 | 7 | 1. Install the package 8 | ```sh 9 | pip install browsergym-core 10 | ``` 11 | -------------------------------------------------------------------------------- /tests/core/data/basic_shadow_iframe_site/inner-iframe.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Inner Iframe 5 | 6 | 7 |

Iframe Level 2

8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /browsergym/experiments/README.md: -------------------------------------------------------------------------------- 1 | # BrowserGym experiments 2 | 3 | This package provides `browsergym.experiments`, a suite of experimentation tools for [BrowserGym](https://github.com/ServiceNow/BrowserGym). 4 | 5 | As a convenience namespace, it also provides `bgym`. 6 | 7 | ## Setup 8 | 9 | 1. Install the package 10 | ```sh 11 | pip install browsergym-experiments 12 | ``` 13 | -------------------------------------------------------------------------------- /tests/core/data/basic_shadow_iframe_site/basic_iframe_2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Simple Website 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /docs/src/api.rst: -------------------------------------------------------------------------------- 1 | API 2 | === 3 | 4 | Core 5 | ____ 6 | 7 | .. toctree:: 8 | core/core.rst 9 | 10 | 11 | Action and Observation Spaces 12 | _____________________________ 13 | 14 | .. toctree:: 15 | core/action_space.rst 16 | core/observation_space.rst 17 | 18 | Environments 19 | ____________ 20 | 21 | .. toctree:: 22 | environments/miniwob.rst 23 | environments/webarena.rst 24 | environments/workarena.rst 25 | -------------------------------------------------------------------------------- /tests/core/data/basic_iframe_site/basic_iframe_2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Simple Website 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /docs/src/core/core.rst: -------------------------------------------------------------------------------- 1 | BrowserGym API 2 | ^^^^^^^^^^^^^^ 3 | 4 | BrowserEnv 5 | """""""""" 6 | 7 | .. autoclass:: browsergym.core.env.BrowserEnv 8 | :members: 9 | :show-inheritance: 10 | 11 | Chat 12 | """" 13 | 14 | .. autoclass:: browsergym.core.env.Chat 15 | :members: 16 | :show-inheritance: 17 | 18 | Task 19 | """" 20 | 21 | .. autoclass:: browsergym.core.task.AbstractBrowserTask 22 | :members: 23 | :show-inheritance: 24 | 25 | -------------------------------------------------------------------------------- /tests/core/data/input_type/file_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

File upload

6 | 7 |

Show a file-select field which allows a file to be chosen for upload:

8 |
9 | 10 |

11 | 12 |
13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /tests/core/data/input_type/submit_nn_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
6 |
7 |
8 |
9 |

10 | 11 |
12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /tests/core/data/dblclick.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /tests/core/data/input_type/email_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

Email Field

6 | 7 |

The input type="email" is used for input fields that should contain an e-mail address:

8 | 9 |
10 | 11 | 12 | 13 |
14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /tests/core/data/input_type/search_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

Search Field

6 |

The input type="search" is used for search fields (behaves like a regular text field):

7 | 8 |
9 | 10 | 11 | 12 |
13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /tests/core/data/input_type/number_step_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

Numeric Steps

6 | 7 |

Depending on browser support: Fixed steps will apply in the input field.

8 | 9 |
10 | 11 | 12 | 13 |
14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /tests/core/data/input_type/url_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

Display a URL Input Field

6 | 7 |

The input type="url" is used for input fields that should contain a URL address:

8 | 9 |
10 | 11 | 12 | 13 |
14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /tests/core/data/input_type/week_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

Display a URL Input Field

6 | 7 |

The input type="url" is used for input fields that should contain a URL address:

8 | 9 |
10 | 11 | 12 | 13 |
14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /tests/core/data/lots_of_iframes.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Lots of Iframes 6 | 7 | 8 | 9 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /tests/core/data/input_type/range_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

Range Field

6 | 7 |

Depending on browser support: The input type "range" can be displayed as a slider control.

8 | 9 |
10 | 11 | 12 | 13 |
14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /tests/core/data/input_type/radio_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

Radio Buttons

6 | 7 |

The input type="radio" defines a radio button:

8 | 9 |

Choose your favorite Web language:

10 |
11 | 12 |   13 |  
14 | 15 | 16 |
17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /dev/requirements.txt: -------------------------------------------------------------------------------- 1 | black[jupyter]==24.2.0 2 | blacken-docs 3 | pre-commit 4 | pytest==7.3.2 5 | pytest-xdist 6 | pytest-playwright 7 | tenacity 8 | -e ../browsergym/core # local package 9 | -e ../browsergym/miniwob # local package 10 | -e ../browsergym/webarena # local package 11 | -e ../browsergym/visualwebarena # local package 12 | -e ../browsergym/experiments # local package 13 | -e ../browsergym/assistantbench # local package 14 | -e ../browsergym/webarenalite # local package 15 | browsergym-workarena 16 | weblinx_browsergym 17 | -------------------------------------------------------------------------------- /browsergym/experiments/src/bgym/__init__.py: -------------------------------------------------------------------------------- 1 | from browsergym.core.action.base import AbstractActionSet 2 | from browsergym.core.action.highlevel import HighLevelActionSet 3 | from browsergym.core.action.python import PythonActionSet 4 | from browsergym.experiments.agent import Agent, AgentInfo 5 | from browsergym.experiments.benchmark import DEFAULT_BENCHMARKS, Benchmark, HighLevelActionSetArgs 6 | from browsergym.experiments.loop import ( 7 | AbstractAgentArgs, 8 | EnvArgs, 9 | ExpArgs, 10 | ExpResult, 11 | StepInfo, 12 | StepTimestamps, 13 | ) 14 | -------------------------------------------------------------------------------- /tests/core/data/input_type/date_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

Date Field

6 | 7 |

The input type="date" is used for input fields that should contain a date.

8 | 9 |
10 | 11 | 12 | 13 |
14 | 15 |

Note: type="date" is not supported in Internet Explorer 11 or prior Safari 14.1.

16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import browsergym.core 2 | import logging 3 | import playwright.sync_api 4 | import pytest 5 | 6 | 7 | # setup code, executed ahead of first test 8 | @pytest.fixture(scope="session", autouse=True) 9 | def setup_playwright(playwright: playwright.sync_api.Playwright): 10 | # bugfix: re-use pytest-playwright's playwright instance in browsergym 11 | # https://github.com/microsoft/playwright-python/issues/2053 12 | browsergym.core._set_global_playwright(playwright) 13 | logging.info("Browsergym is using the playwright instance provided by pytest-playwright.") 14 | -------------------------------------------------------------------------------- /tests/core/data/input_type/hidden_field_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

A Hidden Field (look in source code)

6 | 7 |
8 | 9 |

10 | 11 | 12 |
13 | 14 |

Note: The hidden field is not shown to the user, but the data is sent when the form is submitted.

15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /tests/core/data/input_type/number_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

Number Field

6 | 7 |

The input type="number" defines a numeric input field.

8 | 9 |

You can use the min and max attributes to add numeric restrictions in the input field:

10 | 11 |
12 | 13 | 14 | 15 |
16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /.github/actions/setup-python-uv/action.yml: -------------------------------------------------------------------------------- 1 | name: 'Setup Python and uv' 2 | description: 'Setup Python 3.12 and install uv package manager' 3 | runs: 4 | using: 'composite' 5 | steps: 6 | - name: Set up Python 7 | uses: actions/setup-python@v5 8 | with: 9 | python-version: "3.12" 10 | 11 | - name: Install uv 12 | uses: astral-sh/setup-uv@v7 13 | with: 14 | # Install a specific version of uv. 15 | version: "0.9.17" 16 | enable-cache: true 17 | 18 | - name: Create virtual environment 19 | shell: bash 20 | run: uv venv 21 | -------------------------------------------------------------------------------- /docs/src/tutorials.rst: -------------------------------------------------------------------------------- 1 | Tutorials 2 | ========= 3 | 4 | This section provides tutorials to help build new environments and tasks. 5 | 6 | .. grid:: 2 7 | :gutter: 2 8 | 9 | .. grid-item-card:: Walkthrough 10 | :link: examples/walkthrough.html 11 | 12 | :bdg-primary:`Getting started` 13 | 14 | .. grid-item-card:: Create a custom task 15 | :link: examples/create_custom_task.html 16 | 17 | :bdg-primary:`Custom task` 18 | 19 | 20 | 21 | .. toctree:: 22 | :maxdepth: 1 23 | :hidden: 24 | 25 | examples/walkthrough.rst 26 | examples/create_custom_task.rst 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2024 ServiceNow 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/core/data/input_type/telephone_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

Telephone Field

6 | 7 |

The input type="tel" is used for input fields that should contain a telephone number:

8 | 9 |
10 |

11 |

12 | Format: 123-45-678

13 | 14 |
15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /tests/core/data/input_type/color_picker_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

Show a Color Picker

6 | 7 |

The input type="color" is used for input fields that should contain a color.

8 | 9 |
10 | 11 | 12 | 13 |
14 | 15 |

Note: type="color" is not supported in Internet Explorer 11 or Safari 9.1 (or earlier).

16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /docs/src/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to BrowserGym's documentation! 2 | ====================================== 3 | 4 | **BrowserGym** is a Python library that provides a `gym environment `_ 5 | for web task automation in the Chromium browser. It includes the following benchmarks by default: 6 | `MiniWob++ `_, `WebArena `_, `WorkArena `_. 7 | 8 | .. note:: 9 | 10 | This project is under active development. 11 | 12 | Contents 13 | -------- 14 | 15 | .. toctree:: 16 | :maxdepth: 2 17 | 18 | usage 19 | api 20 | tutorials 21 | -------------------------------------------------------------------------------- /tests/core/data/input_type/date_time_local_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

Local Date Field

6 | 7 |

The input type="datetime-local" specifies a date and time input field, with no time zone.

8 | 9 |
10 | 11 | 12 | 13 |
14 | 15 |

Note: type="datetime-local" is not supported in Internet Explorer 11 or prior Safari 14.1.

16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /tests/core/data/input_type/image_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

Display an Image as the Submit button

6 | 7 |
8 | 9 |

10 | 11 |

12 | 13 |
14 | 15 |

Note: The input type="image" sends the X and Y coordinates of the click that activated the image button.

16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /tests/core/data/input_type/password_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

Password field

6 | 7 |

The input type="password" defines a password field:

8 | 9 |
10 |
11 |
12 |
13 |

14 | 15 |
16 | 17 |

The characters in a password field are masked (shown as asterisks or circles).

18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /tests/core/data/basic_iframe_site/inner-iframe.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Inner Iframe 6 | 12 | 13 | 14 | 15 |

Iframe Level 2

16 | 17 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /docs/src/examples/walkthrough.rst: -------------------------------------------------------------------------------- 1 | Walkthrough 2 | ___________ 3 | 4 | 5 | Boilerplate code to run an agent on an interactive, open-ended task: 6 | 7 | .. code-block:: python 8 | 9 | import gymnasium as gym 10 | import browsergym.core # register the openended task as a gym environment 11 | 12 | env = gym.make( 13 | "browsergym/openended", task_kwargs={"start_url": "https://www.google.com/"}, wait_for_user_message=True 14 | ) 15 | 16 | obs, info = env.reset() 17 | done = False 18 | while not done: 19 | action = ... # implement your agent here 20 | obs, reward, terminated, truncated, info = env.step(action) 21 | done = terminated or truncated -------------------------------------------------------------------------------- /tests/core/data/input_type/time_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

Show a Time Input Control

6 | 7 |

The input type="time" allows the user to select a time (no time zone):

8 | 9 |

If the browser supports it, a time picker pops up when entering the input field.

10 | 11 |
12 | 13 | 14 | 15 |
16 | 17 |

Note: type="time" is not supported in Internet Explorer 11 or prior Safari 14.1.

18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /browsergym/core/src/browsergym/core/chat_files/img/send.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /tests/core/data/input_type/text_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

Text field

6 |

The input type="text" defines a one-line text input field:

7 | 8 |
9 |
10 |
11 |
12 |

13 | 14 |
15 | 16 |

Note that the form itself is not visible.

17 |

Also note that the default width of a text field is 20 characters.

18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = src 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /tests/core/data/input_type/checkbox_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

Checkboxes

6 |

The input type="checkbox" defines a checkbox:

7 | 8 |
9 | 10 |
11 | 12 |
13 | 14 |

15 | 16 |
17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /browsergym/core/src/browsergym/core/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.14.3.dev1" 2 | 3 | import playwright.sync_api 4 | 5 | # we use a global playwright instance 6 | _PLAYWRIGHT = None 7 | 8 | 9 | def _set_global_playwright(pw: playwright.sync_api.Playwright): 10 | global _PLAYWRIGHT 11 | _PLAYWRIGHT = pw 12 | 13 | 14 | def _get_global_playwright(): 15 | global _PLAYWRIGHT 16 | if not _PLAYWRIGHT: 17 | pw = playwright.sync_api.sync_playwright().start() 18 | _set_global_playwright(pw) 19 | 20 | return _PLAYWRIGHT 21 | 22 | 23 | # register the open-ended task 24 | from .registration import register_task 25 | from .task import OpenEndedTask 26 | 27 | register_task(OpenEndedTask.get_task_id(), OpenEndedTask) 28 | -------------------------------------------------------------------------------- /tests/core/data/basic_shadow_dom_site/simple_shadow_dom.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Unit Test with Complex Nested Shadow DOM 5 | 6 | 7 |
8 | 9 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /tests/core/data/input_type/submit_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

Submit Button

6 | 7 |

The input type="submit" defines a button for submitting form data to a form-handler:

8 | 9 |
10 |
11 |
12 |
13 |

14 | 15 |
16 | 17 |

If you click "Submit", the form-data will be sent to a page called "https://www.w3schools.com/action_page.php".

18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /browsergym/webarena/src/browsergym/webarena/__init__.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | 3 | from browsergym.core.registration import register_task 4 | 5 | from . import config, task 6 | 7 | # download necessary tokenizer resources 8 | # note: deprecated punkt -> punkt_tab https://github.com/nltk/nltk/issues/3293 9 | try: 10 | nltk.data.find("tokenizers/punkt_tab") 11 | except: 12 | nltk.download("punkt_tab", quiet=True, raise_on_error=True) 13 | 14 | ALL_WEBARENA_TASK_IDS = [] 15 | 16 | # register all WebArena benchmark 17 | for task_id in config.TASK_IDS: 18 | gym_id = f"webarena.{task_id}" 19 | register_task( 20 | gym_id, 21 | task.GenericWebArenaTask, 22 | task_kwargs={"task_id": task_id}, 23 | ) 24 | ALL_WEBARENA_TASK_IDS.append(gym_id) 25 | -------------------------------------------------------------------------------- /tests/core/data/input_type/date_min_max_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

Date Field Restrictions

6 | 7 |

Use the min and max attributes to add restrictions to dates:

8 | 9 |
10 | 11 |

12 | 13 | 14 |

15 | 16 | 17 |
18 | 19 |

Note: type="date" is not supported in Internet Explorer 11 or prior Safari 14.1.

20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /browsergym/webarenalite/src/browsergym/webarenalite/__init__.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | 3 | from browsergym.core.registration import register_task 4 | 5 | from . import config, task 6 | 7 | # download necessary tokenizer resources 8 | # note: deprecated punkt -> punkt_tab https://github.com/nltk/nltk/issues/3293 9 | try: 10 | nltk.data.find("tokenizers/punkt_tab") 11 | except: 12 | nltk.download("punkt_tab", quiet=True, raise_on_error=True) 13 | 14 | ALL_WEBARENA_TASK_IDS = [] 15 | 16 | # register all WebArena benchmark 17 | for task_id in config.TASK_IDS: 18 | gym_id = f"webarenalite.{task_id}" 19 | register_task( 20 | gym_id, 21 | task.WebArenaLiteTask, 22 | task_kwargs={"task_id": task_id}, 23 | ) 24 | ALL_WEBARENA_TASK_IDS.append(gym_id) 25 | -------------------------------------------------------------------------------- /tests/core/data/input_type/reset_input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

Reset Button

6 | 7 |

The input type="reset" defines a reset button that resets all form values to their default values:

8 | 9 |
10 |
11 |
12 |
13 |

14 | 15 | 16 |
17 | 18 |

If you change the input values and then click the "Reset" button, the form-data will be reset to the default values.

19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /tests/webarena/test_instance.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import playwright.sync_api 3 | 4 | from browsergym.webarena.instance import WebArenaInstance 5 | 6 | 7 | def test_is_reachable(): 8 | # default URLs 9 | instance = WebArenaInstance() 10 | instance.check_status() 11 | 12 | # unreacheable URL 13 | with pytest.raises(RuntimeError): 14 | instance = WebArenaInstance() 15 | instance.urls["reddit"] = "https://invalid.url" 16 | instance.check_status() 17 | 18 | 19 | @pytest.mark.parametrize( 20 | "site", ["reddit", "shopping", "shopping_admin", "gitlab", "wikipedia", "map"] 21 | ) 22 | def test_credentials(page: playwright.sync_api.Page, site: str): 23 | # default URLs and credentials 24 | instance = WebArenaInstance() 25 | instance.ui_login(site=site, page=page) 26 | 27 | # TODO: test this more thoroughly 28 | -------------------------------------------------------------------------------- /tests/visualwebarena/test_vwa_domains.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import playwright.sync_api 3 | 4 | from browsergym.visualwebarena.instance import VisualWebArenaInstance 5 | 6 | 7 | def test_is_reachable(): 8 | # default URLs 9 | instance = VisualWebArenaInstance() 10 | instance.check_status() 11 | 12 | # unreacheable URL 13 | with pytest.raises(RuntimeError): 14 | instance = VisualWebArenaInstance() 15 | instance.urls["reddit"] = "https://invalid.url" 16 | instance.check_status() 17 | 18 | 19 | @pytest.mark.parametrize("site", ["reddit", "shopping", "wikipedia", "classifieds"]) 20 | def test_credentials(page: playwright.sync_api.Page, site: str): 21 | # default URLs and credentials 22 | instance = VisualWebArenaInstance() 23 | instance.ui_login(site=site, page=page) 24 | 25 | # TODO: test this more thoroughly 26 | -------------------------------------------------------------------------------- /browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_factory.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | from .evaluate_dicts import evaluate_dicts 4 | from .evaluate_numbers import evaluate_numbers 5 | from .evaluate_strings import evaluate_strings 6 | 7 | EvaluatorFactory = { 8 | "string": evaluate_strings, 9 | "number": evaluate_numbers, 10 | "json": evaluate_dicts, 11 | "string list": evaluate_strings, 12 | } 13 | 14 | EvaluatorFactoryFromType = { 15 | str: evaluate_strings, 16 | int: evaluate_numbers, 17 | float: evaluate_numbers, 18 | bool: evaluate_strings, 19 | list: evaluate_strings, 20 | } 21 | 22 | 23 | def get_evaluator(evaluator: str): 24 | return EvaluatorFactory[evaluator] 25 | 26 | 27 | def get_evaluator_from_gold_answer(gold_answer: Union[str, int, float]): 28 | return EvaluatorFactoryFromType[gold_answer] 29 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "browsergym-meta" 3 | description = "BrowserGym: a gym environment for web task automation in the Chromium browser" 4 | dynamic = ["version"] 5 | [tool.setuptools] 6 | packages = [] # meta distribution, packages are included as dependencies 7 | [tool.black] 8 | line-length = 100 9 | include = '\.pyi?$' 10 | exclude = ''' 11 | /( 12 | \.eggs 13 | | \.git 14 | | \.hg 15 | | \.mypy_cache 16 | | \.nox 17 | | \.tox 18 | | \.venv 19 | | _build 20 | | buck-out 21 | | build 22 | | dist 23 | )/ 24 | ''' 25 | 26 | [tool.pytest.ini_options] 27 | filterwarnings = [ 28 | 'ignore::UserWarning:gymnasium.*:', # too many "The obs is not within the observation space." warnings. 29 | ] 30 | markers = [ 31 | "slow: marks tests as slow (deselect with '-m \"not slow\"')", 32 | "serial: mark test to be run sequantially (deselect with '-m \"not serial\"')" 33 | ] 34 | -------------------------------------------------------------------------------- /tests/core/data/test_page.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Simple Form 5 | 6 | 7 |

Simple Form

8 | 9 |
10 | 11 |

12 | 13 | 14 |

15 | 16 | 17 |

18 | 19 |
20 |

21 | 22 | 23 |

24 | 25 | 26 | 27 |
28 | 29 | 30 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the OS, Python version and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.12" 13 | # You can also specify other tool versions: 14 | # nodejs: "19" 15 | # rust: "1.64" 16 | # golang: "1.19" 17 | 18 | # Build documentation in the "docs/" directory with Sphinx 19 | sphinx: 20 | configuration: docs/src/conf.py 21 | 22 | # Optionally build your docs in additional formats such as PDF and ePub 23 | # formats: 24 | # - pdf 25 | # - epub 26 | 27 | # Optional but recommended, declare the Python requirements required 28 | # to build your documentation 29 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 30 | python: 31 | install: 32 | - requirements: docs/requirements.txt 33 | -------------------------------------------------------------------------------- /tests/core/test_actions_python.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from browsergym.core.action.python import PythonActionSet 4 | 5 | 6 | ACTIONS_TO_TEST = [ 7 | ( 8 | """\ 9 | a = 0 10 | """, 11 | """\ 12 | a = 0 13 | """, 14 | ), 15 | ( 16 | """\ 17 | ``` 18 | a = 0 19 | ``` 20 | """, 21 | """\ 22 | a = 0 23 | """, 24 | ), 25 | ( 26 | """\ 27 | ```python 28 | a = 0 29 | ``` 30 | """, 31 | """\ 32 | a = 0 33 | """, 34 | ), 35 | ( 36 | """\ 37 | ```python 38 | a = 0 39 | ``` 40 | This is an explanation 41 | ```python 42 | b = 3 43 | ``` 44 | More explanations 45 | """, 46 | """\ 47 | a = 0 48 | 49 | b = 3 50 | """, 51 | ), 52 | ] 53 | 54 | 55 | @pytest.mark.parametrize("action,expected_code", ACTIONS_TO_TEST) 56 | def test_action_cleaning(action, expected_code): 57 | action_set = PythonActionSet() 58 | code = action_set.to_python_code(action) 59 | 60 | assert code == expected_code 61 | -------------------------------------------------------------------------------- /browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_numbers.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import numpy as np 4 | 5 | 6 | # Renamed calc_z function to distance_function_log 7 | def distance_function_log(pred: float, gold: float): 8 | if pred == gold == 0: 9 | return 1 10 | if pred == 0: 11 | pred = 1e-4 12 | if gold == 0: 13 | gold = 1e-4 14 | if pred > gold: 15 | return max(0, 1 - np.log(pred / gold)) 16 | else: 17 | return max(0, 1 - np.log(gold / pred)) 18 | 19 | 20 | def evaluate_numbers(pred: Union[float, str], gold: float): 21 | res = None 22 | if type(pred) != float and type(pred) != int: 23 | try: 24 | pred = float(pred) 25 | except ValueError: 26 | res = 0 27 | if type(gold) != float and type(gold) != int: 28 | try: 29 | gold = float(gold) 30 | except ValueError: 31 | res = 0 32 | if res is None: 33 | res = distance_function_log(pred, gold) 34 | return res 35 | -------------------------------------------------------------------------------- /browsergym/assistantbench/README.md: -------------------------------------------------------------------------------- 1 | # AssistantBench <> BrowserGym 2 | 3 | This package provides an implementation for using the [AssistantBench](https://assistantbench.github.io/) benchmark in BrowserGym. 4 | 5 | Because AssistantBench includes open-ended tasks, setup is extremely easy and simply requires installing the package. 6 | 7 | Please note that AssistantBench has a hidden test set, so test set predictions will need to be uploaded to the official [leaderboard](https://huggingface.co/spaces/AssistantBench/leaderboard). 8 | 9 | ## Setting up 10 | 11 | - Install the package (this is still a wip) 12 | ``` 13 | pip install browsergym-assistantbench 14 | ``` 15 | 16 | - Run inference, e.g., run the following commands for demo on a simple toy task 17 | ``` 18 | python demo_agent/run_demo.py --task_name assistantbench.validation.3 19 | ``` 20 | 21 | - Test set predictions will be saved to `./assistantbench-predictions-test.jsonl`. To evaluate on the official test set, upload these predictions to the official [leaderboard](https://huggingface.co/spaces/AssistantBench/leaderboard). 22 | -------------------------------------------------------------------------------- /browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, List, Set 2 | 3 | import numpy as np 4 | from scipy.optimize import linear_sum_assignment 5 | 6 | 7 | def _align_bags( 8 | predicted: List[Set[str]], 9 | gold: List[Set[str]], 10 | method: Callable[[object, object], float], 11 | ) -> List[float]: 12 | """ 13 | Takes gold and predicted answer sets and first finds the optimal 1-1 alignment 14 | between them and gets maximum metric values over all the answers. 15 | """ 16 | scores = np.zeros([len(gold), len(predicted)]) 17 | for gold_index, gold_item in enumerate(gold): 18 | for pred_index, pred_item in enumerate(predicted): 19 | scores[gold_index, pred_index] = method(pred_item, gold_item) 20 | row_ind, col_ind = linear_sum_assignment(-scores) 21 | 22 | max_scores = np.zeros([max(len(gold), len(predicted))]) 23 | for row, column in zip(row_ind, col_ind): 24 | max_scores[row] = max(max_scores[row], scores[row, column]) 25 | return max_scores 26 | -------------------------------------------------------------------------------- /docs/src/environments/webarena.rst: -------------------------------------------------------------------------------- 1 | WebArena 2 | ^^^^^^^^ 3 | 4 | `BrowserGym` integrates `WebArena` enviroment. For more information about this enviroment, please refer to the `WebArena `_ official documentation. 5 | 6 | 7 | BrowserGym API 8 | """""""""""""" 9 | 10 | .. currentmodule:: browsergym 11 | 12 | .. autosummary:: 13 | :recursive: 14 | :toctree: generated 15 | :caption: WebArena 16 | 17 | webarena 18 | 19 | 20 | Usage 21 | """"" 22 | 23 | Before running the sample code, install `WebArena` by following the steps in the `docs `_. 24 | 25 | .. code-block:: python 26 | 27 | import gym 28 | import browsergym.webarena 29 | 30 | env = gym.make('browsergym/webarena.10') 31 | obs, info = env.reset() 32 | done = False 33 | 34 | while not done: 35 | action = "noop()" 36 | obs, reward, terminated, truncated, info = env.step(action) 37 | print(f"Reward: {reward}, Done: {done}, Info: {info}") 38 | 39 | env.close() 40 | 41 | -------------------------------------------------------------------------------- /browsergym/visualwebarena/src/browsergym/visualwebarena/__init__.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | 3 | from browsergym.core.registration import register_task 4 | 5 | from . import config, task 6 | 7 | # download necessary tokenizer resources 8 | # note: deprecated punkt -> punkt_tab https://github.com/nltk/nltk/issues/3293 9 | try: 10 | nltk.data.find("tokenizers/punkt_tab") 11 | except: 12 | nltk.download("punkt_tab", quiet=True, raise_on_error=True) 13 | 14 | ALL_VISUALWEBARENA_TASK_IDS = [] 15 | VISUALWEBARENA_TASK_IDS_WITH_RESET = [] 16 | VISUALWEBARENA_TASK_IDS_WITHOUT_RESET = [] 17 | 18 | # register all VisualWebArena tasks 19 | for task_id in config.TASK_IDS: 20 | gym_id = f"visualwebarena.{task_id}" 21 | register_task( 22 | gym_id, 23 | task.GenericVisualWebArenaTask, 24 | task_kwargs={"task_id": task_id}, 25 | ) 26 | ALL_VISUALWEBARENA_TASK_IDS.append(gym_id) 27 | if task_id in config.TASK_IDS_WITH_RESET: 28 | VISUALWEBARENA_TASK_IDS_WITH_RESET.append(gym_id) 29 | else: 30 | VISUALWEBARENA_TASK_IDS_WITHOUT_RESET.append(gym_id) 31 | -------------------------------------------------------------------------------- /docs/src/environments/miniwob.rst: -------------------------------------------------------------------------------- 1 | MiniWoB++ 2 | ^^^^^^^^^ 3 | 4 | `BrowserGym` integrates `MiniWoB++` enviroment. For more information about this enviroment, please refer to the `MiniWoB+ `_ official documentation. 5 | 6 | 7 | BrowserGym API 8 | """""""""""""" 9 | 10 | .. currentmodule:: browsergym 11 | 12 | .. autosummary:: 13 | :recursive: 14 | :toctree: generated 15 | :caption: MiniWoB++ 16 | 17 | miniwob 18 | 19 | 20 | Usage 21 | """"" 22 | 23 | Before running the sample code, install `MiniWoB++` by following the steps in the `docs `_. 24 | 25 | .. code-block:: python 26 | 27 | import gym 28 | import browsergym.minwob 29 | 30 | env = gym.make('browsergym/miniwob.book-flight') 31 | obs, info = env.reset() 32 | done = False 33 | 34 | while not done: 35 | action = "noop()" 36 | obs, reward, terminated, truncated, info = env.step(action) 37 | print(f"Reward: {reward}, Done: {done}, Info: {info}") 38 | 39 | env.close() 40 | 41 | -------------------------------------------------------------------------------- /browsergym/webarenalite/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling", "hatch-requirements-txt"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "browsergym-webarenalite" 7 | description = "WebArena Lite benchmark for BrowserGym" 8 | authors = [ 9 | {name = "Aman Jaiswal"}, 10 | {name = "Leo Biosvert"}, 11 | ] 12 | requires-python = ">3.7" 13 | license = {text = "Apache-2.0"} 14 | classifiers = [ 15 | "Development Status :: 3 - Alpha", 16 | "Programming Language :: Python :: 3", 17 | "Operating System :: OS Independent", 18 | "Intended Audience :: Science/Research", 19 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 20 | "License :: OSI Approved :: Apache Software License", 21 | ] 22 | dynamic = ["dependencies", "version"] 23 | 24 | [project.urls] 25 | homepage = "https://github.com/ServiceNow/BrowserGym" 26 | 27 | [tool.hatch.version] 28 | path = "../core/src/browsergym/core/__init__.py" 29 | 30 | [tool.hatch.metadata.hooks.requirements_txt] 31 | files = ["requirements.txt"] 32 | 33 | [tool.hatch.build.targets.wheel] 34 | packages = ["src/browsergym"] 35 | -------------------------------------------------------------------------------- /browsergym/webarena/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling", "hatch-requirements-txt"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "browsergym-webarena" 7 | description = "WebArena benchmark for BrowserGym" 8 | authors = [ 9 | {name = "Maxime Gasse"}, 10 | {name = "Tom Marty"}, 11 | ] 12 | readme = "README.md" 13 | requires-python = ">3.7" 14 | license = {text = "Apache-2.0"} 15 | classifiers = [ 16 | "Development Status :: 3 - Alpha", 17 | "Programming Language :: Python :: 3", 18 | "Operating System :: OS Independent", 19 | "Intended Audience :: Science/Research", 20 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 21 | "License :: OSI Approved :: Apache Software License", 22 | ] 23 | dynamic = ["dependencies", "version"] 24 | 25 | [project.urls] 26 | homepage = "https://github.com/ServiceNow/BrowserGym" 27 | 28 | [tool.hatch.version] 29 | path = "../core/src/browsergym/core/__init__.py" 30 | 31 | [tool.hatch.metadata.hooks.requirements_txt] 32 | files = ["requirements.txt"] 33 | 34 | [tool.hatch.build.targets.wheel] 35 | packages = ["src/browsergym"] 36 | -------------------------------------------------------------------------------- /docs/src/usage.rst: -------------------------------------------------------------------------------- 1 | Usage 2 | ===== 3 | 4 | .. _installation: 5 | 6 | Installation 7 | ------------ 8 | 9 | To use BrowserGym, first install it using pip: 10 | 11 | .. code-block:: console 12 | 13 | pip install browsergym 14 | 15 | Then, a required step is to setup playwright by running 16 | 17 | .. code-block:: console 18 | 19 | playwright install chromium 20 | 21 | Example code 22 | ------------ 23 | 24 | Boilerplate code to run an agent on an interactive, open-ended task: 25 | 26 | .. code-block:: python 27 | 28 | import gymnasium as gym 29 | import browsergym.core # register the openended task as a gym environment 30 | 31 | env = gym.make( 32 | "browsergym/openended", 33 | task_kwargs={"start_url": "https://www.google.com/"}, # starting URL 34 | wait_for_user_message=True, # wait for a user message after each agent message sent to the chat 35 | ) 36 | 37 | obs, info = env.reset() 38 | done = False 39 | while not done: 40 | action = ... # implement your agent here 41 | obs, reward, terminated, truncated, info = env.step(action) 42 | done = terminated or truncated 43 | -------------------------------------------------------------------------------- /tests/webarena/test_env_general.py: -------------------------------------------------------------------------------- 1 | import gymnasium as gym 2 | import logging 3 | import os 4 | import playwright.sync_api 5 | import pytest 6 | import random 7 | 8 | from tenacity import retry, stop_after_attempt, retry_if_exception_type 9 | 10 | # register gym environments 11 | import browsergym.webarena 12 | 13 | 14 | __SLOW_MO = 1000 if "DISPLAY_BROWSER" in os.environ else None 15 | __HEADLESS = False if "DISPLAY_BROWSER" in os.environ else True 16 | 17 | 18 | from browsergym.webarena import ALL_WEBARENA_TASK_IDS 19 | 20 | rng = random.Random(1) 21 | task_ids = rng.sample(ALL_WEBARENA_TASK_IDS, 25) 22 | 23 | 24 | @retry( 25 | stop=stop_after_attempt(5), 26 | retry=retry_if_exception_type(playwright.sync_api.TimeoutError), 27 | reraise=True, 28 | before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."), 29 | ) 30 | @pytest.mark.parametrize("task_id", task_ids) 31 | @pytest.mark.slow 32 | def test_env_generic(task_id): 33 | env = gym.make( 34 | f"browsergym/{task_id}", 35 | headless=__HEADLESS, 36 | slow_mo=__SLOW_MO, 37 | ) 38 | obs, info = env.reset() 39 | 40 | env.close() 41 | -------------------------------------------------------------------------------- /browsergym/assistantbench/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling", "hatch-requirements-txt"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "browsergym-assistantbench" 7 | description = "AssistantBench benchmark for BrowserGym" 8 | authors = [ 9 | {name = "Ori Yoran"}, 10 | {name = "Maxime Gasse"}, 11 | ] 12 | readme = "README.md" 13 | requires-python = ">3.7" 14 | license = {text = "Apache-2.0"} 15 | classifiers = [ 16 | "Development Status :: 3 - Alpha", 17 | "Programming Language :: Python :: 3", 18 | "Operating System :: OS Independent", 19 | "Intended Audience :: Science/Research", 20 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 21 | "License :: OSI Approved :: Apache Software License", 22 | ] 23 | dynamic = ["dependencies", "version"] 24 | 25 | [project.urls] 26 | homepage = "https://github.com/ServiceNow/BrowserGym" 27 | 28 | [tool.hatch.version] 29 | path = "../core/src/browsergym/core/__init__.py" 30 | 31 | [tool.hatch.metadata.hooks.requirements_txt] 32 | files = ["requirements.txt"] 33 | 34 | [tool.hatch.build.targets.wheel] 35 | packages = ["src/browsergym"] 36 | -------------------------------------------------------------------------------- /browsergym/visualwebarena/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling", "hatch-requirements-txt"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "browsergym-visualwebarena" 7 | description = "VisualWebArena benchmark for BrowserGym" 8 | authors = [ 9 | {name = "Lawrence Jang"}, 10 | {name = "Maxime Gasse"}, 11 | ] 12 | readme = "README.md" 13 | requires-python = ">3.7" 14 | license = {text = "Apache-2.0"} 15 | classifiers = [ 16 | "Development Status :: 3 - Alpha", 17 | "Programming Language :: Python :: 3", 18 | "Operating System :: OS Independent", 19 | "Intended Audience :: Science/Research", 20 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 21 | "License :: OSI Approved :: Apache Software License", 22 | ] 23 | dynamic = ["dependencies", "version"] 24 | 25 | [project.urls] 26 | homepage = "https://github.com/ServiceNow/BrowserGym" 27 | 28 | [tool.hatch.version] 29 | path = "../core/src/browsergym/core/__init__.py" 30 | 31 | [tool.hatch.metadata.hooks.requirements_txt] 32 | files = ["requirements.txt"] 33 | 34 | [tool.hatch.build.targets.wheel] 35 | packages = ["src/browsergym"] 36 | -------------------------------------------------------------------------------- /browsergym/miniwob/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling", "hatch-requirements-txt"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "browsergym-miniwob" 7 | description = "MiniWoB++ benchmark for BrowserGym" 8 | authors = [ 9 | {name = "Rim Assouel"}, 10 | {name = "Maxime Gasse"}, 11 | {name = "Tom Marty"}, 12 | {name = "Alexandre Lacoste"}, 13 | ] 14 | readme = "README.md" 15 | requires-python = ">3.7" 16 | license = {text = "Apache-2.0"} 17 | classifiers = [ 18 | "Development Status :: 3 - Alpha", 19 | "Programming Language :: Python :: 3", 20 | "Operating System :: OS Independent", 21 | "Intended Audience :: Science/Research", 22 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 23 | "License :: OSI Approved :: Apache Software License", 24 | ] 25 | dynamic = ["dependencies", "version"] 26 | 27 | [project.urls] 28 | homepage = "https://github.com/ServiceNow/BrowserGym" 29 | 30 | [tool.hatch.version] 31 | path = "../core/src/browsergym/core/__init__.py" 32 | 33 | [tool.hatch.metadata.hooks.requirements_txt] 34 | files = ["requirements.txt"] 35 | 36 | [tool.hatch.build.targets.wheel] 37 | packages = ["src/browsergym"] 38 | -------------------------------------------------------------------------------- /tests/miniwob/test_use-colorwheel-2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gymnasium as gym 3 | import re 4 | import pytest 5 | 6 | # register gym environments 7 | import browsergym.miniwob 8 | 9 | __SLOW_MO = 1000 if "DISPLAY_BROWSER" in os.environ else None 10 | __HEADLESS = False if "DISPLAY_BROWSER" in os.environ else True 11 | 12 | 13 | @pytest.mark.parametrize("seed", range(5)) 14 | def test_cheat(seed): 15 | env = gym.make( 16 | "browsergym/miniwob.use-colorwheel-2", 17 | headless=__HEADLESS, 18 | slow_mo=__SLOW_MO, 19 | action_mapping=None, 20 | ) 21 | obs, info = env.reset(seed=42) 22 | 23 | assert obs["last_action_error"] == "" 24 | 25 | match = re.match( 26 | "Select the following color #(.+) with the color picker and hit Submit.", obs["goal"] 27 | ) 28 | 29 | assert match 30 | 31 | color = match.groups()[0].upper() 32 | 33 | obs, reward, term, trunc, info = env.step( 34 | f"""\ 35 | page.locator("#col").fill("{color}") 36 | page.get_by_role("button", name="Submit").click() 37 | """ 38 | ) 39 | 40 | assert obs["last_action_error"] == "" 41 | assert reward == 1 42 | assert term == True 43 | 44 | env.close() 45 | -------------------------------------------------------------------------------- /tests/miniwob/test_click-scroll-list.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gymnasium as gym 3 | import re 4 | import pytest 5 | 6 | # register gym environments 7 | import browsergym.miniwob 8 | 9 | __SLOW_MO = 1000 if "DISPLAY_BROWSER" in os.environ else None 10 | __HEADLESS = False if "DISPLAY_BROWSER" in os.environ else True 11 | 12 | 13 | @pytest.mark.parametrize("seed", range(5)) 14 | def test_cheat(seed): 15 | env = gym.make( 16 | "browsergym/miniwob.click-scroll-list", 17 | headless=__HEADLESS, 18 | slow_mo=__SLOW_MO, 19 | action_mapping=None, 20 | ) 21 | obs, info = env.reset(seed=seed) 22 | 23 | assert obs["last_action_error"] == "" 24 | 25 | match = re.match("Select (.+) from the scroll list and click Submit.", obs["goal"]) 26 | 27 | assert match 28 | 29 | options = match.groups()[0].split(", ") 30 | options = '", "'.join(options) 31 | action = f"""\ 32 | page.locator("#options").select_option(["{options}"]) 33 | page.get_by_role("button", name="Submit").click() 34 | """ 35 | 36 | obs, reward, term, trunc, info = env.step(action) 37 | 38 | assert obs["last_action_error"] == "" 39 | assert reward == 1 40 | assert term == True 41 | 42 | env.close() 43 | -------------------------------------------------------------------------------- /tests/core/data/basic_iframe_site/outer-iframe.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Shadow DOM Example 5 | 22 | 23 | 24 |
25 |
26 |
27 | 28 |
29 | 30 | 31 | -------------------------------------------------------------------------------- /tests/visualwebarena/test_vwa_tasks_with_reset.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import random 4 | 5 | import gymnasium as gym 6 | import playwright.sync_api 7 | import pytest 8 | from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed 9 | 10 | # register gym environments 11 | import browsergym.visualwebarena 12 | 13 | __SLOW_MO = 1000 if "DISPLAY_BROWSER" in os.environ else None 14 | __HEADLESS = False if "DISPLAY_BROWSER" in os.environ else True 15 | 16 | 17 | from browsergym.visualwebarena import VISUALWEBARENA_TASK_IDS_WITH_RESET 18 | 19 | rng = random.Random(1) 20 | task_ids = rng.sample(VISUALWEBARENA_TASK_IDS_WITH_RESET, 10) 21 | 22 | 23 | @retry( 24 | stop=stop_after_attempt(5), 25 | retry=retry_if_exception_type(playwright.sync_api.TimeoutError), 26 | wait=wait_fixed(2), 27 | reraise=True, 28 | before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."), 29 | ) 30 | @pytest.mark.parametrize("task_id", task_ids) 31 | @pytest.mark.slow 32 | @pytest.mark.serial 33 | def test_env_generic(task_id): 34 | env = gym.make( 35 | f"browsergym/{task_id}", 36 | headless=__HEADLESS, 37 | slow_mo=__SLOW_MO, 38 | ) 39 | obs, info = env.reset() 40 | env.close() 41 | -------------------------------------------------------------------------------- /browsergym/visualwebarena/src/browsergym/visualwebarena/config.py: -------------------------------------------------------------------------------- 1 | TASK_IDS = range(910) 2 | 3 | # import visualwebarena 4 | # import importlib.resources 5 | # import json 6 | # all_configs_str = importlib.resources.files(visualwebarena).joinpath("test_raw.json").read_text() 7 | # all_configs = json.loads(all_configs_str) 8 | # task_ids_with_reset = [task["task_id"] for task in all_configs if task["require_reset"] == True] 9 | TASK_IDS_WITH_RESET = [ 10 | 4, 11 | 5, 12 | 8, 13 | 9, 14 | 28, 15 | 29, 16 | 30, 17 | 31, 18 | 57, 19 | 76, 20 | 77, 21 | 143, 22 | 144, 23 | 145, 24 | 159, 25 | 160, 26 | 203, 27 | 205, 28 | 208, 29 | 213, 30 | 217, 31 | 223, 32 | 392, 33 | 393, 34 | 394, 35 | 402, 36 | 404, 37 | 405, 38 | 406, 39 | 407, 40 | 408, 41 | 410, 42 | 411, 43 | 412, 44 | 416, 45 | 422, 46 | 423, 47 | 424, 48 | 425, 49 | 426, 50 | 441, 51 | 442, 52 | 443, 53 | 668, 54 | 669, 55 | 670, 56 | 671, 57 | 672, 58 | 673, 59 | 688, 60 | 689, 61 | 711, 62 | 712, 63 | 713, 64 | 714, 65 | 715, 66 | 716, 67 | 717, 68 | 733, 69 | 764, 70 | 765, 71 | 766, 72 | ] 73 | -------------------------------------------------------------------------------- /tests/core/data/basic_iframe_site/basic_iframe.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Iframe Example 5 | 6 | 7 | 8 | 9 | 22 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /tests/core/data/basic_shadow_iframe_site/basic_iframe.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Iframe Example 5 | 6 | 7 | 8 | 9 | 22 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | fail_fast: false 2 | 3 | default_language_version: 4 | python: python3 5 | 6 | repos: 7 | - repo: https://github.com/pre-commit/pre-commit-hooks 8 | rev: v4.2.0 9 | hooks: 10 | - id: trailing-whitespace 11 | exclude: ^(.*)\.md$ 12 | - id: end-of-file-fixer 13 | - id: check-yaml 14 | exclude: ^(.circleci/recipe|recipe) # conda build recipes are templated 15 | - id: check-added-large-files 16 | - repo: https://github.com/pocc/pre-commit-hooks 17 | rev: v1.1.1 18 | hooks: 19 | - id: clang-format 20 | args: [--style=file, -i] 21 | - id: clang-tidy 22 | args: [--fix, --fix-errors] 23 | - repo: https://github.com/psf/black 24 | rev: 24.2.0 25 | hooks: 26 | - id: black 27 | args: [--config=./pyproject.toml] 28 | - repo: https://github.com/asottile/blacken-docs 29 | rev: v1.12.1 30 | hooks: 31 | - id: blacken-docs 32 | args: [ '--line-length', '100' ] 33 | additional_dependencies: [black] 34 | - repo: https://github.com/Lucas-C/pre-commit-hooks 35 | rev: v1.5.5 36 | hooks: 37 | - id: forbid-crlf 38 | - id: remove-crlf 39 | # Black does not clear tabs in docstrings 40 | - id: forbid-tabs 41 | files: '.*\.py$' 42 | - id: remove-tabs 43 | files: '.*\.py$' 44 | args: [ '--whitespaces-count', '4' ] -------------------------------------------------------------------------------- /browsergym/core/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling", "hatch-requirements-txt"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "browsergym-core" 7 | description = "BrowserGym: a gym environment for web task automation in the Chromium browser" 8 | authors = [ 9 | {name = "Rim Assouel"}, 10 | {name = "Léo Boisvert"}, 11 | {name = "Massimo Caccia"}, 12 | {name = "Alex Drouin"}, 13 | {name = "Maxime Gasse"}, 14 | {name = "Imene Kerboua"}, 15 | {name = "Alex Lacoste"}, 16 | {name = "Thibault Le Sellier De Chezelles"}, 17 | {name = "Tom Marty"}, 18 | ] 19 | readme = "README.md" 20 | requires-python = ">3.9" 21 | license = {text = "Apache-2.0"} 22 | classifiers = [ 23 | "Development Status :: 3 - Alpha", 24 | "Programming Language :: Python :: 3", 25 | "Operating System :: OS Independent", 26 | "Intended Audience :: Science/Research", 27 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 28 | "License :: OSI Approved :: Apache Software License", 29 | ] 30 | dynamic = ["dependencies", "version"] 31 | 32 | [project.urls] 33 | homepage = "https://github.com/ServiceNow/BrowserGym" 34 | 35 | [tool.hatch.version] 36 | path = "src/browsergym/core/__init__.py" 37 | 38 | [tool.hatch.metadata.hooks.requirements_txt] 39 | files = ["requirements.txt"] 40 | 41 | [tool.hatch.build.targets.wheel] 42 | packages = ["src/browsergym"] 43 | -------------------------------------------------------------------------------- /browsergym/miniwob/README.md: -------------------------------------------------------------------------------- 1 | # Miniwob benchmark for BrowserGym 2 | 3 | This package provides `browsergym.miniwob`, which is an unofficial port of the [MiniWoB++](https://miniwob.farama.org/) benchmark for BrowserGym. 4 | 5 | ## Setup 6 | 7 | ### Option 1: Automated setup (Recommended) 8 | 9 | If you're working from the BrowserGym root directory, you can use the Makefile for automated setup: 10 | 11 | ```sh 12 | make setup-miniwob 13 | ``` 14 | 15 | This will: 16 | 17 | - Clone the MiniWoB++ repository 18 | - Reset to the specific commit for reproducibility 19 | - Add the `MINIWOB_URL` to your `.env` file 20 | 21 | Then load the environment variables: 22 | 23 | ```sh 24 | source .env 25 | ``` 26 | 27 | ### Option 2: Manual setup 28 | 29 | 1. Install the package 30 | 31 | ```sh 32 | pip install browsergym-miniwob 33 | ``` 34 | 35 | 1. Clone miniwob (use a specific frozen commit for reproducibility) 36 | 37 | ```sh 38 | git clone git@github.com:Farama-Foundation/miniwob-plusplus.git 39 | git -C "./miniwob-plusplus" reset --hard 7fd85d71a4b60325c6585396ec4f48377d049838 40 | ``` 41 | 42 | 1. Setup Miniwob URL (change `PATH_TO_MINIWOB_CLONED_REPO` here to the absolute path to your `miniwob-plusplus` folder) 43 | 44 | ```sh 45 | export MINIWOB_URL="file:///miniwob/html/miniwob/" 46 | ``` 47 | 48 | Alternatively, one can [setup a simple HTTP server](https://miniwob.farama.org/content/viewing/) and use a proper URL. 49 | -------------------------------------------------------------------------------- /tests/core/data/basic_shadow_iframe_site/outer-iframe.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Shadow DOM Example 5 | 6 | 7 |
8 | 9 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /tests/webarena/test_infeasible.py: -------------------------------------------------------------------------------- 1 | import gymnasium as gym 2 | import logging 3 | import os 4 | import playwright.sync_api 5 | import pytest 6 | 7 | from tenacity import retry, stop_after_attempt, retry_if_exception_type 8 | 9 | # register gym environments 10 | import browsergym.webarena 11 | 12 | 13 | __SLOW_MO = 1000 if "DISPLAY_BROWSER" in os.environ else None 14 | __HEADLESS = False if "DISPLAY_BROWSER" in os.environ else True 15 | 16 | INFEAS_TASK_IDS = [101, 115, 166] 17 | FEAS_TASK_IDS = [165, 187, 199] 18 | 19 | 20 | @retry( 21 | stop=stop_after_attempt(5), 22 | retry=retry_if_exception_type(playwright.sync_api.TimeoutError), 23 | reraise=True, 24 | before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."), 25 | ) 26 | @pytest.mark.parametrize( 27 | "task_id,infeasible", 28 | [(task_id, True) for task_id in INFEAS_TASK_IDS] 29 | + [(task_id, False) for task_id in FEAS_TASK_IDS], 30 | ) 31 | @pytest.mark.slow 32 | def test_infeasible(task_id, infeasible): 33 | env = gym.make( 34 | f"browsergym/webarena.{task_id}", 35 | headless=__HEADLESS, 36 | slow_mo=__SLOW_MO, 37 | ) 38 | obs, info = env.reset() 39 | 40 | action = 'report_infeasible("Unachievable task.")' 41 | 42 | obs, reward, term, trunc, info = env.step(action) 43 | 44 | if infeasible: 45 | assert term == True and reward == 1.0 46 | 47 | else: 48 | assert term == True and reward == 0.0 49 | 50 | env.close() 51 | -------------------------------------------------------------------------------- /tests/core/data/test_page_2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Simple Form 6 | 7 | 8 | 9 |

Simple Form

10 | 11 |
12 | 13 |

14 | 15 | 16 |

17 | 18 | 19 |

20 | 21 |
22 |

23 | 24 | 25 |

26 | 27 | 28 | 29 |
30 | 31 | 32 | Text within a non-html tag 33 | 34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |

Text that should not be visible

61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /browsergym/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | build-backend = "setuptools.build_meta" 3 | requires = ["setuptools"] 4 | 5 | [project] 6 | name = "browsergym" 7 | description = "BrowserGym: a gym environment for web task automation in the Chromium browser" 8 | authors = [ 9 | {name = "Rim Assouel"}, 10 | {name = "Léo Boisvert"}, 11 | {name = "Massimo Caccia"}, 12 | {name = "Alex Drouin"}, 13 | {name = "Maxime Gasse"}, 14 | {name = "Imene Kerboua"}, 15 | {name = "Alex Lacoste"}, 16 | {name = "Thibault Le Sellier De Chezelles"}, 17 | {name = "Tom Marty"}, 18 | {name = "Aman Jaiswal"}, 19 | ] 20 | readme = "README.md" 21 | requires-python = ">3.10" 22 | license = {text = "Apache-2.0"} 23 | classifiers = [ 24 | "Development Status :: 3 - Alpha", 25 | "Programming Language :: Python :: 3", 26 | "Operating System :: OS Independent", 27 | "Intended Audience :: Science/Research", 28 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 29 | "License :: OSI Approved :: Apache Software License", 30 | ] 31 | version="0.14.3.dev1" 32 | dependencies = [ 33 | "browsergym-core==0.14.3.dev1", 34 | "browsergym-miniwob==0.14.3.dev1", 35 | "browsergym-webarena==0.14.3.dev1", 36 | "browsergym-visualwebarena==0.14.3.dev1", 37 | "browsergym-assistantbench==0.14.3.dev1", 38 | "browsergym-experiments==0.14.3.dev1", 39 | "browsergym-workarena>=0.4.1", 40 | "weblinx-browsergym>=0.0.2", 41 | "browsergym-webarenalite==0.14.3.dev1" 42 | ] 43 | 44 | [tool.setuptools] 45 | packages = [] # meta distribution, packages are included as dependencies 46 | -------------------------------------------------------------------------------- /docs/src/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | 3 | # -- Project information 4 | 5 | project = "BrowserGym" 6 | copyright = "2024, ServiceNow Research" 7 | author = "ServiceNow Research" 8 | 9 | version = "0.14.3.dev1" 10 | release = version 11 | 12 | # -- General configuration 13 | 14 | extensions = [ 15 | "sphinx.ext.duration", 16 | "sphinx.ext.doctest", 17 | "sphinx.ext.autodoc", 18 | "sphinx.ext.autosummary", 19 | "sphinx.ext.intersphinx", 20 | "sphinx_design", 21 | ] 22 | 23 | intersphinx_mapping = { 24 | "python": ("https://docs.python.org/3/", None), 25 | "sphinx": ("https://www.sphinx-doc.org/en/master/", None), 26 | } 27 | intersphinx_disabled_domains = ["std"] 28 | 29 | templates_path = ["_templates"] 30 | fixed_sidebar = True 31 | 32 | # -- Options for HTML output 33 | 34 | # Automatically extract typehints when specified and place them in 35 | # descriptions of the relevant function/method. 36 | # autodoc_typehints = "description" 37 | 38 | # Don't show class signature with the class' name. 39 | # autodoc_class_signature = "separated" 40 | 41 | html_theme = "pydata_sphinx_theme" 42 | 43 | html_theme_options = { 44 | "show_nav_level": 2, 45 | "navigation_depth": 2, 46 | "show_toc_level": 2, 47 | "icon_links": [ 48 | { 49 | "name": "GitHub", 50 | "url": "https://github.com/ServiceNow/BrowserGym", 51 | "icon": "fa-brands fa-square-github", 52 | "type": "fontawesome", 53 | } 54 | ], 55 | } 56 | 57 | # -- Options for EPUB output 58 | epub_show_urls = "footnote" 59 | -------------------------------------------------------------------------------- /tests/core/data/example.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Example Domain 6 | 7 | 8 | 9 | 10 | 41 | 42 | 43 | 44 |
45 |

Example Domain

46 |

This domain is for use in illustrative examples in documents. You may use this 47 | domain in literature without prior coordination or asking for permission.

48 |

More information...

49 |
50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /tests/assistantbench/test_env_general.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import random 4 | 5 | import gymnasium as gym 6 | import playwright.sync_api 7 | import pytest 8 | from tenacity import retry, retry_if_exception_type, stop_after_attempt 9 | 10 | # register gym environments 11 | import browsergym.assistantbench 12 | 13 | __SLOW_MO = 1000 if "DISPLAY_BROWSER" in os.environ else None 14 | __HEADLESS = False if "DISPLAY_BROWSER" in os.environ else True 15 | 16 | 17 | from browsergym.assistantbench import TEST_AB_TASK_IDS, VALID_AB_TASK_IDS 18 | 19 | rng = random.Random(1) 20 | valid_task_ids = rng.sample(VALID_AB_TASK_IDS, 10) 21 | test_task_ids = rng.sample(TEST_AB_TASK_IDS, 10) 22 | 23 | 24 | @retry( 25 | stop=stop_after_attempt(5), 26 | retry=retry_if_exception_type(playwright.sync_api.TimeoutError), 27 | reraise=True, 28 | before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."), 29 | ) 30 | @pytest.mark.parametrize("task_id", valid_task_ids + test_task_ids) 31 | @pytest.mark.slow 32 | def test_valid_env(task_id): 33 | env = gym.make( 34 | f"browsergym/{task_id}", 35 | headless=__HEADLESS, 36 | slow_mo=__SLOW_MO, 37 | ) 38 | obs, info = env.reset() 39 | assert not obs["last_action_error"] 40 | 41 | obs, reward, terminated, truncated, info = env.step("noop(0)") 42 | assert not obs["last_action_error"] 43 | assert not (terminated or truncated) 44 | 45 | obs, reward, terminated, truncated, info = env.step('send_msg_to_user("something")') 46 | assert not obs["last_action_error"] 47 | assert terminated 48 | 49 | env.close() 50 | -------------------------------------------------------------------------------- /browsergym/assistantbench/src/browsergym/assistantbench/__init__.py: -------------------------------------------------------------------------------- 1 | from browsergym.core.registration import register_task 2 | 3 | from . import task 4 | 5 | TOY_AB_TASK_IDS = [] 6 | VALID_AB_TASK_IDS = [] 7 | TEST_AB_TASK_IDS = [] 8 | 9 | 10 | # register a toy easy task for testing implementation 11 | gym_id = f"assistantbench.imp.0" 12 | register_task( 13 | gym_id, 14 | task.AssistantBenchTask, 15 | task_kwargs={ 16 | "task_id": f"imp.0", 17 | }, 18 | default_task_kwargs={ 19 | "save_predictions": False, # can be overriden 20 | }, 21 | ) 22 | TOY_AB_TASK_IDS.append(gym_id) 23 | 24 | # register the AssistantBench dev set 25 | for task_id in range(33): 26 | gym_id = f"assistantbench.validation.{task_id}" 27 | register_task( 28 | gym_id, 29 | task.AssistantBenchTask, 30 | task_kwargs={ 31 | "task_id": f"validation.{task_id}", 32 | }, 33 | default_task_kwargs={ 34 | "save_predictions": False, # can be overriden 35 | }, 36 | ) 37 | VALID_AB_TASK_IDS.append(gym_id) 38 | 39 | # register the AssistantBench test set 40 | for task_id in range(181): 41 | gym_id = f"assistantbench.test.{task_id}" 42 | register_task( 43 | gym_id, 44 | task.AssistantBenchTask, 45 | task_kwargs={ 46 | "task_id": f"test.{task_id}", 47 | }, 48 | default_task_kwargs={ 49 | "save_predictions": True, # can be overriden 50 | }, 51 | ) 52 | TEST_AB_TASK_IDS.append(gym_id) 53 | 54 | ALL_AB_TASK_IDS = TOY_AB_TASK_IDS + VALID_AB_TASK_IDS + TEST_AB_TASK_IDS 55 | -------------------------------------------------------------------------------- /browsergym/visualwebarena/src/browsergym/visualwebarena/utils.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import io 3 | import PIL.Image 4 | import requests 5 | 6 | from typing import Literal 7 | 8 | 9 | def image_url_to_pil_image(image_url: str) -> PIL.Image: 10 | if not image_url.startswith("http"): 11 | raise ValueError(f"Unexpected image URL: {image_url}") 12 | response = requests.get(image_url, stream=True) 13 | if response.status_code != 200: 14 | raise ValueError( 15 | f"Could not download image from url {image_url} (status code {response.status_code})" 16 | ) 17 | img = PIL.Image.open(io.BytesIO(response.content)) 18 | return img 19 | 20 | 21 | def data_uri_to_pil_image(data_uri: str) -> PIL.Image: 22 | if data_uri.startswith("data:image/png;base64,"): 23 | image_data = base64.b64decode(data_uri.removeprefix("data:image/png;base64,")) 24 | elif data_uri.startswith("data:image/jpeg;base64,"): 25 | image_data = base64.b64decode(data_uri.removeprefix("data:image/jpeg;base64,")) 26 | else: 27 | raise ValueError(f"Unexpected image encoding: {data_uri}") 28 | img = PIL.Image.open(io.BytesIO(image_data)) 29 | return img 30 | 31 | 32 | def pil_image_to_data_uri(image: PIL.Image, format: Literal["png", "jpeg"] = "png") -> str: 33 | assert format in ("png", "jpeg") 34 | with io.BytesIO() as image_buffer: 35 | image.save(image_buffer, format=format.upper()) 36 | byte_data = image_buffer.getvalue() 37 | image_b64 = base64.b64encode(byte_data).decode("utf-8") 38 | image_b64 = f"data:image/{format};base64," + image_b64 39 | return image_b64 40 | -------------------------------------------------------------------------------- /browsergym/experiments/src/browsergym/experiments/utils.py: -------------------------------------------------------------------------------- 1 | import tiktoken 2 | 3 | 4 | def count_tokens(text, model="gpt-4"): 5 | """Count the number of tokens in a text.""" 6 | 7 | return len(tiktoken.encoding_for_model(model).encode(text)) 8 | 9 | 10 | def count_messages_token(messages, model="gpt-4"): 11 | """Count the number of tokens in a list of messages. 12 | 13 | Args: 14 | messages (list): a list of messages, each message can be a string or a 15 | list of dicts or an object with a content attribute. 16 | model (str): the model to use for tokenization. 17 | 18 | Returns: 19 | int: the number of tokens. 20 | """ 21 | token_count = 0 22 | for message in messages: 23 | if hasattr(message, "content"): 24 | message = message.content 25 | elif isinstance(message, dict) and "content" in message: 26 | message = message["content"] 27 | 28 | if isinstance(message, str): 29 | token_count += count_tokens(message, model) 30 | # handles messages with image content 31 | elif isinstance(message, (list, tuple)): 32 | for part in message: 33 | if not isinstance(part, dict): 34 | raise ValueError( 35 | f"The message is expected to be a list of dicts, but got list of {type(message)}" 36 | ) 37 | if part["type"] == "text": 38 | token_count += count_tokens(part["text"], model) 39 | else: 40 | raise ValueError( 41 | f"The message is expected to be a string or a list of dicts, but got {type(message)}" 42 | ) 43 | return token_count 44 | -------------------------------------------------------------------------------- /browsergym/core/src/browsergym/core/javascript/frame_unmark_elements.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Go through all DOM elements in the frame (including shadowDOMs), 3 | * and cleanup previously stored data in ARIA attributes. 4 | */ 5 | () => { 6 | // get all DOM elements in the current frame (does not include elements in shadowDOMs) 7 | let elements = Array.from(document.querySelectorAll('*')); 8 | let i = 0; 9 | while (i < elements.length) { 10 | const elem = elements[i]; 11 | // add shadowDOM elements to the elements array, in such a way that order is preserved 12 | // TODO: do we really need the order preserved? 13 | if (elem.shadowRoot !== null) { 14 | elements = new Array( 15 | ...Array.prototype.slice.call(elements, 0, i + 1), 16 | ...Array.from(elem.shadowRoot.querySelectorAll("*")), 17 | ...Array.prototype.slice.call(elements, i + 1) 18 | ); 19 | } 20 | i++; 21 | // Hack: remove custom data stored in ARIA attributes 22 | // - elem_global_id: global browsergym identifier 23 | pop_bid_from_attribute(elem, "aria-description"); 24 | pop_bid_from_attribute(elem, "aria-roledescription"); // fallback for generic nodes 25 | } 26 | } 27 | 28 | function pop_bid_from_attribute(elem, attr) { 29 | let bid_regex = /^browsergym_id[^\s]*\s/; 30 | if (elem.hasAttribute(attr)) { 31 | let content = elem.getAttribute(attr); 32 | let original_content = content.replace(bid_regex, ''); 33 | if (original_content) { 34 | elem.setAttribute(attr, original_content); 35 | } 36 | else { 37 | elem.removeAttribute(attr); 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /browsergym/experiments/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling", "hatch-requirements-txt"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "browsergym-experiments" 7 | description = "Experimentation tools for BrowserGym" 8 | authors = [ 9 | {name = "Massimo Caccia"}, 10 | {name = "Alex Lacoste"}, 11 | {name = "Thibault Le Sellier De Chezelles"}, 12 | {name = "Maxime Gasse"}, 13 | ] 14 | readme = "README.md" 15 | requires-python = ">3.7" 16 | license = {text = "Apache-2.0"} 17 | classifiers = [ 18 | "Development Status :: 3 - Alpha", 19 | "Programming Language :: Python :: 3", 20 | "Operating System :: OS Independent", 21 | "Intended Audience :: Science/Research", 22 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 23 | "License :: OSI Approved :: Apache Software License", 24 | ] 25 | dynamic = ["dependencies", "version"] 26 | 27 | [project.optional-dependencies] 28 | miniwob = [ 29 | "browsergym-miniwob", 30 | ] 31 | workarena = [ 32 | "browsergym-workarena", 33 | ] 34 | webarena = [ 35 | "browsergym-webarena", 36 | ] 37 | visualwebarena = [ 38 | "browsergym-visualwebarena", 39 | ] 40 | assistantbench = [ 41 | "browsergym-assistantbench", 42 | ] 43 | weblinx = [ 44 | "weblinx_browsergym", 45 | ] 46 | all = [ 47 | "browsergym-experiment[miniwob]", 48 | "browsergym-experiment[workarena]", 49 | "browsergym-experiment[webarena]", 50 | "browsergym-experiment[visualwebarena]", 51 | "browsergym-experiment[assistantbench]", 52 | "browsergym-experiment[weblinx]", 53 | ] 54 | 55 | [project.urls] 56 | homepage = "https://github.com/ServiceNow/BrowserGym" 57 | 58 | [tool.hatch.version] 59 | path = "../core/src/browsergym/core/__init__.py" 60 | 61 | [tool.hatch.metadata.hooks.requirements_txt] 62 | files = ["requirements.txt"] 63 | 64 | [tool.hatch.build.targets.wheel] 65 | packages = ["src/browsergym", "src/bgym"] 66 | -------------------------------------------------------------------------------- /browsergym/visualwebarena/README.md: -------------------------------------------------------------------------------- 1 | # VisualWebArena benchmark for BrowserGym 2 | 3 | This package provides `browsergym.visualwebarena`, which is an unofficial port of the [VisualWebArena](https://jykoh.com/vwa) benchmark for BrowserGym. 4 | 5 | Note: the original VisualWebArena codebase has been slightly adapted to ensure compatibility. 6 | 7 | 8 | ## Server installation 9 | 10 | You have two options to setup your webarena instance: 11 | - option 1: follow the official [visualwebarena README](https://github.com/web-arena-x/visualwebarena/blob/main/environment_docker/README.md) 12 | - option 2: use our [unofficial setup scripts](https://github.com/gasse/webarena-setup/tree/main/visualwebarena) 13 | 14 | We recommend **option 2** as it allows you to easily customize the ports of each webarena domain, and offers a reset functionality that allwos browsergym to trigger a full instance reset remotely. 15 | 16 | ## Setup 17 | 18 | 1. Install the package 19 | ```sh 20 | pip install browsergym-visualwebarena 21 | ``` 22 | 23 | 2. Download tokenizer resources 24 | ```sh 25 | python -c "import nltk; nltk.download('punkt_tab')" 26 | ``` 27 | 28 | 3. Setup the URLs as environment variables. The ports for each domain here should correspond to those you used when setting up your webarena instance. Note also the `VWA_` prefix which is specific to browsergym. 29 | ```sh 30 | BASE_URL= # example: "http://myazuremachine.eastus.cloudapp.azure.com" 31 | 32 | # visualwebarena environment variables (change ports as needed) 33 | export VWA_CLASSIFIEDS="$BASE_URL:8083" 34 | export VWA_CLASSIFIEDS_RESET_TOKEN="4b61655535e7ed388f0d40a93600254c" 35 | export VWA_SHOPPING="$BASE_URL:8082" 36 | export VWA_REDDIT="$BASE_URL:8080" 37 | export VWA_WIKIPEDIA="$BASE_URL:8081" 38 | export VWA_HOMEPAGE="$BASE_URL:80" 39 | 40 | # if your webarena instance offers the FULL_RESET feature (optional) 41 | export VWA_FULL_RESET="$BASE_URL:7565" 42 | 43 | # otherwise, be sure to NOT set VWA_FULL_RESET, or set it to an empty string 44 | export VWA_FULL_RESET="" 45 | ``` 46 | 47 | 4. Setup an OpenAI API key 48 | 49 | ```sh 50 | export OPENAI_API_KEY=... 51 | ``` 52 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | install: 2 | @echo "--- 🚀 Installing project dependencies ---" 3 | uv pip install -e ./browsergym/core -e ./browsergym/miniwob -e ./browsergym/webarena -e ./browsergym/webarenalite -e ./browsergym/visualwebarena/ -e ./browsergym/experiments -e ./browsergym/assistantbench -e ./browsergym/ 4 | uv run playwright install chromium 5 | 6 | install-demo: 7 | @echo "--- 🚀 Installing demo dependencies ---" 8 | uv pip install -r demo_agent/requirements.txt 9 | uv run playwright install chromium 10 | 11 | demo: 12 | @echo "--- 🚀 Running demo agent ---" 13 | (set -x && cd demo_agent && python run_demo.py) 14 | 15 | setup-miniwob: 16 | @echo "--- 🤖 Setting up MiniWoB++ ---" 17 | @if [ ! -d "miniwob-plusplus" ]; then \ 18 | echo "Cloning MiniWoB++ repository..."; \ 19 | git clone https://github.com/Farama-Foundation/miniwob-plusplus.git; \ 20 | else \ 21 | echo "MiniWoB++ repository already exists, skipping clone..."; \ 22 | fi 23 | @echo "Resetting to specific commit for reproducibility..." 24 | git -C "./miniwob-plusplus" reset --hard 7fd85d71a4b60325c6585396ec4f48377d049838 25 | @echo "Adding MINIWOB_URL to .env file..." 26 | @echo "MINIWOB_URL=\"file://$(shell pwd)/miniwob-plusplus/miniwob/html/miniwob/\"" >> .env 27 | @echo "✅ MiniWoB++ setup complete!" 28 | @echo "💡 To use MiniWoB++, load the environment variables:" 29 | @echo " source .env" 30 | 31 | test-core: 32 | @echo "--- 🧪 Running tests ---" 33 | uv run pytest -n auto ./tests/core 34 | clean-miniwob: 35 | @echo "--- 🧹 Cleaning MiniWoB++ installation ---" 36 | rm -rf miniwob-plusplus 37 | @echo "✅ MiniWoB++ installation cleaned!" 38 | 39 | help: 40 | @echo "Available targets:" 41 | @echo " install - Install project dependencies" 42 | @echo " setup-miniwob - Setup MiniWoB++ dependencies" 43 | @echo " install-demo - Install demo dependencies" 44 | @echo " demo - Run demo agent" 45 | @echo " test-core - Run core tests" 46 | @echo " clean-miniwob - Remove MiniWoB++ directory" 47 | @echo " help - Show this help message" 48 | 49 | .PHONY: install setup-miniwob install-demo demo test-core clean-miniwob help 50 | -------------------------------------------------------------------------------- /tests/core/data/basic_shadow_dom_site/basic_shadow_dom.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Unit Test with Complex Nested Shadow DOM 5 | 6 | 7 |
8 |
9 | 10 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /browsergym/webarena/README.md: -------------------------------------------------------------------------------- 1 | # WebArena benchmark for BrowserGym 2 | 3 | This package provides `browsergym.webarena`, which is an unofficial port of the [WebArena](https://webarena.dev/) benchmark for BrowserGym. 4 | 5 | Note: the original WebArena codebase has been slightly adapted to ensure compatibility. 6 | 7 | ## Server installation 8 | 9 | You have two options to setup your webarena instance: 10 | - option 1: follow the official [webarena README](https://github.com/web-arena-x/webarena/blob/main/environment_docker/README.md) 11 | - option 2: use our [unofficial setup scripts](https://github.com/gasse/webarena-setup/tree/main/webarena) 12 | 13 | We recommend **option 2** as it allows you to easily customize the ports of each webarena domain, and offers a reset functionality that allows browsergym to trigger a full instance reset remotely. 14 | 15 | ## Setup 16 | 17 | 1. Install the package 18 | ```sh 19 | pip install browsergym-webarena 20 | ``` 21 | 22 | 2. Download tokenizer resources 23 | ```sh 24 | python -c "import nltk; nltk.download('punkt_tab')" 25 | ``` 26 | 27 | 3. Setup the URLs as environment variables. The ports for each domain here should correspond to those you used when setting up your webarena instance. Note also the `WA_` prefix which is specific to browsergym. 28 | ```sh 29 | BASE_URL= # example: "http://myazuremachine.eastus.cloudapp.azure.com" 30 | 31 | # webarena environment variables (change ports as needed) 32 | export WA_SHOPPING="$BASE_URL:8082/" 33 | export WA_SHOPPING_ADMIN="$BASE_URL:8083/admin" 34 | export WA_REDDIT="$BASE_URL:8080" 35 | export WA_GITLAB="$BASE_URL:9001" 36 | export WA_WIKIPEDIA="$BASE_URL:8081/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing" 37 | export WA_MAP="$BASE_URL:443" 38 | export WA_HOMEPAGE="$BASE_URL:80" 39 | 40 | # if your webarena instance offers the FULL_RESET feature (optional) 41 | export WA_FULL_RESET="$BASE_URL:7565" 42 | 43 | # otherwise, be sure to NOT set WA_FULL_RESET, or set it to an empty string 44 | export WA_FULL_RESET="" 45 | ``` 46 | 47 | 4. Setup an OpenAI API key 48 | 49 | ```sh 50 | export OPENAI_API_KEY=... 51 | ``` 52 | 53 | > **_NOTE:_** be mindful of costs, as WebArena will call GPT4 for certain evaluations ([llm_fuzzy_match](https://github.com/web-arena-x/webarena/blob/1469b7c9d8eaec3177855b3131569751f43a40d6/evaluation_harness/helper_functions.py#L146C5-L146C20)). 54 | -------------------------------------------------------------------------------- /tests/miniwob/test_click-menu-2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gymnasium as gym 3 | import re 4 | import pytest 5 | 6 | # register gym environments 7 | import browsergym.miniwob 8 | 9 | __SLOW_MO = 1000 if "DISPLAY_BROWSER" in os.environ else None 10 | __HEADLESS = False if "DISPLAY_BROWSER" in os.environ else True 11 | 12 | 13 | @pytest.mark.parametrize("seed", range(5)) 14 | def test_cheat(seed): 15 | env = gym.make( 16 | "browsergym/miniwob.click-menu-2", 17 | headless=__HEADLESS, 18 | slow_mo=__SLOW_MO, 19 | action_mapping=None, 20 | ) 21 | obs, info = env.reset(seed=seed) 22 | 23 | assert obs["last_action_error"] == "" 24 | 25 | match1 = re.match( 26 | 'Click the "Menu" button, and then find and click on the item labeled "(.+)".', obs["goal"] 27 | ) 28 | match2 = re.match( 29 | 'Click the "Menu" button, and then find and click on the item with the "(.+)" icon.', 30 | obs["goal"], 31 | ) 32 | 33 | assert match1 or match2 34 | 35 | if match1: 36 | item_label = match1.groups()[0] 37 | item_classname = { 38 | "Save": "ui-icon-disk", 39 | "Prev": "ui-icon-seek-start", 40 | "Stop": "ui-icon-stop", 41 | "Play": "ui-icon-play", 42 | "Next": "ui-icon-seek-end", 43 | "Zoom In": "ui-icon-zoomin", 44 | "Zoom Out": "ui-icon-zoomout", 45 | }[item_label] 46 | else: 47 | item_classname = match2.groups()[0] 48 | 49 | action = f"""\ 50 | page.get_by_text("Menu").click() 51 | """ 52 | 53 | obs, reward, term, trunc, info = env.step(action) 54 | 55 | assert obs["last_action_error"] == "" 56 | assert reward == 0 57 | assert term == False 58 | 59 | if item_classname in ("ui-icon-seek-start", "ui-icon-stop", "ui-icon-play", "ui-icon-seek-end"): 60 | 61 | action = f"""\ 62 | page.get_by_text("Playback").click() 63 | """ 64 | 65 | obs, reward, term, trunc, info = env.step(action) 66 | 67 | assert obs["last_action_error"] == "" 68 | assert reward == 0 69 | assert term == False 70 | 71 | action = f"""\ 72 | page.locator(".{item_classname}").click() 73 | """ 74 | 75 | obs, reward, term, trunc, info = env.step(action) 76 | 77 | assert obs["last_action_error"] == "" 78 | assert reward == 1 79 | assert term == True 80 | 81 | env.close() 82 | -------------------------------------------------------------------------------- /browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_dicts.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | import numpy as np 4 | 5 | from .utils import _align_bags 6 | 7 | 8 | def calculate_f1_score(precision, recall): 9 | if precision + recall == 0: 10 | return 0 # Handle the case to avoid division by zero 11 | return 2 * (precision * recall) / (precision + recall) 12 | 13 | 14 | def calc_recall(pred: Dict, gold: Dict, use_gold_for_eval: bool): 15 | from .evaluate_factory import get_evaluator_from_gold_answer 16 | 17 | recall = [] 18 | for gold_key, gold_value in gold.items(): 19 | pred_value = pred.get(gold_key) 20 | gold_value = fix_number(gold_value) 21 | pred_value = fix_number(pred_value) 22 | if gold_key not in pred: 23 | recall.append(0) 24 | else: 25 | evaluator = ( 26 | get_evaluator_from_gold_answer(type(gold_value)) 27 | if use_gold_for_eval 28 | else get_evaluator_from_gold_answer(type(pred_value)) 29 | ) 30 | if type(pred_value) != type(gold_value): 31 | recall.append(0) 32 | continue 33 | recall.append(evaluator(pred_value, gold_value)) 34 | avg_recall = np.average(recall) 35 | return avg_recall 36 | 37 | 38 | def fix_number(number): 39 | 40 | if type(number) == str: 41 | copy_ans = number 42 | copy_ans = " ".join( 43 | " ".join(" ".join(copy_ans.split("$")).split("%")).split("sqft") 44 | ).strip() 45 | copy_ans = copy_ans.strip() 46 | copy_ans = copy_ans.replace(",", ".") 47 | try: 48 | return float(copy_ans) 49 | except: 50 | return number 51 | elif type(number) == int: 52 | return float(number) 53 | else: 54 | return number 55 | 56 | 57 | def evaluate_pair_of_dicts(pred: Dict, gold: Dict): 58 | recall = calc_recall(pred, gold, True) 59 | precision = calc_recall(gold, pred, False) 60 | f1 = calculate_f1_score(precision, recall) 61 | return f1 62 | 63 | 64 | def evaluate_dicts(pred: List[Dict], gold: List[Dict]): 65 | if not (type(pred) == dict or len(pred) == 0 or (type(pred) == list and type(pred[0]) == dict)): 66 | return 0 67 | max_alignment_scores = _align_bags(pred, gold, evaluate_pair_of_dicts) 68 | return np.average(max_alignment_scores) 69 | -------------------------------------------------------------------------------- /browsergym/webarenalite/src/browsergym/webarenalite/config.py: -------------------------------------------------------------------------------- 1 | TASK_IDS = [ 2 | 4, 3 | 7, 4 | 15, 5 | 20, 6 | 23, 7 | 27, 8 | 33, 9 | 37, 10 | 43, 11 | 44, 12 | 48, 13 | 56, 14 | 58, 15 | 65, 16 | 69, 17 | 71, 18 | 75, 19 | 77, 20 | 82, 21 | 88, 22 | 93, 23 | 95, 24 | 96, 25 | 97, 26 | 98, 27 | 103, 28 | 109, 29 | 115, 30 | 117, 31 | 118, 32 | 123, 33 | 125, 34 | 127, 35 | 131, 36 | 135, 37 | 139, 38 | 144, 39 | 149, 40 | 155, 41 | 156, 42 | 157, 43 | 162, 44 | 167, 45 | 169, 46 | 173, 47 | 182, 48 | 190, 49 | 196, 50 | 202, 51 | 205, 52 | 211, 53 | 215, 54 | 220, 55 | 221, 56 | 225, 57 | 227, 58 | 235, 59 | 236, 60 | 240, 61 | 247, 62 | 250, 63 | 254, 64 | 258, 65 | 259, 66 | 268, 67 | 270, 68 | 276, 69 | 283, 70 | 285, 71 | 287, 72 | 288, 73 | 296, 74 | 300, 75 | 311, 76 | 313, 77 | 318, 78 | 321, 79 | 324, 80 | 333, 81 | 335, 82 | 348, 83 | 349, 84 | 354, 85 | 357, 86 | 361, 87 | 367, 88 | 368, 89 | 369, 90 | 374, 91 | 376, 92 | 381, 93 | 382, 94 | 383, 95 | 384, 96 | 386, 97 | 387, 98 | 392, 99 | 401, 100 | 404, 101 | 415, 102 | 419, 103 | 423, 104 | 426, 105 | 431, 106 | 440, 107 | 448, 108 | 454, 109 | 458, 110 | 464, 111 | 466, 112 | 470, 113 | 476, 114 | 485, 115 | 488, 116 | 491, 117 | 497, 118 | 505, 119 | 506, 120 | 509, 121 | 514, 122 | 516, 123 | 521, 124 | 524, 125 | 528, 126 | 534, 127 | 538, 128 | 548, 129 | 566, 130 | 567, 131 | 574, 132 | 577, 133 | 582, 134 | 599, 135 | 601, 136 | 605, 137 | 612, 138 | 619, 139 | 626, 140 | 631, 141 | 641, 142 | 645, 143 | 652, 144 | 657, 145 | 668, 146 | 673, 147 | 678, 148 | 682, 149 | 686, 150 | 693, 151 | 704, 152 | 710, 153 | 714, 154 | 720, 155 | 729, 156 | 733, 157 | 741, 158 | 745, 159 | 748, 160 | 760, 161 | 762, 162 | 768, 163 | 791, 164 | 798, 165 | 809, 166 | 811, 167 | ] 168 | -------------------------------------------------------------------------------- /tests/visualwebarena/test_vwa_tasks_without_reset.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import random 4 | 5 | import gymnasium as gym 6 | import playwright.sync_api 7 | import pytest 8 | from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed 9 | 10 | # register gym environments 11 | import browsergym.visualwebarena 12 | 13 | __SLOW_MO = 1000 if "DISPLAY_BROWSER" in os.environ else None 14 | __HEADLESS = False if "DISPLAY_BROWSER" in os.environ else True 15 | 16 | 17 | from browsergym.visualwebarena import VISUALWEBARENA_TASK_IDS_WITHOUT_RESET 18 | 19 | rng = random.Random(1) 20 | task_ids = rng.sample(VISUALWEBARENA_TASK_IDS_WITHOUT_RESET, 25) 21 | 22 | 23 | @retry( 24 | stop=stop_after_attempt(5), 25 | retry=retry_if_exception_type(playwright.sync_api.TimeoutError), 26 | wait=wait_fixed(2), 27 | reraise=True, 28 | before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."), 29 | ) 30 | @pytest.mark.parametrize("task_id", task_ids) 31 | @pytest.mark.slow 32 | def test_env_generic(task_id): 33 | env = gym.make( 34 | f"browsergym/{task_id}", 35 | headless=__HEADLESS, 36 | slow_mo=__SLOW_MO, 37 | ) 38 | obs, info = env.reset() 39 | env.close() 40 | 41 | 42 | @retry( 43 | stop=stop_after_attempt(5), 44 | retry=retry_if_exception_type(playwright.sync_api.TimeoutError), 45 | wait=wait_fixed(2), 46 | reraise=True, 47 | before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."), 48 | ) 49 | def test_domain_safeguard(): 50 | env = gym.make( 51 | f"browsergym/visualwebarena.398", 52 | headless=__HEADLESS, 53 | slow_mo=__SLOW_MO, 54 | ) 55 | obs, info = env.reset() 56 | assert not obs["last_action_error"] 57 | 58 | obs, reward, terminated, truncated, info = env.step("new_tab()") 59 | assert not obs["last_action_error"] 60 | assert not (terminated or truncated) 61 | 62 | obs, reward, terminated, truncated, info = env.step("tab_close()") 63 | assert not obs["last_action_error"] 64 | assert not (terminated or truncated) 65 | 66 | obs, reward, terminated, truncated, info = env.step("tab_focus(0)") 67 | assert not obs["last_action_error"] 68 | assert not (terminated or truncated) 69 | 70 | obs, reward, terminated, truncated, info = env.step('goto("http://www.google.com")') 71 | assert not obs["last_action_error"] 72 | assert terminated 73 | 74 | env.close() 75 | -------------------------------------------------------------------------------- /browsergym/core/src/browsergym/core/action/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Any 3 | 4 | import playwright.sync_api 5 | 6 | from . import get_global_demo_mode 7 | 8 | 9 | class AbstractActionSet(ABC): 10 | def __init__(self, strict: bool = False): 11 | self.strict = strict 12 | 13 | @abstractmethod 14 | def describe(self, with_long_description: bool = True, with_examples: bool = True) -> str: 15 | """ 16 | Returns a textual description of this action space. 17 | """ 18 | 19 | @abstractmethod 20 | def example_action(self, abstract: bool) -> str: 21 | """ 22 | Returns an example action as a string. 23 | """ 24 | 25 | @abstractmethod 26 | def to_python_code(self, action) -> str: 27 | """ 28 | Converts the given action to browsergym-compatible python code. 29 | 30 | Args: 31 | action: the action to convert. 32 | 33 | Returns: 34 | Executable python code that performs the action in a browsergym environment. 35 | """ 36 | 37 | def to_tool_descriptor(self) -> list[Any]: 38 | """ 39 | Converts the action set to a tool descriptor. 40 | 41 | Returns: 42 | A list of dictionaries describing the actions in the action set. 43 | """ 44 | pass 45 | 46 | 47 | def execute_python_code( 48 | code: str, 49 | page: playwright.sync_api.Page, 50 | send_message_to_user: callable, 51 | report_infeasible_instructions: callable, 52 | ): 53 | """ 54 | Executes Python code in a new context, except for a playwright `page` object and a `send_message_to_user` function. 55 | 56 | WARNING: this is not safe! 57 | https://stackoverflow.com/questions/77655440/can-you-protect-a-python-variable-with-exec 58 | 59 | Args: 60 | code: the Python code to execute, as a string. 61 | page: the playwright page that will be made accessible to the code. 62 | send_message_to_user: utility function that will be made accessible to the code. It should take one text argument. 63 | report_infeasible_instructions: utility function that will be made accessible to the code. It should take one text argument. 64 | """ 65 | 66 | globals = { 67 | "page": page, 68 | "send_message_to_user": send_message_to_user, 69 | "report_infeasible_instructions": report_infeasible_instructions, 70 | "DEMO_MODE": get_global_demo_mode(), 71 | } 72 | 73 | exec(code, globals) 74 | -------------------------------------------------------------------------------- /tests/experiments/test_exp_loop.py: -------------------------------------------------------------------------------- 1 | import re 2 | import tempfile 3 | import logging 4 | import dataclasses 5 | 6 | from browsergym.core.action.highlevel import HighLevelActionSet 7 | from browsergym.experiments.agent import Agent 8 | from browsergym.experiments.loop import AbstractAgentArgs, EnvArgs, ExpArgs, get_exp_result 9 | from browsergym.utils.obs import flatten_axtree_to_str 10 | 11 | 12 | class MiniwobTestAgent(Agent): 13 | 14 | action_set = HighLevelActionSet(subsets="bid") 15 | 16 | def obs_preprocessor(self, obs: dict): 17 | return {"axtree_txt": flatten_axtree_to_str(obs["axtree_object"])} 18 | 19 | def get_action(self, obs: dict) -> tuple[str, dict]: 20 | match = re.search(r"^\s*\[(\d+)\].*button", obs["axtree_txt"], re.MULTILINE | re.IGNORECASE) 21 | 22 | if match: 23 | bid = match.group(1) 24 | action = f'click("{bid}")' 25 | else: 26 | raise Exception("Can't find the button's bid") 27 | 28 | return action, dict(think="I'm clicking the button as requested.") 29 | 30 | 31 | @dataclasses.dataclass 32 | class MiniwobTestAgentArgs(AbstractAgentArgs): 33 | def make_agent(self): 34 | return MiniwobTestAgent() 35 | 36 | 37 | def test_run_exp(): 38 | exp_args = ExpArgs( 39 | agent_args=MiniwobTestAgentArgs(), 40 | env_args=EnvArgs(task_name="miniwob.click-test", task_seed=42), 41 | ) 42 | 43 | with tempfile.TemporaryDirectory() as tmp_dir: 44 | exp_args.prepare(tmp_dir) 45 | exp_args.run() 46 | exp_result = get_exp_result(exp_args.exp_dir) 47 | exp_record = exp_result.get_exp_record() 48 | 49 | target = { 50 | "env_args.task_name": "miniwob.click-test", 51 | "env_args.task_seed": 42, 52 | "env_args.headless": True, 53 | "env_args.record_video": False, 54 | "n_steps": 1, 55 | "cum_reward": 1.0, 56 | "terminated": True, 57 | "truncated": False, 58 | } 59 | 60 | assert len(exp_result.steps_info) == 2 61 | 62 | for key, target_val in target.items(): 63 | assert key in exp_record 64 | assert exp_record[key] == target_val 65 | 66 | # TODO investigate why it's taking almost 5 seconds to solve 67 | assert exp_record["stats.cum_step_elapsed"] < 5 68 | if exp_record["stats.cum_step_elapsed"] > 3: 69 | t = exp_record["stats.cum_step_elapsed"] 70 | logging.warning( 71 | f"miniwob.click-test is taking {t:.2f}s (> 3s) to solve with an oracle." 72 | ) 73 | -------------------------------------------------------------------------------- /docs/src/environments/workarena.rst: -------------------------------------------------------------------------------- 1 | WorkArena 2 | ^^^^^^^^^ 3 | 4 | `BrowserGym` integrates `WebArena` enviroment. For more information about this enviroment, please refer to the `WorkArena `_ official documentation. 5 | 6 | 7 | BrowserGym API 8 | """""""""""""" 9 | 10 | .. currentmodule:: browsergym 11 | 12 | .. autosummary:: 13 | :recursive: 14 | :toctree: generated 15 | :caption: WorkArena 16 | 17 | workarena 18 | 19 | 20 | Usage 21 | """"" 22 | 23 | **Create a ServiceNow Developer Instance** 24 | 25 | * Go to https://developer.servicenow.com/ and create an account. 26 | 27 | * Click on Request an instance and select the Washington release (initializing the instance will take a few minutes) 28 | 29 | * Once the instance is ready, you should see your instance URL and credentials. If not, click Return to the Developer Portal, then navigate to Manage instance password and click Reset instance password. 30 | 31 | * You should now see your URL and credentials. Based on this information, set the following environment variables: 32 | 33 | * SNOW_INSTANCE_URL: The URL of your ServiceNow developer instance 34 | 35 | * SNOW_INSTANCE_UNAME: The username, should be "admin" 36 | 37 | * SNOW_INSTANCE_PWD: The password, make sure you place the value in quotes "" and be mindful of escaping special shell characters. Running echo $SNOW_INSTANCE_PWD should print the correct password. 38 | 39 | * Log into your instance via a browser using the admin credentials. Close any popup that appears on the main screen (e.g., agreeing to analytics). 40 | 41 | 42 | **Install WorkArena and Initialize your Instance** 43 | 44 | Run the following command to install WorkArena in the BrowswerGym environment: 45 | 46 | .. code:: bash 47 | 48 | pip install browsergym-workarena 49 | 50 | 51 | Then, run this command in a terminal to upload the benchmark data to your ServiceNow instance: 52 | 53 | .. code:: bash 54 | 55 | workarena-install 56 | 57 | 58 | Finally, install Playwright: 59 | 60 | .. code:: bash 61 | 62 | playwright install chromium 63 | 64 | 65 | Your installation is now complete! 🎉 66 | 67 | **Run a task from the benchmark suite** 68 | 69 | .. code-block:: python 70 | 71 | import gym 72 | import browsergym.workarena 73 | 74 | env = gym.make("browsergym/workarena.servicenow.filter-asset-list") 75 | obs, info = env.reset() 76 | done = False 77 | 78 | while not done: 79 | action = "noop()" 80 | obs, reward, terminated, truncated, info = env.step(action) 81 | print(f"Reward: {reward}, Done: {done}, Info: {info}") 82 | 83 | env.close() 84 | -------------------------------------------------------------------------------- /tests/assistantbench/test_evaluation.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pathlib 3 | 4 | import gymnasium as gym 5 | import pytest 6 | 7 | from browsergym.assistantbench.evaluation.evaluator import question_scorer 8 | from browsergym.experiments.benchmark.metadata.utils import ( 9 | task_list_from_metadata, 10 | task_metadata, 11 | ) 12 | 13 | __DATA_DIR = pathlib.Path(__file__).resolve().parent / "data" 14 | 15 | metadata = task_metadata("assistantbench") 16 | file_path = pathlib.Path(__DATA_DIR) / "fallback_gpt4_seeplanact_predictions.jsonl" 17 | 18 | data_points = {} 19 | 20 | # Open the JSONL file and read each line as a JSON object 21 | with open(file_path, "r") as f: 22 | for line in f: 23 | data_point = json.loads(line) 24 | 25 | original_id = data_point["id"] 26 | answer = data_point["answer"] 27 | gold_answer = data_point["gold_answer"] 28 | score = data_point["score"] 29 | has_ans = data_point["has_ans"] 30 | 31 | data_points[original_id] = { 32 | "task_id": task_list_from_metadata(metadata, {"original_id": original_id})[0], 33 | "answer": answer, 34 | "gold_answer": gold_answer, 35 | "score": score, 36 | "has_ans": has_ans, 37 | } 38 | 39 | 40 | @pytest.mark.parametrize("original_id", list(data_points.keys())) 41 | def test_evaluate(original_id: str): 42 | 43 | answer = data_points[original_id]["answer"] 44 | gold_answer = data_points[original_id]["gold_answer"] 45 | expected_score = data_points[original_id]["score"] 46 | expected_has_ans = data_points[original_id]["has_ans"] 47 | 48 | score, has_ans = question_scorer(answer, gold_answer) 49 | 50 | # Assert if the expected results doesn't match 51 | assert score == expected_score 52 | assert has_ans == expected_has_ans 53 | 54 | 55 | @pytest.mark.parametrize( 56 | "original_id", 57 | [id for id in data_points.keys() if isinstance(data_points[id]["answer"], (str, float, int))], 58 | ) 59 | @pytest.mark.slow 60 | def test_evaluate_within_env(original_id: str): 61 | 62 | task_id = data_points[original_id]["task_id"] 63 | answer = data_points[original_id]["answer"] 64 | expected_score = data_points[original_id]["score"] 65 | 66 | env = gym.make( 67 | f"browsergym/{task_id}", 68 | ) 69 | obs, info = env.reset() 70 | assert not obs["last_action_error"] 71 | 72 | obs, reward, terminated, truncated, info = env.step(f"send_msg_to_user({repr(str(answer))})") 73 | assert not obs["last_action_error"] 74 | assert terminated 75 | assert reward == expected_score 76 | 77 | env.close() 78 | -------------------------------------------------------------------------------- /browsergym/assistantbench/src/browsergym/assistantbench/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import pathlib 5 | import time 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | def add_prediction_to_jsonl( 11 | file_path: str, task_id: str, prediction: object, override_if_exists: bool 12 | ) -> None: 13 | """ 14 | Multiprocessing-safe file write. 15 | """ 16 | lock_file_path = pathlib.Path(file_path).with_suffix(".lock") 17 | lock_max_wait = 10 # 10 seconds 18 | 19 | # Acquire lock (atomic file creation) 20 | start_time = time.time() 21 | while True: 22 | try: 23 | fd = os.open(lock_file_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY) 24 | with os.fdopen(fd, "w") as f: 25 | f.write("lock") 26 | break 27 | except FileExistsError: 28 | # give up if max wait time reached 29 | seconds_waited = time.time() - start_time 30 | if seconds_waited >= lock_max_wait: 31 | raise RuntimeError( 32 | f"Lock file could not be acquired after {seconds_waited} seconds ({lock_file_path})" 33 | ) 34 | # wait for lock release 35 | logger.info(f"Waiting for lock file to be released: {lock_file_path}") 36 | time.sleep(1) # 1 sec 37 | 38 | logger.info(f"Lock file acquired: {lock_file_path}") 39 | 40 | # Check if the file exists, if not, create it 41 | if not os.path.exists(file_path): 42 | with open(file_path, "w") as f: 43 | pass # Create an empty file 44 | 45 | # Load existing data, if any 46 | data = [] 47 | if os.path.exists(file_path): 48 | with open(file_path, "r") as f: 49 | data.extend([json.loads(line) for line in f if line.strip()]) # Skip empty lines 50 | 51 | # Check if task_id already exists 52 | existing_record = next((entry for entry in data if entry["id"] == task_id), None) 53 | 54 | # Add or update the record 55 | if not existing_record: 56 | # Add new record 57 | data.append({"id": task_id, "answer": prediction}) 58 | elif override_if_exists: 59 | # Update existing record 60 | existing_record["answer"] = prediction 61 | else: 62 | raise ValueError( 63 | f"Prediction for task ID {repr(task_id)} already exists in file {file_path}." 64 | ) 65 | 66 | # Write data back to the file 67 | with open(file_path, "w") as f: 68 | for entry in data: 69 | f.write(json.dumps(entry) + "\n") 70 | 71 | # Release lock (remove file) 72 | os.remove(lock_file_path) 73 | logger.info(f"Lock file released: {lock_file_path}") 74 | -------------------------------------------------------------------------------- /tests/core/test_registration.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import gymnasium as gym 4 | import pytest 5 | 6 | from browsergym.core.registration import register_task 7 | from browsergym.core.task import AbstractBrowserTask 8 | 9 | 10 | class RegistrationTestTask(AbstractBrowserTask): 11 | @classmethod 12 | def get_task_id(cls): 13 | raise NotImplementedError 14 | 15 | def __init__(self, a: str = "", b: int = 0, c: bool = False, *args, **kwargs): 16 | super().__init__(*args, **kwargs) 17 | self.a = a 18 | self.b = b 19 | self.c = c 20 | 21 | def setup(self, page): 22 | return "", {} 23 | 24 | def teardown(self): 25 | pass 26 | 27 | def validate(self, page, chat_messages): 28 | return 0, True, "", {} 29 | 30 | 31 | register_task("test_task", RegistrationTestTask) 32 | register_task( 33 | "test_task_with_defaults", 34 | RegistrationTestTask, 35 | task_kwargs={"a": "new value"}, 36 | default_task_kwargs={"b": 1}, 37 | ) 38 | 39 | 40 | def test_registration(): 41 | 42 | with pytest.raises(ValueError): 43 | register_task( 44 | "test_task_forbidden", 45 | RegistrationTestTask, 46 | task_kwargs={"a": "new value"}, 47 | default_task_kwargs={"a": "other value"}, 48 | ) 49 | 50 | env = gym.make("browsergym/test_task") 51 | 52 | assert env.unwrapped.task_kwargs == {} 53 | 54 | env.reset() 55 | env.unwrapped.task.a == "" 56 | env.unwrapped.task.b == 0 57 | env.unwrapped.task.c == False 58 | env.close() 59 | 60 | env = gym.make("browsergym/test_task", task_kwargs={"a": "other", "b": 1}) 61 | 62 | assert env.unwrapped.task_kwargs == {"a": "other", "b": 1} 63 | 64 | env.reset() 65 | env.unwrapped.task.a == "other" 66 | env.unwrapped.task.b == 1 67 | env.unwrapped.task.c == False 68 | env.close() 69 | 70 | env = gym.make("browsergym/test_task_with_defaults") 71 | 72 | assert env.unwrapped.task_kwargs == {} 73 | 74 | env.reset() 75 | env.unwrapped.task.a == "new value" 76 | env.unwrapped.task.b == 1 77 | env.unwrapped.task.c == False 78 | env.close() 79 | 80 | env = gym.make("browsergym/test_task_with_defaults", task_kwargs={"b": 2}) 81 | 82 | assert env.unwrapped.task_kwargs == {"b": 2} 83 | 84 | env.reset() 85 | env.unwrapped.task.a == "new value" 86 | env.unwrapped.task.b == 2 87 | env.unwrapped.task.c == False 88 | env.close() 89 | 90 | env = gym.make("browsergym/test_task_with_defaults", task_kwargs={"a": "other"}) 91 | 92 | assert env.unwrapped.task_kwargs == {"a": "other"} 93 | 94 | with pytest.raises( 95 | expected_exception=ValueError, 96 | match=re.compile("Illegal attempt to override frozen parameters"), 97 | ): 98 | env.reset() 99 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_store 2 | .idea/ 3 | docs/src/generated/ 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | pip-wheel-metadata/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # error logs 136 | error_logs.txt 137 | 138 | # tests 139 | tests/results 140 | tmp.py 141 | .vscode/** 142 | 143 | # demo and results 144 | results/ 145 | 146 | .vscode/launch.json 147 | 148 | # assistantbench 149 | tests/assistantbench/assistantbench-predictions-test.jsonl 150 | 151 | # weblinx 152 | bg_wl_data/ 153 | 154 | # miniwob setup 155 | miniwob-plusplus/ 156 | 157 | uv.lock 158 | -------------------------------------------------------------------------------- /browsergym/core/src/browsergym/core/registration.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | from typing import Type 3 | 4 | import gymnasium as gym 5 | 6 | from .env import BrowserEnv 7 | from .task import AbstractBrowserTask 8 | 9 | 10 | class frozen_partial: 11 | """ 12 | Freeze some keyword arguments of a function. 13 | 14 | """ 15 | 16 | def __init__(self, func, **frozen_kwargs): 17 | self.func = func 18 | self.frozen_kwargs = frozen_kwargs 19 | 20 | def __call__(self, *args, **kwargs): 21 | # check overlap between kwargs and frozen_kwargs 22 | clashing_kwargs = set(self.frozen_kwargs) & set(kwargs) # key set intersection 23 | if clashing_kwargs: 24 | raise ValueError(f"Illegal attempt to override frozen parameters {clashing_kwargs}.") 25 | # merge the two dicts 26 | kwargs = kwargs | self.frozen_kwargs 27 | 28 | return self.func(*args, **kwargs) 29 | 30 | 31 | def register_task( 32 | id: str, 33 | task_class: Type[AbstractBrowserTask], 34 | task_kwargs: dict = {}, 35 | default_task_kwargs: dict = {}, 36 | nondeterministic: bool = True, 37 | *args, 38 | **kwargs, 39 | ): 40 | """ 41 | Registers a browser task as a gym environment with its unique id. 42 | 43 | Args: 44 | id: the id of the task to register (will be prepended by "browsergym/"). 45 | task_class: the task class to register. 46 | task_kwargs: frozen task arguments (can not be overloaded at environment creation time). 47 | task_kwargs_default: default task arguments (can be overloaded at environment creation time). 48 | nondeterministic: whether the task cannot be guaranteed deterministic transitions. 49 | *args: additional sequential arguments for either the gym or the browsergym environment. 50 | *kwargs: additional keyword arguments for either the gym or the browsergym environment. 51 | """ 52 | if task_kwargs and default_task_kwargs: 53 | # check overlap between frozen and default task_kwargs 54 | clashing_kwargs = set(task_kwargs) & set(default_task_kwargs) # key set intersection 55 | if clashing_kwargs: 56 | raise ValueError( 57 | f"Illegal attempt to register Browsergym environment {id} with both frozen and default values for task parameters {clashing_kwargs}." 58 | ) 59 | 60 | task_entrypoint = task_class 61 | 62 | # freeze task_kwargs (cannot be overriden at environment creation) 63 | task_entrypoint = frozen_partial(task_class, **task_kwargs) 64 | 65 | # pre-set default_task_kwargs (can be overriden at environment creation) 66 | task_entrypoint = partial(task_entrypoint, **default_task_kwargs) 67 | 68 | gym.register( 69 | id=f"browsergym/{id}", 70 | entry_point=lambda *env_args, **env_kwargs: BrowserEnv( 71 | task_entrypoint, *env_args, **env_kwargs 72 | ), 73 | nondeterministic=nondeterministic, 74 | *args, 75 | **kwargs, 76 | ) 77 | -------------------------------------------------------------------------------- /tests/core/data/obstructed_checkbox_page.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Checkbox with Label Interception 7 | 51 | 52 | 53 | 54 |
55 |
56 | 57 |
58 |
59 | 60 |
61 | 62 |
63 |
64 | 65 |
66 |
67 |
68 | 69 | 70 |
71 |
72 | 73 | 74 |
75 |
76 | 77 | 78 |
79 |
80 |
81 |
82 | 83 |
84 |
85 | 86 |
87 |
88 | 89 |
90 |
91 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /tests/core/test_task.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import playwright 4 | import pytest 5 | 6 | from browsergym.core.env import BrowserEnv 7 | from browsergym.core.task import AbstractBrowserTask 8 | 9 | 10 | class MockImageGoalTask(AbstractBrowserTask): 11 | @classmethod 12 | def get_task_id(cls): 13 | return "mockimagegoal" 14 | 15 | def __init__(self, seed: int = 0, start_url: str = "https://www.google.com") -> None: 16 | """ 17 | Args: 18 | seed: random seed. 19 | start_url: str, the url for the starting page. 20 | goal: str, the initial goal. 21 | 22 | """ 23 | super().__init__(seed) 24 | self.start_url = start_url 25 | self.goal = [ 26 | {"type": "text", "text": "This is a mock task with an image goal."}, 27 | { 28 | "type": "image_url", 29 | "image_url": "", 30 | }, 31 | ] 32 | 33 | def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: 34 | page.goto(self.start_url, timeout=10000) 35 | return self.goal, {} 36 | 37 | def teardown(self) -> None: 38 | pass 39 | 40 | def validate( 41 | self, page: playwright.sync_api.Page, chat_messages: list[str] 42 | ) -> Tuple[float, bool, str, dict]: 43 | reward, done, msg, info = 0, False, "", {} 44 | 45 | for message in chat_messages: 46 | if message["role"] == "user" and message["message"] == "exit": 47 | done = True 48 | break 49 | 50 | return reward, done, msg, info 51 | 52 | 53 | def test_mock_image_goal_task(): 54 | env = BrowserEnv(MockImageGoalTask) 55 | obs, _ = env.reset() 56 | 57 | assert "goal_object" in obs 58 | assert len(obs["goal_object"]) == 2 59 | assert obs["goal_object"][0]["type"] == "text" 60 | assert obs["goal_object"][0]["text"] == "This is a mock task with an image goal." 61 | assert obs["goal_object"][1]["type"] == "image_url" 62 | 63 | env.chat.add_message("user", "exit") 64 | obs, reward, terminated, _, _ = env.step("send_msg_to_user('bye')") 65 | 66 | assert reward == 0 67 | assert terminated is True 68 | 69 | env.close() 70 | 71 | 72 | if __name__ == "__main__": 73 | test_mock_image_goal_task() 74 | -------------------------------------------------------------------------------- /tests/core/data/long_page.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 |

This is the top

7 | 8 | 9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |

This is the bottom

206 | 207 | 208 | 209 | 210 | 211 | 212 | -------------------------------------------------------------------------------- /browsergym/core/src/browsergym/core/task.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Tuple 3 | 4 | import numpy as np 5 | import playwright.sync_api 6 | 7 | 8 | class AbstractBrowserTask(ABC): 9 | """ 10 | Abstract class for browsergym tasks. 11 | 12 | """ 13 | 14 | @classmethod 15 | def get_task_id(cls): 16 | raise NotImplementedError 17 | 18 | def __init__(self, seed: int) -> None: 19 | # initiate a random number generator 20 | self.random = np.random.RandomState(seed) 21 | 22 | # task properties, will be used to set up the browsergym environment 23 | # default values, can be overriden in children classes 24 | self.viewport = {"width": 1280, "height": 720} 25 | self.slow_mo = 1000 # ms 26 | self.timeout = 5000 # ms 27 | self.locale = None # see https://playwright.dev/python/docs/api/class-browser#browser-new-context-option-locale 28 | self.timezone_id = None # see https://playwright.dev/python/docs/api/class-browser#browser-new-context-option-timezone-id 29 | 30 | @abstractmethod 31 | def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: 32 | """ 33 | Set up everything needed to execute the task. 34 | 35 | Args: 36 | page: the active playwright page. 37 | 38 | Returns: 39 | goal: str, goal of the task. 40 | info: dict, custom information from the task. 41 | """ 42 | 43 | @abstractmethod 44 | def validate( 45 | self, page: playwright.sync_api.Page, chat_messages: list[str] 46 | ) -> Tuple[float, bool, str, dict]: 47 | """ 48 | Validate the task was completed successfully 49 | 50 | Args: 51 | page: the active playwright page. 52 | chat_messages: the chat messages. 53 | 54 | Returns: 55 | reward: float, the reward obtained since last call to validate(). 56 | done: boolean flag, indicates if the task has finished or not (be it success or fail). 57 | message: string, a new user message for the chat. 58 | info: dictionnary, custom information from the task. 59 | 60 | """ 61 | 62 | def cheat(self, page: playwright.sync_api.Page, chat_messages: list[str]) -> None: 63 | """ 64 | Solve the task using a pre-defined solution (optional). 65 | 66 | """ 67 | raise NotImplementedError 68 | 69 | def teardown(self) -> None: 70 | """ 71 | Tear down the task and clean up any resource / data created by the task (optional). 72 | 73 | """ 74 | pass 75 | 76 | 77 | class OpenEndedTask(AbstractBrowserTask): 78 | @classmethod 79 | def get_task_id(cls): 80 | return "openended" 81 | 82 | def __init__(self, seed: int, start_url: str, goal: str = None) -> None: 83 | """ 84 | Args: 85 | seed: random seed. 86 | start_url: str, the url for the starting page. 87 | goal: str, the initial goal. 88 | 89 | """ 90 | super().__init__(seed) 91 | self.start_url = start_url 92 | self.goal = goal 93 | 94 | def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: 95 | page.goto(self.start_url, timeout=10000) 96 | return self.goal, {} 97 | 98 | def teardown(self) -> None: 99 | pass 100 | 101 | def validate( 102 | self, page: playwright.sync_api.Page, chat_messages: list[str] 103 | ) -> Tuple[float, bool, str, dict]: 104 | reward, done, msg, info = 0, False, "", {} 105 | 106 | for message in chat_messages: 107 | if message["role"] == "user" and message["message"] == "exit": 108 | done = True 109 | break 110 | 111 | return reward, done, msg, info 112 | -------------------------------------------------------------------------------- /browsergym/core/src/browsergym/core/chat.py: -------------------------------------------------------------------------------- 1 | import base64 2 | from pathlib import Path 3 | from typing import Literal 4 | import logging 5 | import playwright.sync_api 6 | import re 7 | import time 8 | 9 | from importlib import resources 10 | 11 | from . import _get_global_playwright, chat_files 12 | 13 | 14 | CHATBOX_DIR = resources.files(chat_files) 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | class Chat: 20 | def __init__( 21 | self, headless: bool, chat_size=(500, 800), record_video_dir=None, modern=True 22 | ) -> None: 23 | self.messages = [] 24 | 25 | # create a new browser, browser context and page for the chat 26 | pw: playwright.sync_api.Playwright = _get_global_playwright() 27 | self.browser = pw.chromium.launch( 28 | headless=headless, args=[f"--window-size={chat_size[0]},{chat_size[1]}"] 29 | ) 30 | self.context = self.browser.new_context( 31 | no_viewport=True, 32 | record_video_dir=Path(record_video_dir) / "chat_video" if record_video_dir else None, 33 | record_video_size=dict(width=chat_size[0], height=chat_size[1]), 34 | ) 35 | self.page = self.context.new_page() 36 | self.recording_start_time = time.time() if record_video_dir else None 37 | 38 | # setup the chat page 39 | self.page.expose_function( 40 | "send_user_message", lambda msg: self._js_user_message_received_callback(msg=msg) 41 | ) 42 | 43 | if modern: 44 | self.page.set_content(get_chatbox_modern(CHATBOX_DIR)) 45 | else: 46 | self.page.set_content(get_chatbox_classic(CHATBOX_DIR)) 47 | 48 | def _js_user_message_received_callback(self, msg: str): 49 | """Callback function for when a user message is received in the chatbox""" 50 | utc_time = time.time() 51 | self.messages.append({"role": "user", "timestamp": utc_time, "message": msg}) 52 | # returning a list as JS doesnt like tuples 53 | return ["user", time.strftime("%H:%M", time.localtime(utc_time)), msg] 54 | 55 | def add_message( 56 | self, role: Literal["user", "user_image", "assistant", "info", "infeasible"], msg: str 57 | ): 58 | """Add a message to the chatbox and update the page accordingly.""" 59 | utc_time = time.time() 60 | if role not in ("user", "user_image", "assistant", "info", "infeasible"): 61 | raise ValueError(f"Invalid role: {role}") 62 | if role in ("user", "user_image", "assistant", "infeasible"): 63 | self.messages.append({"role": role, "timestamp": utc_time, "message": msg}) 64 | timestamp = time.strftime("%H:%M:%S", time.localtime(utc_time)) 65 | self.page.evaluate(f"addChatMessage({repr(role)}, {repr(timestamp)}, {repr(msg)});") 66 | 67 | def wait_for_user_message(self): 68 | logger.info("Waiting for message from user...") 69 | # reset flag 70 | self.page.evaluate("USER_MESSAGE_RECEIVED = false;") 71 | # wait for flag to be raised 72 | self.page.wait_for_function("USER_MESSAGE_RECEIVED", polling=100, timeout=0) 73 | logger.info("Message received.") 74 | 75 | def close(self): 76 | self.context.close() 77 | self.browser.close() 78 | 79 | 80 | def get_chatbox_modern(chatbox_dir) -> str: 81 | with open(chatbox_dir / "chatbox_modern.html", "r") as file: 82 | chatbox_html = file.read() 83 | 84 | return chatbox_html 85 | 86 | 87 | def get_chatbox_classic(chatbox_dir) -> str: 88 | with open(chatbox_dir / "chatbox.html", "r") as file: 89 | chatbox_html = file.read() 90 | with open(chatbox_dir / "assistant.png", "rb") as f: 91 | image_base64 = base64.b64encode(f.read()).decode("utf-8") 92 | 93 | assistant_image_url = f"data:image/png;base64,{image_base64}" 94 | chatbox_html = re.sub("", assistant_image_url, chatbox_html) 95 | return chatbox_html 96 | -------------------------------------------------------------------------------- /demo_agent/run_demo.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | # locally defined agent 4 | from agent import DemoAgentArgs 5 | 6 | # browsergym experiments utils 7 | from browsergym.experiments import EnvArgs, ExpArgs, get_exp_result 8 | 9 | 10 | def str2bool(v): 11 | if isinstance(v, bool): 12 | return v 13 | if v.lower() in ("yes", "true", "t", "y", "1"): 14 | return True 15 | elif v.lower() in ("no", "false", "f", "n", "0"): 16 | return False 17 | else: 18 | raise argparse.ArgumentTypeError("Boolean value expected.") 19 | 20 | 21 | def parse_args(): 22 | parser = argparse.ArgumentParser(description="Run experiment with hyperparameters.") 23 | parser.add_argument( 24 | "--model_name", 25 | type=str, 26 | default="gpt-4o-mini", 27 | help="OpenAI model name.", 28 | ) 29 | parser.add_argument( 30 | "--task_name", 31 | type=str, 32 | default="openended", 33 | help="Name of the Browsergym task to run. If 'openended', you need to specify a 'start_url'", 34 | ) 35 | parser.add_argument( 36 | "--start_url", 37 | type=str, 38 | default="https://www.google.com", 39 | help="Starting URL (only for the openended task).", 40 | ) 41 | parser.add_argument( 42 | "--visual_effects", 43 | type=str2bool, 44 | default=True, 45 | help="Add visual effects when the agents performs actions.", 46 | ) 47 | parser.add_argument( 48 | "--use_html", 49 | type=str2bool, 50 | default=False, 51 | help="Use HTML in the agent's observation space.", 52 | ) 53 | parser.add_argument( 54 | "--use_axtree", 55 | type=str2bool, 56 | default=True, 57 | help="Use AXTree in the agent's observation space.", 58 | ) 59 | parser.add_argument( 60 | "--use_screenshot", 61 | type=str2bool, 62 | default=False, 63 | help="Use screenshot in the agent's observation space.", 64 | ) 65 | 66 | return parser.parse_args() 67 | 68 | 69 | def main(): 70 | print( 71 | """\ 72 | --- WARNING --- 73 | This is a basic agent for demo purposes. 74 | Visit AgentLab for more capable agents with advanced features. 75 | https://github.com/ServiceNow/AgentLab""" 76 | ) 77 | 78 | args = parse_args() 79 | 80 | # setting up agent config 81 | agent_args = DemoAgentArgs( 82 | model_name=args.model_name, 83 | chat_mode=False, 84 | demo_mode="default" if args.visual_effects else "off", 85 | use_html=args.use_html, 86 | use_axtree=args.use_axtree, 87 | use_screenshot=args.use_screenshot, 88 | ) 89 | 90 | # setting up environment config 91 | env_args = EnvArgs( 92 | task_name=args.task_name, 93 | task_seed=None, 94 | max_steps=100, 95 | headless=False, # keep the browser open 96 | # viewport={"width": 1500, "height": 1280}, # can be played with if needed 97 | ) 98 | 99 | # for openended task, set environment and agent to interactive chat mode on a start url 100 | if args.task_name == "openended": 101 | agent_args.chat_mode = True 102 | env_args.wait_for_user_message = True 103 | env_args.task_kwargs = {"start_url": args.start_url} 104 | 105 | # setting up the experiment 106 | exp_args = ExpArgs( 107 | env_args=env_args, 108 | agent_args=agent_args, 109 | ) 110 | 111 | # running and logging results 112 | exp_args.prepare("./results") 113 | exp_args.run() 114 | 115 | # loading and printing results 116 | exp_result = get_exp_result(exp_args.exp_dir) 117 | exp_record = exp_result.get_exp_record() 118 | 119 | for key, val in exp_record.items(): 120 | print(f"{key}: {val}") 121 | 122 | 123 | if __name__ == "__main__": 124 | main() 125 | -------------------------------------------------------------------------------- /browsergym/core/src/browsergym/core/action/parsers.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import pyparsing as pp 3 | 4 | from dataclasses import dataclass 5 | from typing import Any 6 | 7 | 8 | @dataclass 9 | class NamedArgument: 10 | name: str 11 | value: Any 12 | 13 | def __repr__(self): 14 | return f"{self.name}={repr(self.value)}" 15 | 16 | 17 | def _build_highlevel_action_parser() -> pp.ParserElement: 18 | """ 19 | Returns: 20 | An action parser that accepts Python-like function calls with string, number, list or dict literals as arguments. 21 | Example: 22 | func("a", 42, None, True, [2, 4, "s"], {"a_key": "a_value"}, ) 23 | The parser is loose and accepts multi-line or single-line combinations af calls. 24 | Example: 25 | func() func() 26 | \tfunc() 27 | Python comments are ignored. 28 | Example: 29 | # this is a comment 30 | func() # this function call will be parsed 31 | # func() # this one will not 32 | The parser will return a list of (function_name, function_args) tuples, one for each function call in the input. 33 | The parser will raise exceptions 34 | 35 | """ 36 | 37 | def make_keyword(kwd_str, kwd_value): 38 | return pp.Keyword(kwd_str).set_parse_action(pp.replace_with(kwd_value)) 39 | 40 | TRUE = make_keyword("True", True) 41 | FALSE = make_keyword("False", False) 42 | NONE = make_keyword("None", None) 43 | 44 | LBRACK, RBRACK, LBRACE, RBRACE, LPAREN, RPAREN, COLON = map(pp.Suppress, "[]{}():") 45 | 46 | def literal_eval(toks): 47 | return ast.literal_eval(toks[0]) 48 | 49 | string = pp.python_quoted_string().set_parse_action(literal_eval) 50 | number = pp.pyparsing_common.number() 51 | dict = pp.Forward().set_name("dict") # will be defined later 52 | list = pp.Forward().set_name("list") # will be defined later 53 | _tuple = pp.Forward().set_name("tuple") # will be defined later 54 | element = (string | number | dict | list | _tuple | TRUE | FALSE | NONE).set_name("element") 55 | 56 | list_items = pp.DelimitedList(element, allow_trailing_delim=True).set_name(None) 57 | list << pp.Group(LBRACK + pp.Optional(list_items) + RBRACK, aslist=True) 58 | _tuple << pp.Group(LPAREN + pp.Optional(list_items) + RPAREN, aslist=True).set_parse_action( 59 | lambda tokens: tuple(tokens[0]) 60 | ) 61 | 62 | dict_item = pp.Group(string + COLON + element, aslist=True).set_name("dict item") 63 | dict_items = pp.DelimitedList(dict_item, allow_trailing_delim=True).set_name(None) 64 | dict << pp.Dict(LBRACE + pp.Optional(dict_items) + RBRACE, asdict=True) 65 | 66 | arg = element 67 | list_args = pp.DelimitedList(arg, allow_trailing_delim=True).set_name(None) 68 | named_arg = (pp.pyparsing_common.identifier() + pp.Literal("=") + element).set_parse_action( 69 | lambda tokens: NamedArgument(name=tokens[0], value=tokens[2]) 70 | ) 71 | list_named_args = pp.DelimitedList(named_arg, allow_trailing_delim=True).set_name(None) 72 | function_call = pp.pyparsing_common.identifier() + pp.Group( 73 | LPAREN + pp.Optional(list_args) + pp.Optional(list_named_args) + RPAREN, aslist=True 74 | ) 75 | 76 | multiple_function_calls = pp.DelimitedList(pp.Group(function_call), delim="") 77 | multiple_function_calls.ignore(pp.python_style_comment()) 78 | 79 | parser = multiple_function_calls 80 | 81 | return parser 82 | 83 | 84 | # this one will be used to extract python-like function calls 85 | highlevel_action_parser: pp.ParserElement = _build_highlevel_action_parser() 86 | 87 | # this one will be used to process the docstring in high-level actions, in order to describe the action space 88 | action_docstring_parser: pp.ParserElement = ( 89 | pp.Group(pp.OneOrMore(pp.Word(pp.printables), stop_on=pp.Literal("Examples:"))) 90 | + pp.Literal("Examples:").suppress() 91 | + pp.Group(highlevel_action_parser) 92 | ) 93 | -------------------------------------------------------------------------------- /browsergym/visualwebarena/src/browsergym/visualwebarena/instance.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import playwright.sync_api 5 | 6 | # we inherit some code base from webarena to avoid too much duplication 7 | from browsergym.webarena.instance import WebArenaInstance 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | ENV_VARS = ("SHOPPING", "REDDIT", "WIKIPEDIA", "HOMEPAGE", "CLASSIFIEDS", "CLASSIFIEDS_RESET_TOKEN") 13 | 14 | 15 | class VisualWebArenaInstance(WebArenaInstance): 16 | """ 17 | Utility class to access a WebArena instance. 18 | 19 | """ 20 | 21 | RESET_URL_VAR = "VWA_FULL_RESET" # used by full_reset() 22 | 23 | def __init__( 24 | self, 25 | ) -> None: 26 | 27 | # setup visualwebarena environment variables (visualwebarena will read those on import) 28 | os.environ["DATASET"] = "visualwebarena" 29 | append_vwa = lambda x: f"VWA_{x}" 30 | for key in ENV_VARS: 31 | assert append_vwa(key) in os.environ, ( 32 | f"Environment variable {append_vwa(key)} missing.\n" 33 | + "Please set the following environment variables to use VisualWebArena through BrowserGym:\n" 34 | + "\n".join([append_vwa(x) for x in ENV_VARS]) 35 | ) 36 | os.environ[key] = os.environ[append_vwa(key)] 37 | 38 | # import webarena on instantiation 39 | from visualwebarena.browser_env.env_config import ( 40 | ACCOUNTS, 41 | CLASSIFIEDS, 42 | CLASSIFIEDS_RESET_TOKEN, 43 | HOMEPAGE, 44 | REDDIT, 45 | SHOPPING, 46 | WIKIPEDIA, 47 | ) 48 | 49 | self.urls = { 50 | "reddit": REDDIT, 51 | "shopping": SHOPPING, 52 | "wikipedia": WIKIPEDIA, 53 | "classifieds": CLASSIFIEDS, 54 | } 55 | self.home_url = HOMEPAGE 56 | self.classifieds_reset_token = CLASSIFIEDS_RESET_TOKEN 57 | 58 | self.credentials = ACCOUNTS 59 | 60 | def ui_login(self, site: str, page: playwright.sync_api.Page): 61 | """ 62 | Should only be called once per site (expects user to be logged out). 63 | """ 64 | 65 | url = self.urls[site] 66 | 67 | # open a new page (tab) to perform the login 68 | page = page.context.new_page() 69 | 70 | match site: 71 | case "reddit": 72 | username = self.credentials[site]["username"] 73 | password = self.credentials[site]["password"] 74 | page.goto(f"{url}") 75 | page.get_by_role("link", name="Log in").click() 76 | page.get_by_label("Username").fill(username) 77 | page.get_by_label("Password").fill(password) 78 | page.get_by_role("button", name="Log in").click() 79 | case "shopping": 80 | username = self.credentials[site]["username"] 81 | password = self.credentials[site]["password"] 82 | 83 | page.goto(f"{url}/customer/account/login/") 84 | page.get_by_label("Email", exact=True).fill(username) 85 | page.get_by_label("Password", exact=True).fill(password) 86 | page.get_by_role("button", name="Sign In").click() 87 | 88 | case "wikipedia": 89 | page.goto(url) 90 | 91 | case "classifieds": 92 | username = self.credentials[site]["username"] 93 | password = self.credentials[site]["password"] 94 | page.goto(f"{url}/index.php?page=login") 95 | page.locator("#email").fill(username) 96 | page.locator("#password").fill(password) 97 | page.get_by_role("button", name="Log in").click() 98 | 99 | case _: 100 | raise ValueError 101 | 102 | # release login page 103 | page.close() 104 | -------------------------------------------------------------------------------- /browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluator.py: -------------------------------------------------------------------------------- 1 | # todo export evaluation to a python package 2 | 3 | import json 4 | 5 | import numpy as np 6 | 7 | from .evaluate_utils.evaluate_factory import get_evaluator 8 | 9 | 10 | def find_isnan(samp): 11 | try: 12 | if np.isnan(samp): 13 | return True 14 | else: 15 | return False 16 | except: 17 | return False 18 | 19 | 20 | def fix_ans(answer): 21 | try: 22 | answer = ( 23 | answer.replace("{'", '{"') 24 | .replace("', '", '", "') 25 | .replace("': '", '": "') 26 | .replace("'}", '"}') 27 | ) 28 | answer = answer.replace("': ", '": ') 29 | return answer 30 | except: 31 | return answer 32 | 33 | 34 | def parse_answer(answer): 35 | if len(answer) == 1: 36 | ans, is_num = fix_number(answer[0]) 37 | if is_num: 38 | return ans, "number" 39 | try: 40 | ans = json.loads(fix_ans(answer[0])) 41 | return [ans], "json" 42 | except: 43 | ans, is_num = fix_number(answer[0]) 44 | if is_num: 45 | return ans, "number" 46 | else: 47 | return answer[0], "string" 48 | else: 49 | try: 50 | ans = [json.loads(fix_ans(ex)) for ex in answer] 51 | return ans, "json" 52 | except: 53 | return answer, "string list" 54 | 55 | 56 | def fix_number(number): 57 | if type(number) == str: 58 | copy_ans = number 59 | copy_ans = " ".join( 60 | " ".join(" ".join(copy_ans.split("$")).split("%")).split("sqft") 61 | ).strip() 62 | copy_ans = copy_ans.strip() 63 | copy_ans = copy_ans.replace(",", ".").replace(" square kilometers", "") 64 | try: 65 | return float(copy_ans), True 66 | except: 67 | return number, False 68 | elif type(number) == int: 69 | return float(number), True 70 | else: 71 | return number, True 72 | 73 | 74 | def fix_prediction(prediction, gold_answer, evaluator): 75 | if ( 76 | type(prediction) == list 77 | and len(prediction) == 1 78 | and ( 79 | type(prediction[0]) == int 80 | or ((type(prediction[0]) == str) and prediction[0].isnumeric()) 81 | ) 82 | ): 83 | prediction = fix_number(prediction[0]) 84 | 85 | if type(prediction) != list: 86 | prediction, is_num = fix_number(prediction) 87 | if evaluator == "json": 88 | try: 89 | prediction = [json.loads(pred) for pred in prediction.split("\n")] 90 | except: 91 | prediction = [prediction] 92 | 93 | if (hasattr(type(prediction), "__len__")) and (len(prediction) == 0): 94 | return prediction, False 95 | 96 | if (type(prediction) == list and len(prediction) > 1) and type(gold_answer) == float: 97 | return prediction, False 98 | 99 | return prediction, True 100 | 101 | 102 | def question_scorer(prediction, gold_answer): 103 | try: 104 | prediction = json.loads(prediction) 105 | except: 106 | prediction = prediction 107 | 108 | answer_list = ( 109 | [x for x in gold_answer.split("\n") if len(x.strip()) > 0] 110 | if type(gold_answer) != list 111 | else gold_answer 112 | ) 113 | gold_answer, evaluator = parse_answer(answer_list) 114 | prediction, run_eval = fix_prediction(prediction, gold_answer, evaluator) 115 | 116 | has_ans = 1.0 117 | if (type(prediction) != float and len(prediction) == 0) or find_isnan(prediction): 118 | has_ans = 0.0 119 | 120 | if type(prediction) == list: 121 | if all( 122 | (type(pred) not in {float, int} and len(pred) == 0) or find_isnan(pred) 123 | for pred in prediction 124 | ): 125 | has_ans = 0 126 | 127 | if not run_eval: 128 | return 0.0, has_ans 129 | 130 | metric_eval = get_evaluator(evaluator) 131 | accuracy = metric_eval(prediction, gold_answer) 132 | return accuracy, has_ans 133 | -------------------------------------------------------------------------------- /browsergym/core/src/browsergym/core/action/python.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from .base import AbstractActionSet 4 | 5 | 6 | class PythonActionSet(AbstractActionSet): 7 | def describe(self, with_long_description: bool = True, with_examples: bool = True): 8 | """ 9 | Returns a textual description of this action space. 10 | """ 11 | description = f""" 12 | Each action consists of executable Python code (python>=3.10) that uses the Playwright library (playwright==1.32) 13 | to interact with the current webpage and the browser context. The currently active webpage is accessible via the 14 | global variable `page`. A function `send_message_to_user(text)` is also accessible and can be used to send a 15 | message to the user, as well as a function `report_infeasible_instructions(reason)` to notify the user when their 16 | instructions are infeasible.""" 17 | if with_long_description: 18 | description += f""" 19 | The browser context is in `page.context`, and all open webpages (tabs and popups) 20 | are in `page.context.pages`. Here is is an example of a valid action: 21 | ``` 22 | frame = page.frame_locator(".result-frame") 23 | button = frame.get_by_text("Submit") 24 | button.click() 25 | ``` 26 | Here is another example: 27 | ``` 28 | frame = page.get_by_test_id("a").frame_locator(":scope") 29 | frame.get_by_test_id("a776").click() 30 | ``` 31 | Note that Playwright's `get_by_test_id()` method is configured to use the `bid` attribute to locate HTML elements, 32 | instead of the default `data-testid`. Also, Playwright's locators can not traverse iframes, so you have to locate 33 | parent iframes first in order to locate an element in an iframe. The `bid` attribute contains all the information 34 | required to recursively locate an element. For example, an element with `bid="ac2"` can be retrieved as follows: 35 | ``` 36 | frame = page.get_by_test_id("a").frame_locator(":scope") 37 | frame = frame.get_by_test_id("ac").frame_locator(":scope") 38 | elem = frame.get_by_test_id("ac2") 39 | ``` 40 | """ 41 | else: 42 | description += f"""\ 43 | 44 | """ 45 | if with_examples: 46 | description += f"""\ 47 | Here are other examples of valid actions: 48 | ``` 49 | page = page.context.new_page() 50 | page.goto("https://www.wikipedia.org/") 51 | ``` 52 | ``` 53 | page.get_by_label("Birth date").fill("2020-02-02") 54 | page.get_by_role("link", name="Get started").click() 55 | ``` 56 | ``` 57 | page.get_by_label('I agree to the terms above').check() 58 | ``` 59 | ``` 60 | page.locator('#area').fill('Hello World!') 61 | ``` 62 | ``` 63 | page.get_by_role("textbox").press("Control+ArrowRight") 64 | ``` 65 | ``` 66 | send_message_to_user("There are 7 items to choose from.") 67 | ``` 68 | ``` 69 | report_infeasible_instructions("I cannot follow these instructions because there is no email field in this form.") 70 | ``` 71 | """ 72 | 73 | return description 74 | 75 | def example_action(self, abstract: bool) -> str: 76 | """ 77 | Returns an example action as a string. 78 | """ 79 | if abstract: 80 | return """\ 81 | One single bloc of Python code. Do not include any explanation, only valid Python code.""" 82 | else: 83 | return """\ 84 | frame = page.get_by_test_id("b").frame_locator(":scope") 85 | frame = page.get_by_test_id("ba").frame_locator(":scope") 86 | frame.get_by_test_id("ba2").fill("Hello world!") 87 | frame.get_by_test_id("ba3").click() 88 | """ 89 | 90 | def to_python_code(self, action): 91 | """ 92 | Converts the given code action string to browsergym-compatible playwright code. 93 | 94 | Args: 95 | action: the code action to parse. 96 | 97 | Returns: 98 | Executable playwright code that performs the action in a browsergym environment. 99 | """ 100 | 101 | python_code = "" 102 | 103 | # extract markdown-style code snippets if detected 104 | pattern = re.compile(r"```(?:python)?\n(?P[\s\S]*?)```") 105 | if pattern.match(action): 106 | python_code += "\n".join([match.group("code") for match in pattern.finditer(action)]) 107 | # otherwise just use the code action as is 108 | else: 109 | python_code += action 110 | 111 | # return the produced playwright code 112 | return python_code 113 | -------------------------------------------------------------------------------- /browsergym/miniwob/src/browsergym/miniwob/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from browsergym.core.registration import register_task 4 | 5 | from . import all 6 | 7 | 8 | def environment_variables_precheck(): 9 | assert os.environ.get( 10 | "MINIWOB_URL", None 11 | ), "Environment variable MINIWOB_URL has not been setup." 12 | 13 | 14 | ALL_MINIWOB_TASKS = [ 15 | all.AscendingNumbersTask, 16 | all.BisectAngleTask, 17 | all.BookFlightTask, 18 | all.BookFlightNodelayTask, 19 | all.BuyTicketTask, 20 | all.ChooseDateTask, 21 | all.ChooseDateEasyTask, 22 | all.ChooseDateMediumTask, 23 | all.ChooseDateNodelayTask, 24 | all.ChooseListTask, 25 | all.CircleCenterTask, 26 | all.ClickButtonTask, 27 | all.ClickButtonSequenceTask, 28 | all.ClickCheckboxesTask, 29 | all.ClickCheckboxesLargeTask, 30 | all.ClickCheckboxesSoftTask, 31 | all.ClickCheckboxesTransferTask, 32 | all.ClickCollapsibleTask, 33 | all.ClickCollapsible2Task, 34 | all.ClickCollapsible2NodelayTask, 35 | all.ClickCollapsibleNodelayTask, 36 | all.ClickColorTask, 37 | all.ClickDialogTask, 38 | all.ClickDialog2Task, 39 | all.ClickLinkTask, 40 | all.ClickMenuTask, 41 | all.ClickMenu2Task, 42 | all.ClickOptionTask, 43 | all.ClickPieTask, 44 | all.ClickPieNodelayTask, 45 | all.ClickScrollListTask, 46 | all.ClickShadesTask, 47 | all.ClickShapeTask, 48 | all.ClickTabTask, 49 | all.ClickTab2Task, 50 | all.ClickTab2EasyTask, 51 | all.ClickTab2HardTask, 52 | all.ClickTab2MediumTask, 53 | all.ClickTestTask, 54 | all.ClickTest2Task, 55 | all.ClickTestTransferTask, 56 | all.ClickWidgetTask, 57 | all.CopyPasteTask, 58 | all.CopyPaste2Task, 59 | all.CountShapeTask, 60 | all.CountSidesTask, 61 | all.DailyCalendarTask, 62 | all.DragBoxTask, 63 | all.DragCircleTask, 64 | all.DragCubeTask, 65 | all.DragItemsTask, 66 | all.DragItemsGridTask, 67 | all.DragShapesTask, 68 | all.DragShapes2Task, 69 | all.DragSingleShapeTask, 70 | all.DragSortNumbersTask, 71 | all.DrawCircleTask, 72 | all.DrawLineTask, 73 | all.EmailInboxTask, 74 | all.EmailInboxDeleteTask, 75 | all.EmailInboxForwardTask, 76 | all.EmailInboxForwardNlTask, 77 | all.EmailInboxForwardNlTurkTask, 78 | all.EmailInboxImportantTask, 79 | all.EmailInboxNlTurkTask, 80 | all.EmailInboxNoscrollTask, 81 | all.EmailInboxReplyTask, 82 | all.EmailInboxStarReplyTask, 83 | all.EnterDateTask, 84 | all.EnterPasswordTask, 85 | all.EnterTextTask, 86 | all.EnterText2Task, 87 | all.EnterTextDynamicTask, 88 | all.EnterTimeTask, 89 | all.FindGreatestTask, 90 | all.FindMidpointTask, 91 | all.FindWordTask, 92 | all.FocusTextTask, 93 | all.FocusText2Task, 94 | all.FormSequenceTask, 95 | all.FormSequence2Task, 96 | all.FormSequence3Task, 97 | all.GenerateNumberTask, 98 | all.GridCoordinateTask, 99 | all.GuessNumberTask, 100 | all.HighlightTextTask, 101 | all.HighlightText2Task, 102 | all.HotColdTask, 103 | all.IdentifyShapeTask, 104 | all.LoginUserTask, 105 | all.LoginUserPopupTask, 106 | all.MultiLayoutsTask, 107 | all.MultiOrderingsTask, 108 | all.NavigateTreeTask, 109 | all.NumberCheckboxesTask, 110 | all.OddOrEvenTask, 111 | all.OrderFoodTask, 112 | all.PhoneBookTask, 113 | all.ReadTableTask, 114 | all.ReadTable2Task, 115 | all.ResizeTextareaTask, 116 | all.RightAngleTask, 117 | all.ScrollTextTask, 118 | all.ScrollText2Task, 119 | all.SearchEngineTask, 120 | all.SignAgreementTask, 121 | all.SimpleAlgebraTask, 122 | all.SimpleArithmeticTask, 123 | all.SocialMediaTask, 124 | all.SocialMediaAllTask, 125 | all.SocialMediaSomeTask, 126 | all.StockMarketTask, 127 | all.TerminalTask, 128 | all.TextEditorTask, 129 | all.TextTransformTask, 130 | all.TicTacToeTask, 131 | all.UnicodeTestTask, 132 | all.UseAutocompleteTask, 133 | all.UseAutocompleteNodelayTask, 134 | all.UseColorwheelTask, 135 | all.UseColorwheel2Task, 136 | all.UseSliderTask, 137 | all.UseSlider2Task, 138 | all.UseSpinnerTask, 139 | all.VisualAdditionTask, 140 | ] 141 | 142 | # register the Miniwob benchmark 143 | for task in ALL_MINIWOB_TASKS: 144 | register_task( 145 | task.get_task_id(), 146 | task, 147 | nondeterministic=task.nondeterministic, 148 | ) 149 | -------------------------------------------------------------------------------- /.github/workflows/pypi.yml: -------------------------------------------------------------------------------- 1 | name: Build and Publish 2 | # based on official doc 3 | # https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/ 4 | 5 | on: [push, workflow_dispatch] 6 | 7 | jobs: 8 | build: 9 | name: Build 10 | runs-on: ubuntu-22.04 11 | 12 | steps: 13 | - uses: actions/checkout@v4 14 | 15 | - name: Set up Python 16 | uses: actions/setup-python@v5 17 | with: 18 | python-version: "3.x" 19 | 20 | - name: Install pypa/build 21 | run: python3 -m pip install build --user 22 | 23 | - name: Build a binary wheel and a source tarball (browsergym-core) 24 | run: python3 -m build browsergym/core/ --outdir dist/ 25 | 26 | - name: Build a binary wheel and a source tarball (browsergym-miniwob) 27 | run: python3 -m build browsergym/miniwob/ --outdir dist/ 28 | 29 | - name: Build a binary wheel and a source tarball (browsergym-webarena) 30 | run: python3 -m build browsergym/webarena/ --outdir dist/ 31 | - name: Build a binary wheel and a source tarball (browsergym-webarenalite) 32 | run: python3 -m build browsergym/webarenalite/ --outdir dist/ 33 | - name: Build a binary wheel and a source tarball (browsergym-visualwebarena) 34 | run: python3 -m build browsergym/visualwebarena/ --outdir dist/ 35 | 36 | - name: Build a binary wheel and a source tarball (browsergym-assistantbench) 37 | run: python3 -m build browsergym/assistantbench/ --outdir dist/ 38 | 39 | - name: Build a binary wheel and a source tarball (browsergym-experiments) 40 | run: python3 -m build browsergym/experiments/ --outdir dist/ 41 | 42 | - name: Build a binary wheel and a source tarball (browsergym) 43 | run: python3 -m build browsergym/ --outdir dist/ 44 | 45 | - name: Store the distribution packages 46 | uses: actions/upload-artifact@v4 47 | with: 48 | name: python-package-distributions 49 | path: dist/ 50 | 51 | publish-to-pypi: 52 | name: Publish to PyPI 53 | if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes 54 | needs: 55 | - build 56 | runs-on: ubuntu-22.04 57 | environment: pypi 58 | permissions: 59 | id-token: write # IMPORTANT: mandatory for trusted publishing 60 | 61 | steps: 62 | - name: Download all the distribution packages 63 | uses: actions/download-artifact@v4 64 | with: 65 | name: python-package-distributions 66 | path: dist/ 67 | 68 | - name: Publish all distribution packages to PyPI 69 | uses: pypa/gh-action-pypi-publish@release/v1 70 | 71 | github-release: 72 | name: Create GitHub Release 73 | if: startsWith(github.ref, 'refs/tags/') 74 | needs: 75 | - publish-to-pypi 76 | runs-on: ubuntu-22.04 77 | permissions: 78 | contents: write 79 | 80 | steps: 81 | - name: Checkout code 82 | uses: actions/checkout@v4 83 | 84 | - name: Download all the dists 85 | uses: actions/download-artifact@v4 86 | with: 87 | name: python-package-distributions 88 | path: dist/ 89 | 90 | - name: Create GitHub Release 91 | env: 92 | GITHUB_TOKEN: ${{ github.token }} 93 | run: | 94 | gh release create '${{ github.ref_name }}' \ 95 | dist/* \ 96 | --repo '${{ github.repository }}' \ 97 | --title "Release ${{ github.ref_name }}" \ 98 | --generate-notes 99 | 100 | - name: Set as pre-release if dev version 101 | if: contains(github.ref, '.dev') 102 | env: 103 | GITHUB_TOKEN: ${{ github.token }} 104 | run: | 105 | gh release edit '${{ github.ref_name }}' \ 106 | --repo '${{ github.repository }}' \ 107 | --prerelease 108 | 109 | # publish-to-testpypi: 110 | # name: Publish to TestPyPI 111 | # needs: 112 | # - build 113 | # runs-on: ubuntu-latest 114 | # environment: testpypi 115 | # permissions: 116 | # id-token: write # IMPORTANT: mandatory for trusted publishing 117 | 118 | # steps: 119 | # - name: Download all the distribution packages 120 | # uses: actions/download-artifact@v4 121 | # with: 122 | # name: python-package-distributions 123 | # path: dist/ 124 | 125 | # - name: Publish distribution packages to TestPyPI 126 | # uses: pypa/gh-action-pypi-publish@release/v1 127 | # with: 128 | # repository-url: https://test.pypi.org/legacy/ 129 | -------------------------------------------------------------------------------- /docs/src/examples/create_custom_task.rst: -------------------------------------------------------------------------------- 1 | Creating a custom task 2 | ______________________ 3 | 4 | Creating a custom task in `BrowserGym` can be done easily by inheriting from the `AbstractBrowserTask` class. 5 | 6 | 7 | Let's start with an example, we will build a task that starts from the Google Search page and asks for the Eiffel Tower Wikipedia page. 8 | 9 | * **Goal**: Search for 'Eiffel Tower' Wikipedia page. 10 | 11 | * **Reward**: Gets reward = 1 if reaches the expected Wikipedia page on Eiffel Tower, else gets 0. 12 | 13 | .. code-block:: python 14 | 15 | from typing import Tuple 16 | 17 | import playwright.sync_api 18 | from browsergym.core.task import AbstractBrowserTask 19 | 20 | 21 | class SampleTask(AbstractBrowserTask): 22 | def __init__(self, seed: int) -> None: 23 | super().__init__(seed) 24 | 25 | @classmethod 26 | def get_task_id(cls): 27 | return "sample_task" 28 | 29 | 30 | First, let's setup the task. To do this, we need to implement the `setup()` function. The starting point is *https://www.google.com* page and the goal is *"Search for 'Eiffel Tower' Wikipedia page"*. 31 | 32 | .. code-block:: python 33 | 34 | class SampleTask(AbstractBrowserTask): 35 | # ... 36 | # Code above 37 | # ... 38 | 39 | def setup(self, page: playwright.sync_api.Page) -> Tuple[str, dict]: 40 | """Set up everything needed to execute the task.""" 41 | page.goto("https://www.google.com", timeout=10000) 42 | goal = "Search for 'Eiffel Tower' Wikipedia page." 43 | info = {} 44 | return goal, info 45 | 46 | 47 | Next, we need to compute a reward. For this, we'll implement our validation criteria in the `validate()` function. In our case, we consider the task being completed when the Eiffel Tower Wikipedia page is reached. Else it's a fail. 48 | 49 | .. code-block:: python 50 | 51 | class SampleTask(AbstractBrowserTask): 52 | # ... 53 | # Code above 54 | # ... 55 | 56 | def validate( 57 | self, page: playwright.sync_api.Page, chat_messages: list[str] 58 | ) -> Tuple[float, bool, str, dict]: 59 | """Compute reward based on reaching final URL.""" 60 | if page.url == "https://en.wikipedia.org/wiki/Eiffel_Tower": 61 | return 1.0, True, "Task completed", {} 62 | else: 63 | return 0.0, False, "", {} 64 | 65 | 66 | We can also implement the code for completing the task, it's an oracle (a.k.a. cheat) version. For this, we'll fill out the `cheat()` function. 67 | 68 | .. code-block:: python 69 | 70 | class SampleTask(AbstractBrowserTask): 71 | # ... 72 | # Code above 73 | # ... 74 | 75 | def cheat(self, page: playwright.sync_api.Page, chat_messages: list[str]) -> None: 76 | """Solve the task in a single step using a hard-coded Playwright solution.""" 77 | page.get_by_text("Search").fill("Eiffel Tower") 78 | page.get_by_text("Google Search").click() 79 | page.get_by_text("Eiffel Tower - Wikipedia").click() 80 | 81 | 82 | Finally, the `teardown()` function. This function allows to clean resources before closing the enviroment. In our case, nothing need to be done, so we will leave it empty. 83 | 84 | .. code-block:: python 85 | 86 | class SampleTask(AbstractBrowserTask): 87 | # ... 88 | # Code above 89 | # ... 90 | 91 | def teardown(self) -> None: 92 | # Nothing to do for this task. 93 | pass 94 | 95 | 96 | Our folder structure should look like the following: 97 | 98 | .. code-block:: bash 99 | 100 | . 101 | |── tasks 102 | | ├── __init__.py 103 | | └── sample_task.py 104 | ├── run_task.py 105 | 106 | 107 | Now we should register the task in the gym environment using the following code in the `__init__.py` of your package: 108 | 109 | .. code-block:: python 110 | 111 | from browsergym.core.registration import register_task 112 | 113 | from .sample_task import SampleTask 114 | 115 | register_task(id=SampleTask.get_task_id(), task_class=SampleTask) 116 | 117 | 118 | Now that the task is registered it can be called via this code that you can put in `run_task.py` file: 119 | 120 | .. code-block:: python 121 | 122 | import gymnasium as gym 123 | import tasks # will register the gym environment 124 | 125 | env = gym.make("browsergym/sample_task") 126 | obs, info = env.reset() 127 | done = False 128 | 129 | while not done: 130 | action = "noop()" 131 | obs, reward, terminated, truncated, info = env.step(action) 132 | print(f"Reward: {reward}, Done: {done}, Info: {info}") 133 | 134 | -------------------------------------------------------------------------------- /browsergym/core/src/browsergym/core/spaces.py: -------------------------------------------------------------------------------- 1 | """Borrowed from https://github.com/Farama-Foundation/miniwob-plusplus/blob/553daee55ea0b2cc32b181a474083ab4cad782a1/miniwob/spaces.py""" 2 | 3 | from typing import Any 4 | 5 | import numpy as np 6 | from gymnasium.spaces import Space 7 | from numpy.typing import NDArray 8 | 9 | 10 | class Unicode(Space): 11 | """ 12 | A space representing a unicode string. 13 | """ 14 | 15 | def __init__(self): 16 | super().__init__() 17 | 18 | def contains(self, x: Any) -> bool: 19 | """Return boolean specifying if x is a valid member of this space.""" 20 | # Do not check the character set. 21 | return isinstance(x, str) 22 | 23 | def __repr__(self) -> str: 24 | """Gives a string representation of this space.""" 25 | return f"Unicode()" 26 | 27 | def __eq__(self, other: Any) -> bool: 28 | """Check whether ``other`` is equivalent to this instance.""" 29 | return isinstance(other, Unicode) 30 | 31 | 32 | class Float(Space): 33 | """ 34 | A space representing a float. 35 | """ 36 | 37 | def __init__(self): 38 | super().__init__() 39 | 40 | def contains(self, x: Any) -> bool: 41 | """Return boolean specifying if x is a valid member of this space.""" 42 | return isinstance(x, float) 43 | 44 | def __repr__(self) -> str: 45 | """Gives a string representation of this space.""" 46 | return f"Float()" 47 | 48 | def __eq__(self, other: Any) -> bool: 49 | """Check whether ``other`` is equivalent to this instance.""" 50 | return isinstance(other, Float) 51 | 52 | 53 | class Integer(Space): 54 | """ 55 | A space representing an integer. 56 | """ 57 | 58 | def __init__(self): 59 | super().__init__() 60 | 61 | def contains(self, x: Any) -> bool: 62 | """Return boolean specifying if x is a valid member of this space.""" 63 | return isinstance(x, int) 64 | 65 | def __repr__(self) -> str: 66 | """Gives a string representation of this space.""" 67 | return f"Integer()" 68 | 69 | def __eq__(self, other: Any) -> bool: 70 | """Check whether ``other`` is equivalent to this instance.""" 71 | return isinstance(other, Integer) 72 | 73 | 74 | class AnyDict(Space): 75 | """A space representing an arbitrary dictionary object.""" 76 | 77 | def contains(self, x: Any) -> bool: 78 | """Return boolean specifying if x is a valid member of this space.""" 79 | # Do not check anything specific. 80 | return isinstance(x, dict) 81 | 82 | def __repr__(self) -> str: 83 | """Gives a string representation of this space.""" 84 | return f"AnyDict()" 85 | 86 | def __eq__(self, other: Any) -> bool: 87 | """Check whether ``other`` is equivalent to this instance.""" 88 | return isinstance(other, AnyDict) 89 | 90 | 91 | class Anything(Space): 92 | """A space representing an arbitrary dictionary object.""" 93 | 94 | def contains(self, x: Any) -> bool: 95 | return True 96 | 97 | def __repr__(self) -> str: 98 | return f"Anything()" 99 | 100 | def __eq__(self, other: Any) -> bool: 101 | return isinstance(other, Anything) 102 | 103 | 104 | class AnyBox(Space[NDArray[Any]]): 105 | """A space representing an arbitrary dictionary object.""" 106 | 107 | def __init__(self, low, high, shape, dtype): 108 | super().__init__(shape, dtype) 109 | self.low = low 110 | self.high = high 111 | 112 | def contains(self, x: Any) -> bool: 113 | """Return boolean specifying if x is a valid member of this space.""" 114 | if not isinstance(x, np.ndarray): 115 | try: 116 | x = np.asarray(x, dtype=self.dtype) 117 | except (ValueError, TypeError): 118 | return False 119 | 120 | return bool( 121 | np.can_cast(x.dtype, self.dtype) 122 | and len(x.shape) == len(self.shape) 123 | and all([dim in (xdim, -1) for xdim, dim in zip(x.shape, self.shape)]) 124 | and np.all(x >= self.low) 125 | and np.all(x <= self.high) 126 | ) 127 | 128 | def __repr__(self) -> str: 129 | """Gives a string representation of this space.""" 130 | return f"AnyBox(low={repr(self.low)}, high={repr(self.high)}, shape={repr(self.shape)}, dtype={repr(self.dtype)})" 131 | 132 | def __eq__(self, other: Any) -> bool: 133 | """Check whether ``other`` is equivalent to this instance.""" 134 | return ( 135 | isinstance(other, AnyBox) 136 | and self.low == other.low 137 | and self.high == other.high 138 | and self.shape == other.shape 139 | and self.dtype == other.dtype 140 | ) 141 | -------------------------------------------------------------------------------- /browsergym/webarenalite/src/browsergym/webarenalite/task.py: -------------------------------------------------------------------------------- 1 | import importlib.resources 2 | import json 3 | import logging 4 | import tempfile 5 | from typing import Optional 6 | 7 | import playwright.sync_api 8 | 9 | from browsergym.webarena.task import GenericWebArenaTask 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class WebArenaLiteTask(GenericWebArenaTask): 16 | """ 17 | Base class for all WebArena tasks. 18 | 19 | """ 20 | 21 | def __init__( 22 | self, 23 | seed: int, 24 | task_id: Optional[int] = None, 25 | intent_template_id: Optional[int] = None, 26 | with_na_hint: bool = False, 27 | with_homepage_hint: bool = False, 28 | ): 29 | super().__init__( 30 | seed=seed, 31 | task_id=task_id, 32 | intent_template_id=intent_template_id, 33 | with_na_hint=with_na_hint, 34 | with_homepage_hint=with_homepage_hint, 35 | ) 36 | 37 | all_configs_str = ( 38 | importlib.resources.files("browsergym.webarenalite") 39 | .joinpath("test_webarena_lite.raw.json") 40 | .read_text() 41 | ) 42 | # substitute URLs 43 | for pattern, url_key in { 44 | "__GITLAB__": "gitlab", 45 | "__REDDIT__": "reddit", 46 | "__SHOPPING__": "shopping", 47 | "__SHOPPING_ADMIN__": "shopping_admin", 48 | "__WIKIPEDIA__": "wikipedia", 49 | "__MAP__": "map", 50 | }.items(): 51 | all_configs_str = all_configs_str.replace(pattern, self.webarena_instance.urls[url_key]) 52 | 53 | # load all task configs to JSON 54 | all_configs = json.loads(all_configs_str) 55 | 56 | # keep only the desired task configs 57 | if intent_template_id is not None: 58 | task_configs = [ 59 | conf for conf in all_configs if conf["intent_template_id"] == intent_template_id 60 | ] 61 | if not task_configs: 62 | raise ValueError( 63 | f"Could not find any task config with intent_template_id={intent_template_id}." 64 | ) 65 | 66 | elif task_id is not None: 67 | # use old_task_id to filter configs 68 | task_configs = [conf for conf in all_configs if conf["old_task_id"] == task_id] 69 | if not task_configs: 70 | raise ValueError(f"Could not find any task config with old_task_id={task_id}.") 71 | 72 | self.task_configs = task_configs 73 | 74 | def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: 75 | # Using the custom evaluator for WebArena Lite 76 | from .evaluators import evaluator_router 77 | 78 | # pick a task at random 79 | self.config = self.random.choice(self.task_configs) 80 | 81 | # hack: dynamically build a config file to read from 82 | with tempfile.NamedTemporaryFile(mode="w+", delete=False) as f: 83 | json.dump(self.config, f) 84 | f.flush() 85 | self.config_file = f.name 86 | 87 | # build the evaluator 88 | self.evaluator = evaluator_router(self.config_file) 89 | 90 | # authenticate 91 | for site in self.config["sites"]: 92 | self.webarena_instance.ui_login(site=site, page=page) 93 | 94 | # set geolocation 95 | page.context.set_geolocation(self.config["geolocation"]) 96 | 97 | # navigate to the starting url(s) (might need several pages) 98 | # https://github.com/web-arena-x/webarena/blob/c6475f0e9affe5252a2966e26b8cb4c834a4ae40/browser_env/envs.py#L150 99 | if self.config["start_url"]: 100 | start_urls = self.config["start_url"].split(" |AND| ") 101 | for i, url in enumerate(start_urls): 102 | page.goto(url) 103 | if i < len(start_urls) - 1: 104 | page = page.context.new_page() 105 | 106 | # recover goal 107 | goal = self.config["intent"] 108 | 109 | # This note is present in all webarena's agent prompts 110 | # https://github.com/web-arena-x/webarena/blob/c6475f0e9affe5252a2966e26b8cb4c834a4ae40/agent/prompts/raw/p_cot_id_actree_2s.py#L34 111 | if self.with_homepage_hint: 112 | goal += f""" 113 | 114 | (Note: if you want to visit other websites, check out the homepage at {self.webarena_instance.home_url}. It has a list of websites you can visit. {self.webarena_instance.home_url}/password.html lists all the account name and password for the websites. You can use them to log in to the websites.) 115 | """ 116 | 117 | # This note is present in some of webarena's agent prompts 118 | if self.with_na_hint: 119 | goal += """\ 120 | 121 | If you believe the task is impossible to complete, provide the answer "N/A". 122 | """ 123 | 124 | return goal, {} 125 | -------------------------------------------------------------------------------- /browsergym/experiments/src/browsergym/experiments/agent.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from dataclasses import dataclass, field 3 | from typing import Any 4 | 5 | from browsergym.core.action.base import AbstractActionSet 6 | from browsergym.core.action.highlevel import HighLevelActionSet 7 | from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, prune_html 8 | 9 | 10 | def default_obs_preprocessor(obs: dict) -> dict: 11 | obs = obs.copy() # shallow copy to avoid modifying the original dict 12 | # augment the observation with text versions of the DOM and AXTree 13 | obs["dom_txt"] = flatten_dom_to_str(obs["dom_object"]) 14 | obs["axtree_txt"] = flatten_axtree_to_str(obs["axtree_object"]) 15 | obs["pruned_html"] = prune_html(obs["dom_txt"]) 16 | # remove raw entries that the agent won't use, and we don't want to record 17 | del obs["dom_object"] 18 | del obs["axtree_object"] 19 | return obs 20 | 21 | 22 | DEFAULT_ACTION_SET: AbstractActionSet = HighLevelActionSet() 23 | DEFAULT_OBS_PREPROCESSOR: callable = default_obs_preprocessor 24 | 25 | 26 | @dataclass 27 | class AgentInfo: 28 | think: str = None 29 | chat_messages: list = None 30 | stats: dict = field(default_factory=dict) 31 | markdown_page: str = "" 32 | html_page: str = "" 33 | extra_info: dict = None 34 | 35 | def __getitem__(self, key): 36 | return getattr(self, key) 37 | 38 | def __contains__(self, key): 39 | return hasattr(self, key) 40 | 41 | def pop(self, key, default=None): 42 | return getattr(self, key, default) 43 | 44 | def get(self, key, default=None): 45 | return getattr(self, key, default) 46 | 47 | 48 | class Agent(ABC): 49 | """ 50 | A template class that defines the required signature of an agent interacting 51 | with a browsergym environment 52 | 53 | Attributes: 54 | action_set: AbstractActionSet 55 | Defines the set of actions that the agent can take in the environment. 56 | This property is meant to be overloaded by your agent (optional). 57 | By default, uses BrowserGym's high-level action set. 58 | """ 59 | 60 | action_set: AbstractActionSet = DEFAULT_ACTION_SET 61 | 62 | def obs_preprocessor(self, obs: dict) -> Any: 63 | """ 64 | Function that pre-processes observations before feeding them to `get_action()`. 65 | This property is meant to be overloaded by your agent (optional). 66 | By default, the base observation is augmented with text versions of the DOM and AXTREE. 67 | 68 | Why this mapping? This mapping will happen within the experiment loop, so that the 69 | resulting observation gets recorded in the execution traces, and statistics can be computed from it. 70 | """ 71 | return DEFAULT_OBS_PREPROCESSOR(obs) 72 | 73 | @abstractmethod 74 | def get_action(self, obs: Any) -> tuple[str, AgentInfo]: 75 | """ 76 | Updates the agent with the current observation, and returns its next action (plus an info dict, optional). 77 | 78 | Parameters: 79 | ----------- 80 | obs: 81 | The current observation of the environment, after it has been processed by `obs_preprocessor()`. 82 | By default, a BrowserGym observation is a dict with the following entries: 83 | - "chat_messages": list[str], messages between the agent and the user. 84 | - "goal": str, the current goal. 85 | - "open_pages_urls": list[str], open pages. 86 | - "active_page_index": int, the index of the active page. 87 | - "url": str, the current URL. 88 | - "screenshot": 3D np.array, the current screenshot. 89 | - "dom_object": dict, the current DOM object. See DOMSnapshot from chrome devtools. 90 | - "axtree_object": dict, the current AXTREE object. See Accessibility Tree from chrome devtools. 91 | - "extra_element_properties": dict[bid, dict[name, value]] extra 92 | properties of elements in the DOM. 93 | - "focused_element_bid": str, the bid of the focused element. 94 | - "last_action": str, the last action executed. 95 | - "last_action_error": str, the error of the last action. 96 | - "elapsed_time": float, the time elapsed since the start of the episode. 97 | 98 | Returns: 99 | -------- 100 | action: str 101 | The action to be processed by `action_mapping()` (if any), and executed in the environment. 102 | info: AgentInfo 103 | Additional information about the action. with the following entries 104 | being handled by BrowserGym: 105 | - "think": optional chain of thought 106 | - "messages": list of messages with the LLM 107 | - "stats": dict of extra statistics that will be saved and 108 | aggregated. 109 | - "markdown_page": str, string that will be displayed by agentlab's xray tool. 110 | - "extra_info": dict, additional information that will be saved 111 | and aggregated. 112 | """ 113 | -------------------------------------------------------------------------------- /browsergym/experiments/src/browsergym/experiments/benchmark/metadata/scripts.py: -------------------------------------------------------------------------------- 1 | import importlib.resources 2 | import json 3 | 4 | import numpy as np 5 | 6 | from browsergym.experiments.benchmark import task_metadata 7 | 8 | # for posterity 9 | 10 | 11 | def print_metadata_workarena(): 12 | from browsergym.workarena import ( 13 | AGENT_CURRICULUM_L2, 14 | AGENT_CURRICULUM_L3, 15 | TASK_CATEGORY_MAP, 16 | ) 17 | 18 | metadata = [("task_name", "level", "category")] 19 | 20 | for task_name, category in TASK_CATEGORY_MAP.items(): 21 | metadata.append((task_name, "l1", category)) 22 | 23 | for category, items in AGENT_CURRICULUM_L2.items(): 24 | for task_set in items["buckets"]: 25 | for task in task_set: 26 | metadata.append((task.get_task_id(), "l2", category)) 27 | 28 | for category, items in AGENT_CURRICULUM_L3.items(): 29 | for task_set in items["buckets"]: 30 | for task in task_set: 31 | metadata.append((task.get_task_id(), "l3", category)) 32 | 33 | print("\n".join([",".join(x) for x in metadata])) 34 | 35 | 36 | def print_metadata_webarena(): 37 | 38 | import webarena 39 | 40 | metadata = [ 41 | ( 42 | "task_name", 43 | "requires_reset", 44 | "sites", 45 | "eval_types", 46 | ) 47 | ] 48 | all_configs_str = importlib.resources.files(webarena).joinpath("test.raw.json").read_text() 49 | all_configs = json.loads(all_configs_str) 50 | for task in all_configs: 51 | metadata.append( 52 | ( 53 | f"webarena.{task['task_id']}", 54 | str(task["require_reset"] == True), 55 | " ".join(task["sites"]), 56 | " ".join(task["eval"]["eval_types"]), 57 | ) 58 | ) 59 | 60 | print("\n".join([",".join(x) for x in metadata])) 61 | 62 | 63 | def print_metadata_visualwebarena(): 64 | import visualwebarena 65 | 66 | metadata = [ 67 | ( 68 | "task_name", 69 | "requires_reset", 70 | "sites", 71 | "reasoning_difficulty", 72 | "visual_difficulty", 73 | "overall_difficulty", 74 | "eval_types", 75 | ) 76 | ] 77 | 78 | all_configs_str = ( 79 | importlib.resources.files(visualwebarena).joinpath("test_raw.json").read_text() 80 | ) 81 | all_configs = json.loads(all_configs_str) 82 | for task in all_configs: 83 | metadata.append( 84 | ( 85 | f"visualwebarena.{task['task_id']}", 86 | str(task["require_reset"] == True), 87 | " ".join(task["sites"]), 88 | task["reasoning_difficulty"], 89 | task["visual_difficulty"], 90 | task["overall_difficulty"], 91 | " ".join(task["eval"]["eval_types"]), 92 | ) 93 | ) 94 | 95 | print("\n".join([",".join(x) for x in metadata])) 96 | 97 | 98 | def print_miniwob_train_test_splits(): 99 | metadata = task_metadata("miniwob") 100 | 101 | groups = metadata["similarity_group"] 102 | group_counts = groups.value_counts(sort=False) 103 | 104 | group_counts = dict({group: count for group, count in zip(group_counts.index, group_counts)}) 105 | 106 | free_groups = set(group_counts.keys()) 107 | train_groups = set() 108 | test_groups = set() 109 | rng = np.random.RandomState(1337) 110 | 111 | # slack for train / test size equality 112 | slack = sum(group_counts.values()) % 2 113 | 114 | def move_random_group(from_groups: set, to_groups: set): 115 | # pick uniformly among tasks (weighted sampling among groups) 116 | probs = np.asarray([float(group_counts[group]) for group in from_groups]) 117 | probs = probs / probs.sum() 118 | # sample a group 119 | group = rng.choice(list(from_groups), size=1, p=probs)[0] 120 | # move between sets 121 | to_groups.add(group) 122 | from_groups.remove(group) 123 | # return group for information 124 | return group 125 | 126 | done = False 127 | while not done: 128 | n_train = sum([group_counts[group] for group in train_groups]) 129 | n_test = sum([group_counts[group] for group in test_groups]) 130 | 131 | print(f"train/test split: {n_train} <> {n_test}") 132 | 133 | # growing phase 134 | if free_groups: 135 | if n_train < n_test: 136 | group = move_random_group(from_groups=free_groups, to_groups=train_groups) 137 | print(f"adding {group} to train") 138 | else: 139 | group = move_random_group(from_groups=free_groups, to_groups=test_groups) 140 | print(f"adding {group} to test") 141 | 142 | # group switching phase 143 | elif n_train < n_test - slack: 144 | group = move_random_group(from_groups=test_groups, to_groups=train_groups) 145 | print(f"switching {group} from test to train") 146 | elif n_test < n_train - slack: 147 | group = move_random_group(from_groups=train_groups, to_groups=test_groups) 148 | print(f"switching {group} from train to test") 149 | 150 | # done (equilibrium) 151 | else: 152 | print("equilibrium") 153 | done = True 154 | 155 | print() 156 | 157 | metadata["browsergym_split"] = metadata["similarity_group"].apply( 158 | lambda group: "train" if group in train_groups else "test" if group in test_groups else "" 159 | ) 160 | 161 | print(metadata.to_csv(index=False)) 162 | -------------------------------------------------------------------------------- /browsergym/assistantbench/src/browsergym/assistantbench/task.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import Dict, Tuple 4 | 5 | from datasets import load_dataset 6 | from playwright.sync_api import Page 7 | 8 | from browsergym.core.task import AbstractBrowserTask 9 | 10 | from .evaluation.evaluator import question_scorer 11 | from .utils import add_prediction_to_jsonl 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | _DEFAULT_OUTPUT_FILE = None 16 | 17 | 18 | def set_default_output_file(output_file: str): 19 | global _DEFAULT_OUTPUT_FILE 20 | _DEFAULT_OUTPUT_FILE = output_file 21 | 22 | 23 | def get_default_output_file(): 24 | return _DEFAULT_OUTPUT_FILE 25 | 26 | 27 | # Load dataset 28 | 29 | DATA_DATASET = "AssistantBench/AssistantBench" 30 | all_tasks = load_dataset(DATA_DATASET, trust_remote_code=True) 31 | 32 | 33 | # Extract answers and tasks for validation and test splits 34 | def extract_data(split_name: str) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str]]: 35 | return ( 36 | { 37 | f"{split_name}.{i}": row["answer"] if row["answer"] is not None else "" 38 | for i, row in enumerate(all_tasks[split_name]) 39 | }, 40 | {f"{split_name}.{i}": row["task"] for i, row in enumerate(all_tasks[split_name])}, 41 | {f"{split_name}.{i}": row["id"] for i, row in enumerate(all_tasks[split_name])}, 42 | ) 43 | 44 | 45 | # Implementation data for testing 46 | def get_implementation_testing_data() -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str]]: 47 | return ( 48 | {"imp.0": "20"}, 49 | { 50 | "imp.0": "What is the weather in Paris yesterday in Celsius? Answer with the number only." 51 | }, 52 | {"imp.0": "test_imp_id_0"}, 53 | ) 54 | 55 | 56 | # Combine dev, test, and implementation-specific testing splits 57 | gold_answers_dev, tasks_dev, ids_dev = extract_data("validation") 58 | gold_answers_test, tasks_test, ids_test = extract_data("test") 59 | gold_answers_impl_testing, tasks_test_impl_testing, ids_imp_testing = ( 60 | get_implementation_testing_data() 61 | ) 62 | gold_answers = {**gold_answers_dev, **gold_answers_test, **gold_answers_impl_testing} 63 | tasks = {**tasks_dev, **tasks_test, **tasks_test_impl_testing} 64 | ids = {**ids_dev, **ids_test, **ids_imp_testing} 65 | 66 | 67 | class AssistantBenchTask(AbstractBrowserTask): 68 | 69 | @classmethod 70 | def get_task_id(cls) -> str: 71 | """ 72 | Generic class for several task ids, this way of obtaining the task id is not compatible for now. 73 | """ 74 | raise NotImplementedError 75 | 76 | def __init__( 77 | self, seed: int, task_id: str, output_file: str = None, save_predictions: bool = False 78 | ) -> None: 79 | """ 80 | Args: 81 | seed (int): Random seed for task initialization. 82 | task_id (str): Unique identifier for the task (for the BrowserGym environment). 83 | output_file (str, optional): Path to the output file for saving results, needed for test set. 84 | save_predictions (bool, optional): Save predictions to the output file (yes/no). 85 | """ 86 | super().__init__(seed) 87 | self.locale = "en-US" 88 | self.timezone_id = "America/New_York" 89 | 90 | self.task_id = task_id 91 | self.start_url = "https://google.com" 92 | self.goal = tasks[str(self.task_id)] 93 | self.gold = gold_answers[str(self.task_id)] 94 | self.ab_task_id = ids[self.task_id] 95 | self.save_predictions = save_predictions 96 | 97 | self.output_file = output_file 98 | 99 | # set output_file using the global default value, if not provided in constructor 100 | if not self.output_file: 101 | self.output_file = get_default_output_file() 102 | # use env variable in last resort 103 | if not self.output_file: 104 | self.output_file = os.getenv("ASSISTANTBENCH_OUTPUT_FILE", None) 105 | 106 | if self.save_predictions and self.output_file: 107 | logger.info(f"Task prediction will be written to output file {self.output_file}") 108 | 109 | def setup(self, page: Page) -> Tuple[str, dict]: 110 | logger.info(f"Navigating to start url: {self.start_url}") 111 | page.goto(self.start_url, timeout=10000) 112 | if self.save_predictions and self.output_file: 113 | # create an empty task entry in the output file (will raise an Exception if the entry is already there) 114 | add_prediction_to_jsonl( 115 | file_path=self.output_file, 116 | task_id=self.ab_task_id, 117 | prediction="", 118 | override_if_exists=False, 119 | ) 120 | return self.goal, {} 121 | 122 | def teardown(self) -> None: 123 | pass 124 | 125 | def validate(self, page: Page, chat_messages: list[dict]) -> Tuple[float, bool, str, dict]: 126 | accuracy, done, msg, info = 0.0, False, "", {} 127 | 128 | # eval when the agent returns a response 129 | if chat_messages and chat_messages[-1]["role"] == "assistant": 130 | done = True 131 | prediction = chat_messages[-1]["message"] 132 | if self.save_predictions and self.output_file: 133 | # update the task entry in the output file 134 | add_prediction_to_jsonl( 135 | file_path=self.output_file, 136 | task_id=self.ab_task_id, 137 | prediction=prediction, 138 | override_if_exists=True, 139 | ) 140 | accuracy, has_ans = question_scorer(prediction, self.gold) 141 | 142 | return accuracy, done, msg, info 143 | -------------------------------------------------------------------------------- /tests/miniwob/test_base.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import time 4 | import gymnasium as gym 5 | 6 | # register gym environments 7 | import browsergym.miniwob 8 | 9 | from browsergym.miniwob.all import ( 10 | ClickButtonTask, 11 | ClickOptionTask, 12 | DrawLineTask, 13 | LoginUserTask, 14 | ) 15 | 16 | __SLOW_MO = 1000 if "DISPLAY_BROWSER" in os.environ else None 17 | __HEADLESS = False if "DISPLAY_BROWSER" in os.environ else True 18 | 19 | TASKS = [ClickButtonTask, ClickOptionTask, DrawLineTask, LoginUserTask] 20 | 21 | 22 | @pytest.mark.parametrize("task_cls", TASKS) 23 | def test_validate_teardown(task_cls): 24 | pw = browsergym.core._get_global_playwright() 25 | 26 | browser = pw.chromium.launch(headless=__HEADLESS, slow_mo=__SLOW_MO) 27 | context = browser.new_context() 28 | page = context.new_page() 29 | 30 | task = task_cls(seed=42) 31 | task.setup(page=page) 32 | 33 | reward, done, msg, info = task.validate(page, []) 34 | 35 | assert done is False 36 | 37 | task.teardown() 38 | 39 | context.close() 40 | browser.close() 41 | 42 | 43 | @pytest.mark.parametrize("task_cls", TASKS) 44 | def test_episode_max_time(task_cls): 45 | pw = browsergym.core._get_global_playwright() 46 | 47 | browser = pw.chromium.launch(headless=__HEADLESS, slow_mo=__SLOW_MO) 48 | context = browser.new_context() 49 | page = context.new_page() 50 | 51 | task = task_cls(seed=42, episode_max_time=0.2) 52 | task.setup(page=page) 53 | 54 | time.sleep(0.5) 55 | 56 | reward, done, msg, info = task.validate(page, []) 57 | 58 | assert done is True 59 | assert reward == 0 60 | 61 | task.teardown() 62 | 63 | context.close() 64 | browser.close() 65 | 66 | 67 | @pytest.mark.parametrize("task_cls", TASKS) 68 | def test_remove_human_display(task_cls): 69 | pw = browsergym.core._get_global_playwright() 70 | 71 | browser = pw.chromium.launch(headless=__HEADLESS, slow_mo=__SLOW_MO) 72 | 73 | # remove display 74 | 75 | context = browser.new_context() 76 | page = context.new_page() 77 | 78 | task = task_cls(seed=42, remove_human_display=True) 79 | task.setup(page=page) 80 | 81 | for element_id in ["reward-display", "click-canvas", "sync-task-cover"]: 82 | element_in_dom = page.evaluate(f"!!document.getElementById('{element_id}')") 83 | assert not element_in_dom 84 | 85 | assert page.evaluate(f"document.getElementById('query').innerHTML") == "" 86 | 87 | for element_id in ["wrap", "area"]: 88 | element_in_dom = page.evaluate(f"!!document.getElementById('{element_id}')") 89 | assert element_in_dom 90 | 91 | task.teardown() 92 | 93 | context.close() 94 | 95 | # keep display 96 | 97 | context = browser.new_context() 98 | page = context.new_page() 99 | 100 | task = task_cls(seed=42, remove_human_display=False) 101 | task.setup(page=page) 102 | 103 | for element_id in ["reward-display", "click-canvas", "sync-task-cover"]: 104 | element_in_dom = page.evaluate(f"!!document.getElementById('{element_id}')") 105 | assert element_in_dom 106 | 107 | assert page.evaluate(f"document.getElementById('query').innerHTML") != "" 108 | 109 | for element_id in ["wrap", "area"]: 110 | element_in_dom = page.evaluate(f"!!document.getElementById('{element_id}')") 111 | assert element_in_dom 112 | 113 | task.teardown() 114 | 115 | context.close() 116 | browser.close() 117 | 118 | 119 | @pytest.mark.skip(reason="TODO: how to get the final viewport size right?") 120 | @pytest.mark.parametrize("task_cls", TASKS) 121 | def test_viewport(task_cls): 122 | env = gym.make( 123 | f"browsergym/{task_cls.get_task_id()}", 124 | headless=__HEADLESS, 125 | slow_mo=__SLOW_MO, 126 | ) 127 | obs, info = env.reset(seed=42) 128 | 129 | screenshot = obs["screenshot"] 130 | 131 | # 3D array (height, width, rgb) of unsigned bytes (between 0 and 255) 132 | # Miniwob viewport should be (320x500) 133 | assert screenshot.shape[0] == 320 134 | assert screenshot.shape[1] == 500 135 | assert screenshot.shape[2] == 3 # RGB 136 | 137 | env.close() 138 | 139 | 140 | @pytest.mark.parametrize("task_cls", TASKS) 141 | def test_forbidden_navigation(task_cls): 142 | pw = browsergym.core._get_global_playwright() 143 | 144 | browser = pw.chromium.launch(headless=__HEADLESS, slow_mo=__SLOW_MO) 145 | context = browser.new_context() 146 | page = context.new_page() 147 | 148 | task = task_cls(seed=42) 149 | task.setup(page=page) 150 | 151 | reward, done, msg, info = task.validate(page, []) 152 | 153 | assert reward == 0.0 and done == False 154 | 155 | page.goto("http://www.google.com") 156 | 157 | reward, done, msg, info = task.validate(page, []) 158 | 159 | assert reward == 0.0 and done == True 160 | 161 | task.teardown() 162 | 163 | context.close() 164 | browser.close() 165 | 166 | 167 | @pytest.mark.parametrize("task_cls", TASKS) 168 | def test_forbidden_navigation_2(task_cls): 169 | pw = browsergym.core._get_global_playwright() 170 | 171 | browser = pw.chromium.launch(headless=__HEADLESS, slow_mo=__SLOW_MO) 172 | context = browser.new_context() 173 | page = context.new_page() 174 | 175 | task = task_cls(seed=42) 176 | task.setup(page=page) 177 | 178 | reward, done, msg, info = task.validate(page, []) 179 | 180 | assert reward == 0.0 and done == False 181 | 182 | page2 = context.new_page() 183 | page2.goto("http://www.google.com") 184 | 185 | reward, done, msg, info = task.validate(page, []) 186 | 187 | assert reward == 0.0 and done == False 188 | 189 | reward, done, msg, info = task.validate(page2, []) 190 | 191 | assert reward == 0.0 and done == True 192 | 193 | task.teardown() 194 | 195 | context.close() 196 | browser.close() 197 | --------------------------------------------------------------------------------