├── utils
    ├── browser-use
    │   ├── examples
    │   │   ├── __init__.py
    │   │   ├── models
    │   │   │   ├── langchain
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── README.md
    │   │   │   │   └── example.py
    │   │   │   ├── README.md
    │   │   │   ├── gpt-4.1.py
    │   │   │   ├── claude-4-sonnet.py
    │   │   │   ├── llama4-groq.py
    │   │   │   ├── novita.py
    │   │   │   ├── gemini.py
    │   │   │   └── azure_openai.py
    │   │   ├── simple.py
    │   │   ├── features
    │   │   │   ├── planner.py
    │   │   │   ├── small_model_for_extraction.py
    │   │   │   ├── multi-tab_handling.py
    │   │   │   ├── initial_actions.py
    │   │   │   ├── save_trace.py
    │   │   │   ├── restrict_urls.py
    │   │   │   ├── download_file.py
    │   │   │   ├── custom_system_prompt.py
    │   │   │   ├── follow_up_tasks.py
    │   │   │   ├── drag_drop.py
    │   │   │   ├── validate_output.py
    │   │   │   ├── parallel_agents.py
    │   │   │   ├── cross_origin_iframes.py
    │   │   │   ├── result_processing.py
    │   │   │   ├── custom_output.py
    │   │   │   ├── outsource_state.py
    │   │   │   ├── custom_user_agent.py
    │   │   │   ├── multiple_tasks.py
    │   │   │   ├── sensitive_data.py
    │   │   │   └── pause_agent.py
    │   │   ├── ui
    │   │   │   ├── README.md
    │   │   │   ├── streamlit_demo.py
    │   │   │   └── command_line.py
    │   │   ├── use-cases
    │   │   │   ├── README.md
    │   │   │   ├── captcha.py
    │   │   │   ├── wikipedia_banana_to_quantum.py
    │   │   │   ├── twitter_post_using_cookies.py
    │   │   │   ├── online_coding_agent.py
    │   │   │   ├── check_appointment.py
    │   │   │   ├── scrolling_page.py
    │   │   │   ├── web_voyager_agent.py
    │   │   │   └── find_influencer_profiles.py
    │   │   ├── file_system
    │   │   │   ├── excel_sheet.py
    │   │   │   └── file_system.py
    │   │   ├── browser
    │   │   │   ├── real_browser.py
    │   │   │   ├── multiple_agents_same_browser.py
    │   │   │   └── using_cdp.py
    │   │   ├── custom-functions
    │   │   │   ├── notification.py
    │   │   │   ├── save_to_file_hugging_face.py
    │   │   │   ├── clipboard.py
    │   │   │   ├── save_pdf.py
    │   │   │   ├── onepassword_2fa.py
    │   │   │   ├── 2fa.py
    │   │   │   ├── solve_amazon_captcha.py
    │   │   │   ├── extract_pdf_content.py
    │   │   │   ├── perplexity_search.py
    │   │   │   └── file_upload.py
    │   │   ├── mcp
    │   │   │   └── simple_client.py
    │   │   └── integrations
    │   │   │   ├── slack
    │   │   │       └── slack_example.py
    │   │   │   └── browserbase_stagehand.py
    │   ├── .python-version
    │   ├── browser_use
    │   │   ├── dom
    │   │   │   ├── __init__.py
    │   │   │   ├── utils.py
    │   │   │   ├── playground
    │   │   │   │   └── process_dom.py
    │   │   │   ├── history_tree_processor
    │   │   │   │   └── view.py
    │   │   │   └── clickable_element_processor
    │   │   │   │   └── service.py
    │   │   ├── tokens
    │   │   │   ├── __init__.py
    │   │   │   └── views.py
    │   │   ├── filesystem
    │   │   │   └── __init__.py
    │   │   ├── llm
    │   │   │   ├── google
    │   │   │   │   └── __init__.py
    │   │   │   ├── aws
    │   │   │   │   └── __init__.py
    │   │   │   ├── openai
    │   │   │   │   └── like.py
    │   │   │   ├── README.md
    │   │   │   ├── exceptions.py
    │   │   │   ├── openrouter
    │   │   │   │   └── serializer.py
    │   │   │   ├── views.py
    │   │   │   ├── base.py
    │   │   │   ├── __init__.py
    │   │   │   ├── tests
    │   │   │   │   └── test_groq_loop.py
    │   │   │   └── ollama
    │   │   │   │   └── chat.py
    │   │   ├── exceptions.py
    │   │   ├── sync
    │   │   │   └── __init__.py
    │   │   ├── telemetry
    │   │   │   ├── __init__.py
    │   │   │   └── views.py
    │   │   ├── mcp
    │   │   │   ├── __main__.py
    │   │   │   └── __init__.py
    │   │   ├── browser
    │   │   │   ├── browser.py
    │   │   │   ├── __init__.py
    │   │   │   ├── context.py
    │   │   │   ├── utils.py
    │   │   │   └── views.py
    │   │   ├── README.md
    │   │   ├── integrations
    │   │   │   └── gmail
    │   │   │   │   └── __init__.py
    │   │   ├── agent
    │   │   │   └── message_manager
    │   │   │   │   ├── utils.py
    │   │   │   │   └── views.py
    │   │   └── __init__.py
    │   ├── docs
    │   │   ├── favicon.ico
    │   │   ├── images
    │   │   │   ├── laminar.png
    │   │   │   ├── browser-use.png
    │   │   │   └── checks-passed.png
    │   │   ├── development
    │   │   │   ├── roadmap.mdx
    │   │   │   ├── telemetry.mdx
    │   │   │   ├── evaluations.mdx
    │   │   │   ├── observability.mdx
    │   │   │   └── contribution-guide.mdx
    │   │   ├── README.md
    │   │   ├── favicon.svg
    │   │   ├── customize
    │   │   │   ├── output-format.mdx
    │   │   │   └── system-prompt.mdx
    │   │   └── quickstart.mdx
    │   ├── static
    │   │   ├── browser-use.png
    │   │   └── browser-use-dark.png
    │   ├── .github
    │   │   ├── .git-blame-ignore-revs
    │   │   ├── CONTRIBUTING.md
    │   │   ├── ISSUE_TEMPLATE
    │   │   │   ├── config.yml
    │   │   │   └── 4_docs_issue.yml
    │   │   ├── workflows
    │   │   │   ├── cloud_evals.yml
    │   │   │   ├── lint.yml
    │   │   │   ├── claude.yml
    │   │   │   ├── build-base-image.yml.disabled
    │   │   │   ├── package.yaml
    │   │   │   └── docker.yml
    │   │   └── SECURITY.md
    │   ├── .gitattributes
    │   ├── tests
    │   │   ├── agent_tasks
    │   │   │   ├── browser_use_pip.yaml
    │   │   │   ├── amazon_laptop.yaml
    │   │   │   ├── captcha_cloudflare.yaml
    │   │   │   └── README.md
    │   │   ├── old
    │   │   │   ├── test_full_screen.py
    │   │   │   ├── test_dropdown_error.py
    │   │   │   ├── screenshot_test.py
    │   │   │   ├── test_gif_path.py
    │   │   │   ├── test_dropdown.py
    │   │   │   ├── test_react_dropdown.py
    │   │   │   ├── test_dropdown_complex.py
    │   │   │   ├── httpx_client_test.py
    │   │   │   ├── test_vision.py
    │   │   │   └── test_wait_for_element.py
    │   │   └── ci
    │   │   │   ├── test_browser_session_via_cdp.py
    │   │   │   └── test_schema_optimizer.py
    │   ├── bin
    │   │   ├── test.sh
    │   │   ├── lint.sh
    │   │   └── setup.sh
    │   ├── docker
    │   │   ├── base-images
    │   │   │   ├── system
    │   │   │   │   └── Dockerfile
    │   │   │   ├── python-deps
    │   │   │   │   └── Dockerfile
    │   │   │   └── chromium
    │   │   │   │   └── Dockerfile
    │   │   ├── README.md
    │   │   └── build-base-images.sh
    │   ├── .env.example
    │   ├── .dockerignore
    │   ├── .gitignore
    │   ├── LICENSE
    │   ├── Dockerfile.fast
    │   ├── .pre-commit-config.yaml
    │   └── .cursor
    │   │   └── rules
    │   │       └── browser-use-rules.mdc
    ├── __pycache__
    │   ├── core.cpython-312.pyc
    │   ├── __init__.cpython-312.pyc
    │   ├── browser.cpython-312.pyc
    │   ├── metrics.cpython-312.pyc
    │   ├── operations.cpython-312.pyc
    │   ├── agent_runner.cpython-312.pyc
    │   ├── browser_runner.cpython-312.pyc
    │   ├── visual_scorer.cpython-312.pyc
    │   └── assertion_scorer.cpython-312.pyc
    └── __init__.py
├── assets
    ├── result.png
    ├── overview.png
    └── iwrbench_logo.png
├── .env.example
├── requirements.txt
└── config.py


/utils/browser-use/examples/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/browser-use/.python-version:
--------------------------------------------------------------------------------
1 | 3.12
2 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/dom/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/tokens/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/filesystem/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/models/langchain/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/assets/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/assets/result.png


--------------------------------------------------------------------------------
/assets/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/assets/overview.png


--------------------------------------------------------------------------------
/assets/iwrbench_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/assets/iwrbench_logo.png


--------------------------------------------------------------------------------
/utils/browser-use/docs/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/browser-use/docs/favicon.ico


--------------------------------------------------------------------------------
/utils/__pycache__/core.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/__pycache__/core.cpython-312.pyc


--------------------------------------------------------------------------------
/utils/browser-use/static/browser-use.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/browser-use/static/browser-use.png


--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/__pycache__/__init__.cpython-312.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/browser.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/__pycache__/browser.cpython-312.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/metrics.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/__pycache__/metrics.cpython-312.pyc


--------------------------------------------------------------------------------
/utils/browser-use/.github/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
1 | 66b3c26df51adec32d42c3b2c0304e0662457298
2 | 2be4ba4f7078d47bbeed04baf6f8fb04017df028
3 | 


--------------------------------------------------------------------------------
/utils/browser-use/docs/images/laminar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/browser-use/docs/images/laminar.png


--------------------------------------------------------------------------------
/utils/__pycache__/operations.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/__pycache__/operations.cpython-312.pyc


--------------------------------------------------------------------------------
/utils/browser-use/.gitattributes:
--------------------------------------------------------------------------------
1 | static/*.gif filter=lfs diff=lfs merge=lfs -text
2 | # static/*.mp4 filter=lfs diff=lfs merge=lfs -text
3 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/llm/google/__init__.py:
--------------------------------------------------------------------------------
1 | from browser_use.llm.google.chat import ChatGoogle
2 | 
3 | __all__ = ['ChatGoogle']
4 | 


--------------------------------------------------------------------------------
/utils/browser-use/docs/images/browser-use.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/browser-use/docs/images/browser-use.png


--------------------------------------------------------------------------------
/utils/browser-use/static/browser-use-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/browser-use/static/browser-use-dark.png


--------------------------------------------------------------------------------
/utils/__pycache__/agent_runner.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/__pycache__/agent_runner.cpython-312.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/browser_runner.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/__pycache__/browser_runner.cpython-312.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/visual_scorer.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/__pycache__/visual_scorer.cpython-312.pyc


--------------------------------------------------------------------------------
/utils/browser-use/docs/images/checks-passed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/browser-use/docs/images/checks-passed.png


--------------------------------------------------------------------------------
/utils/__pycache__/assertion_scorer.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/__pycache__/assertion_scorer.cpython-312.pyc


--------------------------------------------------------------------------------
/utils/browser-use/examples/models/README.md:
--------------------------------------------------------------------------------
1 | # Gemini
2 | Detailed video on how to integrate browser-use with Gemini: https://www.youtube.com/watch?v=JluZiWBV_Tc
3 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=
2 | OPENAI_BASE_URL=
3 | OPENAI_MODEL_NAME=
4 | MODEL_FOR_EVAL=  # using for eval
5 | HEADLESS=true # set to false if you want to see the browser


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/dom/utils.py:
--------------------------------------------------------------------------------
1 | def cap_text_length(text: str, max_length: int) -> str:
2 | 	if len(text) > max_length:
3 | 		return text[:max_length] + '...'
4 | 	return text
5 | 


--------------------------------------------------------------------------------
/utils/browser-use/docs/development/roadmap.mdx:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Roadmap"
3 | description: "Future plans and upcoming features for Browser Use"
4 | icon: "road"
5 | ---
6 | 
7 | Big things coming soon!
8 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/exceptions.py:
--------------------------------------------------------------------------------
1 | class LLMException(Exception):
2 | 	def __init__(self, status_code, message):
3 | 		self.status_code = status_code
4 | 		self.message = message
5 | 		super().__init__(f'Error {status_code}: {message}')
6 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/llm/aws/__init__.py:
--------------------------------------------------------------------------------
1 | from browser_use.llm.aws.chat_anthropic import ChatAnthropicBedrock
2 | from browser_use.llm.aws.chat_bedrock import ChatAWSBedrock
3 | 
4 | __all__ = [
5 | 	'ChatAWSBedrock',
6 | 	'ChatAnthropicBedrock',
7 | ]
8 | 


--------------------------------------------------------------------------------
/utils/browser-use/tests/agent_tasks/browser_use_pip.yaml:
--------------------------------------------------------------------------------
1 | name: Find pip install command for browser-use
2 | task: Find the pip installation command for the browser-use repo
3 | judge_context:
4 |   - The output must include the command ('pip install browser-use') 
5 | max_steps: 10
6 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/sync/__init__.py:
--------------------------------------------------------------------------------
1 | """Cloud sync module for Browser Use."""
2 | 
3 | from browser_use.sync.auth import CloudAuthConfig, DeviceAuthClient
4 | from browser_use.sync.service import CloudSync
5 | 
6 | __all__ = ['CloudAuthConfig', 'DeviceAuthClient', 'CloudSync']
7 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/telemetry/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Telemetry for Browser Use.
3 | """
4 | 
5 | from browser_use.telemetry.service import ProductTelemetry
6 | from browser_use.telemetry.views import BaseTelemetryEvent
7 | 
8 | __all__ = ['BaseTelemetryEvent', 'ProductTelemetry']
9 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/mcp/__main__.py:
--------------------------------------------------------------------------------
 1 | """Entry point for running MCP server as a module.
 2 | 
 3 | Usage:
 4 |     python -m browser_use.mcp.server
 5 | """
 6 | 
 7 | import asyncio
 8 | 
 9 | from browser_use.mcp.server import main
10 | 
11 | if __name__ == '__main__':
12 | 	asyncio.run(main())
13 | 


--------------------------------------------------------------------------------
/utils/browser-use/tests/agent_tasks/amazon_laptop.yaml:
--------------------------------------------------------------------------------
1 | name: Amazon Laptop Search
2 | task: Go to amazon.com, search for 'laptop', and return the first result
3 | judge_context:
4 |   - The agent must navigate to amazon.com
5 |   - The agent must search for 'laptop'
6 |   - The agent must return name of the first laptop 
7 | max_steps: 10
8 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/browser/browser.py:
--------------------------------------------------------------------------------
1 | from browser_use.browser.profile import BrowserProfile
2 | from browser_use.browser.session import BrowserSession
3 | 
4 | BrowserConfig = BrowserProfile
5 | BrowserContextConfig = BrowserProfile
6 | Browser = BrowserSession
7 | 
8 | __all__ = ['BrowserConfig', 'BrowserContextConfig', 'Browser']
9 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | __all__ = ["browser", "operations", "visual_scorer", "assertion_scorer"]
 3 | 
 4 | 
 5 | def __getattr__(name: str):
 6 | 	if name in __all__:
 7 | 		module = __import__(f"utils.{name}", fromlist=[name])
 8 | 		globals()[name] = module
 9 | 		return module
10 | 	raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
11 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/browser/__init__.py:
--------------------------------------------------------------------------------
1 | from .browser import Browser, BrowserConfig
2 | from .context import BrowserContext, BrowserContextConfig
3 | from .profile import BrowserProfile
4 | from .session import BrowserSession
5 | 
6 | __all__ = ['Browser', 'BrowserConfig', 'BrowserContext', 'BrowserContextConfig', 'BrowserSession', 'BrowserProfile']
7 | 


--------------------------------------------------------------------------------
/utils/browser-use/bin/test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # This script is used to run all the main project tests that run on CI via .github/workflows/test.yaml.
 3 | # Usage:
 4 | #   $ ./bin/test.sh
 5 | 
 6 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 7 | cd "$SCRIPT_DIR/.." || exit 1
 8 | 
 9 | exec uv run pytest --numprocesses auto tests/ci $1 $2 $3
10 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/browser/context.py:
--------------------------------------------------------------------------------
 1 | from browser_use.browser.profile import BrowserProfile
 2 | from browser_use.browser.session import BrowserSession
 3 | 
 4 | Browser = BrowserSession
 5 | BrowserConfig = BrowserProfile
 6 | BrowserContext = BrowserSession
 7 | BrowserContextConfig = BrowserProfile
 8 | 
 9 | __all__ = ['Browser', 'BrowserConfig', 'BrowserContext', 'BrowserContextConfig']
10 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/llm/openai/like.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | from browser_use.llm.openai.chat import ChatOpenAI
 4 | 
 5 | 
 6 | @dataclass
 7 | class ChatOpenAILike(ChatOpenAI):
 8 | 	"""
 9 | 	A class for to interact with any provider using the OpenAI API schema.
10 | 
11 | 	Args:
12 | 	    model (str): The name of the OpenAI model to use.
13 | 	"""
14 | 
15 | 	model: str
16 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | transformers==4.57.0
 2 | huggingface-hub==0.35.3
 3 | tokenizers==0.22.1
 4 | safetensors==0.6.2
 5 | numpy>=1.24,<2.4
 6 | opencv-python==4.12.0.88
 7 | Pillow==11.3.0
 8 | torch==2.8.0
 9 | Levenshtein==0.27.1
10 | rapidfuzz==3.14.1
11 | httpx>=0.24.0
12 | python-dotenv>=1.0.0
13 | openai>=1.0.0
14 | jinja2>=3.0.0
15 | easyocr>=1.6.0
16 | pyyaml>=6.0.0
17 | regex>=2025.0.0
18 | filelock
19 | fsspec


--------------------------------------------------------------------------------
/utils/browser-use/docker/base-images/system/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.12-slim
 2 | 
 3 | # Install minimal system dependencies
 4 | RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
 5 |     apt-get update && \
 6 |     apt-get install -y --no-install-recommends ca-certificates curl wget && \
 7 |     rm -rf /var/lib/apt/lists/*
 8 | 
 9 | # Install uv package manager
10 | COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
11 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/mcp/__init__.py:
--------------------------------------------------------------------------------
 1 | """MCP (Model Context Protocol) support for browser-use.
 2 | 
 3 | This module provides integration with MCP servers and clients for browser automation.
 4 | """
 5 | 
 6 | from browser_use.mcp.client import MCPClient
 7 | from browser_use.mcp.controller import MCPToolWrapper
 8 | from browser_use.mcp.server import BrowserUseServer
 9 | 
10 | __all__ = ['MCPClient', 'MCPToolWrapper', 'BrowserUseServer']
11 | 


--------------------------------------------------------------------------------
/utils/browser-use/docker/base-images/python-deps/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG BASE_TAG=latest
 2 | FROM browseruse/base-chromium:${BASE_TAG}
 3 | 
 4 | ENV PYTHONUNBUFFERED=1 PATH="/app/.venv/bin:$PATH" PLAYWRIGHT_BROWSERS_PATH=/opt/playwright
 5 | 
 6 | WORKDIR /app
 7 | COPY pyproject.toml uv.lock* ./
 8 | 
 9 | RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked \
10 |     uv venv && \
11 |     uv sync --all-extras --no-dev --no-install-project --compile-bytecode
12 | 


--------------------------------------------------------------------------------
/utils/browser-use/bin/lint.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # This script is used to run the formatter, linter, and type checker pre-commit hooks.
 3 | # Usage:
 4 | #   $ ./bin/lint.sh
 5 | 
 6 | IFS=$'\n'
 7 | 
 8 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 9 | 
10 | cd "$SCRIPT_DIR/.." || exit 1
11 | 
12 | echo "[*] Running ruff linter, formatter, pyright type checker, and other pre-commit checks..."
13 | exec uv run pre-commit run --all-files
14 | 


--------------------------------------------------------------------------------
/utils/browser-use/tests/agent_tasks/captcha_cloudflare.yaml:
--------------------------------------------------------------------------------
1 | name: Cloudflare captcha
2 | task: Go to https://2captcha.com/demo/cloudflare-turnstile and solve the captcha, wait a few seconds, then click on check, wait a few more seconds for it to complete, then extract the "hostname" value from the displayed dictionary under "Captcha is passed successfully!"
3 | judge_context:
4 |   - The agent must solve the captcha
5 |   - The hostname returned should be "example.com"
6 | max_steps: 6
7 | 


--------------------------------------------------------------------------------
/utils/browser-use/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to browser-use
2 | 
3 | We love contributions! Please read through these links to get started:
4 | 
5 |  - 🔢 [Contribution Guidelines](https://docs.browser-use.com/development/contribution-guide)
6 |  - 👾 [Local Development Setup Guide](https://docs.browser-use.com/development/local-setup)
7 |  - 🏷️ [Issues Tagged: `#help-wanted`](https://github.com/browser-use/browser-use/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22help%20wanted%22)
8 | 


--------------------------------------------------------------------------------
/utils/browser-use/docs/README.md:
--------------------------------------------------------------------------------
 1 | # Docs
 2 | 
 3 | The official documentation for Browser Use. The docs are published to [Browser Use Docs](https://docs.browser-use.com).
 4 | 
 5 | ### Development
 6 | 
 7 | Install the [Mintlify CLI](https://www.npmjs.com/package/mintlify) to preview the documentation changes locally. To install, use the following command
 8 | 
 9 | ```
10 | npm i -g mintlify
11 | ```
12 | 
13 | Run the following command at the root of your documentation (where mint.json is)
14 | 
15 | ```
16 | mintlify dev
17 | ```
18 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/llm/README.md:
--------------------------------------------------------------------------------
 1 | # Browser Use LLMs
 2 | 
 3 | We officially support the following LLMs:
 4 | 
 5 | - OpenAI
 6 | - Anthropic
 7 | - Google
 8 | - Groq
 9 | - Ollama
10 | - DeepSeek
11 | 
12 | ## Migrating from LangChain
13 | 
14 | Because of how we implemented the LLMs, we can technically support anything. If you want to use a LangChain model, you can use the `ChatLangchain` (NOT OFFICIALLY SUPPORTED) class.
15 | 
16 | You can find all the details in the [LangChain example](examples/models/langchain/example.py). We suggest you grab that code and use it as a reference.
17 | 


--------------------------------------------------------------------------------
/utils/browser-use/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
 1 | blank_issues_enabled: false  # Set to true if you want to allow blank issues
 2 | contact_links:
 3 |   - name: 🔢 Quickstart Guide
 4 |     url: https://docs.browser-use.com/quickstart
 5 |     about: Most common issues can be resolved by following our quickstart guide
 6 |   - name: 💬 Questions and Help
 7 |     url: https://link.browser-use.com/discord
 8 |     about: Please ask questions in our Discord community
 9 |   - name: 📖 Documentation
10 |     url: https://docs.browser-use.com
11 |     about: Check our documentation for answers first
12 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/simple.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | from browser_use.llm.openai.chat import ChatOpenAI
 6 | 
 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 8 | 
 9 | from dotenv import load_dotenv
10 | 
11 | load_dotenv()
12 | 
13 | 
14 | from browser_use import Agent
15 | 
16 | # Initialize the model
17 | llm = ChatOpenAI(
18 | 	model='gpt-4.1-mini',
19 | )
20 | 
21 | 
22 | task = 'Find the founders of browser-use'
23 | agent = Agent(task=task, llm=llm)
24 | 
25 | 
26 | async def main():
27 | 	await agent.run()
28 | 
29 | 
30 | if __name__ == '__main__':
31 | 	asyncio.run(main())
32 | 


--------------------------------------------------------------------------------
/utils/browser-use/tests/old/test_full_screen.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from browser_use.browser.types import async_playwright
 4 | 
 5 | 
 6 | async def test_full_screen(start_fullscreen: bool, maximize: bool):
 7 | 	async with async_playwright() as p:
 8 | 		browser = await p.chromium.launch(
 9 | 			headless=False,
10 | 			args=['--start-maximized'],
11 | 		)
12 | 		context = await browser.new_context(no_viewport=True, viewport=None)
13 | 		page = await context.new_page()
14 | 		await page.goto('https://google.com')
15 | 
16 | 		await asyncio.sleep(10)
17 | 		await browser.close()
18 | 
19 | 
20 | if __name__ == '__main__':
21 | 	asyncio.run(test_full_screen(False, False))
22 | 


--------------------------------------------------------------------------------
/utils/browser-use/.env.example:
--------------------------------------------------------------------------------
 1 | OPENAI_API_KEY=
 2 | ANTHROPIC_API_KEY=
 3 | AZURE_OPENAI_ENDPOINT=
 4 | AZURE_OPENAI_KEY=
 5 | GOOGLE_API_KEY=
 6 | DEEPSEEK_API_KEY=
 7 | GROK_API_KEY=
 8 | NOVITA_API_KEY=
 9 | 
10 | # Set to false to disable anonymized telemetry
11 | ANONYMIZED_TELEMETRY=true
12 | 
13 | # LogLevel: Set to debug to enable verbose logging, set to result to get results only. Available: result | debug | info
14 | BROWSER_USE_LOGGING_LEVEL=info
15 | 
16 | # Calculate costs: (beta) Add cost calculations to tokens. Available: true | false
17 | BROWSER_USE_CALCULATE_COST=false
18 | 
19 | # set this to true to optimize browser-use's chrome for running inside docker
20 | IN_DOCKER=false
21 | 


--------------------------------------------------------------------------------
/utils/browser-use/.dockerignore:
--------------------------------------------------------------------------------
 1 | docs/
 2 | static/
 3 | .claude/
 4 | .github/
 5 | 
 6 | # Cache files
 7 | .DS_Store
 8 | __pycache__/
 9 | *.py[cod]
10 | *$py.class
11 | .mypy_cache/
12 | .ruff_cache/
13 | .pytest_cache/
14 | .ipynb_checkpoints
15 | 
16 | # Virtual Environments
17 | .venv
18 | venv/
19 | 
20 | # Editor cruft
21 | .vscode/
22 | .idea/
23 | 
24 | # Build Files
25 | dist/
26 | 
27 | # Data files
28 | *.gif
29 | *.txt
30 | *.pdf
31 | *.csv
32 | *.json
33 | *.jsonl
34 | *.bak
35 | 
36 | # Secrets and sensitive files
37 | secrets.env
38 | .env
39 | browser_cookies.json
40 | cookies.json
41 | gcp-login.json
42 | saved_trajectories/
43 | AgentHistory.json
44 | AgentHistoryList.json
45 | private_example.py
46 | private_example
47 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/llm/exceptions.py:
--------------------------------------------------------------------------------
 1 | class ModelError(Exception):
 2 | 	pass
 3 | 
 4 | 
 5 | class ModelProviderError(ModelError):
 6 | 	"""Exception raised when a model provider returns an error."""
 7 | 
 8 | 	def __init__(
 9 | 		self,
10 | 		message: str,
11 | 		status_code: int = 502,
12 | 		model: str | None = None,
13 | 	):
14 | 		super().__init__(message, status_code)
15 | 		self.model = model
16 | 
17 | 
18 | class ModelRateLimitError(ModelProviderError):
19 | 	"""Exception raised when a model provider returns a rate limit error."""
20 | 
21 | 	def __init__(
22 | 		self,
23 | 		message: str,
24 | 		status_code: int = 429,
25 | 		model: str | None = None,
26 | 	):
27 | 		super().__init__(message, status_code, model)
28 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/features/planner.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 6 | 
 7 | from dotenv import load_dotenv
 8 | 
 9 | load_dotenv()
10 | 
11 | from browser_use import Agent
12 | from browser_use.llm import ChatOpenAI
13 | 
14 | llm = ChatOpenAI(model='gpt-4.1', temperature=0.0)
15 | planner_llm = ChatOpenAI(
16 | 	model='o3-mini',
17 | )
18 | task = 'your task'
19 | 
20 | 
21 | agent = Agent(task=task, llm=llm, planner_llm=planner_llm, use_vision_for_planner=False, planner_interval=1)
22 | 
23 | 
24 | async def main():
25 | 	await agent.run()
26 | 
27 | 
28 | if __name__ == '__main__':
29 | 	asyncio.run(main())
30 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/ui/README.md:
--------------------------------------------------------------------------------
1 | # **User Interfaces of Browser-Use**
2 | 
3 | | **File Name**          | **User Interface** | **Description**                           | **Example Usage**                         |
4 | |------------------------|-------------------|-------------------------------------------|-------------------------------------------|
5 | | `command_line.py`      | **Terminal**      | Parses arguments for command-line execution. | `python command_line.py`                  |
6 | | `gradio_demo.py`       | **Gradio**        | Provides a Gradio-based interactive UI.  | `python gradio_demo.py`                   |
7 | | `streamlit_demo.py`    | **Streamlit**     | Runs a Streamlit-based web interface.    | `python -m streamlit run streamlit_demo.py` |
8 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/features/small_model_for_extraction.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 6 | 
 7 | from dotenv import load_dotenv
 8 | 
 9 | load_dotenv()
10 | 
11 | from browser_use import Agent
12 | from browser_use.llm import ChatOpenAI
13 | 
14 | llm = ChatOpenAI(model='gpt-4.1', temperature=0.0)
15 | small_llm = ChatOpenAI(model='gpt-4.1-mini', temperature=0.0)
16 | task = 'Find the founders of browser-use in ycombinator, extract all links and open the links one by one'
17 | agent = Agent(task=task, llm=llm, page_extraction_llm=small_llm)
18 | 
19 | 
20 | async def main():
21 | 	await agent.run()
22 | 
23 | 
24 | if __name__ == '__main__':
25 | 	asyncio.run(main())
26 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/features/multi-tab_handling.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple try of the agent.
 3 | 
 4 | @dev You need to add OPENAI_API_KEY to your environment variables.
 5 | """
 6 | 
 7 | import asyncio
 8 | import os
 9 | import sys
10 | 
11 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
12 | 
13 | from dotenv import load_dotenv
14 | 
15 | load_dotenv()
16 | 
17 | from browser_use import Agent
18 | from browser_use.llm import ChatOpenAI
19 | 
20 | # video: https://preview.screen.studio/share/clenCmS6
21 | llm = ChatOpenAI(model='gpt-4.1')
22 | agent = Agent(
23 | 	task='open 3 tabs with elon musk, trump, and steve jobs, then go back to the first and stop',
24 | 	llm=llm,
25 | )
26 | 
27 | 
28 | async def main():
29 | 	await agent.run()
30 | 
31 | 
32 | asyncio.run(main())
33 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/models/gpt-4.1.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple try of the agent.
 3 | 
 4 | @dev You need to add OPENAI_API_KEY to your environment variables.
 5 | """
 6 | 
 7 | import asyncio
 8 | 
 9 | from dotenv import load_dotenv
10 | from lmnr import Laminar
11 | 
12 | from browser_use import Agent
13 | from browser_use.llm import ChatOpenAI
14 | 
15 | load_dotenv()
16 | 
17 | 
18 | Laminar.initialize()
19 | 
20 | # All the models are type safe from OpenAI in case you need a list of supported models
21 | llm = ChatOpenAI(model='gpt-4.1-mini')
22 | agent = Agent(
23 | 	task='Go to example.com, click on the first link, and give me the title of the page',
24 | 	llm=llm,
25 | )
26 | 
27 | 
28 | async def main():
29 | 	await agent.run(max_steps=10)
30 | 	input('Press Enter to continue...')
31 | 
32 | 
33 | asyncio.run(main())
34 | 


--------------------------------------------------------------------------------
/utils/browser-use/.gitignore:
--------------------------------------------------------------------------------
 1 | # Cache files
 2 | .DS_Store
 3 | __pycache__/
 4 | *.py[cod]
 5 | *$py.class
 6 | .mypy_cache/
 7 | .ruff_cache/
 8 | .pytest_cache/
 9 | .ipynb_checkpoints
10 | ~/
11 | 
12 | # Virtual Environments
13 | .venv*
14 | venv/
15 | 
16 | # IDEs
17 | .vscode/
18 | .idea/
19 | 
20 | # Build files
21 | dist/
22 | 
23 | # Data files
24 | *.gif
25 | *.txt
26 | *.pdf
27 | *.csv
28 | *.json
29 | *.jsonl
30 | *.log
31 | *.bak
32 | 
33 | # Secrets and sensitive files
34 | secrets.env
35 | .env
36 | browser_cookies.json
37 | cookies.json
38 | gcp-login.json
39 | saved_trajectories/
40 | old_tests/
41 | AgentHistory.json
42 | AgentHistoryList.json
43 | private_example.py
44 | private_example
45 | CLAUDE.local.md
46 | 
47 | uv.lock
48 | temp
49 | tmp
50 | 
51 | # Google API credentials
52 | credentials.json
53 | token.json
54 | 


--------------------------------------------------------------------------------
/utils/browser-use/tests/agent_tasks/README.md:
--------------------------------------------------------------------------------
 1 | # Contributing Agent Tasks
 2 | 
 3 | Contribute your own agent tasks and we test if the agent solves them for CI testing!
 4 | 
 5 | ## How to Add a Task
 6 | 
 7 | 1. Create a new `.yaml` file in this directory (`tests/agent_tasks/`).
 8 | 2. Use the following format:
 9 | 
10 | ```yaml
11 | name: My Task Name
12 | task: Describe the task for the agent to perform
13 | judge_context:
14 |   - List criteria for success, one per line
15 | max_steps: 10
16 | ```
17 | 
18 | ## Guidelines
19 | - Be specific in your task and criteria.
20 | - The `judge_context` should list what counts as a successful result.
21 | - The agent's output will be judged by an LLM using these criteria.
22 | 
23 | ## Running the Tests
24 | 
25 | To run all agent tasks:
26 | 
27 | ```bash
28 | pytest tests/ci/test_agent_real_tasks.py
29 | ```
30 | 
31 | ---
32 | 
33 | Happy contributing! 
34 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/features/initial_actions.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 6 | 
 7 | from dotenv import load_dotenv
 8 | 
 9 | load_dotenv()
10 | 
11 | from browser_use import Agent
12 | from browser_use.llm import ChatOpenAI
13 | 
14 | llm = ChatOpenAI(model='gpt-4.1')
15 | 
16 | initial_actions = [
17 | 	{'go_to_url': {'url': 'https://www.google.com', 'new_tab': True}},
18 | 	{'go_to_url': {'url': 'https://en.wikipedia.org/wiki/Randomness', 'new_tab': True}},
19 | 	{'scroll_down': {'amount': 1000}},
20 | ]
21 | agent = Agent(
22 | 	task='What theories are displayed on the page?',
23 | 	initial_actions=initial_actions,
24 | 	llm=llm,
25 | )
26 | 
27 | 
28 | async def main():
29 | 	await agent.run(max_steps=10)
30 | 
31 | 
32 | if __name__ == '__main__':
33 | 	asyncio.run(main())
34 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/models/claude-4-sonnet.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple script that runs the task of opening amazon and searching.
 3 | @dev Ensure we have a `ANTHROPIC_API_KEY` variable in our `.env` file.
 4 | """
 5 | 
 6 | import asyncio
 7 | import os
 8 | import sys
 9 | 
10 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
11 | 
12 | from dotenv import load_dotenv
13 | from lmnr import Laminar
14 | 
15 | load_dotenv()
16 | Laminar.initialize()
17 | 
18 | from browser_use import Agent
19 | from browser_use.llm import ChatAnthropic
20 | 
21 | llm = ChatAnthropic(model='claude-4-sonnet-20250514', temperature=0.0)
22 | 
23 | agent = Agent(
24 | 	task='Go to amazon.com, search for laptop, sort by best rating, and give me the price of the first result',
25 | 	llm=llm,
26 | )
27 | 
28 | 
29 | async def main():
30 | 	await agent.run(max_steps=10)
31 | 
32 | 
33 | asyncio.run(main())
34 | 


--------------------------------------------------------------------------------
/utils/browser-use/.github/workflows/cloud_evals.yml:
--------------------------------------------------------------------------------
 1 | name: cloud_evals
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - 'releases/*'
 8 |   workflow_dispatch:
 9 |     inputs:
10 |       commit_hash:
11 |         description: Commit hash of the library to build the Cloud eval image for
12 |         required: false
13 | 
14 | jobs:
15 |   trigger_cloud_eval_image_build:
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - uses: actions/github-script@v7
19 |         with:
20 |           github-token: ${{ secrets.TRIGGER_CLOUD_BUILD_GH_KEY }}
21 |           script: |
22 |             const result = await github.rest.repos.createDispatchEvent({
23 |               owner: 'browser-use',
24 |               repo: 'cloud',
25 |               event_type: 'trigger-workflow',
26 |               client_payload: {"commit_hash": "${{ github.event.inputs.commit_hash || github.sha }}"}
27 |             })
28 |             console.log(result)
29 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/models/langchain/README.md:
--------------------------------------------------------------------------------
 1 | # Langchain Models (legacy)
 2 | 
 3 | This directory contains example of how to still use Langchain models with the new Browser Use chat models.
 4 | 
 5 | ## How to use
 6 | 
 7 | ```python
 8 | from langchain_openai import ChatOpenAI
 9 | 
10 | from browser_use import Agent
11 | from .chat import ChatLangchain
12 | 
13 | async def main():
14 | 	"""Basic example using ChatLangchain with OpenAI through LangChain."""
15 | 
16 | 	# Create a LangChain model (OpenAI)
17 | 	langchain_model = ChatOpenAI(
18 | 		model='gpt-4.1-mini',
19 | 		temperature=0.1,
20 | 	)
21 | 
22 | 	# Wrap it with ChatLangchain to make it compatible with browser-use
23 | 	llm = ChatLangchain(chat=langchain_model)
24 | 
25 |     agent = Agent(
26 |         task="Go to google.com and search for 'browser automation with Python'",
27 |         llm=llm,
28 |     )
29 | 
30 |     history = await agent.run()
31 | 
32 |     print(history.history)
33 | ```
34 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/features/save_trace.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 6 | 
 7 | from dotenv import load_dotenv
 8 | 
 9 | load_dotenv()
10 | 
11 | from browser_use.agent.service import Agent
12 | from browser_use.browser import BrowserProfile, BrowserSession
13 | from browser_use.llm import ChatOpenAI
14 | 
15 | llm = ChatOpenAI(model='gpt-4.1', temperature=0.0)
16 | 
17 | 
18 | async def main():
19 | 	browser_session = BrowserSession(
20 | 		browser_profile=BrowserProfile(
21 | 			traces_dir='./tmp/traces/',
22 | 			user_data_dir='~/.config/browseruse/profiles/default',
23 | 		)
24 | 	)
25 | 
26 | 	async with browser_session:
27 | 		agent = Agent(
28 | 			task='Go to hackernews, then go to apple.com and return all titles of open tabs',
29 | 			llm=llm,
30 | 			browser_session=browser_session,
31 | 		)
32 | 		await agent.run()
33 | 
34 | 
35 | asyncio.run(main())
36 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/llm/openrouter/serializer.py:
--------------------------------------------------------------------------------
 1 | from openai.types.chat import ChatCompletionMessageParam
 2 | 
 3 | from browser_use.llm.messages import BaseMessage
 4 | from browser_use.llm.openai.serializer import OpenAIMessageSerializer
 5 | 
 6 | 
 7 | class OpenRouterMessageSerializer:
 8 | 	"""
 9 | 	Serializer for converting between custom message types and OpenRouter message formats.
10 | 
11 | 	OpenRouter uses the OpenAI-compatible API, so we can reuse the OpenAI serializer.
12 | 	"""
13 | 
14 | 	@staticmethod
15 | 	def serialize_messages(messages: list[BaseMessage]) -> list[ChatCompletionMessageParam]:
16 | 		"""
17 | 		Serialize a list of browser_use messages to OpenRouter-compatible messages.
18 | 
19 | 		Args:
20 | 		    messages: List of browser_use messages
21 | 
22 | 		Returns:
23 | 		    List of OpenRouter-compatible messages (identical to OpenAI format)
24 | 		"""
25 | 		# OpenRouter uses the same message format as OpenAI
26 | 		return OpenAIMessageSerializer.serialize_messages(messages)
27 | 


--------------------------------------------------------------------------------
/utils/browser-use/docker/base-images/chromium/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG BASE_TAG=latest
 2 | FROM browseruse/base-system:${BASE_TAG}
 3 | 
 4 | WORKDIR /tmp
 5 | COPY pyproject.toml ./
 6 | 
 7 | # Install both playwright and patchright with versions from pyproject.toml
 8 | RUN --mount=type=cache,target=/root/.cache,sharing=locked \
 9 |     PLAYWRIGHT_VERSION=$(grep -E "playwright>=" pyproject.toml | grep -o "[0-9]\+\.[0-9]\+\.[0-9]\+" | head -1) && \
10 |     PATCHRIGHT_VERSION=$(grep -E "patchright>=" pyproject.toml | grep -o "[0-9]\+\.[0-9]\+\.[0-9]\+" | head -1) && \
11 |     echo "Installing playwright==$PLAYWRIGHT_VERSION patchright==$PATCHRIGHT_VERSION" && \
12 |     pip install --no-cache-dir playwright==$PLAYWRIGHT_VERSION patchright==$PATCHRIGHT_VERSION && \
13 |     PLAYWRIGHT_BROWSERS_PATH=/opt/playwright playwright install --with-deps --no-shell chromium && \
14 |     ln -s /opt/playwright/chromium-*/chrome-linux/chrome /usr/bin/chromium-browser && \
15 |     chmod -R 755 /opt/playwright && \
16 |     rm -f pyproject.toml
17 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/models/llama4-groq.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 6 | 
 7 | from dotenv import load_dotenv
 8 | from lmnr import Laminar
 9 | 
10 | load_dotenv()
11 | 
12 | 
13 | Laminar.initialize()
14 | 
15 | 
16 | from browser_use import Agent
17 | from browser_use.llm import ChatGroq
18 | 
19 | groq_api_key = os.environ.get('GROQ_API_KEY')
20 | llm = ChatGroq(
21 | 	model='meta-llama/llama-4-maverick-17b-128e-instruct',
22 | 	# temperature=0.1,
23 | )
24 | 
25 | # llm = ChatGroq(
26 | # 	model='meta-llama/llama-4-maverick-17b-128e-instruct',
27 | # 	api_key=os.environ.get('GROQ_API_KEY'),
28 | # 	temperature=0.0,
29 | # )
30 | 
31 | task = 'Go to amazon.com, search for laptop, sort by best rating, and give me the price of the first result'
32 | 
33 | 
34 | async def main():
35 | 	agent = Agent(
36 | 		task=task,
37 | 		llm=llm,
38 | 	)
39 | 	await agent.run()
40 | 
41 | 
42 | if __name__ == '__main__':
43 | 	asyncio.run(main())
44 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/use-cases/README.md:
--------------------------------------------------------------------------------
 1 | # Use Cases of Browser-Use
 2 | 
 3 | | File Name | Description |
 4 | |-----------|------------|
 5 | | `captcha.py` | Automates CAPTCHA solving on a demo website. |
 6 | | `check_appointment.py` | Checks for available visa appointment slots on the Greece MFA website. |
 7 | | `find_and_apply_to_jobs.py` | Searches for job listings, evaluates relevance based on a CV, and applies automatically. |
 8 | | `online_coding_agent.py` | Implements a multi-agent system for online code editors, with separate agents for coding and execution. |
 9 | | `post-twitter.py` | Provides a template for automated posting on X (Twitter), including new tweets, tagging, and replies. |
10 | | `scrolling_page.py` | Automates webpage scrolling with various scrolling actions and text search functionality. |
11 | | `twitter_post_using_cookies.py` | Automates posting on X (Twitter) using stored authentication cookies. |
12 | | `web_voyager_agent.py` | A general-purpose web navigation agent for tasks like flight booking and course searching. |
13 | 


--------------------------------------------------------------------------------
/utils/browser-use/tests/ci/test_browser_session_via_cdp.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from browser_use.browser import BrowserSession
 4 | from browser_use.browser.profile import BrowserProfile
 5 | from browser_use.browser.types import async_playwright
 6 | 
 7 | 
 8 | async def test_connection_via_cdp():
 9 | 	browser_session = BrowserSession(
10 | 		cdp_url='http://localhost:9898',
11 | 		browser_profile=BrowserProfile(
12 | 			headless=True,
13 | 			keep_alive=True,
14 | 		),
15 | 	)
16 | 	with pytest.raises(Exception) as e:
17 | 		await browser_session.start()
18 | 
19 | 	# Assert on the exception value outside the context manager
20 | 	assert 'ECONNREFUSED' in str(e.value)
21 | 
22 | 	playwright = await async_playwright().start()
23 | 	browser = await playwright.chromium.launch(args=['--remote-debugging-port=9898'])
24 | 
25 | 	async with await browser_session.start():
26 | 		await browser_session.create_new_tab()
27 | 
28 | 		assert (await browser_session.get_current_page()).url == 'about:blank'
29 | 
30 | 		await browser.close()
31 | 
32 | 	await browser_session.kill()
33 | 	await playwright.stop()
34 | 


--------------------------------------------------------------------------------
/utils/browser-use/tests/old/test_dropdown_error.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple try of the agent.
 3 | 
 4 | @dev You need to add OPENAI_API_KEY to your environment variables.
 5 | """
 6 | 
 7 | import os
 8 | import sys
 9 | 
10 | from browser_use.browser import BrowserProfile, BrowserSession
11 | 
12 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
13 | 
14 | from browser_use import Agent, AgentHistoryList
15 | from browser_use.llm import ChatOpenAI
16 | 
17 | llm = ChatOpenAI(model='gpt-4.1')
18 | browser_session = BrowserSession(browser_profile=BrowserProfile(headless=True))
19 | 
20 | agent = Agent(
21 | 	task=('go to https://codepen.io/shyam-king/pen/emOyjKm and select number "4" and return the output of "selected value"'),
22 | 	llm=llm,
23 | 	browser_session=browser_session,
24 | )
25 | 
26 | 
27 | async def test_dropdown():
28 | 	await browser_session.start()
29 | 	try:
30 | 		history: AgentHistoryList = await agent.run(20)
31 | 
32 | 		result = history.final_result()
33 | 		assert result is not None
34 | 		assert '4' in result
35 | 		print(result)
36 | 	finally:
37 | 		await browser_session.stop()
38 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/README.md:
--------------------------------------------------------------------------------
 1 | # Codebase Structure
 2 | 
 3 | > The code structure inspired by https://github.com/Netflix/dispatch.
 4 | 
 5 | Very good structure on how to make a scalable codebase is also in [this repo](https://github.com/zhanymkanov/fastapi-best-practices).
 6 | 
 7 | Just a brief document about how we should structure our backend codebase.
 8 | 
 9 | ## Code Structure
10 | 
11 | ```markdown
12 | src/
13 | /<service name>/
14 | models.py
15 | services.py
16 | prompts.py
17 | views.py
18 | utils.py
19 | routers.py
20 | 
21 |     	/_<subservice name>/
22 | ```
23 | 
24 | ### Service.py
25 | 
26 | Always a single file, except if it becomes too long - more than ~500 lines, split it into \_subservices
27 | 
28 | ### Views.py
29 | 
30 | Always split the views into two parts
31 | 
32 | ```python
33 | # All
34 | ...
35 | 
36 | # Requests
37 | ...
38 | 
39 | # Responses
40 | ...
41 | ```
42 | 
43 | If too long → split into multiple files
44 | 
45 | ### Prompts.py
46 | 
47 | Single file; if too long → split into multiple files (one prompt per file or so)
48 | 
49 | ### Routers.py
50 | 
51 | Never split into more than one file
52 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/file_system/excel_sheet.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | from browser_use.llm.openai.chat import ChatOpenAI
 6 | 
 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 8 | 
 9 | from dotenv import load_dotenv
10 | 
11 | load_dotenv()
12 | from lmnr import Laminar
13 | 
14 | try:
15 | 	Laminar.initialize(project_api_key=os.getenv('LMNR_PROJECT_API_KEY'))
16 | except Exception:
17 | 	pass
18 | 
19 | from browser_use import Agent
20 | 
21 | # Initialize the model
22 | llm = ChatOpenAI(
23 | 	model='o4-mini',
24 | 	temperature=1.0,
25 | )
26 | 
27 | 
28 | task = (
29 | 	'Find current stock price of companies Meta and Amazon. Then, make me a CSV file with 2 columns: company name, stock price.'
30 | )
31 | 
32 | agent = Agent(task=task, llm=llm)
33 | 
34 | 
35 | async def main():
36 | 	import time
37 | 
38 | 	start_time = time.time()
39 | 	history = await agent.run()
40 | 	# token usage
41 | 	print(history.usage)
42 | 	end_time = time.time()
43 | 	print(f'Time taken: {end_time - start_time} seconds')
44 | 
45 | 
46 | if __name__ == '__main__':
47 | 	asyncio.run(main())
48 | 


--------------------------------------------------------------------------------
/utils/browser-use/.github/SECURITY.md:
--------------------------------------------------------------------------------
 1 | ## Reporting Security Issues
 2 | 
 3 | If you believe you have found a security vulnerability in browser-use, please report it through coordinated disclosure.
 4 | 
 5 | **Please do not report security vulnerabilities through the repository issues, discussions, or pull requests.**
 6 | 
 7 | Instead, please open a new [Github security advisory](https://github.com/browser-use/browser-use/security/advisories/new).
 8 | 
 9 | Please include as much of the information listed below as you can to help me better understand and resolve the issue:
10 | 
11 | * The type of issue (e.g., buffer overflow, SQL injection, or cross-site scripting)
12 | * Full paths of source file(s) related to the manifestation of the issue
13 | * The location of the affected source code (tag/branch/commit or direct URL)
14 | * Any special configuration required to reproduce the issue
15 | * Step-by-step instructions to reproduce the issue
16 | * Proof-of-concept or exploit code (if possible)
17 | * Impact of the issue, including how an attacker might exploit the issue
18 | 
19 | This information will help me triage your report more quickly.
20 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/use-cases/captcha.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Goal: Automates CAPTCHA solving on a demo website.
 3 | 
 4 | 
 5 | Simple try of the agent.
 6 | @dev You need to add OPENAI_API_KEY to your environment variables.
 7 | NOTE: captchas are hard. For this example it works. But e.g. for iframes it does not.
 8 | for this example it helps to zoom in.
 9 | """
10 | 
11 | import asyncio
12 | import os
13 | import sys
14 | 
15 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
16 | 
17 | from dotenv import load_dotenv
18 | 
19 | load_dotenv()
20 | 
21 | from browser_use import Agent
22 | from browser_use.llm import ChatOpenAI
23 | 
24 | if not os.getenv('OPENAI_API_KEY'):
25 | 	raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.')
26 | 
27 | 
28 | async def main():
29 | 	llm = ChatOpenAI(model='gpt-4.1')
30 | 	agent = Agent(
31 | 		task='go to https://captcha.com/demos/features/captcha-demo.aspx and solve the captcha',
32 | 		llm=llm,
33 | 	)
34 | 	await agent.run()
35 | 	input('Press Enter to exit')
36 | 
37 | 
38 | if __name__ == '__main__':
39 | 	asyncio.run(main())
40 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/models/novita.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple try of the agent.
 3 | 
 4 | @dev You need to add NOVITA_API_KEY to your environment variables.
 5 | """
 6 | 
 7 | import asyncio
 8 | import os
 9 | import sys
10 | 
11 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
12 | 
13 | from dotenv import load_dotenv
14 | 
15 | load_dotenv()
16 | 
17 | 
18 | from browser_use import Agent
19 | from browser_use.llm import ChatOpenAI
20 | 
21 | api_key = os.getenv('NOVITA_API_KEY', '')
22 | if not api_key:
23 | 	raise ValueError('NOVITA_API_KEY is not set')
24 | 
25 | 
26 | async def run_search():
27 | 	agent = Agent(
28 | 		task=(
29 | 			'1. Go to https://www.reddit.com/r/LocalLLaMA '
30 | 			"2. Search for 'browser use' in the search bar"
31 | 			'3. Click on first result'
32 | 			'4. Return the first comment'
33 | 		),
34 | 		llm=ChatOpenAI(
35 | 			base_url='https://api.novita.ai/v3/openai',
36 | 			model='deepseek/deepseek-v3-0324',
37 | 			api_key=api_key,
38 | 		),
39 | 		use_vision=False,
40 | 	)
41 | 
42 | 	await agent.run()
43 | 
44 | 
45 | if __name__ == '__main__':
46 | 	asyncio.run(run_search())
47 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/use-cases/wikipedia_banana_to_quantum.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 6 | 
 7 | from dotenv import load_dotenv
 8 | 
 9 | load_dotenv()
10 | 
11 | from browser_use import Agent
12 | from browser_use.browser import BrowserProfile, BrowserSession
13 | from browser_use.llm import ChatOpenAI
14 | 
15 | # video https://preview.screen.studio/share/vuq91Ej8
16 | llm = ChatOpenAI(
17 | 	model='gpt-4.1',
18 | 	temperature=0.0,
19 | )
20 | task = 'go to https://en.wikipedia.org/wiki/Banana and click on buttons on the wikipedia page to go as fast as possible from banna to Quantum mechanics'
21 | 
22 | browser_session = BrowserSession(
23 | 	browser_profile=BrowserProfile(
24 | 		viewport_expansion=-1,
25 | 		highlight_elements=False,
26 | 		user_data_dir='~/.config/browseruse/profiles/default',
27 | 	),
28 | )
29 | agent = Agent(task=task, llm=llm, browser_session=browser_session, use_vision=False)
30 | 
31 | 
32 | async def main():
33 | 	await agent.run()
34 | 
35 | 
36 | if __name__ == '__main__':
37 | 	asyncio.run(main())
38 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/browser/real_browser.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 6 | 
 7 | from dotenv import load_dotenv
 8 | 
 9 | load_dotenv()
10 | 
11 | from browser_use import Agent
12 | from browser_use.browser import BrowserProfile, BrowserSession
13 | from browser_use.llm import ChatOpenAI
14 | 
15 | browser_profile = BrowserProfile(
16 | 	# NOTE: you need to close your chrome browser - so that this can open your browser in debug mode
17 | 	executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
18 | 	user_data_dir='~/.config/browseruse/profiles/default',
19 | 	headless=False,
20 | )
21 | browser_session = BrowserSession(browser_profile=browser_profile)
22 | 
23 | 
24 | async def main():
25 | 	agent = Agent(
26 | 		task='Find todays DOW stock price',
27 | 		llm=ChatOpenAI(model='gpt-4.1'),
28 | 		browser_session=browser_session,
29 | 	)
30 | 
31 | 	await agent.run()
32 | 	await browser_session.close()
33 | 
34 | 	input('Press Enter to close...')
35 | 
36 | 
37 | if __name__ == '__main__':
38 | 	asyncio.run(main())
39 | 


--------------------------------------------------------------------------------
/utils/browser-use/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Gregor Zunic
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/telemetry/views.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from collections.abc import Sequence
 3 | from dataclasses import asdict, dataclass
 4 | from typing import Any
 5 | 
 6 | 
 7 | @dataclass
 8 | class BaseTelemetryEvent(ABC):
 9 | 	@property
10 | 	@abstractmethod
11 | 	def name(self) -> str:
12 | 		pass
13 | 
14 | 	@property
15 | 	def properties(self) -> dict[str, Any]:
16 | 		return {k: v for k, v in asdict(self).items() if k != 'name'}
17 | 
18 | 
19 | @dataclass
20 | class AgentTelemetryEvent(BaseTelemetryEvent):
21 | 	# start details
22 | 	task: str
23 | 	model: str
24 | 	model_provider: str
25 | 	planner_llm: str | None
26 | 	max_steps: int
27 | 	max_actions_per_step: int
28 | 	use_vision: bool
29 | 	use_validation: bool
30 | 	version: str
31 | 	source: str
32 | 	# step details
33 | 	action_errors: Sequence[str | None]
34 | 	action_history: Sequence[list[dict] | None]
35 | 	urls_visited: Sequence[str | None]
36 | 	# end details
37 | 	steps: int
38 | 	total_input_tokens: int
39 | 	total_duration_seconds: float
40 | 	success: bool | None
41 | 	final_result_response: str | None
42 | 	error_message: str | None
43 | 
44 | 	name: str = 'agent_event'
45 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/integrations/gmail/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Gmail Integration for Browser Use
 3 | Provides Gmail API integration for email reading and verification code extraction.
 4 | This integration enables agents to read email content and extract verification codes themselves.
 5 | Usage:
 6 |     from browser_use.integrations.gmail import GmailService, register_gmail_actions
 7 |     # Option 1: Register Gmail actions with file-based authentication
 8 |     controller = Controller()
 9 |     register_gmail_actions(controller)
10 |     # Option 2: Register Gmail actions with direct access token (recommended for production)
11 |     controller = Controller()
12 |     register_gmail_actions(controller, access_token="your_access_token_here")
13 |     # Option 3: Use the service directly
14 |     gmail = GmailService(access_token="your_access_token_here")
15 |     await gmail.authenticate()
16 |     emails = await gmail.get_recent_emails()
17 | """
18 | 
19 | # @file purpose: Gmail integration for 2FA email authentication and email reading
20 | 
21 | from .actions import register_gmail_actions
22 | from .service import GmailService
23 | 
24 | __all__ = ['GmailService', 'register_gmail_actions']
25 | 


--------------------------------------------------------------------------------
/utils/browser-use/docker/README.md:
--------------------------------------------------------------------------------
 1 | # Docker Setup for Browser-Use
 2 | 
 3 | This directory contains the optimized Docker build system for browser-use, achieving < 30 second builds.
 4 | 
 5 | ## Quick Start
 6 | 
 7 | ```bash
 8 | # Build base images (only needed once or when dependencies change)
 9 | ./docker/build-base-images.sh
10 | 
11 | # Build browser-use
12 | docker build -f Dockerfile.fast -t browseruse .
13 | 
14 | # Or use the standard Dockerfile (slower but self-contained)
15 | docker build -t browseruse .
16 | ```
17 | 
18 | ## Files
19 | 
20 | - `Dockerfile` - Standard self-contained build (~2 min)
21 | - `Dockerfile.fast` - Fast build using pre-built base images (~30 sec)
22 | - `docker/` - Base image definitions and build script
23 |   - `base-images/system/` - Python + minimal system deps
24 |   - `base-images/chromium/` - Adds Chromium browser
25 |   - `base-images/python-deps/` - Adds Python dependencies
26 |   - `build-base-images.sh` - Script to build all base images
27 | 
28 | ## Performance
29 | 
30 | | Build Type | Time |
31 | |------------|------|
32 | | Standard Dockerfile | ~2 minutes |
33 | | Fast build (with base images) | ~30 seconds |
34 | | Rebuild after code change | ~16 seconds |
35 | 


--------------------------------------------------------------------------------
/utils/browser-use/tests/old/screenshot_test.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import base64
 3 | 
 4 | import pytest
 5 | 
 6 | from browser_use.browser import BrowserProfile, BrowserSession
 7 | 
 8 | 
 9 | async def test_take_full_page_screenshot():
10 | 	browser_session = BrowserSession(browser_profile=BrowserProfile(headless=True, disable_security=True))
11 | 	await browser_session.start()
12 | 	try:
13 | 		page = await browser_session.get_current_page()
14 | 		# Go to a test page
15 | 		await page.goto('https://example.com')
16 | 
17 | 		await asyncio.sleep(3)
18 | 		# Take full page screenshot
19 | 		screenshot_b64 = await browser_session.take_screenshot(full_page=True)
20 | 		await asyncio.sleep(3)
21 | 		# Verify screenshot is not empty and is valid base64
22 | 		assert screenshot_b64 is not None
23 | 		assert isinstance(screenshot_b64, str)
24 | 		assert len(screenshot_b64) > 0
25 | 
26 | 		# Test we can decode the base64 string
27 | 		try:
28 | 			base64.b64decode(screenshot_b64)
29 | 		except Exception as e:
30 | 			pytest.fail(f'Failed to decode base64 screenshot: {str(e)}')
31 | 	finally:
32 | 		await browser_session.stop()
33 | 
34 | 
35 | if __name__ == '__main__':
36 | 	asyncio.run(test_take_full_page_screenshot())
37 | 


--------------------------------------------------------------------------------
/utils/browser-use/tests/old/test_gif_path.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple try of the agent.
 3 | 
 4 | @dev You need to add OPENAI_API_KEY to your environment variables.
 5 | """
 6 | 
 7 | import os
 8 | import sys
 9 | 
10 | from browser_use.browser import BrowserProfile, BrowserSession
11 | 
12 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
13 | 
14 | from browser_use import Agent, AgentHistoryList
15 | from browser_use.llm import ChatOpenAI
16 | 
17 | llm = ChatOpenAI(model='gpt-4.1')
18 | 
19 | browser_session = BrowserSession(browser_profile=BrowserProfile(headless=True, disable_security=True))
20 | 
21 | agent = Agent(
22 | 	task=('go to google.com and search for text "hi there"'),
23 | 	llm=llm,
24 | 	browser_session=browser_session,
25 | 	generate_gif='./google.gif',
26 | )
27 | 
28 | 
29 | async def test_gif_path():
30 | 	if os.path.exists('./google.gif'):
31 | 		os.unlink('./google.gif')
32 | 
33 | 	await browser_session.start()
34 | 	try:
35 | 		history: AgentHistoryList = await agent.run(20)
36 | 
37 | 		result = history.final_result()
38 | 		assert result is not None
39 | 
40 | 		assert os.path.exists('./google.gif'), 'google.gif was not created'
41 | 	finally:
42 | 		await browser_session.stop()
43 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/models/gemini.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 6 | 
 7 | from dotenv import load_dotenv
 8 | from lmnr import Laminar
 9 | 
10 | load_dotenv()
11 | 
12 | Laminar.initialize()
13 | 
14 | 
15 | from browser_use import Agent
16 | from browser_use.browser import BrowserProfile, BrowserSession
17 | from browser_use.llm import ChatGoogle
18 | 
19 | api_key = os.getenv('GOOGLE_API_KEY')
20 | if not api_key:
21 | 	raise ValueError('GOOGLE_API_KEY is not set')
22 | 
23 | llm = ChatGoogle(model='gemini-2.0-flash-exp', api_key=api_key)
24 | 
25 | browser_session = BrowserSession(
26 | 	browser_profile=BrowserProfile(
27 | 		viewport_expansion=0,
28 | 		user_data_dir='~/.config/browseruse/profiles/default',
29 | 	)
30 | )
31 | 
32 | 
33 | async def run_search():
34 | 	agent = Agent(
35 | 		task='Go to amazon.com, search for laptop, sort by best rating, and give me the price of the first result',
36 | 		llm=llm,
37 | 		max_actions_per_step=4,
38 | 		browser_session=browser_session,
39 | 	)
40 | 
41 | 	await agent.run(max_steps=25)
42 | 
43 | 
44 | if __name__ == '__main__':
45 | 	asyncio.run(run_search())
46 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/features/restrict_urls.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 6 | 
 7 | from dotenv import load_dotenv
 8 | 
 9 | load_dotenv()
10 | 
11 | from browser_use import Agent
12 | from browser_use.browser import BrowserProfile, BrowserSession
13 | from browser_use.llm import ChatOpenAI
14 | 
15 | llm = ChatOpenAI(model='gpt-4.1', temperature=0.0)
16 | task = (
17 | 	"go to google.com and search for openai.com and click on the first link then extract content and scroll down - what's there?"
18 | )
19 | 
20 | allowed_domains = ['google.com']
21 | 
22 | browser_session = BrowserSession(
23 | 	browser_profile=BrowserProfile(
24 | 		executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
25 | 		allowed_domains=allowed_domains,
26 | 		user_data_dir='~/.config/browseruse/profiles/default',
27 | 	),
28 | )
29 | 
30 | agent = Agent(
31 | 	task=task,
32 | 	llm=llm,
33 | 	browser_session=browser_session,
34 | )
35 | 
36 | 
37 | async def main():
38 | 	await agent.run(max_steps=25)
39 | 
40 | 	input('Press Enter to close the browser...')
41 | 	await browser_session.close()
42 | 
43 | 
44 | asyncio.run(main())
45 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/browser/multiple_agents_same_browser.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 6 | 
 7 | from dotenv import load_dotenv
 8 | 
 9 | load_dotenv()
10 | 
11 | 
12 | from browser_use import Agent
13 | from browser_use.browser.profile import BrowserProfile
14 | from browser_use.browser.session import BrowserSession
15 | from browser_use.llm import ChatOpenAI
16 | 
17 | 
18 | async def main():
19 | 	browser_session = BrowserSession(
20 | 		browser_profile=BrowserProfile(
21 | 			keep_alive=True,
22 | 			user_data_dir=None,
23 | 			headless=False,
24 | 		)
25 | 	)
26 | 	await browser_session.start()
27 | 
28 | 	current_agent = None
29 | 	llm = ChatOpenAI(model='gpt-4.1')
30 | 
31 | 	task1 = 'find todays weather on San Francisco and extract it as json'
32 | 	task2 = 'find todays weather in Zurich and extract it as json'
33 | 
34 | 	agent1 = Agent(
35 | 		task=task1,
36 | 		browser_session=browser_session,
37 | 		llm=llm,
38 | 	)
39 | 	agent2 = Agent(
40 | 		task=task2,
41 | 		browser_session=browser_session,
42 | 		llm=llm,
43 | 	)
44 | 
45 | 	await asyncio.gather(agent1.run(), agent2.run())
46 | 	await browser_session.kill()
47 | 
48 | 
49 | asyncio.run(main())
50 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/features/download_file.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 6 | 
 7 | from dotenv import load_dotenv
 8 | 
 9 | load_dotenv()
10 | 
11 | 
12 | from browser_use import Agent
13 | from browser_use.browser import BrowserSession
14 | from browser_use.llm import ChatGoogle
15 | 
16 | api_key = os.getenv('GOOGLE_API_KEY')
17 | if not api_key:
18 | 	raise ValueError('GOOGLE_API_KEY is not set')
19 | 
20 | assert api_key is not None, 'GOOGLE_API_KEY must be set'
21 | llm = ChatGoogle(model='gemini-2.0-flash-exp', api_key=api_key)
22 | 
23 | from browser_use.browser import BrowserProfile
24 | 
25 | browser_session = BrowserSession(
26 | 	browser_profile=BrowserProfile(
27 | 		downloads_path='~/Downloads',
28 | 		user_data_dir='~/.config/browseruse/profiles/default',
29 | 	)
30 | )
31 | 
32 | 
33 | async def run_download():
34 | 	agent = Agent(
35 | 		task='Go to "https://file-examples.com/" and download the smallest doc file.',
36 | 		llm=llm,
37 | 		max_actions_per_step=8,
38 | 		use_vision=True,
39 | 		browser_session=browser_session,
40 | 	)
41 | 	await agent.run(max_steps=25)
42 | 
43 | 
44 | if __name__ == '__main__':
45 | 	asyncio.run(run_download())
46 | 


--------------------------------------------------------------------------------
/utils/browser-use/tests/old/test_dropdown.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test dropdown interaction functionality.
 3 | """
 4 | 
 5 | import pytest
 6 | 
 7 | from browser_use.agent.service import Agent
 8 | from browser_use.agent.views import AgentHistoryList
 9 | 
10 | 
11 | async def test_dropdown(llm, browser_session):
12 | 	"""Test selecting an option from a dropdown menu."""
13 | 	agent = Agent(
14 | 		task=(
15 | 			'go to https://codepen.io/geheimschriftstift/pen/mPLvQz and first get all options for the dropdown and then select the 5th option'
16 | 		),
17 | 		llm=llm,
18 | 		browser_session=browser_session,
19 | 	)
20 | 
21 | 	try:
22 | 		history: AgentHistoryList = await agent.run(20)
23 | 		result = history.final_result()
24 | 
25 | 		# Verify dropdown interaction
26 | 		assert result is not None
27 | 		assert 'Duck' in result, "Expected 5th option 'Duck' to be selected"
28 | 
29 | 		# Verify dropdown state
30 | 		page = await browser_session.get_current_page()
31 | 		element = await page.query_selector('select')
32 | 		assert element is not None, 'Dropdown element should exist'
33 | 
34 | 		value = await element.evaluate('el => el.value')
35 | 		assert value == '5', 'Dropdown should have 5th option selected'
36 | 
37 | 	except Exception as e:
38 | 		pytest.fail(f'Dropdown test failed: {str(e)}')
39 | 


--------------------------------------------------------------------------------
/utils/browser-use/tests/old/test_react_dropdown.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple try of the agent.
 3 | 
 4 | @dev You need to add OPENAI_API_KEY to your environment variables.
 5 | """
 6 | 
 7 | import os
 8 | import sys
 9 | 
10 | from browser_use.browser import BrowserProfile, BrowserSession
11 | 
12 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
13 | import asyncio
14 | 
15 | from browser_use import Agent, AgentHistoryList
16 | from browser_use.llm import ChatOpenAI
17 | 
18 | llm = ChatOpenAI(model='gpt-4.1')
19 | 
20 | browser_session = BrowserSession(browser_profile=BrowserProfile(headless=True, disable_security=True))
21 | 
22 | agent = Agent(
23 | 	task=(
24 | 		'go to https://codepen.io/shyam-king/pen/ByBJoOv and select "Tiger" dropdown and read the text given in "Selected Animal" box (it can be empty as well)'
25 | 	),
26 | 	llm=llm,
27 | 	browser_session=browser_session,
28 | )
29 | 
30 | 
31 | async def test_dropdown():
32 | 	await browser_session.start()
33 | 	try:
34 | 		history: AgentHistoryList = await agent.run(10)
35 | 
36 | 		result = history.final_result()
37 | 		assert result is not None
38 | 		print('result: ', result)
39 | 	finally:
40 | 		await browser_session.stop()
41 | 
42 | 
43 | if __name__ == '__main__':
44 | 	asyncio.run(test_dropdown())
45 | 


--------------------------------------------------------------------------------
/utils/browser-use/docs/development/telemetry.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Telemetry"
 3 | description: "Understanding Browser Use's telemetry and privacy settings"
 4 | icon: "chart-mixed"
 5 | ---
 6 | 
 7 | ## Overview
 8 | 
 9 | Browser Use collects anonymous usage data to help us understand how the library is being used and to improve the user experience. It also helps us fix bugs faster and prioritize feature development.
10 | 
11 | ## Data Collection
12 | 
13 | We use [PostHog](https://posthog.com) for telemetry collection. The data is completely anonymized and contains no personally identifiable information.
14 | 
15 | <Note>
16 |   We never collect personal information, credentials, or specific content from
17 |   your browser automation tasks.
18 | </Note>
19 | 
20 | ## Opting Out
21 | 
22 | You can disable telemetry by setting an environment variable:
23 | 
24 | ```bash .env
25 | ANONYMIZED_TELEMETRY=false
26 | ```
27 | 
28 | Or in your Python code:
29 | 
30 | ```python
31 | import os
32 | os.environ["ANONYMIZED_TELEMETRY"] = "false"
33 | ```
34 | 
35 | <Note>
36 |   Even when enabled, telemetry has zero impact on the library's performance or
37 |   functionality. Code is available in [Telemetry
38 |   Service](https://github.com/browser-use/browser-use/tree/main/browser_use/telemetry).
39 | </Note>
40 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/features/custom_system_prompt.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import os
 4 | import sys
 5 | 
 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 7 | 
 8 | from dotenv import load_dotenv
 9 | 
10 | load_dotenv()
11 | 
12 | try:
13 | 	from lmnr import Laminar
14 | 
15 | 	Laminar.initialize(project_api_key=os.getenv('LMNR_PROJECT_API_KEY'))
16 | except Exception as e:
17 | 	print(f'Error initializing Laminar: {e}')
18 | 
19 | 
20 | from browser_use import Agent
21 | from browser_use.llm import ChatOpenAI
22 | 
23 | extend_system_message = (
24 | 	'REMEMBER the most important RULE: ALWAYS open first a new tab and go first to url wikipedia.com no matter the task!!!'
25 | )
26 | 
27 | # or use override_system_message to completely override the system prompt
28 | 
29 | 
30 | async def main():
31 | 	task = 'do google search to find images of Elon Musk'
32 | 	model = ChatOpenAI(model='gpt-4.1')
33 | 	agent = Agent(task=task, llm=model, extend_system_message=extend_system_message)
34 | 
35 | 	print(
36 | 		json.dumps(
37 | 			agent.message_manager.system_prompt.model_dump(exclude_unset=True),
38 | 			indent=4,
39 | 		)
40 | 	)
41 | 
42 | 	await agent.run()
43 | 
44 | 
45 | if __name__ == '__main__':
46 | 	asyncio.run(main())
47 | 


--------------------------------------------------------------------------------
/utils/browser-use/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: lint
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 |       - stable
 7 |       - 'releases/**'
 8 |     tags:
 9 |       - '*'
10 |   pull_request:
11 |   workflow_dispatch:
12 | 
13 | jobs:
14 |   lint-syntax:
15 |     name: syntax-errors
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - uses: actions/checkout@v4
19 |       - uses: astral-sh/setup-uv@v5
20 |         with:
21 |           enable-cache: true
22 |       - run: uv run ruff check --no-fix --select PLE
23 | 
24 |   lint-style:
25 |     name: code-style
26 |     runs-on: ubuntu-latest
27 |     steps:
28 |       - uses: actions/checkout@v4
29 |       - uses: astral-sh/setup-uv@v5
30 |         with:
31 |           enable-cache: true
32 |       - run: uv sync --dev --all-extras  # install extras for examples to avoid pyright missing imports errors
33 |       - run: uv run pre-commit run --all-files --show-diff-on-failure
34 | 
35 |   lint-typecheck:
36 |     name: type-checker
37 |     runs-on: ubuntu-latest
38 |     steps:
39 |       - uses: actions/checkout@v4
40 |       - uses: astral-sh/setup-uv@v6
41 |         with:
42 |           enable-cache: true
43 |       - run: uv sync --dev --all-extras  # install extras for examples to avoid pyright missing imports errors-
44 |       - run: uv run pyright
45 | 


--------------------------------------------------------------------------------
/utils/browser-use/Dockerfile.fast:
--------------------------------------------------------------------------------
 1 | # Fast Dockerfile using pre-built base images
 2 | ARG REGISTRY=browseruse
 3 | ARG BASE_TAG=latest
 4 | FROM ${REGISTRY}/base-python-deps:${BASE_TAG}
 5 | 
 6 | LABEL name="browseruse" description="Browser automation for AI agents"
 7 | 
 8 | ENV BROWSERUSE_USER="browseruse" DEFAULT_PUID=911 DEFAULT_PGID=911 DATA_DIR=/data
 9 | 
10 | # Create user and directories
11 | RUN groupadd --system $BROWSERUSE_USER && \
12 |     useradd --system --create-home --gid $BROWSERUSE_USER --groups audio,video $BROWSERUSE_USER && \
13 |     usermod -u "$DEFAULT_PUID" "$BROWSERUSE_USER" && \
14 |     groupmod -g "$DEFAULT_PGID" "$BROWSERUSE_USER" && \
15 |     mkdir -p /data /home/$BROWSERUSE_USER/.config && \
16 |     ln -s $DATA_DIR /home/$BROWSERUSE_USER/.config/browseruse && \
17 |     mkdir -p "/home/$BROWSERUSE_USER/.config/chromium/Crash Reports/pending/" && \
18 |     mkdir -p "$DATA_DIR/profiles/default" && \
19 |     chown -R "$BROWSERUSE_USER:$BROWSERUSE_USER" "/home/$BROWSERUSE_USER" "$DATA_DIR"
20 | 
21 | WORKDIR /app
22 | COPY . /app
23 | 
24 | # Install browser-use
25 | RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked \
26 |     uv sync --all-extras --locked --no-dev --compile-bytecode
27 | 
28 | USER "$BROWSERUSE_USER"
29 | VOLUME "$DATA_DIR"
30 | EXPOSE 9242 9222
31 | ENTRYPOINT ["browser-use"]
32 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/features/follow_up_tasks.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 6 | 
 7 | from dotenv import load_dotenv
 8 | 
 9 | load_dotenv()
10 | 
11 | from browser_use import Agent, Controller
12 | from browser_use.browser import BrowserProfile, BrowserSession
13 | from browser_use.llm import ChatOpenAI
14 | 
15 | # Initialize the model
16 | llm = ChatOpenAI(
17 | 	model='gpt-4.1',
18 | 	temperature=0.0,
19 | )
20 | # Get your chrome path
21 | browser_session = BrowserSession(
22 | 	browser_profile=BrowserProfile(
23 | 		executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
24 | 		keep_alive=True,
25 | 		user_data_dir='~/.config/browseruse/profiles/default',
26 | 	),
27 | )
28 | 
29 | controller = Controller()
30 | 
31 | 
32 | task = 'Find the founders of browser-use and draft them a short personalized message'
33 | 
34 | agent = Agent(task=task, llm=llm, controller=controller, browser_session=browser_session)
35 | 
36 | 
37 | async def main():
38 | 	await agent.run()
39 | 
40 | 	# new_task = input('Type in a new task: ')
41 | 	new_task = 'Find an image of the founders'
42 | 
43 | 	agent.add_new_task(new_task)
44 | 
45 | 	await agent.run()
46 | 
47 | 
48 | if __name__ == '__main__':
49 | 	asyncio.run(main())
50 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/custom-functions/notification.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 6 | 
 7 | from dotenv import load_dotenv
 8 | 
 9 | load_dotenv()
10 | 
11 | from browser_use import ActionResult, Agent, Controller
12 | from browser_use.llm import ChatOpenAI
13 | 
14 | controller = Controller()
15 | 
16 | 
17 | @controller.registry.action('Done with task ')
18 | async def done(text: str):
19 | 	import yagmail  # type: ignore
20 | 
21 | 	# To send emails use
22 | 	# STEP 1: go to https://support.google.com/accounts/answer/185833
23 | 	# STEP 2: Create an app password (you can't use here your normal gmail password)
24 | 	# STEP 3: Use the app password in the code below for the password
25 | 	yag = yagmail.SMTP('your_email@gmail.com', 'your_app_password')
26 | 	yag.send(
27 | 		to='recipient@example.com',
28 | 		subject='Test Email',
29 | 		contents=f'result\n: {text}',
30 | 	)
31 | 
32 | 	return ActionResult(is_done=True, extracted_content='Email sent!')
33 | 
34 | 
35 | async def main():
36 | 	task = 'go to brower-use.com and then done'
37 | 	model = ChatOpenAI(model='gpt-4.1')
38 | 	agent = Agent(task=task, llm=model, controller=controller)
39 | 
40 | 	await agent.run()
41 | 
42 | 
43 | if __name__ == '__main__':
44 | 	asyncio.run(main())
45 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/mcp/simple_client.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple example of using MCP client with browser-use.
 3 | 
 4 | This example shows how to connect to an MCP server and use its tools with an agent.
 5 | """
 6 | 
 7 | import asyncio
 8 | import os
 9 | 
10 | from browser_use import Agent, Controller
11 | from browser_use.llm.openai.chat import ChatOpenAI
12 | from browser_use.mcp.client import MCPClient
13 | 
14 | 
15 | async def main():
16 | 	# Initialize controller
17 | 	controller = Controller()
18 | 
19 | 	# Connect to a filesystem MCP server
20 | 	# This server provides tools to read/write files in a directory
21 | 	mcp_client = MCPClient(
22 | 		server_name='filesystem', command='npx', args=['@modelcontextprotocol/server-filesystem', os.path.expanduser('~/Desktop')]
23 | 	)
24 | 
25 | 	# Connect and register MCP tools
26 | 	await mcp_client.connect()
27 | 	await mcp_client.register_to_controller(controller)
28 | 
29 | 	# Create agent with MCP-enabled controller
30 | 	agent = Agent(
31 | 		task='List all files on the Desktop and read the content of any .txt files you find',
32 | 		llm=ChatOpenAI(model='gpt-4o'),
33 | 		controller=controller,
34 | 	)
35 | 
36 | 	# Run the agent - it now has access to filesystem tools
37 | 	await agent.run()
38 | 
39 | 	# Disconnect when done
40 | 	await mcp_client.disconnect()
41 | 
42 | 
43 | if __name__ == '__main__':
44 | 	asyncio.run(main())
45 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/models/azure_openai.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple try of the agent.
 3 | 
 4 | @dev You need to add AZURE_OPENAI_KEY and AZURE_OPENAI_ENDPOINT to your environment variables.
 5 | """
 6 | 
 7 | import asyncio
 8 | import os
 9 | import sys
10 | 
11 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
12 | 
13 | from dotenv import load_dotenv
14 | 
15 | load_dotenv()
16 | 
17 | 
18 | from browser_use import Agent
19 | from browser_use.llm import ChatAzureOpenAI
20 | 
21 | # Retrieve Azure-specific environment variables
22 | azure_openai_api_key = os.getenv('AZURE_OPENAI_KEY')
23 | azure_openai_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT')
24 | 
25 | if not azure_openai_api_key or not azure_openai_endpoint:
26 | 	raise ValueError('AZURE_OPENAI_KEY or AZURE_OPENAI_ENDPOINT is not set')
27 | 
28 | # Initialize the Azure OpenAI client
29 | llm = ChatAzureOpenAI(
30 | 	model='gpt-4.1',
31 | 	api_key=azure_openai_api_key,
32 | 	azure_endpoint=azure_openai_endpoint,  # Corrected to use azure_endpoint instead of openai_api_base
33 | )
34 | 
35 | agent = Agent(
36 | 	task='Go to amazon.com, search for laptop, sort by best rating, and give me the price of the first result',
37 | 	llm=llm,
38 | )
39 | 
40 | 
41 | async def main():
42 | 	await agent.run(max_steps=10)
43 | 	input('Press Enter to continue...')
44 | 
45 | 
46 | asyncio.run(main())
47 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/browser/utils.py:
--------------------------------------------------------------------------------
 1 | def normalize_url(url: str) -> str:
 2 | 	"""
 3 | 	Normalize a URL by adding https:// protocol if needed, while preserving special URLs.
 4 | 
 5 | 	This function safely adds https:// to URLs that lack a protocol, but preserves
 6 | 	special URLs like "about:blank", "mailto:...", "tel:...", etc. that should not
 7 | 	be prefixed with https://.
 8 | 
 9 | 	Args:
10 | 	    url: The URL string to normalize
11 | 
12 | 	Returns:
13 | 	    str: The normalized URL with protocol if needed
14 | 
15 | 	Examples:
16 | 	    >>> normalize_url('example.com')
17 | 	    'https://example.com'
18 | 	    >>> normalize_url('about:blank')
19 | 	    'about:blank'
20 | 	    >>> normalize_url('mailto:test@example.com')
21 | 	    'mailto:test@example.com'
22 | 	    >>> normalize_url('https://example.com')
23 | 	    'https://example.com'
24 | 	"""
25 | 	normalized_url = url.strip()
26 | 
27 | 	# If URL already has a protocol, return as-is
28 | 	if '://' in normalized_url:
29 | 		return normalized_url
30 | 
31 | 	# Check for special protocols that should not be prefixed with https://
32 | 	special_protocols = ['about:', 'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:']
33 | 	for protocol in special_protocols:
34 | 		if normalized_url.startswith(protocol):
35 | 			return normalized_url
36 | 
37 | 	# For everything else, add https://
38 | 	return f'https://{normalized_url}'
39 | 


--------------------------------------------------------------------------------
/utils/browser-use/docs/favicon.svg:
--------------------------------------------------------------------------------
 1 | <svg width="100" height="100" viewBox="0 0 100 100" fill="none" xmlns="http://www.w3.org/2000/svg">
 2 | <g clip-path="url(#clip0_7_13)">
 3 | <path d="M97.8916 39.0448C82.6177 33.1997 95.2199 10.8169 74.212 11.3849C48.5413 12.0793 8.31528 52.4518 12.4236 78.6851C14.4652 91.6755 24.6096 86.2218 29.3732 88.1154C32.5364 89.3652 36.2792 95.0083 40.3245 95.9047C22.4293 106.193 -0.556809 96.397 0.0102912 74.3423C0.829435 41.86 47.7474 -5.25386 81.1937 0.477571C99.8702 3.68414 102.189 23.5422 97.8916 39.0448Z" fill="white"/>
 4 | <path d="M24.8115 57.7541L39.6068 71.7166C49.0332 80.1875 74.061 94.9706 85.403 84.9469C98.774 73.1306 70.495 32.3162 57.4769 25.802L68.9069 20.6639C86.7138 33.6796 113.783 75.9836 91.7294 94.4025C77.5014 106.282 54.5655 96.2204 41.0811 87.3707C30.8103 80.6294 15.9647 70.9591 24.8115 57.7415V57.7541Z" fill="white"/>
 5 | <path d="M40.3373 4.75723C35.5485 4.88347 31.8055 11.1199 28.2895 12.2182C25.1642 13.1903 20.8414 10.5266 16.1408 14.0487C11.0495 17.8613 12.7891 36.0655 3.02233 40.5976C-2.98893 22.9362 0.75354 1.8789 22.4672 0.0736228C24.1433 -0.0652445 42.7822 1.17195 40.3373 4.74463V4.75723Z" fill="white"/>
 6 | <path d="M76.1025 57.754C84.1175 71.0348 69.5871 86.2092 57.489 74.1025L76.1025 57.754Z" fill="white"/>
 7 | </g>
 8 | <defs>
 9 | <clipPath id="clip0_7_13">
10 | <rect width="100" height="100" fill="white"/>
11 | </clipPath>
12 | </defs>
13 | </svg>
14 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/custom-functions/save_to_file_hugging_face.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 6 | 
 7 | from dotenv import load_dotenv
 8 | 
 9 | load_dotenv()
10 | 
11 | from pydantic import BaseModel
12 | 
13 | from browser_use.agent.service import Agent
14 | from browser_use.controller.service import Controller
15 | from browser_use.llm import ChatOpenAI
16 | 
17 | # Initialize controller first
18 | controller = Controller()
19 | 
20 | 
21 | class Model(BaseModel):
22 | 	title: str
23 | 	url: str
24 | 	likes: int
25 | 	license: str
26 | 
27 | 
28 | class Models(BaseModel):
29 | 	models: list[Model]
30 | 
31 | 
32 | @controller.action('Save models', param_model=Models)
33 | def save_models(params: Models):
34 | 	with open('models.txt', 'a') as f:
35 | 		for model in params.models:
36 | 			f.write(f'{model.title} ({model.url}): {model.likes} likes, {model.license}\n')
37 | 
38 | 
39 | # video: https://preview.screen.studio/share/EtOhIk0P
40 | async def main():
41 | 	task = 'Look up models with a license of cc-by-sa-4.0 and sort by most likes on Hugging face, save top 5 to file.'
42 | 
43 | 	model = ChatOpenAI(model='gpt-4.1')
44 | 	agent = Agent(task=task, llm=model, controller=controller)
45 | 
46 | 	await agent.run()
47 | 
48 | 
49 | if __name__ == '__main__':
50 | 	asyncio.run(main())
51 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/llm/views.py:
--------------------------------------------------------------------------------
 1 | from typing import Generic, TypeVar, Union
 2 | 
 3 | from pydantic import BaseModel
 4 | 
 5 | T = TypeVar('T', bound=Union[BaseModel, str])
 6 | 
 7 | 
 8 | class ChatInvokeUsage(BaseModel):
 9 | 	"""
10 | 	Usage information for a chat model invocation.
11 | 	"""
12 | 
13 | 	prompt_tokens: int
14 | 	"""The number of tokens in the prompt (this includes the cached tokens as well. When calculating the cost, subtract the cached tokens from the prompt tokens)"""
15 | 
16 | 	prompt_cached_tokens: int | None
17 | 	"""The number of cached tokens."""
18 | 
19 | 	prompt_cache_creation_tokens: int | None
20 | 	"""Anthropic only: The number of tokens used to create the cache."""
21 | 
22 | 	prompt_image_tokens: int | None
23 | 	"""Google only: The number of tokens in the image (prompt tokens is the text tokens + image tokens in that case)"""
24 | 
25 | 	completion_tokens: int
26 | 	"""The number of tokens in the completion."""
27 | 
28 | 	total_tokens: int
29 | 	"""The total number of tokens in the response."""
30 | 
31 | 
32 | class ChatInvokeCompletion(BaseModel, Generic[T]):
33 | 	"""
34 | 	Response from a chat model invocation.
35 | 	"""
36 | 
37 | 	completion: T
38 | 	"""The completion of the response."""
39 | 
40 | 	# Thinking stuff
41 | 	thinking: str | None = None
42 | 	redacted_thinking: str | None = None
43 | 
44 | 	usage: ChatInvokeUsage | None
45 | 	"""The usage of the response."""
46 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/features/drag_drop.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 6 | 
 7 | from dotenv import load_dotenv
 8 | 
 9 | load_dotenv()
10 | 
11 | 
12 | from browser_use import Agent
13 | from browser_use.llm import ChatGoogle
14 | 
15 | api_key = os.getenv('GOOGLE_API_KEY')
16 | if not api_key:
17 | 	raise ValueError('GOOGLE_API_KEY is not set')
18 | 
19 | # API key is automatically set from the environment variable GOOGLE_API_KEY
20 | llm = ChatGoogle(model='gemini-2.0-flash-exp')
21 | 
22 | 
23 | task_1 = """
24 | Navigate to: https://sortablejs.github.io/Sortable/. 
25 | Then scroll down to the first examplw with title "Simple list example". 
26 | Drag the element with name "item 1" to below the element with name "item 3".
27 | """
28 | 
29 | 
30 | task_2 = """
31 | Navigate to: https://excalidraw.com/.
32 | Click on the pencil icon (with index 40).
33 | Then draw a triangle in the canvas.
34 | Draw the triangle starting from coordinate (400,400).
35 | You can use the drag and drop action to draw the triangle.
36 | """
37 | 
38 | 
39 | async def run_search():
40 | 	agent = Agent(
41 | 		task=task_1,
42 | 		llm=llm,
43 | 		max_actions_per_step=1,
44 | 		use_vision=True,
45 | 	)
46 | 
47 | 	await agent.run(max_steps=25)
48 | 
49 | 
50 | if __name__ == '__main__':
51 | 	asyncio.run(run_search())
52 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/use-cases/twitter_post_using_cookies.py:
--------------------------------------------------------------------------------
 1 | # Goal: Automates posting on X (Twitter) using stored authentication cookies.
 2 | 
 3 | import asyncio
 4 | import os
 5 | import sys
 6 | 
 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 8 | 
 9 | from dotenv import load_dotenv
10 | 
11 | load_dotenv()
12 | 
13 | 
14 | from browser_use import Agent
15 | from browser_use.browser import BrowserProfile, BrowserSession
16 | from browser_use.llm import ChatGoogle
17 | 
18 | api_key = os.getenv('GOOGLE_API_KEY')
19 | if not api_key:
20 | 	raise ValueError('GOOGLE_API_KEY is not set')
21 | 
22 | llm = ChatGoogle(model='gemini-2.0-flash-exp', api_key=api_key)
23 | 
24 | 
25 | browser_session = BrowserSession(
26 | 	browser_profile=BrowserProfile(
27 | 		user_data_dir='~/.config/browseruse/profiles/default',
28 | 		# headless=False,  # Uncomment to see the browser
29 | 		# executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
30 | 	)
31 | )
32 | 
33 | 
34 | async def main():
35 | 	agent = Agent(
36 | 		browser_session=browser_session,
37 | 		task=('go to https://x.com. write a new post with the text "browser-use ftw", and submit it'),
38 | 		llm=llm,
39 | 		max_actions_per_step=4,
40 | 	)
41 | 	await agent.run(max_steps=25)
42 | 	input('Press Enter to close the browser...')
43 | 
44 | 
45 | if __name__ == '__main__':
46 | 	asyncio.run(main())
47 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/features/validate_output.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Demonstrate output validator.
 3 | 
 4 | @dev You need to add OPENAI_API_KEY to your environment variables.
 5 | """
 6 | 
 7 | import asyncio
 8 | import os
 9 | import sys
10 | 
11 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
12 | 
13 | from dotenv import load_dotenv
14 | 
15 | load_dotenv()
16 | 
17 | from pydantic import BaseModel
18 | 
19 | from browser_use import ActionResult, Agent, Controller
20 | from browser_use.llm import ChatOpenAI
21 | 
22 | controller = Controller()
23 | 
24 | 
25 | class DoneResult(BaseModel):
26 | 	title: str
27 | 	comments: str
28 | 	hours_since_start: int
29 | 
30 | 
31 | # we overwrite done() in this example to demonstrate the validator
32 | @controller.registry.action('Done with task', param_model=DoneResult)
33 | async def done(params: DoneResult):
34 | 	result = ActionResult(is_done=True, extracted_content=params.model_dump_json())
35 | 	print(result)
36 | 	# NOTE: this is clearly wrong - to demonstrate the validator
37 | 	return 'blablabla'
38 | 
39 | 
40 | async def main():
41 | 	task = 'Go to hackernews hn and give me the top 1 post'
42 | 	model = ChatOpenAI(model='gpt-4.1')
43 | 	agent = Agent(task=task, llm=model, controller=controller, validate_output=True)
44 | 	# NOTE: this should fail to demonstrate the validator
45 | 	await agent.run(max_steps=5)
46 | 
47 | 
48 | if __name__ == '__main__':
49 | 	asyncio.run(main())
50 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/features/parallel_agents.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 6 | 
 7 | from dotenv import load_dotenv
 8 | 
 9 | load_dotenv()
10 | 
11 | from browser_use.agent.service import Agent
12 | from browser_use.browser import BrowserProfile, BrowserSession
13 | from browser_use.llm import ChatOpenAI
14 | 
15 | browser_session = BrowserSession(
16 | 	browser_profile=BrowserProfile(
17 | 		keep_alive=True,
18 | 		headless=False,
19 | 		record_video_dir='./tmp/recordings',
20 | 		user_data_dir='~/.config/browseruse/profiles/default',
21 | 	)
22 | )
23 | llm = ChatOpenAI(model='gpt-4.1')
24 | 
25 | 
26 | async def main():
27 | 	await browser_session.start()
28 | 	agents = [
29 | 		Agent(task=task, llm=llm, browser_session=browser_session)
30 | 		for task in [
31 | 			'Search Google for weather in Tokyo',
32 | 			'Check Reddit front page title',
33 | 			'Look up Bitcoin price on Coinbase',
34 | 			'Find NASA image of the day',
35 | 			'Check top story on CNN',
36 | 			# 'Search latest SpaceX launch date',
37 | 			# 'Look up population of Paris',
38 | 			# 'Find current time in Sydney',
39 | 			# 'Check who won last Super Bowl',
40 | 			# 'Search trending topics on Twitter',
41 | 		]
42 | 	]
43 | 
44 | 	print(await asyncio.gather(*[agent.run() for agent in agents]))
45 | 	await browser_session.kill()
46 | 
47 | 
48 | if __name__ == '__main__':
49 | 	asyncio.run(main())
50 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/features/cross_origin_iframes.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example of how it supports cross-origin iframes.
 3 | 
 4 | @dev You need to add OPENAI_API_KEY to your environment variables.
 5 | """
 6 | 
 7 | import asyncio
 8 | import os
 9 | import sys
10 | 
11 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
12 | 
13 | from dotenv import load_dotenv
14 | 
15 | load_dotenv()
16 | 
17 | from browser_use import Agent, Controller
18 | from browser_use.browser import BrowserProfile, BrowserSession
19 | from browser_use.llm import ChatOpenAI
20 | 
21 | if not os.getenv('OPENAI_API_KEY'):
22 | 	raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.')
23 | 
24 | 
25 | browser_profile = BrowserProfile(
26 | 	executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
27 | )
28 | browser_session = BrowserSession(browser_profile=browser_profile)
29 | controller = Controller()
30 | 
31 | 
32 | async def main():
33 | 	agent = Agent(
34 | 		task='Click "Go cross-site (simple page)" button on https://csreis.github.io/tests/cross-site-iframe.html then tell me the text within',
35 | 		llm=ChatOpenAI(model='gpt-4.1', temperature=0.0),
36 | 		controller=controller,
37 | 		browser_session=browser_session,
38 | 	)
39 | 
40 | 	await agent.run()
41 | 	await browser_session.close()
42 | 
43 | 	input('Press Enter to close...')
44 | 
45 | 
46 | if __name__ == '__main__':
47 | 	try:
48 | 		asyncio.run(main())
49 | 	except Exception as e:
50 | 		print(e)
51 | 


--------------------------------------------------------------------------------
/utils/browser-use/tests/old/test_dropdown_complex.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test complex dropdown interaction functionality.
 3 | """
 4 | 
 5 | import pytest
 6 | 
 7 | from browser_use.agent.service import Agent
 8 | from browser_use.agent.views import AgentHistoryList
 9 | 
10 | 
11 | async def test_dropdown_complex(llm, browser_session):
12 | 	"""Test selecting an option from a complex dropdown menu."""
13 | 	agent = Agent(
14 | 		task=(
15 | 			'go to https://codepen.io/shyam-king/pen/pvzpByJ and first get all options for the dropdown and then select the json option'
16 | 		),
17 | 		llm=llm,
18 | 		browser_session=browser_session,
19 | 	)
20 | 
21 | 	try:
22 | 		history: AgentHistoryList = await agent.run(20)
23 | 		result = history.final_result()
24 | 
25 | 		# Verify dropdown interaction
26 | 		assert result is not None
27 | 		assert 'json' in result.lower(), "Expected 'json' option to be selected"
28 | 
29 | 		# Verify dropdown state
30 | 		page = await browser_session.get_current_page()
31 | 		element = await page.query_selector('.select-selected')
32 | 		assert element is not None, 'Custom dropdown element should exist'
33 | 
34 | 		text = await element.text_content()
35 | 		assert 'json' in text.lower(), 'Dropdown should display json option'
36 | 
37 | 		# Verify the selected option's effect
38 | 		code_element = await page.query_selector('pre code')
39 | 		assert code_element is not None, 'Code element should be visible when JSON is selected'
40 | 
41 | 	except Exception as e:
42 | 		pytest.fail(f'Complex dropdown test failed: {str(e)}')
43 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/features/result_processing.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | from pprint import pprint
 5 | 
 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 7 | 
 8 | from dotenv import load_dotenv
 9 | 
10 | load_dotenv()
11 | 
12 | from browser_use import Agent
13 | from browser_use.agent.views import AgentHistoryList
14 | from browser_use.browser import BrowserProfile, BrowserSession
15 | from browser_use.llm import ChatOpenAI
16 | 
17 | llm = ChatOpenAI(model='gpt-4.1')
18 | 
19 | 
20 | async def main():
21 | 	async with BrowserSession(
22 | 		browser_profile=BrowserProfile(
23 | 			headless=False,
24 | 			traces_dir='./tmp/result_processing',
25 | 			window_size={'width': 1280, 'height': 1000},
26 | 			user_data_dir='~/.config/browseruse/profiles/default',
27 | 		)
28 | 	) as browser_session:
29 | 		agent = Agent(
30 | 			task="go to google.com and type 'OpenAI' click search and give me the first url",
31 | 			llm=llm,
32 | 			browser_session=browser_session,
33 | 		)
34 | 		history: AgentHistoryList = await agent.run(max_steps=3)
35 | 
36 | 		print('Final Result:')
37 | 		pprint(history.final_result(), indent=4)
38 | 
39 | 		print('\nErrors:')
40 | 		pprint(history.errors(), indent=4)
41 | 
42 | 		# e.g. xPaths the model clicked on
43 | 		print('\nModel Outputs:')
44 | 		pprint(history.model_actions(), indent=4)
45 | 
46 | 		print('\nThoughts:')
47 | 		pprint(history.model_thoughts(), indent=4)
48 | 
49 | 
50 | if __name__ == '__main__':
51 | 	asyncio.run(main())
52 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/dom/playground/process_dom.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import os
 4 | import time
 5 | 
 6 | import anyio
 7 | 
 8 | from browser_use.browser import BrowserProfile, BrowserSession
 9 | 
10 | 
11 | async def test_process_dom():
12 | 	browser_session = BrowserSession(browser_profile=BrowserProfile(headless=True))
13 | 	await browser_session.start()
14 | 	try:
15 | 		page = await browser_session.get_current_page()
16 | 		await page.goto('https://kayak.com/flights')
17 | 		# await page.goto('https://google.com/flights')
18 | 		# await page.goto('https://immobilienscout24.de')
19 | 		# await page.goto('https://seleniumbase.io/w3schools/iframes')
20 | 
21 | 		await asyncio.sleep(3)
22 | 
23 | 		async with await anyio.open_file('browser_use/dom/buildDomTree.js', 'r') as f:
24 | 			js_code = await f.read()
25 | 
26 | 		start = time.time()
27 | 		dom_tree = await page.evaluate(js_code)
28 | 		end = time.time()
29 | 
30 | 		# print(dom_tree)
31 | 		print(f'Time: {end - start:.2f}s')
32 | 
33 | 		os.makedirs('./tmp', exist_ok=True)
34 | 		async with await anyio.open_file('./tmp/dom.json', 'w') as f:
35 | 			await f.write(json.dumps(dom_tree, indent=1))
36 | 
37 | 		# both of these work for immobilienscout24.de
38 | 		# await page.click('.sc-dcJsrY.ezjNCe')
39 | 		# await page.click(
40 | 		# 	'div > div:nth-of-type(2) > div > div:nth-of-type(2) > div > div:nth-of-type(2) > div > div > div > button:nth-of-type(2)'
41 | 		# )
42 | 
43 | 		input('Press Enter to continue...')
44 | 	finally:
45 | 		await browser_session.stop()
46 | 


--------------------------------------------------------------------------------
/utils/browser-use/docs/customize/output-format.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Output Format"
 3 | description: "The default is text. But you can define a structured output format to make post-processing easier."
 4 | icon: "code"
 5 | ---
 6 | 
 7 | ## Custom output format
 8 | With [this example](https://github.com/browser-use/browser-use/blob/main/examples/features/custom_output.py) you can define what output format the agent should return to you.
 9 | 
10 | ```python
11 | from pydantic import BaseModel
12 | # Define the output format as a Pydantic model
13 | class Post(BaseModel):
14 | 	post_title: str
15 | 	post_url: str
16 | 	num_comments: int
17 | 	hours_since_post: int
18 | 
19 | 
20 | class Posts(BaseModel):
21 | 	posts: List[Post]
22 | 
23 | 
24 | controller = Controller(output_model=Posts)
25 | 
26 | 
27 | async def main():
28 | 	task = 'Go to hackernews show hn and give me the first  5 posts'
29 | 	model = ChatOpenAI(model='gpt-4o')
30 | 	agent = Agent(task=task, llm=model, controller=controller)
31 | 
32 | 	history = await agent.run()
33 | 
34 | 	result = history.final_result()
35 | 	if result:
36 | 		parsed: Posts = Posts.model_validate_json(result)
37 | 
38 | 		for post in parsed.posts:
39 | 			print('\n--------------------------------')
40 | 			print(f'Title:            {post.post_title}')
41 | 			print(f'URL:              {post.post_url}')
42 | 			print(f'Comments:         {post.num_comments}')
43 | 			print(f'Hours since post: {post.hours_since_post}')
44 | 	else:
45 | 		print('No result')
46 | 
47 | 
48 | if __name__ == '__main__':
49 | 	asyncio.run(main())
50 | ```
51 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/features/custom_output.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Show how to use custom outputs.
 3 | 
 4 | @dev You need to add OPENAI_API_KEY to your environment variables.
 5 | """
 6 | 
 7 | import asyncio
 8 | import os
 9 | import sys
10 | 
11 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
12 | 
13 | from dotenv import load_dotenv
14 | 
15 | load_dotenv()
16 | 
17 | from pydantic import BaseModel
18 | 
19 | from browser_use import Agent, Controller
20 | from browser_use.llm import ChatOpenAI
21 | 
22 | 
23 | class Post(BaseModel):
24 | 	post_title: str
25 | 	post_url: str
26 | 	num_comments: int
27 | 	hours_since_post: int
28 | 
29 | 
30 | class Posts(BaseModel):
31 | 	posts: list[Post]
32 | 
33 | 
34 | controller = Controller(output_model=Posts)
35 | 
36 | 
37 | async def main():
38 | 	task = 'Go to hackernews show hn and give me the first  5 posts'
39 | 	model = ChatOpenAI(model='gpt-4.1')
40 | 	agent = Agent(task=task, llm=model, controller=controller)
41 | 
42 | 	history = await agent.run()
43 | 
44 | 	result = history.final_result()
45 | 	if result:
46 | 		parsed: Posts = Posts.model_validate_json(result)
47 | 
48 | 		for post in parsed.posts:
49 | 			print('\n--------------------------------')
50 | 			print(f'Title:            {post.post_title}')
51 | 			print(f'URL:              {post.post_url}')
52 | 			print(f'Comments:         {post.num_comments}')
53 | 			print(f'Hours since post: {post.hours_since_post}')
54 | 	else:
55 | 		print('No result')
56 | 
57 | 
58 | if __name__ == '__main__':
59 | 	asyncio.run(main())
60 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/use-cases/online_coding_agent.py:
--------------------------------------------------------------------------------
 1 | # Goal: Implements a multi-agent system for online code editors, with separate agents for coding and execution.
 2 | 
 3 | import asyncio
 4 | import os
 5 | import sys
 6 | 
 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 8 | 
 9 | from dotenv import load_dotenv
10 | 
11 | load_dotenv()
12 | 
13 | from browser_use import Agent
14 | from browser_use.browser import BrowserSession
15 | from browser_use.llm import ChatOpenAI
16 | 
17 | if not os.getenv('OPENAI_API_KEY'):
18 | 	raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.')
19 | 
20 | 
21 | async def main():
22 | 	browser_session = BrowserSession()
23 | 	model = ChatOpenAI(model='gpt-4.1')
24 | 
25 | 	# Initialize browser agent
26 | 	agent1 = Agent(
27 | 		task='Open an online code editor programiz.',
28 | 		llm=model,
29 | 		browser_session=browser_session,
30 | 	)
31 | 	executor = Agent(
32 | 		task='Executor. Execute the code written by the coder and suggest some updates if there are errors.',
33 | 		llm=model,
34 | 		browser_session=browser_session,
35 | 	)
36 | 
37 | 	coder = Agent(
38 | 		task='Coder. Your job is to write and complete code. You are an expert coder. Code a simple calculator. Write the code on the coding interface after agent1 has opened the link.',
39 | 		llm=model,
40 | 		browser_session=browser_session,
41 | 	)
42 | 	await agent1.run()
43 | 	await executor.run()
44 | 	await coder.run()
45 | 
46 | 
47 | if __name__ == '__main__':
48 | 	asyncio.run(main())
49 | 


--------------------------------------------------------------------------------
/utils/browser-use/.github/workflows/claude.yml:
--------------------------------------------------------------------------------
 1 | name: Claude Code
 2 | 
 3 | on:
 4 |   issue_comment:
 5 |     types: [created]
 6 |   pull_request_review_comment:
 7 |     types: [created]
 8 |   issues:
 9 |     types: [opened, assigned]
10 |   pull_request_review:
11 |     types: [submitted]
12 | 
13 | jobs:
14 |   claude:
15 |     if: |
16 |       (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) ||
17 |       (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) ||
18 |       (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) ||
19 |       (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude')))
20 |     runs-on: ubuntu-latest
21 |     permissions:
22 |       contents: read
23 |       pull-requests: read
24 |       issues: read
25 |       id-token: write
26 |     steps:
27 |       - name: Checkout repository
28 |         uses: actions/checkout@v4
29 |         with:
30 |           fetch-depth: 1
31 | 
32 |       - name: Run Claude Code
33 |         id: claude
34 |         uses: anthropics/claude-code-action@beta
35 |         with:
36 |           anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
37 |           allowed_tools: |
38 |             Bash(pytest)
39 |             Bash(sed)
40 |             Bash(grep)
41 |             Bash(python)
42 |             Bash(uv)
43 |             Bash(./bin/lint.sh)
44 |             Bash(./bin/test.sh)
45 |             Edit
46 |             Replace
47 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/agent/message_manager/utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import json
 4 | import logging
 5 | from pathlib import Path
 6 | from typing import Any
 7 | 
 8 | import anyio
 9 | 
10 | from browser_use.llm.messages import BaseMessage
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | async def save_conversation(
16 | 	input_messages: list[BaseMessage],
17 | 	response: Any,
18 | 	target: str | Path,
19 | 	encoding: str | None = None,
20 | ) -> None:
21 | 	"""Save conversation history to file asynchronously."""
22 | 	target_path = Path(target)
23 | 	# create folders if not exists
24 | 	if target_path.parent:
25 | 		await anyio.Path(target_path.parent).mkdir(parents=True, exist_ok=True)
26 | 
27 | 	await anyio.Path(target_path).write_text(
28 | 		await _format_conversation(input_messages, response),
29 | 		encoding=encoding or 'utf-8',
30 | 	)
31 | 
32 | 
33 | async def _format_conversation(messages: list[BaseMessage], response: Any) -> str:
34 | 	"""Format the conversation including messages and response."""
35 | 	lines = []
36 | 
37 | 	# Format messages
38 | 	for message in messages:
39 | 		lines.append(f' {message.role} ')
40 | 
41 | 		lines.append(message.text)
42 | 		lines.append('')  # Empty line after each message
43 | 
44 | 	# Format response
45 | 	lines.append(' RESPONSE')
46 | 	lines.append(json.dumps(json.loads(response.model_dump_json(exclude_unset=True)), indent=2))
47 | 
48 | 	return '\n'.join(lines)
49 | 
50 | 
51 | # Note: _write_messages_to_file and _write_response_to_file have been merged into _format_conversation
52 | # This is more efficient for async operations and reduces file I/O
53 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/integrations/slack/slack_example.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 5 | 
 6 | from dotenv import load_dotenv
 7 | 
 8 | load_dotenv()
 9 | 
10 | 
11 | from browser_use.browser import BrowserProfile
12 | from browser_use.llm import ChatGoogle
13 | from examples.integrations.slack.slack_api import SlackBot, app
14 | 
15 | # load credentials from environment variables
16 | bot_token = os.getenv('SLACK_BOT_TOKEN')
17 | if not bot_token:
18 | 	raise ValueError('Slack bot token not found in .env file.')
19 | 
20 | signing_secret = os.getenv('SLACK_SIGNING_SECRET')
21 | if not signing_secret:
22 | 	raise ValueError('Slack signing secret not found in .env file.')
23 | 
24 | api_key = os.getenv('GOOGLE_API_KEY')
25 | if not api_key:
26 | 	raise ValueError('GOOGLE_API_KEY is not set')
27 | 
28 | llm = ChatGoogle(model='gemini-2.0-flash-exp', api_key=api_key)
29 | 
30 | slack_bot = SlackBot(
31 | 	llm=llm,  # required; instance of BaseChatModel
32 | 	bot_token=bot_token,  # required; Slack bot token
33 | 	signing_secret=signing_secret,  # required; Slack signing secret
34 | 	ack=True,  # optional; whether to acknowledge task receipt with a message, defaults to False
35 | 	browser_profile=BrowserProfile(
36 | 		headless=True
37 | 	),  # optional; useful for changing headless mode or other browser configs, defaults to headless mode
38 | )
39 | 
40 | app.dependency_overrides[SlackBot] = lambda: slack_bot
41 | 
42 | if __name__ == '__main__':
43 | 	import uvicorn
44 | 
45 | 	uvicorn.run('integrations.slack.slack_api:app', host='0.0.0.0', port=3000)
46 | 


--------------------------------------------------------------------------------
/utils/browser-use/tests/old/httpx_client_test.py:
--------------------------------------------------------------------------------
 1 | import httpx
 2 | 
 3 | from browser_use.browser import BrowserProfile, BrowserSession
 4 | 
 5 | 
 6 | async def test_browser_close_doesnt_affect_external_httpx_clients():
 7 | 	"""
 8 | 	Test that Browser.close() doesn't close HTTPX clients created outside the Browser instance.
 9 | 	This test demonstrates the issue where Browser.close() is closing all HTTPX clients.
10 | 	"""
11 | 	# Create an external HTTPX client that should remain open
12 | 	external_client = httpx.AsyncClient()
13 | 
14 | 	# Create a BrowserSession instance
15 | 	browser_session = BrowserSession(browser_profile=BrowserProfile(headless=True))
16 | 	await browser_session.start()
17 | 
18 | 	# Close the browser (which should trigger cleanup_httpx_clients)
19 | 	await browser_session.stop()
20 | 
21 | 	# Check if the external client is still usable
22 | 	try:
23 | 		# If the client is closed, this will raise RuntimeError
24 | 		# Using a simple HEAD request to a reliable URL
25 | 		await external_client.head('https://www.example.com', timeout=2.0)
26 | 		client_is_closed = False
27 | 	except RuntimeError as e:
28 | 		# If we get "Cannot send a request, as the client has been closed"
29 | 		client_is_closed = 'client has been closed' in str(e)
30 | 	except Exception:
31 | 		# Any other exception means the client is not closed but request failed
32 | 		client_is_closed = False
33 | 	finally:
34 | 		# Always clean up our test client properly
35 | 		await external_client.aclose()
36 | 
37 | 	# Our external client should not be closed by browser.close()
38 | 	assert not client_is_closed, 'External HTTPX client was incorrectly closed by Browser.close()'
39 | 


--------------------------------------------------------------------------------
/utils/browser-use/.github/workflows/build-base-image.yml.disabled:
--------------------------------------------------------------------------------
 1 | name: Build Base Image
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 2 * * 1'  # Weekly on Monday
 6 |   workflow_dispatch:
 7 |   push:
 8 |     paths:
 9 |       - 'Dockerfile.base'
10 | 
11 | jobs:
12 |   build-base:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       matrix:
16 |         platform: [linux/amd64, linux/arm64]
17 |     steps:
18 |       - uses: actions/checkout@v4
19 |       
20 |       - name: Set up QEMU
21 |         uses: docker/setup-qemu-action@v3
22 |         
23 |       - name: Set up Docker Buildx
24 |         uses: docker/setup-buildx-action@v3
25 |         
26 |       - name: Login to Docker Hub
27 |         uses: docker/login-action@v3
28 |         with:
29 |           username: ${{ secrets.DOCKER_USERNAME }}
30 |           password: ${{ secrets.DOCKER_PASSWORD }}
31 |           
32 |       - name: Build and push base image
33 |         uses: docker/build-push-action@v5
34 |         with:
35 |           context: .
36 |           file: ./Dockerfile.base
37 |           platforms: ${{ matrix.platform }}
38 |           push: true
39 |           tags: |
40 |             browseruse/browseruse-base:chromium-138-${{ matrix.platform == 'linux/amd64' && 'amd64' || 'arm64' }}
41 |             browseruse/browseruse-base:latest-${{ matrix.platform == 'linux/amd64' && 'amd64' || 'arm64' }}
42 |           cache-from: type=registry,ref=browseruse/browseruse-base:buildcache-${{ matrix.platform == 'linux/amd64' && 'amd64' || 'arm64' }}
43 |           cache-to: type=registry,ref=browseruse/browseruse-base:buildcache-${{ matrix.platform == 'linux/amd64' && 'amd64' || 'arm64' }},mode=max
44 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/use-cases/check_appointment.py:
--------------------------------------------------------------------------------
 1 | # Goal: Checks for available visa appointment slots on the Greece MFA website.
 2 | 
 3 | import asyncio
 4 | import os
 5 | import sys
 6 | 
 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 8 | 
 9 | from dotenv import load_dotenv
10 | 
11 | load_dotenv()
12 | 
13 | from pydantic import BaseModel
14 | 
15 | from browser_use.agent.service import Agent
16 | from browser_use.controller.service import Controller
17 | from browser_use.llm import ChatOpenAI
18 | 
19 | if not os.getenv('OPENAI_API_KEY'):
20 | 	raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.')
21 | 
22 | controller = Controller()
23 | 
24 | 
25 | class WebpageInfo(BaseModel):
26 | 	"""Model for webpage link."""
27 | 
28 | 	link: str = 'https://appointment.mfa.gr/en/reservations/aero/ireland-grcon-dub/'
29 | 
30 | 
31 | @controller.action('Go to the webpage', param_model=WebpageInfo)
32 | def go_to_webpage(webpage_info: WebpageInfo):
33 | 	"""Returns the webpage link."""
34 | 	return webpage_info.link
35 | 
36 | 
37 | async def main():
38 | 	"""Main function to execute the agent task."""
39 | 	task = (
40 | 		'Go to the Greece MFA webpage via the link I provided you.'
41 | 		'Check the visa appointment dates. If there is no available date in this month, check the next month.'
42 | 		'If there is no available date in both months, tell me there is no available date.'
43 | 	)
44 | 
45 | 	model = ChatOpenAI(model='gpt-4.1-mini')
46 | 	agent = Agent(task, model, controller=controller, use_vision=True)
47 | 
48 | 	await agent.run()
49 | 
50 | 
51 | if __name__ == '__main__':
52 | 	asyncio.run(main())
53 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/custom-functions/clipboard.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 6 | 
 7 | from dotenv import load_dotenv
 8 | 
 9 | load_dotenv()
10 | 
11 | import pyperclip
12 | 
13 | from browser_use import Agent, Controller
14 | from browser_use.agent.views import ActionResult
15 | from browser_use.browser import BrowserProfile, BrowserSession
16 | from browser_use.browser.types import Page
17 | from browser_use.llm import ChatOpenAI
18 | 
19 | browser_profile = BrowserProfile(
20 | 	headless=False,
21 | )
22 | controller = Controller()
23 | 
24 | 
25 | @controller.registry.action('Copy text to clipboard')
26 | def copy_to_clipboard(text: str):
27 | 	pyperclip.copy(text)
28 | 	return ActionResult(extracted_content=text)
29 | 
30 | 
31 | @controller.registry.action('Paste text from clipboard')
32 | async def paste_from_clipboard(page: Page):
33 | 	text = pyperclip.paste()
34 | 	# send text to browser
35 | 	await page.keyboard.type(text)
36 | 
37 | 	return ActionResult(extracted_content=text)
38 | 
39 | 
40 | async def main():
41 | 	task = 'Copy the text "Hello, world!" to the clipboard, then go to google.com and paste the text'
42 | 	model = ChatOpenAI(model='gpt-4.1')
43 | 	browser_session = BrowserSession(browser_profile=browser_profile)
44 | 	await browser_session.start()
45 | 	agent = Agent(
46 | 		task=task,
47 | 		llm=model,
48 | 		controller=controller,
49 | 		browser_session=browser_session,
50 | 	)
51 | 
52 | 	await agent.run()
53 | 	await browser_session.stop()
54 | 
55 | 	input('Press Enter to close...')
56 | 
57 | 
58 | if __name__ == '__main__':
59 | 	asyncio.run(main())
60 | 


--------------------------------------------------------------------------------
/utils/browser-use/.github/workflows/package.yaml:
--------------------------------------------------------------------------------
 1 | name: package
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 |       - stable
 7 |       - 'releases/**'
 8 |     tags:
 9 |       - '*'
10 |   pull_request:
11 |   workflow_dispatch:
12 | 
13 | jobs:
14 |   build:
15 |     name: pip-build
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - uses: actions/checkout@v4
19 |       - uses: astral-sh/setup-uv@v5
20 |       - run: uv build --python 3.12
21 |       - uses: actions/upload-artifact@v4
22 |         with:
23 |           name: dist-artifact
24 |           path: |
25 |             dist/*.whl
26 |             dist/*.tar.gz
27 | 
28 |   build_test:
29 |     name: pip-install-on-${{ matrix.os }}-py-${{ matrix.python-version }}
30 |     needs: build
31 |     runs-on: ${{ matrix.os }}
32 |     strategy:
33 |       matrix:
34 |         os: [ubuntu-latest, macos-latest, windows-latest]
35 |         python-version: ["3.11", "3.13"]
36 |     env:
37 |       ANONYMIZED_TELEMETRY: 'false'
38 | 
39 |     steps:
40 |       - uses: actions/checkout@v4
41 |       - uses: astral-sh/setup-uv@v5
42 |       - uses: actions/download-artifact@v4
43 |         with:
44 |           name: dist-artifact
45 | 
46 |       - name: Set up venv and test for OS/Python versions
47 |         shell: bash
48 |         run: |
49 |           uv venv /tmp/testenv --python ${{ matrix.python-version }}
50 |           if [[ "$RUNNER_OS" == "Windows" ]]; then
51 |             . /tmp/testenv/Scripts/activate
52 |           else
53 |             source /tmp/testenv/bin/activate
54 |           fi
55 |           uv pip install *.whl
56 |           python -c 'from browser_use import Agent, Browser, Controller, ActionModel, ActionResult'
57 | 


--------------------------------------------------------------------------------
/utils/browser-use/bin/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # This script is used to setup a local development environment for the browser-use project.
 3 | # Usage:
 4 | #   $ ./bin/setup.sh
 5 | 
 6 | ### Bash Environment Setup
 7 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/
 8 | # https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
 9 | # set -o xtrace
10 | # set -x
11 | # shopt -s nullglob
12 | set -o errexit
13 | set -o errtrace
14 | set -o nounset
15 | set -o pipefail
16 | IFS=$'\n'
17 | 
18 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
19 | cd "$SCRIPT_DIR"
20 | 
21 | 
22 | if [ -f "$SCRIPT_DIR/lint.sh" ]; then
23 |     echo "[√] already inside a cloned browser-use repo"
24 | else
25 |     echo "[+] Cloning browser-use repo into current directory: $SCRIPT_DIR"
26 |     git clone https://github.com/browser-use/browser-use
27 |     cd browser-use
28 | fi
29 | 
30 | echo "[+] Installing uv..."
31 | curl -LsSf https://astral.sh/uv/install.sh | sh
32 | 
33 | #git checkout main git pull
34 | echo
35 | echo "[+] Setting up venv"
36 | uv venv
37 | echo
38 | echo "[+] Installing packages in venv"
39 | uv sync --dev --all-extras
40 | echo
41 | echo "[i] Tip: make sure to set BROWSER_USE_LOGGING_LEVEL=debug and your LLM API keys in your .env file"
42 | echo
43 | uv pip show browser-use
44 | 
45 | echo "Usage:"
46 | echo "  $ browser-use               use the CLI"
47 | echo "  or"
48 | echo "  $ source .venv/bin/activate"
49 | echo "  $ ipython                   use the library"
50 | echo "  >>> from browser_use import BrowserSession, Agent"
51 | echo "  >>> await Agent(task='book me a flight to fiji', browser=BrowserSession(headless=False)).run()"
52 | echo ""
53 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/llm/base.py:
--------------------------------------------------------------------------------
 1 | """
 2 | We have switched all of our code from langchain to openai.types.chat.chat_completion_message_param.
 3 | 
 4 | For easier transition we have
 5 | """
 6 | 
 7 | from typing import Any, Protocol, TypeVar, overload
 8 | 
 9 | from pydantic import BaseModel
10 | 
11 | from browser_use.llm.messages import BaseMessage
12 | from browser_use.llm.views import ChatInvokeCompletion
13 | 
14 | T = TypeVar('T', bound=BaseModel)
15 | 
16 | 
17 | class BaseChatModel(Protocol):
18 | 	_verified_api_keys: bool = False
19 | 
20 | 	model: str
21 | 
22 | 	@property
23 | 	def provider(self) -> str: ...
24 | 
25 | 	@property
26 | 	def name(self) -> str: ...
27 | 
28 | 	@property
29 | 	def model_name(self) -> str:
30 | 		# for legacy support
31 | 		return self.model
32 | 
33 | 	@overload
34 | 	async def ainvoke(self, messages: list[BaseMessage], output_format: None = None) -> ChatInvokeCompletion[str]: ...
35 | 
36 | 	@overload
37 | 	async def ainvoke(self, messages: list[BaseMessage], output_format: type[T]) -> ChatInvokeCompletion[T]: ...
38 | 
39 | 	async def ainvoke(
40 | 		self, messages: list[BaseMessage], output_format: type[T] | None = None
41 | 	) -> ChatInvokeCompletion[T] | ChatInvokeCompletion[str]: ...
42 | 
43 | 	@classmethod
44 | 	def __get_pydantic_core_schema__(
45 | 		cls,
46 | 		source_type: type,
47 | 		handler: Any,
48 | 	) -> Any:
49 | 		"""
50 | 		Allow this Protocol to be used in Pydantic models -> very useful to typesafe the agent settings for example.
51 | 		Returns a schema that allows any object (since this is a Protocol).
52 | 		"""
53 | 		from pydantic_core import core_schema
54 | 
55 | 		# Return a schema that accepts any object for Protocol types
56 | 		return core_schema.any_schema()
57 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/file_system/file_system.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import pathlib
 4 | import shutil
 5 | 
 6 | from dotenv import load_dotenv
 7 | 
 8 | from browser_use import Agent
 9 | from browser_use.llm import ChatOpenAI
10 | 
11 | load_dotenv()
12 | 
13 | ''
14 | SCRIPT_DIR = pathlib.Path(os.path.dirname(os.path.abspath(__file__)))
15 | agent_dir = SCRIPT_DIR / 'test_no_thinking'
16 | agent_dir.mkdir(exist_ok=True)
17 | conversation_dir = agent_dir / 'conversations' / 'conversation'
18 | print(f'Agent logs directory: {agent_dir}')
19 | 
20 | try:
21 | 	from lmnr import Laminar
22 | 
23 | 	Laminar.initialize(project_api_key=os.getenv('LMNR_PROJECT_API_KEY'))
24 | except Exception as e:
25 | 	print(f'Error initializing Laminar: {e}')
26 | 
27 | task = """
28 | Go to https://mertunsall.github.io/posts/post1.html
29 | Save the title of the article in "data.md"
30 | Then, use append_file to add the first sentence of the article to "data.md"
31 | Then, read the file to see its content and make sure it's correct.
32 | Finally, share the file with me.
33 | 
34 | NOTE: DO NOT USE extract_structured_data action - everything is visible in browser state.
35 | """.strip('\n')
36 | 
37 | llm = ChatOpenAI(
38 | 	model='gpt-4.1-mini',
39 | )
40 | 
41 | 
42 | agent = Agent(
43 | 	task=task,
44 | 	llm=llm,
45 | 	save_conversation_path=str(conversation_dir),
46 | 	file_system_path=str(agent_dir / 'fs'),
47 | )
48 | 
49 | 
50 | async def main():
51 | 	agent_history = await agent.run()
52 | 	print(f'Final result: {agent_history.final_result()}', flush=True)
53 | 
54 | 	input('Press Enter to clean the file system...')
55 | 	# clean the file system
56 | 	shutil.rmtree(str(agent_dir / 'fs'))
57 | 
58 | 
59 | if __name__ == '__main__':
60 | 	asyncio.run(main())
61 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/llm/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | We have switched all of our code from langchain to openai.types.chat.chat_completion_message_param.
 3 | 
 4 | For easier transition we have
 5 | """
 6 | 
 7 | from browser_use.llm.anthropic.chat import ChatAnthropic
 8 | from browser_use.llm.aws.chat_anthropic import ChatAnthropicBedrock
 9 | from browser_use.llm.aws.chat_bedrock import ChatAWSBedrock
10 | from browser_use.llm.azure.chat import ChatAzureOpenAI
11 | from browser_use.llm.base import BaseChatModel
12 | from browser_use.llm.google.chat import ChatGoogle
13 | from browser_use.llm.groq.chat import ChatGroq
14 | from browser_use.llm.messages import (
15 | 	AssistantMessage,
16 | 	BaseMessage,
17 | 	SystemMessage,
18 | 	UserMessage,
19 | )
20 | from browser_use.llm.messages import (
21 | 	ContentPartImageParam as ContentImage,
22 | )
23 | from browser_use.llm.messages import (
24 | 	ContentPartRefusalParam as ContentRefusal,
25 | )
26 | from browser_use.llm.messages import (
27 | 	ContentPartTextParam as ContentText,
28 | )
29 | from browser_use.llm.ollama.chat import ChatOllama
30 | from browser_use.llm.openai.chat import ChatOpenAI
31 | from browser_use.llm.openrouter.chat import ChatOpenRouter
32 | 
33 | # Make better names for the message
34 | 
35 | __all__ = [
36 | 	# Message types -> for easier transition from langchain
37 | 	'BaseMessage',
38 | 	'UserMessage',
39 | 	'SystemMessage',
40 | 	'AssistantMessage',
41 | 	# Content parts with better names
42 | 	'ContentText',
43 | 	'ContentRefusal',
44 | 	'ContentImage',
45 | 	# Chat models
46 | 	'BaseChatModel',
47 | 	'ChatOpenAI',
48 | 	'ChatGoogle',
49 | 	'ChatAnthropic',
50 | 	'ChatAnthropicBedrock',
51 | 	'ChatAWSBedrock',
52 | 	'ChatGroq',
53 | 	'ChatAzureOpenAI',
54 | 	'ChatOllama',
55 | 	'ChatOpenRouter',
56 | ]
57 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/models/langchain/example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example of using LangChain models with browser-use.
 3 | 
 4 | This example demonstrates how to:
 5 | 1. Wrap a LangChain model with ChatLangchain
 6 | 2. Use it with a browser-use Agent
 7 | 3. Run a simple web automation task
 8 | 
 9 | @file purpose: Example usage of LangChain integration with browser-use
10 | """
11 | 
12 | import asyncio
13 | 
14 | from langchain_openai import ChatOpenAI  # pyright: ignore
15 | from lmnr import Laminar
16 | 
17 | from browser_use import Agent
18 | from examples.models.langchain.chat import ChatLangchain
19 | 
20 | Laminar.initialize()
21 | 
22 | 
23 | async def main():
24 | 	"""Basic example using ChatLangchain with OpenAI through LangChain."""
25 | 
26 | 	# Create a LangChain model (OpenAI)
27 | 	langchain_model = ChatOpenAI(
28 | 		model='gpt-4.1-mini',
29 | 		temperature=0.1,
30 | 	)
31 | 
32 | 	# Wrap it with ChatLangchain to make it compatible with browser-use
33 | 	llm = ChatLangchain(chat=langchain_model)
34 | 
35 | 	# Create a simple task
36 | 	task = "Go to google.com and search for 'browser automation with Python'"
37 | 
38 | 	# Create and run the agent
39 | 	agent = Agent(
40 | 		task=task,
41 | 		llm=llm,
42 | 	)
43 | 
44 | 	print(f'🚀 Starting task: {task}')
45 | 	print(f'🤖 Using model: {llm.name} (provider: {llm.provider})')
46 | 
47 | 	# Run the agent
48 | 	history = await agent.run()
49 | 
50 | 	print(f'✅ Task completed! Steps taken: {len(history.history)}')
51 | 
52 | 	# Print the final result if available
53 | 	if history.final_result():
54 | 		print(f'📋 Final result: {history.final_result()}')
55 | 
56 | 		return history
57 | 
58 | 
59 | if __name__ == '__main__':
60 | 	print('🌐 Browser-use LangChain Integration Example')
61 | 	print('=' * 45)
62 | 
63 | 	asyncio.run(main())
64 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/llm/tests/test_groq_loop.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from browser_use.llm import ContentText
 4 | from browser_use.llm.groq.chat import ChatGroq
 5 | from browser_use.llm.messages import SystemMessage, UserMessage
 6 | 
 7 | llm = ChatGroq(
 8 | 	model='meta-llama/llama-4-maverick-17b-128e-instruct',
 9 | 	temperature=0.5,
10 | )
11 | # llm = ChatOpenAI(model='gpt-4.1-mini')
12 | 
13 | 
14 | async def main():
15 | 	from pydantic import BaseModel
16 | 
17 | 	from browser_use.tokens.service import TokenCost
18 | 
19 | 	tk = TokenCost().register_llm(llm)
20 | 
21 | 	class Output(BaseModel):
22 | 		reasoning: str
23 | 		answer: str
24 | 
25 | 	message = [
26 | 		SystemMessage(content='You are a helpful assistant that can answer questions and help with tasks.'),
27 | 		UserMessage(
28 | 			content=[
29 | 				ContentText(
30 | 					text=r"Why is the sky blue? write exactly this into reasoning make sure to output ' with  exactly like in the input : "
31 | 				),
32 | 				ContentText(
33 | 					text="""
34 | 	The user's request is to find the lowest priced women's plus size one piece swimsuit in color black with a customer rating of at least 5 on Kohls.com. I am currently on the homepage of Kohls. The page has a search bar and various category links. To begin, I need to navigate to the women's section and search for swimsuits. I will start by clicking on the 'Women' category link."""
35 | 				),
36 | 			]
37 | 		),
38 | 	]
39 | 
40 | 	for i in range(10):
41 | 		print('-' * 50)
42 | 		print(f'start loop {i}')
43 | 		response = await llm.ainvoke(message, output_format=Output)
44 | 		completion = response.completion
45 | 		print(f'start reasoning: {completion.reasoning}')
46 | 		print(f'answer: {completion.answer}')
47 | 		print('-' * 50)
48 | 
49 | 
50 | if __name__ == '__main__':
51 | 	asyncio.run(main())
52 | 


--------------------------------------------------------------------------------
/utils/browser-use/docs/quickstart.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Quickstart"
 3 | description: "Start using Browser Use with this quickstart guide"
 4 | icon: "rocket"
 5 | ---
 6 | 
 7 | {/* You can install Browser Use from PyPI or clone it from Github. */}
 8 | 
 9 | ## Prepare the environment
10 | 
11 | Browser Use requires Python 3.11 or higher.
12 | 
13 | First, we recommend using [uv](https://docs.astral.sh/uv/) to setup the Python environment.
14 | 
15 | ```bash
16 | uv venv --python 3.11
17 | ```
18 | 
19 | and activate it with:
20 | 
21 | ```bash
22 | # For Mac/Linux:
23 | source .venv/bin/activate
24 | 
25 | # For Windows:
26 | .venv\Scripts\activate
27 | ```
28 | 
29 | Install the dependencies:
30 | 
31 | ```bash
32 | uv pip install browser-use
33 | ```
34 | 
35 | Then install playwright:
36 | 
37 | ```bash
38 | uv run playwright install
39 | ```
40 | 
41 | ## Create an agent
42 | 
43 | Then you can use the agent as follows:
44 | 
45 | ```python agent.py
46 | from browser_use.llm import ChatOpenAI
47 | from browser_use import Agent
48 | from dotenv import load_dotenv
49 | load_dotenv()
50 | 
51 | import asyncio
52 | 
53 | llm = ChatOpenAI(model="gpt-4.1")
54 | 
55 | async def main():
56 |     agent = Agent(
57 |         task="Compare the price of gpt-4o and DeepSeek-V3",
58 |         llm=llm,
59 |     )
60 |     result = await agent.run()
61 |     print(result)
62 | 
63 | asyncio.run(main())
64 | ```
65 | 
66 | ## Set up your LLM API keys
67 | 
68 | `ChatOpenAI` and other chat models require API keys. You should store these in your `.env` file. For example, for OpenAI and Anthropic, you can set the API keys in your `.env` file, such as:
69 | 
70 | ```bash .env
71 | OPENAI_API_KEY=
72 | ANTHROPIC_API_KEY=
73 | ```
74 | 
75 | For other LLM models you can refer to the [Supported Models](/customize/supported-models) page to find how to set them up with their specific API keys.
76 | 


--------------------------------------------------------------------------------
/utils/browser-use/docs/development/evaluations.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Evaluations"
 3 | description: "Test the Browser Use agent on standardized benchmarks"
 4 | icon: "chart-bar"
 5 | ---
 6 | 
 7 | ## Prerequisites
 8 | 
 9 | Browser Use uses proprietary/private test sets that must never be committed to Github and must be fetched through a authorized api request.
10 | Accessing these test sets requires an approved Browser Use account.
11 | There are currently no publicly available test sets, but some may be released in the future.
12 | 
13 | ## Get an Api Access Key
14 | 
15 | First, navigate to https://browser-use.tools and log in with an authorized browser use account.
16 | 
17 | Then, click the "Account" button at the top right of the page, and click the "Cycle New Key" button on that page.
18 | 
19 | Copy the resulting url and secret key into your `.env` file. It should look like this:
20 | 
21 | ```bash .env
22 | EVALUATION_TOOL_URL= ...
23 | EVALUATION_TOOL_SECRET_KEY= ...
24 | ```
25 | 
26 | ## Running Evaluations
27 | 
28 | First, ensure your file `eval/service.py` is up to date.
29 | 
30 | Then run the file:
31 | 
32 | ```bash
33 | python eval/service.py
34 | ```
35 | 
36 | ## Configuring Evaluations
37 | 
38 | You can modify the evaluation by providing flags to the evaluation script. For instance:
39 | 
40 | ```bash
41 | python eval/service.py --parallel_runs 5 --parallel_evaluations 5 --max-steps 25 --start 0 --end 100 --model gpt-4o
42 | ```
43 | 
44 | The evaluations webpage has a convenient GUI for generating these commands. To use it, navigate to https://browser-use.tools/dashboard.
45 | 
46 | Then click the button "New Eval Run" on the left panel. This will open a interface with selectors, inputs, sliders, and switches.
47 | 
48 | Input your desired configuration into the interface and copy the resulting python command at the bottom. Then run this command as before.
49 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/features/outsource_state.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Show how to use custom outputs.
 3 | 
 4 | @dev You need to add OPENAI_API_KEY to your environment variables.
 5 | """
 6 | 
 7 | import asyncio
 8 | import os
 9 | import sys
10 | 
11 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
12 | 
13 | from dotenv import load_dotenv
14 | 
15 | load_dotenv()
16 | 
17 | import anyio
18 | 
19 | from browser_use import Agent
20 | from browser_use.agent.views import AgentState
21 | from browser_use.browser import BrowserProfile, BrowserSession
22 | from browser_use.llm import ChatOpenAI
23 | 
24 | 
25 | async def main():
26 | 	task = 'Go to hackernews show hn and give me the first  5 posts'
27 | 
28 | 	browser_profile = BrowserProfile(
29 | 		headless=True,
30 | 	)
31 | 	browser_session = BrowserSession(browser_profile=browser_profile)
32 | 
33 | 	agent_state = AgentState()
34 | 
35 | 	for i in range(10):
36 | 		agent = Agent(
37 | 			task=task,
38 | 			llm=ChatOpenAI(model='gpt-4.1'),
39 | 			browser_session=browser_session,
40 | 			injected_agent_state=agent_state,
41 | 			page_extraction_llm=ChatOpenAI(model='gpt-4.1-mini'),
42 | 		)
43 | 
44 | 		done, valid = await agent.take_step()
45 | 		print(f'Step {i}: Done: {done}, Valid: {valid}')
46 | 
47 | 		if done and valid:
48 | 			break
49 | 
50 | 		agent_state.history.history = []
51 | 
52 | 		# Save state to file
53 | 		async with await anyio.open_file('agent_state.json', 'w') as f:
54 | 			serialized = agent_state.model_dump_json(exclude={'history'})
55 | 			await f.write(serialized)
56 | 
57 | 		# Load state back from file
58 | 		async with await anyio.open_file('agent_state.json', 'r') as f:
59 | 			loaded_json = await f.read()
60 | 			agent_state = AgentState.model_validate_json(loaded_json)
61 | 
62 | 		break
63 | 
64 | 
65 | if __name__ == '__main__':
66 | 	asyncio.run(main())
67 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/use-cases/scrolling_page.py:
--------------------------------------------------------------------------------
 1 | # Goal: Automates webpage scrolling with various scrolling actions and text search functionality.
 2 | 
 3 | import asyncio
 4 | import os
 5 | import sys
 6 | 
 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 8 | 
 9 | from dotenv import load_dotenv
10 | 
11 | load_dotenv()
12 | 
13 | from browser_use import Agent
14 | from browser_use.browser import BrowserProfile, BrowserSession
15 | from browser_use.llm import ChatOpenAI
16 | 
17 | if not os.getenv('OPENAI_API_KEY'):
18 | 	raise ValueError('OPENAI_API_KEY is not set')
19 | 
20 | """
21 | Example: Using the 'Scroll' action with custom page amounts.
22 | 
23 | This script demonstrates how the agent can navigate to a webpage and scroll by specific page amounts.
24 | The scroll action now supports:
25 | - Scrolling by a specific number of pages using the 'num_pages' parameter (e.g., 0.5 for half page, 1.0 for one page, 2.0 for two pages)
26 | - Scrolling by one page height if no num_pages is specified (default behavior)
27 | - Scrolling up or down using the 'down' parameter
28 | """
29 | 
30 | llm = ChatOpenAI(model='gpt-4.1')
31 | 
32 | browser_profile = BrowserProfile(headless=False)
33 | browser_session = BrowserSession(browser_profile=browser_profile)
34 | 
35 | agent = Agent(
36 | 	task="Navigate to 'https://en.wikipedia.org/wiki/Internet' and scroll down by one page - then scroll up by 0.5 pages - then scroll down by 0.25 pages - then scroll down by 2 pages.",
37 | 	# Alternative task to demonstrate text-based scrolling:
38 | 	# task="Navigate to 'https://en.wikipedia.org/wiki/Internet' and scroll to the string 'The vast majority of computer'",
39 | 	llm=llm,
40 | 	browser_session=browser_session,
41 | )
42 | 
43 | 
44 | async def main():
45 | 	await agent.run()
46 | 
47 | 
48 | if __name__ == '__main__':
49 | 	asyncio.run(main())
50 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/custom-functions/save_pdf.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import re
 4 | import sys
 5 | from pathlib import Path
 6 | 
 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 8 | 
 9 | from dotenv import load_dotenv
10 | 
11 | load_dotenv()
12 | 
13 | from browser_use import ActionResult, Agent, Controller
14 | from browser_use.browser.types import Page
15 | from browser_use.llm import ChatOpenAI
16 | 
17 | # Initialize controller
18 | controller = Controller()
19 | 
20 | download_path = Path.cwd() / 'downloads'
21 | download_path.mkdir(parents=True, exist_ok=True)
22 | 
23 | 
24 | # Save PDF - exact copy from original controller function
25 | @controller.registry.action('Save the current page as a PDF file')
26 | async def save_pdf(page: Page):
27 | 	short_url = re.sub(r'^https?://(?:www\.)?|/$', '', page.url)
28 | 	slug = re.sub(r'[^a-zA-Z0-9]+', '-', short_url).strip('-').lower()
29 | 	sanitized_filename = f'{slug}.pdf'
30 | 
31 | 	await page.emulate_media(media='screen')
32 | 	await page.pdf(path=download_path / sanitized_filename, format='A4', print_background=False)
33 | 	msg = f'Saving page with URL {page.url} as PDF to {download_path / sanitized_filename}'
34 | 	return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=f'Saved PDF to {sanitized_filename}')
35 | 
36 | 
37 | async def main():
38 | 	"""
39 | 	Example task: Navigate to browser-use.com and save the page as a PDF
40 | 	"""
41 | 	task = """
42 | 	Go to https://browser-use.com/ and save the page as a PDF file.
43 | 	"""
44 | 
45 | 	# Initialize the language model
46 | 	model = ChatOpenAI(model='gpt-4.1-mini')
47 | 
48 | 	# Create and run the agent
49 | 	agent = Agent(task=task, llm=model, controller=controller)
50 | 
51 | 	result = await agent.run()
52 | 	print(f'🎯 Task completed: {result}')
53 | 
54 | 
55 | if __name__ == '__main__':
56 | 	asyncio.run(main())
57 | 


--------------------------------------------------------------------------------
/utils/browser-use/tests/old/test_vision.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple try of the agent.
 3 | 
 4 | @dev You need to add OPENAI_API_KEY to your environment variables.
 5 | """
 6 | 
 7 | import os
 8 | import sys
 9 | from pprint import pprint
10 | 
11 | import pytest
12 | 
13 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
14 | 
15 | 
16 | from browser_use import Agent, AgentHistoryList, BrowserSession, Controller
17 | from browser_use.llm import ChatOpenAI
18 | 
19 | llm = ChatOpenAI(model='gpt-4.1')
20 | controller = Controller()
21 | 
22 | # use this test to ask the model questions about the page like
23 | # which color do you see for bbox labels, list all with their label
24 | # what's the smallest bboxes with labels and
25 | 
26 | 
27 | @controller.registry.action(description='explain what you see on the screen and ask user for input')
28 | async def explain_screen(text: str) -> str:
29 | 	pprint(text)
30 | 	answer = input('\nuser input next question: \n')
31 | 	return answer
32 | 
33 | 
34 | @controller.registry.action(description='done')
35 | async def done(text: str) -> str:
36 | 	# pprint(text)
37 | 	return 'call explain_screen'
38 | 
39 | 
40 | @pytest.mark.skip(reason='this is for local testing only')
41 | async def test_vision():
42 | 	from browser_use.browser.profile import BrowserProfile
43 | 
44 | 	profile = BrowserProfile(headless=True, user_data_dir=None)
45 | 	browser_session = BrowserSession(browser_profile=profile)
46 | 	await browser_session.start()
47 | 	try:
48 | 		agent = Agent(
49 | 			task='call explain_screen all the time the user asks you questions e.g. about the page like bbox which you see are labels  - your task is to explain it and get the next question',
50 | 			llm=llm,
51 | 			controller=controller,
52 | 			browser_session=browser_session,
53 | 		)
54 | 		history: AgentHistoryList = await agent.run(20)
55 | 	finally:
56 | 		# Make sure to close the browser
57 | 		await browser_session.stop()
58 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/browser/using_cdp.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple demonstration of the CDP feature.
 3 | 
 4 | To test this locally, follow these steps:
 5 | 1. Create a shortcut for the executable Chrome file.
 6 | 2. Add the following argument to the shortcut:
 7 |    - On Windows: `--remote-debugging-port=9222`
 8 | 3. Open a web browser and navigate to `http://localhost:9222/json/version` to verify that the Remote Debugging Protocol (CDP) is running.
 9 | 4. Launch this example.
10 | 
11 | @dev You need to set the `GOOGLE_API_KEY` environment variable before proceeding.
12 | """
13 | 
14 | import asyncio
15 | import os
16 | import sys
17 | 
18 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
19 | 
20 | from dotenv import load_dotenv
21 | 
22 | load_dotenv()
23 | 
24 | 
25 | from browser_use import Agent, Controller
26 | from browser_use.browser import BrowserProfile, BrowserSession
27 | from browser_use.llm import ChatGoogle
28 | 
29 | api_key = os.getenv('GOOGLE_API_KEY')
30 | if not api_key:
31 | 	raise ValueError('GOOGLE_API_KEY is not set')
32 | 
33 | browser_session = BrowserSession(
34 | 	browser_profile=BrowserProfile(
35 | 		headless=False,
36 | 	),
37 | 	cdp_url='http://localhost:9222',
38 | )
39 | controller = Controller()
40 | 
41 | 
42 | async def main():
43 | 	task = 'In docs.google.com write my Papa a quick thank you for everything letter \n - Magnus'
44 | 	task += ' and save the document as pdf'
45 | 	# Assert api_key is not None to satisfy type checker
46 | 	assert api_key is not None, 'GOOGLE_API_KEY must be set'
47 | 	model = ChatGoogle(model='gemini-2.0-flash-exp', api_key=api_key)
48 | 	agent = Agent(
49 | 		task=task,
50 | 		llm=model,
51 | 		controller=controller,
52 | 		browser_session=browser_session,
53 | 	)
54 | 
55 | 	await agent.run()
56 | 	await browser_session.close()
57 | 
58 | 	input('Press Enter to close...')
59 | 
60 | 
61 | if __name__ == '__main__':
62 | 	asyncio.run(main())
63 | 


--------------------------------------------------------------------------------
/utils/browser-use/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/asottile/yesqa
 3 |     rev: v1.5.0
 4 |     hooks:
 5 |       - id: yesqa
 6 | 
 7 |   - repo: https://github.com/codespell-project/codespell
 8 |     rev: v2.4.1
 9 |     hooks:
10 |       - id: codespell # See pyproject.toml for args
11 |         additional_dependencies:
12 |           - tomli
13 | 
14 |   - repo: https://github.com/asottile/pyupgrade
15 |     rev: v3.19.1
16 |     hooks:
17 |       - id: pyupgrade
18 |         args: [--py311-plus]
19 | 
20 |   # - repo: https://github.com/asottile/add-trailing-comma
21 |   #   rev: v3.1.0
22 |   #   hooks:
23 |   #     - id: add-trailing-comma
24 | 
25 |   - repo: https://github.com/astral-sh/ruff-pre-commit
26 |     rev: v0.11.2
27 |     hooks:
28 |       - id: ruff
29 |       - id: ruff-format
30 |       # see pyproject.toml for more details on ruff config
31 | 
32 |   - repo: https://github.com/RobertCraigie/pyright-python
33 |     rev: v1.1.402
34 |     hooks:
35 |     - id: pyright
36 | 
37 |   - repo: https://github.com/pre-commit/pre-commit-hooks
38 |     rev: v5.0.0
39 |     hooks:
40 |       # check for basic syntax errors in python and data files
41 |       - id: check-ast
42 |       - id: check-toml
43 |       - id: check-yaml
44 |       - id: check-json
45 |       - id: check-merge-conflict
46 |       # check for bad files and folders
47 |       - id: check-symlinks
48 |       - id: destroyed-symlinks
49 |       - id: check-case-conflict
50 |       - id: check-illegal-windows-names
51 |       - id: check-shebang-scripts-are-executable
52 |       - id: mixed-line-ending
53 |       - id: fix-byte-order-marker
54 |       - id: end-of-file-fixer
55 |       # best practices enforcement
56 |       - id: detect-private-key
57 |       # - id: check-docstring-first
58 |       - id: debug-statements
59 |       - id: forbid-submodules
60 |       - id: check-added-large-files
61 |         args: ["--maxkb=600"]
62 |       # - id: name-tests-test
63 |       #   args: ["--pytest-test-first"]
64 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/dom/history_tree_processor/view.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | from pydantic import BaseModel
 4 | 
 5 | 
 6 | @dataclass
 7 | class HashedDomElement:
 8 | 	"""
 9 | 	Hash of the dom element to be used as a unique identifier
10 | 	"""
11 | 
12 | 	branch_path_hash: str
13 | 	attributes_hash: str
14 | 	xpath_hash: str
15 | 	# text_hash: str
16 | 
17 | 
18 | class Coordinates(BaseModel):
19 | 	x: int
20 | 	y: int
21 | 
22 | 
23 | class CoordinateSet(BaseModel):
24 | 	top_left: Coordinates
25 | 	top_right: Coordinates
26 | 	bottom_left: Coordinates
27 | 	bottom_right: Coordinates
28 | 	center: Coordinates
29 | 	width: int
30 | 	height: int
31 | 
32 | 
33 | class ViewportInfo(BaseModel):
34 | 	scroll_x: int | None = None
35 | 	scroll_y: int | None = None
36 | 	width: int
37 | 	height: int
38 | 
39 | 
40 | @dataclass
41 | class DOMHistoryElement:
42 | 	tag_name: str
43 | 	xpath: str
44 | 	highlight_index: int | None
45 | 	entire_parent_branch_path: list[str]
46 | 	attributes: dict[str, str]
47 | 	shadow_root: bool = False
48 | 	css_selector: str | None = None
49 | 	page_coordinates: CoordinateSet | None = None
50 | 	viewport_coordinates: CoordinateSet | None = None
51 | 	viewport_info: ViewportInfo | None = None
52 | 
53 | 	def to_dict(self) -> dict:
54 | 		page_coordinates = self.page_coordinates.model_dump() if self.page_coordinates else None
55 | 		viewport_coordinates = self.viewport_coordinates.model_dump() if self.viewport_coordinates else None
56 | 		viewport_info = self.viewport_info.model_dump() if self.viewport_info else None
57 | 
58 | 		return {
59 | 			'tag_name': self.tag_name,
60 | 			'xpath': self.xpath,
61 | 			'highlight_index': self.highlight_index,
62 | 			'entire_parent_branch_path': self.entire_parent_branch_path,
63 | 			'attributes': self.attributes,
64 | 			'shadow_root': self.shadow_root,
65 | 			'css_selector': self.css_selector,
66 | 			'page_coordinates': page_coordinates,
67 | 			'viewport_coordinates': viewport_coordinates,
68 | 			'viewport_info': viewport_info,
69 | 		}
70 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/custom-functions/onepassword_2fa.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import logging
 3 | import os
 4 | import sys
 5 | 
 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 7 | 
 8 | from dotenv import load_dotenv
 9 | 
10 | load_dotenv()
11 | 
12 | from onepassword.client import Client  # type: ignore  # pip install onepassword-sdk
13 | 
14 | from browser_use import ActionResult, Agent, Controller
15 | from browser_use.llm import ChatOpenAI
16 | 
17 | # Set up logging
18 | logging.basicConfig(level=logging.INFO)
19 | logger = logging.getLogger(__name__)
20 | 
21 | OP_SERVICE_ACCOUNT_TOKEN = os.getenv('OP_SERVICE_ACCOUNT_TOKEN')
22 | OP_ITEM_ID = os.getenv('OP_ITEM_ID')  # Go to 1Password, right click on the item, click "Copy Secret Reference"
23 | 
24 | 
25 | controller = Controller()
26 | 
27 | 
28 | @controller.registry.action('Get 2FA code from 1Password for Google Account', domains=['*.google.com', 'google.com'])
29 | async def get_1password_2fa() -> ActionResult:
30 | 	"""
31 | 	Custom action to retrieve 2FA/MFA code from 1Password using onepassword.client SDK.
32 | 	"""
33 | 	client = await Client.authenticate(
34 | 		# setup instructions: https://github.com/1Password/onepassword-sdk-python/#-get-started
35 | 		auth=OP_SERVICE_ACCOUNT_TOKEN,
36 | 		integration_name='Browser-Use',
37 | 		integration_version='v1.0.0',
38 | 	)
39 | 
40 | 	mfa_code = await client.secrets.resolve(f'op://Private/{OP_ITEM_ID}/One-time passcode')
41 | 
42 | 	return ActionResult(extracted_content=mfa_code)
43 | 
44 | 
45 | async def main():
46 | 	# Example task using the 1Password 2FA action
47 | 	task = 'Go to account.google.com, enter username and password, then if prompted for 2FA code, get 2FA code from 1Password for and enter it'
48 | 
49 | 	model = ChatOpenAI(model='gpt-4.1')
50 | 	agent = Agent(task=task, llm=model, controller=controller)
51 | 
52 | 	result = await agent.run()
53 | 	print(f'Task completed with result: {result}')
54 | 
55 | 
56 | if __name__ == '__main__':
57 | 	asyncio.run(main())
58 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/features/custom_user_agent.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import asyncio
 3 | import os
 4 | import sys
 5 | 
 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 7 | 
 8 | from dotenv import load_dotenv
 9 | 
10 | load_dotenv()
11 | 
12 | from browser_use import Agent
13 | from browser_use.browser import BrowserProfile, BrowserSession
14 | from browser_use.controller.service import Controller
15 | from browser_use.llm import ChatAnthropic, ChatOpenAI
16 | 
17 | 
18 | def get_llm(provider: str):
19 | 	if provider == 'anthropic':
20 | 		return ChatAnthropic(model='claude-3-5-sonnet-20240620', temperature=0.0)
21 | 	elif provider == 'openai':
22 | 		return ChatOpenAI(model='gpt-4.1', temperature=0.0)
23 | 
24 | 	else:
25 | 		raise ValueError(f'Unsupported provider: {provider}')
26 | 
27 | 
28 | # NOTE: This example is to find your current user agent string to use it in the browser_context
29 | task = 'go to https://whatismyuseragent.com and find the current user agent string '
30 | 
31 | 
32 | controller = Controller()
33 | 
34 | 
35 | parser = argparse.ArgumentParser()
36 | parser.add_argument('--query', type=str, help='The query to process', default=task)
37 | parser.add_argument(
38 | 	'--provider',
39 | 	type=str,
40 | 	choices=['openai', 'anthropic'],
41 | 	default='openai',
42 | 	help='The model provider to use (default: openai)',
43 | )
44 | 
45 | args = parser.parse_args()
46 | 
47 | llm = get_llm(args.provider)
48 | 
49 | browser_session = BrowserSession(
50 | 	browser_profile=BrowserProfile(
51 | 		user_agent='foobarfoo',
52 | 		user_data_dir='~/.config/browseruse/profiles/default',
53 | 	)
54 | )
55 | 
56 | agent = Agent(
57 | 	task=args.query,
58 | 	llm=llm,
59 | 	controller=controller,
60 | 	browser_session=browser_session,
61 | 	use_vision=True,
62 | 	max_actions_per_step=1,
63 | )
64 | 
65 | 
66 | async def main():
67 | 	await agent.run(max_steps=25)
68 | 
69 | 	input('Press Enter to close the browser...')
70 | 	await browser_session.close()
71 | 
72 | 
73 | asyncio.run(main())
74 | 


--------------------------------------------------------------------------------
/utils/browser-use/.github/workflows/docker.yml:
--------------------------------------------------------------------------------
 1 | name: docker
 2 | 
 3 | on:
 4 |   push:
 5 |   release:
 6 |     types: [published]
 7 | 
 8 | jobs:
 9 |   build_publish_image:
10 |     runs-on: ubuntu-latest
11 |     permissions:
12 |       packages: write
13 |       contents: read
14 |       attestations: write
15 |       id-token: write
16 |     steps:
17 |       - name: Check out the repo
18 |         uses: actions/checkout@v4
19 | 
20 |       - name: Set up QEMU
21 |         uses: docker/setup-qemu-action@v3
22 | 
23 |       - name: Set up Docker Buildx
24 |         uses: docker/setup-buildx-action@v3
25 | 
26 |       - name: Log in to Docker Hub
27 |         uses: docker/login-action@v3
28 |         with:
29 |           username: ${{ secrets.DOCKER_USERNAME }}
30 |           password: ${{ secrets.DOCKER_PASSWORD }}
31 | 
32 |       - name: Login to GitHub Container Registry
33 |         uses: docker/login-action@v3
34 |         with:
35 |           registry: ghcr.io
36 |           username: ${{ github.repository_owner }}
37 |           password: ${{ secrets.GITHUB_TOKEN }}
38 | 
39 |       - name: Compute Docker tags based on tag/branch
40 |         id: meta
41 |         uses: docker/metadata-action@v5
42 |         with:
43 |           images: |
44 |             browseruse/browseruse
45 |             ghcr.io/browser-use/browser-use
46 |           tags: |
47 |             type=ref,event=branch
48 |             type=ref,event=pr
49 |             type=pep440,pattern={{version}}
50 |             type=pep440,pattern={{major}}.{{minor}}
51 |             type=sha
52 | 
53 |       - name: Build and push Docker image
54 |         id: push
55 |         uses: docker/build-push-action@v6
56 |         with:
57 |           platforms: linux/amd64,linux/arm64
58 |           context: .
59 |           file: ./Dockerfile
60 |           push: true
61 |           tags: ${{ steps.meta.outputs.tags }}
62 |           labels: ${{ steps.meta.outputs.labels }}
63 |           cache-from: type=registry,ref=browseruse/browseruse:buildcache
64 |           cache-to: type=registry,ref=browseruse/browseruse:buildcache,mode=max
65 | 


--------------------------------------------------------------------------------
/utils/browser-use/docker/build-base-images.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Build script for browser-use base images
 3 | set -euo pipefail
 4 | 
 5 | # Configuration
 6 | REGISTRY="${DOCKER_REGISTRY:-browseruse}"
 7 | PLATFORMS="${PLATFORMS:-linux/amd64}"
 8 | PUSH="${PUSH:-false}"
 9 | 
10 | # Build function
11 | build_image() {
12 |     local name=$1
13 |     local dockerfile=$2
14 |     local build_args="${3:-}"
15 |     
16 |     echo "[INFO] Building ${name}..."
17 |     
18 |     local build_cmd="docker build"
19 |     local tag_args="-t ${REGISTRY}/${name}:latest -t ${REGISTRY}/${name}:$(date +%Y%m%d)"
20 |     
21 |     # Use buildx for multi-platform or push
22 |     if [[ "$PLATFORMS" == *","* ]] || [ "$PUSH" = "true" ]; then
23 |         build_cmd="docker buildx build --platform=$PLATFORMS"
24 |         [ "$PUSH" = "true" ] && build_cmd="$build_cmd --push" || build_cmd="$build_cmd"
25 |     fi
26 |     
27 |     $build_cmd $tag_args $build_args -f $dockerfile ../../..
28 | }
29 | 
30 | # Main
31 | cd "$(dirname "$0")"
32 | 
33 | # Parse arguments
34 | while [[ $# -gt 0 ]]; do
35 |     case $1 in
36 |         --push) PUSH=true; shift ;;
37 |         --registry) REGISTRY="$2"; shift 2 ;;
38 |         --platforms) PLATFORMS="$2"; shift 2 ;;
39 |         --help)
40 |             echo "Usage: $0 [--push] [--registry REG] [--platforms P]"
41 |             exit 0 ;;
42 |         *) echo "Unknown option: $1"; exit 1 ;;
43 |     esac
44 | done
45 | 
46 | # Create buildx builder if needed
47 | if [[ "$PLATFORMS" == *","* ]] || [ "$PUSH" = "true" ]; then
48 |     docker buildx inspect browseruse-builder >/dev/null 2>&1 || \
49 |         docker buildx create --name browseruse-builder --use
50 |     docker buildx use browseruse-builder
51 | fi
52 | 
53 | # Build images in order
54 | build_image "base-system" "base-images/system/Dockerfile"
55 | build_image "base-chromium" "base-images/chromium/Dockerfile" "--build-arg BASE_TAG=latest"
56 | build_image "base-python-deps" "base-images/python-deps/Dockerfile" "--build-arg BASE_TAG=latest"
57 | 
58 | echo "[INFO] Build complete. Use: FROM ${REGISTRY}/base-python-deps:latest"
59 | 


--------------------------------------------------------------------------------
/utils/browser-use/.github/ISSUE_TEMPLATE/4_docs_issue.yml:
--------------------------------------------------------------------------------
 1 | name: 📚 Documentation Issue
 2 | description: Report an issue in the browser-use documentation
 3 | labels: ["documentation"]
 4 | title: "Documentation: ..."
 5 | body:
 6 |   - type: markdown
 7 |     attributes:
 8 |       value: |
 9 |         Thanks for taking the time to improve our documentation! Please fill out the form below to help us fix the issue quickly.
10 | 
11 |   - type: dropdown
12 |     id: type
13 |     attributes:
14 |       label: Type of Documentation Issue
15 |       description: What type of documentation issue is this?
16 |       options:
17 |         - Missing documentation
18 |         - Incorrect documentation
19 |         - Unclear documentation
20 |         - Broken link
21 |         - Other (specify in description)
22 |     validations:
23 |       required: true
24 | 
25 |   - type: input
26 |     id: page
27 |     attributes:
28 |       label: Documentation Page
29 |       description: Which page or section of the documentation is this about?
30 |       placeholder: "e.g. https://docs.browser-use.com/customize/browser-settings > Context Configuration > headless"
31 |     validations:
32 |       required: true
33 | 
34 |   - type: textarea
35 |     id: description
36 |     attributes:
37 |       label: Issue Description
38 |       description: "Describe what's wrong or missing in the documentation"
39 |       placeholder: e.g. Docs should clarify whether BrowserSession(no_viewport=False) is supported when running in BrowserSession(headless=False) mode...
40 |     validations:
41 |       required: true
42 | 
43 |   - type: textarea
44 |     id: suggestion
45 |     attributes:
46 |       label: Suggested Changes
47 |       description: If you have specific suggestions for how to improve the documentation, please share them
48 |       placeholder: |
49 |         e.g. The documentation could be improved by adding one more line here:
50 |         ```diff
51 |         Use `BrowserSession(headless=False)` to open the browser window (aka headful mode).
52 |         + Viewports are not supported when headful, if `headless=False` it will force `no_viewport=True`.
53 |         ```
54 |     validations:
55 |       required: false
56 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/custom-functions/2fa.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import logging
 3 | import os
 4 | import sys
 5 | 
 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 7 | 
 8 | from dotenv import load_dotenv
 9 | 
10 | load_dotenv()
11 | 
12 | import pyotp  # type: ignore
13 | 
14 | from browser_use import ActionResult, Agent, Controller
15 | from browser_use.llm import ChatOpenAI
16 | 
17 | # Set up logging
18 | logging.basicConfig(level=logging.INFO)
19 | logger = logging.getLogger(__name__)
20 | 
21 | 
22 | controller = Controller()
23 | 
24 | 
25 | @controller.registry.action('Get 2FA code from when OTP is required')
26 | async def get_otp_2fa() -> ActionResult:
27 | 	"""
28 | 	Custom action to retrieve 2FA/MFA code from OTP secret key using pyotp.
29 | 	The OTP secret key should be set in the environment variable OTP_SECRET_KEY.
30 | 	"""
31 | 	secret_key = os.environ.get('OTP_SECRET_KEY')
32 | 	if not secret_key:
33 | 		raise ValueError('OTP_SECRET_KEY environment variable is not set')
34 | 
35 | 	totp = pyotp.TOTP(secret_key, digits=6)
36 | 	code = totp.now()
37 | 	return ActionResult(extracted_content=code)
38 | 
39 | 
40 | async def main():
41 | 	# Example task using the 1Password 2FA action
42 | 	task = """
43 | 	Steps:
44 | 	1. Go to https://authenticationtest.com/totpChallenge/ and try to log in.
45 | 	2. If prompted for 2FA code:
46 | 	2.1. Use the get_2fa_code action to retrieve the 2FA code.
47 | 	2.2. Submit the code provided by the get_2fa_code action.
48 | 	
49 | 	Considerations:
50 | 	- ALWAYS use the get_2fa_code action to retrieve the 2FA code if needed.
51 | 	- NEVER skip the 2FA step if the page requires it.
52 | 	- NEVER extract the code from the page.
53 | 	- NEVER use a code that is not generated by the get_2fa_code action.
54 | 	- NEVER hallucinate the 2FA code, always use the get_2fa_code action to get it.
55 | 	
56 | 	You are completely FORBIDDEN to use any other method to get the 2FA code.
57 | 	"""
58 | 
59 | 	model = ChatOpenAI(model='gpt-4.1')
60 | 	agent = Agent(task=task, llm=model, controller=controller)
61 | 
62 | 	result = await agent.run()
63 | 	print(f'Task completed with result: {result}')
64 | 
65 | 
66 | if __name__ == '__main__':
67 | 	asyncio.run(main())
68 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from browser_use.logging_config import setup_logging
 4 | 
 5 | # Only set up logging if not in MCP mode or if explicitly requested
 6 | if os.environ.get('BROWSER_USE_SETUP_LOGGING', 'true').lower() != 'false':
 7 | 	logger = setup_logging()
 8 | else:
 9 | 	import logging
10 | 
11 | 	logger = logging.getLogger('browser_use')
12 | 
13 | # Monkeypatch BaseSubprocessTransport.__del__ to handle closed event loops gracefully
14 | from asyncio import base_subprocess
15 | 
16 | from browser_use.agent.prompts import SystemPrompt
17 | from browser_use.agent.service import Agent
18 | from browser_use.agent.views import ActionModel, ActionResult, AgentHistoryList
19 | from browser_use.browser import Browser, BrowserConfig, BrowserContext, BrowserContextConfig, BrowserProfile, BrowserSession
20 | from browser_use.controller.service import Controller
21 | from browser_use.dom.service import DomService
22 | from browser_use.llm import (
23 | 	ChatAnthropic,
24 | 	ChatAzureOpenAI,
25 | 	ChatGoogle,
26 | 	ChatGroq,
27 | 	ChatOllama,
28 | 	ChatOpenAI,
29 | )
30 | 
31 | _original_del = base_subprocess.BaseSubprocessTransport.__del__
32 | 
33 | 
34 | def _patched_del(self):
35 | 	"""Patched __del__ that handles closed event loops without throwing noisy red-herring errors like RuntimeError: Event loop is closed"""
36 | 	try:
37 | 		# Check if the event loop is closed before calling the original
38 | 		if hasattr(self, '_loop') and self._loop and self._loop.is_closed():
39 | 			# Event loop is closed, skip cleanup that requires the loop
40 | 			return
41 | 		_original_del(self)
42 | 	except RuntimeError as e:
43 | 		if 'Event loop is closed' in str(e):
44 | 			# Silently ignore this specific error
45 | 			pass
46 | 		else:
47 | 			raise
48 | 
49 | 
50 | base_subprocess.BaseSubprocessTransport.__del__ = _patched_del
51 | 
52 | 
53 | __all__ = [
54 | 	'Agent',
55 | 	'Browser',
56 | 	'BrowserConfig',
57 | 	'BrowserSession',
58 | 	'BrowserProfile',
59 | 	'Controller',
60 | 	'DomService',
61 | 	'SystemPrompt',
62 | 	'ActionResult',
63 | 	'ActionModel',
64 | 	'AgentHistoryList',
65 | 	'BrowserContext',
66 | 	'BrowserContextConfig',
67 | 	# Chat models
68 | 	'ChatOpenAI',
69 | 	'ChatGoogle',
70 | 	'ChatAnthropic',
71 | 	'ChatGroq',
72 | 	'ChatAzureOpenAI',
73 | 	'ChatOllama',
74 | ]
75 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/features/multiple_tasks.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 6 | 
 7 | from dotenv import load_dotenv
 8 | 
 9 | load_dotenv()
10 | 
11 | 
12 | from browser_use import Agent
13 | from browser_use.browser import BrowserSession
14 | from browser_use.browser.types import async_playwright
15 | from browser_use.llm import ChatGoogle
16 | 
17 | api_key = os.getenv('GOOGLE_API_KEY')
18 | 
19 | if not api_key:
20 | 	raise ValueError('GOOGLE_API_KEY is not set')
21 | 
22 | llm = ChatGoogle(model='gemini-2.0-flash', api_key=api_key)
23 | 
24 | 
25 | async def main():
26 | 	async with async_playwright() as p:
27 | 		browser = await p.chromium.launch(
28 | 			headless=False,
29 | 		)
30 | 
31 | 		context = await browser.new_context(
32 | 			viewport={'width': 1502, 'height': 853},
33 | 			ignore_https_errors=True,
34 | 		)
35 | 
36 | 		agent = Agent(
37 | 			browser_session=BrowserSession(
38 | 				browser_context=context,
39 | 			),
40 | 			task='Go to https://browser-use.com/',
41 | 			llm=llm,
42 | 		)
43 | 
44 | 		try:
45 | 			result = await agent.run()
46 | 			print(f'First task was {"successful" if result.is_successful else "not successful"}')
47 | 
48 | 			if not result.is_successful:
49 | 				raise RuntimeError('Failed to navigate to the initial page.')
50 | 
51 | 			agent.add_new_task('Navigate to the documentation page')
52 | 
53 | 			result = await agent.run()
54 | 			print(f'Second task was {"successful" if result.is_successful else "not successful"}')
55 | 
56 | 			if not result.is_successful:
57 | 				raise RuntimeError('Failed to navigate to the documentation page.')
58 | 
59 | 			while True:
60 | 				next_task = input('Write your next task or leave empty to exit\n> ')
61 | 
62 | 				if not next_task.strip():
63 | 					print('Exiting...')
64 | 					break
65 | 
66 | 				agent.add_new_task(next_task)
67 | 				result = await agent.run()
68 | 
69 | 				print(f"Task '{next_task}' was {'successful' if result.is_successful else 'not successful'}")
70 | 
71 | 				if not result.is_successful:
72 | 					print('Failed to complete the task. Please try again.')
73 | 					continue
74 | 
75 | 		finally:
76 | 			await context.close()
77 | 			await browser.close()
78 | 
79 | 
80 | if __name__ == '__main__':
81 | 	asyncio.run(main())
82 | 


--------------------------------------------------------------------------------
/utils/browser-use/docs/customize/system-prompt.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "System Prompt"
 3 | description: "Customize the system prompt to control agent behavior and capabilities"
 4 | icon: "message"
 5 | ---
 6 | 
 7 | ## Overview
 8 | 
 9 | You can customize the system prompt in two ways:
10 | 
11 | 1. Extend the default system prompt with additional instructions
12 | 2. Override the default system prompt entirely
13 | 
14 | <Note>
15 |   Custom system prompts allow you to modify the agent's behavior at a
16 |   fundamental level. Use this feature carefully as it can significantly impact
17 |   the agent's performance and reliability.
18 | </Note>
19 | 
20 | ### Extend System Prompt (recommended)
21 | 
22 | To add additional instructions to the default system prompt:
23 | 
24 | ```python
25 | extend_system_message = """
26 | REMEMBER the most important RULE:
27 | ALWAYS open first a new tab and go first to url wikipedia.com no matter the task!!!
28 | """
29 | ```
30 | 
31 | ### Override System Prompt
32 | 
33 | <Warning>
34 |   Not recommended! If you must override the [default system
35 |   prompt](https://github.com/browser-use/browser-use/blob/main/browser_use/agent/system_prompt.md),
36 |   make sure to test the agent yourself.
37 | </Warning>
38 | 
39 | Anyway, to override the default system prompt:
40 | 
41 | ```python
42 | # Define your complete custom prompt
43 | override_system_message = """
44 | You are an AI agent that helps users with web browsing tasks.
45 | 
46 | [Your complete custom instructions here...]
47 | """
48 | 
49 | # Create agent with custom system prompt
50 | agent = Agent(
51 |     task="Your task here",
52 |     llm=ChatOpenAI(model='gpt-4'),
53 |     override_system_message=override_system_message
54 | )
55 | ```
56 | 
57 | ### Extend Planner System Prompt
58 | 
59 | You can customize the behavior of the planning agent by extending its system prompt:
60 | 
61 | ```python
62 | extend_planner_system_message = """
63 | PRIORITIZE gathering information before taking any action.
64 | Always suggest exploring multiple options before making a decision.
65 | """
66 | 
67 | # Create agent with extended planner system prompt
68 | llm = ChatOpenAI(model='gpt-4o')
69 | planner_llm = ChatOpenAI(model='gpt-4o-mini')
70 | 
71 | agent = Agent(
72 | 	task="Your task here",
73 | 	llm=llm,
74 | 	planner_llm=planner_llm,
75 | 	extend_planner_system_message=extend_planner_system_message
76 | )
77 | ```
78 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/features/sensitive_data.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 6 | 
 7 | from dotenv import load_dotenv
 8 | 
 9 | load_dotenv()
10 | 
11 | from browser_use import Agent
12 | from browser_use.browser import BrowserProfile
13 | from browser_use.llm import ChatOpenAI
14 | 
15 | try:
16 | 	from lmnr import Laminar
17 | 
18 | 	Laminar.initialize(project_api_key=os.getenv('LMNR_PROJECT_API_KEY'))
19 | except Exception as e:
20 | 	print(f'Error initializing Laminar: {e}')
21 | 
22 | # Initialize the model
23 | llm = ChatOpenAI(
24 | 	model='gpt-4.1',
25 | 	temperature=0.0,
26 | )
27 | # Simple case: the model will see x_name and x_password, but never the actual values.
28 | # sensitive_data = {'x_name': 'my_x_name', 'x_password': 'my_x_password'}
29 | 
30 | # Advanced case: domain-specific credentials with reusable data
31 | # Define a single credential set that can be reused
32 | company_credentials = {'company_username': 'user@example.com', 'company_password': 'securePassword123'}
33 | 
34 | # Map the same credentials to multiple domains for secure access control
35 | # Type annotation to satisfy pyright
36 | sensitive_data: dict[str, str | dict[str, str]] = {
37 | 	'https://example.com': company_credentials,
38 | 	'https://admin.example.com': company_credentials,
39 | 	'https://*.example-staging.com': company_credentials,
40 | 	'http*://test.example.com': company_credentials,
41 | 	# You can also add domain-specific credentials
42 | 	'https://*.google.com': {'g_email': 'user@gmail.com', 'g_pass': 'google_password'},
43 | }
44 | # Update task to use one of the credentials above
45 | task = 'Go to google.com and put the login information in the search bar.'
46 | 
47 | # Always set allowed_domains when using sensitive_data for security
48 | from browser_use.browser.session import BrowserSession
49 | 
50 | browser_session = BrowserSession(
51 | 	browser_profile=BrowserProfile(
52 | 		allowed_domains=list(sensitive_data.keys())
53 | 		+ ['https://*.trusted-partner.com']  # Domain patterns from sensitive_data + additional allowed domains
54 | 	)
55 | )
56 | 
57 | agent = Agent(task=task, llm=llm, sensitive_data=sensitive_data, browser_session=browser_session)
58 | 
59 | 
60 | async def main():
61 | 	await agent.run()
62 | 
63 | 
64 | if __name__ == '__main__':
65 | 	asyncio.run(main())
66 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/use-cases/web_voyager_agent.py:
--------------------------------------------------------------------------------
 1 | # Goal: A general-purpose web navigation agent for tasks like flight booking and course searching.
 2 | 
 3 | import asyncio
 4 | import os
 5 | import sys
 6 | 
 7 | # Adjust Python path
 8 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 9 | 
10 | from dotenv import load_dotenv
11 | 
12 | load_dotenv()
13 | 
14 | 
15 | from browser_use.agent.service import Agent
16 | from browser_use.browser import BrowserProfile, BrowserSession
17 | from browser_use.llm import ChatAzureOpenAI, ChatOpenAI
18 | 
19 | # Set LLM based on defined environment variables
20 | if os.getenv('OPENAI_API_KEY'):
21 | 	llm = ChatOpenAI(
22 | 		model='gpt-4.1',
23 | 	)
24 | elif os.getenv('AZURE_OPENAI_KEY') and os.getenv('AZURE_OPENAI_ENDPOINT'):
25 | 	llm = ChatAzureOpenAI(
26 | 		model='gpt-4.1',
27 | 	)
28 | else:
29 | 	raise ValueError('No LLM found. Please set OPENAI_API_KEY or AZURE_OPENAI_KEY and AZURE_OPENAI_ENDPOINT.')
30 | 
31 | 
32 | browser_session = BrowserSession(
33 | 	browser_profile=BrowserProfile(
34 | 		headless=False,  # This is True in production
35 | 		minimum_wait_page_load_time=1,  # 3 on prod
36 | 		maximum_wait_page_load_time=10,  # 20 on prod
37 | 		viewport={'width': 1280, 'height': 1100},
38 | 		user_data_dir='~/.config/browseruse/profiles/default',
39 | 		# trace_path='./tmp/web_voyager_agent',
40 | 	)
41 | )
42 | 
43 | # TASK = """
44 | # Find the lowest-priced one-way flight from Cairo to Montreal on February 21, 2025, including the total travel time and number of stops. on https://www.google.com/travel/flights/
45 | # """
46 | # TASK = """
47 | # Browse Coursera, which universities offer Master of Advanced Study in Engineering degrees? Tell me what is the latest application deadline for this degree? on https://www.coursera.org/"""
48 | TASK = """
49 | Find and book a hotel in Paris with suitable accommodations for a family of four (two adults and two children) offering free cancellation for the dates of February 14-21, 2025. on https://www.booking.com/
50 | """
51 | 
52 | 
53 | async def main():
54 | 	agent = Agent(
55 | 		task=TASK,
56 | 		llm=llm,
57 | 		browser_session=browser_session,
58 | 		validate_output=True,
59 | 		enable_memory=False,
60 | 	)
61 | 	history = await agent.run(max_steps=50)
62 | 	history.save_to_file('./tmp/history.json')
63 | 
64 | 
65 | if __name__ == '__main__':
66 | 	asyncio.run(main())
67 | 


--------------------------------------------------------------------------------
/utils/browser-use/tests/old/test_wait_for_element.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | from browser_use.llm.openai.chat import ChatOpenAI
 6 | 
 7 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
 8 | if project_root not in sys.path:
 9 | 	sys.path.insert(0, project_root)
10 | 
11 | import pytest
12 | from dotenv import load_dotenv
13 | 
14 | # Third-party imports
15 | from browser_use import Agent, Controller
16 | 
17 | # Local imports
18 | from browser_use.browser import BrowserProfile, BrowserSession
19 | 
20 | # Load environment variables.
21 | load_dotenv()
22 | 
23 | # Initialize language model and controller.
24 | llm = ChatOpenAI(model='gpt-4.1')
25 | controller = Controller()
26 | 
27 | 
28 | @pytest.mark.skip(reason='this is for local testing only')
29 | async def test_wait_for_element():
30 | 	"""Test 'Wait for element' action."""
31 | 
32 | 	initial_actions = [
33 | 		{'go_to_url': {'url': 'https://pypi.org/', 'new_tab': True}},
34 | 		# Uncomment the line below to include the wait action in initial actions.
35 | 		# {'wait_for_element': {'selector': '#search', 'timeout': 30}},
36 | 	]
37 | 
38 | 	# Set up the browser session.
39 | 	browser_session = BrowserSession(browser_profile=BrowserProfile(headless=True, disable_security=True))
40 | 	await browser_session.start()
41 | 
42 | 	try:
43 | 		# Create the agent with the task.
44 | 		agent = Agent(
45 | 			task="Wait for element '#search' to be visible with a timeout of 30 seconds.",
46 | 			llm=llm,
47 | 			browser_session=browser_session,
48 | 			initial_actions=initial_actions,
49 | 			controller=controller,
50 | 		)
51 | 
52 | 		# Run the agent for a few steps to trigger navigation and then the wait action.
53 | 		history = await agent.run(max_steps=3)
54 | 		action_names = history.action_names()
55 | 
56 | 		# Ensure that the wait_for_element action was executed.
57 | 		assert 'wait_for_element' in action_names, 'Expected wait_for_element action to be executed.'
58 | 
59 | 		# Verify that the #search element is visible by querying the page.
60 | 		page = await browser_session.get_current_page()
61 | 		header_handle = await page.query_selector('#search')
62 | 		assert header_handle is not None, 'Expected to find a #search element on the page.'
63 | 		is_visible = await header_handle.is_visible()
64 | 		assert is_visible, 'Expected the #search element to be visible.'
65 | 	finally:
66 | 		await browser_session.stop()
67 | 
68 | 
69 | if __name__ == '__main__':
70 | 	asyncio.run(test_wait_for_element())
71 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/browser/views.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Any
 3 | 
 4 | from pydantic import BaseModel
 5 | 
 6 | from browser_use.dom.history_tree_processor.service import DOMHistoryElement
 7 | from browser_use.dom.views import DOMState
 8 | 
 9 | 
10 | # Pydantic
11 | class TabInfo(BaseModel):
12 | 	"""Represents information about a browser tab"""
13 | 
14 | 	page_id: int
15 | 	url: str
16 | 	title: str
17 | 	parent_page_id: int | None = None  # parent page that contains this popup or cross-origin iframe
18 | 
19 | 
20 | class PageInfo(BaseModel):
21 | 	"""Comprehensive page size and scroll information"""
22 | 
23 | 	# Current viewport dimensions
24 | 	viewport_width: int
25 | 	viewport_height: int
26 | 
27 | 	# Total page dimensions
28 | 	page_width: int
29 | 	page_height: int
30 | 
31 | 	# Current scroll position
32 | 	scroll_x: int
33 | 	scroll_y: int
34 | 
35 | 	# Calculated scroll information
36 | 	pixels_above: int
37 | 	pixels_below: int
38 | 	pixels_left: int
39 | 	pixels_right: int
40 | 
41 | 	# Page statistics are now computed dynamically instead of stored
42 | 
43 | 
44 | @dataclass
45 | class BrowserStateSummary(DOMState):
46 | 	"""The summary of the browser's current state designed for an LLM to process"""
47 | 
48 | 	# provided by DOMState:
49 | 	# element_tree: DOMElementNode
50 | 	# selector_map: SelectorMap
51 | 
52 | 	url: str
53 | 	title: str
54 | 	tabs: list[TabInfo]
55 | 	screenshot: str | None = field(default=None, repr=False)
56 | 	page_info: PageInfo | None = None  # Enhanced page information
57 | 
58 | 	# Keep legacy fields for backward compatibility
59 | 	pixels_above: int = 0
60 | 	pixels_below: int = 0
61 | 	browser_errors: list[str] = field(default_factory=list)
62 | 
63 | 
64 | @dataclass
65 | class BrowserStateHistory:
66 | 	"""The summary of the browser's state at a past point in time to usse in LLM message history"""
67 | 
68 | 	url: str
69 | 	title: str
70 | 	tabs: list[TabInfo]
71 | 	interacted_element: list[DOMHistoryElement | None] | list[None]
72 | 	screenshot: str | None = None
73 | 
74 | 	def to_dict(self) -> dict[str, Any]:
75 | 		data = {}
76 | 		data['tabs'] = [tab.model_dump() for tab in self.tabs]
77 | 		data['screenshot'] = self.screenshot
78 | 		data['interacted_element'] = [el.to_dict() if el else None for el in self.interacted_element]
79 | 		data['url'] = self.url
80 | 		data['title'] = self.title
81 | 		return data
82 | 
83 | 
84 | class BrowserError(Exception):
85 | 	"""Base class for all browser errors"""
86 | 
87 | 
88 | class URLNotAllowedError(BrowserError):
89 | 	"""Error raised when a URL is not allowed"""
90 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/custom-functions/solve_amazon_captcha.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 6 | 
 7 | from dotenv import load_dotenv
 8 | 
 9 | load_dotenv()
10 | 
11 | from amazoncaptcha import AmazonCaptcha  # type: ignore
12 | 
13 | from browser_use import ActionResult
14 | from browser_use.agent.service import Agent
15 | from browser_use.browser import BrowserConfig, BrowserSession
16 | from browser_use.controller.service import Controller
17 | from browser_use.llm import ChatOpenAI
18 | 
19 | browser_profile = BrowserConfig(headless=False)
20 | 
21 | # Initialize controller first
22 | controller = Controller()
23 | 
24 | 
25 | @controller.action(
26 | 	'Solve Amazon text based captcha',
27 | 	domains=[
28 | 		'*.amazon.com',
29 | 		'*.amazon.co.uk',
30 | 		'*.amazon.ca',
31 | 		'*.amazon.de',
32 | 		'*.amazon.es',
33 | 		'*.amazon.fr',
34 | 		'*.amazon.it',
35 | 		'*.amazon.co.jp',
36 | 		'*.amazon.in',
37 | 		'*.amazon.cn',
38 | 		'*.amazon.com.sg',
39 | 		'*.amazon.com.mx',
40 | 		'*.amazon.ae',
41 | 		'*.amazon.com.br',
42 | 		'*.amazon.nl',
43 | 		'*.amazon.com.au',
44 | 		'*.amazon.com.tr',
45 | 		'*.amazon.sa',
46 | 		'*.amazon.se',
47 | 		'*.amazon.pl',
48 | 	],
49 | )
50 | async def solve_amazon_captcha(browser_session: BrowserSession):
51 | 	page = await browser_session.get_current_page()
52 | 
53 | 	# Find the captcha image and extract its src
54 | 	captcha_img = page.locator('img[src*="amazon.com/captcha"]')
55 | 	link = await captcha_img.get_attribute('src')
56 | 
57 | 	if not link:
58 | 		raise ValueError('Could not find captcha image on the page')
59 | 
60 | 	captcha = AmazonCaptcha.fromlink(link)
61 | 	solution = captcha.solve()
62 | 	if not solution or solution == 'Not solved':
63 | 		raise ValueError('Captcha could not be solved')
64 | 
65 | 	await page.locator('#captchacharacters').fill(solution)
66 | 	await page.locator('button[type="submit"]').click()
67 | 
68 | 	return ActionResult(extracted_content=solution)
69 | 
70 | 
71 | async def main():
72 | 	task = 'Go to https://www.amazon.com/errors/validateCaptcha and solve the captcha using the solve_amazon_captcha tool'
73 | 
74 | 	model = ChatOpenAI(model='gpt-4.1')
75 | 	browser_session = BrowserSession(browser_profile=browser_profile)
76 | 	await browser_session.start()
77 | 	agent = Agent(task=task, llm=model, controller=controller, browser_session=browser_session)
78 | 
79 | 	await agent.run()
80 | 	await browser_session.stop()
81 | 
82 | 	input('Press Enter to close...')
83 | 
84 | 
85 | if __name__ == '__main__':
86 | 	asyncio.run(main())
87 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/integrations/browserbase_stagehand.py:
--------------------------------------------------------------------------------
 1 | """
 2 | EXPERIMENTAL: Integration example with Stagehand (browserbase)
 3 | 
 4 | This example shows how to combine browser-use with Stagehand for advanced browser automation.
 5 | Note: This requires the stagehand-py library to be installed separately:
 6 |     pip install stagehand-py
 7 | 
 8 | The exact API may vary depending on the stagehand-py version.
 9 | Please refer to the official Stagehand documentation for the latest usage:
10 |     https://pypi.org/project/stagehand-py/
11 |     https://github.com/browserbase/stagehand-python-examples/
12 | """
13 | 
14 | import asyncio
15 | import os
16 | 
17 | from dotenv import load_dotenv
18 | 
19 | load_dotenv()
20 | 
21 | from stagehand import Stagehand, StagehandConfig  # type: ignore
22 | 
23 | from browser_use.agent.service import Agent
24 | 
25 | 
26 | async def main():
27 | 	# Configure Stagehand
28 | 	# https://pypi.org/project/stagehand-py/
29 | 	# https://github.com/browserbase/stagehand-python-examples/blob/main/agent_example.py
30 | 	# Note: This example requires the stagehand-py library to be installed
31 | 	# pip install stagehand-py
32 | 
33 | 	# Create StagehandConfig with correct parameters
34 | 	# The exact parameters depend on the stagehand-py version
35 | 	config = StagehandConfig(  # type: ignore
36 | 		apiKey=os.getenv('BROWSERBASE_API_KEY'),
37 | 		projectId=os.getenv('BROWSERBASE_PROJECT_ID'),
38 | 	)
39 | 
40 | 	# Create a Stagehand client using the configuration object.
41 | 	stagehand = Stagehand(
42 | 		config=config,
43 | 		model_api_key=os.getenv('OPENAI_API_KEY'),
44 | 		# server_url=os.getenv('STAGEHAND_SERVER_URL'),
45 | 	)
46 | 
47 | 	# Initialize - this creates a new session automatically.
48 | 	await stagehand.init()
49 | 	print(f'\nCreated new session: {stagehand.session_id}')
50 | 	print(f'🌐 View your live browser: https://www.browserbase.com/sessions/{stagehand.session_id}')
51 | 
52 | 	# Check if stagehand has a page attribute
53 | 	if hasattr(stagehand, 'page') and stagehand.page:
54 | 		await stagehand.page.goto('https://google.com/')
55 | 		await stagehand.page.act('search for openai')
56 | 	else:
57 | 		print('Warning: Stagehand page not available')
58 | 
59 | 	# Combine with Browser Use
60 | 	agent = Agent(task='click the first result', page=stagehand.page)  # type: ignore
61 | 	await agent.run()
62 | 
63 | 	# go back and forth
64 | 	await stagehand.page.act('open the 3 first links on the page in new tabs')  # type: ignore
65 | 
66 | 	await Agent(task='click the first result', page=stagehand.page).run()  # type: ignore
67 | 
68 | 
69 | if __name__ == '__main__':
70 | 	asyncio.run(main())
71 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/features/pause_agent.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import os
  3 | import sys
  4 | import threading
  5 | 
  6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
  7 | 
  8 | from dotenv import load_dotenv
  9 | 
 10 | load_dotenv()
 11 | 
 12 | from browser_use import Agent
 13 | from browser_use.llm import ChatOpenAI
 14 | 
 15 | 
 16 | class AgentController:
 17 | 	def __init__(self):
 18 | 		llm = ChatOpenAI(model='gpt-4.1')
 19 | 		self.agent = Agent(
 20 | 			task='open in one action https://www.google.com, https://www.wikipedia.org, https://www.youtube.com, https://www.github.com, https://amazon.com',
 21 | 			llm=llm,
 22 | 		)
 23 | 		self.running = False
 24 | 
 25 | 	async def run_agent(self):
 26 | 		"""Run the agent"""
 27 | 		self.running = True
 28 | 		await self.agent.run()
 29 | 
 30 | 	def start(self):
 31 | 		"""Start the agent in a separate thread"""
 32 | 		loop = asyncio.new_event_loop()
 33 | 		asyncio.set_event_loop(loop)
 34 | 		loop.run_until_complete(self.run_agent())
 35 | 
 36 | 	def pause(self):
 37 | 		"""Pause the agent"""
 38 | 		self.agent.pause()
 39 | 
 40 | 	def resume(self):
 41 | 		"""Resume the agent"""
 42 | 		self.agent.resume()
 43 | 
 44 | 	def stop(self):
 45 | 		"""Stop the agent"""
 46 | 		self.agent.stop()
 47 | 		self.running = False
 48 | 
 49 | 
 50 | def print_menu():
 51 | 	print('\nAgent Control Menu:')
 52 | 	print('1. Start')
 53 | 	print('2. Pause')
 54 | 	print('3. Resume')
 55 | 	print('4. Stop')
 56 | 	print('5. Exit')
 57 | 
 58 | 
 59 | async def main():
 60 | 	controller = AgentController()
 61 | 	agent_thread = None
 62 | 
 63 | 	while True:
 64 | 		print_menu()
 65 | 		try:
 66 | 			choice = input('Enter your choice (1-5): ')
 67 | 		except KeyboardInterrupt:
 68 | 			choice = '5'
 69 | 
 70 | 		if choice == '1' and not agent_thread:
 71 | 			print('Starting agent...')
 72 | 			agent_thread = threading.Thread(target=controller.start)
 73 | 			agent_thread.start()
 74 | 
 75 | 		elif choice == '2':
 76 | 			print('Pausing agent...')
 77 | 			controller.pause()
 78 | 
 79 | 		elif choice == '3':
 80 | 			print('Resuming agent...')
 81 | 			controller.resume()
 82 | 
 83 | 		elif choice == '4':
 84 | 			print('Stopping agent...')
 85 | 			controller.stop()
 86 | 			if agent_thread:
 87 | 				agent_thread.join()
 88 | 				agent_thread = None
 89 | 
 90 | 		elif choice == '5':
 91 | 			print('Exiting...')
 92 | 			if controller.running:
 93 | 				controller.stop()
 94 | 				if agent_thread:
 95 | 					agent_thread.join()
 96 | 			break
 97 | 
 98 | 		await asyncio.sleep(0.1)  # Small delay to prevent CPU spinning
 99 | 
100 | 
101 | if __name__ == '__main__':
102 | 	asyncio.run(main())
103 | 


--------------------------------------------------------------------------------
/utils/browser-use/docs/development/observability.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Observability"
 3 | description: "Trace Browser Use's agent execution steps and browser sessions"
 4 | icon: "eye"
 5 | ---
 6 | 
 7 | ## Overview
 8 | 
 9 | Browser Use has a native integration with [Laminar](https://lmnr.ai) - open-source platform for tracing, evals and labeling of AI agents.
10 | Read more about Laminar in the [Laminar docs](https://docs.lmnr.ai).
11 | 
12 | <Note>
13 |   Laminar excels at tracing browser agents by providing unified visibility into
14 |   both browser session recordings and agent execution steps.
15 | </Note>
16 | 
17 | ## Setup
18 | 
19 | To setup Laminar, you need to install the `lmnr` package and set the `LMNR_PROJECT_API_KEY` environment variable.
20 | 
21 | To get your project API key, you can either:
22 | 
23 | - Register on [Laminar Cloud](https://lmnr.ai) and get the key from your project settings
24 | - Or spin up a local Laminar instance and get the key from the settings page
25 | 
26 | ```bash
27 | pip install 'lmnr[all]'
28 | export LMNR_PROJECT_API_KEY=<your-project-api-key>
29 | ```
30 | 
31 | ## Usage
32 | 
33 | Then, you simply initialize the Laminar at the top of your project and both Browser Use and session recordings will be automatically traced.
34 | 
35 | ```python {5-8}
36 | from browser_use.llm import ChatOpenAI
37 | from browser_use import Agent
38 | import asyncio
39 | 
40 | from lmnr import Laminar, Instruments
41 | # this line auto-instruments Browser Use and any browser you use (local or remote)
42 | Laminar.initialize(project_api_key="...", disable_batch=True, disabled_instruments={Instruments.BROWSER_USE}) # you can also pass project api key here
43 | 
44 | async def main():
45 |     agent = Agent(
46 |         task="open google, search Laminar AI",
47 |         llm=ChatOpenAI(model="gpt-4.1-mini"),
48 |     )
49 |     result = await agent.run()
50 |     print(result)
51 | 
52 | asyncio.run(main())
53 | ```
54 | 
55 | ## Viewing Traces
56 | 
57 | You can view traces in the Laminar UI by going to the traces tab in your project.
58 | When you select a trace, you can see both the browser session recording and the agent execution steps.
59 | 
60 | Timeline of the browser session is synced with the agent execution steps, timeline highlights indicate the agent's current step synced with the browser session.
61 | In the trace view, you can also see the agent's current step, the tool it's using, and the tool's input and output. Tools are highlighted in the timeline with a yellow color.
62 | 
63 | <img className="block" src="/images/laminar.png" alt="Laminar" />
64 | 
65 | ## Laminar
66 | 
67 | To learn more about tracing and evaluating your browser agents, check out the [Laminar docs](https://docs.lmnr.ai).
68 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/ui/streamlit_demo.py:
--------------------------------------------------------------------------------
 1 | """
 2 | To use it, you'll need to install streamlit, and run with:
 3 | 
 4 | python -m streamlit run streamlit_demo.py
 5 | 
 6 | """
 7 | 
 8 | import asyncio
 9 | import os
10 | import sys
11 | 
12 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
13 | 
14 | from dotenv import load_dotenv
15 | 
16 | load_dotenv()
17 | 
18 | import streamlit as st  # type: ignore
19 | 
20 | from browser_use import Agent
21 | from browser_use.browser import BrowserSession
22 | from browser_use.controller.service import Controller
23 | 
24 | if os.name == 'nt':
25 | 	asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
26 | 
27 | 
28 | # Function to get the LLM based on provider
29 | def get_llm(provider: str):
30 | 	if provider == 'anthropic':
31 | 		from browser_use.llm import ChatAnthropic
32 | 
33 | 		api_key = os.getenv('ANTHROPIC_API_KEY')
34 | 		if not api_key:
35 | 			st.error('Error: ANTHROPIC_API_KEY is not set. Please provide a valid API key.')
36 | 			st.stop()
37 | 
38 | 		return ChatAnthropic(model='claude-3-5-sonnet-20240620', temperature=0.0)
39 | 	elif provider == 'openai':
40 | 		from browser_use.llm import ChatOpenAI
41 | 
42 | 		api_key = os.getenv('OPENAI_API_KEY')
43 | 		if not api_key:
44 | 			st.error('Error: OPENAI_API_KEY is not set. Please provide a valid API key.')
45 | 			st.stop()
46 | 
47 | 		return ChatOpenAI(model='gpt-4.1', temperature=0.0)
48 | 	else:
49 | 		st.error(f'Unsupported provider: {provider}')
50 | 		st.stop()
51 | 		return None  # Never reached, but helps with type checking
52 | 
53 | 
54 | # Function to initialize the agent
55 | def initialize_agent(query: str, provider: str):
56 | 	llm = get_llm(provider)
57 | 	controller = Controller()
58 | 	browser_session = BrowserSession()
59 | 
60 | 	return Agent(
61 | 		task=query,
62 | 		llm=llm,  # type: ignore
63 | 		controller=controller,
64 | 		browser_session=browser_session,
65 | 		use_vision=True,
66 | 		max_actions_per_step=1,
67 | 	), browser_session
68 | 
69 | 
70 | # Streamlit UI
71 | st.title('Automated Browser Agent with LLMs 🤖')
72 | 
73 | query = st.text_input('Enter your query:', 'go to reddit and search for posts about browser-use')
74 | provider = st.radio('Select LLM Provider:', ['openai', 'anthropic'], index=0)
75 | 
76 | if st.button('Run Agent'):
77 | 	st.write('Initializing agent...')
78 | 	agent, browser_session = initialize_agent(query, provider)
79 | 
80 | 	async def run_agent():
81 | 		with st.spinner('Running automation...'):
82 | 			await agent.run(max_steps=25)
83 | 		st.success('Task completed! 🎉')
84 | 
85 | 	asyncio.run(run_agent())
86 | 
87 | 	st.button('Close Browser', on_click=lambda: asyncio.run(browser_session.close()))
88 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/custom-functions/extract_pdf_content.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env -S uv run --script
 2 | # /// script
 3 | # requires-python = ">=3.11"
 4 | # dependencies = ["browser-use", "mistralai"]
 5 | # ///
 6 | 
 7 | import os
 8 | import sys
 9 | 
10 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
11 | 
12 | from dotenv import load_dotenv
13 | 
14 | load_dotenv()
15 | 
16 | import asyncio
17 | import logging
18 | 
19 | from mistralai import Mistral  # type: ignore
20 | from pydantic import BaseModel, Field
21 | 
22 | from browser_use import Agent, Controller
23 | from browser_use.agent.views import ActionResult
24 | from browser_use.browser.context import BrowserContext
25 | from browser_use.llm import ChatOpenAI
26 | 
27 | if not os.getenv('OPENAI_API_KEY'):
28 | 	raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.')
29 | 
30 | if not os.getenv('MISTRAL_API_KEY'):
31 | 	raise ValueError('MISTRAL_API_KEY is not set. Please add it to your environment variables.')
32 | 
33 | logger = logging.getLogger(__name__)
34 | 
35 | controller = Controller()
36 | 
37 | 
38 | class PdfExtractParams(BaseModel):
39 | 	url: str = Field(description='URL to a PDF document')
40 | 
41 | 
42 | @controller.registry.action(
43 | 	'Extract PDF Text',
44 | 	param_model=PdfExtractParams,
45 | )
46 | def extract_mistral_ocr(params: PdfExtractParams, browser: BrowserContext) -> ActionResult:
47 | 	"""
48 | 	Process a PDF URL using Mistral OCR API and return the OCR response.
49 | 
50 | 	Args:
51 | 	    url: URL to a PDF document
52 | 
53 | 	Returns:
54 | 	    OCR response object from Mistral API
55 | 	"""
56 | 	api_key = os.getenv('MISTRAL_API_KEY')
57 | 	client = Mistral(api_key=api_key)
58 | 
59 | 	response = client.ocr.process(
60 | 		model='mistral-ocr-latest',
61 | 		document={
62 | 			'type': 'document_url',
63 | 			'document_url': params.url,
64 | 		},
65 | 		include_image_base64=False,
66 | 	)
67 | 
68 | 	markdown = '\n\n'.join(f'### Page {i + 1}\n{response.pages[i].markdown}' for i in range(len(response.pages)))
69 | 	return ActionResult(
70 | 		extracted_content=markdown,
71 | 		include_in_memory=False,  ## PDF content can be very large, so we don't include it in memory
72 | 	)
73 | 
74 | 
75 | async def main():
76 | 	agent = Agent(
77 | 		task="""
78 |         Objective: Navigate to the following URL, extract its contents using the Extract PDF Text action, and explain its historical significance.
79 | 
80 |         URL: https://docs.house.gov/meetings/GO/GO00/20220929/115171/HHRG-117-GO00-20220929-SD010.pdf
81 |         """,
82 | 		llm=ChatOpenAI(model='gpt-4.1'),
83 | 		controller=controller,
84 | 	)
85 | 	result = await agent.run()
86 | 	logger.info(result)
87 | 
88 | 
89 | if __name__ == '__main__':
90 | 	asyncio.run(main())
91 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/custom-functions/perplexity_search.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import sys
 4 | 
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 6 | 
 7 | from dotenv import load_dotenv
 8 | 
 9 | load_dotenv()
10 | 
11 | import logging
12 | 
13 | from pydantic import BaseModel
14 | 
15 | from browser_use import ActionResult, Agent, Controller
16 | from browser_use.browser.profile import BrowserProfile
17 | from browser_use.llm import ChatOpenAI
18 | 
19 | logger = logging.getLogger(__name__)
20 | 
21 | 
22 | class Person(BaseModel):
23 | 	name: str
24 | 	email: str | None = None
25 | 
26 | 
27 | class PersonList(BaseModel):
28 | 	people: list[Person]
29 | 
30 | 
31 | PERPLEXITY_API_KEY = os.getenv('PERPLEXITY_API_KEY')
32 | if not PERPLEXITY_API_KEY:
33 | 	raise ValueError('PERPLEXITY_API_KEY is not set')
34 | 
35 | controller = Controller(exclude_actions=['search_google'], output_model=PersonList)
36 | 
37 | 
38 | @controller.registry.action('Search the web for a specific query with perplexity')
39 | async def search_web(query: str):
40 | 	import httpx
41 | 
42 | 	url = 'https://api.perplexity.ai/chat/completions'
43 | 
44 | 	payload = {
45 | 		'model': 'sonar',
46 | 		'messages': [
47 | 			{'role': 'system', 'content': 'Be precise and concise.'},
48 | 			{'role': 'user', 'content': query},
49 | 		],
50 | 	}
51 | 	headers = {'Authorization': f'Bearer {PERPLEXITY_API_KEY}', 'Content-Type': 'application/json'}
52 | 
53 | 	async with httpx.AsyncClient() as client:
54 | 		response = await client.post(url, json=payload, headers=headers)
55 | 		response.raise_for_status()
56 | 		response_json = response.json()
57 | 		content = response_json['choices'][0]['message']['content']
58 | 		citations = response_json['citations']
59 | 		output = f'{content}\n\nCitations:\n' + '\n'.join(citations)
60 | 		logger.info(output)
61 | 		return ActionResult(extracted_content=output, include_in_memory=True)
62 | 
63 | 
64 | names = [
65 | 	'Ruedi Aebersold',
66 | 	'Bernd Bodenmiller',
67 | 	'Eugene Demler',
68 | ]
69 | 
70 | 
71 | async def main():
72 | 	task = 'use search_web with "find email address of the following ETH professor:" for each of the persons. Finally return the list with name and email if provided '
73 | 	task += '\n' + '\n'.join(names)
74 | 	model = ChatOpenAI(model='gpt-4.1')
75 | 	browser_profile = BrowserProfile()
76 | 	agent = Agent(task=task, llm=model, controller=controller, browser_profile=browser_profile)
77 | 
78 | 	history = await agent.run()
79 | 
80 | 	result = history.final_result()
81 | 	if result:
82 | 		parsed: PersonList = PersonList.model_validate_json(result)
83 | 
84 | 		for person in parsed.people:
85 | 			print(f'{person.name} - {person.email}')
86 | 	else:
87 | 		print('No result')
88 | 
89 | 
90 | if __name__ == '__main__':
91 | 	asyncio.run(main())
92 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/tokens/views.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | from typing import Any, TypeVar
  3 | 
  4 | from pydantic import BaseModel, Field
  5 | 
  6 | from browser_use.llm.views import ChatInvokeUsage
  7 | 
  8 | T = TypeVar('T', bound=BaseModel)
  9 | 
 10 | 
 11 | class TokenUsageEntry(BaseModel):
 12 | 	"""Single token usage entry"""
 13 | 
 14 | 	model: str
 15 | 	timestamp: datetime
 16 | 	usage: ChatInvokeUsage
 17 | 
 18 | 
 19 | class TokenCostCalculated(BaseModel):
 20 | 	"""Token cost"""
 21 | 
 22 | 	new_prompt_tokens: int
 23 | 	new_prompt_cost: float
 24 | 
 25 | 	prompt_read_cached_tokens: int | None
 26 | 	prompt_read_cached_cost: float | None
 27 | 
 28 | 	prompt_cached_creation_tokens: int | None
 29 | 	prompt_cache_creation_cost: float | None
 30 | 	"""Anthropic only: The cost of creating the cache."""
 31 | 
 32 | 	completion_tokens: int
 33 | 	completion_cost: float
 34 | 
 35 | 	@property
 36 | 	def prompt_cost(self) -> float:
 37 | 		return self.new_prompt_cost + (self.prompt_read_cached_cost or 0) + (self.prompt_cache_creation_cost or 0)
 38 | 
 39 | 	@property
 40 | 	def total_cost(self) -> float:
 41 | 		return (
 42 | 			self.new_prompt_cost
 43 | 			+ (self.prompt_read_cached_cost or 0)
 44 | 			+ (self.prompt_cache_creation_cost or 0)
 45 | 			+ self.completion_cost
 46 | 		)
 47 | 
 48 | 
 49 | class ModelPricing(BaseModel):
 50 | 	"""Pricing information for a model"""
 51 | 
 52 | 	model: str
 53 | 	input_cost_per_token: float | None
 54 | 	output_cost_per_token: float | None
 55 | 
 56 | 	cache_read_input_token_cost: float | None
 57 | 	cache_creation_input_token_cost: float | None
 58 | 
 59 | 	max_tokens: int | None
 60 | 	max_input_tokens: int | None
 61 | 	max_output_tokens: int | None
 62 | 
 63 | 
 64 | class CachedPricingData(BaseModel):
 65 | 	"""Cached pricing data with timestamp"""
 66 | 
 67 | 	timestamp: datetime
 68 | 	data: dict[str, Any]
 69 | 
 70 | 
 71 | class ModelUsageStats(BaseModel):
 72 | 	"""Usage statistics for a single model"""
 73 | 
 74 | 	model: str
 75 | 	prompt_tokens: int = 0
 76 | 	completion_tokens: int = 0
 77 | 	total_tokens: int = 0
 78 | 	cost: float = 0.0
 79 | 	invocations: int = 0
 80 | 	average_tokens_per_invocation: float = 0.0
 81 | 
 82 | 
 83 | class ModelUsageTokens(BaseModel):
 84 | 	"""Usage tokens for a single model"""
 85 | 
 86 | 	model: str
 87 | 	prompt_tokens: int
 88 | 	prompt_cached_tokens: int
 89 | 	completion_tokens: int
 90 | 	total_tokens: int
 91 | 
 92 | 
 93 | class UsageSummary(BaseModel):
 94 | 	"""Summary of token usage and costs"""
 95 | 
 96 | 	total_prompt_tokens: int
 97 | 	total_prompt_cost: float
 98 | 
 99 | 	total_prompt_cached_tokens: int
100 | 	total_prompt_cached_cost: float
101 | 
102 | 	total_completion_tokens: int
103 | 	total_completion_cost: float
104 | 	total_tokens: int
105 | 	total_cost: float
106 | 	entry_count: int
107 | 
108 | 	by_model: dict[str, ModelUsageStats] = Field(default_factory=dict)
109 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/use-cases/find_influencer_profiles.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Show how to use custom outputs.
 3 | 
 4 | @dev You need to add OPENAI_API_KEY to your environment variables.
 5 | """
 6 | 
 7 | import asyncio
 8 | import json
 9 | import os
10 | import sys
11 | 
12 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
13 | 
14 | from dotenv import load_dotenv
15 | 
16 | load_dotenv()
17 | 
18 | import httpx
19 | from pydantic import BaseModel
20 | 
21 | from browser_use import Agent, Controller
22 | from browser_use.agent.views import ActionResult
23 | from browser_use.llm import ChatOpenAI
24 | 
25 | 
26 | class Profile(BaseModel):
27 | 	platform: str
28 | 	profile_url: str
29 | 
30 | 
31 | class Profiles(BaseModel):
32 | 	profiles: list[Profile]
33 | 
34 | 
35 | controller = Controller(exclude_actions=['search_google'], output_model=Profiles)
36 | BEARER_TOKEN = os.getenv('BEARER_TOKEN')
37 | 
38 | if not BEARER_TOKEN:
39 | 	# use the api key for ask tessa
40 | 	# you can also use other apis like exa, xAI, perplexity, etc.
41 | 	raise ValueError('BEARER_TOKEN is not set - go to https://www.heytessa.ai/ and create an api key')
42 | 
43 | 
44 | @controller.registry.action('Search the web for a specific query')
45 | async def search_web(query: str):
46 | 	keys_to_use = ['url', 'title', 'content', 'author', 'score']
47 | 	headers = {'Authorization': f'Bearer {BEARER_TOKEN}'}
48 | 	async with httpx.AsyncClient() as client:
49 | 		response = await client.post(
50 | 			'https://asktessa.ai/api/search',
51 | 			headers=headers,
52 | 			json={'query': query},
53 | 		)
54 | 
55 | 	final_results = [
56 | 		{key: source[key] for key in keys_to_use if key in source}
57 | 		for source in await response.json()['sources']
58 | 		if source['score'] >= 0.2
59 | 	]
60 | 	# print(json.dumps(final_results, indent=4))
61 | 	result_text = json.dumps(final_results, indent=4)
62 | 	print(result_text)
63 | 	return ActionResult(extracted_content=result_text, include_in_memory=True)
64 | 
65 | 
66 | async def main():
67 | 	task = (
68 | 		'Go to this tiktok video url, open it and extract the @username from the resulting url. Then do a websearch for this username to find all his social media profiles. Return me the links to the social media profiles with the platform name.'
69 | 		' https://www.tiktokv.com/share/video/7470981717659110678/  '
70 | 	)
71 | 	model = ChatOpenAI(model='gpt-4.1')
72 | 	agent = Agent(task=task, llm=model, controller=controller)
73 | 
74 | 	history = await agent.run()
75 | 
76 | 	result = history.final_result()
77 | 	if result:
78 | 		parsed: Profiles = Profiles.model_validate_json(result)
79 | 
80 | 		for profile in parsed.profiles:
81 | 			print('\n--------------------------------')
82 | 			print(f'Platform:         {profile.platform}')
83 | 			print(f'Profile URL:      {profile.profile_url}')
84 | 
85 | 	else:
86 | 		print('No result')
87 | 
88 | 
89 | if __name__ == '__main__':
90 | 	asyncio.run(main())
91 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/llm/ollama/chat.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Any, TypeVar, overload
 3 | 
 4 | import httpx
 5 | from ollama import AsyncClient as OllamaAsyncClient
 6 | from pydantic import BaseModel
 7 | 
 8 | from browser_use.llm.base import BaseChatModel
 9 | from browser_use.llm.exceptions import ModelProviderError
10 | from browser_use.llm.messages import BaseMessage
11 | from browser_use.llm.ollama.serializer import OllamaMessageSerializer
12 | from browser_use.llm.views import ChatInvokeCompletion
13 | 
14 | T = TypeVar('T', bound=BaseModel)
15 | 
16 | 
17 | @dataclass
18 | class ChatOllama(BaseChatModel):
19 | 	"""
20 | 	A wrapper around Ollama's chat model.
21 | 	"""
22 | 
23 | 	model: str
24 | 
25 | 	# # Model params
26 | 	# temperature: float | None = None
27 | 
28 | 	# Client initialization parameters
29 | 	host: str | None = None
30 | 	timeout: float | httpx.Timeout | None = None
31 | 	client_params: dict[str, Any] | None = None
32 | 
33 | 	# Static
34 | 	@property
35 | 	def provider(self) -> str:
36 | 		return 'ollama'
37 | 
38 | 	def _get_client_params(self) -> dict[str, Any]:
39 | 		"""Prepare client parameters dictionary."""
40 | 		return {
41 | 			'host': self.host,
42 | 			'timeout': self.timeout,
43 | 			'client_params': self.client_params,
44 | 		}
45 | 
46 | 	def get_client(self) -> OllamaAsyncClient:
47 | 		"""
48 | 		Returns an OllamaAsyncClient client.
49 | 		"""
50 | 		return OllamaAsyncClient(host=self.host, timeout=self.timeout, **self.client_params or {})
51 | 
52 | 	@property
53 | 	def name(self) -> str:
54 | 		return self.model
55 | 
56 | 	@overload
57 | 	async def ainvoke(self, messages: list[BaseMessage], output_format: None = None) -> ChatInvokeCompletion[str]: ...
58 | 
59 | 	@overload
60 | 	async def ainvoke(self, messages: list[BaseMessage], output_format: type[T]) -> ChatInvokeCompletion[T]: ...
61 | 
62 | 	async def ainvoke(
63 | 		self, messages: list[BaseMessage], output_format: type[T] | None = None
64 | 	) -> ChatInvokeCompletion[T] | ChatInvokeCompletion[str]:
65 | 		ollama_messages = OllamaMessageSerializer.serialize_messages(messages)
66 | 
67 | 		try:
68 | 			if output_format is None:
69 | 				response = await self.get_client().chat(
70 | 					model=self.model,
71 | 					messages=ollama_messages,
72 | 				)
73 | 
74 | 				return ChatInvokeCompletion(completion=response.message.content or '', usage=None)
75 | 			else:
76 | 				schema = output_format.model_json_schema()
77 | 
78 | 				response = await self.get_client().chat(
79 | 					model=self.model,
80 | 					messages=ollama_messages,
81 | 					format=schema,
82 | 				)
83 | 
84 | 				completion = response.message.content or ''
85 | 				if output_format is not None:
86 | 					completion = output_format.model_validate_json(completion)
87 | 
88 | 				return ChatInvokeCompletion(completion=completion, usage=None)
89 | 
90 | 		except Exception as e:
91 | 			raise ModelProviderError(message=str(e), model=self.name) from e
92 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/dom/clickable_element_processor/service.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | 
 3 | from browser_use.dom.views import DOMElementNode
 4 | 
 5 | 
 6 | class ClickableElementProcessor:
 7 | 	@staticmethod
 8 | 	def get_clickable_elements_hashes(dom_element: DOMElementNode) -> set[str]:
 9 | 		"""Get all clickable elements in the DOM tree"""
10 | 		clickable_elements = ClickableElementProcessor.get_clickable_elements(dom_element)
11 | 		return {ClickableElementProcessor.hash_dom_element(element) for element in clickable_elements}
12 | 
13 | 	@staticmethod
14 | 	def get_clickable_elements(dom_element: DOMElementNode) -> list[DOMElementNode]:
15 | 		"""Get all clickable elements in the DOM tree"""
16 | 		clickable_elements = list()
17 | 		for child in dom_element.children:
18 | 			if isinstance(child, DOMElementNode):
19 | 				if child.highlight_index:
20 | 					clickable_elements.append(child)
21 | 
22 | 				clickable_elements.extend(ClickableElementProcessor.get_clickable_elements(child))
23 | 
24 | 		return list(clickable_elements)
25 | 
26 | 	@staticmethod
27 | 	def hash_dom_element(dom_element: DOMElementNode) -> str:
28 | 		parent_branch_path = ClickableElementProcessor._get_parent_branch_path(dom_element)
29 | 		branch_path_hash = ClickableElementProcessor._parent_branch_path_hash(parent_branch_path)
30 | 		attributes_hash = ClickableElementProcessor._attributes_hash(dom_element.attributes)
31 | 		xpath_hash = ClickableElementProcessor._xpath_hash(dom_element.xpath)
32 | 		# text_hash = DomTreeProcessor._text_hash(dom_element)
33 | 
34 | 		return ClickableElementProcessor._hash_string(f'{branch_path_hash}-{attributes_hash}-{xpath_hash}')
35 | 
36 | 	@staticmethod
37 | 	def _get_parent_branch_path(dom_element: DOMElementNode) -> list[str]:
38 | 		parents: list[DOMElementNode] = []
39 | 		current_element: DOMElementNode = dom_element
40 | 		while current_element.parent is not None:
41 | 			parents.append(current_element)
42 | 			current_element = current_element.parent
43 | 
44 | 		parents.reverse()
45 | 
46 | 		return [parent.tag_name for parent in parents]
47 | 
48 | 	@staticmethod
49 | 	def _parent_branch_path_hash(parent_branch_path: list[str]) -> str:
50 | 		parent_branch_path_string = '/'.join(parent_branch_path)
51 | 		return hashlib.sha256(parent_branch_path_string.encode()).hexdigest()
52 | 
53 | 	@staticmethod
54 | 	def _attributes_hash(attributes: dict[str, str]) -> str:
55 | 		attributes_string = ''.join(f'{key}={value}' for key, value in attributes.items())
56 | 		return ClickableElementProcessor._hash_string(attributes_string)
57 | 
58 | 	@staticmethod
59 | 	def _xpath_hash(xpath: str) -> str:
60 | 		return ClickableElementProcessor._hash_string(xpath)
61 | 
62 | 	@staticmethod
63 | 	def _text_hash(dom_element: DOMElementNode) -> str:
64 | 		""" """
65 | 		text_string = dom_element.get_all_text_till_next_clickable_element()
66 | 		return ClickableElementProcessor._hash_string(text_string)
67 | 
68 | 	@staticmethod
69 | 	def _hash_string(string: str) -> str:
70 | 		return hashlib.sha256(string.encode()).hexdigest()
71 | 


--------------------------------------------------------------------------------
/utils/browser-use/docs/development/contribution-guide.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Contribution Guide"
 3 | description: "Learn how to contribute to Browser Use"
 4 | icon: "github"
 5 | ---
 6 | 
 7 | # Join the Browser Use Community!
 8 | 
 9 | We're thrilled you're interested in contributing to Browser Use! This guide will help you get started with contributing to our project. Your contributions are what make the open-source community such an amazing place to learn, inspire, and create.
10 | 
11 | ## Quick Setup
12 | 
13 | Get started with Browser Use development in minutes:
14 | 
15 | ```bash
16 | git clone https://github.com/browser-use/browser-use
17 | cd browser-use
18 | uv sync --all-extras --dev
19 | # or pip install -U git+https://github.com/browser-use/browser-use.git@main
20 | 
21 | echo "BROWSER_USE_LOGGING_LEVEL=debug" >> .env
22 | ```
23 | 
24 | For more detailed setup instructions, see our [Local Setup Guide](/development/local-setup).
25 | 
26 | ## How to Contribute
27 | 
28 | ### Find Something to Work On
29 | 
30 | - Browse our [GitHub Issues](https://github.com/browser-use/browser-use/issues) for beginner-friendly issues labeled `good-first-issue`
31 | - Check out our most active issues or ask in [Discord](https://discord.gg/zXJJHtJf3k) for ideas of what to work on
32 | - Get inspiration and share what you build in the [`#showcase-your-work`](https://discord.com/channels/1303749220842340412/1305549200678850642) channel
33 | - Explore or contribute to [`awesome-browser-use-prompts`](https://github.com/browser-use/awesome-prompts)!
34 | 
35 | ### Making a Great Pull Request
36 | 
37 | When submitting a pull request, please:
38 | 
39 | - Include a clear description of what the PR does and why it's needed
40 | - Add tests that cover your changes
41 | - Include a demo screenshot/gif or an example script demonstrating your changes
42 | - Make sure the PR passes all CI checks and tests
43 | - Keep your PR focused on a single issue or feature to make it easier to review
44 | 
45 | Note: We appreciate quality over quantity. Instead of submitting small typo/style-only PRs, consider including those fixes as part of larger bugfix or feature PRs.
46 | 
47 | ### Contribution Process
48 | 
49 | 1. Fork the repository
50 | 2. Create a new branch for your feature or bugfix
51 | 3. Make your changes
52 | 4. Run tests to ensure everything works
53 | 5. Submit a pull request
54 | 6. Respond to any feedback from maintainers
55 | 7. Celebrate your contribution!
56 | 
57 | Feel free to bump your issues/PRs with comments periodically if you need faster feedback.
58 | 
59 | ## Code of Conduct
60 | 
61 | We're committed to providing a welcoming and inclusive environment for all contributors. Please be respectful and constructive in all interactions.
62 | 
63 | ## Getting Help
64 | 
65 | If you need help at any point:
66 | 
67 | - Join our [Discord community](https://link.browser-use.com/discord)
68 | - Ask questions in the appropriate GitHub issue
69 | - Check our [documentation](/introduction)
70 | 
71 | We're here to help you succeed in contributing to Browser Use!
72 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/custom-functions/file_upload.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import logging
 3 | import os
 4 | import sys
 5 | from pathlib import Path
 6 | 
 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 8 | 
 9 | from dotenv import load_dotenv
10 | 
11 | load_dotenv()
12 | 
13 | from lmnr import Laminar
14 | 
15 | try:
16 | 	Laminar.initialize(project_api_key=os.getenv('LMNR_PROJECT_API_KEY'))
17 | except Exception:
18 | 	pass
19 | 
20 | from browser_use import Agent, Controller
21 | from browser_use.agent.views import ActionResult
22 | from browser_use.browser import BrowserSession
23 | from browser_use.llm import ChatOpenAI
24 | 
25 | logger = logging.getLogger(__name__)
26 | 
27 | controller = Controller()
28 | 
29 | 
30 | @controller.action('Upload file to interactive element with file path')
31 | async def upload_file(index: int, path: str, browser_session: BrowserSession, available_file_paths: list[str]):
32 | 	if path not in available_file_paths:
33 | 		return ActionResult(error=f'File path {path} is not available')
34 | 
35 | 	if not os.path.exists(path):
36 | 		return ActionResult(error=f'File {path} does not exist')
37 | 
38 | 	file_upload_dom_el = await browser_session.find_file_upload_element_by_index(index, max_height=3, max_descendant_depth=3)
39 | 
40 | 	if file_upload_dom_el is None:
41 | 		msg = f'No file upload element found at index {index}'
42 | 		logger.info(msg)
43 | 		return ActionResult(error=msg)
44 | 
45 | 	file_upload_el = await browser_session.get_locate_element(file_upload_dom_el)
46 | 
47 | 	if file_upload_el is None:
48 | 		msg = f'No file upload element found at index {index}'
49 | 		logger.info(msg)
50 | 		return ActionResult(error=msg)
51 | 
52 | 	try:
53 | 		await file_upload_el.set_input_files(path)
54 | 		msg = f'Successfully uploaded file to index {index}'
55 | 		logger.info(msg)
56 | 		return ActionResult(extracted_content=msg, include_in_memory=True)
57 | 	except Exception as e:
58 | 		msg = f'Failed to upload file to index {index}: {str(e)}'
59 | 		logger.info(msg)
60 | 		return ActionResult(error=msg)
61 | 
62 | 
63 | def create_file(file_type: str = 'txt'):
64 | 	with open(f'tmp.{file_type}', 'w') as f:
65 | 		f.write('test')
66 | 	file_path = Path.cwd() / f'tmp.{file_type}'
67 | 	logger.info(f'Created file: {file_path}')
68 | 	return str(file_path)
69 | 
70 | 
71 | async def main():
72 | 	task = 'Go to https://kzmpmkh2zfk1ojnpxfn1.lite.vusercontent.net/ and - read the file content and upload them to fields'
73 | 	task = 'Go to https://www.freepdfconvert.com/, upload the file tmp.pdf into the field choose a file - dont click the fileupload button'
74 | 	available_file_paths = [create_file('txt'), create_file('pdf'), create_file('csv')]
75 | 
76 | 	model = ChatOpenAI(model='gpt-4.1-mini')
77 | 	agent = Agent(
78 | 		task=task,
79 | 		llm=model,
80 | 		controller=controller,
81 | 		available_file_paths=available_file_paths,
82 | 	)
83 | 
84 | 	await agent.run()
85 | 
86 | 	input('Press Enter to close...')
87 | 
88 | 
89 | if __name__ == '__main__':
90 | 	asyncio.run(main())
91 | 


--------------------------------------------------------------------------------
/utils/browser-use/.cursor/rules/browser-use-rules.mdc:
--------------------------------------------------------------------------------
 1 | ---
 2 | description: 
 3 | globs: 
 4 | alwaysApply: true
 5 | ---
 6 | ## 🧠 General Guidelines for Contributing to `browser-use`
 7 | 
 8 | **Browser-Use** is an AI agent that autonomously interacts with the web. It takes a user-defined task, navigates web pages using Chromium via Playwright, processes HTML, and repeatedly queries a language model (like `gpt-4o`) to decide the next action—until the task is completed.
 9 | 
10 | ### 🗂️ File Documentation
11 | 
12 | When you create a **new file**:
13 | 
14 | * **For humans**: At the top of the file, include a docstring in natural language explaining:
15 | 
16 |   * What this file does.
17 |   * How it fits into the browser-use system.
18 |   * If it introduces a new abstraction or replaces an old one.
19 | * **For LLMs/AI**: Include structured metadata using standardized comments such as:
20 | 
21 |   ```python
22 |   # @file purpose: Defines <purpose>
23 |   ```
24 | 
25 | ---
26 | 
27 | ### 🧰 Development Rules
28 | 
29 | * ✅ **Always use [`uv`](mdc:https:/github.com/astral-sh/uv) instead of `pip`**
30 |   For deterministic and fast dependency installs.
31 | 
32 | ```bash
33 | uv venv --python 3.11
34 | source .venv/bin/activate
35 | uv sync
36 | ```
37 | 
38 | * ✅ **Use real model names**
39 |   Do **not** replace `gpt-4o` with `gpt-4`. The model `gpt-4o` is a distinct release and supported.
40 | 
41 | * ✅ **Type-safe coding**
42 |   Use **Pydantic v2 models** for all internal action schemas, task inputs/outputs, and controller I/O. This ensures robust validation and LLM-call integrity.
43 | 
44 | ---
45 | 
46 | ## ⚙️ Adding New Actions
47 | 
48 | To add a new action that your browser agent can execute:
49 | 
50 | ```python
51 | from playwright.async_api import Page
52 | from browser_use.core.controller import Controller, ActionResult
53 | 
54 | controller = Controller()
55 | 
56 | @controller.registry.action("Search the web for a specific query")
57 | async def search_web(query: str, page: Page):
58 |     # Implement your logic here, e.g., query a search engine and return results
59 |     result = ...
60 |     return ActionResult(extracted_content=result, include_in_memory=True)
61 | ```
62 | 
63 | ### Notes:
64 | 
65 | * Use descriptive names and docstrings for each action.
66 | * Prefer returning `ActionResult` with structured content to help the agent reason better.
67 | 
68 | ---
69 | 
70 | ## 🧠 Creating and Running an Agent
71 | 
72 | To define a task and run a browser-use agent:
73 | 
74 | ```python
75 | from browser_use import Agent
76 | from browser_use.llm import ChatOpenAI
77 | 
78 | task = "Find the CEO of OpenAI and return their name"
79 | model = ChatOpenAI(model="gpt-4.1-mini")
80 | 
81 | agent = Agent(task=task, llm=model, controller=controller)
82 | 
83 | history = await agent.run()
84 | ```
85 | 
86 | # Never create random examples
87 | 
88 | When I ask you to implement a feature never create new files that show off that feature -> the code just gets messy. If you do anything to test it out, just do the inline code inside the terminal (if you want).
89 | 


--------------------------------------------------------------------------------
/utils/browser-use/tests/ci/test_schema_optimizer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for the SchemaOptimizer to ensure it correctly processes and
 3 | optimizes the schemas for agent actions without losing information.
 4 | """
 5 | 
 6 | from pydantic import BaseModel
 7 | 
 8 | from browser_use.agent.views import AgentOutput
 9 | from browser_use.controller.service import Controller
10 | from browser_use.llm.schema import SchemaOptimizer
11 | 
12 | 
13 | class ProductInfo(BaseModel):
14 | 	"""A sample structured output model with multiple fields."""
15 | 
16 | 	price: str
17 | 	title: str
18 | 	rating: float | None = None
19 | 
20 | 
21 | def test_optimizer_preserves_all_fields_in_structured_done_action():
22 | 	"""
23 | 	Ensures the SchemaOptimizer does not drop fields from a custom structured
24 | 	output model when creating the schema for the 'done' action.
25 | 
26 | 	This test specifically checks for a bug where fields were being lost
27 | 	during the optimization process.
28 | 	"""
29 | 	# 1. Setup a controller with a custom output model, simulating an Agent
30 | 	#    being created with an `output_model_schema`.
31 | 	controller = Controller(output_model=ProductInfo)
32 | 
33 | 	# 2. Get the dynamically created AgentOutput model, which includes all registered actions.
34 | 	ActionModel = controller.registry.create_action_model()
35 | 	agent_output_model = AgentOutput.type_with_custom_actions(ActionModel)
36 | 
37 | 	# 3. Run the schema optimizer on the agent's output model.
38 | 	optimized_schema = SchemaOptimizer.create_optimized_json_schema(agent_output_model)
39 | 
40 | 	# 4. Find the 'done' action schema within the optimized output.
41 | 	# The path is properties -> action -> items -> anyOf -> [schema with 'done'].
42 | 	done_action_schema = None
43 | 	actions_schemas = optimized_schema.get('properties', {}).get('action', {}).get('items', {}).get('anyOf', [])
44 | 	for action_schema in actions_schemas:
45 | 		if 'done' in action_schema.get('properties', {}):
46 | 			done_action_schema = action_schema
47 | 			break
48 | 
49 | 	# 5. Assert that the 'done' action schema was successfully found.
50 | 	assert done_action_schema is not None, "Could not find 'done' action in the optimized schema."
51 | 
52 | 	# 6. Navigate to the schema for our custom data model within the 'done' action.
53 | 	# The path is properties -> done -> properties -> data -> properties.
54 | 	done_params_schema = done_action_schema.get('properties', {}).get('done', {})
55 | 	structured_data_schema = done_params_schema.get('properties', {}).get('data', {})
56 | 	final_properties = structured_data_schema.get('properties', {})
57 | 
58 | 	# 7. Assert that the set of fields in the optimized schema matches the original model's fields.
59 | 	original_fields = set(ProductInfo.model_fields.keys())
60 | 	optimized_fields = set(final_properties.keys())
61 | 
62 | 	assert original_fields == optimized_fields, (
63 | 		f"Field mismatch between original and optimized structured 'done' action schema.\n"
64 | 		f'Missing from optimized: {original_fields - optimized_fields}\n'
65 | 		f'Unexpected in optimized: {optimized_fields - original_fields}'
66 | 	)
67 | 


--------------------------------------------------------------------------------
/utils/browser-use/browser_use/agent/message_manager/views.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import TYPE_CHECKING
 4 | 
 5 | from pydantic import BaseModel, ConfigDict, Field
 6 | 
 7 | from browser_use.llm.messages import (
 8 | 	BaseMessage,
 9 | 	UserMessage,
10 | )
11 | 
12 | if TYPE_CHECKING:
13 | 	pass
14 | 
15 | 
16 | class HistoryItem(BaseModel):
17 | 	"""Represents a single agent history item with its data and string representation"""
18 | 
19 | 	step_number: int | None = None
20 | 	evaluation_previous_goal: str | None = None
21 | 	memory: str | None = None
22 | 	next_goal: str | None = None
23 | 	action_results: str | None = None
24 | 	error: str | None = None
25 | 	system_message: str | None = None
26 | 
27 | 	model_config = ConfigDict(arbitrary_types_allowed=True)
28 | 
29 | 	def model_post_init(self, __context) -> None:
30 | 		"""Validate that error and system_message are not both provided"""
31 | 		if self.error is not None and self.system_message is not None:
32 | 			raise ValueError('Cannot have both error and system_message at the same time')
33 | 
34 | 	def to_string(self) -> str:
35 | 		"""Get string representation of the history item"""
36 | 		step_str = f'step_{self.step_number}' if self.step_number is not None else 'step_unknown'
37 | 
38 | 		if self.error:
39 | 			return f"""<{step_str}>
40 | {self.error}
41 | </{step_str}>"""
42 | 		elif self.system_message:
43 | 			return f"""<sys>
44 | {self.system_message}
45 | </sys>"""
46 | 		else:
47 | 			content_parts = [
48 | 				f'Evaluation of Previous Step: {self.evaluation_previous_goal}',
49 | 				f'Memory: {self.memory}',
50 | 				f'Next Goal: {self.next_goal}',
51 | 			]
52 | 
53 | 			if self.action_results:
54 | 				content_parts.append(self.action_results)
55 | 
56 | 			content = '\n'.join(content_parts)
57 | 
58 | 			return f"""<{step_str}>
59 | {content}
60 | </{step_str}>"""
61 | 
62 | 
63 | class MessageHistory(BaseModel):
64 | 	"""History of messages"""
65 | 
66 | 	messages: list[BaseMessage] = Field(default_factory=list)
67 | 
68 | 	model_config = ConfigDict(arbitrary_types_allowed=True)
69 | 
70 | 	def add_message(self, message: BaseMessage, position: int | None = None) -> None:
71 | 		"""Add message to history"""
72 | 		if position is None:
73 | 			self.messages.append(message)
74 | 		else:
75 | 			self.messages.insert(position, message)
76 | 
77 | 	def get_messages(self) -> list[BaseMessage]:
78 | 		"""Get all messages"""
79 | 		return self.messages
80 | 
81 | 	def remove_last_state_message(self) -> None:
82 | 		"""Remove last state message from history"""
83 | 		if len(self.messages) > 2 and isinstance(self.messages[-1], UserMessage):
84 | 			self.messages.pop()
85 | 
86 | 
87 | class MessageManagerState(BaseModel):
88 | 	"""Holds the state for MessageManager"""
89 | 
90 | 	history: MessageHistory = Field(default_factory=MessageHistory)
91 | 	tool_id: int = 1
92 | 	agent_history_items: list[HistoryItem] = Field(
93 | 		default_factory=lambda: [HistoryItem(step_number=0, system_message='Agent initialized')]
94 | 	)
95 | 	read_state_description: str = ''
96 | 
97 | 	model_config = ConfigDict(arbitrary_types_allowed=True)
98 | 


--------------------------------------------------------------------------------
/utils/browser-use/examples/ui/command_line.py:
--------------------------------------------------------------------------------
 1 | """
 2 | To Use It:
 3 | 
 4 | Example 1: Using OpenAI (default), with default task: 'go to reddit and search for posts about browser-use'
 5 | python command_line.py
 6 | 
 7 | Example 2: Using OpenAI with a Custom Query
 8 | python command_line.py --query "go to google and search for browser-use"
 9 | 
10 | Example 3: Using Anthropic's Claude Model with a Custom Query
11 | python command_line.py --query "find latest Python tutorials on Medium" --provider anthropic
12 | 
13 | """
14 | 
15 | import argparse
16 | import asyncio
17 | import os
18 | import sys
19 | 
20 | # Ensure local repository (browser_use) is accessible
21 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
22 | 
23 | from dotenv import load_dotenv
24 | 
25 | load_dotenv()
26 | 
27 | from browser_use import Agent
28 | from browser_use.browser import BrowserSession
29 | from browser_use.controller.service import Controller
30 | 
31 | 
32 | def get_llm(provider: str):
33 | 	if provider == 'anthropic':
34 | 		from browser_use.llm import ChatAnthropic
35 | 
36 | 		api_key = os.getenv('ANTHROPIC_API_KEY')
37 | 		if not api_key:
38 | 			raise ValueError('Error: ANTHROPIC_API_KEY is not set. Please provide a valid API key.')
39 | 
40 | 		return ChatAnthropic(model='claude-3-5-sonnet-20240620', temperature=0.0)
41 | 	elif provider == 'openai':
42 | 		from browser_use.llm import ChatOpenAI
43 | 
44 | 		api_key = os.getenv('OPENAI_API_KEY')
45 | 		if not api_key:
46 | 			raise ValueError('Error: OPENAI_API_KEY is not set. Please provide a valid API key.')
47 | 
48 | 		return ChatOpenAI(model='gpt-4.1', temperature=0.0)
49 | 
50 | 	else:
51 | 		raise ValueError(f'Unsupported provider: {provider}')
52 | 
53 | 
54 | def parse_arguments():
55 | 	"""Parse command-line arguments."""
56 | 	parser = argparse.ArgumentParser(description='Automate browser tasks using an LLM agent.')
57 | 	parser.add_argument(
58 | 		'--query', type=str, help='The query to process', default='go to reddit and search for posts about browser-use'
59 | 	)
60 | 	parser.add_argument(
61 | 		'--provider',
62 | 		type=str,
63 | 		choices=['openai', 'anthropic'],
64 | 		default='openai',
65 | 		help='The model provider to use (default: openai)',
66 | 	)
67 | 	return parser.parse_args()
68 | 
69 | 
70 | def initialize_agent(query: str, provider: str):
71 | 	"""Initialize the browser agent with the given query and provider."""
72 | 	llm = get_llm(provider)
73 | 	controller = Controller()
74 | 	browser_session = BrowserSession()
75 | 
76 | 	return Agent(
77 | 		task=query,
78 | 		llm=llm,
79 | 		controller=controller,
80 | 		browser_session=browser_session,
81 | 		use_vision=True,
82 | 		max_actions_per_step=1,
83 | 	), browser_session
84 | 
85 | 
86 | async def main():
87 | 	"""Main async function to run the agent."""
88 | 	args = parse_arguments()
89 | 	agent, browser_session = initialize_agent(args.query, args.provider)
90 | 
91 | 	await agent.run(max_steps=25)
92 | 
93 | 	input('Press Enter to close the browser...')
94 | 	await browser_session.close()
95 | 
96 | 
97 | if __name__ == '__main__':
98 | 	asyncio.run(main())
99 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | API_CONFIG = {
 4 |     "claude":{
 5 |         "max_tokens": 16384,
 6 |         "max_images": 70,
 7 |         "video_fps": 1,
 8 |         "max_video_frames": 64,
 9 |         "timeout": 300,
10 |         "max_single_image_size": 1000,
11 |     },
12 |     "glm": {
13 |         "max_tokens": 16384,
14 |         "max_images": 20,
15 |         "video_fps": 1,
16 |         "max_video_frames": 20,
17 |         "timeout": 600
18 |     },
19 |     "qwen": {
20 |         "max_tokens": 4096,
21 |         "max_images": 20,
22 |         "video_fps": 1,
23 |         "max_video_frames": 20,
24 |         "timeout": 300,
25 |     },
26 |     "gpt-4o-2024-05-13": {
27 |         "max_tokens": 4096,
28 |         "max_images": 70,
29 |         "video_fps": 1,
30 |         "max_video_frames": 64,
31 |         "timeout": 600
32 |     },
33 |     "gpt-5": {
34 |         "max_tokens": 16384,
35 |         "max_images": 70,
36 |         "video_fps": 1,
37 |         "max_video_frames": 64,
38 |         "timeout": 600
39 |     },
40 |     "doubao-seed-1-6-thinking-250615": {
41 |         "max_tokens": 16384,
42 |         "max_images": 70,
43 |         "video_fps": 1,
44 |         "max_video_frames": 64,
45 |         "timeout": 600
46 |     },
47 |     "gpt-4.1-2025-04-14":{
48 |         "max_tokens": 16384,
49 |         "max_images": 70,
50 |         "video_fps": 1,
51 |         "max_video_frames": 64,
52 |         "timeout": 600
53 |     },
54 |     "default": {
55 |         "max_tokens": 16384,
56 |         "max_images": 70,
57 |         "video_fps": 1,
58 |         "max_video_frames": 64,
59 |         "timeout": 300
60 |     }
61 | }
62 | 
63 | 
64 | PROMPT_TEMPLATE = """
65 | You are an expert front-end developer. Your task is to create a pixel-perfect replica of a website from a video.
66 | Generate a single `index.html` file that contains all HTML, CSS, and JavaScript necessary to replicate the UI, content, and interaction features shown. The webpage resolution in the video is {resolution}.
67 | 
68 | Instructions:
69 | 1. Single File Output: All HTML, CSS, and JS must be in one `index.html` file.
70 | 2. If backend logic is implied, mock it in JS with static data (e.g., a JS array for a fake API call).
71 | 3. For all clickable elements, please add the class name "btn" in the HTML source code so that the evaluation agent can perform click evaluation. 
72 | 4. Assets(Images and Videos in the webpage):
73 |    - All images must use the provided stitched image assets.
74 |    - The `src` attribute must start with the literal, unchanging string `__PLACEHOLDER_ASSETS_BASE_DIR__/`, followed by the actual filename identified from the stitched image.
75 |    - For example: `src="__PLACEHOLDER_ASSETS_BASE_DIR__/logo.svg"`.
76 |    - `<img>` tags must include `width` and `height` attributes.
77 |    - The provided stitched image assets are before the video.
78 | 5. No External Dependencies: The generated code must be entirely self-contained. No External Libraries and no External Fonts.
79 | 6. Final Response: Return **only the complete HTML code** in a single ```html code block, with no additional text or explanations.
80 | """
81 | 


--------------------------------------------------------------------------------