├── utils ├── browser-use │ ├── examples │ │ ├── __init__.py │ │ ├── models │ │ │ ├── langchain │ │ │ │ ├── __init__.py │ │ │ │ ├── README.md │ │ │ │ └── example.py │ │ │ ├── README.md │ │ │ ├── gpt-4.1.py │ │ │ ├── claude-4-sonnet.py │ │ │ ├── llama4-groq.py │ │ │ ├── novita.py │ │ │ ├── gemini.py │ │ │ └── azure_openai.py │ │ ├── simple.py │ │ ├── features │ │ │ ├── planner.py │ │ │ ├── small_model_for_extraction.py │ │ │ ├── multi-tab_handling.py │ │ │ ├── initial_actions.py │ │ │ ├── save_trace.py │ │ │ ├── restrict_urls.py │ │ │ ├── download_file.py │ │ │ ├── custom_system_prompt.py │ │ │ ├── follow_up_tasks.py │ │ │ ├── drag_drop.py │ │ │ ├── validate_output.py │ │ │ ├── parallel_agents.py │ │ │ ├── cross_origin_iframes.py │ │ │ ├── result_processing.py │ │ │ ├── custom_output.py │ │ │ ├── outsource_state.py │ │ │ ├── custom_user_agent.py │ │ │ ├── multiple_tasks.py │ │ │ ├── sensitive_data.py │ │ │ └── pause_agent.py │ │ ├── ui │ │ │ ├── README.md │ │ │ ├── streamlit_demo.py │ │ │ └── command_line.py │ │ ├── use-cases │ │ │ ├── README.md │ │ │ ├── captcha.py │ │ │ ├── wikipedia_banana_to_quantum.py │ │ │ ├── twitter_post_using_cookies.py │ │ │ ├── online_coding_agent.py │ │ │ ├── check_appointment.py │ │ │ ├── scrolling_page.py │ │ │ ├── web_voyager_agent.py │ │ │ └── find_influencer_profiles.py │ │ ├── file_system │ │ │ ├── excel_sheet.py │ │ │ └── file_system.py │ │ ├── browser │ │ │ ├── real_browser.py │ │ │ ├── multiple_agents_same_browser.py │ │ │ └── using_cdp.py │ │ ├── custom-functions │ │ │ ├── notification.py │ │ │ ├── save_to_file_hugging_face.py │ │ │ ├── clipboard.py │ │ │ ├── save_pdf.py │ │ │ ├── onepassword_2fa.py │ │ │ ├── 2fa.py │ │ │ ├── solve_amazon_captcha.py │ │ │ ├── extract_pdf_content.py │ │ │ ├── perplexity_search.py │ │ │ └── file_upload.py │ │ ├── mcp │ │ │ └── simple_client.py │ │ └── integrations │ │ │ ├── slack │ │ │ └── slack_example.py │ │ │ └── browserbase_stagehand.py │ ├── .python-version │ ├── browser_use │ │ ├── dom │ │ │ ├── __init__.py │ │ │ ├── utils.py │ │ │ ├── playground │ │ │ │ └── process_dom.py │ │ │ ├── history_tree_processor │ │ │ │ └── view.py │ │ │ └── clickable_element_processor │ │ │ │ └── service.py │ │ ├── tokens │ │ │ ├── __init__.py │ │ │ └── views.py │ │ ├── filesystem │ │ │ └── __init__.py │ │ ├── llm │ │ │ ├── google │ │ │ │ └── __init__.py │ │ │ ├── aws │ │ │ │ └── __init__.py │ │ │ ├── openai │ │ │ │ └── like.py │ │ │ ├── README.md │ │ │ ├── exceptions.py │ │ │ ├── openrouter │ │ │ │ └── serializer.py │ │ │ ├── views.py │ │ │ ├── base.py │ │ │ ├── __init__.py │ │ │ ├── tests │ │ │ │ └── test_groq_loop.py │ │ │ └── ollama │ │ │ │ └── chat.py │ │ ├── exceptions.py │ │ ├── sync │ │ │ └── __init__.py │ │ ├── telemetry │ │ │ ├── __init__.py │ │ │ └── views.py │ │ ├── mcp │ │ │ ├── __main__.py │ │ │ └── __init__.py │ │ ├── browser │ │ │ ├── browser.py │ │ │ ├── __init__.py │ │ │ ├── context.py │ │ │ ├── utils.py │ │ │ └── views.py │ │ ├── README.md │ │ ├── integrations │ │ │ └── gmail │ │ │ │ └── __init__.py │ │ ├── agent │ │ │ └── message_manager │ │ │ │ ├── utils.py │ │ │ │ └── views.py │ │ └── __init__.py │ ├── docs │ │ ├── favicon.ico │ │ ├── images │ │ │ ├── laminar.png │ │ │ ├── browser-use.png │ │ │ └── checks-passed.png │ │ ├── development │ │ │ ├── roadmap.mdx │ │ │ ├── telemetry.mdx │ │ │ ├── evaluations.mdx │ │ │ ├── observability.mdx │ │ │ └── contribution-guide.mdx │ │ ├── README.md │ │ ├── favicon.svg │ │ ├── customize │ │ │ ├── output-format.mdx │ │ │ └── system-prompt.mdx │ │ └── quickstart.mdx │ ├── static │ │ ├── browser-use.png │ │ └── browser-use-dark.png │ ├── .github │ │ ├── .git-blame-ignore-revs │ │ ├── CONTRIBUTING.md │ │ ├── ISSUE_TEMPLATE │ │ │ ├── config.yml │ │ │ └── 4_docs_issue.yml │ │ ├── workflows │ │ │ ├── cloud_evals.yml │ │ │ ├── lint.yml │ │ │ ├── claude.yml │ │ │ ├── build-base-image.yml.disabled │ │ │ ├── package.yaml │ │ │ └── docker.yml │ │ └── SECURITY.md │ ├── .gitattributes │ ├── tests │ │ ├── agent_tasks │ │ │ ├── browser_use_pip.yaml │ │ │ ├── amazon_laptop.yaml │ │ │ ├── captcha_cloudflare.yaml │ │ │ └── README.md │ │ ├── old │ │ │ ├── test_full_screen.py │ │ │ ├── test_dropdown_error.py │ │ │ ├── screenshot_test.py │ │ │ ├── test_gif_path.py │ │ │ ├── test_dropdown.py │ │ │ ├── test_react_dropdown.py │ │ │ ├── test_dropdown_complex.py │ │ │ ├── httpx_client_test.py │ │ │ ├── test_vision.py │ │ │ └── test_wait_for_element.py │ │ └── ci │ │ │ ├── test_browser_session_via_cdp.py │ │ │ └── test_schema_optimizer.py │ ├── bin │ │ ├── test.sh │ │ ├── lint.sh │ │ └── setup.sh │ ├── docker │ │ ├── base-images │ │ │ ├── system │ │ │ │ └── Dockerfile │ │ │ ├── python-deps │ │ │ │ └── Dockerfile │ │ │ └── chromium │ │ │ │ └── Dockerfile │ │ ├── README.md │ │ └── build-base-images.sh │ ├── .env.example │ ├── .dockerignore │ ├── .gitignore │ ├── LICENSE │ ├── Dockerfile.fast │ ├── .pre-commit-config.yaml │ └── .cursor │ │ └── rules │ │ └── browser-use-rules.mdc ├── __pycache__ │ ├── core.cpython-312.pyc │ ├── __init__.cpython-312.pyc │ ├── browser.cpython-312.pyc │ ├── metrics.cpython-312.pyc │ ├── operations.cpython-312.pyc │ ├── agent_runner.cpython-312.pyc │ ├── browser_runner.cpython-312.pyc │ ├── visual_scorer.cpython-312.pyc │ └── assertion_scorer.cpython-312.pyc └── __init__.py ├── assets ├── result.png ├── overview.png └── iwrbench_logo.png ├── .env.example ├── requirements.txt └── config.py /utils/browser-use/examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/browser-use/.python-version: -------------------------------------------------------------------------------- 1 | 3.12 2 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/dom/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/tokens/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/filesystem/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/browser-use/examples/models/langchain/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /assets/result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/assets/result.png -------------------------------------------------------------------------------- /assets/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/assets/overview.png -------------------------------------------------------------------------------- /assets/iwrbench_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/assets/iwrbench_logo.png -------------------------------------------------------------------------------- /utils/browser-use/docs/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/browser-use/docs/favicon.ico -------------------------------------------------------------------------------- /utils/__pycache__/core.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/__pycache__/core.cpython-312.pyc -------------------------------------------------------------------------------- /utils/browser-use/static/browser-use.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/browser-use/static/browser-use.png -------------------------------------------------------------------------------- /utils/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /utils/__pycache__/browser.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/__pycache__/browser.cpython-312.pyc -------------------------------------------------------------------------------- /utils/__pycache__/metrics.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/__pycache__/metrics.cpython-312.pyc -------------------------------------------------------------------------------- /utils/browser-use/.github/.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | 66b3c26df51adec32d42c3b2c0304e0662457298 2 | 2be4ba4f7078d47bbeed04baf6f8fb04017df028 3 | -------------------------------------------------------------------------------- /utils/browser-use/docs/images/laminar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/browser-use/docs/images/laminar.png -------------------------------------------------------------------------------- /utils/__pycache__/operations.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/__pycache__/operations.cpython-312.pyc -------------------------------------------------------------------------------- /utils/browser-use/.gitattributes: -------------------------------------------------------------------------------- 1 | static/*.gif filter=lfs diff=lfs merge=lfs -text 2 | # static/*.mp4 filter=lfs diff=lfs merge=lfs -text 3 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/llm/google/__init__.py: -------------------------------------------------------------------------------- 1 | from browser_use.llm.google.chat import ChatGoogle 2 | 3 | __all__ = ['ChatGoogle'] 4 | -------------------------------------------------------------------------------- /utils/browser-use/docs/images/browser-use.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/browser-use/docs/images/browser-use.png -------------------------------------------------------------------------------- /utils/browser-use/static/browser-use-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/browser-use/static/browser-use-dark.png -------------------------------------------------------------------------------- /utils/__pycache__/agent_runner.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/__pycache__/agent_runner.cpython-312.pyc -------------------------------------------------------------------------------- /utils/__pycache__/browser_runner.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/__pycache__/browser_runner.cpython-312.pyc -------------------------------------------------------------------------------- /utils/__pycache__/visual_scorer.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/__pycache__/visual_scorer.cpython-312.pyc -------------------------------------------------------------------------------- /utils/browser-use/docs/images/checks-passed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/browser-use/docs/images/checks-passed.png -------------------------------------------------------------------------------- /utils/__pycache__/assertion_scorer.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SIGMME/IWR-Bench/HEAD/utils/__pycache__/assertion_scorer.cpython-312.pyc -------------------------------------------------------------------------------- /utils/browser-use/examples/models/README.md: -------------------------------------------------------------------------------- 1 | # Gemini 2 | Detailed video on how to integrate browser-use with Gemini: https://www.youtube.com/watch?v=JluZiWBV_Tc 3 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY= 2 | OPENAI_BASE_URL= 3 | OPENAI_MODEL_NAME= 4 | MODEL_FOR_EVAL= # using for eval 5 | HEADLESS=true # set to false if you want to see the browser -------------------------------------------------------------------------------- /utils/browser-use/browser_use/dom/utils.py: -------------------------------------------------------------------------------- 1 | def cap_text_length(text: str, max_length: int) -> str: 2 | if len(text) > max_length: 3 | return text[:max_length] + '...' 4 | return text 5 | -------------------------------------------------------------------------------- /utils/browser-use/docs/development/roadmap.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Roadmap" 3 | description: "Future plans and upcoming features for Browser Use" 4 | icon: "road" 5 | --- 6 | 7 | Big things coming soon! 8 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/exceptions.py: -------------------------------------------------------------------------------- 1 | class LLMException(Exception): 2 | def __init__(self, status_code, message): 3 | self.status_code = status_code 4 | self.message = message 5 | super().__init__(f'Error {status_code}: {message}') 6 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/llm/aws/__init__.py: -------------------------------------------------------------------------------- 1 | from browser_use.llm.aws.chat_anthropic import ChatAnthropicBedrock 2 | from browser_use.llm.aws.chat_bedrock import ChatAWSBedrock 3 | 4 | __all__ = [ 5 | 'ChatAWSBedrock', 6 | 'ChatAnthropicBedrock', 7 | ] 8 | -------------------------------------------------------------------------------- /utils/browser-use/tests/agent_tasks/browser_use_pip.yaml: -------------------------------------------------------------------------------- 1 | name: Find pip install command for browser-use 2 | task: Find the pip installation command for the browser-use repo 3 | judge_context: 4 | - The output must include the command ('pip install browser-use') 5 | max_steps: 10 6 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/sync/__init__.py: -------------------------------------------------------------------------------- 1 | """Cloud sync module for Browser Use.""" 2 | 3 | from browser_use.sync.auth import CloudAuthConfig, DeviceAuthClient 4 | from browser_use.sync.service import CloudSync 5 | 6 | __all__ = ['CloudAuthConfig', 'DeviceAuthClient', 'CloudSync'] 7 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/telemetry/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Telemetry for Browser Use. 3 | """ 4 | 5 | from browser_use.telemetry.service import ProductTelemetry 6 | from browser_use.telemetry.views import BaseTelemetryEvent 7 | 8 | __all__ = ['BaseTelemetryEvent', 'ProductTelemetry'] 9 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/mcp/__main__.py: -------------------------------------------------------------------------------- 1 | """Entry point for running MCP server as a module. 2 | 3 | Usage: 4 | python -m browser_use.mcp.server 5 | """ 6 | 7 | import asyncio 8 | 9 | from browser_use.mcp.server import main 10 | 11 | if __name__ == '__main__': 12 | asyncio.run(main()) 13 | -------------------------------------------------------------------------------- /utils/browser-use/tests/agent_tasks/amazon_laptop.yaml: -------------------------------------------------------------------------------- 1 | name: Amazon Laptop Search 2 | task: Go to amazon.com, search for 'laptop', and return the first result 3 | judge_context: 4 | - The agent must navigate to amazon.com 5 | - The agent must search for 'laptop' 6 | - The agent must return name of the first laptop 7 | max_steps: 10 8 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/browser/browser.py: -------------------------------------------------------------------------------- 1 | from browser_use.browser.profile import BrowserProfile 2 | from browser_use.browser.session import BrowserSession 3 | 4 | BrowserConfig = BrowserProfile 5 | BrowserContextConfig = BrowserProfile 6 | Browser = BrowserSession 7 | 8 | __all__ = ['BrowserConfig', 'BrowserContextConfig', 'Browser'] 9 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | __all__ = ["browser", "operations", "visual_scorer", "assertion_scorer"] 3 | 4 | 5 | def __getattr__(name: str): 6 | if name in __all__: 7 | module = __import__(f"utils.{name}", fromlist=[name]) 8 | globals()[name] = module 9 | return module 10 | raise AttributeError(f"module {__name__!r} has no attribute {name!r}") 11 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/browser/__init__.py: -------------------------------------------------------------------------------- 1 | from .browser import Browser, BrowserConfig 2 | from .context import BrowserContext, BrowserContextConfig 3 | from .profile import BrowserProfile 4 | from .session import BrowserSession 5 | 6 | __all__ = ['Browser', 'BrowserConfig', 'BrowserContext', 'BrowserContextConfig', 'BrowserSession', 'BrowserProfile'] 7 | -------------------------------------------------------------------------------- /utils/browser-use/bin/test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # This script is used to run all the main project tests that run on CI via .github/workflows/test.yaml. 3 | # Usage: 4 | # $ ./bin/test.sh 5 | 6 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 7 | cd "$SCRIPT_DIR/.." || exit 1 8 | 9 | exec uv run pytest --numprocesses auto tests/ci $1 $2 $3 10 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/browser/context.py: -------------------------------------------------------------------------------- 1 | from browser_use.browser.profile import BrowserProfile 2 | from browser_use.browser.session import BrowserSession 3 | 4 | Browser = BrowserSession 5 | BrowserConfig = BrowserProfile 6 | BrowserContext = BrowserSession 7 | BrowserContextConfig = BrowserProfile 8 | 9 | __all__ = ['Browser', 'BrowserConfig', 'BrowserContext', 'BrowserContextConfig'] 10 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/llm/openai/like.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from browser_use.llm.openai.chat import ChatOpenAI 4 | 5 | 6 | @dataclass 7 | class ChatOpenAILike(ChatOpenAI): 8 | """ 9 | A class for to interact with any provider using the OpenAI API schema. 10 | 11 | Args: 12 | model (str): The name of the OpenAI model to use. 13 | """ 14 | 15 | model: str 16 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.57.0 2 | huggingface-hub==0.35.3 3 | tokenizers==0.22.1 4 | safetensors==0.6.2 5 | numpy>=1.24,<2.4 6 | opencv-python==4.12.0.88 7 | Pillow==11.3.0 8 | torch==2.8.0 9 | Levenshtein==0.27.1 10 | rapidfuzz==3.14.1 11 | httpx>=0.24.0 12 | python-dotenv>=1.0.0 13 | openai>=1.0.0 14 | jinja2>=3.0.0 15 | easyocr>=1.6.0 16 | pyyaml>=6.0.0 17 | regex>=2025.0.0 18 | filelock 19 | fsspec -------------------------------------------------------------------------------- /utils/browser-use/docker/base-images/system/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.12-slim 2 | 3 | # Install minimal system dependencies 4 | RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ 5 | apt-get update && \ 6 | apt-get install -y --no-install-recommends ca-certificates curl wget && \ 7 | rm -rf /var/lib/apt/lists/* 8 | 9 | # Install uv package manager 10 | COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ 11 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/mcp/__init__.py: -------------------------------------------------------------------------------- 1 | """MCP (Model Context Protocol) support for browser-use. 2 | 3 | This module provides integration with MCP servers and clients for browser automation. 4 | """ 5 | 6 | from browser_use.mcp.client import MCPClient 7 | from browser_use.mcp.controller import MCPToolWrapper 8 | from browser_use.mcp.server import BrowserUseServer 9 | 10 | __all__ = ['MCPClient', 'MCPToolWrapper', 'BrowserUseServer'] 11 | -------------------------------------------------------------------------------- /utils/browser-use/docker/base-images/python-deps/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_TAG=latest 2 | FROM browseruse/base-chromium:${BASE_TAG} 3 | 4 | ENV PYTHONUNBUFFERED=1 PATH="/app/.venv/bin:$PATH" PLAYWRIGHT_BROWSERS_PATH=/opt/playwright 5 | 6 | WORKDIR /app 7 | COPY pyproject.toml uv.lock* ./ 8 | 9 | RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked \ 10 | uv venv && \ 11 | uv sync --all-extras --no-dev --no-install-project --compile-bytecode 12 | -------------------------------------------------------------------------------- /utils/browser-use/bin/lint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # This script is used to run the formatter, linter, and type checker pre-commit hooks. 3 | # Usage: 4 | # $ ./bin/lint.sh 5 | 6 | IFS=$'\n' 7 | 8 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 9 | 10 | cd "$SCRIPT_DIR/.." || exit 1 11 | 12 | echo "[*] Running ruff linter, formatter, pyright type checker, and other pre-commit checks..." 13 | exec uv run pre-commit run --all-files 14 | -------------------------------------------------------------------------------- /utils/browser-use/tests/agent_tasks/captcha_cloudflare.yaml: -------------------------------------------------------------------------------- 1 | name: Cloudflare captcha 2 | task: Go to https://2captcha.com/demo/cloudflare-turnstile and solve the captcha, wait a few seconds, then click on check, wait a few more seconds for it to complete, then extract the "hostname" value from the displayed dictionary under "Captcha is passed successfully!" 3 | judge_context: 4 | - The agent must solve the captcha 5 | - The hostname returned should be "example.com" 6 | max_steps: 6 7 | -------------------------------------------------------------------------------- /utils/browser-use/.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to browser-use 2 | 3 | We love contributions! Please read through these links to get started: 4 | 5 | - 🔢 [Contribution Guidelines](https://docs.browser-use.com/development/contribution-guide) 6 | - 👾 [Local Development Setup Guide](https://docs.browser-use.com/development/local-setup) 7 | - 🏷️ [Issues Tagged: `#help-wanted`](https://github.com/browser-use/browser-use/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22help%20wanted%22) 8 | -------------------------------------------------------------------------------- /utils/browser-use/docs/README.md: -------------------------------------------------------------------------------- 1 | # Docs 2 | 3 | The official documentation for Browser Use. The docs are published to [Browser Use Docs](https://docs.browser-use.com). 4 | 5 | ### Development 6 | 7 | Install the [Mintlify CLI](https://www.npmjs.com/package/mintlify) to preview the documentation changes locally. To install, use the following command 8 | 9 | ``` 10 | npm i -g mintlify 11 | ``` 12 | 13 | Run the following command at the root of your documentation (where mint.json is) 14 | 15 | ``` 16 | mintlify dev 17 | ``` 18 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/llm/README.md: -------------------------------------------------------------------------------- 1 | # Browser Use LLMs 2 | 3 | We officially support the following LLMs: 4 | 5 | - OpenAI 6 | - Anthropic 7 | - Google 8 | - Groq 9 | - Ollama 10 | - DeepSeek 11 | 12 | ## Migrating from LangChain 13 | 14 | Because of how we implemented the LLMs, we can technically support anything. If you want to use a LangChain model, you can use the `ChatLangchain` (NOT OFFICIALLY SUPPORTED) class. 15 | 16 | You can find all the details in the [LangChain example](examples/models/langchain/example.py). We suggest you grab that code and use it as a reference. 17 | -------------------------------------------------------------------------------- /utils/browser-use/.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false # Set to true if you want to allow blank issues 2 | contact_links: 3 | - name: 🔢 Quickstart Guide 4 | url: https://docs.browser-use.com/quickstart 5 | about: Most common issues can be resolved by following our quickstart guide 6 | - name: 💬 Questions and Help 7 | url: https://link.browser-use.com/discord 8 | about: Please ask questions in our Discord community 9 | - name: 📖 Documentation 10 | url: https://docs.browser-use.com 11 | about: Check our documentation for answers first 12 | -------------------------------------------------------------------------------- /utils/browser-use/examples/simple.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | from browser_use.llm.openai.chat import ChatOpenAI 6 | 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 8 | 9 | from dotenv import load_dotenv 10 | 11 | load_dotenv() 12 | 13 | 14 | from browser_use import Agent 15 | 16 | # Initialize the model 17 | llm = ChatOpenAI( 18 | model='gpt-4.1-mini', 19 | ) 20 | 21 | 22 | task = 'Find the founders of browser-use' 23 | agent = Agent(task=task, llm=llm) 24 | 25 | 26 | async def main(): 27 | await agent.run() 28 | 29 | 30 | if __name__ == '__main__': 31 | asyncio.run(main()) 32 | -------------------------------------------------------------------------------- /utils/browser-use/tests/old/test_full_screen.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from browser_use.browser.types import async_playwright 4 | 5 | 6 | async def test_full_screen(start_fullscreen: bool, maximize: bool): 7 | async with async_playwright() as p: 8 | browser = await p.chromium.launch( 9 | headless=False, 10 | args=['--start-maximized'], 11 | ) 12 | context = await browser.new_context(no_viewport=True, viewport=None) 13 | page = await context.new_page() 14 | await page.goto('https://google.com') 15 | 16 | await asyncio.sleep(10) 17 | await browser.close() 18 | 19 | 20 | if __name__ == '__main__': 21 | asyncio.run(test_full_screen(False, False)) 22 | -------------------------------------------------------------------------------- /utils/browser-use/.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY= 2 | ANTHROPIC_API_KEY= 3 | AZURE_OPENAI_ENDPOINT= 4 | AZURE_OPENAI_KEY= 5 | GOOGLE_API_KEY= 6 | DEEPSEEK_API_KEY= 7 | GROK_API_KEY= 8 | NOVITA_API_KEY= 9 | 10 | # Set to false to disable anonymized telemetry 11 | ANONYMIZED_TELEMETRY=true 12 | 13 | # LogLevel: Set to debug to enable verbose logging, set to result to get results only. Available: result | debug | info 14 | BROWSER_USE_LOGGING_LEVEL=info 15 | 16 | # Calculate costs: (beta) Add cost calculations to tokens. Available: true | false 17 | BROWSER_USE_CALCULATE_COST=false 18 | 19 | # set this to true to optimize browser-use's chrome for running inside docker 20 | IN_DOCKER=false 21 | -------------------------------------------------------------------------------- /utils/browser-use/.dockerignore: -------------------------------------------------------------------------------- 1 | docs/ 2 | static/ 3 | .claude/ 4 | .github/ 5 | 6 | # Cache files 7 | .DS_Store 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | .mypy_cache/ 12 | .ruff_cache/ 13 | .pytest_cache/ 14 | .ipynb_checkpoints 15 | 16 | # Virtual Environments 17 | .venv 18 | venv/ 19 | 20 | # Editor cruft 21 | .vscode/ 22 | .idea/ 23 | 24 | # Build Files 25 | dist/ 26 | 27 | # Data files 28 | *.gif 29 | *.txt 30 | *.pdf 31 | *.csv 32 | *.json 33 | *.jsonl 34 | *.bak 35 | 36 | # Secrets and sensitive files 37 | secrets.env 38 | .env 39 | browser_cookies.json 40 | cookies.json 41 | gcp-login.json 42 | saved_trajectories/ 43 | AgentHistory.json 44 | AgentHistoryList.json 45 | private_example.py 46 | private_example 47 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/llm/exceptions.py: -------------------------------------------------------------------------------- 1 | class ModelError(Exception): 2 | pass 3 | 4 | 5 | class ModelProviderError(ModelError): 6 | """Exception raised when a model provider returns an error.""" 7 | 8 | def __init__( 9 | self, 10 | message: str, 11 | status_code: int = 502, 12 | model: str | None = None, 13 | ): 14 | super().__init__(message, status_code) 15 | self.model = model 16 | 17 | 18 | class ModelRateLimitError(ModelProviderError): 19 | """Exception raised when a model provider returns a rate limit error.""" 20 | 21 | def __init__( 22 | self, 23 | message: str, 24 | status_code: int = 429, 25 | model: str | None = None, 26 | ): 27 | super().__init__(message, status_code, model) 28 | -------------------------------------------------------------------------------- /utils/browser-use/examples/features/planner.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | from browser_use import Agent 12 | from browser_use.llm import ChatOpenAI 13 | 14 | llm = ChatOpenAI(model='gpt-4.1', temperature=0.0) 15 | planner_llm = ChatOpenAI( 16 | model='o3-mini', 17 | ) 18 | task = 'your task' 19 | 20 | 21 | agent = Agent(task=task, llm=llm, planner_llm=planner_llm, use_vision_for_planner=False, planner_interval=1) 22 | 23 | 24 | async def main(): 25 | await agent.run() 26 | 27 | 28 | if __name__ == '__main__': 29 | asyncio.run(main()) 30 | -------------------------------------------------------------------------------- /utils/browser-use/examples/ui/README.md: -------------------------------------------------------------------------------- 1 | # **User Interfaces of Browser-Use** 2 | 3 | | **File Name** | **User Interface** | **Description** | **Example Usage** | 4 | |------------------------|-------------------|-------------------------------------------|-------------------------------------------| 5 | | `command_line.py` | **Terminal** | Parses arguments for command-line execution. | `python command_line.py` | 6 | | `gradio_demo.py` | **Gradio** | Provides a Gradio-based interactive UI. | `python gradio_demo.py` | 7 | | `streamlit_demo.py` | **Streamlit** | Runs a Streamlit-based web interface. | `python -m streamlit run streamlit_demo.py` | 8 | -------------------------------------------------------------------------------- /utils/browser-use/examples/features/small_model_for_extraction.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | from browser_use import Agent 12 | from browser_use.llm import ChatOpenAI 13 | 14 | llm = ChatOpenAI(model='gpt-4.1', temperature=0.0) 15 | small_llm = ChatOpenAI(model='gpt-4.1-mini', temperature=0.0) 16 | task = 'Find the founders of browser-use in ycombinator, extract all links and open the links one by one' 17 | agent = Agent(task=task, llm=llm, page_extraction_llm=small_llm) 18 | 19 | 20 | async def main(): 21 | await agent.run() 22 | 23 | 24 | if __name__ == '__main__': 25 | asyncio.run(main()) 26 | -------------------------------------------------------------------------------- /utils/browser-use/examples/features/multi-tab_handling.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple try of the agent. 3 | 4 | @dev You need to add OPENAI_API_KEY to your environment variables. 5 | """ 6 | 7 | import asyncio 8 | import os 9 | import sys 10 | 11 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 12 | 13 | from dotenv import load_dotenv 14 | 15 | load_dotenv() 16 | 17 | from browser_use import Agent 18 | from browser_use.llm import ChatOpenAI 19 | 20 | # video: https://preview.screen.studio/share/clenCmS6 21 | llm = ChatOpenAI(model='gpt-4.1') 22 | agent = Agent( 23 | task='open 3 tabs with elon musk, trump, and steve jobs, then go back to the first and stop', 24 | llm=llm, 25 | ) 26 | 27 | 28 | async def main(): 29 | await agent.run() 30 | 31 | 32 | asyncio.run(main()) 33 | -------------------------------------------------------------------------------- /utils/browser-use/examples/models/gpt-4.1.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple try of the agent. 3 | 4 | @dev You need to add OPENAI_API_KEY to your environment variables. 5 | """ 6 | 7 | import asyncio 8 | 9 | from dotenv import load_dotenv 10 | from lmnr import Laminar 11 | 12 | from browser_use import Agent 13 | from browser_use.llm import ChatOpenAI 14 | 15 | load_dotenv() 16 | 17 | 18 | Laminar.initialize() 19 | 20 | # All the models are type safe from OpenAI in case you need a list of supported models 21 | llm = ChatOpenAI(model='gpt-4.1-mini') 22 | agent = Agent( 23 | task='Go to example.com, click on the first link, and give me the title of the page', 24 | llm=llm, 25 | ) 26 | 27 | 28 | async def main(): 29 | await agent.run(max_steps=10) 30 | input('Press Enter to continue...') 31 | 32 | 33 | asyncio.run(main()) 34 | -------------------------------------------------------------------------------- /utils/browser-use/.gitignore: -------------------------------------------------------------------------------- 1 | # Cache files 2 | .DS_Store 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | .mypy_cache/ 7 | .ruff_cache/ 8 | .pytest_cache/ 9 | .ipynb_checkpoints 10 | ~/ 11 | 12 | # Virtual Environments 13 | .venv* 14 | venv/ 15 | 16 | # IDEs 17 | .vscode/ 18 | .idea/ 19 | 20 | # Build files 21 | dist/ 22 | 23 | # Data files 24 | *.gif 25 | *.txt 26 | *.pdf 27 | *.csv 28 | *.json 29 | *.jsonl 30 | *.log 31 | *.bak 32 | 33 | # Secrets and sensitive files 34 | secrets.env 35 | .env 36 | browser_cookies.json 37 | cookies.json 38 | gcp-login.json 39 | saved_trajectories/ 40 | old_tests/ 41 | AgentHistory.json 42 | AgentHistoryList.json 43 | private_example.py 44 | private_example 45 | CLAUDE.local.md 46 | 47 | uv.lock 48 | temp 49 | tmp 50 | 51 | # Google API credentials 52 | credentials.json 53 | token.json 54 | -------------------------------------------------------------------------------- /utils/browser-use/tests/agent_tasks/README.md: -------------------------------------------------------------------------------- 1 | # Contributing Agent Tasks 2 | 3 | Contribute your own agent tasks and we test if the agent solves them for CI testing! 4 | 5 | ## How to Add a Task 6 | 7 | 1. Create a new `.yaml` file in this directory (`tests/agent_tasks/`). 8 | 2. Use the following format: 9 | 10 | ```yaml 11 | name: My Task Name 12 | task: Describe the task for the agent to perform 13 | judge_context: 14 | - List criteria for success, one per line 15 | max_steps: 10 16 | ``` 17 | 18 | ## Guidelines 19 | - Be specific in your task and criteria. 20 | - The `judge_context` should list what counts as a successful result. 21 | - The agent's output will be judged by an LLM using these criteria. 22 | 23 | ## Running the Tests 24 | 25 | To run all agent tasks: 26 | 27 | ```bash 28 | pytest tests/ci/test_agent_real_tasks.py 29 | ``` 30 | 31 | --- 32 | 33 | Happy contributing! 34 | -------------------------------------------------------------------------------- /utils/browser-use/examples/features/initial_actions.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | from browser_use import Agent 12 | from browser_use.llm import ChatOpenAI 13 | 14 | llm = ChatOpenAI(model='gpt-4.1') 15 | 16 | initial_actions = [ 17 | {'go_to_url': {'url': 'https://www.google.com', 'new_tab': True}}, 18 | {'go_to_url': {'url': 'https://en.wikipedia.org/wiki/Randomness', 'new_tab': True}}, 19 | {'scroll_down': {'amount': 1000}}, 20 | ] 21 | agent = Agent( 22 | task='What theories are displayed on the page?', 23 | initial_actions=initial_actions, 24 | llm=llm, 25 | ) 26 | 27 | 28 | async def main(): 29 | await agent.run(max_steps=10) 30 | 31 | 32 | if __name__ == '__main__': 33 | asyncio.run(main()) 34 | -------------------------------------------------------------------------------- /utils/browser-use/examples/models/claude-4-sonnet.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple script that runs the task of opening amazon and searching. 3 | @dev Ensure we have a `ANTHROPIC_API_KEY` variable in our `.env` file. 4 | """ 5 | 6 | import asyncio 7 | import os 8 | import sys 9 | 10 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 11 | 12 | from dotenv import load_dotenv 13 | from lmnr import Laminar 14 | 15 | load_dotenv() 16 | Laminar.initialize() 17 | 18 | from browser_use import Agent 19 | from browser_use.llm import ChatAnthropic 20 | 21 | llm = ChatAnthropic(model='claude-4-sonnet-20250514', temperature=0.0) 22 | 23 | agent = Agent( 24 | task='Go to amazon.com, search for laptop, sort by best rating, and give me the price of the first result', 25 | llm=llm, 26 | ) 27 | 28 | 29 | async def main(): 30 | await agent.run(max_steps=10) 31 | 32 | 33 | asyncio.run(main()) 34 | -------------------------------------------------------------------------------- /utils/browser-use/.github/workflows/cloud_evals.yml: -------------------------------------------------------------------------------- 1 | name: cloud_evals 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - 'releases/*' 8 | workflow_dispatch: 9 | inputs: 10 | commit_hash: 11 | description: Commit hash of the library to build the Cloud eval image for 12 | required: false 13 | 14 | jobs: 15 | trigger_cloud_eval_image_build: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/github-script@v7 19 | with: 20 | github-token: ${{ secrets.TRIGGER_CLOUD_BUILD_GH_KEY }} 21 | script: | 22 | const result = await github.rest.repos.createDispatchEvent({ 23 | owner: 'browser-use', 24 | repo: 'cloud', 25 | event_type: 'trigger-workflow', 26 | client_payload: {"commit_hash": "${{ github.event.inputs.commit_hash || github.sha }}"} 27 | }) 28 | console.log(result) 29 | -------------------------------------------------------------------------------- /utils/browser-use/examples/models/langchain/README.md: -------------------------------------------------------------------------------- 1 | # Langchain Models (legacy) 2 | 3 | This directory contains example of how to still use Langchain models with the new Browser Use chat models. 4 | 5 | ## How to use 6 | 7 | ```python 8 | from langchain_openai import ChatOpenAI 9 | 10 | from browser_use import Agent 11 | from .chat import ChatLangchain 12 | 13 | async def main(): 14 | """Basic example using ChatLangchain with OpenAI through LangChain.""" 15 | 16 | # Create a LangChain model (OpenAI) 17 | langchain_model = ChatOpenAI( 18 | model='gpt-4.1-mini', 19 | temperature=0.1, 20 | ) 21 | 22 | # Wrap it with ChatLangchain to make it compatible with browser-use 23 | llm = ChatLangchain(chat=langchain_model) 24 | 25 | agent = Agent( 26 | task="Go to google.com and search for 'browser automation with Python'", 27 | llm=llm, 28 | ) 29 | 30 | history = await agent.run() 31 | 32 | print(history.history) 33 | ``` 34 | -------------------------------------------------------------------------------- /utils/browser-use/examples/features/save_trace.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | from browser_use.agent.service import Agent 12 | from browser_use.browser import BrowserProfile, BrowserSession 13 | from browser_use.llm import ChatOpenAI 14 | 15 | llm = ChatOpenAI(model='gpt-4.1', temperature=0.0) 16 | 17 | 18 | async def main(): 19 | browser_session = BrowserSession( 20 | browser_profile=BrowserProfile( 21 | traces_dir='./tmp/traces/', 22 | user_data_dir='~/.config/browseruse/profiles/default', 23 | ) 24 | ) 25 | 26 | async with browser_session: 27 | agent = Agent( 28 | task='Go to hackernews, then go to apple.com and return all titles of open tabs', 29 | llm=llm, 30 | browser_session=browser_session, 31 | ) 32 | await agent.run() 33 | 34 | 35 | asyncio.run(main()) 36 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/llm/openrouter/serializer.py: -------------------------------------------------------------------------------- 1 | from openai.types.chat import ChatCompletionMessageParam 2 | 3 | from browser_use.llm.messages import BaseMessage 4 | from browser_use.llm.openai.serializer import OpenAIMessageSerializer 5 | 6 | 7 | class OpenRouterMessageSerializer: 8 | """ 9 | Serializer for converting between custom message types and OpenRouter message formats. 10 | 11 | OpenRouter uses the OpenAI-compatible API, so we can reuse the OpenAI serializer. 12 | """ 13 | 14 | @staticmethod 15 | def serialize_messages(messages: list[BaseMessage]) -> list[ChatCompletionMessageParam]: 16 | """ 17 | Serialize a list of browser_use messages to OpenRouter-compatible messages. 18 | 19 | Args: 20 | messages: List of browser_use messages 21 | 22 | Returns: 23 | List of OpenRouter-compatible messages (identical to OpenAI format) 24 | """ 25 | # OpenRouter uses the same message format as OpenAI 26 | return OpenAIMessageSerializer.serialize_messages(messages) 27 | -------------------------------------------------------------------------------- /utils/browser-use/docker/base-images/chromium/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_TAG=latest 2 | FROM browseruse/base-system:${BASE_TAG} 3 | 4 | WORKDIR /tmp 5 | COPY pyproject.toml ./ 6 | 7 | # Install both playwright and patchright with versions from pyproject.toml 8 | RUN --mount=type=cache,target=/root/.cache,sharing=locked \ 9 | PLAYWRIGHT_VERSION=$(grep -E "playwright>=" pyproject.toml | grep -o "[0-9]\+\.[0-9]\+\.[0-9]\+" | head -1) && \ 10 | PATCHRIGHT_VERSION=$(grep -E "patchright>=" pyproject.toml | grep -o "[0-9]\+\.[0-9]\+\.[0-9]\+" | head -1) && \ 11 | echo "Installing playwright==$PLAYWRIGHT_VERSION patchright==$PATCHRIGHT_VERSION" && \ 12 | pip install --no-cache-dir playwright==$PLAYWRIGHT_VERSION patchright==$PATCHRIGHT_VERSION && \ 13 | PLAYWRIGHT_BROWSERS_PATH=/opt/playwright playwright install --with-deps --no-shell chromium && \ 14 | ln -s /opt/playwright/chromium-*/chrome-linux/chrome /usr/bin/chromium-browser && \ 15 | chmod -R 755 /opt/playwright && \ 16 | rm -f pyproject.toml 17 | -------------------------------------------------------------------------------- /utils/browser-use/examples/models/llama4-groq.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 6 | 7 | from dotenv import load_dotenv 8 | from lmnr import Laminar 9 | 10 | load_dotenv() 11 | 12 | 13 | Laminar.initialize() 14 | 15 | 16 | from browser_use import Agent 17 | from browser_use.llm import ChatGroq 18 | 19 | groq_api_key = os.environ.get('GROQ_API_KEY') 20 | llm = ChatGroq( 21 | model='meta-llama/llama-4-maverick-17b-128e-instruct', 22 | # temperature=0.1, 23 | ) 24 | 25 | # llm = ChatGroq( 26 | # model='meta-llama/llama-4-maverick-17b-128e-instruct', 27 | # api_key=os.environ.get('GROQ_API_KEY'), 28 | # temperature=0.0, 29 | # ) 30 | 31 | task = 'Go to amazon.com, search for laptop, sort by best rating, and give me the price of the first result' 32 | 33 | 34 | async def main(): 35 | agent = Agent( 36 | task=task, 37 | llm=llm, 38 | ) 39 | await agent.run() 40 | 41 | 42 | if __name__ == '__main__': 43 | asyncio.run(main()) 44 | -------------------------------------------------------------------------------- /utils/browser-use/examples/use-cases/README.md: -------------------------------------------------------------------------------- 1 | # Use Cases of Browser-Use 2 | 3 | | File Name | Description | 4 | |-----------|------------| 5 | | `captcha.py` | Automates CAPTCHA solving on a demo website. | 6 | | `check_appointment.py` | Checks for available visa appointment slots on the Greece MFA website. | 7 | | `find_and_apply_to_jobs.py` | Searches for job listings, evaluates relevance based on a CV, and applies automatically. | 8 | | `online_coding_agent.py` | Implements a multi-agent system for online code editors, with separate agents for coding and execution. | 9 | | `post-twitter.py` | Provides a template for automated posting on X (Twitter), including new tweets, tagging, and replies. | 10 | | `scrolling_page.py` | Automates webpage scrolling with various scrolling actions and text search functionality. | 11 | | `twitter_post_using_cookies.py` | Automates posting on X (Twitter) using stored authentication cookies. | 12 | | `web_voyager_agent.py` | A general-purpose web navigation agent for tasks like flight booking and course searching. | 13 | -------------------------------------------------------------------------------- /utils/browser-use/tests/ci/test_browser_session_via_cdp.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from browser_use.browser import BrowserSession 4 | from browser_use.browser.profile import BrowserProfile 5 | from browser_use.browser.types import async_playwright 6 | 7 | 8 | async def test_connection_via_cdp(): 9 | browser_session = BrowserSession( 10 | cdp_url='http://localhost:9898', 11 | browser_profile=BrowserProfile( 12 | headless=True, 13 | keep_alive=True, 14 | ), 15 | ) 16 | with pytest.raises(Exception) as e: 17 | await browser_session.start() 18 | 19 | # Assert on the exception value outside the context manager 20 | assert 'ECONNREFUSED' in str(e.value) 21 | 22 | playwright = await async_playwright().start() 23 | browser = await playwright.chromium.launch(args=['--remote-debugging-port=9898']) 24 | 25 | async with await browser_session.start(): 26 | await browser_session.create_new_tab() 27 | 28 | assert (await browser_session.get_current_page()).url == 'about:blank' 29 | 30 | await browser.close() 31 | 32 | await browser_session.kill() 33 | await playwright.stop() 34 | -------------------------------------------------------------------------------- /utils/browser-use/tests/old/test_dropdown_error.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple try of the agent. 3 | 4 | @dev You need to add OPENAI_API_KEY to your environment variables. 5 | """ 6 | 7 | import os 8 | import sys 9 | 10 | from browser_use.browser import BrowserProfile, BrowserSession 11 | 12 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 13 | 14 | from browser_use import Agent, AgentHistoryList 15 | from browser_use.llm import ChatOpenAI 16 | 17 | llm = ChatOpenAI(model='gpt-4.1') 18 | browser_session = BrowserSession(browser_profile=BrowserProfile(headless=True)) 19 | 20 | agent = Agent( 21 | task=('go to https://codepen.io/shyam-king/pen/emOyjKm and select number "4" and return the output of "selected value"'), 22 | llm=llm, 23 | browser_session=browser_session, 24 | ) 25 | 26 | 27 | async def test_dropdown(): 28 | await browser_session.start() 29 | try: 30 | history: AgentHistoryList = await agent.run(20) 31 | 32 | result = history.final_result() 33 | assert result is not None 34 | assert '4' in result 35 | print(result) 36 | finally: 37 | await browser_session.stop() 38 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/README.md: -------------------------------------------------------------------------------- 1 | # Codebase Structure 2 | 3 | > The code structure inspired by https://github.com/Netflix/dispatch. 4 | 5 | Very good structure on how to make a scalable codebase is also in [this repo](https://github.com/zhanymkanov/fastapi-best-practices). 6 | 7 | Just a brief document about how we should structure our backend codebase. 8 | 9 | ## Code Structure 10 | 11 | ```markdown 12 | src/ 13 | // 14 | models.py 15 | services.py 16 | prompts.py 17 | views.py 18 | utils.py 19 | routers.py 20 | 21 | /_/ 22 | ``` 23 | 24 | ### Service.py 25 | 26 | Always a single file, except if it becomes too long - more than ~500 lines, split it into \_subservices 27 | 28 | ### Views.py 29 | 30 | Always split the views into two parts 31 | 32 | ```python 33 | # All 34 | ... 35 | 36 | # Requests 37 | ... 38 | 39 | # Responses 40 | ... 41 | ``` 42 | 43 | If too long → split into multiple files 44 | 45 | ### Prompts.py 46 | 47 | Single file; if too long → split into multiple files (one prompt per file or so) 48 | 49 | ### Routers.py 50 | 51 | Never split into more than one file 52 | -------------------------------------------------------------------------------- /utils/browser-use/examples/file_system/excel_sheet.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | from browser_use.llm.openai.chat import ChatOpenAI 6 | 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 8 | 9 | from dotenv import load_dotenv 10 | 11 | load_dotenv() 12 | from lmnr import Laminar 13 | 14 | try: 15 | Laminar.initialize(project_api_key=os.getenv('LMNR_PROJECT_API_KEY')) 16 | except Exception: 17 | pass 18 | 19 | from browser_use import Agent 20 | 21 | # Initialize the model 22 | llm = ChatOpenAI( 23 | model='o4-mini', 24 | temperature=1.0, 25 | ) 26 | 27 | 28 | task = ( 29 | 'Find current stock price of companies Meta and Amazon. Then, make me a CSV file with 2 columns: company name, stock price.' 30 | ) 31 | 32 | agent = Agent(task=task, llm=llm) 33 | 34 | 35 | async def main(): 36 | import time 37 | 38 | start_time = time.time() 39 | history = await agent.run() 40 | # token usage 41 | print(history.usage) 42 | end_time = time.time() 43 | print(f'Time taken: {end_time - start_time} seconds') 44 | 45 | 46 | if __name__ == '__main__': 47 | asyncio.run(main()) 48 | -------------------------------------------------------------------------------- /utils/browser-use/.github/SECURITY.md: -------------------------------------------------------------------------------- 1 | ## Reporting Security Issues 2 | 3 | If you believe you have found a security vulnerability in browser-use, please report it through coordinated disclosure. 4 | 5 | **Please do not report security vulnerabilities through the repository issues, discussions, or pull requests.** 6 | 7 | Instead, please open a new [Github security advisory](https://github.com/browser-use/browser-use/security/advisories/new). 8 | 9 | Please include as much of the information listed below as you can to help me better understand and resolve the issue: 10 | 11 | * The type of issue (e.g., buffer overflow, SQL injection, or cross-site scripting) 12 | * Full paths of source file(s) related to the manifestation of the issue 13 | * The location of the affected source code (tag/branch/commit or direct URL) 14 | * Any special configuration required to reproduce the issue 15 | * Step-by-step instructions to reproduce the issue 16 | * Proof-of-concept or exploit code (if possible) 17 | * Impact of the issue, including how an attacker might exploit the issue 18 | 19 | This information will help me triage your report more quickly. 20 | -------------------------------------------------------------------------------- /utils/browser-use/examples/use-cases/captcha.py: -------------------------------------------------------------------------------- 1 | """ 2 | Goal: Automates CAPTCHA solving on a demo website. 3 | 4 | 5 | Simple try of the agent. 6 | @dev You need to add OPENAI_API_KEY to your environment variables. 7 | NOTE: captchas are hard. For this example it works. But e.g. for iframes it does not. 8 | for this example it helps to zoom in. 9 | """ 10 | 11 | import asyncio 12 | import os 13 | import sys 14 | 15 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 16 | 17 | from dotenv import load_dotenv 18 | 19 | load_dotenv() 20 | 21 | from browser_use import Agent 22 | from browser_use.llm import ChatOpenAI 23 | 24 | if not os.getenv('OPENAI_API_KEY'): 25 | raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.') 26 | 27 | 28 | async def main(): 29 | llm = ChatOpenAI(model='gpt-4.1') 30 | agent = Agent( 31 | task='go to https://captcha.com/demos/features/captcha-demo.aspx and solve the captcha', 32 | llm=llm, 33 | ) 34 | await agent.run() 35 | input('Press Enter to exit') 36 | 37 | 38 | if __name__ == '__main__': 39 | asyncio.run(main()) 40 | -------------------------------------------------------------------------------- /utils/browser-use/examples/models/novita.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple try of the agent. 3 | 4 | @dev You need to add NOVITA_API_KEY to your environment variables. 5 | """ 6 | 7 | import asyncio 8 | import os 9 | import sys 10 | 11 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 12 | 13 | from dotenv import load_dotenv 14 | 15 | load_dotenv() 16 | 17 | 18 | from browser_use import Agent 19 | from browser_use.llm import ChatOpenAI 20 | 21 | api_key = os.getenv('NOVITA_API_KEY', '') 22 | if not api_key: 23 | raise ValueError('NOVITA_API_KEY is not set') 24 | 25 | 26 | async def run_search(): 27 | agent = Agent( 28 | task=( 29 | '1. Go to https://www.reddit.com/r/LocalLLaMA ' 30 | "2. Search for 'browser use' in the search bar" 31 | '3. Click on first result' 32 | '4. Return the first comment' 33 | ), 34 | llm=ChatOpenAI( 35 | base_url='https://api.novita.ai/v3/openai', 36 | model='deepseek/deepseek-v3-0324', 37 | api_key=api_key, 38 | ), 39 | use_vision=False, 40 | ) 41 | 42 | await agent.run() 43 | 44 | 45 | if __name__ == '__main__': 46 | asyncio.run(run_search()) 47 | -------------------------------------------------------------------------------- /utils/browser-use/examples/use-cases/wikipedia_banana_to_quantum.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | from browser_use import Agent 12 | from browser_use.browser import BrowserProfile, BrowserSession 13 | from browser_use.llm import ChatOpenAI 14 | 15 | # video https://preview.screen.studio/share/vuq91Ej8 16 | llm = ChatOpenAI( 17 | model='gpt-4.1', 18 | temperature=0.0, 19 | ) 20 | task = 'go to https://en.wikipedia.org/wiki/Banana and click on buttons on the wikipedia page to go as fast as possible from banna to Quantum mechanics' 21 | 22 | browser_session = BrowserSession( 23 | browser_profile=BrowserProfile( 24 | viewport_expansion=-1, 25 | highlight_elements=False, 26 | user_data_dir='~/.config/browseruse/profiles/default', 27 | ), 28 | ) 29 | agent = Agent(task=task, llm=llm, browser_session=browser_session, use_vision=False) 30 | 31 | 32 | async def main(): 33 | await agent.run() 34 | 35 | 36 | if __name__ == '__main__': 37 | asyncio.run(main()) 38 | -------------------------------------------------------------------------------- /utils/browser-use/examples/browser/real_browser.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | from browser_use import Agent 12 | from browser_use.browser import BrowserProfile, BrowserSession 13 | from browser_use.llm import ChatOpenAI 14 | 15 | browser_profile = BrowserProfile( 16 | # NOTE: you need to close your chrome browser - so that this can open your browser in debug mode 17 | executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', 18 | user_data_dir='~/.config/browseruse/profiles/default', 19 | headless=False, 20 | ) 21 | browser_session = BrowserSession(browser_profile=browser_profile) 22 | 23 | 24 | async def main(): 25 | agent = Agent( 26 | task='Find todays DOW stock price', 27 | llm=ChatOpenAI(model='gpt-4.1'), 28 | browser_session=browser_session, 29 | ) 30 | 31 | await agent.run() 32 | await browser_session.close() 33 | 34 | input('Press Enter to close...') 35 | 36 | 37 | if __name__ == '__main__': 38 | asyncio.run(main()) 39 | -------------------------------------------------------------------------------- /utils/browser-use/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Gregor Zunic 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/telemetry/views.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from collections.abc import Sequence 3 | from dataclasses import asdict, dataclass 4 | from typing import Any 5 | 6 | 7 | @dataclass 8 | class BaseTelemetryEvent(ABC): 9 | @property 10 | @abstractmethod 11 | def name(self) -> str: 12 | pass 13 | 14 | @property 15 | def properties(self) -> dict[str, Any]: 16 | return {k: v for k, v in asdict(self).items() if k != 'name'} 17 | 18 | 19 | @dataclass 20 | class AgentTelemetryEvent(BaseTelemetryEvent): 21 | # start details 22 | task: str 23 | model: str 24 | model_provider: str 25 | planner_llm: str | None 26 | max_steps: int 27 | max_actions_per_step: int 28 | use_vision: bool 29 | use_validation: bool 30 | version: str 31 | source: str 32 | # step details 33 | action_errors: Sequence[str | None] 34 | action_history: Sequence[list[dict] | None] 35 | urls_visited: Sequence[str | None] 36 | # end details 37 | steps: int 38 | total_input_tokens: int 39 | total_duration_seconds: float 40 | success: bool | None 41 | final_result_response: str | None 42 | error_message: str | None 43 | 44 | name: str = 'agent_event' 45 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/integrations/gmail/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Gmail Integration for Browser Use 3 | Provides Gmail API integration for email reading and verification code extraction. 4 | This integration enables agents to read email content and extract verification codes themselves. 5 | Usage: 6 | from browser_use.integrations.gmail import GmailService, register_gmail_actions 7 | # Option 1: Register Gmail actions with file-based authentication 8 | controller = Controller() 9 | register_gmail_actions(controller) 10 | # Option 2: Register Gmail actions with direct access token (recommended for production) 11 | controller = Controller() 12 | register_gmail_actions(controller, access_token="your_access_token_here") 13 | # Option 3: Use the service directly 14 | gmail = GmailService(access_token="your_access_token_here") 15 | await gmail.authenticate() 16 | emails = await gmail.get_recent_emails() 17 | """ 18 | 19 | # @file purpose: Gmail integration for 2FA email authentication and email reading 20 | 21 | from .actions import register_gmail_actions 22 | from .service import GmailService 23 | 24 | __all__ = ['GmailService', 'register_gmail_actions'] 25 | -------------------------------------------------------------------------------- /utils/browser-use/docker/README.md: -------------------------------------------------------------------------------- 1 | # Docker Setup for Browser-Use 2 | 3 | This directory contains the optimized Docker build system for browser-use, achieving < 30 second builds. 4 | 5 | ## Quick Start 6 | 7 | ```bash 8 | # Build base images (only needed once or when dependencies change) 9 | ./docker/build-base-images.sh 10 | 11 | # Build browser-use 12 | docker build -f Dockerfile.fast -t browseruse . 13 | 14 | # Or use the standard Dockerfile (slower but self-contained) 15 | docker build -t browseruse . 16 | ``` 17 | 18 | ## Files 19 | 20 | - `Dockerfile` - Standard self-contained build (~2 min) 21 | - `Dockerfile.fast` - Fast build using pre-built base images (~30 sec) 22 | - `docker/` - Base image definitions and build script 23 | - `base-images/system/` - Python + minimal system deps 24 | - `base-images/chromium/` - Adds Chromium browser 25 | - `base-images/python-deps/` - Adds Python dependencies 26 | - `build-base-images.sh` - Script to build all base images 27 | 28 | ## Performance 29 | 30 | | Build Type | Time | 31 | |------------|------| 32 | | Standard Dockerfile | ~2 minutes | 33 | | Fast build (with base images) | ~30 seconds | 34 | | Rebuild after code change | ~16 seconds | 35 | -------------------------------------------------------------------------------- /utils/browser-use/tests/old/screenshot_test.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import base64 3 | 4 | import pytest 5 | 6 | from browser_use.browser import BrowserProfile, BrowserSession 7 | 8 | 9 | async def test_take_full_page_screenshot(): 10 | browser_session = BrowserSession(browser_profile=BrowserProfile(headless=True, disable_security=True)) 11 | await browser_session.start() 12 | try: 13 | page = await browser_session.get_current_page() 14 | # Go to a test page 15 | await page.goto('https://example.com') 16 | 17 | await asyncio.sleep(3) 18 | # Take full page screenshot 19 | screenshot_b64 = await browser_session.take_screenshot(full_page=True) 20 | await asyncio.sleep(3) 21 | # Verify screenshot is not empty and is valid base64 22 | assert screenshot_b64 is not None 23 | assert isinstance(screenshot_b64, str) 24 | assert len(screenshot_b64) > 0 25 | 26 | # Test we can decode the base64 string 27 | try: 28 | base64.b64decode(screenshot_b64) 29 | except Exception as e: 30 | pytest.fail(f'Failed to decode base64 screenshot: {str(e)}') 31 | finally: 32 | await browser_session.stop() 33 | 34 | 35 | if __name__ == '__main__': 36 | asyncio.run(test_take_full_page_screenshot()) 37 | -------------------------------------------------------------------------------- /utils/browser-use/tests/old/test_gif_path.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple try of the agent. 3 | 4 | @dev You need to add OPENAI_API_KEY to your environment variables. 5 | """ 6 | 7 | import os 8 | import sys 9 | 10 | from browser_use.browser import BrowserProfile, BrowserSession 11 | 12 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 13 | 14 | from browser_use import Agent, AgentHistoryList 15 | from browser_use.llm import ChatOpenAI 16 | 17 | llm = ChatOpenAI(model='gpt-4.1') 18 | 19 | browser_session = BrowserSession(browser_profile=BrowserProfile(headless=True, disable_security=True)) 20 | 21 | agent = Agent( 22 | task=('go to google.com and search for text "hi there"'), 23 | llm=llm, 24 | browser_session=browser_session, 25 | generate_gif='./google.gif', 26 | ) 27 | 28 | 29 | async def test_gif_path(): 30 | if os.path.exists('./google.gif'): 31 | os.unlink('./google.gif') 32 | 33 | await browser_session.start() 34 | try: 35 | history: AgentHistoryList = await agent.run(20) 36 | 37 | result = history.final_result() 38 | assert result is not None 39 | 40 | assert os.path.exists('./google.gif'), 'google.gif was not created' 41 | finally: 42 | await browser_session.stop() 43 | -------------------------------------------------------------------------------- /utils/browser-use/examples/models/gemini.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | from lmnr import Laminar 9 | 10 | load_dotenv() 11 | 12 | Laminar.initialize() 13 | 14 | 15 | from browser_use import Agent 16 | from browser_use.browser import BrowserProfile, BrowserSession 17 | from browser_use.llm import ChatGoogle 18 | 19 | api_key = os.getenv('GOOGLE_API_KEY') 20 | if not api_key: 21 | raise ValueError('GOOGLE_API_KEY is not set') 22 | 23 | llm = ChatGoogle(model='gemini-2.0-flash-exp', api_key=api_key) 24 | 25 | browser_session = BrowserSession( 26 | browser_profile=BrowserProfile( 27 | viewport_expansion=0, 28 | user_data_dir='~/.config/browseruse/profiles/default', 29 | ) 30 | ) 31 | 32 | 33 | async def run_search(): 34 | agent = Agent( 35 | task='Go to amazon.com, search for laptop, sort by best rating, and give me the price of the first result', 36 | llm=llm, 37 | max_actions_per_step=4, 38 | browser_session=browser_session, 39 | ) 40 | 41 | await agent.run(max_steps=25) 42 | 43 | 44 | if __name__ == '__main__': 45 | asyncio.run(run_search()) 46 | -------------------------------------------------------------------------------- /utils/browser-use/examples/features/restrict_urls.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | from browser_use import Agent 12 | from browser_use.browser import BrowserProfile, BrowserSession 13 | from browser_use.llm import ChatOpenAI 14 | 15 | llm = ChatOpenAI(model='gpt-4.1', temperature=0.0) 16 | task = ( 17 | "go to google.com and search for openai.com and click on the first link then extract content and scroll down - what's there?" 18 | ) 19 | 20 | allowed_domains = ['google.com'] 21 | 22 | browser_session = BrowserSession( 23 | browser_profile=BrowserProfile( 24 | executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', 25 | allowed_domains=allowed_domains, 26 | user_data_dir='~/.config/browseruse/profiles/default', 27 | ), 28 | ) 29 | 30 | agent = Agent( 31 | task=task, 32 | llm=llm, 33 | browser_session=browser_session, 34 | ) 35 | 36 | 37 | async def main(): 38 | await agent.run(max_steps=25) 39 | 40 | input('Press Enter to close the browser...') 41 | await browser_session.close() 42 | 43 | 44 | asyncio.run(main()) 45 | -------------------------------------------------------------------------------- /utils/browser-use/examples/browser/multiple_agents_same_browser.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | 12 | from browser_use import Agent 13 | from browser_use.browser.profile import BrowserProfile 14 | from browser_use.browser.session import BrowserSession 15 | from browser_use.llm import ChatOpenAI 16 | 17 | 18 | async def main(): 19 | browser_session = BrowserSession( 20 | browser_profile=BrowserProfile( 21 | keep_alive=True, 22 | user_data_dir=None, 23 | headless=False, 24 | ) 25 | ) 26 | await browser_session.start() 27 | 28 | current_agent = None 29 | llm = ChatOpenAI(model='gpt-4.1') 30 | 31 | task1 = 'find todays weather on San Francisco and extract it as json' 32 | task2 = 'find todays weather in Zurich and extract it as json' 33 | 34 | agent1 = Agent( 35 | task=task1, 36 | browser_session=browser_session, 37 | llm=llm, 38 | ) 39 | agent2 = Agent( 40 | task=task2, 41 | browser_session=browser_session, 42 | llm=llm, 43 | ) 44 | 45 | await asyncio.gather(agent1.run(), agent2.run()) 46 | await browser_session.kill() 47 | 48 | 49 | asyncio.run(main()) 50 | -------------------------------------------------------------------------------- /utils/browser-use/examples/features/download_file.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | 12 | from browser_use import Agent 13 | from browser_use.browser import BrowserSession 14 | from browser_use.llm import ChatGoogle 15 | 16 | api_key = os.getenv('GOOGLE_API_KEY') 17 | if not api_key: 18 | raise ValueError('GOOGLE_API_KEY is not set') 19 | 20 | assert api_key is not None, 'GOOGLE_API_KEY must be set' 21 | llm = ChatGoogle(model='gemini-2.0-flash-exp', api_key=api_key) 22 | 23 | from browser_use.browser import BrowserProfile 24 | 25 | browser_session = BrowserSession( 26 | browser_profile=BrowserProfile( 27 | downloads_path='~/Downloads', 28 | user_data_dir='~/.config/browseruse/profiles/default', 29 | ) 30 | ) 31 | 32 | 33 | async def run_download(): 34 | agent = Agent( 35 | task='Go to "https://file-examples.com/" and download the smallest doc file.', 36 | llm=llm, 37 | max_actions_per_step=8, 38 | use_vision=True, 39 | browser_session=browser_session, 40 | ) 41 | await agent.run(max_steps=25) 42 | 43 | 44 | if __name__ == '__main__': 45 | asyncio.run(run_download()) 46 | -------------------------------------------------------------------------------- /utils/browser-use/tests/old/test_dropdown.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test dropdown interaction functionality. 3 | """ 4 | 5 | import pytest 6 | 7 | from browser_use.agent.service import Agent 8 | from browser_use.agent.views import AgentHistoryList 9 | 10 | 11 | async def test_dropdown(llm, browser_session): 12 | """Test selecting an option from a dropdown menu.""" 13 | agent = Agent( 14 | task=( 15 | 'go to https://codepen.io/geheimschriftstift/pen/mPLvQz and first get all options for the dropdown and then select the 5th option' 16 | ), 17 | llm=llm, 18 | browser_session=browser_session, 19 | ) 20 | 21 | try: 22 | history: AgentHistoryList = await agent.run(20) 23 | result = history.final_result() 24 | 25 | # Verify dropdown interaction 26 | assert result is not None 27 | assert 'Duck' in result, "Expected 5th option 'Duck' to be selected" 28 | 29 | # Verify dropdown state 30 | page = await browser_session.get_current_page() 31 | element = await page.query_selector('select') 32 | assert element is not None, 'Dropdown element should exist' 33 | 34 | value = await element.evaluate('el => el.value') 35 | assert value == '5', 'Dropdown should have 5th option selected' 36 | 37 | except Exception as e: 38 | pytest.fail(f'Dropdown test failed: {str(e)}') 39 | -------------------------------------------------------------------------------- /utils/browser-use/tests/old/test_react_dropdown.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple try of the agent. 3 | 4 | @dev You need to add OPENAI_API_KEY to your environment variables. 5 | """ 6 | 7 | import os 8 | import sys 9 | 10 | from browser_use.browser import BrowserProfile, BrowserSession 11 | 12 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 13 | import asyncio 14 | 15 | from browser_use import Agent, AgentHistoryList 16 | from browser_use.llm import ChatOpenAI 17 | 18 | llm = ChatOpenAI(model='gpt-4.1') 19 | 20 | browser_session = BrowserSession(browser_profile=BrowserProfile(headless=True, disable_security=True)) 21 | 22 | agent = Agent( 23 | task=( 24 | 'go to https://codepen.io/shyam-king/pen/ByBJoOv and select "Tiger" dropdown and read the text given in "Selected Animal" box (it can be empty as well)' 25 | ), 26 | llm=llm, 27 | browser_session=browser_session, 28 | ) 29 | 30 | 31 | async def test_dropdown(): 32 | await browser_session.start() 33 | try: 34 | history: AgentHistoryList = await agent.run(10) 35 | 36 | result = history.final_result() 37 | assert result is not None 38 | print('result: ', result) 39 | finally: 40 | await browser_session.stop() 41 | 42 | 43 | if __name__ == '__main__': 44 | asyncio.run(test_dropdown()) 45 | -------------------------------------------------------------------------------- /utils/browser-use/docs/development/telemetry.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Telemetry" 3 | description: "Understanding Browser Use's telemetry and privacy settings" 4 | icon: "chart-mixed" 5 | --- 6 | 7 | ## Overview 8 | 9 | Browser Use collects anonymous usage data to help us understand how the library is being used and to improve the user experience. It also helps us fix bugs faster and prioritize feature development. 10 | 11 | ## Data Collection 12 | 13 | We use [PostHog](https://posthog.com) for telemetry collection. The data is completely anonymized and contains no personally identifiable information. 14 | 15 | 16 | We never collect personal information, credentials, or specific content from 17 | your browser automation tasks. 18 | 19 | 20 | ## Opting Out 21 | 22 | You can disable telemetry by setting an environment variable: 23 | 24 | ```bash .env 25 | ANONYMIZED_TELEMETRY=false 26 | ``` 27 | 28 | Or in your Python code: 29 | 30 | ```python 31 | import os 32 | os.environ["ANONYMIZED_TELEMETRY"] = "false" 33 | ``` 34 | 35 | 36 | Even when enabled, telemetry has zero impact on the library's performance or 37 | functionality. Code is available in [Telemetry 38 | Service](https://github.com/browser-use/browser-use/tree/main/browser_use/telemetry). 39 | 40 | -------------------------------------------------------------------------------- /utils/browser-use/examples/features/custom_system_prompt.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import os 4 | import sys 5 | 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 7 | 8 | from dotenv import load_dotenv 9 | 10 | load_dotenv() 11 | 12 | try: 13 | from lmnr import Laminar 14 | 15 | Laminar.initialize(project_api_key=os.getenv('LMNR_PROJECT_API_KEY')) 16 | except Exception as e: 17 | print(f'Error initializing Laminar: {e}') 18 | 19 | 20 | from browser_use import Agent 21 | from browser_use.llm import ChatOpenAI 22 | 23 | extend_system_message = ( 24 | 'REMEMBER the most important RULE: ALWAYS open first a new tab and go first to url wikipedia.com no matter the task!!!' 25 | ) 26 | 27 | # or use override_system_message to completely override the system prompt 28 | 29 | 30 | async def main(): 31 | task = 'do google search to find images of Elon Musk' 32 | model = ChatOpenAI(model='gpt-4.1') 33 | agent = Agent(task=task, llm=model, extend_system_message=extend_system_message) 34 | 35 | print( 36 | json.dumps( 37 | agent.message_manager.system_prompt.model_dump(exclude_unset=True), 38 | indent=4, 39 | ) 40 | ) 41 | 42 | await agent.run() 43 | 44 | 45 | if __name__ == '__main__': 46 | asyncio.run(main()) 47 | -------------------------------------------------------------------------------- /utils/browser-use/.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: lint 2 | on: 3 | push: 4 | branches: 5 | - main 6 | - stable 7 | - 'releases/**' 8 | tags: 9 | - '*' 10 | pull_request: 11 | workflow_dispatch: 12 | 13 | jobs: 14 | lint-syntax: 15 | name: syntax-errors 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v4 19 | - uses: astral-sh/setup-uv@v5 20 | with: 21 | enable-cache: true 22 | - run: uv run ruff check --no-fix --select PLE 23 | 24 | lint-style: 25 | name: code-style 26 | runs-on: ubuntu-latest 27 | steps: 28 | - uses: actions/checkout@v4 29 | - uses: astral-sh/setup-uv@v5 30 | with: 31 | enable-cache: true 32 | - run: uv sync --dev --all-extras # install extras for examples to avoid pyright missing imports errors 33 | - run: uv run pre-commit run --all-files --show-diff-on-failure 34 | 35 | lint-typecheck: 36 | name: type-checker 37 | runs-on: ubuntu-latest 38 | steps: 39 | - uses: actions/checkout@v4 40 | - uses: astral-sh/setup-uv@v6 41 | with: 42 | enable-cache: true 43 | - run: uv sync --dev --all-extras # install extras for examples to avoid pyright missing imports errors- 44 | - run: uv run pyright 45 | -------------------------------------------------------------------------------- /utils/browser-use/Dockerfile.fast: -------------------------------------------------------------------------------- 1 | # Fast Dockerfile using pre-built base images 2 | ARG REGISTRY=browseruse 3 | ARG BASE_TAG=latest 4 | FROM ${REGISTRY}/base-python-deps:${BASE_TAG} 5 | 6 | LABEL name="browseruse" description="Browser automation for AI agents" 7 | 8 | ENV BROWSERUSE_USER="browseruse" DEFAULT_PUID=911 DEFAULT_PGID=911 DATA_DIR=/data 9 | 10 | # Create user and directories 11 | RUN groupadd --system $BROWSERUSE_USER && \ 12 | useradd --system --create-home --gid $BROWSERUSE_USER --groups audio,video $BROWSERUSE_USER && \ 13 | usermod -u "$DEFAULT_PUID" "$BROWSERUSE_USER" && \ 14 | groupmod -g "$DEFAULT_PGID" "$BROWSERUSE_USER" && \ 15 | mkdir -p /data /home/$BROWSERUSE_USER/.config && \ 16 | ln -s $DATA_DIR /home/$BROWSERUSE_USER/.config/browseruse && \ 17 | mkdir -p "/home/$BROWSERUSE_USER/.config/chromium/Crash Reports/pending/" && \ 18 | mkdir -p "$DATA_DIR/profiles/default" && \ 19 | chown -R "$BROWSERUSE_USER:$BROWSERUSE_USER" "/home/$BROWSERUSE_USER" "$DATA_DIR" 20 | 21 | WORKDIR /app 22 | COPY . /app 23 | 24 | # Install browser-use 25 | RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked \ 26 | uv sync --all-extras --locked --no-dev --compile-bytecode 27 | 28 | USER "$BROWSERUSE_USER" 29 | VOLUME "$DATA_DIR" 30 | EXPOSE 9242 9222 31 | ENTRYPOINT ["browser-use"] 32 | -------------------------------------------------------------------------------- /utils/browser-use/examples/features/follow_up_tasks.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | from browser_use import Agent, Controller 12 | from browser_use.browser import BrowserProfile, BrowserSession 13 | from browser_use.llm import ChatOpenAI 14 | 15 | # Initialize the model 16 | llm = ChatOpenAI( 17 | model='gpt-4.1', 18 | temperature=0.0, 19 | ) 20 | # Get your chrome path 21 | browser_session = BrowserSession( 22 | browser_profile=BrowserProfile( 23 | executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', 24 | keep_alive=True, 25 | user_data_dir='~/.config/browseruse/profiles/default', 26 | ), 27 | ) 28 | 29 | controller = Controller() 30 | 31 | 32 | task = 'Find the founders of browser-use and draft them a short personalized message' 33 | 34 | agent = Agent(task=task, llm=llm, controller=controller, browser_session=browser_session) 35 | 36 | 37 | async def main(): 38 | await agent.run() 39 | 40 | # new_task = input('Type in a new task: ') 41 | new_task = 'Find an image of the founders' 42 | 43 | agent.add_new_task(new_task) 44 | 45 | await agent.run() 46 | 47 | 48 | if __name__ == '__main__': 49 | asyncio.run(main()) 50 | -------------------------------------------------------------------------------- /utils/browser-use/examples/custom-functions/notification.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | from browser_use import ActionResult, Agent, Controller 12 | from browser_use.llm import ChatOpenAI 13 | 14 | controller = Controller() 15 | 16 | 17 | @controller.registry.action('Done with task ') 18 | async def done(text: str): 19 | import yagmail # type: ignore 20 | 21 | # To send emails use 22 | # STEP 1: go to https://support.google.com/accounts/answer/185833 23 | # STEP 2: Create an app password (you can't use here your normal gmail password) 24 | # STEP 3: Use the app password in the code below for the password 25 | yag = yagmail.SMTP('your_email@gmail.com', 'your_app_password') 26 | yag.send( 27 | to='recipient@example.com', 28 | subject='Test Email', 29 | contents=f'result\n: {text}', 30 | ) 31 | 32 | return ActionResult(is_done=True, extracted_content='Email sent!') 33 | 34 | 35 | async def main(): 36 | task = 'go to brower-use.com and then done' 37 | model = ChatOpenAI(model='gpt-4.1') 38 | agent = Agent(task=task, llm=model, controller=controller) 39 | 40 | await agent.run() 41 | 42 | 43 | if __name__ == '__main__': 44 | asyncio.run(main()) 45 | -------------------------------------------------------------------------------- /utils/browser-use/examples/mcp/simple_client.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple example of using MCP client with browser-use. 3 | 4 | This example shows how to connect to an MCP server and use its tools with an agent. 5 | """ 6 | 7 | import asyncio 8 | import os 9 | 10 | from browser_use import Agent, Controller 11 | from browser_use.llm.openai.chat import ChatOpenAI 12 | from browser_use.mcp.client import MCPClient 13 | 14 | 15 | async def main(): 16 | # Initialize controller 17 | controller = Controller() 18 | 19 | # Connect to a filesystem MCP server 20 | # This server provides tools to read/write files in a directory 21 | mcp_client = MCPClient( 22 | server_name='filesystem', command='npx', args=['@modelcontextprotocol/server-filesystem', os.path.expanduser('~/Desktop')] 23 | ) 24 | 25 | # Connect and register MCP tools 26 | await mcp_client.connect() 27 | await mcp_client.register_to_controller(controller) 28 | 29 | # Create agent with MCP-enabled controller 30 | agent = Agent( 31 | task='List all files on the Desktop and read the content of any .txt files you find', 32 | llm=ChatOpenAI(model='gpt-4o'), 33 | controller=controller, 34 | ) 35 | 36 | # Run the agent - it now has access to filesystem tools 37 | await agent.run() 38 | 39 | # Disconnect when done 40 | await mcp_client.disconnect() 41 | 42 | 43 | if __name__ == '__main__': 44 | asyncio.run(main()) 45 | -------------------------------------------------------------------------------- /utils/browser-use/examples/models/azure_openai.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple try of the agent. 3 | 4 | @dev You need to add AZURE_OPENAI_KEY and AZURE_OPENAI_ENDPOINT to your environment variables. 5 | """ 6 | 7 | import asyncio 8 | import os 9 | import sys 10 | 11 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 12 | 13 | from dotenv import load_dotenv 14 | 15 | load_dotenv() 16 | 17 | 18 | from browser_use import Agent 19 | from browser_use.llm import ChatAzureOpenAI 20 | 21 | # Retrieve Azure-specific environment variables 22 | azure_openai_api_key = os.getenv('AZURE_OPENAI_KEY') 23 | azure_openai_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT') 24 | 25 | if not azure_openai_api_key or not azure_openai_endpoint: 26 | raise ValueError('AZURE_OPENAI_KEY or AZURE_OPENAI_ENDPOINT is not set') 27 | 28 | # Initialize the Azure OpenAI client 29 | llm = ChatAzureOpenAI( 30 | model='gpt-4.1', 31 | api_key=azure_openai_api_key, 32 | azure_endpoint=azure_openai_endpoint, # Corrected to use azure_endpoint instead of openai_api_base 33 | ) 34 | 35 | agent = Agent( 36 | task='Go to amazon.com, search for laptop, sort by best rating, and give me the price of the first result', 37 | llm=llm, 38 | ) 39 | 40 | 41 | async def main(): 42 | await agent.run(max_steps=10) 43 | input('Press Enter to continue...') 44 | 45 | 46 | asyncio.run(main()) 47 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/browser/utils.py: -------------------------------------------------------------------------------- 1 | def normalize_url(url: str) -> str: 2 | """ 3 | Normalize a URL by adding https:// protocol if needed, while preserving special URLs. 4 | 5 | This function safely adds https:// to URLs that lack a protocol, but preserves 6 | special URLs like "about:blank", "mailto:...", "tel:...", etc. that should not 7 | be prefixed with https://. 8 | 9 | Args: 10 | url: The URL string to normalize 11 | 12 | Returns: 13 | str: The normalized URL with protocol if needed 14 | 15 | Examples: 16 | >>> normalize_url('example.com') 17 | 'https://example.com' 18 | >>> normalize_url('about:blank') 19 | 'about:blank' 20 | >>> normalize_url('mailto:test@example.com') 21 | 'mailto:test@example.com' 22 | >>> normalize_url('https://example.com') 23 | 'https://example.com' 24 | """ 25 | normalized_url = url.strip() 26 | 27 | # If URL already has a protocol, return as-is 28 | if '://' in normalized_url: 29 | return normalized_url 30 | 31 | # Check for special protocols that should not be prefixed with https:// 32 | special_protocols = ['about:', 'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'] 33 | for protocol in special_protocols: 34 | if normalized_url.startswith(protocol): 35 | return normalized_url 36 | 37 | # For everything else, add https:// 38 | return f'https://{normalized_url}' 39 | -------------------------------------------------------------------------------- /utils/browser-use/docs/favicon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /utils/browser-use/examples/custom-functions/save_to_file_hugging_face.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | from pydantic import BaseModel 12 | 13 | from browser_use.agent.service import Agent 14 | from browser_use.controller.service import Controller 15 | from browser_use.llm import ChatOpenAI 16 | 17 | # Initialize controller first 18 | controller = Controller() 19 | 20 | 21 | class Model(BaseModel): 22 | title: str 23 | url: str 24 | likes: int 25 | license: str 26 | 27 | 28 | class Models(BaseModel): 29 | models: list[Model] 30 | 31 | 32 | @controller.action('Save models', param_model=Models) 33 | def save_models(params: Models): 34 | with open('models.txt', 'a') as f: 35 | for model in params.models: 36 | f.write(f'{model.title} ({model.url}): {model.likes} likes, {model.license}\n') 37 | 38 | 39 | # video: https://preview.screen.studio/share/EtOhIk0P 40 | async def main(): 41 | task = 'Look up models with a license of cc-by-sa-4.0 and sort by most likes on Hugging face, save top 5 to file.' 42 | 43 | model = ChatOpenAI(model='gpt-4.1') 44 | agent = Agent(task=task, llm=model, controller=controller) 45 | 46 | await agent.run() 47 | 48 | 49 | if __name__ == '__main__': 50 | asyncio.run(main()) 51 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/llm/views.py: -------------------------------------------------------------------------------- 1 | from typing import Generic, TypeVar, Union 2 | 3 | from pydantic import BaseModel 4 | 5 | T = TypeVar('T', bound=Union[BaseModel, str]) 6 | 7 | 8 | class ChatInvokeUsage(BaseModel): 9 | """ 10 | Usage information for a chat model invocation. 11 | """ 12 | 13 | prompt_tokens: int 14 | """The number of tokens in the prompt (this includes the cached tokens as well. When calculating the cost, subtract the cached tokens from the prompt tokens)""" 15 | 16 | prompt_cached_tokens: int | None 17 | """The number of cached tokens.""" 18 | 19 | prompt_cache_creation_tokens: int | None 20 | """Anthropic only: The number of tokens used to create the cache.""" 21 | 22 | prompt_image_tokens: int | None 23 | """Google only: The number of tokens in the image (prompt tokens is the text tokens + image tokens in that case)""" 24 | 25 | completion_tokens: int 26 | """The number of tokens in the completion.""" 27 | 28 | total_tokens: int 29 | """The total number of tokens in the response.""" 30 | 31 | 32 | class ChatInvokeCompletion(BaseModel, Generic[T]): 33 | """ 34 | Response from a chat model invocation. 35 | """ 36 | 37 | completion: T 38 | """The completion of the response.""" 39 | 40 | # Thinking stuff 41 | thinking: str | None = None 42 | redacted_thinking: str | None = None 43 | 44 | usage: ChatInvokeUsage | None 45 | """The usage of the response.""" 46 | -------------------------------------------------------------------------------- /utils/browser-use/examples/features/drag_drop.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | 12 | from browser_use import Agent 13 | from browser_use.llm import ChatGoogle 14 | 15 | api_key = os.getenv('GOOGLE_API_KEY') 16 | if not api_key: 17 | raise ValueError('GOOGLE_API_KEY is not set') 18 | 19 | # API key is automatically set from the environment variable GOOGLE_API_KEY 20 | llm = ChatGoogle(model='gemini-2.0-flash-exp') 21 | 22 | 23 | task_1 = """ 24 | Navigate to: https://sortablejs.github.io/Sortable/. 25 | Then scroll down to the first examplw with title "Simple list example". 26 | Drag the element with name "item 1" to below the element with name "item 3". 27 | """ 28 | 29 | 30 | task_2 = """ 31 | Navigate to: https://excalidraw.com/. 32 | Click on the pencil icon (with index 40). 33 | Then draw a triangle in the canvas. 34 | Draw the triangle starting from coordinate (400,400). 35 | You can use the drag and drop action to draw the triangle. 36 | """ 37 | 38 | 39 | async def run_search(): 40 | agent = Agent( 41 | task=task_1, 42 | llm=llm, 43 | max_actions_per_step=1, 44 | use_vision=True, 45 | ) 46 | 47 | await agent.run(max_steps=25) 48 | 49 | 50 | if __name__ == '__main__': 51 | asyncio.run(run_search()) 52 | -------------------------------------------------------------------------------- /utils/browser-use/examples/use-cases/twitter_post_using_cookies.py: -------------------------------------------------------------------------------- 1 | # Goal: Automates posting on X (Twitter) using stored authentication cookies. 2 | 3 | import asyncio 4 | import os 5 | import sys 6 | 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 8 | 9 | from dotenv import load_dotenv 10 | 11 | load_dotenv() 12 | 13 | 14 | from browser_use import Agent 15 | from browser_use.browser import BrowserProfile, BrowserSession 16 | from browser_use.llm import ChatGoogle 17 | 18 | api_key = os.getenv('GOOGLE_API_KEY') 19 | if not api_key: 20 | raise ValueError('GOOGLE_API_KEY is not set') 21 | 22 | llm = ChatGoogle(model='gemini-2.0-flash-exp', api_key=api_key) 23 | 24 | 25 | browser_session = BrowserSession( 26 | browser_profile=BrowserProfile( 27 | user_data_dir='~/.config/browseruse/profiles/default', 28 | # headless=False, # Uncomment to see the browser 29 | # executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', 30 | ) 31 | ) 32 | 33 | 34 | async def main(): 35 | agent = Agent( 36 | browser_session=browser_session, 37 | task=('go to https://x.com. write a new post with the text "browser-use ftw", and submit it'), 38 | llm=llm, 39 | max_actions_per_step=4, 40 | ) 41 | await agent.run(max_steps=25) 42 | input('Press Enter to close the browser...') 43 | 44 | 45 | if __name__ == '__main__': 46 | asyncio.run(main()) 47 | -------------------------------------------------------------------------------- /utils/browser-use/examples/features/validate_output.py: -------------------------------------------------------------------------------- 1 | """ 2 | Demonstrate output validator. 3 | 4 | @dev You need to add OPENAI_API_KEY to your environment variables. 5 | """ 6 | 7 | import asyncio 8 | import os 9 | import sys 10 | 11 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 12 | 13 | from dotenv import load_dotenv 14 | 15 | load_dotenv() 16 | 17 | from pydantic import BaseModel 18 | 19 | from browser_use import ActionResult, Agent, Controller 20 | from browser_use.llm import ChatOpenAI 21 | 22 | controller = Controller() 23 | 24 | 25 | class DoneResult(BaseModel): 26 | title: str 27 | comments: str 28 | hours_since_start: int 29 | 30 | 31 | # we overwrite done() in this example to demonstrate the validator 32 | @controller.registry.action('Done with task', param_model=DoneResult) 33 | async def done(params: DoneResult): 34 | result = ActionResult(is_done=True, extracted_content=params.model_dump_json()) 35 | print(result) 36 | # NOTE: this is clearly wrong - to demonstrate the validator 37 | return 'blablabla' 38 | 39 | 40 | async def main(): 41 | task = 'Go to hackernews hn and give me the top 1 post' 42 | model = ChatOpenAI(model='gpt-4.1') 43 | agent = Agent(task=task, llm=model, controller=controller, validate_output=True) 44 | # NOTE: this should fail to demonstrate the validator 45 | await agent.run(max_steps=5) 46 | 47 | 48 | if __name__ == '__main__': 49 | asyncio.run(main()) 50 | -------------------------------------------------------------------------------- /utils/browser-use/examples/features/parallel_agents.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | from browser_use.agent.service import Agent 12 | from browser_use.browser import BrowserProfile, BrowserSession 13 | from browser_use.llm import ChatOpenAI 14 | 15 | browser_session = BrowserSession( 16 | browser_profile=BrowserProfile( 17 | keep_alive=True, 18 | headless=False, 19 | record_video_dir='./tmp/recordings', 20 | user_data_dir='~/.config/browseruse/profiles/default', 21 | ) 22 | ) 23 | llm = ChatOpenAI(model='gpt-4.1') 24 | 25 | 26 | async def main(): 27 | await browser_session.start() 28 | agents = [ 29 | Agent(task=task, llm=llm, browser_session=browser_session) 30 | for task in [ 31 | 'Search Google for weather in Tokyo', 32 | 'Check Reddit front page title', 33 | 'Look up Bitcoin price on Coinbase', 34 | 'Find NASA image of the day', 35 | 'Check top story on CNN', 36 | # 'Search latest SpaceX launch date', 37 | # 'Look up population of Paris', 38 | # 'Find current time in Sydney', 39 | # 'Check who won last Super Bowl', 40 | # 'Search trending topics on Twitter', 41 | ] 42 | ] 43 | 44 | print(await asyncio.gather(*[agent.run() for agent in agents])) 45 | await browser_session.kill() 46 | 47 | 48 | if __name__ == '__main__': 49 | asyncio.run(main()) 50 | -------------------------------------------------------------------------------- /utils/browser-use/examples/features/cross_origin_iframes.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example of how it supports cross-origin iframes. 3 | 4 | @dev You need to add OPENAI_API_KEY to your environment variables. 5 | """ 6 | 7 | import asyncio 8 | import os 9 | import sys 10 | 11 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 12 | 13 | from dotenv import load_dotenv 14 | 15 | load_dotenv() 16 | 17 | from browser_use import Agent, Controller 18 | from browser_use.browser import BrowserProfile, BrowserSession 19 | from browser_use.llm import ChatOpenAI 20 | 21 | if not os.getenv('OPENAI_API_KEY'): 22 | raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.') 23 | 24 | 25 | browser_profile = BrowserProfile( 26 | executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', 27 | ) 28 | browser_session = BrowserSession(browser_profile=browser_profile) 29 | controller = Controller() 30 | 31 | 32 | async def main(): 33 | agent = Agent( 34 | task='Click "Go cross-site (simple page)" button on https://csreis.github.io/tests/cross-site-iframe.html then tell me the text within', 35 | llm=ChatOpenAI(model='gpt-4.1', temperature=0.0), 36 | controller=controller, 37 | browser_session=browser_session, 38 | ) 39 | 40 | await agent.run() 41 | await browser_session.close() 42 | 43 | input('Press Enter to close...') 44 | 45 | 46 | if __name__ == '__main__': 47 | try: 48 | asyncio.run(main()) 49 | except Exception as e: 50 | print(e) 51 | -------------------------------------------------------------------------------- /utils/browser-use/tests/old/test_dropdown_complex.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test complex dropdown interaction functionality. 3 | """ 4 | 5 | import pytest 6 | 7 | from browser_use.agent.service import Agent 8 | from browser_use.agent.views import AgentHistoryList 9 | 10 | 11 | async def test_dropdown_complex(llm, browser_session): 12 | """Test selecting an option from a complex dropdown menu.""" 13 | agent = Agent( 14 | task=( 15 | 'go to https://codepen.io/shyam-king/pen/pvzpByJ and first get all options for the dropdown and then select the json option' 16 | ), 17 | llm=llm, 18 | browser_session=browser_session, 19 | ) 20 | 21 | try: 22 | history: AgentHistoryList = await agent.run(20) 23 | result = history.final_result() 24 | 25 | # Verify dropdown interaction 26 | assert result is not None 27 | assert 'json' in result.lower(), "Expected 'json' option to be selected" 28 | 29 | # Verify dropdown state 30 | page = await browser_session.get_current_page() 31 | element = await page.query_selector('.select-selected') 32 | assert element is not None, 'Custom dropdown element should exist' 33 | 34 | text = await element.text_content() 35 | assert 'json' in text.lower(), 'Dropdown should display json option' 36 | 37 | # Verify the selected option's effect 38 | code_element = await page.query_selector('pre code') 39 | assert code_element is not None, 'Code element should be visible when JSON is selected' 40 | 41 | except Exception as e: 42 | pytest.fail(f'Complex dropdown test failed: {str(e)}') 43 | -------------------------------------------------------------------------------- /utils/browser-use/examples/features/result_processing.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | from pprint import pprint 5 | 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 7 | 8 | from dotenv import load_dotenv 9 | 10 | load_dotenv() 11 | 12 | from browser_use import Agent 13 | from browser_use.agent.views import AgentHistoryList 14 | from browser_use.browser import BrowserProfile, BrowserSession 15 | from browser_use.llm import ChatOpenAI 16 | 17 | llm = ChatOpenAI(model='gpt-4.1') 18 | 19 | 20 | async def main(): 21 | async with BrowserSession( 22 | browser_profile=BrowserProfile( 23 | headless=False, 24 | traces_dir='./tmp/result_processing', 25 | window_size={'width': 1280, 'height': 1000}, 26 | user_data_dir='~/.config/browseruse/profiles/default', 27 | ) 28 | ) as browser_session: 29 | agent = Agent( 30 | task="go to google.com and type 'OpenAI' click search and give me the first url", 31 | llm=llm, 32 | browser_session=browser_session, 33 | ) 34 | history: AgentHistoryList = await agent.run(max_steps=3) 35 | 36 | print('Final Result:') 37 | pprint(history.final_result(), indent=4) 38 | 39 | print('\nErrors:') 40 | pprint(history.errors(), indent=4) 41 | 42 | # e.g. xPaths the model clicked on 43 | print('\nModel Outputs:') 44 | pprint(history.model_actions(), indent=4) 45 | 46 | print('\nThoughts:') 47 | pprint(history.model_thoughts(), indent=4) 48 | 49 | 50 | if __name__ == '__main__': 51 | asyncio.run(main()) 52 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/dom/playground/process_dom.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import os 4 | import time 5 | 6 | import anyio 7 | 8 | from browser_use.browser import BrowserProfile, BrowserSession 9 | 10 | 11 | async def test_process_dom(): 12 | browser_session = BrowserSession(browser_profile=BrowserProfile(headless=True)) 13 | await browser_session.start() 14 | try: 15 | page = await browser_session.get_current_page() 16 | await page.goto('https://kayak.com/flights') 17 | # await page.goto('https://google.com/flights') 18 | # await page.goto('https://immobilienscout24.de') 19 | # await page.goto('https://seleniumbase.io/w3schools/iframes') 20 | 21 | await asyncio.sleep(3) 22 | 23 | async with await anyio.open_file('browser_use/dom/buildDomTree.js', 'r') as f: 24 | js_code = await f.read() 25 | 26 | start = time.time() 27 | dom_tree = await page.evaluate(js_code) 28 | end = time.time() 29 | 30 | # print(dom_tree) 31 | print(f'Time: {end - start:.2f}s') 32 | 33 | os.makedirs('./tmp', exist_ok=True) 34 | async with await anyio.open_file('./tmp/dom.json', 'w') as f: 35 | await f.write(json.dumps(dom_tree, indent=1)) 36 | 37 | # both of these work for immobilienscout24.de 38 | # await page.click('.sc-dcJsrY.ezjNCe') 39 | # await page.click( 40 | # 'div > div:nth-of-type(2) > div > div:nth-of-type(2) > div > div:nth-of-type(2) > div > div > div > button:nth-of-type(2)' 41 | # ) 42 | 43 | input('Press Enter to continue...') 44 | finally: 45 | await browser_session.stop() 46 | -------------------------------------------------------------------------------- /utils/browser-use/docs/customize/output-format.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Output Format" 3 | description: "The default is text. But you can define a structured output format to make post-processing easier." 4 | icon: "code" 5 | --- 6 | 7 | ## Custom output format 8 | With [this example](https://github.com/browser-use/browser-use/blob/main/examples/features/custom_output.py) you can define what output format the agent should return to you. 9 | 10 | ```python 11 | from pydantic import BaseModel 12 | # Define the output format as a Pydantic model 13 | class Post(BaseModel): 14 | post_title: str 15 | post_url: str 16 | num_comments: int 17 | hours_since_post: int 18 | 19 | 20 | class Posts(BaseModel): 21 | posts: List[Post] 22 | 23 | 24 | controller = Controller(output_model=Posts) 25 | 26 | 27 | async def main(): 28 | task = 'Go to hackernews show hn and give me the first 5 posts' 29 | model = ChatOpenAI(model='gpt-4o') 30 | agent = Agent(task=task, llm=model, controller=controller) 31 | 32 | history = await agent.run() 33 | 34 | result = history.final_result() 35 | if result: 36 | parsed: Posts = Posts.model_validate_json(result) 37 | 38 | for post in parsed.posts: 39 | print('\n--------------------------------') 40 | print(f'Title: {post.post_title}') 41 | print(f'URL: {post.post_url}') 42 | print(f'Comments: {post.num_comments}') 43 | print(f'Hours since post: {post.hours_since_post}') 44 | else: 45 | print('No result') 46 | 47 | 48 | if __name__ == '__main__': 49 | asyncio.run(main()) 50 | ``` 51 | -------------------------------------------------------------------------------- /utils/browser-use/examples/features/custom_output.py: -------------------------------------------------------------------------------- 1 | """ 2 | Show how to use custom outputs. 3 | 4 | @dev You need to add OPENAI_API_KEY to your environment variables. 5 | """ 6 | 7 | import asyncio 8 | import os 9 | import sys 10 | 11 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 12 | 13 | from dotenv import load_dotenv 14 | 15 | load_dotenv() 16 | 17 | from pydantic import BaseModel 18 | 19 | from browser_use import Agent, Controller 20 | from browser_use.llm import ChatOpenAI 21 | 22 | 23 | class Post(BaseModel): 24 | post_title: str 25 | post_url: str 26 | num_comments: int 27 | hours_since_post: int 28 | 29 | 30 | class Posts(BaseModel): 31 | posts: list[Post] 32 | 33 | 34 | controller = Controller(output_model=Posts) 35 | 36 | 37 | async def main(): 38 | task = 'Go to hackernews show hn and give me the first 5 posts' 39 | model = ChatOpenAI(model='gpt-4.1') 40 | agent = Agent(task=task, llm=model, controller=controller) 41 | 42 | history = await agent.run() 43 | 44 | result = history.final_result() 45 | if result: 46 | parsed: Posts = Posts.model_validate_json(result) 47 | 48 | for post in parsed.posts: 49 | print('\n--------------------------------') 50 | print(f'Title: {post.post_title}') 51 | print(f'URL: {post.post_url}') 52 | print(f'Comments: {post.num_comments}') 53 | print(f'Hours since post: {post.hours_since_post}') 54 | else: 55 | print('No result') 56 | 57 | 58 | if __name__ == '__main__': 59 | asyncio.run(main()) 60 | -------------------------------------------------------------------------------- /utils/browser-use/examples/use-cases/online_coding_agent.py: -------------------------------------------------------------------------------- 1 | # Goal: Implements a multi-agent system for online code editors, with separate agents for coding and execution. 2 | 3 | import asyncio 4 | import os 5 | import sys 6 | 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 8 | 9 | from dotenv import load_dotenv 10 | 11 | load_dotenv() 12 | 13 | from browser_use import Agent 14 | from browser_use.browser import BrowserSession 15 | from browser_use.llm import ChatOpenAI 16 | 17 | if not os.getenv('OPENAI_API_KEY'): 18 | raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.') 19 | 20 | 21 | async def main(): 22 | browser_session = BrowserSession() 23 | model = ChatOpenAI(model='gpt-4.1') 24 | 25 | # Initialize browser agent 26 | agent1 = Agent( 27 | task='Open an online code editor programiz.', 28 | llm=model, 29 | browser_session=browser_session, 30 | ) 31 | executor = Agent( 32 | task='Executor. Execute the code written by the coder and suggest some updates if there are errors.', 33 | llm=model, 34 | browser_session=browser_session, 35 | ) 36 | 37 | coder = Agent( 38 | task='Coder. Your job is to write and complete code. You are an expert coder. Code a simple calculator. Write the code on the coding interface after agent1 has opened the link.', 39 | llm=model, 40 | browser_session=browser_session, 41 | ) 42 | await agent1.run() 43 | await executor.run() 44 | await coder.run() 45 | 46 | 47 | if __name__ == '__main__': 48 | asyncio.run(main()) 49 | -------------------------------------------------------------------------------- /utils/browser-use/.github/workflows/claude.yml: -------------------------------------------------------------------------------- 1 | name: Claude Code 2 | 3 | on: 4 | issue_comment: 5 | types: [created] 6 | pull_request_review_comment: 7 | types: [created] 8 | issues: 9 | types: [opened, assigned] 10 | pull_request_review: 11 | types: [submitted] 12 | 13 | jobs: 14 | claude: 15 | if: | 16 | (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) || 17 | (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) || 18 | (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) || 19 | (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude'))) 20 | runs-on: ubuntu-latest 21 | permissions: 22 | contents: read 23 | pull-requests: read 24 | issues: read 25 | id-token: write 26 | steps: 27 | - name: Checkout repository 28 | uses: actions/checkout@v4 29 | with: 30 | fetch-depth: 1 31 | 32 | - name: Run Claude Code 33 | id: claude 34 | uses: anthropics/claude-code-action@beta 35 | with: 36 | anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} 37 | allowed_tools: | 38 | Bash(pytest) 39 | Bash(sed) 40 | Bash(grep) 41 | Bash(python) 42 | Bash(uv) 43 | Bash(./bin/lint.sh) 44 | Bash(./bin/test.sh) 45 | Edit 46 | Replace 47 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/agent/message_manager/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | import logging 5 | from pathlib import Path 6 | from typing import Any 7 | 8 | import anyio 9 | 10 | from browser_use.llm.messages import BaseMessage 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | async def save_conversation( 16 | input_messages: list[BaseMessage], 17 | response: Any, 18 | target: str | Path, 19 | encoding: str | None = None, 20 | ) -> None: 21 | """Save conversation history to file asynchronously.""" 22 | target_path = Path(target) 23 | # create folders if not exists 24 | if target_path.parent: 25 | await anyio.Path(target_path.parent).mkdir(parents=True, exist_ok=True) 26 | 27 | await anyio.Path(target_path).write_text( 28 | await _format_conversation(input_messages, response), 29 | encoding=encoding or 'utf-8', 30 | ) 31 | 32 | 33 | async def _format_conversation(messages: list[BaseMessage], response: Any) -> str: 34 | """Format the conversation including messages and response.""" 35 | lines = [] 36 | 37 | # Format messages 38 | for message in messages: 39 | lines.append(f' {message.role} ') 40 | 41 | lines.append(message.text) 42 | lines.append('') # Empty line after each message 43 | 44 | # Format response 45 | lines.append(' RESPONSE') 46 | lines.append(json.dumps(json.loads(response.model_dump_json(exclude_unset=True)), indent=2)) 47 | 48 | return '\n'.join(lines) 49 | 50 | 51 | # Note: _write_messages_to_file and _write_response_to_file have been merged into _format_conversation 52 | # This is more efficient for async operations and reduces file I/O 53 | -------------------------------------------------------------------------------- /utils/browser-use/examples/integrations/slack/slack_example.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 5 | 6 | from dotenv import load_dotenv 7 | 8 | load_dotenv() 9 | 10 | 11 | from browser_use.browser import BrowserProfile 12 | from browser_use.llm import ChatGoogle 13 | from examples.integrations.slack.slack_api import SlackBot, app 14 | 15 | # load credentials from environment variables 16 | bot_token = os.getenv('SLACK_BOT_TOKEN') 17 | if not bot_token: 18 | raise ValueError('Slack bot token not found in .env file.') 19 | 20 | signing_secret = os.getenv('SLACK_SIGNING_SECRET') 21 | if not signing_secret: 22 | raise ValueError('Slack signing secret not found in .env file.') 23 | 24 | api_key = os.getenv('GOOGLE_API_KEY') 25 | if not api_key: 26 | raise ValueError('GOOGLE_API_KEY is not set') 27 | 28 | llm = ChatGoogle(model='gemini-2.0-flash-exp', api_key=api_key) 29 | 30 | slack_bot = SlackBot( 31 | llm=llm, # required; instance of BaseChatModel 32 | bot_token=bot_token, # required; Slack bot token 33 | signing_secret=signing_secret, # required; Slack signing secret 34 | ack=True, # optional; whether to acknowledge task receipt with a message, defaults to False 35 | browser_profile=BrowserProfile( 36 | headless=True 37 | ), # optional; useful for changing headless mode or other browser configs, defaults to headless mode 38 | ) 39 | 40 | app.dependency_overrides[SlackBot] = lambda: slack_bot 41 | 42 | if __name__ == '__main__': 43 | import uvicorn 44 | 45 | uvicorn.run('integrations.slack.slack_api:app', host='0.0.0.0', port=3000) 46 | -------------------------------------------------------------------------------- /utils/browser-use/tests/old/httpx_client_test.py: -------------------------------------------------------------------------------- 1 | import httpx 2 | 3 | from browser_use.browser import BrowserProfile, BrowserSession 4 | 5 | 6 | async def test_browser_close_doesnt_affect_external_httpx_clients(): 7 | """ 8 | Test that Browser.close() doesn't close HTTPX clients created outside the Browser instance. 9 | This test demonstrates the issue where Browser.close() is closing all HTTPX clients. 10 | """ 11 | # Create an external HTTPX client that should remain open 12 | external_client = httpx.AsyncClient() 13 | 14 | # Create a BrowserSession instance 15 | browser_session = BrowserSession(browser_profile=BrowserProfile(headless=True)) 16 | await browser_session.start() 17 | 18 | # Close the browser (which should trigger cleanup_httpx_clients) 19 | await browser_session.stop() 20 | 21 | # Check if the external client is still usable 22 | try: 23 | # If the client is closed, this will raise RuntimeError 24 | # Using a simple HEAD request to a reliable URL 25 | await external_client.head('https://www.example.com', timeout=2.0) 26 | client_is_closed = False 27 | except RuntimeError as e: 28 | # If we get "Cannot send a request, as the client has been closed" 29 | client_is_closed = 'client has been closed' in str(e) 30 | except Exception: 31 | # Any other exception means the client is not closed but request failed 32 | client_is_closed = False 33 | finally: 34 | # Always clean up our test client properly 35 | await external_client.aclose() 36 | 37 | # Our external client should not be closed by browser.close() 38 | assert not client_is_closed, 'External HTTPX client was incorrectly closed by Browser.close()' 39 | -------------------------------------------------------------------------------- /utils/browser-use/.github/workflows/build-base-image.yml.disabled: -------------------------------------------------------------------------------- 1 | name: Build Base Image 2 | 3 | on: 4 | schedule: 5 | - cron: '0 2 * * 1' # Weekly on Monday 6 | workflow_dispatch: 7 | push: 8 | paths: 9 | - 'Dockerfile.base' 10 | 11 | jobs: 12 | build-base: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | platform: [linux/amd64, linux/arm64] 17 | steps: 18 | - uses: actions/checkout@v4 19 | 20 | - name: Set up QEMU 21 | uses: docker/setup-qemu-action@v3 22 | 23 | - name: Set up Docker Buildx 24 | uses: docker/setup-buildx-action@v3 25 | 26 | - name: Login to Docker Hub 27 | uses: docker/login-action@v3 28 | with: 29 | username: ${{ secrets.DOCKER_USERNAME }} 30 | password: ${{ secrets.DOCKER_PASSWORD }} 31 | 32 | - name: Build and push base image 33 | uses: docker/build-push-action@v5 34 | with: 35 | context: . 36 | file: ./Dockerfile.base 37 | platforms: ${{ matrix.platform }} 38 | push: true 39 | tags: | 40 | browseruse/browseruse-base:chromium-138-${{ matrix.platform == 'linux/amd64' && 'amd64' || 'arm64' }} 41 | browseruse/browseruse-base:latest-${{ matrix.platform == 'linux/amd64' && 'amd64' || 'arm64' }} 42 | cache-from: type=registry,ref=browseruse/browseruse-base:buildcache-${{ matrix.platform == 'linux/amd64' && 'amd64' || 'arm64' }} 43 | cache-to: type=registry,ref=browseruse/browseruse-base:buildcache-${{ matrix.platform == 'linux/amd64' && 'amd64' || 'arm64' }},mode=max 44 | -------------------------------------------------------------------------------- /utils/browser-use/examples/use-cases/check_appointment.py: -------------------------------------------------------------------------------- 1 | # Goal: Checks for available visa appointment slots on the Greece MFA website. 2 | 3 | import asyncio 4 | import os 5 | import sys 6 | 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 8 | 9 | from dotenv import load_dotenv 10 | 11 | load_dotenv() 12 | 13 | from pydantic import BaseModel 14 | 15 | from browser_use.agent.service import Agent 16 | from browser_use.controller.service import Controller 17 | from browser_use.llm import ChatOpenAI 18 | 19 | if not os.getenv('OPENAI_API_KEY'): 20 | raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.') 21 | 22 | controller = Controller() 23 | 24 | 25 | class WebpageInfo(BaseModel): 26 | """Model for webpage link.""" 27 | 28 | link: str = 'https://appointment.mfa.gr/en/reservations/aero/ireland-grcon-dub/' 29 | 30 | 31 | @controller.action('Go to the webpage', param_model=WebpageInfo) 32 | def go_to_webpage(webpage_info: WebpageInfo): 33 | """Returns the webpage link.""" 34 | return webpage_info.link 35 | 36 | 37 | async def main(): 38 | """Main function to execute the agent task.""" 39 | task = ( 40 | 'Go to the Greece MFA webpage via the link I provided you.' 41 | 'Check the visa appointment dates. If there is no available date in this month, check the next month.' 42 | 'If there is no available date in both months, tell me there is no available date.' 43 | ) 44 | 45 | model = ChatOpenAI(model='gpt-4.1-mini') 46 | agent = Agent(task, model, controller=controller, use_vision=True) 47 | 48 | await agent.run() 49 | 50 | 51 | if __name__ == '__main__': 52 | asyncio.run(main()) 53 | -------------------------------------------------------------------------------- /utils/browser-use/examples/custom-functions/clipboard.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | import pyperclip 12 | 13 | from browser_use import Agent, Controller 14 | from browser_use.agent.views import ActionResult 15 | from browser_use.browser import BrowserProfile, BrowserSession 16 | from browser_use.browser.types import Page 17 | from browser_use.llm import ChatOpenAI 18 | 19 | browser_profile = BrowserProfile( 20 | headless=False, 21 | ) 22 | controller = Controller() 23 | 24 | 25 | @controller.registry.action('Copy text to clipboard') 26 | def copy_to_clipboard(text: str): 27 | pyperclip.copy(text) 28 | return ActionResult(extracted_content=text) 29 | 30 | 31 | @controller.registry.action('Paste text from clipboard') 32 | async def paste_from_clipboard(page: Page): 33 | text = pyperclip.paste() 34 | # send text to browser 35 | await page.keyboard.type(text) 36 | 37 | return ActionResult(extracted_content=text) 38 | 39 | 40 | async def main(): 41 | task = 'Copy the text "Hello, world!" to the clipboard, then go to google.com and paste the text' 42 | model = ChatOpenAI(model='gpt-4.1') 43 | browser_session = BrowserSession(browser_profile=browser_profile) 44 | await browser_session.start() 45 | agent = Agent( 46 | task=task, 47 | llm=model, 48 | controller=controller, 49 | browser_session=browser_session, 50 | ) 51 | 52 | await agent.run() 53 | await browser_session.stop() 54 | 55 | input('Press Enter to close...') 56 | 57 | 58 | if __name__ == '__main__': 59 | asyncio.run(main()) 60 | -------------------------------------------------------------------------------- /utils/browser-use/.github/workflows/package.yaml: -------------------------------------------------------------------------------- 1 | name: package 2 | on: 3 | push: 4 | branches: 5 | - main 6 | - stable 7 | - 'releases/**' 8 | tags: 9 | - '*' 10 | pull_request: 11 | workflow_dispatch: 12 | 13 | jobs: 14 | build: 15 | name: pip-build 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v4 19 | - uses: astral-sh/setup-uv@v5 20 | - run: uv build --python 3.12 21 | - uses: actions/upload-artifact@v4 22 | with: 23 | name: dist-artifact 24 | path: | 25 | dist/*.whl 26 | dist/*.tar.gz 27 | 28 | build_test: 29 | name: pip-install-on-${{ matrix.os }}-py-${{ matrix.python-version }} 30 | needs: build 31 | runs-on: ${{ matrix.os }} 32 | strategy: 33 | matrix: 34 | os: [ubuntu-latest, macos-latest, windows-latest] 35 | python-version: ["3.11", "3.13"] 36 | env: 37 | ANONYMIZED_TELEMETRY: 'false' 38 | 39 | steps: 40 | - uses: actions/checkout@v4 41 | - uses: astral-sh/setup-uv@v5 42 | - uses: actions/download-artifact@v4 43 | with: 44 | name: dist-artifact 45 | 46 | - name: Set up venv and test for OS/Python versions 47 | shell: bash 48 | run: | 49 | uv venv /tmp/testenv --python ${{ matrix.python-version }} 50 | if [[ "$RUNNER_OS" == "Windows" ]]; then 51 | . /tmp/testenv/Scripts/activate 52 | else 53 | source /tmp/testenv/bin/activate 54 | fi 55 | uv pip install *.whl 56 | python -c 'from browser_use import Agent, Browser, Controller, ActionModel, ActionResult' 57 | -------------------------------------------------------------------------------- /utils/browser-use/bin/setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # This script is used to setup a local development environment for the browser-use project. 3 | # Usage: 4 | # $ ./bin/setup.sh 5 | 6 | ### Bash Environment Setup 7 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/ 8 | # https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html 9 | # set -o xtrace 10 | # set -x 11 | # shopt -s nullglob 12 | set -o errexit 13 | set -o errtrace 14 | set -o nounset 15 | set -o pipefail 16 | IFS=$'\n' 17 | 18 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 19 | cd "$SCRIPT_DIR" 20 | 21 | 22 | if [ -f "$SCRIPT_DIR/lint.sh" ]; then 23 | echo "[√] already inside a cloned browser-use repo" 24 | else 25 | echo "[+] Cloning browser-use repo into current directory: $SCRIPT_DIR" 26 | git clone https://github.com/browser-use/browser-use 27 | cd browser-use 28 | fi 29 | 30 | echo "[+] Installing uv..." 31 | curl -LsSf https://astral.sh/uv/install.sh | sh 32 | 33 | #git checkout main git pull 34 | echo 35 | echo "[+] Setting up venv" 36 | uv venv 37 | echo 38 | echo "[+] Installing packages in venv" 39 | uv sync --dev --all-extras 40 | echo 41 | echo "[i] Tip: make sure to set BROWSER_USE_LOGGING_LEVEL=debug and your LLM API keys in your .env file" 42 | echo 43 | uv pip show browser-use 44 | 45 | echo "Usage:" 46 | echo " $ browser-use use the CLI" 47 | echo " or" 48 | echo " $ source .venv/bin/activate" 49 | echo " $ ipython use the library" 50 | echo " >>> from browser_use import BrowserSession, Agent" 51 | echo " >>> await Agent(task='book me a flight to fiji', browser=BrowserSession(headless=False)).run()" 52 | echo "" 53 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/llm/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | We have switched all of our code from langchain to openai.types.chat.chat_completion_message_param. 3 | 4 | For easier transition we have 5 | """ 6 | 7 | from typing import Any, Protocol, TypeVar, overload 8 | 9 | from pydantic import BaseModel 10 | 11 | from browser_use.llm.messages import BaseMessage 12 | from browser_use.llm.views import ChatInvokeCompletion 13 | 14 | T = TypeVar('T', bound=BaseModel) 15 | 16 | 17 | class BaseChatModel(Protocol): 18 | _verified_api_keys: bool = False 19 | 20 | model: str 21 | 22 | @property 23 | def provider(self) -> str: ... 24 | 25 | @property 26 | def name(self) -> str: ... 27 | 28 | @property 29 | def model_name(self) -> str: 30 | # for legacy support 31 | return self.model 32 | 33 | @overload 34 | async def ainvoke(self, messages: list[BaseMessage], output_format: None = None) -> ChatInvokeCompletion[str]: ... 35 | 36 | @overload 37 | async def ainvoke(self, messages: list[BaseMessage], output_format: type[T]) -> ChatInvokeCompletion[T]: ... 38 | 39 | async def ainvoke( 40 | self, messages: list[BaseMessage], output_format: type[T] | None = None 41 | ) -> ChatInvokeCompletion[T] | ChatInvokeCompletion[str]: ... 42 | 43 | @classmethod 44 | def __get_pydantic_core_schema__( 45 | cls, 46 | source_type: type, 47 | handler: Any, 48 | ) -> Any: 49 | """ 50 | Allow this Protocol to be used in Pydantic models -> very useful to typesafe the agent settings for example. 51 | Returns a schema that allows any object (since this is a Protocol). 52 | """ 53 | from pydantic_core import core_schema 54 | 55 | # Return a schema that accepts any object for Protocol types 56 | return core_schema.any_schema() 57 | -------------------------------------------------------------------------------- /utils/browser-use/examples/file_system/file_system.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import pathlib 4 | import shutil 5 | 6 | from dotenv import load_dotenv 7 | 8 | from browser_use import Agent 9 | from browser_use.llm import ChatOpenAI 10 | 11 | load_dotenv() 12 | 13 | '' 14 | SCRIPT_DIR = pathlib.Path(os.path.dirname(os.path.abspath(__file__))) 15 | agent_dir = SCRIPT_DIR / 'test_no_thinking' 16 | agent_dir.mkdir(exist_ok=True) 17 | conversation_dir = agent_dir / 'conversations' / 'conversation' 18 | print(f'Agent logs directory: {agent_dir}') 19 | 20 | try: 21 | from lmnr import Laminar 22 | 23 | Laminar.initialize(project_api_key=os.getenv('LMNR_PROJECT_API_KEY')) 24 | except Exception as e: 25 | print(f'Error initializing Laminar: {e}') 26 | 27 | task = """ 28 | Go to https://mertunsall.github.io/posts/post1.html 29 | Save the title of the article in "data.md" 30 | Then, use append_file to add the first sentence of the article to "data.md" 31 | Then, read the file to see its content and make sure it's correct. 32 | Finally, share the file with me. 33 | 34 | NOTE: DO NOT USE extract_structured_data action - everything is visible in browser state. 35 | """.strip('\n') 36 | 37 | llm = ChatOpenAI( 38 | model='gpt-4.1-mini', 39 | ) 40 | 41 | 42 | agent = Agent( 43 | task=task, 44 | llm=llm, 45 | save_conversation_path=str(conversation_dir), 46 | file_system_path=str(agent_dir / 'fs'), 47 | ) 48 | 49 | 50 | async def main(): 51 | agent_history = await agent.run() 52 | print(f'Final result: {agent_history.final_result()}', flush=True) 53 | 54 | input('Press Enter to clean the file system...') 55 | # clean the file system 56 | shutil.rmtree(str(agent_dir / 'fs')) 57 | 58 | 59 | if __name__ == '__main__': 60 | asyncio.run(main()) 61 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/llm/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | We have switched all of our code from langchain to openai.types.chat.chat_completion_message_param. 3 | 4 | For easier transition we have 5 | """ 6 | 7 | from browser_use.llm.anthropic.chat import ChatAnthropic 8 | from browser_use.llm.aws.chat_anthropic import ChatAnthropicBedrock 9 | from browser_use.llm.aws.chat_bedrock import ChatAWSBedrock 10 | from browser_use.llm.azure.chat import ChatAzureOpenAI 11 | from browser_use.llm.base import BaseChatModel 12 | from browser_use.llm.google.chat import ChatGoogle 13 | from browser_use.llm.groq.chat import ChatGroq 14 | from browser_use.llm.messages import ( 15 | AssistantMessage, 16 | BaseMessage, 17 | SystemMessage, 18 | UserMessage, 19 | ) 20 | from browser_use.llm.messages import ( 21 | ContentPartImageParam as ContentImage, 22 | ) 23 | from browser_use.llm.messages import ( 24 | ContentPartRefusalParam as ContentRefusal, 25 | ) 26 | from browser_use.llm.messages import ( 27 | ContentPartTextParam as ContentText, 28 | ) 29 | from browser_use.llm.ollama.chat import ChatOllama 30 | from browser_use.llm.openai.chat import ChatOpenAI 31 | from browser_use.llm.openrouter.chat import ChatOpenRouter 32 | 33 | # Make better names for the message 34 | 35 | __all__ = [ 36 | # Message types -> for easier transition from langchain 37 | 'BaseMessage', 38 | 'UserMessage', 39 | 'SystemMessage', 40 | 'AssistantMessage', 41 | # Content parts with better names 42 | 'ContentText', 43 | 'ContentRefusal', 44 | 'ContentImage', 45 | # Chat models 46 | 'BaseChatModel', 47 | 'ChatOpenAI', 48 | 'ChatGoogle', 49 | 'ChatAnthropic', 50 | 'ChatAnthropicBedrock', 51 | 'ChatAWSBedrock', 52 | 'ChatGroq', 53 | 'ChatAzureOpenAI', 54 | 'ChatOllama', 55 | 'ChatOpenRouter', 56 | ] 57 | -------------------------------------------------------------------------------- /utils/browser-use/examples/models/langchain/example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example of using LangChain models with browser-use. 3 | 4 | This example demonstrates how to: 5 | 1. Wrap a LangChain model with ChatLangchain 6 | 2. Use it with a browser-use Agent 7 | 3. Run a simple web automation task 8 | 9 | @file purpose: Example usage of LangChain integration with browser-use 10 | """ 11 | 12 | import asyncio 13 | 14 | from langchain_openai import ChatOpenAI # pyright: ignore 15 | from lmnr import Laminar 16 | 17 | from browser_use import Agent 18 | from examples.models.langchain.chat import ChatLangchain 19 | 20 | Laminar.initialize() 21 | 22 | 23 | async def main(): 24 | """Basic example using ChatLangchain with OpenAI through LangChain.""" 25 | 26 | # Create a LangChain model (OpenAI) 27 | langchain_model = ChatOpenAI( 28 | model='gpt-4.1-mini', 29 | temperature=0.1, 30 | ) 31 | 32 | # Wrap it with ChatLangchain to make it compatible with browser-use 33 | llm = ChatLangchain(chat=langchain_model) 34 | 35 | # Create a simple task 36 | task = "Go to google.com and search for 'browser automation with Python'" 37 | 38 | # Create and run the agent 39 | agent = Agent( 40 | task=task, 41 | llm=llm, 42 | ) 43 | 44 | print(f'🚀 Starting task: {task}') 45 | print(f'🤖 Using model: {llm.name} (provider: {llm.provider})') 46 | 47 | # Run the agent 48 | history = await agent.run() 49 | 50 | print(f'✅ Task completed! Steps taken: {len(history.history)}') 51 | 52 | # Print the final result if available 53 | if history.final_result(): 54 | print(f'📋 Final result: {history.final_result()}') 55 | 56 | return history 57 | 58 | 59 | if __name__ == '__main__': 60 | print('🌐 Browser-use LangChain Integration Example') 61 | print('=' * 45) 62 | 63 | asyncio.run(main()) 64 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/llm/tests/test_groq_loop.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from browser_use.llm import ContentText 4 | from browser_use.llm.groq.chat import ChatGroq 5 | from browser_use.llm.messages import SystemMessage, UserMessage 6 | 7 | llm = ChatGroq( 8 | model='meta-llama/llama-4-maverick-17b-128e-instruct', 9 | temperature=0.5, 10 | ) 11 | # llm = ChatOpenAI(model='gpt-4.1-mini') 12 | 13 | 14 | async def main(): 15 | from pydantic import BaseModel 16 | 17 | from browser_use.tokens.service import TokenCost 18 | 19 | tk = TokenCost().register_llm(llm) 20 | 21 | class Output(BaseModel): 22 | reasoning: str 23 | answer: str 24 | 25 | message = [ 26 | SystemMessage(content='You are a helpful assistant that can answer questions and help with tasks.'), 27 | UserMessage( 28 | content=[ 29 | ContentText( 30 | text=r"Why is the sky blue? write exactly this into reasoning make sure to output ' with exactly like in the input : " 31 | ), 32 | ContentText( 33 | text=""" 34 | The user's request is to find the lowest priced women's plus size one piece swimsuit in color black with a customer rating of at least 5 on Kohls.com. I am currently on the homepage of Kohls. The page has a search bar and various category links. To begin, I need to navigate to the women's section and search for swimsuits. I will start by clicking on the 'Women' category link.""" 35 | ), 36 | ] 37 | ), 38 | ] 39 | 40 | for i in range(10): 41 | print('-' * 50) 42 | print(f'start loop {i}') 43 | response = await llm.ainvoke(message, output_format=Output) 44 | completion = response.completion 45 | print(f'start reasoning: {completion.reasoning}') 46 | print(f'answer: {completion.answer}') 47 | print('-' * 50) 48 | 49 | 50 | if __name__ == '__main__': 51 | asyncio.run(main()) 52 | -------------------------------------------------------------------------------- /utils/browser-use/docs/quickstart.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Quickstart" 3 | description: "Start using Browser Use with this quickstart guide" 4 | icon: "rocket" 5 | --- 6 | 7 | {/* You can install Browser Use from PyPI or clone it from Github. */} 8 | 9 | ## Prepare the environment 10 | 11 | Browser Use requires Python 3.11 or higher. 12 | 13 | First, we recommend using [uv](https://docs.astral.sh/uv/) to setup the Python environment. 14 | 15 | ```bash 16 | uv venv --python 3.11 17 | ``` 18 | 19 | and activate it with: 20 | 21 | ```bash 22 | # For Mac/Linux: 23 | source .venv/bin/activate 24 | 25 | # For Windows: 26 | .venv\Scripts\activate 27 | ``` 28 | 29 | Install the dependencies: 30 | 31 | ```bash 32 | uv pip install browser-use 33 | ``` 34 | 35 | Then install playwright: 36 | 37 | ```bash 38 | uv run playwright install 39 | ``` 40 | 41 | ## Create an agent 42 | 43 | Then you can use the agent as follows: 44 | 45 | ```python agent.py 46 | from browser_use.llm import ChatOpenAI 47 | from browser_use import Agent 48 | from dotenv import load_dotenv 49 | load_dotenv() 50 | 51 | import asyncio 52 | 53 | llm = ChatOpenAI(model="gpt-4.1") 54 | 55 | async def main(): 56 | agent = Agent( 57 | task="Compare the price of gpt-4o and DeepSeek-V3", 58 | llm=llm, 59 | ) 60 | result = await agent.run() 61 | print(result) 62 | 63 | asyncio.run(main()) 64 | ``` 65 | 66 | ## Set up your LLM API keys 67 | 68 | `ChatOpenAI` and other chat models require API keys. You should store these in your `.env` file. For example, for OpenAI and Anthropic, you can set the API keys in your `.env` file, such as: 69 | 70 | ```bash .env 71 | OPENAI_API_KEY= 72 | ANTHROPIC_API_KEY= 73 | ``` 74 | 75 | For other LLM models you can refer to the [Supported Models](/customize/supported-models) page to find how to set them up with their specific API keys. 76 | -------------------------------------------------------------------------------- /utils/browser-use/docs/development/evaluations.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Evaluations" 3 | description: "Test the Browser Use agent on standardized benchmarks" 4 | icon: "chart-bar" 5 | --- 6 | 7 | ## Prerequisites 8 | 9 | Browser Use uses proprietary/private test sets that must never be committed to Github and must be fetched through a authorized api request. 10 | Accessing these test sets requires an approved Browser Use account. 11 | There are currently no publicly available test sets, but some may be released in the future. 12 | 13 | ## Get an Api Access Key 14 | 15 | First, navigate to https://browser-use.tools and log in with an authorized browser use account. 16 | 17 | Then, click the "Account" button at the top right of the page, and click the "Cycle New Key" button on that page. 18 | 19 | Copy the resulting url and secret key into your `.env` file. It should look like this: 20 | 21 | ```bash .env 22 | EVALUATION_TOOL_URL= ... 23 | EVALUATION_TOOL_SECRET_KEY= ... 24 | ``` 25 | 26 | ## Running Evaluations 27 | 28 | First, ensure your file `eval/service.py` is up to date. 29 | 30 | Then run the file: 31 | 32 | ```bash 33 | python eval/service.py 34 | ``` 35 | 36 | ## Configuring Evaluations 37 | 38 | You can modify the evaluation by providing flags to the evaluation script. For instance: 39 | 40 | ```bash 41 | python eval/service.py --parallel_runs 5 --parallel_evaluations 5 --max-steps 25 --start 0 --end 100 --model gpt-4o 42 | ``` 43 | 44 | The evaluations webpage has a convenient GUI for generating these commands. To use it, navigate to https://browser-use.tools/dashboard. 45 | 46 | Then click the button "New Eval Run" on the left panel. This will open a interface with selectors, inputs, sliders, and switches. 47 | 48 | Input your desired configuration into the interface and copy the resulting python command at the bottom. Then run this command as before. 49 | -------------------------------------------------------------------------------- /utils/browser-use/examples/features/outsource_state.py: -------------------------------------------------------------------------------- 1 | """ 2 | Show how to use custom outputs. 3 | 4 | @dev You need to add OPENAI_API_KEY to your environment variables. 5 | """ 6 | 7 | import asyncio 8 | import os 9 | import sys 10 | 11 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 12 | 13 | from dotenv import load_dotenv 14 | 15 | load_dotenv() 16 | 17 | import anyio 18 | 19 | from browser_use import Agent 20 | from browser_use.agent.views import AgentState 21 | from browser_use.browser import BrowserProfile, BrowserSession 22 | from browser_use.llm import ChatOpenAI 23 | 24 | 25 | async def main(): 26 | task = 'Go to hackernews show hn and give me the first 5 posts' 27 | 28 | browser_profile = BrowserProfile( 29 | headless=True, 30 | ) 31 | browser_session = BrowserSession(browser_profile=browser_profile) 32 | 33 | agent_state = AgentState() 34 | 35 | for i in range(10): 36 | agent = Agent( 37 | task=task, 38 | llm=ChatOpenAI(model='gpt-4.1'), 39 | browser_session=browser_session, 40 | injected_agent_state=agent_state, 41 | page_extraction_llm=ChatOpenAI(model='gpt-4.1-mini'), 42 | ) 43 | 44 | done, valid = await agent.take_step() 45 | print(f'Step {i}: Done: {done}, Valid: {valid}') 46 | 47 | if done and valid: 48 | break 49 | 50 | agent_state.history.history = [] 51 | 52 | # Save state to file 53 | async with await anyio.open_file('agent_state.json', 'w') as f: 54 | serialized = agent_state.model_dump_json(exclude={'history'}) 55 | await f.write(serialized) 56 | 57 | # Load state back from file 58 | async with await anyio.open_file('agent_state.json', 'r') as f: 59 | loaded_json = await f.read() 60 | agent_state = AgentState.model_validate_json(loaded_json) 61 | 62 | break 63 | 64 | 65 | if __name__ == '__main__': 66 | asyncio.run(main()) 67 | -------------------------------------------------------------------------------- /utils/browser-use/examples/use-cases/scrolling_page.py: -------------------------------------------------------------------------------- 1 | # Goal: Automates webpage scrolling with various scrolling actions and text search functionality. 2 | 3 | import asyncio 4 | import os 5 | import sys 6 | 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 8 | 9 | from dotenv import load_dotenv 10 | 11 | load_dotenv() 12 | 13 | from browser_use import Agent 14 | from browser_use.browser import BrowserProfile, BrowserSession 15 | from browser_use.llm import ChatOpenAI 16 | 17 | if not os.getenv('OPENAI_API_KEY'): 18 | raise ValueError('OPENAI_API_KEY is not set') 19 | 20 | """ 21 | Example: Using the 'Scroll' action with custom page amounts. 22 | 23 | This script demonstrates how the agent can navigate to a webpage and scroll by specific page amounts. 24 | The scroll action now supports: 25 | - Scrolling by a specific number of pages using the 'num_pages' parameter (e.g., 0.5 for half page, 1.0 for one page, 2.0 for two pages) 26 | - Scrolling by one page height if no num_pages is specified (default behavior) 27 | - Scrolling up or down using the 'down' parameter 28 | """ 29 | 30 | llm = ChatOpenAI(model='gpt-4.1') 31 | 32 | browser_profile = BrowserProfile(headless=False) 33 | browser_session = BrowserSession(browser_profile=browser_profile) 34 | 35 | agent = Agent( 36 | task="Navigate to 'https://en.wikipedia.org/wiki/Internet' and scroll down by one page - then scroll up by 0.5 pages - then scroll down by 0.25 pages - then scroll down by 2 pages.", 37 | # Alternative task to demonstrate text-based scrolling: 38 | # task="Navigate to 'https://en.wikipedia.org/wiki/Internet' and scroll to the string 'The vast majority of computer'", 39 | llm=llm, 40 | browser_session=browser_session, 41 | ) 42 | 43 | 44 | async def main(): 45 | await agent.run() 46 | 47 | 48 | if __name__ == '__main__': 49 | asyncio.run(main()) 50 | -------------------------------------------------------------------------------- /utils/browser-use/examples/custom-functions/save_pdf.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import re 4 | import sys 5 | from pathlib import Path 6 | 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 8 | 9 | from dotenv import load_dotenv 10 | 11 | load_dotenv() 12 | 13 | from browser_use import ActionResult, Agent, Controller 14 | from browser_use.browser.types import Page 15 | from browser_use.llm import ChatOpenAI 16 | 17 | # Initialize controller 18 | controller = Controller() 19 | 20 | download_path = Path.cwd() / 'downloads' 21 | download_path.mkdir(parents=True, exist_ok=True) 22 | 23 | 24 | # Save PDF - exact copy from original controller function 25 | @controller.registry.action('Save the current page as a PDF file') 26 | async def save_pdf(page: Page): 27 | short_url = re.sub(r'^https?://(?:www\.)?|/$', '', page.url) 28 | slug = re.sub(r'[^a-zA-Z0-9]+', '-', short_url).strip('-').lower() 29 | sanitized_filename = f'{slug}.pdf' 30 | 31 | await page.emulate_media(media='screen') 32 | await page.pdf(path=download_path / sanitized_filename, format='A4', print_background=False) 33 | msg = f'Saving page with URL {page.url} as PDF to {download_path / sanitized_filename}' 34 | return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=f'Saved PDF to {sanitized_filename}') 35 | 36 | 37 | async def main(): 38 | """ 39 | Example task: Navigate to browser-use.com and save the page as a PDF 40 | """ 41 | task = """ 42 | Go to https://browser-use.com/ and save the page as a PDF file. 43 | """ 44 | 45 | # Initialize the language model 46 | model = ChatOpenAI(model='gpt-4.1-mini') 47 | 48 | # Create and run the agent 49 | agent = Agent(task=task, llm=model, controller=controller) 50 | 51 | result = await agent.run() 52 | print(f'🎯 Task completed: {result}') 53 | 54 | 55 | if __name__ == '__main__': 56 | asyncio.run(main()) 57 | -------------------------------------------------------------------------------- /utils/browser-use/tests/old/test_vision.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple try of the agent. 3 | 4 | @dev You need to add OPENAI_API_KEY to your environment variables. 5 | """ 6 | 7 | import os 8 | import sys 9 | from pprint import pprint 10 | 11 | import pytest 12 | 13 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 14 | 15 | 16 | from browser_use import Agent, AgentHistoryList, BrowserSession, Controller 17 | from browser_use.llm import ChatOpenAI 18 | 19 | llm = ChatOpenAI(model='gpt-4.1') 20 | controller = Controller() 21 | 22 | # use this test to ask the model questions about the page like 23 | # which color do you see for bbox labels, list all with their label 24 | # what's the smallest bboxes with labels and 25 | 26 | 27 | @controller.registry.action(description='explain what you see on the screen and ask user for input') 28 | async def explain_screen(text: str) -> str: 29 | pprint(text) 30 | answer = input('\nuser input next question: \n') 31 | return answer 32 | 33 | 34 | @controller.registry.action(description='done') 35 | async def done(text: str) -> str: 36 | # pprint(text) 37 | return 'call explain_screen' 38 | 39 | 40 | @pytest.mark.skip(reason='this is for local testing only') 41 | async def test_vision(): 42 | from browser_use.browser.profile import BrowserProfile 43 | 44 | profile = BrowserProfile(headless=True, user_data_dir=None) 45 | browser_session = BrowserSession(browser_profile=profile) 46 | await browser_session.start() 47 | try: 48 | agent = Agent( 49 | task='call explain_screen all the time the user asks you questions e.g. about the page like bbox which you see are labels - your task is to explain it and get the next question', 50 | llm=llm, 51 | controller=controller, 52 | browser_session=browser_session, 53 | ) 54 | history: AgentHistoryList = await agent.run(20) 55 | finally: 56 | # Make sure to close the browser 57 | await browser_session.stop() 58 | -------------------------------------------------------------------------------- /utils/browser-use/examples/browser/using_cdp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple demonstration of the CDP feature. 3 | 4 | To test this locally, follow these steps: 5 | 1. Create a shortcut for the executable Chrome file. 6 | 2. Add the following argument to the shortcut: 7 | - On Windows: `--remote-debugging-port=9222` 8 | 3. Open a web browser and navigate to `http://localhost:9222/json/version` to verify that the Remote Debugging Protocol (CDP) is running. 9 | 4. Launch this example. 10 | 11 | @dev You need to set the `GOOGLE_API_KEY` environment variable before proceeding. 12 | """ 13 | 14 | import asyncio 15 | import os 16 | import sys 17 | 18 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 19 | 20 | from dotenv import load_dotenv 21 | 22 | load_dotenv() 23 | 24 | 25 | from browser_use import Agent, Controller 26 | from browser_use.browser import BrowserProfile, BrowserSession 27 | from browser_use.llm import ChatGoogle 28 | 29 | api_key = os.getenv('GOOGLE_API_KEY') 30 | if not api_key: 31 | raise ValueError('GOOGLE_API_KEY is not set') 32 | 33 | browser_session = BrowserSession( 34 | browser_profile=BrowserProfile( 35 | headless=False, 36 | ), 37 | cdp_url='http://localhost:9222', 38 | ) 39 | controller = Controller() 40 | 41 | 42 | async def main(): 43 | task = 'In docs.google.com write my Papa a quick thank you for everything letter \n - Magnus' 44 | task += ' and save the document as pdf' 45 | # Assert api_key is not None to satisfy type checker 46 | assert api_key is not None, 'GOOGLE_API_KEY must be set' 47 | model = ChatGoogle(model='gemini-2.0-flash-exp', api_key=api_key) 48 | agent = Agent( 49 | task=task, 50 | llm=model, 51 | controller=controller, 52 | browser_session=browser_session, 53 | ) 54 | 55 | await agent.run() 56 | await browser_session.close() 57 | 58 | input('Press Enter to close...') 59 | 60 | 61 | if __name__ == '__main__': 62 | asyncio.run(main()) 63 | -------------------------------------------------------------------------------- /utils/browser-use/.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/asottile/yesqa 3 | rev: v1.5.0 4 | hooks: 5 | - id: yesqa 6 | 7 | - repo: https://github.com/codespell-project/codespell 8 | rev: v2.4.1 9 | hooks: 10 | - id: codespell # See pyproject.toml for args 11 | additional_dependencies: 12 | - tomli 13 | 14 | - repo: https://github.com/asottile/pyupgrade 15 | rev: v3.19.1 16 | hooks: 17 | - id: pyupgrade 18 | args: [--py311-plus] 19 | 20 | # - repo: https://github.com/asottile/add-trailing-comma 21 | # rev: v3.1.0 22 | # hooks: 23 | # - id: add-trailing-comma 24 | 25 | - repo: https://github.com/astral-sh/ruff-pre-commit 26 | rev: v0.11.2 27 | hooks: 28 | - id: ruff 29 | - id: ruff-format 30 | # see pyproject.toml for more details on ruff config 31 | 32 | - repo: https://github.com/RobertCraigie/pyright-python 33 | rev: v1.1.402 34 | hooks: 35 | - id: pyright 36 | 37 | - repo: https://github.com/pre-commit/pre-commit-hooks 38 | rev: v5.0.0 39 | hooks: 40 | # check for basic syntax errors in python and data files 41 | - id: check-ast 42 | - id: check-toml 43 | - id: check-yaml 44 | - id: check-json 45 | - id: check-merge-conflict 46 | # check for bad files and folders 47 | - id: check-symlinks 48 | - id: destroyed-symlinks 49 | - id: check-case-conflict 50 | - id: check-illegal-windows-names 51 | - id: check-shebang-scripts-are-executable 52 | - id: mixed-line-ending 53 | - id: fix-byte-order-marker 54 | - id: end-of-file-fixer 55 | # best practices enforcement 56 | - id: detect-private-key 57 | # - id: check-docstring-first 58 | - id: debug-statements 59 | - id: forbid-submodules 60 | - id: check-added-large-files 61 | args: ["--maxkb=600"] 62 | # - id: name-tests-test 63 | # args: ["--pytest-test-first"] 64 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/dom/history_tree_processor/view.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from pydantic import BaseModel 4 | 5 | 6 | @dataclass 7 | class HashedDomElement: 8 | """ 9 | Hash of the dom element to be used as a unique identifier 10 | """ 11 | 12 | branch_path_hash: str 13 | attributes_hash: str 14 | xpath_hash: str 15 | # text_hash: str 16 | 17 | 18 | class Coordinates(BaseModel): 19 | x: int 20 | y: int 21 | 22 | 23 | class CoordinateSet(BaseModel): 24 | top_left: Coordinates 25 | top_right: Coordinates 26 | bottom_left: Coordinates 27 | bottom_right: Coordinates 28 | center: Coordinates 29 | width: int 30 | height: int 31 | 32 | 33 | class ViewportInfo(BaseModel): 34 | scroll_x: int | None = None 35 | scroll_y: int | None = None 36 | width: int 37 | height: int 38 | 39 | 40 | @dataclass 41 | class DOMHistoryElement: 42 | tag_name: str 43 | xpath: str 44 | highlight_index: int | None 45 | entire_parent_branch_path: list[str] 46 | attributes: dict[str, str] 47 | shadow_root: bool = False 48 | css_selector: str | None = None 49 | page_coordinates: CoordinateSet | None = None 50 | viewport_coordinates: CoordinateSet | None = None 51 | viewport_info: ViewportInfo | None = None 52 | 53 | def to_dict(self) -> dict: 54 | page_coordinates = self.page_coordinates.model_dump() if self.page_coordinates else None 55 | viewport_coordinates = self.viewport_coordinates.model_dump() if self.viewport_coordinates else None 56 | viewport_info = self.viewport_info.model_dump() if self.viewport_info else None 57 | 58 | return { 59 | 'tag_name': self.tag_name, 60 | 'xpath': self.xpath, 61 | 'highlight_index': self.highlight_index, 62 | 'entire_parent_branch_path': self.entire_parent_branch_path, 63 | 'attributes': self.attributes, 64 | 'shadow_root': self.shadow_root, 65 | 'css_selector': self.css_selector, 66 | 'page_coordinates': page_coordinates, 67 | 'viewport_coordinates': viewport_coordinates, 68 | 'viewport_info': viewport_info, 69 | } 70 | -------------------------------------------------------------------------------- /utils/browser-use/examples/custom-functions/onepassword_2fa.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | import os 4 | import sys 5 | 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 7 | 8 | from dotenv import load_dotenv 9 | 10 | load_dotenv() 11 | 12 | from onepassword.client import Client # type: ignore # pip install onepassword-sdk 13 | 14 | from browser_use import ActionResult, Agent, Controller 15 | from browser_use.llm import ChatOpenAI 16 | 17 | # Set up logging 18 | logging.basicConfig(level=logging.INFO) 19 | logger = logging.getLogger(__name__) 20 | 21 | OP_SERVICE_ACCOUNT_TOKEN = os.getenv('OP_SERVICE_ACCOUNT_TOKEN') 22 | OP_ITEM_ID = os.getenv('OP_ITEM_ID') # Go to 1Password, right click on the item, click "Copy Secret Reference" 23 | 24 | 25 | controller = Controller() 26 | 27 | 28 | @controller.registry.action('Get 2FA code from 1Password for Google Account', domains=['*.google.com', 'google.com']) 29 | async def get_1password_2fa() -> ActionResult: 30 | """ 31 | Custom action to retrieve 2FA/MFA code from 1Password using onepassword.client SDK. 32 | """ 33 | client = await Client.authenticate( 34 | # setup instructions: https://github.com/1Password/onepassword-sdk-python/#-get-started 35 | auth=OP_SERVICE_ACCOUNT_TOKEN, 36 | integration_name='Browser-Use', 37 | integration_version='v1.0.0', 38 | ) 39 | 40 | mfa_code = await client.secrets.resolve(f'op://Private/{OP_ITEM_ID}/One-time passcode') 41 | 42 | return ActionResult(extracted_content=mfa_code) 43 | 44 | 45 | async def main(): 46 | # Example task using the 1Password 2FA action 47 | task = 'Go to account.google.com, enter username and password, then if prompted for 2FA code, get 2FA code from 1Password for and enter it' 48 | 49 | model = ChatOpenAI(model='gpt-4.1') 50 | agent = Agent(task=task, llm=model, controller=controller) 51 | 52 | result = await agent.run() 53 | print(f'Task completed with result: {result}') 54 | 55 | 56 | if __name__ == '__main__': 57 | asyncio.run(main()) 58 | -------------------------------------------------------------------------------- /utils/browser-use/examples/features/custom_user_agent.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import asyncio 3 | import os 4 | import sys 5 | 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 7 | 8 | from dotenv import load_dotenv 9 | 10 | load_dotenv() 11 | 12 | from browser_use import Agent 13 | from browser_use.browser import BrowserProfile, BrowserSession 14 | from browser_use.controller.service import Controller 15 | from browser_use.llm import ChatAnthropic, ChatOpenAI 16 | 17 | 18 | def get_llm(provider: str): 19 | if provider == 'anthropic': 20 | return ChatAnthropic(model='claude-3-5-sonnet-20240620', temperature=0.0) 21 | elif provider == 'openai': 22 | return ChatOpenAI(model='gpt-4.1', temperature=0.0) 23 | 24 | else: 25 | raise ValueError(f'Unsupported provider: {provider}') 26 | 27 | 28 | # NOTE: This example is to find your current user agent string to use it in the browser_context 29 | task = 'go to https://whatismyuseragent.com and find the current user agent string ' 30 | 31 | 32 | controller = Controller() 33 | 34 | 35 | parser = argparse.ArgumentParser() 36 | parser.add_argument('--query', type=str, help='The query to process', default=task) 37 | parser.add_argument( 38 | '--provider', 39 | type=str, 40 | choices=['openai', 'anthropic'], 41 | default='openai', 42 | help='The model provider to use (default: openai)', 43 | ) 44 | 45 | args = parser.parse_args() 46 | 47 | llm = get_llm(args.provider) 48 | 49 | browser_session = BrowserSession( 50 | browser_profile=BrowserProfile( 51 | user_agent='foobarfoo', 52 | user_data_dir='~/.config/browseruse/profiles/default', 53 | ) 54 | ) 55 | 56 | agent = Agent( 57 | task=args.query, 58 | llm=llm, 59 | controller=controller, 60 | browser_session=browser_session, 61 | use_vision=True, 62 | max_actions_per_step=1, 63 | ) 64 | 65 | 66 | async def main(): 67 | await agent.run(max_steps=25) 68 | 69 | input('Press Enter to close the browser...') 70 | await browser_session.close() 71 | 72 | 73 | asyncio.run(main()) 74 | -------------------------------------------------------------------------------- /utils/browser-use/.github/workflows/docker.yml: -------------------------------------------------------------------------------- 1 | name: docker 2 | 3 | on: 4 | push: 5 | release: 6 | types: [published] 7 | 8 | jobs: 9 | build_publish_image: 10 | runs-on: ubuntu-latest 11 | permissions: 12 | packages: write 13 | contents: read 14 | attestations: write 15 | id-token: write 16 | steps: 17 | - name: Check out the repo 18 | uses: actions/checkout@v4 19 | 20 | - name: Set up QEMU 21 | uses: docker/setup-qemu-action@v3 22 | 23 | - name: Set up Docker Buildx 24 | uses: docker/setup-buildx-action@v3 25 | 26 | - name: Log in to Docker Hub 27 | uses: docker/login-action@v3 28 | with: 29 | username: ${{ secrets.DOCKER_USERNAME }} 30 | password: ${{ secrets.DOCKER_PASSWORD }} 31 | 32 | - name: Login to GitHub Container Registry 33 | uses: docker/login-action@v3 34 | with: 35 | registry: ghcr.io 36 | username: ${{ github.repository_owner }} 37 | password: ${{ secrets.GITHUB_TOKEN }} 38 | 39 | - name: Compute Docker tags based on tag/branch 40 | id: meta 41 | uses: docker/metadata-action@v5 42 | with: 43 | images: | 44 | browseruse/browseruse 45 | ghcr.io/browser-use/browser-use 46 | tags: | 47 | type=ref,event=branch 48 | type=ref,event=pr 49 | type=pep440,pattern={{version}} 50 | type=pep440,pattern={{major}}.{{minor}} 51 | type=sha 52 | 53 | - name: Build and push Docker image 54 | id: push 55 | uses: docker/build-push-action@v6 56 | with: 57 | platforms: linux/amd64,linux/arm64 58 | context: . 59 | file: ./Dockerfile 60 | push: true 61 | tags: ${{ steps.meta.outputs.tags }} 62 | labels: ${{ steps.meta.outputs.labels }} 63 | cache-from: type=registry,ref=browseruse/browseruse:buildcache 64 | cache-to: type=registry,ref=browseruse/browseruse:buildcache,mode=max 65 | -------------------------------------------------------------------------------- /utils/browser-use/docker/build-base-images.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Build script for browser-use base images 3 | set -euo pipefail 4 | 5 | # Configuration 6 | REGISTRY="${DOCKER_REGISTRY:-browseruse}" 7 | PLATFORMS="${PLATFORMS:-linux/amd64}" 8 | PUSH="${PUSH:-false}" 9 | 10 | # Build function 11 | build_image() { 12 | local name=$1 13 | local dockerfile=$2 14 | local build_args="${3:-}" 15 | 16 | echo "[INFO] Building ${name}..." 17 | 18 | local build_cmd="docker build" 19 | local tag_args="-t ${REGISTRY}/${name}:latest -t ${REGISTRY}/${name}:$(date +%Y%m%d)" 20 | 21 | # Use buildx for multi-platform or push 22 | if [[ "$PLATFORMS" == *","* ]] || [ "$PUSH" = "true" ]; then 23 | build_cmd="docker buildx build --platform=$PLATFORMS" 24 | [ "$PUSH" = "true" ] && build_cmd="$build_cmd --push" || build_cmd="$build_cmd" 25 | fi 26 | 27 | $build_cmd $tag_args $build_args -f $dockerfile ../../.. 28 | } 29 | 30 | # Main 31 | cd "$(dirname "$0")" 32 | 33 | # Parse arguments 34 | while [[ $# -gt 0 ]]; do 35 | case $1 in 36 | --push) PUSH=true; shift ;; 37 | --registry) REGISTRY="$2"; shift 2 ;; 38 | --platforms) PLATFORMS="$2"; shift 2 ;; 39 | --help) 40 | echo "Usage: $0 [--push] [--registry REG] [--platforms P]" 41 | exit 0 ;; 42 | *) echo "Unknown option: $1"; exit 1 ;; 43 | esac 44 | done 45 | 46 | # Create buildx builder if needed 47 | if [[ "$PLATFORMS" == *","* ]] || [ "$PUSH" = "true" ]; then 48 | docker buildx inspect browseruse-builder >/dev/null 2>&1 || \ 49 | docker buildx create --name browseruse-builder --use 50 | docker buildx use browseruse-builder 51 | fi 52 | 53 | # Build images in order 54 | build_image "base-system" "base-images/system/Dockerfile" 55 | build_image "base-chromium" "base-images/chromium/Dockerfile" "--build-arg BASE_TAG=latest" 56 | build_image "base-python-deps" "base-images/python-deps/Dockerfile" "--build-arg BASE_TAG=latest" 57 | 58 | echo "[INFO] Build complete. Use: FROM ${REGISTRY}/base-python-deps:latest" 59 | -------------------------------------------------------------------------------- /utils/browser-use/.github/ISSUE_TEMPLATE/4_docs_issue.yml: -------------------------------------------------------------------------------- 1 | name: 📚 Documentation Issue 2 | description: Report an issue in the browser-use documentation 3 | labels: ["documentation"] 4 | title: "Documentation: ..." 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | Thanks for taking the time to improve our documentation! Please fill out the form below to help us fix the issue quickly. 10 | 11 | - type: dropdown 12 | id: type 13 | attributes: 14 | label: Type of Documentation Issue 15 | description: What type of documentation issue is this? 16 | options: 17 | - Missing documentation 18 | - Incorrect documentation 19 | - Unclear documentation 20 | - Broken link 21 | - Other (specify in description) 22 | validations: 23 | required: true 24 | 25 | - type: input 26 | id: page 27 | attributes: 28 | label: Documentation Page 29 | description: Which page or section of the documentation is this about? 30 | placeholder: "e.g. https://docs.browser-use.com/customize/browser-settings > Context Configuration > headless" 31 | validations: 32 | required: true 33 | 34 | - type: textarea 35 | id: description 36 | attributes: 37 | label: Issue Description 38 | description: "Describe what's wrong or missing in the documentation" 39 | placeholder: e.g. Docs should clarify whether BrowserSession(no_viewport=False) is supported when running in BrowserSession(headless=False) mode... 40 | validations: 41 | required: true 42 | 43 | - type: textarea 44 | id: suggestion 45 | attributes: 46 | label: Suggested Changes 47 | description: If you have specific suggestions for how to improve the documentation, please share them 48 | placeholder: | 49 | e.g. The documentation could be improved by adding one more line here: 50 | ```diff 51 | Use `BrowserSession(headless=False)` to open the browser window (aka headful mode). 52 | + Viewports are not supported when headful, if `headless=False` it will force `no_viewport=True`. 53 | ``` 54 | validations: 55 | required: false 56 | -------------------------------------------------------------------------------- /utils/browser-use/examples/custom-functions/2fa.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | import os 4 | import sys 5 | 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 7 | 8 | from dotenv import load_dotenv 9 | 10 | load_dotenv() 11 | 12 | import pyotp # type: ignore 13 | 14 | from browser_use import ActionResult, Agent, Controller 15 | from browser_use.llm import ChatOpenAI 16 | 17 | # Set up logging 18 | logging.basicConfig(level=logging.INFO) 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | controller = Controller() 23 | 24 | 25 | @controller.registry.action('Get 2FA code from when OTP is required') 26 | async def get_otp_2fa() -> ActionResult: 27 | """ 28 | Custom action to retrieve 2FA/MFA code from OTP secret key using pyotp. 29 | The OTP secret key should be set in the environment variable OTP_SECRET_KEY. 30 | """ 31 | secret_key = os.environ.get('OTP_SECRET_KEY') 32 | if not secret_key: 33 | raise ValueError('OTP_SECRET_KEY environment variable is not set') 34 | 35 | totp = pyotp.TOTP(secret_key, digits=6) 36 | code = totp.now() 37 | return ActionResult(extracted_content=code) 38 | 39 | 40 | async def main(): 41 | # Example task using the 1Password 2FA action 42 | task = """ 43 | Steps: 44 | 1. Go to https://authenticationtest.com/totpChallenge/ and try to log in. 45 | 2. If prompted for 2FA code: 46 | 2.1. Use the get_2fa_code action to retrieve the 2FA code. 47 | 2.2. Submit the code provided by the get_2fa_code action. 48 | 49 | Considerations: 50 | - ALWAYS use the get_2fa_code action to retrieve the 2FA code if needed. 51 | - NEVER skip the 2FA step if the page requires it. 52 | - NEVER extract the code from the page. 53 | - NEVER use a code that is not generated by the get_2fa_code action. 54 | - NEVER hallucinate the 2FA code, always use the get_2fa_code action to get it. 55 | 56 | You are completely FORBIDDEN to use any other method to get the 2FA code. 57 | """ 58 | 59 | model = ChatOpenAI(model='gpt-4.1') 60 | agent = Agent(task=task, llm=model, controller=controller) 61 | 62 | result = await agent.run() 63 | print(f'Task completed with result: {result}') 64 | 65 | 66 | if __name__ == '__main__': 67 | asyncio.run(main()) 68 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from browser_use.logging_config import setup_logging 4 | 5 | # Only set up logging if not in MCP mode or if explicitly requested 6 | if os.environ.get('BROWSER_USE_SETUP_LOGGING', 'true').lower() != 'false': 7 | logger = setup_logging() 8 | else: 9 | import logging 10 | 11 | logger = logging.getLogger('browser_use') 12 | 13 | # Monkeypatch BaseSubprocessTransport.__del__ to handle closed event loops gracefully 14 | from asyncio import base_subprocess 15 | 16 | from browser_use.agent.prompts import SystemPrompt 17 | from browser_use.agent.service import Agent 18 | from browser_use.agent.views import ActionModel, ActionResult, AgentHistoryList 19 | from browser_use.browser import Browser, BrowserConfig, BrowserContext, BrowserContextConfig, BrowserProfile, BrowserSession 20 | from browser_use.controller.service import Controller 21 | from browser_use.dom.service import DomService 22 | from browser_use.llm import ( 23 | ChatAnthropic, 24 | ChatAzureOpenAI, 25 | ChatGoogle, 26 | ChatGroq, 27 | ChatOllama, 28 | ChatOpenAI, 29 | ) 30 | 31 | _original_del = base_subprocess.BaseSubprocessTransport.__del__ 32 | 33 | 34 | def _patched_del(self): 35 | """Patched __del__ that handles closed event loops without throwing noisy red-herring errors like RuntimeError: Event loop is closed""" 36 | try: 37 | # Check if the event loop is closed before calling the original 38 | if hasattr(self, '_loop') and self._loop and self._loop.is_closed(): 39 | # Event loop is closed, skip cleanup that requires the loop 40 | return 41 | _original_del(self) 42 | except RuntimeError as e: 43 | if 'Event loop is closed' in str(e): 44 | # Silently ignore this specific error 45 | pass 46 | else: 47 | raise 48 | 49 | 50 | base_subprocess.BaseSubprocessTransport.__del__ = _patched_del 51 | 52 | 53 | __all__ = [ 54 | 'Agent', 55 | 'Browser', 56 | 'BrowserConfig', 57 | 'BrowserSession', 58 | 'BrowserProfile', 59 | 'Controller', 60 | 'DomService', 61 | 'SystemPrompt', 62 | 'ActionResult', 63 | 'ActionModel', 64 | 'AgentHistoryList', 65 | 'BrowserContext', 66 | 'BrowserContextConfig', 67 | # Chat models 68 | 'ChatOpenAI', 69 | 'ChatGoogle', 70 | 'ChatAnthropic', 71 | 'ChatGroq', 72 | 'ChatAzureOpenAI', 73 | 'ChatOllama', 74 | ] 75 | -------------------------------------------------------------------------------- /utils/browser-use/examples/features/multiple_tasks.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | 12 | from browser_use import Agent 13 | from browser_use.browser import BrowserSession 14 | from browser_use.browser.types import async_playwright 15 | from browser_use.llm import ChatGoogle 16 | 17 | api_key = os.getenv('GOOGLE_API_KEY') 18 | 19 | if not api_key: 20 | raise ValueError('GOOGLE_API_KEY is not set') 21 | 22 | llm = ChatGoogle(model='gemini-2.0-flash', api_key=api_key) 23 | 24 | 25 | async def main(): 26 | async with async_playwright() as p: 27 | browser = await p.chromium.launch( 28 | headless=False, 29 | ) 30 | 31 | context = await browser.new_context( 32 | viewport={'width': 1502, 'height': 853}, 33 | ignore_https_errors=True, 34 | ) 35 | 36 | agent = Agent( 37 | browser_session=BrowserSession( 38 | browser_context=context, 39 | ), 40 | task='Go to https://browser-use.com/', 41 | llm=llm, 42 | ) 43 | 44 | try: 45 | result = await agent.run() 46 | print(f'First task was {"successful" if result.is_successful else "not successful"}') 47 | 48 | if not result.is_successful: 49 | raise RuntimeError('Failed to navigate to the initial page.') 50 | 51 | agent.add_new_task('Navigate to the documentation page') 52 | 53 | result = await agent.run() 54 | print(f'Second task was {"successful" if result.is_successful else "not successful"}') 55 | 56 | if not result.is_successful: 57 | raise RuntimeError('Failed to navigate to the documentation page.') 58 | 59 | while True: 60 | next_task = input('Write your next task or leave empty to exit\n> ') 61 | 62 | if not next_task.strip(): 63 | print('Exiting...') 64 | break 65 | 66 | agent.add_new_task(next_task) 67 | result = await agent.run() 68 | 69 | print(f"Task '{next_task}' was {'successful' if result.is_successful else 'not successful'}") 70 | 71 | if not result.is_successful: 72 | print('Failed to complete the task. Please try again.') 73 | continue 74 | 75 | finally: 76 | await context.close() 77 | await browser.close() 78 | 79 | 80 | if __name__ == '__main__': 81 | asyncio.run(main()) 82 | -------------------------------------------------------------------------------- /utils/browser-use/docs/customize/system-prompt.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "System Prompt" 3 | description: "Customize the system prompt to control agent behavior and capabilities" 4 | icon: "message" 5 | --- 6 | 7 | ## Overview 8 | 9 | You can customize the system prompt in two ways: 10 | 11 | 1. Extend the default system prompt with additional instructions 12 | 2. Override the default system prompt entirely 13 | 14 | 15 | Custom system prompts allow you to modify the agent's behavior at a 16 | fundamental level. Use this feature carefully as it can significantly impact 17 | the agent's performance and reliability. 18 | 19 | 20 | ### Extend System Prompt (recommended) 21 | 22 | To add additional instructions to the default system prompt: 23 | 24 | ```python 25 | extend_system_message = """ 26 | REMEMBER the most important RULE: 27 | ALWAYS open first a new tab and go first to url wikipedia.com no matter the task!!! 28 | """ 29 | ``` 30 | 31 | ### Override System Prompt 32 | 33 | 34 | Not recommended! If you must override the [default system 35 | prompt](https://github.com/browser-use/browser-use/blob/main/browser_use/agent/system_prompt.md), 36 | make sure to test the agent yourself. 37 | 38 | 39 | Anyway, to override the default system prompt: 40 | 41 | ```python 42 | # Define your complete custom prompt 43 | override_system_message = """ 44 | You are an AI agent that helps users with web browsing tasks. 45 | 46 | [Your complete custom instructions here...] 47 | """ 48 | 49 | # Create agent with custom system prompt 50 | agent = Agent( 51 | task="Your task here", 52 | llm=ChatOpenAI(model='gpt-4'), 53 | override_system_message=override_system_message 54 | ) 55 | ``` 56 | 57 | ### Extend Planner System Prompt 58 | 59 | You can customize the behavior of the planning agent by extending its system prompt: 60 | 61 | ```python 62 | extend_planner_system_message = """ 63 | PRIORITIZE gathering information before taking any action. 64 | Always suggest exploring multiple options before making a decision. 65 | """ 66 | 67 | # Create agent with extended planner system prompt 68 | llm = ChatOpenAI(model='gpt-4o') 69 | planner_llm = ChatOpenAI(model='gpt-4o-mini') 70 | 71 | agent = Agent( 72 | task="Your task here", 73 | llm=llm, 74 | planner_llm=planner_llm, 75 | extend_planner_system_message=extend_planner_system_message 76 | ) 77 | ``` 78 | -------------------------------------------------------------------------------- /utils/browser-use/examples/features/sensitive_data.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | from browser_use import Agent 12 | from browser_use.browser import BrowserProfile 13 | from browser_use.llm import ChatOpenAI 14 | 15 | try: 16 | from lmnr import Laminar 17 | 18 | Laminar.initialize(project_api_key=os.getenv('LMNR_PROJECT_API_KEY')) 19 | except Exception as e: 20 | print(f'Error initializing Laminar: {e}') 21 | 22 | # Initialize the model 23 | llm = ChatOpenAI( 24 | model='gpt-4.1', 25 | temperature=0.0, 26 | ) 27 | # Simple case: the model will see x_name and x_password, but never the actual values. 28 | # sensitive_data = {'x_name': 'my_x_name', 'x_password': 'my_x_password'} 29 | 30 | # Advanced case: domain-specific credentials with reusable data 31 | # Define a single credential set that can be reused 32 | company_credentials = {'company_username': 'user@example.com', 'company_password': 'securePassword123'} 33 | 34 | # Map the same credentials to multiple domains for secure access control 35 | # Type annotation to satisfy pyright 36 | sensitive_data: dict[str, str | dict[str, str]] = { 37 | 'https://example.com': company_credentials, 38 | 'https://admin.example.com': company_credentials, 39 | 'https://*.example-staging.com': company_credentials, 40 | 'http*://test.example.com': company_credentials, 41 | # You can also add domain-specific credentials 42 | 'https://*.google.com': {'g_email': 'user@gmail.com', 'g_pass': 'google_password'}, 43 | } 44 | # Update task to use one of the credentials above 45 | task = 'Go to google.com and put the login information in the search bar.' 46 | 47 | # Always set allowed_domains when using sensitive_data for security 48 | from browser_use.browser.session import BrowserSession 49 | 50 | browser_session = BrowserSession( 51 | browser_profile=BrowserProfile( 52 | allowed_domains=list(sensitive_data.keys()) 53 | + ['https://*.trusted-partner.com'] # Domain patterns from sensitive_data + additional allowed domains 54 | ) 55 | ) 56 | 57 | agent = Agent(task=task, llm=llm, sensitive_data=sensitive_data, browser_session=browser_session) 58 | 59 | 60 | async def main(): 61 | await agent.run() 62 | 63 | 64 | if __name__ == '__main__': 65 | asyncio.run(main()) 66 | -------------------------------------------------------------------------------- /utils/browser-use/examples/use-cases/web_voyager_agent.py: -------------------------------------------------------------------------------- 1 | # Goal: A general-purpose web navigation agent for tasks like flight booking and course searching. 2 | 3 | import asyncio 4 | import os 5 | import sys 6 | 7 | # Adjust Python path 8 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 9 | 10 | from dotenv import load_dotenv 11 | 12 | load_dotenv() 13 | 14 | 15 | from browser_use.agent.service import Agent 16 | from browser_use.browser import BrowserProfile, BrowserSession 17 | from browser_use.llm import ChatAzureOpenAI, ChatOpenAI 18 | 19 | # Set LLM based on defined environment variables 20 | if os.getenv('OPENAI_API_KEY'): 21 | llm = ChatOpenAI( 22 | model='gpt-4.1', 23 | ) 24 | elif os.getenv('AZURE_OPENAI_KEY') and os.getenv('AZURE_OPENAI_ENDPOINT'): 25 | llm = ChatAzureOpenAI( 26 | model='gpt-4.1', 27 | ) 28 | else: 29 | raise ValueError('No LLM found. Please set OPENAI_API_KEY or AZURE_OPENAI_KEY and AZURE_OPENAI_ENDPOINT.') 30 | 31 | 32 | browser_session = BrowserSession( 33 | browser_profile=BrowserProfile( 34 | headless=False, # This is True in production 35 | minimum_wait_page_load_time=1, # 3 on prod 36 | maximum_wait_page_load_time=10, # 20 on prod 37 | viewport={'width': 1280, 'height': 1100}, 38 | user_data_dir='~/.config/browseruse/profiles/default', 39 | # trace_path='./tmp/web_voyager_agent', 40 | ) 41 | ) 42 | 43 | # TASK = """ 44 | # Find the lowest-priced one-way flight from Cairo to Montreal on February 21, 2025, including the total travel time and number of stops. on https://www.google.com/travel/flights/ 45 | # """ 46 | # TASK = """ 47 | # Browse Coursera, which universities offer Master of Advanced Study in Engineering degrees? Tell me what is the latest application deadline for this degree? on https://www.coursera.org/""" 48 | TASK = """ 49 | Find and book a hotel in Paris with suitable accommodations for a family of four (two adults and two children) offering free cancellation for the dates of February 14-21, 2025. on https://www.booking.com/ 50 | """ 51 | 52 | 53 | async def main(): 54 | agent = Agent( 55 | task=TASK, 56 | llm=llm, 57 | browser_session=browser_session, 58 | validate_output=True, 59 | enable_memory=False, 60 | ) 61 | history = await agent.run(max_steps=50) 62 | history.save_to_file('./tmp/history.json') 63 | 64 | 65 | if __name__ == '__main__': 66 | asyncio.run(main()) 67 | -------------------------------------------------------------------------------- /utils/browser-use/tests/old/test_wait_for_element.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | from browser_use.llm.openai.chat import ChatOpenAI 6 | 7 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) 8 | if project_root not in sys.path: 9 | sys.path.insert(0, project_root) 10 | 11 | import pytest 12 | from dotenv import load_dotenv 13 | 14 | # Third-party imports 15 | from browser_use import Agent, Controller 16 | 17 | # Local imports 18 | from browser_use.browser import BrowserProfile, BrowserSession 19 | 20 | # Load environment variables. 21 | load_dotenv() 22 | 23 | # Initialize language model and controller. 24 | llm = ChatOpenAI(model='gpt-4.1') 25 | controller = Controller() 26 | 27 | 28 | @pytest.mark.skip(reason='this is for local testing only') 29 | async def test_wait_for_element(): 30 | """Test 'Wait for element' action.""" 31 | 32 | initial_actions = [ 33 | {'go_to_url': {'url': 'https://pypi.org/', 'new_tab': True}}, 34 | # Uncomment the line below to include the wait action in initial actions. 35 | # {'wait_for_element': {'selector': '#search', 'timeout': 30}}, 36 | ] 37 | 38 | # Set up the browser session. 39 | browser_session = BrowserSession(browser_profile=BrowserProfile(headless=True, disable_security=True)) 40 | await browser_session.start() 41 | 42 | try: 43 | # Create the agent with the task. 44 | agent = Agent( 45 | task="Wait for element '#search' to be visible with a timeout of 30 seconds.", 46 | llm=llm, 47 | browser_session=browser_session, 48 | initial_actions=initial_actions, 49 | controller=controller, 50 | ) 51 | 52 | # Run the agent for a few steps to trigger navigation and then the wait action. 53 | history = await agent.run(max_steps=3) 54 | action_names = history.action_names() 55 | 56 | # Ensure that the wait_for_element action was executed. 57 | assert 'wait_for_element' in action_names, 'Expected wait_for_element action to be executed.' 58 | 59 | # Verify that the #search element is visible by querying the page. 60 | page = await browser_session.get_current_page() 61 | header_handle = await page.query_selector('#search') 62 | assert header_handle is not None, 'Expected to find a #search element on the page.' 63 | is_visible = await header_handle.is_visible() 64 | assert is_visible, 'Expected the #search element to be visible.' 65 | finally: 66 | await browser_session.stop() 67 | 68 | 69 | if __name__ == '__main__': 70 | asyncio.run(test_wait_for_element()) 71 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/browser/views.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Any 3 | 4 | from pydantic import BaseModel 5 | 6 | from browser_use.dom.history_tree_processor.service import DOMHistoryElement 7 | from browser_use.dom.views import DOMState 8 | 9 | 10 | # Pydantic 11 | class TabInfo(BaseModel): 12 | """Represents information about a browser tab""" 13 | 14 | page_id: int 15 | url: str 16 | title: str 17 | parent_page_id: int | None = None # parent page that contains this popup or cross-origin iframe 18 | 19 | 20 | class PageInfo(BaseModel): 21 | """Comprehensive page size and scroll information""" 22 | 23 | # Current viewport dimensions 24 | viewport_width: int 25 | viewport_height: int 26 | 27 | # Total page dimensions 28 | page_width: int 29 | page_height: int 30 | 31 | # Current scroll position 32 | scroll_x: int 33 | scroll_y: int 34 | 35 | # Calculated scroll information 36 | pixels_above: int 37 | pixels_below: int 38 | pixels_left: int 39 | pixels_right: int 40 | 41 | # Page statistics are now computed dynamically instead of stored 42 | 43 | 44 | @dataclass 45 | class BrowserStateSummary(DOMState): 46 | """The summary of the browser's current state designed for an LLM to process""" 47 | 48 | # provided by DOMState: 49 | # element_tree: DOMElementNode 50 | # selector_map: SelectorMap 51 | 52 | url: str 53 | title: str 54 | tabs: list[TabInfo] 55 | screenshot: str | None = field(default=None, repr=False) 56 | page_info: PageInfo | None = None # Enhanced page information 57 | 58 | # Keep legacy fields for backward compatibility 59 | pixels_above: int = 0 60 | pixels_below: int = 0 61 | browser_errors: list[str] = field(default_factory=list) 62 | 63 | 64 | @dataclass 65 | class BrowserStateHistory: 66 | """The summary of the browser's state at a past point in time to usse in LLM message history""" 67 | 68 | url: str 69 | title: str 70 | tabs: list[TabInfo] 71 | interacted_element: list[DOMHistoryElement | None] | list[None] 72 | screenshot: str | None = None 73 | 74 | def to_dict(self) -> dict[str, Any]: 75 | data = {} 76 | data['tabs'] = [tab.model_dump() for tab in self.tabs] 77 | data['screenshot'] = self.screenshot 78 | data['interacted_element'] = [el.to_dict() if el else None for el in self.interacted_element] 79 | data['url'] = self.url 80 | data['title'] = self.title 81 | return data 82 | 83 | 84 | class BrowserError(Exception): 85 | """Base class for all browser errors""" 86 | 87 | 88 | class URLNotAllowedError(BrowserError): 89 | """Error raised when a URL is not allowed""" 90 | -------------------------------------------------------------------------------- /utils/browser-use/examples/custom-functions/solve_amazon_captcha.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | from amazoncaptcha import AmazonCaptcha # type: ignore 12 | 13 | from browser_use import ActionResult 14 | from browser_use.agent.service import Agent 15 | from browser_use.browser import BrowserConfig, BrowserSession 16 | from browser_use.controller.service import Controller 17 | from browser_use.llm import ChatOpenAI 18 | 19 | browser_profile = BrowserConfig(headless=False) 20 | 21 | # Initialize controller first 22 | controller = Controller() 23 | 24 | 25 | @controller.action( 26 | 'Solve Amazon text based captcha', 27 | domains=[ 28 | '*.amazon.com', 29 | '*.amazon.co.uk', 30 | '*.amazon.ca', 31 | '*.amazon.de', 32 | '*.amazon.es', 33 | '*.amazon.fr', 34 | '*.amazon.it', 35 | '*.amazon.co.jp', 36 | '*.amazon.in', 37 | '*.amazon.cn', 38 | '*.amazon.com.sg', 39 | '*.amazon.com.mx', 40 | '*.amazon.ae', 41 | '*.amazon.com.br', 42 | '*.amazon.nl', 43 | '*.amazon.com.au', 44 | '*.amazon.com.tr', 45 | '*.amazon.sa', 46 | '*.amazon.se', 47 | '*.amazon.pl', 48 | ], 49 | ) 50 | async def solve_amazon_captcha(browser_session: BrowserSession): 51 | page = await browser_session.get_current_page() 52 | 53 | # Find the captcha image and extract its src 54 | captcha_img = page.locator('img[src*="amazon.com/captcha"]') 55 | link = await captcha_img.get_attribute('src') 56 | 57 | if not link: 58 | raise ValueError('Could not find captcha image on the page') 59 | 60 | captcha = AmazonCaptcha.fromlink(link) 61 | solution = captcha.solve() 62 | if not solution or solution == 'Not solved': 63 | raise ValueError('Captcha could not be solved') 64 | 65 | await page.locator('#captchacharacters').fill(solution) 66 | await page.locator('button[type="submit"]').click() 67 | 68 | return ActionResult(extracted_content=solution) 69 | 70 | 71 | async def main(): 72 | task = 'Go to https://www.amazon.com/errors/validateCaptcha and solve the captcha using the solve_amazon_captcha tool' 73 | 74 | model = ChatOpenAI(model='gpt-4.1') 75 | browser_session = BrowserSession(browser_profile=browser_profile) 76 | await browser_session.start() 77 | agent = Agent(task=task, llm=model, controller=controller, browser_session=browser_session) 78 | 79 | await agent.run() 80 | await browser_session.stop() 81 | 82 | input('Press Enter to close...') 83 | 84 | 85 | if __name__ == '__main__': 86 | asyncio.run(main()) 87 | -------------------------------------------------------------------------------- /utils/browser-use/examples/integrations/browserbase_stagehand.py: -------------------------------------------------------------------------------- 1 | """ 2 | EXPERIMENTAL: Integration example with Stagehand (browserbase) 3 | 4 | This example shows how to combine browser-use with Stagehand for advanced browser automation. 5 | Note: This requires the stagehand-py library to be installed separately: 6 | pip install stagehand-py 7 | 8 | The exact API may vary depending on the stagehand-py version. 9 | Please refer to the official Stagehand documentation for the latest usage: 10 | https://pypi.org/project/stagehand-py/ 11 | https://github.com/browserbase/stagehand-python-examples/ 12 | """ 13 | 14 | import asyncio 15 | import os 16 | 17 | from dotenv import load_dotenv 18 | 19 | load_dotenv() 20 | 21 | from stagehand import Stagehand, StagehandConfig # type: ignore 22 | 23 | from browser_use.agent.service import Agent 24 | 25 | 26 | async def main(): 27 | # Configure Stagehand 28 | # https://pypi.org/project/stagehand-py/ 29 | # https://github.com/browserbase/stagehand-python-examples/blob/main/agent_example.py 30 | # Note: This example requires the stagehand-py library to be installed 31 | # pip install stagehand-py 32 | 33 | # Create StagehandConfig with correct parameters 34 | # The exact parameters depend on the stagehand-py version 35 | config = StagehandConfig( # type: ignore 36 | apiKey=os.getenv('BROWSERBASE_API_KEY'), 37 | projectId=os.getenv('BROWSERBASE_PROJECT_ID'), 38 | ) 39 | 40 | # Create a Stagehand client using the configuration object. 41 | stagehand = Stagehand( 42 | config=config, 43 | model_api_key=os.getenv('OPENAI_API_KEY'), 44 | # server_url=os.getenv('STAGEHAND_SERVER_URL'), 45 | ) 46 | 47 | # Initialize - this creates a new session automatically. 48 | await stagehand.init() 49 | print(f'\nCreated new session: {stagehand.session_id}') 50 | print(f'🌐 View your live browser: https://www.browserbase.com/sessions/{stagehand.session_id}') 51 | 52 | # Check if stagehand has a page attribute 53 | if hasattr(stagehand, 'page') and stagehand.page: 54 | await stagehand.page.goto('https://google.com/') 55 | await stagehand.page.act('search for openai') 56 | else: 57 | print('Warning: Stagehand page not available') 58 | 59 | # Combine with Browser Use 60 | agent = Agent(task='click the first result', page=stagehand.page) # type: ignore 61 | await agent.run() 62 | 63 | # go back and forth 64 | await stagehand.page.act('open the 3 first links on the page in new tabs') # type: ignore 65 | 66 | await Agent(task='click the first result', page=stagehand.page).run() # type: ignore 67 | 68 | 69 | if __name__ == '__main__': 70 | asyncio.run(main()) 71 | -------------------------------------------------------------------------------- /utils/browser-use/examples/features/pause_agent.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | import threading 5 | 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 7 | 8 | from dotenv import load_dotenv 9 | 10 | load_dotenv() 11 | 12 | from browser_use import Agent 13 | from browser_use.llm import ChatOpenAI 14 | 15 | 16 | class AgentController: 17 | def __init__(self): 18 | llm = ChatOpenAI(model='gpt-4.1') 19 | self.agent = Agent( 20 | task='open in one action https://www.google.com, https://www.wikipedia.org, https://www.youtube.com, https://www.github.com, https://amazon.com', 21 | llm=llm, 22 | ) 23 | self.running = False 24 | 25 | async def run_agent(self): 26 | """Run the agent""" 27 | self.running = True 28 | await self.agent.run() 29 | 30 | def start(self): 31 | """Start the agent in a separate thread""" 32 | loop = asyncio.new_event_loop() 33 | asyncio.set_event_loop(loop) 34 | loop.run_until_complete(self.run_agent()) 35 | 36 | def pause(self): 37 | """Pause the agent""" 38 | self.agent.pause() 39 | 40 | def resume(self): 41 | """Resume the agent""" 42 | self.agent.resume() 43 | 44 | def stop(self): 45 | """Stop the agent""" 46 | self.agent.stop() 47 | self.running = False 48 | 49 | 50 | def print_menu(): 51 | print('\nAgent Control Menu:') 52 | print('1. Start') 53 | print('2. Pause') 54 | print('3. Resume') 55 | print('4. Stop') 56 | print('5. Exit') 57 | 58 | 59 | async def main(): 60 | controller = AgentController() 61 | agent_thread = None 62 | 63 | while True: 64 | print_menu() 65 | try: 66 | choice = input('Enter your choice (1-5): ') 67 | except KeyboardInterrupt: 68 | choice = '5' 69 | 70 | if choice == '1' and not agent_thread: 71 | print('Starting agent...') 72 | agent_thread = threading.Thread(target=controller.start) 73 | agent_thread.start() 74 | 75 | elif choice == '2': 76 | print('Pausing agent...') 77 | controller.pause() 78 | 79 | elif choice == '3': 80 | print('Resuming agent...') 81 | controller.resume() 82 | 83 | elif choice == '4': 84 | print('Stopping agent...') 85 | controller.stop() 86 | if agent_thread: 87 | agent_thread.join() 88 | agent_thread = None 89 | 90 | elif choice == '5': 91 | print('Exiting...') 92 | if controller.running: 93 | controller.stop() 94 | if agent_thread: 95 | agent_thread.join() 96 | break 97 | 98 | await asyncio.sleep(0.1) # Small delay to prevent CPU spinning 99 | 100 | 101 | if __name__ == '__main__': 102 | asyncio.run(main()) 103 | -------------------------------------------------------------------------------- /utils/browser-use/docs/development/observability.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Observability" 3 | description: "Trace Browser Use's agent execution steps and browser sessions" 4 | icon: "eye" 5 | --- 6 | 7 | ## Overview 8 | 9 | Browser Use has a native integration with [Laminar](https://lmnr.ai) - open-source platform for tracing, evals and labeling of AI agents. 10 | Read more about Laminar in the [Laminar docs](https://docs.lmnr.ai). 11 | 12 | 13 | Laminar excels at tracing browser agents by providing unified visibility into 14 | both browser session recordings and agent execution steps. 15 | 16 | 17 | ## Setup 18 | 19 | To setup Laminar, you need to install the `lmnr` package and set the `LMNR_PROJECT_API_KEY` environment variable. 20 | 21 | To get your project API key, you can either: 22 | 23 | - Register on [Laminar Cloud](https://lmnr.ai) and get the key from your project settings 24 | - Or spin up a local Laminar instance and get the key from the settings page 25 | 26 | ```bash 27 | pip install 'lmnr[all]' 28 | export LMNR_PROJECT_API_KEY= 29 | ``` 30 | 31 | ## Usage 32 | 33 | Then, you simply initialize the Laminar at the top of your project and both Browser Use and session recordings will be automatically traced. 34 | 35 | ```python {5-8} 36 | from browser_use.llm import ChatOpenAI 37 | from browser_use import Agent 38 | import asyncio 39 | 40 | from lmnr import Laminar, Instruments 41 | # this line auto-instruments Browser Use and any browser you use (local or remote) 42 | Laminar.initialize(project_api_key="...", disable_batch=True, disabled_instruments={Instruments.BROWSER_USE}) # you can also pass project api key here 43 | 44 | async def main(): 45 | agent = Agent( 46 | task="open google, search Laminar AI", 47 | llm=ChatOpenAI(model="gpt-4.1-mini"), 48 | ) 49 | result = await agent.run() 50 | print(result) 51 | 52 | asyncio.run(main()) 53 | ``` 54 | 55 | ## Viewing Traces 56 | 57 | You can view traces in the Laminar UI by going to the traces tab in your project. 58 | When you select a trace, you can see both the browser session recording and the agent execution steps. 59 | 60 | Timeline of the browser session is synced with the agent execution steps, timeline highlights indicate the agent's current step synced with the browser session. 61 | In the trace view, you can also see the agent's current step, the tool it's using, and the tool's input and output. Tools are highlighted in the timeline with a yellow color. 62 | 63 | Laminar 64 | 65 | ## Laminar 66 | 67 | To learn more about tracing and evaluating your browser agents, check out the [Laminar docs](https://docs.lmnr.ai). 68 | -------------------------------------------------------------------------------- /utils/browser-use/examples/ui/streamlit_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | To use it, you'll need to install streamlit, and run with: 3 | 4 | python -m streamlit run streamlit_demo.py 5 | 6 | """ 7 | 8 | import asyncio 9 | import os 10 | import sys 11 | 12 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 13 | 14 | from dotenv import load_dotenv 15 | 16 | load_dotenv() 17 | 18 | import streamlit as st # type: ignore 19 | 20 | from browser_use import Agent 21 | from browser_use.browser import BrowserSession 22 | from browser_use.controller.service import Controller 23 | 24 | if os.name == 'nt': 25 | asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy()) 26 | 27 | 28 | # Function to get the LLM based on provider 29 | def get_llm(provider: str): 30 | if provider == 'anthropic': 31 | from browser_use.llm import ChatAnthropic 32 | 33 | api_key = os.getenv('ANTHROPIC_API_KEY') 34 | if not api_key: 35 | st.error('Error: ANTHROPIC_API_KEY is not set. Please provide a valid API key.') 36 | st.stop() 37 | 38 | return ChatAnthropic(model='claude-3-5-sonnet-20240620', temperature=0.0) 39 | elif provider == 'openai': 40 | from browser_use.llm import ChatOpenAI 41 | 42 | api_key = os.getenv('OPENAI_API_KEY') 43 | if not api_key: 44 | st.error('Error: OPENAI_API_KEY is not set. Please provide a valid API key.') 45 | st.stop() 46 | 47 | return ChatOpenAI(model='gpt-4.1', temperature=0.0) 48 | else: 49 | st.error(f'Unsupported provider: {provider}') 50 | st.stop() 51 | return None # Never reached, but helps with type checking 52 | 53 | 54 | # Function to initialize the agent 55 | def initialize_agent(query: str, provider: str): 56 | llm = get_llm(provider) 57 | controller = Controller() 58 | browser_session = BrowserSession() 59 | 60 | return Agent( 61 | task=query, 62 | llm=llm, # type: ignore 63 | controller=controller, 64 | browser_session=browser_session, 65 | use_vision=True, 66 | max_actions_per_step=1, 67 | ), browser_session 68 | 69 | 70 | # Streamlit UI 71 | st.title('Automated Browser Agent with LLMs 🤖') 72 | 73 | query = st.text_input('Enter your query:', 'go to reddit and search for posts about browser-use') 74 | provider = st.radio('Select LLM Provider:', ['openai', 'anthropic'], index=0) 75 | 76 | if st.button('Run Agent'): 77 | st.write('Initializing agent...') 78 | agent, browser_session = initialize_agent(query, provider) 79 | 80 | async def run_agent(): 81 | with st.spinner('Running automation...'): 82 | await agent.run(max_steps=25) 83 | st.success('Task completed! 🎉') 84 | 85 | asyncio.run(run_agent()) 86 | 87 | st.button('Close Browser', on_click=lambda: asyncio.run(browser_session.close())) 88 | -------------------------------------------------------------------------------- /utils/browser-use/examples/custom-functions/extract_pdf_content.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env -S uv run --script 2 | # /// script 3 | # requires-python = ">=3.11" 4 | # dependencies = ["browser-use", "mistralai"] 5 | # /// 6 | 7 | import os 8 | import sys 9 | 10 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 11 | 12 | from dotenv import load_dotenv 13 | 14 | load_dotenv() 15 | 16 | import asyncio 17 | import logging 18 | 19 | from mistralai import Mistral # type: ignore 20 | from pydantic import BaseModel, Field 21 | 22 | from browser_use import Agent, Controller 23 | from browser_use.agent.views import ActionResult 24 | from browser_use.browser.context import BrowserContext 25 | from browser_use.llm import ChatOpenAI 26 | 27 | if not os.getenv('OPENAI_API_KEY'): 28 | raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.') 29 | 30 | if not os.getenv('MISTRAL_API_KEY'): 31 | raise ValueError('MISTRAL_API_KEY is not set. Please add it to your environment variables.') 32 | 33 | logger = logging.getLogger(__name__) 34 | 35 | controller = Controller() 36 | 37 | 38 | class PdfExtractParams(BaseModel): 39 | url: str = Field(description='URL to a PDF document') 40 | 41 | 42 | @controller.registry.action( 43 | 'Extract PDF Text', 44 | param_model=PdfExtractParams, 45 | ) 46 | def extract_mistral_ocr(params: PdfExtractParams, browser: BrowserContext) -> ActionResult: 47 | """ 48 | Process a PDF URL using Mistral OCR API and return the OCR response. 49 | 50 | Args: 51 | url: URL to a PDF document 52 | 53 | Returns: 54 | OCR response object from Mistral API 55 | """ 56 | api_key = os.getenv('MISTRAL_API_KEY') 57 | client = Mistral(api_key=api_key) 58 | 59 | response = client.ocr.process( 60 | model='mistral-ocr-latest', 61 | document={ 62 | 'type': 'document_url', 63 | 'document_url': params.url, 64 | }, 65 | include_image_base64=False, 66 | ) 67 | 68 | markdown = '\n\n'.join(f'### Page {i + 1}\n{response.pages[i].markdown}' for i in range(len(response.pages))) 69 | return ActionResult( 70 | extracted_content=markdown, 71 | include_in_memory=False, ## PDF content can be very large, so we don't include it in memory 72 | ) 73 | 74 | 75 | async def main(): 76 | agent = Agent( 77 | task=""" 78 | Objective: Navigate to the following URL, extract its contents using the Extract PDF Text action, and explain its historical significance. 79 | 80 | URL: https://docs.house.gov/meetings/GO/GO00/20220929/115171/HHRG-117-GO00-20220929-SD010.pdf 81 | """, 82 | llm=ChatOpenAI(model='gpt-4.1'), 83 | controller=controller, 84 | ) 85 | result = await agent.run() 86 | logger.info(result) 87 | 88 | 89 | if __name__ == '__main__': 90 | asyncio.run(main()) 91 | -------------------------------------------------------------------------------- /utils/browser-use/examples/custom-functions/perplexity_search.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | import logging 12 | 13 | from pydantic import BaseModel 14 | 15 | from browser_use import ActionResult, Agent, Controller 16 | from browser_use.browser.profile import BrowserProfile 17 | from browser_use.llm import ChatOpenAI 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | class Person(BaseModel): 23 | name: str 24 | email: str | None = None 25 | 26 | 27 | class PersonList(BaseModel): 28 | people: list[Person] 29 | 30 | 31 | PERPLEXITY_API_KEY = os.getenv('PERPLEXITY_API_KEY') 32 | if not PERPLEXITY_API_KEY: 33 | raise ValueError('PERPLEXITY_API_KEY is not set') 34 | 35 | controller = Controller(exclude_actions=['search_google'], output_model=PersonList) 36 | 37 | 38 | @controller.registry.action('Search the web for a specific query with perplexity') 39 | async def search_web(query: str): 40 | import httpx 41 | 42 | url = 'https://api.perplexity.ai/chat/completions' 43 | 44 | payload = { 45 | 'model': 'sonar', 46 | 'messages': [ 47 | {'role': 'system', 'content': 'Be precise and concise.'}, 48 | {'role': 'user', 'content': query}, 49 | ], 50 | } 51 | headers = {'Authorization': f'Bearer {PERPLEXITY_API_KEY}', 'Content-Type': 'application/json'} 52 | 53 | async with httpx.AsyncClient() as client: 54 | response = await client.post(url, json=payload, headers=headers) 55 | response.raise_for_status() 56 | response_json = response.json() 57 | content = response_json['choices'][0]['message']['content'] 58 | citations = response_json['citations'] 59 | output = f'{content}\n\nCitations:\n' + '\n'.join(citations) 60 | logger.info(output) 61 | return ActionResult(extracted_content=output, include_in_memory=True) 62 | 63 | 64 | names = [ 65 | 'Ruedi Aebersold', 66 | 'Bernd Bodenmiller', 67 | 'Eugene Demler', 68 | ] 69 | 70 | 71 | async def main(): 72 | task = 'use search_web with "find email address of the following ETH professor:" for each of the persons. Finally return the list with name and email if provided ' 73 | task += '\n' + '\n'.join(names) 74 | model = ChatOpenAI(model='gpt-4.1') 75 | browser_profile = BrowserProfile() 76 | agent = Agent(task=task, llm=model, controller=controller, browser_profile=browser_profile) 77 | 78 | history = await agent.run() 79 | 80 | result = history.final_result() 81 | if result: 82 | parsed: PersonList = PersonList.model_validate_json(result) 83 | 84 | for person in parsed.people: 85 | print(f'{person.name} - {person.email}') 86 | else: 87 | print('No result') 88 | 89 | 90 | if __name__ == '__main__': 91 | asyncio.run(main()) 92 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/tokens/views.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import Any, TypeVar 3 | 4 | from pydantic import BaseModel, Field 5 | 6 | from browser_use.llm.views import ChatInvokeUsage 7 | 8 | T = TypeVar('T', bound=BaseModel) 9 | 10 | 11 | class TokenUsageEntry(BaseModel): 12 | """Single token usage entry""" 13 | 14 | model: str 15 | timestamp: datetime 16 | usage: ChatInvokeUsage 17 | 18 | 19 | class TokenCostCalculated(BaseModel): 20 | """Token cost""" 21 | 22 | new_prompt_tokens: int 23 | new_prompt_cost: float 24 | 25 | prompt_read_cached_tokens: int | None 26 | prompt_read_cached_cost: float | None 27 | 28 | prompt_cached_creation_tokens: int | None 29 | prompt_cache_creation_cost: float | None 30 | """Anthropic only: The cost of creating the cache.""" 31 | 32 | completion_tokens: int 33 | completion_cost: float 34 | 35 | @property 36 | def prompt_cost(self) -> float: 37 | return self.new_prompt_cost + (self.prompt_read_cached_cost or 0) + (self.prompt_cache_creation_cost or 0) 38 | 39 | @property 40 | def total_cost(self) -> float: 41 | return ( 42 | self.new_prompt_cost 43 | + (self.prompt_read_cached_cost or 0) 44 | + (self.prompt_cache_creation_cost or 0) 45 | + self.completion_cost 46 | ) 47 | 48 | 49 | class ModelPricing(BaseModel): 50 | """Pricing information for a model""" 51 | 52 | model: str 53 | input_cost_per_token: float | None 54 | output_cost_per_token: float | None 55 | 56 | cache_read_input_token_cost: float | None 57 | cache_creation_input_token_cost: float | None 58 | 59 | max_tokens: int | None 60 | max_input_tokens: int | None 61 | max_output_tokens: int | None 62 | 63 | 64 | class CachedPricingData(BaseModel): 65 | """Cached pricing data with timestamp""" 66 | 67 | timestamp: datetime 68 | data: dict[str, Any] 69 | 70 | 71 | class ModelUsageStats(BaseModel): 72 | """Usage statistics for a single model""" 73 | 74 | model: str 75 | prompt_tokens: int = 0 76 | completion_tokens: int = 0 77 | total_tokens: int = 0 78 | cost: float = 0.0 79 | invocations: int = 0 80 | average_tokens_per_invocation: float = 0.0 81 | 82 | 83 | class ModelUsageTokens(BaseModel): 84 | """Usage tokens for a single model""" 85 | 86 | model: str 87 | prompt_tokens: int 88 | prompt_cached_tokens: int 89 | completion_tokens: int 90 | total_tokens: int 91 | 92 | 93 | class UsageSummary(BaseModel): 94 | """Summary of token usage and costs""" 95 | 96 | total_prompt_tokens: int 97 | total_prompt_cost: float 98 | 99 | total_prompt_cached_tokens: int 100 | total_prompt_cached_cost: float 101 | 102 | total_completion_tokens: int 103 | total_completion_cost: float 104 | total_tokens: int 105 | total_cost: float 106 | entry_count: int 107 | 108 | by_model: dict[str, ModelUsageStats] = Field(default_factory=dict) 109 | -------------------------------------------------------------------------------- /utils/browser-use/examples/use-cases/find_influencer_profiles.py: -------------------------------------------------------------------------------- 1 | """ 2 | Show how to use custom outputs. 3 | 4 | @dev You need to add OPENAI_API_KEY to your environment variables. 5 | """ 6 | 7 | import asyncio 8 | import json 9 | import os 10 | import sys 11 | 12 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 13 | 14 | from dotenv import load_dotenv 15 | 16 | load_dotenv() 17 | 18 | import httpx 19 | from pydantic import BaseModel 20 | 21 | from browser_use import Agent, Controller 22 | from browser_use.agent.views import ActionResult 23 | from browser_use.llm import ChatOpenAI 24 | 25 | 26 | class Profile(BaseModel): 27 | platform: str 28 | profile_url: str 29 | 30 | 31 | class Profiles(BaseModel): 32 | profiles: list[Profile] 33 | 34 | 35 | controller = Controller(exclude_actions=['search_google'], output_model=Profiles) 36 | BEARER_TOKEN = os.getenv('BEARER_TOKEN') 37 | 38 | if not BEARER_TOKEN: 39 | # use the api key for ask tessa 40 | # you can also use other apis like exa, xAI, perplexity, etc. 41 | raise ValueError('BEARER_TOKEN is not set - go to https://www.heytessa.ai/ and create an api key') 42 | 43 | 44 | @controller.registry.action('Search the web for a specific query') 45 | async def search_web(query: str): 46 | keys_to_use = ['url', 'title', 'content', 'author', 'score'] 47 | headers = {'Authorization': f'Bearer {BEARER_TOKEN}'} 48 | async with httpx.AsyncClient() as client: 49 | response = await client.post( 50 | 'https://asktessa.ai/api/search', 51 | headers=headers, 52 | json={'query': query}, 53 | ) 54 | 55 | final_results = [ 56 | {key: source[key] for key in keys_to_use if key in source} 57 | for source in await response.json()['sources'] 58 | if source['score'] >= 0.2 59 | ] 60 | # print(json.dumps(final_results, indent=4)) 61 | result_text = json.dumps(final_results, indent=4) 62 | print(result_text) 63 | return ActionResult(extracted_content=result_text, include_in_memory=True) 64 | 65 | 66 | async def main(): 67 | task = ( 68 | 'Go to this tiktok video url, open it and extract the @username from the resulting url. Then do a websearch for this username to find all his social media profiles. Return me the links to the social media profiles with the platform name.' 69 | ' https://www.tiktokv.com/share/video/7470981717659110678/ ' 70 | ) 71 | model = ChatOpenAI(model='gpt-4.1') 72 | agent = Agent(task=task, llm=model, controller=controller) 73 | 74 | history = await agent.run() 75 | 76 | result = history.final_result() 77 | if result: 78 | parsed: Profiles = Profiles.model_validate_json(result) 79 | 80 | for profile in parsed.profiles: 81 | print('\n--------------------------------') 82 | print(f'Platform: {profile.platform}') 83 | print(f'Profile URL: {profile.profile_url}') 84 | 85 | else: 86 | print('No result') 87 | 88 | 89 | if __name__ == '__main__': 90 | asyncio.run(main()) 91 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/llm/ollama/chat.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Any, TypeVar, overload 3 | 4 | import httpx 5 | from ollama import AsyncClient as OllamaAsyncClient 6 | from pydantic import BaseModel 7 | 8 | from browser_use.llm.base import BaseChatModel 9 | from browser_use.llm.exceptions import ModelProviderError 10 | from browser_use.llm.messages import BaseMessage 11 | from browser_use.llm.ollama.serializer import OllamaMessageSerializer 12 | from browser_use.llm.views import ChatInvokeCompletion 13 | 14 | T = TypeVar('T', bound=BaseModel) 15 | 16 | 17 | @dataclass 18 | class ChatOllama(BaseChatModel): 19 | """ 20 | A wrapper around Ollama's chat model. 21 | """ 22 | 23 | model: str 24 | 25 | # # Model params 26 | # temperature: float | None = None 27 | 28 | # Client initialization parameters 29 | host: str | None = None 30 | timeout: float | httpx.Timeout | None = None 31 | client_params: dict[str, Any] | None = None 32 | 33 | # Static 34 | @property 35 | def provider(self) -> str: 36 | return 'ollama' 37 | 38 | def _get_client_params(self) -> dict[str, Any]: 39 | """Prepare client parameters dictionary.""" 40 | return { 41 | 'host': self.host, 42 | 'timeout': self.timeout, 43 | 'client_params': self.client_params, 44 | } 45 | 46 | def get_client(self) -> OllamaAsyncClient: 47 | """ 48 | Returns an OllamaAsyncClient client. 49 | """ 50 | return OllamaAsyncClient(host=self.host, timeout=self.timeout, **self.client_params or {}) 51 | 52 | @property 53 | def name(self) -> str: 54 | return self.model 55 | 56 | @overload 57 | async def ainvoke(self, messages: list[BaseMessage], output_format: None = None) -> ChatInvokeCompletion[str]: ... 58 | 59 | @overload 60 | async def ainvoke(self, messages: list[BaseMessage], output_format: type[T]) -> ChatInvokeCompletion[T]: ... 61 | 62 | async def ainvoke( 63 | self, messages: list[BaseMessage], output_format: type[T] | None = None 64 | ) -> ChatInvokeCompletion[T] | ChatInvokeCompletion[str]: 65 | ollama_messages = OllamaMessageSerializer.serialize_messages(messages) 66 | 67 | try: 68 | if output_format is None: 69 | response = await self.get_client().chat( 70 | model=self.model, 71 | messages=ollama_messages, 72 | ) 73 | 74 | return ChatInvokeCompletion(completion=response.message.content or '', usage=None) 75 | else: 76 | schema = output_format.model_json_schema() 77 | 78 | response = await self.get_client().chat( 79 | model=self.model, 80 | messages=ollama_messages, 81 | format=schema, 82 | ) 83 | 84 | completion = response.message.content or '' 85 | if output_format is not None: 86 | completion = output_format.model_validate_json(completion) 87 | 88 | return ChatInvokeCompletion(completion=completion, usage=None) 89 | 90 | except Exception as e: 91 | raise ModelProviderError(message=str(e), model=self.name) from e 92 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/dom/clickable_element_processor/service.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | 3 | from browser_use.dom.views import DOMElementNode 4 | 5 | 6 | class ClickableElementProcessor: 7 | @staticmethod 8 | def get_clickable_elements_hashes(dom_element: DOMElementNode) -> set[str]: 9 | """Get all clickable elements in the DOM tree""" 10 | clickable_elements = ClickableElementProcessor.get_clickable_elements(dom_element) 11 | return {ClickableElementProcessor.hash_dom_element(element) for element in clickable_elements} 12 | 13 | @staticmethod 14 | def get_clickable_elements(dom_element: DOMElementNode) -> list[DOMElementNode]: 15 | """Get all clickable elements in the DOM tree""" 16 | clickable_elements = list() 17 | for child in dom_element.children: 18 | if isinstance(child, DOMElementNode): 19 | if child.highlight_index: 20 | clickable_elements.append(child) 21 | 22 | clickable_elements.extend(ClickableElementProcessor.get_clickable_elements(child)) 23 | 24 | return list(clickable_elements) 25 | 26 | @staticmethod 27 | def hash_dom_element(dom_element: DOMElementNode) -> str: 28 | parent_branch_path = ClickableElementProcessor._get_parent_branch_path(dom_element) 29 | branch_path_hash = ClickableElementProcessor._parent_branch_path_hash(parent_branch_path) 30 | attributes_hash = ClickableElementProcessor._attributes_hash(dom_element.attributes) 31 | xpath_hash = ClickableElementProcessor._xpath_hash(dom_element.xpath) 32 | # text_hash = DomTreeProcessor._text_hash(dom_element) 33 | 34 | return ClickableElementProcessor._hash_string(f'{branch_path_hash}-{attributes_hash}-{xpath_hash}') 35 | 36 | @staticmethod 37 | def _get_parent_branch_path(dom_element: DOMElementNode) -> list[str]: 38 | parents: list[DOMElementNode] = [] 39 | current_element: DOMElementNode = dom_element 40 | while current_element.parent is not None: 41 | parents.append(current_element) 42 | current_element = current_element.parent 43 | 44 | parents.reverse() 45 | 46 | return [parent.tag_name for parent in parents] 47 | 48 | @staticmethod 49 | def _parent_branch_path_hash(parent_branch_path: list[str]) -> str: 50 | parent_branch_path_string = '/'.join(parent_branch_path) 51 | return hashlib.sha256(parent_branch_path_string.encode()).hexdigest() 52 | 53 | @staticmethod 54 | def _attributes_hash(attributes: dict[str, str]) -> str: 55 | attributes_string = ''.join(f'{key}={value}' for key, value in attributes.items()) 56 | return ClickableElementProcessor._hash_string(attributes_string) 57 | 58 | @staticmethod 59 | def _xpath_hash(xpath: str) -> str: 60 | return ClickableElementProcessor._hash_string(xpath) 61 | 62 | @staticmethod 63 | def _text_hash(dom_element: DOMElementNode) -> str: 64 | """ """ 65 | text_string = dom_element.get_all_text_till_next_clickable_element() 66 | return ClickableElementProcessor._hash_string(text_string) 67 | 68 | @staticmethod 69 | def _hash_string(string: str) -> str: 70 | return hashlib.sha256(string.encode()).hexdigest() 71 | -------------------------------------------------------------------------------- /utils/browser-use/docs/development/contribution-guide.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Contribution Guide" 3 | description: "Learn how to contribute to Browser Use" 4 | icon: "github" 5 | --- 6 | 7 | # Join the Browser Use Community! 8 | 9 | We're thrilled you're interested in contributing to Browser Use! This guide will help you get started with contributing to our project. Your contributions are what make the open-source community such an amazing place to learn, inspire, and create. 10 | 11 | ## Quick Setup 12 | 13 | Get started with Browser Use development in minutes: 14 | 15 | ```bash 16 | git clone https://github.com/browser-use/browser-use 17 | cd browser-use 18 | uv sync --all-extras --dev 19 | # or pip install -U git+https://github.com/browser-use/browser-use.git@main 20 | 21 | echo "BROWSER_USE_LOGGING_LEVEL=debug" >> .env 22 | ``` 23 | 24 | For more detailed setup instructions, see our [Local Setup Guide](/development/local-setup). 25 | 26 | ## How to Contribute 27 | 28 | ### Find Something to Work On 29 | 30 | - Browse our [GitHub Issues](https://github.com/browser-use/browser-use/issues) for beginner-friendly issues labeled `good-first-issue` 31 | - Check out our most active issues or ask in [Discord](https://discord.gg/zXJJHtJf3k) for ideas of what to work on 32 | - Get inspiration and share what you build in the [`#showcase-your-work`](https://discord.com/channels/1303749220842340412/1305549200678850642) channel 33 | - Explore or contribute to [`awesome-browser-use-prompts`](https://github.com/browser-use/awesome-prompts)! 34 | 35 | ### Making a Great Pull Request 36 | 37 | When submitting a pull request, please: 38 | 39 | - Include a clear description of what the PR does and why it's needed 40 | - Add tests that cover your changes 41 | - Include a demo screenshot/gif or an example script demonstrating your changes 42 | - Make sure the PR passes all CI checks and tests 43 | - Keep your PR focused on a single issue or feature to make it easier to review 44 | 45 | Note: We appreciate quality over quantity. Instead of submitting small typo/style-only PRs, consider including those fixes as part of larger bugfix or feature PRs. 46 | 47 | ### Contribution Process 48 | 49 | 1. Fork the repository 50 | 2. Create a new branch for your feature or bugfix 51 | 3. Make your changes 52 | 4. Run tests to ensure everything works 53 | 5. Submit a pull request 54 | 6. Respond to any feedback from maintainers 55 | 7. Celebrate your contribution! 56 | 57 | Feel free to bump your issues/PRs with comments periodically if you need faster feedback. 58 | 59 | ## Code of Conduct 60 | 61 | We're committed to providing a welcoming and inclusive environment for all contributors. Please be respectful and constructive in all interactions. 62 | 63 | ## Getting Help 64 | 65 | If you need help at any point: 66 | 67 | - Join our [Discord community](https://link.browser-use.com/discord) 68 | - Ask questions in the appropriate GitHub issue 69 | - Check our [documentation](/introduction) 70 | 71 | We're here to help you succeed in contributing to Browser Use! 72 | -------------------------------------------------------------------------------- /utils/browser-use/examples/custom-functions/file_upload.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | import os 4 | import sys 5 | from pathlib import Path 6 | 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 8 | 9 | from dotenv import load_dotenv 10 | 11 | load_dotenv() 12 | 13 | from lmnr import Laminar 14 | 15 | try: 16 | Laminar.initialize(project_api_key=os.getenv('LMNR_PROJECT_API_KEY')) 17 | except Exception: 18 | pass 19 | 20 | from browser_use import Agent, Controller 21 | from browser_use.agent.views import ActionResult 22 | from browser_use.browser import BrowserSession 23 | from browser_use.llm import ChatOpenAI 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | controller = Controller() 28 | 29 | 30 | @controller.action('Upload file to interactive element with file path') 31 | async def upload_file(index: int, path: str, browser_session: BrowserSession, available_file_paths: list[str]): 32 | if path not in available_file_paths: 33 | return ActionResult(error=f'File path {path} is not available') 34 | 35 | if not os.path.exists(path): 36 | return ActionResult(error=f'File {path} does not exist') 37 | 38 | file_upload_dom_el = await browser_session.find_file_upload_element_by_index(index, max_height=3, max_descendant_depth=3) 39 | 40 | if file_upload_dom_el is None: 41 | msg = f'No file upload element found at index {index}' 42 | logger.info(msg) 43 | return ActionResult(error=msg) 44 | 45 | file_upload_el = await browser_session.get_locate_element(file_upload_dom_el) 46 | 47 | if file_upload_el is None: 48 | msg = f'No file upload element found at index {index}' 49 | logger.info(msg) 50 | return ActionResult(error=msg) 51 | 52 | try: 53 | await file_upload_el.set_input_files(path) 54 | msg = f'Successfully uploaded file to index {index}' 55 | logger.info(msg) 56 | return ActionResult(extracted_content=msg, include_in_memory=True) 57 | except Exception as e: 58 | msg = f'Failed to upload file to index {index}: {str(e)}' 59 | logger.info(msg) 60 | return ActionResult(error=msg) 61 | 62 | 63 | def create_file(file_type: str = 'txt'): 64 | with open(f'tmp.{file_type}', 'w') as f: 65 | f.write('test') 66 | file_path = Path.cwd() / f'tmp.{file_type}' 67 | logger.info(f'Created file: {file_path}') 68 | return str(file_path) 69 | 70 | 71 | async def main(): 72 | task = 'Go to https://kzmpmkh2zfk1ojnpxfn1.lite.vusercontent.net/ and - read the file content and upload them to fields' 73 | task = 'Go to https://www.freepdfconvert.com/, upload the file tmp.pdf into the field choose a file - dont click the fileupload button' 74 | available_file_paths = [create_file('txt'), create_file('pdf'), create_file('csv')] 75 | 76 | model = ChatOpenAI(model='gpt-4.1-mini') 77 | agent = Agent( 78 | task=task, 79 | llm=model, 80 | controller=controller, 81 | available_file_paths=available_file_paths, 82 | ) 83 | 84 | await agent.run() 85 | 86 | input('Press Enter to close...') 87 | 88 | 89 | if __name__ == '__main__': 90 | asyncio.run(main()) 91 | -------------------------------------------------------------------------------- /utils/browser-use/.cursor/rules/browser-use-rules.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: 3 | globs: 4 | alwaysApply: true 5 | --- 6 | ## 🧠 General Guidelines for Contributing to `browser-use` 7 | 8 | **Browser-Use** is an AI agent that autonomously interacts with the web. It takes a user-defined task, navigates web pages using Chromium via Playwright, processes HTML, and repeatedly queries a language model (like `gpt-4o`) to decide the next action—until the task is completed. 9 | 10 | ### 🗂️ File Documentation 11 | 12 | When you create a **new file**: 13 | 14 | * **For humans**: At the top of the file, include a docstring in natural language explaining: 15 | 16 | * What this file does. 17 | * How it fits into the browser-use system. 18 | * If it introduces a new abstraction or replaces an old one. 19 | * **For LLMs/AI**: Include structured metadata using standardized comments such as: 20 | 21 | ```python 22 | # @file purpose: Defines 23 | ``` 24 | 25 | --- 26 | 27 | ### 🧰 Development Rules 28 | 29 | * ✅ **Always use [`uv`](mdc:https:/github.com/astral-sh/uv) instead of `pip`** 30 | For deterministic and fast dependency installs. 31 | 32 | ```bash 33 | uv venv --python 3.11 34 | source .venv/bin/activate 35 | uv sync 36 | ``` 37 | 38 | * ✅ **Use real model names** 39 | Do **not** replace `gpt-4o` with `gpt-4`. The model `gpt-4o` is a distinct release and supported. 40 | 41 | * ✅ **Type-safe coding** 42 | Use **Pydantic v2 models** for all internal action schemas, task inputs/outputs, and controller I/O. This ensures robust validation and LLM-call integrity. 43 | 44 | --- 45 | 46 | ## ⚙️ Adding New Actions 47 | 48 | To add a new action that your browser agent can execute: 49 | 50 | ```python 51 | from playwright.async_api import Page 52 | from browser_use.core.controller import Controller, ActionResult 53 | 54 | controller = Controller() 55 | 56 | @controller.registry.action("Search the web for a specific query") 57 | async def search_web(query: str, page: Page): 58 | # Implement your logic here, e.g., query a search engine and return results 59 | result = ... 60 | return ActionResult(extracted_content=result, include_in_memory=True) 61 | ``` 62 | 63 | ### Notes: 64 | 65 | * Use descriptive names and docstrings for each action. 66 | * Prefer returning `ActionResult` with structured content to help the agent reason better. 67 | 68 | --- 69 | 70 | ## 🧠 Creating and Running an Agent 71 | 72 | To define a task and run a browser-use agent: 73 | 74 | ```python 75 | from browser_use import Agent 76 | from browser_use.llm import ChatOpenAI 77 | 78 | task = "Find the CEO of OpenAI and return their name" 79 | model = ChatOpenAI(model="gpt-4.1-mini") 80 | 81 | agent = Agent(task=task, llm=model, controller=controller) 82 | 83 | history = await agent.run() 84 | ``` 85 | 86 | # Never create random examples 87 | 88 | When I ask you to implement a feature never create new files that show off that feature -> the code just gets messy. If you do anything to test it out, just do the inline code inside the terminal (if you want). 89 | -------------------------------------------------------------------------------- /utils/browser-use/tests/ci/test_schema_optimizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the SchemaOptimizer to ensure it correctly processes and 3 | optimizes the schemas for agent actions without losing information. 4 | """ 5 | 6 | from pydantic import BaseModel 7 | 8 | from browser_use.agent.views import AgentOutput 9 | from browser_use.controller.service import Controller 10 | from browser_use.llm.schema import SchemaOptimizer 11 | 12 | 13 | class ProductInfo(BaseModel): 14 | """A sample structured output model with multiple fields.""" 15 | 16 | price: str 17 | title: str 18 | rating: float | None = None 19 | 20 | 21 | def test_optimizer_preserves_all_fields_in_structured_done_action(): 22 | """ 23 | Ensures the SchemaOptimizer does not drop fields from a custom structured 24 | output model when creating the schema for the 'done' action. 25 | 26 | This test specifically checks for a bug where fields were being lost 27 | during the optimization process. 28 | """ 29 | # 1. Setup a controller with a custom output model, simulating an Agent 30 | # being created with an `output_model_schema`. 31 | controller = Controller(output_model=ProductInfo) 32 | 33 | # 2. Get the dynamically created AgentOutput model, which includes all registered actions. 34 | ActionModel = controller.registry.create_action_model() 35 | agent_output_model = AgentOutput.type_with_custom_actions(ActionModel) 36 | 37 | # 3. Run the schema optimizer on the agent's output model. 38 | optimized_schema = SchemaOptimizer.create_optimized_json_schema(agent_output_model) 39 | 40 | # 4. Find the 'done' action schema within the optimized output. 41 | # The path is properties -> action -> items -> anyOf -> [schema with 'done']. 42 | done_action_schema = None 43 | actions_schemas = optimized_schema.get('properties', {}).get('action', {}).get('items', {}).get('anyOf', []) 44 | for action_schema in actions_schemas: 45 | if 'done' in action_schema.get('properties', {}): 46 | done_action_schema = action_schema 47 | break 48 | 49 | # 5. Assert that the 'done' action schema was successfully found. 50 | assert done_action_schema is not None, "Could not find 'done' action in the optimized schema." 51 | 52 | # 6. Navigate to the schema for our custom data model within the 'done' action. 53 | # The path is properties -> done -> properties -> data -> properties. 54 | done_params_schema = done_action_schema.get('properties', {}).get('done', {}) 55 | structured_data_schema = done_params_schema.get('properties', {}).get('data', {}) 56 | final_properties = structured_data_schema.get('properties', {}) 57 | 58 | # 7. Assert that the set of fields in the optimized schema matches the original model's fields. 59 | original_fields = set(ProductInfo.model_fields.keys()) 60 | optimized_fields = set(final_properties.keys()) 61 | 62 | assert original_fields == optimized_fields, ( 63 | f"Field mismatch between original and optimized structured 'done' action schema.\n" 64 | f'Missing from optimized: {original_fields - optimized_fields}\n' 65 | f'Unexpected in optimized: {optimized_fields - original_fields}' 66 | ) 67 | -------------------------------------------------------------------------------- /utils/browser-use/browser_use/agent/message_manager/views.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | from pydantic import BaseModel, ConfigDict, Field 6 | 7 | from browser_use.llm.messages import ( 8 | BaseMessage, 9 | UserMessage, 10 | ) 11 | 12 | if TYPE_CHECKING: 13 | pass 14 | 15 | 16 | class HistoryItem(BaseModel): 17 | """Represents a single agent history item with its data and string representation""" 18 | 19 | step_number: int | None = None 20 | evaluation_previous_goal: str | None = None 21 | memory: str | None = None 22 | next_goal: str | None = None 23 | action_results: str | None = None 24 | error: str | None = None 25 | system_message: str | None = None 26 | 27 | model_config = ConfigDict(arbitrary_types_allowed=True) 28 | 29 | def model_post_init(self, __context) -> None: 30 | """Validate that error and system_message are not both provided""" 31 | if self.error is not None and self.system_message is not None: 32 | raise ValueError('Cannot have both error and system_message at the same time') 33 | 34 | def to_string(self) -> str: 35 | """Get string representation of the history item""" 36 | step_str = f'step_{self.step_number}' if self.step_number is not None else 'step_unknown' 37 | 38 | if self.error: 39 | return f"""<{step_str}> 40 | {self.error} 41 | """ 42 | elif self.system_message: 43 | return f""" 44 | {self.system_message} 45 | """ 46 | else: 47 | content_parts = [ 48 | f'Evaluation of Previous Step: {self.evaluation_previous_goal}', 49 | f'Memory: {self.memory}', 50 | f'Next Goal: {self.next_goal}', 51 | ] 52 | 53 | if self.action_results: 54 | content_parts.append(self.action_results) 55 | 56 | content = '\n'.join(content_parts) 57 | 58 | return f"""<{step_str}> 59 | {content} 60 | """ 61 | 62 | 63 | class MessageHistory(BaseModel): 64 | """History of messages""" 65 | 66 | messages: list[BaseMessage] = Field(default_factory=list) 67 | 68 | model_config = ConfigDict(arbitrary_types_allowed=True) 69 | 70 | def add_message(self, message: BaseMessage, position: int | None = None) -> None: 71 | """Add message to history""" 72 | if position is None: 73 | self.messages.append(message) 74 | else: 75 | self.messages.insert(position, message) 76 | 77 | def get_messages(self) -> list[BaseMessage]: 78 | """Get all messages""" 79 | return self.messages 80 | 81 | def remove_last_state_message(self) -> None: 82 | """Remove last state message from history""" 83 | if len(self.messages) > 2 and isinstance(self.messages[-1], UserMessage): 84 | self.messages.pop() 85 | 86 | 87 | class MessageManagerState(BaseModel): 88 | """Holds the state for MessageManager""" 89 | 90 | history: MessageHistory = Field(default_factory=MessageHistory) 91 | tool_id: int = 1 92 | agent_history_items: list[HistoryItem] = Field( 93 | default_factory=lambda: [HistoryItem(step_number=0, system_message='Agent initialized')] 94 | ) 95 | read_state_description: str = '' 96 | 97 | model_config = ConfigDict(arbitrary_types_allowed=True) 98 | -------------------------------------------------------------------------------- /utils/browser-use/examples/ui/command_line.py: -------------------------------------------------------------------------------- 1 | """ 2 | To Use It: 3 | 4 | Example 1: Using OpenAI (default), with default task: 'go to reddit and search for posts about browser-use' 5 | python command_line.py 6 | 7 | Example 2: Using OpenAI with a Custom Query 8 | python command_line.py --query "go to google and search for browser-use" 9 | 10 | Example 3: Using Anthropic's Claude Model with a Custom Query 11 | python command_line.py --query "find latest Python tutorials on Medium" --provider anthropic 12 | 13 | """ 14 | 15 | import argparse 16 | import asyncio 17 | import os 18 | import sys 19 | 20 | # Ensure local repository (browser_use) is accessible 21 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 22 | 23 | from dotenv import load_dotenv 24 | 25 | load_dotenv() 26 | 27 | from browser_use import Agent 28 | from browser_use.browser import BrowserSession 29 | from browser_use.controller.service import Controller 30 | 31 | 32 | def get_llm(provider: str): 33 | if provider == 'anthropic': 34 | from browser_use.llm import ChatAnthropic 35 | 36 | api_key = os.getenv('ANTHROPIC_API_KEY') 37 | if not api_key: 38 | raise ValueError('Error: ANTHROPIC_API_KEY is not set. Please provide a valid API key.') 39 | 40 | return ChatAnthropic(model='claude-3-5-sonnet-20240620', temperature=0.0) 41 | elif provider == 'openai': 42 | from browser_use.llm import ChatOpenAI 43 | 44 | api_key = os.getenv('OPENAI_API_KEY') 45 | if not api_key: 46 | raise ValueError('Error: OPENAI_API_KEY is not set. Please provide a valid API key.') 47 | 48 | return ChatOpenAI(model='gpt-4.1', temperature=0.0) 49 | 50 | else: 51 | raise ValueError(f'Unsupported provider: {provider}') 52 | 53 | 54 | def parse_arguments(): 55 | """Parse command-line arguments.""" 56 | parser = argparse.ArgumentParser(description='Automate browser tasks using an LLM agent.') 57 | parser.add_argument( 58 | '--query', type=str, help='The query to process', default='go to reddit and search for posts about browser-use' 59 | ) 60 | parser.add_argument( 61 | '--provider', 62 | type=str, 63 | choices=['openai', 'anthropic'], 64 | default='openai', 65 | help='The model provider to use (default: openai)', 66 | ) 67 | return parser.parse_args() 68 | 69 | 70 | def initialize_agent(query: str, provider: str): 71 | """Initialize the browser agent with the given query and provider.""" 72 | llm = get_llm(provider) 73 | controller = Controller() 74 | browser_session = BrowserSession() 75 | 76 | return Agent( 77 | task=query, 78 | llm=llm, 79 | controller=controller, 80 | browser_session=browser_session, 81 | use_vision=True, 82 | max_actions_per_step=1, 83 | ), browser_session 84 | 85 | 86 | async def main(): 87 | """Main async function to run the agent.""" 88 | args = parse_arguments() 89 | agent, browser_session = initialize_agent(args.query, args.provider) 90 | 91 | await agent.run(max_steps=25) 92 | 93 | input('Press Enter to close the browser...') 94 | await browser_session.close() 95 | 96 | 97 | if __name__ == '__main__': 98 | asyncio.run(main()) 99 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | API_CONFIG = { 4 | "claude":{ 5 | "max_tokens": 16384, 6 | "max_images": 70, 7 | "video_fps": 1, 8 | "max_video_frames": 64, 9 | "timeout": 300, 10 | "max_single_image_size": 1000, 11 | }, 12 | "glm": { 13 | "max_tokens": 16384, 14 | "max_images": 20, 15 | "video_fps": 1, 16 | "max_video_frames": 20, 17 | "timeout": 600 18 | }, 19 | "qwen": { 20 | "max_tokens": 4096, 21 | "max_images": 20, 22 | "video_fps": 1, 23 | "max_video_frames": 20, 24 | "timeout": 300, 25 | }, 26 | "gpt-4o-2024-05-13": { 27 | "max_tokens": 4096, 28 | "max_images": 70, 29 | "video_fps": 1, 30 | "max_video_frames": 64, 31 | "timeout": 600 32 | }, 33 | "gpt-5": { 34 | "max_tokens": 16384, 35 | "max_images": 70, 36 | "video_fps": 1, 37 | "max_video_frames": 64, 38 | "timeout": 600 39 | }, 40 | "doubao-seed-1-6-thinking-250615": { 41 | "max_tokens": 16384, 42 | "max_images": 70, 43 | "video_fps": 1, 44 | "max_video_frames": 64, 45 | "timeout": 600 46 | }, 47 | "gpt-4.1-2025-04-14":{ 48 | "max_tokens": 16384, 49 | "max_images": 70, 50 | "video_fps": 1, 51 | "max_video_frames": 64, 52 | "timeout": 600 53 | }, 54 | "default": { 55 | "max_tokens": 16384, 56 | "max_images": 70, 57 | "video_fps": 1, 58 | "max_video_frames": 64, 59 | "timeout": 300 60 | } 61 | } 62 | 63 | 64 | PROMPT_TEMPLATE = """ 65 | You are an expert front-end developer. Your task is to create a pixel-perfect replica of a website from a video. 66 | Generate a single `index.html` file that contains all HTML, CSS, and JavaScript necessary to replicate the UI, content, and interaction features shown. The webpage resolution in the video is {resolution}. 67 | 68 | Instructions: 69 | 1. Single File Output: All HTML, CSS, and JS must be in one `index.html` file. 70 | 2. If backend logic is implied, mock it in JS with static data (e.g., a JS array for a fake API call). 71 | 3. For all clickable elements, please add the class name "btn" in the HTML source code so that the evaluation agent can perform click evaluation. 72 | 4. Assets(Images and Videos in the webpage): 73 | - All images must use the provided stitched image assets. 74 | - The `src` attribute must start with the literal, unchanging string `__PLACEHOLDER_ASSETS_BASE_DIR__/`, followed by the actual filename identified from the stitched image. 75 | - For example: `src="__PLACEHOLDER_ASSETS_BASE_DIR__/logo.svg"`. 76 | - `` tags must include `width` and `height` attributes. 77 | - The provided stitched image assets are before the video. 78 | 5. No External Dependencies: The generated code must be entirely self-contained. No External Libraries and no External Fonts. 79 | 6. Final Response: Return **only the complete HTML code** in a single ```html code block, with no additional text or explanations. 80 | """ 81 | --------------------------------------------------------------------------------