├── .cursor └── rules │ └── browser-use-rules.mdc ├── .dockerignore ├── .env.example ├── .gitattributes ├── .github ├── .git-blame-ignore-revs ├── CONTRIBUTING.md ├── ISSUE_TEMPLATE │ ├── 1_element_detection_bug.yml │ ├── 2_bug_report.yml │ ├── 3_feature_request.yml │ ├── 4_docs_issue.yml │ └── config.yml ├── SECURITY.md └── workflows │ ├── build-base-image.yml.disabled │ ├── claude.yml │ ├── cloud_evals.yml │ ├── docker.yml │ ├── lint.yml │ ├── package.yaml │ ├── publish.yml │ ├── stale-bot.yml │ └── test.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── .python-version ├── CLAUDE.md ├── Dockerfile ├── Dockerfile.fast ├── LICENSE ├── README.md ├── bin ├── lint.sh ├── setup.sh └── test.sh ├── browser_use ├── README.md ├── __init__.py ├── agent │ ├── cloud_events.py │ ├── gif.py │ ├── message_manager │ │ ├── service.py │ │ ├── utils.py │ │ └── views.py │ ├── prompts.py │ ├── service.py │ ├── system_prompt.md │ ├── system_prompt_flash.md │ ├── system_prompt_no_thinking.md │ └── views.py ├── browser │ ├── __init__.py │ ├── events.py │ ├── profile.py │ ├── python_highlights.py │ ├── session.py │ ├── views.py │ ├── watchdog_base.py │ └── watchdogs │ │ ├── __init__.py │ │ ├── aboutblank_watchdog.py │ │ ├── crash_watchdog.py │ │ ├── default_action_watchdog.py │ │ ├── dom_watchdog.py │ │ ├── downloads_watchdog.py │ │ ├── local_browser_watchdog.py │ │ ├── permissions_watchdog.py │ │ ├── popups_watchdog.py │ │ ├── screenshot_watchdog.py │ │ ├── security_watchdog.py │ │ └── storage_state_watchdog.py ├── cli.py ├── config.py ├── controller │ └── __init__.py ├── dom │ ├── enhanced_snapshot.py │ ├── playground │ │ ├── extraction.py │ │ ├── multi_act.py │ │ └── tree.py │ ├── serializer │ │ ├── clickable_elements.py │ │ └── serializer.py │ ├── service.py │ ├── utils.py │ └── views.py ├── exceptions.py ├── filesystem │ ├── __init__.py │ └── file_system.py ├── integrations │ └── gmail │ │ ├── __init__.py │ │ ├── actions.py │ │ └── service.py ├── llm │ ├── README.md │ ├── __init__.py │ ├── anthropic │ │ ├── chat.py │ │ └── serializer.py │ ├── aws │ │ ├── __init__.py │ │ ├── chat_anthropic.py │ │ ├── chat_bedrock.py │ │ └── serializer.py │ ├── azure │ │ └── chat.py │ ├── base.py │ ├── deepseek │ │ ├── chat.py │ │ └── serializer.py │ ├── exceptions.py │ ├── google │ │ ├── __init__.py │ │ ├── chat.py │ │ └── serializer.py │ ├── groq │ │ ├── chat.py │ │ ├── parser.py │ │ └── serializer.py │ ├── messages.py │ ├── models.py │ ├── ollama │ │ ├── chat.py │ │ └── serializer.py │ ├── openai │ │ ├── chat.py │ │ ├── like.py │ │ └── serializer.py │ ├── openrouter │ │ ├── chat.py │ │ └── serializer.py │ ├── schema.py │ ├── tests │ │ ├── test_anthropic_cache.py │ │ ├── test_chat_models.py │ │ ├── test_gemini_image.py │ │ ├── test_groq_loop.py │ │ └── test_single_step.py │ └── views.py ├── logging_config.py ├── mcp │ ├── .dxtignore │ ├── __init__.py │ ├── __main__.py │ ├── client.py │ ├── controller.py │ ├── manifest.json │ └── server.py ├── observability.py ├── py.typed ├── screenshots │ ├── __init__.py │ └── service.py ├── sync │ ├── __init__.py │ ├── auth.py │ └── service.py ├── telemetry │ ├── __init__.py │ ├── service.py │ └── views.py ├── tokens │ ├── __init__.py │ ├── service.py │ ├── tests │ │ └── test_cost.py │ └── views.py ├── tools │ ├── registry │ │ ├── service.py │ │ └── views.py │ ├── service.py │ └── views.py └── utils.py ├── docker ├── README.md ├── base-images │ ├── chromium │ │ └── Dockerfile │ ├── python-deps │ │ └── Dockerfile │ └── system │ │ └── Dockerfile └── build-base-images.sh ├── docs ├── README.md ├── cloud │ └── v1 │ │ ├── authentication.mdx │ │ ├── custom-sdk.mdx │ │ ├── implementation.mdx │ │ ├── n8n-browser-use-integration.mdx │ │ ├── pricing.mdx │ │ ├── quickstart.mdx │ │ ├── search.mdx │ │ └── webhooks.mdx ├── customize │ ├── agent │ │ ├── all-parameters.mdx │ │ ├── basics.mdx │ │ ├── output-format.mdx │ │ └── supported-models.mdx │ ├── browser │ │ ├── all-parameters.mdx │ │ ├── basics.mdx │ │ ├── real-browser.mdx │ │ └── remote.mdx │ ├── examples │ │ ├── chain-agents.mdx │ │ ├── fast-agent.mdx │ │ ├── more-examples.mdx │ │ ├── parallel-browser.mdx │ │ ├── prompting-guide.mdx │ │ ├── secure.mdx │ │ └── sensitive-data.mdx │ └── tools │ │ ├── add.mdx │ │ ├── available.mdx │ │ ├── basics.mdx │ │ ├── remove.mdx │ │ └── response.mdx ├── development.mdx ├── development │ ├── get-help.mdx │ ├── monitoring │ │ ├── observability.mdx │ │ └── telemetry.mdx │ ├── n8n-integration.mdx │ ├── roadmap.mdx │ └── setup │ │ ├── contribution-guide.mdx │ │ └── local-setup.mdx ├── docs.json ├── favicon.ico ├── favicon.svg ├── images │ ├── browser-use-banner-dark.png │ ├── browser-use-banner.png │ ├── checks-passed.png │ ├── cloud-banner-dark.png │ ├── cloud-banner-js.png │ ├── cloud-banner-python.png │ ├── cloud-banner.png │ └── laminar.png ├── introduction.mdx ├── logo │ ├── dark.svg │ └── light.svg ├── quickstart.mdx └── quickstart_llm.mdx ├── examples ├── __init__.py ├── api │ └── search │ │ ├── search_url.py │ │ └── simple_search.py ├── browser │ ├── parallel_browser.py │ ├── real_browser.py │ └── using_cdp.py ├── cloud │ ├── 01_basic_task.py │ ├── 02_fast_mode_gemini.py │ ├── 03_structured_output.py │ ├── 04_proxy_usage.py │ ├── 05_search_api.py │ ├── README.md │ └── env.example ├── custom-functions │ ├── 2fa.py │ ├── action_filters.py │ ├── advanced_search.py │ ├── cua.py │ ├── file_upload.py │ ├── notification.py │ ├── onepassword_2fa.py │ └── save_to_file_hugging_face.py ├── features │ ├── custom_output.py │ ├── custom_system_prompt.py │ ├── download_file.py │ ├── follow_up_tasks.py │ ├── initial_actions.py │ ├── multi_tab.py │ ├── parallel_agents.py │ ├── process_agent_output.py │ ├── restrict_urls.py │ ├── scrolling_page.py │ ├── secure.py │ ├── sensitive_data.py │ └── small_model_for_extraction.py ├── file_system │ ├── alphabet_earnings.py │ ├── excel_sheet.py │ └── file_system.py ├── getting_started │ ├── 01_basic_search.py │ ├── 02_form_filling.py │ ├── 03_data_extraction.py │ ├── 04_multi_step_task.py │ └── 05_fast_agent.py ├── integrations │ ├── agentmail │ │ ├── 2fa.py │ │ └── email_tools.py │ ├── discord │ │ ├── discord_api.py │ │ └── discord_example.py │ ├── gmail_2fa_integration.py │ └── slack │ │ ├── README.md │ │ ├── slack_api.py │ │ └── slack_example.py ├── mcp │ ├── advanced_client.py │ ├── advanced_server.py │ ├── simple_client.py │ └── simple_server.py ├── models │ ├── aws.py │ ├── azure_openai.py │ ├── claude-4-sonnet.py │ ├── deepseek-chat.py │ ├── gemini.py │ ├── gpt-4.1.py │ ├── gpt-5-mini.py │ ├── langchain │ │ ├── README.md │ │ ├── __init__.py │ │ ├── chat.py │ │ ├── example.py │ │ └── serializer.py │ ├── lazy_import.py │ ├── llama4-groq.py │ ├── novita.py │ └── openrouter.py ├── observability │ └── openLLMetry.py ├── simple.py ├── ui │ ├── README.md │ ├── command_line.py │ ├── gradio_demo.py │ └── streamlit_demo.py └── use-cases │ ├── captcha.py │ ├── check_appointment.py │ ├── extract_pdf_content.py │ ├── find_and_apply_to_jobs.py │ ├── find_influencer_profiles.py │ └── shopping.py ├── pyproject.toml ├── static ├── NiceHack69.png ├── browser-use-dark.png └── browser-use.png └── tests ├── agent_tasks ├── README.md ├── amazon_laptop.yaml ├── browser_use_pip.yaml └── captcha_cloudflare.yaml ├── ci ├── conftest.py ├── evaluate_tasks.py ├── test_agent_sensitive_data.py ├── test_browser_event_ClickElementEvent.py ├── test_browser_event_GetDropdownOptionsEvent.py ├── test_browser_event_GetDropdownOptionsEvent_aria_menus.py ├── test_browser_event_NavigateToUrlEvent.py ├── test_browser_event_NavigateToUrlEvent2.py ├── test_browser_event_ScrollEvent.py ├── test_browser_session_element_cache.py ├── test_browser_session_output_paths.py ├── test_browser_session_proxy.py ├── test_browser_session_recent_events.py ├── test_browser_session_start.py ├── test_browser_session_tab_management.py ├── test_browser_session_via_cdp_tab_management.py ├── test_browser_session_viewport_and_proxy.py ├── test_browser_watchdog_crash.py ├── test_browser_watchdog_downloads.py ├── test_browser_watchdog_downloads_simple.py ├── test_browser_watchdog_downloads_upload_full_circle.py ├── test_browser_watchdog_screenshots.py ├── test_browser_watchdog_security2.py ├── test_config.py ├── test_filesystem.py ├── test_llm_anthropic_502_error.py ├── test_llm_custom_structured_ouput.py ├── test_llm_gemini_type_field_fix.py ├── test_llm_schema_optimizer.py ├── test_radio_buttons.html ├── test_radio_buttons.py ├── test_registry.py ├── test_registry_action_parameter_injection.py ├── test_sync_client.py ├── test_sync_client_auth.py ├── test_telemetry.py └── test_tools.py ├── mind2web_data └── processed.json └── scripts ├── debug_iframe_scrolling.py └── test_frame_hierarchy.py /.cursor/rules/browser-use-rules.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: 3 | globs: 4 | alwaysApply: true 5 | --- 6 | 7 | ## 🧠 General Guidelines for Contributing to `browser-use` 8 | 9 | **Browser-Use** is an AI agent that autonomously interacts with the web. It takes a user-defined task, navigates web pages using Chromium via CDP, processes HTML, and repeatedly queries a language model (like `gpt-4.1-mini`) to decide the next action—until the task is completed. 10 | 11 | --- 12 | 13 | ### 🧰 Development Rules 14 | 15 | - ✅ **Always use [`uv`](mdc:https:/github.com/astral-sh/uv) instead of `pip`** 16 | For deterministic and fast dependency installs. 17 | 18 | ```bash 19 | uv venv --python 3.11 20 | source .venv/bin/activate 21 | uv sync 22 | ``` 23 | 24 | - ✅ **Use real model names** 25 | Do **not** replace `gpt-4o` with `gpt-4`. The model `gpt-4o` is a distinct release and supported. 26 | 27 | - ✅ **Type-safe coding** 28 | Use **Pydantic v2 models** for all internal action schemas, task inputs/outputs, and tools I/O. This ensures robust validation and LLM-call integrity. 29 | 30 | - ✅ **Pre-commit formatting** 31 | ALWAYS make sure to run pre-commit before making PRs. 32 | 33 | --- 34 | 35 | 36 | 37 | ### Notes: 38 | 39 | - Use descriptive names and docstrings for each action. 40 | - Prefer returning `ActionResult` with structured content to help the agent reason better. 41 | 42 | --- 43 | 44 | ## 🧠 Creating and Running an Agent 45 | 46 | To define a task and run a browser-use agent: 47 | 48 | ```python 49 | from browser_use import Agent, ChatOpenAI 50 | 51 | task = "Find the CEO of OpenAI and return their name" 52 | model = ChatOpenAI(model="gpt-4.1-mini") 53 | 54 | agent = Agent(task=task, llm=model, tools=tools) 55 | 56 | history = await agent.run() 57 | ``` 58 | 59 | # Never create random examples 60 | 61 | When I ask you to implement a feature never create new files that show off that feature -> the code just gets messy. If you do anything to test it out, just do the inline code inside the terminal (if you want). 62 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | docs/ 2 | static/ 3 | .claude/ 4 | .github/ 5 | 6 | # Cache files 7 | .DS_Store 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | .mypy_cache/ 12 | .ruff_cache/ 13 | .pytest_cache/ 14 | .ipynb_checkpoints 15 | 16 | # Virtual Environments 17 | .venv 18 | venv/ 19 | 20 | # Editor cruft 21 | .vscode/ 22 | .idea/ 23 | 24 | # Build Files 25 | dist/ 26 | 27 | # Data files 28 | *.gif 29 | *.txt 30 | *.pdf 31 | *.csv 32 | *.json 33 | *.jsonl 34 | *.bak 35 | 36 | # Secrets and sensitive files 37 | secrets.env 38 | .env 39 | browser_cookies.json 40 | cookies.json 41 | gcp-login.json 42 | saved_trajectories/ 43 | AgentHistory.json 44 | AgentHistoryList.json 45 | private_example.py 46 | private_example 47 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # Browser Use Configuration 2 | # Copy this file to .env and fill in your values 3 | 4 | # Logging Configuration 5 | # Set the logging level (debug, info, warning, error) 6 | BROWSER_USE_LOGGING_LEVEL=info 7 | 8 | # Log file paths (optional) 9 | # Save debug level logs to this file 10 | BROWSER_USE_DEBUG_LOG_FILE=debug.log 11 | 12 | # Save info level logs to this file 13 | BROWSER_USE_INFO_LOG_FILE=info.log 14 | 15 | # CDP (Chrome DevTools Protocol) logging level 16 | CDP_LOGGING_LEVEL=WARNING 17 | 18 | # Telemetry and Analytics 19 | # Enable/disable anonymous telemetry 20 | ANONYMIZED_TELEMETRY=true 21 | 22 | # Browser Use Cloud Configuration (optional) 23 | # Your Browser Use Cloud API key - get it from: https://cloud.browser-use.com/billing 24 | # BROWSER_USE_API_KEY=your_api_key_here 25 | 26 | # Custom API base URL (for enterprise installations) 27 | # BROWSER_USE_CLOUD_API_URL=https://api.browser-use.com 28 | 29 | # Cloud sync settings 30 | # BROWSER_USE_CLOUD_SYNC=false 31 | 32 | # Model Configuration 33 | # Default LLM model to use 34 | # OPENAI_API_KEY=your_openai_api_key_here 35 | # ANTHROPIC_API_KEY=your_anthropic_api_key_here 36 | # AZURE_OPENAI_API_KEY= 37 | # AZURE_OPENAI_ENDPOINT= 38 | # GOOGLE_API_KEY= 39 | # DEEPSEEK_API_KEY= 40 | # GROK_API_KEY= 41 | # NOVITA_API_KEY= 42 | 43 | # Browser Configuration 44 | # Path to Chrome/Chromium executable (optional) 45 | # BROWSER_USE_EXECUTABLE_PATH=/path/to/chrome 46 | 47 | # Run browser in headless mode 48 | # BROWSER_USE_HEADLESS=false 49 | 50 | # User data directory for browser profile 51 | # BROWSER_USE_USER_DATA_DIR=./browser_data 52 | 53 | # Proxy Configuration (optional) 54 | # BROWSER_USE_PROXY_SERVER=http://proxy.example.com:8080 55 | # BROWSER_USE_NO_PROXY=localhost,127.0.0.1,*.internal 56 | # BROWSER_USE_PROXY_USERNAME=username 57 | # BROWSER_USE_PROXY_PASSWORD=password 58 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | static/*.gif filter=lfs diff=lfs merge=lfs -text 2 | # static/*.mp4 filter=lfs diff=lfs merge=lfs -text 3 | -------------------------------------------------------------------------------- /.github/.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | 66b3c26df51adec32d42c3b2c0304e0662457298 2 | 2be4ba4f7078d47bbeed04baf6f8fb04017df028 3 | -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to browser-use 2 | 3 | We love contributions! Please read through these links to get started: 4 | 5 | - 🔢 [Contribution Guidelines](https://docs.browser-use.com/development/contribution-guide) 6 | - 👾 [Local Development Setup Guide](https://docs.browser-use.com/development/local-setup) 7 | - 🏷️ [Issues Tagged: `#help-wanted`](https://github.com/browser-use/browser-use/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22help%20wanted%22) 8 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/4_docs_issue.yml: -------------------------------------------------------------------------------- 1 | name: 📚 Documentation Issue 2 | description: Report an issue in the browser-use documentation 3 | labels: ["documentation"] 4 | title: "Documentation: ..." 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | Thanks for taking the time to improve our documentation! Please fill out the form below to help us fix the issue quickly. 10 | 11 | - type: dropdown 12 | id: type 13 | attributes: 14 | label: Type of Documentation Issue 15 | description: What type of documentation issue is this? 16 | options: 17 | - Missing documentation 18 | - Incorrect documentation 19 | - Unclear documentation 20 | - Broken link 21 | - Other (specify in description) 22 | validations: 23 | required: true 24 | 25 | - type: input 26 | id: page 27 | attributes: 28 | label: Documentation Page 29 | description: Which page or section of the documentation is this about? 30 | placeholder: "e.g. https://docs.browser-use.com/customize/browser-settings > Context Configuration > headless" 31 | validations: 32 | required: true 33 | 34 | - type: textarea 35 | id: description 36 | attributes: 37 | label: Issue Description 38 | description: "Describe what's wrong or missing in the documentation" 39 | placeholder: e.g. Docs should clarify whether BrowserSession(no_viewport=False) is supported when running in BrowserSession(headless=False) mode... 40 | validations: 41 | required: true 42 | 43 | - type: textarea 44 | id: suggestion 45 | attributes: 46 | label: Suggested Changes 47 | description: If you have specific suggestions for how to improve the documentation, please share them 48 | placeholder: | 49 | e.g. The documentation could be improved by adding one more line here: 50 | ```diff 51 | Use `BrowserSession(headless=False)` to open the browser window (aka headful mode). 52 | + Viewports are not supported when headful, if `headless=False` it will force `no_viewport=True`. 53 | ``` 54 | validations: 55 | required: false 56 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false # Set to true if you want to allow blank issues 2 | contact_links: 3 | - name: 🔢 Quickstart Guide 4 | url: https://docs.browser-use.com/quickstart 5 | about: Most common issues can be resolved by following our quickstart guide 6 | - name: 💬 Questions and Help 7 | url: https://link.browser-use.com/discord 8 | about: Please ask questions in our Discord community 9 | - name: 📖 Documentation 10 | url: https://docs.browser-use.com 11 | about: Check our documentation for answers first 12 | -------------------------------------------------------------------------------- /.github/SECURITY.md: -------------------------------------------------------------------------------- 1 | ## Reporting Security Issues 2 | 3 | If you believe you have found a security vulnerability in browser-use, please report it through coordinated disclosure. 4 | 5 | **Please do not report security vulnerabilities through the repository issues, discussions, or pull requests.** 6 | 7 | Instead, please open a new [Github security advisory](https://github.com/browser-use/browser-use/security/advisories/new). 8 | 9 | Please include as much of the information listed below as you can to help me better understand and resolve the issue: 10 | 11 | * The type of issue (e.g., buffer overflow, SQL injection, or cross-site scripting) 12 | * Full paths of source file(s) related to the manifestation of the issue 13 | * The location of the affected source code (tag/branch/commit or direct URL) 14 | * Any special configuration required to reproduce the issue 15 | * Step-by-step instructions to reproduce the issue 16 | * Proof-of-concept or exploit code (if possible) 17 | * Impact of the issue, including how an attacker might exploit the issue 18 | 19 | This information will help me triage your report more quickly. 20 | -------------------------------------------------------------------------------- /.github/workflows/build-base-image.yml.disabled: -------------------------------------------------------------------------------- 1 | name: Build Base Image 2 | 3 | on: 4 | schedule: 5 | - cron: '0 2 * * 1' # Weekly on Monday 6 | workflow_dispatch: 7 | push: 8 | paths: 9 | - 'Dockerfile.base' 10 | 11 | jobs: 12 | build-base: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | platform: [linux/amd64, linux/arm64] 17 | steps: 18 | - uses: actions/checkout@v4 19 | 20 | - name: Set up QEMU 21 | uses: docker/setup-qemu-action@v3 22 | 23 | - name: Set up Docker Buildx 24 | uses: docker/setup-buildx-action@v3 25 | 26 | - name: Login to Docker Hub 27 | uses: docker/login-action@v3 28 | with: 29 | username: ${{ secrets.DOCKER_USERNAME }} 30 | password: ${{ secrets.DOCKER_PASSWORD }} 31 | 32 | - name: Build and push base image 33 | uses: docker/build-push-action@v5 34 | with: 35 | context: . 36 | file: ./Dockerfile.base 37 | platforms: ${{ matrix.platform }} 38 | push: true 39 | tags: | 40 | browseruse/browseruse-base:chromium-138-${{ matrix.platform == 'linux/amd64' && 'amd64' || 'arm64' }} 41 | browseruse/browseruse-base:latest-${{ matrix.platform == 'linux/amd64' && 'amd64' || 'arm64' }} 42 | cache-from: type=registry,ref=browseruse/browseruse-base:buildcache-${{ matrix.platform == 'linux/amd64' && 'amd64' || 'arm64' }} 43 | cache-to: type=registry,ref=browseruse/browseruse-base:buildcache-${{ matrix.platform == 'linux/amd64' && 'amd64' || 'arm64' }},mode=max 44 | -------------------------------------------------------------------------------- /.github/workflows/cloud_evals.yml: -------------------------------------------------------------------------------- 1 | name: cloud_evals 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - 'releases/*' 8 | workflow_dispatch: 9 | inputs: 10 | commit_hash: 11 | description: Commit hash of the library to build the Cloud eval image for 12 | required: false 13 | 14 | jobs: 15 | trigger_cloud_eval_image_build: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/github-script@v7 19 | with: 20 | github-token: ${{ secrets.TRIGGER_CLOUD_BUILD_GH_KEY }} 21 | script: | 22 | const result = await github.rest.repos.createDispatchEvent({ 23 | owner: 'browser-use', 24 | repo: 'cloud', 25 | event_type: 'trigger-workflow', 26 | client_payload: {"commit_hash": "${{ github.event.inputs.commit_hash || github.sha }}"} 27 | }) 28 | console.log(result) 29 | -------------------------------------------------------------------------------- /.github/workflows/docker.yml: -------------------------------------------------------------------------------- 1 | name: docker 2 | 3 | on: 4 | push: 5 | release: 6 | types: [published] 7 | 8 | jobs: 9 | build_publish_image: 10 | runs-on: ubuntu-latest 11 | permissions: 12 | packages: write 13 | contents: read 14 | attestations: write 15 | id-token: write 16 | steps: 17 | - name: Check out the repo 18 | uses: actions/checkout@v4 19 | 20 | - name: Set up QEMU 21 | uses: docker/setup-qemu-action@v3 22 | 23 | - name: Set up Docker Buildx 24 | uses: docker/setup-buildx-action@v3 25 | 26 | - name: Log in to Docker Hub 27 | uses: docker/login-action@v3 28 | with: 29 | username: ${{ secrets.DOCKER_USERNAME }} 30 | password: ${{ secrets.DOCKER_PASSWORD }} 31 | 32 | - name: Login to GitHub Container Registry 33 | uses: docker/login-action@v3 34 | with: 35 | registry: ghcr.io 36 | username: ${{ github.repository_owner }} 37 | password: ${{ secrets.GITHUB_TOKEN }} 38 | 39 | - name: Compute Docker tags based on tag/branch 40 | id: meta 41 | uses: docker/metadata-action@v5 42 | with: 43 | images: | 44 | browseruse/browseruse 45 | ghcr.io/browser-use/browser-use 46 | tags: | 47 | type=ref,event=branch 48 | type=ref,event=pr 49 | type=pep440,pattern={{version}} 50 | type=pep440,pattern={{major}}.{{minor}} 51 | type=sha 52 | 53 | - name: Build and push Docker image 54 | id: push 55 | uses: docker/build-push-action@v6 56 | with: 57 | platforms: linux/amd64,linux/arm64 58 | context: . 59 | file: ./Dockerfile 60 | push: true 61 | tags: ${{ steps.meta.outputs.tags }} 62 | labels: ${{ steps.meta.outputs.labels }} 63 | cache-from: type=registry,ref=browseruse/browseruse:buildcache 64 | cache-to: type=registry,ref=browseruse/browseruse:buildcache,mode=max 65 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: lint 2 | on: 3 | push: 4 | branches: 5 | - main 6 | - stable 7 | - 'releases/**' 8 | tags: 9 | - '*' 10 | pull_request: 11 | workflow_dispatch: 12 | 13 | jobs: 14 | lint-syntax: 15 | name: syntax-errors 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v4 19 | - uses: astral-sh/setup-uv@v5 20 | with: 21 | enable-cache: true 22 | - run: uv run ruff check --no-fix --select PLE 23 | 24 | lint-style: 25 | name: code-style 26 | runs-on: ubuntu-latest 27 | steps: 28 | - uses: actions/checkout@v4 29 | - uses: astral-sh/setup-uv@v5 30 | with: 31 | enable-cache: true 32 | - run: uv sync --dev --all-extras # install extras for examples to avoid pyright missing imports errors 33 | - run: uv run pre-commit run --all-files --show-diff-on-failure 34 | 35 | lint-typecheck: 36 | name: type-checker 37 | runs-on: ubuntu-latest 38 | steps: 39 | - uses: actions/checkout@v4 40 | - uses: astral-sh/setup-uv@v6 41 | with: 42 | enable-cache: true 43 | - run: uv sync --dev --all-extras # install extras for examples to avoid pyright missing imports errors- 44 | - run: uv run pyright 45 | -------------------------------------------------------------------------------- /.github/workflows/package.yaml: -------------------------------------------------------------------------------- 1 | name: package 2 | on: 3 | push: 4 | branches: 5 | - main 6 | - stable 7 | - 'releases/**' 8 | tags: 9 | - '*' 10 | pull_request: 11 | workflow_dispatch: 12 | 13 | jobs: 14 | build: 15 | name: pip-build 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v4 19 | - uses: astral-sh/setup-uv@v5 20 | - run: uv build --python 3.12 21 | - uses: actions/upload-artifact@v4 22 | with: 23 | name: dist-artifact 24 | path: | 25 | dist/*.whl 26 | dist/*.tar.gz 27 | 28 | build_test: 29 | name: pip-install-on-${{ matrix.os }}-py-${{ matrix.python-version }} 30 | needs: build 31 | runs-on: ${{ matrix.os }} 32 | strategy: 33 | matrix: 34 | os: [ubuntu-latest, macos-latest, windows-latest] 35 | python-version: ["3.11", "3.13"] 36 | env: 37 | ANONYMIZED_TELEMETRY: 'false' 38 | 39 | steps: 40 | - uses: actions/checkout@v4 41 | - uses: astral-sh/setup-uv@v5 42 | - uses: actions/download-artifact@v4 43 | with: 44 | name: dist-artifact 45 | 46 | - name: Set up venv and test for OS/Python versions 47 | shell: bash 48 | run: | 49 | uv venv /tmp/testenv --python ${{ matrix.python-version }} --clear 50 | if [[ "$RUNNER_OS" == "Windows" ]]; then 51 | . /tmp/testenv/Scripts/activate 52 | else 53 | source /tmp/testenv/bin/activate 54 | fi 55 | uv pip install *.whl 56 | python -c 'from browser_use import Agent, BrowserProfile, BrowserSession, Tools, ActionModel, ActionResult' 57 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Cache files 2 | .DS_Store 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | .mypy_cache/ 7 | .ruff_cache/ 8 | .pytest_cache/ 9 | .ipynb_checkpoints 10 | ~/ 11 | 12 | # Virtual Environments 13 | .venv* 14 | venv/ 15 | 16 | # IDEs 17 | .vscode/ 18 | .idea/ 19 | 20 | # Build files 21 | dist/ 22 | 23 | # Data files 24 | *.gif 25 | *.txt 26 | *.pdf 27 | *.csv 28 | *.json 29 | *.jsonl 30 | *.log 31 | *.bak 32 | 33 | # Secrets and sensitive files 34 | secrets.env 35 | .env 36 | browser_cookies.json 37 | cookies.json 38 | gcp-login.json 39 | saved_trajectories/ 40 | old_tests/ 41 | AgentHistory.json 42 | AgentHistoryList.json 43 | private_example.py 44 | private_example 45 | CLAUDE.local.md 46 | 47 | uv.lock 48 | temp 49 | tmp 50 | 51 | # Google API credentials 52 | credentials.json 53 | token.json 54 | 55 | !docs/docs.json 56 | 57 | 58 | temp-profile-* 59 | 60 | screenshot.png 61 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/asottile/yesqa 3 | rev: v1.5.0 4 | hooks: 5 | - id: yesqa 6 | 7 | - repo: https://github.com/codespell-project/codespell 8 | rev: v2.4.1 9 | hooks: 10 | - id: codespell # See pyproject.toml for args 11 | additional_dependencies: 12 | - tomli 13 | 14 | - repo: https://github.com/asottile/pyupgrade 15 | rev: v3.20.0 16 | hooks: 17 | - id: pyupgrade 18 | args: [--py311-plus] 19 | 20 | # - repo: https://github.com/asottile/add-trailing-comma 21 | # rev: v3.1.0 22 | # hooks: 23 | # - id: add-trailing-comma 24 | 25 | - repo: https://github.com/astral-sh/ruff-pre-commit 26 | rev: v0.12.10 27 | hooks: 28 | - id: ruff-check 29 | args: [ --fix ] 30 | - id: ruff-format 31 | # see pyproject.toml for more details on ruff config 32 | 33 | - repo: https://github.com/RobertCraigie/pyright-python 34 | rev: v1.1.404 35 | hooks: 36 | - id: pyright 37 | 38 | - repo: https://github.com/pre-commit/pre-commit-hooks 39 | rev: v6.0.0 40 | hooks: 41 | # check for basic syntax errors in python and data files 42 | - id: check-ast 43 | - id: check-toml 44 | - id: check-yaml 45 | - id: check-json 46 | - id: check-merge-conflict 47 | # check for bad files and folders 48 | - id: check-symlinks 49 | - id: destroyed-symlinks 50 | - id: check-case-conflict 51 | - id: check-illegal-windows-names 52 | - id: check-shebang-scripts-are-executable 53 | - id: mixed-line-ending 54 | - id: fix-byte-order-marker 55 | - id: end-of-file-fixer 56 | # best practices enforcement 57 | - id: detect-private-key 58 | # - id: check-docstring-first 59 | - id: debug-statements 60 | - id: forbid-submodules 61 | - id: check-added-large-files 62 | args: ["--maxkb=600"] 63 | # - id: name-tests-test 64 | # args: ["--pytest-test-first"] 65 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.12 2 | -------------------------------------------------------------------------------- /Dockerfile.fast: -------------------------------------------------------------------------------- 1 | # Fast Dockerfile using pre-built base images 2 | ARG REGISTRY=browseruse 3 | ARG BASE_TAG=latest 4 | FROM ${REGISTRY}/base-python-deps:${BASE_TAG} 5 | 6 | LABEL name="browseruse" description="Browser automation for AI agents" 7 | 8 | ENV BROWSERUSE_USER="browseruse" DEFAULT_PUID=911 DEFAULT_PGID=911 DATA_DIR=/data 9 | 10 | # Create user and directories 11 | RUN groupadd --system $BROWSERUSE_USER && \ 12 | useradd --system --create-home --gid $BROWSERUSE_USER --groups audio,video $BROWSERUSE_USER && \ 13 | usermod -u "$DEFAULT_PUID" "$BROWSERUSE_USER" && \ 14 | groupmod -g "$DEFAULT_PGID" "$BROWSERUSE_USER" && \ 15 | mkdir -p /data /home/$BROWSERUSE_USER/.config && \ 16 | ln -s $DATA_DIR /home/$BROWSERUSE_USER/.config/browseruse && \ 17 | mkdir -p "/home/$BROWSERUSE_USER/.config/chromium/Crash Reports/pending/" && \ 18 | mkdir -p "$DATA_DIR/profiles/default" && \ 19 | chown -R "$BROWSERUSE_USER:$BROWSERUSE_USER" "/home/$BROWSERUSE_USER" "$DATA_DIR" 20 | 21 | WORKDIR /app 22 | COPY . /app 23 | 24 | # Install browser-use 25 | RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked \ 26 | uv sync --all-extras --locked --no-dev --compile-bytecode 27 | 28 | USER "$BROWSERUSE_USER" 29 | VOLUME "$DATA_DIR" 30 | EXPOSE 9242 9222 31 | ENTRYPOINT ["browser-use"] 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Gregor Zunic 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /bin/lint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # This script is used to run the formatter, linter, and type checker pre-commit hooks. 3 | # Usage: 4 | # $ ./bin/lint.sh 5 | 6 | IFS=$'\n' 7 | 8 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 9 | 10 | cd "$SCRIPT_DIR/.." || exit 1 11 | 12 | echo "[*] Running ruff linter, formatter, pyright type checker, and other pre-commit checks..." 13 | exec uv run pre-commit run --all-files 14 | -------------------------------------------------------------------------------- /bin/setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # This script is used to setup a local development environment for the browser-use project. 3 | # Usage: 4 | # $ ./bin/setup.sh 5 | 6 | ### Bash Environment Setup 7 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/ 8 | # https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html 9 | # set -o xtrace 10 | # set -x 11 | # shopt -s nullglob 12 | set -o errexit 13 | set -o errtrace 14 | set -o nounset 15 | set -o pipefail 16 | IFS=$'\n' 17 | 18 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 19 | cd "$SCRIPT_DIR" 20 | 21 | 22 | if [ -f "$SCRIPT_DIR/lint.sh" ]; then 23 | echo "[√] already inside a cloned browser-use repo" 24 | else 25 | echo "[+] Cloning browser-use repo into current directory: $SCRIPT_DIR" 26 | git clone https://github.com/browser-use/browser-use 27 | cd browser-use 28 | fi 29 | 30 | echo "[+] Installing uv..." 31 | curl -LsSf https://astral.sh/uv/install.sh | sh 32 | 33 | #git checkout main git pull 34 | echo 35 | echo "[+] Setting up venv" 36 | uv venv 37 | echo 38 | echo "[+] Installing packages in venv" 39 | uv sync --dev --all-extras 40 | echo 41 | echo "[i] Tip: make sure to set BROWSER_USE_LOGGING_LEVEL=debug and your LLM API keys in your .env file" 42 | echo 43 | uv pip show browser-use 44 | 45 | echo "Usage:" 46 | echo " $ browser-use use the CLI" 47 | echo " or" 48 | echo " $ source .venv/bin/activate" 49 | echo " $ ipython use the library" 50 | echo " >>> from browser_use import BrowserSession, Agent" 51 | echo " >>> await Agent(task='book me a flight to fiji', browser=BrowserSession(headless=False)).run()" 52 | echo "" 53 | -------------------------------------------------------------------------------- /bin/test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # This script is used to run all the main project tests that run on CI via .github/workflows/test.yaml. 3 | # Usage: 4 | # $ ./bin/test.sh 5 | 6 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 7 | cd "$SCRIPT_DIR/.." || exit 1 8 | 9 | exec uv run pytest --numprocesses auto tests/ci $1 $2 $3 10 | -------------------------------------------------------------------------------- /browser_use/README.md: -------------------------------------------------------------------------------- 1 | # Codebase Structure 2 | 3 | > The code structure inspired by https://github.com/Netflix/dispatch. 4 | 5 | Very good structure on how to make a scalable codebase is also in [this repo](https://github.com/zhanymkanov/fastapi-best-practices). 6 | 7 | Just a brief document about how we should structure our backend codebase. 8 | 9 | ## Code Structure 10 | 11 | ```markdown 12 | src/ 13 | // 14 | models.py 15 | services.py 16 | prompts.py 17 | views.py 18 | utils.py 19 | routers.py 20 | 21 | /_/ 22 | ``` 23 | 24 | ### Service.py 25 | 26 | Always a single file, except if it becomes too long - more than ~500 lines, split it into \_subservices 27 | 28 | ### Views.py 29 | 30 | Always split the views into two parts 31 | 32 | ```python 33 | # All 34 | ... 35 | 36 | # Requests 37 | ... 38 | 39 | # Responses 40 | ... 41 | ``` 42 | 43 | If too long → split into multiple files 44 | 45 | ### Prompts.py 46 | 47 | Single file; if too long → split into multiple files (one prompt per file or so) 48 | 49 | ### Routers.py 50 | 51 | Never split into more than one file 52 | -------------------------------------------------------------------------------- /browser_use/agent/message_manager/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | import logging 5 | from pathlib import Path 6 | from typing import Any 7 | 8 | import anyio 9 | 10 | from browser_use.llm.messages import BaseMessage 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | async def save_conversation( 16 | input_messages: list[BaseMessage], 17 | response: Any, 18 | target: str | Path, 19 | encoding: str | None = None, 20 | ) -> None: 21 | """Save conversation history to file asynchronously.""" 22 | target_path = Path(target) 23 | # create folders if not exists 24 | if target_path.parent: 25 | await anyio.Path(target_path.parent).mkdir(parents=True, exist_ok=True) 26 | 27 | await anyio.Path(target_path).write_text( 28 | await _format_conversation(input_messages, response), 29 | encoding=encoding or 'utf-8', 30 | ) 31 | 32 | 33 | async def _format_conversation(messages: list[BaseMessage], response: Any) -> str: 34 | """Format the conversation including messages and response.""" 35 | lines = [] 36 | 37 | # Format messages 38 | for message in messages: 39 | lines.append(f' {message.role} ') 40 | 41 | lines.append(message.text) 42 | lines.append('') # Empty line after each message 43 | 44 | # Format response 45 | lines.append(' RESPONSE') 46 | lines.append(json.dumps(json.loads(response.model_dump_json(exclude_unset=True)), indent=2)) 47 | 48 | return '\n'.join(lines) 49 | 50 | 51 | # Note: _write_messages_to_file and _write_response_to_file have been merged into _format_conversation 52 | # This is more efficient for async operations and reduces file I/O 53 | -------------------------------------------------------------------------------- /browser_use/agent/message_manager/views.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | from pydantic import BaseModel, ConfigDict, Field 6 | 7 | from browser_use.llm.messages import ( 8 | BaseMessage, 9 | ) 10 | 11 | if TYPE_CHECKING: 12 | pass 13 | 14 | 15 | class HistoryItem(BaseModel): 16 | """Represents a single agent history item with its data and string representation""" 17 | 18 | step_number: int | None = None 19 | evaluation_previous_goal: str | None = None 20 | memory: str | None = None 21 | next_goal: str | None = None 22 | action_results: str | None = None 23 | error: str | None = None 24 | system_message: str | None = None 25 | 26 | model_config = ConfigDict(arbitrary_types_allowed=True) 27 | 28 | def model_post_init(self, __context) -> None: 29 | """Validate that error and system_message are not both provided""" 30 | if self.error is not None and self.system_message is not None: 31 | raise ValueError('Cannot have both error and system_message at the same time') 32 | 33 | def to_string(self) -> str: 34 | """Get string representation of the history item""" 35 | step_str = 'step' if self.step_number is not None else 'step_unknown' 36 | 37 | if self.error: 38 | return f"""<{step_str}> 39 | {self.error} 40 | """ 41 | elif self.system_message: 42 | return '' # empty string 43 | else: 44 | content_parts = [] 45 | 46 | # Only include evaluation_previous_goal if it's not None/empty 47 | if self.evaluation_previous_goal: 48 | content_parts.append(f'{self.evaluation_previous_goal}') 49 | 50 | # Always include memory 51 | if self.memory: 52 | content_parts.append(f'{self.memory}') 53 | 54 | # Only include next_goal if it's not None/empty 55 | if self.next_goal: 56 | content_parts.append(f'{self.next_goal}') 57 | 58 | if self.action_results: 59 | content_parts.append(self.action_results) 60 | 61 | content = '\n'.join(content_parts) 62 | 63 | return f"""<{step_str}> 64 | {content} 65 | """ 66 | 67 | 68 | class MessageHistory(BaseModel): 69 | """History of messages""" 70 | 71 | system_message: BaseMessage | None = None 72 | state_message: BaseMessage | None = None 73 | context_messages: list[BaseMessage] = Field(default_factory=list) 74 | model_config = ConfigDict(arbitrary_types_allowed=True) 75 | 76 | def get_messages(self) -> list[BaseMessage]: 77 | """Get all messages in the correct order: system -> state -> contextual""" 78 | messages = [] 79 | if self.system_message: 80 | messages.append(self.system_message) 81 | if self.state_message: 82 | messages.append(self.state_message) 83 | messages.extend(self.context_messages) 84 | 85 | return messages 86 | 87 | 88 | class MessageManagerState(BaseModel): 89 | """Holds the state for MessageManager""" 90 | 91 | history: MessageHistory = Field(default_factory=MessageHistory) 92 | tool_id: int = 1 93 | agent_history_items: list[HistoryItem] = Field( 94 | default_factory=lambda: [HistoryItem(step_number=0, system_message='Agent initialized')] 95 | ) 96 | read_state_description: str = '' 97 | 98 | model_config = ConfigDict(arbitrary_types_allowed=True) 99 | -------------------------------------------------------------------------------- /browser_use/browser/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | # Type stubs for lazy imports 4 | if TYPE_CHECKING: 5 | from .profile import BrowserProfile, ProxySettings 6 | from .session import BrowserSession 7 | 8 | 9 | # Lazy imports mapping for heavy browser components 10 | _LAZY_IMPORTS = { 11 | 'ProxySettings': ('.profile', 'ProxySettings'), 12 | 'BrowserProfile': ('.profile', 'BrowserProfile'), 13 | 'BrowserSession': ('.session', 'BrowserSession'), 14 | } 15 | 16 | 17 | def __getattr__(name: str): 18 | """Lazy import mechanism for heavy browser components.""" 19 | if name in _LAZY_IMPORTS: 20 | module_path, attr_name = _LAZY_IMPORTS[name] 21 | try: 22 | from importlib import import_module 23 | 24 | # Use relative import for current package 25 | full_module_path = f'browser_use.browser{module_path}' 26 | module = import_module(full_module_path) 27 | attr = getattr(module, attr_name) 28 | # Cache the imported attribute in the module's globals 29 | globals()[name] = attr 30 | return attr 31 | except ImportError as e: 32 | raise ImportError(f'Failed to import {name} from {full_module_path}: {e}') from e 33 | 34 | raise AttributeError(f"module '{__name__}' has no attribute '{name}'") 35 | 36 | 37 | __all__ = [ 38 | 'BrowserSession', 39 | 'BrowserProfile', 40 | 'ProxySettings', 41 | ] 42 | -------------------------------------------------------------------------------- /browser_use/browser/watchdogs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/browser-use/browser-use/2d84ff6605f4d89988f7868633f288f351c6a349/browser_use/browser/watchdogs/__init__.py -------------------------------------------------------------------------------- /browser_use/browser/watchdogs/permissions_watchdog.py: -------------------------------------------------------------------------------- 1 | """Permissions watchdog for granting browser permissions on connection.""" 2 | 3 | from typing import TYPE_CHECKING, ClassVar 4 | 5 | from bubus import BaseEvent 6 | 7 | from browser_use.browser.events import BrowserConnectedEvent 8 | from browser_use.browser.watchdog_base import BaseWatchdog 9 | 10 | if TYPE_CHECKING: 11 | pass 12 | 13 | 14 | class PermissionsWatchdog(BaseWatchdog): 15 | """Grants browser permissions when browser connects.""" 16 | 17 | # Event contracts 18 | LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ 19 | BrowserConnectedEvent, 20 | ] 21 | EMITS: ClassVar[list[type[BaseEvent]]] = [] 22 | 23 | async def on_BrowserConnectedEvent(self, event: BrowserConnectedEvent) -> None: 24 | """Grant permissions when browser connects.""" 25 | permissions = self.browser_session.browser_profile.permissions 26 | 27 | if not permissions: 28 | self.logger.debug('No permissions to grant') 29 | return 30 | 31 | self.logger.debug(f'🔓 Granting browser permissions: {permissions}') 32 | 33 | try: 34 | # Grant permissions using CDP Browser.grantPermissions 35 | # origin=None means grant to all origins 36 | # Browser domain commands don't use session_id 37 | await self.browser_session.cdp_client.send.Browser.grantPermissions( 38 | params={'permissions': permissions} # type: ignore 39 | ) 40 | self.logger.debug(f'✅ Successfully granted permissions: {permissions}') 41 | except Exception as e: 42 | self.logger.error(f'❌ Failed to grant permissions: {str(e)}') 43 | # Don't raise - permissions are not critical to browser operation 44 | -------------------------------------------------------------------------------- /browser_use/browser/watchdogs/screenshot_watchdog.py: -------------------------------------------------------------------------------- 1 | """Screenshot watchdog for handling screenshot requests using CDP.""" 2 | 3 | from typing import TYPE_CHECKING, Any, ClassVar 4 | 5 | from bubus import BaseEvent 6 | from cdp_use.cdp.page import CaptureScreenshotParameters 7 | 8 | from browser_use.browser.events import ScreenshotEvent 9 | from browser_use.browser.views import BrowserError 10 | from browser_use.browser.watchdog_base import BaseWatchdog 11 | 12 | if TYPE_CHECKING: 13 | pass 14 | 15 | 16 | class ScreenshotWatchdog(BaseWatchdog): 17 | """Handles screenshot requests using CDP.""" 18 | 19 | # Events this watchdog listens to 20 | LISTENS_TO: ClassVar[list[type[BaseEvent[Any]]]] = [ScreenshotEvent] 21 | 22 | # Events this watchdog emits 23 | EMITS: ClassVar[list[type[BaseEvent[Any]]]] = [] 24 | 25 | async def on_ScreenshotEvent(self, event: ScreenshotEvent) -> str: 26 | """Handle screenshot request using CDP. 27 | 28 | Args: 29 | event: ScreenshotEvent with optional full_page and clip parameters 30 | 31 | Returns: 32 | Dict with 'screenshot' key containing base64-encoded screenshot or None 33 | """ 34 | self.logger.debug('[ScreenshotWatchdog] Handler START - on_ScreenshotEvent called') 35 | try: 36 | # Get CDP client and session for current target 37 | cdp_session = await self.browser_session.get_or_create_cdp_session() 38 | 39 | # Prepare screenshot parameters 40 | params = CaptureScreenshotParameters(format='png', captureBeyondViewport=False) 41 | 42 | # Take screenshot using CDP 43 | self.logger.debug(f'[ScreenshotWatchdog] Taking screenshot with params: {params}') 44 | result = await cdp_session.cdp_client.send.Page.captureScreenshot(params=params, session_id=cdp_session.session_id) 45 | 46 | # Return base64-encoded screenshot data 47 | if result and 'data' in result: 48 | self.logger.debug('[ScreenshotWatchdog] Screenshot captured successfully') 49 | return result['data'] 50 | 51 | raise BrowserError('[ScreenshotWatchdog] Screenshot result missing data') 52 | except Exception as e: 53 | self.logger.error(f'[ScreenshotWatchdog] Screenshot failed: {e}') 54 | raise 55 | finally: 56 | # Try to remove highlights even on failure 57 | try: 58 | await self.browser_session.remove_highlights() 59 | except Exception: 60 | pass 61 | -------------------------------------------------------------------------------- /browser_use/controller/__init__.py: -------------------------------------------------------------------------------- 1 | from browser_use.tools.service import Controller 2 | 3 | __all__ = ['Controller'] 4 | -------------------------------------------------------------------------------- /browser_use/dom/playground/multi_act.py: -------------------------------------------------------------------------------- 1 | from browser_use import Agent 2 | from browser_use.browser import BrowserProfile, BrowserSession 3 | from browser_use.browser.types import ViewportSize 4 | from browser_use.llm import ChatAzureOpenAI 5 | 6 | # Initialize the Azure OpenAI client 7 | llm = ChatAzureOpenAI( 8 | model='gpt-4.1-mini', 9 | ) 10 | 11 | 12 | TASK = """ 13 | Go to https://browser-use.github.io/stress-tests/challenges/react-native-web-form.html and complete the React Native Web form by filling in all required fields and submitting. 14 | """ 15 | 16 | 17 | async def main(): 18 | browser = BrowserSession( 19 | browser_profile=BrowserProfile( 20 | window_size=ViewportSize(width=1100, height=1000), 21 | ) 22 | ) 23 | 24 | agent = Agent(task=TASK, llm=llm) 25 | 26 | await agent.run() 27 | 28 | 29 | if __name__ == '__main__': 30 | import asyncio 31 | 32 | asyncio.run(main()) 33 | -------------------------------------------------------------------------------- /browser_use/dom/utils.py: -------------------------------------------------------------------------------- 1 | def cap_text_length(text: str, max_length: int) -> str: 2 | """Cap text length for display.""" 3 | if len(text) <= max_length: 4 | return text 5 | return text[:max_length] + '...' 6 | -------------------------------------------------------------------------------- /browser_use/exceptions.py: -------------------------------------------------------------------------------- 1 | class LLMException(Exception): 2 | def __init__(self, status_code, message): 3 | self.status_code = status_code 4 | self.message = message 5 | super().__init__(f'Error {status_code}: {message}') 6 | -------------------------------------------------------------------------------- /browser_use/filesystem/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/browser-use/browser-use/2d84ff6605f4d89988f7868633f288f351c6a349/browser_use/filesystem/__init__.py -------------------------------------------------------------------------------- /browser_use/integrations/gmail/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Gmail Integration for Browser Use 3 | Provides Gmail API integration for email reading and verification code extraction. 4 | This integration enables agents to read email content and extract verification codes themselves. 5 | Usage: 6 | from browser_use.integrations.gmail import GmailService, register_gmail_actions 7 | # Option 1: Register Gmail actions with file-based authentication 8 | tools = Tools() 9 | register_gmail_actions(tools) 10 | # Option 2: Register Gmail actions with direct access token (recommended for production) 11 | tools = Tools() 12 | register_gmail_actions(tools, access_token="your_access_token_here") 13 | # Option 3: Use the service directly 14 | gmail = GmailService(access_token="your_access_token_here") 15 | await gmail.authenticate() 16 | emails = await gmail.get_recent_emails() 17 | """ 18 | 19 | # @file purpose: Gmail integration for 2FA email authentication and email reading 20 | 21 | from .actions import register_gmail_actions 22 | from .service import GmailService 23 | 24 | __all__ = ['GmailService', 'register_gmail_actions'] 25 | -------------------------------------------------------------------------------- /browser_use/llm/README.md: -------------------------------------------------------------------------------- 1 | # Browser Use LLMs 2 | 3 | We officially support the following LLMs: 4 | 5 | - OpenAI 6 | - Anthropic 7 | - Google 8 | - Groq 9 | - Ollama 10 | - DeepSeek 11 | 12 | ## Migrating from LangChain 13 | 14 | Because of how we implemented the LLMs, we can technically support anything. If you want to use a LangChain model, you can use the `ChatLangchain` (NOT OFFICIALLY SUPPORTED) class. 15 | 16 | You can find all the details in the [LangChain example](examples/models/langchain/example.py). We suggest you grab that code and use it as a reference. 17 | -------------------------------------------------------------------------------- /browser_use/llm/aws/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | # Type stubs for lazy imports 4 | if TYPE_CHECKING: 5 | from browser_use.llm.aws.chat_anthropic import ChatAnthropicBedrock 6 | from browser_use.llm.aws.chat_bedrock import ChatAWSBedrock 7 | 8 | # Lazy imports mapping for AWS chat models 9 | _LAZY_IMPORTS = { 10 | 'ChatAnthropicBedrock': ('browser_use.llm.aws.chat_anthropic', 'ChatAnthropicBedrock'), 11 | 'ChatAWSBedrock': ('browser_use.llm.aws.chat_bedrock', 'ChatAWSBedrock'), 12 | } 13 | 14 | 15 | def __getattr__(name: str): 16 | """Lazy import mechanism for AWS chat models.""" 17 | if name in _LAZY_IMPORTS: 18 | module_path, attr_name = _LAZY_IMPORTS[name] 19 | try: 20 | from importlib import import_module 21 | 22 | module = import_module(module_path) 23 | attr = getattr(module, attr_name) 24 | # Cache the imported attribute in the module's globals 25 | globals()[name] = attr 26 | return attr 27 | except ImportError as e: 28 | raise ImportError(f'Failed to import {name} from {module_path}: {e}') from e 29 | 30 | raise AttributeError(f"module '{__name__}' has no attribute '{name}'") 31 | 32 | 33 | __all__ = [ 34 | 'ChatAWSBedrock', 35 | 'ChatAnthropicBedrock', 36 | ] 37 | -------------------------------------------------------------------------------- /browser_use/llm/azure/chat.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass 3 | from typing import Any 4 | 5 | import httpx 6 | from openai import AsyncAzureOpenAI as AsyncAzureOpenAIClient 7 | from openai.types.shared import ChatModel 8 | 9 | from browser_use.llm.openai.like import ChatOpenAILike 10 | 11 | 12 | @dataclass 13 | class ChatAzureOpenAI(ChatOpenAILike): 14 | """ 15 | A class for to interact with any provider using the OpenAI API schema. 16 | 17 | Args: 18 | model (str): The name of the OpenAI model to use. Defaults to "not-provided". 19 | api_key (Optional[str]): The API key to use. Defaults to "not-provided". 20 | """ 21 | 22 | # Model configuration 23 | model: str | ChatModel 24 | 25 | # Client initialization parameters 26 | api_key: str | None = None 27 | api_version: str | None = '2024-12-01-preview' 28 | azure_endpoint: str | None = None 29 | azure_deployment: str | None = None 30 | base_url: str | None = None 31 | azure_ad_token: str | None = None 32 | azure_ad_token_provider: Any | None = None 33 | 34 | default_headers: dict[str, str] | None = None 35 | default_query: dict[str, Any] | None = None 36 | 37 | client: AsyncAzureOpenAIClient | None = None 38 | 39 | @property 40 | def provider(self) -> str: 41 | return 'azure' 42 | 43 | def _get_client_params(self) -> dict[str, Any]: 44 | _client_params: dict[str, Any] = {} 45 | 46 | self.api_key = self.api_key or os.getenv('AZURE_OPENAI_API_KEY') 47 | self.azure_endpoint = self.azure_endpoint or os.getenv('AZURE_OPENAI_ENDPOINT') 48 | self.azure_deployment = self.azure_deployment or os.getenv('AZURE_OPENAI_DEPLOYMENT') 49 | params_mapping = { 50 | 'api_key': self.api_key, 51 | 'api_version': self.api_version, 52 | 'organization': self.organization, 53 | 'azure_endpoint': self.azure_endpoint, 54 | 'azure_deployment': self.azure_deployment, 55 | 'base_url': self.base_url, 56 | 'azure_ad_token': self.azure_ad_token, 57 | 'azure_ad_token_provider': self.azure_ad_token_provider, 58 | 'http_client': self.http_client, 59 | } 60 | if self.default_headers is not None: 61 | _client_params['default_headers'] = self.default_headers 62 | if self.default_query is not None: 63 | _client_params['default_query'] = self.default_query 64 | 65 | _client_params.update({k: v for k, v in params_mapping.items() if v is not None}) 66 | 67 | return _client_params 68 | 69 | def get_client(self) -> AsyncAzureOpenAIClient: 70 | """ 71 | Returns an asynchronous OpenAI client. 72 | 73 | Returns: 74 | AsyncAzureOpenAIClient: An instance of the asynchronous OpenAI client. 75 | """ 76 | if self.client: 77 | return self.client 78 | 79 | _client_params: dict[str, Any] = self._get_client_params() 80 | 81 | if self.http_client: 82 | _client_params['http_client'] = self.http_client 83 | else: 84 | # Create a new async HTTP client with custom limits 85 | _client_params['http_client'] = httpx.AsyncClient( 86 | limits=httpx.Limits(max_connections=20, max_keepalive_connections=6) 87 | ) 88 | 89 | self.client = AsyncAzureOpenAIClient(**_client_params) 90 | 91 | return self.client 92 | -------------------------------------------------------------------------------- /browser_use/llm/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | We have switched all of our code from langchain to openai.types.chat.chat_completion_message_param. 3 | 4 | For easier transition we have 5 | """ 6 | 7 | from typing import Any, Protocol, TypeVar, overload, runtime_checkable 8 | 9 | from pydantic import BaseModel 10 | 11 | from browser_use.llm.messages import BaseMessage 12 | from browser_use.llm.views import ChatInvokeCompletion 13 | 14 | T = TypeVar('T', bound=BaseModel) 15 | 16 | 17 | @runtime_checkable 18 | class BaseChatModel(Protocol): 19 | _verified_api_keys: bool = False 20 | 21 | model: str 22 | 23 | @property 24 | def provider(self) -> str: ... 25 | 26 | @property 27 | def name(self) -> str: ... 28 | 29 | @property 30 | def model_name(self) -> str: 31 | # for legacy support 32 | return self.model 33 | 34 | @overload 35 | async def ainvoke(self, messages: list[BaseMessage], output_format: None = None) -> ChatInvokeCompletion[str]: ... 36 | 37 | @overload 38 | async def ainvoke(self, messages: list[BaseMessage], output_format: type[T]) -> ChatInvokeCompletion[T]: ... 39 | 40 | async def ainvoke( 41 | self, messages: list[BaseMessage], output_format: type[T] | None = None 42 | ) -> ChatInvokeCompletion[T] | ChatInvokeCompletion[str]: ... 43 | 44 | @classmethod 45 | def __get_pydantic_core_schema__( 46 | cls, 47 | source_type: type, 48 | handler: Any, 49 | ) -> Any: 50 | """ 51 | Allow this Protocol to be used in Pydantic models -> very useful to typesafe the agent settings for example. 52 | Returns a schema that allows any object (since this is a Protocol). 53 | """ 54 | from pydantic_core import core_schema 55 | 56 | # Return a schema that accepts any object for Protocol types 57 | return core_schema.any_schema() 58 | -------------------------------------------------------------------------------- /browser_use/llm/exceptions.py: -------------------------------------------------------------------------------- 1 | class ModelError(Exception): 2 | pass 3 | 4 | 5 | class ModelProviderError(ModelError): 6 | """Exception raised when a model provider returns an error.""" 7 | 8 | def __init__( 9 | self, 10 | message: str, 11 | status_code: int = 502, 12 | model: str | None = None, 13 | ): 14 | super().__init__(message, status_code) 15 | self.model = model 16 | 17 | 18 | class ModelRateLimitError(ModelProviderError): 19 | """Exception raised when a model provider returns a rate limit error.""" 20 | 21 | def __init__( 22 | self, 23 | message: str, 24 | status_code: int = 429, 25 | model: str | None = None, 26 | ): 27 | super().__init__(message, status_code, model) 28 | -------------------------------------------------------------------------------- /browser_use/llm/google/__init__.py: -------------------------------------------------------------------------------- 1 | from browser_use.llm.google.chat import ChatGoogle 2 | 3 | __all__ = ['ChatGoogle'] 4 | -------------------------------------------------------------------------------- /browser_use/llm/ollama/chat.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Any, TypeVar, overload 3 | 4 | import httpx 5 | from ollama import AsyncClient as OllamaAsyncClient 6 | from pydantic import BaseModel 7 | 8 | from browser_use.llm.base import BaseChatModel 9 | from browser_use.llm.exceptions import ModelProviderError 10 | from browser_use.llm.messages import BaseMessage 11 | from browser_use.llm.ollama.serializer import OllamaMessageSerializer 12 | from browser_use.llm.views import ChatInvokeCompletion 13 | 14 | T = TypeVar('T', bound=BaseModel) 15 | 16 | 17 | @dataclass 18 | class ChatOllama(BaseChatModel): 19 | """ 20 | A wrapper around Ollama's chat model. 21 | """ 22 | 23 | model: str 24 | 25 | # # Model params 26 | # TODO (matic): Why is this commented out? 27 | # temperature: float | None = None 28 | 29 | # Client initialization parameters 30 | host: str | None = None 31 | timeout: float | httpx.Timeout | None = None 32 | client_params: dict[str, Any] | None = None 33 | 34 | # Static 35 | @property 36 | def provider(self) -> str: 37 | return 'ollama' 38 | 39 | def _get_client_params(self) -> dict[str, Any]: 40 | """Prepare client parameters dictionary.""" 41 | return { 42 | 'host': self.host, 43 | 'timeout': self.timeout, 44 | 'client_params': self.client_params, 45 | } 46 | 47 | def get_client(self) -> OllamaAsyncClient: 48 | """ 49 | Returns an OllamaAsyncClient client. 50 | """ 51 | return OllamaAsyncClient(host=self.host, timeout=self.timeout, **self.client_params or {}) 52 | 53 | @property 54 | def name(self) -> str: 55 | return self.model 56 | 57 | @overload 58 | async def ainvoke(self, messages: list[BaseMessage], output_format: None = None) -> ChatInvokeCompletion[str]: ... 59 | 60 | @overload 61 | async def ainvoke(self, messages: list[BaseMessage], output_format: type[T]) -> ChatInvokeCompletion[T]: ... 62 | 63 | async def ainvoke( 64 | self, messages: list[BaseMessage], output_format: type[T] | None = None 65 | ) -> ChatInvokeCompletion[T] | ChatInvokeCompletion[str]: 66 | ollama_messages = OllamaMessageSerializer.serialize_messages(messages) 67 | 68 | try: 69 | if output_format is None: 70 | response = await self.get_client().chat( 71 | model=self.model, 72 | messages=ollama_messages, 73 | ) 74 | 75 | return ChatInvokeCompletion(completion=response.message.content or '', usage=None) 76 | else: 77 | schema = output_format.model_json_schema() 78 | 79 | response = await self.get_client().chat( 80 | model=self.model, 81 | messages=ollama_messages, 82 | format=schema, 83 | ) 84 | 85 | completion = response.message.content or '' 86 | if output_format is not None: 87 | completion = output_format.model_validate_json(completion) 88 | 89 | return ChatInvokeCompletion(completion=completion, usage=None) 90 | 91 | except Exception as e: 92 | raise ModelProviderError(message=str(e), model=self.name) from e 93 | -------------------------------------------------------------------------------- /browser_use/llm/openai/like.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from browser_use.llm.openai.chat import ChatOpenAI 4 | 5 | 6 | @dataclass 7 | class ChatOpenAILike(ChatOpenAI): 8 | """ 9 | A class for to interact with any provider using the OpenAI API schema. 10 | 11 | Args: 12 | model (str): The name of the OpenAI model to use. 13 | """ 14 | 15 | model: str 16 | -------------------------------------------------------------------------------- /browser_use/llm/openrouter/serializer.py: -------------------------------------------------------------------------------- 1 | from openai.types.chat import ChatCompletionMessageParam 2 | 3 | from browser_use.llm.messages import BaseMessage 4 | from browser_use.llm.openai.serializer import OpenAIMessageSerializer 5 | 6 | 7 | class OpenRouterMessageSerializer: 8 | """ 9 | Serializer for converting between custom message types and OpenRouter message formats. 10 | 11 | OpenRouter uses the OpenAI-compatible API, so we can reuse the OpenAI serializer. 12 | """ 13 | 14 | @staticmethod 15 | def serialize_messages(messages: list[BaseMessage]) -> list[ChatCompletionMessageParam]: 16 | """ 17 | Serialize a list of browser_use messages to OpenRouter-compatible messages. 18 | 19 | Args: 20 | messages: List of browser_use messages 21 | 22 | Returns: 23 | List of OpenRouter-compatible messages (identical to OpenAI format) 24 | """ 25 | # OpenRouter uses the same message format as OpenAI 26 | return OpenAIMessageSerializer.serialize_messages(messages) 27 | -------------------------------------------------------------------------------- /browser_use/llm/tests/test_gemini_image.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import base64 3 | import io 4 | import random 5 | 6 | from lmnr import Laminar 7 | from PIL import Image, ImageDraw, ImageFont 8 | 9 | from browser_use.llm.google.chat import ChatGoogle 10 | from browser_use.llm.google.serializer import GoogleMessageSerializer 11 | from browser_use.llm.messages import ( 12 | BaseMessage, 13 | ContentPartImageParam, 14 | ContentPartTextParam, 15 | ImageURL, 16 | SystemMessage, 17 | UserMessage, 18 | ) 19 | 20 | Laminar.initialize() 21 | 22 | 23 | def create_random_text_image(text: str = 'hello world', width: int = 4000, height: int = 4000) -> str: 24 | # Create image with random background color 25 | bg_color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) 26 | image = Image.new('RGB', (width, height), bg_color) 27 | draw = ImageDraw.Draw(image) 28 | 29 | # Try to use a default font, fallback to default if not available 30 | try: 31 | font = ImageFont.truetype('arial.ttf', 24) 32 | except Exception: 33 | font = ImageFont.load_default() 34 | 35 | # Calculate text position to center it 36 | bbox = draw.textbbox((0, 0), text, font=font) 37 | text_width = bbox[2] - bbox[0] 38 | text_height = bbox[3] - bbox[1] 39 | x = (width - text_width) // 2 40 | y = (height - text_height) // 2 41 | 42 | # Draw text with contrasting color 43 | text_color = (255 - bg_color[0], 255 - bg_color[1], 255 - bg_color[2]) 44 | draw.text((x, y), text, fill=text_color, font=font) 45 | 46 | # Convert to base64 47 | buffer = io.BytesIO() 48 | image.save(buffer, format='PNG') 49 | img_data = base64.b64encode(buffer.getvalue()).decode() 50 | 51 | return f'data:image/png;base64,{img_data}' 52 | 53 | 54 | async def test_gemini_image_vision(): 55 | """Test Gemini's ability to see and describe images.""" 56 | 57 | # Create the LLM 58 | llm = ChatGoogle(model='gemini-2.0-flash-exp') 59 | 60 | # Create a random image with text 61 | image_data_url = create_random_text_image('Hello Gemini! Can you see this text?') 62 | 63 | # Create messages with image 64 | messages: list[BaseMessage] = [ 65 | SystemMessage(content='You are a helpful assistant that can see and describe images.'), 66 | UserMessage( 67 | content=[ 68 | ContentPartTextParam(text='What do you see in this image? Please describe the text and any visual elements.'), 69 | ContentPartImageParam(image_url=ImageURL(url=image_data_url)), 70 | ] 71 | ), 72 | ] 73 | 74 | # Serialize messages for Google format 75 | serializer = GoogleMessageSerializer() 76 | formatted_messages, system_message = serializer.serialize_messages(messages) 77 | 78 | print('Testing Gemini image vision...') 79 | print(f'System message: {system_message}') 80 | 81 | # Make the API call 82 | try: 83 | response = await llm.ainvoke(messages) 84 | print('\n=== Gemini Response ===') 85 | print(response.completion) 86 | print(response.usage) 87 | print('=======================') 88 | except Exception as e: 89 | print(f'Error calling Gemini: {e}') 90 | print(f'Error type: {type(e)}') 91 | 92 | 93 | if __name__ == '__main__': 94 | asyncio.run(test_gemini_image_vision()) 95 | -------------------------------------------------------------------------------- /browser_use/llm/tests/test_groq_loop.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from browser_use.llm import ContentText 4 | from browser_use.llm.groq.chat import ChatGroq 5 | from browser_use.llm.messages import SystemMessage, UserMessage 6 | 7 | llm = ChatGroq( 8 | model='meta-llama/llama-4-maverick-17b-128e-instruct', 9 | temperature=0.5, 10 | ) 11 | # llm = ChatOpenAI(model='gpt-4.1-mini') 12 | 13 | 14 | async def main(): 15 | from pydantic import BaseModel 16 | 17 | from browser_use.tokens.service import TokenCost 18 | 19 | tk = TokenCost().register_llm(llm) 20 | 21 | class Output(BaseModel): 22 | reasoning: str 23 | answer: str 24 | 25 | message = [ 26 | SystemMessage(content='You are a helpful assistant that can answer questions and help with tasks.'), 27 | UserMessage( 28 | content=[ 29 | ContentText( 30 | text=r"Why is the sky blue? write exactly this into reasoning make sure to output ' with exactly like in the input : " 31 | ), 32 | ContentText( 33 | text=""" 34 | The user's request is to find the lowest priced women's plus size one piece swimsuit in color black with a customer rating of at least 5 on Kohls.com. I am currently on the homepage of Kohls. The page has a search bar and various category links. To begin, I need to navigate to the women's section and search for swimsuits. I will start by clicking on the 'Women' category link.""" 35 | ), 36 | ] 37 | ), 38 | ] 39 | 40 | for i in range(10): 41 | print('-' * 50) 42 | print(f'start loop {i}') 43 | response = await llm.ainvoke(message, output_format=Output) 44 | completion = response.completion 45 | print(f'start reasoning: {completion.reasoning}') 46 | print(f'answer: {completion.answer}') 47 | print('-' * 50) 48 | 49 | 50 | if __name__ == '__main__': 51 | asyncio.run(main()) 52 | -------------------------------------------------------------------------------- /browser_use/llm/views.py: -------------------------------------------------------------------------------- 1 | from typing import Generic, TypeVar, Union 2 | 3 | from pydantic import BaseModel 4 | 5 | T = TypeVar('T', bound=Union[BaseModel, str]) 6 | 7 | 8 | class ChatInvokeUsage(BaseModel): 9 | """ 10 | Usage information for a chat model invocation. 11 | """ 12 | 13 | prompt_tokens: int 14 | """The number of tokens in the prompt (this includes the cached tokens as well. When calculating the cost, subtract the cached tokens from the prompt tokens)""" 15 | 16 | prompt_cached_tokens: int | None 17 | """The number of cached tokens.""" 18 | 19 | prompt_cache_creation_tokens: int | None 20 | """Anthropic only: The number of tokens used to create the cache.""" 21 | 22 | prompt_image_tokens: int | None 23 | """Google only: The number of tokens in the image (prompt tokens is the text tokens + image tokens in that case)""" 24 | 25 | completion_tokens: int 26 | """The number of tokens in the completion.""" 27 | 28 | total_tokens: int 29 | """The total number of tokens in the response.""" 30 | 31 | 32 | class ChatInvokeCompletion(BaseModel, Generic[T]): 33 | """ 34 | Response from a chat model invocation. 35 | """ 36 | 37 | completion: T 38 | """The completion of the response.""" 39 | 40 | # Thinking stuff 41 | thinking: str | None = None 42 | redacted_thinking: str | None = None 43 | 44 | usage: ChatInvokeUsage | None 45 | """The usage of the response.""" 46 | -------------------------------------------------------------------------------- /browser_use/mcp/__init__.py: -------------------------------------------------------------------------------- 1 | """MCP (Model Context Protocol) support for browser-use. 2 | 3 | This module provides integration with MCP servers and clients for browser automation. 4 | """ 5 | 6 | from browser_use.mcp.client import MCPClient 7 | from browser_use.mcp.controller import MCPToolWrapper 8 | 9 | __all__ = ['MCPClient', 'MCPToolWrapper', 'BrowserUseServer'] # type: ignore 10 | 11 | 12 | def __getattr__(name): 13 | """Lazy import to avoid importing server module when only client is needed.""" 14 | if name == 'BrowserUseServer': 15 | from browser_use.mcp.server import BrowserUseServer 16 | 17 | return BrowserUseServer 18 | raise AttributeError(f"module '{__name__}' has no attribute '{name}'") 19 | -------------------------------------------------------------------------------- /browser_use/mcp/__main__.py: -------------------------------------------------------------------------------- 1 | """Entry point for running MCP server as a module. 2 | 3 | Usage: 4 | python -m browser_use.mcp.server 5 | """ 6 | 7 | import asyncio 8 | 9 | from browser_use.mcp.server import main 10 | 11 | if __name__ == '__main__': 12 | asyncio.run(main()) 13 | -------------------------------------------------------------------------------- /browser_use/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/browser-use/browser-use/2d84ff6605f4d89988f7868633f288f351c6a349/browser_use/py.typed -------------------------------------------------------------------------------- /browser_use/screenshots/__init__.py: -------------------------------------------------------------------------------- 1 | # Screenshots package for browser-use 2 | -------------------------------------------------------------------------------- /browser_use/screenshots/service.py: -------------------------------------------------------------------------------- 1 | """ 2 | Screenshot storage service for browser-use agents. 3 | """ 4 | 5 | import base64 6 | from pathlib import Path 7 | 8 | import anyio 9 | 10 | 11 | class ScreenshotService: 12 | """Simple screenshot storage service that saves screenshots to disk""" 13 | 14 | def __init__(self, agent_directory: str | Path): 15 | """Initialize with agent directory path""" 16 | self.agent_directory = Path(agent_directory) if isinstance(agent_directory, str) else agent_directory 17 | 18 | # Create screenshots subdirectory 19 | self.screenshots_dir = self.agent_directory / 'screenshots' 20 | self.screenshots_dir.mkdir(parents=True, exist_ok=True) 21 | 22 | async def store_screenshot(self, screenshot_b64: str, step_number: int) -> str: 23 | """Store screenshot to disk and return the full path as string""" 24 | screenshot_filename = f'step_{step_number}.png' 25 | screenshot_path = self.screenshots_dir / screenshot_filename 26 | 27 | # Decode base64 and save to disk 28 | screenshot_data = base64.b64decode(screenshot_b64) 29 | 30 | async with await anyio.open_file(screenshot_path, 'wb') as f: 31 | await f.write(screenshot_data) 32 | 33 | return str(screenshot_path) 34 | 35 | async def get_screenshot(self, screenshot_path: str) -> str | None: 36 | """Load screenshot from disk path and return as base64""" 37 | if not screenshot_path: 38 | return None 39 | 40 | path = Path(screenshot_path) 41 | if not path.exists(): 42 | return None 43 | 44 | # Load from disk and encode to base64 45 | async with await anyio.open_file(path, 'rb') as f: 46 | screenshot_data = await f.read() 47 | 48 | return base64.b64encode(screenshot_data).decode('utf-8') 49 | -------------------------------------------------------------------------------- /browser_use/sync/__init__.py: -------------------------------------------------------------------------------- 1 | """Cloud sync module for Browser Use.""" 2 | 3 | from browser_use.sync.auth import CloudAuthConfig, DeviceAuthClient 4 | from browser_use.sync.service import CloudSync 5 | 6 | __all__ = ['CloudAuthConfig', 'DeviceAuthClient', 'CloudSync'] 7 | -------------------------------------------------------------------------------- /browser_use/telemetry/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Telemetry for Browser Use. 3 | """ 4 | 5 | from typing import TYPE_CHECKING 6 | 7 | # Type stubs for lazy imports 8 | if TYPE_CHECKING: 9 | from browser_use.telemetry.service import ProductTelemetry 10 | from browser_use.telemetry.views import ( 11 | BaseTelemetryEvent, 12 | CLITelemetryEvent, 13 | MCPClientTelemetryEvent, 14 | MCPServerTelemetryEvent, 15 | ) 16 | 17 | # Lazy imports mapping 18 | _LAZY_IMPORTS = { 19 | 'ProductTelemetry': ('browser_use.telemetry.service', 'ProductTelemetry'), 20 | 'BaseTelemetryEvent': ('browser_use.telemetry.views', 'BaseTelemetryEvent'), 21 | 'CLITelemetryEvent': ('browser_use.telemetry.views', 'CLITelemetryEvent'), 22 | 'MCPClientTelemetryEvent': ('browser_use.telemetry.views', 'MCPClientTelemetryEvent'), 23 | 'MCPServerTelemetryEvent': ('browser_use.telemetry.views', 'MCPServerTelemetryEvent'), 24 | } 25 | 26 | 27 | def __getattr__(name: str): 28 | """Lazy import mechanism for telemetry components.""" 29 | if name in _LAZY_IMPORTS: 30 | module_path, attr_name = _LAZY_IMPORTS[name] 31 | try: 32 | from importlib import import_module 33 | 34 | module = import_module(module_path) 35 | attr = getattr(module, attr_name) 36 | # Cache the imported attribute in the module's globals 37 | globals()[name] = attr 38 | return attr 39 | except ImportError as e: 40 | raise ImportError(f'Failed to import {name} from {module_path}: {e}') from e 41 | 42 | raise AttributeError(f"module '{__name__}' has no attribute '{name}'") 43 | 44 | 45 | __all__ = [ 46 | 'BaseTelemetryEvent', 47 | 'ProductTelemetry', 48 | 'CLITelemetryEvent', 49 | 'MCPClientTelemetryEvent', 50 | 'MCPServerTelemetryEvent', 51 | ] 52 | -------------------------------------------------------------------------------- /browser_use/telemetry/views.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from collections.abc import Sequence 3 | from dataclasses import asdict, dataclass 4 | from typing import Any 5 | 6 | from browser_use.config import is_running_in_docker 7 | 8 | 9 | @dataclass 10 | class BaseTelemetryEvent(ABC): 11 | @property 12 | @abstractmethod 13 | def name(self) -> str: 14 | pass 15 | 16 | @property 17 | def properties(self) -> dict[str, Any]: 18 | props = {k: v for k, v in asdict(self).items() if k != 'name'} 19 | # Add Docker context if running in Docker 20 | props['is_docker'] = is_running_in_docker() 21 | return props 22 | 23 | 24 | @dataclass 25 | class AgentTelemetryEvent(BaseTelemetryEvent): 26 | # start details 27 | task: str 28 | model: str 29 | model_provider: str 30 | max_steps: int 31 | max_actions_per_step: int 32 | use_vision: bool 33 | version: str 34 | source: str 35 | cdp_url: str | None 36 | # step details 37 | action_errors: Sequence[str | None] 38 | action_history: Sequence[list[dict] | None] 39 | urls_visited: Sequence[str | None] 40 | # end details 41 | steps: int 42 | total_input_tokens: int 43 | total_duration_seconds: float 44 | success: bool | None 45 | final_result_response: str | None 46 | error_message: str | None 47 | 48 | name: str = 'agent_event' 49 | 50 | 51 | @dataclass 52 | class MCPClientTelemetryEvent(BaseTelemetryEvent): 53 | """Telemetry event for MCP client usage""" 54 | 55 | server_name: str 56 | command: str 57 | tools_discovered: int 58 | version: str 59 | action: str # 'connect', 'disconnect', 'tool_call' 60 | tool_name: str | None = None 61 | duration_seconds: float | None = None 62 | error_message: str | None = None 63 | 64 | name: str = 'mcp_client_event' 65 | 66 | 67 | @dataclass 68 | class MCPServerTelemetryEvent(BaseTelemetryEvent): 69 | """Telemetry event for MCP server usage""" 70 | 71 | version: str 72 | action: str # 'start', 'stop', 'tool_call' 73 | tool_name: str | None = None 74 | duration_seconds: float | None = None 75 | error_message: str | None = None 76 | parent_process_cmdline: str | None = None 77 | 78 | name: str = 'mcp_server_event' 79 | 80 | 81 | @dataclass 82 | class CLITelemetryEvent(BaseTelemetryEvent): 83 | """Telemetry event for CLI usage""" 84 | 85 | version: str 86 | action: str # 'start', 'message_sent', 'task_completed', 'error' 87 | mode: str # 'interactive', 'oneshot', 'mcp_server' 88 | model: str | None = None 89 | model_provider: str | None = None 90 | duration_seconds: float | None = None 91 | error_message: str | None = None 92 | 93 | name: str = 'cli_event' 94 | -------------------------------------------------------------------------------- /browser_use/tokens/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/browser-use/browser-use/2d84ff6605f4d89988f7868633f288f351c6a349/browser_use/tokens/__init__.py -------------------------------------------------------------------------------- /browser_use/tokens/views.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import Any, TypeVar 3 | 4 | from pydantic import BaseModel, Field 5 | 6 | from browser_use.llm.views import ChatInvokeUsage 7 | 8 | T = TypeVar('T', bound=BaseModel) 9 | 10 | 11 | class TokenUsageEntry(BaseModel): 12 | """Single token usage entry""" 13 | 14 | model: str 15 | timestamp: datetime 16 | usage: ChatInvokeUsage 17 | 18 | 19 | class TokenCostCalculated(BaseModel): 20 | """Token cost""" 21 | 22 | new_prompt_tokens: int 23 | new_prompt_cost: float 24 | 25 | prompt_read_cached_tokens: int | None 26 | prompt_read_cached_cost: float | None 27 | 28 | prompt_cached_creation_tokens: int | None 29 | prompt_cache_creation_cost: float | None 30 | """Anthropic only: The cost of creating the cache.""" 31 | 32 | completion_tokens: int 33 | completion_cost: float 34 | 35 | @property 36 | def prompt_cost(self) -> float: 37 | return self.new_prompt_cost + (self.prompt_read_cached_cost or 0) + (self.prompt_cache_creation_cost or 0) 38 | 39 | @property 40 | def total_cost(self) -> float: 41 | return ( 42 | self.new_prompt_cost 43 | + (self.prompt_read_cached_cost or 0) 44 | + (self.prompt_cache_creation_cost or 0) 45 | + self.completion_cost 46 | ) 47 | 48 | 49 | class ModelPricing(BaseModel): 50 | """Pricing information for a model""" 51 | 52 | model: str 53 | input_cost_per_token: float | None 54 | output_cost_per_token: float | None 55 | 56 | cache_read_input_token_cost: float | None 57 | cache_creation_input_token_cost: float | None 58 | 59 | max_tokens: int | None 60 | max_input_tokens: int | None 61 | max_output_tokens: int | None 62 | 63 | 64 | class CachedPricingData(BaseModel): 65 | """Cached pricing data with timestamp""" 66 | 67 | timestamp: datetime 68 | data: dict[str, Any] 69 | 70 | 71 | class ModelUsageStats(BaseModel): 72 | """Usage statistics for a single model""" 73 | 74 | model: str 75 | prompt_tokens: int = 0 76 | completion_tokens: int = 0 77 | total_tokens: int = 0 78 | cost: float = 0.0 79 | invocations: int = 0 80 | average_tokens_per_invocation: float = 0.0 81 | 82 | 83 | class ModelUsageTokens(BaseModel): 84 | """Usage tokens for a single model""" 85 | 86 | model: str 87 | prompt_tokens: int 88 | prompt_cached_tokens: int 89 | completion_tokens: int 90 | total_tokens: int 91 | 92 | 93 | class UsageSummary(BaseModel): 94 | """Summary of token usage and costs""" 95 | 96 | total_prompt_tokens: int 97 | total_prompt_cost: float 98 | 99 | total_prompt_cached_tokens: int 100 | total_prompt_cached_cost: float 101 | 102 | total_completion_tokens: int 103 | total_completion_cost: float 104 | total_tokens: int 105 | total_cost: float 106 | entry_count: int 107 | 108 | by_model: dict[str, ModelUsageStats] = Field(default_factory=dict) 109 | -------------------------------------------------------------------------------- /browser_use/tools/views.py: -------------------------------------------------------------------------------- 1 | from typing import Generic, TypeVar 2 | 3 | from pydantic import BaseModel, ConfigDict, Field 4 | 5 | 6 | # Action Input Models 7 | class SearchGoogleAction(BaseModel): 8 | query: str 9 | 10 | 11 | class GoToUrlAction(BaseModel): 12 | url: str 13 | new_tab: bool = False # True to open in new tab, False to navigate in current tab 14 | 15 | 16 | class ClickElementAction(BaseModel): 17 | index: int = Field(ge=1, description='index of the element to click') 18 | while_holding_ctrl: bool | None = Field( 19 | default=None, 20 | description='Set to True to open the navigation in a new background tab (Ctrl+Click behavior). Optional.', 21 | ) 22 | # expect_download: bool = Field(default=False, description='set True if expecting a download, False otherwise') # moved to downloads_watchdog.py 23 | # click_count: int = 1 # TODO 24 | 25 | 26 | class InputTextAction(BaseModel): 27 | index: int = Field(ge=0, description='index of the element to input text into, 0 is the page') 28 | text: str 29 | clear_existing: bool = Field(default=True, description='set True to clear existing text, False to append to existing text') 30 | 31 | 32 | class DoneAction(BaseModel): 33 | text: str 34 | success: bool 35 | files_to_display: list[str] | None = [] 36 | 37 | 38 | T = TypeVar('T', bound=BaseModel) 39 | 40 | 41 | class StructuredOutputAction(BaseModel, Generic[T]): 42 | success: bool = True 43 | data: T 44 | 45 | 46 | class SwitchTabAction(BaseModel): 47 | tab_id: str = Field( 48 | min_length=4, 49 | max_length=4, 50 | description='Last 4 chars of TargetID', 51 | ) # last 4 chars of TargetID 52 | 53 | 54 | class CloseTabAction(BaseModel): 55 | tab_id: str = Field(min_length=4, max_length=4, description='4 character Tab ID') # last 4 chars of TargetID 56 | 57 | 58 | class ScrollAction(BaseModel): 59 | down: bool # True to scroll down, False to scroll up 60 | num_pages: float # Number of pages to scroll (0.5 = half page, 1.0 = one page, etc.) 61 | frame_element_index: int | None = None # Optional element index to find scroll container for 62 | 63 | 64 | class SendKeysAction(BaseModel): 65 | keys: str 66 | 67 | 68 | class UploadFileAction(BaseModel): 69 | index: int 70 | path: str 71 | 72 | 73 | class ExtractPageContentAction(BaseModel): 74 | value: str 75 | 76 | 77 | class NoParamsAction(BaseModel): 78 | """ 79 | Accepts absolutely anything in the incoming data 80 | and discards it, so the final parsed model is empty. 81 | """ 82 | 83 | model_config = ConfigDict(extra='ignore') 84 | # No fields defined - all inputs are ignored automatically 85 | 86 | 87 | class GetDropdownOptionsAction(BaseModel): 88 | index: int = Field(ge=1, description='index of the dropdown element to get the option values for') 89 | 90 | 91 | class SelectDropdownOptionAction(BaseModel): 92 | index: int = Field(ge=1, description='index of the dropdown element to select an option for') 93 | text: str = Field(description='the text or exact value of the option to select') 94 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | # Docker Setup for Browser-Use 2 | 3 | This directory contains the optimized Docker build system for browser-use, achieving < 30 second builds. 4 | 5 | ## Quick Start 6 | 7 | ```bash 8 | # Build base images (only needed once or when dependencies change) 9 | ./docker/build-base-images.sh 10 | 11 | # Build browser-use 12 | docker build -f Dockerfile.fast -t browseruse . 13 | 14 | # Or use the standard Dockerfile (slower but self-contained) 15 | docker build -t browseruse . 16 | ``` 17 | 18 | ## Files 19 | 20 | - `Dockerfile` - Standard self-contained build (~2 min) 21 | - `Dockerfile.fast` - Fast build using pre-built base images (~30 sec) 22 | - `docker/` - Base image definitions and build script 23 | - `base-images/system/` - Python + minimal system deps 24 | - `base-images/chromium/` - Adds Chromium browser 25 | - `base-images/python-deps/` - Adds Python dependencies 26 | - `build-base-images.sh` - Script to build all base images 27 | 28 | ## Performance 29 | 30 | | Build Type | Time | 31 | |------------|------| 32 | | Standard Dockerfile | ~2 minutes | 33 | | Fast build (with base images) | ~30 seconds | 34 | | Rebuild after code change | ~16 seconds | 35 | -------------------------------------------------------------------------------- /docker/base-images/chromium/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_TAG=latest 2 | FROM browseruse/base-system:${BASE_TAG} 3 | 4 | WORKDIR /tmp 5 | COPY pyproject.toml ./ 6 | 7 | # Install chromium browser using temporary playwright installation 8 | RUN --mount=type=cache,target=/root/.cache,sharing=locked \ 9 | echo "Installing chromium browser via temporary playwright..." && \ 10 | pip install --no-cache-dir playwright && \ 11 | PLAYWRIGHT_BROWSERS_PATH=/opt/playwright playwright install chromium --with-deps --no-shell && \ 12 | ln -s /opt/playwright/chromium-*/chrome-linux/chrome /usr/bin/chromium-browser && \ 13 | chmod -R 755 /opt/playwright && \ 14 | pip uninstall playwright -y && \ 15 | rm -f pyproject.toml 16 | -------------------------------------------------------------------------------- /docker/base-images/python-deps/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_TAG=latest 2 | FROM browseruse/base-chromium:${BASE_TAG} 3 | 4 | ENV PYTHONUNBUFFERED=1 PATH="/app/.venv/bin:$PATH" PLAYWRIGHT_BROWSERS_PATH=/opt/playwright 5 | 6 | WORKDIR /app 7 | COPY pyproject.toml uv.lock* ./ 8 | 9 | RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked \ 10 | uv venv && \ 11 | uv sync --all-extras --no-dev --no-install-project --compile-bytecode 12 | -------------------------------------------------------------------------------- /docker/base-images/system/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.12-slim 2 | 3 | # Install minimal system dependencies 4 | RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ 5 | apt-get update && \ 6 | apt-get install -y --no-install-recommends ca-certificates curl wget && \ 7 | rm -rf /var/lib/apt/lists/* 8 | 9 | # Install uv package manager 10 | COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ 11 | -------------------------------------------------------------------------------- /docker/build-base-images.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Build script for browser-use base images 3 | set -euo pipefail 4 | 5 | # Configuration 6 | REGISTRY="${DOCKER_REGISTRY:-browseruse}" 7 | PLATFORMS="${PLATFORMS:-linux/amd64}" 8 | PUSH="${PUSH:-false}" 9 | 10 | # Build function 11 | build_image() { 12 | local name=$1 13 | local dockerfile=$2 14 | local build_args="${3:-}" 15 | 16 | echo "[INFO] Building ${name}..." 17 | 18 | local build_cmd="docker build" 19 | local tag_args="-t ${REGISTRY}/${name}:latest -t ${REGISTRY}/${name}:$(date +%Y%m%d)" 20 | 21 | # Use buildx for multi-platform or push 22 | if [[ "$PLATFORMS" == *","* ]] || [ "$PUSH" = "true" ]; then 23 | build_cmd="docker buildx build --platform=$PLATFORMS" 24 | [ "$PUSH" = "true" ] && build_cmd="$build_cmd --push" || build_cmd="$build_cmd" 25 | fi 26 | 27 | $build_cmd $tag_args $build_args -f $dockerfile ../../.. 28 | } 29 | 30 | # Main 31 | cd "$(dirname "$0")" 32 | 33 | # Parse arguments 34 | while [[ $# -gt 0 ]]; do 35 | case $1 in 36 | --push) PUSH=true; shift ;; 37 | --registry) REGISTRY="$2"; shift 2 ;; 38 | --platforms) PLATFORMS="$2"; shift 2 ;; 39 | --help) 40 | echo "Usage: $0 [--push] [--registry REG] [--platforms P]" 41 | exit 0 ;; 42 | *) echo "Unknown option: $1"; exit 1 ;; 43 | esac 44 | done 45 | 46 | # Create buildx builder if needed 47 | if [[ "$PLATFORMS" == *","* ]] || [ "$PUSH" = "true" ]; then 48 | docker buildx inspect browseruse-builder >/dev/null 2>&1 || \ 49 | docker buildx create --name browseruse-builder --use 50 | docker buildx use browseruse-builder 51 | fi 52 | 53 | # Build images in order 54 | build_image "base-system" "base-images/system/Dockerfile" 55 | build_image "base-chromium" "base-images/chromium/Dockerfile" "--build-arg BASE_TAG=latest" 56 | build_image "base-python-deps" "base-images/python-deps/Dockerfile" "--build-arg BASE_TAG=latest" 57 | 58 | echo "[INFO] Build complete. Use: FROM ${REGISTRY}/base-python-deps:latest" 59 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Docs 2 | 3 | The official documentation for Browser Use. The docs are published to [Browser Use Docs](https://docs.browser-use.com). 4 | 5 | ### Development 6 | 7 | Install the [Mintlify CLI](https://www.npmjs.com/package/mintlify) to preview the documentation changes locally. To install, use the following command 8 | 9 | ``` 10 | npm i -g mintlify 11 | ``` 12 | 13 | Run the following command at the root of your documentation (where mint.json is) 14 | 15 | ``` 16 | mintlify dev 17 | ``` 18 | -------------------------------------------------------------------------------- /docs/cloud/v1/authentication.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Authentication" 3 | description: "Learn how to authenticate with the Browser Use Cloud API" 4 | icon: "lock" 5 | mode: "wide" 6 | --- 7 | 8 | The Browser Use Cloud API uses API keys to authenticate requests. You can obtain an API key from your [Browser Use Cloud dashboard](https://cloud.browser-use.com/settings/api-keys). 9 | 10 | ## API Keys 11 | 12 | All API requests must include your API key in the `Authorization` header: 13 | 14 | ```bash 15 | Authorization: Bearer YOUR_API_KEY 16 | ``` 17 | 18 | Keep your API keys secure and do not share them in publicly accessible areas such as GitHub, client-side code, or in your browser's developer tools. API keys should be stored securely in environment variables or a secure key management system. 19 | 20 | ## Example Request 21 | 22 | Here's an example of how to include your API key in a request using Python: 23 | 24 | ```python 25 | import requests 26 | 27 | API_KEY = 'your_api_key_here' 28 | BASE_URL = 'https://api.browser-use.com/api/v1' 29 | HEADERS = {'Authorization': f'Bearer {API_KEY}'} 30 | 31 | response = requests.get(f'{BASE_URL}/me', headers=HEADERS) 32 | print(response.json()) 33 | ``` 34 | 35 | ## Verifying Authentication 36 | 37 | You can verify that your API key is valid by making a request to the `/api/v1/me` endpoint. See the [Me endpoint documentation](/api-reference/api-v1/me) for more details. 38 | 39 | ## API Key Security 40 | 41 | To ensure the security of your API keys: 42 | 43 | 1. **Never share your API key** in publicly accessible areas 44 | 2. **Rotate your API keys** periodically 45 | 3. **Use environment variables** to store API keys in your applications 46 | 4. **Implement proper access controls** for your API keys 47 | 5. **Monitor API key usage** for suspicious activity 48 | 49 | If you believe your API key has been compromised, you should immediately revoke it and generate a new one from your Browser Use Cloud dashboard. 50 | -------------------------------------------------------------------------------- /docs/cloud/v1/custom-sdk.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Cloud SDK" 3 | description: "Learn how to set up your own Browser Use Cloud SDK" 4 | icon: "code" 5 | mode: "wide" 6 | --- 7 | 8 | This guide walks you through setting up your own Browser Use Cloud SDK. 9 | 10 | ## Building your own client (OpenAPI) 11 | 12 | 13 | This approach is recommended **only** if you need to run simple tasks and 14 | **don’t require fine-grained control**. 15 | 16 | 17 | The best way to build your own client is to use our [OpenAPI specification](http://api.browser-use.com/openapi.json) to generate a type-safe client library. 18 | 19 | ### Python 20 | 21 | Use [openapi-python-client](https://github.com/openapi-generators/openapi-python-client) to generate a modern Python client: 22 | 23 | ```bash 24 | # Install the generator 25 | pipx install openapi-python-client --include-deps 26 | 27 | # Generate the client 28 | openapi-python-client generate --url http://api.browser-use.com/openapi.json 29 | ``` 30 | 31 | This will create a Python package with full type hints, modern dataclasses, and async support. 32 | 33 | ### TypeScript/JavaScript 34 | 35 | Use [OpenAPI TS](https://openapi-ts.dev/) library to generate a type safe TypeScript client for the Browser Use API. 36 | 37 | The following guide shows how to create a simple type-safe `fetch` client, but you can also use other generators. 38 | 39 | - React Query - https://openapi-ts.dev/openapi-react-query/ 40 | - SWR - https://openapi-ts.dev/swr-openapi/ 41 | 42 | 43 | 44 | ```bash npm 45 | npm install openapi-fetch 46 | npm install -D openapi-typescript typescript 47 | ``` 48 | 49 | ```bash yarn 50 | yarn add openapi-fetch 51 | yarn add -D openapi-typescript typescript 52 | ``` 53 | ```bash pnpm 54 | pnpm add openapi-fetch 55 | pnpm add -D openapi-typescript typescript 56 | ``` 57 | 58 | 59 | ```json title="package.json" 60 | { 61 | "scripts": { 62 | "openapi:gen": "openapi-typescript https://api.browser-use.com/openapi.json -o ./src/lib/api/v1.d.ts" 63 | } 64 | } 65 | ``` 66 | 67 | ```bash 68 | pnpm openapi:gen 69 | ``` 70 | 71 | ```ts 72 | // client.ts 73 | 74 | 'use client' 75 | 76 | import createClient from 'openapi-fetch' 77 | import { paths } from '@/lib/api/v1' 78 | 79 | export type Client = ReturnType> 80 | 81 | export const client = createClient({ 82 | baseUrl: 'https://api.browser-use.com/', 83 | 84 | // NOTE: You can get your API key from https://cloud.browser-use.com/billing! 85 | headers: { Authorization: `Bearer ${apiKey}` }, 86 | }) 87 | 88 | ``` 89 | 90 | 91 | Need help? Contact our support team at support@browser-use.com or join our 92 | [Discord community](https://link.browser-use.com/discord) 93 | 94 | -------------------------------------------------------------------------------- /docs/cloud/v1/pricing.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Pricing" 3 | description: "Browser Use Cloud API pricing structure and cost breakdown" 4 | icon: "dollar-sign" 5 | mode: "wide" 6 | --- 7 | 8 | The Browser Use Cloud API pricing consists of two components: 9 | 10 | 1. **Task Initialization Cost**: $0.01 per started task 11 | 2. **Task Step Cost**: Additional cost based on the specific model used for each step 12 | 13 | ## LLM Model Step Pricing 14 | 15 | The following table shows the total cost per step for each available LLM model: 16 | 17 | | Model | Cost per Step | 18 | | -------------------------------- | ------------- | 19 | | GPT-4o | $0.03 | 20 | | GPT-4o mini | $0.01 | 21 | | GPT-4.1 | $0.03 | 22 | | GPT-4.1 mini | $0.01 | 23 | | O4 mini | $0.02 | 24 | | O3 | $0.03 | 25 | | Gemini 2.0 Flash | $0.01 | 26 | | Gemini 2.0 Flash Lite | $0.01 | 27 | | Gemini 2.5 Flash Preview (04/17) | $0.01 | 28 | | Gemini 2.5 Flash | $0.01 | 29 | | Gemini 2.5 Pro | $0.03 | 30 | | Claude 3.7 Sonnet (2025-02-19) | $0.03 | 31 | | Claude Sonnet 4 (2025-05-14) | $0.03 | 32 | | Llama 4 Maverick 17B Instruct | $0.01 | 33 | 34 | ## Example Cost Calculation 35 | 36 | For example, using GPT-4.1 for a 10 step task: 37 | 38 | - Task initialization: $0.01 39 | - 10 steps x \$0.03 per step = \$0.30 40 | - **Total cost: $0.31** 41 | -------------------------------------------------------------------------------- /docs/cloud/v1/quickstart.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Quickstart" 3 | description: "Learn how to get started with the Browser Use Cloud API" 4 | icon: "cloud" 5 | mode: "wide" 6 | --- 7 | 8 | Browser Use Cloud Banner 13 | Browser Use Cloud Banner 18 | 19 | 20 | You need an active subscription and an API key from 21 | [cloud.browser-use.com/billing](https://cloud.browser-use.com/billing). For 22 | detailed pricing information, see our [pricing page](/cloud/v1/pricing). 23 | 24 | 25 | ## Creating Your First Agent 26 | 27 | To understand how the API works visit the [Run Task](/api-reference/api-v1/run-task?playground=open) page. 28 | 29 | ```bash 30 | curl -X POST https://api.browser-use.com/api/v1/run-task \ 31 | -H "Authorization: Bearer your_api_key_here" \ 32 | -H "Content-Type: application/json" \ 33 | -d '{ 34 | "task": "Go to google.com and search for Browser Use" 35 | }' 36 | ``` 37 | 38 | `run-task` API returns a task ID, which you can query to get the task status, live preview URL, and the result output. 39 | 40 | 41 | To play around with the API, you can use the [Browser Use Cloud 42 | Playground](https://cloud.browser-use.com/playground). 43 | 44 | 45 | For the full implementation guide see the [Implementation](/cloud/v1/implementation) page. 46 | -------------------------------------------------------------------------------- /docs/customize/agent/basics.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Basics" 3 | description: "" 4 | icon: "play" 5 | mode: "wide" 6 | --- 7 | 8 | 9 | ```python 10 | from browser_use import Agent, ChatOpenAI 11 | 12 | agent = Agent( 13 | task="Search for latest news about AI", 14 | llm=ChatOpenAI(model="gpt-4.1-mini"), 15 | ) 16 | 17 | async def main(): 18 | history = await agent.run(max_steps=100) 19 | ``` 20 | 21 | - `task`: The task you want to automate. 22 | - `llm`: Your favorite LLM. See Supported Models. 23 | 24 | 25 | The agent is executed using the async `run()` method: 26 | 27 | - `max_steps` (default: `100`): Maximum number of steps an agent can take. 28 | -------------------------------------------------------------------------------- /docs/customize/agent/output-format.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Output Format" 3 | description: "" 4 | icon: "arrow-right-to-bracket" 5 | mode: "wide" 6 | --- 7 | 8 | ## Agent History 9 | 10 | The `run()` method returns an `AgentHistoryList` object with the complete execution history: 11 | 12 | ```python 13 | history = await agent.run() 14 | 15 | # Access useful information 16 | history.urls() # List of visited URLs 17 | history.screenshot_paths() # List of screenshot paths 18 | history.screenshots() # List of screenshots as base64 strings 19 | history.action_names() # Names of executed actions 20 | history.extracted_content() # List of extracted content from all actions 21 | history.errors() # List of errors (with None for steps without errors) 22 | history.model_actions() # All actions with their parameters 23 | history.model_outputs() # All model outputs from history 24 | history.last_action() # Last action in history 25 | 26 | # Analysis methods 27 | history.final_result() # Get the final extracted content (last step) 28 | history.is_done() # Check if agent completed successfully 29 | history.is_successful() # Check if agent completed successfully (returns None if not done) 30 | history.has_errors() # Check if any errors occurred 31 | history.model_thoughts() # Get the agent's reasoning process (AgentBrain objects) 32 | history.action_results() # Get all ActionResult objects from history 33 | history.action_history() # Get truncated action history with essential fields 34 | history.number_of_steps() # Get the number of steps in the history 35 | history.total_duration_seconds() # Get total duration of all steps in seconds 36 | 37 | # Structured output (when using output_model_schema) 38 | history.structured_output # Property that returns parsed structured output 39 | ``` 40 | 41 | See all helper methods in the [AgentHistoryList source code](https://github.com/browser-use/browser-use/blob/main/browser_use/agent/views.py#L301). 42 | 43 | ## Structured Output 44 | 45 | For structured output, use the `output_model_schema` parameter with a Pydantic model. [Example](https://github.com/browser-use/browser-use/blob/main/examples/features/custom_output.py). 46 | -------------------------------------------------------------------------------- /docs/customize/browser/basics.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Basics" 3 | description: "" 4 | icon: "play" 5 | --- 6 | 7 | 8 | --- 9 | 10 | ```python 11 | from browser_use import Agent, Browser, ChatOpenAI 12 | 13 | browser = Browser( 14 | headless=False, # Show browser window 15 | window_size={'width': 1000, 'height': 700}, # Set window size 16 | ) 17 | 18 | agent = Agent( 19 | task='Search for Browser Use', 20 | browser=browser, 21 | llm=ChatOpenAI(model='gpt-4.1-mini'), 22 | ) 23 | 24 | 25 | async def main(): 26 | await agent.run() 27 | ``` 28 | -------------------------------------------------------------------------------- /docs/customize/browser/real-browser.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Real Browser" 3 | description: "" 4 | icon: "arrow-right-to-bracket" 5 | --- 6 | 7 | Connect your existing Chrome browser to preserve authentication. 8 | 9 | ## Basic Example 10 | 11 | ```python 12 | from browser_use import Agent, Browser, ChatOpenAI 13 | 14 | # Connect to your existing Chrome browser 15 | browser = Browser( 16 | executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', 17 | user_data_dir='~/Library/Application Support/Google/Chrome', 18 | profile_directory='Default', 19 | ) 20 | 21 | agent = Agent( 22 | task='Visit https://duckduckgo.com and search for "browser-use founders"', 23 | browser=browser, 24 | llm=ChatOpenAI(model='gpt-4.1-mini'), 25 | ) 26 | async def main(): 27 | await agent.run() 28 | ``` 29 | 30 | > **Note:** You need to fully close chrome before running this example. Also, Google blocks this approach currently so we use DuckDuckGo instead. 31 | 32 | 33 | 34 | 35 | ## How it Works 36 | 37 | 1. **`executable_path`** - Path to your Chrome installation 38 | 2. **`user_data_dir`** - Your Chrome profile folder (keeps cookies, extensions, bookmarks) 39 | 3. **`profile_directory`** - Specific profile name (Default, Profile 1, etc.) 40 | 41 | 42 | ## Platform Paths 43 | 44 | ```python 45 | # macOS 46 | executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome' 47 | user_data_dir='~/Library/Application Support/Google/Chrome' 48 | 49 | # Windows 50 | executable_path='C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe' 51 | user_data_dir='%LOCALAPPDATA%\\Google\\Chrome\\User Data' 52 | 53 | # Linux 54 | executable_path='/usr/bin/google-chrome' 55 | user_data_dir='~/.config/google-chrome' 56 | ``` 57 | -------------------------------------------------------------------------------- /docs/customize/browser/remote.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Remote Browser" 3 | description: "" 4 | icon: "cloud" 5 | mode: "wide" 6 | --- 7 | 8 | ```python 9 | from browser_use import Agent, Browser, ChatOpenAI 10 | 11 | # Connect to remote browser 12 | browser = Browser( 13 | cdp_url='http://remote-server:9222' 14 | ) 15 | 16 | 17 | agent = Agent( 18 | task="Your task here", 19 | llm=ChatOpenAI(model='gpt-4.1-mini'), 20 | browser=browser, 21 | ) 22 | ``` 23 | 24 | 25 | ## Get a CDP URL 26 | ### Cloud Browser 27 | Get a cdp url from your favorite browser provider like AnchorBorwser, HyperBrowser, BrowserBase, Steel.dev, etc. 28 | 29 | 30 | 31 | 32 | ### Proxy Connection 33 | 34 | ```python 35 | 36 | from browser_use import Agent, Browser, ChatOpenAI 37 | from browser_use.browser import ProxySettings 38 | 39 | browser = Browser( 40 | headless=False, 41 | proxy=ProxySettings( 42 | server="http://proxy-server:8080", 43 | username="proxy-user", 44 | password="proxy-pass" 45 | ) 46 | cdp_url="http://remote-server:9222" 47 | ) 48 | 49 | 50 | agent = Agent( 51 | task="Your task here", 52 | llm=ChatOpenAI(model='gpt-4.1-mini'), 53 | browser=browser, 54 | ) 55 | ``` 56 | -------------------------------------------------------------------------------- /docs/customize/examples/chain-agents.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Chain Agents" 3 | description: "Chain multiple tasks together with the same agent and browser session." 4 | icon: "link" 5 | mode: "wide" 6 | --- 7 | 8 | ## Chain Agent Tasks 9 | 10 | Keep your browser session alive and chain multiple tasks together. Perfect for conversational workflows or multi-step processes. 11 | 12 | ```python 13 | import asyncio 14 | from dotenv import load_dotenv 15 | load_dotenv() 16 | 17 | from browser_use import Agent, BrowserProfile 18 | 19 | profile = BrowserProfile(keep_alive=True) 20 | 21 | async def main(): 22 | agent = Agent(task="Go to reddit.com", browser_profile=profile) 23 | await agent.run(max_steps=1) 24 | 25 | while True: 26 | user_response = input('\n👤 New task or "q" to quit: ') 27 | if user_response.lower() == 'q': 28 | break 29 | agent.add_new_task(f'New task: {user_response}') 30 | await agent.run() 31 | 32 | if __name__ == '__main__': 33 | asyncio.run(main()) 34 | ``` 35 | 36 | ## How It Works 37 | 38 | 1. **Persistent Browser**: `BrowserProfile(keep_alive=True)` prevents browser from closing between tasks 39 | 2. **Task Chaining**: Use `agent.add_new_task()` to add follow-up tasks 40 | 3. **Context Preservation**: Agent maintains memory and browser state across tasks 41 | 4. **Interactive Flow**: Perfect for conversational interfaces or complex workflows 42 | 43 | 44 | The browser session remains active throughout the entire chain, preserving all cookies, local storage, and page state. 45 | 46 | -------------------------------------------------------------------------------- /docs/customize/examples/fast-agent.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Fast Agent" 3 | description: "Optimize agent performance for maximum speed and efficiency." 4 | icon: "bolt" 5 | mode: "wide" 6 | --- 7 | 8 | ```python 9 | import asyncio 10 | from dotenv import load_dotenv 11 | load_dotenv() 12 | 13 | from browser_use import Agent, BrowserProfile 14 | 15 | # Speed optimization instructions for the model 16 | SPEED_OPTIMIZATION_PROMPT = """ 17 | Speed optimization instructions: 18 | - Be extremely concise and direct in your responses 19 | - Get to the goal as quickly as possible 20 | - Use multi-action sequences whenever possible to reduce steps 21 | """ 22 | 23 | 24 | async def main(): 25 | # 1. Use fast LLM - Llama 4 on Groq for ultra-fast inference 26 | from browser_use import ChatGroq 27 | 28 | llm = ChatGroq( 29 | model='meta-llama/llama-4-maverick-17b-128e-instruct', 30 | temperature=0.0, 31 | ) 32 | # from browser_use import ChatGoogle 33 | 34 | # llm = ChatGoogle(model='gemini-2.5-flash') 35 | 36 | # 2. Create speed-optimized browser profile 37 | browser_profile = BrowserProfile( 38 | minimum_wait_page_load_time=0.1, 39 | wait_between_actions=0.1, 40 | headless=False, 41 | ) 42 | 43 | # 3. Define a speed-focused task 44 | task = """ 45 | 1. Go to reddit https://www.reddit.com/search/?q=browser+agent&type=communities 46 | 2. Click directly on the first 5 communities to open each in new tabs 47 | 3. Find out what the latest post is about, and switch directly to the next tab 48 | 4. Return the latest post summary for each page 49 | """ 50 | 51 | # 4. Create agent with all speed optimizations 52 | agent = Agent( 53 | task=task, 54 | llm=llm, 55 | flash_mode=True, # Disables thinking in the LLM output for maximum speed 56 | browser_profile=browser_profile, 57 | extend_system_message=SPEED_OPTIMIZATION_PROMPT, 58 | ) 59 | 60 | await agent.run() 61 | 62 | 63 | if __name__ == '__main__': 64 | asyncio.run(main()) 65 | ``` 66 | 67 | ## Speed Optimization Techniques 68 | 69 | ### 1. Fast LLM Models 70 | ```python 71 | # Groq - Ultra-fast inference 72 | from browser_use import ChatGroq 73 | llm = ChatGroq(model='meta-llama/llama-4-maverick-17b-128e-instruct') 74 | 75 | # Google Gemini Flash - Optimized for speed 76 | from browser_use import ChatGoogle 77 | llm = ChatGoogle(model='gemini-2.5-flash') 78 | ``` 79 | 80 | ### 2. Browser Optimizations 81 | ```python 82 | browser_profile = BrowserProfile( 83 | minimum_wait_page_load_time=0.1, # Reduce wait time 84 | wait_between_actions=0.1, # Faster action execution 85 | headless=True, # No GUI overhead 86 | ) 87 | ``` 88 | 89 | ### 3. Agent Optimizations 90 | ```python 91 | agent = Agent( 92 | task=task, 93 | llm=llm, 94 | flash_mode=True, # Skip LLM thinking process 95 | extend_system_message=SPEED_PROMPT, # Optimize LLM behavior 96 | ) 97 | ``` 98 | -------------------------------------------------------------------------------- /docs/customize/examples/more-examples.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "More Examples" 3 | description: "Explore additional examples and use cases on GitHub." 4 | icon: "arrow-up-right-from-square" 5 | mode: "wide" 6 | --- 7 | 8 | ### 🔗 Browse All Examples 9 | 10 | **[View Complete Examples Directory →](https://github.com/browser-use/browser-use/tree/main/examples)** 11 | 12 | ### 🤝 Contributing Examples 13 | 14 | Have a great use case? **[Submit a pull request](https://github.com/browser-use/browser-use/pulls)** with your example! 15 | -------------------------------------------------------------------------------- /docs/customize/examples/parallel-browser.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Parallel Agents" 3 | description: "Run multiple agents in parallel with separate browser instances" 4 | icon: "copy" 5 | --- 6 | 7 | ```python 8 | import asyncio 9 | from browser_use import Agent, Browser, ChatOpenAI 10 | 11 | async def main(): 12 | # Create 3 separate browser instances 13 | browsers = [ 14 | Browser( 15 | user_data_dir=f'./temp-profile-{i}', 16 | headless=False, 17 | ) 18 | for i in range(3) 19 | ] 20 | 21 | # Create 3 agents with different tasks 22 | agents = [ 23 | Agent( 24 | task='Search for "browser automation" on Google', 25 | browser=browsers[0], 26 | llm=ChatOpenAI(model='gpt-4.1-mini'), 27 | ), 28 | Agent( 29 | task='Search for "AI agents" on DuckDuckGo', 30 | browser=browsers[1], 31 | llm=ChatOpenAI(model='gpt-4.1-mini'), 32 | ), 33 | Agent( 34 | task='Visit Wikipedia and search for "web scraping"', 35 | browser=browsers[2], 36 | llm=ChatOpenAI(model='gpt-4.1-mini'), 37 | ), 38 | ] 39 | 40 | # Run all agents in parallel 41 | tasks = [agent.run() for agent in agents] 42 | results = await asyncio.gather(*tasks, return_exceptions=True) 43 | 44 | print('🎉 All agents completed!') 45 | ``` 46 | 47 | > **Note:** This is experimental, and agents might conflict each other. 48 | -------------------------------------------------------------------------------- /docs/customize/examples/prompting-guide.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Prompting Guide" 3 | description: "Tips and tricks " 4 | icon: "lightbulb" 5 | --- 6 | 7 | Prompting can trasticly improve performance and solve existing limitations of the library. 8 | 9 | ### 1. Be Specific vs Open-Ended 10 | 11 | **✅ Specific (Recommended)** 12 | ```python 13 | task = """ 14 | 1. Go to https://quotes.toscrape.com/ 15 | 2. Use extract_structured_data action with the query "first 3 quotes with their authors" 16 | 3. Save results to quotes.csv using write_file action 17 | 4. Do a google search for the first quote and find when it was written 18 | """ 19 | ``` 20 | 21 | **❌ Open-Ended** 22 | ```python 23 | task = "Go to web and make money" 24 | ``` 25 | 26 | ### 2. Name Actions Directly 27 | 28 | When you know exactly what the agent should do, reference actions by name: 29 | 30 | ```python 31 | task = """ 32 | 1. Use search_google action to find "Python tutorials" 33 | 2. Use click_element_by_index to open first result in a new tab 34 | 3. Use scroll action to scroll down 2 pages 35 | 4. Use extract_structured_data to extract the names of the first 5 items 36 | 5. Wait for 2 seconds if the page is not loaded, refresh it and wait 10 sec 37 | 6. Use send_keys action with "Tab Tab ArrowDown Enter" 38 | """ 39 | ``` 40 | 41 | See [Available Tools](/customize/tools/available) for the complete list of actions. 42 | 43 | 44 | ### 3. Handle interaction problems via keyboard navigation 45 | 46 | Sometimes buttons can't be clicked (you found a bug in the library - open an issue). 47 | Good news - often you can work around it with keyboard navigation! 48 | 49 | ```python 50 | task = """ 51 | If the submit button cannot be clicked: 52 | 1. Use send_keys action with "Tab Tab Enter" to navigate and activate 53 | 2. Or use send_keys with "ArrowDown ArrowDown Enter" for form submission 54 | """ 55 | ``` 56 | 57 | 58 | 59 | 60 | ### 4. Custom Actions Integration 61 | 62 | ```python 63 | # When you have custom actions 64 | @controller.action("Get 2FA code from authenticator app") 65 | async def get_2fa_code(): 66 | # Your implementation 67 | pass 68 | 69 | task = """ 70 | Login with 2FA: 71 | 1. Enter username/password 72 | 2. When prompted for 2FA, use get_2fa_code action 73 | 3. NEVER try to extract 2FA codes from the page manually 74 | 4. ALWAYS use the get_2fa_code action for authentication codes 75 | """ 76 | ``` 77 | 78 | ### 5. Error Recovery 79 | 80 | ```python 81 | task = """ 82 | Robust data extraction: 83 | 1. Go to openai.com to find their CEO 84 | 2. If navigation fails due to anti-bot protection: 85 | - Use google search to find the CEO 86 | 3. If page times out, use go_back and try alternative approach 87 | """ 88 | ``` 89 | 90 | 91 | 92 | The key to effective prompting is being specific about actions. 93 | -------------------------------------------------------------------------------- /docs/customize/examples/secure.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Secure Setup" 3 | description: "Azure OpenAI with data privacy and security configuration." 4 | icon: "shield-check" 5 | mode: "wide" 6 | --- 7 | 8 | ## Secure Setup with Azure OpenAI 9 | 10 | Enterprise-grade security with Azure OpenAI, data privacy protection, and restricted browser access. 11 | 12 | ```python 13 | import asyncio 14 | import os 15 | from dotenv import load_dotenv 16 | load_dotenv() 17 | os.environ['ANONYMIZED_TELEMETRY'] = 'false' 18 | from browser_use import Agent, BrowserProfile, ChatAzureOpenAI 19 | 20 | # Azure OpenAI configuration 21 | api_key = os.getenv('AZURE_OPENAI_KEY') 22 | azure_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT') 23 | llm = ChatAzureOpenAI(model='gpt-4.1-mini', api_key=api_key, azure_endpoint=azure_endpoint) 24 | 25 | # Secure browser configuration 26 | browser_profile = BrowserProfile( 27 | allowed_domains=['*google.com', 'browser-use.com'], 28 | enable_default_extensions=False 29 | ) 30 | 31 | # Sensitive data filtering 32 | sensitive_data = {'company_name': 'browser-use'} 33 | 34 | # Create secure agent 35 | agent = Agent( 36 | task='Find the founders of the sensitive company_name', 37 | llm=llm, 38 | browser_profile=browser_profile, 39 | sensitive_data=sensitive_data 40 | ) 41 | 42 | async def main(): 43 | await agent.run(max_steps=10) 44 | 45 | asyncio.run(main()) 46 | ``` 47 | 48 | ## Security Features 49 | 50 | **Azure OpenAI:** 51 | - NOT used to train OpenAI models 52 | - NOT shared with other customers 53 | - Hosted entirely within Azure 54 | - 30-day retention (or zero with Limited Access Program) 55 | 56 | **Browser Security:** 57 | - `allowed_domains`: Restrict navigation to trusted sites 58 | - `enable_default_extensions=False`: Disable potentially dangerous extensions 59 | - `sensitive_data`: Filter sensitive information from LLM input 60 | 61 | 62 | 63 | 64 | For enterprise deployments contact support@browser-use.com. 65 | 66 | -------------------------------------------------------------------------------- /docs/customize/examples/sensitive-data.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Sensitive Data" 3 | description: "Handle secret information securely and avoid sending PII & passwords to the LLM." 4 | icon: "shield" 5 | mode: "wide" 6 | --- 7 | 8 | 9 | ```python 10 | import os 11 | from browser_use import Agent, Browser, ChatOpenAI 12 | os.environ['ANONYMIZED_TELEMETRY'] = "false" 13 | 14 | 15 | company_credentials = {'x_user': 'your-real-username@email.com', 'x_pass': 'your-real-password123'} 16 | 17 | # Option 1: Secrets available for all websites 18 | sensitive_data = company_credentials 19 | 20 | # Option 2: Secrets per domain with regex 21 | # sensitive_data: dict[str, str | dict[str, str]] = { 22 | # 'https://*.example-staging.com': company_credentials, 23 | # 'http*://test.example.com': company_credentials, 24 | # 'https://example.com': company_credentials, 25 | # 'https://*.google.com': {'g_email': 'user@gmail.com', 'g_pass': 'google_password'}, 26 | # } 27 | 28 | 29 | agent = Agent( 30 | task='Log into example.com with username x_user and password x_pass', 31 | sensitive_data=sensitive_data, 32 | use_vision=False, # Disable vision to prevent LLM seeing sensitive data in screenshots 33 | llm=ChatOpenAI(model='gpt-4.1-mini'), 34 | ) 35 | async def main(): 36 | await agent.run() 37 | ``` 38 | 39 | ## How it Works 40 | 1. **Text Filtering**: The LLM only sees placeholders (`x_user`, `x_pass`), we filter your sensitive data from the input text. 41 | 2. **DOM Actions**: Real values are injected directly into form fields after the LLM call 42 | 43 | ## Best Practices 44 | - Use `Browser(allowed_domains=[...])` to restrict navigation 45 | - Set `use_vision=False` to prevent screenshot leaks 46 | - Use `storage_state='./auth.json'` for login cookies instead of passwords when possible 47 | -------------------------------------------------------------------------------- /docs/customize/tools/add.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Add Tools" 3 | description: "" 4 | icon: "plus" 5 | mode: "wide" 6 | --- 7 | 8 | 9 | Examples: 10 | - deterministic clicks 11 | - file handling 12 | - calling APIs 13 | - human-in-the-loop 14 | - browser interactions 15 | - calling LLMs 16 | - get 2fa codes 17 | - send emails 18 | - ... 19 | 20 | Simply add `@tools.action(...)` to your function. 21 | 22 | ```python 23 | from browser_use import Tools, Agent 24 | 25 | tools = Tools() 26 | 27 | @tools.action(description='Ask human for help with a question') 28 | def ask_human(question: str) -> ActionResult: 29 | answer = input(f'{question} > ') 30 | return f'The human responded with: {answer}' 31 | ``` 32 | 33 | ```python 34 | agent = Agent(task='...', llm=llm, tools=tools) 35 | ``` 36 | 37 | - **`description`** *(required)* - What the tool does, the LLM uses this to decide when to call it. 38 | - **`allowed_domains`** - List of domains where tool can run (e.g. `['*.example.com']`), defaults to all domains 39 | 40 | The Agent fills your function parameters based on their names, type hints, & defaults. 41 | 42 | 43 | ## Available Objects 44 | 45 | Your function has access to these objects: 46 | 47 | - **`browser_session: BrowserSession`** - Current browser session for CDP access 48 | - **`cdp_client`** - Direct Chrome DevTools Protocol client 49 | - **`page_extraction_llm: BaseChatModel`** - The LLM you pass into agent. This can be used to do a custom llm call here. 50 | - **`file_system: FileSystem`** - File system access 51 | - **`available_file_paths: list[str]`** - Available files for upload/processing 52 | - **`has_sensitive_data: bool`** - Whether action contains sensitive data 53 | 54 | ## Pydantic Input 55 | 56 | You can use Pydantic for the tool parameters: 57 | 58 | ```python 59 | from pydantic import BaseModel 60 | 61 | class Cars(BaseModel): 62 | name: str = Field(description='The name of the car, e.g. "Toyota Camry"') 63 | price: int = Field(description='The price of the car as int in USD, e.g. 25000') 64 | 65 | @tools.action(description='Save cars to file') 66 | def save_cars(cars: list[Cars]) -> str: 67 | with open('cars.json', 'w') as f: 68 | json.dump(cars, f) 69 | return f'Saved {len(cars)} cars to file' 70 | 71 | task = "find cars and save them to file" 72 | ``` 73 | ## Domain Restrictions 74 | 75 | Limit tools to specific domains: 76 | 77 | ```python 78 | @tools.action( 79 | description='Fill out banking forms', 80 | allowed_domains=['https://mybank.com'] 81 | ) 82 | def fill_bank_form(account_number: str) -> str: 83 | # Only works on mybank.com 84 | return f'Filled form for account {account_number}' 85 | ``` 86 | -------------------------------------------------------------------------------- /docs/customize/tools/available.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Available Tools" 3 | description: "Here is the [source code](https://github.com/browser-use/browser-use/blob/main/browser_use/tools/service.py) for the default tools:" 4 | icon: "list" 5 | mode: "wide" 6 | --- 7 | 8 | 9 | 10 | 11 | ### Navigation & Browser Control 12 | - **`search_google`** - Search queries in Google 13 | - **`go_to_url`** - Navigate to URLs 14 | - **`go_back`** - Go back in browser history 15 | - **`wait`** - Wait for specified seconds 16 | 17 | ### Page Interaction 18 | - **`click_element_by_index`** - Click elements by their index 19 | - **`input_text`** - Input text into form fields 20 | - **`upload_file_to_element`** - Upload files to file inputs 21 | - **`scroll`** - Scroll the page up/down 22 | - **`scroll_to_text`** - Scroll to specific text on page 23 | - **`send_keys`** - Send special keys (Enter, Escape, etc.) 24 | 25 | ### Tab Management 26 | - **`switch_tab`** - Switch between browser tabs 27 | - **`close_tab`** - Close browser tabs 28 | 29 | ### Content Extraction 30 | - **`extract_structured_data`** - Extract data from webpages using LLM 31 | 32 | ### Form Controls 33 | - **`get_dropdown_options`** - Get dropdown option values 34 | - **`select_dropdown_option`** - Select dropdown options 35 | 36 | ### File Operations 37 | - **`write_file`** - Write content to files 38 | - **`read_file`** - Read file contents 39 | - **`replace_file_str`** - Replace text in files 40 | 41 | ### Task Completion 42 | - **`done`** - Complete the task (always available) 43 | -------------------------------------------------------------------------------- /docs/customize/tools/basics.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Basics" 3 | description: "Tools are the functions that the agent has to interact with the world." 4 | icon: "play" 5 | mode: "wide" 6 | --- 7 | 8 | 9 | ## Quick Example 10 | 11 | 12 | ```python 13 | from browser_use import Tools, ActionResult 14 | 15 | tools = Tools() 16 | 17 | @tools.action('Ask human for help with a question') 18 | def ask_human(question: str) -> ActionResult: 19 | answer = input(f'{question} > ') 20 | return f'The human responded with: {answer}' 21 | 22 | agent = Agent( 23 | task='Ask human for help', 24 | llm=llm, 25 | tools=tools, 26 | ) 27 | ``` 28 | -------------------------------------------------------------------------------- /docs/customize/tools/remove.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Remove Tools" 3 | description: "You can exclude default tools:" 4 | icon: "minus" 5 | mode: "wide" 6 | --- 7 | 8 | 9 | ```python 10 | from browser_use import Tools 11 | 12 | tools = Tools(exclude_actions=['search_google', 'wait']) 13 | agent = Agent(task='...', llm=llm, tools=tools) 14 | ``` 15 | -------------------------------------------------------------------------------- /docs/customize/tools/response.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Tool Response" 3 | description: "" 4 | icon: "arrow-turn-down-left" 5 | mode: "wide" 6 | --- 7 | 8 | Tools return results using `ActionResult` or simple strings. 9 | 10 | ## Return Types 11 | 12 | ```python 13 | @tools.action('My tool') 14 | def my_tool() -> str: 15 | return "Task completed successfully" 16 | 17 | @tools.action('Advanced tool') 18 | def advanced_tool() -> ActionResult: 19 | return ActionResult( 20 | extracted_content="Main result", 21 | long_term_memory="Remember this info", 22 | error="Something went wrong", 23 | is_done=True, 24 | success=True, 25 | attachments=["file.pdf"], 26 | ) 27 | ``` 28 | 29 | ## ActionResult Properties 30 | 31 | - `extracted_content` (default: `None`) - Main result passed to LLM, this is equivalent to returning a string. 32 | - `include_extracted_content_only_once` (default: `False`) - Set to `True` for large content to include it only once in the LLM input. 33 | - `long_term_memory` (default: `None`) - This is always included in the LLM input for all future steps. 34 | - `error` (default: `None`) - Error message, we catch exceptions and set this automatically. This is always included in the LLM input. 35 | - `is_done` (default: `False`) - Tool completes entire task 36 | - `success` (default: `None`) - Task success (only valid with `is_done=True`) 37 | - `attachments` (default: `None`) - Files to show user 38 | - `metadata` (default: `None`) - Debug/observability data 39 | 40 | ## Why `extracted_content` and `long_term_memory`? 41 | With this you control the context for the LLM. 42 | 43 | ### 1. Include short content always in context 44 | ```python 45 | def simple_tool() -> str: 46 | return "Hello, world!" # Keep in context for all future steps 47 | ``` 48 | 49 | ### 2. Show long content once, remember subset in context 50 | ```python 51 | return ActionResult( 52 | extracted_content="[500 lines of product data...]", # Shows to LLM once 53 | include_extracted_content_only_once=True, # Never show full output again 54 | long_term_memory="Found 50 products" # Only this in future steps 55 | ) 56 | ``` 57 | We save the full `extracted_content` to files which the LLM can read in future steps. 58 | 59 | ### 3. Dont show long content, remember subset in context 60 | ```python 61 | return ActionResult( 62 | extracted_content="[500 lines of product data...]", # The LLM never sees this because `long_term_memory` overrides it and `include_extracted_content_only_once` is not used 63 | long_term_memory="Saved user's favorite products", # This is shown to the LLM in future steps 64 | ) 65 | ``` 66 | 67 | ## Terminating the Agent 68 | 69 | Set `is_done=True` to stop the agent completely. Use when your tool finishes the entire task: 70 | 71 | ```python 72 | @tools.action(description='Complete the task') 73 | def finish_task() -> ActionResult: 74 | return ActionResult( 75 | extracted_content="Task completed!", 76 | is_done=True, # Stops the agent 77 | success=True # Task succeeded 78 | ) 79 | ``` 80 | -------------------------------------------------------------------------------- /docs/development/get-help.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Get Help" 3 | description: "More than 20k developers help each other" 4 | icon: "circle-question" 5 | mode: "wide" 6 | --- 7 | 8 | 9 | 1. Check our [GitHub Issues](https://github.com/browser-use/browser-use/issues) 10 | 2. Ask in our [Discord community](https://link.browser-use.com/discord) 11 | 3. Get support for your enterprise with support@browser-use.com 12 | -------------------------------------------------------------------------------- /docs/development/monitoring/observability.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Observability" 3 | description: "Trace Browser Use's agent execution steps and browser sessions" 4 | icon: "eye" 5 | mode: "wide" 6 | --- 7 | 8 | ## Overview 9 | 10 | Browser Use has a native integration with [Laminar](https://lmnr.ai) - open-source platform for tracing, evals and labeling of AI agents. 11 | Read more about Laminar in the [Laminar docs](https://docs.lmnr.ai). 12 | 13 | ## Setup 14 | 15 | 16 | 17 | Register on [Laminar Cloud](https://lmnr.ai) and get the key from your project settings. 18 | Set the `LMNR_PROJECT_API_KEY` environment variable. 19 | ```bash 20 | pip install 'lmnr[all]' 21 | export LMNR_PROJECT_API_KEY= 22 | ``` 23 | 24 | ## Usage 25 | 26 | Then, you simply initialize the Laminar at the top of your project and both Browser Use and session recordings will be automatically traced. 27 | 28 | ```python {5-8} 29 | from browser_use import Agent, ChatOpenAI 30 | import asyncio 31 | 32 | from lmnr import Laminar, Instruments 33 | # this line auto-instruments Browser Use and any browser you use (local or remote) 34 | Laminar.initialize(project_api_key="...") 35 | 36 | async def main(): 37 | agent = Agent( 38 | task="open google, search Laminar AI", 39 | llm=ChatOpenAI(model="gpt-4.1-mini"), 40 | ) 41 | await agent.run() 42 | 43 | asyncio.run(main()) 44 | ``` 45 | 46 | ## Viewing Traces 47 | 48 | You can view traces in the Laminar UI by going to the traces tab in your project. 49 | When you select a trace, you can see both the browser session recording and the agent execution steps. 50 | 51 | Timeline of the browser session is synced with the agent execution steps, timeline highlights indicate the agent's current step synced with the browser session. 52 | In the trace view, you can also see the agent's current step, the tool it's using, and the tool's input and output. Tools are highlighted in the timeline with a yellow color. 53 | 54 | Laminar 55 | 56 | ## Laminar 57 | 58 | To learn more about tracing and evaluating your browser agents, check out the [Laminar docs](https://docs.lmnr.ai). 59 | -------------------------------------------------------------------------------- /docs/development/monitoring/telemetry.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Telemetry" 3 | description: "Understanding Browser Use's telemetry" 4 | icon: "chart-mixed" 5 | mode: "wide" 6 | --- 7 | 8 | ## Overview 9 | 10 | Browser Use is free under the MIT license. To help us continue improving the library, we collect anonymous usage data with [PostHog](https://posthog.com) . This information helps us understand how the library is used, fix bugs more quickly, and prioritize new features. 11 | 12 | 13 | ## Opting Out 14 | 15 | You can disable telemetry by setting the environment variable: 16 | 17 | ```bash .env 18 | ANONYMIZED_TELEMETRY=false 19 | ``` 20 | 21 | Or in your Python code: 22 | 23 | ```python 24 | import os 25 | os.environ["ANONYMIZED_TELEMETRY"] = "false" 26 | ``` 27 | 28 | 29 | Even when enabled, telemetry has zero impact on the library's performance. Code is available in [Telemetry 30 | Service](https://github.com/browser-use/browser-use/tree/main/browser_use/telemetry). 31 | 32 | -------------------------------------------------------------------------------- /docs/development/roadmap.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Roadmap" 3 | description: "Future plans and upcoming features for Browser Use" 4 | icon: "road" 5 | mode: "wide" 6 | --- 7 | 8 | Big things coming soon! 9 | -------------------------------------------------------------------------------- /docs/development/setup/contribution-guide.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Contribution Guide" 3 | description: "" 4 | icon: "handshake" 5 | mode: "wide" 6 | --- 7 | 8 | ## Mission 9 | 10 | - Make developers happy 11 | - Do more clicks than human 12 | - Tell your computer what to do, and it gets it done. 13 | - Make agents faster and more reliable. 14 | 15 | 16 | ## What to work on? 17 | 18 | - This space is moving fast. We have 10 ideas daily. Let's exchange some. 19 | - Browse our [GitHub Issues](https://github.com/browser-use/browser-use/issues) 20 | - Check out our most active issues on [Discord](https://discord.gg/zXJJHtJf3k) 21 | - Get inspiration in [`#showcase-your-work`](https://discord.com/channels/1303749220842340412/1305549200678850642) channel 22 | 23 | 24 | ## What makes a great PR? 25 | 26 | 1. Why do we need this PR? 27 | 2. Include a demo screenshot/gif 28 | 3. Make sure the PR passes all CI tests 29 | 4. Keep your PR focused on a single feature 30 | 31 | 32 | ## How? 33 | 1. Fork the repository 34 | 2. Create a new branch for your feature 35 | 3. Submit a PR 36 | 37 | We are overwhelmed with Issues. Feel free to bump your issues/PRs with comments periodically if you need faster feedback. 38 | -------------------------------------------------------------------------------- /docs/development/setup/local-setup.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Local Setup" 3 | description: "We're excited to have you join our community of contributors. " 4 | icon: "laptop-code" 5 | mode: "wide" 6 | --- 7 | 8 | ## Welcome to Browser Use Development! 9 | 10 | ```bash 11 | git clone https://github.com/browser-use/browser-use 12 | cd browser-use 13 | uv sync --all-extras --dev 14 | # or pip install -U git+https://github.com/browser-use/browser-use.git@main 15 | ``` 16 | 17 | ## Configuration 18 | 19 | Set up your environment variables: 20 | 21 | ```bash 22 | # Copy the example environment file 23 | cp .env.example .env 24 | 25 | # set logging level 26 | # BROWSER_USE_LOGGING_LEVEL=debug 27 | ``` 28 | 29 | 30 | ## Helper Scripts 31 | For common development tasks 32 | ```bash 33 | # Complete setup script - installs uv, creates a venv, and installs dependencies 34 | ./bin/setup.sh 35 | 36 | # Run all pre-commit hooks (formatting, linting, type checking) 37 | ./bin/lint.sh 38 | 39 | # Run the core test suite that's executed in CI 40 | ./bin/test.sh 41 | ``` 42 | 43 | 44 | 45 | ## Run examples 46 | 47 | ```bash 48 | uv run examples/simple.py 49 | ``` 50 | -------------------------------------------------------------------------------- /docs/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/browser-use/browser-use/2d84ff6605f4d89988f7868633f288f351c6a349/docs/favicon.ico -------------------------------------------------------------------------------- /docs/favicon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /docs/images/browser-use-banner-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/browser-use/browser-use/2d84ff6605f4d89988f7868633f288f351c6a349/docs/images/browser-use-banner-dark.png -------------------------------------------------------------------------------- /docs/images/browser-use-banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/browser-use/browser-use/2d84ff6605f4d89988f7868633f288f351c6a349/docs/images/browser-use-banner.png -------------------------------------------------------------------------------- /docs/images/checks-passed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/browser-use/browser-use/2d84ff6605f4d89988f7868633f288f351c6a349/docs/images/checks-passed.png -------------------------------------------------------------------------------- /docs/images/cloud-banner-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/browser-use/browser-use/2d84ff6605f4d89988f7868633f288f351c6a349/docs/images/cloud-banner-dark.png -------------------------------------------------------------------------------- /docs/images/cloud-banner-js.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/browser-use/browser-use/2d84ff6605f4d89988f7868633f288f351c6a349/docs/images/cloud-banner-js.png -------------------------------------------------------------------------------- /docs/images/cloud-banner-python.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/browser-use/browser-use/2d84ff6605f4d89988f7868633f288f351c6a349/docs/images/cloud-banner-python.png -------------------------------------------------------------------------------- /docs/images/cloud-banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/browser-use/browser-use/2d84ff6605f4d89988f7868633f288f351c6a349/docs/images/cloud-banner.png -------------------------------------------------------------------------------- /docs/images/laminar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/browser-use/browser-use/2d84ff6605f4d89988f7868633f288f351c6a349/docs/images/laminar.png -------------------------------------------------------------------------------- /docs/introduction.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Introduction" 3 | description: "Automate browser tasks in plain text. " 4 | icon: "book-open" 5 | --- 6 | 7 | Browser Use Logo 12 | Browser Use Logo 17 | 18 | 19 | 20 | Open-source Python library. 21 | 22 | 28 | Scale up with our cloud. 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /docs/quickstart_llm.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "LLM Quickstart" 3 | description: "" 4 | icon: "brain" 5 | --- 6 | 7 | 8 | 9 | 1. Copy all content [🔗 from here](https://docs.browser-use.com/llms-full.txt) (~32k tokens) 10 | 2. Paste it into your favorite coding agent (Cursor, Claude, ChatGPT ...). 11 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/browser-use/browser-use/2d84ff6605f4d89988f7868633f288f351c6a349/examples/__init__.py -------------------------------------------------------------------------------- /examples/api/search/search_url.py: -------------------------------------------------------------------------------- 1 | """ 2 | Search URL API Example 3 | 4 | This example shows how to use the Browser Use API to extract specific 5 | content from a given URL based on your query. 6 | 7 | Usage: 8 | # Copy this function and customize the parameters 9 | result = await search_url("https://example.com", "what to find", depth=2) 10 | """ 11 | 12 | import asyncio 13 | import os 14 | 15 | import aiohttp 16 | from dotenv import load_dotenv 17 | 18 | # Load environment variables 19 | load_dotenv() 20 | 21 | 22 | async def search_url(url: str, query: str, depth: int = 2): 23 | # Validate API key exists 24 | api_key = os.getenv('BROWSER_USE_API_KEY') 25 | if not api_key: 26 | print('❌ Error: BROWSER_USE_API_KEY environment variable is not set.') 27 | print('Please set your API key: export BROWSER_USE_API_KEY="your_api_key_here"') 28 | return None 29 | 30 | payload = {'url': url, 'query': query, 'depth': depth} 31 | 32 | headers = {'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'} 33 | 34 | print('Testing Search URL API...') 35 | print(f'URL: {url}') 36 | print(f'Query: {query}') 37 | print(f'Depth: {depth}') 38 | print('-' * 50) 39 | 40 | try: 41 | async with aiohttp.ClientSession() as session: 42 | async with session.post( 43 | 'https://api.browser-use.com/api/v1/search-url', 44 | json=payload, 45 | headers=headers, 46 | timeout=aiohttp.ClientTimeout(total=300), 47 | ) as response: 48 | if response.status == 200: 49 | result = await response.json() 50 | print('✅ Success!') 51 | print(f'URL processed: {result.get("url", "N/A")}') 52 | content = result.get('content', '') 53 | print(f'Content: {content}') 54 | return result 55 | else: 56 | error_text = await response.text() 57 | print(f'❌ Error {response.status}: {error_text}') 58 | return None 59 | except Exception as e: 60 | print(f'❌ Exception: {str(e)}') 61 | return None 62 | 63 | 64 | if __name__ == '__main__': 65 | # Example 1: Extract pricing info 66 | asyncio.run(search_url('https://browser-use.com/#pricing', 'Find pricing information for Browser Use')) 67 | 68 | # Example 2: News article analysis 69 | # asyncio.run(search_url("https://techcrunch.com", "latest startup funding news", depth=3)) 70 | 71 | # Example 3: Product research 72 | # asyncio.run(search_url("https://github.com/browser-use/browser-use", "installation instructions", depth=2)) 73 | -------------------------------------------------------------------------------- /examples/api/search/simple_search.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple Search API Example 3 | 4 | This example shows how to use the Browser Use API to search and extract 5 | content from multiple websites based on a query. 6 | 7 | Usage: 8 | # Copy this function and customize the parameters 9 | result = await simple_search("your search query", max_websites=5, depth=2) 10 | """ 11 | 12 | import asyncio 13 | import os 14 | 15 | import aiohttp 16 | from dotenv import load_dotenv 17 | 18 | # Load environment variables 19 | load_dotenv() 20 | 21 | 22 | async def simple_search(query: str, max_websites: int = 5, depth: int = 2): 23 | # Validate API key exists 24 | api_key = os.getenv('BROWSER_USE_API_KEY') 25 | if not api_key: 26 | print('❌ Error: BROWSER_USE_API_KEY environment variable is not set.') 27 | print('Please set your API key: export BROWSER_USE_API_KEY="your_api_key_here"') 28 | return None 29 | 30 | payload = {'query': query, 'max_websites': max_websites, 'depth': depth} 31 | 32 | headers = {'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'} 33 | 34 | print('Testing Simple Search API...') 35 | print(f'Query: {query}') 36 | print(f'Max websites: {max_websites}') 37 | print(f'Depth: {depth}') 38 | print('-' * 50) 39 | 40 | try: 41 | async with aiohttp.ClientSession() as session: 42 | async with session.post( 43 | 'https://api.browser-use.com/api/v1/simple-search', 44 | json=payload, 45 | headers=headers, 46 | timeout=aiohttp.ClientTimeout(total=300), 47 | ) as response: 48 | if response.status == 200: 49 | result = await response.json() 50 | print('✅ Success!') 51 | print(f'Results: {len(result.get("results", []))} websites processed') 52 | for i, item in enumerate(result.get('results', [])[:2], 1): 53 | print(f'\n{i}. {item.get("url", "N/A")}') 54 | content = item.get('content', '') 55 | print(f' Content: {content}') 56 | return result 57 | else: 58 | error_text = await response.text() 59 | print(f'❌ Error {response.status}: {error_text}') 60 | return None 61 | except Exception as e: 62 | print(f'❌ Exception: {str(e)}') 63 | return None 64 | 65 | 66 | if __name__ == '__main__': 67 | # Example 1: Basic search 68 | asyncio.run(simple_search('latest AI news')) 69 | 70 | # Example 2: Custom parameters 71 | # asyncio.run(simple_search("python web scraping", max_websites=3, depth=3)) 72 | 73 | # Example 3: Research query 74 | # asyncio.run(simple_search("climate change solutions 2024", max_websites=7, depth=2)) 75 | -------------------------------------------------------------------------------- /examples/browser/parallel_browser.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from browser_use import Agent, Browser, ChatOpenAI 4 | 5 | # NOTE: This is still experimental, and agents might conflict each other. 6 | 7 | 8 | async def main(): 9 | # Create 3 separate browser instances 10 | browsers = [ 11 | Browser( 12 | user_data_dir=f'./temp-profile-{i}', 13 | headless=False, 14 | ) 15 | for i in range(3) 16 | ] 17 | 18 | # Create 3 agents with different tasks 19 | agents = [ 20 | Agent( 21 | task='Search for "browser automation" on Google', 22 | browser=browsers[0], 23 | llm=ChatOpenAI(model='gpt-4.1-mini'), 24 | ), 25 | Agent( 26 | task='Search for "AI agents" on DuckDuckGo', 27 | browser=browsers[1], 28 | llm=ChatOpenAI(model='gpt-4.1-mini'), 29 | ), 30 | Agent( 31 | task='Visit Wikipedia and search for "web scraping"', 32 | browser=browsers[2], 33 | llm=ChatOpenAI(model='gpt-4.1-mini'), 34 | ), 35 | ] 36 | 37 | # Run all agents in parallel 38 | tasks = [agent.run() for agent in agents] 39 | results = await asyncio.gather(*tasks, return_exceptions=True) 40 | 41 | print('🎉 All agents completed!') 42 | 43 | 44 | if __name__ == '__main__': 45 | asyncio.run(main()) 46 | -------------------------------------------------------------------------------- /examples/browser/real_browser.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | from browser_use import Agent, Browser, ChatOpenAI 12 | 13 | # Connect to your existing Chrome browser 14 | browser = Browser( 15 | executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', 16 | user_data_dir='~/Library/Application Support/Google/Chrome', 17 | profile_directory='Default', 18 | ) 19 | 20 | 21 | async def main(): 22 | agent = Agent( 23 | llm=ChatOpenAI(model='gpt-4.1-mini'), 24 | # Google blocks this approach, so we use a different search engine 25 | task='Visit https://duckduckgo.com and search for "browser-use founders"', 26 | browser=browser, 27 | ) 28 | await agent.run() 29 | 30 | 31 | if __name__ == '__main__': 32 | asyncio.run(main()) 33 | -------------------------------------------------------------------------------- /examples/browser/using_cdp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple demonstration of the CDP feature. 3 | 4 | To test this locally, follow these steps: 5 | 1. Find the chrome executable file. 6 | 2. On mac by default, the chrome is in `/Applications/Google Chrome.app/Contents/MacOS/Google Chrome` 7 | 3. Add the following argument to the shortcut: 8 | `--remote-debugging-port=9222` 9 | 4. Open a web browser and navigate to `http://localhost:9222/json/version` to verify that the Remote Debugging Protocol (CDP) is running. 10 | 5. Launch this example. 11 | 12 | Full command Mac: 13 | "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" --remote-debugging-port=9222 14 | 15 | @dev You need to set the `OPENAI_API_KEY` environment variable before proceeding. 16 | """ 17 | 18 | import asyncio 19 | import os 20 | import sys 21 | 22 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 23 | 24 | from dotenv import load_dotenv 25 | 26 | load_dotenv() 27 | 28 | from browser_use import Agent, Tools 29 | from browser_use.browser import BrowserProfile, BrowserSession 30 | from browser_use.llm import ChatOpenAI 31 | 32 | browser_session = BrowserSession(browser_profile=BrowserProfile(cdp_url='http://localhost:9222', is_local=True)) 33 | tools = Tools() 34 | 35 | 36 | async def main(): 37 | agent = Agent( 38 | task='Visit https://duckduckgo.com and search for "browser-use founders"', 39 | lllm=ChatOpenAI(model='gpt-4.1-mini'), 40 | tools=tools, 41 | browser_session=browser_session, 42 | ) 43 | 44 | await agent.run() 45 | await browser_session.kill() 46 | 47 | input('Press Enter to close...') 48 | 49 | 50 | if __name__ == '__main__': 51 | asyncio.run(main()) 52 | -------------------------------------------------------------------------------- /examples/cloud/env.example: -------------------------------------------------------------------------------- 1 | # Browser Use Cloud API Configuration 2 | # Copy this file to .env and fill in your values 3 | 4 | # Required: Your Browser Use Cloud API key 5 | # Get it from: https://cloud.browser-use.com/billing 6 | BROWSER_USE_API_KEY=your_api_key_here 7 | 8 | # Optional: Custom API base URL (for enterprise installations) 9 | # BROWSER_USE_BASE_URL=https://api.browser-use.com/api/v1 10 | 11 | # Optional: Default model preference 12 | # BROWSER_USE_DEFAULT_MODEL=gemini-2.5-flash 13 | 14 | # Optional: Cost limits 15 | # BROWSER_USE_MAX_COST_PER_TASK=5.0 16 | 17 | # Optional: Request timeout (seconds) 18 | # BROWSER_USE_TIMEOUT=30 19 | 20 | # Optional: Logging configuration 21 | # LOG_LEVEL=INFO 22 | -------------------------------------------------------------------------------- /examples/custom-functions/2fa.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | import os 4 | import sys 5 | 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 7 | 8 | from dotenv import load_dotenv 9 | 10 | load_dotenv() 11 | 12 | import pyotp # type: ignore 13 | 14 | from browser_use import ActionResult, Agent, ChatOpenAI, Tools 15 | 16 | # Set up logging 17 | logging.basicConfig(level=logging.INFO) 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | tools = Tools() 22 | 23 | 24 | @tools.registry.action('Get 2FA code from when OTP is required') 25 | async def get_otp_2fa() -> ActionResult: 26 | """ 27 | Custom action to retrieve 2FA/MFA code from OTP secret key using pyotp. 28 | The OTP secret key should be set in the environment variable OTP_SECRET_KEY. 29 | """ 30 | secret_key = os.environ.get('OTP_SECRET_KEY') 31 | if not secret_key: 32 | raise ValueError('OTP_SECRET_KEY environment variable is not set') 33 | 34 | totp = pyotp.TOTP(secret_key, digits=6) 35 | code = totp.now() 36 | return ActionResult(extracted_content=code) 37 | 38 | 39 | async def main(): 40 | # Example task using the 1Password 2FA action 41 | task = """ 42 | Steps: 43 | 1. Go to https://authenticationtest.com/totpChallenge/ and try to log in. 44 | 2. If prompted for 2FA code: 45 | 2.1. Use the get_2fa_code action to retrieve the 2FA code. 46 | 2.2. Submit the code provided by the get_2fa_code action. 47 | 48 | Considerations: 49 | - ALWAYS use the get_2fa_code action to retrieve the 2FA code if needed. 50 | - NEVER skip the 2FA step if the page requires it. 51 | - NEVER extract the code from the page. 52 | - NEVER use a code that is not generated by the get_2fa_code action. 53 | - NEVER hallucinate the 2FA code, always use the get_2fa_code action to get it. 54 | 55 | You are completely FORBIDDEN to use any other method to get the 2FA code. 56 | """ 57 | 58 | model = ChatOpenAI(model='gpt-4.1-mini') 59 | agent = Agent(task=task, llm=model, tools=tools) 60 | 61 | result = await agent.run() 62 | print(f'Task completed with result: {result}') 63 | 64 | 65 | if __name__ == '__main__': 66 | asyncio.run(main()) 67 | -------------------------------------------------------------------------------- /examples/custom-functions/notification.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | from browser_use import ActionResult, Agent, ChatOpenAI, Tools 12 | 13 | tools = Tools() 14 | 15 | 16 | @tools.registry.action('Done with task') 17 | async def done(text: str): 18 | import yagmail # type: ignore 19 | 20 | # To send emails use 21 | # STEP 1: go to https://support.google.com/accounts/answer/185833 22 | # STEP 2: Create an app password (you can't use here your normal gmail password) 23 | # STEP 3: Use the app password in the code below for the password 24 | yag = yagmail.SMTP('your_email@gmail.com', 'your_app_password') 25 | yag.send( 26 | to='recipient@example.com', 27 | subject='Test Email', 28 | contents=f'result\n: {text}', 29 | ) 30 | 31 | return ActionResult(is_done=True, extracted_content='Email sent!') 32 | 33 | 34 | async def main(): 35 | task = 'go to brower-use.com and then done' 36 | model = ChatOpenAI(model='gpt-4.1-mini') 37 | agent = Agent(task=task, llm=model, tools=tools) 38 | 39 | await agent.run() 40 | 41 | 42 | if __name__ == '__main__': 43 | asyncio.run(main()) 44 | -------------------------------------------------------------------------------- /examples/custom-functions/onepassword_2fa.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | import os 4 | import sys 5 | 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 7 | 8 | from dotenv import load_dotenv 9 | 10 | load_dotenv() 11 | 12 | from onepassword.client import Client # type: ignore # pip install onepassword-sdk 13 | 14 | from browser_use import ActionResult, Agent, ChatOpenAI, Tools 15 | 16 | # Set up logging 17 | logging.basicConfig(level=logging.INFO) 18 | logger = logging.getLogger(__name__) 19 | 20 | OP_SERVICE_ACCOUNT_TOKEN = os.getenv('OP_SERVICE_ACCOUNT_TOKEN') 21 | OP_ITEM_ID = os.getenv('OP_ITEM_ID') # Go to 1Password, right click on the item, click "Copy Secret Reference" 22 | 23 | 24 | tools = Tools() 25 | 26 | 27 | @tools.registry.action('Get 2FA code from 1Password for Google Account', domains=['*.google.com', 'google.com']) 28 | async def get_1password_2fa() -> ActionResult: 29 | """ 30 | Custom action to retrieve 2FA/MFA code from 1Password using onepassword.client SDK. 31 | """ 32 | client = await Client.authenticate( 33 | # setup instructions: https://github.com/1Password/onepassword-sdk-python/#-get-started 34 | auth=OP_SERVICE_ACCOUNT_TOKEN, 35 | integration_name='Browser-Use', 36 | integration_version='v1.0.0', 37 | ) 38 | 39 | mfa_code = await client.secrets.resolve(f'op://Private/{OP_ITEM_ID}/One-time passcode') 40 | 41 | return ActionResult(extracted_content=mfa_code) 42 | 43 | 44 | async def main(): 45 | # Example task using the 1Password 2FA action 46 | task = 'Go to account.google.com, enter username and password, then if prompted for 2FA code, get 2FA code from 1Password for and enter it' 47 | 48 | model = ChatOpenAI(model='gpt-4.1-mini') 49 | agent = Agent(task=task, llm=model, tools=tools) 50 | 51 | result = await agent.run() 52 | print(f'Task completed with result: {result}') 53 | 54 | 55 | if __name__ == '__main__': 56 | asyncio.run(main()) 57 | -------------------------------------------------------------------------------- /examples/custom-functions/save_to_file_hugging_face.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | from pydantic import BaseModel 12 | 13 | from browser_use import ChatOpenAI 14 | from browser_use.agent.service import Agent 15 | from browser_use.tools.service import Tools 16 | 17 | # Initialize tools first 18 | tools = Tools() 19 | 20 | 21 | class Model(BaseModel): 22 | title: str 23 | url: str 24 | likes: int 25 | license: str 26 | 27 | 28 | class Models(BaseModel): 29 | models: list[Model] 30 | 31 | 32 | @tools.action('Save models', param_model=Models) 33 | def save_models(params: Models): 34 | with open('models.txt', 'a') as f: 35 | for model in params.models: 36 | f.write(f'{model.title} ({model.url}): {model.likes} likes, {model.license}\n') 37 | 38 | 39 | # video: https://preview.screen.studio/share/EtOhIk0P 40 | async def main(): 41 | task = 'Look up models with a license of cc-by-sa-4.0 and sort by most likes on Hugging face, save top 5 to file.' 42 | 43 | model = ChatOpenAI(model='gpt-4.1-mini') 44 | agent = Agent(task=task, llm=model, tools=tools) 45 | 46 | await agent.run() 47 | 48 | 49 | if __name__ == '__main__': 50 | asyncio.run(main()) 51 | -------------------------------------------------------------------------------- /examples/features/custom_output.py: -------------------------------------------------------------------------------- 1 | """ 2 | Show how to use custom outputs. 3 | 4 | @dev You need to add OPENAI_API_KEY to your environment variables. 5 | """ 6 | 7 | import asyncio 8 | import os 9 | import sys 10 | 11 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 12 | 13 | from dotenv import load_dotenv 14 | 15 | load_dotenv() 16 | 17 | from pydantic import BaseModel 18 | 19 | from browser_use import Agent, ChatOpenAI 20 | 21 | 22 | class Post(BaseModel): 23 | post_title: str 24 | post_url: str 25 | num_comments: int 26 | hours_since_post: int 27 | 28 | 29 | class Posts(BaseModel): 30 | posts: list[Post] 31 | 32 | 33 | async def main(): 34 | task = 'Go to hackernews show hn and give me the first 5 posts' 35 | model = ChatOpenAI(model='gpt-4.1-mini') 36 | agent = Agent(task=task, llm=model, output_model_schema=Posts) 37 | 38 | history = await agent.run() 39 | 40 | result = history.final_result() 41 | if result: 42 | parsed: Posts = Posts.model_validate_json(result) 43 | 44 | for post in parsed.posts: 45 | print('\n--------------------------------') 46 | print(f'Title: {post.post_title}') 47 | print(f'URL: {post.post_url}') 48 | print(f'Comments: {post.num_comments}') 49 | print(f'Hours since post: {post.hours_since_post}') 50 | else: 51 | print('No result') 52 | 53 | 54 | if __name__ == '__main__': 55 | asyncio.run(main()) 56 | -------------------------------------------------------------------------------- /examples/features/custom_system_prompt.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import os 4 | import sys 5 | 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 7 | 8 | from dotenv import load_dotenv 9 | 10 | load_dotenv() 11 | 12 | 13 | from browser_use import Agent, ChatOpenAI 14 | 15 | extend_system_message = ( 16 | 'REMEMBER the most important RULE: ALWAYS open first a new tab and go first to url wikipedia.com no matter the task!!!' 17 | ) 18 | 19 | # or use override_system_message to completely override the system prompt 20 | 21 | 22 | async def main(): 23 | task = 'do google search to find images of Elon Musk' 24 | model = ChatOpenAI(model='gpt-4.1-mini') 25 | agent = Agent(task=task, llm=model, extend_system_message=extend_system_message) 26 | 27 | print( 28 | json.dumps( 29 | agent.message_manager.system_prompt.model_dump(exclude_unset=True), 30 | indent=4, 31 | ) 32 | ) 33 | 34 | await agent.run() 35 | 36 | 37 | if __name__ == '__main__': 38 | asyncio.run(main()) 39 | -------------------------------------------------------------------------------- /examples/features/download_file.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | 12 | from browser_use import Agent, BrowserProfile, BrowserSession, ChatGoogle 13 | 14 | api_key = os.getenv('GOOGLE_API_KEY') 15 | if not api_key: 16 | raise ValueError('GOOGLE_API_KEY is not set') 17 | 18 | llm = ChatGoogle(model='gemini-2.5-flash', api_key=api_key) 19 | 20 | 21 | browser_session = BrowserSession(browser_profile=BrowserProfile(downloads_path='~/Downloads')) 22 | 23 | 24 | async def run_download(): 25 | agent = Agent( 26 | task='Go to "https://file-examples.com/" and download the smallest doc file.', 27 | llm=llm, 28 | browser_session=browser_session, 29 | ) 30 | await agent.run(max_steps=25) 31 | 32 | 33 | if __name__ == '__main__': 34 | asyncio.run(run_download()) 35 | -------------------------------------------------------------------------------- /examples/features/follow_up_tasks.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | from browser_use.browser.profile import BrowserProfile 6 | 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 8 | 9 | from dotenv import load_dotenv 10 | 11 | load_dotenv() 12 | 13 | from browser_use import Agent 14 | 15 | profile = BrowserProfile(keep_alive=True) 16 | 17 | 18 | task = """Go to reddit.com""" 19 | 20 | 21 | async def main(): 22 | agent = Agent(task=task, browser_profile=profile) 23 | await agent.run(max_steps=1) 24 | 25 | while True: 26 | user_response = input('\n👤 New task or "q" to quit: ') 27 | agent.add_new_task(f'New task: {user_response}') 28 | await agent.run() 29 | 30 | 31 | if __name__ == '__main__': 32 | asyncio.run(main()) 33 | -------------------------------------------------------------------------------- /examples/features/initial_actions.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | from browser_use import Agent, ChatOpenAI 12 | 13 | llm = ChatOpenAI(model='gpt-4.1-mini') 14 | 15 | initial_actions = [ 16 | {'go_to_url': {'url': 'https://www.google.com', 'new_tab': True}}, 17 | {'go_to_url': {'url': 'https://en.wikipedia.org/wiki/Randomness', 'new_tab': True}}, 18 | ] 19 | agent = Agent( 20 | task='What theories are displayed on the page?', 21 | initial_actions=initial_actions, 22 | llm=llm, 23 | ) 24 | 25 | 26 | async def main(): 27 | await agent.run(max_steps=10) 28 | 29 | 30 | if __name__ == '__main__': 31 | asyncio.run(main()) 32 | -------------------------------------------------------------------------------- /examples/features/multi_tab.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple try of the agent. 3 | 4 | @dev You need to add OPENAI_API_KEY to your environment variables. 5 | """ 6 | 7 | import asyncio 8 | import os 9 | import sys 10 | 11 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 12 | 13 | from dotenv import load_dotenv 14 | 15 | load_dotenv() 16 | 17 | from browser_use import Agent, ChatOpenAI 18 | 19 | # video: https://preview.screen.studio/share/clenCmS6 20 | llm = ChatOpenAI(model='gpt-4.1-mini') 21 | agent = Agent( 22 | task='open 3 tabs with elon musk, sam altman, and steve jobs, then go back to the first and stop', 23 | llm=llm, 24 | ) 25 | 26 | 27 | async def main(): 28 | await agent.run() 29 | 30 | 31 | asyncio.run(main()) 32 | -------------------------------------------------------------------------------- /examples/features/parallel_agents.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | from browser_use import ChatOpenAI 12 | from browser_use.agent.service import Agent 13 | from browser_use.browser import BrowserProfile, BrowserSession 14 | 15 | browser_session = BrowserSession( 16 | browser_profile=BrowserProfile( 17 | keep_alive=True, 18 | headless=False, 19 | record_video_dir='./tmp/recordings', 20 | user_data_dir='~/.config/browseruse/profiles/default', 21 | ) 22 | ) 23 | llm = ChatOpenAI(model='gpt-4.1-mini') 24 | 25 | 26 | # NOTE: This is experimental - you will have multiple agents running in the same browser session 27 | async def main(): 28 | await browser_session.start() 29 | agents = [ 30 | Agent(task=task, llm=llm, browser_session=browser_session) 31 | for task in [ 32 | 'Search Google for weather in Tokyo', 33 | 'Check Reddit front page title', 34 | 'Look up Bitcoin price on Coinbase', 35 | # 'Find NASA image of the day', 36 | # 'Check top story on CNN', 37 | # 'Search latest SpaceX launch date', 38 | # 'Look up population of Paris', 39 | # 'Find current time in Sydney', 40 | # 'Check who won last Super Bowl', 41 | # 'Search trending topics on Twitter', 42 | ] 43 | ] 44 | 45 | print(await asyncio.gather(*[agent.run() for agent in agents])) 46 | await browser_session.kill() 47 | 48 | 49 | if __name__ == '__main__': 50 | asyncio.run(main()) 51 | -------------------------------------------------------------------------------- /examples/features/process_agent_output.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | from pprint import pprint 5 | 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 7 | 8 | from dotenv import load_dotenv 9 | 10 | load_dotenv() 11 | 12 | from browser_use import Agent, ChatOpenAI 13 | from browser_use.agent.views import AgentHistoryList 14 | from browser_use.browser import BrowserProfile, BrowserSession 15 | from browser_use.browser.profile import ViewportSize 16 | 17 | llm = ChatOpenAI(model='gpt-4.1-mini') 18 | 19 | 20 | async def main(): 21 | browser_session = BrowserSession( 22 | browser_profile=BrowserProfile( 23 | headless=False, 24 | traces_dir='./tmp/result_processing', 25 | window_size=ViewportSize(width=1280, height=1000), 26 | user_data_dir='~/.config/browseruse/profiles/default', 27 | ) 28 | ) 29 | await browser_session.start() 30 | try: 31 | agent = Agent( 32 | task="go to google.com and type 'OpenAI' click search and give me the first url", 33 | llm=llm, 34 | browser_session=browser_session, 35 | ) 36 | history: AgentHistoryList = await agent.run(max_steps=3) 37 | 38 | print('Final Result:') 39 | pprint(history.final_result(), indent=4) 40 | 41 | print('\nErrors:') 42 | pprint(history.errors(), indent=4) 43 | 44 | # e.g. xPaths the model clicked on 45 | print('\nModel Outputs:') 46 | pprint(history.model_actions(), indent=4) 47 | 48 | print('\nThoughts:') 49 | pprint(history.model_thoughts(), indent=4) 50 | finally: 51 | await browser_session.stop() 52 | 53 | 54 | if __name__ == '__main__': 55 | asyncio.run(main()) 56 | -------------------------------------------------------------------------------- /examples/features/restrict_urls.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | from browser_use import Agent, ChatOpenAI 12 | from browser_use.browser import BrowserProfile, BrowserSession 13 | 14 | llm = ChatOpenAI(model='gpt-4.1-mini') 15 | task = ( 16 | "go to google.com and search for openai.com and click on the first link then extract content and scroll down - what's there?" 17 | ) 18 | 19 | allowed_domains = ['google.com'] 20 | 21 | browser_session = BrowserSession( 22 | browser_profile=BrowserProfile( 23 | executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', 24 | allowed_domains=allowed_domains, 25 | user_data_dir='~/.config/browseruse/profiles/default', 26 | ), 27 | ) 28 | 29 | agent = Agent( 30 | task=task, 31 | llm=llm, 32 | browser_session=browser_session, 33 | ) 34 | 35 | 36 | async def main(): 37 | await agent.run(max_steps=25) 38 | 39 | input('Press Enter to close the browser...') 40 | await browser_session.kill() 41 | 42 | 43 | asyncio.run(main()) 44 | -------------------------------------------------------------------------------- /examples/features/secure.py: -------------------------------------------------------------------------------- 1 | """ 2 | Azure OpenAI example with data privacy and high-scale configuration. 3 | 4 | Environment Variables Required: 5 | - AZURE_OPENAI_KEY (or AZURE_OPENAI_API_KEY) 6 | - AZURE_OPENAI_ENDPOINT 7 | - AZURE_OPENAI_DEPLOYMENT (optional) 8 | 9 | DATA PRIVACY WITH AZURE OPENAI: 10 | ✅ Good News: No Training on Your Data by Default 11 | 12 | Azure OpenAI Service already protects your data: 13 | ✅ NOT used to train OpenAI models 14 | ✅ NOT shared with other customers 15 | ✅ NOT accessible to OpenAI directly 16 | ✅ NOT used to improve Microsoft/third-party products 17 | ✅ Hosted entirely within Azure (not OpenAI's servers) 18 | 19 | ⚠️ Default Data Retention (30 Days) 20 | - Prompts and completions stored for up to 30 days 21 | - Purpose: Abuse monitoring and compliance 22 | - Access: Microsoft authorized personnel (only if abuse detected) 23 | 24 | 🔒 How to Disable Data Logging Completely 25 | Apply for Microsoft's "Limited Access Program": 26 | 1. Contact Microsoft Azure support 27 | 2. Submit Limited Access Program request 28 | 3. Demonstrate legitimate business need 29 | 4. After approval: Zero data logging, immediate deletion, no human review 30 | 31 | For high-scale deployments (500+ agents), consider: 32 | - Multiple deployments across regions 33 | 34 | 35 | How to Verify This Yourself, that there is no data logging: 36 | - Network monitoring: Run with network monitoring tools 37 | - Firewall rules: Block all domains except Azure OpenAI and your target sites 38 | 39 | Contact us if you need help with this: support@browser-use.com 40 | """ 41 | 42 | import asyncio 43 | import os 44 | import sys 45 | 46 | from dotenv import load_dotenv 47 | 48 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 49 | 50 | load_dotenv() 51 | 52 | 53 | os.environ['ANONYMIZED_TELEMETRY'] = 'false' 54 | 55 | 56 | from browser_use import Agent, BrowserProfile, ChatAzureOpenAI 57 | 58 | # Configuration LLM 59 | api_key = os.getenv('AZURE_OPENAI_KEY') 60 | azure_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT') 61 | llm = ChatAzureOpenAI(model='gpt-4.1-mini', api_key=api_key, azure_endpoint=azure_endpoint) 62 | 63 | # Configuration Task 64 | task = 'Find the founders of the sensitive company_name' 65 | 66 | # Configuration Browser (optional) 67 | browser_profile = BrowserProfile(allowed_domains=['*google.com', 'browser-use.com'], enable_default_extensions=False) 68 | 69 | # Sensitive data (optional) - {key: sensitive_information} - we filter out the sensitive_information from any input to the LLM, it will only work with placeholder. 70 | # By default we pass screenshots to the LLM which can contain your information. Set use_vision=False to disable this. 71 | # If you trust your LLM endpoint, you don't need to worry about this. 72 | sensitive_data: dict[str, str | dict[str, str]] = {'company_name': 'browser-use'} 73 | 74 | 75 | # Create Agent 76 | agent = Agent(task=task, llm=llm, browser_profile=browser_profile, sensitive_data=sensitive_data) 77 | 78 | 79 | async def main(): 80 | await agent.run(max_steps=10) 81 | 82 | 83 | asyncio.run(main()) 84 | -------------------------------------------------------------------------------- /examples/features/sensitive_data.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | from browser_use import Agent, ChatOpenAI 12 | from browser_use.browser import BrowserProfile 13 | 14 | # Initialize the model 15 | llm = ChatOpenAI( 16 | model='gpt-4.1', 17 | temperature=0.0, 18 | ) 19 | # Simple case: the model will see x_name and x_password, but never the actual values. 20 | # sensitive_data = {'x_name': 'my_x_name', 'x_password': 'my_x_password'} 21 | 22 | # Advanced case: domain-specific credentials with reusable data 23 | # Define a single credential set that can be reused 24 | company_credentials = {'company_username': 'user@example.com', 'company_password': 'securePassword123'} 25 | 26 | # Map the same credentials to multiple domains for secure access control 27 | # Type annotation to satisfy pyright 28 | sensitive_data: dict[str, str | dict[str, str]] = { 29 | 'https://example.com': company_credentials, 30 | 'https://admin.example.com': company_credentials, 31 | 'https://*.example-staging.com': company_credentials, 32 | 'http*://test.example.com': company_credentials, 33 | # You can also add domain-specific credentials 34 | 'https://*.google.com': {'g_email': 'user@gmail.com', 'g_pass': 'google_password'}, 35 | } 36 | # Update task to use one of the credentials above 37 | task = 'Go to google.com and put the login information in the search bar.' 38 | 39 | # Always set allowed_domains when using sensitive_data for security 40 | from browser_use.browser.session import BrowserSession 41 | 42 | browser_session = BrowserSession( 43 | browser_profile=BrowserProfile( 44 | allowed_domains=list(sensitive_data.keys()) 45 | + ['https://*.trusted-partner.com'] # Domain patterns from sensitive_data + additional allowed domains 46 | ) 47 | ) 48 | 49 | agent = Agent(task=task, llm=llm, sensitive_data=sensitive_data, browser_session=browser_session) 50 | 51 | 52 | async def main(): 53 | await agent.run() 54 | 55 | 56 | if __name__ == '__main__': 57 | asyncio.run(main()) 58 | -------------------------------------------------------------------------------- /examples/features/small_model_for_extraction.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | from browser_use import Agent, ChatOpenAI 12 | 13 | # This uses a bigger model for the planning 14 | # And a smaller model for the page content extraction 15 | # THink of it like a subagent which only task is to extract content from the current page 16 | llm = ChatOpenAI(model='gpt-4.1') 17 | small_llm = ChatOpenAI(model='gpt-4.1-mini') 18 | task = 'Find the founders of browser-use in ycombinator, extract all links and open the links one by one' 19 | agent = Agent(task=task, llm=llm, page_extraction_llm=small_llm) 20 | 21 | 22 | async def main(): 23 | await agent.run() 24 | 25 | 26 | if __name__ == '__main__': 27 | asyncio.run(main()) 28 | -------------------------------------------------------------------------------- /examples/file_system/alphabet_earnings.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import pathlib 4 | import shutil 5 | 6 | from dotenv import load_dotenv 7 | 8 | from browser_use import Agent, ChatOpenAI 9 | 10 | load_dotenv() 11 | 12 | SCRIPT_DIR = pathlib.Path(os.path.dirname(os.path.abspath(__file__))) 13 | agent_dir = SCRIPT_DIR / 'alphabet_earnings' 14 | agent_dir.mkdir(exist_ok=True) 15 | 16 | task = """ 17 | Go to https://abc.xyz/assets/cc/27/3ada14014efbadd7a58472f1f3f4/2025q2-alphabet-earnings-release.pdf. 18 | Read the PDF and save 3 interesting data points in "alphabet_earnings.pdf" and share it with me! 19 | """.strip('\n') 20 | 21 | agent = Agent( 22 | task=task, 23 | llm=ChatOpenAI(model='o4-mini'), 24 | file_system_path=str(agent_dir / 'fs'), 25 | flash_mode=True, 26 | ) 27 | 28 | 29 | async def main(): 30 | await agent.run() 31 | input(f'Press Enter to clean the file system at {agent_dir}...') 32 | # clean the file system 33 | shutil.rmtree(str(agent_dir / 'fs')) 34 | 35 | 36 | if __name__ == '__main__': 37 | asyncio.run(main()) 38 | -------------------------------------------------------------------------------- /examples/file_system/excel_sheet.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | from browser_use.llm.openai.chat import ChatOpenAI 6 | 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 8 | 9 | from dotenv import load_dotenv 10 | 11 | load_dotenv() 12 | 13 | from browser_use import Agent 14 | 15 | llm = ChatOpenAI(model='o4-mini') 16 | 17 | 18 | task = ( 19 | 'Find current stock price of companies Meta and Amazon. Then, make me a CSV file with 2 columns: company name, stock price.' 20 | ) 21 | 22 | agent = Agent(task=task, llm=llm) 23 | 24 | 25 | async def main(): 26 | import time 27 | 28 | start_time = time.time() 29 | history = await agent.run() 30 | # token usage 31 | print(history.usage) 32 | end_time = time.time() 33 | print(f'Time taken: {end_time - start_time} seconds') 34 | 35 | 36 | if __name__ == '__main__': 37 | asyncio.run(main()) 38 | -------------------------------------------------------------------------------- /examples/file_system/file_system.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import pathlib 4 | import shutil 5 | 6 | from dotenv import load_dotenv 7 | 8 | from browser_use import Agent, ChatOpenAI 9 | 10 | load_dotenv() 11 | 12 | 13 | SCRIPT_DIR = pathlib.Path(os.path.dirname(os.path.abspath(__file__))) 14 | agent_dir = SCRIPT_DIR / 'file_system' 15 | agent_dir.mkdir(exist_ok=True) 16 | conversation_dir = agent_dir / 'conversations' / 'conversation' 17 | print(f'Agent logs directory: {agent_dir}') 18 | 19 | 20 | task = """ 21 | Go to https://mertunsall.github.io/posts/post1.html 22 | Save the title of the article in "data.md" 23 | Then, use append_file to add the first sentence of the article to "data.md" 24 | Then, read the file to see its content and make sure it's correct. 25 | Finally, share the file with me. 26 | 27 | NOTE: DO NOT USE extract_structured_data action - everything is visible in browser state. 28 | """.strip('\n') 29 | 30 | llm = ChatOpenAI(model='gpt-4.1-mini') 31 | 32 | agent = Agent( 33 | task=task, 34 | llm=llm, 35 | save_conversation_path=str(conversation_dir), 36 | file_system_path=str(agent_dir / 'fs'), 37 | ) 38 | 39 | 40 | async def main(): 41 | agent_history = await agent.run() 42 | print(f'Final result: {agent_history.final_result()}', flush=True) 43 | 44 | input('Press Enter to clean the file system...') 45 | # clean the file system 46 | shutil.rmtree(str(agent_dir / 'fs')) 47 | 48 | 49 | if __name__ == '__main__': 50 | asyncio.run(main()) 51 | -------------------------------------------------------------------------------- /examples/getting_started/01_basic_search.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | # Add the parent directory to the path so we can import browser_use 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 7 | 8 | from dotenv import load_dotenv 9 | 10 | load_dotenv() 11 | 12 | from browser_use import Agent, ChatOpenAI 13 | 14 | 15 | async def main(): 16 | llm = ChatOpenAI(model='gpt-4.1-mini') 17 | task = "Search Google for 'what is browser automation' and tell me the top 3 results" 18 | agent = Agent(task=task, llm=llm) 19 | await agent.run() 20 | 21 | 22 | if __name__ == '__main__': 23 | asyncio.run(main()) 24 | -------------------------------------------------------------------------------- /examples/getting_started/02_form_filling.py: -------------------------------------------------------------------------------- 1 | """ 2 | Getting Started Example 2: Form Filling 3 | 4 | This example demonstrates how to: 5 | - Navigate to a website with forms 6 | - Fill out input fields 7 | - Submit forms 8 | - Handle basic form interactions 9 | 10 | This builds on the basic search example by showing more complex interactions. 11 | """ 12 | 13 | import asyncio 14 | import os 15 | import sys 16 | 17 | # Add the parent directory to the path so we can import browser_use 18 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 19 | 20 | from dotenv import load_dotenv 21 | 22 | load_dotenv() 23 | 24 | from browser_use import Agent, ChatOpenAI 25 | 26 | 27 | async def main(): 28 | # Initialize the model 29 | llm = ChatOpenAI(model='gpt-4.1-mini') 30 | 31 | # Define a form filling task 32 | task = """ 33 | Go to https://httpbin.org/forms/post and fill out the contact form with: 34 | - Customer name: John Doe 35 | - Telephone: 555-123-4567 36 | - Email: john.doe@example.com 37 | - Size: Medium 38 | - Topping: cheese 39 | - Delivery time: now 40 | - Comments: This is a test form submission 41 | 42 | Then submit the form and tell me what response you get. 43 | """ 44 | 45 | # Create and run the agent 46 | agent = Agent(task=task, llm=llm) 47 | await agent.run() 48 | 49 | 50 | if __name__ == '__main__': 51 | asyncio.run(main()) 52 | -------------------------------------------------------------------------------- /examples/getting_started/03_data_extraction.py: -------------------------------------------------------------------------------- 1 | """ 2 | Getting Started Example 3: Data Extraction 3 | 4 | This example demonstrates how to: 5 | - Navigate to a website with structured data 6 | - Extract specific information from the page 7 | - Process and organize the extracted data 8 | - Return structured results 9 | 10 | This builds on previous examples by showing how to get valuable data from websites. 11 | """ 12 | 13 | import asyncio 14 | import os 15 | import sys 16 | 17 | # Add the parent directory to the path so we can import browser_use 18 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 19 | 20 | from dotenv import load_dotenv 21 | 22 | load_dotenv() 23 | 24 | from browser_use import Agent, ChatOpenAI 25 | 26 | 27 | async def main(): 28 | # Initialize the model 29 | llm = ChatOpenAI(model='gpt-4.1-mini') 30 | 31 | # Define a data extraction task 32 | task = """ 33 | Go to https://quotes.toscrape.com/ and extract the following information: 34 | - The first 5 quotes on the page 35 | - The author of each quote 36 | - The tags associated with each quote 37 | 38 | Present the information in a clear, structured format like: 39 | Quote 1: "[quote text]" - Author: [author name] - Tags: [tag1, tag2, ...] 40 | Quote 2: "[quote text]" - Author: [author name] - Tags: [tag1, tag2, ...] 41 | etc. 42 | """ 43 | 44 | # Create and run the agent 45 | agent = Agent(task=task, llm=llm) 46 | await agent.run() 47 | 48 | 49 | if __name__ == '__main__': 50 | asyncio.run(main()) 51 | -------------------------------------------------------------------------------- /examples/getting_started/04_multi_step_task.py: -------------------------------------------------------------------------------- 1 | """ 2 | Getting Started Example 4: Multi-Step Task 3 | 4 | This example demonstrates how to: 5 | - Perform a complex workflow with multiple steps 6 | - Navigate between different pages 7 | - Combine search, form filling, and data extraction 8 | - Handle a realistic end-to-end scenario 9 | 10 | This is the most advanced getting started example, combining all previous concepts. 11 | """ 12 | 13 | import asyncio 14 | import os 15 | import sys 16 | 17 | # Add the parent directory to the path so we can import browser_use 18 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 19 | 20 | from dotenv import load_dotenv 21 | 22 | load_dotenv() 23 | 24 | from browser_use import Agent, ChatOpenAI 25 | 26 | 27 | async def main(): 28 | # Initialize the model 29 | llm = ChatOpenAI(model='gpt-4.1-mini') 30 | 31 | # Define a multi-step task 32 | task = """ 33 | I want you to research Python web scraping libraries. Here's what I need: 34 | 35 | 1. First, search Google for "best Python web scraping libraries 2024" 36 | 2. Find a reputable article or blog post about this topic 37 | 3. From that article, extract the top 3 recommended libraries 38 | 4. For each library, visit its official website or GitHub page 39 | 5. Extract key information about each library: 40 | - Name 41 | - Brief description 42 | - Main features or advantages 43 | - GitHub stars (if available) 44 | 45 | Present your findings in a summary format comparing the three libraries. 46 | """ 47 | 48 | # Create and run the agent 49 | agent = Agent(task=task, llm=llm) 50 | await agent.run() 51 | 52 | 53 | if __name__ == '__main__': 54 | asyncio.run(main()) 55 | -------------------------------------------------------------------------------- /examples/getting_started/05_fast_agent.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | # Add the parent directory to the path so we can import browser_use 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 7 | 8 | from dotenv import load_dotenv 9 | 10 | load_dotenv() 11 | 12 | 13 | from browser_use import Agent, BrowserProfile 14 | 15 | # Speed optimization instructions for the model 16 | SPEED_OPTIMIZATION_PROMPT = """ 17 | Speed optimization instructions: 18 | - Be extremely concise and direct in your responses 19 | - Get to the goal as quickly as possible 20 | - Use multi-action sequences whenever possible to reduce steps 21 | """ 22 | 23 | 24 | async def main(): 25 | # 1. Use fast LLM - Llama 4 on Groq for ultra-fast inference 26 | from browser_use import ChatGroq 27 | 28 | llm = ChatGroq( 29 | model='meta-llama/llama-4-maverick-17b-128e-instruct', 30 | temperature=0.0, 31 | ) 32 | # from browser_use import ChatGoogle 33 | 34 | # llm = ChatGoogle(model='gemini-2.5-flash') 35 | 36 | # 2. Create speed-optimized browser profile 37 | browser_profile = BrowserProfile( 38 | minimum_wait_page_load_time=0.1, 39 | wait_between_actions=0.1, 40 | headless=False, 41 | ) 42 | 43 | # 3. Define a speed-focused task 44 | task = """ 45 | 1. Go to reddit https://www.reddit.com/search/?q=browser+agent&type=communities 46 | 2. Click directly on the first 5 communities to open each in new tabs 47 | 3. Find out what the latest post is about, and switch directly to the next tab 48 | 4. Return the latest post summary for each page 49 | """ 50 | 51 | # 4. Create agent with all speed optimizations 52 | agent = Agent( 53 | task=task, 54 | llm=llm, 55 | flash_mode=True, # Disables thinking in the LLM output for maximum speed 56 | browser_profile=browser_profile, 57 | extend_system_message=SPEED_OPTIMIZATION_PROMPT, 58 | ) 59 | 60 | await agent.run() 61 | 62 | 63 | if __name__ == '__main__': 64 | asyncio.run(main()) 65 | -------------------------------------------------------------------------------- /examples/integrations/agentmail/2fa.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | from agentmail import AsyncAgentMail # type: ignore 6 | 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 8 | from dotenv import load_dotenv 9 | 10 | load_dotenv() 11 | 12 | from browser_use import Agent, Browser, models 13 | from examples.integrations.agentmail.email_tools import EmailTools 14 | 15 | TASK = """ 16 | Go to reddit.com, create a new account (use the get_email_address), make up password and all other information, confirm the 2fa with get_latest_email, and like latest post on r/elon subreddit. 17 | """ 18 | 19 | 20 | async def main(): 21 | # Create email inbox 22 | # Get an API key from https://agentmail.to/ 23 | email_client = AsyncAgentMail() 24 | inbox = await email_client.inboxes.create() 25 | print(f'Your email address is: {inbox.inbox_id}\n\n') 26 | 27 | # Initialize the tools for browser-use agent 28 | tools = EmailTools(email_client=email_client, inbox=inbox) 29 | 30 | # Initialize the LLM for browser-use agent 31 | llm = models.azure_gpt_4_1_mini 32 | 33 | # Set your local browser path 34 | browser = Browser(executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome') 35 | 36 | agent = Agent(task=TASK, tools=tools, llm=llm, browser=browser) 37 | 38 | await agent.run() 39 | 40 | 41 | if __name__ == '__main__': 42 | asyncio.run(main()) 43 | -------------------------------------------------------------------------------- /examples/integrations/slack/slack_example.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 5 | 6 | from dotenv import load_dotenv 7 | 8 | load_dotenv() 9 | 10 | 11 | from browser_use.browser import BrowserProfile 12 | from browser_use.llm import ChatGoogle 13 | from examples.integrations.slack.slack_api import SlackBot, app 14 | 15 | # load credentials from environment variables 16 | bot_token = os.getenv('SLACK_BOT_TOKEN') 17 | if not bot_token: 18 | raise ValueError('Slack bot token not found in .env file.') 19 | 20 | signing_secret = os.getenv('SLACK_SIGNING_SECRET') 21 | if not signing_secret: 22 | raise ValueError('Slack signing secret not found in .env file.') 23 | 24 | api_key = os.getenv('GOOGLE_API_KEY') 25 | if not api_key: 26 | raise ValueError('GOOGLE_API_KEY is not set') 27 | 28 | llm = ChatGoogle(model='gemini-2.0-flash-exp', api_key=api_key) 29 | 30 | slack_bot = SlackBot( 31 | llm=llm, # required; instance of BaseChatModel 32 | bot_token=bot_token, # required; Slack bot token 33 | signing_secret=signing_secret, # required; Slack signing secret 34 | ack=True, # optional; whether to acknowledge task receipt with a message, defaults to False 35 | browser_profile=BrowserProfile( 36 | headless=True 37 | ), # optional; useful for changing headless mode or other browser configs, defaults to headless mode 38 | ) 39 | 40 | app.dependency_overrides[SlackBot] = lambda: slack_bot 41 | 42 | if __name__ == '__main__': 43 | import uvicorn 44 | 45 | uvicorn.run('integrations.slack.slack_api:app', host='0.0.0.0', port=3000) 46 | -------------------------------------------------------------------------------- /examples/mcp/simple_client.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple example of using MCP client with browser-use. 3 | 4 | This example shows how to connect to an MCP server and use its tools with an agent. 5 | """ 6 | 7 | import asyncio 8 | import os 9 | 10 | from browser_use import Agent, Tools 11 | from browser_use.llm.openai.chat import ChatOpenAI 12 | from browser_use.mcp.client import MCPClient 13 | 14 | 15 | async def main(): 16 | # Initialize tools 17 | tools = Tools() 18 | 19 | # Connect to a filesystem MCP server 20 | # This server provides tools to read/write files in a directory 21 | mcp_client = MCPClient( 22 | server_name='filesystem', command='npx', args=['@modelcontextprotocol/server-filesystem', os.path.expanduser('~/Desktop')] 23 | ) 24 | 25 | # Connect and register MCP tools 26 | await mcp_client.connect() 27 | await mcp_client.register_to_tools(tools) 28 | 29 | # Create agent with MCP-enabled tools 30 | agent = Agent( 31 | task='List all files on the Desktop and read the content of any .txt files you find', 32 | llm=ChatOpenAI(model='gpt-4.1-mini'), 33 | tools=tools, 34 | ) 35 | 36 | # Run the agent - it now has access to filesystem tools 37 | await agent.run() 38 | 39 | # Disconnect when done 40 | await mcp_client.disconnect() 41 | 42 | 43 | if __name__ == '__main__': 44 | asyncio.run(main()) 45 | -------------------------------------------------------------------------------- /examples/models/aws.py: -------------------------------------------------------------------------------- 1 | """ 2 | AWS Bedrock Examples 3 | 4 | This file demonstrates how to use AWS Bedrock models with browser-use. 5 | We provide two classes: 6 | 1. ChatAnthropicBedrock - Convenience class for Anthropic Claude models 7 | 2. ChatAWSBedrock - General AWS Bedrock client supporting all providers 8 | 9 | Requirements: 10 | - AWS credentials configured via environment variables 11 | - boto3 installed: pip install boto3 12 | - Access to AWS Bedrock models in your region 13 | """ 14 | 15 | import asyncio 16 | 17 | from lmnr import Laminar 18 | 19 | from browser_use import Agent 20 | from browser_use.llm import ChatAnthropicBedrock, ChatAWSBedrock 21 | 22 | Laminar.initialize() 23 | 24 | 25 | async def example_anthropic_bedrock(): 26 | """Example using ChatAnthropicBedrock - convenience class for Claude models.""" 27 | print('🔹 ChatAnthropicBedrock Example') 28 | 29 | # Initialize with Anthropic Claude via AWS Bedrock 30 | llm = ChatAnthropicBedrock( 31 | model='us.anthropic.claude-sonnet-4-20250514-v1:0', 32 | aws_region='us-east-1', 33 | temperature=0.7, 34 | ) 35 | 36 | print(f'Model: {llm.name}') 37 | print(f'Provider: {llm.provider}') 38 | 39 | # Create agent 40 | agent = Agent( 41 | task="Navigate to google.com and search for 'AWS Bedrock pricing'", 42 | llm=llm, 43 | ) 44 | 45 | print("Task: Navigate to google.com and search for 'AWS Bedrock pricing'") 46 | 47 | # Run the agent 48 | result = await agent.run(max_steps=2) 49 | print(f'Result: {result}') 50 | 51 | 52 | async def example_aws_bedrock(): 53 | """Example using ChatAWSBedrock - general client for any Bedrock model.""" 54 | print('\n🔹 ChatAWSBedrock Example') 55 | 56 | # Initialize with any AWS Bedrock model (using Meta Llama as example) 57 | llm = ChatAWSBedrock( 58 | model='us.meta.llama4-maverick-17b-instruct-v1:0', 59 | aws_region='us-east-1', 60 | temperature=0.5, 61 | ) 62 | 63 | print(f'Model: {llm.name}') 64 | print(f'Provider: {llm.provider}') 65 | 66 | # Create agent 67 | agent = Agent( 68 | task='Go to github.com and find the most popular Python repository', 69 | llm=llm, 70 | ) 71 | 72 | print('Task: Go to github.com and find the most popular Python repository') 73 | 74 | # Run the agent 75 | result = await agent.run(max_steps=2) 76 | print(f'Result: {result}') 77 | 78 | 79 | async def main(): 80 | """Run AWS Bedrock examples.""" 81 | print('🚀 AWS Bedrock Examples') 82 | print('=' * 40) 83 | 84 | print('Make sure you have AWS credentials configured:') 85 | print('export AWS_ACCESS_KEY_ID=your_key') 86 | print('export AWS_SECRET_ACCESS_KEY=your_secret') 87 | print('export AWS_DEFAULT_REGION=us-east-1') 88 | print('=' * 40) 89 | 90 | try: 91 | # Run both examples 92 | await example_aws_bedrock() 93 | await example_anthropic_bedrock() 94 | 95 | except Exception as e: 96 | print(f'❌ Error: {e}') 97 | print('Make sure you have:') 98 | print('- Valid AWS credentials configured') 99 | print('- Access to AWS Bedrock in your region') 100 | print('- boto3 installed: pip install boto3') 101 | 102 | 103 | if __name__ == '__main__': 104 | asyncio.run(main()) 105 | -------------------------------------------------------------------------------- /examples/models/azure_openai.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple try of the agent. 3 | 4 | @dev You need to add AZURE_OPENAI_KEY and AZURE_OPENAI_ENDPOINT to your environment variables. 5 | """ 6 | 7 | import asyncio 8 | import os 9 | import sys 10 | 11 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 12 | 13 | from dotenv import load_dotenv 14 | 15 | load_dotenv() 16 | 17 | 18 | from browser_use import Agent 19 | from browser_use.llm import ChatAzureOpenAI 20 | 21 | # Make sure your deployment exists, double check the region and model name 22 | api_key = os.getenv('AZURE_OPENAI_KEY') 23 | azure_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT') 24 | llm = ChatAzureOpenAI( 25 | model='gpt-4.1-mini', 26 | api_key=api_key, 27 | azure_endpoint=azure_endpoint, 28 | ) 29 | 30 | TASK = """ 31 | Go to google.com/travel/flights and find the cheapest flight from New York to Paris on 2025-10-15 32 | """ 33 | 34 | agent = Agent( 35 | task=TASK, 36 | llm=llm, 37 | ) 38 | 39 | 40 | async def main(): 41 | await agent.run(max_steps=10) 42 | 43 | 44 | asyncio.run(main()) 45 | -------------------------------------------------------------------------------- /examples/models/claude-4-sonnet.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple script that runs the task of opening amazon and searching. 3 | @dev Ensure we have a `ANTHROPIC_API_KEY` variable in our `.env` file. 4 | """ 5 | 6 | import asyncio 7 | import os 8 | import sys 9 | 10 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 11 | 12 | from dotenv import load_dotenv 13 | from lmnr import Laminar 14 | 15 | load_dotenv() 16 | Laminar.initialize() 17 | 18 | from browser_use import Agent 19 | from browser_use.llm import ChatAnthropic 20 | 21 | llm = ChatAnthropic(model='claude-sonnet-4-0', temperature=0.0) 22 | 23 | agent = Agent( 24 | task='Go to amazon.com, search for laptop, sort by best rating, and give me the price of the first result', 25 | llm=llm, 26 | ) 27 | 28 | 29 | async def main(): 30 | await agent.run(max_steps=10) 31 | 32 | 33 | asyncio.run(main()) 34 | -------------------------------------------------------------------------------- /examples/models/deepseek-chat.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | 4 | from browser_use import Agent 5 | from browser_use.llm import ChatDeepSeek 6 | 7 | # Add your custom instructions 8 | extend_system_message = """ 9 | Remember the most important rules: 10 | 1. When performing a search task, open https://www.google.com/ first for search. 11 | 2. Final output. 12 | """ 13 | deepseek_api_key = os.getenv('DEEPSEEK_API_KEY') 14 | if deepseek_api_key is None: 15 | print('Make sure you have DEEPSEEK_API_KEY:') 16 | print('export DEEPSEEK_API_KEY=your_key') 17 | exit(0) 18 | 19 | 20 | async def main(): 21 | llm = ChatDeepSeek( 22 | base_url='https://api.deepseek.com/v1', 23 | model='deepseek-chat', 24 | api_key=deepseek_api_key, 25 | ) 26 | 27 | agent = Agent( 28 | task='What should we pay attention to in the recent new rules on tariffs in China-US trade?', 29 | llm=llm, 30 | use_vision=False, 31 | extend_system_message=extend_system_message, 32 | ) 33 | await agent.run() 34 | 35 | 36 | asyncio.run(main()) 37 | -------------------------------------------------------------------------------- /examples/models/gemini.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | 7 | from dotenv import load_dotenv 8 | from lmnr import Laminar 9 | 10 | load_dotenv() 11 | 12 | 13 | from browser_use import Agent, ChatGoogle 14 | 15 | Laminar.initialize() 16 | 17 | api_key = os.getenv('GOOGLE_API_KEY') 18 | if not api_key: 19 | raise ValueError('GOOGLE_API_KEY is not set') 20 | 21 | llm = ChatGoogle(model='gemini-2.5-flash', api_key=api_key) 22 | 23 | 24 | async def run_search(): 25 | agent = Agent( 26 | task='Go to google.com/travel/flights and find the cheapest flight from New York to Paris on 2025-07-15', 27 | llm=llm, 28 | ) 29 | 30 | await agent.run(max_steps=25) 31 | 32 | 33 | if __name__ == '__main__': 34 | asyncio.run(run_search()) 35 | -------------------------------------------------------------------------------- /examples/models/gpt-4.1.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple try of the agent. 3 | 4 | @dev You need to add OPENAI_API_KEY to your environment variables. 5 | """ 6 | 7 | import asyncio 8 | 9 | from dotenv import load_dotenv 10 | from lmnr import Laminar 11 | 12 | from browser_use import Agent, ChatOpenAI 13 | 14 | load_dotenv() 15 | 16 | 17 | Laminar.initialize() 18 | 19 | # All the models are type safe from OpenAI in case you need a list of supported models 20 | llm = ChatOpenAI(model='gpt-4.1-mini') 21 | agent = Agent( 22 | task='Go to amazon.com, click on the first link, and give me the title of the page', 23 | llm=llm, 24 | ) 25 | 26 | 27 | async def main(): 28 | await agent.run(max_steps=10) 29 | input('Press Enter to continue...') 30 | 31 | 32 | asyncio.run(main()) 33 | -------------------------------------------------------------------------------- /examples/models/gpt-5-mini.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple try of the agent. 3 | 4 | @dev You need to add OPENAI_API_KEY to your environment variables. 5 | """ 6 | 7 | import asyncio 8 | 9 | from dotenv import load_dotenv 10 | from lmnr import Laminar 11 | 12 | from browser_use import Agent, ChatOpenAI 13 | 14 | load_dotenv() 15 | 16 | 17 | Laminar.initialize() 18 | 19 | # All the models are type safe from OpenAI in case you need a list of supported models 20 | llm = ChatOpenAI(model='gpt-5-mini') 21 | agent = Agent( 22 | llm=llm, 23 | task='Find out which one is cooler: the monkey park or a dolphin tour in Tenerife?', 24 | ) 25 | 26 | 27 | async def main(): 28 | await agent.run(max_steps=20) 29 | input('Press Enter to continue...') 30 | 31 | 32 | asyncio.run(main()) 33 | -------------------------------------------------------------------------------- /examples/models/langchain/README.md: -------------------------------------------------------------------------------- 1 | # Langchain Models (legacy) 2 | 3 | This directory contains example of how to still use Langchain models with the new Browser Use chat models. 4 | 5 | ## How to use 6 | 7 | ```python 8 | from langchain_openai import ChatOpenAI 9 | 10 | from browser_use import Agent 11 | from .chat import ChatLangchain 12 | 13 | async def main(): 14 | """Basic example using ChatLangchain with OpenAI through LangChain.""" 15 | 16 | # Create a LangChain model (OpenAI) 17 | langchain_model = ChatOpenAI( 18 | model='gpt-4.1-mini', 19 | temperature=0.1, 20 | ) 21 | 22 | # Wrap it with ChatLangchain to make it compatible with browser-use 23 | llm = ChatLangchain(chat=langchain_model) 24 | 25 | agent = Agent( 26 | task="Go to google.com and search for 'browser automation with Python'", 27 | llm=llm, 28 | ) 29 | 30 | history = await agent.run() 31 | 32 | print(history.history) 33 | ``` 34 | -------------------------------------------------------------------------------- /examples/models/langchain/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/browser-use/browser-use/2d84ff6605f4d89988f7868633f288f351c6a349/examples/models/langchain/__init__.py -------------------------------------------------------------------------------- /examples/models/langchain/example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example of using LangChain models with browser-use. 3 | 4 | This example demonstrates how to: 5 | 1. Wrap a LangChain model with ChatLangchain 6 | 2. Use it with a browser-use Agent 7 | 3. Run a simple web automation task 8 | 9 | @file purpose: Example usage of LangChain integration with browser-use 10 | """ 11 | 12 | import asyncio 13 | 14 | from langchain_openai import ChatOpenAI # pyright: ignore 15 | from lmnr import Laminar 16 | 17 | from browser_use import Agent 18 | from examples.models.langchain.chat import ChatLangchain 19 | 20 | Laminar.initialize() 21 | 22 | 23 | async def main(): 24 | """Basic example using ChatLangchain with OpenAI through LangChain.""" 25 | 26 | # Create a LangChain model (OpenAI) 27 | langchain_model = ChatOpenAI( 28 | model='gpt-4.1-mini', 29 | temperature=0.1, 30 | ) 31 | 32 | # Wrap it with ChatLangchain to make it compatible with browser-use 33 | llm = ChatLangchain(chat=langchain_model) 34 | 35 | # Create a simple task 36 | task = "Go to google.com and search for 'browser automation with Python'" 37 | 38 | # Create and run the agent 39 | agent = Agent( 40 | task=task, 41 | llm=llm, 42 | ) 43 | 44 | print(f'🚀 Starting task: {task}') 45 | print(f'🤖 Using model: {llm.name} (provider: {llm.provider})') 46 | 47 | # Run the agent 48 | history = await agent.run() 49 | 50 | print(f'✅ Task completed! Steps taken: {len(history.history)}') 51 | 52 | # Print the final result if available 53 | if history.final_result(): 54 | print(f'📋 Final result: {history.final_result()}') 55 | 56 | return history 57 | 58 | 59 | if __name__ == '__main__': 60 | print('🌐 Browser-use LangChain Integration Example') 61 | print('=' * 45) 62 | 63 | asyncio.run(main()) 64 | -------------------------------------------------------------------------------- /examples/models/lazy_import.py: -------------------------------------------------------------------------------- 1 | from browser_use import Agent, models 2 | 3 | # available providers for this import style: openai, azure, google 4 | agent = Agent(task='Find founders of browser-use', llm=models.azure_gpt_4_1_mini) 5 | 6 | agent.run_sync() 7 | -------------------------------------------------------------------------------- /examples/models/llama4-groq.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 6 | 7 | from dotenv import load_dotenv 8 | from lmnr import Laminar 9 | 10 | load_dotenv() 11 | 12 | 13 | Laminar.initialize() 14 | 15 | 16 | from browser_use import Agent 17 | from browser_use.llm import ChatGroq 18 | 19 | groq_api_key = os.environ.get('GROQ_API_KEY') 20 | llm = ChatGroq( 21 | model='meta-llama/llama-4-maverick-17b-128e-instruct', 22 | # temperature=0.1, 23 | ) 24 | 25 | # llm = ChatGroq( 26 | # model='meta-llama/llama-4-maverick-17b-128e-instruct', 27 | # api_key=os.environ.get('GROQ_API_KEY'), 28 | # temperature=0.0, 29 | # ) 30 | 31 | task = 'Go to amazon.com, search for laptop, sort by best rating, and give me the price of the first result' 32 | 33 | 34 | async def main(): 35 | agent = Agent( 36 | task=task, 37 | llm=llm, 38 | ) 39 | await agent.run() 40 | 41 | 42 | if __name__ == '__main__': 43 | asyncio.run(main()) 44 | -------------------------------------------------------------------------------- /examples/models/novita.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple try of the agent. 3 | 4 | @dev You need to add NOVITA_API_KEY to your environment variables. 5 | """ 6 | 7 | import asyncio 8 | import os 9 | import sys 10 | 11 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 12 | 13 | from dotenv import load_dotenv 14 | 15 | load_dotenv() 16 | 17 | 18 | from browser_use import Agent, ChatOpenAI 19 | 20 | api_key = os.getenv('NOVITA_API_KEY', '') 21 | if not api_key: 22 | raise ValueError('NOVITA_API_KEY is not set') 23 | 24 | 25 | async def run_search(): 26 | agent = Agent( 27 | task=( 28 | '1. Go to https://www.reddit.com/r/LocalLLaMA ' 29 | "2. Search for 'browser use' in the search bar" 30 | '3. Click on first result' 31 | '4. Return the first comment' 32 | ), 33 | llm=ChatOpenAI( 34 | base_url='https://api.novita.ai/v3/openai', 35 | model='deepseek/deepseek-v3-0324', 36 | api_key=api_key, 37 | ), 38 | use_vision=False, 39 | ) 40 | 41 | await agent.run() 42 | 43 | 44 | if __name__ == '__main__': 45 | asyncio.run(run_search()) 46 | -------------------------------------------------------------------------------- /examples/models/openrouter.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple try of the agent. 3 | 4 | @dev You need to add OPENAI_API_KEY to your environment variables. 5 | """ 6 | 7 | import asyncio 8 | import os 9 | 10 | from dotenv import load_dotenv 11 | from lmnr import Laminar 12 | 13 | from browser_use import Agent, ChatOpenAI 14 | 15 | load_dotenv() 16 | 17 | 18 | Laminar.initialize() 19 | 20 | # All the models are type safe from OpenAI in case you need a list of supported models 21 | llm = ChatOpenAI( 22 | model='x-ai/grok-4', 23 | base_url='https://openrouter.ai/api/v1', 24 | api_key=os.getenv('OPENROUTER_API_KEY'), 25 | ) 26 | agent = Agent( 27 | task='Go to example.com, click on the first link, and give me the title of the page', 28 | llm=llm, 29 | ) 30 | 31 | 32 | async def main(): 33 | await agent.run(max_steps=10) 34 | input('Press Enter to continue...') 35 | 36 | 37 | asyncio.run(main()) 38 | -------------------------------------------------------------------------------- /examples/observability/openLLMetry.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | 4 | from dotenv import load_dotenv 5 | 6 | # test if traceloop is installed 7 | try: 8 | from traceloop.sdk import Traceloop # type: ignore 9 | except ImportError: 10 | print('Traceloop is not installed') 11 | exit(1) 12 | 13 | from browser_use import Agent 14 | 15 | load_dotenv() 16 | api_key = os.getenv('TRACELOOP_API_KEY') 17 | Traceloop.init(api_key=api_key, disable_batch=True) 18 | 19 | 20 | async def main(): 21 | await Agent('Find the founders of browser-use').run() 22 | 23 | 24 | if __name__ == '__main__': 25 | asyncio.run(main()) 26 | -------------------------------------------------------------------------------- /examples/simple.py: -------------------------------------------------------------------------------- 1 | from browser_use import Agent, ChatOpenAI 2 | 3 | agent = Agent( 4 | task='Find founders of browser-use', 5 | llm=ChatOpenAI(model='gpt-4.1-mini'), 6 | ) 7 | 8 | agent.run_sync() 9 | -------------------------------------------------------------------------------- /examples/ui/README.md: -------------------------------------------------------------------------------- 1 | # **User Interfaces of Browser-Use** 2 | 3 | | **File Name** | **User Interface** | **Description** | **Example Usage** | 4 | |------------------------|-------------------|-------------------------------------------|-------------------------------------------| 5 | | `command_line.py` | **Terminal** | Parses arguments for command-line execution. | `python command_line.py` | 6 | | `gradio_demo.py` | **Gradio** | Provides a Gradio-based interactive UI. | `python gradio_demo.py` | 7 | | `streamlit_demo.py` | **Streamlit** | Runs a Streamlit-based web interface. | `python -m streamlit run streamlit_demo.py` | 8 | -------------------------------------------------------------------------------- /examples/ui/command_line.py: -------------------------------------------------------------------------------- 1 | """ 2 | To Use It: 3 | 4 | Example 1: Using OpenAI (default), with default task: 'go to reddit and search for posts about browser-use' 5 | python command_line.py 6 | 7 | Example 2: Using OpenAI with a Custom Query 8 | python command_line.py --query "go to google and search for browser-use" 9 | 10 | Example 3: Using Anthropic's Claude Model with a Custom Query 11 | python command_line.py --query "find latest Python tutorials on Medium" --provider anthropic 12 | 13 | """ 14 | 15 | import argparse 16 | import asyncio 17 | import os 18 | import sys 19 | 20 | # Ensure local repository (browser_use) is accessible 21 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 22 | 23 | from dotenv import load_dotenv 24 | 25 | load_dotenv() 26 | 27 | from browser_use import Agent 28 | from browser_use.browser import BrowserSession 29 | from browser_use.tools.service import Tools 30 | 31 | 32 | def get_llm(provider: str): 33 | if provider == 'anthropic': 34 | from browser_use.llm import ChatAnthropic 35 | 36 | api_key = os.getenv('ANTHROPIC_API_KEY') 37 | if not api_key: 38 | raise ValueError('Error: ANTHROPIC_API_KEY is not set. Please provide a valid API key.') 39 | 40 | return ChatAnthropic(model='claude-3-5-sonnet-20240620', temperature=0.0) 41 | elif provider == 'openai': 42 | from browser_use import ChatOpenAI 43 | 44 | api_key = os.getenv('OPENAI_API_KEY') 45 | if not api_key: 46 | raise ValueError('Error: OPENAI_API_KEY is not set. Please provide a valid API key.') 47 | 48 | return ChatOpenAI(model='gpt-4.1', temperature=0.0) 49 | 50 | else: 51 | raise ValueError(f'Unsupported provider: {provider}') 52 | 53 | 54 | def parse_arguments(): 55 | """Parse command-line arguments.""" 56 | parser = argparse.ArgumentParser(description='Automate browser tasks using an LLM agent.') 57 | parser.add_argument( 58 | '--query', type=str, help='The query to process', default='go to reddit and search for posts about browser-use' 59 | ) 60 | parser.add_argument( 61 | '--provider', 62 | type=str, 63 | choices=['openai', 'anthropic'], 64 | default='openai', 65 | help='The model provider to use (default: openai)', 66 | ) 67 | return parser.parse_args() 68 | 69 | 70 | def initialize_agent(query: str, provider: str): 71 | """Initialize the browser agent with the given query and provider.""" 72 | llm = get_llm(provider) 73 | tools = Tools() 74 | browser_session = BrowserSession() 75 | 76 | return Agent( 77 | task=query, 78 | llm=llm, 79 | tools=tools, 80 | browser_session=browser_session, 81 | use_vision=True, 82 | max_actions_per_step=1, 83 | ), browser_session 84 | 85 | 86 | async def main(): 87 | """Main async function to run the agent.""" 88 | args = parse_arguments() 89 | agent, browser_session = initialize_agent(args.query, args.provider) 90 | 91 | await agent.run(max_steps=25) 92 | 93 | input('Press Enter to close the browser...') 94 | await browser_session.kill() 95 | 96 | 97 | if __name__ == '__main__': 98 | asyncio.run(main()) 99 | -------------------------------------------------------------------------------- /examples/ui/gradio_demo.py: -------------------------------------------------------------------------------- 1 | # pyright: reportMissingImports=false 2 | import asyncio 3 | import os 4 | import sys 5 | from dataclasses import dataclass 6 | 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 8 | 9 | from dotenv import load_dotenv 10 | 11 | load_dotenv() 12 | 13 | # Third-party imports 14 | import gradio as gr # type: ignore 15 | from rich.console import Console 16 | from rich.panel import Panel 17 | from rich.text import Text 18 | 19 | # Local module imports 20 | from browser_use import Agent, ChatOpenAI 21 | 22 | 23 | @dataclass 24 | class ActionResult: 25 | is_done: bool 26 | extracted_content: str | None 27 | error: str | None 28 | include_in_memory: bool 29 | 30 | 31 | @dataclass 32 | class AgentHistoryList: 33 | all_results: list[ActionResult] 34 | all_model_outputs: list[dict] 35 | 36 | 37 | def parse_agent_history(history_str: str) -> None: 38 | console = Console() 39 | 40 | # Split the content into sections based on ActionResult entries 41 | sections = history_str.split('ActionResult(') 42 | 43 | for i, section in enumerate(sections[1:], 1): # Skip first empty section 44 | # Extract relevant information 45 | content = '' 46 | if 'extracted_content=' in section: 47 | content = section.split('extracted_content=')[1].split(',')[0].strip("'") 48 | 49 | if content: 50 | header = Text(f'Step {i}', style='bold blue') 51 | panel = Panel(content, title=header, border_style='blue') 52 | console.print(panel) 53 | console.print() 54 | 55 | return None 56 | 57 | 58 | async def run_browser_task( 59 | task: str, 60 | api_key: str, 61 | model: str = 'gpt-4.1', 62 | headless: bool = True, 63 | ) -> str: 64 | if not api_key.strip(): 65 | return 'Please provide an API key' 66 | 67 | os.environ['OPENAI_API_KEY'] = api_key 68 | 69 | try: 70 | agent = Agent( 71 | task=task, 72 | llm=ChatOpenAI(model='gpt-4.1-mini'), 73 | ) 74 | result = await agent.run() 75 | # TODO: The result could be parsed better 76 | return str(result) 77 | except Exception as e: 78 | return f'Error: {str(e)}' 79 | 80 | 81 | def create_ui(): 82 | with gr.Blocks(title='Browser Use GUI') as interface: 83 | gr.Markdown('# Browser Use Task Automation') 84 | 85 | with gr.Row(): 86 | with gr.Column(): 87 | api_key = gr.Textbox(label='OpenAI API Key', placeholder='sk-...', type='password') 88 | task = gr.Textbox( 89 | label='Task Description', 90 | placeholder='E.g., Find flights from New York to London for next week', 91 | lines=3, 92 | ) 93 | model = gr.Dropdown(choices=['gpt-4', 'gpt-3.5-turbo'], label='Model', value='gpt-4') 94 | headless = gr.Checkbox(label='Run Headless', value=True) 95 | submit_btn = gr.Button('Run Task') 96 | 97 | with gr.Column(): 98 | output = gr.Textbox(label='Output', lines=10, interactive=False) 99 | 100 | submit_btn.click( 101 | fn=lambda *args: asyncio.run(run_browser_task(*args)), 102 | inputs=[task, api_key, model, headless], 103 | outputs=output, 104 | ) 105 | 106 | return interface 107 | 108 | 109 | if __name__ == '__main__': 110 | demo = create_ui() 111 | demo.launch() 112 | -------------------------------------------------------------------------------- /examples/ui/streamlit_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | To use it, you'll need to install streamlit, and run with: 3 | 4 | python -m streamlit run streamlit_demo.py 5 | 6 | """ 7 | 8 | import asyncio 9 | import os 10 | import sys 11 | 12 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 13 | 14 | from dotenv import load_dotenv 15 | 16 | load_dotenv() 17 | 18 | import streamlit as st # type: ignore 19 | 20 | from browser_use import Agent 21 | from browser_use.browser import BrowserSession 22 | from browser_use.tools.service import Tools 23 | 24 | if os.name == 'nt': 25 | asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy()) 26 | 27 | 28 | # Function to get the LLM based on provider 29 | def get_llm(provider: str): 30 | if provider == 'anthropic': 31 | from browser_use.llm import ChatAnthropic 32 | 33 | api_key = os.getenv('ANTHROPIC_API_KEY') 34 | if not api_key: 35 | st.error('Error: ANTHROPIC_API_KEY is not set. Please provide a valid API key.') 36 | st.stop() 37 | 38 | return ChatAnthropic(model='claude-3-5-sonnet-20240620', temperature=0.0) 39 | elif provider == 'openai': 40 | from browser_use import ChatOpenAI 41 | 42 | api_key = os.getenv('OPENAI_API_KEY') 43 | if not api_key: 44 | st.error('Error: OPENAI_API_KEY is not set. Please provide a valid API key.') 45 | st.stop() 46 | 47 | return ChatOpenAI(model='gpt-4.1', temperature=0.0) 48 | else: 49 | st.error(f'Unsupported provider: {provider}') 50 | st.stop() 51 | return None # Never reached, but helps with type checking 52 | 53 | 54 | # Function to initialize the agent 55 | def initialize_agent(query: str, provider: str): 56 | llm = get_llm(provider) 57 | tools = Tools() 58 | browser_session = BrowserSession() 59 | 60 | return Agent( 61 | task=query, 62 | llm=llm, # type: ignore 63 | tools=tools, 64 | browser_session=browser_session, 65 | use_vision=True, 66 | max_actions_per_step=1, 67 | ), browser_session 68 | 69 | 70 | # Streamlit UI 71 | st.title('Automated Browser Agent with LLMs 🤖') 72 | 73 | query = st.text_input('Enter your query:', 'go to reddit and search for posts about browser-use') 74 | provider = st.radio('Select LLM Provider:', ['openai', 'anthropic'], index=0) 75 | 76 | if st.button('Run Agent'): 77 | st.write('Initializing agent...') 78 | agent, browser_session = initialize_agent(query, provider) 79 | 80 | async def run_agent(): 81 | with st.spinner('Running automation...'): 82 | await agent.run(max_steps=25) 83 | st.success('Task completed! 🎉') 84 | 85 | asyncio.run(run_agent()) 86 | 87 | st.button('Close Browser', on_click=lambda: asyncio.run(browser_session.kill())) 88 | -------------------------------------------------------------------------------- /examples/use-cases/captcha.py: -------------------------------------------------------------------------------- 1 | """ 2 | Goal: Automates CAPTCHA solving on a demo website. 3 | 4 | 5 | Simple try of the agent. 6 | @dev You need to add OPENAI_API_KEY to your environment variables. 7 | NOTE: captchas are hard. For this example it works. But e.g. for iframes it does not. 8 | for this example it helps to zoom in. 9 | """ 10 | 11 | import asyncio 12 | import os 13 | import sys 14 | 15 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 16 | 17 | from dotenv import load_dotenv 18 | 19 | load_dotenv() 20 | 21 | from browser_use import Agent, ChatOpenAI 22 | 23 | 24 | async def main(): 25 | llm = ChatOpenAI(model='gpt-4.1-mini') 26 | agent = Agent( 27 | task='go to https://captcha.com/demos/features/captcha-demo.aspx and solve the captcha', 28 | llm=llm, 29 | ) 30 | await agent.run() 31 | input('Press Enter to exit') 32 | 33 | 34 | if __name__ == '__main__': 35 | asyncio.run(main()) 36 | -------------------------------------------------------------------------------- /examples/use-cases/check_appointment.py: -------------------------------------------------------------------------------- 1 | # Goal: Checks for available visa appointment slots on the Greece MFA website. 2 | 3 | import asyncio 4 | import os 5 | import sys 6 | 7 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 8 | 9 | from dotenv import load_dotenv 10 | 11 | load_dotenv() 12 | 13 | from pydantic import BaseModel 14 | 15 | from browser_use import ChatOpenAI 16 | from browser_use.agent.service import Agent 17 | from browser_use.tools.service import Tools 18 | 19 | if not os.getenv('OPENAI_API_KEY'): 20 | raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.') 21 | 22 | tools = Tools() 23 | 24 | 25 | class WebpageInfo(BaseModel): 26 | """Model for webpage link.""" 27 | 28 | link: str = 'https://appointment.mfa.gr/en/reservations/aero/ireland-grcon-dub/' 29 | 30 | 31 | @tools.action('Go to the webpage', param_model=WebpageInfo) 32 | def go_to_webpage(webpage_info: WebpageInfo): 33 | """Returns the webpage link.""" 34 | return webpage_info.link 35 | 36 | 37 | async def main(): 38 | """Main function to execute the agent task.""" 39 | task = ( 40 | 'Go to the Greece MFA webpage via the link I provided you.' 41 | 'Check the visa appointment dates. If there is no available date in this month, check the next month.' 42 | 'If there is no available date in both months, tell me there is no available date.' 43 | ) 44 | 45 | model = ChatOpenAI(model='gpt-4.1-mini') 46 | agent = Agent(task, model, tools=tools, use_vision=True) 47 | 48 | await agent.run() 49 | 50 | 51 | if __name__ == '__main__': 52 | asyncio.run(main()) 53 | -------------------------------------------------------------------------------- /examples/use-cases/extract_pdf_content.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env -S uv run --script 2 | # /// script 3 | # requires-python = ">=3.11" 4 | # dependencies = ["browser-use", "mistralai"] 5 | # /// 6 | 7 | import os 8 | import sys 9 | 10 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 11 | 12 | from dotenv import load_dotenv 13 | 14 | load_dotenv() 15 | 16 | import asyncio 17 | import logging 18 | 19 | from browser_use import Agent, ChatOpenAI 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | async def main(): 25 | agent = Agent( 26 | task=""" 27 | Objective: Navigate to the following UR, what is on page 3? 28 | 29 | URL: https://docs.house.gov/meetings/GO/GO00/20220929/115171/HHRG-117-GO00-20220929-SD010.pdf 30 | """, 31 | llm=ChatOpenAI(model='gpt-4.1-mini'), 32 | ) 33 | result = await agent.run() 34 | logger.info(result) 35 | 36 | 37 | if __name__ == '__main__': 38 | asyncio.run(main()) 39 | -------------------------------------------------------------------------------- /examples/use-cases/find_influencer_profiles.py: -------------------------------------------------------------------------------- 1 | """ 2 | Show how to use custom outputs. 3 | 4 | @dev You need to add OPENAI_API_KEY to your environment variables. 5 | """ 6 | 7 | import asyncio 8 | import json 9 | import os 10 | import sys 11 | 12 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 13 | 14 | from dotenv import load_dotenv 15 | 16 | load_dotenv() 17 | 18 | import httpx 19 | from pydantic import BaseModel 20 | 21 | from browser_use import Agent, ChatOpenAI, Tools 22 | from browser_use.agent.views import ActionResult 23 | 24 | 25 | class Profile(BaseModel): 26 | platform: str 27 | profile_url: str 28 | 29 | 30 | class Profiles(BaseModel): 31 | profiles: list[Profile] 32 | 33 | 34 | tools = Tools(exclude_actions=['search_google'], output_model=Profiles) 35 | BEARER_TOKEN = os.getenv('BEARER_TOKEN') 36 | 37 | if not BEARER_TOKEN: 38 | # use the api key for ask tessa 39 | # you can also use other apis like exa, xAI, perplexity, etc. 40 | raise ValueError('BEARER_TOKEN is not set - go to https://www.heytessa.ai/ and create an api key') 41 | 42 | 43 | @tools.registry.action('Search the web for a specific query') 44 | async def search_web(query: str): 45 | keys_to_use = ['url', 'title', 'content', 'author', 'score'] 46 | headers = {'Authorization': f'Bearer {BEARER_TOKEN}'} 47 | async with httpx.AsyncClient() as client: 48 | response = await client.post( 49 | 'https://asktessa.ai/api/search', 50 | headers=headers, 51 | json={'query': query}, 52 | ) 53 | 54 | final_results = [ 55 | {key: source[key] for key in keys_to_use if key in source} 56 | for source in await response.json()['sources'] 57 | if source['score'] >= 0.2 58 | ] 59 | # print(json.dumps(final_results, indent=4)) 60 | result_text = json.dumps(final_results, indent=4) 61 | print(result_text) 62 | return ActionResult(extracted_content=result_text, include_in_memory=True) 63 | 64 | 65 | async def main(): 66 | task = ( 67 | 'Go to this tiktok video url, open it and extract the @username from the resulting url. Then do a websearch for this username to find all his social media profiles. Return me the links to the social media profiles with the platform name.' 68 | ' https://www.tiktokv.com/share/video/7470981717659110678/ ' 69 | ) 70 | model = ChatOpenAI(model='gpt-4.1-mini') 71 | agent = Agent(task=task, llm=model, tools=tools) 72 | 73 | history = await agent.run() 74 | 75 | result = history.final_result() 76 | if result: 77 | parsed: Profiles = Profiles.model_validate_json(result) 78 | 79 | for profile in parsed.profiles: 80 | print('\n--------------------------------') 81 | print(f'Platform: {profile.platform}') 82 | print(f'Profile URL: {profile.profile_url}') 83 | 84 | else: 85 | print('No result') 86 | 87 | 88 | if __name__ == '__main__': 89 | asyncio.run(main()) 90 | -------------------------------------------------------------------------------- /static/NiceHack69.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/browser-use/browser-use/2d84ff6605f4d89988f7868633f288f351c6a349/static/NiceHack69.png -------------------------------------------------------------------------------- /static/browser-use-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/browser-use/browser-use/2d84ff6605f4d89988f7868633f288f351c6a349/static/browser-use-dark.png -------------------------------------------------------------------------------- /static/browser-use.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/browser-use/browser-use/2d84ff6605f4d89988f7868633f288f351c6a349/static/browser-use.png -------------------------------------------------------------------------------- /tests/agent_tasks/README.md: -------------------------------------------------------------------------------- 1 | # Contributing Agent Tasks 2 | 3 | Contribute your own agent tasks and we test if the agent solves them for CI testing! 4 | 5 | ## How to Add a Task 6 | 7 | 1. Create a new `.yaml` file in this directory (`tests/agent_tasks/`). 8 | 2. Use the following format: 9 | 10 | ```yaml 11 | name: My Task Name 12 | task: Describe the task for the agent to perform 13 | judge_context: 14 | - List criteria for success, one per line 15 | max_steps: 10 16 | ``` 17 | 18 | ## Guidelines 19 | - Be specific in your task and criteria. 20 | - The `judge_context` should list what counts as a successful result. 21 | - The agent's output will be judged by an LLM using these criteria. 22 | 23 | ## Running the Tests 24 | 25 | To run all agent tasks: 26 | 27 | ```bash 28 | pytest tests/ci/test_agent_real_tasks.py 29 | ``` 30 | 31 | --- 32 | 33 | Happy contributing! 34 | -------------------------------------------------------------------------------- /tests/agent_tasks/amazon_laptop.yaml: -------------------------------------------------------------------------------- 1 | name: Amazon Laptop Search 2 | task: Go to amazon.com, search for 'laptop', and return the first result 3 | judge_context: 4 | - The agent must navigate to amazon.com 5 | - The agent must search for 'laptop' 6 | - The agent must return name of the first laptop 7 | max_steps: 10 8 | -------------------------------------------------------------------------------- /tests/agent_tasks/browser_use_pip.yaml: -------------------------------------------------------------------------------- 1 | name: Find pip install command for browser-use 2 | task: Find the pip installation command for the browser-use repo 3 | judge_context: 4 | - The output must include the command ('pip install browser-use') 5 | max_steps: 10 6 | -------------------------------------------------------------------------------- /tests/agent_tasks/captcha_cloudflare.yaml: -------------------------------------------------------------------------------- 1 | name: Cloudflare captcha 2 | task: Go to https://2captcha.com/demo/cloudflare-turnstile and solve the captcha, wait a few seconds, then click on check, wait a few more seconds for it to complete, then extract the "hostname" value from the displayed dictionary under "Captcha is passed successfully!" 3 | judge_context: 4 | - The agent must solve the captcha 5 | - The hostname returned should be "example.com" 6 | max_steps: 6 7 | -------------------------------------------------------------------------------- /tests/ci/test_browser_session_viewport_and_proxy.py: -------------------------------------------------------------------------------- 1 | async def test_proxy_settings_pydantic_model(): 2 | """ 3 | Test that ProxySettings as a Pydantic model is correctly converted to a dictionary when used. 4 | """ 5 | # Create ProxySettings with Pydantic model 6 | proxy_settings = dict(server='http://example.proxy:8080', bypass='localhost', username='testuser', password='testpass') 7 | 8 | # Verify the model has correct dict-like access 9 | assert proxy_settings['server'] == 'http://example.proxy:8080' 10 | assert proxy_settings.get('bypass') == 'localhost' 11 | assert proxy_settings.get('nonexistent', 'default') == 'default' 12 | 13 | # Verify model_dump works correctly 14 | proxy_dict = dict(proxy_settings) 15 | assert isinstance(proxy_dict, dict) 16 | assert proxy_dict['server'] == 'http://example.proxy:8080' 17 | assert proxy_dict['bypass'] == 'localhost' 18 | assert proxy_dict['username'] == 'testuser' 19 | assert proxy_dict['password'] == 'testpass' 20 | 21 | # We don't launch the actual browser - we just verify the model itself works as expected 22 | -------------------------------------------------------------------------------- /tests/ci/test_browser_watchdog_downloads_simple.py: -------------------------------------------------------------------------------- 1 | """Test simple download functionality.""" 2 | 3 | import pytest 4 | 5 | # Skip Playwright imports - removed dependency 6 | from pytest_httpserver import HTTPServer 7 | 8 | 9 | async def test_simple_playwright_download(): 10 | """Test basic Playwright download functionality without browser-use - this just validates the browser setup""" 11 | # Skip Playwright usage - removed dependency 12 | pytest.skip('Playwright dependency removed') 13 | 14 | 15 | @pytest.fixture(scope='function') 16 | def http_server(): 17 | """Create a test HTTP server with a downloadable file.""" 18 | server = HTTPServer() 19 | server.start() 20 | 21 | # Serve a simple text file for download 22 | server.expect_request('/download/test.txt').respond_with_data( 23 | 'Hello World from HTTP Server', status=200, headers={'Content-Type': 'text/plain'} 24 | ) 25 | 26 | yield server 27 | server.stop() 28 | 29 | 30 | async def test_browser_use_download_with_http_server(http_server): 31 | """Test browser-use download with HTTP server and event coordination""" 32 | # Skip complex element selection for now - would need to implement selector-to-index conversion 33 | pytest.skip('Complex element selection needs refactoring for CDP events') 34 | -------------------------------------------------------------------------------- /tests/ci/test_llm_anthropic_502_error.py: -------------------------------------------------------------------------------- 1 | """Test for handling Anthropic 502 errors""" 2 | 3 | import pytest 4 | from anthropic import APIStatusError 5 | 6 | from browser_use.llm.anthropic.chat import ChatAnthropic 7 | from browser_use.llm.exceptions import ModelProviderError 8 | from browser_use.llm.messages import BaseMessage, UserMessage 9 | 10 | 11 | @pytest.mark.asyncio 12 | async def test_anthropic_502_error_handling(monkeypatch): 13 | """Test that ChatAnthropic properly handles 502 errors from the API""" 14 | # Create a ChatAnthropic instance 15 | chat = ChatAnthropic(model='claude-3-5-sonnet-20240620', api_key='test-key') 16 | 17 | # Create test messages 18 | messages: list[BaseMessage] = [UserMessage(content='Test message')] 19 | 20 | # Mock the client to raise a 502 error 21 | class MockClient: 22 | class Messages: 23 | async def create(self, **kwargs): 24 | # Simulate a 502 error from Anthropic API 25 | import httpx 26 | 27 | request = httpx.Request('POST', 'https://api.anthropic.com/v1/messages') 28 | response = httpx.Response(status_code=502, headers={}, content=b'Bad Gateway', request=request) 29 | raise APIStatusError( 30 | message='Bad Gateway', response=response, body={'error': {'message': 'Bad Gateway', 'type': 'server_error'}} 31 | ) 32 | 33 | messages = Messages() 34 | 35 | # Replace the client with our mock 36 | monkeypatch.setattr(chat, 'get_client', lambda: MockClient()) 37 | 38 | # Test that the error is properly caught and re-raised as ModelProviderError 39 | with pytest.raises(ModelProviderError) as exc_info: 40 | await chat.ainvoke(messages) 41 | 42 | # Verify the error details 43 | assert exc_info.value.args[0] == 'Bad Gateway' 44 | assert exc_info.value.args[1] == 502 45 | assert str(exc_info.value) == "('Bad Gateway', 502)" 46 | 47 | 48 | @pytest.mark.asyncio 49 | async def test_anthropic_error_does_not_access_usage(monkeypatch): 50 | """Test that error handling doesn't try to access usage attribute on error responses""" 51 | chat = ChatAnthropic(model='claude-3-5-sonnet-20240620', api_key='test-key') 52 | 53 | messages: list[BaseMessage] = [UserMessage(content='Test message')] 54 | 55 | # Mock the client to return a string instead of a proper response 56 | class MockClient: 57 | class Messages: 58 | async def create(self, **kwargs): 59 | # This simulates what might happen if the API returns an unexpected response 60 | # that gets parsed as a string 61 | return 'Error: Bad Gateway' 62 | 63 | messages = Messages() 64 | 65 | monkeypatch.setattr(chat, 'get_client', lambda: MockClient()) 66 | 67 | # This should raise a ModelProviderError with a clear message 68 | with pytest.raises(ModelProviderError) as exc_info: 69 | await chat.ainvoke(messages) 70 | 71 | # The error should be about unexpected response type, not missing 'usage' attribute 72 | assert "'str' object has no attribute 'usage'" not in str(exc_info.value) 73 | assert 'Unexpected response type from Anthropic API' in str(exc_info.value) 74 | assert exc_info.value.args[1] == 502 75 | -------------------------------------------------------------------------------- /tests/ci/test_llm_schema_optimizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the SchemaOptimizer to ensure it correctly processes and 3 | optimizes the schemas for agent actions without losing information. 4 | """ 5 | 6 | from pydantic import BaseModel 7 | 8 | from browser_use.agent.views import AgentOutput 9 | from browser_use.llm.schema import SchemaOptimizer 10 | from browser_use.tools.service import Tools 11 | 12 | 13 | class ProductInfo(BaseModel): 14 | """A sample structured output model with multiple fields.""" 15 | 16 | price: str 17 | title: str 18 | rating: float | None = None 19 | 20 | 21 | def test_optimizer_preserves_all_fields_in_structured_done_action(): 22 | """ 23 | Ensures the SchemaOptimizer does not drop fields from a custom structured 24 | output model when creating the schema for the 'done' action. 25 | 26 | This test specifically checks for a bug where fields were being lost 27 | during the optimization process. 28 | """ 29 | # 1. Setup a tools with a custom output model, simulating an Agent 30 | # being created with an `output_model_schema`. 31 | tools = Tools(output_model=ProductInfo) 32 | 33 | # 2. Get the dynamically created AgentOutput model, which includes all registered actions. 34 | ActionModel = tools.registry.create_action_model() 35 | agent_output_model = AgentOutput.type_with_custom_actions(ActionModel) 36 | 37 | # 3. Run the schema optimizer on the agent's output model. 38 | optimized_schema = SchemaOptimizer.create_optimized_json_schema(agent_output_model) 39 | 40 | # 4. Find the 'done' action schema within the optimized output. 41 | # The path is properties -> action -> items -> anyOf -> [schema with 'done']. 42 | done_action_schema = None 43 | actions_schemas = optimized_schema.get('properties', {}).get('action', {}).get('items', {}).get('anyOf', []) 44 | for action_schema in actions_schemas: 45 | if 'done' in action_schema.get('properties', {}): 46 | done_action_schema = action_schema 47 | break 48 | 49 | # 5. Assert that the 'done' action schema was successfully found. 50 | assert done_action_schema is not None, "Could not find 'done' action in the optimized schema." 51 | 52 | # 6. Navigate to the schema for our custom data model within the 'done' action. 53 | # The path is properties -> done -> properties -> data -> properties. 54 | done_params_schema = done_action_schema.get('properties', {}).get('done', {}) 55 | structured_data_schema = done_params_schema.get('properties', {}).get('data', {}) 56 | final_properties = structured_data_schema.get('properties', {}) 57 | 58 | # 7. Assert that the set of fields in the optimized schema matches the original model's fields. 59 | original_fields = set(ProductInfo.model_fields.keys()) 60 | optimized_fields = set(final_properties.keys()) 61 | 62 | assert original_fields == optimized_fields, ( 63 | f"Field mismatch between original and optimized structured 'done' action schema.\n" 64 | f'Missing from optimized: {original_fields - optimized_fields}\n' 65 | f'Unexpected in optimized: {optimized_fields - original_fields}' 66 | ) 67 | --------------------------------------------------------------------------------