├── src
    └── mcp_browser_use
    │   ├── agent
    │       ├── __init__.py
    │       ├── custom_views.py
    │       ├── custom_massage_manager.py
    │       ├── custom_prompts.py
    │       └── custom_agent.py
    │   ├── browser
    │       ├── __init__.py
    │       └── browser_manager.py
    │   ├── utils
    │       ├── __init__.py
    │       ├── logging.py
    │       ├── agent_state.py
    │       └── utils.py
    │   ├── controller
    │       ├── __init__.py
    │       └── custom_controller.py
    │   ├── mcp_browser_use.py
    │   ├── __init__.py
    │   ├── client.py
    │   └── server.py
├── tests
    ├── stubs
    │   ├── browser_use
    │   │   ├── agent
    │   │   │   ├── prompts.py
    │   │   │   ├── message_manager
    │   │   │   │   ├── service.py
    │   │   │   │   └── views.py
    │   │   │   ├── service.py
    │   │   │   └── views.py
    │   │   ├── browser
    │   │   │   ├── browser.py
    │   │   │   ├── events.py
    │   │   │   ├── __init__.py
    │   │   │   ├── profile.py
    │   │   │   ├── context.py
    │   │   │   └── views.py
    │   │   ├── controller
    │   │   │   ├── registry
    │   │   │   │   └── views.py
    │   │   │   └── service.py
    │   │   ├── telemetry
    │   │   │   └── views.py
    │   │   ├── utils.py
    │   │   └── __init__.py
    │   ├── langchain_openai
    │   │   ├── chat_models
    │   │   │   ├── base.py
    │   │   │   └── __init__.py
    │   │   └── __init__.py
    │   ├── langchain_core
    │   │   ├── messages
    │   │   │   └── __init__.py
    │   │   ├── language_models
    │   │   │   ├── __init__.py
    │   │   │   └── chat_models.py
    │   │   └── prompts
    │   │   │   └── __init__.py
    │   └── PIL
    │   │   └── __init__.py
    ├── test_agent_state.py
    ├── test_logging_configuration.py
    ├── conftest.py
    ├── test_gif_creation.py
    ├── test_browser_manager.py
    ├── test_custom_agent_controller.py
    ├── test_summarize_messages.py
    ├── test_client_session.py
    └── test_utils.py
├── .gitattributes
├── renovate.json
├── .editorconfig
├── pyproject.toml
├── Dockerfile
├── .gitignore
├── sample.env.env
├── smithery.yaml
├── README.md
└── documentation
    ├── CONFIGURATION.md
    └── SECURITY.md


/src/mcp_browser_use/agent/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/src/mcp_browser_use/browser/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/src/mcp_browser_use/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/src/mcp_browser_use/controller/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/tests/stubs/browser_use/agent/prompts.py:
--------------------------------------------------------------------------------
1 | class SystemPrompt:
2 |     pass
3 | 


--------------------------------------------------------------------------------
/tests/stubs/browser_use/browser/browser.py:
--------------------------------------------------------------------------------
1 | class Browser:
2 |     pass
3 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/tests/stubs/browser_use/controller/registry/views.py:
--------------------------------------------------------------------------------
1 | class ActionModel:
2 |     pass
3 | 


--------------------------------------------------------------------------------
/tests/stubs/langchain_openai/chat_models/base.py:
--------------------------------------------------------------------------------
1 | _convert_message_to_dict = lambda x: {}
2 | 


--------------------------------------------------------------------------------
/renovate.json:
--------------------------------------------------------------------------------
1 | {
2 |   "$schema": "https://docs.renovatebot.com/renovate-schema.json",
3 |   "extends": ["config:recommended"]
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/stubs/browser_use/agent/message_manager/service.py:
--------------------------------------------------------------------------------
1 | class MessageManager:
2 |     def __init__(self, *args, **kwargs):
3 |         pass
4 | 


--------------------------------------------------------------------------------
/tests/stubs/langchain_openai/__init__.py:
--------------------------------------------------------------------------------
1 | from .chat_models import AzureChatOpenAI, ChatOpenAI
2 | 
3 | __all__ = ["ChatOpenAI", "AzureChatOpenAI"]
4 | 


--------------------------------------------------------------------------------
/tests/stubs/langchain_core/messages/__init__.py:
--------------------------------------------------------------------------------
1 | class BaseMessage: pass
2 | class HumanMessage: pass
3 | class AIMessage: pass
4 | class SystemMessage: pass
5 | 


--------------------------------------------------------------------------------
/tests/stubs/browser_use/agent/service.py:
--------------------------------------------------------------------------------
1 | class Agent:
2 |     def __init__(self, *args, **kwargs):
3 |         self.history = kwargs.get('history', None)
4 |         self.generate_gif = False
5 | 


--------------------------------------------------------------------------------
/tests/stubs/browser_use/telemetry/views.py:
--------------------------------------------------------------------------------
1 | class AgentEndTelemetryEvent:
2 |     def __init__(self, *args, **kwargs):
3 |         pass
4 | class AgentRunTelemetryEvent:
5 |     def __init__(self, *args, **kwargs):
6 |         pass
7 | 


--------------------------------------------------------------------------------
/tests/stubs/langchain_core/language_models/__init__.py:
--------------------------------------------------------------------------------
1 | class BaseChatModel:
2 |     def with_structured_output(self, *args, **kwargs):
3 |         return self
4 |     async def ainvoke(self, *args, **kwargs):
5 |         return {}
6 | 


--------------------------------------------------------------------------------
/tests/stubs/langchain_core/language_models/chat_models.py:
--------------------------------------------------------------------------------
1 | class BaseChatModel:
2 |     async def ainvoke(self, *args, **kwargs):
3 |         return {}
4 |     def with_structured_output(self, *args, **kwargs):
5 |         return self
6 | 


--------------------------------------------------------------------------------
/tests/stubs/browser_use/utils.py:
--------------------------------------------------------------------------------
1 | def time_execution_async(name):
2 |     def decorator(func):
3 |         async def wrapper(*args, **kwargs):
4 |             return await func(*args, **kwargs)
5 | 
6 |         return wrapper
7 | 
8 |     return decorator
9 | 


--------------------------------------------------------------------------------
/tests/stubs/browser_use/browser/events.py:
--------------------------------------------------------------------------------
1 | class SendKeysEvent:
2 |     def __init__(self, keys: str):
3 |         self.keys = keys
4 | 
5 | 
6 | class ScreenshotEvent:
7 |     def __init__(self, full_page: bool = False):
8 |         self.full_page = full_page
9 | 


--------------------------------------------------------------------------------
/src/mcp_browser_use/mcp_browser_use.py:
--------------------------------------------------------------------------------
1 | """Public entry-points for backwards compatible imports."""
2 | 
3 | from __future__ import annotations
4 | 
5 | from .client import AgentNotRegisteredError, create_client_session
6 | 
7 | __all__ = ["AgentNotRegisteredError", "create_client_session"]
8 | 


--------------------------------------------------------------------------------
/tests/stubs/browser_use/browser/__init__.py:
--------------------------------------------------------------------------------
1 | from .. import BrowserSession as Browser  # noqa: F401
2 | from .events import SendKeysEvent  # noqa: F401
3 | from .profile import BrowserProfile, ProxySettings  # noqa: F401
4 | 
5 | __all__ = ["Browser", "BrowserProfile", "ProxySettings", "SendKeysEvent"]
6 | 


--------------------------------------------------------------------------------
/tests/stubs/browser_use/agent/message_manager/views.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Any, List
 3 | 
 4 | @dataclass
 5 | class MessageHistory:
 6 |     messages: List[Any] = field(default_factory=list)
 7 |     total_tokens: int = 0
 8 | 
 9 | @dataclass
10 | class ManagedMessage:
11 |     message: Any
12 | 


--------------------------------------------------------------------------------
/tests/stubs/browser_use/browser/profile.py:
--------------------------------------------------------------------------------
 1 | class ProxySettings:
 2 |     def __init__(self, **kwargs):
 3 |         for key, value in kwargs.items():
 4 |             setattr(self, key, value)
 5 | 
 6 | 
 7 | class BrowserProfile:
 8 |     def __init__(self, **kwargs):
 9 |         for key, value in kwargs.items():
10 |             setattr(self, key, value)
11 | 


--------------------------------------------------------------------------------
/tests/stubs/browser_use/browser/context.py:
--------------------------------------------------------------------------------
 1 | class BrowserContextConfig:
 2 |     def __init__(self, **kwargs):
 3 |         for key, value in kwargs.items():
 4 |             setattr(self, key, value)
 5 | 
 6 | 
 7 | class BrowserContext:
 8 |     async def get_state(self, *args, **kwargs):
 9 |         pass
10 | 
11 |     async def close(self):
12 |         pass
13 | 


--------------------------------------------------------------------------------
/tests/stubs/browser_use/browser/views.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | @dataclass
 4 | class BrowserStateHistory:
 5 |     url: str = ""
 6 |     title: str = ""
 7 |     tabs: list = None
 8 |     interacted_element: list = None
 9 |     screenshot: str | None = None
10 | 
11 | @dataclass
12 | class BrowserState:
13 |     screenshot: str | None = None
14 | 


--------------------------------------------------------------------------------
/tests/stubs/langchain_core/prompts/__init__.py:
--------------------------------------------------------------------------------
 1 | class ChatPromptTemplate:
 2 |     @staticmethod
 3 |     def from_messages(msgs):
 4 |         return ChatPromptTemplate()
 5 |     def __or__(self, other):
 6 |         return self
 7 |     def invoke(self, data):
 8 |         return ''
 9 | 
10 | class MessagesPlaceholder:
11 |     def __init__(self, variable_name=''):
12 |         pass
13 | 


--------------------------------------------------------------------------------
/src/mcp_browser_use/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """MCP server for browser-use."""
 4 | 
 5 | from mcp_browser_use.mcp_browser_use import (  # noqa: F401
 6 |     AgentNotRegisteredError,
 7 |     create_client_session,
 8 | )
 9 | from mcp_browser_use.server import app, launch_mcp_browser_use_server
10 | 
11 | __all__ = [
12 |     "app",
13 |     "launch_mcp_browser_use_server",
14 |     "create_client_session",
15 |     "AgentNotRegisteredError",
16 | ]
17 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # Check http://editorconfig.org for more information
 2 | # This is the main config file for this project:
 3 | root = true
 4 | 
 5 | [*]
 6 | charset = utf-8
 7 | end_of_line = lf
 8 | insert_final_newline = true
 9 | indent_style = space
10 | indent_size = 2
11 | trim_trailing_whitespace = true
12 | 
13 | [*.{py, pyi}]
14 | indent_style = space
15 | indent_size = 4
16 | 
17 | [Makefile]
18 | indent_style = tab
19 | 
20 | [*.md]
21 | trim_trailing_whitespace = false


--------------------------------------------------------------------------------
/tests/stubs/browser_use/controller/service.py:
--------------------------------------------------------------------------------
 1 | class _Registry:
 2 |     def get_prompt_description(self):
 3 |         return ""
 4 | 
 5 |     def create_action_model(self):
 6 |         return type("ActionModel", (), {})
 7 | 
 8 |     def action(self, *_args, **_kwargs):
 9 |         def decorator(func):
10 |             return func
11 | 
12 |         return decorator
13 | 
14 | 
15 | class Controller:
16 |     def __init__(self):
17 |         self.registry = _Registry()
18 | 
19 |     async def multi_act(self, actions, context):  # pragma: no cover - stub
20 |         return []
21 | 


--------------------------------------------------------------------------------
/tests/stubs/langchain_openai/chat_models/__init__.py:
--------------------------------------------------------------------------------
 1 | class Base:
 2 |     pass
 3 | 
 4 | class ChatOpenAI:
 5 |     def __init__(self, *args, **kwargs):
 6 |         pass
 7 | 
 8 |     root_async_client = None
 9 |     model_name = 'mock'
10 |     def with_structured_output(self, *args, **kwargs):
11 |         return self
12 |     async def ainvoke(self, *args, **kwargs):
13 |         return {}
14 | 
15 | 
16 | class AzureChatOpenAI(ChatOpenAI):
17 |     """Minimal stub mirroring the OpenAI chat client API."""
18 | 
19 |     def __init__(self, *args, **kwargs):
20 |         super().__init__(*args, **kwargs)
21 | 
22 | 


--------------------------------------------------------------------------------
/tests/test_agent_state.py:
--------------------------------------------------------------------------------
 1 | from mcp_browser_use.utils.agent_state import AgentState
 2 | 
 3 | 
 4 | def test_agent_state_stop_flow():
 5 |     state = AgentState()
 6 | 
 7 |     assert state.is_stop_requested() is False
 8 | 
 9 |     state.request_stop()
10 |     assert state.is_stop_requested() is True
11 | 
12 |     state.clear_stop()
13 |     assert state.is_stop_requested() is False
14 | 
15 | 
16 | def test_agent_state_last_valid_state_reset():
17 |     state = AgentState()
18 | 
19 |     marker = {"url": "https://example.com"}
20 |     state.set_last_valid_state(marker)
21 | 
22 |     assert state.get_last_valid_state() == marker
23 | 
24 |     state.clear_stop()
25 | 
26 |     assert state.get_last_valid_state() is None
27 | 


--------------------------------------------------------------------------------
/tests/stubs/browser_use/agent/views.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Any, List, Optional
 3 | 
 4 | @dataclass
 5 | class ActionResult:
 6 |     extracted_content: Optional[str] = None
 7 |     error: Optional[str] = None
 8 |     is_done: bool = False
 9 |     include_in_memory: bool = False
10 | 
11 | @dataclass
12 | class AgentHistory:
13 |     model_output: Any
14 |     state: Any
15 |     result: List[ActionResult]
16 | 
17 | @dataclass
18 | class AgentHistoryList:
19 |     history: List[AgentHistory] = field(default_factory=list)
20 |     def is_done(self) -> bool:
21 |         for h in self.history:
22 |             for r in h.result:
23 |                 if r.is_done:
24 |                     return True
25 |         return False
26 | 
27 | @dataclass
28 | class AgentStepInfo:
29 |     step_number: int = 0
30 | 
31 | class AgentOutput:
32 |     pass
33 | 


--------------------------------------------------------------------------------
/src/mcp_browser_use/utils/logging.py:
--------------------------------------------------------------------------------
 1 | """Centralised logging configuration utilities for the MCP browser agent."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import logging
 6 | import os
 7 | from typing import Optional
 8 | 
 9 | 
10 | _DEFAULT_FORMAT = "%(asctime)s | %(levelname)s | %(name)s | %(message)s"
11 | 
12 | 
13 | def _resolve_level(level_name: Optional[str]) -> int:
14 |     """Translate a string level name into a numeric logging level."""
15 | 
16 |     if not level_name:
17 |         return logging.INFO
18 | 
19 |     try:
20 |         return int(level_name)
21 |     except ValueError:
22 |         resolved = logging.getLevelName(level_name.upper())
23 |         if isinstance(resolved, int):
24 |             return resolved
25 |         return logging.INFO
26 | def configure_logging() -> None:
27 |     """Configure the root logger once for the application."""
28 | 
29 |     level = _resolve_level(os.getenv("LOG_LEVEL"))
30 | 
31 |     root_logger = logging.getLogger()
32 |     if not root_logger.handlers:
33 |         logging.basicConfig(level=level, format=_DEFAULT_FORMAT)
34 |     else:
35 |         root_logger.setLevel(level)
36 | 


--------------------------------------------------------------------------------
/src/mcp_browser_use/utils/agent_state.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | """
 5 | If we plan to scale or have multiple agents, we might remove the singleton pattern or differentiate them by agent ID.
 6 | """
 7 | 
 8 | import asyncio
 9 | from typing import Any, Optional
10 | 
11 | 
12 | class AgentState:
13 |     """
14 |     Tracks an asynchronous stop signal and stores the last valid browser state.
15 | 
16 |     request_stop() sets an asyncio.Event, is_stop_requested() checks if it's set,
17 |     clear_stop() resets the event and last_valid_state.
18 |     """
19 | 
20 |     def __init__(self) -> None:
21 |         self._stop_requested = asyncio.Event()
22 |         self._last_valid_state: Optional[Any] = None
23 | 
24 |     def request_stop(self) -> None:
25 |         self._stop_requested.set()
26 | 
27 |     def clear_stop(self) -> None:
28 |         self._stop_requested.clear()
29 |         self._last_valid_state = None
30 | 
31 |     def is_stop_requested(self) -> bool:
32 |         return self._stop_requested.is_set()
33 | 
34 |     def set_last_valid_state(self, state: Any) -> None:
35 |         self._last_valid_state = state
36 | 
37 |     def get_last_valid_state(self) -> Optional[Any]:
38 |         return self._last_valid_state
39 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "mcp_browser_use"
 3 | version = "0.1.0"
 4 | description = "This Python project is a FastAPI server implementing MCP Server protocol Browser automation via browser-use library."
 5 | readme = "README.md"
 6 | requires-python = ">=3.11"
 7 | license = { text = "MIT" }
 8 | classifiers = [
 9 |     "Development Status :: 4 - Beta",
10 |     "Programming Language :: Python :: 3",
11 |     "Programming Language :: Python :: 3.11",
12 |     "Operating System :: OS Independent",
13 | ]
14 | 
15 | dependencies = [
16 |     "pydantic>=2.11.9",
17 |     "uvicorn>=0.37.0",
18 |     "browser-use>=0.7.9",
19 |     "fastapi>=0.117.1",
20 |     "fastmcp>=2.12.4",
21 |     "instructor>=1.11.3",
22 |     "langchain>=0.3.27",
23 |     "langchain-google-genai>=2.1.1",
24 |     "langchain-openai>=0.2.14",
25 |     "langchain-anthropic>=0.3.20",
26 |     "langchain-ollama>=0.2.2",
27 |     "openai>=1.109.1",
28 |     "pillow>=11.3.0",
29 |     "python-dotenv>=1.1.1",
30 |     "pyperclip>=1.11.0",
31 | ]
32 | 
33 | [build-system]
34 | requires = ["hatchling"]
35 | build-backend = "hatchling.build"
36 | 
37 | [tool.hatch.build.targets.wheel]
38 | packages = ["src/mcp_browser_use"]
39 | 
40 | [project.scripts]
41 | mcp-browser-use = "mcp_browser_use.server:launch_mcp_browser_use_server"
42 | 


--------------------------------------------------------------------------------
/tests/test_logging_configuration.py:
--------------------------------------------------------------------------------
 1 | """Smoke tests around module imports and logging configuration."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import importlib
 6 | import logging
 7 | import sys
 8 | from typing import Iterable
 9 | 
10 | import pytest
11 | 
12 | 
13 | MODULES_TO_TEST: Iterable[str] = (
14 |     "mcp_browser_use.controller.custom_controller",
15 |     "mcp_browser_use.utils.utils",
16 |     "mcp_browser_use.agent.custom_agent",
17 |     "mcp_browser_use.agent.custom_message_manager",
18 | )
19 | 
20 | 
21 | @pytest.mark.parametrize("module_name", MODULES_TO_TEST)
22 | def test_module_import_does_not_call_basic_config(module_name: str, monkeypatch) -> None:
23 |     """Ensure importing project modules does not invoke ``logging.basicConfig``."""
24 | 
25 |     # Import once so that shared third-party dependencies are cached.
26 |     importlib.import_module(module_name)
27 |     sys.modules.pop(module_name, None)
28 | 
29 |     calls: list[tuple[tuple[object, ...], dict[str, object]]] = []
30 | 
31 |     def record_basic_config(*args: object, **kwargs: object) -> None:
32 |         calls.append((args, kwargs))
33 | 
34 |     monkeypatch.setattr(logging, "basicConfig", record_basic_config)
35 | 
36 |     importlib.import_module(module_name)
37 | 
38 |     assert calls == [], f"Module {module_name} should not call logging.basicConfig during import"
39 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Generated by https://smithery.ai. See: https://smithery.ai/docs/config#dockerfile
 2 | # Use a Python image with uv pre-installed
 3 | FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim AS uv
 4 | 
 5 | # Install the project into /app
 6 | WORKDIR /app
 7 | 
 8 | # Enable bytecode compilation
 9 | ENV UV_COMPILE_BYTECODE=1
10 | 
11 | # Copy from the cache instead of linking since it's a mounted volume
12 | ENV UV_LINK_MODE=copy
13 | 
14 | # Install the project's dependencies using the lockfile and settings
15 | RUN --mount=type=cache,target=/root/.cache/uv \
16 |   --mount=type=bind,source=uv.lock,target=uv.lock \
17 |   --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
18 |   uv sync --frozen --no-install-project --no-dev --no-editable
19 | 
20 | # Then, add the rest of the project source code and install it
21 | # Installing separately from its dependencies allows optimal layer caching
22 | ADD . /app
23 | RUN --mount=type=cache,target=/root/.cache/uv \
24 |   uv sync --frozen --no-dev --no-editable
25 | 
26 | FROM python:3.13-slim-bookworm
27 | 
28 | WORKDIR /app
29 | 
30 | COPY --from=uv /root/.local /root/.local
31 | COPY --from=uv --chown=app:app /app/.venv /app/.venv
32 | 
33 | # Place executables in the environment at the front of the path
34 | ENV PATH="/app/.venv/bin:$PATH"
35 | 
36 | # when running the container, add --db-path and a bind mount to the host's db file
37 | ENTRYPOINT ["mcp-browser-use"]
38 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """Test fixtures and environment setup for the test suite."""
 2 | 
 3 | import importlib
 4 | import os
 5 | import sys
 6 | import types
 7 | 
 8 | BASE_DIR = os.path.dirname(__file__)
 9 | STUBS_DIR = os.path.join(BASE_DIR, "stubs")
10 | SRC_DIR = os.path.join(os.path.dirname(BASE_DIR), "src")
11 | 
12 | for path in (STUBS_DIR, SRC_DIR):
13 |     if path not in sys.path:
14 |         sys.path.insert(0, path)
15 | 
16 | if "langchain_openai" not in sys.modules:
17 |     importlib.import_module("langchain_openai")
18 | 
19 | if "langchain_anthropic" not in sys.modules:
20 |     module = types.ModuleType("langchain_anthropic")
21 | 
22 |     class ChatAnthropic:  # type: ignore[too-many-ancestors]
23 |         def __init__(self, *args, **kwargs):
24 |             pass
25 | 
26 |     module.ChatAnthropic = ChatAnthropic
27 |     sys.modules["langchain_anthropic"] = module
28 | 
29 | if "langchain_google_genai" not in sys.modules:
30 |     module = types.ModuleType("langchain_google_genai")
31 | 
32 |     class ChatGoogleGenerativeAI:  # type: ignore[too-many-ancestors]
33 |         def __init__(self, *args, **kwargs):
34 |             pass
35 | 
36 |     module.ChatGoogleGenerativeAI = ChatGoogleGenerativeAI
37 |     sys.modules["langchain_google_genai"] = module
38 | 
39 | if "langchain_ollama" not in sys.modules:
40 |     module = types.ModuleType("langchain_ollama")
41 | 
42 |     class ChatOllama:  # type: ignore[too-many-ancestors]
43 |         def __init__(self, *args, **kwargs):
44 |             pass
45 | 
46 |     module.ChatOllama = ChatOllama
47 |     sys.modules["langchain_ollama"] = module
48 | 
49 | 


--------------------------------------------------------------------------------
/tests/test_gif_creation.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import base64
 4 | import io
 5 | 
 6 | # Add stub package path before importing CustomAgent
 7 | BASE_DIR = os.path.dirname(__file__)
 8 | sys.path.insert(0, os.path.join(BASE_DIR, "stubs"))
 9 | sys.path.insert(0, os.path.join(os.path.dirname(BASE_DIR), "src"))
10 | 
11 | from PIL import Image
12 | 
13 | from mcp_browser_use.agent.custom_agent import CustomAgent
14 | from browser_use.agent.views import AgentHistoryList, AgentHistory, ActionResult
15 | from browser_use.browser.views import BrowserStateHistory
16 | 
17 | 
18 | class DummyState:
19 |     def __init__(self, thought: str):
20 |         self.current_state = type("Brain", (), {"thought": thought})()
21 | 
22 | 
23 | def create_screenshot() -> str:
24 |     img = Image.new("RGB", (100, 100), color="white")
25 |     buf = io.BytesIO()
26 |     img.save(buf, format="PNG")
27 |     return base64.b64encode(buf.getvalue()).decode("utf-8")
28 | 
29 | 
30 | def test_create_history_gif(tmp_path):
31 |     screenshot = create_screenshot()
32 |     hist = AgentHistoryList(
33 |         history=[
34 |             AgentHistory(
35 |                 model_output=DummyState("step one"),
36 |                 state=BrowserStateHistory(screenshot=screenshot),
37 |                 result=[ActionResult(is_done=False)],
38 |             ),
39 |             AgentHistory(
40 |                 model_output=DummyState("step two"),
41 |                 state=BrowserStateHistory(screenshot=screenshot),
42 |                 result=[ActionResult(is_done=True)],
43 |             ),
44 |         ]
45 |     )
46 | 
47 |     agent = CustomAgent.__new__(CustomAgent)
48 |     agent.history = hist
49 |     agent.task = "My Task"
50 | 
51 |     output_gif = tmp_path / "out.gif"
52 |     agent.create_history_gif(output_path=str(output_gif))
53 | 
54 |     assert output_gif.exists()
55 | 


--------------------------------------------------------------------------------
/tests/test_browser_manager.py:
--------------------------------------------------------------------------------
 1 | """Tests for browser manager environment configuration helpers."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import importlib
 6 | 
 7 | import pytest
 8 | 
 9 | 
10 | browser_manager = importlib.import_module(
11 |     "mcp_browser_use.browser.browser_manager"
12 | )
13 | 
14 | 
15 | @pytest.fixture(autouse=True)
16 | def clear_browser_env(monkeypatch):
17 |     """Ensure browser-related environment variables do not leak between tests."""
18 | 
19 |     for key in (
20 |         "BROWSER_USE_CDP_URL",
21 |         "CHROME_DEBUGGING_HOST",
22 |         "CHROME_DEBUGGING_PORT",
23 |     ):
24 |         monkeypatch.delenv(key, raising=False)
25 | 
26 | 
27 | def test_from_env_derives_cdp_url_from_debugging(monkeypatch):
28 |     """When only debugging env vars are set, derive a CDP URL automatically."""
29 | 
30 |     monkeypatch.setenv("CHROME_DEBUGGING_HOST", "debug.example")
31 |     monkeypatch.setenv("CHROME_DEBUGGING_PORT", "1337")
32 | 
33 |     config = browser_manager.BrowserEnvironmentConfig.from_env()
34 | 
35 |     assert config.cdp_url == "http://debug.example:1337"
36 | 
37 | 
38 | def test_create_browser_session_preserves_computed_cdp_url(monkeypatch):
39 |     """Computed CDP URL is passed to BrowserSession when overrides omit it."""
40 | 
41 |     monkeypatch.setenv("CHROME_DEBUGGING_HOST", "localhost")
42 |     monkeypatch.setenv("CHROME_DEBUGGING_PORT", "9000")
43 | 
44 |     captured_kwargs: dict[str, object] = {}
45 | 
46 |     class DummyBrowserSession:
47 |         def __init__(self, **kwargs):
48 |             captured_kwargs.update(kwargs)
49 | 
50 |     monkeypatch.setattr(browser_manager, "BrowserSession", DummyBrowserSession)
51 | 
52 |     session = browser_manager.create_browser_session()
53 | 
54 |     assert isinstance(session, DummyBrowserSession)
55 |     assert captured_kwargs["cdp_url"] == "http://localhost:9000"
56 | 


--------------------------------------------------------------------------------
/tests/stubs/PIL/__init__.py:
--------------------------------------------------------------------------------
 1 | class DummyImage:
 2 |     def __init__(self, width=100, height=100):
 3 |         self.width = width
 4 |         self.height = height
 5 |         self.mode = "RGBA"
 6 | 
 7 |     @property
 8 |     def size(self):
 9 |         return (self.width, self.height)
10 | 
11 |     def convert(self, mode):
12 |         self.mode = mode
13 |         return self
14 | 
15 |     def resize(self, size, resample=None):
16 |         self.width, self.height = size
17 |         return self
18 | 
19 |     def save(self, fp, *args, **kwargs):
20 |         if hasattr(fp, "write"):
21 |             fp.write(b"dummy")
22 |         else:
23 |             with open(fp, "wb") as f:
24 |                 f.write(b"dummy")
25 | 
26 |     def alpha_composite(self, other):
27 |         pass
28 | 
29 |     def paste(self, img, pos, mask=None):
30 |         pass
31 | 
32 | 
33 | class Image:
34 |     @staticmethod
35 |     def open(fp):
36 |         return DummyImage()
37 | 
38 |     @staticmethod
39 |     def new(mode, size, color=(0, 0, 0, 0)):
40 |         return DummyImage(*size)
41 | 
42 |     Resampling = type("Resampling", (), {"LANCZOS": 0})
43 |     Image = DummyImage
44 | 
45 | 
46 | class ImageDraw:
47 |     class Draw:
48 |         def __init__(self, img):
49 |             pass
50 | 
51 |         def text(self, *args, **kwargs):
52 |             pass
53 | 
54 |         def rectangle(self, *args, **kwargs):
55 |             pass
56 | 
57 |         def textbbox(self, xy, text, font=None):
58 |             # return left, top, right, bottom
59 |             return (0, 0, len(text) * 10, 10)
60 | 
61 |         def textlength(self, text, font=None):
62 |             return len(text) * 10
63 | 
64 |     ImageDraw = Draw
65 | 
66 | 
67 | class ImageFont:
68 |     class FreeTypeFont:
69 |         pass
70 | 
71 |     @staticmethod
72 |     def truetype(font, size):
73 |         return ImageFont.FreeTypeFont()
74 | 
75 |     @staticmethod
76 |     def load_default():
77 |         return ImageFont.FreeTypeFont()
78 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | .pytest_cache/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | db.sqlite3
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # IPython
 78 | profile_default/
 79 | ipython_config.py
 80 | 
 81 | # pyenv
 82 | .python-version
 83 | 
 84 | # celery beat schedule file
 85 | celerybeat-schedule
 86 | 
 87 | # SageMath parsed files
 88 | *.sage.py
 89 | 
 90 | # Environments
 91 | .env
 92 | .venv
 93 | env/
 94 | venv/
 95 | ENV/
 96 | env.bak/
 97 | venv.bak/
 98 | 
 99 | # Spyder project settings
100 | .spyderproject
101 | .spyproject
102 | 
103 | # Rope project settings
104 | .ropeproject
105 | 
106 | # mkdocs documentation
107 | /site
108 | 
109 | # mypy
110 | .mypy_cache/
111 | .dmypy.json
112 | dmypy.json
113 | 
114 | # Pyre type checker
115 | .pyre/
116 | 
117 | # ignore the database
118 | *.db
119 | 
120 | # ignore vscode settings
121 | .vscode/
122 | 
123 | # Project Files
124 | /*.json
125 | target/
126 | dbt_packages/
127 | dbt_packages/*
128 | logs/
129 | /secrets/*
130 | #mac pc specific - system configuration files
131 | .DS_Store
132 | 


--------------------------------------------------------------------------------
/tests/test_custom_agent_controller.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | BASE_DIR = os.path.dirname(__file__)
 5 | sys.path.insert(0, os.path.join(BASE_DIR, "stubs"))
 6 | sys.path.insert(0, os.path.join(os.path.dirname(BASE_DIR), "src"))
 7 | 
 8 | import pytest
 9 | from langchain_core.language_models.chat_models import BaseChatModel
10 | from unittest.mock import Mock
11 | 
12 | import mcp_browser_use.agent.custom_agent as custom_agent_module
13 | 
14 | 
15 | @pytest.fixture
16 | def custom_agent(monkeypatch):
17 |     class DummyMessageManager:
18 |         def __init__(self, *args, **kwargs):
19 |             pass
20 | 
21 |     monkeypatch.setattr(
22 |         custom_agent_module,
23 |         "CustomMassageManager",
24 |         DummyMessageManager,
25 |     )
26 | 
27 |     def fake_agent_init(self, *args, **kwargs):
28 |         for key, value in kwargs.items():
29 |             setattr(self, key, value)
30 |         # Set attributes not passed in kwargs that are needed
31 |         self.n_steps = 0
32 |         self._last_result = None
33 |         self.message_manager = None
34 |         self.history = None
35 |         self.generate_gif = False
36 | 
37 |     monkeypatch.setattr(custom_agent_module.Agent, "__init__", fake_agent_init)
38 | 
39 |     return custom_agent_module
40 | 
41 | 
42 | def test_custom_agent_creates_independent_default_controllers(
43 |     custom_agent, monkeypatch
44 | ):
45 |     controllers = []
46 | 
47 |     class TrackingController(custom_agent.Controller):
48 |         def __init__(self):
49 |             super().__init__()
50 |             controllers.append(self)
51 | 
52 |     monkeypatch.setattr(custom_agent, "Controller", TrackingController)
53 | 
54 |     llm = Mock(spec=BaseChatModel)
55 |     agent_one = custom_agent.CustomAgent(task="Task one", llm=llm)
56 |     agent_two = custom_agent.CustomAgent(task="Task two", llm=llm)
57 | 
58 |     assert agent_one.controller is not agent_two.controller
59 |     assert controllers == [agent_one.controller, agent_two.controller]
60 | 
61 | 
62 | def test_custom_agent_uses_supplied_controller(custom_agent):
63 |     llm = Mock(spec=BaseChatModel)
64 |     provided_controller = custom_agent.Controller()
65 | 
66 |     agent = custom_agent.CustomAgent(
67 |         task="Task with supplied controller",
68 |         llm=llm,
69 |         controller=provided_controller,
70 |     )
71 | 
72 |     assert agent.controller is provided_controller
73 | 


--------------------------------------------------------------------------------
/sample.env.env:
--------------------------------------------------------------------------------
 1 | # ---------------------------
 2 | # API Keys (Replace as needed)
 3 | # ---------------------------
 4 | OPENAI_API_KEY=your_openai_api_key_here
 5 | ANTHROPIC_API_KEY=your_anthropic_api_key_here
 6 | GOOGLE_API_KEY=your_google_api_key_here
 7 | AZURE_OPENAI_API_KEY=your_azure_api_key_here
 8 | DEEPSEEK_API_KEY=your_deepseek_api_key_here
 9 | 
10 | # ----------------------------------
11 | # Model Provider & Endpoint Settings
12 | # ----------------------------------
13 | # Typical endpoints; change to match your usage.
14 | OPENAI_ENDPOINT=https://api.openai.com/v1
15 | ANTHROPIC_API_ENDPOINT=https://api.anthropic.com
16 | AZURE_OPENAI_ENDPOINT=https://your-azure-openai-endpoint
17 | DEEPSEEK_ENDPOINT=https://api.deepseek.com
18 | 
19 | # ---------------------------
20 | # Model & Agent Configuration
21 | # ---------------------------
22 | # Choose one provider: "openai", "anthropic", "azure_openai", "deepseek", "gemini", "ollama".
23 | MCP_MODEL_PROVIDER=anthropic
24 | MCP_MODEL_NAME=claude-3-5-sonnet-20241022
25 | MCP_TEMPERATURE=0.3
26 | MCP_MAX_STEPS=30
27 | MCP_MAX_ACTIONS_PER_STEP=5
28 | MCP_USE_VISION=true
29 | MCP_TOOL_CALL_IN_CONTENT=true
30 | 
31 | # ---------------------------------
32 | # Chrome / Playwright Configuration
33 | # ---------------------------------
34 | # If CHROME_PATH is set, the code will attempt to launch a locally installed Chrome
35 | # with remote debugging on port 9222.
36 | # If left empty, it will launch a standard Chromium instance via Playwright.
37 | 
38 | CHROME_PATH=/path/to/your/chrome/binary
39 | CHROME_USER_DATA=/path/to/your/chrome-profile
40 | CHROME_DEBUGGING_PORT=9222
41 | CHROME_DEBUGGING_HOST=localhost
42 | CHROME_PERSISTENT_SESSION=false
43 | 
44 | # You can add extra flags in your code if needed:
45 | # Example: export CHROME_EXTRA_ARGS="--some-chrome-flag"
46 | 
47 | # --------------
48 | # Other Settings
49 | # --------------
50 | # Adjust HEADLESS or DISABLE_SECURITY if your code checks them.
51 | # By default, you might keep them out or set them in the code itself.
52 | 
53 | # HEADLESS=false
54 | # DISABLE_SECURITY=false
55 | 
56 | # -------------
57 | # Example Usage
58 | # -------------
59 | # Load this file with:
60 | #    source .env
61 | # or use a library like python-dotenv or uv to manage environment variables.
62 | 
63 | # Note: In production or multi-user environments, never commit real API keys
64 | # or share them publicly. Instead use a secrets manager or encrypted storage.
65 | 


--------------------------------------------------------------------------------
/src/mcp_browser_use/client.py:
--------------------------------------------------------------------------------
 1 | """Client helpers for interacting with the in-process FastMCP server."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from contextlib import asynccontextmanager
 6 | from typing import Any, AsyncIterator, Callable, Optional
 7 | 
 8 | from fastmcp.client import Client
 9 | 
10 | from .server import app
11 | 
12 | 
13 | class AgentNotRegisteredError(RuntimeError):
14 |     """Error raised when attempting to control an agent that is not running."""
15 | 
16 | 
17 | @asynccontextmanager
18 | async def create_client_session(
19 |     client: Optional[Client] = None,
20 |     *,
21 |     client_factory: Optional[Callable[[], Client]] = None,
22 |     **client_kwargs: Any,
23 | ) -> AsyncIterator[Client]:
24 |     """Create an asynchronous context manager for interacting with the server.
25 | 
26 |     Parameters
27 |     ----------
28 |     client:
29 |         An existing :class:`fastmcp.client.Client` instance. If provided, the
30 |         caller is responsible for its configuration. ``client_kwargs`` must not
31 |         be supplied in this case.
32 |     client_factory:
33 |         Optional callable used to lazily construct a client. This is useful in
34 |         testing where a lightweight stub client might be injected. If provided,
35 |         the callable is invoked with no arguments and ``client_kwargs`` must not
36 |         be supplied.
37 |     **client_kwargs:
38 |         Additional keyword arguments forwarded to :class:`fastmcp.client.Client`
39 |         when neither ``client`` nor ``client_factory`` is provided.
40 | 
41 |     Yields
42 |     ------
43 |     Client
44 |         A connected FastMCP client ready for use within the context manager.
45 |     """
46 | 
47 |     if client is not None and client_factory is not None:
48 |         raise ValueError("Provide either 'client' or 'client_factory', not both.")
49 | 
50 |     if client is not None and client_kwargs:
51 |         raise ValueError(
52 |             "'client_kwargs' cannot be used when an explicit client instance is provided."
53 |         )
54 | 
55 |     if client_factory is not None and client_kwargs:
56 |         raise ValueError("'client_kwargs' cannot be combined with 'client_factory'.")
57 | 
58 |     if client is not None:
59 |         session_client = client
60 |     elif client_factory is not None:
61 |         session_client = client_factory()
62 |     else:
63 |         session_client = Client(app, **client_kwargs)
64 | 
65 |     async with session_client as connected_client:
66 |         yield connected_client
67 | 


--------------------------------------------------------------------------------
/tests/stubs/browser_use/__init__.py:
--------------------------------------------------------------------------------
 1 | class _DummyEvent:
 2 |     def __await__(self):
 3 |         async def _noop():
 4 |             return None
 5 | 
 6 |         return _noop().__await__()
 7 | 
 8 |     async def event_result(self, *args, **kwargs):  # pragma: no cover - stub method
 9 |         return None
10 | 
11 | 
12 | class _DummyEventBus:
13 |     def dispatch(self, event):  # noqa: D401 - simple stub
14 |         return _DummyEvent()
15 | 
16 | 
17 | class BrowserPage:
18 |     def __init__(self, **kwargs):
19 |         for key, value in kwargs.items():
20 |             setattr(self, key, value)
21 |         self.event_bus = _DummyEventBus()
22 | 
23 |     async def close(self) -> None:  # pragma: no cover - stub method
24 |         return None
25 | 
26 | 
27 | class Browser:
28 |     """Lightweight stub mirroring the public Browser API used in tests."""
29 | 
30 |     def __init__(self, **kwargs):
31 |         for key, value in kwargs.items():
32 |             setattr(self, key, value)
33 |         self._pages: list[BrowserPage] = []
34 |         self._started = False
35 | 
36 |     async def start(self):  # pragma: no cover - stub method
37 |         self._started = True
38 |         return self
39 | 
40 |     async def stop(self):  # pragma: no cover - stub method
41 |         self._started = False
42 |         return None
43 | 
44 |     async def new_page(self, **kwargs):
45 |         page = BrowserPage(**kwargs)
46 |         self._pages.append(page)
47 |         return page
48 | 
49 |     async def close(self):  # pragma: no cover - compatibility alias
50 |         return await self.stop()
51 | 
52 | 
53 | class BrowserSession(Browser):  # pragma: no cover - stub class
54 |     async def kill(self):  # pragma: no cover - stub method
55 |         return await self.stop()
56 | 
57 | 
58 | class BrowserProfile:  # pragma: no cover - stub class
59 |     def __init__(self, **kwargs):
60 |         for key, value in kwargs.items():
61 |             setattr(self, key, value)
62 |         self.event_bus = _DummyEventBus()
63 | 
64 |     async def kill(self) -> None:  # pragma: no cover - stub method
65 |         return None
66 | 
67 | 
68 | class BrowserProfile:  # pragma: no cover - stub class
69 |     def __init__(self, **kwargs):
70 |         for key, value in kwargs.items():
71 |             setattr(self, key, value)
72 | 
73 | 
74 | class ProxySettings:  # pragma: no cover - stub class
75 |     def __init__(self, **kwargs):
76 |         for key, value in kwargs.items():
77 |             setattr(self, key, value)
78 | 
79 | 
80 | # Alias maintained for compatibility with production package
81 | Browser = BrowserSession
82 | 


--------------------------------------------------------------------------------
/src/mcp_browser_use/controller/custom_controller.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import logging
 4 | import sys
 5 | 
 6 | import pyperclip
 7 | from browser_use import BrowserSession
 8 | from browser_use.agent.views import ActionResult
 9 | from browser_use.controller.service import Controller
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | class CustomController(Controller):
15 |     """
16 |     A custom controller registering two clipboard actions: copy and paste.
17 |     """
18 | 
19 |     def __init__(self):
20 |         super().__init__()
21 |         self._register_custom_actions()
22 | 
23 |     def _register_custom_actions(self) -> None:
24 |         """Register all custom browser actions for this controller."""
25 | 
26 |         @self.registry.action("Copy text to clipboard")
27 |         def copy_to_clipboard(text: str) -> ActionResult:
28 |             """
29 |             Copy the given text to the system's clipboard.
30 |             Returns an ActionResult with the same text as extracted_content.
31 |             """
32 |             try:
33 |                 pyperclip.copy(text)
34 |                 # Be cautious about logging the actual text, if sensitive
35 |                 logger.debug("Copied text to clipboard.")
36 |                 return ActionResult(extracted_content=text)
37 |             except Exception as e:
38 |                 logger.error(f"Error copying text to clipboard: {e}")
39 |                 return ActionResult(error=str(e), extracted_content=None)
40 | 
41 |         @self.registry.action("Paste text from clipboard", requires_browser=True)
42 |         async def paste_from_clipboard(browser_session: BrowserSession) -> ActionResult:
43 |             """
44 |             Paste whatever is currently in the system's clipboard
45 |             into the active browser page by using the send_keys tool.
46 |             """
47 |             try:
48 |                 text = pyperclip.paste()
49 |             except Exception as e:
50 |                 logger.error(f"Error reading text from clipboard: {e}")
51 |                 return ActionResult(error=str(e), extracted_content=None)
52 | 
53 |             try:
54 |                 modifier = "meta" if sys.platform == "darwin" else "ctrl"
55 |                 # Use the documented tool via the registry
56 |                 await self.registry.execute_action(
57 |                     "send_keys",
58 |                     {"keys": f"{modifier}+v"},
59 |                     browser_session=browser_session,
60 |                 )
61 |                 logger.debug("Triggered paste shortcut inside the browser session.")
62 |                 return ActionResult(extracted_content=text)
63 |             except Exception as e:
64 |                 logger.error(f"Error pasting text into the browser session: {e}")
65 |                 return ActionResult(error=str(e), extracted_content=None)
66 | 


--------------------------------------------------------------------------------
/src/mcp_browser_use/agent/custom_views.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from dataclasses import dataclass
 4 | from typing import List, Type
 5 | 
 6 | from browser_use.agent.views import AgentOutput
 7 | from browser_use.controller.registry.views import ActionModel
 8 | from pydantic import BaseModel, ConfigDict, Field, create_model
 9 | 
10 | 
11 | @dataclass
12 | class CustomAgentStepInfo:
13 |     """
14 |     Holds metadata about a single step of the agent's execution.
15 | 
16 |     :param step_number: Which step number we're currently on.
17 |     :param max_steps: Total maximum steps before we stop.
18 |     :param task: The primary task assigned to the agent.
19 |     :param add_infos: Additional contextual info or instructions.
20 |     :param memory: Cumulative memory or context from previous steps.
21 |     :param task_progress: Text describing progress toward the task goal.
22 |     """
23 | 
24 |     step_number: int
25 |     max_steps: int
26 |     task: str
27 |     add_infos: str
28 |     memory: str
29 |     task_progress: str
30 | 
31 | 
32 | class CustomAgentBrain(BaseModel):
33 |     """
34 |     Represents the agent's 'thinking' or ephemeral state during processing.
35 | 
36 |     :param prev_action_evaluation: String evaluation of the last action performed (success/failure).
37 |     :param important_contents: Key points or memory extracted from the environment.
38 |     :param completed_contents: Completed portion of the task so far.
39 |     :param thought: Agent's internal reasoning or thought process text.
40 |     :param summary: Short summary of the agent's current state or progress.
41 |     """
42 | 
43 |     prev_action_evaluation: str
44 |     important_contents: str
45 |     completed_contents: str
46 |     thought: str
47 |     summary: str
48 | 
49 | 
50 | class CustomAgentOutput(AgentOutput):
51 |     """
52 |     Output model for the agent. Extended at runtime with custom actions
53 |     by 'type_with_custom_actions'.
54 |     """
55 | 
56 |     model_config = ConfigDict(arbitrary_types_allowed=True)
57 | 
58 |     current_state: CustomAgentBrain
59 |     action: List[ActionModel]
60 | 
61 |     @staticmethod
62 |     def type_with_custom_actions(
63 |         custom_actions: Type[ActionModel],
64 |     ) -> Type["CustomAgentOutput"]:
65 |         """
66 |         Create a new Pydantic model that inherits from CustomAgentOutput
67 |         but redefines the 'action' field to be a list of the given
68 |         custom action model.
69 | 
70 |         :param custom_actions: The action model type from the controller registry.
71 |         :return: A new Pydantic model class based on CustomAgentOutput.
72 |         """
73 |         return create_model(
74 |             # Could rename to something more specific if needed
75 |             "AgentOutput",
76 |             __base__=CustomAgentOutput,
77 |             action=(List[custom_actions], Field(...)),
78 |             __module__=CustomAgentOutput.__module__,
79 |         )
80 | 


--------------------------------------------------------------------------------
/smithery.yaml:
--------------------------------------------------------------------------------
 1 | # Smithery configuration file: https://smithery.ai/docs/config#smitheryyaml
 2 | 
 3 | startCommand:
 4 |   type: stdio
 5 |   configSchema:
 6 |     # JSON Schema defining the configuration options for the MCP.
 7 |     type: object
 8 |     required:
 9 |       - openaiApiKey
10 |       - anthropicApiKey
11 |       - mcpModelProvider
12 |       - mcpModelName
13 |     properties:
14 |       openaiApiKey:
15 |         type: string
16 |         description: API key for OpenAI services.
17 |       anthropicApiKey:
18 |         type: string
19 |         description: API key for Anthropic services.
20 |       googleApiKey:
21 |         type: string
22 |         description: API key for Google services (optional).
23 |       azureOpenaiEndpoint:
24 |         type: string
25 |         description: Azure OpenAI endpoint (optional).
26 |       azureOpenaiApiKey:
27 |         type: string
28 |         description: Azure OpenAI API key (optional).
29 |       chromePath:
30 |         type: string
31 |         description: Path to Chrome executable (optional).
32 |       chromeUserData:
33 |         type: string
34 |         description: Path to Chrome user data directory (optional).
35 |       chromeDebuggingPort:
36 |         type: string
37 |         default: "9222"
38 |         description: Chrome debugging port. Default is 9222.
39 |       chromeDebuggingHost:
40 |         type: string
41 |         default: localhost
42 |         description: Chrome debugging host. Default is localhost.
43 |       chromePersistentSession:
44 |         type: boolean
45 |         default: false
46 |         description: Keep browser open between tasks.
47 |       mcpModelProvider:
48 |         type: string
49 |         description: Model provider (e.g., anthropic, openai).
50 |       mcpModelName:
51 |         type: string
52 |         description: Model name.
53 |       mcpTemperature:
54 |         type: number
55 |         default: 0.3
56 |         description: Model temperature.
57 |       mcpMaxSteps:
58 |         type: number
59 |         default: 30
60 |         description: Max steps for model.
61 |       mcpUseVision:
62 |         type: boolean
63 |         default: true
64 |         description: Use vision capabilities.
65 |       mcpMaxActionsPerStep:
66 |         type: number
67 |         default: 5
68 |         description: Max actions per step.
69 |   commandFunction:
70 |     # A function that produces the CLI command to start the MCP on stdio.
71 |     |-
72 |     (config) => ({ command: 'uv', args: ['run', 'mcp-browser-use'], env: { OPENAI_API_KEY: config.openaiApiKey, ANTHROPIC_API_KEY: config.anthropicApiKey, GOOGLE_API_KEY: config.googleApiKey, AZURE_OPENAI_ENDPOINT: config.azureOpenaiEndpoint, AZURE_OPENAI_API_KEY: config.azureOpenaiApiKey, CHROME_PATH: config.chromePath, CHROME_USER_DATA: config.chromeUserData, CHROME_DEBUGGING_PORT: config.chromeDebuggingPort || '9222', CHROME_DEBUGGING_HOST: config.chromeDebuggingHost || 'localhost', CHROME_PERSISTENT_SESSION: config.chromePersistentSession, MCP_MODEL_PROVIDER: config.mcpModelProvider, MCP_MODEL_NAME: config.mcpModelName, MCP_TEMPERATURE: config.mcpTemperature || 0.3, MCP_MAX_STEPS: config.mcpMaxSteps || 30, MCP_USE_VISION: config.mcpUseVision, MCP_MAX_ACTIONS_PER_STEP: config.mcpMaxActionsPerStep || 5 } })
73 | 


--------------------------------------------------------------------------------
/tests/test_summarize_messages.py:
--------------------------------------------------------------------------------
  1 | from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
  2 | 
  3 | import mcp_browser_use.agent.custom_agent as custom_agent_module
  4 | from mcp_browser_use.agent.custom_agent import CustomAgent
  5 | from browser_use.agent.message_manager.views import MessageHistory, ManagedMessage
  6 | 
  7 | 
  8 | class FakeLLM:
  9 |     def __init__(self, content: str = "Conversation summary"):
 10 |         self.calls = []
 11 |         self._content = content
 12 | 
 13 |     def invoke(self, input, **kwargs):
 14 |         self.calls.append(input)
 15 |         message = AIMessage(content=self._content)
 16 |         return message
 17 | 
 18 |     def __call__(self, input, **kwargs):
 19 |         return self.invoke(input, **kwargs)
 20 | 
 21 | 
 22 | class DummyMessageManager:
 23 |     def __init__(self, extra_messages: int = 6):
 24 |         self.system_prompt = SystemMessage(content="System instructions")
 25 |         self.example_tool_call = AIMessage(content="[]")
 26 |         self.example_tool_call.tool_calls = []
 27 |         self.reset_calls = 0
 28 |         self.history = MessageHistory()
 29 |         self.reset_history()
 30 |         for idx in range(extra_messages):
 31 |             human = HumanMessage(content=f"User message {idx}")
 32 |             self._add_message_with_tokens(human)
 33 | 
 34 |     def get_messages(self):
 35 |         return [managed.message for managed in self.history.messages]
 36 | 
 37 |     def reset_history(self) -> None:
 38 |         self.reset_calls += 1
 39 |         self.history = MessageHistory()
 40 |         self.history.messages = []
 41 |         if hasattr(self.history, "total_tokens"):
 42 |             self.history.total_tokens = 0
 43 |         self._add_message_with_tokens(self.system_prompt)
 44 |         self._add_message_with_tokens(self.example_tool_call)
 45 | 
 46 |     def _add_message_with_tokens(self, message):
 47 |         self.history.messages.append(ManagedMessage(message=message))
 48 |         if hasattr(self.history, "total_tokens"):
 49 |             self.history.total_tokens += 1
 50 | 
 51 | 
 52 | def test_summarize_messages_preserves_system_prompt(monkeypatch):
 53 |     class StubChain:
 54 |         def __init__(self, llm):
 55 |             self.llm = llm
 56 | 
 57 |         def invoke(self, data):
 58 |             return self.llm.invoke(data)
 59 | 
 60 |     class StubPrompt:
 61 |         def __or__(self, llm):
 62 |             return StubChain(llm)
 63 | 
 64 |     class StubChatPromptTemplate:
 65 |         @staticmethod
 66 |         def from_messages(messages):
 67 |             return StubPrompt()
 68 | 
 69 |     monkeypatch.setattr(
 70 |         custom_agent_module,
 71 |         "ChatPromptTemplate",
 72 |         StubChatPromptTemplate,
 73 |     )
 74 | 
 75 |     agent = CustomAgent.__new__(CustomAgent)
 76 |     agent.llm = FakeLLM()
 77 |     agent.message_manager = DummyMessageManager()
 78 | 
 79 |     assert len(agent.message_manager.get_messages()) > 5
 80 |     # Ensure the initial reset was performed
 81 |     assert agent.message_manager.reset_calls == 1
 82 | 
 83 |     result = agent.summarize_messages()
 84 | 
 85 |     assert result is True
 86 |     assert agent.message_manager.reset_calls == 2
 87 | 
 88 |     history_messages = agent.message_manager.history.messages
 89 |     assert len(history_messages) == 3
 90 |     assert [entry.message for entry in history_messages[:2]] == [
 91 |         agent.message_manager.system_prompt,
 92 |         agent.message_manager.example_tool_call,
 93 |     ]
 94 |     assert history_messages[2].message.content == "Conversation summary"
 95 |     if hasattr(agent.message_manager.history, "total_tokens"):
 96 |         assert agent.message_manager.history.total_tokens == len(history_messages)
 97 | 
 98 |     # Ensure the LLM was called with the conversation
 99 |     assert len(agent.llm.calls) == 1
100 |     prompt_value = agent.llm.calls[0]
101 |     assert isinstance(prompt_value, dict)
102 |     assert "chat_history" in prompt_value
103 | 


--------------------------------------------------------------------------------
/tests/test_client_session.py:
--------------------------------------------------------------------------------
  1 | import importlib
  2 | 
  3 | import pytest
  4 | 
  5 | from mcp_browser_use import client as client_module
  6 | from mcp_browser_use.client import AgentNotRegisteredError, create_client_session
  7 | 
  8 | 
  9 | @pytest.fixture
 10 | def anyio_backend():
 11 |     return "asyncio"
 12 | 
 13 | 
 14 | @pytest.mark.anyio("asyncio")
 15 | async def test_create_client_session_uses_supplied_client():
 16 |     events = []
 17 | 
 18 |     class DummyClient:
 19 |         def __init__(self):
 20 |             self.connected = False
 21 | 
 22 |         async def __aenter__(self):
 23 |             events.append("enter")
 24 |             self.connected = True
 25 |             return self
 26 | 
 27 |         async def __aexit__(self, exc_type, exc, tb):
 28 |             events.append("exit")
 29 |             self.connected = False
 30 | 
 31 |     dummy = DummyClient()
 32 |     async with create_client_session(client=dummy) as session:
 33 |         assert session is dummy
 34 |         assert dummy.connected
 35 | 
 36 |     assert events == ["enter", "exit"]
 37 |     assert dummy.connected is False
 38 | 
 39 | 
 40 | @pytest.mark.anyio("asyncio")
 41 | async def test_create_client_session_accepts_factory():
 42 |     events = []
 43 | 
 44 |     class DummyClient:
 45 |         async def __aenter__(self):
 46 |             events.append("enter")
 47 |             return self
 48 | 
 49 |         async def __aexit__(self, exc_type, exc, tb):
 50 |             events.append("exit")
 51 | 
 52 |     async with create_client_session(client_factory=DummyClient) as session:
 53 |         assert isinstance(session, DummyClient)
 54 | 
 55 |     assert events == ["enter", "exit"]
 56 | 
 57 | 
 58 | @pytest.mark.anyio("asyncio")
 59 | async def test_create_client_session_rejects_mixed_arguments():
 60 |     class DummyClient:
 61 |         async def __aenter__(self):
 62 |             return self
 63 | 
 64 |         async def __aexit__(self, exc_type, exc, tb):
 65 |             pass
 66 | 
 67 |     dummy = DummyClient()
 68 | 
 69 |     with pytest.raises(ValueError):
 70 |         async with create_client_session(client=dummy, timeout=5):
 71 |             pass
 72 | 
 73 |     with pytest.raises(ValueError):
 74 |         async with create_client_session(client_factory=DummyClient, timeout=5):
 75 |             pass
 76 | 
 77 |     with pytest.raises(ValueError):
 78 |         async with create_client_session(client=dummy, client_factory=DummyClient):
 79 |             pass
 80 | 
 81 | 
 82 | @pytest.mark.anyio("asyncio")
 83 | async def test_create_client_session_constructs_default_client(monkeypatch):
 84 |     created = {}
 85 | 
 86 |     class DummyClient:
 87 |         def __init__(self, app, **kwargs):
 88 |             created["app"] = app
 89 |             created["kwargs"] = kwargs
 90 | 
 91 |         async def __aenter__(self):
 92 |             created["entered"] = True
 93 |             return self
 94 | 
 95 |         async def __aexit__(self, exc_type, exc, tb):
 96 |             created["exited"] = True
 97 | 
 98 |     monkeypatch.setattr("mcp_browser_use.client.Client", DummyClient)
 99 | 
100 |     async with create_client_session(timeout=5) as session:
101 |         assert isinstance(session, DummyClient)
102 | 
103 |     assert created["app"] is client_module.app
104 |     assert created["kwargs"] == {"timeout": 5}
105 |     assert created["entered"] is True
106 |     assert created["exited"] is True
107 | 
108 | 
109 | @pytest.mark.anyio("asyncio")
110 | async def test_create_client_session_kwargs_with_factory_raise():
111 |     class DummyClient:
112 |         async def __aenter__(self):
113 |             return self
114 | 
115 |         async def __aexit__(self, exc_type, exc, tb):
116 |             pass
117 | 
118 |     kwargs = {"client_factory": DummyClient, "timeout": 10}
119 | 
120 |     with pytest.raises(ValueError):
121 |         async with create_client_session(**kwargs):
122 |             pass
123 | 
124 | 
125 | @pytest.mark.parametrize(
126 |     "legacy_module",
127 |     [
128 |         "mcp_browser",
129 |         "mcp_browser.use",
130 |         "mcp_browser.use.mcp_browser_use",
131 |     ],
132 | )
133 | def test_legacy_namespace_is_removed(legacy_module):
134 |     with pytest.raises(ModuleNotFoundError):
135 |         importlib.import_module(legacy_module)
136 | 
137 | 
138 | def test_exception_type():
139 |     assert issubclass(AgentNotRegisteredError, RuntimeError)
140 | 


--------------------------------------------------------------------------------
/src/mcp_browser_use/agent/custom_massage_manager.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import copy
  6 | import logging
  7 | from typing import List, Optional, Type
  8 | 
  9 | from browser_use.agent.message_manager.service import MessageManager
 10 | from browser_use.agent.message_manager.views import MessageHistory
 11 | from browser_use.agent.prompts import SystemPrompt
 12 | from browser_use.agent.views import ActionResult, AgentStepInfo
 13 | from browser_use.browser.views import BrowserState
 14 | from langchain_core.language_models import BaseChatModel
 15 | from langchain_core.messages import HumanMessage, AIMessage
 16 | 
 17 | from mcp_browser_use.agent.custom_prompts import CustomAgentMessagePrompt
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | 
 22 | class CustomMassageManager(MessageManager):
 23 |     def __init__(
 24 |         self,
 25 |         llm: BaseChatModel,
 26 |         task: str,
 27 |         action_descriptions: str,
 28 |         system_prompt_class: Type[SystemPrompt],
 29 |         max_input_tokens: int = 128000,
 30 |         estimated_tokens_per_character: int = 3,
 31 |         image_tokens: int = 800,
 32 |         include_attributes: list[str] = [],
 33 |         max_error_length: int = 400,
 34 |         max_actions_per_step: int = 10,
 35 |         tool_call_in_content: bool = False,
 36 |     ):
 37 |         super().__init__(
 38 |             llm=llm,
 39 |             task=task,
 40 |             action_descriptions=action_descriptions,
 41 |             system_prompt_class=system_prompt_class,
 42 |             max_input_tokens=max_input_tokens,
 43 |             estimated_tokens_per_character=estimated_tokens_per_character,
 44 |             image_tokens=image_tokens,
 45 |             include_attributes=include_attributes,
 46 |             max_error_length=max_error_length,
 47 |             max_actions_per_step=max_actions_per_step,
 48 |             tool_call_in_content=tool_call_in_content,
 49 |         )
 50 | 
 51 |         # Store template for example tool call so we can rebuild the history when needed
 52 |         self.tool_call_in_content = tool_call_in_content
 53 |         self._example_tool_call_template = [
 54 |             {
 55 |                 "name": "CustomAgentOutput",
 56 |                 "args": {
 57 |                     "current_state": {
 58 |                         "prev_action_evaluation": "Unknown - No previous actions to evaluate.",
 59 |                         "important_contents": "",
 60 |                         "completed_contents": "",
 61 |                         "thought": "Now Google is open. Need to type OpenAI to search.",
 62 |                         "summary": "Type OpenAI to search.",
 63 |                     },
 64 |                     "action": [],
 65 |                 },
 66 |                 "id": "",
 67 |                 "type": "tool_call",
 68 |             }
 69 |         ]
 70 |         self.reset_history()
 71 | 
 72 |     def _create_example_tool_call_message(self) -> AIMessage:
 73 |         tool_calls = copy.deepcopy(self._example_tool_call_template)
 74 |         if self.tool_call_in_content:
 75 |             # openai throws error if tool_calls are not responded -> move to content
 76 |             return AIMessage(
 77 |                 content=f"{tool_calls}",
 78 |                 tool_calls=[],
 79 |             )
 80 |         return AIMessage(
 81 |             content="",
 82 |             tool_calls=tool_calls,
 83 |         )
 84 | 
 85 |     def reset_history(self) -> None:
 86 |         """Reset the message history to the initial seeded state."""
 87 | 
 88 |         self.history = MessageHistory()
 89 |         if hasattr(self.history, "total_tokens"):
 90 |             self.history.total_tokens = 0
 91 | 
 92 |         self._add_message_with_tokens(self.system_prompt)
 93 |         self._add_message_with_tokens(self._create_example_tool_call_message())
 94 | 
 95 |     def add_state_message(
 96 |         self,
 97 |         state: BrowserState,
 98 |         result: Optional[List[ActionResult]] = None,
 99 |         step_info: Optional[AgentStepInfo] = None,
100 |     ) -> None:
101 |         """Add browser state as human message"""
102 | 
103 |         # if keep in memory, add to directly to history and add state without result
104 |         if result:
105 |             for r in result:
106 |                 if r.include_in_memory:
107 |                     if r.extracted_content:
108 |                         msg = HumanMessage(content=str(r.extracted_content))
109 |                         self._add_message_with_tokens(msg)
110 |                     if r.error:
111 |                         msg = HumanMessage(
112 |                             content=str(r.error)[-self.max_error_length :]
113 |                         )
114 |                         self._add_message_with_tokens(msg)
115 |                     result = None  # if result in history, we dont want to add it again
116 | 
117 |         # otherwise add state message and result to next message (which will not stay in memory)
118 |         state_message = CustomAgentMessagePrompt(
119 |             state,
120 |             result,
121 |             include_attributes=self.include_attributes,
122 |             max_error_length=self.max_error_length,
123 |             step_info=step_info,
124 |         ).get_user_message()
125 |         self._add_message_with_tokens(state_message)
126 | 


--------------------------------------------------------------------------------
/src/mcp_browser_use/server.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from mcp_browser_use.utils.logging import configure_logging
  4 | 
  5 | # It is critical to configure logging before any other modules are imported,
  6 | # as they might initialize logging themselves.
  7 | configure_logging()
  8 | 
  9 | import asyncio
 10 | import logging
 11 | import os
 12 | import sys
 13 | import traceback
 14 | from typing import Any, Optional
 15 | 
 16 | from browser_use import Browser
 17 | from fastmcp import FastMCP
 18 | from mcp_browser_use.agent.custom_agent import CustomAgent
 19 | from mcp_browser_use.controller.custom_controller import CustomController
 20 | from mcp_browser_use.browser.browser_manager import create_browser_session
 21 | from mcp_browser_use.utils import utils
 22 | from mcp_browser_use.utils.agent_state import AgentState
 23 | 
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | app = FastMCP("mcp_browser_use")
 27 | 
 28 | 
 29 | @app.tool()
 30 | async def run_browser_agent(task: str, add_infos: str = "") -> str:
 31 |     """
 32 |     This is the entrypoint for running a browser-based agent.
 33 | 
 34 |     :param task: The main instruction or goal for the agent.
 35 |     :param add_infos: Additional information or context for the agent.
 36 |     :return: The final result string from the agent run.
 37 |     """
 38 | 
 39 |     browser_session: Optional[Browser] = None
 40 |     agent_state = AgentState()
 41 | 
 42 |     try:
 43 |         # Clear any previous agent stop signals
 44 |         agent_state.clear_stop()
 45 | 
 46 |         # Read environment variables with defaults and parse carefully
 47 |         # Fallback to defaults if parsing fails.
 48 |         model_provider = os.getenv("MCP_MODEL_PROVIDER", "anthropic")
 49 |         model_name = os.getenv("MCP_MODEL_NAME", "claude-3-5-sonnet-20241022")
 50 | 
 51 |         def safe_float(env_var: str, default: float) -> float:
 52 |             """Safely parse a float from an environment variable."""
 53 |             try:
 54 |                 return float(os.getenv(env_var, str(default)))
 55 |             except ValueError:
 56 |                 logger.warning(f"Invalid float for {env_var}, using default={default}")
 57 |                 return default
 58 | 
 59 |         def safe_int(env_var: str, default: int) -> int:
 60 |             """Safely parse an int from an environment variable."""
 61 |             try:
 62 |                 return int(os.getenv(env_var, str(default)))
 63 |             except ValueError:
 64 |                 logger.warning(f"Invalid int for {env_var}, using default={default}")
 65 |                 return default
 66 | 
 67 |         # Get environment variables with defaults
 68 |         temperature = safe_float("MCP_TEMPERATURE", 0.3)
 69 |         max_steps = safe_int("MCP_MAX_STEPS", 30)
 70 |         use_vision = os.getenv("MCP_USE_VISION", "true").lower() == "true"
 71 |         max_actions_per_step = safe_int("MCP_MAX_ACTIONS_PER_STEP", 5)
 72 |         tool_call_in_content = (
 73 |             os.getenv("MCP_TOOL_CALL_IN_CONTENT", "true").lower() == "true"
 74 |         )
 75 | 
 76 |         # Prepare LLM
 77 |         llm = utils.get_llm_model(
 78 |             provider=model_provider, model_name=model_name, temperature=temperature
 79 |         )
 80 | 
 81 |         # Create a fresh browser session for this run
 82 |         browser_session = create_browser_session()
 83 |         await browser_session.start()
 84 | 
 85 |         # Create controller and agent
 86 |         controller = CustomController()
 87 |         agent = CustomAgent(
 88 |             task=task,
 89 |             add_infos=add_infos,
 90 |             use_vision=use_vision,
 91 |             llm=llm,
 92 |             browser_session=browser_session,
 93 |             controller=controller,
 94 |             max_actions_per_step=max_actions_per_step,
 95 |             tool_call_in_content=tool_call_in_content,
 96 |             agent_state=agent_state,
 97 |         )
 98 | 
 99 |         # Execute the agent task lifecycle
100 |         history = await agent.execute_agent_task(max_steps=max_steps)
101 | 
102 |         # Extract final result from the agent's history
103 |         final_result = history.final_result()
104 |         if not final_result:
105 |             final_result = f"No final result. Possibly incomplete. {history}"
106 | 
107 |         return final_result
108 | 
109 |     except Exception as e:
110 |         logger.error("run-browser-agent error: %s", str(e))
111 |         raise ValueError(f"run-browser-agent error: {e}\n{traceback.format_exc()}")
112 | 
113 |     finally:
114 |         # Always ensure cleanup, even if no error.
115 |         try:
116 |             agent_state.request_stop()
117 |         except Exception as stop_error:
118 |             logger.warning("Error stopping agent state: %s", stop_error)
119 | 
120 |         if browser_session:
121 |             try:
122 |                 await browser_session.stop()
123 |             except Exception as browser_error:
124 |                 logger.warning(
125 |                     "Failed to stop browser session gracefully, killing it: %s",
126 |                     browser_error,
127 |                 )
128 |                 if hasattr(browser_session, "kill"):
129 |                     await browser_session.kill()
130 | 
131 | 
132 | def launch_mcp_browser_use_server() -> None:
133 |     """
134 |     Entry point for running the FastMCP application.
135 |     Handles server start and final resource cleanup.
136 |     """
137 |     try:
138 |         app.run()
139 |     except Exception as e:
140 |         logger.error("Error running MCP server: %s\n%s", e, traceback.format_exc())
141 | 
142 | 
143 | if __name__ == "__main__":
144 |     launch_mcp_browser_use_server()
145 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # MCP Browser Use Server
  2 | 
  3 | [![smithery badge](https://smithery.ai/badge/@JovaniPink/mcp-browser-use)](https://smithery.ai/server/@JovaniPink/mcp-browser-use)
  4 | 
  5 | > Model Context Protocol (MCP) server that wires [browser-use](https://github.com/browser-use/browser-use) into Claude Desktop and other MCP compatible clients.
  6 | 
  7 | <a href="https://glama.ai/mcp/servers/tjea5rgnbv"><img width="380" height="200" src="https://glama.ai/mcp/servers/tjea5rgnbv/badge" alt="Browser Use Server MCP server" /></a>
  8 | 
  9 | ## Overview
 10 | 
 11 | This repository provides a production-ready wrapper around the `browser-use` automation engine. It exposes a single MCP tool (`run_browser_agent`) that orchestrates a browser session, executes the `browser-use` agent, and returns the final result back to the client. The refactored layout focuses on keeping configuration in one place, improving testability, and keeping `browser-use` upgrades isolated from MCP specific code.
 12 | 
 13 | ### Key Capabilities
 14 | 
 15 | - **Automated browsing** – Navigate, interact with forms, control tabs, capture screenshots, and read page content through natural-language instructions executed by `browser-use`.
 16 | - **Agent lifecycle management** – `CustomAgent` wraps `browser-use`'s base agent to add history export, richer prompts, and consistent error handling across runs.
 17 | - **Centralised browser configuration** – `create_browser_session` translates environment variables into a ready-to-use `BrowserSession`, enabling persistent profiles, proxies, and custom Chromium flags without touching the agent logic.
 18 | - **FastMCP integration** – `server.py` registers the MCP tool, normalises configuration, and ensures the browser session is always cleaned up.
 19 | - **Client helpers** – `client.py` includes async helpers for tests or other Python processes that wish to exercise the MCP server in-process.
 20 | 
 21 | ### Project Structure
 22 | 
 23 | ```
 24 | .
 25 | ├── documentation/
 26 | │   ├── CONFIGURATION.md      # Detailed configuration reference
 27 | │   └── SECURITY.md           # Security considerations for running the server
 28 | ├── .env.example            # Example environment variables for local development
 29 | ├── src/mcp_browser_use/
 30 | │   ├── agent/                # Custom agent, prompts, message history, and views
 31 | │   ├── browser/              # Browser session factory and persistence helpers
 32 | │   ├── controller/           # Custom controller extensions for clipboard actions
 33 | │   ├── utils/                # LLM factory, agent state helpers, encoding utilities
 34 | │   ├── client.py             # Async helper for connecting to the FastMCP app
 35 | │   └── server.py             # FastMCP app and the `run_browser_agent` tool
 36 | └── tests/                    # Unit tests covering server helpers and agent features
 37 | ```
 38 | 
 39 | ## Getting Started
 40 | 
 41 | ### Requirements
 42 | 
 43 | - Python 3.11+
 44 | - Google Chrome or Chromium (for local automation)
 45 | - [`uv`](https://github.com/astral-sh/uv) for dependency management (recommended)
 46 | - Optional: Claude Desktop or another MCP-compatible client for integration testing
 47 | 
 48 | ### Installation
 49 | 
 50 | ```bash
 51 | git clone https://github.com/JovaniPink/mcp-browser-use.git
 52 | cd mcp-browser-use
 53 | uv sync
 54 | ```
 55 | 
 56 | Copy `sample.env` to `.env` (or export the variables in another way) and update the values for the providers you plan to use.
 57 | 
 58 | ### Launching the server
 59 | 
 60 | ```bash
 61 | uv run mcp-browser-use
 62 | ```
 63 | 
 64 | The command invokes the console script defined in `pyproject.toml`, starts the FastMCP application, and registers the `run_browser_agent` tool.
 65 | 
 66 | #### Using with Claude Desktop
 67 | 
 68 | Once the server is running you can register it inside Claude Desktop, for example:
 69 | 
 70 | ```json
 71 | "mcpServers": {
 72 |   "mcp_server_browser_use": {
 73 |     "command": "uvx",
 74 |     "args": ["mcp-browser-use"],
 75 |     "env": {
 76 |       "MCP_MODEL_PROVIDER": "anthropic",
 77 |       "MCP_MODEL_NAME": "claude-3-5-sonnet-20241022"
 78 |     }
 79 |   }
 80 | }
 81 | ```
 82 | 
 83 | ### Debugging
 84 | 
 85 | For interactive debugging, use the [MCP Inspector](https://github.com/modelcontextprotocol/inspector):
 86 | 
 87 | ```bash
 88 | npx @modelcontextprotocol/inspector uv --directory /path/to/project run mcp-browser-use
 89 | ```
 90 | 
 91 | The inspector prints a URL that can be opened in the browser to watch tool calls and responses in real time.
 92 | 
 93 | ## Configuration
 94 | 
 95 | A full list of environment variables and their defaults is available in [documentation/CONFIGURATION.md](documentation/CONFIGURATION.md). Highlights include:
 96 | 
 97 | - `MCP_MODEL_PROVIDER`, `MCP_MODEL_NAME`, `MCP_TEMPERATURE`, `MCP_MAX_STEPS`, `MCP_MAX_ACTIONS_PER_STEP`, and `MCP_USE_VISION` control the LLM and agent run.
 98 | - Provider-specific API keys and endpoints (`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `DEEPSEEK_API_KEY`, `GOOGLE_API_KEY`, `AZURE_OPENAI_API_KEY`, etc.).
 99 | - Browser runtime flags (`BROWSER_USE_HEADLESS`, `BROWSER_USE_EXTRA_CHROMIUM_ARGS`, `CHROME_PERSISTENT_SESSION`, `BROWSER_USE_PROXY_URL`, ...).
100 | 
101 | Use `.env` + [`python-dotenv`](https://pypi.org/project/python-dotenv/) or your preferred secrets manager to keep credentials out of source control.
102 | 
103 | ## Running Tests
104 | 
105 | ```bash
106 | uv run pytest
107 | ```
108 | 
109 | The tests cover the custom agent behaviour, browser session factory, and other utility helpers.
110 | 
111 | ## Security
112 | 
113 | Controlling a full browser instance remotely can grant broad access to the host machine. Review [documentation/SECURITY.md](documentation/SECURITY.md) before exposing the server to untrusted environments.
114 | 
115 | ## Contributing
116 | 
117 | 1. Fork the repository
118 | 2. Create your feature branch: `git checkout -b my-new-feature`
119 | 3. Commit your changes: `git commit -m 'Add some feature'`
120 | 4. Push to the branch: `git push origin my-new-feature`
121 | 5. Open a pull request
122 | 
123 | Bug reports and feature suggestions are welcome—please include logs and reproduction steps when applicable.
124 | 


--------------------------------------------------------------------------------
/documentation/CONFIGURATION.md:
--------------------------------------------------------------------------------
 1 | # Configuration Guide
 2 | 
 3 | This guide describes every configuration option recognised by the MCP Browser Use server. All settings can be supplied as environment variables (e.g. via a `.env` file loaded with [`python-dotenv`](https://pypi.org/project/python-dotenv/)) or injected by your MCP client.
 4 | 
 5 | The sample file at [`sample.env.example`](../sample.env.example) contains a ready-to-copy template with placeholders for secrets.
 6 | 
 7 | ## How configuration is loaded
 8 | 
 9 | 1. **Model & Agent settings** are read in [`server.py`](../src/mcp_browser_use/server.py). They control the language model as well as the agent run loop.
10 | 2. **Browser runtime settings** are parsed in [`browser/browser_manager.py`](../src/mcp_browser_use/browser/browser_manager.py) which returns a configured `BrowserSession` instance.
11 | 3. **Provider specific credentials** are consumed by the LLM factory in [`utils/utils.py`](../src/mcp_browser_use/utils/utils.py).
12 | 
13 | Unless otherwise noted, boolean flags treat any of `1`, `true`, `yes`, `on` (case insensitive) as **true**. Any other value is considered **false**.
14 | 
15 | ## Core Agent Options
16 | 
17 | | Variable | Default | Description |
18 | | --- | --- | --- |
19 | | `MCP_MODEL_PROVIDER` | `anthropic` | LLM provider name passed to the LangChain factory. Supported values: `anthropic`, `openai`, `deepseek`, `gemini`, `ollama`, `azure_openai`. |
20 | | `MCP_MODEL_NAME` | `claude-3-5-sonnet-20241022` | Model identifier sent to the provider. Each provider supports its own model list. |
21 | | `MCP_TEMPERATURE` | `0.3` | Sampling temperature for the model. Parsed as float. |
22 | | `MCP_MAX_STEPS` | `30` | Maximum number of reasoning/action steps before aborting the run. Parsed as integer. |
23 | | `MCP_MAX_ACTIONS_PER_STEP` | `5` | Limits how many tool invocations the agent may issue in a single step. Parsed as integer. |
24 | | `MCP_USE_VISION` | `true` | Enables vision features within the agent (element snapshots). |
25 | | `MCP_TOOL_CALL_IN_CONTENT` | `true` | Whether tool call payloads are expected inside the model response content. |
26 | 
27 | ## Provider Credentials & Endpoints
28 | 
29 | The LLM factory reads the following variables when initialising clients. Only set the values for the provider(s) you actively use.
30 | 
31 | | Variable | Purpose |
32 | | --- | --- |
33 | | `ANTHROPIC_API_KEY` | API key for Anthropic Claude models. |
34 | | `OPENAI_API_KEY` | API key for OpenAI models. |
35 | | `DEEPSEEK_API_KEY` | API key for DeepSeek hosted models. |
36 | | `GOOGLE_API_KEY` | API key for Google Gemini via LangChain Google Generative AI. |
37 | | `AZURE_OPENAI_API_KEY` | API key for Azure OpenAI deployments. |
38 | | `AZURE_OPENAI_ENDPOINT` | Endpoint URL for the Azure OpenAI deployment. |
39 | | `OPENAI_ENDPOINT` | Override the OpenAI base URL (useful for proxies). |
40 | | `DEEPSEEK_ENDPOINT` | Base URL for the DeepSeek-compatible endpoint. |
41 | | `ANTHROPIC_API_ENDPOINT` | Alternative base URL for Anthropic (rarely needed). |
42 | 
43 | When pointing to self-hosted or compatible services you may also override the defaults using `base_url` specific variables in your own code. See [`utils/utils.py`](../src/mcp_browser_use/utils/utils.py) for the full mapping.
44 | 
45 | ## Browser Runtime Options
46 | 
47 | These options are parsed by [`BrowserEnvironmentConfig.from_env`](../src/mcp_browser_use/browser/browser_manager.py) and control Chromium launch behaviour.
48 | 
49 | | Variable | Default | Description |
50 | | --- | --- | --- |
51 | | `CHROME_PATH` | _unset_ | Absolute path to a Chrome/Chromium executable. Leave unset to let `browser-use` manage Chromium via Playwright. |
52 | | `CHROME_USER_DATA` | _unset_ | Directory to store user data (profiles, cookies). Required when `CHROME_PERSISTENT_SESSION` is true. |
53 | | `CHROME_PERSISTENT_SESSION` | `false` | Keeps the browser profile between runs by mounting `CHROME_USER_DATA`. |
54 | | `CHROME_DEBUGGING_PORT` | _unset_ | Remote debugging port for attaching to an existing Chrome instance. Must be an integer. |
55 | | `CHROME_DEBUGGING_HOST` | _unset_ | Hostname/IP for remote debugging (e.g. `localhost`). |
56 | | `BROWSER_USE_HEADLESS` | `false` | Launch Chromium in headless mode. |
57 | | `BROWSER_USE_DISABLE_SECURITY` | `false` | Disables web security features (CORS, sandbox). Use with caution. |
58 | | `BROWSER_USE_EXTRA_CHROMIUM_ARGS` | _unset_ | Comma-separated list of additional Chromium command-line flags. |
59 | | `BROWSER_USE_ALLOWED_DOMAINS` | _unset_ | Comma-separated allowlist limiting which domains the agent may open. |
60 | | `BROWSER_USE_PROXY_URL` | _unset_ | HTTP/HTTPS proxy URL. |
61 | | `BROWSER_USE_NO_PROXY` | _unset_ | Hosts to bypass in proxy mode. |
62 | | `BROWSER_USE_PROXY_USERNAME` | _unset_ | Username for proxy authentication. |
63 | | `BROWSER_USE_PROXY_PASSWORD` | _unset_ | Password for proxy authentication. |
64 | | `BROWSER_USE_CDP_URL` | _unset_ | Connect to an existing Chrome DevTools Protocol endpoint instead of launching a new browser. |
65 | 
66 | ### Persistence hints
67 | 
68 | - When `CHROME_PERSISTENT_SESSION` is true and `CHROME_USER_DATA` is not provided, the server logs a warning and the session falls back to ephemeral storage.
69 | - Remote debugging settings (`CHROME_DEBUGGING_HOST` / `CHROME_DEBUGGING_PORT`) are optional and ignored if invalid values are supplied. The server logs a warning and continues with defaults.
70 | 
71 | ## Additional Environment Variables
72 | 
73 | Some ancillary features inspect the following variables:
74 | 
75 | | Variable | Purpose |
76 | | --- | --- |
77 | | `WIN_FONT_DIR` | Custom Windows font directory used when generating GIF summaries of browsing sessions. |
78 | 
79 | ## Tips for managing configuration
80 | 
81 | - Store secrets outside of version control. When sharing an `.env` file, redact or rotate keys immediately.
82 | - Keep provider-specific settings grouped so you can switch model providers quickly when testing.
83 | - Start with the defaults, confirm the agent behaves as expected, then tighten security by restricting `BROWSER_USE_ALLOWED_DOMAINS` and enabling headless mode.
84 | - When experimenting locally, keep `CHROME_PERSISTENT_SESSION=false` to avoid stale cookies interfering with automation runs.
85 | 
86 | For any options not covered here, consult the upstream [`browser-use` documentation](https://github.com/browser-use/browser-use) which explains additional environment variables recognised by the underlying library.
87 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import importlib
  3 | import importlib.util
  4 | import os
  5 | import sys
  6 | import time
  7 | import types
  8 | 
  9 | import pytest
 10 | 
 11 | # Path to utils module
 12 | ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 13 | UTILS_PATH = os.path.join(ROOT, "src", "mcp_browser_use", "utils", "utils.py")
 14 | 
 15 | # Provide dummy langchain modules if they are not installed
 16 | if "langchain_openai" not in sys.modules:
 17 |     module = types.ModuleType("langchain_openai")
 18 | 
 19 |     class ChatOpenAI:
 20 |         def __init__(self, *args, **kwargs):
 21 |             pass
 22 | 
 23 |     class AzureChatOpenAI:
 24 |         def __init__(self, *args, **kwargs):
 25 |             pass
 26 | 
 27 |     module.ChatOpenAI = ChatOpenAI
 28 |     module.AzureChatOpenAI = AzureChatOpenAI
 29 |     sys.modules["langchain_openai"] = module
 30 | 
 31 | if "langchain_anthropic" not in sys.modules:
 32 |     module = types.ModuleType("langchain_anthropic")
 33 | 
 34 |     class ChatAnthropic:
 35 |         def __init__(self, *args, **kwargs):
 36 |             pass
 37 | 
 38 |     module.ChatAnthropic = ChatAnthropic
 39 |     sys.modules["langchain_anthropic"] = module
 40 | 
 41 | if "langchain_google_genai" not in sys.modules:
 42 |     module = types.ModuleType("langchain_google_genai")
 43 | 
 44 |     class ChatGoogleGenerativeAI:
 45 |         def __init__(self, *args, **kwargs):
 46 |             pass
 47 | 
 48 |     module.ChatGoogleGenerativeAI = ChatGoogleGenerativeAI
 49 |     sys.modules["langchain_google_genai"] = module
 50 | 
 51 | if "langchain_ollama" not in sys.modules:
 52 |     module = types.ModuleType("langchain_ollama")
 53 | 
 54 |     class ChatOllama:
 55 |         def __init__(self, *args, **kwargs):
 56 |             pass
 57 | 
 58 |     module.ChatOllama = ChatOllama
 59 |     sys.modules["langchain_ollama"] = module
 60 | 
 61 | if "browser_use" not in sys.modules:
 62 |     browser_use_module = types.ModuleType("browser_use")
 63 |     browser_module = types.ModuleType("browser_use.browser")
 64 |     events_module = types.ModuleType("browser_use.browser.events")
 65 | 
 66 |     class ScreenshotEvent:
 67 |         def __init__(self, full_page: bool = False):
 68 |             self.full_page = full_page
 69 | 
 70 |     events_module.ScreenshotEvent = ScreenshotEvent
 71 |     browser_module.events = events_module
 72 |     browser_use_module.browser = browser_module
 73 | 
 74 |     sys.modules["browser_use"] = browser_use_module
 75 |     sys.modules["browser_use.browser"] = browser_module
 76 |     sys.modules["browser_use.browser.events"] = events_module
 77 | 
 78 | # Import utils module directly from file after stubbing dependencies
 79 | spec = importlib.util.spec_from_file_location("mcp_browser_use.utils.utils", UTILS_PATH)
 80 | utils = importlib.util.module_from_spec(spec)
 81 | spec.loader.exec_module(utils)
 82 | 
 83 | 
 84 | @pytest.fixture
 85 | def anyio_backend():
 86 |     return "asyncio"
 87 | 
 88 | 
 89 | def test_get_llm_model_returns_chatopenai():
 90 |     model = utils.get_llm_model("openai")
 91 |     assert isinstance(model, utils.ChatOpenAI)
 92 | 
 93 | 
 94 | def test_get_llm_model_unknown_provider_raises():
 95 |     with pytest.raises(ValueError):
 96 |         utils.get_llm_model("unknown")
 97 | 
 98 | 
 99 | def test_encode_image_handles_empty_path():
100 |     assert utils.encode_image(None) is None
101 |     assert utils.encode_image("") is None
102 | 
103 | 
104 | def test_encode_image_roundtrip(tmp_path):
105 |     image_path = tmp_path / "image.bin"
106 |     payload = b"test-bytes"
107 |     image_path.write_bytes(payload)
108 | 
109 |     encoded = utils.encode_image(str(image_path))
110 | 
111 |     assert encoded == base64.b64encode(payload).decode("utf-8")
112 | 
113 | 
114 | def test_encode_image_missing_file(tmp_path):
115 |     with pytest.raises(FileNotFoundError):
116 |         utils.encode_image(str(tmp_path / "missing.bin"))
117 | 
118 | 
119 | def test_get_latest_files_creates_directory(tmp_path):
120 |     target = tmp_path / "captures"
121 | 
122 |     result = utils.get_latest_files(str(target), file_types=[".webm", ".zip"])
123 | 
124 |     assert target.exists()
125 |     assert result == {".webm": None, ".zip": None}
126 | 
127 | 
128 | def test_get_latest_files_skips_recent_files(tmp_path, monkeypatch):
129 |     directory = tmp_path / "captures"
130 |     directory.mkdir()
131 | 
132 |     recent_path = directory / "recent.webm"
133 |     recent_path.write_text("recent")
134 | 
135 |     now = time.time()
136 |     os.utime(recent_path, (now, now))
137 | 
138 |     monkeypatch.setattr(utils.time, "time", lambda: now)
139 | 
140 |     result = utils.get_latest_files(str(directory), file_types=[".webm"])
141 | 
142 |     assert result == {".webm": None}
143 | 
144 | 
145 | @pytest.mark.anyio("asyncio")
146 | async def test_capture_screenshot_uses_event_bus():
147 |     screenshot_payload = base64.b64encode(b"payload").decode("utf-8")
148 | 
149 |     class DummyEvent:
150 |         def __init__(self, result):
151 |             self._result = result
152 |             self.awaited = False
153 | 
154 |         def __await__(self):
155 |             async def _wait():
156 |                 self.awaited = True
157 |                 return self
158 | 
159 |             return _wait().__await__()
160 | 
161 |         async def event_result(self, raise_if_any=True, raise_if_none=True):
162 |             return self._result
163 | 
164 |     class DummyEventBus:
165 |         def __init__(self, dispatched_event):
166 |             self._event = dispatched_event
167 |             self.dispatched = []
168 | 
169 |         def dispatch(self, event):
170 |             self.dispatched.append(event)
171 |             return self._event
172 | 
173 |     class DummyBrowserSession:
174 |         def __init__(self, event_bus):
175 |             self.event_bus = event_bus
176 | 
177 |     dummy_event = DummyEvent(screenshot_payload)
178 |     event_bus = DummyEventBus(dummy_event)
179 |     session = DummyBrowserSession(event_bus)
180 | 
181 |     encoded = await utils.capture_screenshot(session)
182 | 
183 |     assert encoded == screenshot_payload
184 |     assert dummy_event.awaited is True
185 |     assert len(event_bus.dispatched) == 1
186 |     assert isinstance(event_bus.dispatched[0], utils.ScreenshotEvent)
187 | 
188 | 
189 | @pytest.mark.anyio("asyncio")
190 | async def test_capture_screenshot_returns_none_on_error():
191 |     class DummyErrorEvent:
192 |         def __await__(self):
193 |             async def _wait():
194 |                 return self
195 | 
196 |             return _wait().__await__()
197 | 
198 |         async def event_result(self, raise_if_any=True, raise_if_none=True):
199 |             raise RuntimeError("boom")
200 | 
201 |     class DummyEventBus:
202 |         def dispatch(self, event):
203 |             return DummyErrorEvent()
204 | 
205 |     class DummyBrowserSession:
206 |         def __init__(self):
207 |             self.event_bus = DummyEventBus()
208 | 
209 |     session = DummyBrowserSession()
210 | 
211 |     result = await utils.capture_screenshot(session)
212 | 
213 |     assert result is None
214 | 


--------------------------------------------------------------------------------
/src/mcp_browser_use/browser/browser_manager.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Utility helpers for configuring and creating :class:`BrowserSession` instances.
  3 | 
  4 | This module consolidates the thin wrappers that previously lived in
  5 | ``custom_browser.py``, ``custom_context.py``, and ``config.py``.  The new structure
  6 | centralises environment parsing so ``server.py`` can simply request a configured
  7 | browser session without re-implementing the translation from environment
  8 | variables to ``BrowserSession`` keyword arguments.
  9 | """
 10 | 
 11 | from __future__ import annotations
 12 | 
 13 | import logging
 14 | import os
 15 | from dataclasses import dataclass
 16 | from typing import Any, Dict, Optional
 17 | 
 18 | from browser_use import BrowserSession
 19 | from browser_use.browser.profile import ProxySettings
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | _BOOL_TRUE = {"1", "true", "yes", "on"}
 24 | 
 25 | 
 26 | @dataclass(slots=True)
 27 | class BrowserPersistenceConfig:
 28 |     """Configuration for browser persistence and remote debugging settings."""
 29 | 
 30 |     persistent_session: bool = False
 31 |     user_data_dir: Optional[str] = None
 32 |     debugging_port: Optional[int] = None
 33 |     debugging_host: Optional[str] = None
 34 | 
 35 |     @classmethod
 36 |     def from_env(cls) -> "BrowserPersistenceConfig":
 37 |         persistent_session = (
 38 |             os.getenv("CHROME_PERSISTENT_SESSION", "").lower() in _BOOL_TRUE
 39 |         )
 40 |         user_data_dir = os.getenv("CHROME_USER_DATA") or None
 41 | 
 42 |         debugging_port: Optional[int]
 43 |         port_value = os.getenv("CHROME_DEBUGGING_PORT")
 44 |         if port_value:
 45 |             try:
 46 |                 debugging_port = int(port_value)
 47 |             except ValueError:
 48 |                 logger.warning(
 49 |                     "Invalid CHROME_DEBUGGING_PORT=%r, ignoring debug port setting.",
 50 |                     port_value,
 51 |                 )
 52 |                 debugging_port = None
 53 |         else:
 54 |             debugging_port = None
 55 | 
 56 |         debugging_host = os.getenv("CHROME_DEBUGGING_HOST") or None
 57 | 
 58 |         return cls(
 59 |             persistent_session=persistent_session,
 60 |             user_data_dir=user_data_dir,
 61 |             debugging_port=debugging_port,
 62 |             debugging_host=debugging_host,
 63 |         )
 64 | 
 65 | 
 66 | @dataclass(slots=True)
 67 | class BrowserEnvironmentConfig:
 68 |     """All runtime settings required for instantiating ``BrowserSession``."""
 69 | 
 70 |     headless: bool = False
 71 |     disable_security: bool = False
 72 |     executable_path: Optional[str] = None
 73 |     args: Optional[list[str]] = None
 74 |     allowed_domains: Optional[list[str]] = None
 75 |     proxy: Optional[ProxySettings] = None
 76 |     cdp_url: Optional[str] = None
 77 |     user_data_dir: Optional[str] = None
 78 | 
 79 |     def to_kwargs(self) -> Dict[str, Any]:
 80 |         """Convert to keyword arguments understood by :class:`BrowserSession`."""
 81 | 
 82 |         kwargs: Dict[str, Any] = {
 83 |             "headless": self.headless,
 84 |             "disable_security": self.disable_security,
 85 |             "executable_path": self.executable_path,
 86 |             "args": self.args,
 87 |             "allowed_domains": self.allowed_domains,
 88 |             "proxy": self.proxy,
 89 |             "cdp_url": self.cdp_url,
 90 |             "user_data_dir": self.user_data_dir,
 91 |         }
 92 |         # Remove ``None`` values so BrowserSession can rely on its defaults.
 93 |         return {key: value for key, value in kwargs.items() if value is not None}
 94 | 
 95 |     @classmethod
 96 |     def from_env(cls) -> "BrowserEnvironmentConfig":
 97 |         persistence = BrowserPersistenceConfig.from_env()
 98 | 
 99 |         headless = os.getenv("BROWSER_USE_HEADLESS", "false").lower() in _BOOL_TRUE
100 |         disable_security = (
101 |             os.getenv("BROWSER_USE_DISABLE_SECURITY", "false").lower() in _BOOL_TRUE
102 |         )
103 |         executable_path = os.getenv("CHROME_PATH") or None
104 | 
105 |         extra_args_env = os.getenv("BROWSER_USE_EXTRA_CHROMIUM_ARGS")
106 |         args = None
107 |         if extra_args_env:
108 |             args = [arg.strip() for arg in extra_args_env.split(",") if arg.strip()]
109 | 
110 |         allowed_domains_env = os.getenv("BROWSER_USE_ALLOWED_DOMAINS")
111 |         allowed_domains = None
112 |         if allowed_domains_env:
113 |             allowed_domains = [
114 |                 domain.strip()
115 |                 for domain in allowed_domains_env.split(",")
116 |                 if domain.strip()
117 |             ]
118 | 
119 |         proxy_url = os.getenv("BROWSER_USE_PROXY_URL")
120 |         proxy: Optional[ProxySettings] = None
121 |         if proxy_url:
122 |             proxy = ProxySettings(
123 |                 server=proxy_url,
124 |                 bypass=os.getenv("BROWSER_USE_NO_PROXY"),
125 |                 username=os.getenv("BROWSER_USE_PROXY_USERNAME"),
126 |                 password=os.getenv("BROWSER_USE_PROXY_PASSWORD"),
127 |             )
128 | 
129 |         cdp_url = os.getenv("BROWSER_USE_CDP_URL") or None
130 |         if not cdp_url and (persistence.debugging_host or persistence.debugging_port):
131 |             host = persistence.debugging_host or "127.0.0.1"
132 |             port = persistence.debugging_port or 9222
133 |             cdp_url = f"http://{host}:{port}"
134 | 
135 |         user_data_dir = None
136 |         if persistence.persistent_session:
137 |             if persistence.user_data_dir:
138 |                 user_data_dir = persistence.user_data_dir
139 |             else:
140 |                 logger.warning(
141 |                     "CHROME_PERSISTENT_SESSION requested but CHROME_USER_DATA was not provided."
142 |                 )
143 | 
144 |         return cls(
145 |             headless=headless,
146 |             disable_security=disable_security,
147 |             executable_path=executable_path,
148 |             args=args,
149 |             allowed_domains=allowed_domains,
150 |             proxy=proxy,
151 |             cdp_url=cdp_url,
152 |             user_data_dir=user_data_dir,
153 |         )
154 | 
155 | 
156 | def create_browser_session(
157 |     overrides: Optional[Dict[str, Any]] = None,
158 | ) -> BrowserSession:
159 |     """Instantiate a :class:`BrowserSession` using environment defaults.
160 | 
161 |     ``overrides`` can be supplied to fine-tune the resulting session.  Any keys
162 |     set to ``None`` are ignored so callers can override only a subset of values.
163 |     """
164 | 
165 |     config = BrowserEnvironmentConfig.from_env()
166 |     kwargs = config.to_kwargs()
167 | 
168 |     if overrides:
169 |         for key, value in overrides.items():
170 |             if value is not None:
171 |                 kwargs[key] = value
172 |             elif key in kwargs:
173 |                 # Explicit ``None`` removes the override letting BrowserSession
174 |                 # fall back to its internal default.
175 |                 kwargs.pop(key)
176 | 
177 |     logger.debug(
178 |         "Creating BrowserSession with kwargs: %s",
179 |         {k: v for k, v in kwargs.items() if k != "proxy"},
180 |     )
181 |     return BrowserSession(**kwargs)
182 | 


--------------------------------------------------------------------------------
/src/mcp_browser_use/utils/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import base64
  4 | import logging
  5 | import os
  6 | import time
  7 | from pathlib import Path
  8 | from typing import Any, Callable, Dict, List, Optional, Tuple, Type
  9 | 
 10 | from browser_use.browser.events import ScreenshotEvent
 11 | from langchain_anthropic import ChatAnthropic
 12 | from langchain_google_genai import ChatGoogleGenerativeAI
 13 | from langchain_ollama import ChatOllama
 14 | from langchain_openai import AzureChatOpenAI, ChatOpenAI
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | def _anthropic_params(kwargs: Dict[str, Any]) -> Dict[str, Any]:
 20 |     return {
 21 |         "model_name": kwargs.get("model_name", "claude-3-5-sonnet-20240620"),
 22 |         "temperature": kwargs.get("temperature", 0.0),
 23 |         "base_url": kwargs.get("base_url") or "https://api.anthropic.com",
 24 |         "api_key": kwargs.get("api_key") or os.getenv("ANTHROPIC_API_KEY", ""),
 25 |     }
 26 | 
 27 | 
 28 | def _openai_params(kwargs: Dict[str, Any]) -> Dict[str, Any]:
 29 |     return {
 30 |         "model": kwargs.get("model_name", "gpt-4o"),
 31 |         "temperature": kwargs.get("temperature", 0.0),
 32 |         "base_url": kwargs.get("base_url")
 33 |         or os.getenv("OPENAI_ENDPOINT", "https://api.openai.com/v1"),
 34 |         "api_key": kwargs.get("api_key") or os.getenv("OPENAI_API_KEY", ""),
 35 |     }
 36 | 
 37 | 
 38 | def _deepseek_params(kwargs: Dict[str, Any]) -> Dict[str, Any]:
 39 |     return {
 40 |         "model": kwargs.get("model_name", "deepseek-chat"),
 41 |         "temperature": kwargs.get("temperature", 0.0),
 42 |         "base_url": kwargs.get("base_url") or os.getenv("DEEPSEEK_ENDPOINT", ""),
 43 |         "api_key": kwargs.get("api_key") or os.getenv("DEEPSEEK_API_KEY", ""),
 44 |     }
 45 | 
 46 | 
 47 | def _gemini_params(kwargs: Dict[str, Any]) -> Dict[str, Any]:
 48 |     return {
 49 |         "model": kwargs.get("model_name", "gemini-2.0-flash-exp"),
 50 |         "temperature": kwargs.get("temperature", 0.0),
 51 |         "google_api_key": kwargs.get("api_key") or os.getenv("GOOGLE_API_KEY", ""),
 52 |     }
 53 | 
 54 | 
 55 | def _ollama_params(kwargs: Dict[str, Any]) -> Dict[str, Any]:
 56 |     return {
 57 |         "model": kwargs.get("model_name", "phi4"),
 58 |         "temperature": kwargs.get("temperature", 0.0),
 59 |         "num_ctx": kwargs.get("num_ctx", 128000),
 60 |         "base_url": kwargs.get("base_url", "http://localhost:11434"),
 61 |     }
 62 | 
 63 | 
 64 | def _azure_openai_params(kwargs: Dict[str, Any]) -> Dict[str, Any]:
 65 |     return {
 66 |         "model": kwargs.get("model_name", "gpt-4o"),
 67 |         "temperature": kwargs.get("temperature", 0.0),
 68 |         "api_version": kwargs.get("api_version", "2024-05-01-preview"),
 69 |         "azure_endpoint": kwargs.get("base_url")
 70 |         or os.getenv("AZURE_OPENAI_ENDPOINT", ""),
 71 |         "api_key": kwargs.get("api_key") or os.getenv("AZURE_OPENAI_API_KEY", ""),
 72 |     }
 73 | 
 74 | 
 75 | LLM_PROVIDERS: Dict[str, Tuple[Type, Callable[[Dict[str, Any]], Dict[str, Any]]]] = {
 76 |     "anthropic": (ChatAnthropic, _anthropic_params),
 77 |     "openai": (ChatOpenAI, _openai_params),
 78 |     "deepseek": (ChatOpenAI, _deepseek_params),
 79 |     "gemini": (ChatGoogleGenerativeAI, _gemini_params),
 80 |     "ollama": (ChatOllama, _ollama_params),
 81 |     "azure_openai": (AzureChatOpenAI, _azure_openai_params),
 82 | }
 83 | 
 84 | 
 85 | def get_llm_model(provider: str, **kwargs) -> Any:
 86 |     """
 87 |     Return an initialized language model client based on the given provider name.
 88 | 
 89 |     :param provider: The name of the LLM provider (e.g., "anthropic", "openai", "azure_openai").
 90 |     :param kwargs: Additional parameters (model_name, temperature, base_url, api_key, etc.).
 91 |     :return: An instance of a ChatLLM from the relevant langchain_* library.
 92 |     :raises ValueError: If the provider is unsupported.
 93 |     """
 94 | 
 95 |     try:
 96 |         llm_class, params_builder = LLM_PROVIDERS[provider]
 97 |     except KeyError as error:
 98 |         raise ValueError(f"Unsupported provider: {provider}") from error
 99 | 
100 |     provider_kwargs = params_builder(kwargs)
101 |     return llm_class(**provider_kwargs)
102 | 
103 | 
104 | # Commonly used model names for quick reference
105 | model_names = {
106 |     "anthropic": ["claude-3-5-sonnet-20240620", "claude-3-opus-20240229"],
107 |     "openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo"],
108 |     "deepseek": ["deepseek-chat"],
109 |     "gemini": [
110 |         "gemini-2.0-flash-exp",
111 |         "gemini-2.0-flash-thinking-exp",
112 |         "gemini-1.5-flash-latest",
113 |         "gemini-1.5-flash-8b-latest",
114 |         "gemini-2.0-flash-thinking-exp-1219",
115 |     ],
116 |     "ollama": ["deepseek-r1:671b", "qwen2.5:7b", "llama3.3", "phi4"],
117 |     "azure_openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo"],
118 | }
119 | 
120 | 
121 | def encode_image(img_path: Optional[str]) -> Optional[str]:
122 |     """
123 |     Convert an image at `img_path` into a base64-encoded string.
124 |     Returns None if `img_path` is None or empty.
125 |     Raises FileNotFoundError if the file doesn't exist.
126 |     """
127 |     if not img_path:
128 |         return None
129 | 
130 |     try:
131 |         with open(img_path, "rb") as image_file:
132 |             image_data = base64.b64encode(image_file.read()).decode("utf-8")
133 |         return image_data
134 |     except FileNotFoundError as error:
135 |         logger.error(f"Image not found at path {img_path}: {error}")
136 |         raise
137 |     except Exception as error:
138 |         logger.error(f"Error encoding image at {img_path}: {error}")
139 |         raise
140 | 
141 | 
142 | def get_latest_files(
143 |     directory: str, file_types: List[str] = [".webm", ".zip"]
144 | ) -> Dict[str, Optional[str]]:
145 |     """
146 |     Find the latest file for each extension in `file_types` under `directory`.
147 |     Returns a dict {file_extension: latest_file_path or None}.
148 | 
149 |     :param directory: The directory to search.
150 |     :param file_types: List of file extensions (e.g., [".webm", ".zip"]).
151 |     :return: dict mapping each extension to the path of the newest file or None if not found.
152 |     """
153 |     latest_files: Dict[str, Optional[str]] = {ext: None for ext in file_types}
154 | 
155 |     if not os.path.exists(directory):
156 |         logger.debug(f"Directory '{directory}' does not exist. Creating it.")
157 |         os.makedirs(directory, exist_ok=True)
158 |         return latest_files
159 | 
160 |     for file_type in file_types:
161 |         try:
162 |             matching_files = list(Path(directory).rglob(f"*{file_type}"))
163 |             if matching_files:
164 |                 # Sort or use max() by modified time
165 |                 most_recent_file = max(
166 |                     matching_files, key=lambda path: path.stat().st_mtime
167 |                 )
168 |                 # Check if file is not actively being written
169 |                 if time.time() - most_recent_file.stat().st_mtime > 1.0:
170 |                     latest_files[file_type] = str(most_recent_file)
171 |                 else:
172 |                     logger.debug(
173 |                         f"Skipping file {most_recent_file} - possibly still being written."
174 |                     )
175 |         except Exception as error:
176 |             logger.error(
177 |                 f"Error getting latest {file_type} file in '{directory}': {error}"
178 |             )
179 | 
180 |     return latest_files
181 | 
182 | 
183 | async def capture_screenshot(browser_session) -> Optional[str]:
184 |     """Capture a screenshot of the current page using the browser-use event bus."""
185 | 
186 |     if not hasattr(browser_session, "event_bus"):
187 |         logger.error("Browser session does not have an event_bus.")
188 |         return None
189 | 
190 |     try:
191 |         event = browser_session.event_bus.dispatch(ScreenshotEvent(full_page=False))
192 |         await event
193 |         result = await event.event_result(raise_if_any=True, raise_if_none=True)
194 |         return result
195 |     except Exception as error:
196 |         logger.error(f"Failed to capture screenshot via event bus: {error}")
197 |         return None
198 | 


--------------------------------------------------------------------------------
/src/mcp_browser_use/agent/custom_prompts.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from typing import List, Optional
  4 | 
  5 | from browser_use.agent.prompts import SystemPrompt
  6 | from browser_use.agent.views import ActionResult
  7 | from browser_use.browser.views import BrowserState
  8 | from langchain_core.messages import HumanMessage, SystemMessage
  9 | 
 10 | from mcp_browser_use.agent.custom_views import CustomAgentStepInfo
 11 | 
 12 | 
 13 | class CustomSystemPrompt(SystemPrompt):
 14 |     """
 15 |     Custom system prompt that extends SystemPrompt to inject additional
 16 |     formatting rules and instructions for the AI agent.
 17 |     """
 18 | 
 19 |     def important_rules(self) -> str:
 20 |         """
 21 |         Return a detailed multiline string describing how the agent
 22 |         must format its JSON response, handle multiple actions, forms,
 23 |         navigation, and the maximum actions per step.
 24 | 
 25 |         The text includes guidelines for:
 26 |           - JSON response format
 27 |           - Action sequences
 28 |           - Element interaction
 29 |           - Navigation & error handling
 30 |           - Task completion
 31 |           - Visual context usage
 32 |           - Handling form filling and suggestions
 33 |         """
 34 |         text = r"""
 35 |     1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format:
 36 |        {
 37 |          "current_state": {
 38 |            "prev_action_evaluation": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Ignore the action result. The website is the ground truth. Also mention if something unexpected happened like new suggestions in an input field. Shortly state why/why not. Note that the result you output must be consistent with the reasoning you output afterwards. If you consider it to be 'Failed,' you should reflect on this during your thought.",
 39 |            "important_contents": "Output important contents closely related to user's instruction or task on the current page. If there is, please output the contents. If not, please output empty string ''.",
 40 |            "completed_contents": "Update the input Task Progress. Completed contents is a general summary of the current contents that have been completed. Just summarize the contents that have been actually completed based on the current page and the history operations. Please list each completed item individually, such as: 1. Input username. 2. Input Password. 3. Click confirm button",
 41 |            "thought": "Think about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation. If the output of prev_action_evaluation is 'Failed', please reflect and output your reflection here. If you think you have entered the wrong page, consider to go back to the previous page in next action.",
 42 |            "summary": "Please generate a brief natural language description for the operation in next actions based on your Thought."
 43 |          },
 44 |          "action": [
 45 |            {
 46 |              "action_name": {
 47 |                // action-specific parameters
 48 |              }
 49 |            },
 50 |            // ... more actions in sequence
 51 |          ]
 52 |        }
 53 | 
 54 |     2. ACTIONS: You can specify multiple actions to be executed in sequence.
 55 |        Common action sequences:
 56 |        - Form filling: [
 57 |            {"input_text": {"index": 1, "text": "username"}},
 58 |            {"input_text": {"index": 2, "text": "password"}},
 59 |            {"click_element": {"index": 3}}
 60 |          ]
 61 |        - Navigation and extraction: [
 62 |            {"open_new_tab": {}},
 63 |            {"go_to_url": {"url": "https://example.com"}},
 64 |            {"extract_page_content": {}}
 65 |          ]
 66 | 
 67 |     3. ELEMENT INTERACTION:
 68 |        - Only use indexes that exist in the provided element list
 69 |        - Each element has a unique index number (e.g., "33[:]<button>")
 70 |        - Elements marked with "_[:]" are non-interactive (for context only)
 71 | 
 72 |     4. NAVIGATION & ERROR HANDLING:
 73 |        - If no suitable elements exist, use other functions to complete the task
 74 |        - If stuck, try alternative approaches
 75 |        - Handle popups/cookies by accepting or closing them
 76 |        - Use scroll to find elements you are looking for
 77 | 
 78 |     5. TASK COMPLETION:
 79 |        - If you think all the requirements of user's instruction have been completed and no further operation is required, output the done action to terminate the operation process.
 80 |        - Don't hallucinate actions.
 81 |        - If the task requires specific information - make sure to include everything in the done function. This is what the user will see.
 82 |        - If you are running out of steps (current step), think about speeding it up, and ALWAYS use the done action as the last action.
 83 | 
 84 |     6. VISUAL CONTEXT:
 85 |        - When an image is provided, use it to understand the page layout
 86 |        - Bounding boxes with labels correspond to element indexes
 87 |        - Each bounding box and its label have the same color
 88 |        - Most often the label is inside the bounding box, on the top right
 89 |        - Visual context helps verify element locations and relationships
 90 |        - Sometimes labels overlap, so use the context to verify the correct element
 91 | 
 92 |     7. FORM FILLING:
 93 |        - If you fill an input field and your action sequence is interrupted, most often a list with suggestions popped up under the field and you need to first select the right element from the suggestion list.
 94 | 
 95 |     8. ACTION SEQUENCING:
 96 |        - Actions are executed in the order they appear in the list
 97 |        - Each action should logically follow from the previous one
 98 |        - If the page changes after an action, the sequence is interrupted and you get the new state.
 99 |        - If content only disappears the sequence continues.
100 |        - Only provide the action sequence until you think the page will change.
101 |        - Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page like saving, extracting, checkboxes...
102 |        - Only use multiple actions if it makes sense.
103 |     """
104 |         text += f"   - use maximum {self.max_actions_per_step} actions per sequence"
105 |         return text
106 | 
107 |     def input_format(self) -> str:
108 |         """
109 |         Return a string describing the input structure that the agent can rely on
110 |         when constructing its output (Task, Hints, Memory, Task Progress, etc.).
111 |         """
112 |         return r"""
113 |     INPUT STRUCTURE:
114 |     1. Task: The user's instructions you need to complete.
115 |     2. Hints(Optional): Some hints to help you complete the user's instructions.
116 |     3. Memory: Important contents are recorded during historical operations for use in subsequent operations.
117 |     4. Task Progress: Up to the current page, the content you have completed can be understood as the progress of the task.
118 |     5. Current URL: The webpage you're currently on
119 |     6. Available Tabs: List of open browser tabs
120 |     7. Interactive Elements: List in the format:
121 |        index[:]<element_type>element_text</element_type>
122 |        - index: Numeric identifier for interaction
123 |        - element_type: HTML element type (button, input, etc.)
124 |        - element_text: Visible text or element description
125 | 
126 |     Example:
127 |     33[:]<button>Submit Form</button>
128 |     _[:] Non-interactive text
129 | 
130 | 
131 |     Notes:
132 |     - Only elements with numeric indexes are interactive
133 |     - _[:] elements provide context but cannot be interacted with
134 |     """
135 | 
136 |     def get_system_message(self) -> SystemMessage:
137 |         """
138 |         Build and return a SystemMessage containing all system-level instructions,
139 |         rules, and function references for the agent.
140 |         """
141 |         time_str = self.current_date.strftime("%Y-%m-%d %H:%M")
142 | 
143 |         AGENT_PROMPT = f"""You are a precise browser automation agent that interacts with websites through structured commands. Your role is to:
144 |     1. Analyze the provided webpage elements and structure
145 |     2. Plan a sequence of actions to accomplish the given task
146 |     3. Respond with valid JSON containing your action sequence and state assessment
147 | 
148 |     Current date and time: {time_str}
149 | 
150 |     {self.input_format()}
151 | 
152 |     {self.important_rules()}
153 | 
154 |     Functions:
155 |     {self.default_action_description}
156 | 
157 |     Remember: Your responses must be valid JSON matching the specified format. Each action in the sequence must be valid."""
158 | 
159 |         return SystemMessage(content=AGENT_PROMPT)
160 | 
161 | 
162 | class CustomAgentMessagePrompt:
163 |     """
164 |     Builds a user-facing prompt (HumanMessage) from the current browser state,
165 |     task step info, and any results or errors from previous actions.
166 |     """
167 | 
168 |     def __init__(
169 |         self,
170 |         state: BrowserState,
171 |         result: Optional[List[ActionResult]] = None,
172 |         include_attributes: Optional[List[str]] = None,
173 |         max_error_length: int = 400,
174 |         step_info: Optional[CustomAgentStepInfo] = None,
175 |     ):
176 |         """
177 |         :param state: The current BrowserState, including URL, tabs, elements, etc.
178 |         :param result: A list of ActionResults from the previous step(s).
179 |         :param include_attributes: A list of HTML attributes to show in element strings.
180 |         :param max_error_length: Maximum characters of error output to include.
181 |         :param step_info: Holds metadata like the current step number, memory, task details, etc.
182 |         """
183 |         self.state = state
184 |         self.result = result or []
185 |         self.include_attributes = include_attributes or []
186 |         self.max_error_length = max_error_length
187 |         self.step_info = step_info
188 | 
189 |     def get_user_message(self) -> HumanMessage:
190 |         """
191 |         Construct and return a HumanMessage containing:
192 |           1. Task and hints from step_info
193 |           2. Memory and task progress
194 |           3. Current URL and available tabs
195 |           4. A string representation of interactive elements
196 |           5. Any results or errors from previous actions
197 |           6. An inline base64 screenshot if available
198 | 
199 |         :return: A HumanMessage object for the agent to process.
200 |         """
201 |         step_info = self.step_info
202 |         if not step_info:
203 |             # Fallback if no step_info is provided
204 |             step_info_text = ""
205 |             task = ""
206 |             add_infos = ""
207 |             memory = ""
208 |             task_progress = ""
209 |         else:
210 |             step_info_text = f"Step {step_info.step_number}/{step_info.max_steps}"
211 |             task = step_info.task
212 |             add_infos = step_info.add_infos
213 |             memory = step_info.memory
214 |             task_progress = step_info.task_progress
215 | 
216 |         state_description = f"""
217 |     {step_info_text}
218 |     1. Task: {task}
219 |     2. Hints(Optional):
220 |     {add_infos}
221 |     3. Memory:
222 |     {memory}
223 |     4. Task Progress:
224 |     {task_progress}
225 |     5. Current url: {self.state.url}
226 |     6. Available tabs:
227 |     {self.state.tabs}
228 |     7. Interactive elements:
229 |     {self.state.element_tree.clickable_elements_to_string(
230 |         include_attributes=self.include_attributes
231 |     )}
232 |         """
233 | 
234 |         # Append action results or errors
235 |         for i, r in enumerate(self.result):
236 |             if r.extracted_content:
237 |                 state_description += f"\nResult of action {i + 1}/{len(self.result)}: {r.extracted_content}"
238 |             if r.error:
239 |                 truncated_error = r.error[-self.max_error_length :]
240 |                 state_description += f"\nError of action {i + 1}/{len(self.result)}: ...{truncated_error}"
241 | 
242 |         # If a screenshot is available, embed it as an image URL
243 |         if self.state.screenshot:
244 |             # Format message for vision model or multi-part message
245 |             return HumanMessage(
246 |                 content=[
247 |                     {"type": "text", "text": state_description},
248 |                     {
249 |                         "type": "image_url",
250 |                         "image_url": {
251 |                             "url": f"data:image/png;base64,{self.state.screenshot}"
252 |                         },
253 |                     },
254 |                 ]
255 |             )
256 |         else:
257 |             # Otherwise, just return text
258 |             return HumanMessage(content=state_description)
259 | 


--------------------------------------------------------------------------------
/documentation/SECURITY.md:
--------------------------------------------------------------------------------
  1 | # Security
  2 | 
  3 | > Below is a comprehensive security audit of your Browser-Use + MCP project using all the prior conversations and standard best practices for security. This is not an exhaustive penetration test but a systematic review of the major scripts and common pitfalls. We also provide suggestions for how to mitigate identified risks.
  4 | 
  5 | 1. Project Structure & High-Level Summary
  6 | 
  7 | The code layout is:
  8 | 
  9 |  1. Main server code server.py that runs an async event loop (loop = asyncio.new_event_loop()) within __main__):
 10 |     - Runs a FastMCP (Model Context Protocol) server.
 11 |     - Exposes a tool endpoint to run a single “browser agent.”
 12 |  2. Custom Agent under the agent directory and Related Classes:
 13 |     - custom_agent.py: Inherits from a base Agent and implements logic to parse LLM output, execute browser actions, handle vision, and create history GIFs.
 14 |     - custom_massage_manager.py: Handles LLM output parsing and conversion to browser actions.
 15 |     - custom_prompts.py: Contains system-level instructions for the LLM to produce a structured JSON output.
 16 |     - custom_views.py: Data classes (CustomAgentStepInfo, CustomAgentBrain) are used to store the agent’s state and output schema.
 17 |  3. Custom Browser Components under the browser directory:
 18 |     - config.py: Holds dataclasses for configuring Chrome (persistent sessions, debugging port).
 19 |     - custom_browser.py: Subclass of Browser that handles launching or connecting to Chrome over a debugging port. It may disable some security flags or run headless.
 20 |     - custom_context.py: Subclass of BrowserContext that can reuse an existing context or create new ones, load cookies, start traces, etc.
 21 |  4. Controllers & Actions:
 22 |     - custom_controller.py: Registers custom actions (copy/paste from clipboard).
 23 |  5. Utilities:
 24 |     - agent_state.py: Tracks a stop_requested event (via asyncio.Event) and optional “last valid state.” Implemented as a singleton (only one agent at a time).
 25 |     - utils.py: offers a get_llm_model function to create different LLM clients (OpenAI, Anthropic, Azure, etc.), as well as image encoding and file-tracking utilities.
 26 | 
 27 | The project runs a single agent simultaneously, hooking an LLM to actual browser actions. Let’s go through significant security aspects.
 28 | 
 29 | 2. Identified Security Risks & Recommendations
 30 | 
 31 | Below are the main areas of concern based on the code we’ve seen and typical usage patterns.
 32 | 
 33 | 2.1 Disabling Browser Security & Remote Debug Port
 34 | 
 35 | Where
 36 | 
 37 | - custom_browser.py:
 38 | - Allows launching Chrome with flags like --disable-web-security.
 39 | - Launches Chrome with --remote-debugging-port=9222.
 40 | 
 41 | Risks
 42 | 
 43 |  1. Cross-Origin Attacks: Disabling web security (--disable-web-security, --disable-features=IsolateOrigins) allows malicious pages to read cross-origin data in the same browser instance. If the agent visits untrusted websites, it could inadvertently exfiltrate data from other open tabs or sessions.
 44 |  2. Debug Port Exposure: A remote debugging port on 9222 (if bound to 0.0.0.0 or otherwise accessible externally) gives anyone who can connect full control of the browser. If not behind a firewall, an attacker can hijack the session.
 45 | 
 46 | Recommendations
 47 | 
 48 |  1. Limit the usage of disable-web-security and related flags. Restrict this to internal/test scenarios or run it inside a hardened container or ephemeral environment.
 49 |  2. Restrict Access to Port 9222:
 50 | 
 51 | - Bind to 127.0.0.1 only (--remote-debugging-address=127.0.0.1) so external hosts cannot connect.
 52 | - Use a firewall or security group to block external access.
 53 | - If remote access is required, use SSH tunneling rather than publicly exposing the port.
 54 | 
 55 |  3. If you must open untrusted pages, create separate browser instances. This means not reusing the same “user data dir” or disabling security for critical tasks.
 56 | 
 57 | 2.2 Global Singleton AgentState
 58 | 
 59 | Where
 60 | 
 61 | - agent_state.py implements a singleton that shares_stop_requested and last_valid_state across all agent references.
 62 | 
 63 | Risks
 64 | 
 65 |  1. Concurrent Agents: If you (in the future) attempt to run multiple agents, the single AgentState object might cause cross-talk or unpredictable behavior (e.g., one agent’s stop request stops another).
 66 |  2. Potential Race Conditions: If the code evolves to multi-thread or multi-process, the concurrency might not behave as expected.
 67 | 
 68 | Recommendations
 69 | 
 70 |  1. Ensure Only One Agent: If that’s your design (a single agent at a time), the singleton is acceptable. Document it.
 71 |  2. Remove Singleton for multi-agent scenarios. Each agent can have its state object.
 72 | 
 73 | 2.3 Clipboard Actions
 74 | 
 75 | Where
 76 | 
 77 | - custom_controller.py registers actions like “Copy text to clipboard” and “Paste from clipboard.”
 78 | 
 79 | Risks
 80 | 
 81 |  1. System Clipboard: Copy/paste using the OS-level clipboard (pyperclip). This can leak sensitive data if other apps or remote sessions see the same clipboard.
 82 |  2. Overwrite: The agent can overwrite a user’s clipboard or read from it unexpectedly.
 83 | 
 84 | Recommendations
 85 | 
 86 |  1. Run in a Controlled Environment: It may be okay if you only do local development or a dedicated environment.
 87 |  2. Use an In-Memory Clipboard: Instead of the actual system clipboard, implement a local memory store for copying and pasting within the agent’s session. This prevents overwriting the user’s system clipboard.
 88 |  3. Disable or Restrict these actions if you run in multi-user or production mode.
 89 | 
 90 | 2.4 Logging Sensitive Data
 91 | 
 92 | Where
 93 | 
 94 | - Various scripts log LLM responses or user tasks.
 95 | - utils.py and other files read environment variables for API keys.
 96 | 
 97 | Risks
 98 | 
 99 |  1. API Keys in Logs: If you ever log environment variables, they might contain secrets (e.g., OPENAI_API_KEY, ANTHROPIC_API_KEY).
100 |  2. Conversation Logs: LLM or browser actions might contain personal info or private data from pages the agent visits.
101 | 
102 | Recommendations
103 | 
104 |  1. Scrub Sensitive Info: Use partial redaction to log environment variables or user data.
105 |  2. Control Log Levels: Keep debug logs for local dev; avoid them in production or store them in a secure location.
106 |  3. Never commit or print raw API keys or user credentials.
107 | 
108 | 2.5 Environment Variables for API Keys
109 | 
110 | Where
111 | 
112 | - utils.py reads OPENAI_API_KEY, ANTHROPIC_API_KEY, AZURE_OPENAI_API_KEY, etc.
113 | 
114 | Risks
115 | 
116 |  1. Credentials Leak: Others might read if environment variables are insecurely stored or the machine is multi-tenant.
117 |  2. Rotation & Auditing: It is harder to rotate if you embed them in environment variables in multiple places.
118 | 
119 | Recommendations
120 | 
121 |  1. Use a Secret Manager: For production, store keys in Vault, AWS Secrets Manager, or a similar service, injecting them at runtime with minimal exposure.
122 |  2. Lock Down or Mask your environment variables in logs.
123 | 
124 | 2.6 Handling of Cookies & Persisted Sessions
125 | 
126 | Where
127 | 
128 | - custom_context.py loads cookies from a file and reuses them if cookies_file is set.
129 | 
130 | Risks
131 | 
132 |  1. Cookie Theft: Cookies containing session tokens can be used to impersonate or access accounts.
133 |  2. Insecure Storage: If cookies_file is not locked down or is in a publicly accessible directory, attackers could read it.
134 | 
135 | Recommendations
136 | 
137 |  1. Encrypt or Secure the cookie file if it’s sensitive.
138 |  2. Use ephemeral sessions if you don’t need persistence (this mitigates the risk of session hijacking).
139 |  3. Handle JSON Errors gracefully. The code might crash if the cookie file is corrupted or maliciously edited. Currently, you catch some exceptions, but be sure they are robust.
140 | 
141 | 2.7 LLM Output Execution
142 | 
143 | Where
144 | 
145 | - custom_agent.py uses the LLM output to determine subsequent actions in the browser. This is effectively arbitrary remote code controlling the browser if the LLM’s output is invalid.
146 | 
147 | Risks
148 | 
149 |  1. Prompt Injection or Malicious LLM Output: If an attacker can manipulate the prompt or the LLM’s instructions, they might cause harmful browsing actions (e.g., navigating to malicious pages, downloading malicious content, or exfiltrating data).
150 |  2. Excessive Trust: The agent automatically performs actions the LLM says. If the LLM is compromised or intentionally producing malicious JSON, your system might become an attack vector.
151 | 
152 | Recommendations
153 | 
154 |  1. Policy Layer: Before executing each action, you can add checks to ensure it’s within a set of “allowed” domains or “allowed action types.”
155 |  2. Safe Browsing: You could block navigation to known malicious or undesired domains.
156 |  3. Sandboxes: Run the browser in a locked-down Docker container or VM so the environment is contained even if the LLM instructs to visit a malicious link.
157 | 
158 | 2.8 Untrusted Web Content & Vision
159 | 
160 | Where
161 | 
162 | - The agent uses optional “vision-based element detection” or page screenshots.
163 | 
164 | Risks
165 | 
166 |  1. Malicious Images: If the agent processes images from untrusted sources, ensure it’s safe from typical image library exploits (PIL is relatively safe, but keep it updated).
167 |  2. Screenshot capturing: If you store or send screenshots, you risk inadvertently capturing personal data or content.
168 | 
169 | Recommendations
170 | 
171 |  1. Use the Latest Libraries: Keep PIL (pillow) updated to avoid known vulnerabilities in image parsing.
172 |  2. Handle Storage: If you store screenshots, do so in secure, short-lived storage with restricted access.
173 | 
174 |  3. Summary of Key Security Practices
175 | 
176 | Based on the potential issues above, here’s a short checklist to ensure your system remains secure:
177 | 
178 |  1. Networking & Ports:
179 | 
180 | - Bind remote debugging to 127.0.0.1 only.
181 | - Use firewalls or SSH tunnels if remote access is necessary.
182 | 
183 |  2. Sandboxing:
184 | 
185 | - Use Docker or a VM for your automation environment.
186 | - Avoid --disable-web-security in production, or keep it in an isolated environment if you must use it.
187 | 
188 |  3. Logging & Secrets:
189 | 
190 | - Never log API keys or raw environment variables.
191 | - Redact sensitive info in logs.
192 | - Use a secret manager to store credentials.
193 | 
194 |  4. Clipboard & Persistence:
195 | 
196 | - Limit usage of system clipboard actions or implement an in-memory approach.
197 | - If session data/cookies are reused, ensure the file and directory permissions are locked down.
198 | 
199 |  5. LLM Output Validation:
200 | 
201 | - Consider a “policy layer” that checks which actions are allowed before executing them.
202 | - Consider domain safelisting or an interactive approval step in critical scenarios.
203 | 
204 |  6. Error Handling:
205 |  •  - Gracefully handle invalid JSON, cookies, or environment variables.
206 | 
207 | - Decide if you want to continue or fail fast with an error message.
208 | 
209 |  7. Document your single-agent approach:
210 | 
211 | - The singleton approach is fine if you never plan multiple concurrent agents.
212 | - Otherwise, remove it or ensure concurrency safety.
213 | 
214 |  4. Verifying Project Structure
215 | 
216 | From a structural standpoint:
217 | 
218 |  1. Modular & Readable: Your project is decently modular: custom_agent, custom_browser, custom_context, custom_controller, custom_prompts, etc.
219 |  2. Dependencies: You rely on Playwright. async_api, pyperclip, requests, and custom browser_use and langchain _* modules. Ensure they are pinned to known-safe versions (e.g., in a requirements.txt) and kept updated.
220 |  3. Single vs. Multi Agent: In your README or main docs, clarify that you run only one agent at a time or concurrency is in scope.
221 |  4. Deployment: If you distribute or deploy this server, outline the usage of environment variables, the required ports, and the recommended containerization approach.
222 | 
223 |  5. Conclusion
224 | 
225 | Your codebase is well-organized and functionally robust. The main security concerns revolve around:
226 | 
227 | - Remote Debugging & Disabling Security** in Chrome.
228 | - Clipboard & Cookie usage.
229 | - LLM output leading to potentially dangerous actions if not validated.
230 | - Logging & environment variables containing sensitive data.
231 | 
232 | You can mitigate most of these risks by containerizing or VM-isolating your environment, restricting your debugging port to localhost, carefully handling credentials and logs, and implementing a minimal policy layer for LLM-driven actions.
233 | 
234 | The project is in good shape, but you should document these security measures and carefully configure them, especially in environments other than internal development.
235 | 
236 | Next Steps:
237 | 
238 | - Implement or strengthen the recommended mitigation steps above.
239 | - Periodically review dependencies for security patches.
240 | - If this is a production-grade service, consider formal penetration testing or a threat model exercise to identify additional risks.
241 | - Keep documentation clear about the single-agent design and environment variables, and recommend using a container or ephemeral environment to prevent lateral movement or data exfiltration.
242 | 


--------------------------------------------------------------------------------
/src/mcp_browser_use/agent/custom_agent.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import json
  4 | import logging
  5 | import traceback
  6 | from typing import Any, List, Optional, Type
  7 | 
  8 | import base64
  9 | import io
 10 | import os
 11 | from PIL import Image, ImageDraw, ImageFont
 12 | 
 13 | from browser_use.agent.prompts import SystemPrompt
 14 | from browser_use.agent.service import Agent
 15 | from browser_use.agent.views import (
 16 |     ActionResult,
 17 |     AgentHistoryList,
 18 |     AgentOutput,
 19 |     AgentHistory,
 20 | )
 21 | from browser_use import BrowserSession
 22 | from browser_use.browser.views import BrowserStateHistory
 23 | from browser_use.controller.service import Controller
 24 | from browser_use.telemetry.views import AgentEndTelemetryEvent, AgentRunTelemetryEvent
 25 | from browser_use.utils import time_execution_async
 26 | from langchain_core.language_models.chat_models import BaseChatModel
 27 | from langchain_core.messages import BaseMessage
 28 | from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 29 | from langchain_openai import ChatOpenAI
 30 | from langchain_openai.chat_models.base import _convert_message_to_dict
 31 | 
 32 | from mcp_browser_use.utils.agent_state import AgentState
 33 | from mcp_browser_use.agent.custom_massage_manager import CustomMassageManager
 34 | from mcp_browser_use.agent.custom_views import CustomAgentOutput, CustomAgentStepInfo
 35 | 
 36 | logger = logging.getLogger(__name__)
 37 | 
 38 | 
 39 | class CustomAgent(Agent):
 40 |     """
 41 |     An AI-driven Agent that uses a language model to determine browser actions,
 42 |     interacts with a browser/page handle, and manages conversation history and
 43 |     state.
 44 |     """
 45 | 
 46 |     def __init__(
 47 |         self,
 48 |         task: str,
 49 |         llm: BaseChatModel,
 50 |         add_infos: str = "",
 51 |         browser_session: Optional[BrowserSession] = None,
 52 |         browser: Optional[BrowserSession] = None,
 53 |         browser_context: Optional[Any] = None,
 54 |         controller: Optional[Controller] = None,
 55 |         use_vision: bool = True,
 56 |         save_conversation_path: Optional[str] = None,
 57 |         max_failures: int = 5,
 58 |         retry_delay: int = 10,
 59 |         system_prompt_class: Type[SystemPrompt] = SystemPrompt,
 60 |         max_input_tokens: int = 13000,
 61 |         validate_output: bool = False,
 62 |         include_attributes: tuple[str, str, str, str, str, str, str, str, str, str] = (
 63 |             "title",
 64 |             "type",
 65 |             "name",
 66 |             "role",
 67 |             "tabindex",
 68 |             "aria-label",
 69 |             "placeholder",
 70 |             "value",
 71 |             "alt",
 72 |             "aria-expanded",
 73 |         ),
 74 |         max_error_length: int = 400,
 75 |         max_actions_per_step: int = 10,
 76 |         tool_call_in_content: bool = True,
 77 |         agent_state: Optional[AgentState] = None,
 78 |     ):
 79 |         """
 80 |         :param task: Main instruction or goal for the agent.
 81 |         :param llm: The large language model (BaseChatModel) used for reasoning.
 82 |         :param add_infos: Additional information or context to pass to the agent.
 83 |         :param browser_session: Optional browser/session instance (legacy name).
 84 |         :param browser: Preferred browser object for ``browser-use`` >= 0.7.
 85 |         :param browser_context: Optional active page/context to reuse.
 86 |         :param controller: Optional controller for handling multi-step actions. A new
 87 |             controller is created when not provided.
 88 |         :param use_vision: Whether to use vision-based element detection.
 89 |         :param save_conversation_path: File path to store conversation logs.
 90 |         :param max_failures: Max consecutive failures allowed before aborting.
 91 |         :param retry_delay: Delay between retries (not currently used).
 92 |         :param system_prompt_class: System prompt class for the agent.
 93 |         :param max_input_tokens: Token limit for model input.
 94 |         :param validate_output: Whether to validate final output at each step.
 95 |         :param include_attributes: HTML attributes to include in vision logic.
 96 |         :param max_error_length: Max length for error messages.
 97 |         :param max_actions_per_step: Limit the number of actions agent can perform per step.
 98 |         :param tool_call_in_content: Whether tool calls are in the raw model content.
 99 |         :param agent_state: Shared state to detect external stop signals, store last valid state, etc.
100 |         """
101 |         controller = controller or Controller()
102 |         self.controller = controller
103 | 
104 |         browser_handle = browser or browser_session
105 | 
106 |         init_kwargs: dict[str, Any] = {
107 |             "task": task,
108 |             "llm": llm,
109 |             "controller": controller,
110 |             "use_vision": use_vision,
111 |             "save_conversation_path": save_conversation_path,
112 |             "max_failures": max_failures,
113 |             "retry_delay": retry_delay,
114 |             "system_prompt_class": system_prompt_class,
115 |             "max_input_tokens": max_input_tokens,
116 |             "validate_output": validate_output,
117 |             "include_attributes": include_attributes,
118 |             "max_error_length": max_error_length,
119 |             "max_actions_per_step": max_actions_per_step,
120 |             "tool_call_in_content": tool_call_in_content,
121 |         }
122 | 
123 |         if browser_handle is not None:
124 |             init_kwargs["browser"] = browser_handle
125 | 
126 |         if browser_context is not None:
127 |             init_kwargs["page"] = browser_context
128 | 
129 |         for _ in range(4):
130 |             try:
131 |                 super().__init__(**init_kwargs)
132 |                 break
133 |             except TypeError as exc:  # pragma: no cover - defensive compatibility
134 |                 message = str(exc)
135 |                 if (
136 |                     "unexpected keyword argument 'browser'" in message
137 |                     and "browser" in init_kwargs
138 |                 ):
139 |                     browser_value = init_kwargs.pop("browser")
140 |                     if browser_value is not None:
141 |                         init_kwargs.setdefault("browser_session", browser_value)
142 |                     continue
143 |                 if (
144 |                     "unexpected keyword argument 'browser_session'" in message
145 |                     and "browser_session" in init_kwargs
146 |                 ):
147 |                     browser_value = init_kwargs.pop("browser_session")
148 |                     if browser_value is not None:
149 |                         init_kwargs.setdefault("browser", browser_value)
150 |                     continue
151 |                 if (
152 |                     "unexpected keyword argument 'page'" in message
153 |                     and "page" in init_kwargs
154 |                 ):
155 |                     init_kwargs.pop("page")
156 |                     continue
157 |                 if (
158 |                     "unexpected keyword argument 'controller'" in message
159 |                     and "controller" in init_kwargs
160 |                 ):
161 |                     controller_value = init_kwargs.pop("controller")
162 |                     init_kwargs.setdefault("tools", controller_value)
163 |                     continue
164 |                 if (
165 |                     "unexpected keyword argument 'tools'" in message
166 |                     and "tools" in init_kwargs
167 |                 ):
168 |                     controller_value = init_kwargs.pop("tools")
169 |                     init_kwargs.setdefault("controller", controller_value)
170 |                     continue
171 |                 raise
172 |         else:  # pragma: no cover - should never happen
173 |             raise TypeError("Unable to initialise base Agent with provided arguments")
174 |         self.add_infos = add_infos
175 |         self.agent_state = agent_state
176 | 
177 |         # Custom message manager
178 |         self.message_manager = CustomMassageManager(
179 |             llm=self.llm,
180 |             task=self.task,
181 |             action_descriptions=self.controller.registry.get_prompt_description(),
182 |             system_prompt_class=self.system_prompt_class,
183 |             max_input_tokens=self.max_input_tokens,
184 |             include_attributes=self.include_attributes,
185 |             max_error_length=self.max_error_length,
186 |             max_actions_per_step=self.max_actions_per_step,
187 |             tool_call_in_content=tool_call_in_content,
188 |         )
189 | 
190 |     def _setup_action_models(self) -> None:
191 |         """
192 |         Setup dynamic action models from the controller's registry.
193 |         This ensures the agent's output schema matches all possible actions.
194 |         """
195 |         # Get the dynamic action model from controller's registry
196 |         self.ActionModel = self.controller.registry.create_action_model()
197 |         # Create output model with the dynamic actions
198 |         self.AgentOutput = CustomAgentOutput.type_with_custom_actions(self.ActionModel)
199 | 
200 |     def _log_response(self, response: CustomAgentOutput) -> None:
201 |         """
202 |         Log the model's response in a human-friendly way.
203 |         Shows success/fail state, memory, thought, summary, etc.
204 |         """
205 |         evaluation = response.current_state.prev_action_evaluation or ""
206 |         if "Success" in evaluation:
207 |             emoji = "✅"
208 |         elif "Failed" in evaluation:
209 |             emoji = "❌"
210 |         else:
211 |             emoji = "🤷"
212 | 
213 |         logger.info(f"{emoji} Eval: {evaluation}")
214 |         logger.info(f"🧠 New Memory: {response.current_state.important_contents}")
215 |         logger.info(f"⏳ Task Progress: {response.current_state.completed_contents}")
216 |         logger.info(f"🤔 Thought: {response.current_state.thought}")
217 |         logger.info(f"🎯 Summary: {response.current_state.summary}")
218 | 
219 |         for i, action in enumerate(response.action):
220 |             logger.info(
221 |                 f"🛠️  Action {i + 1}/{len(response.action)}: "
222 |                 f"{action.model_dump_json(exclude_unset=True)}"
223 |             )
224 | 
225 |     def update_step_info(
226 |         self,
227 |         model_output: CustomAgentOutput,
228 |         step_info: Optional[CustomAgentStepInfo] = None,
229 |     ) -> None:
230 |         """
231 |         Update the current step with new memory and completed contents.
232 | 
233 |         :param model_output: Parsed output from the LLM.
234 |         :param step_info: Step information object, if any.
235 |         """
236 |         if step_info is None:
237 |             return
238 | 
239 |         step_info.step_number += 1
240 |         important_contents = model_output.current_state.important_contents
241 |         if (
242 |             important_contents
243 |             and "None" not in important_contents
244 |             and important_contents not in step_info.memory
245 |         ):
246 |             step_info.memory += important_contents + "\n"
247 | 
248 |         completed_contents = model_output.current_state.completed_contents
249 |         if completed_contents and "None" not in completed_contents:
250 |             step_info.task_progress = completed_contents
251 | 
252 |     @time_execution_async("--get_next_action")
253 |     async def get_next_action(self, input_messages: List[BaseMessage]) -> AgentOutput:
254 |         """
255 |         Get the next action from the LLM, attempting structured output parsing.
256 |         Falls back to manual JSON parsing if structured parse fails.
257 |         """
258 |         logger.info("Getting next action from LLM")
259 |         logger.debug(f"Input messages: {input_messages}")
260 | 
261 |         try:
262 |             if isinstance(self.llm, ChatOpenAI):
263 |                 # For OpenAI, attempt structured parse with "instructor" first
264 |                 parsed_output = await self._handle_openai_structured_output(
265 |                     input_messages
266 |                 )
267 |             else:
268 |                 logger.info(f"Using non-OpenAI model: {type(self.llm).__name__}")
269 |                 parsed_output = await self._handle_non_openai_structured_output(
270 |                     input_messages
271 |                 )
272 | 
273 |             self._truncate_and_log_actions(parsed_output)
274 |             self.n_steps += 1
275 |             return parsed_output
276 | 
277 |         except Exception as e:
278 |             logger.warning(f"Error getting structured output: {str(e)}")
279 |             logger.info("Attempting fallback to manual parsing")
280 |             return await self._fallback_parse(input_messages)
281 | 
282 |     async def _handle_openai_structured_output(
283 |         self, input_messages: List[BaseMessage]
284 |     ) -> AgentOutput:
285 |         """
286 |         Attempt to get structured output from an OpenAI LLM
287 |         using the 'instructor' library. If that fails, fallback
288 |         to the default structured output approach.
289 |         """
290 |         logger.info("Using OpenAI chat model")
291 |         # Usually safe to import here to avoid circular import issues
292 |         from instructor import from_openai
293 | 
294 |         try:
295 |             client = from_openai(self.llm.root_async_client)
296 |             logger.debug(f"Using model: {self.llm.model_name}")
297 |             messages = [_convert_message_to_dict(msg) for msg in input_messages]
298 | 
299 |             parsed_response = await client.chat.completions.create(
300 |                 messages=messages,
301 |                 model=self.llm.model_name,
302 |                 response_model=self.AgentOutput,
303 |             )
304 |             logger.debug(f"Raw OpenAI response: {parsed_response}")
305 | 
306 |             return parsed_response
307 | 
308 |         except Exception as e:
309 |             # Attempt default structured output if instructor fails
310 |             logger.error(f"Error with 'instructor' approach: {str(e)}")
311 |             logger.info("Using default structured output approach.")
312 | 
313 |             structured_llm = self.llm.with_structured_output(
314 |                 self.AgentOutput, include_raw=True
315 |             )
316 |             response: dict[str, Any] = await structured_llm.ainvoke(input_messages)
317 |             logger.debug(f"Raw LLM response (default approach): {response}")
318 |             return response["parsed"]  # type: ignore
319 | 
320 |     async def _handle_non_openai_structured_output(
321 |         self, input_messages: List[BaseMessage]
322 |     ) -> AgentOutput:
323 |         """
324 |         For non-OpenAI models, we directly use the structured LLM approach.
325 |         """
326 |         structured_llm = self.llm.with_structured_output(
327 |             self.AgentOutput, include_raw=True
328 |         )
329 |         response: dict[str, Any] = await structured_llm.ainvoke(input_messages)
330 |         logger.debug(f"Raw LLM response: {response}")
331 |         return response["parsed"]  # type: ignore
332 | 
333 |     async def _fallback_parse(self, input_messages: List[BaseMessage]) -> AgentOutput:
334 |         """
335 |         Manual JSON parsing fallback if structured parse fails.
336 |         Tries to extract JSON from the raw text and parse into AgentOutput.
337 |         """
338 |         try:
339 |             ret = await self.llm.ainvoke(input_messages)
340 |             logger.debug(f"Raw fallback response: {ret}")
341 | 
342 |             content = ret.content
343 |             if isinstance(content, list):
344 |                 # If content is a list, parse from the first element
345 |                 parsed_json = json.loads(
346 |                     content[0].replace("```json", "").replace("```", "")
347 |                 )
348 |             else:
349 |                 # Otherwise parse from the string
350 |                 parsed_json = json.loads(
351 |                     content.replace("```json", "").replace("```", "")
352 |                 )
353 | 
354 |             parsed_output: AgentOutput = self.AgentOutput(**parsed_json)
355 |             if parsed_output is None:
356 |                 raise ValueError("Could not parse fallback response.")
357 | 
358 |             self._truncate_and_log_actions(parsed_output)
359 |             self.n_steps += 1
360 |             logger.info(
361 |                 f"Successfully got next action via fallback. Step count: {self.n_steps}"
362 |             )
363 |             return parsed_output
364 | 
365 |         except Exception as parse_error:
366 |             logger.error(f"Fallback parsing failed: {str(parse_error)}")
367 |             raise
368 | 
369 |     def _truncate_and_log_actions(self, parsed_output: AgentOutput) -> None:
370 |         """
371 |         Enforce the max_actions_per_step limit and log the response.
372 |         """
373 |         original_action_count = len(parsed_output.action)
374 |         parsed_output.action = parsed_output.action[: self.max_actions_per_step]
375 |         if original_action_count > self.max_actions_per_step:
376 |             logger.warning(
377 |                 f"Truncated actions from {original_action_count} to {self.max_actions_per_step}"
378 |             )
379 |         self._log_response(parsed_output)
380 | 
381 |     def summarize_messages(self) -> bool:
382 |         """
383 |         Summarize message history if it exceeds 5 messages.
384 |         Returns True if summarization occurred, False otherwise.
385 |         """
386 |         stored_messages = self.message_manager.get_messages()
387 |         message_count = len(stored_messages)
388 | 
389 |         if message_count <= 5:
390 |             logger.debug("Message count <= 5, skipping summarization")
391 |             return False
392 | 
393 |         logger.info(f"Summarizing {message_count} messages")
394 |         try:
395 |             summarization_prompt = ChatPromptTemplate.from_messages(
396 |                 [
397 |                     MessagesPlaceholder(variable_name="chat_history"),
398 |                     (
399 |                         "user",
400 |                         "Distill the above chat messages into a single summary message. "
401 |                         "Include as many specific details as you can.",
402 |                     ),
403 |                 ]
404 |             )
405 |             summarization_chain = summarization_prompt | self.llm
406 | 
407 |             summary_message = summarization_chain.invoke(
408 |                 {"chat_history": stored_messages}
409 |             )
410 |             logger.debug(f"Generated summary: {summary_message}")
411 | 
412 |             self.message_manager.reset_history()
413 |             self.message_manager._add_message_with_tokens(
414 |                 summary_message
415 |             )  # Consider creating a public method for this
416 |             return True
417 | 
418 |         except Exception as e:
419 |             logger.error(f"Error during message summarization: {str(e)}")
420 |             logger.debug(f"Full traceback: {traceback.format_exc()}")
421 |             return False
422 | 
423 |     @time_execution_async("--execute-agent-step")
424 |     async def execute_agent_step(
425 |         self, step_info: Optional[CustomAgentStepInfo] = None
426 |     ) -> None:
427 |         """
428 |         Execute a single agent step of the task:
429 |         1) Capture browser state
430 |         2) Query LLM for next action
431 |         3) Execute that action(s)
432 |         4) Update logs/history
433 |         """
434 |         logger.info(f"\n📍 Step {self.n_steps}")
435 |         logger.info(f"History token count: {self.message_manager.history.total_tokens}")
436 | 
437 |         # Optionally summarize to reduce token usage
438 |         # self.summarize_messages()
439 | 
440 |         state = None
441 |         model_output = None
442 |         result: List[ActionResult] = []
443 | 
444 |         try:
445 |             try:
446 |                 state = await self.browser_context.get_state(use_vision=self.use_vision)
447 |             except TypeError:
448 |                 logger.warning(
449 |                     "get_state does not support 'use_vision' argument, falling back."
450 |                 )
451 |                 state = await self.browser_context.get_state()
452 |             self.message_manager.add_state_message(state, self._last_result, step_info)
453 |             input_messages = self.message_manager.get_messages()
454 | 
455 |             model_output = await self.get_next_action(input_messages)
456 |             self.update_step_info(model_output, step_info)
457 |             logger.info(f"🧠 All Memory: {getattr(step_info, 'memory', '')}")
458 | 
459 |             self._save_conversation(input_messages, model_output)
460 |             # Remove the last state message from chat history to prevent bloat
461 |             self.message_manager._remove_last_state_message()
462 |             self.message_manager.add_model_output(model_output)
463 | 
464 |             # Execute the requested actions
465 |             result = await self.controller.multi_act(
466 |                 model_output.action, self.browser_context
467 |             )
468 |             self._last_result = result
469 | 
470 |             # If the last action indicates "is_done", we can log the extracted content
471 |             if len(result) > 0 and result[-1].is_done:
472 |                 logger.info(f"📄 Result: {result[-1].extracted_content}")
473 | 
474 |             self.consecutive_failures = 0
475 | 
476 |         except Exception as e:
477 |             result = self._handle_step_error(e)
478 |             self._last_result = result
479 | 
480 |         finally:
481 |             if not result:
482 |                 return
483 | 
484 |             for r in result:
485 |                 logger.warning(f"🔧 Action result: {r}")
486 | 
487 |             if state:
488 |                 self._make_history_item(model_output, state, result)
489 | 
490 |     def create_history_gif(
491 |         self,
492 |         output_path: str = "agent_history.gif",
493 |         duration: int = 3000,
494 |         show_goals: bool = True,
495 |         show_task: bool = True,
496 |         show_logo: bool = False,
497 |         font_size: int = 40,
498 |         title_font_size: int = 56,
499 |         goal_font_size: int = 44,
500 |         margin: int = 40,
501 |         line_spacing: float = 1.5,
502 |     ) -> None:
503 |         """
504 |         Create a GIF from the agent's history using the captured screenshots.
505 |         Overlays text for tasks/goals. Optionally includes a logo.
506 |         """
507 |         if not self.history.history:
508 |             logger.warning("No history to create GIF from")
509 |             return
510 | 
511 |         if not self.history.history[0].state.screenshot:
512 |             logger.warning(
513 |                 "No screenshots in the first history item; cannot create GIF"
514 |             )
515 |             return
516 | 
517 |         images = []
518 |         try:
519 |             # Attempt to load some preferred fonts
520 |             font_options = ["Helvetica", "Arial", "DejaVuSans", "Verdana"]
521 |             regular_font, title_font, goal_font = None, None, None
522 |             font_loaded = False
523 | 
524 |             for font_name in font_options:
525 |                 try:
526 |                     import platform
527 | 
528 |                     if platform.system() == "Windows":
529 |                         # On Windows, we may need absolute font paths
530 |                         font_name = os.path.join(
531 |                             os.getenv("WIN_FONT_DIR", "C:\\Windows\\Fonts"),
532 |                             font_name + ".ttf",
533 |                         )
534 | 
535 |                     regular_font = ImageFont.truetype(font_name, font_size)
536 |                     title_font = ImageFont.truetype(font_name, title_font_size)
537 |                     goal_font = ImageFont.truetype(font_name, goal_font_size)
538 |                     font_loaded = True
539 |                     break
540 |                 except OSError:
541 |                     continue
542 | 
543 |             if not font_loaded:
544 |                 raise OSError("No preferred fonts found")
545 | 
546 |         except OSError:
547 |             # Fallback to default
548 |             regular_font = ImageFont.load_default()
549 |             title_font = regular_font
550 |             goal_font = regular_font
551 | 
552 |         logo = None
553 |         if show_logo:
554 |             try:
555 |                 logo = Image.open("./static/browser-use.png")
556 |                 # Resize logo
557 |                 logo_height = 150
558 |                 aspect_ratio = logo.width / logo.height
559 |                 logo_width = int(logo_height * aspect_ratio)
560 |                 logo = logo.resize((logo_width, logo_height), Image.Resampling.LANCZOS)
561 |             except Exception as e:
562 |                 logger.warning(f"Could not load logo: {e}")
563 | 
564 |         # If requested, create an initial frame with the entire task
565 |         if show_task and self.task:
566 |             task_frame = self._create_task_frame(
567 |                 self.task,
568 |                 self.history.history[0].state.screenshot,
569 |                 title_font,
570 |                 regular_font,
571 |                 logo,
572 |                 line_spacing,
573 |             )
574 |             images.append(task_frame)
575 | 
576 |         # Convert each step’s screenshot
577 |         for i, item in enumerate(self.history.history, 1):
578 |             if not item.state.screenshot:
579 |                 continue
580 | 
581 |             img_data = base64.b64decode(item.state.screenshot)
582 |             image = Image.open(io.BytesIO(img_data))
583 | 
584 |             if show_goals and item.model_output:
585 |                 image = self._add_overlay_to_image(
586 |                     image=image,
587 |                     step_number=i,
588 |                     goal_text=item.model_output.current_state.thought,
589 |                     regular_font=regular_font,
590 |                     title_font=title_font,
591 |                     margin=margin,
592 |                     logo=logo,
593 |                     line_spacing=line_spacing,
594 |                 )
595 | 
596 |             images.append(image)
597 | 
598 |         if images:
599 |             images[0].save(
600 |                 output_path,
601 |                 save_all=True,
602 |                 append_images=images[1:],
603 |                 duration=duration,
604 |                 loop=0,
605 |                 optimize=False,
606 |             )
607 |             logger.info(f"Created GIF at {output_path}")
608 |         else:
609 |             logger.warning("No images found in history to create GIF")
610 | 
611 |     def _create_task_frame(
612 |         self,
613 |         task_text: str,
614 |         screenshot_b64: str,
615 |         title_font: ImageFont.FreeTypeFont,
616 |         regular_font: ImageFont.FreeTypeFont,
617 |         logo: Image.Image | None,
618 |         line_spacing: float,
619 |     ) -> Image.Image:
620 |         """Return an image with the task text overlaid on the screenshot."""
621 | 
622 |         margin = 40
623 |         img = Image.open(io.BytesIO(base64.b64decode(screenshot_b64))).convert("RGBA")
624 | 
625 |         overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
626 |         draw = ImageDraw.Draw(overlay)
627 | 
628 |         max_width = img.width - margin * 2
629 |         text_lines: list[str] = self._wrap_text_to_lines(
630 |             draw, task_text, regular_font, max_width
631 |         )
632 | 
633 |         y = margin
634 |         title_bbox = draw.textbbox((margin, y), "Task", font=title_font)
635 |         title_height = title_bbox[3] - title_bbox[1]
636 |         total_height = title_height + int(margin * 0.5)
637 |         for t in text_lines:
638 |             bbox = draw.textbbox((margin, 0), t, font=regular_font)
639 |             total_height += int((bbox[3] - bbox[1]) * line_spacing)
640 | 
641 |         if logo:
642 |             total_height = max(total_height, logo.height + margin * 2)
643 | 
644 |         draw.rectangle(
645 |             [(0, 0), (img.width, total_height)],
646 |             fill=(0, 0, 0, 180),
647 |         )
648 | 
649 |         draw.text((margin, y), "Task", font=title_font, fill="white")
650 |         y += title_height + int(margin * 0.5)
651 |         for t in text_lines:
652 |             draw.text((margin, y), t, font=regular_font, fill="white")
653 |             bbox = draw.textbbox((margin, y), t, font=regular_font)
654 |             y += int((bbox[3] - bbox[1]) * line_spacing)
655 | 
656 |         if logo:
657 |             overlay.paste(
658 |                 logo,
659 |                 (img.width - logo.width - margin, margin),
660 |                 logo if logo.mode == "RGBA" else None,
661 |             )
662 | 
663 |         img.alpha_composite(overlay)
664 |         return img.convert("RGB")
665 | 
666 |     def _wrap_text_to_lines(
667 |         self,
668 |         draw: ImageDraw.ImageDraw,
669 |         text: str,
670 |         font: ImageFont.FreeTypeFont,
671 |         max_width: int,
672 |     ) -> list[str]:
673 |         """Split ``text`` into lines that fit within ``max_width`` pixels."""
674 | 
675 |         if not text:
676 |             return []
677 | 
678 |         if max_width <= 0:
679 |             return [text]
680 | 
681 |         wrapped_lines: list[str] = []
682 | 
683 |         lines = text.splitlines()
684 |         if not lines:
685 |             lines = [text]
686 | 
687 |         for raw_line in lines:
688 |             words = raw_line.split()
689 |             if not words:
690 |                 wrapped_lines.append("")
691 |                 continue
692 | 
693 |             current_line = words[0]
694 |             for word in words[1:]:
695 |                 candidate = f"{current_line} {word}" if current_line else word
696 |                 if draw.textlength(candidate, font=font) <= max_width:
697 |                     current_line = candidate
698 |                 else:
699 |                     wrapped_lines.append(current_line)
700 |                     current_line = word
701 | 
702 |             wrapped_lines.append(current_line)
703 | 
704 |         return wrapped_lines
705 | 
706 |     def _add_overlay_to_image(
707 |         self,
708 |         image: Image.Image,
709 |         step_number: int,
710 |         goal_text: str,
711 |         regular_font: ImageFont.FreeTypeFont,
712 |         title_font: ImageFont.FreeTypeFont,
713 |         margin: int,
714 |         logo: Image.Image | None,
715 |         line_spacing: float,  # Added line_spacing parameter
716 |     ) -> Image.Image:
717 |         """Overlay the step number and goal text onto a screenshot image."""
718 | 
719 |         image = image.convert("RGBA")
720 |         overlay = Image.new("RGBA", image.size, (0, 0, 0, 0))
721 |         draw = ImageDraw.Draw(overlay)
722 | 
723 |         step_text = f"Step {step_number}"
724 |         max_width = image.width - margin * 2
725 | 
726 |         lines: list[str] = []
727 |         words = goal_text.split()
728 |         line = ""
729 |         for word in words:
730 |             test = f"{line} {word}".strip()
731 |             if draw.textlength(test, font=regular_font) <= max_width:
732 |                 line = test
733 |             else:
734 |                 lines.append(line)
735 |                 line = word
736 |         if line:
737 |             lines.append(line)
738 | 
739 |         y = margin
740 |         step_bbox = draw.textbbox((margin, y), step_text, font=title_font)
741 |         step_height = step_bbox[3] - step_bbox[1]
742 |         total_height = step_height + int(margin * 0.5)
743 |         for l in lines:
744 |             bbox = draw.textbbox((margin, 0), l, font=regular_font)
745 |             total_height += bbox[3] - bbox[1]
746 | 
747 |         if logo:
748 |             total_height = max(total_height, logo.height + margin * 2)
749 | 
750 |         draw.rectangle(
751 |             [(0, 0), (image.width, total_height)],
752 |             fill=(0, 0, 0, 180),
753 |         )
754 | 
755 |         draw.text((margin, y), step_text, font=title_font, fill="white")
756 |         y += step_height + int(margin * 0.5)
757 |         for l in lines:
758 |             draw.text((margin, y), l, font=regular_font, fill="white")
759 |             bbox = draw.textbbox((margin, y), l, font=regular_font)
760 |             y += bbox[3] - bbox[1]
761 | 
762 |         if logo:
763 |             overlay.paste(
764 |                 logo,
765 |                 (image.width - logo.width - margin, margin),
766 |                 logo if logo.mode == "RGBA" else None,
767 |             )
768 | 
769 |         image.alpha_composite(overlay)
770 |         return image.convert("RGB")
771 | 
772 |     async def execute_agent_task(self, max_steps: int = 100) -> AgentHistoryList:
773 |         """
774 |         Execute the entire agent task for up to max_steps or until 'done'.
775 |         Checks for external stop signals and logs each step in self.history.
776 |         """
777 |         try:
778 |             logger.info(f"🚀 Starting task: {self.task}")
779 |             self.telemetry.capture(
780 |                 AgentRunTelemetryEvent(
781 |                     agent_id=self.agent_id,
782 |                     task=self.task,
783 |                 )
784 |             )
785 | 
786 |             step_info = CustomAgentStepInfo(
787 |                 task=self.task,
788 |                 add_infos=self.add_infos,
789 |                 step_number=1,
790 |                 max_steps=max_steps,
791 |                 memory="",
792 |                 task_progress="",
793 |             )
794 | 
795 |             for step in range(max_steps):
796 |                 # 1) Check if stop requested externally
797 |                 if self.agent_state and self.agent_state.is_stop_requested():
798 |                     logger.info("🛑 Stop requested by user")
799 |                     self._create_stop_history_item()
800 |                     break
801 | 
802 |                 # 2) Store last valid state
803 |                 if self.browser_context and self.agent_state:
804 |                     state = await self.browser_context.get_state(
805 |                         use_vision=self.use_vision
806 |                     )
807 |                     self.agent_state.set_last_valid_state(state)
808 | 
809 |                 # 3) Check for too many failures
810 |                 if self._too_many_failures():
811 |                     break
812 | 
813 |                 # 4) Execute one detailed agent step
814 |                 await self.execute_agent_step(step_info)
815 | 
816 |                 if self.history.is_done():
817 |                     if self.validate_output and step < max_steps - 1:
818 |                         # Optionally validate final output
819 |                         if not await self._validate_output():
820 |                             continue
821 |                     logger.info("✅ Task completed successfully")
822 |                     break
823 |             else:
824 |                 logger.info("❌ Failed to complete task within maximum steps")
825 | 
826 |             return self.history
827 | 
828 |         finally:
829 |             self.telemetry.capture(
830 |                 AgentEndTelemetryEvent(
831 |                     agent_id=self.agent_id,
832 |                     task=self.task,
833 |                     success=self.history.is_done(),
834 |                     steps=len(self.history.history),
835 |                 )
836 |             )
837 |             # Close the browser context if we created it here (not injected)
838 |             if not self.injected_browser_context and self.browser_context:
839 |                 await self.browser_context.close()
840 | 
841 |             # Close the browser instance if it wasn't injected
842 |             if not self.injected_browser and self.browser:
843 |                 await self.browser.close()
844 | 
845 |             # Generate a GIF of the agent's run if enabled
846 |             if self.generate_gif:
847 |                 self.create_history_gif()
848 | 
849 |     def _create_stop_history_item(self) -> None:
850 |         """
851 |         Create a final 'stop' history item indicating the agent has halted by request.
852 |         """
853 |         try:
854 |             state = None
855 |             if self.agent_state:
856 |                 last_state = self.agent_state.get_last_valid_state()
857 |                 if last_state:
858 |                     state = self._convert_to_browser_state_history(last_state)
859 |                 else:
860 |                     state = self._create_empty_state()
861 |             else:
862 |                 state = self._create_empty_state()
863 | 
864 |             stop_history = AgentHistory(
865 |                 model_output=None,
866 |                 state=state,
867 |                 result=[ActionResult(extracted_content=None, error=None, is_done=True)],
868 |             )
869 |             self.history.history.append(stop_history)
870 | 
871 |         except Exception as e:
872 |             logger.error(f"Error creating stop history item: {e}")
873 |             state = self._create_empty_state()
874 |             stop_history = AgentHistory(
875 |                 model_output=None,
876 |                 state=state,
877 |                 result=[ActionResult(extracted_content=None, error=None, is_done=True)],
878 |             )
879 |             self.history.history.append(stop_history)
880 | 
881 |     def _convert_to_browser_state_history(
882 |         self, browser_state: Any
883 |     ) -> BrowserStateHistory:
884 |         """
885 |         Convert a raw browser_state object into a BrowserStateHistory dataclass.
886 |         """
887 |         return BrowserStateHistory(
888 |             url=getattr(browser_state, "url", ""),
889 |             title=getattr(browser_state, "title", ""),
890 |             tabs=getattr(browser_state, "tabs", []),
891 |             interacted_element=[None],
892 |             screenshot=getattr(browser_state, "screenshot", None),
893 |         )
894 | 
895 |     def _create_empty_state(self) -> BrowserStateHistory:
896 |         """
897 |         Create a basic empty state for fallback or stop-history usage.
898 |         """
899 |         return BrowserStateHistory(
900 |             url="", title="", tabs=[], interacted_element=[None], screenshot=None
901 |         )
902 | 


--------------------------------------------------------------------------------