├── src └── mcp_browser_use │ ├── agent │ ├── __init__.py │ ├── custom_views.py │ ├── custom_massage_manager.py │ ├── custom_prompts.py │ └── custom_agent.py │ ├── browser │ ├── __init__.py │ └── browser_manager.py │ ├── utils │ ├── __init__.py │ ├── logging.py │ ├── agent_state.py │ └── utils.py │ ├── controller │ ├── __init__.py │ └── custom_controller.py │ ├── mcp_browser_use.py │ ├── __init__.py │ ├── client.py │ └── server.py ├── tests ├── stubs │ ├── browser_use │ │ ├── agent │ │ │ ├── prompts.py │ │ │ ├── message_manager │ │ │ │ ├── service.py │ │ │ │ └── views.py │ │ │ ├── service.py │ │ │ └── views.py │ │ ├── browser │ │ │ ├── browser.py │ │ │ ├── events.py │ │ │ ├── __init__.py │ │ │ ├── profile.py │ │ │ ├── context.py │ │ │ └── views.py │ │ ├── controller │ │ │ ├── registry │ │ │ │ └── views.py │ │ │ └── service.py │ │ ├── telemetry │ │ │ └── views.py │ │ ├── utils.py │ │ └── __init__.py │ ├── langchain_openai │ │ ├── chat_models │ │ │ ├── base.py │ │ │ └── __init__.py │ │ └── __init__.py │ ├── langchain_core │ │ ├── messages │ │ │ └── __init__.py │ │ ├── language_models │ │ │ ├── __init__.py │ │ │ └── chat_models.py │ │ └── prompts │ │ │ └── __init__.py │ └── PIL │ │ └── __init__.py ├── test_agent_state.py ├── test_logging_configuration.py ├── conftest.py ├── test_gif_creation.py ├── test_browser_manager.py ├── test_custom_agent_controller.py ├── test_summarize_messages.py ├── test_client_session.py └── test_utils.py ├── .gitattributes ├── renovate.json ├── .editorconfig ├── pyproject.toml ├── Dockerfile ├── .gitignore ├── sample.env.env ├── smithery.yaml ├── README.md └── documentation ├── CONFIGURATION.md └── SECURITY.md /src/mcp_browser_use/agent/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /src/mcp_browser_use/browser/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /src/mcp_browser_use/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /src/mcp_browser_use/controller/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /tests/stubs/browser_use/agent/prompts.py: -------------------------------------------------------------------------------- 1 | class SystemPrompt: 2 | pass 3 | -------------------------------------------------------------------------------- /tests/stubs/browser_use/browser/browser.py: -------------------------------------------------------------------------------- 1 | class Browser: 2 | pass 3 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /tests/stubs/browser_use/controller/registry/views.py: -------------------------------------------------------------------------------- 1 | class ActionModel: 2 | pass 3 | -------------------------------------------------------------------------------- /tests/stubs/langchain_openai/chat_models/base.py: -------------------------------------------------------------------------------- 1 | _convert_message_to_dict = lambda x: {} 2 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": ["config:recommended"] 4 | } 5 | -------------------------------------------------------------------------------- /tests/stubs/browser_use/agent/message_manager/service.py: -------------------------------------------------------------------------------- 1 | class MessageManager: 2 | def __init__(self, *args, **kwargs): 3 | pass 4 | -------------------------------------------------------------------------------- /tests/stubs/langchain_openai/__init__.py: -------------------------------------------------------------------------------- 1 | from .chat_models import AzureChatOpenAI, ChatOpenAI 2 | 3 | __all__ = ["ChatOpenAI", "AzureChatOpenAI"] 4 | -------------------------------------------------------------------------------- /tests/stubs/langchain_core/messages/__init__.py: -------------------------------------------------------------------------------- 1 | class BaseMessage: pass 2 | class HumanMessage: pass 3 | class AIMessage: pass 4 | class SystemMessage: pass 5 | -------------------------------------------------------------------------------- /tests/stubs/browser_use/agent/service.py: -------------------------------------------------------------------------------- 1 | class Agent: 2 | def __init__(self, *args, **kwargs): 3 | self.history = kwargs.get('history', None) 4 | self.generate_gif = False 5 | -------------------------------------------------------------------------------- /tests/stubs/browser_use/telemetry/views.py: -------------------------------------------------------------------------------- 1 | class AgentEndTelemetryEvent: 2 | def __init__(self, *args, **kwargs): 3 | pass 4 | class AgentRunTelemetryEvent: 5 | def __init__(self, *args, **kwargs): 6 | pass 7 | -------------------------------------------------------------------------------- /tests/stubs/langchain_core/language_models/__init__.py: -------------------------------------------------------------------------------- 1 | class BaseChatModel: 2 | def with_structured_output(self, *args, **kwargs): 3 | return self 4 | async def ainvoke(self, *args, **kwargs): 5 | return {} 6 | -------------------------------------------------------------------------------- /tests/stubs/langchain_core/language_models/chat_models.py: -------------------------------------------------------------------------------- 1 | class BaseChatModel: 2 | async def ainvoke(self, *args, **kwargs): 3 | return {} 4 | def with_structured_output(self, *args, **kwargs): 5 | return self 6 | -------------------------------------------------------------------------------- /tests/stubs/browser_use/utils.py: -------------------------------------------------------------------------------- 1 | def time_execution_async(name): 2 | def decorator(func): 3 | async def wrapper(*args, **kwargs): 4 | return await func(*args, **kwargs) 5 | 6 | return wrapper 7 | 8 | return decorator 9 | -------------------------------------------------------------------------------- /tests/stubs/browser_use/browser/events.py: -------------------------------------------------------------------------------- 1 | class SendKeysEvent: 2 | def __init__(self, keys: str): 3 | self.keys = keys 4 | 5 | 6 | class ScreenshotEvent: 7 | def __init__(self, full_page: bool = False): 8 | self.full_page = full_page 9 | -------------------------------------------------------------------------------- /src/mcp_browser_use/mcp_browser_use.py: -------------------------------------------------------------------------------- 1 | """Public entry-points for backwards compatible imports.""" 2 | 3 | from __future__ import annotations 4 | 5 | from .client import AgentNotRegisteredError, create_client_session 6 | 7 | __all__ = ["AgentNotRegisteredError", "create_client_session"] 8 | -------------------------------------------------------------------------------- /tests/stubs/browser_use/browser/__init__.py: -------------------------------------------------------------------------------- 1 | from .. import BrowserSession as Browser # noqa: F401 2 | from .events import SendKeysEvent # noqa: F401 3 | from .profile import BrowserProfile, ProxySettings # noqa: F401 4 | 5 | __all__ = ["Browser", "BrowserProfile", "ProxySettings", "SendKeysEvent"] 6 | -------------------------------------------------------------------------------- /tests/stubs/browser_use/agent/message_manager/views.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Any, List 3 | 4 | @dataclass 5 | class MessageHistory: 6 | messages: List[Any] = field(default_factory=list) 7 | total_tokens: int = 0 8 | 9 | @dataclass 10 | class ManagedMessage: 11 | message: Any 12 | -------------------------------------------------------------------------------- /tests/stubs/browser_use/browser/profile.py: -------------------------------------------------------------------------------- 1 | class ProxySettings: 2 | def __init__(self, **kwargs): 3 | for key, value in kwargs.items(): 4 | setattr(self, key, value) 5 | 6 | 7 | class BrowserProfile: 8 | def __init__(self, **kwargs): 9 | for key, value in kwargs.items(): 10 | setattr(self, key, value) 11 | -------------------------------------------------------------------------------- /tests/stubs/browser_use/browser/context.py: -------------------------------------------------------------------------------- 1 | class BrowserContextConfig: 2 | def __init__(self, **kwargs): 3 | for key, value in kwargs.items(): 4 | setattr(self, key, value) 5 | 6 | 7 | class BrowserContext: 8 | async def get_state(self, *args, **kwargs): 9 | pass 10 | 11 | async def close(self): 12 | pass 13 | -------------------------------------------------------------------------------- /tests/stubs/browser_use/browser/views.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | @dataclass 4 | class BrowserStateHistory: 5 | url: str = "" 6 | title: str = "" 7 | tabs: list = None 8 | interacted_element: list = None 9 | screenshot: str | None = None 10 | 11 | @dataclass 12 | class BrowserState: 13 | screenshot: str | None = None 14 | -------------------------------------------------------------------------------- /tests/stubs/langchain_core/prompts/__init__.py: -------------------------------------------------------------------------------- 1 | class ChatPromptTemplate: 2 | @staticmethod 3 | def from_messages(msgs): 4 | return ChatPromptTemplate() 5 | def __or__(self, other): 6 | return self 7 | def invoke(self, data): 8 | return '' 9 | 10 | class MessagesPlaceholder: 11 | def __init__(self, variable_name=''): 12 | pass 13 | -------------------------------------------------------------------------------- /src/mcp_browser_use/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """MCP server for browser-use.""" 4 | 5 | from mcp_browser_use.mcp_browser_use import ( # noqa: F401 6 | AgentNotRegisteredError, 7 | create_client_session, 8 | ) 9 | from mcp_browser_use.server import app, launch_mcp_browser_use_server 10 | 11 | __all__ = [ 12 | "app", 13 | "launch_mcp_browser_use_server", 14 | "create_client_session", 15 | "AgentNotRegisteredError", 16 | ] 17 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # Check http://editorconfig.org for more information 2 | # This is the main config file for this project: 3 | root = true 4 | 5 | [*] 6 | charset = utf-8 7 | end_of_line = lf 8 | insert_final_newline = true 9 | indent_style = space 10 | indent_size = 2 11 | trim_trailing_whitespace = true 12 | 13 | [*.{py, pyi}] 14 | indent_style = space 15 | indent_size = 4 16 | 17 | [Makefile] 18 | indent_style = tab 19 | 20 | [*.md] 21 | trim_trailing_whitespace = false -------------------------------------------------------------------------------- /tests/stubs/browser_use/controller/service.py: -------------------------------------------------------------------------------- 1 | class _Registry: 2 | def get_prompt_description(self): 3 | return "" 4 | 5 | def create_action_model(self): 6 | return type("ActionModel", (), {}) 7 | 8 | def action(self, *_args, **_kwargs): 9 | def decorator(func): 10 | return func 11 | 12 | return decorator 13 | 14 | 15 | class Controller: 16 | def __init__(self): 17 | self.registry = _Registry() 18 | 19 | async def multi_act(self, actions, context): # pragma: no cover - stub 20 | return [] 21 | -------------------------------------------------------------------------------- /tests/stubs/langchain_openai/chat_models/__init__.py: -------------------------------------------------------------------------------- 1 | class Base: 2 | pass 3 | 4 | class ChatOpenAI: 5 | def __init__(self, *args, **kwargs): 6 | pass 7 | 8 | root_async_client = None 9 | model_name = 'mock' 10 | def with_structured_output(self, *args, **kwargs): 11 | return self 12 | async def ainvoke(self, *args, **kwargs): 13 | return {} 14 | 15 | 16 | class AzureChatOpenAI(ChatOpenAI): 17 | """Minimal stub mirroring the OpenAI chat client API.""" 18 | 19 | def __init__(self, *args, **kwargs): 20 | super().__init__(*args, **kwargs) 21 | 22 | -------------------------------------------------------------------------------- /tests/test_agent_state.py: -------------------------------------------------------------------------------- 1 | from mcp_browser_use.utils.agent_state import AgentState 2 | 3 | 4 | def test_agent_state_stop_flow(): 5 | state = AgentState() 6 | 7 | assert state.is_stop_requested() is False 8 | 9 | state.request_stop() 10 | assert state.is_stop_requested() is True 11 | 12 | state.clear_stop() 13 | assert state.is_stop_requested() is False 14 | 15 | 16 | def test_agent_state_last_valid_state_reset(): 17 | state = AgentState() 18 | 19 | marker = {"url": "https://example.com"} 20 | state.set_last_valid_state(marker) 21 | 22 | assert state.get_last_valid_state() == marker 23 | 24 | state.clear_stop() 25 | 26 | assert state.get_last_valid_state() is None 27 | -------------------------------------------------------------------------------- /tests/stubs/browser_use/agent/views.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Any, List, Optional 3 | 4 | @dataclass 5 | class ActionResult: 6 | extracted_content: Optional[str] = None 7 | error: Optional[str] = None 8 | is_done: bool = False 9 | include_in_memory: bool = False 10 | 11 | @dataclass 12 | class AgentHistory: 13 | model_output: Any 14 | state: Any 15 | result: List[ActionResult] 16 | 17 | @dataclass 18 | class AgentHistoryList: 19 | history: List[AgentHistory] = field(default_factory=list) 20 | def is_done(self) -> bool: 21 | for h in self.history: 22 | for r in h.result: 23 | if r.is_done: 24 | return True 25 | return False 26 | 27 | @dataclass 28 | class AgentStepInfo: 29 | step_number: int = 0 30 | 31 | class AgentOutput: 32 | pass 33 | -------------------------------------------------------------------------------- /src/mcp_browser_use/utils/logging.py: -------------------------------------------------------------------------------- 1 | """Centralised logging configuration utilities for the MCP browser agent.""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | import os 7 | from typing import Optional 8 | 9 | 10 | _DEFAULT_FORMAT = "%(asctime)s | %(levelname)s | %(name)s | %(message)s" 11 | 12 | 13 | def _resolve_level(level_name: Optional[str]) -> int: 14 | """Translate a string level name into a numeric logging level.""" 15 | 16 | if not level_name: 17 | return logging.INFO 18 | 19 | try: 20 | return int(level_name) 21 | except ValueError: 22 | resolved = logging.getLevelName(level_name.upper()) 23 | if isinstance(resolved, int): 24 | return resolved 25 | return logging.INFO 26 | def configure_logging() -> None: 27 | """Configure the root logger once for the application.""" 28 | 29 | level = _resolve_level(os.getenv("LOG_LEVEL")) 30 | 31 | root_logger = logging.getLogger() 32 | if not root_logger.handlers: 33 | logging.basicConfig(level=level, format=_DEFAULT_FORMAT) 34 | else: 35 | root_logger.setLevel(level) 36 | -------------------------------------------------------------------------------- /src/mcp_browser_use/utils/agent_state.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | """ 5 | If we plan to scale or have multiple agents, we might remove the singleton pattern or differentiate them by agent ID. 6 | """ 7 | 8 | import asyncio 9 | from typing import Any, Optional 10 | 11 | 12 | class AgentState: 13 | """ 14 | Tracks an asynchronous stop signal and stores the last valid browser state. 15 | 16 | request_stop() sets an asyncio.Event, is_stop_requested() checks if it's set, 17 | clear_stop() resets the event and last_valid_state. 18 | """ 19 | 20 | def __init__(self) -> None: 21 | self._stop_requested = asyncio.Event() 22 | self._last_valid_state: Optional[Any] = None 23 | 24 | def request_stop(self) -> None: 25 | self._stop_requested.set() 26 | 27 | def clear_stop(self) -> None: 28 | self._stop_requested.clear() 29 | self._last_valid_state = None 30 | 31 | def is_stop_requested(self) -> bool: 32 | return self._stop_requested.is_set() 33 | 34 | def set_last_valid_state(self, state: Any) -> None: 35 | self._last_valid_state = state 36 | 37 | def get_last_valid_state(self) -> Optional[Any]: 38 | return self._last_valid_state 39 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "mcp_browser_use" 3 | version = "0.1.0" 4 | description = "This Python project is a FastAPI server implementing MCP Server protocol Browser automation via browser-use library." 5 | readme = "README.md" 6 | requires-python = ">=3.11" 7 | license = { text = "MIT" } 8 | classifiers = [ 9 | "Development Status :: 4 - Beta", 10 | "Programming Language :: Python :: 3", 11 | "Programming Language :: Python :: 3.11", 12 | "Operating System :: OS Independent", 13 | ] 14 | 15 | dependencies = [ 16 | "pydantic>=2.11.9", 17 | "uvicorn>=0.37.0", 18 | "browser-use>=0.7.9", 19 | "fastapi>=0.117.1", 20 | "fastmcp>=2.12.4", 21 | "instructor>=1.11.3", 22 | "langchain>=0.3.27", 23 | "langchain-google-genai>=2.1.1", 24 | "langchain-openai>=0.2.14", 25 | "langchain-anthropic>=0.3.20", 26 | "langchain-ollama>=0.2.2", 27 | "openai>=1.109.1", 28 | "pillow>=11.3.0", 29 | "python-dotenv>=1.1.1", 30 | "pyperclip>=1.11.0", 31 | ] 32 | 33 | [build-system] 34 | requires = ["hatchling"] 35 | build-backend = "hatchling.build" 36 | 37 | [tool.hatch.build.targets.wheel] 38 | packages = ["src/mcp_browser_use"] 39 | 40 | [project.scripts] 41 | mcp-browser-use = "mcp_browser_use.server:launch_mcp_browser_use_server" 42 | -------------------------------------------------------------------------------- /tests/test_logging_configuration.py: -------------------------------------------------------------------------------- 1 | """Smoke tests around module imports and logging configuration.""" 2 | 3 | from __future__ import annotations 4 | 5 | import importlib 6 | import logging 7 | import sys 8 | from typing import Iterable 9 | 10 | import pytest 11 | 12 | 13 | MODULES_TO_TEST: Iterable[str] = ( 14 | "mcp_browser_use.controller.custom_controller", 15 | "mcp_browser_use.utils.utils", 16 | "mcp_browser_use.agent.custom_agent", 17 | "mcp_browser_use.agent.custom_message_manager", 18 | ) 19 | 20 | 21 | @pytest.mark.parametrize("module_name", MODULES_TO_TEST) 22 | def test_module_import_does_not_call_basic_config(module_name: str, monkeypatch) -> None: 23 | """Ensure importing project modules does not invoke ``logging.basicConfig``.""" 24 | 25 | # Import once so that shared third-party dependencies are cached. 26 | importlib.import_module(module_name) 27 | sys.modules.pop(module_name, None) 28 | 29 | calls: list[tuple[tuple[object, ...], dict[str, object]]] = [] 30 | 31 | def record_basic_config(*args: object, **kwargs: object) -> None: 32 | calls.append((args, kwargs)) 33 | 34 | monkeypatch.setattr(logging, "basicConfig", record_basic_config) 35 | 36 | importlib.import_module(module_name) 37 | 38 | assert calls == [], f"Module {module_name} should not call logging.basicConfig during import" 39 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Generated by https://smithery.ai. See: https://smithery.ai/docs/config#dockerfile 2 | # Use a Python image with uv pre-installed 3 | FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim AS uv 4 | 5 | # Install the project into /app 6 | WORKDIR /app 7 | 8 | # Enable bytecode compilation 9 | ENV UV_COMPILE_BYTECODE=1 10 | 11 | # Copy from the cache instead of linking since it's a mounted volume 12 | ENV UV_LINK_MODE=copy 13 | 14 | # Install the project's dependencies using the lockfile and settings 15 | RUN --mount=type=cache,target=/root/.cache/uv \ 16 | --mount=type=bind,source=uv.lock,target=uv.lock \ 17 | --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ 18 | uv sync --frozen --no-install-project --no-dev --no-editable 19 | 20 | # Then, add the rest of the project source code and install it 21 | # Installing separately from its dependencies allows optimal layer caching 22 | ADD . /app 23 | RUN --mount=type=cache,target=/root/.cache/uv \ 24 | uv sync --frozen --no-dev --no-editable 25 | 26 | FROM python:3.13-slim-bookworm 27 | 28 | WORKDIR /app 29 | 30 | COPY --from=uv /root/.local /root/.local 31 | COPY --from=uv --chown=app:app /app/.venv /app/.venv 32 | 33 | # Place executables in the environment at the front of the path 34 | ENV PATH="/app/.venv/bin:$PATH" 35 | 36 | # when running the container, add --db-path and a bind mount to the host's db file 37 | ENTRYPOINT ["mcp-browser-use"] 38 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Test fixtures and environment setup for the test suite.""" 2 | 3 | import importlib 4 | import os 5 | import sys 6 | import types 7 | 8 | BASE_DIR = os.path.dirname(__file__) 9 | STUBS_DIR = os.path.join(BASE_DIR, "stubs") 10 | SRC_DIR = os.path.join(os.path.dirname(BASE_DIR), "src") 11 | 12 | for path in (STUBS_DIR, SRC_DIR): 13 | if path not in sys.path: 14 | sys.path.insert(0, path) 15 | 16 | if "langchain_openai" not in sys.modules: 17 | importlib.import_module("langchain_openai") 18 | 19 | if "langchain_anthropic" not in sys.modules: 20 | module = types.ModuleType("langchain_anthropic") 21 | 22 | class ChatAnthropic: # type: ignore[too-many-ancestors] 23 | def __init__(self, *args, **kwargs): 24 | pass 25 | 26 | module.ChatAnthropic = ChatAnthropic 27 | sys.modules["langchain_anthropic"] = module 28 | 29 | if "langchain_google_genai" not in sys.modules: 30 | module = types.ModuleType("langchain_google_genai") 31 | 32 | class ChatGoogleGenerativeAI: # type: ignore[too-many-ancestors] 33 | def __init__(self, *args, **kwargs): 34 | pass 35 | 36 | module.ChatGoogleGenerativeAI = ChatGoogleGenerativeAI 37 | sys.modules["langchain_google_genai"] = module 38 | 39 | if "langchain_ollama" not in sys.modules: 40 | module = types.ModuleType("langchain_ollama") 41 | 42 | class ChatOllama: # type: ignore[too-many-ancestors] 43 | def __init__(self, *args, **kwargs): 44 | pass 45 | 46 | module.ChatOllama = ChatOllama 47 | sys.modules["langchain_ollama"] = module 48 | 49 | -------------------------------------------------------------------------------- /tests/test_gif_creation.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import base64 4 | import io 5 | 6 | # Add stub package path before importing CustomAgent 7 | BASE_DIR = os.path.dirname(__file__) 8 | sys.path.insert(0, os.path.join(BASE_DIR, "stubs")) 9 | sys.path.insert(0, os.path.join(os.path.dirname(BASE_DIR), "src")) 10 | 11 | from PIL import Image 12 | 13 | from mcp_browser_use.agent.custom_agent import CustomAgent 14 | from browser_use.agent.views import AgentHistoryList, AgentHistory, ActionResult 15 | from browser_use.browser.views import BrowserStateHistory 16 | 17 | 18 | class DummyState: 19 | def __init__(self, thought: str): 20 | self.current_state = type("Brain", (), {"thought": thought})() 21 | 22 | 23 | def create_screenshot() -> str: 24 | img = Image.new("RGB", (100, 100), color="white") 25 | buf = io.BytesIO() 26 | img.save(buf, format="PNG") 27 | return base64.b64encode(buf.getvalue()).decode("utf-8") 28 | 29 | 30 | def test_create_history_gif(tmp_path): 31 | screenshot = create_screenshot() 32 | hist = AgentHistoryList( 33 | history=[ 34 | AgentHistory( 35 | model_output=DummyState("step one"), 36 | state=BrowserStateHistory(screenshot=screenshot), 37 | result=[ActionResult(is_done=False)], 38 | ), 39 | AgentHistory( 40 | model_output=DummyState("step two"), 41 | state=BrowserStateHistory(screenshot=screenshot), 42 | result=[ActionResult(is_done=True)], 43 | ), 44 | ] 45 | ) 46 | 47 | agent = CustomAgent.__new__(CustomAgent) 48 | agent.history = hist 49 | agent.task = "My Task" 50 | 51 | output_gif = tmp_path / "out.gif" 52 | agent.create_history_gif(output_path=str(output_gif)) 53 | 54 | assert output_gif.exists() 55 | -------------------------------------------------------------------------------- /tests/test_browser_manager.py: -------------------------------------------------------------------------------- 1 | """Tests for browser manager environment configuration helpers.""" 2 | 3 | from __future__ import annotations 4 | 5 | import importlib 6 | 7 | import pytest 8 | 9 | 10 | browser_manager = importlib.import_module( 11 | "mcp_browser_use.browser.browser_manager" 12 | ) 13 | 14 | 15 | @pytest.fixture(autouse=True) 16 | def clear_browser_env(monkeypatch): 17 | """Ensure browser-related environment variables do not leak between tests.""" 18 | 19 | for key in ( 20 | "BROWSER_USE_CDP_URL", 21 | "CHROME_DEBUGGING_HOST", 22 | "CHROME_DEBUGGING_PORT", 23 | ): 24 | monkeypatch.delenv(key, raising=False) 25 | 26 | 27 | def test_from_env_derives_cdp_url_from_debugging(monkeypatch): 28 | """When only debugging env vars are set, derive a CDP URL automatically.""" 29 | 30 | monkeypatch.setenv("CHROME_DEBUGGING_HOST", "debug.example") 31 | monkeypatch.setenv("CHROME_DEBUGGING_PORT", "1337") 32 | 33 | config = browser_manager.BrowserEnvironmentConfig.from_env() 34 | 35 | assert config.cdp_url == "http://debug.example:1337" 36 | 37 | 38 | def test_create_browser_session_preserves_computed_cdp_url(monkeypatch): 39 | """Computed CDP URL is passed to BrowserSession when overrides omit it.""" 40 | 41 | monkeypatch.setenv("CHROME_DEBUGGING_HOST", "localhost") 42 | monkeypatch.setenv("CHROME_DEBUGGING_PORT", "9000") 43 | 44 | captured_kwargs: dict[str, object] = {} 45 | 46 | class DummyBrowserSession: 47 | def __init__(self, **kwargs): 48 | captured_kwargs.update(kwargs) 49 | 50 | monkeypatch.setattr(browser_manager, "BrowserSession", DummyBrowserSession) 51 | 52 | session = browser_manager.create_browser_session() 53 | 54 | assert isinstance(session, DummyBrowserSession) 55 | assert captured_kwargs["cdp_url"] == "http://localhost:9000" 56 | -------------------------------------------------------------------------------- /tests/stubs/PIL/__init__.py: -------------------------------------------------------------------------------- 1 | class DummyImage: 2 | def __init__(self, width=100, height=100): 3 | self.width = width 4 | self.height = height 5 | self.mode = "RGBA" 6 | 7 | @property 8 | def size(self): 9 | return (self.width, self.height) 10 | 11 | def convert(self, mode): 12 | self.mode = mode 13 | return self 14 | 15 | def resize(self, size, resample=None): 16 | self.width, self.height = size 17 | return self 18 | 19 | def save(self, fp, *args, **kwargs): 20 | if hasattr(fp, "write"): 21 | fp.write(b"dummy") 22 | else: 23 | with open(fp, "wb") as f: 24 | f.write(b"dummy") 25 | 26 | def alpha_composite(self, other): 27 | pass 28 | 29 | def paste(self, img, pos, mask=None): 30 | pass 31 | 32 | 33 | class Image: 34 | @staticmethod 35 | def open(fp): 36 | return DummyImage() 37 | 38 | @staticmethod 39 | def new(mode, size, color=(0, 0, 0, 0)): 40 | return DummyImage(*size) 41 | 42 | Resampling = type("Resampling", (), {"LANCZOS": 0}) 43 | Image = DummyImage 44 | 45 | 46 | class ImageDraw: 47 | class Draw: 48 | def __init__(self, img): 49 | pass 50 | 51 | def text(self, *args, **kwargs): 52 | pass 53 | 54 | def rectangle(self, *args, **kwargs): 55 | pass 56 | 57 | def textbbox(self, xy, text, font=None): 58 | # return left, top, right, bottom 59 | return (0, 0, len(text) * 10, 10) 60 | 61 | def textlength(self, text, font=None): 62 | return len(text) * 10 63 | 64 | ImageDraw = Draw 65 | 66 | 67 | class ImageFont: 68 | class FreeTypeFont: 69 | pass 70 | 71 | @staticmethod 72 | def truetype(font, size): 73 | return ImageFont.FreeTypeFont() 74 | 75 | @staticmethod 76 | def load_default(): 77 | return ImageFont.FreeTypeFont() 78 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # IPython 78 | profile_default/ 79 | ipython_config.py 80 | 81 | # pyenv 82 | .python-version 83 | 84 | # celery beat schedule file 85 | celerybeat-schedule 86 | 87 | # SageMath parsed files 88 | *.sage.py 89 | 90 | # Environments 91 | .env 92 | .venv 93 | env/ 94 | venv/ 95 | ENV/ 96 | env.bak/ 97 | venv.bak/ 98 | 99 | # Spyder project settings 100 | .spyderproject 101 | .spyproject 102 | 103 | # Rope project settings 104 | .ropeproject 105 | 106 | # mkdocs documentation 107 | /site 108 | 109 | # mypy 110 | .mypy_cache/ 111 | .dmypy.json 112 | dmypy.json 113 | 114 | # Pyre type checker 115 | .pyre/ 116 | 117 | # ignore the database 118 | *.db 119 | 120 | # ignore vscode settings 121 | .vscode/ 122 | 123 | # Project Files 124 | /*.json 125 | target/ 126 | dbt_packages/ 127 | dbt_packages/* 128 | logs/ 129 | /secrets/* 130 | #mac pc specific - system configuration files 131 | .DS_Store 132 | -------------------------------------------------------------------------------- /tests/test_custom_agent_controller.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | BASE_DIR = os.path.dirname(__file__) 5 | sys.path.insert(0, os.path.join(BASE_DIR, "stubs")) 6 | sys.path.insert(0, os.path.join(os.path.dirname(BASE_DIR), "src")) 7 | 8 | import pytest 9 | from langchain_core.language_models.chat_models import BaseChatModel 10 | from unittest.mock import Mock 11 | 12 | import mcp_browser_use.agent.custom_agent as custom_agent_module 13 | 14 | 15 | @pytest.fixture 16 | def custom_agent(monkeypatch): 17 | class DummyMessageManager: 18 | def __init__(self, *args, **kwargs): 19 | pass 20 | 21 | monkeypatch.setattr( 22 | custom_agent_module, 23 | "CustomMassageManager", 24 | DummyMessageManager, 25 | ) 26 | 27 | def fake_agent_init(self, *args, **kwargs): 28 | for key, value in kwargs.items(): 29 | setattr(self, key, value) 30 | # Set attributes not passed in kwargs that are needed 31 | self.n_steps = 0 32 | self._last_result = None 33 | self.message_manager = None 34 | self.history = None 35 | self.generate_gif = False 36 | 37 | monkeypatch.setattr(custom_agent_module.Agent, "__init__", fake_agent_init) 38 | 39 | return custom_agent_module 40 | 41 | 42 | def test_custom_agent_creates_independent_default_controllers( 43 | custom_agent, monkeypatch 44 | ): 45 | controllers = [] 46 | 47 | class TrackingController(custom_agent.Controller): 48 | def __init__(self): 49 | super().__init__() 50 | controllers.append(self) 51 | 52 | monkeypatch.setattr(custom_agent, "Controller", TrackingController) 53 | 54 | llm = Mock(spec=BaseChatModel) 55 | agent_one = custom_agent.CustomAgent(task="Task one", llm=llm) 56 | agent_two = custom_agent.CustomAgent(task="Task two", llm=llm) 57 | 58 | assert agent_one.controller is not agent_two.controller 59 | assert controllers == [agent_one.controller, agent_two.controller] 60 | 61 | 62 | def test_custom_agent_uses_supplied_controller(custom_agent): 63 | llm = Mock(spec=BaseChatModel) 64 | provided_controller = custom_agent.Controller() 65 | 66 | agent = custom_agent.CustomAgent( 67 | task="Task with supplied controller", 68 | llm=llm, 69 | controller=provided_controller, 70 | ) 71 | 72 | assert agent.controller is provided_controller 73 | -------------------------------------------------------------------------------- /sample.env.env: -------------------------------------------------------------------------------- 1 | # --------------------------- 2 | # API Keys (Replace as needed) 3 | # --------------------------- 4 | OPENAI_API_KEY=your_openai_api_key_here 5 | ANTHROPIC_API_KEY=your_anthropic_api_key_here 6 | GOOGLE_API_KEY=your_google_api_key_here 7 | AZURE_OPENAI_API_KEY=your_azure_api_key_here 8 | DEEPSEEK_API_KEY=your_deepseek_api_key_here 9 | 10 | # ---------------------------------- 11 | # Model Provider & Endpoint Settings 12 | # ---------------------------------- 13 | # Typical endpoints; change to match your usage. 14 | OPENAI_ENDPOINT=https://api.openai.com/v1 15 | ANTHROPIC_API_ENDPOINT=https://api.anthropic.com 16 | AZURE_OPENAI_ENDPOINT=https://your-azure-openai-endpoint 17 | DEEPSEEK_ENDPOINT=https://api.deepseek.com 18 | 19 | # --------------------------- 20 | # Model & Agent Configuration 21 | # --------------------------- 22 | # Choose one provider: "openai", "anthropic", "azure_openai", "deepseek", "gemini", "ollama". 23 | MCP_MODEL_PROVIDER=anthropic 24 | MCP_MODEL_NAME=claude-3-5-sonnet-20241022 25 | MCP_TEMPERATURE=0.3 26 | MCP_MAX_STEPS=30 27 | MCP_MAX_ACTIONS_PER_STEP=5 28 | MCP_USE_VISION=true 29 | MCP_TOOL_CALL_IN_CONTENT=true 30 | 31 | # --------------------------------- 32 | # Chrome / Playwright Configuration 33 | # --------------------------------- 34 | # If CHROME_PATH is set, the code will attempt to launch a locally installed Chrome 35 | # with remote debugging on port 9222. 36 | # If left empty, it will launch a standard Chromium instance via Playwright. 37 | 38 | CHROME_PATH=/path/to/your/chrome/binary 39 | CHROME_USER_DATA=/path/to/your/chrome-profile 40 | CHROME_DEBUGGING_PORT=9222 41 | CHROME_DEBUGGING_HOST=localhost 42 | CHROME_PERSISTENT_SESSION=false 43 | 44 | # You can add extra flags in your code if needed: 45 | # Example: export CHROME_EXTRA_ARGS="--some-chrome-flag" 46 | 47 | # -------------- 48 | # Other Settings 49 | # -------------- 50 | # Adjust HEADLESS or DISABLE_SECURITY if your code checks them. 51 | # By default, you might keep them out or set them in the code itself. 52 | 53 | # HEADLESS=false 54 | # DISABLE_SECURITY=false 55 | 56 | # ------------- 57 | # Example Usage 58 | # ------------- 59 | # Load this file with: 60 | # source .env 61 | # or use a library like python-dotenv or uv to manage environment variables. 62 | 63 | # Note: In production or multi-user environments, never commit real API keys 64 | # or share them publicly. Instead use a secrets manager or encrypted storage. 65 | -------------------------------------------------------------------------------- /src/mcp_browser_use/client.py: -------------------------------------------------------------------------------- 1 | """Client helpers for interacting with the in-process FastMCP server.""" 2 | 3 | from __future__ import annotations 4 | 5 | from contextlib import asynccontextmanager 6 | from typing import Any, AsyncIterator, Callable, Optional 7 | 8 | from fastmcp.client import Client 9 | 10 | from .server import app 11 | 12 | 13 | class AgentNotRegisteredError(RuntimeError): 14 | """Error raised when attempting to control an agent that is not running.""" 15 | 16 | 17 | @asynccontextmanager 18 | async def create_client_session( 19 | client: Optional[Client] = None, 20 | *, 21 | client_factory: Optional[Callable[[], Client]] = None, 22 | **client_kwargs: Any, 23 | ) -> AsyncIterator[Client]: 24 | """Create an asynchronous context manager for interacting with the server. 25 | 26 | Parameters 27 | ---------- 28 | client: 29 | An existing :class:`fastmcp.client.Client` instance. If provided, the 30 | caller is responsible for its configuration. ``client_kwargs`` must not 31 | be supplied in this case. 32 | client_factory: 33 | Optional callable used to lazily construct a client. This is useful in 34 | testing where a lightweight stub client might be injected. If provided, 35 | the callable is invoked with no arguments and ``client_kwargs`` must not 36 | be supplied. 37 | **client_kwargs: 38 | Additional keyword arguments forwarded to :class:`fastmcp.client.Client` 39 | when neither ``client`` nor ``client_factory`` is provided. 40 | 41 | Yields 42 | ------ 43 | Client 44 | A connected FastMCP client ready for use within the context manager. 45 | """ 46 | 47 | if client is not None and client_factory is not None: 48 | raise ValueError("Provide either 'client' or 'client_factory', not both.") 49 | 50 | if client is not None and client_kwargs: 51 | raise ValueError( 52 | "'client_kwargs' cannot be used when an explicit client instance is provided." 53 | ) 54 | 55 | if client_factory is not None and client_kwargs: 56 | raise ValueError("'client_kwargs' cannot be combined with 'client_factory'.") 57 | 58 | if client is not None: 59 | session_client = client 60 | elif client_factory is not None: 61 | session_client = client_factory() 62 | else: 63 | session_client = Client(app, **client_kwargs) 64 | 65 | async with session_client as connected_client: 66 | yield connected_client 67 | -------------------------------------------------------------------------------- /tests/stubs/browser_use/__init__.py: -------------------------------------------------------------------------------- 1 | class _DummyEvent: 2 | def __await__(self): 3 | async def _noop(): 4 | return None 5 | 6 | return _noop().__await__() 7 | 8 | async def event_result(self, *args, **kwargs): # pragma: no cover - stub method 9 | return None 10 | 11 | 12 | class _DummyEventBus: 13 | def dispatch(self, event): # noqa: D401 - simple stub 14 | return _DummyEvent() 15 | 16 | 17 | class BrowserPage: 18 | def __init__(self, **kwargs): 19 | for key, value in kwargs.items(): 20 | setattr(self, key, value) 21 | self.event_bus = _DummyEventBus() 22 | 23 | async def close(self) -> None: # pragma: no cover - stub method 24 | return None 25 | 26 | 27 | class Browser: 28 | """Lightweight stub mirroring the public Browser API used in tests.""" 29 | 30 | def __init__(self, **kwargs): 31 | for key, value in kwargs.items(): 32 | setattr(self, key, value) 33 | self._pages: list[BrowserPage] = [] 34 | self._started = False 35 | 36 | async def start(self): # pragma: no cover - stub method 37 | self._started = True 38 | return self 39 | 40 | async def stop(self): # pragma: no cover - stub method 41 | self._started = False 42 | return None 43 | 44 | async def new_page(self, **kwargs): 45 | page = BrowserPage(**kwargs) 46 | self._pages.append(page) 47 | return page 48 | 49 | async def close(self): # pragma: no cover - compatibility alias 50 | return await self.stop() 51 | 52 | 53 | class BrowserSession(Browser): # pragma: no cover - stub class 54 | async def kill(self): # pragma: no cover - stub method 55 | return await self.stop() 56 | 57 | 58 | class BrowserProfile: # pragma: no cover - stub class 59 | def __init__(self, **kwargs): 60 | for key, value in kwargs.items(): 61 | setattr(self, key, value) 62 | self.event_bus = _DummyEventBus() 63 | 64 | async def kill(self) -> None: # pragma: no cover - stub method 65 | return None 66 | 67 | 68 | class BrowserProfile: # pragma: no cover - stub class 69 | def __init__(self, **kwargs): 70 | for key, value in kwargs.items(): 71 | setattr(self, key, value) 72 | 73 | 74 | class ProxySettings: # pragma: no cover - stub class 75 | def __init__(self, **kwargs): 76 | for key, value in kwargs.items(): 77 | setattr(self, key, value) 78 | 79 | 80 | # Alias maintained for compatibility with production package 81 | Browser = BrowserSession 82 | -------------------------------------------------------------------------------- /src/mcp_browser_use/controller/custom_controller.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | import sys 5 | 6 | import pyperclip 7 | from browser_use import BrowserSession 8 | from browser_use.agent.views import ActionResult 9 | from browser_use.controller.service import Controller 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class CustomController(Controller): 15 | """ 16 | A custom controller registering two clipboard actions: copy and paste. 17 | """ 18 | 19 | def __init__(self): 20 | super().__init__() 21 | self._register_custom_actions() 22 | 23 | def _register_custom_actions(self) -> None: 24 | """Register all custom browser actions for this controller.""" 25 | 26 | @self.registry.action("Copy text to clipboard") 27 | def copy_to_clipboard(text: str) -> ActionResult: 28 | """ 29 | Copy the given text to the system's clipboard. 30 | Returns an ActionResult with the same text as extracted_content. 31 | """ 32 | try: 33 | pyperclip.copy(text) 34 | # Be cautious about logging the actual text, if sensitive 35 | logger.debug("Copied text to clipboard.") 36 | return ActionResult(extracted_content=text) 37 | except Exception as e: 38 | logger.error(f"Error copying text to clipboard: {e}") 39 | return ActionResult(error=str(e), extracted_content=None) 40 | 41 | @self.registry.action("Paste text from clipboard", requires_browser=True) 42 | async def paste_from_clipboard(browser_session: BrowserSession) -> ActionResult: 43 | """ 44 | Paste whatever is currently in the system's clipboard 45 | into the active browser page by using the send_keys tool. 46 | """ 47 | try: 48 | text = pyperclip.paste() 49 | except Exception as e: 50 | logger.error(f"Error reading text from clipboard: {e}") 51 | return ActionResult(error=str(e), extracted_content=None) 52 | 53 | try: 54 | modifier = "meta" if sys.platform == "darwin" else "ctrl" 55 | # Use the documented tool via the registry 56 | await self.registry.execute_action( 57 | "send_keys", 58 | {"keys": f"{modifier}+v"}, 59 | browser_session=browser_session, 60 | ) 61 | logger.debug("Triggered paste shortcut inside the browser session.") 62 | return ActionResult(extracted_content=text) 63 | except Exception as e: 64 | logger.error(f"Error pasting text into the browser session: {e}") 65 | return ActionResult(error=str(e), extracted_content=None) 66 | -------------------------------------------------------------------------------- /src/mcp_browser_use/agent/custom_views.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from dataclasses import dataclass 4 | from typing import List, Type 5 | 6 | from browser_use.agent.views import AgentOutput 7 | from browser_use.controller.registry.views import ActionModel 8 | from pydantic import BaseModel, ConfigDict, Field, create_model 9 | 10 | 11 | @dataclass 12 | class CustomAgentStepInfo: 13 | """ 14 | Holds metadata about a single step of the agent's execution. 15 | 16 | :param step_number: Which step number we're currently on. 17 | :param max_steps: Total maximum steps before we stop. 18 | :param task: The primary task assigned to the agent. 19 | :param add_infos: Additional contextual info or instructions. 20 | :param memory: Cumulative memory or context from previous steps. 21 | :param task_progress: Text describing progress toward the task goal. 22 | """ 23 | 24 | step_number: int 25 | max_steps: int 26 | task: str 27 | add_infos: str 28 | memory: str 29 | task_progress: str 30 | 31 | 32 | class CustomAgentBrain(BaseModel): 33 | """ 34 | Represents the agent's 'thinking' or ephemeral state during processing. 35 | 36 | :param prev_action_evaluation: String evaluation of the last action performed (success/failure). 37 | :param important_contents: Key points or memory extracted from the environment. 38 | :param completed_contents: Completed portion of the task so far. 39 | :param thought: Agent's internal reasoning or thought process text. 40 | :param summary: Short summary of the agent's current state or progress. 41 | """ 42 | 43 | prev_action_evaluation: str 44 | important_contents: str 45 | completed_contents: str 46 | thought: str 47 | summary: str 48 | 49 | 50 | class CustomAgentOutput(AgentOutput): 51 | """ 52 | Output model for the agent. Extended at runtime with custom actions 53 | by 'type_with_custom_actions'. 54 | """ 55 | 56 | model_config = ConfigDict(arbitrary_types_allowed=True) 57 | 58 | current_state: CustomAgentBrain 59 | action: List[ActionModel] 60 | 61 | @staticmethod 62 | def type_with_custom_actions( 63 | custom_actions: Type[ActionModel], 64 | ) -> Type["CustomAgentOutput"]: 65 | """ 66 | Create a new Pydantic model that inherits from CustomAgentOutput 67 | but redefines the 'action' field to be a list of the given 68 | custom action model. 69 | 70 | :param custom_actions: The action model type from the controller registry. 71 | :return: A new Pydantic model class based on CustomAgentOutput. 72 | """ 73 | return create_model( 74 | # Could rename to something more specific if needed 75 | "AgentOutput", 76 | __base__=CustomAgentOutput, 77 | action=(List[custom_actions], Field(...)), 78 | __module__=CustomAgentOutput.__module__, 79 | ) 80 | -------------------------------------------------------------------------------- /smithery.yaml: -------------------------------------------------------------------------------- 1 | # Smithery configuration file: https://smithery.ai/docs/config#smitheryyaml 2 | 3 | startCommand: 4 | type: stdio 5 | configSchema: 6 | # JSON Schema defining the configuration options for the MCP. 7 | type: object 8 | required: 9 | - openaiApiKey 10 | - anthropicApiKey 11 | - mcpModelProvider 12 | - mcpModelName 13 | properties: 14 | openaiApiKey: 15 | type: string 16 | description: API key for OpenAI services. 17 | anthropicApiKey: 18 | type: string 19 | description: API key for Anthropic services. 20 | googleApiKey: 21 | type: string 22 | description: API key for Google services (optional). 23 | azureOpenaiEndpoint: 24 | type: string 25 | description: Azure OpenAI endpoint (optional). 26 | azureOpenaiApiKey: 27 | type: string 28 | description: Azure OpenAI API key (optional). 29 | chromePath: 30 | type: string 31 | description: Path to Chrome executable (optional). 32 | chromeUserData: 33 | type: string 34 | description: Path to Chrome user data directory (optional). 35 | chromeDebuggingPort: 36 | type: string 37 | default: "9222" 38 | description: Chrome debugging port. Default is 9222. 39 | chromeDebuggingHost: 40 | type: string 41 | default: localhost 42 | description: Chrome debugging host. Default is localhost. 43 | chromePersistentSession: 44 | type: boolean 45 | default: false 46 | description: Keep browser open between tasks. 47 | mcpModelProvider: 48 | type: string 49 | description: Model provider (e.g., anthropic, openai). 50 | mcpModelName: 51 | type: string 52 | description: Model name. 53 | mcpTemperature: 54 | type: number 55 | default: 0.3 56 | description: Model temperature. 57 | mcpMaxSteps: 58 | type: number 59 | default: 30 60 | description: Max steps for model. 61 | mcpUseVision: 62 | type: boolean 63 | default: true 64 | description: Use vision capabilities. 65 | mcpMaxActionsPerStep: 66 | type: number 67 | default: 5 68 | description: Max actions per step. 69 | commandFunction: 70 | # A function that produces the CLI command to start the MCP on stdio. 71 | |- 72 | (config) => ({ command: 'uv', args: ['run', 'mcp-browser-use'], env: { OPENAI_API_KEY: config.openaiApiKey, ANTHROPIC_API_KEY: config.anthropicApiKey, GOOGLE_API_KEY: config.googleApiKey, AZURE_OPENAI_ENDPOINT: config.azureOpenaiEndpoint, AZURE_OPENAI_API_KEY: config.azureOpenaiApiKey, CHROME_PATH: config.chromePath, CHROME_USER_DATA: config.chromeUserData, CHROME_DEBUGGING_PORT: config.chromeDebuggingPort || '9222', CHROME_DEBUGGING_HOST: config.chromeDebuggingHost || 'localhost', CHROME_PERSISTENT_SESSION: config.chromePersistentSession, MCP_MODEL_PROVIDER: config.mcpModelProvider, MCP_MODEL_NAME: config.mcpModelName, MCP_TEMPERATURE: config.mcpTemperature || 0.3, MCP_MAX_STEPS: config.mcpMaxSteps || 30, MCP_USE_VISION: config.mcpUseVision, MCP_MAX_ACTIONS_PER_STEP: config.mcpMaxActionsPerStep || 5 } }) 73 | -------------------------------------------------------------------------------- /tests/test_summarize_messages.py: -------------------------------------------------------------------------------- 1 | from langchain_core.messages import AIMessage, HumanMessage, SystemMessage 2 | 3 | import mcp_browser_use.agent.custom_agent as custom_agent_module 4 | from mcp_browser_use.agent.custom_agent import CustomAgent 5 | from browser_use.agent.message_manager.views import MessageHistory, ManagedMessage 6 | 7 | 8 | class FakeLLM: 9 | def __init__(self, content: str = "Conversation summary"): 10 | self.calls = [] 11 | self._content = content 12 | 13 | def invoke(self, input, **kwargs): 14 | self.calls.append(input) 15 | message = AIMessage(content=self._content) 16 | return message 17 | 18 | def __call__(self, input, **kwargs): 19 | return self.invoke(input, **kwargs) 20 | 21 | 22 | class DummyMessageManager: 23 | def __init__(self, extra_messages: int = 6): 24 | self.system_prompt = SystemMessage(content="System instructions") 25 | self.example_tool_call = AIMessage(content="[]") 26 | self.example_tool_call.tool_calls = [] 27 | self.reset_calls = 0 28 | self.history = MessageHistory() 29 | self.reset_history() 30 | for idx in range(extra_messages): 31 | human = HumanMessage(content=f"User message {idx}") 32 | self._add_message_with_tokens(human) 33 | 34 | def get_messages(self): 35 | return [managed.message for managed in self.history.messages] 36 | 37 | def reset_history(self) -> None: 38 | self.reset_calls += 1 39 | self.history = MessageHistory() 40 | self.history.messages = [] 41 | if hasattr(self.history, "total_tokens"): 42 | self.history.total_tokens = 0 43 | self._add_message_with_tokens(self.system_prompt) 44 | self._add_message_with_tokens(self.example_tool_call) 45 | 46 | def _add_message_with_tokens(self, message): 47 | self.history.messages.append(ManagedMessage(message=message)) 48 | if hasattr(self.history, "total_tokens"): 49 | self.history.total_tokens += 1 50 | 51 | 52 | def test_summarize_messages_preserves_system_prompt(monkeypatch): 53 | class StubChain: 54 | def __init__(self, llm): 55 | self.llm = llm 56 | 57 | def invoke(self, data): 58 | return self.llm.invoke(data) 59 | 60 | class StubPrompt: 61 | def __or__(self, llm): 62 | return StubChain(llm) 63 | 64 | class StubChatPromptTemplate: 65 | @staticmethod 66 | def from_messages(messages): 67 | return StubPrompt() 68 | 69 | monkeypatch.setattr( 70 | custom_agent_module, 71 | "ChatPromptTemplate", 72 | StubChatPromptTemplate, 73 | ) 74 | 75 | agent = CustomAgent.__new__(CustomAgent) 76 | agent.llm = FakeLLM() 77 | agent.message_manager = DummyMessageManager() 78 | 79 | assert len(agent.message_manager.get_messages()) > 5 80 | # Ensure the initial reset was performed 81 | assert agent.message_manager.reset_calls == 1 82 | 83 | result = agent.summarize_messages() 84 | 85 | assert result is True 86 | assert agent.message_manager.reset_calls == 2 87 | 88 | history_messages = agent.message_manager.history.messages 89 | assert len(history_messages) == 3 90 | assert [entry.message for entry in history_messages[:2]] == [ 91 | agent.message_manager.system_prompt, 92 | agent.message_manager.example_tool_call, 93 | ] 94 | assert history_messages[2].message.content == "Conversation summary" 95 | if hasattr(agent.message_manager.history, "total_tokens"): 96 | assert agent.message_manager.history.total_tokens == len(history_messages) 97 | 98 | # Ensure the LLM was called with the conversation 99 | assert len(agent.llm.calls) == 1 100 | prompt_value = agent.llm.calls[0] 101 | assert isinstance(prompt_value, dict) 102 | assert "chat_history" in prompt_value 103 | -------------------------------------------------------------------------------- /tests/test_client_session.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | import pytest 4 | 5 | from mcp_browser_use import client as client_module 6 | from mcp_browser_use.client import AgentNotRegisteredError, create_client_session 7 | 8 | 9 | @pytest.fixture 10 | def anyio_backend(): 11 | return "asyncio" 12 | 13 | 14 | @pytest.mark.anyio("asyncio") 15 | async def test_create_client_session_uses_supplied_client(): 16 | events = [] 17 | 18 | class DummyClient: 19 | def __init__(self): 20 | self.connected = False 21 | 22 | async def __aenter__(self): 23 | events.append("enter") 24 | self.connected = True 25 | return self 26 | 27 | async def __aexit__(self, exc_type, exc, tb): 28 | events.append("exit") 29 | self.connected = False 30 | 31 | dummy = DummyClient() 32 | async with create_client_session(client=dummy) as session: 33 | assert session is dummy 34 | assert dummy.connected 35 | 36 | assert events == ["enter", "exit"] 37 | assert dummy.connected is False 38 | 39 | 40 | @pytest.mark.anyio("asyncio") 41 | async def test_create_client_session_accepts_factory(): 42 | events = [] 43 | 44 | class DummyClient: 45 | async def __aenter__(self): 46 | events.append("enter") 47 | return self 48 | 49 | async def __aexit__(self, exc_type, exc, tb): 50 | events.append("exit") 51 | 52 | async with create_client_session(client_factory=DummyClient) as session: 53 | assert isinstance(session, DummyClient) 54 | 55 | assert events == ["enter", "exit"] 56 | 57 | 58 | @pytest.mark.anyio("asyncio") 59 | async def test_create_client_session_rejects_mixed_arguments(): 60 | class DummyClient: 61 | async def __aenter__(self): 62 | return self 63 | 64 | async def __aexit__(self, exc_type, exc, tb): 65 | pass 66 | 67 | dummy = DummyClient() 68 | 69 | with pytest.raises(ValueError): 70 | async with create_client_session(client=dummy, timeout=5): 71 | pass 72 | 73 | with pytest.raises(ValueError): 74 | async with create_client_session(client_factory=DummyClient, timeout=5): 75 | pass 76 | 77 | with pytest.raises(ValueError): 78 | async with create_client_session(client=dummy, client_factory=DummyClient): 79 | pass 80 | 81 | 82 | @pytest.mark.anyio("asyncio") 83 | async def test_create_client_session_constructs_default_client(monkeypatch): 84 | created = {} 85 | 86 | class DummyClient: 87 | def __init__(self, app, **kwargs): 88 | created["app"] = app 89 | created["kwargs"] = kwargs 90 | 91 | async def __aenter__(self): 92 | created["entered"] = True 93 | return self 94 | 95 | async def __aexit__(self, exc_type, exc, tb): 96 | created["exited"] = True 97 | 98 | monkeypatch.setattr("mcp_browser_use.client.Client", DummyClient) 99 | 100 | async with create_client_session(timeout=5) as session: 101 | assert isinstance(session, DummyClient) 102 | 103 | assert created["app"] is client_module.app 104 | assert created["kwargs"] == {"timeout": 5} 105 | assert created["entered"] is True 106 | assert created["exited"] is True 107 | 108 | 109 | @pytest.mark.anyio("asyncio") 110 | async def test_create_client_session_kwargs_with_factory_raise(): 111 | class DummyClient: 112 | async def __aenter__(self): 113 | return self 114 | 115 | async def __aexit__(self, exc_type, exc, tb): 116 | pass 117 | 118 | kwargs = {"client_factory": DummyClient, "timeout": 10} 119 | 120 | with pytest.raises(ValueError): 121 | async with create_client_session(**kwargs): 122 | pass 123 | 124 | 125 | @pytest.mark.parametrize( 126 | "legacy_module", 127 | [ 128 | "mcp_browser", 129 | "mcp_browser.use", 130 | "mcp_browser.use.mcp_browser_use", 131 | ], 132 | ) 133 | def test_legacy_namespace_is_removed(legacy_module): 134 | with pytest.raises(ModuleNotFoundError): 135 | importlib.import_module(legacy_module) 136 | 137 | 138 | def test_exception_type(): 139 | assert issubclass(AgentNotRegisteredError, RuntimeError) 140 | -------------------------------------------------------------------------------- /src/mcp_browser_use/agent/custom_massage_manager.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import annotations 4 | 5 | import copy 6 | import logging 7 | from typing import List, Optional, Type 8 | 9 | from browser_use.agent.message_manager.service import MessageManager 10 | from browser_use.agent.message_manager.views import MessageHistory 11 | from browser_use.agent.prompts import SystemPrompt 12 | from browser_use.agent.views import ActionResult, AgentStepInfo 13 | from browser_use.browser.views import BrowserState 14 | from langchain_core.language_models import BaseChatModel 15 | from langchain_core.messages import HumanMessage, AIMessage 16 | 17 | from mcp_browser_use.agent.custom_prompts import CustomAgentMessagePrompt 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | class CustomMassageManager(MessageManager): 23 | def __init__( 24 | self, 25 | llm: BaseChatModel, 26 | task: str, 27 | action_descriptions: str, 28 | system_prompt_class: Type[SystemPrompt], 29 | max_input_tokens: int = 128000, 30 | estimated_tokens_per_character: int = 3, 31 | image_tokens: int = 800, 32 | include_attributes: list[str] = [], 33 | max_error_length: int = 400, 34 | max_actions_per_step: int = 10, 35 | tool_call_in_content: bool = False, 36 | ): 37 | super().__init__( 38 | llm=llm, 39 | task=task, 40 | action_descriptions=action_descriptions, 41 | system_prompt_class=system_prompt_class, 42 | max_input_tokens=max_input_tokens, 43 | estimated_tokens_per_character=estimated_tokens_per_character, 44 | image_tokens=image_tokens, 45 | include_attributes=include_attributes, 46 | max_error_length=max_error_length, 47 | max_actions_per_step=max_actions_per_step, 48 | tool_call_in_content=tool_call_in_content, 49 | ) 50 | 51 | # Store template for example tool call so we can rebuild the history when needed 52 | self.tool_call_in_content = tool_call_in_content 53 | self._example_tool_call_template = [ 54 | { 55 | "name": "CustomAgentOutput", 56 | "args": { 57 | "current_state": { 58 | "prev_action_evaluation": "Unknown - No previous actions to evaluate.", 59 | "important_contents": "", 60 | "completed_contents": "", 61 | "thought": "Now Google is open. Need to type OpenAI to search.", 62 | "summary": "Type OpenAI to search.", 63 | }, 64 | "action": [], 65 | }, 66 | "id": "", 67 | "type": "tool_call", 68 | } 69 | ] 70 | self.reset_history() 71 | 72 | def _create_example_tool_call_message(self) -> AIMessage: 73 | tool_calls = copy.deepcopy(self._example_tool_call_template) 74 | if self.tool_call_in_content: 75 | # openai throws error if tool_calls are not responded -> move to content 76 | return AIMessage( 77 | content=f"{tool_calls}", 78 | tool_calls=[], 79 | ) 80 | return AIMessage( 81 | content="", 82 | tool_calls=tool_calls, 83 | ) 84 | 85 | def reset_history(self) -> None: 86 | """Reset the message history to the initial seeded state.""" 87 | 88 | self.history = MessageHistory() 89 | if hasattr(self.history, "total_tokens"): 90 | self.history.total_tokens = 0 91 | 92 | self._add_message_with_tokens(self.system_prompt) 93 | self._add_message_with_tokens(self._create_example_tool_call_message()) 94 | 95 | def add_state_message( 96 | self, 97 | state: BrowserState, 98 | result: Optional[List[ActionResult]] = None, 99 | step_info: Optional[AgentStepInfo] = None, 100 | ) -> None: 101 | """Add browser state as human message""" 102 | 103 | # if keep in memory, add to directly to history and add state without result 104 | if result: 105 | for r in result: 106 | if r.include_in_memory: 107 | if r.extracted_content: 108 | msg = HumanMessage(content=str(r.extracted_content)) 109 | self._add_message_with_tokens(msg) 110 | if r.error: 111 | msg = HumanMessage( 112 | content=str(r.error)[-self.max_error_length :] 113 | ) 114 | self._add_message_with_tokens(msg) 115 | result = None # if result in history, we dont want to add it again 116 | 117 | # otherwise add state message and result to next message (which will not stay in memory) 118 | state_message = CustomAgentMessagePrompt( 119 | state, 120 | result, 121 | include_attributes=self.include_attributes, 122 | max_error_length=self.max_error_length, 123 | step_info=step_info, 124 | ).get_user_message() 125 | self._add_message_with_tokens(state_message) 126 | -------------------------------------------------------------------------------- /src/mcp_browser_use/server.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from mcp_browser_use.utils.logging import configure_logging 4 | 5 | # It is critical to configure logging before any other modules are imported, 6 | # as they might initialize logging themselves. 7 | configure_logging() 8 | 9 | import asyncio 10 | import logging 11 | import os 12 | import sys 13 | import traceback 14 | from typing import Any, Optional 15 | 16 | from browser_use import Browser 17 | from fastmcp import FastMCP 18 | from mcp_browser_use.agent.custom_agent import CustomAgent 19 | from mcp_browser_use.controller.custom_controller import CustomController 20 | from mcp_browser_use.browser.browser_manager import create_browser_session 21 | from mcp_browser_use.utils import utils 22 | from mcp_browser_use.utils.agent_state import AgentState 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | app = FastMCP("mcp_browser_use") 27 | 28 | 29 | @app.tool() 30 | async def run_browser_agent(task: str, add_infos: str = "") -> str: 31 | """ 32 | This is the entrypoint for running a browser-based agent. 33 | 34 | :param task: The main instruction or goal for the agent. 35 | :param add_infos: Additional information or context for the agent. 36 | :return: The final result string from the agent run. 37 | """ 38 | 39 | browser_session: Optional[Browser] = None 40 | agent_state = AgentState() 41 | 42 | try: 43 | # Clear any previous agent stop signals 44 | agent_state.clear_stop() 45 | 46 | # Read environment variables with defaults and parse carefully 47 | # Fallback to defaults if parsing fails. 48 | model_provider = os.getenv("MCP_MODEL_PROVIDER", "anthropic") 49 | model_name = os.getenv("MCP_MODEL_NAME", "claude-3-5-sonnet-20241022") 50 | 51 | def safe_float(env_var: str, default: float) -> float: 52 | """Safely parse a float from an environment variable.""" 53 | try: 54 | return float(os.getenv(env_var, str(default))) 55 | except ValueError: 56 | logger.warning(f"Invalid float for {env_var}, using default={default}") 57 | return default 58 | 59 | def safe_int(env_var: str, default: int) -> int: 60 | """Safely parse an int from an environment variable.""" 61 | try: 62 | return int(os.getenv(env_var, str(default))) 63 | except ValueError: 64 | logger.warning(f"Invalid int for {env_var}, using default={default}") 65 | return default 66 | 67 | # Get environment variables with defaults 68 | temperature = safe_float("MCP_TEMPERATURE", 0.3) 69 | max_steps = safe_int("MCP_MAX_STEPS", 30) 70 | use_vision = os.getenv("MCP_USE_VISION", "true").lower() == "true" 71 | max_actions_per_step = safe_int("MCP_MAX_ACTIONS_PER_STEP", 5) 72 | tool_call_in_content = ( 73 | os.getenv("MCP_TOOL_CALL_IN_CONTENT", "true").lower() == "true" 74 | ) 75 | 76 | # Prepare LLM 77 | llm = utils.get_llm_model( 78 | provider=model_provider, model_name=model_name, temperature=temperature 79 | ) 80 | 81 | # Create a fresh browser session for this run 82 | browser_session = create_browser_session() 83 | await browser_session.start() 84 | 85 | # Create controller and agent 86 | controller = CustomController() 87 | agent = CustomAgent( 88 | task=task, 89 | add_infos=add_infos, 90 | use_vision=use_vision, 91 | llm=llm, 92 | browser_session=browser_session, 93 | controller=controller, 94 | max_actions_per_step=max_actions_per_step, 95 | tool_call_in_content=tool_call_in_content, 96 | agent_state=agent_state, 97 | ) 98 | 99 | # Execute the agent task lifecycle 100 | history = await agent.execute_agent_task(max_steps=max_steps) 101 | 102 | # Extract final result from the agent's history 103 | final_result = history.final_result() 104 | if not final_result: 105 | final_result = f"No final result. Possibly incomplete. {history}" 106 | 107 | return final_result 108 | 109 | except Exception as e: 110 | logger.error("run-browser-agent error: %s", str(e)) 111 | raise ValueError(f"run-browser-agent error: {e}\n{traceback.format_exc()}") 112 | 113 | finally: 114 | # Always ensure cleanup, even if no error. 115 | try: 116 | agent_state.request_stop() 117 | except Exception as stop_error: 118 | logger.warning("Error stopping agent state: %s", stop_error) 119 | 120 | if browser_session: 121 | try: 122 | await browser_session.stop() 123 | except Exception as browser_error: 124 | logger.warning( 125 | "Failed to stop browser session gracefully, killing it: %s", 126 | browser_error, 127 | ) 128 | if hasattr(browser_session, "kill"): 129 | await browser_session.kill() 130 | 131 | 132 | def launch_mcp_browser_use_server() -> None: 133 | """ 134 | Entry point for running the FastMCP application. 135 | Handles server start and final resource cleanup. 136 | """ 137 | try: 138 | app.run() 139 | except Exception as e: 140 | logger.error("Error running MCP server: %s\n%s", e, traceback.format_exc()) 141 | 142 | 143 | if __name__ == "__main__": 144 | launch_mcp_browser_use_server() 145 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MCP Browser Use Server 2 | 3 | [![smithery badge](https://smithery.ai/badge/@JovaniPink/mcp-browser-use)](https://smithery.ai/server/@JovaniPink/mcp-browser-use) 4 | 5 | > Model Context Protocol (MCP) server that wires [browser-use](https://github.com/browser-use/browser-use) into Claude Desktop and other MCP compatible clients. 6 | 7 | Browser Use Server MCP server 8 | 9 | ## Overview 10 | 11 | This repository provides a production-ready wrapper around the `browser-use` automation engine. It exposes a single MCP tool (`run_browser_agent`) that orchestrates a browser session, executes the `browser-use` agent, and returns the final result back to the client. The refactored layout focuses on keeping configuration in one place, improving testability, and keeping `browser-use` upgrades isolated from MCP specific code. 12 | 13 | ### Key Capabilities 14 | 15 | - **Automated browsing** – Navigate, interact with forms, control tabs, capture screenshots, and read page content through natural-language instructions executed by `browser-use`. 16 | - **Agent lifecycle management** – `CustomAgent` wraps `browser-use`'s base agent to add history export, richer prompts, and consistent error handling across runs. 17 | - **Centralised browser configuration** – `create_browser_session` translates environment variables into a ready-to-use `BrowserSession`, enabling persistent profiles, proxies, and custom Chromium flags without touching the agent logic. 18 | - **FastMCP integration** – `server.py` registers the MCP tool, normalises configuration, and ensures the browser session is always cleaned up. 19 | - **Client helpers** – `client.py` includes async helpers for tests or other Python processes that wish to exercise the MCP server in-process. 20 | 21 | ### Project Structure 22 | 23 | ``` 24 | . 25 | ├── documentation/ 26 | │ ├── CONFIGURATION.md # Detailed configuration reference 27 | │ └── SECURITY.md # Security considerations for running the server 28 | ├── .env.example # Example environment variables for local development 29 | ├── src/mcp_browser_use/ 30 | │ ├── agent/ # Custom agent, prompts, message history, and views 31 | │ ├── browser/ # Browser session factory and persistence helpers 32 | │ ├── controller/ # Custom controller extensions for clipboard actions 33 | │ ├── utils/ # LLM factory, agent state helpers, encoding utilities 34 | │ ├── client.py # Async helper for connecting to the FastMCP app 35 | │ └── server.py # FastMCP app and the `run_browser_agent` tool 36 | └── tests/ # Unit tests covering server helpers and agent features 37 | ``` 38 | 39 | ## Getting Started 40 | 41 | ### Requirements 42 | 43 | - Python 3.11+ 44 | - Google Chrome or Chromium (for local automation) 45 | - [`uv`](https://github.com/astral-sh/uv) for dependency management (recommended) 46 | - Optional: Claude Desktop or another MCP-compatible client for integration testing 47 | 48 | ### Installation 49 | 50 | ```bash 51 | git clone https://github.com/JovaniPink/mcp-browser-use.git 52 | cd mcp-browser-use 53 | uv sync 54 | ``` 55 | 56 | Copy `sample.env` to `.env` (or export the variables in another way) and update the values for the providers you plan to use. 57 | 58 | ### Launching the server 59 | 60 | ```bash 61 | uv run mcp-browser-use 62 | ``` 63 | 64 | The command invokes the console script defined in `pyproject.toml`, starts the FastMCP application, and registers the `run_browser_agent` tool. 65 | 66 | #### Using with Claude Desktop 67 | 68 | Once the server is running you can register it inside Claude Desktop, for example: 69 | 70 | ```json 71 | "mcpServers": { 72 | "mcp_server_browser_use": { 73 | "command": "uvx", 74 | "args": ["mcp-browser-use"], 75 | "env": { 76 | "MCP_MODEL_PROVIDER": "anthropic", 77 | "MCP_MODEL_NAME": "claude-3-5-sonnet-20241022" 78 | } 79 | } 80 | } 81 | ``` 82 | 83 | ### Debugging 84 | 85 | For interactive debugging, use the [MCP Inspector](https://github.com/modelcontextprotocol/inspector): 86 | 87 | ```bash 88 | npx @modelcontextprotocol/inspector uv --directory /path/to/project run mcp-browser-use 89 | ``` 90 | 91 | The inspector prints a URL that can be opened in the browser to watch tool calls and responses in real time. 92 | 93 | ## Configuration 94 | 95 | A full list of environment variables and their defaults is available in [documentation/CONFIGURATION.md](documentation/CONFIGURATION.md). Highlights include: 96 | 97 | - `MCP_MODEL_PROVIDER`, `MCP_MODEL_NAME`, `MCP_TEMPERATURE`, `MCP_MAX_STEPS`, `MCP_MAX_ACTIONS_PER_STEP`, and `MCP_USE_VISION` control the LLM and agent run. 98 | - Provider-specific API keys and endpoints (`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `DEEPSEEK_API_KEY`, `GOOGLE_API_KEY`, `AZURE_OPENAI_API_KEY`, etc.). 99 | - Browser runtime flags (`BROWSER_USE_HEADLESS`, `BROWSER_USE_EXTRA_CHROMIUM_ARGS`, `CHROME_PERSISTENT_SESSION`, `BROWSER_USE_PROXY_URL`, ...). 100 | 101 | Use `.env` + [`python-dotenv`](https://pypi.org/project/python-dotenv/) or your preferred secrets manager to keep credentials out of source control. 102 | 103 | ## Running Tests 104 | 105 | ```bash 106 | uv run pytest 107 | ``` 108 | 109 | The tests cover the custom agent behaviour, browser session factory, and other utility helpers. 110 | 111 | ## Security 112 | 113 | Controlling a full browser instance remotely can grant broad access to the host machine. Review [documentation/SECURITY.md](documentation/SECURITY.md) before exposing the server to untrusted environments. 114 | 115 | ## Contributing 116 | 117 | 1. Fork the repository 118 | 2. Create your feature branch: `git checkout -b my-new-feature` 119 | 3. Commit your changes: `git commit -m 'Add some feature'` 120 | 4. Push to the branch: `git push origin my-new-feature` 121 | 5. Open a pull request 122 | 123 | Bug reports and feature suggestions are welcome—please include logs and reproduction steps when applicable. 124 | -------------------------------------------------------------------------------- /documentation/CONFIGURATION.md: -------------------------------------------------------------------------------- 1 | # Configuration Guide 2 | 3 | This guide describes every configuration option recognised by the MCP Browser Use server. All settings can be supplied as environment variables (e.g. via a `.env` file loaded with [`python-dotenv`](https://pypi.org/project/python-dotenv/)) or injected by your MCP client. 4 | 5 | The sample file at [`sample.env.example`](../sample.env.example) contains a ready-to-copy template with placeholders for secrets. 6 | 7 | ## How configuration is loaded 8 | 9 | 1. **Model & Agent settings** are read in [`server.py`](../src/mcp_browser_use/server.py). They control the language model as well as the agent run loop. 10 | 2. **Browser runtime settings** are parsed in [`browser/browser_manager.py`](../src/mcp_browser_use/browser/browser_manager.py) which returns a configured `BrowserSession` instance. 11 | 3. **Provider specific credentials** are consumed by the LLM factory in [`utils/utils.py`](../src/mcp_browser_use/utils/utils.py). 12 | 13 | Unless otherwise noted, boolean flags treat any of `1`, `true`, `yes`, `on` (case insensitive) as **true**. Any other value is considered **false**. 14 | 15 | ## Core Agent Options 16 | 17 | | Variable | Default | Description | 18 | | --- | --- | --- | 19 | | `MCP_MODEL_PROVIDER` | `anthropic` | LLM provider name passed to the LangChain factory. Supported values: `anthropic`, `openai`, `deepseek`, `gemini`, `ollama`, `azure_openai`. | 20 | | `MCP_MODEL_NAME` | `claude-3-5-sonnet-20241022` | Model identifier sent to the provider. Each provider supports its own model list. | 21 | | `MCP_TEMPERATURE` | `0.3` | Sampling temperature for the model. Parsed as float. | 22 | | `MCP_MAX_STEPS` | `30` | Maximum number of reasoning/action steps before aborting the run. Parsed as integer. | 23 | | `MCP_MAX_ACTIONS_PER_STEP` | `5` | Limits how many tool invocations the agent may issue in a single step. Parsed as integer. | 24 | | `MCP_USE_VISION` | `true` | Enables vision features within the agent (element snapshots). | 25 | | `MCP_TOOL_CALL_IN_CONTENT` | `true` | Whether tool call payloads are expected inside the model response content. | 26 | 27 | ## Provider Credentials & Endpoints 28 | 29 | The LLM factory reads the following variables when initialising clients. Only set the values for the provider(s) you actively use. 30 | 31 | | Variable | Purpose | 32 | | --- | --- | 33 | | `ANTHROPIC_API_KEY` | API key for Anthropic Claude models. | 34 | | `OPENAI_API_KEY` | API key for OpenAI models. | 35 | | `DEEPSEEK_API_KEY` | API key for DeepSeek hosted models. | 36 | | `GOOGLE_API_KEY` | API key for Google Gemini via LangChain Google Generative AI. | 37 | | `AZURE_OPENAI_API_KEY` | API key for Azure OpenAI deployments. | 38 | | `AZURE_OPENAI_ENDPOINT` | Endpoint URL for the Azure OpenAI deployment. | 39 | | `OPENAI_ENDPOINT` | Override the OpenAI base URL (useful for proxies). | 40 | | `DEEPSEEK_ENDPOINT` | Base URL for the DeepSeek-compatible endpoint. | 41 | | `ANTHROPIC_API_ENDPOINT` | Alternative base URL for Anthropic (rarely needed). | 42 | 43 | When pointing to self-hosted or compatible services you may also override the defaults using `base_url` specific variables in your own code. See [`utils/utils.py`](../src/mcp_browser_use/utils/utils.py) for the full mapping. 44 | 45 | ## Browser Runtime Options 46 | 47 | These options are parsed by [`BrowserEnvironmentConfig.from_env`](../src/mcp_browser_use/browser/browser_manager.py) and control Chromium launch behaviour. 48 | 49 | | Variable | Default | Description | 50 | | --- | --- | --- | 51 | | `CHROME_PATH` | _unset_ | Absolute path to a Chrome/Chromium executable. Leave unset to let `browser-use` manage Chromium via Playwright. | 52 | | `CHROME_USER_DATA` | _unset_ | Directory to store user data (profiles, cookies). Required when `CHROME_PERSISTENT_SESSION` is true. | 53 | | `CHROME_PERSISTENT_SESSION` | `false` | Keeps the browser profile between runs by mounting `CHROME_USER_DATA`. | 54 | | `CHROME_DEBUGGING_PORT` | _unset_ | Remote debugging port for attaching to an existing Chrome instance. Must be an integer. | 55 | | `CHROME_DEBUGGING_HOST` | _unset_ | Hostname/IP for remote debugging (e.g. `localhost`). | 56 | | `BROWSER_USE_HEADLESS` | `false` | Launch Chromium in headless mode. | 57 | | `BROWSER_USE_DISABLE_SECURITY` | `false` | Disables web security features (CORS, sandbox). Use with caution. | 58 | | `BROWSER_USE_EXTRA_CHROMIUM_ARGS` | _unset_ | Comma-separated list of additional Chromium command-line flags. | 59 | | `BROWSER_USE_ALLOWED_DOMAINS` | _unset_ | Comma-separated allowlist limiting which domains the agent may open. | 60 | | `BROWSER_USE_PROXY_URL` | _unset_ | HTTP/HTTPS proxy URL. | 61 | | `BROWSER_USE_NO_PROXY` | _unset_ | Hosts to bypass in proxy mode. | 62 | | `BROWSER_USE_PROXY_USERNAME` | _unset_ | Username for proxy authentication. | 63 | | `BROWSER_USE_PROXY_PASSWORD` | _unset_ | Password for proxy authentication. | 64 | | `BROWSER_USE_CDP_URL` | _unset_ | Connect to an existing Chrome DevTools Protocol endpoint instead of launching a new browser. | 65 | 66 | ### Persistence hints 67 | 68 | - When `CHROME_PERSISTENT_SESSION` is true and `CHROME_USER_DATA` is not provided, the server logs a warning and the session falls back to ephemeral storage. 69 | - Remote debugging settings (`CHROME_DEBUGGING_HOST` / `CHROME_DEBUGGING_PORT`) are optional and ignored if invalid values are supplied. The server logs a warning and continues with defaults. 70 | 71 | ## Additional Environment Variables 72 | 73 | Some ancillary features inspect the following variables: 74 | 75 | | Variable | Purpose | 76 | | --- | --- | 77 | | `WIN_FONT_DIR` | Custom Windows font directory used when generating GIF summaries of browsing sessions. | 78 | 79 | ## Tips for managing configuration 80 | 81 | - Store secrets outside of version control. When sharing an `.env` file, redact or rotate keys immediately. 82 | - Keep provider-specific settings grouped so you can switch model providers quickly when testing. 83 | - Start with the defaults, confirm the agent behaves as expected, then tighten security by restricting `BROWSER_USE_ALLOWED_DOMAINS` and enabling headless mode. 84 | - When experimenting locally, keep `CHROME_PERSISTENT_SESSION=false` to avoid stale cookies interfering with automation runs. 85 | 86 | For any options not covered here, consult the upstream [`browser-use` documentation](https://github.com/browser-use/browser-use) which explains additional environment variables recognised by the underlying library. 87 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import importlib 3 | import importlib.util 4 | import os 5 | import sys 6 | import time 7 | import types 8 | 9 | import pytest 10 | 11 | # Path to utils module 12 | ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) 13 | UTILS_PATH = os.path.join(ROOT, "src", "mcp_browser_use", "utils", "utils.py") 14 | 15 | # Provide dummy langchain modules if they are not installed 16 | if "langchain_openai" not in sys.modules: 17 | module = types.ModuleType("langchain_openai") 18 | 19 | class ChatOpenAI: 20 | def __init__(self, *args, **kwargs): 21 | pass 22 | 23 | class AzureChatOpenAI: 24 | def __init__(self, *args, **kwargs): 25 | pass 26 | 27 | module.ChatOpenAI = ChatOpenAI 28 | module.AzureChatOpenAI = AzureChatOpenAI 29 | sys.modules["langchain_openai"] = module 30 | 31 | if "langchain_anthropic" not in sys.modules: 32 | module = types.ModuleType("langchain_anthropic") 33 | 34 | class ChatAnthropic: 35 | def __init__(self, *args, **kwargs): 36 | pass 37 | 38 | module.ChatAnthropic = ChatAnthropic 39 | sys.modules["langchain_anthropic"] = module 40 | 41 | if "langchain_google_genai" not in sys.modules: 42 | module = types.ModuleType("langchain_google_genai") 43 | 44 | class ChatGoogleGenerativeAI: 45 | def __init__(self, *args, **kwargs): 46 | pass 47 | 48 | module.ChatGoogleGenerativeAI = ChatGoogleGenerativeAI 49 | sys.modules["langchain_google_genai"] = module 50 | 51 | if "langchain_ollama" not in sys.modules: 52 | module = types.ModuleType("langchain_ollama") 53 | 54 | class ChatOllama: 55 | def __init__(self, *args, **kwargs): 56 | pass 57 | 58 | module.ChatOllama = ChatOllama 59 | sys.modules["langchain_ollama"] = module 60 | 61 | if "browser_use" not in sys.modules: 62 | browser_use_module = types.ModuleType("browser_use") 63 | browser_module = types.ModuleType("browser_use.browser") 64 | events_module = types.ModuleType("browser_use.browser.events") 65 | 66 | class ScreenshotEvent: 67 | def __init__(self, full_page: bool = False): 68 | self.full_page = full_page 69 | 70 | events_module.ScreenshotEvent = ScreenshotEvent 71 | browser_module.events = events_module 72 | browser_use_module.browser = browser_module 73 | 74 | sys.modules["browser_use"] = browser_use_module 75 | sys.modules["browser_use.browser"] = browser_module 76 | sys.modules["browser_use.browser.events"] = events_module 77 | 78 | # Import utils module directly from file after stubbing dependencies 79 | spec = importlib.util.spec_from_file_location("mcp_browser_use.utils.utils", UTILS_PATH) 80 | utils = importlib.util.module_from_spec(spec) 81 | spec.loader.exec_module(utils) 82 | 83 | 84 | @pytest.fixture 85 | def anyio_backend(): 86 | return "asyncio" 87 | 88 | 89 | def test_get_llm_model_returns_chatopenai(): 90 | model = utils.get_llm_model("openai") 91 | assert isinstance(model, utils.ChatOpenAI) 92 | 93 | 94 | def test_get_llm_model_unknown_provider_raises(): 95 | with pytest.raises(ValueError): 96 | utils.get_llm_model("unknown") 97 | 98 | 99 | def test_encode_image_handles_empty_path(): 100 | assert utils.encode_image(None) is None 101 | assert utils.encode_image("") is None 102 | 103 | 104 | def test_encode_image_roundtrip(tmp_path): 105 | image_path = tmp_path / "image.bin" 106 | payload = b"test-bytes" 107 | image_path.write_bytes(payload) 108 | 109 | encoded = utils.encode_image(str(image_path)) 110 | 111 | assert encoded == base64.b64encode(payload).decode("utf-8") 112 | 113 | 114 | def test_encode_image_missing_file(tmp_path): 115 | with pytest.raises(FileNotFoundError): 116 | utils.encode_image(str(tmp_path / "missing.bin")) 117 | 118 | 119 | def test_get_latest_files_creates_directory(tmp_path): 120 | target = tmp_path / "captures" 121 | 122 | result = utils.get_latest_files(str(target), file_types=[".webm", ".zip"]) 123 | 124 | assert target.exists() 125 | assert result == {".webm": None, ".zip": None} 126 | 127 | 128 | def test_get_latest_files_skips_recent_files(tmp_path, monkeypatch): 129 | directory = tmp_path / "captures" 130 | directory.mkdir() 131 | 132 | recent_path = directory / "recent.webm" 133 | recent_path.write_text("recent") 134 | 135 | now = time.time() 136 | os.utime(recent_path, (now, now)) 137 | 138 | monkeypatch.setattr(utils.time, "time", lambda: now) 139 | 140 | result = utils.get_latest_files(str(directory), file_types=[".webm"]) 141 | 142 | assert result == {".webm": None} 143 | 144 | 145 | @pytest.mark.anyio("asyncio") 146 | async def test_capture_screenshot_uses_event_bus(): 147 | screenshot_payload = base64.b64encode(b"payload").decode("utf-8") 148 | 149 | class DummyEvent: 150 | def __init__(self, result): 151 | self._result = result 152 | self.awaited = False 153 | 154 | def __await__(self): 155 | async def _wait(): 156 | self.awaited = True 157 | return self 158 | 159 | return _wait().__await__() 160 | 161 | async def event_result(self, raise_if_any=True, raise_if_none=True): 162 | return self._result 163 | 164 | class DummyEventBus: 165 | def __init__(self, dispatched_event): 166 | self._event = dispatched_event 167 | self.dispatched = [] 168 | 169 | def dispatch(self, event): 170 | self.dispatched.append(event) 171 | return self._event 172 | 173 | class DummyBrowserSession: 174 | def __init__(self, event_bus): 175 | self.event_bus = event_bus 176 | 177 | dummy_event = DummyEvent(screenshot_payload) 178 | event_bus = DummyEventBus(dummy_event) 179 | session = DummyBrowserSession(event_bus) 180 | 181 | encoded = await utils.capture_screenshot(session) 182 | 183 | assert encoded == screenshot_payload 184 | assert dummy_event.awaited is True 185 | assert len(event_bus.dispatched) == 1 186 | assert isinstance(event_bus.dispatched[0], utils.ScreenshotEvent) 187 | 188 | 189 | @pytest.mark.anyio("asyncio") 190 | async def test_capture_screenshot_returns_none_on_error(): 191 | class DummyErrorEvent: 192 | def __await__(self): 193 | async def _wait(): 194 | return self 195 | 196 | return _wait().__await__() 197 | 198 | async def event_result(self, raise_if_any=True, raise_if_none=True): 199 | raise RuntimeError("boom") 200 | 201 | class DummyEventBus: 202 | def dispatch(self, event): 203 | return DummyErrorEvent() 204 | 205 | class DummyBrowserSession: 206 | def __init__(self): 207 | self.event_bus = DummyEventBus() 208 | 209 | session = DummyBrowserSession() 210 | 211 | result = await utils.capture_screenshot(session) 212 | 213 | assert result is None 214 | -------------------------------------------------------------------------------- /src/mcp_browser_use/browser/browser_manager.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Utility helpers for configuring and creating :class:`BrowserSession` instances. 3 | 4 | This module consolidates the thin wrappers that previously lived in 5 | ``custom_browser.py``, ``custom_context.py``, and ``config.py``. The new structure 6 | centralises environment parsing so ``server.py`` can simply request a configured 7 | browser session without re-implementing the translation from environment 8 | variables to ``BrowserSession`` keyword arguments. 9 | """ 10 | 11 | from __future__ import annotations 12 | 13 | import logging 14 | import os 15 | from dataclasses import dataclass 16 | from typing import Any, Dict, Optional 17 | 18 | from browser_use import BrowserSession 19 | from browser_use.browser.profile import ProxySettings 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | _BOOL_TRUE = {"1", "true", "yes", "on"} 24 | 25 | 26 | @dataclass(slots=True) 27 | class BrowserPersistenceConfig: 28 | """Configuration for browser persistence and remote debugging settings.""" 29 | 30 | persistent_session: bool = False 31 | user_data_dir: Optional[str] = None 32 | debugging_port: Optional[int] = None 33 | debugging_host: Optional[str] = None 34 | 35 | @classmethod 36 | def from_env(cls) -> "BrowserPersistenceConfig": 37 | persistent_session = ( 38 | os.getenv("CHROME_PERSISTENT_SESSION", "").lower() in _BOOL_TRUE 39 | ) 40 | user_data_dir = os.getenv("CHROME_USER_DATA") or None 41 | 42 | debugging_port: Optional[int] 43 | port_value = os.getenv("CHROME_DEBUGGING_PORT") 44 | if port_value: 45 | try: 46 | debugging_port = int(port_value) 47 | except ValueError: 48 | logger.warning( 49 | "Invalid CHROME_DEBUGGING_PORT=%r, ignoring debug port setting.", 50 | port_value, 51 | ) 52 | debugging_port = None 53 | else: 54 | debugging_port = None 55 | 56 | debugging_host = os.getenv("CHROME_DEBUGGING_HOST") or None 57 | 58 | return cls( 59 | persistent_session=persistent_session, 60 | user_data_dir=user_data_dir, 61 | debugging_port=debugging_port, 62 | debugging_host=debugging_host, 63 | ) 64 | 65 | 66 | @dataclass(slots=True) 67 | class BrowserEnvironmentConfig: 68 | """All runtime settings required for instantiating ``BrowserSession``.""" 69 | 70 | headless: bool = False 71 | disable_security: bool = False 72 | executable_path: Optional[str] = None 73 | args: Optional[list[str]] = None 74 | allowed_domains: Optional[list[str]] = None 75 | proxy: Optional[ProxySettings] = None 76 | cdp_url: Optional[str] = None 77 | user_data_dir: Optional[str] = None 78 | 79 | def to_kwargs(self) -> Dict[str, Any]: 80 | """Convert to keyword arguments understood by :class:`BrowserSession`.""" 81 | 82 | kwargs: Dict[str, Any] = { 83 | "headless": self.headless, 84 | "disable_security": self.disable_security, 85 | "executable_path": self.executable_path, 86 | "args": self.args, 87 | "allowed_domains": self.allowed_domains, 88 | "proxy": self.proxy, 89 | "cdp_url": self.cdp_url, 90 | "user_data_dir": self.user_data_dir, 91 | } 92 | # Remove ``None`` values so BrowserSession can rely on its defaults. 93 | return {key: value for key, value in kwargs.items() if value is not None} 94 | 95 | @classmethod 96 | def from_env(cls) -> "BrowserEnvironmentConfig": 97 | persistence = BrowserPersistenceConfig.from_env() 98 | 99 | headless = os.getenv("BROWSER_USE_HEADLESS", "false").lower() in _BOOL_TRUE 100 | disable_security = ( 101 | os.getenv("BROWSER_USE_DISABLE_SECURITY", "false").lower() in _BOOL_TRUE 102 | ) 103 | executable_path = os.getenv("CHROME_PATH") or None 104 | 105 | extra_args_env = os.getenv("BROWSER_USE_EXTRA_CHROMIUM_ARGS") 106 | args = None 107 | if extra_args_env: 108 | args = [arg.strip() for arg in extra_args_env.split(",") if arg.strip()] 109 | 110 | allowed_domains_env = os.getenv("BROWSER_USE_ALLOWED_DOMAINS") 111 | allowed_domains = None 112 | if allowed_domains_env: 113 | allowed_domains = [ 114 | domain.strip() 115 | for domain in allowed_domains_env.split(",") 116 | if domain.strip() 117 | ] 118 | 119 | proxy_url = os.getenv("BROWSER_USE_PROXY_URL") 120 | proxy: Optional[ProxySettings] = None 121 | if proxy_url: 122 | proxy = ProxySettings( 123 | server=proxy_url, 124 | bypass=os.getenv("BROWSER_USE_NO_PROXY"), 125 | username=os.getenv("BROWSER_USE_PROXY_USERNAME"), 126 | password=os.getenv("BROWSER_USE_PROXY_PASSWORD"), 127 | ) 128 | 129 | cdp_url = os.getenv("BROWSER_USE_CDP_URL") or None 130 | if not cdp_url and (persistence.debugging_host or persistence.debugging_port): 131 | host = persistence.debugging_host or "127.0.0.1" 132 | port = persistence.debugging_port or 9222 133 | cdp_url = f"http://{host}:{port}" 134 | 135 | user_data_dir = None 136 | if persistence.persistent_session: 137 | if persistence.user_data_dir: 138 | user_data_dir = persistence.user_data_dir 139 | else: 140 | logger.warning( 141 | "CHROME_PERSISTENT_SESSION requested but CHROME_USER_DATA was not provided." 142 | ) 143 | 144 | return cls( 145 | headless=headless, 146 | disable_security=disable_security, 147 | executable_path=executable_path, 148 | args=args, 149 | allowed_domains=allowed_domains, 150 | proxy=proxy, 151 | cdp_url=cdp_url, 152 | user_data_dir=user_data_dir, 153 | ) 154 | 155 | 156 | def create_browser_session( 157 | overrides: Optional[Dict[str, Any]] = None, 158 | ) -> BrowserSession: 159 | """Instantiate a :class:`BrowserSession` using environment defaults. 160 | 161 | ``overrides`` can be supplied to fine-tune the resulting session. Any keys 162 | set to ``None`` are ignored so callers can override only a subset of values. 163 | """ 164 | 165 | config = BrowserEnvironmentConfig.from_env() 166 | kwargs = config.to_kwargs() 167 | 168 | if overrides: 169 | for key, value in overrides.items(): 170 | if value is not None: 171 | kwargs[key] = value 172 | elif key in kwargs: 173 | # Explicit ``None`` removes the override letting BrowserSession 174 | # fall back to its internal default. 175 | kwargs.pop(key) 176 | 177 | logger.debug( 178 | "Creating BrowserSession with kwargs: %s", 179 | {k: v for k, v in kwargs.items() if k != "proxy"}, 180 | ) 181 | return BrowserSession(**kwargs) 182 | -------------------------------------------------------------------------------- /src/mcp_browser_use/utils/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import base64 4 | import logging 5 | import os 6 | import time 7 | from pathlib import Path 8 | from typing import Any, Callable, Dict, List, Optional, Tuple, Type 9 | 10 | from browser_use.browser.events import ScreenshotEvent 11 | from langchain_anthropic import ChatAnthropic 12 | from langchain_google_genai import ChatGoogleGenerativeAI 13 | from langchain_ollama import ChatOllama 14 | from langchain_openai import AzureChatOpenAI, ChatOpenAI 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | def _anthropic_params(kwargs: Dict[str, Any]) -> Dict[str, Any]: 20 | return { 21 | "model_name": kwargs.get("model_name", "claude-3-5-sonnet-20240620"), 22 | "temperature": kwargs.get("temperature", 0.0), 23 | "base_url": kwargs.get("base_url") or "https://api.anthropic.com", 24 | "api_key": kwargs.get("api_key") or os.getenv("ANTHROPIC_API_KEY", ""), 25 | } 26 | 27 | 28 | def _openai_params(kwargs: Dict[str, Any]) -> Dict[str, Any]: 29 | return { 30 | "model": kwargs.get("model_name", "gpt-4o"), 31 | "temperature": kwargs.get("temperature", 0.0), 32 | "base_url": kwargs.get("base_url") 33 | or os.getenv("OPENAI_ENDPOINT", "https://api.openai.com/v1"), 34 | "api_key": kwargs.get("api_key") or os.getenv("OPENAI_API_KEY", ""), 35 | } 36 | 37 | 38 | def _deepseek_params(kwargs: Dict[str, Any]) -> Dict[str, Any]: 39 | return { 40 | "model": kwargs.get("model_name", "deepseek-chat"), 41 | "temperature": kwargs.get("temperature", 0.0), 42 | "base_url": kwargs.get("base_url") or os.getenv("DEEPSEEK_ENDPOINT", ""), 43 | "api_key": kwargs.get("api_key") or os.getenv("DEEPSEEK_API_KEY", ""), 44 | } 45 | 46 | 47 | def _gemini_params(kwargs: Dict[str, Any]) -> Dict[str, Any]: 48 | return { 49 | "model": kwargs.get("model_name", "gemini-2.0-flash-exp"), 50 | "temperature": kwargs.get("temperature", 0.0), 51 | "google_api_key": kwargs.get("api_key") or os.getenv("GOOGLE_API_KEY", ""), 52 | } 53 | 54 | 55 | def _ollama_params(kwargs: Dict[str, Any]) -> Dict[str, Any]: 56 | return { 57 | "model": kwargs.get("model_name", "phi4"), 58 | "temperature": kwargs.get("temperature", 0.0), 59 | "num_ctx": kwargs.get("num_ctx", 128000), 60 | "base_url": kwargs.get("base_url", "http://localhost:11434"), 61 | } 62 | 63 | 64 | def _azure_openai_params(kwargs: Dict[str, Any]) -> Dict[str, Any]: 65 | return { 66 | "model": kwargs.get("model_name", "gpt-4o"), 67 | "temperature": kwargs.get("temperature", 0.0), 68 | "api_version": kwargs.get("api_version", "2024-05-01-preview"), 69 | "azure_endpoint": kwargs.get("base_url") 70 | or os.getenv("AZURE_OPENAI_ENDPOINT", ""), 71 | "api_key": kwargs.get("api_key") or os.getenv("AZURE_OPENAI_API_KEY", ""), 72 | } 73 | 74 | 75 | LLM_PROVIDERS: Dict[str, Tuple[Type, Callable[[Dict[str, Any]], Dict[str, Any]]]] = { 76 | "anthropic": (ChatAnthropic, _anthropic_params), 77 | "openai": (ChatOpenAI, _openai_params), 78 | "deepseek": (ChatOpenAI, _deepseek_params), 79 | "gemini": (ChatGoogleGenerativeAI, _gemini_params), 80 | "ollama": (ChatOllama, _ollama_params), 81 | "azure_openai": (AzureChatOpenAI, _azure_openai_params), 82 | } 83 | 84 | 85 | def get_llm_model(provider: str, **kwargs) -> Any: 86 | """ 87 | Return an initialized language model client based on the given provider name. 88 | 89 | :param provider: The name of the LLM provider (e.g., "anthropic", "openai", "azure_openai"). 90 | :param kwargs: Additional parameters (model_name, temperature, base_url, api_key, etc.). 91 | :return: An instance of a ChatLLM from the relevant langchain_* library. 92 | :raises ValueError: If the provider is unsupported. 93 | """ 94 | 95 | try: 96 | llm_class, params_builder = LLM_PROVIDERS[provider] 97 | except KeyError as error: 98 | raise ValueError(f"Unsupported provider: {provider}") from error 99 | 100 | provider_kwargs = params_builder(kwargs) 101 | return llm_class(**provider_kwargs) 102 | 103 | 104 | # Commonly used model names for quick reference 105 | model_names = { 106 | "anthropic": ["claude-3-5-sonnet-20240620", "claude-3-opus-20240229"], 107 | "openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo"], 108 | "deepseek": ["deepseek-chat"], 109 | "gemini": [ 110 | "gemini-2.0-flash-exp", 111 | "gemini-2.0-flash-thinking-exp", 112 | "gemini-1.5-flash-latest", 113 | "gemini-1.5-flash-8b-latest", 114 | "gemini-2.0-flash-thinking-exp-1219", 115 | ], 116 | "ollama": ["deepseek-r1:671b", "qwen2.5:7b", "llama3.3", "phi4"], 117 | "azure_openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo"], 118 | } 119 | 120 | 121 | def encode_image(img_path: Optional[str]) -> Optional[str]: 122 | """ 123 | Convert an image at `img_path` into a base64-encoded string. 124 | Returns None if `img_path` is None or empty. 125 | Raises FileNotFoundError if the file doesn't exist. 126 | """ 127 | if not img_path: 128 | return None 129 | 130 | try: 131 | with open(img_path, "rb") as image_file: 132 | image_data = base64.b64encode(image_file.read()).decode("utf-8") 133 | return image_data 134 | except FileNotFoundError as error: 135 | logger.error(f"Image not found at path {img_path}: {error}") 136 | raise 137 | except Exception as error: 138 | logger.error(f"Error encoding image at {img_path}: {error}") 139 | raise 140 | 141 | 142 | def get_latest_files( 143 | directory: str, file_types: List[str] = [".webm", ".zip"] 144 | ) -> Dict[str, Optional[str]]: 145 | """ 146 | Find the latest file for each extension in `file_types` under `directory`. 147 | Returns a dict {file_extension: latest_file_path or None}. 148 | 149 | :param directory: The directory to search. 150 | :param file_types: List of file extensions (e.g., [".webm", ".zip"]). 151 | :return: dict mapping each extension to the path of the newest file or None if not found. 152 | """ 153 | latest_files: Dict[str, Optional[str]] = {ext: None for ext in file_types} 154 | 155 | if not os.path.exists(directory): 156 | logger.debug(f"Directory '{directory}' does not exist. Creating it.") 157 | os.makedirs(directory, exist_ok=True) 158 | return latest_files 159 | 160 | for file_type in file_types: 161 | try: 162 | matching_files = list(Path(directory).rglob(f"*{file_type}")) 163 | if matching_files: 164 | # Sort or use max() by modified time 165 | most_recent_file = max( 166 | matching_files, key=lambda path: path.stat().st_mtime 167 | ) 168 | # Check if file is not actively being written 169 | if time.time() - most_recent_file.stat().st_mtime > 1.0: 170 | latest_files[file_type] = str(most_recent_file) 171 | else: 172 | logger.debug( 173 | f"Skipping file {most_recent_file} - possibly still being written." 174 | ) 175 | except Exception as error: 176 | logger.error( 177 | f"Error getting latest {file_type} file in '{directory}': {error}" 178 | ) 179 | 180 | return latest_files 181 | 182 | 183 | async def capture_screenshot(browser_session) -> Optional[str]: 184 | """Capture a screenshot of the current page using the browser-use event bus.""" 185 | 186 | if not hasattr(browser_session, "event_bus"): 187 | logger.error("Browser session does not have an event_bus.") 188 | return None 189 | 190 | try: 191 | event = browser_session.event_bus.dispatch(ScreenshotEvent(full_page=False)) 192 | await event 193 | result = await event.event_result(raise_if_any=True, raise_if_none=True) 194 | return result 195 | except Exception as error: 196 | logger.error(f"Failed to capture screenshot via event bus: {error}") 197 | return None 198 | -------------------------------------------------------------------------------- /src/mcp_browser_use/agent/custom_prompts.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from typing import List, Optional 4 | 5 | from browser_use.agent.prompts import SystemPrompt 6 | from browser_use.agent.views import ActionResult 7 | from browser_use.browser.views import BrowserState 8 | from langchain_core.messages import HumanMessage, SystemMessage 9 | 10 | from mcp_browser_use.agent.custom_views import CustomAgentStepInfo 11 | 12 | 13 | class CustomSystemPrompt(SystemPrompt): 14 | """ 15 | Custom system prompt that extends SystemPrompt to inject additional 16 | formatting rules and instructions for the AI agent. 17 | """ 18 | 19 | def important_rules(self) -> str: 20 | """ 21 | Return a detailed multiline string describing how the agent 22 | must format its JSON response, handle multiple actions, forms, 23 | navigation, and the maximum actions per step. 24 | 25 | The text includes guidelines for: 26 | - JSON response format 27 | - Action sequences 28 | - Element interaction 29 | - Navigation & error handling 30 | - Task completion 31 | - Visual context usage 32 | - Handling form filling and suggestions 33 | """ 34 | text = r""" 35 | 1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format: 36 | { 37 | "current_state": { 38 | "prev_action_evaluation": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Ignore the action result. The website is the ground truth. Also mention if something unexpected happened like new suggestions in an input field. Shortly state why/why not. Note that the result you output must be consistent with the reasoning you output afterwards. If you consider it to be 'Failed,' you should reflect on this during your thought.", 39 | "important_contents": "Output important contents closely related to user's instruction or task on the current page. If there is, please output the contents. If not, please output empty string ''.", 40 | "completed_contents": "Update the input Task Progress. Completed contents is a general summary of the current contents that have been completed. Just summarize the contents that have been actually completed based on the current page and the history operations. Please list each completed item individually, such as: 1. Input username. 2. Input Password. 3. Click confirm button", 41 | "thought": "Think about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation. If the output of prev_action_evaluation is 'Failed', please reflect and output your reflection here. If you think you have entered the wrong page, consider to go back to the previous page in next action.", 42 | "summary": "Please generate a brief natural language description for the operation in next actions based on your Thought." 43 | }, 44 | "action": [ 45 | { 46 | "action_name": { 47 | // action-specific parameters 48 | } 49 | }, 50 | // ... more actions in sequence 51 | ] 52 | } 53 | 54 | 2. ACTIONS: You can specify multiple actions to be executed in sequence. 55 | Common action sequences: 56 | - Form filling: [ 57 | {"input_text": {"index": 1, "text": "username"}}, 58 | {"input_text": {"index": 2, "text": "password"}}, 59 | {"click_element": {"index": 3}} 60 | ] 61 | - Navigation and extraction: [ 62 | {"open_new_tab": {}}, 63 | {"go_to_url": {"url": "https://example.com"}}, 64 | {"extract_page_content": {}} 65 | ] 66 | 67 | 3. ELEMENT INTERACTION: 68 | - Only use indexes that exist in the provided element list 69 | - Each element has a unique index number (e.g., "33[:] 128 | _[:] Non-interactive text 129 | 130 | 131 | Notes: 132 | - Only elements with numeric indexes are interactive 133 | - _[:] elements provide context but cannot be interacted with 134 | """ 135 | 136 | def get_system_message(self) -> SystemMessage: 137 | """ 138 | Build and return a SystemMessage containing all system-level instructions, 139 | rules, and function references for the agent. 140 | """ 141 | time_str = self.current_date.strftime("%Y-%m-%d %H:%M") 142 | 143 | AGENT_PROMPT = f"""You are a precise browser automation agent that interacts with websites through structured commands. Your role is to: 144 | 1. Analyze the provided webpage elements and structure 145 | 2. Plan a sequence of actions to accomplish the given task 146 | 3. Respond with valid JSON containing your action sequence and state assessment 147 | 148 | Current date and time: {time_str} 149 | 150 | {self.input_format()} 151 | 152 | {self.important_rules()} 153 | 154 | Functions: 155 | {self.default_action_description} 156 | 157 | Remember: Your responses must be valid JSON matching the specified format. Each action in the sequence must be valid.""" 158 | 159 | return SystemMessage(content=AGENT_PROMPT) 160 | 161 | 162 | class CustomAgentMessagePrompt: 163 | """ 164 | Builds a user-facing prompt (HumanMessage) from the current browser state, 165 | task step info, and any results or errors from previous actions. 166 | """ 167 | 168 | def __init__( 169 | self, 170 | state: BrowserState, 171 | result: Optional[List[ActionResult]] = None, 172 | include_attributes: Optional[List[str]] = None, 173 | max_error_length: int = 400, 174 | step_info: Optional[CustomAgentStepInfo] = None, 175 | ): 176 | """ 177 | :param state: The current BrowserState, including URL, tabs, elements, etc. 178 | :param result: A list of ActionResults from the previous step(s). 179 | :param include_attributes: A list of HTML attributes to show in element strings. 180 | :param max_error_length: Maximum characters of error output to include. 181 | :param step_info: Holds metadata like the current step number, memory, task details, etc. 182 | """ 183 | self.state = state 184 | self.result = result or [] 185 | self.include_attributes = include_attributes or [] 186 | self.max_error_length = max_error_length 187 | self.step_info = step_info 188 | 189 | def get_user_message(self) -> HumanMessage: 190 | """ 191 | Construct and return a HumanMessage containing: 192 | 1. Task and hints from step_info 193 | 2. Memory and task progress 194 | 3. Current URL and available tabs 195 | 4. A string representation of interactive elements 196 | 5. Any results or errors from previous actions 197 | 6. An inline base64 screenshot if available 198 | 199 | :return: A HumanMessage object for the agent to process. 200 | """ 201 | step_info = self.step_info 202 | if not step_info: 203 | # Fallback if no step_info is provided 204 | step_info_text = "" 205 | task = "" 206 | add_infos = "" 207 | memory = "" 208 | task_progress = "" 209 | else: 210 | step_info_text = f"Step {step_info.step_number}/{step_info.max_steps}" 211 | task = step_info.task 212 | add_infos = step_info.add_infos 213 | memory = step_info.memory 214 | task_progress = step_info.task_progress 215 | 216 | state_description = f""" 217 | {step_info_text} 218 | 1. Task: {task} 219 | 2. Hints(Optional): 220 | {add_infos} 221 | 3. Memory: 222 | {memory} 223 | 4. Task Progress: 224 | {task_progress} 225 | 5. Current url: {self.state.url} 226 | 6. Available tabs: 227 | {self.state.tabs} 228 | 7. Interactive elements: 229 | {self.state.element_tree.clickable_elements_to_string( 230 | include_attributes=self.include_attributes 231 | )} 232 | """ 233 | 234 | # Append action results or errors 235 | for i, r in enumerate(self.result): 236 | if r.extracted_content: 237 | state_description += f"\nResult of action {i + 1}/{len(self.result)}: {r.extracted_content}" 238 | if r.error: 239 | truncated_error = r.error[-self.max_error_length :] 240 | state_description += f"\nError of action {i + 1}/{len(self.result)}: ...{truncated_error}" 241 | 242 | # If a screenshot is available, embed it as an image URL 243 | if self.state.screenshot: 244 | # Format message for vision model or multi-part message 245 | return HumanMessage( 246 | content=[ 247 | {"type": "text", "text": state_description}, 248 | { 249 | "type": "image_url", 250 | "image_url": { 251 | "url": f"data:image/png;base64,{self.state.screenshot}" 252 | }, 253 | }, 254 | ] 255 | ) 256 | else: 257 | # Otherwise, just return text 258 | return HumanMessage(content=state_description) 259 | -------------------------------------------------------------------------------- /documentation/SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security 2 | 3 | > Below is a comprehensive security audit of your Browser-Use + MCP project using all the prior conversations and standard best practices for security. This is not an exhaustive penetration test but a systematic review of the major scripts and common pitfalls. We also provide suggestions for how to mitigate identified risks. 4 | 5 | 1. Project Structure & High-Level Summary 6 | 7 | The code layout is: 8 | 9 | 1. Main server code server.py that runs an async event loop (loop = asyncio.new_event_loop()) within __main__): 10 | - Runs a FastMCP (Model Context Protocol) server. 11 | - Exposes a tool endpoint to run a single “browser agent.” 12 | 2. Custom Agent under the agent directory and Related Classes: 13 | - custom_agent.py: Inherits from a base Agent and implements logic to parse LLM output, execute browser actions, handle vision, and create history GIFs. 14 | - custom_massage_manager.py: Handles LLM output parsing and conversion to browser actions. 15 | - custom_prompts.py: Contains system-level instructions for the LLM to produce a structured JSON output. 16 | - custom_views.py: Data classes (CustomAgentStepInfo, CustomAgentBrain) are used to store the agent’s state and output schema. 17 | 3. Custom Browser Components under the browser directory: 18 | - config.py: Holds dataclasses for configuring Chrome (persistent sessions, debugging port). 19 | - custom_browser.py: Subclass of Browser that handles launching or connecting to Chrome over a debugging port. It may disable some security flags or run headless. 20 | - custom_context.py: Subclass of BrowserContext that can reuse an existing context or create new ones, load cookies, start traces, etc. 21 | 4. Controllers & Actions: 22 | - custom_controller.py: Registers custom actions (copy/paste from clipboard). 23 | 5. Utilities: 24 | - agent_state.py: Tracks a stop_requested event (via asyncio.Event) and optional “last valid state.” Implemented as a singleton (only one agent at a time). 25 | - utils.py: offers a get_llm_model function to create different LLM clients (OpenAI, Anthropic, Azure, etc.), as well as image encoding and file-tracking utilities. 26 | 27 | The project runs a single agent simultaneously, hooking an LLM to actual browser actions. Let’s go through significant security aspects. 28 | 29 | 2. Identified Security Risks & Recommendations 30 | 31 | Below are the main areas of concern based on the code we’ve seen and typical usage patterns. 32 | 33 | 2.1 Disabling Browser Security & Remote Debug Port 34 | 35 | Where 36 | 37 | - custom_browser.py: 38 | - Allows launching Chrome with flags like --disable-web-security. 39 | - Launches Chrome with --remote-debugging-port=9222. 40 | 41 | Risks 42 | 43 | 1. Cross-Origin Attacks: Disabling web security (--disable-web-security, --disable-features=IsolateOrigins) allows malicious pages to read cross-origin data in the same browser instance. If the agent visits untrusted websites, it could inadvertently exfiltrate data from other open tabs or sessions. 44 | 2. Debug Port Exposure: A remote debugging port on 9222 (if bound to 0.0.0.0 or otherwise accessible externally) gives anyone who can connect full control of the browser. If not behind a firewall, an attacker can hijack the session. 45 | 46 | Recommendations 47 | 48 | 1. Limit the usage of disable-web-security and related flags. Restrict this to internal/test scenarios or run it inside a hardened container or ephemeral environment. 49 | 2. Restrict Access to Port 9222: 50 | 51 | - Bind to 127.0.0.1 only (--remote-debugging-address=127.0.0.1) so external hosts cannot connect. 52 | - Use a firewall or security group to block external access. 53 | - If remote access is required, use SSH tunneling rather than publicly exposing the port. 54 | 55 | 3. If you must open untrusted pages, create separate browser instances. This means not reusing the same “user data dir” or disabling security for critical tasks. 56 | 57 | 2.2 Global Singleton AgentState 58 | 59 | Where 60 | 61 | - agent_state.py implements a singleton that shares_stop_requested and last_valid_state across all agent references. 62 | 63 | Risks 64 | 65 | 1. Concurrent Agents: If you (in the future) attempt to run multiple agents, the single AgentState object might cause cross-talk or unpredictable behavior (e.g., one agent’s stop request stops another). 66 | 2. Potential Race Conditions: If the code evolves to multi-thread or multi-process, the concurrency might not behave as expected. 67 | 68 | Recommendations 69 | 70 | 1. Ensure Only One Agent: If that’s your design (a single agent at a time), the singleton is acceptable. Document it. 71 | 2. Remove Singleton for multi-agent scenarios. Each agent can have its state object. 72 | 73 | 2.3 Clipboard Actions 74 | 75 | Where 76 | 77 | - custom_controller.py registers actions like “Copy text to clipboard” and “Paste from clipboard.” 78 | 79 | Risks 80 | 81 | 1. System Clipboard: Copy/paste using the OS-level clipboard (pyperclip). This can leak sensitive data if other apps or remote sessions see the same clipboard. 82 | 2. Overwrite: The agent can overwrite a user’s clipboard or read from it unexpectedly. 83 | 84 | Recommendations 85 | 86 | 1. Run in a Controlled Environment: It may be okay if you only do local development or a dedicated environment. 87 | 2. Use an In-Memory Clipboard: Instead of the actual system clipboard, implement a local memory store for copying and pasting within the agent’s session. This prevents overwriting the user’s system clipboard. 88 | 3. Disable or Restrict these actions if you run in multi-user or production mode. 89 | 90 | 2.4 Logging Sensitive Data 91 | 92 | Where 93 | 94 | - Various scripts log LLM responses or user tasks. 95 | - utils.py and other files read environment variables for API keys. 96 | 97 | Risks 98 | 99 | 1. API Keys in Logs: If you ever log environment variables, they might contain secrets (e.g., OPENAI_API_KEY, ANTHROPIC_API_KEY). 100 | 2. Conversation Logs: LLM or browser actions might contain personal info or private data from pages the agent visits. 101 | 102 | Recommendations 103 | 104 | 1. Scrub Sensitive Info: Use partial redaction to log environment variables or user data. 105 | 2. Control Log Levels: Keep debug logs for local dev; avoid them in production or store them in a secure location. 106 | 3. Never commit or print raw API keys or user credentials. 107 | 108 | 2.5 Environment Variables for API Keys 109 | 110 | Where 111 | 112 | - utils.py reads OPENAI_API_KEY, ANTHROPIC_API_KEY, AZURE_OPENAI_API_KEY, etc. 113 | 114 | Risks 115 | 116 | 1. Credentials Leak: Others might read if environment variables are insecurely stored or the machine is multi-tenant. 117 | 2. Rotation & Auditing: It is harder to rotate if you embed them in environment variables in multiple places. 118 | 119 | Recommendations 120 | 121 | 1. Use a Secret Manager: For production, store keys in Vault, AWS Secrets Manager, or a similar service, injecting them at runtime with minimal exposure. 122 | 2. Lock Down or Mask your environment variables in logs. 123 | 124 | 2.6 Handling of Cookies & Persisted Sessions 125 | 126 | Where 127 | 128 | - custom_context.py loads cookies from a file and reuses them if cookies_file is set. 129 | 130 | Risks 131 | 132 | 1. Cookie Theft: Cookies containing session tokens can be used to impersonate or access accounts. 133 | 2. Insecure Storage: If cookies_file is not locked down or is in a publicly accessible directory, attackers could read it. 134 | 135 | Recommendations 136 | 137 | 1. Encrypt or Secure the cookie file if it’s sensitive. 138 | 2. Use ephemeral sessions if you don’t need persistence (this mitigates the risk of session hijacking). 139 | 3. Handle JSON Errors gracefully. The code might crash if the cookie file is corrupted or maliciously edited. Currently, you catch some exceptions, but be sure they are robust. 140 | 141 | 2.7 LLM Output Execution 142 | 143 | Where 144 | 145 | - custom_agent.py uses the LLM output to determine subsequent actions in the browser. This is effectively arbitrary remote code controlling the browser if the LLM’s output is invalid. 146 | 147 | Risks 148 | 149 | 1. Prompt Injection or Malicious LLM Output: If an attacker can manipulate the prompt or the LLM’s instructions, they might cause harmful browsing actions (e.g., navigating to malicious pages, downloading malicious content, or exfiltrating data). 150 | 2. Excessive Trust: The agent automatically performs actions the LLM says. If the LLM is compromised or intentionally producing malicious JSON, your system might become an attack vector. 151 | 152 | Recommendations 153 | 154 | 1. Policy Layer: Before executing each action, you can add checks to ensure it’s within a set of “allowed” domains or “allowed action types.” 155 | 2. Safe Browsing: You could block navigation to known malicious or undesired domains. 156 | 3. Sandboxes: Run the browser in a locked-down Docker container or VM so the environment is contained even if the LLM instructs to visit a malicious link. 157 | 158 | 2.8 Untrusted Web Content & Vision 159 | 160 | Where 161 | 162 | - The agent uses optional “vision-based element detection” or page screenshots. 163 | 164 | Risks 165 | 166 | 1. Malicious Images: If the agent processes images from untrusted sources, ensure it’s safe from typical image library exploits (PIL is relatively safe, but keep it updated). 167 | 2. Screenshot capturing: If you store or send screenshots, you risk inadvertently capturing personal data or content. 168 | 169 | Recommendations 170 | 171 | 1. Use the Latest Libraries: Keep PIL (pillow) updated to avoid known vulnerabilities in image parsing. 172 | 2. Handle Storage: If you store screenshots, do so in secure, short-lived storage with restricted access. 173 | 174 | 3. Summary of Key Security Practices 175 | 176 | Based on the potential issues above, here’s a short checklist to ensure your system remains secure: 177 | 178 | 1. Networking & Ports: 179 | 180 | - Bind remote debugging to 127.0.0.1 only. 181 | - Use firewalls or SSH tunnels if remote access is necessary. 182 | 183 | 2. Sandboxing: 184 | 185 | - Use Docker or a VM for your automation environment. 186 | - Avoid --disable-web-security in production, or keep it in an isolated environment if you must use it. 187 | 188 | 3. Logging & Secrets: 189 | 190 | - Never log API keys or raw environment variables. 191 | - Redact sensitive info in logs. 192 | - Use a secret manager to store credentials. 193 | 194 | 4. Clipboard & Persistence: 195 | 196 | - Limit usage of system clipboard actions or implement an in-memory approach. 197 | - If session data/cookies are reused, ensure the file and directory permissions are locked down. 198 | 199 | 5. LLM Output Validation: 200 | 201 | - Consider a “policy layer” that checks which actions are allowed before executing them. 202 | - Consider domain safelisting or an interactive approval step in critical scenarios. 203 | 204 | 6. Error Handling: 205 | • - Gracefully handle invalid JSON, cookies, or environment variables. 206 | 207 | - Decide if you want to continue or fail fast with an error message. 208 | 209 | 7. Document your single-agent approach: 210 | 211 | - The singleton approach is fine if you never plan multiple concurrent agents. 212 | - Otherwise, remove it or ensure concurrency safety. 213 | 214 | 4. Verifying Project Structure 215 | 216 | From a structural standpoint: 217 | 218 | 1. Modular & Readable: Your project is decently modular: custom_agent, custom_browser, custom_context, custom_controller, custom_prompts, etc. 219 | 2. Dependencies: You rely on Playwright. async_api, pyperclip, requests, and custom browser_use and langchain _* modules. Ensure they are pinned to known-safe versions (e.g., in a requirements.txt) and kept updated. 220 | 3. Single vs. Multi Agent: In your README or main docs, clarify that you run only one agent at a time or concurrency is in scope. 221 | 4. Deployment: If you distribute or deploy this server, outline the usage of environment variables, the required ports, and the recommended containerization approach. 222 | 223 | 5. Conclusion 224 | 225 | Your codebase is well-organized and functionally robust. The main security concerns revolve around: 226 | 227 | - Remote Debugging & Disabling Security** in Chrome. 228 | - Clipboard & Cookie usage. 229 | - LLM output leading to potentially dangerous actions if not validated. 230 | - Logging & environment variables containing sensitive data. 231 | 232 | You can mitigate most of these risks by containerizing or VM-isolating your environment, restricting your debugging port to localhost, carefully handling credentials and logs, and implementing a minimal policy layer for LLM-driven actions. 233 | 234 | The project is in good shape, but you should document these security measures and carefully configure them, especially in environments other than internal development. 235 | 236 | Next Steps: 237 | 238 | - Implement or strengthen the recommended mitigation steps above. 239 | - Periodically review dependencies for security patches. 240 | - If this is a production-grade service, consider formal penetration testing or a threat model exercise to identify additional risks. 241 | - Keep documentation clear about the single-agent design and environment variables, and recommend using a container or ephemeral environment to prevent lateral movement or data exfiltration. 242 | -------------------------------------------------------------------------------- /src/mcp_browser_use/agent/custom_agent.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | import logging 5 | import traceback 6 | from typing import Any, List, Optional, Type 7 | 8 | import base64 9 | import io 10 | import os 11 | from PIL import Image, ImageDraw, ImageFont 12 | 13 | from browser_use.agent.prompts import SystemPrompt 14 | from browser_use.agent.service import Agent 15 | from browser_use.agent.views import ( 16 | ActionResult, 17 | AgentHistoryList, 18 | AgentOutput, 19 | AgentHistory, 20 | ) 21 | from browser_use import BrowserSession 22 | from browser_use.browser.views import BrowserStateHistory 23 | from browser_use.controller.service import Controller 24 | from browser_use.telemetry.views import AgentEndTelemetryEvent, AgentRunTelemetryEvent 25 | from browser_use.utils import time_execution_async 26 | from langchain_core.language_models.chat_models import BaseChatModel 27 | from langchain_core.messages import BaseMessage 28 | from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder 29 | from langchain_openai import ChatOpenAI 30 | from langchain_openai.chat_models.base import _convert_message_to_dict 31 | 32 | from mcp_browser_use.utils.agent_state import AgentState 33 | from mcp_browser_use.agent.custom_massage_manager import CustomMassageManager 34 | from mcp_browser_use.agent.custom_views import CustomAgentOutput, CustomAgentStepInfo 35 | 36 | logger = logging.getLogger(__name__) 37 | 38 | 39 | class CustomAgent(Agent): 40 | """ 41 | An AI-driven Agent that uses a language model to determine browser actions, 42 | interacts with a browser/page handle, and manages conversation history and 43 | state. 44 | """ 45 | 46 | def __init__( 47 | self, 48 | task: str, 49 | llm: BaseChatModel, 50 | add_infos: str = "", 51 | browser_session: Optional[BrowserSession] = None, 52 | browser: Optional[BrowserSession] = None, 53 | browser_context: Optional[Any] = None, 54 | controller: Optional[Controller] = None, 55 | use_vision: bool = True, 56 | save_conversation_path: Optional[str] = None, 57 | max_failures: int = 5, 58 | retry_delay: int = 10, 59 | system_prompt_class: Type[SystemPrompt] = SystemPrompt, 60 | max_input_tokens: int = 13000, 61 | validate_output: bool = False, 62 | include_attributes: tuple[str, str, str, str, str, str, str, str, str, str] = ( 63 | "title", 64 | "type", 65 | "name", 66 | "role", 67 | "tabindex", 68 | "aria-label", 69 | "placeholder", 70 | "value", 71 | "alt", 72 | "aria-expanded", 73 | ), 74 | max_error_length: int = 400, 75 | max_actions_per_step: int = 10, 76 | tool_call_in_content: bool = True, 77 | agent_state: Optional[AgentState] = None, 78 | ): 79 | """ 80 | :param task: Main instruction or goal for the agent. 81 | :param llm: The large language model (BaseChatModel) used for reasoning. 82 | :param add_infos: Additional information or context to pass to the agent. 83 | :param browser_session: Optional browser/session instance (legacy name). 84 | :param browser: Preferred browser object for ``browser-use`` >= 0.7. 85 | :param browser_context: Optional active page/context to reuse. 86 | :param controller: Optional controller for handling multi-step actions. A new 87 | controller is created when not provided. 88 | :param use_vision: Whether to use vision-based element detection. 89 | :param save_conversation_path: File path to store conversation logs. 90 | :param max_failures: Max consecutive failures allowed before aborting. 91 | :param retry_delay: Delay between retries (not currently used). 92 | :param system_prompt_class: System prompt class for the agent. 93 | :param max_input_tokens: Token limit for model input. 94 | :param validate_output: Whether to validate final output at each step. 95 | :param include_attributes: HTML attributes to include in vision logic. 96 | :param max_error_length: Max length for error messages. 97 | :param max_actions_per_step: Limit the number of actions agent can perform per step. 98 | :param tool_call_in_content: Whether tool calls are in the raw model content. 99 | :param agent_state: Shared state to detect external stop signals, store last valid state, etc. 100 | """ 101 | controller = controller or Controller() 102 | self.controller = controller 103 | 104 | browser_handle = browser or browser_session 105 | 106 | init_kwargs: dict[str, Any] = { 107 | "task": task, 108 | "llm": llm, 109 | "controller": controller, 110 | "use_vision": use_vision, 111 | "save_conversation_path": save_conversation_path, 112 | "max_failures": max_failures, 113 | "retry_delay": retry_delay, 114 | "system_prompt_class": system_prompt_class, 115 | "max_input_tokens": max_input_tokens, 116 | "validate_output": validate_output, 117 | "include_attributes": include_attributes, 118 | "max_error_length": max_error_length, 119 | "max_actions_per_step": max_actions_per_step, 120 | "tool_call_in_content": tool_call_in_content, 121 | } 122 | 123 | if browser_handle is not None: 124 | init_kwargs["browser"] = browser_handle 125 | 126 | if browser_context is not None: 127 | init_kwargs["page"] = browser_context 128 | 129 | for _ in range(4): 130 | try: 131 | super().__init__(**init_kwargs) 132 | break 133 | except TypeError as exc: # pragma: no cover - defensive compatibility 134 | message = str(exc) 135 | if ( 136 | "unexpected keyword argument 'browser'" in message 137 | and "browser" in init_kwargs 138 | ): 139 | browser_value = init_kwargs.pop("browser") 140 | if browser_value is not None: 141 | init_kwargs.setdefault("browser_session", browser_value) 142 | continue 143 | if ( 144 | "unexpected keyword argument 'browser_session'" in message 145 | and "browser_session" in init_kwargs 146 | ): 147 | browser_value = init_kwargs.pop("browser_session") 148 | if browser_value is not None: 149 | init_kwargs.setdefault("browser", browser_value) 150 | continue 151 | if ( 152 | "unexpected keyword argument 'page'" in message 153 | and "page" in init_kwargs 154 | ): 155 | init_kwargs.pop("page") 156 | continue 157 | if ( 158 | "unexpected keyword argument 'controller'" in message 159 | and "controller" in init_kwargs 160 | ): 161 | controller_value = init_kwargs.pop("controller") 162 | init_kwargs.setdefault("tools", controller_value) 163 | continue 164 | if ( 165 | "unexpected keyword argument 'tools'" in message 166 | and "tools" in init_kwargs 167 | ): 168 | controller_value = init_kwargs.pop("tools") 169 | init_kwargs.setdefault("controller", controller_value) 170 | continue 171 | raise 172 | else: # pragma: no cover - should never happen 173 | raise TypeError("Unable to initialise base Agent with provided arguments") 174 | self.add_infos = add_infos 175 | self.agent_state = agent_state 176 | 177 | # Custom message manager 178 | self.message_manager = CustomMassageManager( 179 | llm=self.llm, 180 | task=self.task, 181 | action_descriptions=self.controller.registry.get_prompt_description(), 182 | system_prompt_class=self.system_prompt_class, 183 | max_input_tokens=self.max_input_tokens, 184 | include_attributes=self.include_attributes, 185 | max_error_length=self.max_error_length, 186 | max_actions_per_step=self.max_actions_per_step, 187 | tool_call_in_content=tool_call_in_content, 188 | ) 189 | 190 | def _setup_action_models(self) -> None: 191 | """ 192 | Setup dynamic action models from the controller's registry. 193 | This ensures the agent's output schema matches all possible actions. 194 | """ 195 | # Get the dynamic action model from controller's registry 196 | self.ActionModel = self.controller.registry.create_action_model() 197 | # Create output model with the dynamic actions 198 | self.AgentOutput = CustomAgentOutput.type_with_custom_actions(self.ActionModel) 199 | 200 | def _log_response(self, response: CustomAgentOutput) -> None: 201 | """ 202 | Log the model's response in a human-friendly way. 203 | Shows success/fail state, memory, thought, summary, etc. 204 | """ 205 | evaluation = response.current_state.prev_action_evaluation or "" 206 | if "Success" in evaluation: 207 | emoji = "✅" 208 | elif "Failed" in evaluation: 209 | emoji = "❌" 210 | else: 211 | emoji = "🤷" 212 | 213 | logger.info(f"{emoji} Eval: {evaluation}") 214 | logger.info(f"🧠 New Memory: {response.current_state.important_contents}") 215 | logger.info(f"⏳ Task Progress: {response.current_state.completed_contents}") 216 | logger.info(f"🤔 Thought: {response.current_state.thought}") 217 | logger.info(f"🎯 Summary: {response.current_state.summary}") 218 | 219 | for i, action in enumerate(response.action): 220 | logger.info( 221 | f"🛠️ Action {i + 1}/{len(response.action)}: " 222 | f"{action.model_dump_json(exclude_unset=True)}" 223 | ) 224 | 225 | def update_step_info( 226 | self, 227 | model_output: CustomAgentOutput, 228 | step_info: Optional[CustomAgentStepInfo] = None, 229 | ) -> None: 230 | """ 231 | Update the current step with new memory and completed contents. 232 | 233 | :param model_output: Parsed output from the LLM. 234 | :param step_info: Step information object, if any. 235 | """ 236 | if step_info is None: 237 | return 238 | 239 | step_info.step_number += 1 240 | important_contents = model_output.current_state.important_contents 241 | if ( 242 | important_contents 243 | and "None" not in important_contents 244 | and important_contents not in step_info.memory 245 | ): 246 | step_info.memory += important_contents + "\n" 247 | 248 | completed_contents = model_output.current_state.completed_contents 249 | if completed_contents and "None" not in completed_contents: 250 | step_info.task_progress = completed_contents 251 | 252 | @time_execution_async("--get_next_action") 253 | async def get_next_action(self, input_messages: List[BaseMessage]) -> AgentOutput: 254 | """ 255 | Get the next action from the LLM, attempting structured output parsing. 256 | Falls back to manual JSON parsing if structured parse fails. 257 | """ 258 | logger.info("Getting next action from LLM") 259 | logger.debug(f"Input messages: {input_messages}") 260 | 261 | try: 262 | if isinstance(self.llm, ChatOpenAI): 263 | # For OpenAI, attempt structured parse with "instructor" first 264 | parsed_output = await self._handle_openai_structured_output( 265 | input_messages 266 | ) 267 | else: 268 | logger.info(f"Using non-OpenAI model: {type(self.llm).__name__}") 269 | parsed_output = await self._handle_non_openai_structured_output( 270 | input_messages 271 | ) 272 | 273 | self._truncate_and_log_actions(parsed_output) 274 | self.n_steps += 1 275 | return parsed_output 276 | 277 | except Exception as e: 278 | logger.warning(f"Error getting structured output: {str(e)}") 279 | logger.info("Attempting fallback to manual parsing") 280 | return await self._fallback_parse(input_messages) 281 | 282 | async def _handle_openai_structured_output( 283 | self, input_messages: List[BaseMessage] 284 | ) -> AgentOutput: 285 | """ 286 | Attempt to get structured output from an OpenAI LLM 287 | using the 'instructor' library. If that fails, fallback 288 | to the default structured output approach. 289 | """ 290 | logger.info("Using OpenAI chat model") 291 | # Usually safe to import here to avoid circular import issues 292 | from instructor import from_openai 293 | 294 | try: 295 | client = from_openai(self.llm.root_async_client) 296 | logger.debug(f"Using model: {self.llm.model_name}") 297 | messages = [_convert_message_to_dict(msg) for msg in input_messages] 298 | 299 | parsed_response = await client.chat.completions.create( 300 | messages=messages, 301 | model=self.llm.model_name, 302 | response_model=self.AgentOutput, 303 | ) 304 | logger.debug(f"Raw OpenAI response: {parsed_response}") 305 | 306 | return parsed_response 307 | 308 | except Exception as e: 309 | # Attempt default structured output if instructor fails 310 | logger.error(f"Error with 'instructor' approach: {str(e)}") 311 | logger.info("Using default structured output approach.") 312 | 313 | structured_llm = self.llm.with_structured_output( 314 | self.AgentOutput, include_raw=True 315 | ) 316 | response: dict[str, Any] = await structured_llm.ainvoke(input_messages) 317 | logger.debug(f"Raw LLM response (default approach): {response}") 318 | return response["parsed"] # type: ignore 319 | 320 | async def _handle_non_openai_structured_output( 321 | self, input_messages: List[BaseMessage] 322 | ) -> AgentOutput: 323 | """ 324 | For non-OpenAI models, we directly use the structured LLM approach. 325 | """ 326 | structured_llm = self.llm.with_structured_output( 327 | self.AgentOutput, include_raw=True 328 | ) 329 | response: dict[str, Any] = await structured_llm.ainvoke(input_messages) 330 | logger.debug(f"Raw LLM response: {response}") 331 | return response["parsed"] # type: ignore 332 | 333 | async def _fallback_parse(self, input_messages: List[BaseMessage]) -> AgentOutput: 334 | """ 335 | Manual JSON parsing fallback if structured parse fails. 336 | Tries to extract JSON from the raw text and parse into AgentOutput. 337 | """ 338 | try: 339 | ret = await self.llm.ainvoke(input_messages) 340 | logger.debug(f"Raw fallback response: {ret}") 341 | 342 | content = ret.content 343 | if isinstance(content, list): 344 | # If content is a list, parse from the first element 345 | parsed_json = json.loads( 346 | content[0].replace("```json", "").replace("```", "") 347 | ) 348 | else: 349 | # Otherwise parse from the string 350 | parsed_json = json.loads( 351 | content.replace("```json", "").replace("```", "") 352 | ) 353 | 354 | parsed_output: AgentOutput = self.AgentOutput(**parsed_json) 355 | if parsed_output is None: 356 | raise ValueError("Could not parse fallback response.") 357 | 358 | self._truncate_and_log_actions(parsed_output) 359 | self.n_steps += 1 360 | logger.info( 361 | f"Successfully got next action via fallback. Step count: {self.n_steps}" 362 | ) 363 | return parsed_output 364 | 365 | except Exception as parse_error: 366 | logger.error(f"Fallback parsing failed: {str(parse_error)}") 367 | raise 368 | 369 | def _truncate_and_log_actions(self, parsed_output: AgentOutput) -> None: 370 | """ 371 | Enforce the max_actions_per_step limit and log the response. 372 | """ 373 | original_action_count = len(parsed_output.action) 374 | parsed_output.action = parsed_output.action[: self.max_actions_per_step] 375 | if original_action_count > self.max_actions_per_step: 376 | logger.warning( 377 | f"Truncated actions from {original_action_count} to {self.max_actions_per_step}" 378 | ) 379 | self._log_response(parsed_output) 380 | 381 | def summarize_messages(self) -> bool: 382 | """ 383 | Summarize message history if it exceeds 5 messages. 384 | Returns True if summarization occurred, False otherwise. 385 | """ 386 | stored_messages = self.message_manager.get_messages() 387 | message_count = len(stored_messages) 388 | 389 | if message_count <= 5: 390 | logger.debug("Message count <= 5, skipping summarization") 391 | return False 392 | 393 | logger.info(f"Summarizing {message_count} messages") 394 | try: 395 | summarization_prompt = ChatPromptTemplate.from_messages( 396 | [ 397 | MessagesPlaceholder(variable_name="chat_history"), 398 | ( 399 | "user", 400 | "Distill the above chat messages into a single summary message. " 401 | "Include as many specific details as you can.", 402 | ), 403 | ] 404 | ) 405 | summarization_chain = summarization_prompt | self.llm 406 | 407 | summary_message = summarization_chain.invoke( 408 | {"chat_history": stored_messages} 409 | ) 410 | logger.debug(f"Generated summary: {summary_message}") 411 | 412 | self.message_manager.reset_history() 413 | self.message_manager._add_message_with_tokens( 414 | summary_message 415 | ) # Consider creating a public method for this 416 | return True 417 | 418 | except Exception as e: 419 | logger.error(f"Error during message summarization: {str(e)}") 420 | logger.debug(f"Full traceback: {traceback.format_exc()}") 421 | return False 422 | 423 | @time_execution_async("--execute-agent-step") 424 | async def execute_agent_step( 425 | self, step_info: Optional[CustomAgentStepInfo] = None 426 | ) -> None: 427 | """ 428 | Execute a single agent step of the task: 429 | 1) Capture browser state 430 | 2) Query LLM for next action 431 | 3) Execute that action(s) 432 | 4) Update logs/history 433 | """ 434 | logger.info(f"\n📍 Step {self.n_steps}") 435 | logger.info(f"History token count: {self.message_manager.history.total_tokens}") 436 | 437 | # Optionally summarize to reduce token usage 438 | # self.summarize_messages() 439 | 440 | state = None 441 | model_output = None 442 | result: List[ActionResult] = [] 443 | 444 | try: 445 | try: 446 | state = await self.browser_context.get_state(use_vision=self.use_vision) 447 | except TypeError: 448 | logger.warning( 449 | "get_state does not support 'use_vision' argument, falling back." 450 | ) 451 | state = await self.browser_context.get_state() 452 | self.message_manager.add_state_message(state, self._last_result, step_info) 453 | input_messages = self.message_manager.get_messages() 454 | 455 | model_output = await self.get_next_action(input_messages) 456 | self.update_step_info(model_output, step_info) 457 | logger.info(f"🧠 All Memory: {getattr(step_info, 'memory', '')}") 458 | 459 | self._save_conversation(input_messages, model_output) 460 | # Remove the last state message from chat history to prevent bloat 461 | self.message_manager._remove_last_state_message() 462 | self.message_manager.add_model_output(model_output) 463 | 464 | # Execute the requested actions 465 | result = await self.controller.multi_act( 466 | model_output.action, self.browser_context 467 | ) 468 | self._last_result = result 469 | 470 | # If the last action indicates "is_done", we can log the extracted content 471 | if len(result) > 0 and result[-1].is_done: 472 | logger.info(f"📄 Result: {result[-1].extracted_content}") 473 | 474 | self.consecutive_failures = 0 475 | 476 | except Exception as e: 477 | result = self._handle_step_error(e) 478 | self._last_result = result 479 | 480 | finally: 481 | if not result: 482 | return 483 | 484 | for r in result: 485 | logger.warning(f"🔧 Action result: {r}") 486 | 487 | if state: 488 | self._make_history_item(model_output, state, result) 489 | 490 | def create_history_gif( 491 | self, 492 | output_path: str = "agent_history.gif", 493 | duration: int = 3000, 494 | show_goals: bool = True, 495 | show_task: bool = True, 496 | show_logo: bool = False, 497 | font_size: int = 40, 498 | title_font_size: int = 56, 499 | goal_font_size: int = 44, 500 | margin: int = 40, 501 | line_spacing: float = 1.5, 502 | ) -> None: 503 | """ 504 | Create a GIF from the agent's history using the captured screenshots. 505 | Overlays text for tasks/goals. Optionally includes a logo. 506 | """ 507 | if not self.history.history: 508 | logger.warning("No history to create GIF from") 509 | return 510 | 511 | if not self.history.history[0].state.screenshot: 512 | logger.warning( 513 | "No screenshots in the first history item; cannot create GIF" 514 | ) 515 | return 516 | 517 | images = [] 518 | try: 519 | # Attempt to load some preferred fonts 520 | font_options = ["Helvetica", "Arial", "DejaVuSans", "Verdana"] 521 | regular_font, title_font, goal_font = None, None, None 522 | font_loaded = False 523 | 524 | for font_name in font_options: 525 | try: 526 | import platform 527 | 528 | if platform.system() == "Windows": 529 | # On Windows, we may need absolute font paths 530 | font_name = os.path.join( 531 | os.getenv("WIN_FONT_DIR", "C:\\Windows\\Fonts"), 532 | font_name + ".ttf", 533 | ) 534 | 535 | regular_font = ImageFont.truetype(font_name, font_size) 536 | title_font = ImageFont.truetype(font_name, title_font_size) 537 | goal_font = ImageFont.truetype(font_name, goal_font_size) 538 | font_loaded = True 539 | break 540 | except OSError: 541 | continue 542 | 543 | if not font_loaded: 544 | raise OSError("No preferred fonts found") 545 | 546 | except OSError: 547 | # Fallback to default 548 | regular_font = ImageFont.load_default() 549 | title_font = regular_font 550 | goal_font = regular_font 551 | 552 | logo = None 553 | if show_logo: 554 | try: 555 | logo = Image.open("./static/browser-use.png") 556 | # Resize logo 557 | logo_height = 150 558 | aspect_ratio = logo.width / logo.height 559 | logo_width = int(logo_height * aspect_ratio) 560 | logo = logo.resize((logo_width, logo_height), Image.Resampling.LANCZOS) 561 | except Exception as e: 562 | logger.warning(f"Could not load logo: {e}") 563 | 564 | # If requested, create an initial frame with the entire task 565 | if show_task and self.task: 566 | task_frame = self._create_task_frame( 567 | self.task, 568 | self.history.history[0].state.screenshot, 569 | title_font, 570 | regular_font, 571 | logo, 572 | line_spacing, 573 | ) 574 | images.append(task_frame) 575 | 576 | # Convert each step’s screenshot 577 | for i, item in enumerate(self.history.history, 1): 578 | if not item.state.screenshot: 579 | continue 580 | 581 | img_data = base64.b64decode(item.state.screenshot) 582 | image = Image.open(io.BytesIO(img_data)) 583 | 584 | if show_goals and item.model_output: 585 | image = self._add_overlay_to_image( 586 | image=image, 587 | step_number=i, 588 | goal_text=item.model_output.current_state.thought, 589 | regular_font=regular_font, 590 | title_font=title_font, 591 | margin=margin, 592 | logo=logo, 593 | line_spacing=line_spacing, 594 | ) 595 | 596 | images.append(image) 597 | 598 | if images: 599 | images[0].save( 600 | output_path, 601 | save_all=True, 602 | append_images=images[1:], 603 | duration=duration, 604 | loop=0, 605 | optimize=False, 606 | ) 607 | logger.info(f"Created GIF at {output_path}") 608 | else: 609 | logger.warning("No images found in history to create GIF") 610 | 611 | def _create_task_frame( 612 | self, 613 | task_text: str, 614 | screenshot_b64: str, 615 | title_font: ImageFont.FreeTypeFont, 616 | regular_font: ImageFont.FreeTypeFont, 617 | logo: Image.Image | None, 618 | line_spacing: float, 619 | ) -> Image.Image: 620 | """Return an image with the task text overlaid on the screenshot.""" 621 | 622 | margin = 40 623 | img = Image.open(io.BytesIO(base64.b64decode(screenshot_b64))).convert("RGBA") 624 | 625 | overlay = Image.new("RGBA", img.size, (0, 0, 0, 0)) 626 | draw = ImageDraw.Draw(overlay) 627 | 628 | max_width = img.width - margin * 2 629 | text_lines: list[str] = self._wrap_text_to_lines( 630 | draw, task_text, regular_font, max_width 631 | ) 632 | 633 | y = margin 634 | title_bbox = draw.textbbox((margin, y), "Task", font=title_font) 635 | title_height = title_bbox[3] - title_bbox[1] 636 | total_height = title_height + int(margin * 0.5) 637 | for t in text_lines: 638 | bbox = draw.textbbox((margin, 0), t, font=regular_font) 639 | total_height += int((bbox[3] - bbox[1]) * line_spacing) 640 | 641 | if logo: 642 | total_height = max(total_height, logo.height + margin * 2) 643 | 644 | draw.rectangle( 645 | [(0, 0), (img.width, total_height)], 646 | fill=(0, 0, 0, 180), 647 | ) 648 | 649 | draw.text((margin, y), "Task", font=title_font, fill="white") 650 | y += title_height + int(margin * 0.5) 651 | for t in text_lines: 652 | draw.text((margin, y), t, font=regular_font, fill="white") 653 | bbox = draw.textbbox((margin, y), t, font=regular_font) 654 | y += int((bbox[3] - bbox[1]) * line_spacing) 655 | 656 | if logo: 657 | overlay.paste( 658 | logo, 659 | (img.width - logo.width - margin, margin), 660 | logo if logo.mode == "RGBA" else None, 661 | ) 662 | 663 | img.alpha_composite(overlay) 664 | return img.convert("RGB") 665 | 666 | def _wrap_text_to_lines( 667 | self, 668 | draw: ImageDraw.ImageDraw, 669 | text: str, 670 | font: ImageFont.FreeTypeFont, 671 | max_width: int, 672 | ) -> list[str]: 673 | """Split ``text`` into lines that fit within ``max_width`` pixels.""" 674 | 675 | if not text: 676 | return [] 677 | 678 | if max_width <= 0: 679 | return [text] 680 | 681 | wrapped_lines: list[str] = [] 682 | 683 | lines = text.splitlines() 684 | if not lines: 685 | lines = [text] 686 | 687 | for raw_line in lines: 688 | words = raw_line.split() 689 | if not words: 690 | wrapped_lines.append("") 691 | continue 692 | 693 | current_line = words[0] 694 | for word in words[1:]: 695 | candidate = f"{current_line} {word}" if current_line else word 696 | if draw.textlength(candidate, font=font) <= max_width: 697 | current_line = candidate 698 | else: 699 | wrapped_lines.append(current_line) 700 | current_line = word 701 | 702 | wrapped_lines.append(current_line) 703 | 704 | return wrapped_lines 705 | 706 | def _add_overlay_to_image( 707 | self, 708 | image: Image.Image, 709 | step_number: int, 710 | goal_text: str, 711 | regular_font: ImageFont.FreeTypeFont, 712 | title_font: ImageFont.FreeTypeFont, 713 | margin: int, 714 | logo: Image.Image | None, 715 | line_spacing: float, # Added line_spacing parameter 716 | ) -> Image.Image: 717 | """Overlay the step number and goal text onto a screenshot image.""" 718 | 719 | image = image.convert("RGBA") 720 | overlay = Image.new("RGBA", image.size, (0, 0, 0, 0)) 721 | draw = ImageDraw.Draw(overlay) 722 | 723 | step_text = f"Step {step_number}" 724 | max_width = image.width - margin * 2 725 | 726 | lines: list[str] = [] 727 | words = goal_text.split() 728 | line = "" 729 | for word in words: 730 | test = f"{line} {word}".strip() 731 | if draw.textlength(test, font=regular_font) <= max_width: 732 | line = test 733 | else: 734 | lines.append(line) 735 | line = word 736 | if line: 737 | lines.append(line) 738 | 739 | y = margin 740 | step_bbox = draw.textbbox((margin, y), step_text, font=title_font) 741 | step_height = step_bbox[3] - step_bbox[1] 742 | total_height = step_height + int(margin * 0.5) 743 | for l in lines: 744 | bbox = draw.textbbox((margin, 0), l, font=regular_font) 745 | total_height += bbox[3] - bbox[1] 746 | 747 | if logo: 748 | total_height = max(total_height, logo.height + margin * 2) 749 | 750 | draw.rectangle( 751 | [(0, 0), (image.width, total_height)], 752 | fill=(0, 0, 0, 180), 753 | ) 754 | 755 | draw.text((margin, y), step_text, font=title_font, fill="white") 756 | y += step_height + int(margin * 0.5) 757 | for l in lines: 758 | draw.text((margin, y), l, font=regular_font, fill="white") 759 | bbox = draw.textbbox((margin, y), l, font=regular_font) 760 | y += bbox[3] - bbox[1] 761 | 762 | if logo: 763 | overlay.paste( 764 | logo, 765 | (image.width - logo.width - margin, margin), 766 | logo if logo.mode == "RGBA" else None, 767 | ) 768 | 769 | image.alpha_composite(overlay) 770 | return image.convert("RGB") 771 | 772 | async def execute_agent_task(self, max_steps: int = 100) -> AgentHistoryList: 773 | """ 774 | Execute the entire agent task for up to max_steps or until 'done'. 775 | Checks for external stop signals and logs each step in self.history. 776 | """ 777 | try: 778 | logger.info(f"🚀 Starting task: {self.task}") 779 | self.telemetry.capture( 780 | AgentRunTelemetryEvent( 781 | agent_id=self.agent_id, 782 | task=self.task, 783 | ) 784 | ) 785 | 786 | step_info = CustomAgentStepInfo( 787 | task=self.task, 788 | add_infos=self.add_infos, 789 | step_number=1, 790 | max_steps=max_steps, 791 | memory="", 792 | task_progress="", 793 | ) 794 | 795 | for step in range(max_steps): 796 | # 1) Check if stop requested externally 797 | if self.agent_state and self.agent_state.is_stop_requested(): 798 | logger.info("🛑 Stop requested by user") 799 | self._create_stop_history_item() 800 | break 801 | 802 | # 2) Store last valid state 803 | if self.browser_context and self.agent_state: 804 | state = await self.browser_context.get_state( 805 | use_vision=self.use_vision 806 | ) 807 | self.agent_state.set_last_valid_state(state) 808 | 809 | # 3) Check for too many failures 810 | if self._too_many_failures(): 811 | break 812 | 813 | # 4) Execute one detailed agent step 814 | await self.execute_agent_step(step_info) 815 | 816 | if self.history.is_done(): 817 | if self.validate_output and step < max_steps - 1: 818 | # Optionally validate final output 819 | if not await self._validate_output(): 820 | continue 821 | logger.info("✅ Task completed successfully") 822 | break 823 | else: 824 | logger.info("❌ Failed to complete task within maximum steps") 825 | 826 | return self.history 827 | 828 | finally: 829 | self.telemetry.capture( 830 | AgentEndTelemetryEvent( 831 | agent_id=self.agent_id, 832 | task=self.task, 833 | success=self.history.is_done(), 834 | steps=len(self.history.history), 835 | ) 836 | ) 837 | # Close the browser context if we created it here (not injected) 838 | if not self.injected_browser_context and self.browser_context: 839 | await self.browser_context.close() 840 | 841 | # Close the browser instance if it wasn't injected 842 | if not self.injected_browser and self.browser: 843 | await self.browser.close() 844 | 845 | # Generate a GIF of the agent's run if enabled 846 | if self.generate_gif: 847 | self.create_history_gif() 848 | 849 | def _create_stop_history_item(self) -> None: 850 | """ 851 | Create a final 'stop' history item indicating the agent has halted by request. 852 | """ 853 | try: 854 | state = None 855 | if self.agent_state: 856 | last_state = self.agent_state.get_last_valid_state() 857 | if last_state: 858 | state = self._convert_to_browser_state_history(last_state) 859 | else: 860 | state = self._create_empty_state() 861 | else: 862 | state = self._create_empty_state() 863 | 864 | stop_history = AgentHistory( 865 | model_output=None, 866 | state=state, 867 | result=[ActionResult(extracted_content=None, error=None, is_done=True)], 868 | ) 869 | self.history.history.append(stop_history) 870 | 871 | except Exception as e: 872 | logger.error(f"Error creating stop history item: {e}") 873 | state = self._create_empty_state() 874 | stop_history = AgentHistory( 875 | model_output=None, 876 | state=state, 877 | result=[ActionResult(extracted_content=None, error=None, is_done=True)], 878 | ) 879 | self.history.history.append(stop_history) 880 | 881 | def _convert_to_browser_state_history( 882 | self, browser_state: Any 883 | ) -> BrowserStateHistory: 884 | """ 885 | Convert a raw browser_state object into a BrowserStateHistory dataclass. 886 | """ 887 | return BrowserStateHistory( 888 | url=getattr(browser_state, "url", ""), 889 | title=getattr(browser_state, "title", ""), 890 | tabs=getattr(browser_state, "tabs", []), 891 | interacted_element=[None], 892 | screenshot=getattr(browser_state, "screenshot", None), 893 | ) 894 | 895 | def _create_empty_state(self) -> BrowserStateHistory: 896 | """ 897 | Create a basic empty state for fallback or stop-history usage. 898 | """ 899 | return BrowserStateHistory( 900 | url="", title="", tabs=[], interacted_element=[None], screenshot=None 901 | ) 902 | --------------------------------------------------------------------------------