├── src
└── mcp_browser_use
│ ├── agent
│ ├── __init__.py
│ ├── custom_views.py
│ ├── custom_massage_manager.py
│ ├── custom_prompts.py
│ └── custom_agent.py
│ ├── browser
│ ├── __init__.py
│ └── browser_manager.py
│ ├── utils
│ ├── __init__.py
│ ├── logging.py
│ ├── agent_state.py
│ └── utils.py
│ ├── controller
│ ├── __init__.py
│ └── custom_controller.py
│ ├── mcp_browser_use.py
│ ├── __init__.py
│ ├── client.py
│ └── server.py
├── tests
├── stubs
│ ├── browser_use
│ │ ├── agent
│ │ │ ├── prompts.py
│ │ │ ├── message_manager
│ │ │ │ ├── service.py
│ │ │ │ └── views.py
│ │ │ ├── service.py
│ │ │ └── views.py
│ │ ├── browser
│ │ │ ├── browser.py
│ │ │ ├── events.py
│ │ │ ├── __init__.py
│ │ │ ├── profile.py
│ │ │ ├── context.py
│ │ │ └── views.py
│ │ ├── controller
│ │ │ ├── registry
│ │ │ │ └── views.py
│ │ │ └── service.py
│ │ ├── telemetry
│ │ │ └── views.py
│ │ ├── utils.py
│ │ └── __init__.py
│ ├── langchain_openai
│ │ ├── chat_models
│ │ │ ├── base.py
│ │ │ └── __init__.py
│ │ └── __init__.py
│ ├── langchain_core
│ │ ├── messages
│ │ │ └── __init__.py
│ │ ├── language_models
│ │ │ ├── __init__.py
│ │ │ └── chat_models.py
│ │ └── prompts
│ │ │ └── __init__.py
│ └── PIL
│ │ └── __init__.py
├── test_agent_state.py
├── test_logging_configuration.py
├── conftest.py
├── test_gif_creation.py
├── test_browser_manager.py
├── test_custom_agent_controller.py
├── test_summarize_messages.py
├── test_client_session.py
└── test_utils.py
├── .gitattributes
├── renovate.json
├── .editorconfig
├── pyproject.toml
├── Dockerfile
├── .gitignore
├── sample.env.env
├── smithery.yaml
├── README.md
└── documentation
├── CONFIGURATION.md
└── SECURITY.md
/src/mcp_browser_use/agent/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/src/mcp_browser_use/browser/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/src/mcp_browser_use/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/src/mcp_browser_use/controller/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/tests/stubs/browser_use/agent/prompts.py:
--------------------------------------------------------------------------------
1 | class SystemPrompt:
2 | pass
3 |
--------------------------------------------------------------------------------
/tests/stubs/browser_use/browser/browser.py:
--------------------------------------------------------------------------------
1 | class Browser:
2 | pass
3 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/tests/stubs/browser_use/controller/registry/views.py:
--------------------------------------------------------------------------------
1 | class ActionModel:
2 | pass
3 |
--------------------------------------------------------------------------------
/tests/stubs/langchain_openai/chat_models/base.py:
--------------------------------------------------------------------------------
1 | _convert_message_to_dict = lambda x: {}
2 |
--------------------------------------------------------------------------------
/renovate.json:
--------------------------------------------------------------------------------
1 | {
2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json",
3 | "extends": ["config:recommended"]
4 | }
5 |
--------------------------------------------------------------------------------
/tests/stubs/browser_use/agent/message_manager/service.py:
--------------------------------------------------------------------------------
1 | class MessageManager:
2 | def __init__(self, *args, **kwargs):
3 | pass
4 |
--------------------------------------------------------------------------------
/tests/stubs/langchain_openai/__init__.py:
--------------------------------------------------------------------------------
1 | from .chat_models import AzureChatOpenAI, ChatOpenAI
2 |
3 | __all__ = ["ChatOpenAI", "AzureChatOpenAI"]
4 |
--------------------------------------------------------------------------------
/tests/stubs/langchain_core/messages/__init__.py:
--------------------------------------------------------------------------------
1 | class BaseMessage: pass
2 | class HumanMessage: pass
3 | class AIMessage: pass
4 | class SystemMessage: pass
5 |
--------------------------------------------------------------------------------
/tests/stubs/browser_use/agent/service.py:
--------------------------------------------------------------------------------
1 | class Agent:
2 | def __init__(self, *args, **kwargs):
3 | self.history = kwargs.get('history', None)
4 | self.generate_gif = False
5 |
--------------------------------------------------------------------------------
/tests/stubs/browser_use/telemetry/views.py:
--------------------------------------------------------------------------------
1 | class AgentEndTelemetryEvent:
2 | def __init__(self, *args, **kwargs):
3 | pass
4 | class AgentRunTelemetryEvent:
5 | def __init__(self, *args, **kwargs):
6 | pass
7 |
--------------------------------------------------------------------------------
/tests/stubs/langchain_core/language_models/__init__.py:
--------------------------------------------------------------------------------
1 | class BaseChatModel:
2 | def with_structured_output(self, *args, **kwargs):
3 | return self
4 | async def ainvoke(self, *args, **kwargs):
5 | return {}
6 |
--------------------------------------------------------------------------------
/tests/stubs/langchain_core/language_models/chat_models.py:
--------------------------------------------------------------------------------
1 | class BaseChatModel:
2 | async def ainvoke(self, *args, **kwargs):
3 | return {}
4 | def with_structured_output(self, *args, **kwargs):
5 | return self
6 |
--------------------------------------------------------------------------------
/tests/stubs/browser_use/utils.py:
--------------------------------------------------------------------------------
1 | def time_execution_async(name):
2 | def decorator(func):
3 | async def wrapper(*args, **kwargs):
4 | return await func(*args, **kwargs)
5 |
6 | return wrapper
7 |
8 | return decorator
9 |
--------------------------------------------------------------------------------
/tests/stubs/browser_use/browser/events.py:
--------------------------------------------------------------------------------
1 | class SendKeysEvent:
2 | def __init__(self, keys: str):
3 | self.keys = keys
4 |
5 |
6 | class ScreenshotEvent:
7 | def __init__(self, full_page: bool = False):
8 | self.full_page = full_page
9 |
--------------------------------------------------------------------------------
/src/mcp_browser_use/mcp_browser_use.py:
--------------------------------------------------------------------------------
1 | """Public entry-points for backwards compatible imports."""
2 |
3 | from __future__ import annotations
4 |
5 | from .client import AgentNotRegisteredError, create_client_session
6 |
7 | __all__ = ["AgentNotRegisteredError", "create_client_session"]
8 |
--------------------------------------------------------------------------------
/tests/stubs/browser_use/browser/__init__.py:
--------------------------------------------------------------------------------
1 | from .. import BrowserSession as Browser # noqa: F401
2 | from .events import SendKeysEvent # noqa: F401
3 | from .profile import BrowserProfile, ProxySettings # noqa: F401
4 |
5 | __all__ = ["Browser", "BrowserProfile", "ProxySettings", "SendKeysEvent"]
6 |
--------------------------------------------------------------------------------
/tests/stubs/browser_use/agent/message_manager/views.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 | from typing import Any, List
3 |
4 | @dataclass
5 | class MessageHistory:
6 | messages: List[Any] = field(default_factory=list)
7 | total_tokens: int = 0
8 |
9 | @dataclass
10 | class ManagedMessage:
11 | message: Any
12 |
--------------------------------------------------------------------------------
/tests/stubs/browser_use/browser/profile.py:
--------------------------------------------------------------------------------
1 | class ProxySettings:
2 | def __init__(self, **kwargs):
3 | for key, value in kwargs.items():
4 | setattr(self, key, value)
5 |
6 |
7 | class BrowserProfile:
8 | def __init__(self, **kwargs):
9 | for key, value in kwargs.items():
10 | setattr(self, key, value)
11 |
--------------------------------------------------------------------------------
/tests/stubs/browser_use/browser/context.py:
--------------------------------------------------------------------------------
1 | class BrowserContextConfig:
2 | def __init__(self, **kwargs):
3 | for key, value in kwargs.items():
4 | setattr(self, key, value)
5 |
6 |
7 | class BrowserContext:
8 | async def get_state(self, *args, **kwargs):
9 | pass
10 |
11 | async def close(self):
12 | pass
13 |
--------------------------------------------------------------------------------
/tests/stubs/browser_use/browser/views.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 |
3 | @dataclass
4 | class BrowserStateHistory:
5 | url: str = ""
6 | title: str = ""
7 | tabs: list = None
8 | interacted_element: list = None
9 | screenshot: str | None = None
10 |
11 | @dataclass
12 | class BrowserState:
13 | screenshot: str | None = None
14 |
--------------------------------------------------------------------------------
/tests/stubs/langchain_core/prompts/__init__.py:
--------------------------------------------------------------------------------
1 | class ChatPromptTemplate:
2 | @staticmethod
3 | def from_messages(msgs):
4 | return ChatPromptTemplate()
5 | def __or__(self, other):
6 | return self
7 | def invoke(self, data):
8 | return ''
9 |
10 | class MessagesPlaceholder:
11 | def __init__(self, variable_name=''):
12 | pass
13 |
--------------------------------------------------------------------------------
/src/mcp_browser_use/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """MCP server for browser-use."""
4 |
5 | from mcp_browser_use.mcp_browser_use import ( # noqa: F401
6 | AgentNotRegisteredError,
7 | create_client_session,
8 | )
9 | from mcp_browser_use.server import app, launch_mcp_browser_use_server
10 |
11 | __all__ = [
12 | "app",
13 | "launch_mcp_browser_use_server",
14 | "create_client_session",
15 | "AgentNotRegisteredError",
16 | ]
17 |
--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | # Check http://editorconfig.org for more information
2 | # This is the main config file for this project:
3 | root = true
4 |
5 | [*]
6 | charset = utf-8
7 | end_of_line = lf
8 | insert_final_newline = true
9 | indent_style = space
10 | indent_size = 2
11 | trim_trailing_whitespace = true
12 |
13 | [*.{py, pyi}]
14 | indent_style = space
15 | indent_size = 4
16 |
17 | [Makefile]
18 | indent_style = tab
19 |
20 | [*.md]
21 | trim_trailing_whitespace = false
--------------------------------------------------------------------------------
/tests/stubs/browser_use/controller/service.py:
--------------------------------------------------------------------------------
1 | class _Registry:
2 | def get_prompt_description(self):
3 | return ""
4 |
5 | def create_action_model(self):
6 | return type("ActionModel", (), {})
7 |
8 | def action(self, *_args, **_kwargs):
9 | def decorator(func):
10 | return func
11 |
12 | return decorator
13 |
14 |
15 | class Controller:
16 | def __init__(self):
17 | self.registry = _Registry()
18 |
19 | async def multi_act(self, actions, context): # pragma: no cover - stub
20 | return []
21 |
--------------------------------------------------------------------------------
/tests/stubs/langchain_openai/chat_models/__init__.py:
--------------------------------------------------------------------------------
1 | class Base:
2 | pass
3 |
4 | class ChatOpenAI:
5 | def __init__(self, *args, **kwargs):
6 | pass
7 |
8 | root_async_client = None
9 | model_name = 'mock'
10 | def with_structured_output(self, *args, **kwargs):
11 | return self
12 | async def ainvoke(self, *args, **kwargs):
13 | return {}
14 |
15 |
16 | class AzureChatOpenAI(ChatOpenAI):
17 | """Minimal stub mirroring the OpenAI chat client API."""
18 |
19 | def __init__(self, *args, **kwargs):
20 | super().__init__(*args, **kwargs)
21 |
22 |
--------------------------------------------------------------------------------
/tests/test_agent_state.py:
--------------------------------------------------------------------------------
1 | from mcp_browser_use.utils.agent_state import AgentState
2 |
3 |
4 | def test_agent_state_stop_flow():
5 | state = AgentState()
6 |
7 | assert state.is_stop_requested() is False
8 |
9 | state.request_stop()
10 | assert state.is_stop_requested() is True
11 |
12 | state.clear_stop()
13 | assert state.is_stop_requested() is False
14 |
15 |
16 | def test_agent_state_last_valid_state_reset():
17 | state = AgentState()
18 |
19 | marker = {"url": "https://example.com"}
20 | state.set_last_valid_state(marker)
21 |
22 | assert state.get_last_valid_state() == marker
23 |
24 | state.clear_stop()
25 |
26 | assert state.get_last_valid_state() is None
27 |
--------------------------------------------------------------------------------
/tests/stubs/browser_use/agent/views.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 | from typing import Any, List, Optional
3 |
4 | @dataclass
5 | class ActionResult:
6 | extracted_content: Optional[str] = None
7 | error: Optional[str] = None
8 | is_done: bool = False
9 | include_in_memory: bool = False
10 |
11 | @dataclass
12 | class AgentHistory:
13 | model_output: Any
14 | state: Any
15 | result: List[ActionResult]
16 |
17 | @dataclass
18 | class AgentHistoryList:
19 | history: List[AgentHistory] = field(default_factory=list)
20 | def is_done(self) -> bool:
21 | for h in self.history:
22 | for r in h.result:
23 | if r.is_done:
24 | return True
25 | return False
26 |
27 | @dataclass
28 | class AgentStepInfo:
29 | step_number: int = 0
30 |
31 | class AgentOutput:
32 | pass
33 |
--------------------------------------------------------------------------------
/src/mcp_browser_use/utils/logging.py:
--------------------------------------------------------------------------------
1 | """Centralised logging configuration utilities for the MCP browser agent."""
2 |
3 | from __future__ import annotations
4 |
5 | import logging
6 | import os
7 | from typing import Optional
8 |
9 |
10 | _DEFAULT_FORMAT = "%(asctime)s | %(levelname)s | %(name)s | %(message)s"
11 |
12 |
13 | def _resolve_level(level_name: Optional[str]) -> int:
14 | """Translate a string level name into a numeric logging level."""
15 |
16 | if not level_name:
17 | return logging.INFO
18 |
19 | try:
20 | return int(level_name)
21 | except ValueError:
22 | resolved = logging.getLevelName(level_name.upper())
23 | if isinstance(resolved, int):
24 | return resolved
25 | return logging.INFO
26 | def configure_logging() -> None:
27 | """Configure the root logger once for the application."""
28 |
29 | level = _resolve_level(os.getenv("LOG_LEVEL"))
30 |
31 | root_logger = logging.getLogger()
32 | if not root_logger.handlers:
33 | logging.basicConfig(level=level, format=_DEFAULT_FORMAT)
34 | else:
35 | root_logger.setLevel(level)
36 |
--------------------------------------------------------------------------------
/src/mcp_browser_use/utils/agent_state.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | """
5 | If we plan to scale or have multiple agents, we might remove the singleton pattern or differentiate them by agent ID.
6 | """
7 |
8 | import asyncio
9 | from typing import Any, Optional
10 |
11 |
12 | class AgentState:
13 | """
14 | Tracks an asynchronous stop signal and stores the last valid browser state.
15 |
16 | request_stop() sets an asyncio.Event, is_stop_requested() checks if it's set,
17 | clear_stop() resets the event and last_valid_state.
18 | """
19 |
20 | def __init__(self) -> None:
21 | self._stop_requested = asyncio.Event()
22 | self._last_valid_state: Optional[Any] = None
23 |
24 | def request_stop(self) -> None:
25 | self._stop_requested.set()
26 |
27 | def clear_stop(self) -> None:
28 | self._stop_requested.clear()
29 | self._last_valid_state = None
30 |
31 | def is_stop_requested(self) -> bool:
32 | return self._stop_requested.is_set()
33 |
34 | def set_last_valid_state(self, state: Any) -> None:
35 | self._last_valid_state = state
36 |
37 | def get_last_valid_state(self) -> Optional[Any]:
38 | return self._last_valid_state
39 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "mcp_browser_use"
3 | version = "0.1.0"
4 | description = "This Python project is a FastAPI server implementing MCP Server protocol Browser automation via browser-use library."
5 | readme = "README.md"
6 | requires-python = ">=3.11"
7 | license = { text = "MIT" }
8 | classifiers = [
9 | "Development Status :: 4 - Beta",
10 | "Programming Language :: Python :: 3",
11 | "Programming Language :: Python :: 3.11",
12 | "Operating System :: OS Independent",
13 | ]
14 |
15 | dependencies = [
16 | "pydantic>=2.11.9",
17 | "uvicorn>=0.37.0",
18 | "browser-use>=0.7.9",
19 | "fastapi>=0.117.1",
20 | "fastmcp>=2.12.4",
21 | "instructor>=1.11.3",
22 | "langchain>=0.3.27",
23 | "langchain-google-genai>=2.1.1",
24 | "langchain-openai>=0.2.14",
25 | "langchain-anthropic>=0.3.20",
26 | "langchain-ollama>=0.2.2",
27 | "openai>=1.109.1",
28 | "pillow>=11.3.0",
29 | "python-dotenv>=1.1.1",
30 | "pyperclip>=1.11.0",
31 | ]
32 |
33 | [build-system]
34 | requires = ["hatchling"]
35 | build-backend = "hatchling.build"
36 |
37 | [tool.hatch.build.targets.wheel]
38 | packages = ["src/mcp_browser_use"]
39 |
40 | [project.scripts]
41 | mcp-browser-use = "mcp_browser_use.server:launch_mcp_browser_use_server"
42 |
--------------------------------------------------------------------------------
/tests/test_logging_configuration.py:
--------------------------------------------------------------------------------
1 | """Smoke tests around module imports and logging configuration."""
2 |
3 | from __future__ import annotations
4 |
5 | import importlib
6 | import logging
7 | import sys
8 | from typing import Iterable
9 |
10 | import pytest
11 |
12 |
13 | MODULES_TO_TEST: Iterable[str] = (
14 | "mcp_browser_use.controller.custom_controller",
15 | "mcp_browser_use.utils.utils",
16 | "mcp_browser_use.agent.custom_agent",
17 | "mcp_browser_use.agent.custom_message_manager",
18 | )
19 |
20 |
21 | @pytest.mark.parametrize("module_name", MODULES_TO_TEST)
22 | def test_module_import_does_not_call_basic_config(module_name: str, monkeypatch) -> None:
23 | """Ensure importing project modules does not invoke ``logging.basicConfig``."""
24 |
25 | # Import once so that shared third-party dependencies are cached.
26 | importlib.import_module(module_name)
27 | sys.modules.pop(module_name, None)
28 |
29 | calls: list[tuple[tuple[object, ...], dict[str, object]]] = []
30 |
31 | def record_basic_config(*args: object, **kwargs: object) -> None:
32 | calls.append((args, kwargs))
33 |
34 | monkeypatch.setattr(logging, "basicConfig", record_basic_config)
35 |
36 | importlib.import_module(module_name)
37 |
38 | assert calls == [], f"Module {module_name} should not call logging.basicConfig during import"
39 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # Generated by https://smithery.ai. See: https://smithery.ai/docs/config#dockerfile
2 | # Use a Python image with uv pre-installed
3 | FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim AS uv
4 |
5 | # Install the project into /app
6 | WORKDIR /app
7 |
8 | # Enable bytecode compilation
9 | ENV UV_COMPILE_BYTECODE=1
10 |
11 | # Copy from the cache instead of linking since it's a mounted volume
12 | ENV UV_LINK_MODE=copy
13 |
14 | # Install the project's dependencies using the lockfile and settings
15 | RUN --mount=type=cache,target=/root/.cache/uv \
16 | --mount=type=bind,source=uv.lock,target=uv.lock \
17 | --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
18 | uv sync --frozen --no-install-project --no-dev --no-editable
19 |
20 | # Then, add the rest of the project source code and install it
21 | # Installing separately from its dependencies allows optimal layer caching
22 | ADD . /app
23 | RUN --mount=type=cache,target=/root/.cache/uv \
24 | uv sync --frozen --no-dev --no-editable
25 |
26 | FROM python:3.13-slim-bookworm
27 |
28 | WORKDIR /app
29 |
30 | COPY --from=uv /root/.local /root/.local
31 | COPY --from=uv --chown=app:app /app/.venv /app/.venv
32 |
33 | # Place executables in the environment at the front of the path
34 | ENV PATH="/app/.venv/bin:$PATH"
35 |
36 | # when running the container, add --db-path and a bind mount to the host's db file
37 | ENTRYPOINT ["mcp-browser-use"]
38 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | """Test fixtures and environment setup for the test suite."""
2 |
3 | import importlib
4 | import os
5 | import sys
6 | import types
7 |
8 | BASE_DIR = os.path.dirname(__file__)
9 | STUBS_DIR = os.path.join(BASE_DIR, "stubs")
10 | SRC_DIR = os.path.join(os.path.dirname(BASE_DIR), "src")
11 |
12 | for path in (STUBS_DIR, SRC_DIR):
13 | if path not in sys.path:
14 | sys.path.insert(0, path)
15 |
16 | if "langchain_openai" not in sys.modules:
17 | importlib.import_module("langchain_openai")
18 |
19 | if "langchain_anthropic" not in sys.modules:
20 | module = types.ModuleType("langchain_anthropic")
21 |
22 | class ChatAnthropic: # type: ignore[too-many-ancestors]
23 | def __init__(self, *args, **kwargs):
24 | pass
25 |
26 | module.ChatAnthropic = ChatAnthropic
27 | sys.modules["langchain_anthropic"] = module
28 |
29 | if "langchain_google_genai" not in sys.modules:
30 | module = types.ModuleType("langchain_google_genai")
31 |
32 | class ChatGoogleGenerativeAI: # type: ignore[too-many-ancestors]
33 | def __init__(self, *args, **kwargs):
34 | pass
35 |
36 | module.ChatGoogleGenerativeAI = ChatGoogleGenerativeAI
37 | sys.modules["langchain_google_genai"] = module
38 |
39 | if "langchain_ollama" not in sys.modules:
40 | module = types.ModuleType("langchain_ollama")
41 |
42 | class ChatOllama: # type: ignore[too-many-ancestors]
43 | def __init__(self, *args, **kwargs):
44 | pass
45 |
46 | module.ChatOllama = ChatOllama
47 | sys.modules["langchain_ollama"] = module
48 |
49 |
--------------------------------------------------------------------------------
/tests/test_gif_creation.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import base64
4 | import io
5 |
6 | # Add stub package path before importing CustomAgent
7 | BASE_DIR = os.path.dirname(__file__)
8 | sys.path.insert(0, os.path.join(BASE_DIR, "stubs"))
9 | sys.path.insert(0, os.path.join(os.path.dirname(BASE_DIR), "src"))
10 |
11 | from PIL import Image
12 |
13 | from mcp_browser_use.agent.custom_agent import CustomAgent
14 | from browser_use.agent.views import AgentHistoryList, AgentHistory, ActionResult
15 | from browser_use.browser.views import BrowserStateHistory
16 |
17 |
18 | class DummyState:
19 | def __init__(self, thought: str):
20 | self.current_state = type("Brain", (), {"thought": thought})()
21 |
22 |
23 | def create_screenshot() -> str:
24 | img = Image.new("RGB", (100, 100), color="white")
25 | buf = io.BytesIO()
26 | img.save(buf, format="PNG")
27 | return base64.b64encode(buf.getvalue()).decode("utf-8")
28 |
29 |
30 | def test_create_history_gif(tmp_path):
31 | screenshot = create_screenshot()
32 | hist = AgentHistoryList(
33 | history=[
34 | AgentHistory(
35 | model_output=DummyState("step one"),
36 | state=BrowserStateHistory(screenshot=screenshot),
37 | result=[ActionResult(is_done=False)],
38 | ),
39 | AgentHistory(
40 | model_output=DummyState("step two"),
41 | state=BrowserStateHistory(screenshot=screenshot),
42 | result=[ActionResult(is_done=True)],
43 | ),
44 | ]
45 | )
46 |
47 | agent = CustomAgent.__new__(CustomAgent)
48 | agent.history = hist
49 | agent.task = "My Task"
50 |
51 | output_gif = tmp_path / "out.gif"
52 | agent.create_history_gif(output_path=str(output_gif))
53 |
54 | assert output_gif.exists()
55 |
--------------------------------------------------------------------------------
/tests/test_browser_manager.py:
--------------------------------------------------------------------------------
1 | """Tests for browser manager environment configuration helpers."""
2 |
3 | from __future__ import annotations
4 |
5 | import importlib
6 |
7 | import pytest
8 |
9 |
10 | browser_manager = importlib.import_module(
11 | "mcp_browser_use.browser.browser_manager"
12 | )
13 |
14 |
15 | @pytest.fixture(autouse=True)
16 | def clear_browser_env(monkeypatch):
17 | """Ensure browser-related environment variables do not leak between tests."""
18 |
19 | for key in (
20 | "BROWSER_USE_CDP_URL",
21 | "CHROME_DEBUGGING_HOST",
22 | "CHROME_DEBUGGING_PORT",
23 | ):
24 | monkeypatch.delenv(key, raising=False)
25 |
26 |
27 | def test_from_env_derives_cdp_url_from_debugging(monkeypatch):
28 | """When only debugging env vars are set, derive a CDP URL automatically."""
29 |
30 | monkeypatch.setenv("CHROME_DEBUGGING_HOST", "debug.example")
31 | monkeypatch.setenv("CHROME_DEBUGGING_PORT", "1337")
32 |
33 | config = browser_manager.BrowserEnvironmentConfig.from_env()
34 |
35 | assert config.cdp_url == "http://debug.example:1337"
36 |
37 |
38 | def test_create_browser_session_preserves_computed_cdp_url(monkeypatch):
39 | """Computed CDP URL is passed to BrowserSession when overrides omit it."""
40 |
41 | monkeypatch.setenv("CHROME_DEBUGGING_HOST", "localhost")
42 | monkeypatch.setenv("CHROME_DEBUGGING_PORT", "9000")
43 |
44 | captured_kwargs: dict[str, object] = {}
45 |
46 | class DummyBrowserSession:
47 | def __init__(self, **kwargs):
48 | captured_kwargs.update(kwargs)
49 |
50 | monkeypatch.setattr(browser_manager, "BrowserSession", DummyBrowserSession)
51 |
52 | session = browser_manager.create_browser_session()
53 |
54 | assert isinstance(session, DummyBrowserSession)
55 | assert captured_kwargs["cdp_url"] == "http://localhost:9000"
56 |
--------------------------------------------------------------------------------
/tests/stubs/PIL/__init__.py:
--------------------------------------------------------------------------------
1 | class DummyImage:
2 | def __init__(self, width=100, height=100):
3 | self.width = width
4 | self.height = height
5 | self.mode = "RGBA"
6 |
7 | @property
8 | def size(self):
9 | return (self.width, self.height)
10 |
11 | def convert(self, mode):
12 | self.mode = mode
13 | return self
14 |
15 | def resize(self, size, resample=None):
16 | self.width, self.height = size
17 | return self
18 |
19 | def save(self, fp, *args, **kwargs):
20 | if hasattr(fp, "write"):
21 | fp.write(b"dummy")
22 | else:
23 | with open(fp, "wb") as f:
24 | f.write(b"dummy")
25 |
26 | def alpha_composite(self, other):
27 | pass
28 |
29 | def paste(self, img, pos, mask=None):
30 | pass
31 |
32 |
33 | class Image:
34 | @staticmethod
35 | def open(fp):
36 | return DummyImage()
37 |
38 | @staticmethod
39 | def new(mode, size, color=(0, 0, 0, 0)):
40 | return DummyImage(*size)
41 |
42 | Resampling = type("Resampling", (), {"LANCZOS": 0})
43 | Image = DummyImage
44 |
45 |
46 | class ImageDraw:
47 | class Draw:
48 | def __init__(self, img):
49 | pass
50 |
51 | def text(self, *args, **kwargs):
52 | pass
53 |
54 | def rectangle(self, *args, **kwargs):
55 | pass
56 |
57 | def textbbox(self, xy, text, font=None):
58 | # return left, top, right, bottom
59 | return (0, 0, len(text) * 10, 10)
60 |
61 | def textlength(self, text, font=None):
62 | return len(text) * 10
63 |
64 | ImageDraw = Draw
65 |
66 |
67 | class ImageFont:
68 | class FreeTypeFont:
69 | pass
70 |
71 | @staticmethod
72 | def truetype(font, size):
73 | return ImageFont.FreeTypeFont()
74 |
75 | @staticmethod
76 | def load_default():
77 | return ImageFont.FreeTypeFont()
78 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | .hypothesis/
50 | .pytest_cache/
51 |
52 | # Translations
53 | *.mo
54 | *.pot
55 |
56 | # Django stuff:
57 | *.log
58 | local_settings.py
59 | db.sqlite3
60 |
61 | # Flask stuff:
62 | instance/
63 | .webassets-cache
64 |
65 | # Scrapy stuff:
66 | .scrapy
67 |
68 | # Sphinx documentation
69 | docs/_build/
70 |
71 | # PyBuilder
72 | target/
73 |
74 | # Jupyter Notebook
75 | .ipynb_checkpoints
76 |
77 | # IPython
78 | profile_default/
79 | ipython_config.py
80 |
81 | # pyenv
82 | .python-version
83 |
84 | # celery beat schedule file
85 | celerybeat-schedule
86 |
87 | # SageMath parsed files
88 | *.sage.py
89 |
90 | # Environments
91 | .env
92 | .venv
93 | env/
94 | venv/
95 | ENV/
96 | env.bak/
97 | venv.bak/
98 |
99 | # Spyder project settings
100 | .spyderproject
101 | .spyproject
102 |
103 | # Rope project settings
104 | .ropeproject
105 |
106 | # mkdocs documentation
107 | /site
108 |
109 | # mypy
110 | .mypy_cache/
111 | .dmypy.json
112 | dmypy.json
113 |
114 | # Pyre type checker
115 | .pyre/
116 |
117 | # ignore the database
118 | *.db
119 |
120 | # ignore vscode settings
121 | .vscode/
122 |
123 | # Project Files
124 | /*.json
125 | target/
126 | dbt_packages/
127 | dbt_packages/*
128 | logs/
129 | /secrets/*
130 | #mac pc specific - system configuration files
131 | .DS_Store
132 |
--------------------------------------------------------------------------------
/tests/test_custom_agent_controller.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | BASE_DIR = os.path.dirname(__file__)
5 | sys.path.insert(0, os.path.join(BASE_DIR, "stubs"))
6 | sys.path.insert(0, os.path.join(os.path.dirname(BASE_DIR), "src"))
7 |
8 | import pytest
9 | from langchain_core.language_models.chat_models import BaseChatModel
10 | from unittest.mock import Mock
11 |
12 | import mcp_browser_use.agent.custom_agent as custom_agent_module
13 |
14 |
15 | @pytest.fixture
16 | def custom_agent(monkeypatch):
17 | class DummyMessageManager:
18 | def __init__(self, *args, **kwargs):
19 | pass
20 |
21 | monkeypatch.setattr(
22 | custom_agent_module,
23 | "CustomMassageManager",
24 | DummyMessageManager,
25 | )
26 |
27 | def fake_agent_init(self, *args, **kwargs):
28 | for key, value in kwargs.items():
29 | setattr(self, key, value)
30 | # Set attributes not passed in kwargs that are needed
31 | self.n_steps = 0
32 | self._last_result = None
33 | self.message_manager = None
34 | self.history = None
35 | self.generate_gif = False
36 |
37 | monkeypatch.setattr(custom_agent_module.Agent, "__init__", fake_agent_init)
38 |
39 | return custom_agent_module
40 |
41 |
42 | def test_custom_agent_creates_independent_default_controllers(
43 | custom_agent, monkeypatch
44 | ):
45 | controllers = []
46 |
47 | class TrackingController(custom_agent.Controller):
48 | def __init__(self):
49 | super().__init__()
50 | controllers.append(self)
51 |
52 | monkeypatch.setattr(custom_agent, "Controller", TrackingController)
53 |
54 | llm = Mock(spec=BaseChatModel)
55 | agent_one = custom_agent.CustomAgent(task="Task one", llm=llm)
56 | agent_two = custom_agent.CustomAgent(task="Task two", llm=llm)
57 |
58 | assert agent_one.controller is not agent_two.controller
59 | assert controllers == [agent_one.controller, agent_two.controller]
60 |
61 |
62 | def test_custom_agent_uses_supplied_controller(custom_agent):
63 | llm = Mock(spec=BaseChatModel)
64 | provided_controller = custom_agent.Controller()
65 |
66 | agent = custom_agent.CustomAgent(
67 | task="Task with supplied controller",
68 | llm=llm,
69 | controller=provided_controller,
70 | )
71 |
72 | assert agent.controller is provided_controller
73 |
--------------------------------------------------------------------------------
/sample.env.env:
--------------------------------------------------------------------------------
1 | # ---------------------------
2 | # API Keys (Replace as needed)
3 | # ---------------------------
4 | OPENAI_API_KEY=your_openai_api_key_here
5 | ANTHROPIC_API_KEY=your_anthropic_api_key_here
6 | GOOGLE_API_KEY=your_google_api_key_here
7 | AZURE_OPENAI_API_KEY=your_azure_api_key_here
8 | DEEPSEEK_API_KEY=your_deepseek_api_key_here
9 |
10 | # ----------------------------------
11 | # Model Provider & Endpoint Settings
12 | # ----------------------------------
13 | # Typical endpoints; change to match your usage.
14 | OPENAI_ENDPOINT=https://api.openai.com/v1
15 | ANTHROPIC_API_ENDPOINT=https://api.anthropic.com
16 | AZURE_OPENAI_ENDPOINT=https://your-azure-openai-endpoint
17 | DEEPSEEK_ENDPOINT=https://api.deepseek.com
18 |
19 | # ---------------------------
20 | # Model & Agent Configuration
21 | # ---------------------------
22 | # Choose one provider: "openai", "anthropic", "azure_openai", "deepseek", "gemini", "ollama".
23 | MCP_MODEL_PROVIDER=anthropic
24 | MCP_MODEL_NAME=claude-3-5-sonnet-20241022
25 | MCP_TEMPERATURE=0.3
26 | MCP_MAX_STEPS=30
27 | MCP_MAX_ACTIONS_PER_STEP=5
28 | MCP_USE_VISION=true
29 | MCP_TOOL_CALL_IN_CONTENT=true
30 |
31 | # ---------------------------------
32 | # Chrome / Playwright Configuration
33 | # ---------------------------------
34 | # If CHROME_PATH is set, the code will attempt to launch a locally installed Chrome
35 | # with remote debugging on port 9222.
36 | # If left empty, it will launch a standard Chromium instance via Playwright.
37 |
38 | CHROME_PATH=/path/to/your/chrome/binary
39 | CHROME_USER_DATA=/path/to/your/chrome-profile
40 | CHROME_DEBUGGING_PORT=9222
41 | CHROME_DEBUGGING_HOST=localhost
42 | CHROME_PERSISTENT_SESSION=false
43 |
44 | # You can add extra flags in your code if needed:
45 | # Example: export CHROME_EXTRA_ARGS="--some-chrome-flag"
46 |
47 | # --------------
48 | # Other Settings
49 | # --------------
50 | # Adjust HEADLESS or DISABLE_SECURITY if your code checks them.
51 | # By default, you might keep them out or set them in the code itself.
52 |
53 | # HEADLESS=false
54 | # DISABLE_SECURITY=false
55 |
56 | # -------------
57 | # Example Usage
58 | # -------------
59 | # Load this file with:
60 | # source .env
61 | # or use a library like python-dotenv or uv to manage environment variables.
62 |
63 | # Note: In production or multi-user environments, never commit real API keys
64 | # or share them publicly. Instead use a secrets manager or encrypted storage.
65 |
--------------------------------------------------------------------------------
/src/mcp_browser_use/client.py:
--------------------------------------------------------------------------------
1 | """Client helpers for interacting with the in-process FastMCP server."""
2 |
3 | from __future__ import annotations
4 |
5 | from contextlib import asynccontextmanager
6 | from typing import Any, AsyncIterator, Callable, Optional
7 |
8 | from fastmcp.client import Client
9 |
10 | from .server import app
11 |
12 |
13 | class AgentNotRegisteredError(RuntimeError):
14 | """Error raised when attempting to control an agent that is not running."""
15 |
16 |
17 | @asynccontextmanager
18 | async def create_client_session(
19 | client: Optional[Client] = None,
20 | *,
21 | client_factory: Optional[Callable[[], Client]] = None,
22 | **client_kwargs: Any,
23 | ) -> AsyncIterator[Client]:
24 | """Create an asynchronous context manager for interacting with the server.
25 |
26 | Parameters
27 | ----------
28 | client:
29 | An existing :class:`fastmcp.client.Client` instance. If provided, the
30 | caller is responsible for its configuration. ``client_kwargs`` must not
31 | be supplied in this case.
32 | client_factory:
33 | Optional callable used to lazily construct a client. This is useful in
34 | testing where a lightweight stub client might be injected. If provided,
35 | the callable is invoked with no arguments and ``client_kwargs`` must not
36 | be supplied.
37 | **client_kwargs:
38 | Additional keyword arguments forwarded to :class:`fastmcp.client.Client`
39 | when neither ``client`` nor ``client_factory`` is provided.
40 |
41 | Yields
42 | ------
43 | Client
44 | A connected FastMCP client ready for use within the context manager.
45 | """
46 |
47 | if client is not None and client_factory is not None:
48 | raise ValueError("Provide either 'client' or 'client_factory', not both.")
49 |
50 | if client is not None and client_kwargs:
51 | raise ValueError(
52 | "'client_kwargs' cannot be used when an explicit client instance is provided."
53 | )
54 |
55 | if client_factory is not None and client_kwargs:
56 | raise ValueError("'client_kwargs' cannot be combined with 'client_factory'.")
57 |
58 | if client is not None:
59 | session_client = client
60 | elif client_factory is not None:
61 | session_client = client_factory()
62 | else:
63 | session_client = Client(app, **client_kwargs)
64 |
65 | async with session_client as connected_client:
66 | yield connected_client
67 |
--------------------------------------------------------------------------------
/tests/stubs/browser_use/__init__.py:
--------------------------------------------------------------------------------
1 | class _DummyEvent:
2 | def __await__(self):
3 | async def _noop():
4 | return None
5 |
6 | return _noop().__await__()
7 |
8 | async def event_result(self, *args, **kwargs): # pragma: no cover - stub method
9 | return None
10 |
11 |
12 | class _DummyEventBus:
13 | def dispatch(self, event): # noqa: D401 - simple stub
14 | return _DummyEvent()
15 |
16 |
17 | class BrowserPage:
18 | def __init__(self, **kwargs):
19 | for key, value in kwargs.items():
20 | setattr(self, key, value)
21 | self.event_bus = _DummyEventBus()
22 |
23 | async def close(self) -> None: # pragma: no cover - stub method
24 | return None
25 |
26 |
27 | class Browser:
28 | """Lightweight stub mirroring the public Browser API used in tests."""
29 |
30 | def __init__(self, **kwargs):
31 | for key, value in kwargs.items():
32 | setattr(self, key, value)
33 | self._pages: list[BrowserPage] = []
34 | self._started = False
35 |
36 | async def start(self): # pragma: no cover - stub method
37 | self._started = True
38 | return self
39 |
40 | async def stop(self): # pragma: no cover - stub method
41 | self._started = False
42 | return None
43 |
44 | async def new_page(self, **kwargs):
45 | page = BrowserPage(**kwargs)
46 | self._pages.append(page)
47 | return page
48 |
49 | async def close(self): # pragma: no cover - compatibility alias
50 | return await self.stop()
51 |
52 |
53 | class BrowserSession(Browser): # pragma: no cover - stub class
54 | async def kill(self): # pragma: no cover - stub method
55 | return await self.stop()
56 |
57 |
58 | class BrowserProfile: # pragma: no cover - stub class
59 | def __init__(self, **kwargs):
60 | for key, value in kwargs.items():
61 | setattr(self, key, value)
62 | self.event_bus = _DummyEventBus()
63 |
64 | async def kill(self) -> None: # pragma: no cover - stub method
65 | return None
66 |
67 |
68 | class BrowserProfile: # pragma: no cover - stub class
69 | def __init__(self, **kwargs):
70 | for key, value in kwargs.items():
71 | setattr(self, key, value)
72 |
73 |
74 | class ProxySettings: # pragma: no cover - stub class
75 | def __init__(self, **kwargs):
76 | for key, value in kwargs.items():
77 | setattr(self, key, value)
78 |
79 |
80 | # Alias maintained for compatibility with production package
81 | Browser = BrowserSession
82 |
--------------------------------------------------------------------------------
/src/mcp_browser_use/controller/custom_controller.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import logging
4 | import sys
5 |
6 | import pyperclip
7 | from browser_use import BrowserSession
8 | from browser_use.agent.views import ActionResult
9 | from browser_use.controller.service import Controller
10 |
11 | logger = logging.getLogger(__name__)
12 |
13 |
14 | class CustomController(Controller):
15 | """
16 | A custom controller registering two clipboard actions: copy and paste.
17 | """
18 |
19 | def __init__(self):
20 | super().__init__()
21 | self._register_custom_actions()
22 |
23 | def _register_custom_actions(self) -> None:
24 | """Register all custom browser actions for this controller."""
25 |
26 | @self.registry.action("Copy text to clipboard")
27 | def copy_to_clipboard(text: str) -> ActionResult:
28 | """
29 | Copy the given text to the system's clipboard.
30 | Returns an ActionResult with the same text as extracted_content.
31 | """
32 | try:
33 | pyperclip.copy(text)
34 | # Be cautious about logging the actual text, if sensitive
35 | logger.debug("Copied text to clipboard.")
36 | return ActionResult(extracted_content=text)
37 | except Exception as e:
38 | logger.error(f"Error copying text to clipboard: {e}")
39 | return ActionResult(error=str(e), extracted_content=None)
40 |
41 | @self.registry.action("Paste text from clipboard", requires_browser=True)
42 | async def paste_from_clipboard(browser_session: BrowserSession) -> ActionResult:
43 | """
44 | Paste whatever is currently in the system's clipboard
45 | into the active browser page by using the send_keys tool.
46 | """
47 | try:
48 | text = pyperclip.paste()
49 | except Exception as e:
50 | logger.error(f"Error reading text from clipboard: {e}")
51 | return ActionResult(error=str(e), extracted_content=None)
52 |
53 | try:
54 | modifier = "meta" if sys.platform == "darwin" else "ctrl"
55 | # Use the documented tool via the registry
56 | await self.registry.execute_action(
57 | "send_keys",
58 | {"keys": f"{modifier}+v"},
59 | browser_session=browser_session,
60 | )
61 | logger.debug("Triggered paste shortcut inside the browser session.")
62 | return ActionResult(extracted_content=text)
63 | except Exception as e:
64 | logger.error(f"Error pasting text into the browser session: {e}")
65 | return ActionResult(error=str(e), extracted_content=None)
66 |
--------------------------------------------------------------------------------
/src/mcp_browser_use/agent/custom_views.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from dataclasses import dataclass
4 | from typing import List, Type
5 |
6 | from browser_use.agent.views import AgentOutput
7 | from browser_use.controller.registry.views import ActionModel
8 | from pydantic import BaseModel, ConfigDict, Field, create_model
9 |
10 |
11 | @dataclass
12 | class CustomAgentStepInfo:
13 | """
14 | Holds metadata about a single step of the agent's execution.
15 |
16 | :param step_number: Which step number we're currently on.
17 | :param max_steps: Total maximum steps before we stop.
18 | :param task: The primary task assigned to the agent.
19 | :param add_infos: Additional contextual info or instructions.
20 | :param memory: Cumulative memory or context from previous steps.
21 | :param task_progress: Text describing progress toward the task goal.
22 | """
23 |
24 | step_number: int
25 | max_steps: int
26 | task: str
27 | add_infos: str
28 | memory: str
29 | task_progress: str
30 |
31 |
32 | class CustomAgentBrain(BaseModel):
33 | """
34 | Represents the agent's 'thinking' or ephemeral state during processing.
35 |
36 | :param prev_action_evaluation: String evaluation of the last action performed (success/failure).
37 | :param important_contents: Key points or memory extracted from the environment.
38 | :param completed_contents: Completed portion of the task so far.
39 | :param thought: Agent's internal reasoning or thought process text.
40 | :param summary: Short summary of the agent's current state or progress.
41 | """
42 |
43 | prev_action_evaluation: str
44 | important_contents: str
45 | completed_contents: str
46 | thought: str
47 | summary: str
48 |
49 |
50 | class CustomAgentOutput(AgentOutput):
51 | """
52 | Output model for the agent. Extended at runtime with custom actions
53 | by 'type_with_custom_actions'.
54 | """
55 |
56 | model_config = ConfigDict(arbitrary_types_allowed=True)
57 |
58 | current_state: CustomAgentBrain
59 | action: List[ActionModel]
60 |
61 | @staticmethod
62 | def type_with_custom_actions(
63 | custom_actions: Type[ActionModel],
64 | ) -> Type["CustomAgentOutput"]:
65 | """
66 | Create a new Pydantic model that inherits from CustomAgentOutput
67 | but redefines the 'action' field to be a list of the given
68 | custom action model.
69 |
70 | :param custom_actions: The action model type from the controller registry.
71 | :return: A new Pydantic model class based on CustomAgentOutput.
72 | """
73 | return create_model(
74 | # Could rename to something more specific if needed
75 | "AgentOutput",
76 | __base__=CustomAgentOutput,
77 | action=(List[custom_actions], Field(...)),
78 | __module__=CustomAgentOutput.__module__,
79 | )
80 |
--------------------------------------------------------------------------------
/smithery.yaml:
--------------------------------------------------------------------------------
1 | # Smithery configuration file: https://smithery.ai/docs/config#smitheryyaml
2 |
3 | startCommand:
4 | type: stdio
5 | configSchema:
6 | # JSON Schema defining the configuration options for the MCP.
7 | type: object
8 | required:
9 | - openaiApiKey
10 | - anthropicApiKey
11 | - mcpModelProvider
12 | - mcpModelName
13 | properties:
14 | openaiApiKey:
15 | type: string
16 | description: API key for OpenAI services.
17 | anthropicApiKey:
18 | type: string
19 | description: API key for Anthropic services.
20 | googleApiKey:
21 | type: string
22 | description: API key for Google services (optional).
23 | azureOpenaiEndpoint:
24 | type: string
25 | description: Azure OpenAI endpoint (optional).
26 | azureOpenaiApiKey:
27 | type: string
28 | description: Azure OpenAI API key (optional).
29 | chromePath:
30 | type: string
31 | description: Path to Chrome executable (optional).
32 | chromeUserData:
33 | type: string
34 | description: Path to Chrome user data directory (optional).
35 | chromeDebuggingPort:
36 | type: string
37 | default: "9222"
38 | description: Chrome debugging port. Default is 9222.
39 | chromeDebuggingHost:
40 | type: string
41 | default: localhost
42 | description: Chrome debugging host. Default is localhost.
43 | chromePersistentSession:
44 | type: boolean
45 | default: false
46 | description: Keep browser open between tasks.
47 | mcpModelProvider:
48 | type: string
49 | description: Model provider (e.g., anthropic, openai).
50 | mcpModelName:
51 | type: string
52 | description: Model name.
53 | mcpTemperature:
54 | type: number
55 | default: 0.3
56 | description: Model temperature.
57 | mcpMaxSteps:
58 | type: number
59 | default: 30
60 | description: Max steps for model.
61 | mcpUseVision:
62 | type: boolean
63 | default: true
64 | description: Use vision capabilities.
65 | mcpMaxActionsPerStep:
66 | type: number
67 | default: 5
68 | description: Max actions per step.
69 | commandFunction:
70 | # A function that produces the CLI command to start the MCP on stdio.
71 | |-
72 | (config) => ({ command: 'uv', args: ['run', 'mcp-browser-use'], env: { OPENAI_API_KEY: config.openaiApiKey, ANTHROPIC_API_KEY: config.anthropicApiKey, GOOGLE_API_KEY: config.googleApiKey, AZURE_OPENAI_ENDPOINT: config.azureOpenaiEndpoint, AZURE_OPENAI_API_KEY: config.azureOpenaiApiKey, CHROME_PATH: config.chromePath, CHROME_USER_DATA: config.chromeUserData, CHROME_DEBUGGING_PORT: config.chromeDebuggingPort || '9222', CHROME_DEBUGGING_HOST: config.chromeDebuggingHost || 'localhost', CHROME_PERSISTENT_SESSION: config.chromePersistentSession, MCP_MODEL_PROVIDER: config.mcpModelProvider, MCP_MODEL_NAME: config.mcpModelName, MCP_TEMPERATURE: config.mcpTemperature || 0.3, MCP_MAX_STEPS: config.mcpMaxSteps || 30, MCP_USE_VISION: config.mcpUseVision, MCP_MAX_ACTIONS_PER_STEP: config.mcpMaxActionsPerStep || 5 } })
73 |
--------------------------------------------------------------------------------
/tests/test_summarize_messages.py:
--------------------------------------------------------------------------------
1 | from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
2 |
3 | import mcp_browser_use.agent.custom_agent as custom_agent_module
4 | from mcp_browser_use.agent.custom_agent import CustomAgent
5 | from browser_use.agent.message_manager.views import MessageHistory, ManagedMessage
6 |
7 |
8 | class FakeLLM:
9 | def __init__(self, content: str = "Conversation summary"):
10 | self.calls = []
11 | self._content = content
12 |
13 | def invoke(self, input, **kwargs):
14 | self.calls.append(input)
15 | message = AIMessage(content=self._content)
16 | return message
17 |
18 | def __call__(self, input, **kwargs):
19 | return self.invoke(input, **kwargs)
20 |
21 |
22 | class DummyMessageManager:
23 | def __init__(self, extra_messages: int = 6):
24 | self.system_prompt = SystemMessage(content="System instructions")
25 | self.example_tool_call = AIMessage(content="[]")
26 | self.example_tool_call.tool_calls = []
27 | self.reset_calls = 0
28 | self.history = MessageHistory()
29 | self.reset_history()
30 | for idx in range(extra_messages):
31 | human = HumanMessage(content=f"User message {idx}")
32 | self._add_message_with_tokens(human)
33 |
34 | def get_messages(self):
35 | return [managed.message for managed in self.history.messages]
36 |
37 | def reset_history(self) -> None:
38 | self.reset_calls += 1
39 | self.history = MessageHistory()
40 | self.history.messages = []
41 | if hasattr(self.history, "total_tokens"):
42 | self.history.total_tokens = 0
43 | self._add_message_with_tokens(self.system_prompt)
44 | self._add_message_with_tokens(self.example_tool_call)
45 |
46 | def _add_message_with_tokens(self, message):
47 | self.history.messages.append(ManagedMessage(message=message))
48 | if hasattr(self.history, "total_tokens"):
49 | self.history.total_tokens += 1
50 |
51 |
52 | def test_summarize_messages_preserves_system_prompt(monkeypatch):
53 | class StubChain:
54 | def __init__(self, llm):
55 | self.llm = llm
56 |
57 | def invoke(self, data):
58 | return self.llm.invoke(data)
59 |
60 | class StubPrompt:
61 | def __or__(self, llm):
62 | return StubChain(llm)
63 |
64 | class StubChatPromptTemplate:
65 | @staticmethod
66 | def from_messages(messages):
67 | return StubPrompt()
68 |
69 | monkeypatch.setattr(
70 | custom_agent_module,
71 | "ChatPromptTemplate",
72 | StubChatPromptTemplate,
73 | )
74 |
75 | agent = CustomAgent.__new__(CustomAgent)
76 | agent.llm = FakeLLM()
77 | agent.message_manager = DummyMessageManager()
78 |
79 | assert len(agent.message_manager.get_messages()) > 5
80 | # Ensure the initial reset was performed
81 | assert agent.message_manager.reset_calls == 1
82 |
83 | result = agent.summarize_messages()
84 |
85 | assert result is True
86 | assert agent.message_manager.reset_calls == 2
87 |
88 | history_messages = agent.message_manager.history.messages
89 | assert len(history_messages) == 3
90 | assert [entry.message for entry in history_messages[:2]] == [
91 | agent.message_manager.system_prompt,
92 | agent.message_manager.example_tool_call,
93 | ]
94 | assert history_messages[2].message.content == "Conversation summary"
95 | if hasattr(agent.message_manager.history, "total_tokens"):
96 | assert agent.message_manager.history.total_tokens == len(history_messages)
97 |
98 | # Ensure the LLM was called with the conversation
99 | assert len(agent.llm.calls) == 1
100 | prompt_value = agent.llm.calls[0]
101 | assert isinstance(prompt_value, dict)
102 | assert "chat_history" in prompt_value
103 |
--------------------------------------------------------------------------------
/tests/test_client_session.py:
--------------------------------------------------------------------------------
1 | import importlib
2 |
3 | import pytest
4 |
5 | from mcp_browser_use import client as client_module
6 | from mcp_browser_use.client import AgentNotRegisteredError, create_client_session
7 |
8 |
9 | @pytest.fixture
10 | def anyio_backend():
11 | return "asyncio"
12 |
13 |
14 | @pytest.mark.anyio("asyncio")
15 | async def test_create_client_session_uses_supplied_client():
16 | events = []
17 |
18 | class DummyClient:
19 | def __init__(self):
20 | self.connected = False
21 |
22 | async def __aenter__(self):
23 | events.append("enter")
24 | self.connected = True
25 | return self
26 |
27 | async def __aexit__(self, exc_type, exc, tb):
28 | events.append("exit")
29 | self.connected = False
30 |
31 | dummy = DummyClient()
32 | async with create_client_session(client=dummy) as session:
33 | assert session is dummy
34 | assert dummy.connected
35 |
36 | assert events == ["enter", "exit"]
37 | assert dummy.connected is False
38 |
39 |
40 | @pytest.mark.anyio("asyncio")
41 | async def test_create_client_session_accepts_factory():
42 | events = []
43 |
44 | class DummyClient:
45 | async def __aenter__(self):
46 | events.append("enter")
47 | return self
48 |
49 | async def __aexit__(self, exc_type, exc, tb):
50 | events.append("exit")
51 |
52 | async with create_client_session(client_factory=DummyClient) as session:
53 | assert isinstance(session, DummyClient)
54 |
55 | assert events == ["enter", "exit"]
56 |
57 |
58 | @pytest.mark.anyio("asyncio")
59 | async def test_create_client_session_rejects_mixed_arguments():
60 | class DummyClient:
61 | async def __aenter__(self):
62 | return self
63 |
64 | async def __aexit__(self, exc_type, exc, tb):
65 | pass
66 |
67 | dummy = DummyClient()
68 |
69 | with pytest.raises(ValueError):
70 | async with create_client_session(client=dummy, timeout=5):
71 | pass
72 |
73 | with pytest.raises(ValueError):
74 | async with create_client_session(client_factory=DummyClient, timeout=5):
75 | pass
76 |
77 | with pytest.raises(ValueError):
78 | async with create_client_session(client=dummy, client_factory=DummyClient):
79 | pass
80 |
81 |
82 | @pytest.mark.anyio("asyncio")
83 | async def test_create_client_session_constructs_default_client(monkeypatch):
84 | created = {}
85 |
86 | class DummyClient:
87 | def __init__(self, app, **kwargs):
88 | created["app"] = app
89 | created["kwargs"] = kwargs
90 |
91 | async def __aenter__(self):
92 | created["entered"] = True
93 | return self
94 |
95 | async def __aexit__(self, exc_type, exc, tb):
96 | created["exited"] = True
97 |
98 | monkeypatch.setattr("mcp_browser_use.client.Client", DummyClient)
99 |
100 | async with create_client_session(timeout=5) as session:
101 | assert isinstance(session, DummyClient)
102 |
103 | assert created["app"] is client_module.app
104 | assert created["kwargs"] == {"timeout": 5}
105 | assert created["entered"] is True
106 | assert created["exited"] is True
107 |
108 |
109 | @pytest.mark.anyio("asyncio")
110 | async def test_create_client_session_kwargs_with_factory_raise():
111 | class DummyClient:
112 | async def __aenter__(self):
113 | return self
114 |
115 | async def __aexit__(self, exc_type, exc, tb):
116 | pass
117 |
118 | kwargs = {"client_factory": DummyClient, "timeout": 10}
119 |
120 | with pytest.raises(ValueError):
121 | async with create_client_session(**kwargs):
122 | pass
123 |
124 |
125 | @pytest.mark.parametrize(
126 | "legacy_module",
127 | [
128 | "mcp_browser",
129 | "mcp_browser.use",
130 | "mcp_browser.use.mcp_browser_use",
131 | ],
132 | )
133 | def test_legacy_namespace_is_removed(legacy_module):
134 | with pytest.raises(ModuleNotFoundError):
135 | importlib.import_module(legacy_module)
136 |
137 |
138 | def test_exception_type():
139 | assert issubclass(AgentNotRegisteredError, RuntimeError)
140 |
--------------------------------------------------------------------------------
/src/mcp_browser_use/agent/custom_massage_manager.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from __future__ import annotations
4 |
5 | import copy
6 | import logging
7 | from typing import List, Optional, Type
8 |
9 | from browser_use.agent.message_manager.service import MessageManager
10 | from browser_use.agent.message_manager.views import MessageHistory
11 | from browser_use.agent.prompts import SystemPrompt
12 | from browser_use.agent.views import ActionResult, AgentStepInfo
13 | from browser_use.browser.views import BrowserState
14 | from langchain_core.language_models import BaseChatModel
15 | from langchain_core.messages import HumanMessage, AIMessage
16 |
17 | from mcp_browser_use.agent.custom_prompts import CustomAgentMessagePrompt
18 |
19 | logger = logging.getLogger(__name__)
20 |
21 |
22 | class CustomMassageManager(MessageManager):
23 | def __init__(
24 | self,
25 | llm: BaseChatModel,
26 | task: str,
27 | action_descriptions: str,
28 | system_prompt_class: Type[SystemPrompt],
29 | max_input_tokens: int = 128000,
30 | estimated_tokens_per_character: int = 3,
31 | image_tokens: int = 800,
32 | include_attributes: list[str] = [],
33 | max_error_length: int = 400,
34 | max_actions_per_step: int = 10,
35 | tool_call_in_content: bool = False,
36 | ):
37 | super().__init__(
38 | llm=llm,
39 | task=task,
40 | action_descriptions=action_descriptions,
41 | system_prompt_class=system_prompt_class,
42 | max_input_tokens=max_input_tokens,
43 | estimated_tokens_per_character=estimated_tokens_per_character,
44 | image_tokens=image_tokens,
45 | include_attributes=include_attributes,
46 | max_error_length=max_error_length,
47 | max_actions_per_step=max_actions_per_step,
48 | tool_call_in_content=tool_call_in_content,
49 | )
50 |
51 | # Store template for example tool call so we can rebuild the history when needed
52 | self.tool_call_in_content = tool_call_in_content
53 | self._example_tool_call_template = [
54 | {
55 | "name": "CustomAgentOutput",
56 | "args": {
57 | "current_state": {
58 | "prev_action_evaluation": "Unknown - No previous actions to evaluate.",
59 | "important_contents": "",
60 | "completed_contents": "",
61 | "thought": "Now Google is open. Need to type OpenAI to search.",
62 | "summary": "Type OpenAI to search.",
63 | },
64 | "action": [],
65 | },
66 | "id": "",
67 | "type": "tool_call",
68 | }
69 | ]
70 | self.reset_history()
71 |
72 | def _create_example_tool_call_message(self) -> AIMessage:
73 | tool_calls = copy.deepcopy(self._example_tool_call_template)
74 | if self.tool_call_in_content:
75 | # openai throws error if tool_calls are not responded -> move to content
76 | return AIMessage(
77 | content=f"{tool_calls}",
78 | tool_calls=[],
79 | )
80 | return AIMessage(
81 | content="",
82 | tool_calls=tool_calls,
83 | )
84 |
85 | def reset_history(self) -> None:
86 | """Reset the message history to the initial seeded state."""
87 |
88 | self.history = MessageHistory()
89 | if hasattr(self.history, "total_tokens"):
90 | self.history.total_tokens = 0
91 |
92 | self._add_message_with_tokens(self.system_prompt)
93 | self._add_message_with_tokens(self._create_example_tool_call_message())
94 |
95 | def add_state_message(
96 | self,
97 | state: BrowserState,
98 | result: Optional[List[ActionResult]] = None,
99 | step_info: Optional[AgentStepInfo] = None,
100 | ) -> None:
101 | """Add browser state as human message"""
102 |
103 | # if keep in memory, add to directly to history and add state without result
104 | if result:
105 | for r in result:
106 | if r.include_in_memory:
107 | if r.extracted_content:
108 | msg = HumanMessage(content=str(r.extracted_content))
109 | self._add_message_with_tokens(msg)
110 | if r.error:
111 | msg = HumanMessage(
112 | content=str(r.error)[-self.max_error_length :]
113 | )
114 | self._add_message_with_tokens(msg)
115 | result = None # if result in history, we dont want to add it again
116 |
117 | # otherwise add state message and result to next message (which will not stay in memory)
118 | state_message = CustomAgentMessagePrompt(
119 | state,
120 | result,
121 | include_attributes=self.include_attributes,
122 | max_error_length=self.max_error_length,
123 | step_info=step_info,
124 | ).get_user_message()
125 | self._add_message_with_tokens(state_message)
126 |
--------------------------------------------------------------------------------
/src/mcp_browser_use/server.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from mcp_browser_use.utils.logging import configure_logging
4 |
5 | # It is critical to configure logging before any other modules are imported,
6 | # as they might initialize logging themselves.
7 | configure_logging()
8 |
9 | import asyncio
10 | import logging
11 | import os
12 | import sys
13 | import traceback
14 | from typing import Any, Optional
15 |
16 | from browser_use import Browser
17 | from fastmcp import FastMCP
18 | from mcp_browser_use.agent.custom_agent import CustomAgent
19 | from mcp_browser_use.controller.custom_controller import CustomController
20 | from mcp_browser_use.browser.browser_manager import create_browser_session
21 | from mcp_browser_use.utils import utils
22 | from mcp_browser_use.utils.agent_state import AgentState
23 |
24 | logger = logging.getLogger(__name__)
25 |
26 | app = FastMCP("mcp_browser_use")
27 |
28 |
29 | @app.tool()
30 | async def run_browser_agent(task: str, add_infos: str = "") -> str:
31 | """
32 | This is the entrypoint for running a browser-based agent.
33 |
34 | :param task: The main instruction or goal for the agent.
35 | :param add_infos: Additional information or context for the agent.
36 | :return: The final result string from the agent run.
37 | """
38 |
39 | browser_session: Optional[Browser] = None
40 | agent_state = AgentState()
41 |
42 | try:
43 | # Clear any previous agent stop signals
44 | agent_state.clear_stop()
45 |
46 | # Read environment variables with defaults and parse carefully
47 | # Fallback to defaults if parsing fails.
48 | model_provider = os.getenv("MCP_MODEL_PROVIDER", "anthropic")
49 | model_name = os.getenv("MCP_MODEL_NAME", "claude-3-5-sonnet-20241022")
50 |
51 | def safe_float(env_var: str, default: float) -> float:
52 | """Safely parse a float from an environment variable."""
53 | try:
54 | return float(os.getenv(env_var, str(default)))
55 | except ValueError:
56 | logger.warning(f"Invalid float for {env_var}, using default={default}")
57 | return default
58 |
59 | def safe_int(env_var: str, default: int) -> int:
60 | """Safely parse an int from an environment variable."""
61 | try:
62 | return int(os.getenv(env_var, str(default)))
63 | except ValueError:
64 | logger.warning(f"Invalid int for {env_var}, using default={default}")
65 | return default
66 |
67 | # Get environment variables with defaults
68 | temperature = safe_float("MCP_TEMPERATURE", 0.3)
69 | max_steps = safe_int("MCP_MAX_STEPS", 30)
70 | use_vision = os.getenv("MCP_USE_VISION", "true").lower() == "true"
71 | max_actions_per_step = safe_int("MCP_MAX_ACTIONS_PER_STEP", 5)
72 | tool_call_in_content = (
73 | os.getenv("MCP_TOOL_CALL_IN_CONTENT", "true").lower() == "true"
74 | )
75 |
76 | # Prepare LLM
77 | llm = utils.get_llm_model(
78 | provider=model_provider, model_name=model_name, temperature=temperature
79 | )
80 |
81 | # Create a fresh browser session for this run
82 | browser_session = create_browser_session()
83 | await browser_session.start()
84 |
85 | # Create controller and agent
86 | controller = CustomController()
87 | agent = CustomAgent(
88 | task=task,
89 | add_infos=add_infos,
90 | use_vision=use_vision,
91 | llm=llm,
92 | browser_session=browser_session,
93 | controller=controller,
94 | max_actions_per_step=max_actions_per_step,
95 | tool_call_in_content=tool_call_in_content,
96 | agent_state=agent_state,
97 | )
98 |
99 | # Execute the agent task lifecycle
100 | history = await agent.execute_agent_task(max_steps=max_steps)
101 |
102 | # Extract final result from the agent's history
103 | final_result = history.final_result()
104 | if not final_result:
105 | final_result = f"No final result. Possibly incomplete. {history}"
106 |
107 | return final_result
108 |
109 | except Exception as e:
110 | logger.error("run-browser-agent error: %s", str(e))
111 | raise ValueError(f"run-browser-agent error: {e}\n{traceback.format_exc()}")
112 |
113 | finally:
114 | # Always ensure cleanup, even if no error.
115 | try:
116 | agent_state.request_stop()
117 | except Exception as stop_error:
118 | logger.warning("Error stopping agent state: %s", stop_error)
119 |
120 | if browser_session:
121 | try:
122 | await browser_session.stop()
123 | except Exception as browser_error:
124 | logger.warning(
125 | "Failed to stop browser session gracefully, killing it: %s",
126 | browser_error,
127 | )
128 | if hasattr(browser_session, "kill"):
129 | await browser_session.kill()
130 |
131 |
132 | def launch_mcp_browser_use_server() -> None:
133 | """
134 | Entry point for running the FastMCP application.
135 | Handles server start and final resource cleanup.
136 | """
137 | try:
138 | app.run()
139 | except Exception as e:
140 | logger.error("Error running MCP server: %s\n%s", e, traceback.format_exc())
141 |
142 |
143 | if __name__ == "__main__":
144 | launch_mcp_browser_use_server()
145 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MCP Browser Use Server
2 |
3 | [](https://smithery.ai/server/@JovaniPink/mcp-browser-use)
4 |
5 | > Model Context Protocol (MCP) server that wires [browser-use](https://github.com/browser-use/browser-use) into Claude Desktop and other MCP compatible clients.
6 |
7 |
8 |
9 | ## Overview
10 |
11 | This repository provides a production-ready wrapper around the `browser-use` automation engine. It exposes a single MCP tool (`run_browser_agent`) that orchestrates a browser session, executes the `browser-use` agent, and returns the final result back to the client. The refactored layout focuses on keeping configuration in one place, improving testability, and keeping `browser-use` upgrades isolated from MCP specific code.
12 |
13 | ### Key Capabilities
14 |
15 | - **Automated browsing** – Navigate, interact with forms, control tabs, capture screenshots, and read page content through natural-language instructions executed by `browser-use`.
16 | - **Agent lifecycle management** – `CustomAgent` wraps `browser-use`'s base agent to add history export, richer prompts, and consistent error handling across runs.
17 | - **Centralised browser configuration** – `create_browser_session` translates environment variables into a ready-to-use `BrowserSession`, enabling persistent profiles, proxies, and custom Chromium flags without touching the agent logic.
18 | - **FastMCP integration** – `server.py` registers the MCP tool, normalises configuration, and ensures the browser session is always cleaned up.
19 | - **Client helpers** – `client.py` includes async helpers for tests or other Python processes that wish to exercise the MCP server in-process.
20 |
21 | ### Project Structure
22 |
23 | ```
24 | .
25 | ├── documentation/
26 | │ ├── CONFIGURATION.md # Detailed configuration reference
27 | │ └── SECURITY.md # Security considerations for running the server
28 | ├── .env.example # Example environment variables for local development
29 | ├── src/mcp_browser_use/
30 | │ ├── agent/ # Custom agent, prompts, message history, and views
31 | │ ├── browser/ # Browser session factory and persistence helpers
32 | │ ├── controller/ # Custom controller extensions for clipboard actions
33 | │ ├── utils/ # LLM factory, agent state helpers, encoding utilities
34 | │ ├── client.py # Async helper for connecting to the FastMCP app
35 | │ └── server.py # FastMCP app and the `run_browser_agent` tool
36 | └── tests/ # Unit tests covering server helpers and agent features
37 | ```
38 |
39 | ## Getting Started
40 |
41 | ### Requirements
42 |
43 | - Python 3.11+
44 | - Google Chrome or Chromium (for local automation)
45 | - [`uv`](https://github.com/astral-sh/uv) for dependency management (recommended)
46 | - Optional: Claude Desktop or another MCP-compatible client for integration testing
47 |
48 | ### Installation
49 |
50 | ```bash
51 | git clone https://github.com/JovaniPink/mcp-browser-use.git
52 | cd mcp-browser-use
53 | uv sync
54 | ```
55 |
56 | Copy `sample.env` to `.env` (or export the variables in another way) and update the values for the providers you plan to use.
57 |
58 | ### Launching the server
59 |
60 | ```bash
61 | uv run mcp-browser-use
62 | ```
63 |
64 | The command invokes the console script defined in `pyproject.toml`, starts the FastMCP application, and registers the `run_browser_agent` tool.
65 |
66 | #### Using with Claude Desktop
67 |
68 | Once the server is running you can register it inside Claude Desktop, for example:
69 |
70 | ```json
71 | "mcpServers": {
72 | "mcp_server_browser_use": {
73 | "command": "uvx",
74 | "args": ["mcp-browser-use"],
75 | "env": {
76 | "MCP_MODEL_PROVIDER": "anthropic",
77 | "MCP_MODEL_NAME": "claude-3-5-sonnet-20241022"
78 | }
79 | }
80 | }
81 | ```
82 |
83 | ### Debugging
84 |
85 | For interactive debugging, use the [MCP Inspector](https://github.com/modelcontextprotocol/inspector):
86 |
87 | ```bash
88 | npx @modelcontextprotocol/inspector uv --directory /path/to/project run mcp-browser-use
89 | ```
90 |
91 | The inspector prints a URL that can be opened in the browser to watch tool calls and responses in real time.
92 |
93 | ## Configuration
94 |
95 | A full list of environment variables and their defaults is available in [documentation/CONFIGURATION.md](documentation/CONFIGURATION.md). Highlights include:
96 |
97 | - `MCP_MODEL_PROVIDER`, `MCP_MODEL_NAME`, `MCP_TEMPERATURE`, `MCP_MAX_STEPS`, `MCP_MAX_ACTIONS_PER_STEP`, and `MCP_USE_VISION` control the LLM and agent run.
98 | - Provider-specific API keys and endpoints (`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `DEEPSEEK_API_KEY`, `GOOGLE_API_KEY`, `AZURE_OPENAI_API_KEY`, etc.).
99 | - Browser runtime flags (`BROWSER_USE_HEADLESS`, `BROWSER_USE_EXTRA_CHROMIUM_ARGS`, `CHROME_PERSISTENT_SESSION`, `BROWSER_USE_PROXY_URL`, ...).
100 |
101 | Use `.env` + [`python-dotenv`](https://pypi.org/project/python-dotenv/) or your preferred secrets manager to keep credentials out of source control.
102 |
103 | ## Running Tests
104 |
105 | ```bash
106 | uv run pytest
107 | ```
108 |
109 | The tests cover the custom agent behaviour, browser session factory, and other utility helpers.
110 |
111 | ## Security
112 |
113 | Controlling a full browser instance remotely can grant broad access to the host machine. Review [documentation/SECURITY.md](documentation/SECURITY.md) before exposing the server to untrusted environments.
114 |
115 | ## Contributing
116 |
117 | 1. Fork the repository
118 | 2. Create your feature branch: `git checkout -b my-new-feature`
119 | 3. Commit your changes: `git commit -m 'Add some feature'`
120 | 4. Push to the branch: `git push origin my-new-feature`
121 | 5. Open a pull request
122 |
123 | Bug reports and feature suggestions are welcome—please include logs and reproduction steps when applicable.
124 |
--------------------------------------------------------------------------------
/documentation/CONFIGURATION.md:
--------------------------------------------------------------------------------
1 | # Configuration Guide
2 |
3 | This guide describes every configuration option recognised by the MCP Browser Use server. All settings can be supplied as environment variables (e.g. via a `.env` file loaded with [`python-dotenv`](https://pypi.org/project/python-dotenv/)) or injected by your MCP client.
4 |
5 | The sample file at [`sample.env.example`](../sample.env.example) contains a ready-to-copy template with placeholders for secrets.
6 |
7 | ## How configuration is loaded
8 |
9 | 1. **Model & Agent settings** are read in [`server.py`](../src/mcp_browser_use/server.py). They control the language model as well as the agent run loop.
10 | 2. **Browser runtime settings** are parsed in [`browser/browser_manager.py`](../src/mcp_browser_use/browser/browser_manager.py) which returns a configured `BrowserSession` instance.
11 | 3. **Provider specific credentials** are consumed by the LLM factory in [`utils/utils.py`](../src/mcp_browser_use/utils/utils.py).
12 |
13 | Unless otherwise noted, boolean flags treat any of `1`, `true`, `yes`, `on` (case insensitive) as **true**. Any other value is considered **false**.
14 |
15 | ## Core Agent Options
16 |
17 | | Variable | Default | Description |
18 | | --- | --- | --- |
19 | | `MCP_MODEL_PROVIDER` | `anthropic` | LLM provider name passed to the LangChain factory. Supported values: `anthropic`, `openai`, `deepseek`, `gemini`, `ollama`, `azure_openai`. |
20 | | `MCP_MODEL_NAME` | `claude-3-5-sonnet-20241022` | Model identifier sent to the provider. Each provider supports its own model list. |
21 | | `MCP_TEMPERATURE` | `0.3` | Sampling temperature for the model. Parsed as float. |
22 | | `MCP_MAX_STEPS` | `30` | Maximum number of reasoning/action steps before aborting the run. Parsed as integer. |
23 | | `MCP_MAX_ACTIONS_PER_STEP` | `5` | Limits how many tool invocations the agent may issue in a single step. Parsed as integer. |
24 | | `MCP_USE_VISION` | `true` | Enables vision features within the agent (element snapshots). |
25 | | `MCP_TOOL_CALL_IN_CONTENT` | `true` | Whether tool call payloads are expected inside the model response content. |
26 |
27 | ## Provider Credentials & Endpoints
28 |
29 | The LLM factory reads the following variables when initialising clients. Only set the values for the provider(s) you actively use.
30 |
31 | | Variable | Purpose |
32 | | --- | --- |
33 | | `ANTHROPIC_API_KEY` | API key for Anthropic Claude models. |
34 | | `OPENAI_API_KEY` | API key for OpenAI models. |
35 | | `DEEPSEEK_API_KEY` | API key for DeepSeek hosted models. |
36 | | `GOOGLE_API_KEY` | API key for Google Gemini via LangChain Google Generative AI. |
37 | | `AZURE_OPENAI_API_KEY` | API key for Azure OpenAI deployments. |
38 | | `AZURE_OPENAI_ENDPOINT` | Endpoint URL for the Azure OpenAI deployment. |
39 | | `OPENAI_ENDPOINT` | Override the OpenAI base URL (useful for proxies). |
40 | | `DEEPSEEK_ENDPOINT` | Base URL for the DeepSeek-compatible endpoint. |
41 | | `ANTHROPIC_API_ENDPOINT` | Alternative base URL for Anthropic (rarely needed). |
42 |
43 | When pointing to self-hosted or compatible services you may also override the defaults using `base_url` specific variables in your own code. See [`utils/utils.py`](../src/mcp_browser_use/utils/utils.py) for the full mapping.
44 |
45 | ## Browser Runtime Options
46 |
47 | These options are parsed by [`BrowserEnvironmentConfig.from_env`](../src/mcp_browser_use/browser/browser_manager.py) and control Chromium launch behaviour.
48 |
49 | | Variable | Default | Description |
50 | | --- | --- | --- |
51 | | `CHROME_PATH` | _unset_ | Absolute path to a Chrome/Chromium executable. Leave unset to let `browser-use` manage Chromium via Playwright. |
52 | | `CHROME_USER_DATA` | _unset_ | Directory to store user data (profiles, cookies). Required when `CHROME_PERSISTENT_SESSION` is true. |
53 | | `CHROME_PERSISTENT_SESSION` | `false` | Keeps the browser profile between runs by mounting `CHROME_USER_DATA`. |
54 | | `CHROME_DEBUGGING_PORT` | _unset_ | Remote debugging port for attaching to an existing Chrome instance. Must be an integer. |
55 | | `CHROME_DEBUGGING_HOST` | _unset_ | Hostname/IP for remote debugging (e.g. `localhost`). |
56 | | `BROWSER_USE_HEADLESS` | `false` | Launch Chromium in headless mode. |
57 | | `BROWSER_USE_DISABLE_SECURITY` | `false` | Disables web security features (CORS, sandbox). Use with caution. |
58 | | `BROWSER_USE_EXTRA_CHROMIUM_ARGS` | _unset_ | Comma-separated list of additional Chromium command-line flags. |
59 | | `BROWSER_USE_ALLOWED_DOMAINS` | _unset_ | Comma-separated allowlist limiting which domains the agent may open. |
60 | | `BROWSER_USE_PROXY_URL` | _unset_ | HTTP/HTTPS proxy URL. |
61 | | `BROWSER_USE_NO_PROXY` | _unset_ | Hosts to bypass in proxy mode. |
62 | | `BROWSER_USE_PROXY_USERNAME` | _unset_ | Username for proxy authentication. |
63 | | `BROWSER_USE_PROXY_PASSWORD` | _unset_ | Password for proxy authentication. |
64 | | `BROWSER_USE_CDP_URL` | _unset_ | Connect to an existing Chrome DevTools Protocol endpoint instead of launching a new browser. |
65 |
66 | ### Persistence hints
67 |
68 | - When `CHROME_PERSISTENT_SESSION` is true and `CHROME_USER_DATA` is not provided, the server logs a warning and the session falls back to ephemeral storage.
69 | - Remote debugging settings (`CHROME_DEBUGGING_HOST` / `CHROME_DEBUGGING_PORT`) are optional and ignored if invalid values are supplied. The server logs a warning and continues with defaults.
70 |
71 | ## Additional Environment Variables
72 |
73 | Some ancillary features inspect the following variables:
74 |
75 | | Variable | Purpose |
76 | | --- | --- |
77 | | `WIN_FONT_DIR` | Custom Windows font directory used when generating GIF summaries of browsing sessions. |
78 |
79 | ## Tips for managing configuration
80 |
81 | - Store secrets outside of version control. When sharing an `.env` file, redact or rotate keys immediately.
82 | - Keep provider-specific settings grouped so you can switch model providers quickly when testing.
83 | - Start with the defaults, confirm the agent behaves as expected, then tighten security by restricting `BROWSER_USE_ALLOWED_DOMAINS` and enabling headless mode.
84 | - When experimenting locally, keep `CHROME_PERSISTENT_SESSION=false` to avoid stale cookies interfering with automation runs.
85 |
86 | For any options not covered here, consult the upstream [`browser-use` documentation](https://github.com/browser-use/browser-use) which explains additional environment variables recognised by the underlying library.
87 |
--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import importlib
3 | import importlib.util
4 | import os
5 | import sys
6 | import time
7 | import types
8 |
9 | import pytest
10 |
11 | # Path to utils module
12 | ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
13 | UTILS_PATH = os.path.join(ROOT, "src", "mcp_browser_use", "utils", "utils.py")
14 |
15 | # Provide dummy langchain modules if they are not installed
16 | if "langchain_openai" not in sys.modules:
17 | module = types.ModuleType("langchain_openai")
18 |
19 | class ChatOpenAI:
20 | def __init__(self, *args, **kwargs):
21 | pass
22 |
23 | class AzureChatOpenAI:
24 | def __init__(self, *args, **kwargs):
25 | pass
26 |
27 | module.ChatOpenAI = ChatOpenAI
28 | module.AzureChatOpenAI = AzureChatOpenAI
29 | sys.modules["langchain_openai"] = module
30 |
31 | if "langchain_anthropic" not in sys.modules:
32 | module = types.ModuleType("langchain_anthropic")
33 |
34 | class ChatAnthropic:
35 | def __init__(self, *args, **kwargs):
36 | pass
37 |
38 | module.ChatAnthropic = ChatAnthropic
39 | sys.modules["langchain_anthropic"] = module
40 |
41 | if "langchain_google_genai" not in sys.modules:
42 | module = types.ModuleType("langchain_google_genai")
43 |
44 | class ChatGoogleGenerativeAI:
45 | def __init__(self, *args, **kwargs):
46 | pass
47 |
48 | module.ChatGoogleGenerativeAI = ChatGoogleGenerativeAI
49 | sys.modules["langchain_google_genai"] = module
50 |
51 | if "langchain_ollama" not in sys.modules:
52 | module = types.ModuleType("langchain_ollama")
53 |
54 | class ChatOllama:
55 | def __init__(self, *args, **kwargs):
56 | pass
57 |
58 | module.ChatOllama = ChatOllama
59 | sys.modules["langchain_ollama"] = module
60 |
61 | if "browser_use" not in sys.modules:
62 | browser_use_module = types.ModuleType("browser_use")
63 | browser_module = types.ModuleType("browser_use.browser")
64 | events_module = types.ModuleType("browser_use.browser.events")
65 |
66 | class ScreenshotEvent:
67 | def __init__(self, full_page: bool = False):
68 | self.full_page = full_page
69 |
70 | events_module.ScreenshotEvent = ScreenshotEvent
71 | browser_module.events = events_module
72 | browser_use_module.browser = browser_module
73 |
74 | sys.modules["browser_use"] = browser_use_module
75 | sys.modules["browser_use.browser"] = browser_module
76 | sys.modules["browser_use.browser.events"] = events_module
77 |
78 | # Import utils module directly from file after stubbing dependencies
79 | spec = importlib.util.spec_from_file_location("mcp_browser_use.utils.utils", UTILS_PATH)
80 | utils = importlib.util.module_from_spec(spec)
81 | spec.loader.exec_module(utils)
82 |
83 |
84 | @pytest.fixture
85 | def anyio_backend():
86 | return "asyncio"
87 |
88 |
89 | def test_get_llm_model_returns_chatopenai():
90 | model = utils.get_llm_model("openai")
91 | assert isinstance(model, utils.ChatOpenAI)
92 |
93 |
94 | def test_get_llm_model_unknown_provider_raises():
95 | with pytest.raises(ValueError):
96 | utils.get_llm_model("unknown")
97 |
98 |
99 | def test_encode_image_handles_empty_path():
100 | assert utils.encode_image(None) is None
101 | assert utils.encode_image("") is None
102 |
103 |
104 | def test_encode_image_roundtrip(tmp_path):
105 | image_path = tmp_path / "image.bin"
106 | payload = b"test-bytes"
107 | image_path.write_bytes(payload)
108 |
109 | encoded = utils.encode_image(str(image_path))
110 |
111 | assert encoded == base64.b64encode(payload).decode("utf-8")
112 |
113 |
114 | def test_encode_image_missing_file(tmp_path):
115 | with pytest.raises(FileNotFoundError):
116 | utils.encode_image(str(tmp_path / "missing.bin"))
117 |
118 |
119 | def test_get_latest_files_creates_directory(tmp_path):
120 | target = tmp_path / "captures"
121 |
122 | result = utils.get_latest_files(str(target), file_types=[".webm", ".zip"])
123 |
124 | assert target.exists()
125 | assert result == {".webm": None, ".zip": None}
126 |
127 |
128 | def test_get_latest_files_skips_recent_files(tmp_path, monkeypatch):
129 | directory = tmp_path / "captures"
130 | directory.mkdir()
131 |
132 | recent_path = directory / "recent.webm"
133 | recent_path.write_text("recent")
134 |
135 | now = time.time()
136 | os.utime(recent_path, (now, now))
137 |
138 | monkeypatch.setattr(utils.time, "time", lambda: now)
139 |
140 | result = utils.get_latest_files(str(directory), file_types=[".webm"])
141 |
142 | assert result == {".webm": None}
143 |
144 |
145 | @pytest.mark.anyio("asyncio")
146 | async def test_capture_screenshot_uses_event_bus():
147 | screenshot_payload = base64.b64encode(b"payload").decode("utf-8")
148 |
149 | class DummyEvent:
150 | def __init__(self, result):
151 | self._result = result
152 | self.awaited = False
153 |
154 | def __await__(self):
155 | async def _wait():
156 | self.awaited = True
157 | return self
158 |
159 | return _wait().__await__()
160 |
161 | async def event_result(self, raise_if_any=True, raise_if_none=True):
162 | return self._result
163 |
164 | class DummyEventBus:
165 | def __init__(self, dispatched_event):
166 | self._event = dispatched_event
167 | self.dispatched = []
168 |
169 | def dispatch(self, event):
170 | self.dispatched.append(event)
171 | return self._event
172 |
173 | class DummyBrowserSession:
174 | def __init__(self, event_bus):
175 | self.event_bus = event_bus
176 |
177 | dummy_event = DummyEvent(screenshot_payload)
178 | event_bus = DummyEventBus(dummy_event)
179 | session = DummyBrowserSession(event_bus)
180 |
181 | encoded = await utils.capture_screenshot(session)
182 |
183 | assert encoded == screenshot_payload
184 | assert dummy_event.awaited is True
185 | assert len(event_bus.dispatched) == 1
186 | assert isinstance(event_bus.dispatched[0], utils.ScreenshotEvent)
187 |
188 |
189 | @pytest.mark.anyio("asyncio")
190 | async def test_capture_screenshot_returns_none_on_error():
191 | class DummyErrorEvent:
192 | def __await__(self):
193 | async def _wait():
194 | return self
195 |
196 | return _wait().__await__()
197 |
198 | async def event_result(self, raise_if_any=True, raise_if_none=True):
199 | raise RuntimeError("boom")
200 |
201 | class DummyEventBus:
202 | def dispatch(self, event):
203 | return DummyErrorEvent()
204 |
205 | class DummyBrowserSession:
206 | def __init__(self):
207 | self.event_bus = DummyEventBus()
208 |
209 | session = DummyBrowserSession()
210 |
211 | result = await utils.capture_screenshot(session)
212 |
213 | assert result is None
214 |
--------------------------------------------------------------------------------
/src/mcp_browser_use/browser/browser_manager.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """Utility helpers for configuring and creating :class:`BrowserSession` instances.
3 |
4 | This module consolidates the thin wrappers that previously lived in
5 | ``custom_browser.py``, ``custom_context.py``, and ``config.py``. The new structure
6 | centralises environment parsing so ``server.py`` can simply request a configured
7 | browser session without re-implementing the translation from environment
8 | variables to ``BrowserSession`` keyword arguments.
9 | """
10 |
11 | from __future__ import annotations
12 |
13 | import logging
14 | import os
15 | from dataclasses import dataclass
16 | from typing import Any, Dict, Optional
17 |
18 | from browser_use import BrowserSession
19 | from browser_use.browser.profile import ProxySettings
20 |
21 | logger = logging.getLogger(__name__)
22 |
23 | _BOOL_TRUE = {"1", "true", "yes", "on"}
24 |
25 |
26 | @dataclass(slots=True)
27 | class BrowserPersistenceConfig:
28 | """Configuration for browser persistence and remote debugging settings."""
29 |
30 | persistent_session: bool = False
31 | user_data_dir: Optional[str] = None
32 | debugging_port: Optional[int] = None
33 | debugging_host: Optional[str] = None
34 |
35 | @classmethod
36 | def from_env(cls) -> "BrowserPersistenceConfig":
37 | persistent_session = (
38 | os.getenv("CHROME_PERSISTENT_SESSION", "").lower() in _BOOL_TRUE
39 | )
40 | user_data_dir = os.getenv("CHROME_USER_DATA") or None
41 |
42 | debugging_port: Optional[int]
43 | port_value = os.getenv("CHROME_DEBUGGING_PORT")
44 | if port_value:
45 | try:
46 | debugging_port = int(port_value)
47 | except ValueError:
48 | logger.warning(
49 | "Invalid CHROME_DEBUGGING_PORT=%r, ignoring debug port setting.",
50 | port_value,
51 | )
52 | debugging_port = None
53 | else:
54 | debugging_port = None
55 |
56 | debugging_host = os.getenv("CHROME_DEBUGGING_HOST") or None
57 |
58 | return cls(
59 | persistent_session=persistent_session,
60 | user_data_dir=user_data_dir,
61 | debugging_port=debugging_port,
62 | debugging_host=debugging_host,
63 | )
64 |
65 |
66 | @dataclass(slots=True)
67 | class BrowserEnvironmentConfig:
68 | """All runtime settings required for instantiating ``BrowserSession``."""
69 |
70 | headless: bool = False
71 | disable_security: bool = False
72 | executable_path: Optional[str] = None
73 | args: Optional[list[str]] = None
74 | allowed_domains: Optional[list[str]] = None
75 | proxy: Optional[ProxySettings] = None
76 | cdp_url: Optional[str] = None
77 | user_data_dir: Optional[str] = None
78 |
79 | def to_kwargs(self) -> Dict[str, Any]:
80 | """Convert to keyword arguments understood by :class:`BrowserSession`."""
81 |
82 | kwargs: Dict[str, Any] = {
83 | "headless": self.headless,
84 | "disable_security": self.disable_security,
85 | "executable_path": self.executable_path,
86 | "args": self.args,
87 | "allowed_domains": self.allowed_domains,
88 | "proxy": self.proxy,
89 | "cdp_url": self.cdp_url,
90 | "user_data_dir": self.user_data_dir,
91 | }
92 | # Remove ``None`` values so BrowserSession can rely on its defaults.
93 | return {key: value for key, value in kwargs.items() if value is not None}
94 |
95 | @classmethod
96 | def from_env(cls) -> "BrowserEnvironmentConfig":
97 | persistence = BrowserPersistenceConfig.from_env()
98 |
99 | headless = os.getenv("BROWSER_USE_HEADLESS", "false").lower() in _BOOL_TRUE
100 | disable_security = (
101 | os.getenv("BROWSER_USE_DISABLE_SECURITY", "false").lower() in _BOOL_TRUE
102 | )
103 | executable_path = os.getenv("CHROME_PATH") or None
104 |
105 | extra_args_env = os.getenv("BROWSER_USE_EXTRA_CHROMIUM_ARGS")
106 | args = None
107 | if extra_args_env:
108 | args = [arg.strip() for arg in extra_args_env.split(",") if arg.strip()]
109 |
110 | allowed_domains_env = os.getenv("BROWSER_USE_ALLOWED_DOMAINS")
111 | allowed_domains = None
112 | if allowed_domains_env:
113 | allowed_domains = [
114 | domain.strip()
115 | for domain in allowed_domains_env.split(",")
116 | if domain.strip()
117 | ]
118 |
119 | proxy_url = os.getenv("BROWSER_USE_PROXY_URL")
120 | proxy: Optional[ProxySettings] = None
121 | if proxy_url:
122 | proxy = ProxySettings(
123 | server=proxy_url,
124 | bypass=os.getenv("BROWSER_USE_NO_PROXY"),
125 | username=os.getenv("BROWSER_USE_PROXY_USERNAME"),
126 | password=os.getenv("BROWSER_USE_PROXY_PASSWORD"),
127 | )
128 |
129 | cdp_url = os.getenv("BROWSER_USE_CDP_URL") or None
130 | if not cdp_url and (persistence.debugging_host or persistence.debugging_port):
131 | host = persistence.debugging_host or "127.0.0.1"
132 | port = persistence.debugging_port or 9222
133 | cdp_url = f"http://{host}:{port}"
134 |
135 | user_data_dir = None
136 | if persistence.persistent_session:
137 | if persistence.user_data_dir:
138 | user_data_dir = persistence.user_data_dir
139 | else:
140 | logger.warning(
141 | "CHROME_PERSISTENT_SESSION requested but CHROME_USER_DATA was not provided."
142 | )
143 |
144 | return cls(
145 | headless=headless,
146 | disable_security=disable_security,
147 | executable_path=executable_path,
148 | args=args,
149 | allowed_domains=allowed_domains,
150 | proxy=proxy,
151 | cdp_url=cdp_url,
152 | user_data_dir=user_data_dir,
153 | )
154 |
155 |
156 | def create_browser_session(
157 | overrides: Optional[Dict[str, Any]] = None,
158 | ) -> BrowserSession:
159 | """Instantiate a :class:`BrowserSession` using environment defaults.
160 |
161 | ``overrides`` can be supplied to fine-tune the resulting session. Any keys
162 | set to ``None`` are ignored so callers can override only a subset of values.
163 | """
164 |
165 | config = BrowserEnvironmentConfig.from_env()
166 | kwargs = config.to_kwargs()
167 |
168 | if overrides:
169 | for key, value in overrides.items():
170 | if value is not None:
171 | kwargs[key] = value
172 | elif key in kwargs:
173 | # Explicit ``None`` removes the override letting BrowserSession
174 | # fall back to its internal default.
175 | kwargs.pop(key)
176 |
177 | logger.debug(
178 | "Creating BrowserSession with kwargs: %s",
179 | {k: v for k, v in kwargs.items() if k != "proxy"},
180 | )
181 | return BrowserSession(**kwargs)
182 |
--------------------------------------------------------------------------------
/src/mcp_browser_use/utils/utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import base64
4 | import logging
5 | import os
6 | import time
7 | from pathlib import Path
8 | from typing import Any, Callable, Dict, List, Optional, Tuple, Type
9 |
10 | from browser_use.browser.events import ScreenshotEvent
11 | from langchain_anthropic import ChatAnthropic
12 | from langchain_google_genai import ChatGoogleGenerativeAI
13 | from langchain_ollama import ChatOllama
14 | from langchain_openai import AzureChatOpenAI, ChatOpenAI
15 |
16 | logger = logging.getLogger(__name__)
17 |
18 |
19 | def _anthropic_params(kwargs: Dict[str, Any]) -> Dict[str, Any]:
20 | return {
21 | "model_name": kwargs.get("model_name", "claude-3-5-sonnet-20240620"),
22 | "temperature": kwargs.get("temperature", 0.0),
23 | "base_url": kwargs.get("base_url") or "https://api.anthropic.com",
24 | "api_key": kwargs.get("api_key") or os.getenv("ANTHROPIC_API_KEY", ""),
25 | }
26 |
27 |
28 | def _openai_params(kwargs: Dict[str, Any]) -> Dict[str, Any]:
29 | return {
30 | "model": kwargs.get("model_name", "gpt-4o"),
31 | "temperature": kwargs.get("temperature", 0.0),
32 | "base_url": kwargs.get("base_url")
33 | or os.getenv("OPENAI_ENDPOINT", "https://api.openai.com/v1"),
34 | "api_key": kwargs.get("api_key") or os.getenv("OPENAI_API_KEY", ""),
35 | }
36 |
37 |
38 | def _deepseek_params(kwargs: Dict[str, Any]) -> Dict[str, Any]:
39 | return {
40 | "model": kwargs.get("model_name", "deepseek-chat"),
41 | "temperature": kwargs.get("temperature", 0.0),
42 | "base_url": kwargs.get("base_url") or os.getenv("DEEPSEEK_ENDPOINT", ""),
43 | "api_key": kwargs.get("api_key") or os.getenv("DEEPSEEK_API_KEY", ""),
44 | }
45 |
46 |
47 | def _gemini_params(kwargs: Dict[str, Any]) -> Dict[str, Any]:
48 | return {
49 | "model": kwargs.get("model_name", "gemini-2.0-flash-exp"),
50 | "temperature": kwargs.get("temperature", 0.0),
51 | "google_api_key": kwargs.get("api_key") or os.getenv("GOOGLE_API_KEY", ""),
52 | }
53 |
54 |
55 | def _ollama_params(kwargs: Dict[str, Any]) -> Dict[str, Any]:
56 | return {
57 | "model": kwargs.get("model_name", "phi4"),
58 | "temperature": kwargs.get("temperature", 0.0),
59 | "num_ctx": kwargs.get("num_ctx", 128000),
60 | "base_url": kwargs.get("base_url", "http://localhost:11434"),
61 | }
62 |
63 |
64 | def _azure_openai_params(kwargs: Dict[str, Any]) -> Dict[str, Any]:
65 | return {
66 | "model": kwargs.get("model_name", "gpt-4o"),
67 | "temperature": kwargs.get("temperature", 0.0),
68 | "api_version": kwargs.get("api_version", "2024-05-01-preview"),
69 | "azure_endpoint": kwargs.get("base_url")
70 | or os.getenv("AZURE_OPENAI_ENDPOINT", ""),
71 | "api_key": kwargs.get("api_key") or os.getenv("AZURE_OPENAI_API_KEY", ""),
72 | }
73 |
74 |
75 | LLM_PROVIDERS: Dict[str, Tuple[Type, Callable[[Dict[str, Any]], Dict[str, Any]]]] = {
76 | "anthropic": (ChatAnthropic, _anthropic_params),
77 | "openai": (ChatOpenAI, _openai_params),
78 | "deepseek": (ChatOpenAI, _deepseek_params),
79 | "gemini": (ChatGoogleGenerativeAI, _gemini_params),
80 | "ollama": (ChatOllama, _ollama_params),
81 | "azure_openai": (AzureChatOpenAI, _azure_openai_params),
82 | }
83 |
84 |
85 | def get_llm_model(provider: str, **kwargs) -> Any:
86 | """
87 | Return an initialized language model client based on the given provider name.
88 |
89 | :param provider: The name of the LLM provider (e.g., "anthropic", "openai", "azure_openai").
90 | :param kwargs: Additional parameters (model_name, temperature, base_url, api_key, etc.).
91 | :return: An instance of a ChatLLM from the relevant langchain_* library.
92 | :raises ValueError: If the provider is unsupported.
93 | """
94 |
95 | try:
96 | llm_class, params_builder = LLM_PROVIDERS[provider]
97 | except KeyError as error:
98 | raise ValueError(f"Unsupported provider: {provider}") from error
99 |
100 | provider_kwargs = params_builder(kwargs)
101 | return llm_class(**provider_kwargs)
102 |
103 |
104 | # Commonly used model names for quick reference
105 | model_names = {
106 | "anthropic": ["claude-3-5-sonnet-20240620", "claude-3-opus-20240229"],
107 | "openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo"],
108 | "deepseek": ["deepseek-chat"],
109 | "gemini": [
110 | "gemini-2.0-flash-exp",
111 | "gemini-2.0-flash-thinking-exp",
112 | "gemini-1.5-flash-latest",
113 | "gemini-1.5-flash-8b-latest",
114 | "gemini-2.0-flash-thinking-exp-1219",
115 | ],
116 | "ollama": ["deepseek-r1:671b", "qwen2.5:7b", "llama3.3", "phi4"],
117 | "azure_openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo"],
118 | }
119 |
120 |
121 | def encode_image(img_path: Optional[str]) -> Optional[str]:
122 | """
123 | Convert an image at `img_path` into a base64-encoded string.
124 | Returns None if `img_path` is None or empty.
125 | Raises FileNotFoundError if the file doesn't exist.
126 | """
127 | if not img_path:
128 | return None
129 |
130 | try:
131 | with open(img_path, "rb") as image_file:
132 | image_data = base64.b64encode(image_file.read()).decode("utf-8")
133 | return image_data
134 | except FileNotFoundError as error:
135 | logger.error(f"Image not found at path {img_path}: {error}")
136 | raise
137 | except Exception as error:
138 | logger.error(f"Error encoding image at {img_path}: {error}")
139 | raise
140 |
141 |
142 | def get_latest_files(
143 | directory: str, file_types: List[str] = [".webm", ".zip"]
144 | ) -> Dict[str, Optional[str]]:
145 | """
146 | Find the latest file for each extension in `file_types` under `directory`.
147 | Returns a dict {file_extension: latest_file_path or None}.
148 |
149 | :param directory: The directory to search.
150 | :param file_types: List of file extensions (e.g., [".webm", ".zip"]).
151 | :return: dict mapping each extension to the path of the newest file or None if not found.
152 | """
153 | latest_files: Dict[str, Optional[str]] = {ext: None for ext in file_types}
154 |
155 | if not os.path.exists(directory):
156 | logger.debug(f"Directory '{directory}' does not exist. Creating it.")
157 | os.makedirs(directory, exist_ok=True)
158 | return latest_files
159 |
160 | for file_type in file_types:
161 | try:
162 | matching_files = list(Path(directory).rglob(f"*{file_type}"))
163 | if matching_files:
164 | # Sort or use max() by modified time
165 | most_recent_file = max(
166 | matching_files, key=lambda path: path.stat().st_mtime
167 | )
168 | # Check if file is not actively being written
169 | if time.time() - most_recent_file.stat().st_mtime > 1.0:
170 | latest_files[file_type] = str(most_recent_file)
171 | else:
172 | logger.debug(
173 | f"Skipping file {most_recent_file} - possibly still being written."
174 | )
175 | except Exception as error:
176 | logger.error(
177 | f"Error getting latest {file_type} file in '{directory}': {error}"
178 | )
179 |
180 | return latest_files
181 |
182 |
183 | async def capture_screenshot(browser_session) -> Optional[str]:
184 | """Capture a screenshot of the current page using the browser-use event bus."""
185 |
186 | if not hasattr(browser_session, "event_bus"):
187 | logger.error("Browser session does not have an event_bus.")
188 | return None
189 |
190 | try:
191 | event = browser_session.event_bus.dispatch(ScreenshotEvent(full_page=False))
192 | await event
193 | result = await event.event_result(raise_if_any=True, raise_if_none=True)
194 | return result
195 | except Exception as error:
196 | logger.error(f"Failed to capture screenshot via event bus: {error}")
197 | return None
198 |
--------------------------------------------------------------------------------
/src/mcp_browser_use/agent/custom_prompts.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from typing import List, Optional
4 |
5 | from browser_use.agent.prompts import SystemPrompt
6 | from browser_use.agent.views import ActionResult
7 | from browser_use.browser.views import BrowserState
8 | from langchain_core.messages import HumanMessage, SystemMessage
9 |
10 | from mcp_browser_use.agent.custom_views import CustomAgentStepInfo
11 |
12 |
13 | class CustomSystemPrompt(SystemPrompt):
14 | """
15 | Custom system prompt that extends SystemPrompt to inject additional
16 | formatting rules and instructions for the AI agent.
17 | """
18 |
19 | def important_rules(self) -> str:
20 | """
21 | Return a detailed multiline string describing how the agent
22 | must format its JSON response, handle multiple actions, forms,
23 | navigation, and the maximum actions per step.
24 |
25 | The text includes guidelines for:
26 | - JSON response format
27 | - Action sequences
28 | - Element interaction
29 | - Navigation & error handling
30 | - Task completion
31 | - Visual context usage
32 | - Handling form filling and suggestions
33 | """
34 | text = r"""
35 | 1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format:
36 | {
37 | "current_state": {
38 | "prev_action_evaluation": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Ignore the action result. The website is the ground truth. Also mention if something unexpected happened like new suggestions in an input field. Shortly state why/why not. Note that the result you output must be consistent with the reasoning you output afterwards. If you consider it to be 'Failed,' you should reflect on this during your thought.",
39 | "important_contents": "Output important contents closely related to user's instruction or task on the current page. If there is, please output the contents. If not, please output empty string ''.",
40 | "completed_contents": "Update the input Task Progress. Completed contents is a general summary of the current contents that have been completed. Just summarize the contents that have been actually completed based on the current page and the history operations. Please list each completed item individually, such as: 1. Input username. 2. Input Password. 3. Click confirm button",
41 | "thought": "Think about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation. If the output of prev_action_evaluation is 'Failed', please reflect and output your reflection here. If you think you have entered the wrong page, consider to go back to the previous page in next action.",
42 | "summary": "Please generate a brief natural language description for the operation in next actions based on your Thought."
43 | },
44 | "action": [
45 | {
46 | "action_name": {
47 | // action-specific parameters
48 | }
49 | },
50 | // ... more actions in sequence
51 | ]
52 | }
53 |
54 | 2. ACTIONS: You can specify multiple actions to be executed in sequence.
55 | Common action sequences:
56 | - Form filling: [
57 | {"input_text": {"index": 1, "text": "username"}},
58 | {"input_text": {"index": 2, "text": "password"}},
59 | {"click_element": {"index": 3}}
60 | ]
61 | - Navigation and extraction: [
62 | {"open_new_tab": {}},
63 | {"go_to_url": {"url": "https://example.com"}},
64 | {"extract_page_content": {}}
65 | ]
66 |
67 | 3. ELEMENT INTERACTION:
68 | - Only use indexes that exist in the provided element list
69 | - Each element has a unique index number (e.g., "33[:]