4 | OTA user interaction data helper
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/browser_use/controller/views_selector.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | from pydantic import BaseModel, ConfigDict, Field, model_validator
4 |
5 | class InputTextBySelectorAction(BaseModel):
6 | selector: str
7 | text: str
8 | xpath: Optional[str] = None
9 |
10 |
11 | class Position(BaseModel):
12 | x: int
13 | y: int
14 |
--------------------------------------------------------------------------------
/chrome-extension/.editorconfig:
--------------------------------------------------------------------------------
1 | # top-most EditorConfig file
2 | root = true
3 |
4 | # Newline ending every file
5 | [*]
6 | end_of_line = lf
7 | insert_final_newline = true
8 |
9 | # Charset
10 | [*.{js,html,css,md,json}]
11 | charset = utf-8
12 |
13 | # Indentation
14 | [*.{js,html,css,json}]
15 | indent_style = space
16 | indent_size = 4
--------------------------------------------------------------------------------
/prompts/subgoal_generation/task-start.md:
--------------------------------------------------------------------------------
1 | I need your help with an analysis to an action in browser and its related changes.
2 | We recorded a user's interactions with his browser for the current task. His ultimate goal is:
3 |
4 | {{ ultimate_goal }}
5 |
6 | The user just started this task, and when he clicked "task start" button, his current page is at:
7 |
8 | {{ change_events }}
9 |
10 | based on this information, provide a concise and formatted instruction in JSON to make another agent to know which website it needs to go to, e.g.:
11 | {"next_goal": "Open allrecipes.com in a new tab to search for the recipe."}
--------------------------------------------------------------------------------
/chrome-extension/manifest.json:
--------------------------------------------------------------------------------
1 | {
2 | "manifest_version": 3,
3 | "name": "WAP Browser Action Capturer",
4 | "version": "1.0",
5 | "description": "A simple tool helping you to collect the interactions with browser for WAP replay.",
6 | "icons": {
7 | "128": "ico/ota-logo-128.png",
8 | "48": "ico/ota-logo-48.png"
9 | },
10 | "permissions": [
11 | "activeTab",
12 | "webNavigation",
13 | "scripting",
14 | "storage"
15 | ],
16 | "optional_host_permissions": [
17 | "*://*/*"
18 | ],
19 | "background": {
20 | "service_worker": "js/background.js"
21 | },
22 | "devtools_page": "devtools.html",
23 | "content_security_policy": {
24 | "extension_pages": "script-src 'self'; object-src 'self'"
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/chrome-extension/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "WAP-Browser-Action-Capturer",
3 | "version": "1.3.0",
4 | "description": "A simple tool helping you to collect the interactions with browser for WAP replay",
5 | "main": "Gruntfile.js",
6 | "repository": {
7 | "type": "git",
8 | "url": "git://github.com/OTA-Tech-AI/webagentprotocol.git"
9 | },
10 | "author": "Konrad Dzwinel",
11 | "license": "GPL",
12 | "bugs": {
13 | "url": "https://github.com/OTA-Tech-AI/webagentprotocol/issues"
14 | },
15 | "homepage": "https://github.com/OTA-Tech-AI/webagentprotocol",
16 | "dependencies": {
17 | "grunt": "^1.0.3",
18 | "grunt-contrib-csslint": "^2.0.0",
19 | "grunt-contrib-jshint": "^1.1.0",
20 | "grunt-contrib-watch": "^1.1.0",
21 | "grunt-zip": "^0.17.1"
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/browser_use/__init__.py:
--------------------------------------------------------------------------------
1 | from browser_use.logging_config import setup_logging
2 |
3 | setup_logging()
4 |
5 | from browser_use.agent.prompts import SystemPrompt as SystemPrompt
6 | from browser_use.agent.service import Agent as Agent
7 | from browser_use.agent.views import ActionModel as ActionModel
8 | from browser_use.agent.views import ActionResult as ActionResult
9 | from browser_use.agent.views import AgentHistoryList as AgentHistoryList
10 | from browser_use.browser.browser import Browser as Browser
11 | from browser_use.browser.browser import BrowserConfig as BrowserConfig
12 | from browser_use.browser.context import BrowserContextConfig
13 | from browser_use.controller.service import Controller as Controller
14 | from browser_use.dom.service import DomService as DomService
15 |
16 | __all__ = [
17 | 'Agent',
18 | 'Browser',
19 | 'BrowserConfig',
20 | 'Controller',
21 | 'DomService',
22 | 'SystemPrompt',
23 | 'ActionResult',
24 | 'ActionModel',
25 | 'AgentHistoryList',
26 | 'BrowserContextConfig',
27 | ]
28 |
--------------------------------------------------------------------------------
/wap_service.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, request
2 | import run_replay
3 |
4 | app = Flask(__name__)
5 |
6 | @app.route('/replay', methods=['GET'])
7 | async def run_replay_endpoint():
8 | try:
9 | # Get parameters from query string
10 | iterations = int(request.args.get('concurrent'))
11 | model = request.args.get('model')
12 | file_path = request.args.get('file_path')
13 |
14 | # Validate required parameters
15 | if not model or not file_path:
16 | return {"status": "error", "message": "Model and file_path are required"}, 400
17 |
18 | await run_replay.main(iterations, model, file_path)
19 | return {"status": "success", "message": "Replay executed successfully"}
20 | except ValueError as ve:
21 | return {"status": "error", "message": "Invalid iterations value: must be an integer"}, 400
22 | except Exception as e:
23 | return {"status": "error", "message": str(e)}, 500
24 |
25 | if __name__ == '__main__':
26 | app.run(host='0.0.0.0', port=3089)
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 OTA-Tech-AI
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/data_samples/replay_list_samples/wap_smart_replay_list_y757R6w6y17LVHXl.json:
--------------------------------------------------------------------------------
1 | {
2 | "ultimate_goal": "find a top rated keyboard on amazon.ca",
3 | "task_id": "y757R6w6y17LVHXl",
4 | "type": "smart_replay",
5 | "subgoal_list": [
6 | {
7 | "index": 0,
8 | "subgoal": "task starts, go for the next sub-goal"
9 | },
10 | {
11 | "index": 1,
12 | "subgoal": "Search for 'top rated keyboard' on amazon.ca to find the best options."
13 | },
14 | {
15 | "index": 2,
16 | "subgoal": "Enter 'keyboard' as the search term in the search input field and press enter key."
17 | },
18 | {
19 | "index": 3,
20 | "subgoal": "Click on the dropdown labeled 'Sort by:' to change sorting options."
21 | },
22 | {
23 | "index": 4,
24 | "subgoal": "Click on the option labeled 'Avg. customer review' in the sort dropdown menu."
25 | },
26 | {
27 | "index": 5,
28 | "subgoal": "Click on the first product"
29 | },
30 | {
31 | "index": 6,
32 | "subgoal": "GOAL-NOT-ACHIEVED"
33 | },
34 | {
35 | "index": 7,
36 | "subgoal": "task done"
37 | }
38 | ]
39 | }
--------------------------------------------------------------------------------
/browser_use/browser/tests/screenshot_test.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import base64
3 |
4 | import pytest
5 |
6 | from browser_use.browser.browser import Browser, BrowserConfig
7 |
8 |
9 | async def test_take_full_page_screenshot():
10 | browser = Browser(config=BrowserConfig(headless=False, disable_security=True))
11 | try:
12 | async with await browser.new_context() as context:
13 | page = await context.get_current_page()
14 | # Go to a test page
15 | await page.goto('https://example.com')
16 |
17 | await asyncio.sleep(3)
18 | # Take full page screenshot
19 | screenshot_b64 = await context.take_screenshot(full_page=True)
20 | await asyncio.sleep(3)
21 | # Verify screenshot is not empty and is valid base64
22 | assert screenshot_b64 is not None
23 | assert isinstance(screenshot_b64, str)
24 | assert len(screenshot_b64) > 0
25 |
26 | # Test we can decode the base64 string
27 | try:
28 | base64.b64decode(screenshot_b64)
29 | except Exception as e:
30 | pytest.fail(f'Failed to decode base64 screenshot: {str(e)}')
31 | finally:
32 | await browser.close()
33 |
34 |
35 | if __name__ == '__main__':
36 | asyncio.run(test_take_full_page_screenshot())
37 |
--------------------------------------------------------------------------------
/chrome-extension/Gruntfile.js:
--------------------------------------------------------------------------------
1 | module.exports = function (grunt) {
2 | "use strict";
3 |
4 | grunt.initConfig({
5 | pkg: grunt.file.readJSON('package.json'),
6 | jshint: {
7 | files: ['Gruntfile.js', 'js/**/*.js'],
8 | options: {
9 | esversion: 6,
10 | evil: true,
11 | camelcase: true,
12 | curly: true,
13 | eqeqeq: true,
14 | noempty: true,
15 | strict: true,
16 | loopfunc: true,
17 | globals: {
18 | console: true,
19 | document: true
20 | }
21 | }
22 | },
23 | csslint: {
24 | src: ['css/*.css'],
25 | options: {
26 | ids: false,
27 | 'compatible-vendor-prefixes': false,
28 | 'fallback-colors': false
29 | }
30 | },
31 | zip: {
32 | 'wap_browser_action_capturer-<%= pkg.version %>.zip': ['css/**/*', 'ico/logo_*.png', 'js/**/*', 'other/**/*', '*.html', 'manifest.json']
33 | }
34 | });
35 |
36 | grunt.loadNpmTasks('grunt-contrib-jshint');
37 | grunt.loadNpmTasks('grunt-contrib-csslint');
38 | grunt.loadNpmTasks('grunt-zip');
39 |
40 | grunt.registerTask('default', ['jshint']);
41 | grunt.registerTask('prod', ['zip']);
42 | };
43 |
--------------------------------------------------------------------------------
/prompts/subgoal_generation/task-finish.md:
--------------------------------------------------------------------------------
1 | I need your help with an analysis to an action in browser and its related changes.
2 | We recorded a user's interactions with his browser for the current task. His ultimate goal is:
3 |
4 | {{ ultimate_goal }}
5 |
6 | now the user has already finished the task, and the final page content that he submitted to complete task is:
7 |
8 | {{ page_content }}
9 |
10 | based on this content, please tell me: do you think this task is really finished?
11 | Provide a concise and formatted instruction in JSON to make another agent to know what to do, you have several options:
12 |
13 | 1. if the ultimate goal has been achieved by the current action and no more other actions need to be executed or no any information needs to be delivered to the user, only reply a 'done' message, e.g.: {"next_goal": "The ultimate task is done"}
14 |
15 | 2. if the ultimate goal has been achieved but we need to extract information from the current page content to respond user's demands, reply a content extraction message, e.g.: {"next_goal": "extract the cook time and prepare time from the page content"}
16 |
17 | 3. if the ultimate goal has NOT been ahieved, please reply a failure message, e,g.: {"next_goal": "GOAL-NOT-ACHIEVED", "reason": "the cook time is longer than expected ..."}
--------------------------------------------------------------------------------
/chrome-extension/js/ScrollHelper.js:
--------------------------------------------------------------------------------
1 | (function () {
2 | "use strict";
3 |
4 | function ScrollHelper(button) {
5 | var scrollHelper = this;
6 |
7 | this._button = button;
8 |
9 | var scrollPos = 0;
10 | document.addEventListener('scroll', function () {
11 | scrollPos = document.body.scrollTop;
12 | });
13 |
14 | function updateBtn() {
15 | if (scrollPos > 0) {
16 | scrollHelper.showButton();
17 | } else {
18 | scrollHelper.hideButton();
19 | }
20 | requestAnimationFrame(updateBtn);
21 | }
22 |
23 | updateBtn();
24 | }
25 |
26 | ScrollHelper.prototype.hideButton = function () {
27 | this._button.classList.add('hidden');
28 | };
29 |
30 | ScrollHelper.prototype.showButton = function () {
31 | this._button.classList.remove('hidden');
32 | };
33 |
34 | ScrollHelper.prototype.scrollToTheTop = function () {
35 | var scrollPos = document.body.scrollTop;
36 |
37 | if (scrollPos > 0) {
38 | document.body.scrollTop -= (scrollPos > 10) ? (scrollPos / 4) : 10;
39 | requestAnimationFrame(this.scrollToTheTop.bind(this));
40 | }
41 | };
42 |
43 | window.ScrollHelper = ScrollHelper;
44 | })();
45 |
--------------------------------------------------------------------------------
/browser_use/dom/tests/process_dom_test.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import time
4 |
5 | from browser_use.browser.browser import Browser, BrowserConfig
6 |
7 |
8 | async def test_process_dom():
9 | browser = Browser(config=BrowserConfig(headless=False))
10 |
11 | async with await browser.new_context() as context:
12 | page = await context.get_current_page()
13 | await page.goto('https://kayak.com/flights')
14 | # await page.goto('https://google.com/flights')
15 | # await page.goto('https://immobilienscout24.de')
16 | # await page.goto('https://seleniumbase.io/w3schools/iframes')
17 |
18 | time.sleep(3)
19 |
20 | with open('browser_use/dom/buildDomTree.js', 'r') as f:
21 | js_code = f.read()
22 |
23 | start = time.time()
24 | dom_tree = await page.evaluate(js_code)
25 | end = time.time()
26 |
27 | # print(dom_tree)
28 | print(f'Time: {end - start:.2f}s')
29 |
30 | os.makedirs('./tmp', exist_ok=True)
31 | with open('./tmp/dom.json', 'w') as f:
32 | json.dump(dom_tree, f, indent=1)
33 |
34 | # both of these work for immobilienscout24.de
35 | # await page.click('.sc-dcJsrY.ezjNCe')
36 | # await page.click(
37 | # 'div > div:nth-of-type(2) > div > div:nth-of-type(2) > div > div:nth-of-type(2) > div > div > div > button:nth-of-type(2)'
38 | # )
39 |
40 | input('Press Enter to continue...')
41 |
--------------------------------------------------------------------------------
/prompts/subgoal_generation/submit.md:
--------------------------------------------------------------------------------
1 | I need your help with an analysis to an action in browser and its related changes.
2 | We recorded an action of 'submit' by the user with his browser for the current task. His ultimate goal is:
3 |
4 | {{ ultimate_goal }}
5 |
6 | So here is the basic information of the action for the 'submit' in current sub-task:
7 |
8 | {{ action }}
9 |
10 | note that "target" is the targeted element for this action.
11 | Here is the detailed information about the form values that the user submitted:
12 |
13 | {{ change_events }}
14 |
15 | note that in some "nodeinfo", #rme mean there are more children inside this tag pairs but we hide it for shorting the context.
16 | You should think about what is the purpose of this action by the user, and think about what is the goal this user is trying to achieve in the current sub-task. You don't need to tell me your thought process. You only need to give me a final reply which is a concise and formatted instruction in JSON to make another agent to understand this sub-goal and reproduce the action. Do not mension the details of the target. If the submision is a search, you need to provide two actions, input change and press enter key e.g.:
17 | {"next_goal": "Enter 'Singapore' as the destination in the search input field and press enter key."}
18 | {"next_goal": "Click on the button with text 'Dinners' to view more options for cooking dinners at home"}
19 |
--------------------------------------------------------------------------------
/action_collect_server.py:
--------------------------------------------------------------------------------
1 | import os
2 | import datetime
3 | import argparse
4 | from flask import Flask, request, jsonify
5 | from flask_cors import CORS
6 |
7 | app = Flask(__name__)
8 | CORS(app)
9 |
10 | def mkdir_n_define_file_name(data_root_dir, task_name):
11 | timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
12 | date_folder = timestamp.split('_')[0]
13 | # Include task_name after the date folder
14 | folderpath = os.path.join(data_root_dir, date_folder, task_name)
15 | if not os.path.exists(folderpath):
16 | os.makedirs(folderpath)
17 | filename = f"summary_event_{timestamp}.json"
18 | filepath = os.path.join(folderpath, filename)
19 | return filepath
20 |
21 | @app.route('/action-data', methods=['POST'])
22 | def handle_event():
23 | if not request.is_json:
24 | return jsonify({"status": "error", "message": "Request must be JSON"}), 400
25 |
26 | event_data = request.get_json()
27 | task_id = event_data["taskId"]
28 | filepath = mkdir_n_define_file_name("data", task_id)
29 |
30 | with open(filepath, "w", encoding='utf-8') as json_file:
31 | import json
32 | json.dump(event_data, json_file, indent=2)
33 |
34 | return jsonify({"status": "success", "message": f"Event received and saved as {filepath}"}), 200
35 |
36 | if __name__ == '__main__':
37 | # Run the Flask app
38 | app.run(debug=True, host='0.0.0.0', port=4934)
--------------------------------------------------------------------------------
/prompts/subgoal_generation/go-back-or-forward.md:
--------------------------------------------------------------------------------
1 | I need your help with an analysis to an action in browser and its related changes.
2 | We recorded a user's interactions with his browser for the current task. His ultimate goal is:
3 |
4 | {{ ultimate_goal }}
5 |
6 | In the current sub-task, the user clicked on "go back" or "go forward" button of the browser, it is possible that he didn't find the information he needed in the current page, or he may want to
7 | confirm information in the previous page. This is the information of this action:
8 |
9 | {{ action }}
10 |
11 | The content before he goes back or forward is:
12 |
13 | {{ page_content }}
14 |
15 | note that sometimes in the page content, you will see #rme and it means there are more children inside this tag but we hide it for shortening contexts.
16 | You should think about what is the purpose of this action by the user. You don't need to tell me your thought process. You only need to give me a final reply which is a concise and formatted instruction in JSON to make another agent to understand this sub-goal and reproduce the action.
17 | Do not mention "go back" or "go forward" because it is unclear. Tell which URL it should navigate to. e.g.:
18 | {"next_goal": "Navigate to https://www.allrecipes.com/search?q=baked+salmon to review search results for 'baked salmon' recipes on Allrecipes."}
19 | {"next_goal": "Navigate to google.com to search for keywords spanish restaurants."}
--------------------------------------------------------------------------------
/prompts/subgoal_generation/common.md:
--------------------------------------------------------------------------------
1 | I need your help with an analysis to an action in browser and its related changes.
2 | We recorded a user's interactions with his browser for the current task. His ultimate goal is:
3 |
4 | {{ ultimate_goal }}
5 |
6 | So here is the basic information of the action for the current sub-task that a user takes in browser:
7 |
8 | {{ action }}
9 |
10 | note that "target" is the targeted element for this action.
11 | This is what happened in the browser DOM before and after this action:
12 |
13 | {{ change_events }}
14 |
15 | note that in some "nodeinfo", #rme mean there are more children inside this tag pairs but we hide it for shorting the context.
16 | You should think about what is the purpose of this action by the user, and think about what is the goal this user is trying to achieve in the current sub-task. You don't need to tell me your thought process. You only need to give me a final reply which is a concise and formatted instruction in JSON to make another agent to understand this sub-goal and reproduce the action. Subgoal should be generalized and fit the ultimate goal. Only include one action in the subgoal, do not explain action. Only include one action (verb) in the subgoal! If you want to use "and", only keep the first action. e.g.:
17 | {"next_goal": "Enter 'Singapore' as the destination in the search input field."}
18 | {"next_goal": "Click on the button with text 'Dinners'"}
19 | {"next_goal": "Click on the first item"}
20 |
--------------------------------------------------------------------------------
/mcp_servers/find_top_rated_keyboard_amazon_ca_y757R6w6y17LVHXl_mcp_server.py:
--------------------------------------------------------------------------------
1 |
2 | from mcp.server.fastmcp import FastMCP
3 | import httpx
4 |
5 | mcp = FastMCP("find a top rated keyboard on amazon.ca")
6 |
7 | @mcp.tool()
8 | async def find_top_rated_keyboard_amazon_ca_smart_replay() -> str:
9 | """smart replay: find a top rated keyboard on amazon.ca"""
10 | async with httpx.AsyncClient(timeout=600.0) as client:
11 | response = await client.get(
12 | "http://localhost:3089/replay",
13 | params={
14 | "concurrent": 1,
15 | "model": "openai",
16 | "file_path": 'data_processed/smart_replay/wap_smart_replay_list_y757R6w6y17LVHXl.json'
17 | }
18 | )
19 | return response.text
20 | return "FAILED"
21 |
22 | @mcp.tool()
23 | async def find_top_rated_keyboard_amazon_ca_exact_replay() -> str:
24 | """exact replay: find a top rated keyboard on amazon.ca"""
25 | async with httpx.AsyncClient(timeout=600.0) as client:
26 | response = await client.get(
27 | "http://localhost:3089/replay",
28 | params={
29 | "concurrent": 1,
30 | "model": "openai",
31 | "file_path": 'data_processed/exact_replay/wap_exact_replay_list_y757R6w6y17LVHXl.json'
32 | }
33 | )
34 | return response.text
35 | return "FAILED"
36 |
37 | if __name__ == "__main__":
38 | mcp.run(transport="stdio")
39 |
--------------------------------------------------------------------------------
/browser_use/browser/utils/screen_resolution.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def get_screen_resolution():
5 | if sys.platform == 'darwin': # macOS
6 | try:
7 | from AppKit import NSScreen
8 |
9 | screen = NSScreen.mainScreen().frame()
10 | return {'width': int(screen.size.width), 'height': int(screen.size.height)}
11 | except ImportError:
12 | print('AppKit is not available. Make sure you are running this on macOS with pyobjc installed.')
13 | except Exception as e:
14 | print(f'Error retrieving macOS screen resolution: {e}')
15 | return {'width': 2560, 'height': 1664}
16 |
17 | else: # Windows & Linux
18 | try:
19 | from screeninfo import get_monitors
20 |
21 | monitors = get_monitors()
22 | if not monitors:
23 | raise Exception('No monitors detected.')
24 | monitor = monitors[0]
25 | return {'width': monitor.width, 'height': monitor.height}
26 | except ImportError:
27 | print("screeninfo package not found. Install it using 'pip install screeninfo'.")
28 | except Exception as e:
29 | print(f'Error retrieving screen resolution: {e}')
30 |
31 | return {'width': 1920, 'height': 1080}
32 |
33 |
34 | def get_window_adjustments():
35 | """Returns recommended x, y offsets for window positioning"""
36 | if sys.platform == 'darwin': # macOS
37 | return -4, 24 # macOS has a small title bar, no border
38 | elif sys.platform == 'win32': # Windows
39 | return -8, 0 # Windows has a border on the left
40 | else: # Linux
41 | return 0, 0
42 |
--------------------------------------------------------------------------------
/chrome-extension/js/ContentScriptProxy.js:
--------------------------------------------------------------------------------
1 | (function () {
2 | "use strict";
3 |
4 | function callCommand(cmd) {
5 | chrome.devtools.inspectedWindow.eval(
6 | cmd,
7 | {useContentScriptContext: true},
8 | function (isException, result) {
9 | if (isException || chrome.runtime.lastError) {
10 | console.error('Content script command call failed.', cmd, result, chrome.runtime.lastError);
11 | }
12 | }
13 | );
14 | }
15 |
16 | function jsArg(str) {
17 | // safely quote argument for eval
18 | return JSON.stringify(str);
19 | }
20 |
21 | window.ContentScriptProxy = {
22 | inspectNode: function (nodeId) {
23 | callCommand('inspect(domListenerExtension.getNode(' + nodeId + '))');
24 | },
25 | highlightNode: function (nodeId) {
26 | callCommand('domListenerExtension.highlightNode(' + nodeId + ')');
27 | },
28 | startRecording: function (desc) {
29 | callCommand(`domListenerExtension.startTaskRecording(${jsArg(desc)})`);
30 | },
31 | pauseRecording: function () {
32 | callCommand('domListenerExtension.pauseTaskRecording()');
33 | },
34 | resumeRecording: function (desc) {
35 | callCommand(`domListenerExtension.resumeTaskRecording(${jsArg(desc)})`);
36 | },
37 | finishRecording: function () {
38 | callCommand('domListenerExtension.finishTaskRecording()');
39 | }
40 | };
41 | })();
42 |
--------------------------------------------------------------------------------
/browser_use/telemetry/views.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from dataclasses import asdict, dataclass
3 | from typing import Any, Dict, Sequence
4 |
5 |
6 | @dataclass
7 | class BaseTelemetryEvent(ABC):
8 | @property
9 | @abstractmethod
10 | def name(self) -> str:
11 | pass
12 |
13 | @property
14 | def properties(self) -> Dict[str, Any]:
15 | return {k: v for k, v in asdict(self).items() if k != 'name'}
16 |
17 |
18 | @dataclass
19 | class RegisteredFunction:
20 | name: str
21 | params: dict[str, Any]
22 |
23 |
24 | @dataclass
25 | class ControllerRegisteredFunctionsTelemetryEvent(BaseTelemetryEvent):
26 | registered_functions: list[RegisteredFunction]
27 | name: str = 'controller_registered_functions'
28 |
29 |
30 | @dataclass
31 | class AgentStepTelemetryEvent(BaseTelemetryEvent):
32 | agent_id: str
33 | step: int
34 | step_error: list[str]
35 | consecutive_failures: int
36 | actions: list[dict]
37 | name: str = 'agent_step'
38 |
39 |
40 | @dataclass
41 | class AgentRunTelemetryEvent(BaseTelemetryEvent):
42 | agent_id: str
43 | use_vision: bool
44 | task: str
45 | model_name: str
46 | chat_model_library: str
47 | version: str
48 | source: str
49 | name: str = 'agent_run'
50 |
51 |
52 | @dataclass
53 | class AgentEndTelemetryEvent(BaseTelemetryEvent):
54 | agent_id: str
55 | steps: int
56 | max_steps_reached: bool
57 | is_done: bool
58 | success: bool | None
59 | total_input_tokens: int
60 | total_duration_seconds: float
61 |
62 | errors: Sequence[str | None]
63 | name: str = 'agent_end'
64 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate==1.6.0
2 | annotated-types==0.7.0
3 | anthropic==0.49.0
4 | anyio==4.9.0
5 | backoff==2.2.1
6 | beautifulsoup4==4.13.3
7 | certifi==2025.1.31
8 | charset-normalizer==3.4.1
9 | colorama==0.4.6
10 | defusedxml==0.7.1
11 | distro==1.9.0
12 | filelock==3.18.0
13 | fsspec==2025.3.2
14 | greenlet==3.1.1
15 | h11==0.14.0
16 | httpcore==1.0.7
17 | httpx==0.28.1
18 | huggingface-hub==0.30.1
19 | idna==3.10
20 | Jinja2==3.1.6
21 | jiter==0.9.0
22 | joblib==1.4.2
23 | jsonpatch==1.33
24 | jsonpointer==3.0.0
25 | langchain-core>=0.3.58,<0.4.0
26 | langchain-anthropic==0.3.3
27 | langchain-ollama==0.2.2
28 | langchain-openai==0.3.1
29 | langsmith==0.3.24
30 | markdownify==0.14.1
31 | MarkupSafe==3.0.2
32 | monotonic==1.6
33 | mpmath==1.3.0
34 | networkx==3.4.2
35 | numpy==2.2.4
36 | ollama==0.4.7
37 | openai==1.70.0
38 | orjson==3.10.16
39 | packaging==24.2
40 | pillow==11.1.0
41 | playwright==1.51.0
42 | posthog==3.23.0
43 | psutil==7.0.0
44 | pydantic==2.11.2
45 | pydantic_core==2.33.1
46 | pyee==12.1.1
47 | python-dateutil==2.9.0.post0
48 | python-dotenv==1.1.0
49 | PyYAML==6.0.2
50 | regex==2024.11.6
51 | requests==2.32.3
52 | requests-toolbelt==1.0.0
53 | safetensors==0.5.3
54 | scikit-learn==1.6.1
55 | scipy==1.15.2
56 | sentence-transformers==4.0.2
57 | six==1.17.0
58 | sniffio==1.3.1
59 | soupsieve==2.6
60 | sympy==1.13.1
61 | tenacity==9.1.2
62 | threadpoolctl==3.6.0
63 | tiktoken==0.9.0
64 | tokenizers==0.21.1
65 | torch==2.6.0
66 | tqdm==4.67.1
67 | transformers==4.51.0
68 | typing-inspection==0.4.0
69 | typing_extensions==4.13.1
70 | urllib3==2.3.0
71 | zstandard==0.23.0
72 | flask==3.1.0
73 | flask_cors==5.0.1
74 | mem0ai==0.1.96
75 | faiss-cpu==1.11.0
76 | screeninfo==0.8.1
77 | mcp==1.7.1
78 | flask[async]
79 | langchain==0.3.25
80 | html_sanitizer==2.5.0
81 |
--------------------------------------------------------------------------------
/browser_use/browser/views.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 | from typing import Any, Optional
3 |
4 | from pydantic import BaseModel
5 |
6 | from browser_use.dom.history_tree_processor.service import DOMHistoryElement
7 | from browser_use.dom.views import DOMState
8 |
9 |
10 | # Pydantic
11 | class TabInfo(BaseModel):
12 | """Represents information about a browser tab"""
13 |
14 | page_id: int
15 | url: str
16 | title: str
17 | parent_page_id: Optional[int] = None # parent page that contains this popup or cross-origin iframe
18 |
19 |
20 | class GroupTabsAction(BaseModel):
21 | tab_ids: list[int]
22 | title: str
23 | color: Optional[str] = 'blue'
24 |
25 |
26 | class UngroupTabsAction(BaseModel):
27 | tab_ids: list[int]
28 |
29 |
30 | @dataclass
31 | class BrowserState(DOMState):
32 | url: str
33 | title: str
34 | tabs: list[TabInfo]
35 | screenshot: Optional[str] = None
36 | pixels_above: int = 0
37 | pixels_below: int = 0
38 | browser_errors: list[str] = field(default_factory=list)
39 |
40 |
41 | @dataclass
42 | class BrowserStateHistory:
43 | url: str
44 | title: str
45 | tabs: list[TabInfo]
46 | interacted_element: list[DOMHistoryElement | None] | list[None]
47 | screenshot: Optional[str] = None
48 |
49 | def to_dict(self) -> dict[str, Any]:
50 | data = {}
51 | data['tabs'] = [tab.model_dump() for tab in self.tabs]
52 | data['screenshot'] = self.screenshot
53 | data['interacted_element'] = [el.to_dict() if el else None for el in self.interacted_element]
54 | data['url'] = self.url
55 | data['title'] = self.title
56 | return data
57 |
58 |
59 | class BrowserError(Exception):
60 | """Base class for all browser errors"""
61 |
62 |
63 | class URLNotAllowedError(BrowserError):
64 | """Error raised when a URL is not allowed"""
65 |
--------------------------------------------------------------------------------
/data_samples/replay_list_samples/wap_exact_replay_list_l8vZDGTfw3qu3GBs.json:
--------------------------------------------------------------------------------
1 | {
2 | "ultimate_goal": "Find the best sold keyboard in BestBuy",
3 | "task_id": "l8vZDGTfw3qu3GBs",
4 | "type": "exact_replay",
5 | "action_list": [
6 | {
7 | "action": "open_tab",
8 | "action_params": {
9 | "url": "https://www.bestbuy.ca/en-ca"
10 | }
11 | },
12 | {
13 | "action": "wait_for_element",
14 | "action_params": {
15 | "selector": "INPUT.style-module_textField__MdLzL",
16 | "timeout": 5000
17 | }
18 | },
19 | {
20 | "action": "input_text_by_selector",
21 | "action_params": {
22 | "selector": "INPUT.style-module_textField__MdLzL",
23 | "text": "keyboard"
24 | }
25 | },
26 | {
27 | "action": "send_keys",
28 | "action_params": {
29 | "keys": "Enter"
30 | }
31 | },
32 | {
33 | "action": "wait_for_element",
34 | "action_params": {
35 | "selector": "#Sort",
36 | "timeout": 5000
37 | }
38 | },
39 | {
40 | "action": "select_option_by_selector",
41 | "action_params": {
42 | "css_selector": "#Sort",
43 | "value": "highestRated"
44 | }
45 | },
46 | {
47 | "action": "wait_for_element",
48 | "action_params": {
49 | "selector": "h3[data-automation=\"productItemName\"]",
50 | "timeout": 5000
51 | }
52 | },
53 | {
54 | "action": "click_element_by_selector",
55 | "action_params": {
56 | "css_selector": "h3[data-automation=\"productItemName\"]"
57 | }
58 | },
59 | {
60 | "action": "extract_content",
61 | "action_params": {
62 | "goal": "Find the best sold keyboard in BestBuy",
63 | "should_strip_link_urls": false
64 | }
65 | },
66 | {
67 | "action": "done",
68 | "action_params": {
69 | "text": "task executed successfully",
70 | "success": true
71 | }
72 | }
73 | ]
74 | }
--------------------------------------------------------------------------------
/browser_use/dom/history_tree_processor/view.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import Optional
3 |
4 | from pydantic import BaseModel
5 |
6 |
7 | @dataclass
8 | class HashedDomElement:
9 | """
10 | Hash of the dom element to be used as a unique identifier
11 | """
12 |
13 | branch_path_hash: str
14 | attributes_hash: str
15 | xpath_hash: str
16 | # text_hash: str
17 |
18 |
19 | class Coordinates(BaseModel):
20 | x: int
21 | y: int
22 |
23 |
24 | class CoordinateSet(BaseModel):
25 | top_left: Coordinates
26 | top_right: Coordinates
27 | bottom_left: Coordinates
28 | bottom_right: Coordinates
29 | center: Coordinates
30 | width: int
31 | height: int
32 |
33 |
34 | class ViewportInfo(BaseModel):
35 | scroll_x: int
36 | scroll_y: int
37 | width: int
38 | height: int
39 |
40 |
41 | @dataclass
42 | class DOMHistoryElement:
43 | tag_name: str
44 | xpath: str
45 | highlight_index: Optional[int]
46 | entire_parent_branch_path: list[str]
47 | attributes: dict[str, str]
48 | shadow_root: bool = False
49 | css_selector: Optional[str] = None
50 | page_coordinates: Optional[CoordinateSet] = None
51 | viewport_coordinates: Optional[CoordinateSet] = None
52 | viewport_info: Optional[ViewportInfo] = None
53 |
54 | def to_dict(self) -> dict:
55 | page_coordinates = self.page_coordinates.model_dump() if self.page_coordinates else None
56 | viewport_coordinates = self.viewport_coordinates.model_dump() if self.viewport_coordinates else None
57 | viewport_info = self.viewport_info.model_dump() if self.viewport_info else None
58 |
59 | return {
60 | 'tag_name': self.tag_name,
61 | 'xpath': self.xpath,
62 | 'highlight_index': self.highlight_index,
63 | 'entire_parent_branch_path': self.entire_parent_branch_path,
64 | 'attributes': self.attributes,
65 | 'shadow_root': self.shadow_root,
66 | 'css_selector': self.css_selector,
67 | 'page_coordinates': page_coordinates,
68 | 'viewport_coordinates': viewport_coordinates,
69 | 'viewport_info': viewport_info,
70 | }
71 |
--------------------------------------------------------------------------------
/utils/html_cleaner.py:
--------------------------------------------------------------------------------
1 | from html_sanitizer import Sanitizer
2 |
3 |
4 | def run_html_sanitizer(html: str, action_type: str):
5 | def sanitize_html(html: str, config: dict) -> str:
6 | sanitizer = Sanitizer(config)
7 | return sanitizer.sanitize(html)
8 |
9 | config = {}
10 | if action_type == "task-finish":
11 | allowed_tags = [
12 | 'a', 'address', 'article', 'aside', 'b', 'blockquote', 'button', 'caption', 'cite', 'code', 'col', 'colgroup',
13 | 'data', 'datalist', 'dd', 'del', 'details', 'div', 'dl', 'dt', 'em',
14 | 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4',
15 | 'h5', 'h6', 'header', 'hr', 'i', 'img', 'input', 'label', 'legend',
16 | 'li', 'main', 'menu', 'nav', 'ol', 'option', 'output', 'p', 'pre',
17 | 'q', 's', 'section', 'select', 'small', 'span', 'strong', 'sub', 'summary',
18 | 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'textarea', 'time', 'tr', 'ul', 'video',
19 | 'title'
20 | ]
21 |
22 | common_attrs = ["id", "aria-label", "role"]
23 | wildcard_data_attrs = "data-*"
24 |
25 | # Start with tag-specific attributes
26 | attributes = {
27 | "a": ["rel", "target"] + common_attrs,
28 | "img": ["alt"] + common_attrs,
29 | "button": ["aria-label"] + common_attrs,
30 | }
31 |
32 | # Add the common attributes to all other tags
33 | for tag in allowed_tags:
34 | if tag not in attributes:
35 | attributes[tag] = common_attrs.copy()
36 | # Add wildcard attributes for data-* only if supported by your sanitizer config
37 | attributes[tag].append(wildcard_data_attrs)
38 |
39 | config = {
40 | "tags": allowed_tags,
41 | "attributes": attributes,
42 | "empty": ["a", "img"],
43 | "separate": ["p", "div", "h1", "h2", "h3", "article", "main"],
44 | "keep_typographic_whitespace": True
45 | }
46 |
47 | return sanitize_html(html, config)
--------------------------------------------------------------------------------
/utils/llm.py:
--------------------------------------------------------------------------------
1 | """Sub-goal generator helper
2 |
3 | This tiny helper takes a text prompt, sends it to OpenAI via LangChain,
4 | and returns the assistant's plain-text reply.
5 | """
6 | from __future__ import annotations
7 |
8 | import os
9 | from typing import Optional
10 |
11 | from langchain_openai import ChatOpenAI
12 | from langchain.schema import AIMessage, HumanMessage, SystemMessage
13 |
14 | __all__ = ["ask_llm"]
15 |
16 | # ---------------------------------------------------------------------------
17 | # Basic LLM wrapper
18 | # ---------------------------------------------------------------------------
19 |
20 | def _build_llm(model: str = "gpt-4o", temperature: float = 0) -> ChatOpenAI: # type: ignore
21 | """Create a LangChain ChatOpenAI client with sane defaults.
22 |
23 | Parameters
24 | ----------
25 | model : str
26 | OpenAI model name. Defaults to *gpt-4o-mini* (fast/cheap). Change to
27 | "gpt-4o" or "gpt-4-turbo" if you want higher quality.
28 | temperature : float
29 | Sampling temperature.
30 | """
31 | # The key must be available in the environment. (Raise a clear error if not.)
32 | if "OPENAI_API_KEY" not in os.environ:
33 | raise RuntimeError("OPENAI_API_KEY environment variable is not set.")
34 |
35 | return ChatOpenAI(model_name=model, temperature=temperature)
36 |
37 |
38 | def ask_llm(prompt: str,
39 | system_prompt: Optional[str] = None,
40 | model: str = "gpt-4o",
41 | temperature: float = 0) -> str:
42 | """Send *prompt* to OpenAI and return the assistant text.
43 |
44 | Parameters
45 | ----------
46 | prompt : str
47 | User prompt / question.
48 | system_prompt : str | None
49 | Optional system message to steer model behaviour.
50 | model : str
51 | OpenAI model name (default: gpt-4o-mini).
52 | temperature : float
53 | Sampling temperature (default 0.2).
54 |
55 | Returns
56 | -------
57 | str
58 | Assistant's plain-text reply.
59 | """
60 | llm = _build_llm(model=model, temperature=temperature)
61 |
62 | messages = []
63 | if system_prompt:
64 | messages.append(SystemMessage(content=system_prompt))
65 | messages.append(HumanMessage(content=prompt))
66 |
67 | # Call the chat model.
68 | response = llm(messages) # -> AIMessage
69 |
70 | if not isinstance(response, AIMessage):
71 | raise RuntimeError("Unexpected response type from LLM")
72 |
73 | return response.content.strip()
74 |
--------------------------------------------------------------------------------
/data_samples/replay_list_samples/wap_exact_replay_list_GqMnZeKFxvePGKGA.json:
--------------------------------------------------------------------------------
1 | {
2 | "ultimate_goal": "search for a recipe of baked salmon which takes less than 1 hour to cook",
3 | "task_id": "GqMnZeKFxvePGKGA",
4 | "type": "exact_replay",
5 | "action_list": [
6 | {
7 | "action": "open_tab",
8 | "action_params": {
9 | "url": "https://www.allrecipes.com/"
10 | }
11 | },
12 | {
13 | "action": "wait_for_element",
14 | "action_params": {
15 | "selector": "#mntl-search-form--open__search-input",
16 | "timeout": 5000
17 | }
18 | },
19 | {
20 | "action": "input_text_by_selector",
21 | "action_params": {
22 | "selector": "#mntl-search-form--open__search-input",
23 | "text": "baked salmon"
24 | }
25 | },
26 | {
27 | "action": "send_keys",
28 | "action_params": {
29 | "keys": "Enter"
30 | }
31 | },
32 | {
33 | "action": "wait_for_element",
34 | "action_params": {
35 | "selector": "img[alt=\"Breaded, baked salmon fillets topped with lemon slices, served alongside asparagus slices and rice pilaf on blue plates\"]",
36 | "timeout": 5000
37 | }
38 | },
39 | {
40 | "action": "click_element_by_selector",
41 | "action_params": {
42 | "css_selector": "img[alt=\"Breaded, baked salmon fillets topped with lemon slices, served alongside asparagus slices and rice pilaf on blue plates\"]"
43 | }
44 | },
45 | {
46 | "action": "go_to_url",
47 | "action_params": {
48 | "url": "https://www.allrecipes.com/search?q=baked+salmon"
49 | }
50 | },
51 | {
52 | "action": "wait_for_element",
53 | "action_params": {
54 | "selector": "body",
55 | "timeout": 8000
56 | }
57 | },
58 | {
59 | "action": "wait_for_element",
60 | "action_params": {
61 | "selector": "img[alt=\"Filet of salmon topped with melted cheese on aluminum foil\"]",
62 | "timeout": 5000
63 | }
64 | },
65 | {
66 | "action": "click_element_by_selector",
67 | "action_params": {
68 | "css_selector": "img[alt=\"Filet of salmon topped with melted cheese on aluminum foil\"]"
69 | }
70 | },
71 | {
72 | "action": "extract_content",
73 | "action_params": {
74 | "goal": "search for a recipe of baked salmon which takes less than 1 hour to cook",
75 | "should_strip_link_urls": false
76 | }
77 | },
78 | {
79 | "action": "done",
80 | "action_params": {
81 | "text": "task executed successfully",
82 | "success": true
83 | }
84 | }
85 | ]
86 | }
--------------------------------------------------------------------------------
/data_samples/replay_list_samples/wap_exact_replay_list_LhTyE4ie0s5a1W6J.json:
--------------------------------------------------------------------------------
1 | {
2 | "ultimate_goal": "search for the best sold keyboard on Amazon",
3 | "task_id": "LhTyE4ie0s5a1W6J",
4 | "type": "exact_replay",
5 | "action_list": [
6 | {
7 | "action": "open_tab",
8 | "action_params": {
9 | "url": "https://www.amazon.ca/"
10 | }
11 | },
12 | {
13 | "action": "wait_for_element",
14 | "action_params": {
15 | "selector": "#searchDropdownBox",
16 | "timeout": 5000
17 | }
18 | },
19 | {
20 | "action": "select_option_by_selector",
21 | "action_params": {
22 | "css_selector": "#searchDropdownBox",
23 | "value": "search-alias=aps"
24 | }
25 | },
26 | {
27 | "action": "wait_for_element",
28 | "action_params": {
29 | "selector": "#twotabsearchtextbox",
30 | "timeout": 5000
31 | }
32 | },
33 | {
34 | "action": "input_text_by_selector",
35 | "action_params": {
36 | "selector": "#twotabsearchtextbox",
37 | "text": "keyboard"
38 | }
39 | },
40 | {
41 | "action": "send_keys",
42 | "action_params": {
43 | "keys": "Enter"
44 | }
45 | },
46 | {
47 | "action": "wait_for_element",
48 | "action_params": {
49 | "selector": ".a-dropdown-prompt",
50 | "timeout": 5000
51 | }
52 | },
53 | {
54 | "action": "click_element_by_selector",
55 | "action_params": {
56 | "css_selector": ".a-dropdown-prompt"
57 | }
58 | },
59 | {
60 | "action": "wait_for_element",
61 | "action_params": {
62 | "selector": "#s-result-sort-select_5",
63 | "timeout": 5000
64 | }
65 | },
66 | {
67 | "action": "click_element_by_selector",
68 | "action_params": {
69 | "css_selector": "#s-result-sort-select_5"
70 | }
71 | },
72 | {
73 | "action": "wait_for_element",
74 | "action_params": {
75 | "selector": "span:text(\"Lenovo 300 USB Keyboard, Wired, Adjustable Tilt, Ergonomic, Windows 7/8/10, GX30M39655, Black\")",
76 | "timeout": 5000
77 | }
78 | },
79 | {
80 | "action": "click_element_by_text",
81 | "action_params": {
82 | "text": "Lenovo 300 USB Keyboard, Wired, Adjustable Tilt, Ergonomic, Windows 7/8/10, GX30M39655, Black",
83 | "element_type": "span",
84 | "nth": 0
85 | }
86 | },
87 | {
88 | "action": "extract_content",
89 | "action_params": {
90 | "goal": "search for the best sold keyboard on Amazon",
91 | "should_strip_link_urls": false
92 | }
93 | },
94 | {
95 | "action": "done",
96 | "action_params": {
97 | "text": "task executed successfully",
98 | "success": true
99 | }
100 | }
101 | ]
102 | }
--------------------------------------------------------------------------------
/browser_use/browser/tests/test_clicks.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import json
3 |
4 | import pytest
5 |
6 | from browser_use.browser.browser import Browser, BrowserConfig
7 | from browser_use.dom.views import DOMBaseNode, DOMElementNode, DOMTextNode
8 | from browser_use.utils import time_execution_sync
9 |
10 |
11 | class ElementTreeSerializer:
12 | @staticmethod
13 | def dom_element_node_to_json(element_tree: DOMElementNode) -> dict:
14 | def node_to_dict(node: DOMBaseNode) -> dict:
15 | if isinstance(node, DOMTextNode):
16 | return {'type': 'text', 'text': node.text}
17 | elif isinstance(node, DOMElementNode):
18 | return {
19 | 'type': 'element',
20 | 'tag_name': node.tag_name,
21 | 'attributes': node.attributes,
22 | 'highlight_index': node.highlight_index,
23 | 'children': [node_to_dict(child) for child in node.children],
24 | }
25 | return {}
26 |
27 | return node_to_dict(element_tree)
28 |
29 |
30 | # run with: pytest browser_use/browser/tests/test_clicks.py
31 | @pytest.mark.asyncio
32 | async def test_highlight_elements():
33 | browser = Browser(config=BrowserConfig(headless=False, disable_security=True))
34 |
35 | async with await browser.new_context() as context:
36 | page = await context.get_current_page()
37 | # await page.goto('https://immobilienscout24.de')
38 | # await page.goto('https://help.sap.com/docs/sap-ai-core/sap-ai-core-service-guide/service-plans')
39 | # await page.goto('https://google.com/search?q=elon+musk')
40 | # await page.goto('https://kayak.com')
41 | # await page.goto('https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe')
42 | # await page.goto('https://dictionary.cambridge.org')
43 | # await page.goto('https://github.com')
44 | await page.goto('https://huggingface.co/')
45 |
46 | await asyncio.sleep(1)
47 |
48 | while True:
49 | try:
50 | # await asyncio.sleep(10)
51 | state = await context.get_state()
52 |
53 | with open('./tmp/page.json', 'w') as f:
54 | json.dump(
55 | ElementTreeSerializer.dom_element_node_to_json(state.element_tree),
56 | f,
57 | indent=1,
58 | )
59 |
60 | # await time_execution_sync('highlight_selector_map_elements')(
61 | # browser.highlight_selector_map_elements
62 | # )(state.selector_map)
63 |
64 | # Find and print duplicate XPaths
65 | xpath_counts = {}
66 | if not state.selector_map:
67 | continue
68 | for selector in state.selector_map.values():
69 | xpath = selector.xpath
70 | if xpath in xpath_counts:
71 | xpath_counts[xpath] += 1
72 | else:
73 | xpath_counts[xpath] = 1
74 |
75 | print('\nDuplicate XPaths found:')
76 | for xpath, count in xpath_counts.items():
77 | if count > 1:
78 | print(f'XPath: {xpath}')
79 | print(f'Count: {count}\n')
80 |
81 | print(list(state.selector_map.keys()), 'Selector map keys')
82 | print(state.element_tree.clickable_elements_to_string())
83 | action = input('Select next action: ')
84 |
85 | await time_execution_sync('remove_highlight_elements')(context.remove_highlights)()
86 |
87 | node_element = state.selector_map[int(action)]
88 |
89 | # check if index of selector map are the same as index of items in dom_items
90 |
91 | await context._click_element_node(node_element)
92 |
93 | except Exception as e:
94 | print(e)
95 |
--------------------------------------------------------------------------------
/browser_use/telemetry/service.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import uuid
4 | from pathlib import Path
5 |
6 | from dotenv import load_dotenv
7 | from posthog import Posthog
8 |
9 | from browser_use.telemetry.views import BaseTelemetryEvent
10 | from browser_use.utils import singleton
11 |
12 | load_dotenv()
13 |
14 |
15 | logger = logging.getLogger(__name__)
16 |
17 |
18 | POSTHOG_EVENT_SETTINGS = {
19 | 'process_person_profile': True,
20 | }
21 |
22 |
23 | @singleton
24 | class ProductTelemetry:
25 | """
26 | Service for capturing anonymized telemetry data.
27 |
28 | If the environment variable `ANONYMIZED_TELEMETRY=False`, anonymized telemetry will be disabled.
29 | """
30 |
31 | USER_ID_PATH = str(Path.home() / '.cache' / 'browser_use' / 'telemetry_user_id')
32 | PROJECT_API_KEY = 'phc_F8JMNjW1i2KbGUTaW1unnDdLSPCoyc52SGRU0JecaUh'
33 | HOST = 'https://eu.i.posthog.com'
34 | UNKNOWN_USER_ID = 'UNKNOWN'
35 |
36 | _curr_user_id = None
37 |
38 | def __init__(self) -> None:
39 | telemetry_disabled = 'false' #os.getenv('ANONYMIZED_TELEMETRY', 'true').lower() == 'false'
40 | self.debug_logging = os.getenv('BROWSER_USE_LOGGING_LEVEL', 'info').lower() == 'debug'
41 |
42 | if telemetry_disabled:
43 | self._posthog_client = None
44 | else:
45 | logger.info(
46 | 'Anonymized telemetry enabled. See https://docs.browser-use.com/development/telemetry for more information.'
47 | )
48 | self._posthog_client = Posthog(
49 | project_api_key=self.PROJECT_API_KEY,
50 | host=self.HOST,
51 | disable_geoip=False,
52 | )
53 |
54 | # Silence posthog's logging
55 | if not self.debug_logging:
56 | posthog_logger = logging.getLogger('posthog')
57 | posthog_logger.disabled = True
58 |
59 | if self._posthog_client is None:
60 | logger.debug('Telemetry disabled')
61 |
62 | def capture(self, event: BaseTelemetryEvent) -> None:
63 | if self._posthog_client is None:
64 | return
65 |
66 | if self.debug_logging:
67 | logger.debug(f'Telemetry event: {event.name} {event.properties}')
68 | self._direct_capture(event)
69 |
70 | def _direct_capture(self, event: BaseTelemetryEvent) -> None:
71 | """
72 | Should not be thread blocking because posthog magically handles it
73 | """
74 | if self._posthog_client is None:
75 | return
76 |
77 | try:
78 | self._posthog_client.capture(
79 | self.user_id,
80 | event.name,
81 | {**event.properties, **POSTHOG_EVENT_SETTINGS},
82 | )
83 | except Exception as e:
84 | logger.error(f'Failed to send telemetry event {event.name}: {e}')
85 |
86 | @property
87 | def user_id(self) -> str:
88 | if self._curr_user_id:
89 | return self._curr_user_id
90 |
91 | # File access may fail due to permissions or other reasons. We don't want to
92 | # crash so we catch all exceptions.
93 | try:
94 | if not os.path.exists(self.USER_ID_PATH):
95 | os.makedirs(os.path.dirname(self.USER_ID_PATH), exist_ok=True)
96 | with open(self.USER_ID_PATH, 'w') as f:
97 | new_user_id = str(uuid.uuid4())
98 | f.write(new_user_id)
99 | self._curr_user_id = new_user_id
100 | else:
101 | with open(self.USER_ID_PATH, 'r') as f:
102 | self._curr_user_id = f.read()
103 | except Exception:
104 | self._curr_user_id = 'UNKNOWN_USER_ID'
105 | return self._curr_user_id
106 |
--------------------------------------------------------------------------------
/wap_replay/generate_smart_replay_list.py:
--------------------------------------------------------------------------------
1 | """
2 | Batch-convert recorded event-JSON files into the canonical smart-replay”
3 | action list by calling `record_metadata_to_actions` from browser-use.
4 |
5 | Usage
6 | -----
7 | python wap_replay/generate_smart_replay_list.py --data_dir_path \
8 | [--output_dir_path data_processed/exact_replay]
9 |
10 | Example
11 | -----
12 | python wap_replay/generate_smart_replay_list.py --data_dir_path data/20250423/Allrecipes--4 \
13 | --output_dir_path data_processed/smart_replay
14 | """
15 | import argparse
16 | from pathlib import Path
17 | from dotenv import load_dotenv
18 | from utils.action_processing import generate_subgoal_speculate_prompt, find_task_prompt, load_event_json
19 | from utils.subgoal_generator import generate_subgoals_from_dir, wap_subgoal_list_generation
20 | load_dotenv()
21 |
22 |
23 | def subgoal_prompt_generation(path: str, output_path: str, ultimate_goal: str) -> None:
24 | # 1️⃣ collect every .json file under the root folder --------------------
25 | root = Path(path)
26 | json_paths = list(root.rglob("*.json"))
27 | if not json_paths:
28 | print(f"[OTA Info] No JSON files found under {root}")
29 | return
30 |
31 | print(f"[OTA Info] Found {len(json_paths)} event files.")
32 | # 2️⃣ process each event file ------------------------------------------
33 | for idx, event_path in enumerate(json_paths, 1):
34 | print(f"\n[{idx}/{len(json_paths)}] Loading {event_path}")
35 | summary_event = load_event_json(event_path)
36 |
37 | print(" Generating sub-goal …")
38 | generate_subgoal_speculate_prompt(summary_event, ultimate_goal, event_path.stem, output_path)
39 |
40 | print("\n[OTA Info] All done.")
41 |
42 |
43 | def subgoal_llm_generation(folder, jsonl_name):
44 | results = generate_subgoals_from_dir(
45 | folder,
46 | system_prompt="You are a concise sub-goal assistant fot analysis of actions in browser.",
47 | model="gpt-4o",
48 | temperature=0,
49 | save_jsonl= jsonl_name
50 | )
51 |
52 | def main() -> None:
53 | parser = argparse.ArgumentParser(description="Smart-replay pipeline")
54 | parser.add_argument("--data_dir_path", required=True,
55 | help="Directory containing recorded event JSON files")
56 | parser.add_argument("--output_dir_path", default="data_processed/smart_replay",
57 | help="Directory where all output will be placed "
58 | "(default: data_processed/smart_replay)")
59 | args = parser.parse_args()
60 |
61 | data_dir = Path(args.data_dir_path)
62 | output_dir = Path(args.output_dir_path)
63 | output_dir.mkdir(parents=True, exist_ok=True)
64 |
65 | task_prompt, task_id = find_task_prompt(data_dir)
66 | print("[OTA Info] Using task prompt =>", task_prompt)
67 | print("[OTA Info] taskId =>", task_id)
68 |
69 | subgoals_dir = output_dir / f"subgoals_{task_id}"
70 | subgoals_dir.mkdir(parents=True, exist_ok=True)
71 |
72 | subgoals_jsonl = subgoals_dir / "subgoals_output.jsonl"
73 | wap_json = output_dir / f"wap_smart_replay_list_{task_id}.json"
74 |
75 | subgoal_prompt_generation(
76 | data_dir,
77 | subgoals_dir,
78 | task_prompt,
79 | )
80 |
81 | subgoal_llm_generation(
82 | subgoals_dir,
83 | subgoals_jsonl,
84 | )
85 |
86 | wap_subgoal_list_generation(
87 | task_prompt,
88 | task_id,
89 | subgoals_jsonl,
90 | wap_json,
91 | )
92 |
93 | if __name__ == "__main__":
94 | main()
--------------------------------------------------------------------------------
/chrome-extension/js/specialEventHandler.js:
--------------------------------------------------------------------------------
1 | (function () {
2 | /******************************************************************
3 | * specialEventHandler.js
4 | * --------------------------------------------------------------
5 | * Registers and runs domain–specific listeners that the generic
6 | * DOMListener cannot reliably cover
7 | *
8 | * Usage from DOMListener.js
9 | * --------------------------------------------------------------
10 | * import(chrome.runtime.getURL('js/specialEventHandler.js'))
11 | * .then(mod => mod.init())
12 | * .catch(err => console.warn('[specialHandler] load failed', err));
13 | ******************************************************************/
14 |
15 | /* ---------- simple registry ------------------------------------------------ */
16 | const _handlers = [];
17 |
18 | /** Register a new handler.
19 | * @param {RegExp} hostPattern – tested against location.hostname
20 | * @param {Function} initFn – called if the pattern matches */
21 | function register(hostPattern, initFn) { _handlers.push({hostPattern, initFn}); }
22 |
23 | /* ---------- H A N D L E R S --------------------------------------------- */
24 | register(/(^|\.)google\.[a-z.]+$/, ({
25 | nodeToHTMLString,
26 | trimTarget,
27 | getEventHash,
28 | getCurrentHTMLSanitized,
29 | taskId
30 | }) => {
31 |
32 | const BOX = 'textarea[name="q"][role="combobox"]';
33 | const BTN = 'button[aria-label="Search"][type="submit"]';
34 |
35 | function report(value, originEl) {
36 | const evHash = getEventHash();
37 |
38 |
39 | const actionTarget = {
40 | type : 'submit',
41 | target : nodeToHTMLString(originEl), // full raw HTML
42 | targetId : originEl.id,
43 | targetClass: originEl.className,
44 | value : value // the user query text
45 | };
46 |
47 | // highlight element just like other flows
48 | originEl.setAttribute('ota-use-interactive-target', '1');
49 | actionTarget.target = trimTarget(originEl); // prettified / trimmed
50 | // (optional) remove the mark after trimming
51 | originEl.removeAttribute('ota-use-interactive-target');
52 |
53 | const summaryEvent = {
54 | taskId : taskId,
55 | eventHash : evHash,
56 | type : 'submit',
57 | actionTimestamp: Date.now(),
58 | eventTarget : actionTarget,
59 | allEvents : {}, // nothing to diff for a submit
60 | pageHTMLContent: getCurrentHTMLSanitized()
61 | };
62 |
63 | /* ---- ship it to the background ------------------------------------ */
64 | chrome.runtime.sendMessage({
65 | type : 'submit', // pick any type name you handle in bg.js
66 | summaryEvent
67 | });
68 | }
69 |
70 | /* enter key */
71 | document.addEventListener('keydown', e => {
72 | if (e.key === 'Enter' && !e.shiftKey && e.target.matches(BOX)) {
73 | report(e.target.value, e.target);
74 | }
75 | }, /*capture*/ true);
76 |
77 | /* blue Search button */
78 | document.addEventListener('click', e => {
79 | const btn = e.target.closest(BTN);
80 | if (!btn) return;
81 | const box = document.querySelector(BOX);
82 | if (box) report(box.value, btn);
83 | }, true);
84 |
85 | console.debug('[specialHandler] Google search attached');
86 | });
87 |
88 | /* -------------------------------------------------------------------------- */
89 | /** Call once from DOMListener. Attaches every handler that matches
90 | * the current hostname. */
91 | function init (deps) {
92 | const host = location.hostname;
93 | _handlers.forEach(({hostPattern, initFn}) => {
94 | if (hostPattern.test(host)) {
95 | console.log(hostPattern)
96 | try { initFn(deps); }
97 | catch (err) {
98 | console.error('[specialHandler] failed for', hostPattern, err);
99 | }
100 | }
101 | });
102 | }
103 |
104 | window.SpecialEvents = { init };
105 |
106 | })();
--------------------------------------------------------------------------------
/browser_use/controller/views.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | from pydantic import BaseModel, ConfigDict, Field, model_validator
4 |
5 |
6 | # Action Input Models
7 | class SearchGoogleAction(BaseModel):
8 | query: str
9 |
10 |
11 | class GoToUrlAction(BaseModel):
12 | url: str
13 |
14 |
15 | class WaitForElementAction(BaseModel):
16 | selector: str
17 | timeout: Optional[int] = 10000 # Timeout in milliseconds
18 |
19 |
20 | class ClickElementAction(BaseModel):
21 | index: int
22 | xpath: Optional[str] = None
23 |
24 |
25 | class ClickElementByXpathAction(BaseModel):
26 | xpath: str
27 |
28 |
29 | class ClickElementBySelectorAction(BaseModel):
30 | css_selector: str
31 |
32 | class SelectOptionBySelectorAction(BaseModel):
33 | css_selector: str # e.g. "#searchDropdownBox"
34 | value: str | None = None # preferred (unique)
35 | label: str | None = None # visible text fallback
36 |
37 | class ClickElementByTextAction(BaseModel):
38 | text: str
39 | element_type: Optional[str]
40 | nth: int = 0
41 |
42 |
43 | class InputTextAction(BaseModel):
44 | index: int
45 | text: str
46 | xpath: Optional[str] = None
47 |
48 |
49 | class DoneAction(BaseModel):
50 | text: str
51 | success: bool
52 |
53 |
54 | class SwitchTabAction(BaseModel):
55 | page_id: int
56 |
57 |
58 | class OpenTabAction(BaseModel):
59 | url: str
60 |
61 |
62 | class CloseTabAction(BaseModel):
63 | page_id: int
64 |
65 |
66 | class ScrollAction(BaseModel):
67 | amount: Optional[int] = None # The number of pixels to scroll. If None, scroll down/up one page
68 |
69 |
70 | class SendKeysAction(BaseModel):
71 | keys: str
72 |
73 |
74 | class GroupTabsAction(BaseModel):
75 | tab_ids: list[int] = Field(..., description='List of tab IDs to group')
76 | title: str = Field(..., description='Name for the tab group')
77 | color: Optional[str] = Field(
78 | 'blue',
79 | description='Color for the group (grey/blue/red/yellow/green/pink/purple/cyan)',
80 | )
81 |
82 |
83 | class UngroupTabsAction(BaseModel):
84 | tab_ids: list[int] = Field(..., description='List of tab IDs to ungroup')
85 |
86 |
87 | class ExtractPageContentAction(BaseModel):
88 | value: str
89 |
90 |
91 | class NoParamsAction(BaseModel):
92 | """
93 | Accepts absolutely anything in the incoming data
94 | and discards it, so the final parsed model is empty.
95 | """
96 |
97 | model_config = ConfigDict(extra='allow')
98 |
99 | @model_validator(mode='before')
100 | def ignore_all_inputs(cls, values):
101 | # No matter what the user sends, discard it and return empty.
102 | return {}
103 |
104 |
105 | class Position(BaseModel):
106 | x: int
107 | y: int
108 |
109 |
110 | class DragDropAction(BaseModel):
111 | # Element-based approach
112 | element_source: Optional[str] = Field(None, description='CSS selector or XPath of the element to drag from')
113 | element_target: Optional[str] = Field(None, description='CSS selector or XPath of the element to drop onto')
114 | element_source_offset: Optional[Position] = Field(
115 | None, description='Precise position within the source element to start drag (in pixels from top-left corner)'
116 | )
117 | element_target_offset: Optional[Position] = Field(
118 | None, description='Precise position within the target element to drop (in pixels from top-left corner)'
119 | )
120 |
121 | # Coordinate-based approach (used if selectors not provided)
122 | coord_source_x: Optional[int] = Field(None, description='Absolute X coordinate on page to start drag from (in pixels)')
123 | coord_source_y: Optional[int] = Field(None, description='Absolute Y coordinate on page to start drag from (in pixels)')
124 | coord_target_x: Optional[int] = Field(None, description='Absolute X coordinate on page to drop at (in pixels)')
125 | coord_target_y: Optional[int] = Field(None, description='Absolute Y coordinate on page to drop at (in pixels)')
126 |
127 | # Common options
128 | steps: Optional[int] = Field(10, description='Number of intermediate points for smoother movement (5-20 recommended)')
129 | delay_ms: Optional[int] = Field(5, description='Delay in milliseconds between steps (0 for fastest, 10-20 for more natural)')
130 |
--------------------------------------------------------------------------------
/wap_replay/generate_exact_replay_list.py:
--------------------------------------------------------------------------------
1 | """
2 | Batch-convert recorded event-JSON files into the canonical “exact-replay”
3 | action list by calling `record_metadata_to_actions` from browser-use.
4 |
5 | Usage
6 | -----
7 | python wap_replay/generate_exact_replay_list.py --data_dir_path \
8 | [--output_dir_path data_processed/exact_replay]
9 |
10 | Example
11 | -----
12 | python wap_replay/generate_exact_replay_list.py --data_dir_path data/20250423/Allrecipes--4 \
13 | --output_dir_path data_processed/exact_replay
14 | """
15 | from __future__ import annotations
16 |
17 | import argparse
18 | import json
19 | from pathlib import Path
20 | from typing import List, Dict, Any
21 | from browser_use.wap.exact_replay import record_metadata_to_actions
22 | from utils.action_processing import find_task_prompt, load_event_json
23 |
24 | # ---------------------------------------------------------------------------#
25 | # core function #
26 | # ---------------------------------------------------------------------------#
27 | def folder_to_actions(folder_path: str | Path) -> List[Dict[str, Any]]:
28 | """
29 | Walk sub-directories recursively, load every *.json file, convert each
30 | to replay actions via `record_metadata_to_actions`, and return the
31 | concatenated list.
32 | """
33 | folder_path = Path(folder_path)
34 |
35 | if not folder_path.is_dir():
36 | raise NotADirectoryError(folder_path)
37 |
38 | json_paths = list(folder_path.rglob("*.json")) # recursive search
39 | if not json_paths:
40 | print(f"[OTA Info] No JSON files found under {folder_path}")
41 | return []
42 |
43 | print(f"[OTA Info] Found {len(json_paths)} event files.")
44 |
45 | all_actions: List[Dict[str, Any]] = []
46 |
47 | for idx, event_path in enumerate(json_paths, 1):
48 | print(f"[{idx}/{len(json_paths)}] Loading {event_path}")
49 | try:
50 | event_json = load_event_json(event_path)
51 | actions = record_metadata_to_actions([event_json])
52 | all_actions.extend(actions)
53 | except Exception as exc:
54 | print(f"[warn] could not process {event_path.name}: {exc}")
55 |
56 | print("[OTA Info] All done.")
57 | return all_actions
58 |
59 |
60 | def save_exact_replay_bundle(
61 | path: Path,
62 | *,
63 | ultimate_goal: str,
64 | task_id: str,
65 | actions: List[Dict[str, Any]],
66 | ) -> None:
67 | """
68 | Write a JSON file shaped like
69 | """
70 | bundle = {
71 | "ultimate_goal": ultimate_goal,
72 | "task_id": task_id,
73 | "type": "exact_replay",
74 | "action_list": actions,
75 | }
76 | path.write_text(json.dumps(bundle, ensure_ascii=False, indent=2), encoding="utf-8")
77 | print(f"[OTA info] wrote {len(actions)} actions → {path}")
78 |
79 |
80 | # ---------------------------------------------------------------------------#
81 | # command-line interface #
82 | # ---------------------------------------------------------------------------#
83 | def parse_args() -> argparse.Namespace:
84 | parser = argparse.ArgumentParser(description="Create exact-replay action list from a folder of event JSON files.")
85 | parser.add_argument("--data_dir_path", required=True, help="Folder containing recorded *.json files.")
86 | parser.add_argument("--output_dir_path", default="data_processed/exact_replay", help="Directory to store result file.")
87 | return parser.parse_args()
88 |
89 | def main() -> None:
90 | args = parse_args()
91 |
92 | input_folder = Path(args.data_dir_path)
93 | output_dir = Path(args.output_dir_path)
94 | output_dir.mkdir(parents=True, exist_ok=True)
95 | task_prompt, task_id = find_task_prompt(input_folder)
96 | output_path = output_dir / f"wap_exact_replay_list_{task_id}.json"
97 |
98 | actions = folder_to_actions(input_folder)
99 | save_exact_replay_bundle(
100 | output_path,
101 | ultimate_goal=task_prompt,
102 | task_id=task_id,
103 | actions=actions,
104 | )
105 |
106 | if __name__ == "__main__":
107 | main()
--------------------------------------------------------------------------------
/browser_use/dom/tests/debug_page_structure.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import os
3 | import sys
4 |
5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
6 |
7 | from browser_use.browser.browser import Browser, BrowserConfig
8 | from browser_use.browser.context import BrowserContext
9 |
10 |
11 | async def analyze_page_structure(url: str):
12 | """Analyze and print the structure of a webpage with enhanced debugging"""
13 | browser = Browser(
14 | config=BrowserConfig(
15 | headless=False, # Set to True if you don't need to see the browser
16 | )
17 | )
18 |
19 | context = BrowserContext(browser=browser)
20 |
21 | try:
22 | async with context as ctx:
23 | # Navigate to the URL
24 | page = await ctx.get_current_page()
25 | await page.goto(url)
26 | await page.wait_for_load_state('networkidle')
27 |
28 | # Get viewport dimensions
29 | viewport_info = await page.evaluate("""() => {
30 | return {
31 | viewport: {
32 | width: window.innerWidth,
33 | height: window.innerHeight,
34 | scrollX: window.scrollX,
35 | scrollY: window.scrollY
36 | }
37 | }
38 | }""")
39 |
40 | print('\nViewport Information:')
41 | print(f'Width: {viewport_info["viewport"]["width"]}')
42 | print(f'Height: {viewport_info["viewport"]["height"]}')
43 | print(f'ScrollX: {viewport_info["viewport"]["scrollX"]}')
44 | print(f'ScrollY: {viewport_info["viewport"]["scrollY"]}')
45 |
46 | # Enhanced debug information for cookie consent and fixed position elements
47 | debug_info = await page.evaluate("""() => {
48 | function getElementInfo(element) {
49 | const rect = element.getBoundingClientRect();
50 | const style = window.getComputedStyle(element);
51 | return {
52 | tag: element.tagName.toLowerCase(),
53 | id: element.id,
54 | className: element.className,
55 | position: style.position,
56 | rect: {
57 | top: rect.top,
58 | right: rect.right,
59 | bottom: rect.bottom,
60 | left: rect.left,
61 | width: rect.width,
62 | height: rect.height
63 | },
64 | isFixed: style.position === 'fixed',
65 | isSticky: style.position === 'sticky',
66 | zIndex: style.zIndex,
67 | visibility: style.visibility,
68 | display: style.display,
69 | opacity: style.opacity
70 | };
71 | }
72 |
73 | // Find cookie-related elements
74 | const cookieElements = Array.from(document.querySelectorAll('[id*="cookie"], [id*="consent"], [class*="cookie"], [class*="consent"]'));
75 | const fixedElements = Array.from(document.querySelectorAll('*')).filter(el => {
76 | const style = window.getComputedStyle(el);
77 | return style.position === 'fixed' || style.position === 'sticky';
78 | });
79 |
80 | return {
81 | cookieElements: cookieElements.map(getElementInfo),
82 | fixedElements: fixedElements.map(getElementInfo)
83 | };
84 | }""")
85 |
86 | print('\nCookie-related Elements:')
87 | for elem in debug_info['cookieElements']:
88 | print(f'\nElement: {elem["tag"]}#{elem["id"]} .{elem["className"]}')
89 | print(f'Position: {elem["position"]}')
90 | print(f'Rect: {elem["rect"]}')
91 | print(f'Z-Index: {elem["zIndex"]}')
92 | print(f'Visibility: {elem["visibility"]}')
93 | print(f'Display: {elem["display"]}')
94 | print(f'Opacity: {elem["opacity"]}')
95 |
96 | print('\nFixed/Sticky Position Elements:')
97 | for elem in debug_info['fixedElements']:
98 | print(f'\nElement: {elem["tag"]}#{elem["id"]} .{elem["className"]}')
99 | print(f'Position: {elem["position"]}')
100 | print(f'Rect: {elem["rect"]}')
101 | print(f'Z-Index: {elem["zIndex"]}')
102 |
103 | print(f'\nPage Structure for {url}:\n')
104 | structure = await ctx.get_page_structure()
105 | print(structure)
106 |
107 | input('Press Enter to close the browser...')
108 | finally:
109 | await browser.close()
110 |
111 |
112 | if __name__ == '__main__':
113 | # You can modify this URL to analyze different pages
114 |
115 | urls = [
116 | 'https://www.mlb.com/yankees/stats/',
117 | 'https://immobilienscout24.de',
118 | 'https://www.zeiss.com/career/en/job-search.html?page=1',
119 | 'https://www.zeiss.com/career/en/job-search.html?page=1',
120 | 'https://reddit.com',
121 | ]
122 | for url in urls:
123 | asyncio.run(analyze_page_structure(url))
124 |
--------------------------------------------------------------------------------
/browser_use/agent/system_prompt_wap_replay.md:
--------------------------------------------------------------------------------
1 | You are an AI agent designed to automate browser tasks. Your goal is to accomplish the ultimate task by replaying the sub-goals for each step that we provided.
2 |
3 | # Input Format
4 | Task
5 | Previous steps
6 | Current URL
7 | Open Tabs
8 | Sub-goal List
9 | Interactive Elements
10 | [index]text
11 | - index: Numeric identifier for interaction
12 | - type: HTML element type (button, input, etc.)
13 | - text: Element description
14 | Example:
15 | [33]
16 |
17 | - Only elements with numeric indexes in [] are interactive
18 | - elements without [] provide only context
19 |
20 | # Response Rules
21 | 1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format:
22 | {{"current_state": {{"evaluation_previous_goal": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Mention if something unexpected happened. Shortly state why/why not",
23 | "memory": "Description of what has been done and what you need to remember. Be very specific. Count here ALWAYS how many times you have done something and how many remain. E.g. 0 out of 10 websites analyzed. Continue with abc and xyz",
24 | "subgoal_index": 1}},
25 | "action":[{{"one_action_name": {{// action-specific parameter}}}}, // ... more actions in sequence]}}
26 |
27 | 2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {{max_actions}} actions per sequence.
28 | Common action sequences:
29 | - Form filling: [{{"input_text": {{"index": 1, "text": "username"}}}}, {{"input_text": {{"index": 2, "text": "password"}}}}, {{"click_element": {{"index": 3}}}}]
30 | - Navigation and extraction: [{{"go_to_url": {{"url": "https://example.com"}}}}, {{"extract_content": {{"goal": "extract the names"}}}}]
31 | - Actions are executed in the given order
32 | - If the page changes after an action, the sequence is interrupted and you get the new state.
33 | - only use multiple actions if it makes sense.
34 |
35 | 3. ELEMENT INTERACTION:
36 | - Only use indexes of the interactive elements
37 | - Elements marked with "[]Non-interactive text" are non-interactive
38 |
39 | 4. NAVIGATION & ERROR HANDLING:
40 | - If no suitable elements exist, use other functions to complete the task
41 | - Handle popups/cookies by accepting or closing them
42 | - Use scroll to find elements you are looking for
43 | - If you want to research something, open a new tab instead of using the current tab
44 | - If captcha pops up, try to solve it - else try a different approach
45 | - If the page is not fully loaded, use wait action
46 |
47 | 5. (MANDATORY) ACTIONS BASED ON SUB-GOAL LIST
48 | - You are provided with a previous and current sub-goals.
49 | - Check the action results in task history and the current page content to see whether the current status has already been safisfied:
50 | -> if YES, specify actions for the current sub-goal in this step
51 | -> if NO, try some approaches more to achieve the previous sub-goals, e.g.: scroll to find elements
52 | - In your response, fill the value of subgoal_index with the index of the sub-goal that you work on in this step
53 | - Keep track of the status and subresults in the memory.
54 |
55 | 6. TASK COMPLETION:
56 | - Use the done action as the last action as soon as the ultimate task is complete
57 | - Dont use "done" before you are done with everything the user asked you, except you reach the last step of max_steps.
58 | - If you reach your last step, use the done action even if the task is not fully finished. Provide all the information you have gathered so far. If the ultimate task is completely finished set success to true. If not everything the user asked for is completed set success in done to false!
59 | - If you have to do something repeatedly for example the task says for "each", or "for all", or "x times", count always inside "memory" how many times you have done it and how many remain. Don't stop until you have completed like the task asked you. Only call done after the last step.
60 | - Don't hallucinate actions
61 |
62 | 7. Form filling:
63 | - If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
64 |
65 | 8. Extraction:
66 | - If your task is to find information - call extract_content on the specific pages to get and store the information.
67 | Your responses must be always JSON with the specified format.
68 |
--------------------------------------------------------------------------------
/browser_use/agent/memory/service.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import logging
4 | from typing import List, Optional
5 |
6 | from langchain_core.language_models.chat_models import BaseChatModel
7 | from langchain_core.messages import (
8 | BaseMessage,
9 | HumanMessage,
10 | )
11 | from langchain_core.messages.utils import convert_to_openai_messages
12 | from mem0 import Memory as Mem0Memory
13 | from pydantic import BaseModel
14 |
15 | from browser_use.agent.message_manager.service import MessageManager
16 | from browser_use.agent.message_manager.views import ManagedMessage, MessageMetadata
17 | from browser_use.utils import time_execution_sync
18 |
19 | logger = logging.getLogger(__name__)
20 |
21 |
22 | class MemorySettings(BaseModel):
23 | """Settings for procedural memory."""
24 |
25 | agent_id: str
26 | interval: int = 10
27 | config: Optional[dict] | None = None
28 |
29 |
30 | class Memory:
31 | """
32 | Manages procedural memory for agents.
33 |
34 | This class implements a procedural memory management system using Mem0 that transforms agent interaction history
35 | into concise, structured representations at specified intervals. It serves to optimize context window
36 | utilization during extended task execution by converting verbose historical information into compact,
37 | yet comprehensive memory constructs that preserve essential operational knowledge.
38 | """
39 |
40 | def __init__(
41 | self,
42 | message_manager: MessageManager,
43 | llm: BaseChatModel,
44 | settings: MemorySettings,
45 | ):
46 | self.message_manager = message_manager
47 | self.llm = llm
48 | self.settings = settings
49 | self._memory_config = self.settings.config or {'vector_store': {'provider': 'faiss'}}
50 | self.mem0 = Mem0Memory.from_config(config_dict=self._memory_config)
51 |
52 | @time_execution_sync('--create_procedural_memory')
53 | def create_procedural_memory(self, current_step: int) -> None:
54 | """
55 | Create a procedural memory if needed based on the current step.
56 |
57 | Args:
58 | current_step: The current step number of the agent
59 | """
60 | logger.info(f'Creating procedural memory at step {current_step}')
61 |
62 | # Get all messages
63 | all_messages = self.message_manager.state.history.messages
64 |
65 | # Filter out messages that are marked as memory in metadata
66 | messages_to_process = []
67 | new_messages = []
68 | for msg in all_messages:
69 | # Exclude system message and initial messages
70 | if isinstance(msg, ManagedMessage) and msg.metadata.message_type in set(['init', 'memory']):
71 | new_messages.append(msg)
72 | else:
73 | messages_to_process.append(msg)
74 |
75 | if len(messages_to_process) <= 1:
76 | logger.info('Not enough non-memory messages to summarize')
77 | return
78 |
79 | # Create a summary
80 | summary = self._create([m.message for m in messages_to_process], current_step)
81 |
82 | if not summary:
83 | logger.warning('Failed to create summary')
84 | return
85 |
86 | # Replace the summarized messages with the summary
87 | summary_message = HumanMessage(content=summary)
88 | summary_tokens = self.message_manager._count_tokens(summary_message)
89 | summary_metadata = MessageMetadata(tokens=summary_tokens, message_type='memory')
90 |
91 | # Calculate the total tokens being removed
92 | removed_tokens = sum(m.metadata.tokens for m in messages_to_process)
93 |
94 | # Add the summary message
95 | new_messages.append(ManagedMessage(message=summary_message, metadata=summary_metadata))
96 |
97 | # Update the history
98 | self.message_manager.state.history.messages = new_messages
99 | self.message_manager.state.history.current_tokens -= removed_tokens
100 | self.message_manager.state.history.current_tokens += summary_tokens
101 |
102 | logger.info(f'Memories summarized: {len(messages_to_process)} messages converted to procedural memory')
103 | logger.info(f'Token reduction: {removed_tokens - summary_tokens} tokens')
104 |
105 | def _create(self, messages: List[BaseMessage], current_step: int) -> Optional[str]:
106 | parsed_messages = convert_to_openai_messages(messages)
107 | try:
108 | results = self.mem0.add(
109 | messages=parsed_messages,
110 | agent_id=self.settings.agent_id,
111 | llm=self.llm,
112 | memory_type='procedural_memory',
113 | metadata={'step': current_step},
114 | )
115 | if len(results.get('results', [])):
116 | return results.get('results', [])[0].get('memory')
117 | return None
118 | except Exception as e:
119 | logger.error(f'Error creating procedural memory: {e}')
120 | return None
121 |
--------------------------------------------------------------------------------
/browser_use/dom/history_tree_processor/service.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | from typing import Optional
3 |
4 | from browser_use.dom.history_tree_processor.view import DOMHistoryElement, HashedDomElement
5 | from browser_use.dom.views import DOMElementNode
6 |
7 |
8 | class HistoryTreeProcessor:
9 | """ "
10 | Operations on the DOM elements
11 |
12 | @dev be careful - text nodes can change even if elements stay the same
13 | """
14 |
15 | @staticmethod
16 | def convert_dom_element_to_history_element(dom_element: DOMElementNode) -> DOMHistoryElement:
17 | from browser_use.browser.context import BrowserContext
18 |
19 | parent_branch_path = HistoryTreeProcessor._get_parent_branch_path(dom_element)
20 | css_selector = BrowserContext._enhanced_css_selector_for_element(dom_element)
21 | return DOMHistoryElement(
22 | dom_element.tag_name,
23 | dom_element.xpath,
24 | dom_element.highlight_index,
25 | parent_branch_path,
26 | dom_element.attributes,
27 | dom_element.shadow_root,
28 | css_selector=css_selector,
29 | page_coordinates=dom_element.page_coordinates,
30 | viewport_coordinates=dom_element.viewport_coordinates,
31 | viewport_info=dom_element.viewport_info,
32 | )
33 |
34 | @staticmethod
35 | def find_history_element_in_tree(dom_history_element: DOMHistoryElement, tree: DOMElementNode) -> Optional[DOMElementNode]:
36 | hashed_dom_history_element = HistoryTreeProcessor._hash_dom_history_element(dom_history_element)
37 |
38 | def process_node(node: DOMElementNode):
39 | if node.highlight_index is not None:
40 | hashed_node = HistoryTreeProcessor._hash_dom_element(node)
41 | if hashed_node == hashed_dom_history_element:
42 | return node
43 | for child in node.children:
44 | if isinstance(child, DOMElementNode):
45 | result = process_node(child)
46 | if result is not None:
47 | return result
48 | return None
49 |
50 | return process_node(tree)
51 |
52 | @staticmethod
53 | def compare_history_element_and_dom_element(dom_history_element: DOMHistoryElement, dom_element: DOMElementNode) -> bool:
54 | hashed_dom_history_element = HistoryTreeProcessor._hash_dom_history_element(dom_history_element)
55 | hashed_dom_element = HistoryTreeProcessor._hash_dom_element(dom_element)
56 |
57 | return hashed_dom_history_element == hashed_dom_element
58 |
59 | @staticmethod
60 | def _hash_dom_history_element(dom_history_element: DOMHistoryElement) -> HashedDomElement:
61 | branch_path_hash = HistoryTreeProcessor._parent_branch_path_hash(dom_history_element.entire_parent_branch_path)
62 | attributes_hash = HistoryTreeProcessor._attributes_hash(dom_history_element.attributes)
63 | xpath_hash = HistoryTreeProcessor._xpath_hash(dom_history_element.xpath)
64 |
65 | return HashedDomElement(branch_path_hash, attributes_hash, xpath_hash)
66 |
67 | @staticmethod
68 | def _hash_dom_element(dom_element: DOMElementNode) -> HashedDomElement:
69 | parent_branch_path = HistoryTreeProcessor._get_parent_branch_path(dom_element)
70 | branch_path_hash = HistoryTreeProcessor._parent_branch_path_hash(parent_branch_path)
71 | attributes_hash = HistoryTreeProcessor._attributes_hash(dom_element.attributes)
72 | xpath_hash = HistoryTreeProcessor._xpath_hash(dom_element.xpath)
73 | # text_hash = DomTreeProcessor._text_hash(dom_element)
74 |
75 | return HashedDomElement(branch_path_hash, attributes_hash, xpath_hash)
76 |
77 | @staticmethod
78 | def _get_parent_branch_path(dom_element: DOMElementNode) -> list[str]:
79 | parents: list[DOMElementNode] = []
80 | current_element: DOMElementNode = dom_element
81 | while current_element.parent is not None:
82 | parents.append(current_element)
83 | current_element = current_element.parent
84 |
85 | parents.reverse()
86 |
87 | return [parent.tag_name for parent in parents]
88 |
89 | @staticmethod
90 | def _parent_branch_path_hash(parent_branch_path: list[str]) -> str:
91 | parent_branch_path_string = '/'.join(parent_branch_path)
92 | return hashlib.sha256(parent_branch_path_string.encode()).hexdigest()
93 |
94 | @staticmethod
95 | def _attributes_hash(attributes: dict[str, str]) -> str:
96 | attributes_string = ''.join(f'{key}={value}' for key, value in attributes.items())
97 | return hashlib.sha256(attributes_string.encode()).hexdigest()
98 |
99 | @staticmethod
100 | def _xpath_hash(xpath: str) -> str:
101 | return hashlib.sha256(xpath.encode()).hexdigest()
102 |
103 | @staticmethod
104 | def _text_hash(dom_element: DOMElementNode) -> str:
105 | """ """
106 | text_string = dom_element.get_all_text_till_next_clickable_element()
107 | return hashlib.sha256(text_string.encode()).hexdigest()
108 |
--------------------------------------------------------------------------------
/browser_use/logging_config.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import sys
4 |
5 | from dotenv import load_dotenv
6 |
7 | load_dotenv()
8 |
9 |
10 | def addLoggingLevel(levelName, levelNum, methodName=None):
11 | """
12 | Comprehensively adds a new logging level to the `logging` module and the
13 | currently configured logging class.
14 |
15 | `levelName` becomes an attribute of the `logging` module with the value
16 | `levelNum`. `methodName` becomes a convenience method for both `logging`
17 | itself and the class returned by `logging.getLoggerClass()` (usually just
18 | `logging.Logger`). If `methodName` is not specified, `levelName.lower()` is
19 | used.
20 |
21 | To avoid accidental clobberings of existing attributes, this method will
22 | raise an `AttributeError` if the level name is already an attribute of the
23 | `logging` module or if the method name is already present
24 |
25 | Example
26 | -------
27 | >>> addLoggingLevel('TRACE', logging.DEBUG - 5)
28 | >>> logging.getLogger(__name__).setLevel('TRACE')
29 | >>> logging.getLogger(__name__).trace('that worked')
30 | >>> logging.trace('so did this')
31 | >>> logging.TRACE
32 | 5
33 |
34 | """
35 | if not methodName:
36 | methodName = levelName.lower()
37 |
38 | if hasattr(logging, levelName):
39 | raise AttributeError('{} already defined in logging module'.format(levelName))
40 | if hasattr(logging, methodName):
41 | raise AttributeError('{} already defined in logging module'.format(methodName))
42 | if hasattr(logging.getLoggerClass(), methodName):
43 | raise AttributeError('{} already defined in logger class'.format(methodName))
44 |
45 | # This method was inspired by the answers to Stack Overflow post
46 | # http://stackoverflow.com/q/2183233/2988730, especially
47 | # http://stackoverflow.com/a/13638084/2988730
48 | def logForLevel(self, message, *args, **kwargs):
49 | if self.isEnabledFor(levelNum):
50 | self._log(levelNum, message, args, **kwargs)
51 |
52 | def logToRoot(message, *args, **kwargs):
53 | logging.log(levelNum, message, *args, **kwargs)
54 |
55 | logging.addLevelName(levelNum, levelName)
56 | setattr(logging, levelName, levelNum)
57 | setattr(logging.getLoggerClass(), methodName, logForLevel)
58 | setattr(logging, methodName, logToRoot)
59 |
60 |
61 | def setup_logging():
62 | # Try to add RESULT level, but ignore if it already exists
63 | try:
64 | addLoggingLevel('RESULT', 35) # This allows ERROR, FATAL and CRITICAL
65 | except AttributeError:
66 | pass # Level already exists, which is fine
67 |
68 | log_type = os.getenv('BROWSER_USE_LOGGING_LEVEL', 'info').lower()
69 |
70 | # Check if handlers are already set up
71 | if logging.getLogger().hasHandlers():
72 | return
73 |
74 | # Clear existing handlers
75 | root = logging.getLogger()
76 | root.handlers = []
77 |
78 | class BrowserUseFormatter(logging.Formatter):
79 | def format(self, record):
80 | if isinstance(record.name, str) and record.name.startswith('browser_use.'):
81 | record.name = record.name.split('.')[-2]
82 | return super().format(record)
83 |
84 | # Setup single handler for all loggers
85 | console = logging.StreamHandler(sys.stdout)
86 |
87 | # adittional setLevel here to filter logs
88 | if log_type == 'result':
89 | console.setLevel('RESULT')
90 | console.setFormatter(BrowserUseFormatter('%(message)s'))
91 | else:
92 | console.setFormatter(BrowserUseFormatter('%(levelname)-8s [%(name)s] %(message)s'))
93 |
94 | # Configure root logger only
95 | root.addHandler(console)
96 |
97 | # switch cases for log_type
98 | if log_type == 'result':
99 | root.setLevel('RESULT') # string usage to avoid syntax error
100 | elif log_type == 'debug':
101 | root.setLevel(logging.DEBUG)
102 | else:
103 | root.setLevel(logging.INFO)
104 |
105 | # Configure browser_use logger
106 | browser_use_logger = logging.getLogger('browser_use')
107 | browser_use_logger.propagate = False # Don't propagate to root logger
108 | browser_use_logger.addHandler(console)
109 | browser_use_logger.setLevel(root.level) # Set same level as root logger
110 |
111 | logger = logging.getLogger('browser_use')
112 | logger.info('BrowserUse logging setup complete with level %s', log_type)
113 | # Silence third-party loggers
114 | for logger in [
115 | 'WDM',
116 | 'httpx',
117 | 'selenium',
118 | 'playwright',
119 | 'urllib3',
120 | 'asyncio',
121 | 'langchain',
122 | 'openai',
123 | 'httpcore',
124 | 'charset_normalizer',
125 | 'anthropic._base_client',
126 | 'PIL.PngImagePlugin',
127 | 'trafilatura.htmlprocessing',
128 | 'trafilatura',
129 | ]:
130 | third_party = logging.getLogger(logger)
131 | third_party.setLevel(logging.ERROR)
132 | third_party.propagate = False
133 |
--------------------------------------------------------------------------------
/browser_use/agent/message_manager/views.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import TYPE_CHECKING, Any
4 | from warnings import filterwarnings
5 |
6 | from langchain_core._api import LangChainBetaWarning
7 | from langchain_core.load import dumpd, load
8 | from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage, ToolMessage
9 | from pydantic import BaseModel, ConfigDict, Field, model_serializer, model_validator
10 |
11 | filterwarnings('ignore', category=LangChainBetaWarning)
12 |
13 | if TYPE_CHECKING:
14 | from browser_use.agent.views import AgentOutput
15 |
16 |
17 | class MessageMetadata(BaseModel):
18 | """Metadata for a message"""
19 |
20 | tokens: int = 0
21 | message_type: str | None = None
22 |
23 |
24 | class ManagedMessage(BaseModel):
25 | """A message with its metadata"""
26 |
27 | message: BaseMessage
28 | metadata: MessageMetadata = Field(default_factory=MessageMetadata)
29 |
30 | model_config = ConfigDict(arbitrary_types_allowed=True)
31 |
32 | # https://github.com/pydantic/pydantic/discussions/7558
33 | @model_serializer(mode='wrap')
34 | def to_json(self, original_dump):
35 | """
36 | Returns the JSON representation of the model.
37 |
38 | It uses langchain's `dumps` function to serialize the `message`
39 | property before encoding the overall dict with json.dumps.
40 | """
41 | data = original_dump(self)
42 |
43 | # NOTE: We override the message field to use langchain JSON serialization.
44 | data['message'] = dumpd(self.message)
45 |
46 | return data
47 |
48 | @model_validator(mode='before')
49 | @classmethod
50 | def validate(
51 | cls,
52 | value: Any,
53 | *,
54 | strict: bool | None = None,
55 | from_attributes: bool | None = None,
56 | context: Any | None = None,
57 | ) -> Any:
58 | """
59 | Custom validator that uses langchain's `loads` function
60 | to parse the message if it is provided as a JSON string.
61 | """
62 | if isinstance(value, dict) and 'message' in value:
63 | # NOTE: We use langchain's load to convert the JSON string back into a BaseMessage object.
64 | filterwarnings('ignore', category=LangChainBetaWarning)
65 | value['message'] = load(value['message'])
66 | return value
67 |
68 |
69 | class MessageHistory(BaseModel):
70 | """History of messages with metadata"""
71 |
72 | messages: list[ManagedMessage] = Field(default_factory=list)
73 | current_tokens: int = 0
74 |
75 | model_config = ConfigDict(arbitrary_types_allowed=True)
76 |
77 | def add_message(self, message: BaseMessage, metadata: MessageMetadata, position: int | None = None) -> None:
78 | """Add message with metadata to history"""
79 | if position is None:
80 | self.messages.append(ManagedMessage(message=message, metadata=metadata))
81 | else:
82 | self.messages.insert(position, ManagedMessage(message=message, metadata=metadata))
83 | self.current_tokens += metadata.tokens
84 |
85 | def add_model_output(self, output: 'AgentOutput') -> None:
86 | """Add model output as AI message"""
87 | tool_calls = [
88 | {
89 | 'name': 'AgentOutput',
90 | 'args': output.model_dump(mode='json', exclude_unset=True),
91 | 'id': '1',
92 | 'type': 'tool_call',
93 | }
94 | ]
95 |
96 | msg = AIMessage(
97 | content='',
98 | tool_calls=tool_calls,
99 | )
100 | self.add_message(msg, MessageMetadata(tokens=100)) # Estimate tokens for tool calls
101 |
102 | # Empty tool response
103 | tool_message = ToolMessage(content='', tool_call_id='1')
104 | self.add_message(tool_message, MessageMetadata(tokens=10)) # Estimate tokens for empty response
105 |
106 | def get_messages(self) -> list[BaseMessage]:
107 | """Get all messages"""
108 | return [m.message for m in self.messages]
109 |
110 | def get_total_tokens(self) -> int:
111 | """Get total tokens in history"""
112 | return self.current_tokens
113 |
114 | def remove_oldest_message(self) -> None:
115 | """Remove oldest non-system message"""
116 | for i, msg in enumerate(self.messages):
117 | if not isinstance(msg.message, SystemMessage):
118 | self.current_tokens -= msg.metadata.tokens
119 | self.messages.pop(i)
120 | break
121 |
122 | def remove_last_state_message(self) -> None:
123 | """Remove last state message from history"""
124 | if len(self.messages) > 2 and isinstance(self.messages[-1].message, HumanMessage):
125 | self.current_tokens -= self.messages[-1].metadata.tokens
126 | self.messages.pop()
127 |
128 |
129 | class MessageManagerState(BaseModel):
130 | """Holds the state for MessageManager"""
131 |
132 | history: MessageHistory = Field(default_factory=MessageHistory)
133 | tool_id: int = 1
134 |
135 | model_config = ConfigDict(arbitrary_types_allowed=True)
136 |
--------------------------------------------------------------------------------
/chrome-extension/panel.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | OTA user interaction data helper
8 |
9 |
10 |
11 |
12 |
13 |
14 |
(disconnected)
15 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 | Event (0)
56 |
57 |
58 |
59 | Target
60 |
61 |
62 | Details
63 |
64 |
65 |
66 | Saved ✔
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
OTA Action Capturer - Settings
77 |
78 |
82 |
83 |
87 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
--------------------------------------------------------------------------------
/browser_use/agent/message_manager/utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import json
4 | import logging
5 | import os
6 | from typing import Any, Optional, Type
7 |
8 | from langchain_core.messages import (
9 | AIMessage,
10 | BaseMessage,
11 | HumanMessage,
12 | SystemMessage,
13 | ToolMessage,
14 | )
15 |
16 | logger = logging.getLogger(__name__)
17 |
18 |
19 | def extract_json_from_model_output(content: str) -> dict:
20 | """Extract JSON from model output, handling both plain JSON and code-block-wrapped JSON."""
21 | try:
22 | # If content is wrapped in code blocks, extract just the JSON part
23 | if '```' in content:
24 | # Find the JSON content between code blocks
25 | content = content.split('```')[1]
26 | # Remove language identifier if present (e.g., 'json\n')
27 | if '\n' in content:
28 | content = content.split('\n', 1)[1]
29 | # Parse the cleaned content
30 | return json.loads(content)
31 | except json.JSONDecodeError as e:
32 | logger.warning(f'Failed to parse model output: {content} {str(e)}')
33 | raise ValueError('Could not parse response.')
34 |
35 |
36 | def convert_input_messages(input_messages: list[BaseMessage], model_name: Optional[str]) -> list[BaseMessage]:
37 | """Convert input messages to a format that is compatible with the planner model"""
38 | if model_name is None:
39 | return input_messages
40 | if model_name == 'deepseek-reasoner' or 'deepseek-r1' in model_name:
41 | converted_input_messages = _convert_messages_for_non_function_calling_models(input_messages)
42 | merged_input_messages = _merge_successive_messages(converted_input_messages, HumanMessage)
43 | merged_input_messages = _merge_successive_messages(merged_input_messages, AIMessage)
44 | return merged_input_messages
45 | return input_messages
46 |
47 |
48 | def _convert_messages_for_non_function_calling_models(input_messages: list[BaseMessage]) -> list[BaseMessage]:
49 | """Convert messages for non-function-calling models"""
50 | output_messages = []
51 | for message in input_messages:
52 | if isinstance(message, HumanMessage):
53 | output_messages.append(message)
54 | elif isinstance(message, SystemMessage):
55 | output_messages.append(message)
56 | elif isinstance(message, ToolMessage):
57 | output_messages.append(HumanMessage(content=message.content))
58 | elif isinstance(message, AIMessage):
59 | # check if tool_calls is a valid JSON object
60 | if message.tool_calls:
61 | tool_calls = json.dumps(message.tool_calls)
62 | output_messages.append(AIMessage(content=tool_calls))
63 | else:
64 | output_messages.append(message)
65 | else:
66 | raise ValueError(f'Unknown message type: {type(message)}')
67 | return output_messages
68 |
69 |
70 | def _merge_successive_messages(messages: list[BaseMessage], class_to_merge: Type[BaseMessage]) -> list[BaseMessage]:
71 | """Some models like deepseek-reasoner dont allow multiple human messages in a row. This function merges them into one."""
72 | merged_messages = []
73 | streak = 0
74 | for message in messages:
75 | if isinstance(message, class_to_merge):
76 | streak += 1
77 | if streak > 1:
78 | if isinstance(message.content, list):
79 | merged_messages[-1].content += message.content[0]['text'] # type:ignore
80 | else:
81 | merged_messages[-1].content += message.content
82 | else:
83 | merged_messages.append(message)
84 | else:
85 | merged_messages.append(message)
86 | streak = 0
87 | return merged_messages
88 |
89 |
90 | def save_conversation(input_messages: list[BaseMessage], response: Any, target: str, encoding: Optional[str] = None) -> None:
91 | """Save conversation history to file."""
92 |
93 | # create folders if not exists
94 | if dirname := os.path.dirname(target):
95 | os.makedirs(dirname, exist_ok=True)
96 |
97 | with open(
98 | target,
99 | 'w',
100 | encoding=encoding,
101 | ) as f:
102 | _write_messages_to_file(f, input_messages)
103 | _write_response_to_file(f, response)
104 |
105 |
106 | def _write_messages_to_file(f: Any, messages: list[BaseMessage]) -> None:
107 | """Write messages to conversation file"""
108 | for message in messages:
109 | f.write(f' {message.__class__.__name__} \n')
110 |
111 | if isinstance(message.content, list):
112 | for item in message.content:
113 | if isinstance(item, dict) and item.get('type') == 'text':
114 | f.write(item['text'].strip() + '\n')
115 | elif isinstance(message.content, str):
116 | try:
117 | content = json.loads(message.content)
118 | f.write(json.dumps(content, indent=2) + '\n')
119 | except json.JSONDecodeError:
120 | f.write(message.content.strip() + '\n')
121 |
122 | f.write('\n')
123 |
124 |
125 | def _write_response_to_file(f: Any, response: Any) -> None:
126 | """Write model response to conversation file"""
127 | f.write(' RESPONSE\n')
128 | f.write(json.dumps(json.loads(response.model_dump_json(exclude_unset=True)), indent=2))
129 |
--------------------------------------------------------------------------------
/browser_use/agent/system_prompt.md:
--------------------------------------------------------------------------------
1 | You are an AI agent designed to automate browser tasks. Your goal is to accomplish the ultimate task following the rules.
2 |
3 | # Input Format
4 | Task
5 | Previous steps
6 | Current URL
7 | Open Tabs
8 | Interactive Elements
9 | [index]text
10 | - index: Numeric identifier for interaction
11 | - type: HTML element type (button, input, etc.)
12 | - text: Element description
13 | Example:
14 | [33]
15 |
16 | - Only elements with numeric indexes in [] are interactive
17 | - elements without [] provide only context
18 |
19 | # Response Rules
20 | 1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format:
21 | {{"current_state": {{"evaluation_previous_goal": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Mention if something unexpected happened. Shortly state why/why not",
22 | "memory": "Description of what has been done and what you need to remember. Be very specific. Count here ALWAYS how many times you have done something and how many remain. E.g. 0 out of 10 websites analyzed. Continue with abc and xyz",
23 | "next_goal": "What needs to be done with the next immediate action"}},
24 | "action":[{{"one_action_name": {{// action-specific parameter}}}}, // ... more actions in sequence]}}
25 |
26 | 2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {{max_actions}} actions per sequence.
27 | Common action sequences:
28 | - Form filling: [{{"input_text": {{"index": 1, "text": "username"}}}}, {{"input_text": {{"index": 2, "text": "password"}}}}, {{"click_element": {{"index": 3}}}}]
29 | - Navigation and extraction: [{{"go_to_url": {{"url": "https://example.com"}}}}, {{"extract_content": {{"goal": "extract the names"}}}}]
30 | - Actions are executed in the given order
31 | - If the page changes after an action, the sequence is interrupted and you get the new state.
32 | - Only provide the action sequence until an action which changes the page state significantly.
33 | - Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page
34 | - only use multiple actions if it makes sense.
35 |
36 | 3. ELEMENT INTERACTION:
37 | - Only use indexes of the interactive elements
38 | - Elements marked with "[]Non-interactive text" are non-interactive
39 |
40 | 4. NAVIGATION & ERROR HANDLING:
41 | - If no suitable elements exist, use other functions to complete the task
42 | - If stuck, try alternative approaches - like going back to a previous page, new search, new tab etc.
43 | - Handle popups/cookies by accepting or closing them
44 | - Use scroll to find elements you are looking for
45 | - If you want to research something, open a new tab instead of using the current tab
46 | - If captcha pops up, try to solve it - else try a different approach
47 | - If the page is not fully loaded, use wait action
48 |
49 | 5. TASK COMPLETION:
50 | - Use the done action as the last action as soon as the ultimate task is complete
51 | - Dont use "done" before you are done with everything the user asked you, except you reach the last step of max_steps.
52 | - If you reach your last step, use the done action even if the task is not fully finished. Provide all the information you have gathered so far. If the ultimate task is completely finished set success to true. If not everything the user asked for is completed set success in done to false!
53 | - If you have to do something repeatedly for example the task says for "each", or "for all", or "x times", count always inside "memory" how many times you have done it and how many remain. Don't stop until you have completed like the task asked you. Only call done after the last step.
54 | - Don't hallucinate actions
55 | - Make sure you include everything you found out for the ultimate task in the done text parameter. Do not just say you are done, but include the requested information of the task.
56 |
57 | 6. VISUAL CONTEXT:
58 | - When an image is provided, use it to understand the page layout
59 | - Bounding boxes with labels on their top right corner correspond to element indexes
60 |
61 | 7. Form filling:
62 | - If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
63 |
64 | 8. Long tasks:
65 | - Keep track of the status and subresults in the memory.
66 | - You are provided with procedural memory summaries that condense previous task history (every N steps). Use these summaries to maintain context about completed actions, current progress, and next steps. The summaries appear in chronological order and contain key information about navigation history, findings, errors encountered, and current state. Refer to these summaries to avoid repeating actions and to ensure consistent progress toward the task goal.
67 |
68 | 9. Extraction:
69 | - If your task is to find information - call extract_content on the specific pages to get and store the information.
70 | Your responses must be always JSON with the specified format.
71 |
--------------------------------------------------------------------------------
/browser_use/controller/registry/views.py:
--------------------------------------------------------------------------------
1 | from typing import Callable, Dict, Type
2 |
3 | from playwright.async_api import Page
4 | from pydantic import BaseModel, ConfigDict
5 |
6 |
7 | class RegisteredAction(BaseModel):
8 | """Model for a registered action"""
9 |
10 | name: str
11 | description: str
12 | function: Callable
13 | param_model: Type[BaseModel]
14 |
15 | # filters: provide specific domains or a function to determine whether the action should be available on the given page or not
16 | domains: list[str] | None = None # e.g. ['*.google.com', 'www.bing.com', 'yahoo.*]
17 | page_filter: Callable[[Page], bool] | None = None
18 |
19 | model_config = ConfigDict(arbitrary_types_allowed=True)
20 |
21 | def prompt_description(self) -> str:
22 | """Get a description of the action for the prompt"""
23 | skip_keys = ['title']
24 | s = f'{self.description}: \n'
25 | s += '{' + str(self.name) + ': '
26 | s += str(
27 | {
28 | k: {sub_k: sub_v for sub_k, sub_v in v.items() if sub_k not in skip_keys}
29 | for k, v in self.param_model.model_json_schema()['properties'].items()
30 | }
31 | )
32 | s += '}'
33 | return s
34 |
35 |
36 | class ActionModel(BaseModel):
37 | """Base model for dynamically created action models"""
38 |
39 | # this will have all the registered actions, e.g.
40 | # click_element = param_model = ClickElementParams
41 | # done = param_model = None
42 | #
43 | model_config = ConfigDict(arbitrary_types_allowed=True)
44 |
45 | def get_index(self) -> int | None:
46 | """Get the index of the action"""
47 | # {'clicked_element': {'index':5}}
48 | params = self.model_dump(exclude_unset=True).values()
49 | if not params:
50 | return None
51 | for param in params:
52 | if param is not None and 'index' in param:
53 | return param['index']
54 | return None
55 |
56 | def set_index(self, index: int):
57 | """Overwrite the index of the action"""
58 | # Get the action name and params
59 | action_data = self.model_dump(exclude_unset=True)
60 | action_name = next(iter(action_data.keys()))
61 | action_params = getattr(self, action_name)
62 |
63 | # Update the index directly on the model
64 | if hasattr(action_params, 'index'):
65 | action_params.index = index
66 |
67 |
68 | class ActionRegistry(BaseModel):
69 | """Model representing the action registry"""
70 |
71 | actions: Dict[str, RegisteredAction] = {}
72 |
73 | @staticmethod
74 | def _match_domains(domains: list[str] | None, url: str) -> bool:
75 | """
76 | Match a list of domain glob patterns against a URL.
77 |
78 | Args:
79 | domain_patterns: A list of domain patterns that can include glob patterns (* wildcard)
80 | url: The URL to match against
81 |
82 | Returns:
83 | True if the URL's domain matches the pattern, False otherwise
84 | """
85 |
86 | if domains is None or not url:
87 | return True
88 |
89 | import fnmatch
90 | from urllib.parse import urlparse
91 |
92 | # Parse the URL to get the domain
93 | try:
94 | parsed_url = urlparse(url)
95 | if not parsed_url.netloc:
96 | return False
97 |
98 | domain = parsed_url.netloc
99 | # Remove port if present
100 | if ':' in domain:
101 | domain = domain.split(':')[0]
102 |
103 | for domain_pattern in domains:
104 | if fnmatch.fnmatch(domain, domain_pattern): # Perform glob *.matching.*
105 | return True
106 | return False
107 | except Exception:
108 | return False
109 |
110 | @staticmethod
111 | def _match_page_filter(page_filter: Callable[[Page], bool] | None, page: Page) -> bool:
112 | """Match a page filter against a page"""
113 | if page_filter is None:
114 | return True
115 | return page_filter(page)
116 |
117 | def get_prompt_description(self, page: Page | None = None) -> str:
118 | """Get a description of all actions for the prompt
119 |
120 | Args:
121 | page: If provided, filter actions by page using page_filter and domains.
122 |
123 | Returns:
124 | A string description of available actions.
125 | - If page is None: return only actions with no page_filter and no domains (for system prompt)
126 | - If page is provided: return only filtered actions that match the current page (excluding unfiltered actions)
127 | """
128 | if page is None:
129 | # For system prompt (no page provided), include only actions with no filters
130 | return '\n'.join(
131 | action.prompt_description()
132 | for action in self.actions.values()
133 | if action.page_filter is None and action.domains is None
134 | )
135 |
136 | # only include filtered actions for the current page
137 | filtered_actions = []
138 | for action in self.actions.values():
139 | if not (action.domains or action.page_filter):
140 | # skip actions with no filters, they are already included in the system prompt
141 | continue
142 |
143 | domain_is_allowed = self._match_domains(action.domains, page.url)
144 | page_is_allowed = self._match_page_filter(action.page_filter, page)
145 |
146 | if domain_is_allowed and page_is_allowed:
147 | filtered_actions.append(action)
148 |
149 | return '\n'.join(action.prompt_description() for action in filtered_actions)
150 |
--------------------------------------------------------------------------------
/chrome-extension/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
12 |
14 |
16 |
17 |
18 |
19 |
20 | OTA WAP Browser Action Capturer
21 | ======================
22 |
23 | OTA browser action capturer is a simple tool which helps you to collect the interactions with browser such as click, typing etc. and transforms them to well structured data for generating LLM-powered "records and replay" instructions. The action data will be organized in a JSON format and sent to your local WAP server of data collection.
24 |
25 | To deploy the local WAP server, please refer to: https://github.com/OTA-Tech-AI/webagentprotocol
26 | WAP (Web Agent protocol) is our standard protocol for AI Agent record-and-play inferencing.
27 |
28 |
29 | Installation
30 | -----
31 |
32 | Install our public **Chrome extension** at [WAP Browser Action Capturer](https://chromewebstore.google.com/detail/wap-browser-action-captur/chikiefojkdpmijbhepipdkadcljlbmh).
33 |
34 | If you want to install this extension locally, please refer to: https://developer.chrome.com/docs/extensions/get-started/tutorial/hello-world
35 |
36 | Usage of Action Capturer
37 | -----
38 |
39 | ### Perpare
40 |
41 | Open Chrome DevTools and navigate to the **"OTA Action Capturer"** panel and you will see:
42 |
43 |
44 |
45 | Make sure the IP address and port in Settings are correct:
46 |
47 |
48 |
49 | ### Start a record
50 |
51 | Clearly describe the task you will be working on and click "START RECORD":
52 |
53 |
54 |
55 | The capturer will record actions such as clicking, typing, navigating etc. only in the current page.
56 |
57 | If the HTML content in the page changed, the event table will present the added/removed/changed nodes. The information of changing will be collected and sent to your local WAP server.
58 |
59 |
60 |
61 | An example of the formatted data which you will received in the WAP backend server is like:
62 |
63 | ```json
64 | {
65 | "taskId": "MkCAhQsHgXn7YgaK",
66 | "type": "click",
67 | "actionTimestamp": 1746325231479,
68 | "eventTarget": {
69 | "type": "click",
70 | "target": "\n
18 |
19 | # Web Agent Protocol
20 |
21 | ## Overview
22 |
23 | The Web Agent Protocol (WAP) is a standardized framework designed to enable seamless interaction between users, web agents, and browsers by recording and replaying browser actions. It separates the concerns of action recording and execution, allowing for efficient automation and reusability. The Python SDK for WAP implements the full specification, making it easy to:
24 |
25 | 1. **Collect** user‑interaction data with the [OTA‑WAP Chrome extension](https://github.com/OTA-Tech-AI/webagentprotocol/tree/main/chrome-extension).
26 | 2. **Convert** the raw event stream into either **_exact‑replay_** or **_smart‑replay_** action lists.
27 | 3. **Convert** recorded actions into **_MCP_** servers for reuse by any agent or user
28 | 4. **Replay** those lists using the **_WAP-Replay_** protocol to ensure accurate browser operations.
29 |
30 | ### WAP FULL DEMO
31 |
32 | [](https://www.youtube.com/watch?v=joh9FXJfnwk)
33 |
34 | ### Without WAP
35 | 
36 |
37 | ### WAP Record
38 | 
39 |
40 | ### WAP Replay
41 | 
42 |
43 | ## Example using WAP
44 | 
45 |
46 | ## Setup
47 | Install the dependencies with the following command:
48 |
49 | Create a conda env
50 |
51 | ```bash
52 | conda create -n WAP python=3.11
53 | ```
54 |
55 | Activate the conda env
56 |
57 | ```bash
58 | conda activate WAP
59 | ```
60 |
61 | Install the dependencies
62 |
63 | ```bash
64 | pip install -r requirements.txt
65 | ```
66 |
67 | Setup your repo source path:
68 | ```
69 | set PYTHONPATH=C:/path/to/webagentprotocol # for Windows
70 | export PYTHONPATH=/path/to/webagentprotocol # for Linux
71 | ```
72 |
73 | Create **.env** file under the repo root directory with your own API keys:
74 | ```
75 | OPENAI_API_KEY=sk-proj-...
76 | DEEPSEEK_API_KEY=sk-...
77 | ```
78 |
79 | ## Record
80 |
81 | ### WAP record extension
82 | Please refer to [OTA‑WAP Chrome Extension](https://github.com/OTA-Tech-AI/webagentprotocol/tree/main/chrome-extension) to setup action capturer in your Chrome browser.
83 |
84 | ### Start data‑collection server
85 |
86 | Run the following command to start the server to collect data from the extension:
87 | ```bash
88 | python action_collect_server.py
89 | ```
90 | **Once the server is up, you can start to record from the page using WAP Chrome extension.**
91 |
92 | The server listens on http://localhost:4934/action-data by default, please make sure the Host and Port in the extension settings match this server config.
93 | Each session will be saved to:
94 |
95 | ```bash
96 | data/YYYYMMDD/taskid/summary_event_.json
97 | ```
98 |
99 | An example of the formatted data which you will received in the WAP backend server is like:
100 |
101 | ```json
102 | {
103 | "taskId": "MkCAhQsHgXn7YgaK",
104 | "type": "click",
105 | "actionTimestamp": 1746325231479,
106 | "eventTarget": {
107 | "type": "click",
108 | "target": "\n