├── utils
    ├── __init__.py
    ├── html_cleaner.py
    ├── llm.py
    ├── action_processing.py
    └── subgoal_generator.py
├── browser_use
    ├── dom
    │   ├── __init__.py
    │   ├── tests
    │   │   ├── process_dom_test.py
    │   │   ├── debug_page_structure.py
    │   │   └── extraction_test.py
    │   ├── history_tree_processor
    │   │   ├── view.py
    │   │   └── service.py
    │   ├── views.py
    │   └── service.py
    ├── agent
    │   ├── memory
    │   │   ├── __init__.py
    │   │   └── service.py
    │   ├── system_prompt_wap_replay.md
    │   ├── message_manager
    │   │   ├── views.py
    │   │   ├── utils.py
    │   │   └── tests.py
    │   ├── system_prompt.md
    │   ├── tests.py
    │   └── prompts.py
    ├── exceptions.py
    ├── controller
    │   ├── views_selector.py
    │   ├── views.py
    │   └── registry
    │   │   ├── views.py
    │   │   └── service.py
    ├── __init__.py
    ├── browser
    │   ├── tests
    │   │   ├── screenshot_test.py
    │   │   └── test_clicks.py
    │   ├── utils
    │   │   └── screen_resolution.py
    │   └── views.py
    ├── telemetry
    │   ├── views.py
    │   └── service.py
    └── logging_config.py
├── chrome-extension
    ├── .gitignore
    ├── assets
    │   ├── pause.gif
    │   ├── panelUI.png
    │   ├── settings.gif
    │   ├── recording.gif
    │   ├── start-record.gif
    │   └── beholder-tool-kit-long.png
    ├── ico
    │   ├── ota-logo-48.png
    │   └── ota-logo-128.png
    ├── other
    │   └── Raleway.woff2
    ├── js
    │   ├── devtools.js
    │   ├── ScrollHelper.js
    │   ├── ContentScriptProxy.js
    │   ├── specialEventHandler.js
    │   └── EventTable.js
    ├── devtools.html
    ├── .editorconfig
    ├── manifest.json
    ├── package.json
    ├── Gruntfile.js
    ├── panel.html
    ├── README.md
    └── css
    │   ├── panel.css
    │   └── normalize.css
├── assets
    └── wap_replay_tool_demo.gif
├── .gitignore
├── prompts
    └── subgoal_generation
    │   ├── task-start.md
    │   ├── task-finish.md
    │   ├── submit.md
    │   ├── go-back-or-forward.md
    │   └── common.md
├── wap_service.py
├── LICENSE
├── data_samples
    └── replay_list_samples
    │   ├── wap_smart_replay_list_y757R6w6y17LVHXl.json
    │   ├── wap_exact_replay_list_l8vZDGTfw3qu3GBs.json
    │   ├── wap_exact_replay_list_GqMnZeKFxvePGKGA.json
    │   └── wap_exact_replay_list_LhTyE4ie0s5a1W6J.json
├── action_collect_server.py
├── mcp_servers
    └── find_top_rated_keyboard_amazon_ca_y757R6w6y17LVHXl_mcp_server.py
├── requirements.txt
├── wap_replay
    ├── generate_smart_replay_list.py
    ├── generate_exact_replay_list.py
    └── generate_mcp_server.py
└── README.md


/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/browser_use/dom/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/chrome-extension/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | .idea
3 | wap_browser_action_capturer-*.zip
4 | 


--------------------------------------------------------------------------------
/assets/wap_replay_tool_demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OTA-Tech-AI/web-agent-protocol/HEAD/assets/wap_replay_tool_demo.gif


--------------------------------------------------------------------------------
/chrome-extension/assets/pause.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OTA-Tech-AI/web-agent-protocol/HEAD/chrome-extension/assets/pause.gif


--------------------------------------------------------------------------------
/chrome-extension/assets/panelUI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OTA-Tech-AI/web-agent-protocol/HEAD/chrome-extension/assets/panelUI.png


--------------------------------------------------------------------------------
/chrome-extension/assets/settings.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OTA-Tech-AI/web-agent-protocol/HEAD/chrome-extension/assets/settings.gif


--------------------------------------------------------------------------------
/chrome-extension/ico/ota-logo-48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OTA-Tech-AI/web-agent-protocol/HEAD/chrome-extension/ico/ota-logo-48.png


--------------------------------------------------------------------------------
/chrome-extension/other/Raleway.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OTA-Tech-AI/web-agent-protocol/HEAD/chrome-extension/other/Raleway.woff2


--------------------------------------------------------------------------------
/chrome-extension/assets/recording.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OTA-Tech-AI/web-agent-protocol/HEAD/chrome-extension/assets/recording.gif


--------------------------------------------------------------------------------
/chrome-extension/ico/ota-logo-128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OTA-Tech-AI/web-agent-protocol/HEAD/chrome-extension/ico/ota-logo-128.png


--------------------------------------------------------------------------------
/chrome-extension/assets/start-record.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OTA-Tech-AI/web-agent-protocol/HEAD/chrome-extension/assets/start-record.gif


--------------------------------------------------------------------------------
/chrome-extension/js/devtools.js:
--------------------------------------------------------------------------------
1 | chrome.devtools.panels.create("OTA Action Capturer", "ico/logo_128.png", "panel.html", function (panel) {});
2 | 


--------------------------------------------------------------------------------
/browser_use/agent/memory/__init__.py:
--------------------------------------------------------------------------------
1 | from browser_use.agent.memory.service import Memory, MemorySettings
2 | 
3 | __all__ = ['Memory', 'MemorySettings']
4 | 


--------------------------------------------------------------------------------
/chrome-extension/assets/beholder-tool-kit-long.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OTA-Tech-AI/web-agent-protocol/HEAD/chrome-extension/assets/beholder-tool-kit-long.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | 
 3 | cookies*.json
 4 | element_tree.txt
 5 | html/
 6 | data/
 7 | results/
 8 | ota_collect_data/
 9 | subgoals/
10 | .env
11 | data_processed/


--------------------------------------------------------------------------------
/browser_use/exceptions.py:
--------------------------------------------------------------------------------
1 | class LLMException(Exception):
2 | 	def __init__(self, status_code, message):
3 | 		self.status_code = status_code
4 | 		self.message = message
5 | 		super().__init__(f'Error {status_code}: {message}')
6 | 


--------------------------------------------------------------------------------
/chrome-extension/devtools.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <title>OTA user interaction data helper</title>
 5 |     <script src='js/devtools.js'></script>
 6 | </head>
 7 | <body>
 8 | </body>
 9 | </html>
10 | 


--------------------------------------------------------------------------------
/browser_use/controller/views_selector.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from pydantic import BaseModel, ConfigDict, Field, model_validator
 4 | 
 5 | class InputTextBySelectorAction(BaseModel):
 6 | 	selector: str
 7 | 	text: str
 8 | 	xpath: Optional[str] = None
 9 | 
10 | 
11 | class Position(BaseModel):
12 | 	x: int
13 | 	y: int
14 | 


--------------------------------------------------------------------------------
/chrome-extension/.editorconfig:
--------------------------------------------------------------------------------
 1 | # top-most EditorConfig file
 2 | root = true
 3 | 
 4 | # Newline ending every file
 5 | [*]
 6 | end_of_line = lf
 7 | insert_final_newline = true
 8 | 
 9 | # Charset
10 | [*.{js,html,css,md,json}]
11 | charset = utf-8
12 | 
13 | # Indentation
14 | [*.{js,html,css,json}]
15 | indent_style = space
16 | indent_size = 4


--------------------------------------------------------------------------------
/prompts/subgoal_generation/task-start.md:
--------------------------------------------------------------------------------
 1 | I need your help with an analysis to an action in browser and its related changes.
 2 | We recorded a user's interactions with his browser for the current task. His ultimate goal is:
 3 | 
 4 | {{ ultimate_goal }}
 5 | 
 6 | The user just started this task, and when he clicked "task start" button, his current page is at:
 7 | 
 8 | {{ change_events }}
 9 | 
10 | based on this information, provide a concise and formatted instruction in JSON to make another agent to know which website it needs to go to, e.g.:
11 | {"next_goal": "Open allrecipes.com in a new tab to search for the recipe."}


--------------------------------------------------------------------------------
/chrome-extension/manifest.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"manifest_version": 3,
 3 | 	"name": "WAP Browser Action Capturer",
 4 | 	"version": "1.0",
 5 | 	"description": "A simple tool helping you to collect the interactions with browser for WAP replay.",
 6 | 	"icons": {
 7 | 		"128": "ico/ota-logo-128.png",
 8 | 		"48": "ico/ota-logo-48.png"
 9 | 	},
10 | 	"permissions": [
11 | 		"activeTab",
12 | 		"webNavigation",
13 | 		"scripting",
14 | 		"storage"
15 | 	],
16 | 	"optional_host_permissions": [
17 | 		"*://*/*"
18 | 	],
19 | 	"background": {
20 | 		"service_worker": "js/background.js"
21 | 	},
22 | 	"devtools_page": "devtools.html",
23 | 	"content_security_policy": {
24 | 		"extension_pages": "script-src 'self'; object-src 'self'"
25 | 	}
26 |   }
27 |   


--------------------------------------------------------------------------------
/chrome-extension/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "WAP-Browser-Action-Capturer",
 3 |   "version": "1.3.0",
 4 |   "description": "A simple tool helping you to collect the interactions with browser for WAP replay",
 5 |   "main": "Gruntfile.js",
 6 |   "repository": {
 7 |     "type": "git",
 8 |     "url": "git://github.com/OTA-Tech-AI/webagentprotocol.git"
 9 |   },
10 |   "author": "Konrad Dzwinel",
11 |   "license": "GPL",
12 |   "bugs": {
13 |     "url": "https://github.com/OTA-Tech-AI/webagentprotocol/issues"
14 |   },
15 |   "homepage": "https://github.com/OTA-Tech-AI/webagentprotocol",
16 |   "dependencies": {
17 |     "grunt": "^1.0.3",
18 |     "grunt-contrib-csslint": "^2.0.0",
19 |     "grunt-contrib-jshint": "^1.1.0",
20 |     "grunt-contrib-watch": "^1.1.0",
21 |     "grunt-zip": "^0.17.1"
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/browser_use/__init__.py:
--------------------------------------------------------------------------------
 1 | from browser_use.logging_config import setup_logging
 2 | 
 3 | setup_logging()
 4 | 
 5 | from browser_use.agent.prompts import SystemPrompt as SystemPrompt
 6 | from browser_use.agent.service import Agent as Agent
 7 | from browser_use.agent.views import ActionModel as ActionModel
 8 | from browser_use.agent.views import ActionResult as ActionResult
 9 | from browser_use.agent.views import AgentHistoryList as AgentHistoryList
10 | from browser_use.browser.browser import Browser as Browser
11 | from browser_use.browser.browser import BrowserConfig as BrowserConfig
12 | from browser_use.browser.context import BrowserContextConfig
13 | from browser_use.controller.service import Controller as Controller
14 | from browser_use.dom.service import DomService as DomService
15 | 
16 | __all__ = [
17 | 	'Agent',
18 | 	'Browser',
19 | 	'BrowserConfig',
20 | 	'Controller',
21 | 	'DomService',
22 | 	'SystemPrompt',
23 | 	'ActionResult',
24 | 	'ActionModel',
25 | 	'AgentHistoryList',
26 | 	'BrowserContextConfig',
27 | ]
28 | 


--------------------------------------------------------------------------------
/wap_service.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, request
 2 | import run_replay
 3 | 
 4 | app = Flask(__name__)
 5 | 
 6 | @app.route('/replay', methods=['GET'])
 7 | async def run_replay_endpoint():
 8 |     try:
 9 |         # Get parameters from query string
10 |         iterations = int(request.args.get('concurrent')) 
11 |         model = request.args.get('model') 
12 |         file_path = request.args.get('file_path') 
13 | 
14 |         # Validate required parameters
15 |         if not model or not file_path:
16 |             return {"status": "error", "message": "Model and file_path are required"}, 400
17 | 
18 |         await run_replay.main(iterations, model, file_path)
19 |         return {"status": "success", "message": "Replay executed successfully"}
20 |     except ValueError as ve:
21 |         return {"status": "error", "message": "Invalid iterations value: must be an integer"}, 400
22 |     except Exception as e:
23 |         return {"status": "error", "message": str(e)}, 500
24 | 
25 | if __name__ == '__main__':
26 |     app.run(host='0.0.0.0', port=3089)


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 OTA-Tech-AI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/data_samples/replay_list_samples/wap_smart_replay_list_y757R6w6y17LVHXl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "ultimate_goal": "find a top rated keyboard on amazon.ca",
 3 |   "task_id": "y757R6w6y17LVHXl",
 4 |   "type": "smart_replay",
 5 |   "subgoal_list": [
 6 |     {
 7 |       "index": 0,
 8 |       "subgoal": "task starts, go for the next sub-goal"
 9 |     },
10 |     {
11 |       "index": 1,
12 |       "subgoal": "Search for 'top rated keyboard' on amazon.ca to find the best options."
13 |     },
14 |     {
15 |       "index": 2,
16 |       "subgoal": "Enter 'keyboard' as the search term in the search input field and press enter key."
17 |     },
18 |     {
19 |       "index": 3,
20 |       "subgoal": "Click on the dropdown labeled 'Sort by:' to change sorting options."
21 |     },
22 |     {
23 |       "index": 4,
24 |       "subgoal": "Click on the option labeled 'Avg. customer review' in the sort dropdown menu."
25 |     },
26 |     {
27 |       "index": 5,
28 |       "subgoal": "Click on the first product"
29 |     },
30 |     {
31 |       "index": 6,
32 |       "subgoal": "GOAL-NOT-ACHIEVED"
33 |     },
34 |     {
35 |       "index": 7,
36 |       "subgoal": "task done"
37 |     }
38 |   ]
39 | }


--------------------------------------------------------------------------------
/browser_use/browser/tests/screenshot_test.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import base64
 3 | 
 4 | import pytest
 5 | 
 6 | from browser_use.browser.browser import Browser, BrowserConfig
 7 | 
 8 | 
 9 | async def test_take_full_page_screenshot():
10 | 	browser = Browser(config=BrowserConfig(headless=False, disable_security=True))
11 | 	try:
12 | 		async with await browser.new_context() as context:
13 | 			page = await context.get_current_page()
14 | 			# Go to a test page
15 | 			await page.goto('https://example.com')
16 | 
17 | 			await asyncio.sleep(3)
18 | 			# Take full page screenshot
19 | 			screenshot_b64 = await context.take_screenshot(full_page=True)
20 | 			await asyncio.sleep(3)
21 | 			# Verify screenshot is not empty and is valid base64
22 | 			assert screenshot_b64 is not None
23 | 			assert isinstance(screenshot_b64, str)
24 | 			assert len(screenshot_b64) > 0
25 | 
26 | 			# Test we can decode the base64 string
27 | 			try:
28 | 				base64.b64decode(screenshot_b64)
29 | 			except Exception as e:
30 | 				pytest.fail(f'Failed to decode base64 screenshot: {str(e)}')
31 | 	finally:
32 | 		await browser.close()
33 | 
34 | 
35 | if __name__ == '__main__':
36 | 	asyncio.run(test_take_full_page_screenshot())
37 | 


--------------------------------------------------------------------------------
/chrome-extension/Gruntfile.js:
--------------------------------------------------------------------------------
 1 | module.exports = function (grunt) {
 2 |   "use strict";
 3 | 
 4 |   grunt.initConfig({
 5 |     pkg: grunt.file.readJSON('package.json'),
 6 |     jshint: {
 7 |       files: ['Gruntfile.js', 'js/**/*.js'],
 8 |       options: {
 9 |         esversion: 6,
10 |         evil: true,
11 |         camelcase: true,
12 |         curly: true,
13 |         eqeqeq: true,
14 |         noempty: true,
15 |         strict: true,
16 |         loopfunc: true,
17 |         globals: {
18 |           console: true,
19 |           document: true
20 |         }
21 |       }
22 |     },
23 |     csslint: {
24 |       src: ['css/*.css'],
25 |       options: {
26 |         ids: false,
27 |         'compatible-vendor-prefixes': false,
28 |         'fallback-colors': false
29 |       }
30 |     },
31 |     zip: {
32 |       'wap_browser_action_capturer-<%= pkg.version %>.zip': ['css/**/*', 'ico/logo_*.png', 'js/**/*', 'other/**/*', '*.html', 'manifest.json']
33 |     }
34 |   });
35 | 
36 |   grunt.loadNpmTasks('grunt-contrib-jshint');
37 |   grunt.loadNpmTasks('grunt-contrib-csslint');
38 |   grunt.loadNpmTasks('grunt-zip');
39 | 
40 |   grunt.registerTask('default', ['jshint']);
41 |   grunt.registerTask('prod', ['zip']);
42 | };
43 | 


--------------------------------------------------------------------------------
/prompts/subgoal_generation/task-finish.md:
--------------------------------------------------------------------------------
 1 | I need your help with an analysis to an action in browser and its related changes.
 2 | We recorded a user's interactions with his browser for the current task. His ultimate goal is:
 3 | 
 4 | {{ ultimate_goal }}
 5 | 
 6 | now the user has already finished the task, and the final page content that he submitted to complete task is: 
 7 | 
 8 | {{ page_content }}
 9 | 
10 | based on this content, please tell me: do you think this task is really finished?
11 | Provide a concise and formatted instruction in JSON to make another agent to know what to do, you have several options:
12 | 
13 | 1. if the ultimate goal has been achieved by the current action and no more other actions need to be executed or no any information needs to be delivered to the user, only reply a 'done' message, e.g.: {"next_goal": "The ultimate task is done"}
14 | 
15 | 2. if the ultimate goal has been achieved but we need to extract information from the current page content to respond user's demands, reply a content extraction message, e.g.: {"next_goal": "extract the cook time and prepare time from the page content"}
16 | 
17 | 3. if the ultimate goal has NOT been ahieved, please reply a failure message, e,g.: {"next_goal": "GOAL-NOT-ACHIEVED", "reason": "the cook time is longer than expected ..."}


--------------------------------------------------------------------------------
/chrome-extension/js/ScrollHelper.js:
--------------------------------------------------------------------------------
 1 | (function () {
 2 |     "use strict";
 3 | 
 4 |     function ScrollHelper(button) {
 5 |         var scrollHelper = this;
 6 | 
 7 |         this._button = button;
 8 | 
 9 |         var scrollPos = 0;
10 |         document.addEventListener('scroll', function () {
11 |             scrollPos = document.body.scrollTop;
12 |         });
13 | 
14 |         function updateBtn() {
15 |             if (scrollPos > 0) {
16 |                 scrollHelper.showButton();
17 |             } else {
18 |                 scrollHelper.hideButton();
19 |             }
20 |             requestAnimationFrame(updateBtn);
21 |         }
22 | 
23 |         updateBtn();
24 |     }
25 | 
26 |     ScrollHelper.prototype.hideButton = function () {
27 |         this._button.classList.add('hidden');
28 |     };
29 | 
30 |     ScrollHelper.prototype.showButton = function () {
31 |         this._button.classList.remove('hidden');
32 |     };
33 | 
34 |     ScrollHelper.prototype.scrollToTheTop = function () {
35 |         var scrollPos = document.body.scrollTop;
36 | 
37 |         if (scrollPos > 0) {
38 |             document.body.scrollTop -= (scrollPos > 10) ? (scrollPos / 4) : 10;
39 |             requestAnimationFrame(this.scrollToTheTop.bind(this));
40 |         }
41 |     };
42 | 
43 |     window.ScrollHelper = ScrollHelper;
44 | })();
45 | 


--------------------------------------------------------------------------------
/browser_use/dom/tests/process_dom_test.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import time
 4 | 
 5 | from browser_use.browser.browser import Browser, BrowserConfig
 6 | 
 7 | 
 8 | async def test_process_dom():
 9 | 	browser = Browser(config=BrowserConfig(headless=False))
10 | 
11 | 	async with await browser.new_context() as context:
12 | 		page = await context.get_current_page()
13 | 		await page.goto('https://kayak.com/flights')
14 | 		# await page.goto('https://google.com/flights')
15 | 		# await page.goto('https://immobilienscout24.de')
16 | 		# await page.goto('https://seleniumbase.io/w3schools/iframes')
17 | 
18 | 		time.sleep(3)
19 | 
20 | 		with open('browser_use/dom/buildDomTree.js', 'r') as f:
21 | 			js_code = f.read()
22 | 
23 | 		start = time.time()
24 | 		dom_tree = await page.evaluate(js_code)
25 | 		end = time.time()
26 | 
27 | 		# print(dom_tree)
28 | 		print(f'Time: {end - start:.2f}s')
29 | 
30 | 		os.makedirs('./tmp', exist_ok=True)
31 | 		with open('./tmp/dom.json', 'w') as f:
32 | 			json.dump(dom_tree, f, indent=1)
33 | 
34 | 		# both of these work for immobilienscout24.de
35 | 		# await page.click('.sc-dcJsrY.ezjNCe')
36 | 		# await page.click(
37 | 		# 	'div > div:nth-of-type(2) > div > div:nth-of-type(2) > div > div:nth-of-type(2) > div > div > div > button:nth-of-type(2)'
38 | 		# )
39 | 
40 | 		input('Press Enter to continue...')
41 | 


--------------------------------------------------------------------------------
/prompts/subgoal_generation/submit.md:
--------------------------------------------------------------------------------
 1 | I need your help with an analysis to an action in browser and its related changes.
 2 | We recorded an action of 'submit' by the user with his browser for the current task. His ultimate goal is:
 3 | 
 4 | {{ ultimate_goal }}
 5 | 
 6 | So here is the basic information of the action for the 'submit' in current sub-task:
 7 | 
 8 | {{ action }}
 9 | 
10 | note that "target" is the targeted element for this action.
11 | Here is the detailed information about the form values that the user submitted:
12 | 
13 | {{ change_events }}
14 | 
15 | note that in some "nodeinfo", #rme mean there are more children inside this tag pairs but we hide it for shorting the context.
16 | You should think about what is the purpose of this action by the user, and think about what is the goal this user is trying to achieve in the current sub-task. You don't need to tell me your thought process. You only need to give me a final reply which is a concise and formatted instruction in JSON to make another agent to understand this sub-goal and reproduce the action. Do not mension the details of the target. If the submision is a search, you need to provide two actions, input change and press enter key e.g.:
17 | {"next_goal": "Enter 'Singapore' as the destination in the search input field and press enter key."}
18 | {"next_goal": "Click on the button with text 'Dinners' to view more options for cooking dinners at home"}
19 | 


--------------------------------------------------------------------------------
/action_collect_server.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import datetime
 3 | import argparse
 4 | from flask import Flask, request, jsonify
 5 | from flask_cors import CORS
 6 | 
 7 | app = Flask(__name__)
 8 | CORS(app)
 9 | 
10 | def mkdir_n_define_file_name(data_root_dir, task_name):
11 |     timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
12 |     date_folder = timestamp.split('_')[0]
13 |     # Include task_name after the date folder
14 |     folderpath = os.path.join(data_root_dir, date_folder, task_name)
15 |     if not os.path.exists(folderpath):
16 |         os.makedirs(folderpath)
17 |     filename = f"summary_event_{timestamp}.json"
18 |     filepath = os.path.join(folderpath, filename)
19 |     return filepath
20 | 
21 | @app.route('/action-data', methods=['POST'])
22 | def handle_event():
23 |     if not request.is_json:
24 |         return jsonify({"status": "error", "message": "Request must be JSON"}), 400
25 | 
26 |     event_data = request.get_json()
27 |     task_id = event_data["taskId"]
28 |     filepath = mkdir_n_define_file_name("data", task_id)
29 | 
30 |     with open(filepath, "w", encoding='utf-8') as json_file:
31 |         import json
32 |         json.dump(event_data, json_file, indent=2)
33 | 
34 |     return jsonify({"status": "success", "message": f"Event received and saved as {filepath}"}), 200
35 | 
36 | if __name__ == '__main__':
37 |     # Run the Flask app
38 |     app.run(debug=True, host='0.0.0.0', port=4934)


--------------------------------------------------------------------------------
/prompts/subgoal_generation/go-back-or-forward.md:
--------------------------------------------------------------------------------
 1 | I need your help with an analysis to an action in browser and its related changes.
 2 | We recorded a user's interactions with his browser for the current task. His ultimate goal is:
 3 | 
 4 | {{ ultimate_goal }}
 5 | 
 6 | In the current sub-task, the user clicked on "go back" or "go forward" button of the browser, it is possible that he didn't find the information he needed in the current page, or he may want to 
 7 | confirm information in the previous page. This is the information of this action:
 8 | 
 9 | {{ action }}
10 | 
11 | The content before he goes back or forward is:
12 | 
13 | {{ page_content }}
14 | 
15 | note that sometimes in the page content, you will see #rme and it means there are more children inside this tag but we hide it for shortening contexts.
16 | You should think about what is the purpose of this action by the user. You don't need to tell me your thought process. You only need to give me a final reply which is a concise and formatted instruction in JSON to make another agent to understand this sub-goal and reproduce the action.
17 | Do not mention "go back" or "go forward" because it is unclear. Tell which URL it should navigate to. e.g.:
18 | {"next_goal": "Navigate to https://www.allrecipes.com/search?q=baked+salmon to review search results for 'baked salmon' recipes on Allrecipes."}
19 | {"next_goal": "Navigate to google.com to search for keywords spanish restaurants."}


--------------------------------------------------------------------------------
/prompts/subgoal_generation/common.md:
--------------------------------------------------------------------------------
 1 | I need your help with an analysis to an action in browser and its related changes.
 2 | We recorded a user's interactions with his browser for the current task. His ultimate goal is:
 3 | 
 4 | {{ ultimate_goal }}
 5 | 
 6 | So here is the basic information of the action for the current sub-task that a user takes in browser:
 7 | 
 8 | {{ action }}
 9 | 
10 | note that "target" is the targeted element for this action.
11 | This is what happened in the browser DOM before and after this action:
12 | 
13 | {{ change_events }}
14 | 
15 | note that in some "nodeinfo", #rme mean there are more children inside this tag pairs but we hide it for shorting the context.
16 | You should think about what is the purpose of this action by the user, and think about what is the goal this user is trying to achieve in the current sub-task. You don't need to tell me your thought process. You only need to give me a final reply which is a concise and formatted instruction in JSON to make another agent to understand this sub-goal and reproduce the action. Subgoal should be generalized and fit the ultimate goal. Only include one action in the subgoal, do not explain action. Only include one action (verb) in the subgoal! If you want to use "and", only keep the first action. e.g.:
17 | {"next_goal": "Enter 'Singapore' as the destination in the search input field."}
18 | {"next_goal": "Click on the button with text 'Dinners'"}
19 | {"next_goal": "Click on the first item"}
20 | 


--------------------------------------------------------------------------------
/mcp_servers/find_top_rated_keyboard_amazon_ca_y757R6w6y17LVHXl_mcp_server.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from mcp.server.fastmcp import FastMCP
 3 | import httpx
 4 | 
 5 | mcp = FastMCP("find a top rated keyboard on amazon.ca")
 6 | 
 7 | @mcp.tool()
 8 | async def find_top_rated_keyboard_amazon_ca_smart_replay() -> str:
 9 |     """smart replay: find a top rated keyboard on amazon.ca"""
10 |     async with httpx.AsyncClient(timeout=600.0) as client:
11 |         response = await client.get(
12 |             "http://localhost:3089/replay",
13 |             params={
14 |                 "concurrent": 1,
15 |                 "model": "openai",
16 |                 "file_path": 'data_processed/smart_replay/wap_smart_replay_list_y757R6w6y17LVHXl.json'
17 |             }
18 |         )
19 |         return response.text
20 |     return "FAILED"
21 | 
22 | @mcp.tool()
23 | async def find_top_rated_keyboard_amazon_ca_exact_replay() -> str:
24 |     """exact replay: find a top rated keyboard on amazon.ca"""
25 |     async with httpx.AsyncClient(timeout=600.0) as client:
26 |         response = await client.get(
27 |             "http://localhost:3089/replay",
28 |             params={
29 |                 "concurrent": 1,
30 |                 "model": "openai",
31 |                 "file_path": 'data_processed/exact_replay/wap_exact_replay_list_y757R6w6y17LVHXl.json'
32 |             }
33 |         )
34 |         return response.text
35 |     return "FAILED"
36 | 
37 | if __name__ == "__main__":
38 |     mcp.run(transport="stdio")
39 | 


--------------------------------------------------------------------------------
/browser_use/browser/utils/screen_resolution.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | 
 4 | def get_screen_resolution():
 5 | 	if sys.platform == 'darwin':  # macOS
 6 | 		try:
 7 | 			from AppKit import NSScreen
 8 | 
 9 | 			screen = NSScreen.mainScreen().frame()
10 | 			return {'width': int(screen.size.width), 'height': int(screen.size.height)}
11 | 		except ImportError:
12 | 			print('AppKit is not available. Make sure you are running this on macOS with pyobjc installed.')
13 | 		except Exception as e:
14 | 			print(f'Error retrieving macOS screen resolution: {e}')
15 | 		return {'width': 2560, 'height': 1664}
16 | 
17 | 	else:  # Windows & Linux
18 | 		try:
19 | 			from screeninfo import get_monitors
20 | 
21 | 			monitors = get_monitors()
22 | 			if not monitors:
23 | 				raise Exception('No monitors detected.')
24 | 			monitor = monitors[0]
25 | 			return {'width': monitor.width, 'height': monitor.height}
26 | 		except ImportError:
27 | 			print("screeninfo package not found. Install it using 'pip install screeninfo'.")
28 | 		except Exception as e:
29 | 			print(f'Error retrieving screen resolution: {e}')
30 | 
31 | 		return {'width': 1920, 'height': 1080}
32 | 
33 | 
34 | def get_window_adjustments():
35 | 	"""Returns recommended x, y offsets for window positioning"""
36 | 	if sys.platform == 'darwin':  # macOS
37 | 		return -4, 24  # macOS has a small title bar, no border
38 | 	elif sys.platform == 'win32':  # Windows
39 | 		return -8, 0  # Windows has a border on the left
40 | 	else:  # Linux
41 | 		return 0, 0
42 | 


--------------------------------------------------------------------------------
/chrome-extension/js/ContentScriptProxy.js:
--------------------------------------------------------------------------------
 1 | (function () {
 2 |     "use strict";
 3 | 
 4 |     function callCommand(cmd) {
 5 |         chrome.devtools.inspectedWindow.eval(
 6 |             cmd,
 7 |             {useContentScriptContext: true},
 8 |             function (isException, result) {
 9 |                 if (isException || chrome.runtime.lastError) {
10 |                     console.error('Content script command call failed.', cmd, result, chrome.runtime.lastError);
11 |                 }
12 |             }
13 |         );
14 |     }
15 | 
16 | 	function jsArg(str) {
17 | 		// safely quote argument for eval
18 | 		return JSON.stringify(str);
19 | 	}
20 | 
21 |     window.ContentScriptProxy = {
22 |         inspectNode: function (nodeId) {
23 |             callCommand('inspect(domListenerExtension.getNode(' + nodeId + '))');
24 |         },
25 |         highlightNode: function (nodeId) {
26 |             callCommand('domListenerExtension.highlightNode(' + nodeId + ')');
27 |         },
28 |         startRecording: function (desc) {
29 |             callCommand(`domListenerExtension.startTaskRecording(${jsArg(desc)})`);
30 |         },
31 |         pauseRecording: function () {
32 |             callCommand('domListenerExtension.pauseTaskRecording()');
33 |         },
34 | 		resumeRecording: function (desc) {
35 | 			callCommand(`domListenerExtension.resumeTaskRecording(${jsArg(desc)})`);
36 | 		},
37 | 		finishRecording: function () {
38 | 			callCommand('domListenerExtension.finishTaskRecording()');
39 | 		}
40 |     };
41 | })();
42 | 


--------------------------------------------------------------------------------
/browser_use/telemetry/views.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from dataclasses import asdict, dataclass
 3 | from typing import Any, Dict, Sequence
 4 | 
 5 | 
 6 | @dataclass
 7 | class BaseTelemetryEvent(ABC):
 8 | 	@property
 9 | 	@abstractmethod
10 | 	def name(self) -> str:
11 | 		pass
12 | 
13 | 	@property
14 | 	def properties(self) -> Dict[str, Any]:
15 | 		return {k: v for k, v in asdict(self).items() if k != 'name'}
16 | 
17 | 
18 | @dataclass
19 | class RegisteredFunction:
20 | 	name: str
21 | 	params: dict[str, Any]
22 | 
23 | 
24 | @dataclass
25 | class ControllerRegisteredFunctionsTelemetryEvent(BaseTelemetryEvent):
26 | 	registered_functions: list[RegisteredFunction]
27 | 	name: str = 'controller_registered_functions'
28 | 
29 | 
30 | @dataclass
31 | class AgentStepTelemetryEvent(BaseTelemetryEvent):
32 | 	agent_id: str
33 | 	step: int
34 | 	step_error: list[str]
35 | 	consecutive_failures: int
36 | 	actions: list[dict]
37 | 	name: str = 'agent_step'
38 | 
39 | 
40 | @dataclass
41 | class AgentRunTelemetryEvent(BaseTelemetryEvent):
42 | 	agent_id: str
43 | 	use_vision: bool
44 | 	task: str
45 | 	model_name: str
46 | 	chat_model_library: str
47 | 	version: str
48 | 	source: str
49 | 	name: str = 'agent_run'
50 | 
51 | 
52 | @dataclass
53 | class AgentEndTelemetryEvent(BaseTelemetryEvent):
54 | 	agent_id: str
55 | 	steps: int
56 | 	max_steps_reached: bool
57 | 	is_done: bool
58 | 	success: bool | None
59 | 	total_input_tokens: int
60 | 	total_duration_seconds: float
61 | 
62 | 	errors: Sequence[str | None]
63 | 	name: str = 'agent_end'
64 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate==1.6.0
 2 | annotated-types==0.7.0
 3 | anthropic==0.49.0
 4 | anyio==4.9.0
 5 | backoff==2.2.1
 6 | beautifulsoup4==4.13.3
 7 | certifi==2025.1.31
 8 | charset-normalizer==3.4.1
 9 | colorama==0.4.6
10 | defusedxml==0.7.1
11 | distro==1.9.0
12 | filelock==3.18.0
13 | fsspec==2025.3.2
14 | greenlet==3.1.1
15 | h11==0.14.0
16 | httpcore==1.0.7
17 | httpx==0.28.1
18 | huggingface-hub==0.30.1
19 | idna==3.10
20 | Jinja2==3.1.6
21 | jiter==0.9.0
22 | joblib==1.4.2
23 | jsonpatch==1.33
24 | jsonpointer==3.0.0
25 | langchain-core>=0.3.58,<0.4.0
26 | langchain-anthropic==0.3.3
27 | langchain-ollama==0.2.2
28 | langchain-openai==0.3.1
29 | langsmith==0.3.24
30 | markdownify==0.14.1
31 | MarkupSafe==3.0.2
32 | monotonic==1.6
33 | mpmath==1.3.0
34 | networkx==3.4.2
35 | numpy==2.2.4
36 | ollama==0.4.7
37 | openai==1.70.0
38 | orjson==3.10.16
39 | packaging==24.2
40 | pillow==11.1.0
41 | playwright==1.51.0
42 | posthog==3.23.0
43 | psutil==7.0.0
44 | pydantic==2.11.2
45 | pydantic_core==2.33.1
46 | pyee==12.1.1
47 | python-dateutil==2.9.0.post0
48 | python-dotenv==1.1.0
49 | PyYAML==6.0.2
50 | regex==2024.11.6
51 | requests==2.32.3
52 | requests-toolbelt==1.0.0
53 | safetensors==0.5.3
54 | scikit-learn==1.6.1
55 | scipy==1.15.2
56 | sentence-transformers==4.0.2
57 | six==1.17.0
58 | sniffio==1.3.1
59 | soupsieve==2.6
60 | sympy==1.13.1
61 | tenacity==9.1.2
62 | threadpoolctl==3.6.0
63 | tiktoken==0.9.0
64 | tokenizers==0.21.1
65 | torch==2.6.0
66 | tqdm==4.67.1
67 | transformers==4.51.0
68 | typing-inspection==0.4.0
69 | typing_extensions==4.13.1
70 | urllib3==2.3.0
71 | zstandard==0.23.0
72 | flask==3.1.0
73 | flask_cors==5.0.1
74 | mem0ai==0.1.96
75 | faiss-cpu==1.11.0
76 | screeninfo==0.8.1
77 | mcp==1.7.1
78 | flask[async]
79 | langchain==0.3.25
80 | html_sanitizer==2.5.0
81 | 


--------------------------------------------------------------------------------
/browser_use/browser/views.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Any, Optional
 3 | 
 4 | from pydantic import BaseModel
 5 | 
 6 | from browser_use.dom.history_tree_processor.service import DOMHistoryElement
 7 | from browser_use.dom.views import DOMState
 8 | 
 9 | 
10 | # Pydantic
11 | class TabInfo(BaseModel):
12 | 	"""Represents information about a browser tab"""
13 | 
14 | 	page_id: int
15 | 	url: str
16 | 	title: str
17 | 	parent_page_id: Optional[int] = None  # parent page that contains this popup or cross-origin iframe
18 | 
19 | 
20 | class GroupTabsAction(BaseModel):
21 | 	tab_ids: list[int]
22 | 	title: str
23 | 	color: Optional[str] = 'blue'
24 | 
25 | 
26 | class UngroupTabsAction(BaseModel):
27 | 	tab_ids: list[int]
28 | 
29 | 
30 | @dataclass
31 | class BrowserState(DOMState):
32 | 	url: str
33 | 	title: str
34 | 	tabs: list[TabInfo]
35 | 	screenshot: Optional[str] = None
36 | 	pixels_above: int = 0
37 | 	pixels_below: int = 0
38 | 	browser_errors: list[str] = field(default_factory=list)
39 | 
40 | 
41 | @dataclass
42 | class BrowserStateHistory:
43 | 	url: str
44 | 	title: str
45 | 	tabs: list[TabInfo]
46 | 	interacted_element: list[DOMHistoryElement | None] | list[None]
47 | 	screenshot: Optional[str] = None
48 | 
49 | 	def to_dict(self) -> dict[str, Any]:
50 | 		data = {}
51 | 		data['tabs'] = [tab.model_dump() for tab in self.tabs]
52 | 		data['screenshot'] = self.screenshot
53 | 		data['interacted_element'] = [el.to_dict() if el else None for el in self.interacted_element]
54 | 		data['url'] = self.url
55 | 		data['title'] = self.title
56 | 		return data
57 | 
58 | 
59 | class BrowserError(Exception):
60 | 	"""Base class for all browser errors"""
61 | 
62 | 
63 | class URLNotAllowedError(BrowserError):
64 | 	"""Error raised when a URL is not allowed"""
65 | 


--------------------------------------------------------------------------------
/data_samples/replay_list_samples/wap_exact_replay_list_l8vZDGTfw3qu3GBs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "ultimate_goal": "Find the best sold keyboard in BestBuy",
 3 |   "task_id": "l8vZDGTfw3qu3GBs",
 4 |   "type": "exact_replay",
 5 |   "action_list": [
 6 |     {
 7 |       "action": "open_tab",
 8 |       "action_params": {
 9 |         "url": "https://www.bestbuy.ca/en-ca"
10 |       }
11 |     },
12 |     {
13 |       "action": "wait_for_element",
14 |       "action_params": {
15 |         "selector": "INPUT.style-module_textField__MdLzL",
16 |         "timeout": 5000
17 |       }
18 |     },
19 |     {
20 |       "action": "input_text_by_selector",
21 |       "action_params": {
22 |         "selector": "INPUT.style-module_textField__MdLzL",
23 |         "text": "keyboard"
24 |       }
25 |     },
26 |     {
27 |       "action": "send_keys",
28 |       "action_params": {
29 |         "keys": "Enter"
30 |       }
31 |     },
32 |     {
33 |       "action": "wait_for_element",
34 |       "action_params": {
35 |         "selector": "#Sort",
36 |         "timeout": 5000
37 |       }
38 |     },
39 |     {
40 |       "action": "select_option_by_selector",
41 |       "action_params": {
42 |         "css_selector": "#Sort",
43 |         "value": "highestRated"
44 |       }
45 |     },
46 |     {
47 |       "action": "wait_for_element",
48 |       "action_params": {
49 |         "selector": "h3[data-automation=\"productItemName\"]",
50 |         "timeout": 5000
51 |       }
52 |     },
53 |     {
54 |       "action": "click_element_by_selector",
55 |       "action_params": {
56 |         "css_selector": "h3[data-automation=\"productItemName\"]"
57 |       }
58 |     },
59 |     {
60 |       "action": "extract_content",
61 |       "action_params": {
62 |         "goal": "Find the best sold keyboard in BestBuy",
63 |         "should_strip_link_urls": false
64 |       }
65 |     },
66 |     {
67 |       "action": "done",
68 |       "action_params": {
69 |         "text": "task executed successfully",
70 |         "success": true
71 |       }
72 |     }
73 |   ]
74 | }


--------------------------------------------------------------------------------
/browser_use/dom/history_tree_processor/view.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Optional
 3 | 
 4 | from pydantic import BaseModel
 5 | 
 6 | 
 7 | @dataclass
 8 | class HashedDomElement:
 9 | 	"""
10 | 	Hash of the dom element to be used as a unique identifier
11 | 	"""
12 | 
13 | 	branch_path_hash: str
14 | 	attributes_hash: str
15 | 	xpath_hash: str
16 | 	# text_hash: str
17 | 
18 | 
19 | class Coordinates(BaseModel):
20 | 	x: int
21 | 	y: int
22 | 
23 | 
24 | class CoordinateSet(BaseModel):
25 | 	top_left: Coordinates
26 | 	top_right: Coordinates
27 | 	bottom_left: Coordinates
28 | 	bottom_right: Coordinates
29 | 	center: Coordinates
30 | 	width: int
31 | 	height: int
32 | 
33 | 
34 | class ViewportInfo(BaseModel):
35 | 	scroll_x: int
36 | 	scroll_y: int
37 | 	width: int
38 | 	height: int
39 | 
40 | 
41 | @dataclass
42 | class DOMHistoryElement:
43 | 	tag_name: str
44 | 	xpath: str
45 | 	highlight_index: Optional[int]
46 | 	entire_parent_branch_path: list[str]
47 | 	attributes: dict[str, str]
48 | 	shadow_root: bool = False
49 | 	css_selector: Optional[str] = None
50 | 	page_coordinates: Optional[CoordinateSet] = None
51 | 	viewport_coordinates: Optional[CoordinateSet] = None
52 | 	viewport_info: Optional[ViewportInfo] = None
53 | 
54 | 	def to_dict(self) -> dict:
55 | 		page_coordinates = self.page_coordinates.model_dump() if self.page_coordinates else None
56 | 		viewport_coordinates = self.viewport_coordinates.model_dump() if self.viewport_coordinates else None
57 | 		viewport_info = self.viewport_info.model_dump() if self.viewport_info else None
58 | 
59 | 		return {
60 | 			'tag_name': self.tag_name,
61 | 			'xpath': self.xpath,
62 | 			'highlight_index': self.highlight_index,
63 | 			'entire_parent_branch_path': self.entire_parent_branch_path,
64 | 			'attributes': self.attributes,
65 | 			'shadow_root': self.shadow_root,
66 | 			'css_selector': self.css_selector,
67 | 			'page_coordinates': page_coordinates,
68 | 			'viewport_coordinates': viewport_coordinates,
69 | 			'viewport_info': viewport_info,
70 | 		}
71 | 


--------------------------------------------------------------------------------
/utils/html_cleaner.py:
--------------------------------------------------------------------------------
 1 | from html_sanitizer import Sanitizer
 2 | 
 3 | 
 4 | def run_html_sanitizer(html: str, action_type: str):
 5 |     def sanitize_html(html: str, config: dict) -> str:
 6 |         sanitizer = Sanitizer(config)
 7 |         return sanitizer.sanitize(html)
 8 | 
 9 |     config = {}
10 |     if action_type == "task-finish":
11 |         allowed_tags = [
12 |             'a', 'address', 'article', 'aside', 'b', 'blockquote', 'button', 'caption', 'cite', 'code', 'col', 'colgroup', 
13 | 			'data', 'datalist', 'dd', 'del', 'details', 'div', 'dl', 'dt', 'em', 
14 | 			'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 
15 | 			'h5', 'h6', 'header', 'hr', 'i', 'img', 'input', 'label', 'legend', 
16 | 			'li', 'main', 'menu', 'nav', 'ol', 'option', 'output', 'p', 'pre', 
17 | 			'q', 's', 'section', 'select', 'small', 'span', 'strong', 'sub', 'summary',
18 | 			'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'textarea', 'time', 'tr', 'ul', 'video',
19 | 			'title'
20 |         ]
21 | 
22 |         common_attrs = ["id", "aria-label", "role"]
23 |         wildcard_data_attrs = "data-*"
24 | 
25 |         # Start with tag-specific attributes
26 |         attributes = {
27 |             "a": ["rel", "target"] + common_attrs,
28 |             "img": ["alt"] + common_attrs,
29 |             "button": ["aria-label"] + common_attrs,
30 |         }
31 | 
32 |         # Add the common attributes to all other tags
33 |         for tag in allowed_tags:
34 |             if tag not in attributes:
35 |                 attributes[tag] = common_attrs.copy()
36 |             # Add wildcard attributes for data-* only if supported by your sanitizer config
37 |             attributes[tag].append(wildcard_data_attrs)
38 | 
39 |         config = {
40 |             "tags": allowed_tags,
41 |             "attributes": attributes,
42 |             "empty": ["a", "img"],
43 |             "separate": ["p", "div", "h1", "h2", "h3", "article", "main"],
44 |             "keep_typographic_whitespace": True
45 |         }
46 | 
47 |     return sanitize_html(html, config)


--------------------------------------------------------------------------------
/utils/llm.py:
--------------------------------------------------------------------------------
 1 | """Sub-goal generator helper
 2 | 
 3 | This tiny helper takes a text prompt, sends it to OpenAI via LangChain,
 4 | and returns the assistant's plain-text reply.
 5 | """
 6 | from __future__ import annotations
 7 | 
 8 | import os
 9 | from typing import Optional
10 | 
11 | from langchain_openai import ChatOpenAI
12 | from langchain.schema import AIMessage, HumanMessage, SystemMessage
13 | 
14 | __all__ = ["ask_llm"]
15 | 
16 | # ---------------------------------------------------------------------------
17 | # Basic LLM wrapper
18 | # ---------------------------------------------------------------------------
19 | 
20 | def _build_llm(model: str = "gpt-4o", temperature: float = 0) -> ChatOpenAI:  # type: ignore
21 |     """Create a LangChain ChatOpenAI client with sane defaults.
22 | 
23 |     Parameters
24 |     ----------
25 |     model : str
26 |         OpenAI model name.  Defaults to *gpt-4o-mini* (fast/cheap).  Change to
27 |         "gpt-4o" or "gpt-4-turbo" if you want higher quality.
28 |     temperature : float
29 |         Sampling temperature.
30 |     """
31 |     # The key must be available in the environment.  (Raise a clear error if not.)
32 |     if "OPENAI_API_KEY" not in os.environ:
33 |         raise RuntimeError("OPENAI_API_KEY environment variable is not set.")
34 | 
35 |     return ChatOpenAI(model_name=model, temperature=temperature)
36 | 
37 | 
38 | def ask_llm(prompt: str,
39 |             system_prompt: Optional[str] = None,
40 |             model: str = "gpt-4o",
41 |             temperature: float = 0) -> str:
42 |     """Send *prompt* to OpenAI and return the assistant text.
43 | 
44 |     Parameters
45 |     ----------
46 |     prompt : str
47 |         User prompt / question.
48 |     system_prompt : str | None
49 |         Optional system message to steer model behaviour.
50 |     model : str
51 |         OpenAI model name (default: gpt-4o-mini).
52 |     temperature : float
53 |         Sampling temperature (default 0.2).
54 | 
55 |     Returns
56 |     -------
57 |     str
58 |         Assistant's plain-text reply.
59 |     """
60 |     llm = _build_llm(model=model, temperature=temperature)
61 | 
62 |     messages = []
63 |     if system_prompt:
64 |         messages.append(SystemMessage(content=system_prompt))
65 |     messages.append(HumanMessage(content=prompt))
66 | 
67 |     # Call the chat model.
68 |     response = llm(messages)  # -> AIMessage
69 | 
70 |     if not isinstance(response, AIMessage):
71 |         raise RuntimeError("Unexpected response type from LLM")
72 | 
73 |     return response.content.strip()
74 | 


--------------------------------------------------------------------------------
/data_samples/replay_list_samples/wap_exact_replay_list_GqMnZeKFxvePGKGA.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "ultimate_goal": "search for a recipe of baked salmon which takes less than 1 hour to cook",
 3 |   "task_id": "GqMnZeKFxvePGKGA",
 4 |   "type": "exact_replay",
 5 |   "action_list": [
 6 |     {
 7 |       "action": "open_tab",
 8 |       "action_params": {
 9 |         "url": "https://www.allrecipes.com/"
10 |       }
11 |     },
12 |     {
13 |       "action": "wait_for_element",
14 |       "action_params": {
15 |         "selector": "#mntl-search-form--open__search-input",
16 |         "timeout": 5000
17 |       }
18 |     },
19 |     {
20 |       "action": "input_text_by_selector",
21 |       "action_params": {
22 |         "selector": "#mntl-search-form--open__search-input",
23 |         "text": "baked salmon"
24 |       }
25 |     },
26 |     {
27 |       "action": "send_keys",
28 |       "action_params": {
29 |         "keys": "Enter"
30 |       }
31 |     },
32 |     {
33 |       "action": "wait_for_element",
34 |       "action_params": {
35 |         "selector": "img[alt=\"Breaded, baked salmon fillets topped with lemon slices, served alongside asparagus slices and rice pilaf on blue plates\"]",
36 |         "timeout": 5000
37 |       }
38 |     },
39 |     {
40 |       "action": "click_element_by_selector",
41 |       "action_params": {
42 |         "css_selector": "img[alt=\"Breaded, baked salmon fillets topped with lemon slices, served alongside asparagus slices and rice pilaf on blue plates\"]"
43 |       }
44 |     },
45 |     {
46 |       "action": "go_to_url",
47 |       "action_params": {
48 |         "url": "https://www.allrecipes.com/search?q=baked+salmon"
49 |       }
50 |     },
51 |     {
52 |       "action": "wait_for_element",
53 |       "action_params": {
54 |         "selector": "body",
55 |         "timeout": 8000
56 |       }
57 |     },
58 |     {
59 |       "action": "wait_for_element",
60 |       "action_params": {
61 |         "selector": "img[alt=\"Filet of salmon topped with melted cheese on aluminum foil\"]",
62 |         "timeout": 5000
63 |       }
64 |     },
65 |     {
66 |       "action": "click_element_by_selector",
67 |       "action_params": {
68 |         "css_selector": "img[alt=\"Filet of salmon topped with melted cheese on aluminum foil\"]"
69 |       }
70 |     },
71 |     {
72 |       "action": "extract_content",
73 |       "action_params": {
74 |         "goal": "search for a recipe of baked salmon which takes less than 1 hour to cook",
75 |         "should_strip_link_urls": false
76 |       }
77 |     },
78 |     {
79 |       "action": "done",
80 |       "action_params": {
81 |         "text": "task executed successfully",
82 |         "success": true
83 |       }
84 |     }
85 |   ]
86 | }


--------------------------------------------------------------------------------
/data_samples/replay_list_samples/wap_exact_replay_list_LhTyE4ie0s5a1W6J.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "ultimate_goal": "search for the best sold keyboard on Amazon",
  3 |   "task_id": "LhTyE4ie0s5a1W6J",
  4 |   "type": "exact_replay",
  5 |   "action_list": [
  6 |     {
  7 |       "action": "open_tab",
  8 |       "action_params": {
  9 |         "url": "https://www.amazon.ca/"
 10 |       }
 11 |     },
 12 |     {
 13 |       "action": "wait_for_element",
 14 |       "action_params": {
 15 |         "selector": "#searchDropdownBox",
 16 |         "timeout": 5000
 17 |       }
 18 |     },
 19 |     {
 20 |       "action": "select_option_by_selector",
 21 |       "action_params": {
 22 |         "css_selector": "#searchDropdownBox",
 23 |         "value": "search-alias=aps"
 24 |       }
 25 |     },
 26 |     {
 27 |       "action": "wait_for_element",
 28 |       "action_params": {
 29 |         "selector": "#twotabsearchtextbox",
 30 |         "timeout": 5000
 31 |       }
 32 |     },
 33 |     {
 34 |       "action": "input_text_by_selector",
 35 |       "action_params": {
 36 |         "selector": "#twotabsearchtextbox",
 37 |         "text": "keyboard"
 38 |       }
 39 |     },
 40 |     {
 41 |       "action": "send_keys",
 42 |       "action_params": {
 43 |         "keys": "Enter"
 44 |       }
 45 |     },
 46 |     {
 47 |       "action": "wait_for_element",
 48 |       "action_params": {
 49 |         "selector": ".a-dropdown-prompt",
 50 |         "timeout": 5000
 51 |       }
 52 |     },
 53 |     {
 54 |       "action": "click_element_by_selector",
 55 |       "action_params": {
 56 |         "css_selector": ".a-dropdown-prompt"
 57 |       }
 58 |     },
 59 |     {
 60 |       "action": "wait_for_element",
 61 |       "action_params": {
 62 |         "selector": "#s-result-sort-select_5",
 63 |         "timeout": 5000
 64 |       }
 65 |     },
 66 |     {
 67 |       "action": "click_element_by_selector",
 68 |       "action_params": {
 69 |         "css_selector": "#s-result-sort-select_5"
 70 |       }
 71 |     },
 72 |     {
 73 |       "action": "wait_for_element",
 74 |       "action_params": {
 75 |         "selector": "span:text(\"Lenovo 300 USB Keyboard, Wired, Adjustable Tilt, Ergonomic, Windows 7/8/10, GX30M39655, Black\")",
 76 |         "timeout": 5000
 77 |       }
 78 |     },
 79 |     {
 80 |       "action": "click_element_by_text",
 81 |       "action_params": {
 82 |         "text": "Lenovo 300 USB Keyboard, Wired, Adjustable Tilt, Ergonomic, Windows 7/8/10, GX30M39655, Black",
 83 |         "element_type": "span",
 84 |         "nth": 0
 85 |       }
 86 |     },
 87 |     {
 88 |       "action": "extract_content",
 89 |       "action_params": {
 90 |         "goal": "search for the best sold keyboard on Amazon",
 91 |         "should_strip_link_urls": false
 92 |       }
 93 |     },
 94 |     {
 95 |       "action": "done",
 96 |       "action_params": {
 97 |         "text": "task executed successfully",
 98 |         "success": true
 99 |       }
100 |     }
101 |   ]
102 | }


--------------------------------------------------------------------------------
/browser_use/browser/tests/test_clicks.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | 
 4 | import pytest
 5 | 
 6 | from browser_use.browser.browser import Browser, BrowserConfig
 7 | from browser_use.dom.views import DOMBaseNode, DOMElementNode, DOMTextNode
 8 | from browser_use.utils import time_execution_sync
 9 | 
10 | 
11 | class ElementTreeSerializer:
12 | 	@staticmethod
13 | 	def dom_element_node_to_json(element_tree: DOMElementNode) -> dict:
14 | 		def node_to_dict(node: DOMBaseNode) -> dict:
15 | 			if isinstance(node, DOMTextNode):
16 | 				return {'type': 'text', 'text': node.text}
17 | 			elif isinstance(node, DOMElementNode):
18 | 				return {
19 | 					'type': 'element',
20 | 					'tag_name': node.tag_name,
21 | 					'attributes': node.attributes,
22 | 					'highlight_index': node.highlight_index,
23 | 					'children': [node_to_dict(child) for child in node.children],
24 | 				}
25 | 			return {}
26 | 
27 | 		return node_to_dict(element_tree)
28 | 
29 | 
30 | # run with: pytest browser_use/browser/tests/test_clicks.py
31 | @pytest.mark.asyncio
32 | async def test_highlight_elements():
33 | 	browser = Browser(config=BrowserConfig(headless=False, disable_security=True))
34 | 
35 | 	async with await browser.new_context() as context:
36 | 		page = await context.get_current_page()
37 | 		# await page.goto('https://immobilienscout24.de')
38 | 		# await page.goto('https://help.sap.com/docs/sap-ai-core/sap-ai-core-service-guide/service-plans')
39 | 		# await page.goto('https://google.com/search?q=elon+musk')
40 | 		# await page.goto('https://kayak.com')
41 | 		# await page.goto('https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe')
42 | 		# await page.goto('https://dictionary.cambridge.org')
43 | 		# await page.goto('https://github.com')
44 | 		await page.goto('https://huggingface.co/')
45 | 
46 | 		await asyncio.sleep(1)
47 | 
48 | 		while True:
49 | 			try:
50 | 				# await asyncio.sleep(10)
51 | 				state = await context.get_state()
52 | 
53 | 				with open('./tmp/page.json', 'w') as f:
54 | 					json.dump(
55 | 						ElementTreeSerializer.dom_element_node_to_json(state.element_tree),
56 | 						f,
57 | 						indent=1,
58 | 					)
59 | 
60 | 				# await time_execution_sync('highlight_selector_map_elements')(
61 | 				# 	browser.highlight_selector_map_elements
62 | 				# )(state.selector_map)
63 | 
64 | 				# Find and print duplicate XPaths
65 | 				xpath_counts = {}
66 | 				if not state.selector_map:
67 | 					continue
68 | 				for selector in state.selector_map.values():
69 | 					xpath = selector.xpath
70 | 					if xpath in xpath_counts:
71 | 						xpath_counts[xpath] += 1
72 | 					else:
73 | 						xpath_counts[xpath] = 1
74 | 
75 | 				print('\nDuplicate XPaths found:')
76 | 				for xpath, count in xpath_counts.items():
77 | 					if count > 1:
78 | 						print(f'XPath: {xpath}')
79 | 						print(f'Count: {count}\n')
80 | 
81 | 				print(list(state.selector_map.keys()), 'Selector map keys')
82 | 				print(state.element_tree.clickable_elements_to_string())
83 | 				action = input('Select next action: ')
84 | 
85 | 				await time_execution_sync('remove_highlight_elements')(context.remove_highlights)()
86 | 
87 | 				node_element = state.selector_map[int(action)]
88 | 
89 | 				# check if index of selector map are the same as index of items in dom_items
90 | 
91 | 				await context._click_element_node(node_element)
92 | 
93 | 			except Exception as e:
94 | 				print(e)
95 | 


--------------------------------------------------------------------------------
/browser_use/telemetry/service.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import uuid
  4 | from pathlib import Path
  5 | 
  6 | from dotenv import load_dotenv
  7 | from posthog import Posthog
  8 | 
  9 | from browser_use.telemetry.views import BaseTelemetryEvent
 10 | from browser_use.utils import singleton
 11 | 
 12 | load_dotenv()
 13 | 
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | POSTHOG_EVENT_SETTINGS = {
 19 | 	'process_person_profile': True,
 20 | }
 21 | 
 22 | 
 23 | @singleton
 24 | class ProductTelemetry:
 25 | 	"""
 26 | 	Service for capturing anonymized telemetry data.
 27 | 
 28 | 	If the environment variable `ANONYMIZED_TELEMETRY=False`, anonymized telemetry will be disabled.
 29 | 	"""
 30 | 
 31 | 	USER_ID_PATH = str(Path.home() / '.cache' / 'browser_use' / 'telemetry_user_id')
 32 | 	PROJECT_API_KEY = 'phc_F8JMNjW1i2KbGUTaW1unnDdLSPCoyc52SGRU0JecaUh'
 33 | 	HOST = 'https://eu.i.posthog.com'
 34 | 	UNKNOWN_USER_ID = 'UNKNOWN'
 35 | 
 36 | 	_curr_user_id = None
 37 | 
 38 | 	def __init__(self) -> None:
 39 | 		telemetry_disabled = 'false' #os.getenv('ANONYMIZED_TELEMETRY', 'true').lower() == 'false'
 40 | 		self.debug_logging = os.getenv('BROWSER_USE_LOGGING_LEVEL', 'info').lower() == 'debug'
 41 | 
 42 | 		if telemetry_disabled:
 43 | 			self._posthog_client = None
 44 | 		else:
 45 | 			logger.info(
 46 | 				'Anonymized telemetry enabled. See https://docs.browser-use.com/development/telemetry for more information.'
 47 | 			)
 48 | 			self._posthog_client = Posthog(
 49 | 				project_api_key=self.PROJECT_API_KEY,
 50 | 				host=self.HOST,
 51 | 				disable_geoip=False,
 52 | 			)
 53 | 
 54 | 			# Silence posthog's logging
 55 | 			if not self.debug_logging:
 56 | 				posthog_logger = logging.getLogger('posthog')
 57 | 				posthog_logger.disabled = True
 58 | 
 59 | 		if self._posthog_client is None:
 60 | 			logger.debug('Telemetry disabled')
 61 | 
 62 | 	def capture(self, event: BaseTelemetryEvent) -> None:
 63 | 		if self._posthog_client is None:
 64 | 			return
 65 | 
 66 | 		if self.debug_logging:
 67 | 			logger.debug(f'Telemetry event: {event.name} {event.properties}')
 68 | 		self._direct_capture(event)
 69 | 
 70 | 	def _direct_capture(self, event: BaseTelemetryEvent) -> None:
 71 | 		"""
 72 | 		Should not be thread blocking because posthog magically handles it
 73 | 		"""
 74 | 		if self._posthog_client is None:
 75 | 			return
 76 | 
 77 | 		try:
 78 | 			self._posthog_client.capture(
 79 | 				self.user_id,
 80 | 				event.name,
 81 | 				{**event.properties, **POSTHOG_EVENT_SETTINGS},
 82 | 			)
 83 | 		except Exception as e:
 84 | 			logger.error(f'Failed to send telemetry event {event.name}: {e}')
 85 | 
 86 | 	@property
 87 | 	def user_id(self) -> str:
 88 | 		if self._curr_user_id:
 89 | 			return self._curr_user_id
 90 | 
 91 | 		# File access may fail due to permissions or other reasons. We don't want to
 92 | 		# crash so we catch all exceptions.
 93 | 		try:
 94 | 			if not os.path.exists(self.USER_ID_PATH):
 95 | 				os.makedirs(os.path.dirname(self.USER_ID_PATH), exist_ok=True)
 96 | 				with open(self.USER_ID_PATH, 'w') as f:
 97 | 					new_user_id = str(uuid.uuid4())
 98 | 					f.write(new_user_id)
 99 | 				self._curr_user_id = new_user_id
100 | 			else:
101 | 				with open(self.USER_ID_PATH, 'r') as f:
102 | 					self._curr_user_id = f.read()
103 | 		except Exception:
104 | 			self._curr_user_id = 'UNKNOWN_USER_ID'
105 | 		return self._curr_user_id
106 | 


--------------------------------------------------------------------------------
/wap_replay/generate_smart_replay_list.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Batch-convert recorded event-JSON files into the canonical smart-replay”
 3 | action list by calling `record_metadata_to_actions` from browser-use.
 4 | 
 5 | Usage
 6 | -----
 7 | python wap_replay/generate_smart_replay_list.py --data_dir_path <folder_with_json_files> \
 8 |                              [--output_dir_path data_processed/exact_replay]
 9 |                              
10 | Example
11 | -----
12 | python wap_replay/generate_smart_replay_list.py --data_dir_path data/20250423/Allrecipes--4 \
13 |                              --output_dir_path data_processed/smart_replay
14 | """
15 | import argparse
16 | from pathlib import Path
17 | from dotenv import load_dotenv
18 | from utils.action_processing import generate_subgoal_speculate_prompt, find_task_prompt, load_event_json
19 | from utils.subgoal_generator import generate_subgoals_from_dir, wap_subgoal_list_generation
20 | load_dotenv()
21 | 
22 | 
23 | def subgoal_prompt_generation(path: str,  output_path: str, ultimate_goal: str) -> None:
24 |     # 1️⃣  collect every .json file under the root folder --------------------
25 |     root = Path(path)
26 |     json_paths = list(root.rglob("*.json"))
27 |     if not json_paths:
28 |         print(f"[OTA Info] No JSON files found under {root}")
29 |         return
30 | 
31 |     print(f"[OTA Info] Found {len(json_paths)} event files.")
32 |     # 2️⃣  process each event file ------------------------------------------
33 |     for idx, event_path in enumerate(json_paths, 1):
34 |         print(f"\n[{idx}/{len(json_paths)}] Loading {event_path}")
35 |         summary_event = load_event_json(event_path)
36 | 
37 |         print("   Generating sub-goal …")
38 |         generate_subgoal_speculate_prompt(summary_event, ultimate_goal, event_path.stem, output_path)
39 | 
40 |     print("\n[OTA Info] All done.")
41 | 
42 | 
43 | def subgoal_llm_generation(folder, jsonl_name):
44 |     results = generate_subgoals_from_dir(
45 |         folder,
46 |         system_prompt="You are a concise sub-goal assistant fot analysis of actions in browser.",
47 |         model="gpt-4o",
48 |         temperature=0,
49 |         save_jsonl= jsonl_name
50 |     )
51 | 
52 | def main() -> None:
53 |     parser = argparse.ArgumentParser(description="Smart-replay pipeline")
54 |     parser.add_argument("--data_dir_path", required=True,
55 |                         help="Directory containing recorded event JSON files")
56 |     parser.add_argument("--output_dir_path", default="data_processed/smart_replay",
57 |                     help="Directory where all output will be placed "
58 |                          "(default: data_processed/smart_replay)")
59 |     args = parser.parse_args()
60 | 
61 |     data_dir   = Path(args.data_dir_path)
62 |     output_dir = Path(args.output_dir_path)
63 |     output_dir.mkdir(parents=True, exist_ok=True)
64 | 
65 |     task_prompt, task_id = find_task_prompt(data_dir)
66 |     print("[OTA Info] Using task prompt =>", task_prompt)
67 |     print("[OTA Info] taskId           =>", task_id)
68 | 
69 |     subgoals_dir = output_dir / f"subgoals_{task_id}"
70 |     subgoals_dir.mkdir(parents=True, exist_ok=True)
71 | 
72 |     subgoals_jsonl = subgoals_dir / "subgoals_output.jsonl"
73 |     wap_json       = output_dir / f"wap_smart_replay_list_{task_id}.json"
74 | 
75 |     subgoal_prompt_generation(
76 |         data_dir,
77 |         subgoals_dir,
78 |         task_prompt,
79 |     )
80 | 
81 |     subgoal_llm_generation(
82 |         subgoals_dir,
83 |         subgoals_jsonl,
84 |     )
85 | 
86 |     wap_subgoal_list_generation(
87 |         task_prompt,
88 |         task_id,
89 |         subgoals_jsonl,
90 |         wap_json,
91 |     )
92 | 
93 | if __name__ == "__main__":
94 |     main()


--------------------------------------------------------------------------------
/chrome-extension/js/specialEventHandler.js:
--------------------------------------------------------------------------------
  1 | (function () {
  2 | /******************************************************************
  3 |  *  specialEventHandler.js
  4 |  *  --------------------------------------------------------------
  5 |  *  Registers and runs domain–specific listeners that the generic
  6 |  *  DOMListener cannot reliably cover
  7 |  *
  8 |  *  Usage from DOMListener.js
  9 |  *  --------------------------------------------------------------
 10 |  *  import(chrome.runtime.getURL('js/specialEventHandler.js'))
 11 |  *        .then(mod => mod.init())
 12 |  *        .catch(err => console.warn('[specialHandler] load failed', err));
 13 |  ******************************************************************/
 14 | 
 15 | /* ---------- simple registry ------------------------------------------------ */
 16 | const _handlers = [];
 17 | 
 18 | /** Register a new handler.
 19 |  *  @param {RegExp}   hostPattern  – tested against location.hostname
 20 |  *  @param {Function} initFn       – called if the pattern matches        */
 21 | function register(hostPattern, initFn) { _handlers.push({hostPattern, initFn}); }
 22 | 
 23 | /* ----------  H A N D L E R S  --------------------------------------------- */
 24 | register(/(^|\.)google\.[a-z.]+$/, ({
 25 | 	nodeToHTMLString,
 26 | 	trimTarget,
 27 | 	getEventHash,
 28 | 	getCurrentHTMLSanitized,
 29 | 	taskId
 30 | }) => {
 31 | 
 32 |   const BOX = 'textarea[name="q"][role="combobox"]';
 33 |   const BTN = 'button[aria-label="Search"][type="submit"]';
 34 | 
 35 |   function report(value, originEl) {
 36 | 	const evHash = getEventHash();
 37 | 
 38 |   
 39 | 	const actionTarget = {
 40 | 	  type       : 'submit',
 41 | 	  target     : nodeToHTMLString(originEl),      // full raw HTML
 42 | 	  targetId   : originEl.id,
 43 | 	  targetClass: originEl.className,
 44 | 	  value      : value                            // the user query text
 45 | 	};
 46 |   
 47 | 	// highlight element just like other flows
 48 | 	originEl.setAttribute('ota-use-interactive-target', '1');
 49 | 	actionTarget.target = trimTarget(originEl);     // prettified / trimmed
 50 | 	// (optional) remove the mark after trimming
 51 | 	originEl.removeAttribute('ota-use-interactive-target');
 52 |   
 53 | 	const summaryEvent = {
 54 | 	  taskId         : taskId,
 55 | 	  eventHash      : evHash,
 56 | 	  type           : 'submit',
 57 | 	  actionTimestamp: Date.now(),
 58 | 	  eventTarget    : actionTarget,
 59 | 	  allEvents      : {},                          // nothing to diff for a submit
 60 | 	  pageHTMLContent: getCurrentHTMLSanitized()
 61 | 	};
 62 |   
 63 | 	/* ---- ship it to the background  ------------------------------------ */
 64 | 	chrome.runtime.sendMessage({
 65 | 	  type : 'submit',     // pick any type name you handle in bg.js
 66 | 	  summaryEvent
 67 | 	});
 68 |   }
 69 | 
 70 |   /*  enter key  */
 71 |   document.addEventListener('keydown', e => {
 72 |     if (e.key === 'Enter' && !e.shiftKey && e.target.matches(BOX)) {
 73 |       report(e.target.value, e.target);
 74 |     }
 75 |   }, /*capture*/ true);
 76 | 
 77 |   /*  blue Search button  */
 78 |   document.addEventListener('click', e => {
 79 |     const btn = e.target.closest(BTN);
 80 |     if (!btn) return;
 81 |     const box = document.querySelector(BOX);
 82 |     if (box) report(box.value, btn);
 83 |   }, true);
 84 | 
 85 |   console.debug('[specialHandler] Google search attached');
 86 | });
 87 | 
 88 | /* -------------------------------------------------------------------------- */
 89 | /** Call once from DOMListener.  Attaches every handler that matches
 90 |  *  the current hostname.                                                     */
 91 | function init (deps) {
 92 |   const host = location.hostname;
 93 |   _handlers.forEach(({hostPattern, initFn}) => {
 94 |     if (hostPattern.test(host)) {
 95 | 		console.log(hostPattern)
 96 |       try { initFn(deps); }
 97 |       catch (err) {
 98 |         console.error('[specialHandler] failed for', hostPattern, err);
 99 |       }
100 |     }
101 |   });
102 | }
103 | 
104 |   window.SpecialEvents = { init };
105 | 
106 | })();


--------------------------------------------------------------------------------
/browser_use/controller/views.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | 
  3 | from pydantic import BaseModel, ConfigDict, Field, model_validator
  4 | 
  5 | 
  6 | # Action Input Models
  7 | class SearchGoogleAction(BaseModel):
  8 | 	query: str
  9 | 
 10 | 
 11 | class GoToUrlAction(BaseModel):
 12 | 	url: str
 13 | 
 14 | 
 15 | class WaitForElementAction(BaseModel):
 16 | 	selector: str
 17 | 	timeout: Optional[int] = 10000  # Timeout in milliseconds
 18 | 
 19 | 
 20 | class ClickElementAction(BaseModel):
 21 | 	index: int
 22 | 	xpath: Optional[str] = None
 23 | 
 24 | 
 25 | class ClickElementByXpathAction(BaseModel):
 26 | 	xpath: str
 27 | 
 28 | 
 29 | class ClickElementBySelectorAction(BaseModel):
 30 | 	css_selector: str
 31 | 
 32 | class SelectOptionBySelectorAction(BaseModel):
 33 |     css_selector: str               # e.g. "#searchDropdownBox"
 34 |     value: str | None = None        # preferred (unique)
 35 |     label: str | None = None        # visible text fallback
 36 | 
 37 | class ClickElementByTextAction(BaseModel):
 38 | 	text: str
 39 | 	element_type: Optional[str]
 40 | 	nth: int = 0
 41 | 
 42 | 
 43 | class InputTextAction(BaseModel):
 44 | 	index: int
 45 | 	text: str
 46 | 	xpath: Optional[str] = None
 47 | 
 48 | 
 49 | class DoneAction(BaseModel):
 50 | 	text: str
 51 | 	success: bool
 52 | 
 53 | 
 54 | class SwitchTabAction(BaseModel):
 55 | 	page_id: int
 56 | 
 57 | 
 58 | class OpenTabAction(BaseModel):
 59 | 	url: str
 60 | 
 61 | 
 62 | class CloseTabAction(BaseModel):
 63 | 	page_id: int
 64 | 
 65 | 
 66 | class ScrollAction(BaseModel):
 67 | 	amount: Optional[int] = None  # The number of pixels to scroll. If None, scroll down/up one page
 68 | 
 69 | 
 70 | class SendKeysAction(BaseModel):
 71 | 	keys: str
 72 | 
 73 | 
 74 | class GroupTabsAction(BaseModel):
 75 | 	tab_ids: list[int] = Field(..., description='List of tab IDs to group')
 76 | 	title: str = Field(..., description='Name for the tab group')
 77 | 	color: Optional[str] = Field(
 78 | 		'blue',
 79 | 		description='Color for the group (grey/blue/red/yellow/green/pink/purple/cyan)',
 80 | 	)
 81 | 
 82 | 
 83 | class UngroupTabsAction(BaseModel):
 84 | 	tab_ids: list[int] = Field(..., description='List of tab IDs to ungroup')
 85 | 
 86 | 
 87 | class ExtractPageContentAction(BaseModel):
 88 | 	value: str
 89 | 
 90 | 
 91 | class NoParamsAction(BaseModel):
 92 | 	"""
 93 | 	Accepts absolutely anything in the incoming data
 94 | 	and discards it, so the final parsed model is empty.
 95 | 	"""
 96 | 
 97 | 	model_config = ConfigDict(extra='allow')
 98 | 
 99 | 	@model_validator(mode='before')
100 | 	def ignore_all_inputs(cls, values):
101 | 		# No matter what the user sends, discard it and return empty.
102 | 		return {}
103 | 
104 | 
105 | class Position(BaseModel):
106 | 	x: int
107 | 	y: int
108 | 
109 | 
110 | class DragDropAction(BaseModel):
111 | 	# Element-based approach
112 | 	element_source: Optional[str] = Field(None, description='CSS selector or XPath of the element to drag from')
113 | 	element_target: Optional[str] = Field(None, description='CSS selector or XPath of the element to drop onto')
114 | 	element_source_offset: Optional[Position] = Field(
115 | 		None, description='Precise position within the source element to start drag (in pixels from top-left corner)'
116 | 	)
117 | 	element_target_offset: Optional[Position] = Field(
118 | 		None, description='Precise position within the target element to drop (in pixels from top-left corner)'
119 | 	)
120 | 
121 | 	# Coordinate-based approach (used if selectors not provided)
122 | 	coord_source_x: Optional[int] = Field(None, description='Absolute X coordinate on page to start drag from (in pixels)')
123 | 	coord_source_y: Optional[int] = Field(None, description='Absolute Y coordinate on page to start drag from (in pixels)')
124 | 	coord_target_x: Optional[int] = Field(None, description='Absolute X coordinate on page to drop at (in pixels)')
125 | 	coord_target_y: Optional[int] = Field(None, description='Absolute Y coordinate on page to drop at (in pixels)')
126 | 
127 | 	# Common options
128 | 	steps: Optional[int] = Field(10, description='Number of intermediate points for smoother movement (5-20 recommended)')
129 | 	delay_ms: Optional[int] = Field(5, description='Delay in milliseconds between steps (0 for fastest, 10-20 for more natural)')
130 | 


--------------------------------------------------------------------------------
/wap_replay/generate_exact_replay_list.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Batch-convert recorded event-JSON files into the canonical “exact-replay”
  3 | action list by calling `record_metadata_to_actions` from browser-use.
  4 | 
  5 | Usage
  6 | -----
  7 | python wap_replay/generate_exact_replay_list.py --data_dir_path <folder_with_json_files> \
  8 |                              [--output_dir_path data_processed/exact_replay]
  9 |                              
 10 | Example
 11 | -----
 12 | python wap_replay/generate_exact_replay_list.py --data_dir_path data/20250423/Allrecipes--4 \
 13 |                              --output_dir_path data_processed/exact_replay
 14 | """
 15 | from __future__ import annotations
 16 | 
 17 | import argparse
 18 | import json
 19 | from pathlib import Path
 20 | from typing import List, Dict, Any
 21 | from browser_use.wap.exact_replay import record_metadata_to_actions
 22 | from utils.action_processing import find_task_prompt, load_event_json
 23 | 
 24 | # ---------------------------------------------------------------------------#
 25 | # core function                                                              #
 26 | # ---------------------------------------------------------------------------#
 27 | def folder_to_actions(folder_path: str | Path) -> List[Dict[str, Any]]:
 28 |     """
 29 |     Walk sub-directories recursively, load every *.json file, convert each
 30 |     to replay actions via `record_metadata_to_actions`, and return the
 31 |     concatenated list.
 32 |     """
 33 |     folder_path = Path(folder_path)
 34 | 
 35 |     if not folder_path.is_dir():
 36 |         raise NotADirectoryError(folder_path)
 37 | 
 38 |     json_paths = list(folder_path.rglob("*.json"))   # recursive search
 39 |     if not json_paths:
 40 |         print(f"[OTA Info] No JSON files found under {folder_path}")
 41 |         return []
 42 | 
 43 |     print(f"[OTA Info] Found {len(json_paths)} event files.")
 44 | 
 45 |     all_actions: List[Dict[str, Any]] = []
 46 | 
 47 |     for idx, event_path in enumerate(json_paths, 1):
 48 |         print(f"[{idx}/{len(json_paths)}] Loading {event_path}")
 49 |         try:
 50 |             event_json = load_event_json(event_path)
 51 |             actions = record_metadata_to_actions([event_json])
 52 |             all_actions.extend(actions)
 53 |         except Exception as exc:
 54 |             print(f"[warn] could not process {event_path.name}: {exc}")
 55 | 
 56 |     print("[OTA Info] All done.")
 57 |     return all_actions
 58 | 
 59 | 
 60 | def save_exact_replay_bundle(
 61 |     path: Path,
 62 |     *,
 63 |     ultimate_goal: str,
 64 |     task_id: str,
 65 |     actions: List[Dict[str, Any]],
 66 | ) -> None:
 67 |     """
 68 |     Write a JSON file shaped like
 69 |     """
 70 |     bundle = {
 71 |         "ultimate_goal": ultimate_goal,
 72 |         "task_id": task_id,
 73 |         "type": "exact_replay",
 74 |         "action_list": actions,
 75 |     }
 76 |     path.write_text(json.dumps(bundle, ensure_ascii=False, indent=2), encoding="utf-8")
 77 |     print(f"[OTA info] wrote {len(actions)} actions → {path}")
 78 | 
 79 | 
 80 | # ---------------------------------------------------------------------------#
 81 | # command-line interface                                                     #
 82 | # ---------------------------------------------------------------------------#
 83 | def parse_args() -> argparse.Namespace:
 84 |     parser = argparse.ArgumentParser(description="Create exact-replay action list from a folder of event JSON files.")
 85 |     parser.add_argument("--data_dir_path", required=True, help="Folder containing recorded *.json files.")
 86 |     parser.add_argument("--output_dir_path", default="data_processed/exact_replay", help="Directory to store result file.")
 87 |     return parser.parse_args()
 88 | 
 89 | def main() -> None:
 90 |     args = parse_args()
 91 | 
 92 |     input_folder = Path(args.data_dir_path)
 93 |     output_dir = Path(args.output_dir_path)
 94 |     output_dir.mkdir(parents=True, exist_ok=True)
 95 |     task_prompt, task_id = find_task_prompt(input_folder)
 96 |     output_path = output_dir / f"wap_exact_replay_list_{task_id}.json"
 97 | 
 98 |     actions = folder_to_actions(input_folder)
 99 |     save_exact_replay_bundle(
100 |         output_path,
101 |         ultimate_goal=task_prompt,
102 |         task_id=task_id,
103 |         actions=actions,
104 |     )
105 | 
106 | if __name__ == "__main__":
107 |     main()


--------------------------------------------------------------------------------
/browser_use/dom/tests/debug_page_structure.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import os
  3 | import sys
  4 | 
  5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  6 | 
  7 | from browser_use.browser.browser import Browser, BrowserConfig
  8 | from browser_use.browser.context import BrowserContext
  9 | 
 10 | 
 11 | async def analyze_page_structure(url: str):
 12 | 	"""Analyze and print the structure of a webpage with enhanced debugging"""
 13 | 	browser = Browser(
 14 | 		config=BrowserConfig(
 15 | 			headless=False,  # Set to True if you don't need to see the browser
 16 | 		)
 17 | 	)
 18 | 
 19 | 	context = BrowserContext(browser=browser)
 20 | 
 21 | 	try:
 22 | 		async with context as ctx:
 23 | 			# Navigate to the URL
 24 | 			page = await ctx.get_current_page()
 25 | 			await page.goto(url)
 26 | 			await page.wait_for_load_state('networkidle')
 27 | 
 28 | 			# Get viewport dimensions
 29 | 			viewport_info = await page.evaluate("""() => {
 30 | 				return {
 31 | 					viewport: {
 32 | 						width: window.innerWidth,
 33 | 						height: window.innerHeight,
 34 | 						scrollX: window.scrollX,
 35 | 						scrollY: window.scrollY
 36 | 					}
 37 | 				}
 38 | 			}""")
 39 | 
 40 | 			print('\nViewport Information:')
 41 | 			print(f'Width: {viewport_info["viewport"]["width"]}')
 42 | 			print(f'Height: {viewport_info["viewport"]["height"]}')
 43 | 			print(f'ScrollX: {viewport_info["viewport"]["scrollX"]}')
 44 | 			print(f'ScrollY: {viewport_info["viewport"]["scrollY"]}')
 45 | 
 46 | 			# Enhanced debug information for cookie consent and fixed position elements
 47 | 			debug_info = await page.evaluate("""() => {
 48 | 				function getElementInfo(element) {
 49 | 					const rect = element.getBoundingClientRect();
 50 | 					const style = window.getComputedStyle(element);
 51 | 					return {
 52 | 						tag: element.tagName.toLowerCase(),
 53 | 						id: element.id,
 54 | 						className: element.className,
 55 | 						position: style.position,
 56 | 						rect: {
 57 | 							top: rect.top,
 58 | 							right: rect.right,
 59 | 							bottom: rect.bottom,
 60 | 							left: rect.left,
 61 | 							width: rect.width,
 62 | 							height: rect.height
 63 | 						},
 64 | 						isFixed: style.position === 'fixed',
 65 | 						isSticky: style.position === 'sticky',
 66 | 						zIndex: style.zIndex,
 67 | 						visibility: style.visibility,
 68 | 						display: style.display,
 69 | 						opacity: style.opacity
 70 | 					};
 71 | 				}
 72 | 
 73 | 				// Find cookie-related elements
 74 | 				const cookieElements = Array.from(document.querySelectorAll('[id*="cookie"], [id*="consent"], [class*="cookie"], [class*="consent"]'));
 75 | 				const fixedElements = Array.from(document.querySelectorAll('*')).filter(el => {
 76 | 					const style = window.getComputedStyle(el);
 77 | 					return style.position === 'fixed' || style.position === 'sticky';
 78 | 				});
 79 | 
 80 | 				return {
 81 | 					cookieElements: cookieElements.map(getElementInfo),
 82 | 					fixedElements: fixedElements.map(getElementInfo)
 83 | 				};
 84 | 			}""")
 85 | 
 86 | 			print('\nCookie-related Elements:')
 87 | 			for elem in debug_info['cookieElements']:
 88 | 				print(f'\nElement: {elem["tag"]}#{elem["id"]} .{elem["className"]}')
 89 | 				print(f'Position: {elem["position"]}')
 90 | 				print(f'Rect: {elem["rect"]}')
 91 | 				print(f'Z-Index: {elem["zIndex"]}')
 92 | 				print(f'Visibility: {elem["visibility"]}')
 93 | 				print(f'Display: {elem["display"]}')
 94 | 				print(f'Opacity: {elem["opacity"]}')
 95 | 
 96 | 			print('\nFixed/Sticky Position Elements:')
 97 | 			for elem in debug_info['fixedElements']:
 98 | 				print(f'\nElement: {elem["tag"]}#{elem["id"]} .{elem["className"]}')
 99 | 				print(f'Position: {elem["position"]}')
100 | 				print(f'Rect: {elem["rect"]}')
101 | 				print(f'Z-Index: {elem["zIndex"]}')
102 | 
103 | 			print(f'\nPage Structure for {url}:\n')
104 | 			structure = await ctx.get_page_structure()
105 | 			print(structure)
106 | 
107 | 			input('Press Enter to close the browser...')
108 | 	finally:
109 | 		await browser.close()
110 | 
111 | 
112 | if __name__ == '__main__':
113 | 	# You can modify this URL to analyze different pages
114 | 
115 | 	urls = [
116 | 		'https://www.mlb.com/yankees/stats/',
117 | 		'https://immobilienscout24.de',
118 | 		'https://www.zeiss.com/career/en/job-search.html?page=1',
119 | 		'https://www.zeiss.com/career/en/job-search.html?page=1',
120 | 		'https://reddit.com',
121 | 	]
122 | 	for url in urls:
123 | 		asyncio.run(analyze_page_structure(url))
124 | 


--------------------------------------------------------------------------------
/browser_use/agent/system_prompt_wap_replay.md:
--------------------------------------------------------------------------------
 1 | You are an AI agent designed to automate browser tasks. Your goal is to accomplish the ultimate task by replaying the sub-goals for each step that we provided.
 2 | 
 3 | # Input Format
 4 | Task
 5 | Previous steps
 6 | Current URL
 7 | Open Tabs
 8 | Sub-goal List
 9 | Interactive Elements
10 | [index]<type>text</type>
11 | - index: Numeric identifier for interaction
12 | - type: HTML element type (button, input, etc.)
13 | - text: Element description
14 | Example:
15 | [33]<button>Submit Form</button>
16 | 
17 | - Only elements with numeric indexes in [] are interactive
18 | - elements without [] provide only context
19 | 
20 | # Response Rules
21 | 1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format:
22 | {{"current_state": {{"evaluation_previous_goal": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Mention if something unexpected happened. Shortly state why/why not",
23 | "memory": "Description of what has been done and what you need to remember. Be very specific. Count here ALWAYS how many times you have done something and how many remain. E.g. 0 out of 10 websites analyzed. Continue with abc and xyz",
24 | "subgoal_index": 1}},
25 | "action":[{{"one_action_name": {{// action-specific parameter}}}}, // ... more actions in sequence]}}
26 | 
27 | 2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {{max_actions}} actions per sequence.
28 | Common action sequences:
29 | - Form filling: [{{"input_text": {{"index": 1, "text": "username"}}}}, {{"input_text": {{"index": 2, "text": "password"}}}}, {{"click_element": {{"index": 3}}}}]
30 | - Navigation and extraction: [{{"go_to_url": {{"url": "https://example.com"}}}}, {{"extract_content": {{"goal": "extract the names"}}}}]
31 | - Actions are executed in the given order
32 | - If the page changes after an action, the sequence is interrupted and you get the new state.
33 | - only use multiple actions if it makes sense.
34 | 
35 | 3. ELEMENT INTERACTION:
36 | - Only use indexes of the interactive elements
37 | - Elements marked with "[]Non-interactive text" are non-interactive
38 | 
39 | 4. NAVIGATION & ERROR HANDLING:
40 | - If no suitable elements exist, use other functions to complete the task
41 | - Handle popups/cookies by accepting or closing them
42 | - Use scroll to find elements you are looking for
43 | - If you want to research something, open a new tab instead of using the current tab
44 | - If captcha pops up, try to solve it - else try a different approach
45 | - If the page is not fully loaded, use wait action
46 | 
47 | 5. (MANDATORY) ACTIONS BASED ON SUB-GOAL LIST
48 | - You are provided with a previous and current sub-goals.
49 | - Check the action results in task history and the current page content to see whether the current status has already been safisfied:
50 |       -> if YES, specify actions for the current sub-goal in this step 
51 | 	  -> if NO, try some approaches more to achieve the previous sub-goals, e.g.: scroll to find elements
52 | - In your response, fill the value of subgoal_index with the index of the sub-goal that you work on in this step
53 | - Keep track of the status and subresults in the memory.
54 | 
55 | 6. TASK COMPLETION:
56 | - Use the done action as the last action as soon as the ultimate task is complete
57 | - Dont use "done" before you are done with everything the user asked you, except you reach the last step of max_steps.
58 | - If you reach your last step, use the done action even if the task is not fully finished. Provide all the information you have gathered so far. If the ultimate task is completely finished set success to true. If not everything the user asked for is completed set success in done to false!
59 | - If you have to do something repeatedly for example the task says for "each", or "for all", or "x times", count always inside "memory" how many times you have done it and how many remain. Don't stop until you have completed like the task asked you. Only call done after the last step.
60 | - Don't hallucinate actions
61 | 
62 | 7. Form filling:
63 | - If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
64 | 
65 | 8. Extraction:
66 | - If your task is to find information - call extract_content on the specific pages to get and store the information.
67 | Your responses must be always JSON with the specified format.
68 | 


--------------------------------------------------------------------------------
/browser_use/agent/memory/service.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import logging
  4 | from typing import List, Optional
  5 | 
  6 | from langchain_core.language_models.chat_models import BaseChatModel
  7 | from langchain_core.messages import (
  8 | 	BaseMessage,
  9 | 	HumanMessage,
 10 | )
 11 | from langchain_core.messages.utils import convert_to_openai_messages
 12 | from mem0 import Memory as Mem0Memory
 13 | from pydantic import BaseModel
 14 | 
 15 | from browser_use.agent.message_manager.service import MessageManager
 16 | from browser_use.agent.message_manager.views import ManagedMessage, MessageMetadata
 17 | from browser_use.utils import time_execution_sync
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | 
 22 | class MemorySettings(BaseModel):
 23 | 	"""Settings for procedural memory."""
 24 | 
 25 | 	agent_id: str
 26 | 	interval: int = 10
 27 | 	config: Optional[dict] | None = None
 28 | 
 29 | 
 30 | class Memory:
 31 | 	"""
 32 | 	Manages procedural memory for agents.
 33 | 
 34 | 	This class implements a procedural memory management system using Mem0 that transforms agent interaction history
 35 | 	into concise, structured representations at specified intervals. It serves to optimize context window
 36 | 	utilization during extended task execution by converting verbose historical information into compact,
 37 | 	yet comprehensive memory constructs that preserve essential operational knowledge.
 38 | 	"""
 39 | 
 40 | 	def __init__(
 41 | 		self,
 42 | 		message_manager: MessageManager,
 43 | 		llm: BaseChatModel,
 44 | 		settings: MemorySettings,
 45 | 	):
 46 | 		self.message_manager = message_manager
 47 | 		self.llm = llm
 48 | 		self.settings = settings
 49 | 		self._memory_config = self.settings.config or {'vector_store': {'provider': 'faiss'}}
 50 | 		self.mem0 = Mem0Memory.from_config(config_dict=self._memory_config)
 51 | 
 52 | 	@time_execution_sync('--create_procedural_memory')
 53 | 	def create_procedural_memory(self, current_step: int) -> None:
 54 | 		"""
 55 | 		Create a procedural memory if needed based on the current step.
 56 | 
 57 | 		Args:
 58 | 		    current_step: The current step number of the agent
 59 | 		"""
 60 | 		logger.info(f'Creating procedural memory at step {current_step}')
 61 | 
 62 | 		# Get all messages
 63 | 		all_messages = self.message_manager.state.history.messages
 64 | 
 65 | 		# Filter out messages that are marked as memory in metadata
 66 | 		messages_to_process = []
 67 | 		new_messages = []
 68 | 		for msg in all_messages:
 69 | 			# Exclude system message and initial messages
 70 | 			if isinstance(msg, ManagedMessage) and msg.metadata.message_type in set(['init', 'memory']):
 71 | 				new_messages.append(msg)
 72 | 			else:
 73 | 				messages_to_process.append(msg)
 74 | 
 75 | 		if len(messages_to_process) <= 1:
 76 | 			logger.info('Not enough non-memory messages to summarize')
 77 | 			return
 78 | 
 79 | 		# Create a summary
 80 | 		summary = self._create([m.message for m in messages_to_process], current_step)
 81 | 
 82 | 		if not summary:
 83 | 			logger.warning('Failed to create summary')
 84 | 			return
 85 | 
 86 | 		# Replace the summarized messages with the summary
 87 | 		summary_message = HumanMessage(content=summary)
 88 | 		summary_tokens = self.message_manager._count_tokens(summary_message)
 89 | 		summary_metadata = MessageMetadata(tokens=summary_tokens, message_type='memory')
 90 | 
 91 | 		# Calculate the total tokens being removed
 92 | 		removed_tokens = sum(m.metadata.tokens for m in messages_to_process)
 93 | 
 94 | 		# Add the summary message
 95 | 		new_messages.append(ManagedMessage(message=summary_message, metadata=summary_metadata))
 96 | 
 97 | 		# Update the history
 98 | 		self.message_manager.state.history.messages = new_messages
 99 | 		self.message_manager.state.history.current_tokens -= removed_tokens
100 | 		self.message_manager.state.history.current_tokens += summary_tokens
101 | 
102 | 		logger.info(f'Memories summarized: {len(messages_to_process)} messages converted to procedural memory')
103 | 		logger.info(f'Token reduction: {removed_tokens - summary_tokens} tokens')
104 | 
105 | 	def _create(self, messages: List[BaseMessage], current_step: int) -> Optional[str]:
106 | 		parsed_messages = convert_to_openai_messages(messages)
107 | 		try:
108 | 			results = self.mem0.add(
109 | 				messages=parsed_messages,
110 | 				agent_id=self.settings.agent_id,
111 | 				llm=self.llm,
112 | 				memory_type='procedural_memory',
113 | 				metadata={'step': current_step},
114 | 			)
115 | 			if len(results.get('results', [])):
116 | 				return results.get('results', [])[0].get('memory')
117 | 			return None
118 | 		except Exception as e:
119 | 			logger.error(f'Error creating procedural memory: {e}')
120 | 			return None
121 | 


--------------------------------------------------------------------------------
/browser_use/dom/history_tree_processor/service.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | from typing import Optional
  3 | 
  4 | from browser_use.dom.history_tree_processor.view import DOMHistoryElement, HashedDomElement
  5 | from browser_use.dom.views import DOMElementNode
  6 | 
  7 | 
  8 | class HistoryTreeProcessor:
  9 | 	""" "
 10 | 	Operations on the DOM elements
 11 | 
 12 | 	@dev be careful - text nodes can change even if elements stay the same
 13 | 	"""
 14 | 
 15 | 	@staticmethod
 16 | 	def convert_dom_element_to_history_element(dom_element: DOMElementNode) -> DOMHistoryElement:
 17 | 		from browser_use.browser.context import BrowserContext
 18 | 
 19 | 		parent_branch_path = HistoryTreeProcessor._get_parent_branch_path(dom_element)
 20 | 		css_selector = BrowserContext._enhanced_css_selector_for_element(dom_element)
 21 | 		return DOMHistoryElement(
 22 | 			dom_element.tag_name,
 23 | 			dom_element.xpath,
 24 | 			dom_element.highlight_index,
 25 | 			parent_branch_path,
 26 | 			dom_element.attributes,
 27 | 			dom_element.shadow_root,
 28 | 			css_selector=css_selector,
 29 | 			page_coordinates=dom_element.page_coordinates,
 30 | 			viewport_coordinates=dom_element.viewport_coordinates,
 31 | 			viewport_info=dom_element.viewport_info,
 32 | 		)
 33 | 
 34 | 	@staticmethod
 35 | 	def find_history_element_in_tree(dom_history_element: DOMHistoryElement, tree: DOMElementNode) -> Optional[DOMElementNode]:
 36 | 		hashed_dom_history_element = HistoryTreeProcessor._hash_dom_history_element(dom_history_element)
 37 | 
 38 | 		def process_node(node: DOMElementNode):
 39 | 			if node.highlight_index is not None:
 40 | 				hashed_node = HistoryTreeProcessor._hash_dom_element(node)
 41 | 				if hashed_node == hashed_dom_history_element:
 42 | 					return node
 43 | 			for child in node.children:
 44 | 				if isinstance(child, DOMElementNode):
 45 | 					result = process_node(child)
 46 | 					if result is not None:
 47 | 						return result
 48 | 			return None
 49 | 
 50 | 		return process_node(tree)
 51 | 
 52 | 	@staticmethod
 53 | 	def compare_history_element_and_dom_element(dom_history_element: DOMHistoryElement, dom_element: DOMElementNode) -> bool:
 54 | 		hashed_dom_history_element = HistoryTreeProcessor._hash_dom_history_element(dom_history_element)
 55 | 		hashed_dom_element = HistoryTreeProcessor._hash_dom_element(dom_element)
 56 | 
 57 | 		return hashed_dom_history_element == hashed_dom_element
 58 | 
 59 | 	@staticmethod
 60 | 	def _hash_dom_history_element(dom_history_element: DOMHistoryElement) -> HashedDomElement:
 61 | 		branch_path_hash = HistoryTreeProcessor._parent_branch_path_hash(dom_history_element.entire_parent_branch_path)
 62 | 		attributes_hash = HistoryTreeProcessor._attributes_hash(dom_history_element.attributes)
 63 | 		xpath_hash = HistoryTreeProcessor._xpath_hash(dom_history_element.xpath)
 64 | 
 65 | 		return HashedDomElement(branch_path_hash, attributes_hash, xpath_hash)
 66 | 
 67 | 	@staticmethod
 68 | 	def _hash_dom_element(dom_element: DOMElementNode) -> HashedDomElement:
 69 | 		parent_branch_path = HistoryTreeProcessor._get_parent_branch_path(dom_element)
 70 | 		branch_path_hash = HistoryTreeProcessor._parent_branch_path_hash(parent_branch_path)
 71 | 		attributes_hash = HistoryTreeProcessor._attributes_hash(dom_element.attributes)
 72 | 		xpath_hash = HistoryTreeProcessor._xpath_hash(dom_element.xpath)
 73 | 		# text_hash = DomTreeProcessor._text_hash(dom_element)
 74 | 
 75 | 		return HashedDomElement(branch_path_hash, attributes_hash, xpath_hash)
 76 | 
 77 | 	@staticmethod
 78 | 	def _get_parent_branch_path(dom_element: DOMElementNode) -> list[str]:
 79 | 		parents: list[DOMElementNode] = []
 80 | 		current_element: DOMElementNode = dom_element
 81 | 		while current_element.parent is not None:
 82 | 			parents.append(current_element)
 83 | 			current_element = current_element.parent
 84 | 
 85 | 		parents.reverse()
 86 | 
 87 | 		return [parent.tag_name for parent in parents]
 88 | 
 89 | 	@staticmethod
 90 | 	def _parent_branch_path_hash(parent_branch_path: list[str]) -> str:
 91 | 		parent_branch_path_string = '/'.join(parent_branch_path)
 92 | 		return hashlib.sha256(parent_branch_path_string.encode()).hexdigest()
 93 | 
 94 | 	@staticmethod
 95 | 	def _attributes_hash(attributes: dict[str, str]) -> str:
 96 | 		attributes_string = ''.join(f'{key}={value}' for key, value in attributes.items())
 97 | 		return hashlib.sha256(attributes_string.encode()).hexdigest()
 98 | 
 99 | 	@staticmethod
100 | 	def _xpath_hash(xpath: str) -> str:
101 | 		return hashlib.sha256(xpath.encode()).hexdigest()
102 | 
103 | 	@staticmethod
104 | 	def _text_hash(dom_element: DOMElementNode) -> str:
105 | 		""" """
106 | 		text_string = dom_element.get_all_text_till_next_clickable_element()
107 | 		return hashlib.sha256(text_string.encode()).hexdigest()
108 | 


--------------------------------------------------------------------------------
/browser_use/logging_config.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import sys
  4 | 
  5 | from dotenv import load_dotenv
  6 | 
  7 | load_dotenv()
  8 | 
  9 | 
 10 | def addLoggingLevel(levelName, levelNum, methodName=None):
 11 | 	"""
 12 | 	Comprehensively adds a new logging level to the `logging` module and the
 13 | 	currently configured logging class.
 14 | 
 15 | 	`levelName` becomes an attribute of the `logging` module with the value
 16 | 	`levelNum`. `methodName` becomes a convenience method for both `logging`
 17 | 	itself and the class returned by `logging.getLoggerClass()` (usually just
 18 | 	`logging.Logger`). If `methodName` is not specified, `levelName.lower()` is
 19 | 	used.
 20 | 
 21 | 	To avoid accidental clobberings of existing attributes, this method will
 22 | 	raise an `AttributeError` if the level name is already an attribute of the
 23 | 	`logging` module or if the method name is already present
 24 | 
 25 | 	Example
 26 | 	-------
 27 | 	>>> addLoggingLevel('TRACE', logging.DEBUG - 5)
 28 | 	>>> logging.getLogger(__name__).setLevel('TRACE')
 29 | 	>>> logging.getLogger(__name__).trace('that worked')
 30 | 	>>> logging.trace('so did this')
 31 | 	>>> logging.TRACE
 32 | 	5
 33 | 
 34 | 	"""
 35 | 	if not methodName:
 36 | 		methodName = levelName.lower()
 37 | 
 38 | 	if hasattr(logging, levelName):
 39 | 		raise AttributeError('{} already defined in logging module'.format(levelName))
 40 | 	if hasattr(logging, methodName):
 41 | 		raise AttributeError('{} already defined in logging module'.format(methodName))
 42 | 	if hasattr(logging.getLoggerClass(), methodName):
 43 | 		raise AttributeError('{} already defined in logger class'.format(methodName))
 44 | 
 45 | 	# This method was inspired by the answers to Stack Overflow post
 46 | 	# http://stackoverflow.com/q/2183233/2988730, especially
 47 | 	# http://stackoverflow.com/a/13638084/2988730
 48 | 	def logForLevel(self, message, *args, **kwargs):
 49 | 		if self.isEnabledFor(levelNum):
 50 | 			self._log(levelNum, message, args, **kwargs)
 51 | 
 52 | 	def logToRoot(message, *args, **kwargs):
 53 | 		logging.log(levelNum, message, *args, **kwargs)
 54 | 
 55 | 	logging.addLevelName(levelNum, levelName)
 56 | 	setattr(logging, levelName, levelNum)
 57 | 	setattr(logging.getLoggerClass(), methodName, logForLevel)
 58 | 	setattr(logging, methodName, logToRoot)
 59 | 
 60 | 
 61 | def setup_logging():
 62 | 	# Try to add RESULT level, but ignore if it already exists
 63 | 	try:
 64 | 		addLoggingLevel('RESULT', 35)  # This allows ERROR, FATAL and CRITICAL
 65 | 	except AttributeError:
 66 | 		pass  # Level already exists, which is fine
 67 | 
 68 | 	log_type = os.getenv('BROWSER_USE_LOGGING_LEVEL', 'info').lower()
 69 | 
 70 | 	# Check if handlers are already set up
 71 | 	if logging.getLogger().hasHandlers():
 72 | 		return
 73 | 
 74 | 	# Clear existing handlers
 75 | 	root = logging.getLogger()
 76 | 	root.handlers = []
 77 | 
 78 | 	class BrowserUseFormatter(logging.Formatter):
 79 | 		def format(self, record):
 80 | 			if isinstance(record.name, str) and record.name.startswith('browser_use.'):
 81 | 				record.name = record.name.split('.')[-2]
 82 | 			return super().format(record)
 83 | 
 84 | 	# Setup single handler for all loggers
 85 | 	console = logging.StreamHandler(sys.stdout)
 86 | 
 87 | 	# adittional setLevel here to filter logs
 88 | 	if log_type == 'result':
 89 | 		console.setLevel('RESULT')
 90 | 		console.setFormatter(BrowserUseFormatter('%(message)s'))
 91 | 	else:
 92 | 		console.setFormatter(BrowserUseFormatter('%(levelname)-8s [%(name)s] %(message)s'))
 93 | 
 94 | 	# Configure root logger only
 95 | 	root.addHandler(console)
 96 | 
 97 | 	# switch cases for log_type
 98 | 	if log_type == 'result':
 99 | 		root.setLevel('RESULT')  # string usage to avoid syntax error
100 | 	elif log_type == 'debug':
101 | 		root.setLevel(logging.DEBUG)
102 | 	else:
103 | 		root.setLevel(logging.INFO)
104 | 
105 | 	# Configure browser_use logger
106 | 	browser_use_logger = logging.getLogger('browser_use')
107 | 	browser_use_logger.propagate = False  # Don't propagate to root logger
108 | 	browser_use_logger.addHandler(console)
109 | 	browser_use_logger.setLevel(root.level)  # Set same level as root logger
110 | 
111 | 	logger = logging.getLogger('browser_use')
112 | 	logger.info('BrowserUse logging setup complete with level %s', log_type)
113 | 	# Silence third-party loggers
114 | 	for logger in [
115 | 		'WDM',
116 | 		'httpx',
117 | 		'selenium',
118 | 		'playwright',
119 | 		'urllib3',
120 | 		'asyncio',
121 | 		'langchain',
122 | 		'openai',
123 | 		'httpcore',
124 | 		'charset_normalizer',
125 | 		'anthropic._base_client',
126 | 		'PIL.PngImagePlugin',
127 | 		'trafilatura.htmlprocessing',
128 | 		'trafilatura',
129 | 	]:
130 | 		third_party = logging.getLogger(logger)
131 | 		third_party.setLevel(logging.ERROR)
132 | 		third_party.propagate = False
133 | 


--------------------------------------------------------------------------------
/browser_use/agent/message_manager/views.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import TYPE_CHECKING, Any
  4 | from warnings import filterwarnings
  5 | 
  6 | from langchain_core._api import LangChainBetaWarning
  7 | from langchain_core.load import dumpd, load
  8 | from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage, ToolMessage
  9 | from pydantic import BaseModel, ConfigDict, Field, model_serializer, model_validator
 10 | 
 11 | filterwarnings('ignore', category=LangChainBetaWarning)
 12 | 
 13 | if TYPE_CHECKING:
 14 | 	from browser_use.agent.views import AgentOutput
 15 | 
 16 | 
 17 | class MessageMetadata(BaseModel):
 18 | 	"""Metadata for a message"""
 19 | 
 20 | 	tokens: int = 0
 21 | 	message_type: str | None = None
 22 | 
 23 | 
 24 | class ManagedMessage(BaseModel):
 25 | 	"""A message with its metadata"""
 26 | 
 27 | 	message: BaseMessage
 28 | 	metadata: MessageMetadata = Field(default_factory=MessageMetadata)
 29 | 
 30 | 	model_config = ConfigDict(arbitrary_types_allowed=True)
 31 | 
 32 | 	# https://github.com/pydantic/pydantic/discussions/7558
 33 | 	@model_serializer(mode='wrap')
 34 | 	def to_json(self, original_dump):
 35 | 		"""
 36 | 		Returns the JSON representation of the model.
 37 | 
 38 | 		It uses langchain's `dumps` function to serialize the `message`
 39 | 		property before encoding the overall dict with json.dumps.
 40 | 		"""
 41 | 		data = original_dump(self)
 42 | 
 43 | 		# NOTE: We override the message field to use langchain JSON serialization.
 44 | 		data['message'] = dumpd(self.message)
 45 | 
 46 | 		return data
 47 | 
 48 | 	@model_validator(mode='before')
 49 | 	@classmethod
 50 | 	def validate(
 51 | 		cls,
 52 | 		value: Any,
 53 | 		*,
 54 | 		strict: bool | None = None,
 55 | 		from_attributes: bool | None = None,
 56 | 		context: Any | None = None,
 57 | 	) -> Any:
 58 | 		"""
 59 | 		Custom validator that uses langchain's `loads` function
 60 | 		to parse the message if it is provided as a JSON string.
 61 | 		"""
 62 | 		if isinstance(value, dict) and 'message' in value:
 63 | 			# NOTE: We use langchain's load to convert the JSON string back into a BaseMessage object.
 64 | 			filterwarnings('ignore', category=LangChainBetaWarning)
 65 | 			value['message'] = load(value['message'])
 66 | 		return value
 67 | 
 68 | 
 69 | class MessageHistory(BaseModel):
 70 | 	"""History of messages with metadata"""
 71 | 
 72 | 	messages: list[ManagedMessage] = Field(default_factory=list)
 73 | 	current_tokens: int = 0
 74 | 
 75 | 	model_config = ConfigDict(arbitrary_types_allowed=True)
 76 | 
 77 | 	def add_message(self, message: BaseMessage, metadata: MessageMetadata, position: int | None = None) -> None:
 78 | 		"""Add message with metadata to history"""
 79 | 		if position is None:
 80 | 			self.messages.append(ManagedMessage(message=message, metadata=metadata))
 81 | 		else:
 82 | 			self.messages.insert(position, ManagedMessage(message=message, metadata=metadata))
 83 | 		self.current_tokens += metadata.tokens
 84 | 
 85 | 	def add_model_output(self, output: 'AgentOutput') -> None:
 86 | 		"""Add model output as AI message"""
 87 | 		tool_calls = [
 88 | 			{
 89 | 				'name': 'AgentOutput',
 90 | 				'args': output.model_dump(mode='json', exclude_unset=True),
 91 | 				'id': '1',
 92 | 				'type': 'tool_call',
 93 | 			}
 94 | 		]
 95 | 
 96 | 		msg = AIMessage(
 97 | 			content='',
 98 | 			tool_calls=tool_calls,
 99 | 		)
100 | 		self.add_message(msg, MessageMetadata(tokens=100))  # Estimate tokens for tool calls
101 | 
102 | 		# Empty tool response
103 | 		tool_message = ToolMessage(content='', tool_call_id='1')
104 | 		self.add_message(tool_message, MessageMetadata(tokens=10))  # Estimate tokens for empty response
105 | 
106 | 	def get_messages(self) -> list[BaseMessage]:
107 | 		"""Get all messages"""
108 | 		return [m.message for m in self.messages]
109 | 
110 | 	def get_total_tokens(self) -> int:
111 | 		"""Get total tokens in history"""
112 | 		return self.current_tokens
113 | 
114 | 	def remove_oldest_message(self) -> None:
115 | 		"""Remove oldest non-system message"""
116 | 		for i, msg in enumerate(self.messages):
117 | 			if not isinstance(msg.message, SystemMessage):
118 | 				self.current_tokens -= msg.metadata.tokens
119 | 				self.messages.pop(i)
120 | 				break
121 | 
122 | 	def remove_last_state_message(self) -> None:
123 | 		"""Remove last state message from history"""
124 | 		if len(self.messages) > 2 and isinstance(self.messages[-1].message, HumanMessage):
125 | 			self.current_tokens -= self.messages[-1].metadata.tokens
126 | 			self.messages.pop()
127 | 
128 | 
129 | class MessageManagerState(BaseModel):
130 | 	"""Holds the state for MessageManager"""
131 | 
132 | 	history: MessageHistory = Field(default_factory=MessageHistory)
133 | 	tool_id: int = 1
134 | 
135 | 	model_config = ConfigDict(arbitrary_types_allowed=True)
136 | 


--------------------------------------------------------------------------------
/chrome-extension/panel.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | <head>
  4 |     <meta charset="utf-8">
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  6 | 
  7 |     <title>OTA user interaction data helper</title>
  8 | 
  9 |     <link rel="stylesheet" href="css/normalize.css">
 10 |     <link rel="stylesheet" href="css/skeleton.css">
 11 |     <link rel="stylesheet" href="css/panel.css">
 12 | </head>
 13 | <body>
 14 | <p class="status" title="Unable to contact content script. Try reloading the page.">(disconnected)</p>
 15 | <nav>
 16 | 	<section class="task-description-section">
 17 | 		<textarea
 18 | 			class="task-description-input"
 19 | 			placeholder="Describe your task here…"
 20 | 			rows="1"></textarea>
 21 | 		<span class="task-description-start" style="display:none;">Current task: </span>
 22 | 		<span class="task-description-task-id" style="display:none;">...</span>
 23 | 		<span class="task-description-label" style="display:none;"></span>
 24 | 	</section>
 25 |     <button class="button-primary record">Start Record</button>
 26 | 	<button class="record-pause" id="record-pause-button-1" disabled>Pause</button>
 27 |     <button class="clear">Clear</button>
 28 | 	<button class="task-visibility-toggle"
 29 |         id="task-visibility-toggle-button"
 30 |         hidden>Hide Task</button>
 31 | </nav>
 32 | <main>
 33 |     <table class="events u-full-width">
 34 |         <thead>
 35 |         <tr>
 36 |             <th>
 37 |                 <div class="type-filters">
 38 |                     <div class="checkbox nodes-added" title="nodes added">
 39 |                         <input type="checkbox" id="nodes-added-filter" checked/>
 40 |                         <label for="nodes-added-filter"></label>
 41 |                     </div>
 42 |                     <div class="checkbox nodes-removed" title="nodes removed">
 43 |                         <input type="checkbox" id="nodes-removed-filter" checked/>
 44 |                         <label for="nodes-removed-filter"></label>
 45 |                     </div>
 46 |                     <div class="checkbox attribute-changed" title="attribute changed">
 47 |                         <input type="checkbox" id="attribute-changed-filter" checked/>
 48 |                         <label for="attribute-changed-filter"></label>
 49 |                     </div>
 50 |                     <div class="checkbox text-changed" title="text changed">
 51 |                         <input type="checkbox" id="text-changed-filter" checked/>
 52 |                         <label for="text-changed-filter"></label>
 53 |                     </div>
 54 |                 </div>
 55 |                 Event <span class="counter">(0)</span>
 56 |             </th>
 57 |             <th>
 58 |                 <input type="text" placeholder="filter" class="target-filter"/>
 59 |                 Target
 60 |             </th>
 61 |             <th>
 62 | 				Details
 63 | 			</th>
 64 | 			<th>
 65 | 				<button class="settings-btn">Settings</button>
 66 | 				<span id="settings-status" style="display:none; color:green;">Saved ✔</span>
 67 | 			</th>
 68 |         </tr>
 69 |         </thead>
 70 |         <tbody></tbody>
 71 |     </table>
 72 | </main>
 73 | 
 74 | <!-- Settings panel (initially hidden) -->
 75 | <section class="settings-panel hidden">
 76 | 	<h5>OTA Action Capturer - Settings</h5>
 77 |   
 78 | 	<label>
 79 | 	  Host
 80 | 	  <input type="text" id="collector-host" placeholder="127.0.0.1">
 81 | 	</label>
 82 |   
 83 | 	<label>
 84 | 	  Port
 85 | 	  <input type="number" id="collector-port" min="1" max="65535" placeholder="4934">
 86 | 	</label>
 87 | 	<label>
 88 | 		<input type="checkbox" id="collector-mask">
 89 | 		Hide sensitive data before sending
 90 | 	</label>
 91 |   
 92 | 	<button id="settings-save"  class="button-primary">Save</button>
 93 | 	<button id="settings-cancel"               >Cancel</button>
 94 | 	<hr>
 95 |   </section>
 96 |   
 97 | <aside class="intro">
 98 | 	<img src="ico/ota-logo-128.png"
 99 | 	alt="OTA logo"
100 | 	style="display:block; margin:0 auto 12px auto; width:64px; height:64px; border-radius:12px;"/>
101 |     <p>Click <em>"Start Record"</em> to start recording your WAP workflow.</p>
102 | 
103 |     <div class="social-buttons">
104 |         <iframe
105 |             src="http://ghbtns.com/github-btn.html?user=OTA-Tech-AI&amp;repo=webagentprotocol&amp;type=watch&amp;count=true&amp;size=small"
106 |             style="background-color: transparent; border: none; overflow:hidden" width="90" height="20"></iframe>
107 | 		<a href="https://otatech.ai"
108 | 			target="_blank"
109 | 			rel="noopener"
110 | 			class="website-btn">
111 | 				Visit OTA
112 | 		</a>
113 |     </div>
114 | </aside>
115 | 
116 | <script src="js/ContentScriptProxy.js"></script>
117 | <script src="js/EventTable.js"></script>
118 | <script src="js/ScrollHelper.js"></script>
119 | <script src="js/panel.js"></script>
120 | 
121 | </body>
122 | </html>
123 | 


--------------------------------------------------------------------------------
/browser_use/agent/message_manager/utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import json
  4 | import logging
  5 | import os
  6 | from typing import Any, Optional, Type
  7 | 
  8 | from langchain_core.messages import (
  9 | 	AIMessage,
 10 | 	BaseMessage,
 11 | 	HumanMessage,
 12 | 	SystemMessage,
 13 | 	ToolMessage,
 14 | )
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | def extract_json_from_model_output(content: str) -> dict:
 20 | 	"""Extract JSON from model output, handling both plain JSON and code-block-wrapped JSON."""
 21 | 	try:
 22 | 		# If content is wrapped in code blocks, extract just the JSON part
 23 | 		if '```' in content:
 24 | 			# Find the JSON content between code blocks
 25 | 			content = content.split('```')[1]
 26 | 			# Remove language identifier if present (e.g., 'json\n')
 27 | 			if '\n' in content:
 28 | 				content = content.split('\n', 1)[1]
 29 | 		# Parse the cleaned content
 30 | 		return json.loads(content)
 31 | 	except json.JSONDecodeError as e:
 32 | 		logger.warning(f'Failed to parse model output: {content} {str(e)}')
 33 | 		raise ValueError('Could not parse response.')
 34 | 
 35 | 
 36 | def convert_input_messages(input_messages: list[BaseMessage], model_name: Optional[str]) -> list[BaseMessage]:
 37 | 	"""Convert input messages to a format that is compatible with the planner model"""
 38 | 	if model_name is None:
 39 | 		return input_messages
 40 | 	if model_name == 'deepseek-reasoner' or 'deepseek-r1' in model_name:
 41 | 		converted_input_messages = _convert_messages_for_non_function_calling_models(input_messages)
 42 | 		merged_input_messages = _merge_successive_messages(converted_input_messages, HumanMessage)
 43 | 		merged_input_messages = _merge_successive_messages(merged_input_messages, AIMessage)
 44 | 		return merged_input_messages
 45 | 	return input_messages
 46 | 
 47 | 
 48 | def _convert_messages_for_non_function_calling_models(input_messages: list[BaseMessage]) -> list[BaseMessage]:
 49 | 	"""Convert messages for non-function-calling models"""
 50 | 	output_messages = []
 51 | 	for message in input_messages:
 52 | 		if isinstance(message, HumanMessage):
 53 | 			output_messages.append(message)
 54 | 		elif isinstance(message, SystemMessage):
 55 | 			output_messages.append(message)
 56 | 		elif isinstance(message, ToolMessage):
 57 | 			output_messages.append(HumanMessage(content=message.content))
 58 | 		elif isinstance(message, AIMessage):
 59 | 			# check if tool_calls is a valid JSON object
 60 | 			if message.tool_calls:
 61 | 				tool_calls = json.dumps(message.tool_calls)
 62 | 				output_messages.append(AIMessage(content=tool_calls))
 63 | 			else:
 64 | 				output_messages.append(message)
 65 | 		else:
 66 | 			raise ValueError(f'Unknown message type: {type(message)}')
 67 | 	return output_messages
 68 | 
 69 | 
 70 | def _merge_successive_messages(messages: list[BaseMessage], class_to_merge: Type[BaseMessage]) -> list[BaseMessage]:
 71 | 	"""Some models like deepseek-reasoner dont allow multiple human messages in a row. This function merges them into one."""
 72 | 	merged_messages = []
 73 | 	streak = 0
 74 | 	for message in messages:
 75 | 		if isinstance(message, class_to_merge):
 76 | 			streak += 1
 77 | 			if streak > 1:
 78 | 				if isinstance(message.content, list):
 79 | 					merged_messages[-1].content += message.content[0]['text']  # type:ignore
 80 | 				else:
 81 | 					merged_messages[-1].content += message.content
 82 | 			else:
 83 | 				merged_messages.append(message)
 84 | 		else:
 85 | 			merged_messages.append(message)
 86 | 			streak = 0
 87 | 	return merged_messages
 88 | 
 89 | 
 90 | def save_conversation(input_messages: list[BaseMessage], response: Any, target: str, encoding: Optional[str] = None) -> None:
 91 | 	"""Save conversation history to file."""
 92 | 
 93 | 	# create folders if not exists
 94 | 	if dirname := os.path.dirname(target):
 95 | 		os.makedirs(dirname, exist_ok=True)
 96 | 
 97 | 	with open(
 98 | 		target,
 99 | 		'w',
100 | 		encoding=encoding,
101 | 	) as f:
102 | 		_write_messages_to_file(f, input_messages)
103 | 		_write_response_to_file(f, response)
104 | 
105 | 
106 | def _write_messages_to_file(f: Any, messages: list[BaseMessage]) -> None:
107 | 	"""Write messages to conversation file"""
108 | 	for message in messages:
109 | 		f.write(f' {message.__class__.__name__} \n')
110 | 
111 | 		if isinstance(message.content, list):
112 | 			for item in message.content:
113 | 				if isinstance(item, dict) and item.get('type') == 'text':
114 | 					f.write(item['text'].strip() + '\n')
115 | 		elif isinstance(message.content, str):
116 | 			try:
117 | 				content = json.loads(message.content)
118 | 				f.write(json.dumps(content, indent=2) + '\n')
119 | 			except json.JSONDecodeError:
120 | 				f.write(message.content.strip() + '\n')
121 | 
122 | 		f.write('\n')
123 | 
124 | 
125 | def _write_response_to_file(f: Any, response: Any) -> None:
126 | 	"""Write model response to conversation file"""
127 | 	f.write(' RESPONSE\n')
128 | 	f.write(json.dumps(json.loads(response.model_dump_json(exclude_unset=True)), indent=2))
129 | 


--------------------------------------------------------------------------------
/browser_use/agent/system_prompt.md:
--------------------------------------------------------------------------------
 1 | You are an AI agent designed to automate browser tasks. Your goal is to accomplish the ultimate task following the rules.
 2 | 
 3 | # Input Format
 4 | Task
 5 | Previous steps
 6 | Current URL
 7 | Open Tabs
 8 | Interactive Elements
 9 | [index]<type>text</type>
10 | - index: Numeric identifier for interaction
11 | - type: HTML element type (button, input, etc.)
12 | - text: Element description
13 | Example:
14 | [33]<button>Submit Form</button>
15 | 
16 | - Only elements with numeric indexes in [] are interactive
17 | - elements without [] provide only context
18 | 
19 | # Response Rules
20 | 1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format:
21 | {{"current_state": {{"evaluation_previous_goal": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Mention if something unexpected happened. Shortly state why/why not",
22 | "memory": "Description of what has been done and what you need to remember. Be very specific. Count here ALWAYS how many times you have done something and how many remain. E.g. 0 out of 10 websites analyzed. Continue with abc and xyz",
23 | "next_goal": "What needs to be done with the next immediate action"}},
24 | "action":[{{"one_action_name": {{// action-specific parameter}}}}, // ... more actions in sequence]}}
25 | 
26 | 2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {{max_actions}} actions per sequence.
27 | Common action sequences:
28 | - Form filling: [{{"input_text": {{"index": 1, "text": "username"}}}}, {{"input_text": {{"index": 2, "text": "password"}}}}, {{"click_element": {{"index": 3}}}}]
29 | - Navigation and extraction: [{{"go_to_url": {{"url": "https://example.com"}}}}, {{"extract_content": {{"goal": "extract the names"}}}}]
30 | - Actions are executed in the given order
31 | - If the page changes after an action, the sequence is interrupted and you get the new state.
32 | - Only provide the action sequence until an action which changes the page state significantly.
33 | - Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page
34 | - only use multiple actions if it makes sense.
35 | 
36 | 3. ELEMENT INTERACTION:
37 | - Only use indexes of the interactive elements
38 | - Elements marked with "[]Non-interactive text" are non-interactive
39 | 
40 | 4. NAVIGATION & ERROR HANDLING:
41 | - If no suitable elements exist, use other functions to complete the task
42 | - If stuck, try alternative approaches - like going back to a previous page, new search, new tab etc.
43 | - Handle popups/cookies by accepting or closing them
44 | - Use scroll to find elements you are looking for
45 | - If you want to research something, open a new tab instead of using the current tab
46 | - If captcha pops up, try to solve it - else try a different approach
47 | - If the page is not fully loaded, use wait action
48 | 
49 | 5. TASK COMPLETION:
50 | - Use the done action as the last action as soon as the ultimate task is complete
51 | - Dont use "done" before you are done with everything the user asked you, except you reach the last step of max_steps.
52 | - If you reach your last step, use the done action even if the task is not fully finished. Provide all the information you have gathered so far. If the ultimate task is completely finished set success to true. If not everything the user asked for is completed set success in done to false!
53 | - If you have to do something repeatedly for example the task says for "each", or "for all", or "x times", count always inside "memory" how many times you have done it and how many remain. Don't stop until you have completed like the task asked you. Only call done after the last step.
54 | - Don't hallucinate actions
55 | - Make sure you include everything you found out for the ultimate task in the done text parameter. Do not just say you are done, but include the requested information of the task.
56 | 
57 | 6. VISUAL CONTEXT:
58 | - When an image is provided, use it to understand the page layout
59 | - Bounding boxes with labels on their top right corner correspond to element indexes
60 | 
61 | 7. Form filling:
62 | - If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
63 | 
64 | 8. Long tasks:
65 | - Keep track of the status and subresults in the memory.
66 | - You are provided with procedural memory summaries that condense previous task history (every N steps). Use these summaries to maintain context about completed actions, current progress, and next steps. The summaries appear in chronological order and contain key information about navigation history, findings, errors encountered, and current state. Refer to these summaries to avoid repeating actions and to ensure consistent progress toward the task goal.
67 | 
68 | 9. Extraction:
69 | - If your task is to find information - call extract_content on the specific pages to get and store the information.
70 | Your responses must be always JSON with the specified format.
71 | 


--------------------------------------------------------------------------------
/browser_use/controller/registry/views.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable, Dict, Type
  2 | 
  3 | from playwright.async_api import Page
  4 | from pydantic import BaseModel, ConfigDict
  5 | 
  6 | 
  7 | class RegisteredAction(BaseModel):
  8 | 	"""Model for a registered action"""
  9 | 
 10 | 	name: str
 11 | 	description: str
 12 | 	function: Callable
 13 | 	param_model: Type[BaseModel]
 14 | 
 15 | 	# filters: provide specific domains or a function to determine whether the action should be available on the given page or not
 16 | 	domains: list[str] | None = None  # e.g. ['*.google.com', 'www.bing.com', 'yahoo.*]
 17 | 	page_filter: Callable[[Page], bool] | None = None
 18 | 
 19 | 	model_config = ConfigDict(arbitrary_types_allowed=True)
 20 | 
 21 | 	def prompt_description(self) -> str:
 22 | 		"""Get a description of the action for the prompt"""
 23 | 		skip_keys = ['title']
 24 | 		s = f'{self.description}: \n'
 25 | 		s += '{' + str(self.name) + ': '
 26 | 		s += str(
 27 | 			{
 28 | 				k: {sub_k: sub_v for sub_k, sub_v in v.items() if sub_k not in skip_keys}
 29 | 				for k, v in self.param_model.model_json_schema()['properties'].items()
 30 | 			}
 31 | 		)
 32 | 		s += '}'
 33 | 		return s
 34 | 
 35 | 
 36 | class ActionModel(BaseModel):
 37 | 	"""Base model for dynamically created action models"""
 38 | 
 39 | 	# this will have all the registered actions, e.g.
 40 | 	# click_element = param_model = ClickElementParams
 41 | 	# done = param_model = None
 42 | 	#
 43 | 	model_config = ConfigDict(arbitrary_types_allowed=True)
 44 | 
 45 | 	def get_index(self) -> int | None:
 46 | 		"""Get the index of the action"""
 47 | 		# {'clicked_element': {'index':5}}
 48 | 		params = self.model_dump(exclude_unset=True).values()
 49 | 		if not params:
 50 | 			return None
 51 | 		for param in params:
 52 | 			if param is not None and 'index' in param:
 53 | 				return param['index']
 54 | 		return None
 55 | 
 56 | 	def set_index(self, index: int):
 57 | 		"""Overwrite the index of the action"""
 58 | 		# Get the action name and params
 59 | 		action_data = self.model_dump(exclude_unset=True)
 60 | 		action_name = next(iter(action_data.keys()))
 61 | 		action_params = getattr(self, action_name)
 62 | 
 63 | 		# Update the index directly on the model
 64 | 		if hasattr(action_params, 'index'):
 65 | 			action_params.index = index
 66 | 
 67 | 
 68 | class ActionRegistry(BaseModel):
 69 | 	"""Model representing the action registry"""
 70 | 
 71 | 	actions: Dict[str, RegisteredAction] = {}
 72 | 
 73 | 	@staticmethod
 74 | 	def _match_domains(domains: list[str] | None, url: str) -> bool:
 75 | 		"""
 76 | 		Match a list of domain glob patterns against a URL.
 77 | 
 78 | 		Args:
 79 | 			domain_patterns: A list of domain patterns that can include glob patterns (* wildcard)
 80 | 			url: The URL to match against
 81 | 
 82 | 		Returns:
 83 | 			True if the URL's domain matches the pattern, False otherwise
 84 | 		"""
 85 | 
 86 | 		if domains is None or not url:
 87 | 			return True
 88 | 
 89 | 		import fnmatch
 90 | 		from urllib.parse import urlparse
 91 | 
 92 | 		# Parse the URL to get the domain
 93 | 		try:
 94 | 			parsed_url = urlparse(url)
 95 | 			if not parsed_url.netloc:
 96 | 				return False
 97 | 
 98 | 			domain = parsed_url.netloc
 99 | 			# Remove port if present
100 | 			if ':' in domain:
101 | 				domain = domain.split(':')[0]
102 | 
103 | 			for domain_pattern in domains:
104 | 				if fnmatch.fnmatch(domain, domain_pattern):  # Perform glob *.matching.*
105 | 					return True
106 | 			return False
107 | 		except Exception:
108 | 			return False
109 | 
110 | 	@staticmethod
111 | 	def _match_page_filter(page_filter: Callable[[Page], bool] | None, page: Page) -> bool:
112 | 		"""Match a page filter against a page"""
113 | 		if page_filter is None:
114 | 			return True
115 | 		return page_filter(page)
116 | 
117 | 	def get_prompt_description(self, page: Page | None = None) -> str:
118 | 		"""Get a description of all actions for the prompt
119 | 
120 | 		Args:
121 | 			page: If provided, filter actions by page using page_filter and domains.
122 | 
123 | 		Returns:
124 | 			A string description of available actions.
125 | 			- If page is None: return only actions with no page_filter and no domains (for system prompt)
126 | 			- If page is provided: return only filtered actions that match the current page (excluding unfiltered actions)
127 | 		"""
128 | 		if page is None:
129 | 			# For system prompt (no page provided), include only actions with no filters
130 | 			return '\n'.join(
131 | 				action.prompt_description()
132 | 				for action in self.actions.values()
133 | 				if action.page_filter is None and action.domains is None
134 | 			)
135 | 
136 | 		# only include filtered actions for the current page
137 | 		filtered_actions = []
138 | 		for action in self.actions.values():
139 | 			if not (action.domains or action.page_filter):
140 | 				# skip actions with no filters, they are already included in the system prompt
141 | 				continue
142 | 
143 | 			domain_is_allowed = self._match_domains(action.domains, page.url)
144 | 			page_is_allowed = self._match_page_filter(action.page_filter, page)
145 | 
146 | 			if domain_is_allowed and page_is_allowed:
147 | 				filtered_actions.append(action)
148 | 
149 | 		return '\n'.join(action.prompt_description() for action in filtered_actions)
150 | 


--------------------------------------------------------------------------------
/chrome-extension/README.md:
--------------------------------------------------------------------------------
  1 | <!-- markdownlint-disable first-line-h1 -->
  2 | <!-- markdownlint-disable html -->
  3 | <!-- markdownlint-disable no-duplicate-header -->
  4 | 
  5 | <div align="center">
  6 |   <img src="assets/beholder-tool-kit-long.png" width="100%" alt="OTA-tool-kits" style="border-radius: 10px;" />
  7 | </div>
  8 | <br>
  9 | <div align="center" style="line-height: 1;">
 10 |   <a href="https://www.otatech.ai/"><img alt="Homepage"
 11 |     src="https://img.shields.io/badge/Visit-otatech.ai-blue"/></a>
 12 |   <a href="https://huggingface.co/OTA-AI/OTA-v1"><img alt="Hugging Face"
 13 |     src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-OTA%20AI-ffc107?color=ffc107&logoColor=white"/></a>
 14 |   <a href="https://github.com/OTA-Tech-AI/webagentprotocol/tree/main/chrome-extension/LICENSE"><img alt="Code License"
 15 |     src="https://img.shields.io/badge/Code_License-GNU-f5de53?&color=f5deff"/></a>
 16 |   <br><br><br>
 17 | </div>
 18 | 
 19 | 
 20 | OTA WAP Browser Action Capturer
 21 | ======================
 22 | 
 23 | OTA browser action capturer is a simple tool which helps you to collect the interactions with browser such as click, typing etc. and transforms them to well structured data for generating LLM-powered "records and replay" instructions. The action data will be organized in a JSON format and sent to your local WAP server of data collection.
 24 | 
 25 | To deploy the local WAP server, please refer to: https://github.com/OTA-Tech-AI/webagentprotocol
 26 | WAP (Web Agent protocol) is our standard protocol for AI Agent record-and-play inferencing.
 27 | 
 28 | 
 29 | Installation
 30 | -----
 31 | 
 32 | Install our public **Chrome extension** at [WAP Browser Action Capturer](https://chromewebstore.google.com/detail/wap-browser-action-captur/chikiefojkdpmijbhepipdkadcljlbmh).
 33 | 
 34 | If you want to install this extension locally, please refer to: https://developer.chrome.com/docs/extensions/get-started/tutorial/hello-world
 35 | 
 36 | Usage of Action Capturer
 37 | -----
 38 | 
 39 | ### Perpare
 40 | 
 41 | Open Chrome DevTools and navigate to the **"OTA Action Capturer"** panel and you will see:
 42 | 
 43 | <img src="assets/panelUI.png" alt="Panel UI" width="700"/>
 44 | 
 45 | Make sure the IP address and port in Settings are correct:
 46 | 
 47 | <img src="assets/settings.gif" alt="Settings GIF" width="700"/>
 48 | 
 49 | ### Start a record
 50 | 
 51 | Clearly describe the task you will be working on and click "START RECORD":
 52 | 
 53 | <img src="assets/start-record.gif" alt="Start Record" width="700"/>
 54 | 
 55 | The capturer will record actions such as clicking, typing, navigating etc. only in the current page.
 56 | 
 57 | If the HTML content in the page changed, the event table will present the added/removed/changed nodes. The information of changing will be collected and sent to your local WAP server.
 58 | 
 59 | <img src="assets/recording.gif" alt="Recording" width="700"/>
 60 | 
 61 | An example of the formatted data which you will received in the WAP backend server is like:
 62 | 
 63 | ```json
 64 | {
 65 |   "taskId": "MkCAhQsHgXn7YgaK",
 66 |   "type": "click",
 67 |   "actionTimestamp": 1746325231479,
 68 |   "eventTarget": {
 69 |     "type": "click",
 70 |     "target": "<a ota-use-interactive-target=\"1\" data-ordinal=\"3\" href=\"https://www.allrecipes.com/recipe/68925/cheesy-baked-salmon/\" data-tax-levels=\"\" data-doc-id=\"6592066\" class=\"comp mntl-card-list-card--extendable mntl-universal-card mntl-document-card mntl-card card card--no-image\" id=\"mntl-card-list-card--extendable_3-0\">\n<div class=\"loc card__top\"><div class=\"card__media mntl-image card__media universal-image__container\">...",
 71 |     "targetId": "mntl-card-list-card--extendable_3-0",
 72 |     "targetClass": "comp mntl-card-list-card--extendable mntl-universal-card mntl-document-card mntl-card card card--no-image"
 73 |   },
 74 |   "allEvents": {},
 75 |   "pageHTMLContent": "<header data-tracking-container=\"true\" data-collapsible=\"true\" class=\"comp header mntl-header mntl-header--magazine mntl-header--open-search-bar mntl-header--myr\" id=\"header_1-0\"><a data-tracking-container=\"true\" id=\"mntl-skip-to-content_1-0\" class=\"mntl-skip-to-content mntl-text-link\" rel=\"nocaes\" href=\"#main\"></a><div class=\"mntl-header__menu-top\">..."
 76 | }
 77 | ```
 78 | 
 79 | The extension utilized MutationObserver to capture any node changes in the page, please refer to [MutationObserver](https://developer.mozilla.org/en/docs/Web/API/MutationObserver) for more details.
 80 | 
 81 | ### Pause
 82 | During the record, you can click on "PAUSE" to pause the capturer so that no actions will be recorded until you hit "RESUME":
 83 | 
 84 | <img src="assets/pause.gif" alt="Pause" width="700"/>
 85 | 
 86 | Thanks to
 87 | ------
 88 | 
 89 | OTA action capturer is built on top of [DOMListenerExtension](https://github.com/kdzwinel/DOMListenerExtension).
 90 | 
 91 | License
 92 | -------
 93 | 
 94 | This program is free software: you can redistribute it and/or modify
 95 | it under the terms of the GNU General Public License as published by
 96 | the Free Software Foundation, either version 3 of the License, or
 97 | (at your option) any later version.
 98 | 
 99 | This program is distributed in the hope that it will be useful,
100 | but WITHOUT ANY WARRANTY; without even the implied warranty of
101 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
102 | GNU General Public License for more details.
103 | 
104 | You should have received a copy of the GNU General Public License
105 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
106 | 


--------------------------------------------------------------------------------
/utils/action_processing.py:
--------------------------------------------------------------------------------
  1 | import json, re, sys
  2 | from typing import Any, Dict
  3 | from utils.html_cleaner import run_html_sanitizer
  4 | from jinja2 import Template
  5 | from pathlib import Path
  6 | 
  7 | TEMPLATE_DIR = Path("prompts/subgoal_generation")
  8 | 
  9 | def choose_template(action_type: str) -> Path:
 10 |     """Return the correct template file for a given action_type."""
 11 |     match action_type:
 12 |         case "submit":
 13 |             return TEMPLATE_DIR / "submit.md"
 14 |         case "go-back-or-forward":
 15 |             return TEMPLATE_DIR / "go-back-or-forward.md"
 16 |         case "task-start":
 17 |             return TEMPLATE_DIR / "task-start.md"
 18 |         case "task-finish":
 19 |             return TEMPLATE_DIR / "task-finish.md"
 20 |         case _:
 21 |             return TEMPLATE_DIR / "common.md"
 22 | 
 23 | 
 24 | def extract_action_bundle(raw: Dict[str, Any], sanitize: bool = False) -> Dict[str, Any]:
 25 |     """
 26 |     Split the incoming JSON dict into:
 27 |         action         {type, eventTarget}
 28 |         change_events  list from `allEvents`
 29 |         page_content   sanitized or raw HTML
 30 | 
 31 |     Returns a dict with those keys.
 32 |     """
 33 |     # 1. Top‑level type + eventTarget
 34 |     action = {
 35 |         "type": raw.get("type"),
 36 |         "eventTarget": raw.get("eventTarget")
 37 |     }
 38 | 
 39 |     # 2. Change events
 40 |     change_events = raw.get("allEvents", [])
 41 | 
 42 |     if change_events == {}:
 43 |         change_events = "[changes not available]"
 44 | 
 45 |     # 3. Page HTML
 46 |     page_html = raw.get("pageHTMLContent", "")
 47 | 
 48 |     # Use prettify to format the HTML.
 49 |     if sanitize:
 50 |         # Use the top‑level type to decide how to sanitize, if desired.
 51 |         page_html = run_html_sanitizer(page_html, action["type"] or "")
 52 | 
 53 |     page_html = re.sub(r'[\n\r\t\\]+', '', page_html)
 54 |     return {
 55 |         "action_type": raw.get("type"),
 56 |         "action": action,
 57 |         "change_events": change_events,
 58 |         "page_content": page_html
 59 |     }
 60 | 
 61 | 
 62 | def generate_subgoal_speculate_prompt(summary_event: Dict[str, Any], ultimate_goal: str, subtask_name: str, output_path: str) -> None:
 63 |     # 1) bundle relevant pieces
 64 |     grouped_items = extract_action_bundle(summary_event, True)
 65 |     # 2) prepare the data that the template expects
 66 |     context = {
 67 |         "ultimate_goal": ultimate_goal,
 68 |         "action":         grouped_items["action"],
 69 |         "change_events":  grouped_items["change_events"],
 70 |         "page_content": grouped_items["page_content"],
 71 |     }
 72 | 
 73 |     template = choose_template(grouped_items["action_type"])
 74 |     template_text = template.read_text(encoding="utf-8")
 75 |     filled_markdown = Template(template_text).render(**context)
 76 | 
 77 |     # 4) save to  subgoals/subgoal_<YYYYMMDD_HHMMSS>.md
 78 |     output_dir = Path(output_path)
 79 |     output_dir.mkdir(parents=True, exist_ok=True)
 80 |     out_path = output_dir / f"subgoal_{subtask_name}_{grouped_items['action_type']}.md"
 81 |     out_path.write_text(filled_markdown, encoding="utf-8")
 82 | 
 83 |     return out_path
 84 | 
 85 | # ---------------------------------------------------------------------------
 86 | # helper: locate exactly one task-start file and return its taskDescription
 87 | # ---------------------------------------------------------------------------
 88 | def find_task_prompt(data_dir: str | Path) -> str:
 89 |     data_dir = Path(data_dir)
 90 | 
 91 |     # 0️⃣  Does the path exist?
 92 |     if not data_dir.exists():
 93 |         sys.exit(f"[OTA error] given path does not exist: {data_dir}")
 94 | 
 95 |     # 0️⃣b Is it a directory?
 96 |     if not data_dir.is_dir():
 97 |         sys.exit(f"[OTA error] path is not a directory: {data_dir}")
 98 | 
 99 |     # 1️⃣  Gather every *.json recursively (sub-folders included)
100 |     json_paths = sorted(data_dir.rglob("*.json"))
101 |     if not json_paths:
102 |         sys.exit(f"[OTA error] no *.json files found under {data_dir}")
103 | 
104 |     # 2️⃣  First file (by name) must be task-start
105 |     first = json.loads(json_paths[0].read_text(encoding="utf-8"))
106 |     if first.get("type") != "task-start":
107 |         sys.exit("[OTA error] first JSON file is not a task-start record")
108 | 
109 |     # 3️⃣  Collect *all* task-start files
110 |     task_start_files = [
111 |         p for p in json_paths
112 |         if json.loads(p.read_text(encoding="utf-8")).get("type") == "task-start"
113 |     ]
114 |     if len(task_start_files) == 0:
115 |         sys.exit("[OTA error] no task-start file found")
116 |     if len(task_start_files) > 1:
117 |         names = ", ".join(p.name for p in task_start_files)
118 |         sys.exit(f"[OTA error] multiple task-start files detected: {names}")
119 | 
120 |     # 4️⃣  Extract taskDescription
121 |     task_json = json.loads(task_start_files[0].read_text(encoding="utf-8"))
122 |     task_id   = task_json.get("taskId")
123 |     task_desc = task_json.get("taskDescription")
124 |     if not task_desc or not task_id:
125 |         sys.exit(f"[OTA error] task-start file {task_start_files[0].name} "
126 |                  "has no taskDescription or taskId")
127 | 
128 |     return task_desc, task_id
129 | 
130 | 
131 | def load_event_json(path: str | Path) -> Dict[str, Any]:
132 |     """Read the given JSON file and return it as a Python dict."""
133 |     path = Path(path)
134 |     if not path.is_file():
135 |         raise FileNotFoundError(f"Cannot find JSON file: {path}")
136 |     with path.open("r", encoding="utf-8") as f:
137 |         return json.load(f)


--------------------------------------------------------------------------------
/utils/subgoal_generator.py:
--------------------------------------------------------------------------------
  1 | """Sub‑goal batch generator.
  2 | 
  3 | Given a directory path, this helper loads every `*.md` file (each file is
  4 | assumed to contain a prompt) and sends the content to OpenAI (via
  5 | `ask_llm`).  It returns a list of dicts with filename, prompt, and reply.
  6 | """
  7 | from __future__ import annotations
  8 | 
  9 | import json, re
 10 | from pathlib import Path
 11 | from typing import List, Dict, Any, Optional
 12 | 
 13 | from utils.llm import ask_llm
 14 | 
 15 | __all__ = ["generate_subgoals_from_dir"]
 16 | 
 17 | 
 18 | def _load_prompts(dir_path: str | Path) -> List[tuple[Path, str]]:
 19 |     """Return a list of (path, text) for every .md file in *dir_path*."""
 20 |     dir_path = Path(dir_path)
 21 |     paths = sorted(dir_path.glob("*.md"))
 22 |     prompts: List[tuple[Path, str]] = []
 23 |     for p in paths:
 24 |         text = p.read_text(encoding="utf-8").strip()
 25 |         if text:
 26 |             prompts.append((p, text))
 27 |     return prompts
 28 | 
 29 | 
 30 | def generate_subgoals_from_dir(
 31 |     dir_path: str | Path,
 32 |     *,
 33 |     system_prompt: Optional[str] = None,
 34 |     model: str = "gpt-4o-mini",
 35 |     temperature: float = 0.2,
 36 |     save_jsonl: Optional[str | Path] = None,
 37 | ) -> List[Dict[str, Any]]:
 38 |     """Load all .md files under *dir_path*, query the LLM, and return results.
 39 | 
 40 |     Parameters
 41 |     ----------
 42 |     dir_path : str | Path
 43 |         Directory containing `*.md` prompt files.
 44 |     system_prompt : str | None
 45 |         Optional system message for the LLM.
 46 |     model : str
 47 |         OpenAI model name.
 48 |     temperature : float
 49 |         Sampling temperature.
 50 |     save_jsonl : str | Path | None
 51 |         If given, write a JSON‑lines file with each result.
 52 | 
 53 |     Returns
 54 |     -------
 55 |     list[dict]
 56 |         Each dict contains {"file", "prompt", "reply"}.
 57 |     """
 58 |     prompts = _load_prompts(dir_path)
 59 |     if not prompts:
 60 |         raise FileNotFoundError(f"No .md files found in {dir_path}")
 61 | 
 62 |     if save_jsonl:
 63 |         save_path = Path(save_jsonl)
 64 |         if save_path.exists():
 65 |             save_path.unlink()
 66 | 
 67 |     results: List[Dict[str, Any]] = []
 68 | 
 69 |     for idx, (path, prompt_text) in enumerate(prompts, 1):
 70 |         print(f"[{idx}/{len(prompts)}] Querying LLM for {path.name} …")
 71 |         reply = ask_llm(
 72 |             prompt_text,
 73 |             system_prompt=system_prompt,
 74 |             model=model,
 75 |             temperature=temperature,
 76 |         )
 77 |         result = {
 78 |             "file": path.name,
 79 |             "prompt": prompt_text,
 80 |             "reply": reply,
 81 |         }
 82 |         results.append(result)
 83 | 
 84 |         # Optionally append to JSONL file incrementally
 85 |         if save_jsonl:
 86 |             with Path(save_jsonl).open("a", encoding="utf-8") as f:
 87 |                 f.write(json.dumps(result, ensure_ascii=False) + "\n")
 88 | 
 89 |     return results
 90 | 
 91 | # ---------------------------------------------------------------------------
 92 | # JSONL "reply" → next_goal extractor
 93 | # ---------------------------------------------------------------------------
 94 | 
 95 | def _clean_reply(raw_reply: str) -> str:
 96 |     """Remove markdown fences and whitespace from a reply string."""
 97 |     # strip triple back‑tick blocks if present
 98 |     fenced = re.compile(r"```json\s*(.*?)\s*```", re.DOTALL | re.IGNORECASE)
 99 |     m = fenced.search(raw_reply)
100 |     if m:
101 |         return m.group(1).strip()
102 |     return raw_reply.strip()
103 | 
104 | 
105 | def wap_subgoal_list_generation(
106 |     ultimate_goal: str,
107 |     task_id: str,
108 |     jsonl_path: str | Path,
109 |     out_path: str | Path = "wap_subgoal.json",
110 | ) -> List[str]:
111 |     """Read *jsonl_path*, extract the `next_goal` from each line's `reply`,
112 |     and write the list of next goals to *out_path* (as a JSON array).
113 | 
114 |     Returns the list for immediate use.
115 |     """
116 |     jsonl_path = Path(jsonl_path)
117 |     if not jsonl_path.is_file():
118 |         raise FileNotFoundError(jsonl_path)
119 | 
120 |     goals: List[Dict[str, str]] = [{"index": 0, "subgoal": "task starts, go for the next sub-goal"}]
121 |     with jsonl_path.open("r", encoding="utf-8") as fh:
122 |         for line_no, line in enumerate(fh, 1):
123 |             if not line.strip():
124 |                 continue
125 |             try:
126 |                 record = json.loads(line)
127 |             except json.JSONDecodeError:
128 |                 print(f"[extract] line {line_no}: malformed JSON – skipped")
129 |                 continue
130 | 
131 |             raw_reply = str(record.get("reply", ""))
132 |             cleaned = _clean_reply(raw_reply)
133 |             try:
134 |                 reply_json = json.loads(cleaned)
135 |             except json.JSONDecodeError:
136 |                 print(f"[extract] line {line_no}: reply not valid JSON – skipped")
137 |                 continue
138 | 
139 |             goal_text = reply_json.get("next_goal")
140 |             if goal_text:
141 |                 goals.append({"index": len(goals), "subgoal": goal_text})
142 |             else:
143 |                 print(f"[extract] line {line_no}: no 'next_goal' key – skipped")
144 | 
145 |     goals.append({"index": len(goals), "subgoal": "task done"})
146 |     
147 |     final_output = {
148 |         "ultimate_goal": ultimate_goal,
149 |         "task_id": task_id,
150 |         "type": "smart_replay",
151 |         "subgoal_list": goals
152 |     }
153 |     Path(out_path).write_text(json.dumps(final_output, ensure_ascii=False, indent=2), encoding="utf-8")
154 |     print(f"[extract] wrote {len(goals)} sub‑goals → {out_path}")
155 |     return goals
156 | 


--------------------------------------------------------------------------------
/wap_replay/generate_mcp_server.py:
--------------------------------------------------------------------------------
  1 | from mcp.server.fastmcp import FastMCP
  2 | import httpx
  3 | import os
  4 | import json
  5 | import argparse
  6 | from typing import Optional
  7 | import utils.llm
  8 | import glob
  9 | from dotenv import load_dotenv
 10 | 
 11 | load_dotenv()
 12 | 
 13 | def extract_ultimate_goal(task_id: str) -> str:
 14 |     """
 15 |     Try to extract ultimate_goal from either exact_replay or smart_replay file.
 16 |     
 17 |     Args:
 18 |         task_id: The task ID to look for in the replay files
 19 |         
 20 |     Returns:
 21 |         The ultimate_goal string if found
 22 |         
 23 |     Raises:
 24 |         FileNotFoundError: If neither replay file exists
 25 |         ValueError: If ultimate_goal field is not found
 26 |     """
 27 |     exact_replay_path = os.path.join(".", "data_processed", "exact_replay", f"wap_exact_replay_list_{task_id}.json")
 28 |     smart_replay_path = os.path.join(".", "data_processed", "smart_replay", f"wap_smart_replay_list_{task_id}.json")
 29 |     
 30 |     for file_path in [exact_replay_path, smart_replay_path]:
 31 |         if os.path.exists(file_path):
 32 |             try:
 33 |                 with open(file_path, 'r') as f:
 34 |                     data = json.load(f)
 35 |                     return data['ultimate_goal']
 36 |             except (json.JSONDecodeError, KeyError):
 37 |                 continue
 38 |     
 39 |     raise ValueError(f"Could not find ultimate_goal in replay files for task_id {task_id}")
 40 | 
 41 | def summarize_goal(ultimate_goal: str) -> str:
 42 |     """
 43 |     Generate a function name from the ultimate goal.
 44 |     This is a placeholder - in practice you would call OpenAI API here.
 45 |     """
 46 |     return utils.llm.ask_llm(f"Summarize the following to a single function name with underscore in plaintext: {ultimate_goal}")
 47 | 
 48 | def create_mcp_server(ultimate_goal: str, function_name: str, task_id: str) -> str:
 49 |     """
 50 |     Creates an MCP server file with the specified parameters.
 51 |     
 52 |     Args:
 53 |         ultimate_goal: The goal description for the MCP
 54 |         function_name: The base name for the functions
 55 |         task_id: The task ID used in file paths
 56 |         
 57 |     Returns:
 58 |         The complete Python code as a string
 59 |     """
 60 |     exact_replay_path = os.path.join(".", "data_processed", "exact_replay", f"wap_exact_replay_list_{task_id}.json")
 61 |     smart_replay_path = os.path.join(".", "data_processed", "smart_replay", f"wap_smart_replay_list_{task_id}.json")
 62 |     
 63 |     smart_docstring = f"smart replay: {ultimate_goal}"
 64 |     exact_docstring = f"exact replay: {ultimate_goal}"
 65 |     
 66 |     code = f'''
 67 | from mcp.server.fastmcp import FastMCP
 68 | import httpx
 69 | 
 70 | mcp = FastMCP("{ultimate_goal}")
 71 | '''
 72 |     
 73 |     # Only include the tool function for the existing replay file
 74 |     if os.path.exists(smart_replay_path):
 75 |         code += f'''
 76 | @mcp.tool()
 77 | async def {function_name}_smart_replay() -> str:
 78 |     """{smart_docstring}"""
 79 |     async with httpx.AsyncClient(timeout=600.0) as client:
 80 |         response = await client.get(
 81 |             "http://localhost:3089/replay",
 82 |             params={{
 83 |                 "concurrent": 1,
 84 |                 "model": "openai",
 85 |                 "file_path": 'data_processed/smart_replay/wap_smart_replay_list_{task_id}.json'
 86 |             }}
 87 |         )
 88 |         return response.text
 89 |     return "FAILED"
 90 | '''
 91 |     if os.path.exists(exact_replay_path):
 92 |         code += f'''
 93 | @mcp.tool()
 94 | async def {function_name}_exact_replay() -> str:
 95 |     """{exact_docstring}"""
 96 |     async with httpx.AsyncClient(timeout=600.0) as client:
 97 |         response = await client.get(
 98 |             "http://localhost:3089/replay",
 99 |             params={{
100 |                 "concurrent": 1,
101 |                 "model": "openai",
102 |                 "file_path": 'data_processed/exact_replay/wap_exact_replay_list_{task_id}.json'
103 |             }}
104 |         )
105 |         return response.text
106 |     return "FAILED"
107 | '''
108 |     
109 |     code += '''
110 | if __name__ == "__main__":
111 |     mcp.run(transport="stdio")
112 | '''
113 |     return code
114 | 
115 | def main():
116 |     parser = argparse.ArgumentParser(description='Create MCP server file from replay data')
117 |     parser.add_argument('--task_id', required=True, help='Task ID to process')
118 |     args = parser.parse_args()
119 |     
120 |     try:
121 |         # Extract ultimate_goal from replay files
122 |         ultimate_goal = extract_ultimate_goal(args.task_id)
123 |         
124 |         # Generate function name
125 |         function_name = summarize_goal(ultimate_goal)
126 |         
127 |         # Generate the code
128 |         server_code = create_mcp_server(ultimate_goal, function_name, args.task_id)
129 |         
130 |         # Create mcp_servers directory if it doesn't exist
131 |         os.makedirs("mcp_servers", exist_ok=True)
132 |         
133 |         # Check for existing files with the same task_id
134 |         existing_files = glob.glob(os.path.join("mcp_servers", f"*_{args.task_id}_mcp_server.py"))
135 |         
136 |         # If duplicates exist, remove them
137 |         for existing_file in existing_files:
138 |             os.remove(existing_file)
139 |             print(f"Removed duplicate: {existing_file}")
140 |         
141 |         # Save to file in the mcp_servers folder
142 |         filename = os.path.join("mcp_servers", f"{function_name}_{args.task_id}_mcp_server.py")
143 |         with open(filename, "w") as file:
144 |             file.write(server_code)
145 |         
146 |         print(f"Successfully created/updated {filename}")
147 |     except Exception as e:
148 |         print(f"Error: {str(e)}")
149 |         exit(1)
150 | 
151 | if __name__ == "__main__":
152 |     main()


--------------------------------------------------------------------------------
/browser_use/dom/tests/extraction_test.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import time
  3 | 
  4 | from langchain_openai import ChatOpenAI
  5 | 
  6 | from browser_use.browser.browser import Browser, BrowserConfig
  7 | from browser_use.browser.context import BrowserContext, BrowserContextConfig
  8 | from browser_use.dom.service import DomService
  9 | from browser_use.utils import time_execution_sync
 10 | 
 11 | 
 12 | def count_string_tokens(string: str, model: str) -> int:
 13 | 	"""Count the number of tokens in a string using a specified model."""
 14 | 	llm = ChatOpenAI(model=model)
 15 | 	return llm.count_tokens(string)
 16 | 
 17 | 
 18 | async def test_process_html_file():
 19 | 	config = BrowserContextConfig(
 20 | 		cookies_file='cookies3.json',
 21 | 		disable_security=True,
 22 | 		wait_for_network_idle_page_load_time=2,
 23 | 	)
 24 | 
 25 | 	browser = Browser(
 26 | 		config=BrowserConfig(
 27 | 			# chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
 28 | 		)
 29 | 	)
 30 | 	context = BrowserContext(browser=browser, config=config)  # noqa: F821
 31 | 
 32 | 	websites = [
 33 | 		'https://kayak.com/flights',
 34 | 		'https://immobilienscout24.de',
 35 | 		'https://google.com',
 36 | 		'https://amazon.com',
 37 | 		'https://github.com',
 38 | 	]
 39 | 
 40 | 	async with context as context:
 41 | 		page = await context.get_current_page()
 42 | 		dom_service = DomService(page)
 43 | 
 44 | 		for website in websites:
 45 | 			print(f'\n{"=" * 50}\nTesting {website}\n{"=" * 50}')
 46 | 			await page.goto(website)
 47 | 			time.sleep(2)  # Additional wait for dynamic content
 48 | 
 49 | 			async def test_viewport(expansion: int, description: str):
 50 | 				print(f'\n{description}:')
 51 | 				dom_state = await time_execution_sync(f'get_clickable_elements ({description})')(
 52 | 					dom_service.get_clickable_elements
 53 | 				)(highlight_elements=True, viewport_expansion=expansion)
 54 | 
 55 | 				elements = dom_state.element_tree
 56 | 				selector_map = dom_state.selector_map
 57 | 				element_count = len(selector_map.keys())
 58 | 				token_count = count_string_tokens(elements.clickable_elements_to_string(), model='gpt-4o')
 59 | 
 60 | 				print(f'Number of elements: {element_count}')
 61 | 				print(f'Token count: {token_count}')
 62 | 				return element_count, token_count
 63 | 
 64 | 			expansions = [0, 100, 200, 300, 400, 500, 600, 1000, -1, -200]
 65 | 			results = []
 66 | 
 67 | 			for i, expansion in enumerate(expansions):
 68 | 				description = (
 69 | 					f'{i + 1}. Expansion {expansion}px' if expansion >= 0 else f'{i + 1}. All elements ({expansion} expansion)'
 70 | 				)
 71 | 				count, tokens = await test_viewport(expansion, description)
 72 | 				results.append((count, tokens))
 73 | 				input('Press Enter to continue...')
 74 | 				await page.evaluate('document.getElementById("playwright-highlight-container")?.remove()')
 75 | 
 76 | 			# Print comparison summary
 77 | 			print('\nComparison Summary:')
 78 | 			for i, (count, tokens) in enumerate(results):
 79 | 				expansion = expansions[i]
 80 | 				description = f'Expansion {expansion}px' if expansion >= 0 else 'All elements (-1)'
 81 | 				initial_count, initial_tokens = results[0]
 82 | 				print(f'{description}: {count} elements (+{count - initial_count}), {tokens} tokens')
 83 | 
 84 | 			input('\nPress Enter to continue to next website...')
 85 | 
 86 | 			# Clear highlights before next website
 87 | 			await page.evaluate('document.getElementById("playwright-highlight-container")?.remove()')
 88 | 
 89 | 
 90 | async def test_focus_vs_all_elements():
 91 | 	config = BrowserContextConfig(
 92 | 		# cookies_file='cookies3.json',
 93 | 		disable_security=True,
 94 | 		wait_for_network_idle_page_load_time=2,
 95 | 	)
 96 | 
 97 | 	browser = Browser(
 98 | 		config=BrowserConfig(
 99 | 			# browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
100 | 		)
101 | 	)
102 | 	context = BrowserContext(browser=browser, config=config)  # noqa: F821
103 | 
104 | 	websites = [
105 | 		'https://en.wikipedia.org/wiki/Humanist_Party_of_Ontario',
106 | 		'https://www.google.com/travel/flights?tfs=CBwQARoJagcIARIDTEpVGglyBwgBEgNMSlVAAUgBcAGCAQsI____________AZgBAQ&tfu=KgIIAw&hl=en-US&gl=US',
107 | 		# 'https://www.concur.com/?&cookie_preferences=cpra',
108 | 		'https://immobilienscout24.de',
109 | 		'https://docs.google.com/spreadsheets/d/1INaIcfpYXlMRWO__de61SHFCaqt1lfHlcvtXZPItlpI/edit',
110 | 		'https://www.zeiss.com/career/en/job-search.html?page=1',
111 | 		'https://www.mlb.com/yankees/stats/',
112 | 		'https://www.amazon.com/s?k=laptop&s=review-rank&crid=1RZCEJ289EUSI&qid=1740202453&sprefix=laptop%2Caps%2C166&ref=sr_st_review-rank&ds=v1%3A4EnYKXVQA7DIE41qCvRZoNB4qN92Jlztd3BPsTFXmxU',
113 | 		'https://codepen.io/geheimschriftstift/pen/mPLvQz',
114 | 		'https://reddit.com',
115 | 		'https://www.google.com/search?q=google+hi&oq=google+hi&gs_lcrp=EgZjaHJvbWUyBggAEEUYOTIGCAEQRRhA0gEIMjI2NmowajSoAgCwAgE&sourceid=chrome&ie=UTF-8',
116 | 		'https://kayak.com/flights',
117 | 		'https://google.com',
118 | 		'https://amazon.com',
119 | 		'https://github.com',
120 | 	]
121 | 
122 | 	async with context as context:
123 | 		page = await context.get_current_page()
124 | 		dom_service = DomService(page)
125 | 
126 | 		for website in websites:
127 | 			# sleep 2
128 | 			await page.goto(website)
129 | 			time.sleep(2)
130 | 
131 | 			while True:
132 | 				try:
133 | 					print(f'\n{"=" * 50}\nTesting {website}\n{"=" * 50}')
134 | 					# time.sleep(2)  # Additional wait for dynamic content
135 | 
136 | 					# First get all elements
137 | 					print('\nGetting all elements:')
138 | 					all_elements_state = await time_execution_sync('get_all_elements')(dom_service.get_clickable_elements)(
139 | 						highlight_elements=True, viewport_expansion=1000
140 | 					)
141 | 
142 | 					selector_map = all_elements_state.selector_map
143 | 					total_elements = len(selector_map.keys())
144 | 					print(f'Total number of elements: {total_elements}')
145 | 
146 | 					print(all_elements_state.element_tree.clickable_elements_to_string())
147 | 
148 | 					answer = input('Press Enter to clear highlights and continue...')
149 | 					if answer == 'q':
150 | 						break
151 | 
152 | 					await page.evaluate('document.getElementById("playwright-highlight-container")?.remove()')
153 | 
154 | 				except Exception as e:
155 | 					print(f'Error: {e}')
156 | 					pass
157 | 
158 | 
159 | if __name__ == '__main__':
160 | 	asyncio.run(test_focus_vs_all_elements())
161 | 	asyncio.run(test_process_html_file())
162 | 


--------------------------------------------------------------------------------
/browser_use/agent/tests.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from browser_use.agent.views import (
  4 | 	ActionResult,
  5 | 	AgentBrain,
  6 | 	AgentHistory,
  7 | 	AgentHistoryList,
  8 | 	AgentOutput,
  9 | )
 10 | from browser_use.browser.views import BrowserState, BrowserStateHistory, TabInfo
 11 | from browser_use.controller.registry.service import Registry
 12 | from browser_use.controller.views import ClickElementAction, DoneAction, ExtractPageContentAction
 13 | from browser_use.dom.views import DOMElementNode
 14 | 
 15 | 
 16 | @pytest.fixture
 17 | def sample_browser_state():
 18 | 	return BrowserState(
 19 | 		url='https://example.com',
 20 | 		title='Example Page',
 21 | 		tabs=[TabInfo(url='https://example.com', title='Example Page', page_id=1)],
 22 | 		screenshot='screenshot1.png',
 23 | 		element_tree=DOMElementNode(
 24 | 			tag_name='root',
 25 | 			is_visible=True,
 26 | 			parent=None,
 27 | 			xpath='',
 28 | 			attributes={},
 29 | 			children=[],
 30 | 		),
 31 | 		selector_map={},
 32 | 	)
 33 | 
 34 | 
 35 | @pytest.fixture
 36 | def action_registry():
 37 | 	registry = Registry()
 38 | 
 39 | 	# Register the actions we need for testing
 40 | 	@registry.action(description='Click an element', param_model=ClickElementAction)
 41 | 	def click_element(params: ClickElementAction, browser=None):
 42 | 		pass
 43 | 
 44 | 	@registry.action(
 45 | 		description='Extract page content',
 46 | 		param_model=ExtractPageContentAction,
 47 | 	)
 48 | 	def extract_page_content(params: ExtractPageContentAction, browser=None):
 49 | 		pass
 50 | 
 51 | 	@registry.action(description='Mark task as done', param_model=DoneAction)
 52 | 	def done(params: DoneAction):
 53 | 		pass
 54 | 
 55 | 	# Create the dynamic ActionModel with all registered actions
 56 | 	return registry.create_action_model()
 57 | 
 58 | 
 59 | @pytest.fixture
 60 | def sample_history(action_registry):
 61 | 	# Create actions with nested params structure
 62 | 	click_action = action_registry(click_element={'index': 1})
 63 | 
 64 | 	extract_action = action_registry(extract_page_content={'value': 'text'})
 65 | 
 66 | 	done_action = action_registry(done={'text': 'Task completed'})
 67 | 
 68 | 	histories = [
 69 | 		AgentHistory(
 70 | 			model_output=AgentOutput(
 71 | 				current_state=AgentBrain(
 72 | 					evaluation_previous_goal='None',
 73 | 					memory='Started task',
 74 | 					next_goal='Click button',
 75 | 				),
 76 | 				action=[click_action],
 77 | 			),
 78 | 			result=[ActionResult(is_done=False)],
 79 | 			state=BrowserStateHistory(
 80 | 				url='https://example.com',
 81 | 				title='Page 1',
 82 | 				tabs=[TabInfo(url='https://example.com', title='Page 1', page_id=1)],
 83 | 				screenshot='screenshot1.png',
 84 | 				interacted_element=[{'xpath': '//button[1]'}],
 85 | 			),
 86 | 		),
 87 | 		AgentHistory(
 88 | 			model_output=AgentOutput(
 89 | 				current_state=AgentBrain(
 90 | 					evaluation_previous_goal='Clicked button',
 91 | 					memory='Button clicked',
 92 | 					next_goal='Extract content',
 93 | 				),
 94 | 				action=[extract_action],
 95 | 			),
 96 | 			result=[
 97 | 				ActionResult(
 98 | 					is_done=False,
 99 | 					extracted_content='Extracted text',
100 | 					error='Failed to extract completely',
101 | 				)
102 | 			],
103 | 			state=BrowserStateHistory(
104 | 				url='https://example.com/page2',
105 | 				title='Page 2',
106 | 				tabs=[TabInfo(url='https://example.com/page2', title='Page 2', page_id=2)],
107 | 				screenshot='screenshot2.png',
108 | 				interacted_element=[{'xpath': '//div[1]'}],
109 | 			),
110 | 		),
111 | 		AgentHistory(
112 | 			model_output=AgentOutput(
113 | 				current_state=AgentBrain(
114 | 					evaluation_previous_goal='Extracted content',
115 | 					memory='Content extracted',
116 | 					next_goal='Finish task',
117 | 				),
118 | 				action=[done_action],
119 | 			),
120 | 			result=[ActionResult(is_done=True, extracted_content='Task completed', error=None)],
121 | 			state=BrowserStateHistory(
122 | 				url='https://example.com/page2',
123 | 				title='Page 2',
124 | 				tabs=[TabInfo(url='https://example.com/page2', title='Page 2', page_id=2)],
125 | 				screenshot='screenshot3.png',
126 | 				interacted_element=[{'xpath': '//div[1]'}],
127 | 			),
128 | 		),
129 | 	]
130 | 	return AgentHistoryList(history=histories)
131 | 
132 | 
133 | def test_last_model_output(sample_history: AgentHistoryList):
134 | 	last_output = sample_history.last_action()
135 | 	print(last_output)
136 | 	assert last_output == {'done': {'text': 'Task completed'}}
137 | 
138 | 
139 | def test_get_errors(sample_history: AgentHistoryList):
140 | 	errors = sample_history.errors()
141 | 	assert len(errors) == 1
142 | 	assert errors[0] == 'Failed to extract completely'
143 | 
144 | 
145 | def test_final_result(sample_history: AgentHistoryList):
146 | 	assert sample_history.final_result() == 'Task completed'
147 | 
148 | 
149 | def test_is_done(sample_history: AgentHistoryList):
150 | 	assert sample_history.is_done() is True
151 | 
152 | 
153 | def test_urls(sample_history: AgentHistoryList):
154 | 	urls = sample_history.urls()
155 | 	assert 'https://example.com' in urls
156 | 	assert 'https://example.com/page2' in urls
157 | 
158 | 
159 | def test_all_screenshots(sample_history: AgentHistoryList):
160 | 	screenshots = sample_history.screenshots()
161 | 	assert len(screenshots) == 3
162 | 	assert screenshots == ['screenshot1.png', 'screenshot2.png', 'screenshot3.png']
163 | 
164 | 
165 | def test_all_model_outputs(sample_history: AgentHistoryList):
166 | 	outputs = sample_history.model_actions()
167 | 	print(f'DEBUG: {outputs[0]}')
168 | 	assert len(outputs) == 3
169 | 	# get first key value pair
170 | 	assert dict([next(iter(outputs[0].items()))]) == {'click_element': {'index': 1}}
171 | 	assert dict([next(iter(outputs[1].items()))]) == {'extract_page_content': {'value': 'text'}}
172 | 	assert dict([next(iter(outputs[2].items()))]) == {'done': {'text': 'Task completed'}}
173 | 
174 | 
175 | def test_all_model_outputs_filtered(sample_history: AgentHistoryList):
176 | 	filtered = sample_history.model_actions_filtered(include=['click_element'])
177 | 	assert len(filtered) == 1
178 | 	assert filtered[0]['click_element']['index'] == 1
179 | 
180 | 
181 | def test_empty_history():
182 | 	empty_history = AgentHistoryList(history=[])
183 | 	assert empty_history.last_action() is None
184 | 	assert empty_history.final_result() is None
185 | 	assert empty_history.is_done() is False
186 | 	assert len(empty_history.urls()) == 0
187 | 
188 | 
189 | # Add a test to verify action creation
190 | def test_action_creation(action_registry):
191 | 	click_action = action_registry(click_element={'index': 1})
192 | 
193 | 	assert click_action.model_dump(exclude_none=True) == {'click_element': {'index': 1}}
194 | 
195 | 
196 | # run this with:
197 | # pytest browser_use/agent/tests.py
198 | 


--------------------------------------------------------------------------------
/browser_use/dom/views.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from functools import cached_property
  3 | from typing import TYPE_CHECKING, Dict, List, Optional
  4 | 
  5 | from browser_use.dom.history_tree_processor.view import CoordinateSet, HashedDomElement, ViewportInfo
  6 | from browser_use.utils import time_execution_sync
  7 | 
  8 | # Avoid circular import issues
  9 | if TYPE_CHECKING:
 10 | 	from .views import DOMElementNode
 11 | 
 12 | 
 13 | @dataclass(frozen=False)
 14 | class DOMBaseNode:
 15 | 	is_visible: bool
 16 | 	# Use None as default and set parent later to avoid circular reference issues
 17 | 	parent: Optional['DOMElementNode']
 18 | 
 19 | 
 20 | @dataclass(frozen=False)
 21 | class DOMTextNode(DOMBaseNode):
 22 | 	text: str
 23 | 	type: str = 'TEXT_NODE'
 24 | 
 25 | 	def has_parent_with_highlight_index(self) -> bool:
 26 | 		current = self.parent
 27 | 		while current is not None:
 28 | 			# stop if the element has a highlight index (will be handled separately)
 29 | 			if current.highlight_index is not None:
 30 | 				return True
 31 | 
 32 | 			current = current.parent
 33 | 		return False
 34 | 
 35 | 	def is_parent_in_viewport(self) -> bool:
 36 | 		if self.parent is None:
 37 | 			return False
 38 | 		return self.parent.is_in_viewport
 39 | 
 40 | 	def is_parent_top_element(self) -> bool:
 41 | 		if self.parent is None:
 42 | 			return False
 43 | 		return self.parent.is_top_element
 44 | 
 45 | 
 46 | @dataclass(frozen=False)
 47 | class DOMElementNode(DOMBaseNode):
 48 | 	"""
 49 | 	xpath: the xpath of the element from the last root node (shadow root or iframe OR document if no shadow root or iframe).
 50 | 	To properly reference the element we need to recursively switch the root node until we find the element (work you way up the tree with `.parent`)
 51 | 	"""
 52 | 
 53 | 	tag_name: str
 54 | 	xpath: str
 55 | 	attributes: Dict[str, str]
 56 | 	children: List[DOMBaseNode]
 57 | 	is_interactive: bool = False
 58 | 	is_top_element: bool = False
 59 | 	is_in_viewport: bool = False
 60 | 	is_ota_interactive_element: bool = False
 61 | 	shadow_root: bool = False
 62 | 	highlight_index: Optional[int] = None
 63 | 	viewport_coordinates: Optional[CoordinateSet] = None
 64 | 	page_coordinates: Optional[CoordinateSet] = None
 65 | 	viewport_info: Optional[ViewportInfo] = None
 66 | 
 67 | 	def __repr__(self) -> str:
 68 | 		tag_str = f'<{self.tag_name}'
 69 | 
 70 | 		# Add attributes
 71 | 		for key, value in self.attributes.items():
 72 | 			tag_str += f' {key}="{value}"'
 73 | 		tag_str += '>'
 74 | 
 75 | 		# Add extra info
 76 | 		extras = []
 77 | 		if self.is_interactive:
 78 | 			extras.append('interactive')
 79 | 		if self.is_top_element:
 80 | 			extras.append('top')
 81 | 		if self.shadow_root:
 82 | 			extras.append('shadow-root')
 83 | 		if self.highlight_index is not None:
 84 | 			extras.append(f'highlight:{self.highlight_index}')
 85 | 		if self.is_in_viewport:
 86 | 			extras.append('in-viewport')
 87 | 		if self.is_ota_interactive_element:
 88 | 			extras.append('ota-interactive-element')
 89 | 
 90 | 		if extras:
 91 | 			tag_str += f' [{", ".join(extras)}]'
 92 | 
 93 | 		return tag_str
 94 | 
 95 | 	@cached_property
 96 | 	def hash(self) -> HashedDomElement:
 97 | 		from browser_use.dom.history_tree_processor.service import (
 98 | 			HistoryTreeProcessor,
 99 | 		)
100 | 
101 | 		return HistoryTreeProcessor._hash_dom_element(self)
102 | 
103 | 	def get_all_text_till_next_clickable_element(self, max_depth: int = -1) -> str:
104 | 		text_parts = []
105 | 
106 | 		def collect_text(node: DOMBaseNode, current_depth: int) -> None:
107 | 			if max_depth != -1 and current_depth > max_depth:
108 | 				return
109 | 
110 | 			# Skip this branch if we hit a highlighted element (except for the current node)
111 | 			if isinstance(node, DOMElementNode) and node != self and node.highlight_index is not None:
112 | 				return
113 | 
114 | 			if isinstance(node, DOMTextNode):
115 | 				text_parts.append(node.text)
116 | 			elif isinstance(node, DOMElementNode):
117 | 				for child in node.children:
118 | 					collect_text(child, current_depth + 1)
119 | 
120 | 		collect_text(self, 0)
121 | 		return '\n'.join(text_parts).strip()
122 | 
123 | 	@time_execution_sync('--clickable_elements_to_string')
124 | 	def clickable_elements_to_string(self, include_attributes: list[str] | None = None) -> str:
125 | 		"""Convert the processed DOM content to HTML."""
126 | 		formatted_text = []
127 | 
128 | 		def process_node(node: DOMBaseNode, depth: int) -> None:
129 | 			if isinstance(node, DOMElementNode):
130 | 				# Add element with highlight_index
131 | 				if node.highlight_index is not None:
132 | 					attributes_str = ''
133 | 					text = node.get_all_text_till_next_clickable_element()
134 | 					if include_attributes:
135 | 						attributes = list(
136 | 							set(
137 | 								[
138 | 									f"{str(key)}={str(value)}"
139 | 									for key, value in node.attributes.items()
140 | 									if key in include_attributes and value != node.tag_name
141 | 								]
142 | 							)
143 | 						)
144 | 						if text in attributes:
145 | 							attributes.remove(text)
146 | 						attributes_str = ';'.join(attributes)
147 | 					line = f'[{node.highlight_index}]<{node.tag_name} '
148 | 					if attributes_str:
149 | 						line += f'{attributes_str}'
150 | 					if text:
151 | 						if attributes_str:
152 | 							line += f'>{text}'
153 | 						else:
154 | 							line += f'{text}'
155 | 					line += '/>'
156 | 					formatted_text.append(line)
157 | 
158 | 				# Process children regardless
159 | 				for child in node.children:
160 | 					process_node(child, depth + 1)
161 | 
162 | 			elif isinstance(node, DOMTextNode):
163 | 				# Add text only if it doesn't have a highlighted parent
164 | 				if not node.has_parent_with_highlight_index() and node.is_visible:  # and node.is_parent_top_element()
165 | 					formatted_text.append(f'{node.text}')
166 | 
167 | 		process_node(self, 0)
168 | 		return '\n'.join(formatted_text)
169 | 
170 | 	def get_file_upload_element(self, check_siblings: bool = True) -> Optional['DOMElementNode']:
171 | 		# Check if current element is a file input
172 | 		if self.tag_name == 'input' and self.attributes.get('type') == 'file':
173 | 			return self
174 | 
175 | 		# Check children
176 | 		for child in self.children:
177 | 			if isinstance(child, DOMElementNode):
178 | 				result = child.get_file_upload_element(check_siblings=False)
179 | 				if result:
180 | 					return result
181 | 
182 | 		# Check siblings only for the initial call
183 | 		if check_siblings and self.parent:
184 | 			for sibling in self.parent.children:
185 | 				if sibling is not self and isinstance(sibling, DOMElementNode):
186 | 					result = sibling.get_file_upload_element(check_siblings=False)
187 | 					if result:
188 | 						return result
189 | 
190 | 		return None
191 | 
192 | 
193 | SelectorMap = dict[int, DOMElementNode]
194 | 
195 | 
196 | @dataclass
197 | class DOMState:
198 | 	element_tree: DOMElementNode
199 | 	selector_map: SelectorMap
200 | 


--------------------------------------------------------------------------------
/browser_use/dom/service.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | import json
  3 | import logging
  4 | from dataclasses import dataclass
  5 | from importlib import resources
  6 | from typing import TYPE_CHECKING, Optional
  7 | from urllib.parse import urlparse
  8 | 
  9 | if TYPE_CHECKING:
 10 | 	from playwright.async_api import Page
 11 | 
 12 | from browser_use.dom.views import (
 13 | 	DOMBaseNode,
 14 | 	DOMElementNode,
 15 | 	DOMState,
 16 | 	DOMTextNode,
 17 | 	SelectorMap,
 18 | )
 19 | from browser_use.utils import time_execution_async
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | 
 24 | @dataclass
 25 | class ViewportInfo:
 26 | 	width: int
 27 | 	height: int
 28 | 
 29 | 
 30 | class DomService:
 31 | 	def __init__(self, page: 'Page'):
 32 | 		self.page = page
 33 | 		self.xpath_cache = {}
 34 | 
 35 | 		self.js_code = resources.files('browser_use.dom').joinpath('buildDomTree.js').read_text()
 36 | 
 37 | 	# region - Clickable elements
 38 | 	@time_execution_async('--get_clickable_elements')
 39 | 	async def get_clickable_elements(
 40 | 		self,
 41 | 		highlight_elements: bool = True,
 42 | 		focus_element: int = -1,
 43 | 		viewport_expansion: int = 0,
 44 | 	) -> DOMState:
 45 | 		element_tree, selector_map = await self._build_dom_tree(highlight_elements, focus_element, viewport_expansion)
 46 | 		return DOMState(element_tree=element_tree, selector_map=selector_map)
 47 | 
 48 | 	@time_execution_async('--get_cross_origin_iframes')
 49 | 	async def get_cross_origin_iframes(self) -> list[str]:
 50 | 		# invisible cross-origin iframes are used for ads and tracking, dont open those
 51 | 		hidden_frame_urls = await self.page.locator('iframe').filter(visible=False).evaluate_all('e => e.map(e => e.src)')
 52 | 
 53 | 		is_ad_url = lambda url: any(
 54 | 			domain in urlparse(url).netloc for domain in ('doubleclick.net', 'adroll.com', 'googletagmanager.com')
 55 | 		)
 56 | 
 57 | 		return [
 58 | 			frame.url
 59 | 			for frame in self.page.frames
 60 | 			if urlparse(frame.url).netloc  # exclude data:urls and about:blank
 61 | 			and urlparse(frame.url).netloc != urlparse(self.page.url).netloc  # exclude same-origin iframes
 62 | 			and frame.url not in hidden_frame_urls  # exclude hidden frames
 63 | 			and not is_ad_url(frame.url)  # exclude most common ad network tracker frame URLs
 64 | 		]
 65 | 
 66 | 	@time_execution_async('--build_dom_tree')
 67 | 	async def _build_dom_tree(
 68 | 		self,
 69 | 		highlight_elements: bool,
 70 | 		focus_element: int,
 71 | 		viewport_expansion: int,
 72 | 	) -> tuple[DOMElementNode, SelectorMap]:
 73 | 		if await self.page.evaluate('1+1') != 2:
 74 | 			raise ValueError('The page cannot evaluate javascript code properly')
 75 | 
 76 | 		if self.page.url == 'about:blank':
 77 | 			# short-circuit if the page is a new empty tab for speed, no need to inject buildDomTree.js
 78 | 			return (
 79 | 				DOMElementNode(
 80 | 					tag_name='body',
 81 | 					xpath='',
 82 | 					attributes={},
 83 | 					children=[],
 84 | 					is_visible=False,
 85 | 					parent=None,
 86 | 				),
 87 | 				{},
 88 | 			)
 89 | 
 90 | 		# NOTE: We execute JS code in the browser to extract important DOM information.
 91 | 		#       The returned hash map contains information about the DOM tree and the
 92 | 		#       relationship between the DOM elements.
 93 | 		debug_mode = logger.getEffectiveLevel() == logging.DEBUG
 94 | 		args = {
 95 | 			'doHighlightElements': highlight_elements,
 96 | 			'focusHighlightIndex': focus_element,
 97 | 			'viewportExpansion': viewport_expansion,
 98 | 			'debugMode': debug_mode,
 99 | 		}
100 | 
101 | 		try:
102 | 			eval_page: dict = await self.page.evaluate(self.js_code, args)
103 | 		except Exception as e:
104 | 			logger.error('Error evaluating JavaScript: %s', e)
105 | 			raise
106 | 
107 | 		# Only log performance metrics in debug mode
108 | 		if debug_mode and 'perfMetrics' in eval_page:
109 | 			logger.debug(
110 | 				'DOM Tree Building Performance Metrics for: %s\n%s',
111 | 				self.page.url,
112 | 				json.dumps(eval_page['perfMetrics'], indent=2),
113 | 			)
114 | 
115 | 		return await self._construct_dom_tree(eval_page)
116 | 
117 | 	@time_execution_async('--construct_dom_tree')
118 | 	async def _construct_dom_tree(
119 | 		self,
120 | 		eval_page: dict,
121 | 	) -> tuple[DOMElementNode, SelectorMap]:
122 | 		js_node_map = eval_page['map']
123 | 		js_root_id = eval_page['rootId']
124 | 
125 | 		selector_map = {}
126 | 		node_map = {}
127 | 
128 | 		for id, node_data in js_node_map.items():
129 | 			node, children_ids = self._parse_node(node_data)
130 | 			if node is None:
131 | 				continue
132 | 
133 | 			node_map[id] = node
134 | 
135 | 			if isinstance(node, DOMElementNode) and node.highlight_index is not None:
136 | 				selector_map[node.highlight_index] = node
137 | 
138 | 			# NOTE: We know that we are building the tree bottom up
139 | 			#       and all children are already processed.
140 | 			if isinstance(node, DOMElementNode):
141 | 				for child_id in children_ids:
142 | 					if child_id not in node_map:
143 | 						continue
144 | 
145 | 					child_node = node_map[child_id]
146 | 
147 | 					child_node.parent = node
148 | 					node.children.append(child_node)
149 | 
150 | 		html_to_dict = node_map[str(js_root_id)]
151 | 
152 | 		del node_map
153 | 		del js_node_map
154 | 		del js_root_id
155 | 
156 | 		gc.collect()
157 | 
158 | 		if html_to_dict is None or not isinstance(html_to_dict, DOMElementNode):
159 | 			raise ValueError('Failed to parse HTML to dictionary')
160 | 
161 | 		return html_to_dict, selector_map
162 | 
163 | 	def _parse_node(
164 | 		self,
165 | 		node_data: dict,
166 | 	) -> tuple[Optional[DOMBaseNode], list[int]]:
167 | 		if not node_data:
168 | 			return None, []
169 | 
170 | 		# Process text nodes immediately
171 | 		if node_data.get('type') == 'TEXT_NODE':
172 | 			text_node = DOMTextNode(
173 | 				text=node_data['text'],
174 | 				is_visible=node_data['isVisible'],
175 | 				parent=None,
176 | 			)
177 | 			return text_node, []
178 | 
179 | 		# Process coordinates if they exist for element nodes
180 | 
181 | 		viewport_info = None
182 | 
183 | 		if 'viewport' in node_data:
184 | 			viewport_info = ViewportInfo(
185 | 				width=node_data['viewport']['width'],
186 | 				height=node_data['viewport']['height'],
187 | 			)
188 | 
189 | 		element_node = DOMElementNode(
190 | 			tag_name=node_data['tagName'],
191 | 			xpath=node_data['xpath'],
192 | 			attributes=node_data.get('attributes', {}),
193 | 			children=[],
194 | 			is_visible=node_data.get('isVisible', False),
195 | 			is_interactive=node_data.get('isInteractive', False),
196 | 			is_top_element=node_data.get('isTopElement', False),
197 | 			is_in_viewport=node_data.get('isInViewport', False),
198 | 			is_ota_interactive_element=node_data.get('isOTAInteractiveElement', False),
199 | 			highlight_index=node_data.get('highlightIndex'),
200 | 			shadow_root=node_data.get('shadowRoot', False),
201 | 			parent=None,
202 | 			viewport_info=viewport_info,
203 | 		)
204 | 
205 | 		children_ids = node_data.get('children', [])
206 | 
207 | 		return element_node, children_ids
208 | 


--------------------------------------------------------------------------------
/browser_use/agent/prompts.py:
--------------------------------------------------------------------------------
  1 | import importlib.resources
  2 | from datetime import datetime
  3 | from typing import TYPE_CHECKING, List, Optional, Union
  4 | 
  5 | from langchain_core.messages import HumanMessage, SystemMessage
  6 | 
  7 | if TYPE_CHECKING:
  8 | 	from browser_use.agent.views import ActionResult, AgentStepInfo
  9 | 	from browser_use.browser.views import BrowserState
 10 | 
 11 | 
 12 | class SystemPrompt:
 13 | 	def __init__(
 14 | 		self,
 15 | 		action_description: str,
 16 | 		max_actions_per_step: int = 10,
 17 | 		override_system_message: Optional[str] = None,
 18 | 		extend_system_message: Optional[str] = None,
 19 | 		replay_mode: str = None
 20 | 	):
 21 | 		self.default_action_description = action_description
 22 | 		self.max_actions_per_step = max_actions_per_step
 23 | 		self.system_prompt_path = 'system_prompt_wap_replay.md' if replay_mode == "smart_replay" else 'system_prompt.md'
 24 | 		prompt = ''
 25 | 		if override_system_message:
 26 | 			prompt = override_system_message
 27 | 		else:
 28 | 			self._load_prompt_template()
 29 | 			prompt = self.prompt_template.format(max_actions=self.max_actions_per_step)
 30 | 
 31 | 		if extend_system_message:
 32 | 			prompt += f'\n{extend_system_message}'
 33 | 
 34 | 		self.system_message = SystemMessage(content=prompt)
 35 | 
 36 | 	def _load_prompt_template(self) -> None:
 37 | 		"""Load the prompt template from the markdown file."""
 38 | 		try:
 39 | 			# This works both in development and when installed as a package
 40 | 			with importlib.resources.files('browser_use.agent').joinpath(self.system_prompt_path).open('r') as f:
 41 | 				self.prompt_template = f.read()
 42 | 		except Exception as e:
 43 | 			raise RuntimeError(f'Failed to load system prompt template: {e}')
 44 | 
 45 | 	def get_system_message(self) -> SystemMessage:
 46 | 		"""
 47 | 		Get the system prompt for the agent.
 48 | 
 49 | 		Returns:
 50 | 		    SystemMessage: Formatted system prompt
 51 | 		"""
 52 | 		return self.system_message
 53 | 
 54 | 
 55 | # Functions:
 56 | # {self.default_action_description}
 57 | 
 58 | # Example:
 59 | # {self.example_response()}
 60 | # Your AVAILABLE ACTIONS:
 61 | # {self.default_action_description}
 62 | 
 63 | 
 64 | class AgentMessagePrompt:
 65 | 	def __init__(
 66 | 		self,
 67 | 		state: 'BrowserState',
 68 | 		result: Optional[List['ActionResult']] = None,
 69 | 		include_attributes: list[str] = [],
 70 | 		step_info: Optional['AgentStepInfo'] = None,
 71 | 		subgoals: Optional[list] = None,
 72 | 	):
 73 | 		self.state = state
 74 | 		self.result = result
 75 | 		self.include_attributes = include_attributes
 76 | 		self.step_info = step_info
 77 | 		self.subgoals = subgoals or []
 78 | 
 79 | 	def get_user_message(self, use_vision: bool = True, is_smart_replay: bool = False) -> HumanMessage:
 80 | 		elements_text = self.state.element_tree.clickable_elements_to_string(include_attributes=self.include_attributes)
 81 | 
 82 | 		has_content_above = (self.state.pixels_above or 0) > 0
 83 | 		has_content_below = (self.state.pixels_below or 0) > 0
 84 | 
 85 | 		if elements_text != '':
 86 | 			if has_content_above:
 87 | 				elements_text = (
 88 | 					f'... {self.state.pixels_above} pixels above - scroll or extract content to see more ...\n{elements_text}'
 89 | 				)
 90 | 			else:
 91 | 				elements_text = f'[Start of page]\n{elements_text}'
 92 | 			if has_content_below:
 93 | 				elements_text = (
 94 | 					f'{elements_text}\n... {self.state.pixels_below} pixels below - scroll or extract content to see more ...'
 95 | 				)
 96 | 			else:
 97 | 				elements_text = f'{elements_text}\n[End of page]'
 98 | 		else:
 99 | 			elements_text = 'empty page'
100 | 
101 | 		if self.step_info:
102 | 			step_info_description = f'Current step: {self.step_info.step_number + 1}/{self.step_info.max_steps}'
103 | 		else:
104 | 			step_info_description = ''
105 | 		time_str = datetime.now().strftime('%Y-%m-%d %H:%M')
106 | 		step_info_description += f'Current date and time: {time_str}'
107 | 
108 | 
109 | 		state_description = ""
110 | 
111 | 		if not is_smart_replay:
112 | 			state_description = f"""
113 | [Task history memory ends]
114 | [Current state starts here]
115 | The following is one-time information - if you need to remember it write it to memory:
116 | Current url: {self.state.url}
117 | Available tabs:
118 | {self.state.tabs}
119 | Interactive elements from top layer of the current page inside the viewport:
120 | {elements_text}
121 | {step_info_description}
122 | """
123 | 		else:
124 | 			state_description = f"""
125 | [Task history memory ends]
126 | [sub-goals start here]
127 | Previous sub-goal: {self.subgoals[0]}
128 | Current sub-goal: {self.subgoals[1]}
129 | [sub-goals end]
130 | [Current state starts here]
131 | The following is one-time information - if you need to remember it write it to memory:
132 | Current url: {self.state.url}
133 | Available tabs:
134 | {self.state.tabs}
135 | Interactive elements from top layer of the current page inside the viewport:
136 | {elements_text}
137 | {step_info_description}
138 | """
139 | 
140 | 		if self.result:
141 | 			for i, result in enumerate(self.result):
142 | 				if result.extracted_content:
143 | 					state_description += f'\nAction result {i + 1}/{len(self.result)}: {result.extracted_content}'
144 | 				if result.error:
145 | 					# only use last line of error
146 | 					error = result.error.split('\n')[-1]
147 | 					state_description += f'\nAction error {i + 1}/{len(self.result)}: ...{error}'
148 | 
149 | 		if self.state.screenshot and use_vision is True:
150 | 			# Format message for vision model
151 | 			return HumanMessage(
152 | 				content=[
153 | 					{'type': 'text', 'text': state_description},
154 | 					{
155 | 						'type': 'image_url',
156 | 						'image_url': {'url': f'data:image/png;base64,{self.state.screenshot}'},  # , 'detail': 'low'
157 | 					},
158 | 				]
159 | 			)
160 | 
161 | 		return HumanMessage(content=state_description)
162 | 
163 | 
164 | class PlannerPrompt(SystemPrompt):
165 | 	def get_system_message(self, is_planner_reasoning) -> Union[SystemMessage, HumanMessage]:
166 | 		planner_prompt_text = """You are a planning agent that helps break down tasks into smaller steps and reason about the current state.
167 | Your role is to:
168 | 1. Analyze the current state and history
169 | 2. Evaluate progress towards the ultimate goal
170 | 3. Identify potential challenges or roadblocks
171 | 4. Suggest the next high-level steps to take
172 | 
173 | Inside your messages, there will be AI messages from different agents with different formats.
174 | 
175 | Your output format should be always a JSON object with the following fields:
176 | {
177 |     "state_analysis": "Brief analysis of the current state and what has been done so far",
178 |     "progress_evaluation": "Evaluation of progress towards the ultimate goal (as percentage and description)",
179 |     "challenges": "List any potential challenges or roadblocks",
180 |     "next_steps": "List 2-3 concrete next steps to take",
181 |     "reasoning": "Explain your reasoning for the suggested next steps"
182 | }
183 | 
184 | Ignore the other AI messages output structures.
185 | 
186 | Keep your responses concise and focused on actionable insights."""
187 | 
188 | 		if is_planner_reasoning:
189 | 			return HumanMessage(content=planner_prompt_text)
190 | 		else:
191 | 			return SystemMessage(content=planner_prompt_text)
192 | 


--------------------------------------------------------------------------------
/chrome-extension/css/panel.css:
--------------------------------------------------------------------------------
  1 | @font-face {
  2 |     font-family: 'Raleway';
  3 |     font-style: normal;
  4 |     font-weight: 400;
  5 |     src: local('Raleway'), url(../other/Raleway.woff2) format('woff2');
  6 |     unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2212, U+2215, U+E0FF, U+EFFD, U+F000;
  7 | }
  8 | 
  9 | * {
 10 |     box-sizing: border-box;
 11 | }
 12 | 
 13 | body {
 14 |     padding: 10px;
 15 |     background: white;
 16 | }
 17 | 
 18 | .intro {
 19 |     position: absolute;
 20 |     left: 50%;
 21 |     top: 50%;
 22 |     transform: translate(-50%, -50%);
 23 |     
 24 |     display: flex;
 25 |     flex-direction: column;
 26 |     align-items: center;
 27 |     text-align: center;
 28 | }
 29 | 
 30 | nav {
 31 |     position: fixed;
 32 |     z-index: 1;
 33 |     bottom: 0;
 34 |     left: 0;
 35 |     right: 0;
 36 |     background: rgba(255, 255, 255, 0.7);
 37 |     padding: 10px;
 38 | }
 39 | 
 40 | nav > button {
 41 |     margin-bottom: 0;
 42 |     background: white;
 43 | }
 44 | 
 45 | .top {
 46 |     opacity: 1;
 47 |     transition: opacity 300ms;
 48 | }
 49 | 
 50 | .top.hidden {
 51 |     pointer-events: none;
 52 |     opacity: 0;
 53 | }
 54 | 
 55 | .status {
 56 |     position: fixed;
 57 |     top: 10px;
 58 |     right: 10px;
 59 |     color: rgba(139, 0, 0, 0.5);
 60 |     font-size: small;
 61 |     z-index: 1;
 62 |     cursor: help;
 63 | }
 64 | 
 65 | .status.connected {
 66 |     display: none;
 67 | }
 68 | 
 69 | .events {
 70 |     margin-bottom: 50px;
 71 |     table-layout: fixed;
 72 | }
 73 | 
 74 | .events thead th {
 75 |     vertical-align: bottom;
 76 | }
 77 | 
 78 | .events thead th input[type=text] {
 79 |     margin-bottom: 9px;
 80 |     height: 30px;
 81 |     font-weight: normal;
 82 |     width: 100%;
 83 |     min-width: 55px;
 84 | }
 85 | 
 86 | .events thead th .checkbox {
 87 |     margin-bottom: 9px;
 88 | }
 89 | 
 90 | .events td {
 91 |     word-wrap: break-word;
 92 | }
 93 | 
 94 | .events .time {
 95 |     display: block;
 96 |     color: #b6b6b6;
 97 |     font-size: small;
 98 | }
 99 | 
100 | .events td:first-child {
101 |     border-left: solid transparent 5px;
102 |     padding-left: 4px;
103 | }
104 | 
105 | .events tr.nodes-added td:first-child {
106 |     border-left-color: rgb(138,219,246);
107 | }
108 | 
109 | .events tr.nodes-removed td:first-child {
110 |     border-left-color: rgb(255,198,139);
111 | }
112 | 
113 | .events tr.text-changed td:first-child {
114 |     border-left-color: rgb(254,239,139);
115 | }
116 | 
117 | .events tr.attribute-changed td:first-child {
118 |     border-left-color: rgb(179,146,248);
119 | }
120 | 
121 | .events th:nth-child(1), .events td:nth-child(1) {
122 |     min-width: 155px;
123 | }
124 | 
125 | .events th:nth-child(3), .events td:nth-child(3) {
126 |     /* width: 60%; */
127 | }
128 | 
129 | .events td > div {
130 |     max-height: 85px;
131 |     overflow: auto;
132 | }
133 | 
134 | .events td:nth-child(3) hr {
135 |     margin-top: 2px;
136 |     margin-bottom: 2px;
137 | }
138 | 
139 | .events thead .counter {
140 |     color: #b6b6b6;
141 | }
142 | 
143 | .node:hover {
144 |     text-decoration: underline;
145 |     cursor: pointer;
146 | }
147 | 
148 | /*
149 | Filter
150 | */
151 | 
152 | .events tbody tr {
153 |     display: none;
154 | }
155 | 
156 | .events tbody.nodes-added-visible tr.target-match.nodes-added,
157 | .events tbody.nodes-removed-visible tr.target-match.nodes-removed,
158 | .events tbody.text-changed-visible tr.target-match.text-changed,
159 | .events tbody.attribute-changed-visible tr.target-match.attribute-changed {
160 |     display: table-row;
161 | }
162 | 
163 | /*
164 | Checkbox
165 | */
166 | 
167 | .checkbox {
168 |     width: 20px;
169 |     position: relative;
170 |     display: inline-block;
171 | }
172 | 
173 | .checkbox label {
174 |     cursor: pointer;
175 |     position: absolute;
176 |     width: 20px;
177 |     height: 20px;
178 |     top: 0;
179 |     border-radius: 4px;
180 |     margin: 0;
181 | }
182 | 
183 | .checkbox label:after {
184 |     opacity: 0;
185 |     content: '';
186 |     position: absolute;
187 |     width: 9px;
188 |     height: 5px;
189 |     background: transparent;
190 |     top: 4px;
191 |     left: 4px;
192 |     border: 3px solid #fcfff4;
193 |     border-top: none;
194 |     border-right: none;
195 | 
196 |     transform: rotate(-45deg);
197 | }
198 | 
199 | .checkbox label:hover::after {
200 |     opacity: 0.3;
201 | }
202 | 
203 | .checkbox input {
204 |     visibility: hidden;
205 |     margin: 0;
206 | }
207 | 
208 | .checkbox input[type=checkbox]:checked + label:after {
209 |     opacity: 1;
210 | }
211 | 
212 | .checkbox.nodes-added label {
213 |     background: rgb(138,219,246);
214 | }
215 | 
216 | .checkbox.nodes-removed label {
217 |     background: rgb(255,198,139);
218 | }
219 | 
220 | .checkbox.text-changed label {
221 |     background: rgb(254,239,139);
222 | }
223 | 
224 | .checkbox.attribute-changed label {
225 |     background: rgb(179,146,248);
226 | }
227 | 
228 | .task-description-input {
229 |     width: 100%;
230 |     padding: 8px;
231 |     font-size: 14px;
232 | 	min-height: 20px;
233 | 	margin-bottom: 0;
234 | 	resize: none;
235 | 	overflow: hidden;
236 | 	box-sizing: border-box;
237 | 	line-height: 1.4;
238 | }
239 | 
240 | .task-description-input.invalid {
241 |     border: 1px solid #e74c3c;
242 | }
243 | 
244 | .task-description-section {
245 | 	padding-bottom: 10px;
246 | }
247 | 
248 | .task-description-label {
249 | 	display: none; /* Default hidden */
250 | 	font-size: 16px;
251 | 	font-weight: bold;
252 | 	margin-top: 10px;
253 | 	margin-bottom: 100px;
254 | 	padding: 5px 10px;
255 | 	background-color: #f0f0f0;
256 | 	color: #333;
257 | 	border: 1px solid #ccc;
258 | 	border-radius: 5px;
259 | 	white-space: pre-wrap; /* Allow line breaks if task description is long */
260 |   }
261 | 
262 |   .task-description-task-id {
263 | 	display: none; /* Default hidden */
264 | 	font-size: 16px;
265 | 	font-weight: bold;
266 | 	margin-top: 10px;
267 | 	margin-bottom: 100px;
268 | 	padding: 5px 10px;
269 | 	background-color: hsl(52, 100%, 57%);
270 | 	color: #333;
271 | 	border: 1px solid #ccc;
272 | 	border-radius: 5px;
273 | 	white-space: pre-wrap; /* Allow line breaks if task description is long */
274 |   }
275 | 
276 | .hidden { display: none !important; }
277 | 
278 | .settings-panel {
279 | 	padding: 10px;
280 | 	border-top: 1px solid #ddd;
281 | }
282 | 
283 | .settings-panel label {
284 | 	display: block;
285 | 	margin: 6px 0;
286 | }
287 | 
288 | /* Make the header a positioning context */
289 | table.events thead {
290 | 	position: relative;
291 |   }
292 |   
293 |   /* Pin the button */
294 | .settings-btn {
295 | 	position: absolute;
296 | 	top:   4px;
297 | 	right: 8px;
298 | 	background-color: #e5e5e5;
299 | }
300 | 
301 | .social-buttons {
302 |     display: flex;
303 |     align-items: center;
304 |     gap: 8px;
305 | }
306 | 
307 | .website-btn {
308 |     font-family: inherit;
309 |     font-size: 10px;
310 |     font-weight: 600;
311 |     line-height: 1;
312 |     padding: 5px 10px;
313 |     color: #fff;
314 |     background: #000d1d;
315 |     border-radius: 15px;
316 |     text-decoration: none;
317 | }
318 | 
319 | .website-btn:hover,
320 | .website-btn:focus {
321 | 	color: #fff;
322 |     background: #00223d;
323 |     outline: none;
324 | }
325 | 
326 | .website-btn:active {
327 | 	color: #fff;
328 |     background: #00294e;
329 | }


--------------------------------------------------------------------------------
/chrome-extension/js/EventTable.js:
--------------------------------------------------------------------------------
  1 | (function () {
  2 |     "use strict";
  3 | 
  4 |     function formatNode(node) {
  5 |         return '<span class="node" data-nodeid="' + node.nodeId + '">' + node.selector + '</span>';
  6 |     }
  7 | 
  8 |     function formatValue(value) {
  9 |         if (value === null) {
 10 |             return 'null';
 11 |         } else if (value === undefined) {
 12 |             return 'undefined';
 13 |         } else {
 14 |             return '"' + value + '"';
 15 |         }
 16 |     }
 17 | 
 18 |     function momentJS(date, timeOnly) {
 19 |         let hours = date.getHours();
 20 |         let minutes = date.getMinutes();
 21 |         let seconds = date.getSeconds();
 22 |         hours = hours % 24;
 23 |         minutes = minutes < 10 ? '0' + minutes : minutes;
 24 |         seconds = seconds < 10 ? '0' + seconds : seconds;
 25 |         const strTime = hours + ':' + minutes + ':' + seconds;
 26 | 
 27 |         if (timeOnly) {
 28 |             return strTime;
 29 |         }
 30 | 
 31 |         return date.getMonth()+1 + "/" + date.getDate() + "/" + date.getFullYear() + " " + strTime + ":" + date.getMilliseconds();
 32 |       }
 33 | 
 34 |     function formatDate(timestamp) {
 35 |         const date = new Date(timestamp);
 36 | 
 37 |         return `<span class="time" title="${momentJS(date, false)}">${momentJS(date, true)}</span>`;
 38 |     }
 39 | 
 40 |     function formatEventDetails(event) {
 41 |         var details = "";
 42 |         switch (event.type) {
 43 |             case "nodes added":
 44 |                 details = event.nodes.length + ' node(s) added: ' +
 45 |                 '<em>' + (event.nodes.map(formatNode)).join('</em>, <em>') + '</em>';
 46 |                 break;
 47 |             case "nodes removed":
 48 |                 details = event.nodes.length + ' node(s) removed: ' +
 49 |                 '<em>' + (event.nodes.map(formatNode)).join('</em>, <em>') + '</em>';
 50 |                 break;
 51 |             case "attribute changed":
 52 |                 details = '<em>"' + event.attribute + '"</em> ';
 53 | 
 54 |                 if (event.oldValue === null && event.newValue === "") {
 55 |                     details += ' was added';
 56 |                 } else if (event.newValue === null && event.oldValue === "") {
 57 |                     details += ' was removed';
 58 |                 } else {
 59 |                     details += 'changed from <em>' + formatValue(event.oldValue) + '</em> ' +
 60 |                     'to <em>' + formatValue(event.newValue) + '</em>';
 61 |                 }
 62 | 
 63 |                 break;
 64 |             case "text changed":
 65 |                 details = 'text changed ' +
 66 |                 'from <em>' + formatValue(event.oldValue) + '</em> ' +
 67 |                 'to <em>' + formatValue(event.newValue) + '</em>';
 68 |                 break;
 69 |         }
 70 | 
 71 |         return details;
 72 |     }
 73 | 
 74 |     function EventTable(table) {
 75 |         this._tableHead = table.tHead;
 76 |         this._tableBody = table.tBodies[0];
 77 |         this._counter = this._tableHead.querySelector('.counter');
 78 |         this._targetFilter = (this._tableHead).querySelector('.target-filter');
 79 |         this._count = this._tableBody.children.length;
 80 | 
 81 |         //FILTERS
 82 |         var thead = this._tableHead;
 83 |         var tbody = this._tableBody;
 84 |         var targetFilter = this._targetFilter;
 85 |         var typeFilters = (this._tableHead).querySelectorAll('.type-filters input');
 86 | 
 87 |         function updateTypeFilters() {
 88 |             var nodesAdded = thead.querySelector('.nodes-added input').checked;
 89 |             var nodesRemoved = thead.querySelector('.nodes-removed input').checked;
 90 |             var textChanged = thead.querySelector('.text-changed input').checked;
 91 |             var attributeChanged = thead.querySelector('.attribute-changed input').checked;
 92 | 
 93 |             if (nodesAdded) {
 94 |                 tbody.classList.add('nodes-added-visible');
 95 |             } else {
 96 |                 tbody.classList.remove('nodes-added-visible');
 97 |             }
 98 | 
 99 |             if (nodesRemoved) {
100 |                 tbody.classList.add('nodes-removed-visible');
101 |             } else {
102 |                 tbody.classList.remove('nodes-removed-visible');
103 |             }
104 | 
105 |             if (textChanged) {
106 |                 tbody.classList.add('text-changed-visible');
107 |             } else {
108 |                 tbody.classList.remove('text-changed-visible');
109 |             }
110 | 
111 |             if (attributeChanged) {
112 |                 tbody.classList.add('attribute-changed-visible');
113 |             } else {
114 |                 tbody.classList.remove('attribute-changed-visible');
115 |             }
116 |         }
117 | 
118 |         updateTypeFilters();
119 | 
120 |         function updateTargetFilter() {
121 |             var query = (targetFilter.value).trim();
122 | 
123 |             for (var i = 0, l = tbody.children.length; i < l; i++) {
124 |                 var tr = tbody.children[i];
125 |                 var targetTd = tr.children[1];
126 | 
127 |                 if (!query || targetTd.innerText.indexOf(query) > -1) {
128 |                     tr.classList.add('target-match');
129 |                 } else {
130 |                     tr.classList.remove('target-match');
131 |                 }
132 |             }
133 |         }
134 | 
135 |         updateTargetFilter();
136 | 
137 |         targetFilter.addEventListener('keyup', updateTargetFilter);
138 | 
139 |         for (var i = 0, l = typeFilters.length; i < l; i++) {
140 |             typeFilters[i].addEventListener('change', updateTypeFilters);
141 |         }
142 |     }
143 | 
144 |     EventTable.prototype._updateEventCounter = function () {
145 |         (this._counter).innerText = '(' + this._count + ')';
146 |     };
147 | 
148 |     EventTable.prototype.clear = function () {
149 |         (this._tableBody).innerHTML = '';
150 | 
151 |         this._count = 0;
152 |         this._updateEventCounter();
153 |     };
154 | 
155 |     EventTable.prototype.addEvent = function (event) {
156 |         var tr = (this._tableBody).firstChild;
157 |         var tdAction, tdDetails, tdTarget;
158 | 
159 |         //check if events should be grouped together
160 |         if (tr && parseInt(tr.dataset.targetNodeId, 10) === event.target.nodeId && tr.dataset.eventType === event.type) {
161 |             tdAction = tr.querySelector('td:nth-child(1)');
162 |             tdDetails = tr.querySelector('td:nth-child(3)');
163 | 
164 |             tr.dataset.count = parseInt(tr.dataset.count || "1", 10) + 1;
165 | 
166 |             tdAction.innerHTML = tr.dataset.count + ' x ' + event.type + formatDate(event.date);
167 | 
168 |             tdDetails.querySelector('div').innerHTML += '<hr/>' + formatEventDetails(event);
169 | 
170 |             this._count++;
171 |             return;
172 |         }
173 | 
174 |         tr = document.createElement('tr');
175 |         tdAction = document.createElement('td');
176 |         tdTarget = document.createElement('td');
177 |         tdDetails = document.createElement('td');
178 | 
179 |         tr.dataset.targetNodeId = event.target.nodeId;
180 |         tr.dataset.eventType = event.type;
181 | 
182 |         tr.classList.add(event.type.replace(' ', '-'));
183 | 
184 |         tdAction.innerHTML = event.type + formatDate(event.date);
185 |         tdTarget.innerHTML = '<div>' + formatNode(event.target) + '</div>';
186 |         tdDetails.innerHTML = '<div>' + formatEventDetails(event) + '</div>';
187 | 
188 |         tr.appendChild(tdAction);
189 |         tr.appendChild(tdTarget);
190 |         tr.appendChild(tdDetails);
191 | 
192 |         //check if it matches current query
193 |         var query = ((this._targetFilter).value).trim();
194 |         if (!query || tdTarget.innerText.indexOf(query) > -1) {
195 |             tr.classList.add('target-match');
196 |         }
197 | 
198 |         //insert at the top/beginning
199 |         (this._tableBody).insertBefore(tr, this._tableBody.firstChild);
200 | 
201 |         tr.animate([
202 |             {opacity: 0},
203 |             {opacity: 1}
204 |         ], 300);
205 | 
206 |         this._count++;
207 |         this._updateEventCounter();
208 |     };
209 | 
210 |     window.EventTable = EventTable;
211 | })();
212 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!-- markdownlint-disable first-line-h1 -->
  2 | <!-- markdownlint-disable html -->
  3 | <!-- markdownlint-disable no-duplicate-header -->
  4 | 
  5 | <div align="center">
  6 |   <img src="chrome-extension/assets/beholder-tool-kit-long.png" width="100%" alt="OTA-tool-kits" style="border-radius: 10px;" />
  7 | </div>
  8 | <br>
  9 | <div align="center" style="line-height: 1;">
 10 |   <a href="https://www.otatech.ai/"><img alt="Homepage"
 11 | 	src="https://img.shields.io/badge/Visit-otatech.ai-blue"/></a>
 12 |   <a href="https://huggingface.co/OTA-AI/OTA-v1"><img alt="Hugging Face"
 13 | 	src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-OTA%20AI-ffc107?color=ffc107&logoColor=white"/></a>
 14 |   <a href="https://github.com/OTA-Tech-AI/webagentprotocol/blob/main/LICENSE"><img alt="Code License"
 15 | 	src="https://img.shields.io/badge/Code_License-MIT-f5de53?&color=f5deff"/></a>
 16 |   <br><br><br>
 17 | </div>
 18 | 
 19 | # Web Agent Protocol
 20 | 
 21 | ## Overview
 22 | 
 23 | The Web Agent Protocol (WAP) is a standardized framework designed to enable seamless interaction between users, web agents, and browsers by recording and replaying browser actions. It separates the concerns of action recording and execution, allowing for efficient automation and reusability. The Python SDK for WAP implements the full specification, making it easy to:
 24 | 
 25 | 1. **Collect** user‑interaction data with the [OTA‑WAP Chrome extension](https://github.com/OTA-Tech-AI/webagentprotocol/tree/main/chrome-extension).
 26 | 2. **Convert** the raw event stream into either **_exact‑replay_** or **_smart‑replay_** action lists.
 27 | 3. **Convert** recorded actions into **_MCP_** servers for reuse by any agent or user
 28 | 4. **Replay** those lists using the **_WAP-Replay_** protocol to ensure accurate browser operations.
 29 | 
 30 | ### WAP FULL DEMO
 31 | 
 32 | [![Watch the video](https://img.youtube.com/vi/joh9FXJfnwk/0.jpg)](https://www.youtube.com/watch?v=joh9FXJfnwk)
 33 | 
 34 | ### Without WAP
 35 | ![image](https://github.com/user-attachments/assets/843ea9da-45c0-48e9-8a25-44f5bfb31786)
 36 | 
 37 | ### WAP Record
 38 | ![image](https://github.com/user-attachments/assets/3d041f56-9e76-4b61-9b56-0686070723a3)
 39 | 
 40 | ### WAP Replay
 41 | ![image](https://github.com/user-attachments/assets/e13ca7c7-3cc1-4952-8a79-3bd1e9e98580)
 42 | 
 43 | ## Example using WAP
 44 | ![image](https://github.com/user-attachments/assets/ccb7387b-0677-498c-b4ad-a10590e37e27)
 45 | 
 46 | ## Setup
 47 | Install the dependencies with the following command:
 48 | 
 49 | Create a conda env
 50 | 
 51 | ```bash
 52 | conda create -n WAP python=3.11
 53 | ```
 54 | 
 55 | Activate the conda env
 56 | 
 57 | ```bash
 58 | conda activate WAP
 59 | ```
 60 | 
 61 | Install the dependencies
 62 | 
 63 | ```bash
 64 | pip install -r requirements.txt
 65 | ```
 66 | 
 67 | Setup your repo source path:
 68 | ```
 69 | set PYTHONPATH=C:/path/to/webagentprotocol # for Windows
 70 | export PYTHONPATH=/path/to/webagentprotocol # for Linux
 71 | ```
 72 | 
 73 | Create **.env** file under the repo root directory with your own API keys:
 74 | ```
 75 | OPENAI_API_KEY=sk-proj-...
 76 | DEEPSEEK_API_KEY=sk-...
 77 | ```
 78 | 
 79 | ## Record
 80 | 
 81 | ### WAP record extension
 82 | Please refer to [OTA‑WAP Chrome Extension](https://github.com/OTA-Tech-AI/webagentprotocol/tree/main/chrome-extension) to setup action capturer in your Chrome browser.
 83 | 
 84 | ### Start data‑collection server
 85 | 
 86 | Run the following command to start the server to collect data from the extension:
 87 | ```bash
 88 | python action_collect_server.py
 89 | ```
 90 | **Once the server is up, you can start to record from the page using WAP Chrome extension.**
 91 | 
 92 | The server listens on http://localhost:4934/action-data by default, please make sure the Host and Port in the extension settings match this server config.
 93 | Each session will be saved to:
 94 | 
 95 | ```bash
 96 | data/YYYYMMDD/taskid/summary_event_<timestamp>.json
 97 | ```
 98 | 
 99 | An example of the formatted data which you will received in the WAP backend server is like:
100 | 
101 | ```json
102 | {
103 |   "taskId": "MkCAhQsHgXn7YgaK",
104 |   "type": "click",
105 |   "actionTimestamp": 1746325231479,
106 |   "eventTarget": {
107 |     "type": "click",
108 |     "target": "<a ota-use-interactive-target=\"1\" data-ordinal=\"3\" href=\"https://www.allrecipes.com/recipe/68925/cheesy-baked-salmon/\" data-tax-levels=\"\" data-doc-id=\"6592066\" class=\"comp mntl-card-list-card--extendable mntl-universal-card mntl-document-card mntl-card card card--no-image\" id=\"mntl-card-list-card--extendable_3-0\">\n<div class=\"loc card__top\"><div class=\"card__media mntl-image card__media universal-image__container\">...",
109 |     "targetId": "mntl-card-list-card--extendable_3-0",
110 |     "targetClass": "comp mntl-card-list-card--extendable mntl-universal-card mntl-document-card mntl-card card card--no-image"
111 |   },
112 |   "allEvents": {},
113 |   "pageHTMLContent": "<header data-tracking-container=\"true\" data-collapsible=\"true\" class=\"comp header mntl-header mntl-header--magazine mntl-header--open-search-bar mntl-header--myr\" id=\"header_1-0\"><a data-tracking-container=\"true\" id=\"mntl-skip-to-content_1-0\" class=\"mntl-skip-to-content mntl-text-link\" rel=\"nocaes\" href=\"#main\"></a><div class=\"mntl-header__menu-top\">..."
114 | }
115 | ```
116 | 
117 | 
118 | ## Generate replay lists
119 | 
120 | | Mode                                               | Command                                                                                                                                                                |
121 | | -------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
122 | | **Exact replay** – exactly reproduce every action | `python wap_replay/generate_exact_replay_list.py --data_dir_path data/<date>/<task_id> --output_dir_path data_processed/exact_replay` |
123 | | **Smart replay** – condensed goal‑oriented steps   | `python wap_replay/generate_smart_replay_list.py --data_dir_path data/<date>/<task_id> --output_dir_path data_processed/smart_replay` |
124 | 
125 | Replace **<task_id>** with the folder produced by the extension
126 | (e.g. em3h6UBDZykz0gnH).
127 | 
128 | Output structure:
129 | ```bash
130 | data_processed/smart_replay/
131 |  ├─ subgoals_<task_id>/                     # intermediate prompts & replies
132 |  └─ wap_smart_replay_list_<task_id>.json   # final smart replay list for the agent
133 | 
134 | data_processed/exact_replay/
135 |  └─ wap_smart_replay_list_<task_id>.json   # final exact replay list for the agent
136 | ```
137 | 
138 | ## Replay
139 | ```bash
140 | python run_replay.py --model-provider openai --wap_replay_list data_processed/exact_replay/wap_exact_replay_list_<task_id>.json --max-concurrent 1
141 | ```
142 | For **smart-replay**, replace the path with a smart‑replay JSON to test this mode.
143 | 
144 | ## Convert to MCP Server
145 | 
146 | ```bash
147 | python wap_replay\generate_mcp_server.py --task_id <task_id>
148 | ```
149 | 
150 | converted MCP servers will be located under ``` mcp_servers ``` folder
151 | 
152 | ## Replay with MCP
153 | 
154 | You would need 2 terminals to replay with MCP. In the first termnial
155 | ```bash
156 | python wap_service.py
157 | ```
158 | 
159 | In the second termnial
160 | ```bash
161 | python mcp_client.py
162 | ```
163 | 
164 | Then enter your prompt in the second terminal
165 | 
166 | ```bash
167 | example: find a top rated keyboard on amazon.ca using smart replay
168 | ```
169 | 
170 | ## Replay with our Desktop App
171 | 
172 | We provide out-of-box desktop app for running replay lists. It is easy to install and you don't need any extra steps for setup and deployments. Visit [WAP Replay Tool releases](https://github.com/OTA-Tech-AI/web-agent-protocol/releases) for more details.
173 | 
174 | <img src="assets/wap_replay_tool_demo.gif" alt="WAP Replay Tool Demo GIF" width="500"/>
175 | 
176 | 
177 | ## Troubleshooting
178 | 
179 | **ModuleNotFoundError** – run commands from the project root or export PYTHONPATH=. (set PYTHONPATH=. for Windows).
180 | 
181 | “no task‑start file” – ensure the extension recorded a full session;
182 | the generators require exactly one task-start and one task-finish record.
183 | 
184 | ## Acknowledgement
185 | 
186 | Browser-Use: https://github.com/browser-use/browser-use
187 | 
188 | MCP: https://github.com/modelcontextprotocol/python-sdk
189 | 
190 | DOM Extension: https://github.com/kdzwinel/DOMListenerExtension
191 | 


--------------------------------------------------------------------------------
/browser_use/agent/message_manager/tests.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from langchain_anthropic import ChatAnthropic
  3 | from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
  4 | from langchain_openai import AzureChatOpenAI, ChatOpenAI
  5 | 
  6 | from browser_use.agent.message_manager.service import MessageManager, MessageManagerSettings
  7 | from browser_use.agent.views import ActionResult
  8 | from browser_use.browser.views import BrowserState, TabInfo
  9 | from browser_use.dom.views import DOMElementNode, DOMTextNode
 10 | 
 11 | 
 12 | @pytest.fixture(
 13 | 	params=[
 14 | 		ChatOpenAI(model='gpt-4o-mini'),
 15 | 		AzureChatOpenAI(model='gpt-4o', api_version='2024-02-15-preview'),
 16 | 		ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=100, temperature=0.0, stop=None),
 17 | 	],
 18 | 	ids=['gpt-4o-mini', 'gpt-4o', 'claude-3-5-sonnet'],
 19 | )
 20 | def message_manager(request: pytest.FixtureRequest):
 21 | 	task = 'Test task'
 22 | 	action_descriptions = 'Test actions'
 23 | 	return MessageManager(
 24 | 		task=task,
 25 | 		system_message=SystemMessage(content=action_descriptions),
 26 | 		settings=MessageManagerSettings(
 27 | 			max_input_tokens=1000,
 28 | 			estimated_characters_per_token=3,
 29 | 			image_tokens=800,
 30 | 		),
 31 | 	)
 32 | 
 33 | 
 34 | def test_initial_messages(message_manager: MessageManager):
 35 | 	"""Test that message manager initializes with system and task messages"""
 36 | 	messages = message_manager.get_messages()
 37 | 	assert len(messages) == 2
 38 | 	assert isinstance(messages[0], SystemMessage)
 39 | 	assert isinstance(messages[1], HumanMessage)
 40 | 	assert 'Test task' in messages[1].content
 41 | 
 42 | 
 43 | def test_add_state_message(message_manager: MessageManager):
 44 | 	"""Test adding browser state message"""
 45 | 	state = BrowserState(
 46 | 		url='https://test.com',
 47 | 		title='Test Page',
 48 | 		element_tree=DOMElementNode(
 49 | 			tag_name='div',
 50 | 			attributes={},
 51 | 			children=[],
 52 | 			is_visible=True,
 53 | 			parent=None,
 54 | 			xpath='//div',
 55 | 		),
 56 | 		selector_map={},
 57 | 		tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')],
 58 | 	)
 59 | 	message_manager.add_state_message(state)
 60 | 
 61 | 	messages = message_manager.get_messages()
 62 | 	assert len(messages) == 3
 63 | 	assert isinstance(messages[2], HumanMessage)
 64 | 	assert 'https://test.com' in messages[2].content
 65 | 
 66 | 
 67 | def test_add_state_with_memory_result(message_manager: MessageManager):
 68 | 	"""Test adding state with result that should be included in memory"""
 69 | 	state = BrowserState(
 70 | 		url='https://test.com',
 71 | 		title='Test Page',
 72 | 		element_tree=DOMElementNode(
 73 | 			tag_name='div',
 74 | 			attributes={},
 75 | 			children=[],
 76 | 			is_visible=True,
 77 | 			parent=None,
 78 | 			xpath='//div',
 79 | 		),
 80 | 		selector_map={},
 81 | 		tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')],
 82 | 	)
 83 | 	result = ActionResult(extracted_content='Important content', include_in_memory=True)
 84 | 
 85 | 	message_manager.add_state_message(state, [result])
 86 | 	messages = message_manager.get_messages()
 87 | 
 88 | 	# Should have system, task, extracted content, and state messages
 89 | 	assert len(messages) == 4
 90 | 	assert 'Important content' in messages[2].content
 91 | 	assert isinstance(messages[2], HumanMessage)
 92 | 	assert isinstance(messages[3], HumanMessage)
 93 | 	assert 'Important content' not in messages[3].content
 94 | 
 95 | 
 96 | def test_add_state_with_non_memory_result(message_manager: MessageManager):
 97 | 	"""Test adding state with result that should not be included in memory"""
 98 | 	state = BrowserState(
 99 | 		url='https://test.com',
100 | 		title='Test Page',
101 | 		element_tree=DOMElementNode(
102 | 			tag_name='div',
103 | 			attributes={},
104 | 			children=[],
105 | 			is_visible=True,
106 | 			parent=None,
107 | 			xpath='//div',
108 | 		),
109 | 		selector_map={},
110 | 		tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')],
111 | 	)
112 | 	result = ActionResult(extracted_content='Temporary content', include_in_memory=False)
113 | 
114 | 	message_manager.add_state_message(state, [result])
115 | 	messages = message_manager.get_messages()
116 | 
117 | 	# Should have system, task, and combined state+result message
118 | 	assert len(messages) == 3
119 | 	assert 'Temporary content' in messages[2].content
120 | 	assert isinstance(messages[2], HumanMessage)
121 | 
122 | 
123 | @pytest.mark.skip('not sure how to fix this')
124 | @pytest.mark.parametrize('max_tokens', [100000, 10000, 5000])
125 | def test_token_overflow_handling_with_real_flow(message_manager: MessageManager, max_tokens):
126 | 	"""Test handling of token overflow in a realistic message flow"""
127 | 	# Set more realistic token limit
128 | 	message_manager.settings.max_input_tokens = max_tokens
129 | 
130 | 	# Create a long sequence of interactions
131 | 	for i in range(200):  # Simulate 40 steps of interaction
132 | 		# Create state with varying content length
133 | 		state = BrowserState(
134 | 			url=f'https://test{i}.com',
135 | 			title=f'Test Page {i}',
136 | 			element_tree=DOMElementNode(
137 | 				tag_name='div',
138 | 				attributes={},
139 | 				children=[
140 | 					DOMTextNode(
141 | 						text=f'Content {j} ' * (10 + i),  # Increasing content length
142 | 						is_visible=True,
143 | 						parent=None,
144 | 					)
145 | 					for j in range(5)  # Multiple DOM items
146 | 				],
147 | 				is_visible=True,
148 | 				parent=None,
149 | 				xpath='//div',
150 | 			),
151 | 			selector_map={j: f'//div[{j}]' for j in range(5)},
152 | 			tabs=[TabInfo(page_id=1, url=f'https://test{i}.com', title=f'Test Page {i}')],
153 | 		)
154 | 
155 | 		# Alternate between different types of results
156 | 		result = None
157 | 		if i % 2 == 0:  # Every other iteration
158 | 			result = ActionResult(
159 | 				extracted_content=f'Important content from step {i}' * 5,
160 | 				include_in_memory=i % 4 == 0,  # Include in memory every 4th message
161 | 			)
162 | 
163 | 		# Add state message
164 | 		if result:
165 | 			message_manager.add_state_message(state, [result])
166 | 		else:
167 | 			message_manager.add_state_message(state)
168 | 
169 | 		try:
170 | 			messages = message_manager.get_messages()
171 | 		except ValueError as e:
172 | 			if 'Max token limit reached - history is too long' in str(e):
173 | 				return  # If error occurs, end the test
174 | 			else:
175 | 				raise e
176 | 
177 | 		assert message_manager.state.history.current_tokens <= message_manager.settings.max_input_tokens + 100
178 | 
179 | 		last_msg = messages[-1]
180 | 		assert isinstance(last_msg, HumanMessage)
181 | 
182 | 		if i % 4 == 0:
183 | 			assert isinstance(message_manager.state.history.messages[-2].message, HumanMessage)
184 | 		if i % 2 == 0 and not i % 4 == 0:
185 | 			if isinstance(last_msg.content, list):
186 | 				assert 'Current url: https://test' in last_msg.content[0]['text']
187 | 			else:
188 | 				assert 'Current url: https://test' in last_msg.content
189 | 
190 | 		# Add model output every time
191 | 		from browser_use.agent.views import AgentBrain, AgentOutput
192 | 		from browser_use.controller.registry.views import ActionModel
193 | 
194 | 		output = AgentOutput(
195 | 			current_state=AgentBrain(
196 | 				evaluation_previous_goal=f'Success in step {i}',
197 | 				memory=f'Memory from step {i}',
198 | 				next_goal=f'Goal for step {i + 1}',
199 | 			),
200 | 			action=[ActionModel()],
201 | 		)
202 | 		message_manager._remove_last_state_message()
203 | 		message_manager.add_model_output(output)
204 | 
205 | 		# Get messages and verify after each addition
206 | 		messages = [m.message for m in message_manager.state.history.messages]
207 | 
208 | 		# Verify token limit is respected
209 | 
210 | 		# Verify essential messages are preserved
211 | 		assert isinstance(messages[0], SystemMessage)  # System prompt always first
212 | 		assert isinstance(messages[1], HumanMessage)  # Task always second
213 | 		assert 'Test task' in messages[1].content
214 | 
215 | 		# Verify structure of latest messages
216 | 		assert isinstance(messages[-1], AIMessage)  # Last message should be model output
217 | 		assert f'step {i}' in messages[-1].content  # Should contain current step info
218 | 
219 | 		# Log token usage for debugging
220 | 		token_usage = message_manager.state.history.current_tokens
221 | 		token_limit = message_manager.settings.max_input_tokens
222 | 		# print(f'Step {i}: Using {token_usage}/{token_limit} tokens')
223 | 
224 | 		# go through all messages and verify that the token count and total tokens is correct
225 | 		total_tokens = 0
226 | 		real_tokens = []
227 | 		stored_tokens = []
228 | 		for msg in message_manager.state.history.messages:
229 | 			total_tokens += msg.metadata.tokens
230 | 			stored_tokens.append(msg.metadata.tokens)
231 | 			real_tokens.append(message_manager._count_tokens(msg.message))
232 | 		assert total_tokens == sum(real_tokens)
233 | 		assert stored_tokens == real_tokens
234 | 		assert message_manager.state.history.current_tokens == total_tokens
235 | 
236 | 
237 | # pytest -s browser_use/agent/message_manager/tests.py
238 | 


--------------------------------------------------------------------------------
/browser_use/controller/registry/service.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from inspect import iscoroutinefunction, signature
  3 | from typing import Any, Callable, Dict, Generic, Optional, Type, TypeVar
  4 | 
  5 | from langchain_core.language_models.chat_models import BaseChatModel
  6 | from pydantic import BaseModel, Field, create_model
  7 | 
  8 | from browser_use.browser.context import BrowserContext
  9 | from browser_use.controller.registry.views import (
 10 | 	ActionModel,
 11 | 	ActionRegistry,
 12 | 	RegisteredAction,
 13 | )
 14 | from browser_use.telemetry.service import ProductTelemetry
 15 | from browser_use.telemetry.views import (
 16 | 	ControllerRegisteredFunctionsTelemetryEvent,
 17 | 	RegisteredFunction,
 18 | )
 19 | from browser_use.utils import time_execution_async, time_execution_sync
 20 | 
 21 | Context = TypeVar('Context')
 22 | 
 23 | 
 24 | class Registry(Generic[Context]):
 25 | 	"""Service for registering and managing actions"""
 26 | 
 27 | 	def __init__(self, exclude_actions: list[str] | None = None):
 28 | 		self.registry = ActionRegistry()
 29 | 		self.telemetry = ProductTelemetry()
 30 | 		self.exclude_actions = exclude_actions if exclude_actions is not None else []
 31 | 
 32 | 	@time_execution_sync('--create_param_model')
 33 | 	def _create_param_model(self, function: Callable) -> Type[BaseModel]:
 34 | 		"""Creates a Pydantic model from function signature"""
 35 | 		sig = signature(function)
 36 | 		params = {
 37 | 			name: (param.annotation, ... if param.default == param.empty else param.default)
 38 | 			for name, param in sig.parameters.items()
 39 | 			if name != 'browser' and name != 'page_extraction_llm' and name != 'available_file_paths'
 40 | 		}
 41 | 		# TODO: make the types here work
 42 | 		return create_model(
 43 | 			f'{function.__name__}_parameters',
 44 | 			__base__=ActionModel,
 45 | 			**params,  # type: ignore
 46 | 		)
 47 | 
 48 | 	def action(
 49 | 		self,
 50 | 		description: str,
 51 | 		param_model: Optional[Type[BaseModel]] = None,
 52 | 		domains: Optional[list[str]] = None,
 53 | 		page_filter: Optional[Callable[[Any], bool]] = None,
 54 | 	):
 55 | 		"""Decorator for registering actions"""
 56 | 
 57 | 		def decorator(func: Callable):
 58 | 			# Skip registration if action is in exclude_actions
 59 | 			if func.__name__ in self.exclude_actions:
 60 | 				return func
 61 | 
 62 | 			# Create param model from function if not provided
 63 | 			actual_param_model = param_model or self._create_param_model(func)
 64 | 
 65 | 			# Wrap sync functions to make them async
 66 | 			if not iscoroutinefunction(func):
 67 | 
 68 | 				async def async_wrapper(*args, **kwargs):
 69 | 					return await asyncio.to_thread(func, *args, **kwargs)
 70 | 
 71 | 				# Copy the signature and other metadata from the original function
 72 | 				async_wrapper.__signature__ = signature(func)
 73 | 				async_wrapper.__name__ = func.__name__
 74 | 				async_wrapper.__annotations__ = func.__annotations__
 75 | 				wrapped_func = async_wrapper
 76 | 			else:
 77 | 				wrapped_func = func
 78 | 
 79 | 			action = RegisteredAction(
 80 | 				name=func.__name__,
 81 | 				description=description,
 82 | 				function=wrapped_func,
 83 | 				param_model=actual_param_model,
 84 | 				domains=domains,
 85 | 				page_filter=page_filter,
 86 | 			)
 87 | 			self.registry.actions[func.__name__] = action
 88 | 			return func
 89 | 
 90 | 		return decorator
 91 | 
 92 | 	@time_execution_async('--execute_action')
 93 | 	async def execute_action(
 94 | 		self,
 95 | 		action_name: str,
 96 | 		params: dict,
 97 | 		browser: Optional[BrowserContext] = None,
 98 | 		page_extraction_llm: Optional[BaseChatModel] = None,
 99 | 		sensitive_data: Optional[Dict[str, str]] = None,
100 | 		available_file_paths: Optional[list[str]] = None,
101 | 		#
102 | 		context: Context | None = None,
103 | 	) -> Any:
104 | 		"""Execute a registered action"""
105 | 		if action_name not in self.registry.actions:
106 | 			raise ValueError(f'Action {action_name} not found')
107 | 
108 | 		action = self.registry.actions[action_name]
109 | 		try:
110 | 			# Create the validated Pydantic model
111 | 			validated_params = action.param_model(**params)
112 | 
113 | 			# Check if the first parameter is a Pydantic model
114 | 			sig = signature(action.function)
115 | 			parameters = list(sig.parameters.values())
116 | 			is_pydantic = parameters and issubclass(parameters[0].annotation, BaseModel)
117 | 			parameter_names = [param.name for param in parameters]
118 | 
119 | 			if sensitive_data:
120 | 				validated_params = self._replace_sensitive_data(validated_params, sensitive_data)
121 | 
122 | 			# Check if the action requires browser
123 | 			if 'browser' in parameter_names and not browser:
124 | 				raise ValueError(f'Action {action_name} requires browser but none provided.')
125 | 			if 'page_extraction_llm' in parameter_names and not page_extraction_llm:
126 | 				raise ValueError(f'Action {action_name} requires page_extraction_llm but none provided.')
127 | 			if 'available_file_paths' in parameter_names and not available_file_paths:
128 | 				raise ValueError(f'Action {action_name} requires available_file_paths but none provided.')
129 | 
130 | 			if 'context' in parameter_names and not context:
131 | 				raise ValueError(f'Action {action_name} requires context but none provided.')
132 | 
133 | 			# Prepare arguments based on parameter type
134 | 			extra_args = {}
135 | 			if 'context' in parameter_names:
136 | 				extra_args['context'] = context
137 | 			if 'browser' in parameter_names:
138 | 				extra_args['browser'] = browser
139 | 			if 'page_extraction_llm' in parameter_names:
140 | 				extra_args['page_extraction_llm'] = page_extraction_llm
141 | 			if 'available_file_paths' in parameter_names:
142 | 				extra_args['available_file_paths'] = available_file_paths
143 | 			if action_name == 'input_text' and sensitive_data:
144 | 				extra_args['has_sensitive_data'] = True
145 | 			if is_pydantic:
146 | 				return await action.function(validated_params, **extra_args)
147 | 			return await action.function(**validated_params.model_dump(), **extra_args)
148 | 
149 | 		except Exception as e:
150 | 			raise RuntimeError(f'Error executing action {action_name}: {str(e)}') from e
151 | 
152 | 	def _replace_sensitive_data(self, params: BaseModel, sensitive_data: Dict[str, str]) -> BaseModel:
153 | 		"""Replaces the sensitive data in the params"""
154 | 		# if there are any str with <secret>placeholder</secret> in the params, replace them with the actual value from sensitive_data
155 | 
156 | 		import re
157 | 
158 | 		secret_pattern = re.compile(r'<secret>(.*?)</secret>')
159 | 
160 | 		def replace_secrets(value):
161 | 			if isinstance(value, str):
162 | 				matches = secret_pattern.findall(value)
163 | 				for placeholder in matches:
164 | 					if placeholder in sensitive_data:
165 | 						value = value.replace(f'<secret>{placeholder}</secret>', sensitive_data[placeholder])
166 | 				return value
167 | 			elif isinstance(value, dict):
168 | 				return {k: replace_secrets(v) for k, v in value.items()}
169 | 			elif isinstance(value, list):
170 | 				return [replace_secrets(v) for v in value]
171 | 			return value
172 | 
173 | 		for key, value in params.model_dump().items():
174 | 			params.__dict__[key] = replace_secrets(value)
175 | 		return params
176 | 
177 | 	@time_execution_sync('--create_action_model')
178 | 	def create_action_model(self, include_actions: Optional[list[str]] = None, page=None) -> Type[ActionModel]:
179 | 		"""Creates a Pydantic model from registered actions, used by LLM APIs that support tool calling & enforce a schema"""
180 | 
181 | 		# Filter actions based on page if provided:
182 | 		#   if page is None, only include actions with no filters
183 | 		#   if page is provided, only include actions that match the page
184 | 
185 | 		available_actions = {}
186 | 		for name, action in self.registry.actions.items():
187 | 			if include_actions is not None and name not in include_actions:
188 | 				continue
189 | 
190 | 			# If no page provided, only include actions with no filters
191 | 			if page is None:
192 | 				if action.page_filter is None and action.domains is None:
193 | 					available_actions[name] = action
194 | 				continue
195 | 
196 | 			# Check page_filter if present
197 | 			domain_is_allowed = self.registry._match_domains(action.domains, page.url)
198 | 			page_is_allowed = self.registry._match_page_filter(action.page_filter, page)
199 | 
200 | 			# Include action if both filters match (or if either is not present)
201 | 			if domain_is_allowed and page_is_allowed:
202 | 				available_actions[name] = action
203 | 
204 | 		fields = {
205 | 			name: (
206 | 				Optional[action.param_model],
207 | 				Field(default=None, description=action.description),
208 | 			)
209 | 			for name, action in available_actions.items()
210 | 		}
211 | 
212 | 		self.telemetry.capture(
213 | 			ControllerRegisteredFunctionsTelemetryEvent(
214 | 				registered_functions=[
215 | 					RegisteredFunction(name=name, params=action.param_model.model_json_schema())
216 | 					for name, action in available_actions.items()
217 | 				]
218 | 			)
219 | 		)
220 | 
221 | 		return create_model('ActionModel', __base__=ActionModel, **fields)  # type:ignore
222 | 
223 | 	def get_prompt_description(self, page=None) -> str:
224 | 		"""Get a description of all actions for the prompt
225 | 
226 | 		If page is provided, only include actions that are available for that page
227 | 		based on their filter_func
228 | 		"""
229 | 		return self.registry.get_prompt_description(page=page)
230 | 


--------------------------------------------------------------------------------
/chrome-extension/css/normalize.css:
--------------------------------------------------------------------------------
  1 | /*! normalize.css v3.0.2 | MIT License | git.io/normalize */
  2 | 
  3 | /**
  4 |  * 1. Set default font family to sans-serif.
  5 |  * 2. Prevent iOS text size adjust after orientation change, without disabling
  6 |  *    user zoom.
  7 |  */
  8 | 
  9 | html {
 10 |   font-family: sans-serif; /* 1 */
 11 |   -ms-text-size-adjust: 100%; /* 2 */
 12 |   -webkit-text-size-adjust: 100%; /* 2 */
 13 | }
 14 | 
 15 | /**
 16 |  * Remove default margin.
 17 |  */
 18 | 
 19 | body {
 20 |   margin: 0;
 21 | }
 22 | 
 23 | /* HTML5 display definitions
 24 |    ========================================================================== */
 25 | 
 26 | /**
 27 |  * Correct `block` display not defined for any HTML5 element in IE 8/9.
 28 |  * Correct `block` display not defined for `details` or `summary` in IE 10/11
 29 |  * and Firefox.
 30 |  * Correct `block` display not defined for `main` in IE 11.
 31 |  */
 32 | 
 33 | article,
 34 | aside,
 35 | details,
 36 | figcaption,
 37 | figure,
 38 | footer,
 39 | header,
 40 | hgroup,
 41 | main,
 42 | menu,
 43 | nav,
 44 | section,
 45 | summary {
 46 |   display: block;
 47 | }
 48 | 
 49 | /**
 50 |  * 1. Correct `inline-block` display not defined in IE 8/9.
 51 |  * 2. Normalize vertical alignment of `progress` in Chrome, Firefox, and Opera.
 52 |  */
 53 | 
 54 | audio,
 55 | canvas,
 56 | progress,
 57 | video {
 58 |   display: inline-block; /* 1 */
 59 |   vertical-align: baseline; /* 2 */
 60 | }
 61 | 
 62 | /**
 63 |  * Prevent modern browsers from displaying `audio` without controls.
 64 |  * Remove excess height in iOS 5 devices.
 65 |  */
 66 | 
 67 | audio:not([controls]) {
 68 |   display: none;
 69 |   height: 0;
 70 | }
 71 | 
 72 | /**
 73 |  * Address `[hidden]` styling not present in IE 8/9/10.
 74 |  * Hide the `template` element in IE 8/9/11, Safari, and Firefox < 22.
 75 |  */
 76 | 
 77 | [hidden],
 78 | template {
 79 |   display: none;
 80 | }
 81 | 
 82 | /* Links
 83 |    ========================================================================== */
 84 | 
 85 | /**
 86 |  * Remove the gray background color from active links in IE 10.
 87 |  */
 88 | 
 89 | a {
 90 |   background-color: transparent;
 91 | }
 92 | 
 93 | /**
 94 |  * Improve readability when focused and also mouse hovered in all browsers.
 95 |  */
 96 | 
 97 | a:active,
 98 | a:hover {
 99 |   outline: 0;
100 | }
101 | 
102 | /* Text-level semantics
103 |    ========================================================================== */
104 | 
105 | /**
106 |  * Address styling not present in IE 8/9/10/11, Safari, and Chrome.
107 |  */
108 | 
109 | abbr[title] {
110 |   border-bottom: 1px dotted;
111 | }
112 | 
113 | /**
114 |  * Address style set to `bolder` in Firefox 4+, Safari, and Chrome.
115 |  */
116 | 
117 | b,
118 | strong {
119 |   font-weight: bold;
120 | }
121 | 
122 | /**
123 |  * Address styling not present in Safari and Chrome.
124 |  */
125 | 
126 | dfn {
127 |   font-style: italic;
128 | }
129 | 
130 | /**
131 |  * Address variable `h1` font-size and margin within `section` and `article`
132 |  * contexts in Firefox 4+, Safari, and Chrome.
133 |  */
134 | 
135 | h1 {
136 |   font-size: 2em;
137 |   margin: 0.67em 0;
138 | }
139 | 
140 | /**
141 |  * Address styling not present in IE 8/9.
142 |  */
143 | 
144 | mark {
145 |   background: #ff0;
146 |   color: #000;
147 | }
148 | 
149 | /**
150 |  * Address inconsistent and variable font size in all browsers.
151 |  */
152 | 
153 | small {
154 |   font-size: 80%;
155 | }
156 | 
157 | /**
158 |  * Prevent `sub` and `sup` affecting `line-height` in all browsers.
159 |  */
160 | 
161 | sub,
162 | sup {
163 |   font-size: 75%;
164 |   line-height: 0;
165 |   position: relative;
166 |   vertical-align: baseline;
167 | }
168 | 
169 | sup {
170 |   top: -0.5em;
171 | }
172 | 
173 | sub {
174 |   bottom: -0.25em;
175 | }
176 | 
177 | /* Embedded content
178 |    ========================================================================== */
179 | 
180 | /**
181 |  * Remove border when inside `a` element in IE 8/9/10.
182 |  */
183 | 
184 | img {
185 |   border: 0;
186 | }
187 | 
188 | /**
189 |  * Correct overflow not hidden in IE 9/10/11.
190 |  */
191 | 
192 | svg:not(:root) {
193 |   overflow: hidden;
194 | }
195 | 
196 | /* Grouping content
197 |    ========================================================================== */
198 | 
199 | /**
200 |  * Address margin not present in IE 8/9 and Safari.
201 |  */
202 | 
203 | figure {
204 |   margin: 1em 40px;
205 | }
206 | 
207 | /**
208 |  * Address differences between Firefox and other browsers.
209 |  */
210 | 
211 | hr {
212 |   -moz-box-sizing: content-box;
213 |   box-sizing: content-box;
214 |   height: 0;
215 | }
216 | 
217 | /**
218 |  * Contain overflow in all browsers.
219 |  */
220 | 
221 | pre {
222 |   overflow: auto;
223 | }
224 | 
225 | /**
226 |  * Address odd `em`-unit font size rendering in all browsers.
227 |  */
228 | 
229 | code,
230 | kbd,
231 | pre,
232 | samp {
233 |   font-family: monospace, monospace;
234 |   font-size: 1em;
235 | }
236 | 
237 | /* Forms
238 |    ========================================================================== */
239 | 
240 | /**
241 |  * Known limitation: by default, Chrome and Safari on OS X allow very limited
242 |  * styling of `select`, unless a `border` property is set.
243 |  */
244 | 
245 | /**
246 |  * 1. Correct color not being inherited.
247 |  *    Known issue: affects color of disabled elements.
248 |  * 2. Correct font properties not being inherited.
249 |  * 3. Address margins set differently in Firefox 4+, Safari, and Chrome.
250 |  */
251 | 
252 | button,
253 | input,
254 | optgroup,
255 | select,
256 | textarea {
257 |   color: inherit; /* 1 */
258 |   font: inherit; /* 2 */
259 |   margin: 0; /* 3 */
260 | }
261 | 
262 | /**
263 |  * Address `overflow` set to `hidden` in IE 8/9/10/11.
264 |  */
265 | 
266 | button {
267 |   overflow: visible;
268 | }
269 | 
270 | /**
271 |  * Address inconsistent `text-transform` inheritance for `button` and `select`.
272 |  * All other form control elements do not inherit `text-transform` values.
273 |  * Correct `button` style inheritance in Firefox, IE 8/9/10/11, and Opera.
274 |  * Correct `select` style inheritance in Firefox.
275 |  */
276 | 
277 | button,
278 | select {
279 |   text-transform: none;
280 | }
281 | 
282 | /**
283 |  * 1. Avoid the WebKit bug in Android 4.0.* where (2) destroys native `audio`
284 |  *    and `video` controls.
285 |  * 2. Correct inability to style clickable `input` types in iOS.
286 |  * 3. Improve usability and consistency of cursor style between image-type
287 |  *    `input` and others.
288 |  */
289 | 
290 | button,
291 | html input[type="button"], /* 1 */
292 | input[type="reset"],
293 | input[type="submit"] {
294 |   -webkit-appearance: button; /* 2 */
295 |   cursor: pointer; /* 3 */
296 | }
297 | 
298 | /**
299 |  * Re-set default cursor for disabled elements.
300 |  */
301 | 
302 | button[disabled],
303 | html input[disabled] {
304 |   cursor: default;
305 | }
306 | 
307 | /**
308 |  * Remove inner padding and border in Firefox 4+.
309 |  */
310 | 
311 | button::-moz-focus-inner,
312 | input::-moz-focus-inner {
313 |   border: 0;
314 |   padding: 0;
315 | }
316 | 
317 | /**
318 |  * Address Firefox 4+ setting `line-height` on `input` using `!important` in
319 |  * the UA stylesheet.
320 |  */
321 | 
322 | input {
323 |   line-height: normal;
324 | }
325 | 
326 | /**
327 |  * It's recommended that you don't attempt to style these elements.
328 |  * Firefox's implementation doesn't respect box-sizing, padding, or width.
329 |  *
330 |  * 1. Address box sizing set to `content-box` in IE 8/9/10.
331 |  * 2. Remove excess padding in IE 8/9/10.
332 |  */
333 | 
334 | input[type="checkbox"],
335 | input[type="radio"] {
336 |   box-sizing: border-box; /* 1 */
337 |   padding: 0; /* 2 */
338 | }
339 | 
340 | /**
341 |  * Fix the cursor style for Chrome's increment/decrement buttons. For certain
342 |  * `font-size` values of the `input`, it causes the cursor style of the
343 |  * decrement button to change from `default` to `text`.
344 |  */
345 | 
346 | input[type="number"]::-webkit-inner-spin-button,
347 | input[type="number"]::-webkit-outer-spin-button {
348 |   height: auto;
349 | }
350 | 
351 | /**
352 |  * 1. Address `appearance` set to `searchfield` in Safari and Chrome.
353 |  * 2. Address `box-sizing` set to `border-box` in Safari and Chrome
354 |  *    (include `-moz` to future-proof).
355 |  */
356 | 
357 | input[type="search"] {
358 |   -webkit-appearance: textfield; /* 1 */
359 |   -moz-box-sizing: content-box;
360 |   -webkit-box-sizing: content-box; /* 2 */
361 |   box-sizing: content-box;
362 | }
363 | 
364 | /**
365 |  * Remove inner padding and search cancel button in Safari and Chrome on OS X.
366 |  * Safari (but not Chrome) clips the cancel button when the search input has
367 |  * padding (and `textfield` appearance).
368 |  */
369 | 
370 | input[type="search"]::-webkit-search-cancel-button,
371 | input[type="search"]::-webkit-search-decoration {
372 |   -webkit-appearance: none;
373 | }
374 | 
375 | /**
376 |  * Define consistent border, margin, and padding.
377 |  */
378 | 
379 | fieldset {
380 |   border: 1px solid #c0c0c0;
381 |   margin: 0 2px;
382 |   padding: 0.35em 0.625em 0.75em;
383 | }
384 | 
385 | /**
386 |  * 1. Correct `color` not being inherited in IE 8/9/10/11.
387 |  * 2. Remove padding so people aren't caught out if they zero out fieldsets.
388 |  */
389 | 
390 | legend {
391 |   border: 0; /* 1 */
392 |   padding: 0; /* 2 */
393 | }
394 | 
395 | /**
396 |  * Remove default vertical scrollbar in IE 8/9/10/11.
397 |  */
398 | 
399 | textarea {
400 |   overflow: auto;
401 | }
402 | 
403 | /**
404 |  * Don't inherit the `font-weight` (applied by a rule above).
405 |  * NOTE: the default cannot safely be changed in Chrome and Safari on OS X.
406 |  */
407 | 
408 | optgroup {
409 |   font-weight: bold;
410 | }
411 | 
412 | /* Tables
413 |    ========================================================================== */
414 | 
415 | /**
416 |  * Remove most spacing between table cells.
417 |  */
418 | 
419 | table {
420 |   border-collapse: collapse;
421 |   border-spacing: 0;
422 | }
423 | 
424 | td,
425 | th {
426 |   padding: 0;
427 | }


--------------------------------------------------------------------------------