├── utils ├── __init__.py ├── html_cleaner.py ├── llm.py ├── action_processing.py └── subgoal_generator.py ├── browser_use ├── dom │ ├── __init__.py │ ├── tests │ │ ├── process_dom_test.py │ │ ├── debug_page_structure.py │ │ └── extraction_test.py │ ├── history_tree_processor │ │ ├── view.py │ │ └── service.py │ ├── views.py │ └── service.py ├── agent │ ├── memory │ │ ├── __init__.py │ │ └── service.py │ ├── system_prompt_wap_replay.md │ ├── message_manager │ │ ├── views.py │ │ ├── utils.py │ │ └── tests.py │ ├── system_prompt.md │ ├── tests.py │ └── prompts.py ├── exceptions.py ├── controller │ ├── views_selector.py │ ├── views.py │ └── registry │ │ ├── views.py │ │ └── service.py ├── __init__.py ├── browser │ ├── tests │ │ ├── screenshot_test.py │ │ └── test_clicks.py │ ├── utils │ │ └── screen_resolution.py │ └── views.py ├── telemetry │ ├── views.py │ └── service.py └── logging_config.py ├── chrome-extension ├── .gitignore ├── assets │ ├── pause.gif │ ├── panelUI.png │ ├── settings.gif │ ├── recording.gif │ ├── start-record.gif │ └── beholder-tool-kit-long.png ├── ico │ ├── ota-logo-48.png │ └── ota-logo-128.png ├── other │ └── Raleway.woff2 ├── js │ ├── devtools.js │ ├── ScrollHelper.js │ ├── ContentScriptProxy.js │ ├── specialEventHandler.js │ └── EventTable.js ├── devtools.html ├── .editorconfig ├── manifest.json ├── package.json ├── Gruntfile.js ├── panel.html ├── README.md └── css │ ├── panel.css │ └── normalize.css ├── assets └── wap_replay_tool_demo.gif ├── .gitignore ├── prompts └── subgoal_generation │ ├── task-start.md │ ├── task-finish.md │ ├── submit.md │ ├── go-back-or-forward.md │ └── common.md ├── wap_service.py ├── LICENSE ├── data_samples └── replay_list_samples │ ├── wap_smart_replay_list_y757R6w6y17LVHXl.json │ ├── wap_exact_replay_list_l8vZDGTfw3qu3GBs.json │ ├── wap_exact_replay_list_GqMnZeKFxvePGKGA.json │ └── wap_exact_replay_list_LhTyE4ie0s5a1W6J.json ├── action_collect_server.py ├── mcp_servers └── find_top_rated_keyboard_amazon_ca_y757R6w6y17LVHXl_mcp_server.py ├── requirements.txt ├── wap_replay ├── generate_smart_replay_list.py ├── generate_exact_replay_list.py └── generate_mcp_server.py └── README.md /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /browser_use/dom/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /chrome-extension/.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | .idea 3 | wap_browser_action_capturer-*.zip 4 | -------------------------------------------------------------------------------- /assets/wap_replay_tool_demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OTA-Tech-AI/web-agent-protocol/HEAD/assets/wap_replay_tool_demo.gif -------------------------------------------------------------------------------- /chrome-extension/assets/pause.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OTA-Tech-AI/web-agent-protocol/HEAD/chrome-extension/assets/pause.gif -------------------------------------------------------------------------------- /chrome-extension/assets/panelUI.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OTA-Tech-AI/web-agent-protocol/HEAD/chrome-extension/assets/panelUI.png -------------------------------------------------------------------------------- /chrome-extension/assets/settings.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OTA-Tech-AI/web-agent-protocol/HEAD/chrome-extension/assets/settings.gif -------------------------------------------------------------------------------- /chrome-extension/ico/ota-logo-48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OTA-Tech-AI/web-agent-protocol/HEAD/chrome-extension/ico/ota-logo-48.png -------------------------------------------------------------------------------- /chrome-extension/other/Raleway.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OTA-Tech-AI/web-agent-protocol/HEAD/chrome-extension/other/Raleway.woff2 -------------------------------------------------------------------------------- /chrome-extension/assets/recording.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OTA-Tech-AI/web-agent-protocol/HEAD/chrome-extension/assets/recording.gif -------------------------------------------------------------------------------- /chrome-extension/ico/ota-logo-128.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OTA-Tech-AI/web-agent-protocol/HEAD/chrome-extension/ico/ota-logo-128.png -------------------------------------------------------------------------------- /chrome-extension/assets/start-record.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OTA-Tech-AI/web-agent-protocol/HEAD/chrome-extension/assets/start-record.gif -------------------------------------------------------------------------------- /chrome-extension/js/devtools.js: -------------------------------------------------------------------------------- 1 | chrome.devtools.panels.create("OTA Action Capturer", "ico/logo_128.png", "panel.html", function (panel) {}); 2 | -------------------------------------------------------------------------------- /browser_use/agent/memory/__init__.py: -------------------------------------------------------------------------------- 1 | from browser_use.agent.memory.service import Memory, MemorySettings 2 | 3 | __all__ = ['Memory', 'MemorySettings'] 4 | -------------------------------------------------------------------------------- /chrome-extension/assets/beholder-tool-kit-long.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OTA-Tech-AI/web-agent-protocol/HEAD/chrome-extension/assets/beholder-tool-kit-long.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | 3 | cookies*.json 4 | element_tree.txt 5 | html/ 6 | data/ 7 | results/ 8 | ota_collect_data/ 9 | subgoals/ 10 | .env 11 | data_processed/ -------------------------------------------------------------------------------- /browser_use/exceptions.py: -------------------------------------------------------------------------------- 1 | class LLMException(Exception): 2 | def __init__(self, status_code, message): 3 | self.status_code = status_code 4 | self.message = message 5 | super().__init__(f'Error {status_code}: {message}') 6 | -------------------------------------------------------------------------------- /chrome-extension/devtools.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | OTA user interaction data helper 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /browser_use/controller/views_selector.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from pydantic import BaseModel, ConfigDict, Field, model_validator 4 | 5 | class InputTextBySelectorAction(BaseModel): 6 | selector: str 7 | text: str 8 | xpath: Optional[str] = None 9 | 10 | 11 | class Position(BaseModel): 12 | x: int 13 | y: int 14 | -------------------------------------------------------------------------------- /chrome-extension/.editorconfig: -------------------------------------------------------------------------------- 1 | # top-most EditorConfig file 2 | root = true 3 | 4 | # Newline ending every file 5 | [*] 6 | end_of_line = lf 7 | insert_final_newline = true 8 | 9 | # Charset 10 | [*.{js,html,css,md,json}] 11 | charset = utf-8 12 | 13 | # Indentation 14 | [*.{js,html,css,json}] 15 | indent_style = space 16 | indent_size = 4 -------------------------------------------------------------------------------- /prompts/subgoal_generation/task-start.md: -------------------------------------------------------------------------------- 1 | I need your help with an analysis to an action in browser and its related changes. 2 | We recorded a user's interactions with his browser for the current task. His ultimate goal is: 3 | 4 | {{ ultimate_goal }} 5 | 6 | The user just started this task, and when he clicked "task start" button, his current page is at: 7 | 8 | {{ change_events }} 9 | 10 | based on this information, provide a concise and formatted instruction in JSON to make another agent to know which website it needs to go to, e.g.: 11 | {"next_goal": "Open allrecipes.com in a new tab to search for the recipe."} -------------------------------------------------------------------------------- /chrome-extension/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest_version": 3, 3 | "name": "WAP Browser Action Capturer", 4 | "version": "1.0", 5 | "description": "A simple tool helping you to collect the interactions with browser for WAP replay.", 6 | "icons": { 7 | "128": "ico/ota-logo-128.png", 8 | "48": "ico/ota-logo-48.png" 9 | }, 10 | "permissions": [ 11 | "activeTab", 12 | "webNavigation", 13 | "scripting", 14 | "storage" 15 | ], 16 | "optional_host_permissions": [ 17 | "*://*/*" 18 | ], 19 | "background": { 20 | "service_worker": "js/background.js" 21 | }, 22 | "devtools_page": "devtools.html", 23 | "content_security_policy": { 24 | "extension_pages": "script-src 'self'; object-src 'self'" 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /chrome-extension/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "WAP-Browser-Action-Capturer", 3 | "version": "1.3.0", 4 | "description": "A simple tool helping you to collect the interactions with browser for WAP replay", 5 | "main": "Gruntfile.js", 6 | "repository": { 7 | "type": "git", 8 | "url": "git://github.com/OTA-Tech-AI/webagentprotocol.git" 9 | }, 10 | "author": "Konrad Dzwinel", 11 | "license": "GPL", 12 | "bugs": { 13 | "url": "https://github.com/OTA-Tech-AI/webagentprotocol/issues" 14 | }, 15 | "homepage": "https://github.com/OTA-Tech-AI/webagentprotocol", 16 | "dependencies": { 17 | "grunt": "^1.0.3", 18 | "grunt-contrib-csslint": "^2.0.0", 19 | "grunt-contrib-jshint": "^1.1.0", 20 | "grunt-contrib-watch": "^1.1.0", 21 | "grunt-zip": "^0.17.1" 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /browser_use/__init__.py: -------------------------------------------------------------------------------- 1 | from browser_use.logging_config import setup_logging 2 | 3 | setup_logging() 4 | 5 | from browser_use.agent.prompts import SystemPrompt as SystemPrompt 6 | from browser_use.agent.service import Agent as Agent 7 | from browser_use.agent.views import ActionModel as ActionModel 8 | from browser_use.agent.views import ActionResult as ActionResult 9 | from browser_use.agent.views import AgentHistoryList as AgentHistoryList 10 | from browser_use.browser.browser import Browser as Browser 11 | from browser_use.browser.browser import BrowserConfig as BrowserConfig 12 | from browser_use.browser.context import BrowserContextConfig 13 | from browser_use.controller.service import Controller as Controller 14 | from browser_use.dom.service import DomService as DomService 15 | 16 | __all__ = [ 17 | 'Agent', 18 | 'Browser', 19 | 'BrowserConfig', 20 | 'Controller', 21 | 'DomService', 22 | 'SystemPrompt', 23 | 'ActionResult', 24 | 'ActionModel', 25 | 'AgentHistoryList', 26 | 'BrowserContextConfig', 27 | ] 28 | -------------------------------------------------------------------------------- /wap_service.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request 2 | import run_replay 3 | 4 | app = Flask(__name__) 5 | 6 | @app.route('/replay', methods=['GET']) 7 | async def run_replay_endpoint(): 8 | try: 9 | # Get parameters from query string 10 | iterations = int(request.args.get('concurrent')) 11 | model = request.args.get('model') 12 | file_path = request.args.get('file_path') 13 | 14 | # Validate required parameters 15 | if not model or not file_path: 16 | return {"status": "error", "message": "Model and file_path are required"}, 400 17 | 18 | await run_replay.main(iterations, model, file_path) 19 | return {"status": "success", "message": "Replay executed successfully"} 20 | except ValueError as ve: 21 | return {"status": "error", "message": "Invalid iterations value: must be an integer"}, 400 22 | except Exception as e: 23 | return {"status": "error", "message": str(e)}, 500 24 | 25 | if __name__ == '__main__': 26 | app.run(host='0.0.0.0', port=3089) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 OTA-Tech-AI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /data_samples/replay_list_samples/wap_smart_replay_list_y757R6w6y17LVHXl.json: -------------------------------------------------------------------------------- 1 | { 2 | "ultimate_goal": "find a top rated keyboard on amazon.ca", 3 | "task_id": "y757R6w6y17LVHXl", 4 | "type": "smart_replay", 5 | "subgoal_list": [ 6 | { 7 | "index": 0, 8 | "subgoal": "task starts, go for the next sub-goal" 9 | }, 10 | { 11 | "index": 1, 12 | "subgoal": "Search for 'top rated keyboard' on amazon.ca to find the best options." 13 | }, 14 | { 15 | "index": 2, 16 | "subgoal": "Enter 'keyboard' as the search term in the search input field and press enter key." 17 | }, 18 | { 19 | "index": 3, 20 | "subgoal": "Click on the dropdown labeled 'Sort by:' to change sorting options." 21 | }, 22 | { 23 | "index": 4, 24 | "subgoal": "Click on the option labeled 'Avg. customer review' in the sort dropdown menu." 25 | }, 26 | { 27 | "index": 5, 28 | "subgoal": "Click on the first product" 29 | }, 30 | { 31 | "index": 6, 32 | "subgoal": "GOAL-NOT-ACHIEVED" 33 | }, 34 | { 35 | "index": 7, 36 | "subgoal": "task done" 37 | } 38 | ] 39 | } -------------------------------------------------------------------------------- /browser_use/browser/tests/screenshot_test.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import base64 3 | 4 | import pytest 5 | 6 | from browser_use.browser.browser import Browser, BrowserConfig 7 | 8 | 9 | async def test_take_full_page_screenshot(): 10 | browser = Browser(config=BrowserConfig(headless=False, disable_security=True)) 11 | try: 12 | async with await browser.new_context() as context: 13 | page = await context.get_current_page() 14 | # Go to a test page 15 | await page.goto('https://example.com') 16 | 17 | await asyncio.sleep(3) 18 | # Take full page screenshot 19 | screenshot_b64 = await context.take_screenshot(full_page=True) 20 | await asyncio.sleep(3) 21 | # Verify screenshot is not empty and is valid base64 22 | assert screenshot_b64 is not None 23 | assert isinstance(screenshot_b64, str) 24 | assert len(screenshot_b64) > 0 25 | 26 | # Test we can decode the base64 string 27 | try: 28 | base64.b64decode(screenshot_b64) 29 | except Exception as e: 30 | pytest.fail(f'Failed to decode base64 screenshot: {str(e)}') 31 | finally: 32 | await browser.close() 33 | 34 | 35 | if __name__ == '__main__': 36 | asyncio.run(test_take_full_page_screenshot()) 37 | -------------------------------------------------------------------------------- /chrome-extension/Gruntfile.js: -------------------------------------------------------------------------------- 1 | module.exports = function (grunt) { 2 | "use strict"; 3 | 4 | grunt.initConfig({ 5 | pkg: grunt.file.readJSON('package.json'), 6 | jshint: { 7 | files: ['Gruntfile.js', 'js/**/*.js'], 8 | options: { 9 | esversion: 6, 10 | evil: true, 11 | camelcase: true, 12 | curly: true, 13 | eqeqeq: true, 14 | noempty: true, 15 | strict: true, 16 | loopfunc: true, 17 | globals: { 18 | console: true, 19 | document: true 20 | } 21 | } 22 | }, 23 | csslint: { 24 | src: ['css/*.css'], 25 | options: { 26 | ids: false, 27 | 'compatible-vendor-prefixes': false, 28 | 'fallback-colors': false 29 | } 30 | }, 31 | zip: { 32 | 'wap_browser_action_capturer-<%= pkg.version %>.zip': ['css/**/*', 'ico/logo_*.png', 'js/**/*', 'other/**/*', '*.html', 'manifest.json'] 33 | } 34 | }); 35 | 36 | grunt.loadNpmTasks('grunt-contrib-jshint'); 37 | grunt.loadNpmTasks('grunt-contrib-csslint'); 38 | grunt.loadNpmTasks('grunt-zip'); 39 | 40 | grunt.registerTask('default', ['jshint']); 41 | grunt.registerTask('prod', ['zip']); 42 | }; 43 | -------------------------------------------------------------------------------- /prompts/subgoal_generation/task-finish.md: -------------------------------------------------------------------------------- 1 | I need your help with an analysis to an action in browser and its related changes. 2 | We recorded a user's interactions with his browser for the current task. His ultimate goal is: 3 | 4 | {{ ultimate_goal }} 5 | 6 | now the user has already finished the task, and the final page content that he submitted to complete task is: 7 | 8 | {{ page_content }} 9 | 10 | based on this content, please tell me: do you think this task is really finished? 11 | Provide a concise and formatted instruction in JSON to make another agent to know what to do, you have several options: 12 | 13 | 1. if the ultimate goal has been achieved by the current action and no more other actions need to be executed or no any information needs to be delivered to the user, only reply a 'done' message, e.g.: {"next_goal": "The ultimate task is done"} 14 | 15 | 2. if the ultimate goal has been achieved but we need to extract information from the current page content to respond user's demands, reply a content extraction message, e.g.: {"next_goal": "extract the cook time and prepare time from the page content"} 16 | 17 | 3. if the ultimate goal has NOT been ahieved, please reply a failure message, e,g.: {"next_goal": "GOAL-NOT-ACHIEVED", "reason": "the cook time is longer than expected ..."} -------------------------------------------------------------------------------- /chrome-extension/js/ScrollHelper.js: -------------------------------------------------------------------------------- 1 | (function () { 2 | "use strict"; 3 | 4 | function ScrollHelper(button) { 5 | var scrollHelper = this; 6 | 7 | this._button = button; 8 | 9 | var scrollPos = 0; 10 | document.addEventListener('scroll', function () { 11 | scrollPos = document.body.scrollTop; 12 | }); 13 | 14 | function updateBtn() { 15 | if (scrollPos > 0) { 16 | scrollHelper.showButton(); 17 | } else { 18 | scrollHelper.hideButton(); 19 | } 20 | requestAnimationFrame(updateBtn); 21 | } 22 | 23 | updateBtn(); 24 | } 25 | 26 | ScrollHelper.prototype.hideButton = function () { 27 | this._button.classList.add('hidden'); 28 | }; 29 | 30 | ScrollHelper.prototype.showButton = function () { 31 | this._button.classList.remove('hidden'); 32 | }; 33 | 34 | ScrollHelper.prototype.scrollToTheTop = function () { 35 | var scrollPos = document.body.scrollTop; 36 | 37 | if (scrollPos > 0) { 38 | document.body.scrollTop -= (scrollPos > 10) ? (scrollPos / 4) : 10; 39 | requestAnimationFrame(this.scrollToTheTop.bind(this)); 40 | } 41 | }; 42 | 43 | window.ScrollHelper = ScrollHelper; 44 | })(); 45 | -------------------------------------------------------------------------------- /browser_use/dom/tests/process_dom_test.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import time 4 | 5 | from browser_use.browser.browser import Browser, BrowserConfig 6 | 7 | 8 | async def test_process_dom(): 9 | browser = Browser(config=BrowserConfig(headless=False)) 10 | 11 | async with await browser.new_context() as context: 12 | page = await context.get_current_page() 13 | await page.goto('https://kayak.com/flights') 14 | # await page.goto('https://google.com/flights') 15 | # await page.goto('https://immobilienscout24.de') 16 | # await page.goto('https://seleniumbase.io/w3schools/iframes') 17 | 18 | time.sleep(3) 19 | 20 | with open('browser_use/dom/buildDomTree.js', 'r') as f: 21 | js_code = f.read() 22 | 23 | start = time.time() 24 | dom_tree = await page.evaluate(js_code) 25 | end = time.time() 26 | 27 | # print(dom_tree) 28 | print(f'Time: {end - start:.2f}s') 29 | 30 | os.makedirs('./tmp', exist_ok=True) 31 | with open('./tmp/dom.json', 'w') as f: 32 | json.dump(dom_tree, f, indent=1) 33 | 34 | # both of these work for immobilienscout24.de 35 | # await page.click('.sc-dcJsrY.ezjNCe') 36 | # await page.click( 37 | # 'div > div:nth-of-type(2) > div > div:nth-of-type(2) > div > div:nth-of-type(2) > div > div > div > button:nth-of-type(2)' 38 | # ) 39 | 40 | input('Press Enter to continue...') 41 | -------------------------------------------------------------------------------- /prompts/subgoal_generation/submit.md: -------------------------------------------------------------------------------- 1 | I need your help with an analysis to an action in browser and its related changes. 2 | We recorded an action of 'submit' by the user with his browser for the current task. His ultimate goal is: 3 | 4 | {{ ultimate_goal }} 5 | 6 | So here is the basic information of the action for the 'submit' in current sub-task: 7 | 8 | {{ action }} 9 | 10 | note that "target" is the targeted element for this action. 11 | Here is the detailed information about the form values that the user submitted: 12 | 13 | {{ change_events }} 14 | 15 | note that in some "nodeinfo", #rme mean there are more children inside this tag pairs but we hide it for shorting the context. 16 | You should think about what is the purpose of this action by the user, and think about what is the goal this user is trying to achieve in the current sub-task. You don't need to tell me your thought process. You only need to give me a final reply which is a concise and formatted instruction in JSON to make another agent to understand this sub-goal and reproduce the action. Do not mension the details of the target. If the submision is a search, you need to provide two actions, input change and press enter key e.g.: 17 | {"next_goal": "Enter 'Singapore' as the destination in the search input field and press enter key."} 18 | {"next_goal": "Click on the button with text 'Dinners' to view more options for cooking dinners at home"} 19 | -------------------------------------------------------------------------------- /action_collect_server.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime 3 | import argparse 4 | from flask import Flask, request, jsonify 5 | from flask_cors import CORS 6 | 7 | app = Flask(__name__) 8 | CORS(app) 9 | 10 | def mkdir_n_define_file_name(data_root_dir, task_name): 11 | timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") 12 | date_folder = timestamp.split('_')[0] 13 | # Include task_name after the date folder 14 | folderpath = os.path.join(data_root_dir, date_folder, task_name) 15 | if not os.path.exists(folderpath): 16 | os.makedirs(folderpath) 17 | filename = f"summary_event_{timestamp}.json" 18 | filepath = os.path.join(folderpath, filename) 19 | return filepath 20 | 21 | @app.route('/action-data', methods=['POST']) 22 | def handle_event(): 23 | if not request.is_json: 24 | return jsonify({"status": "error", "message": "Request must be JSON"}), 400 25 | 26 | event_data = request.get_json() 27 | task_id = event_data["taskId"] 28 | filepath = mkdir_n_define_file_name("data", task_id) 29 | 30 | with open(filepath, "w", encoding='utf-8') as json_file: 31 | import json 32 | json.dump(event_data, json_file, indent=2) 33 | 34 | return jsonify({"status": "success", "message": f"Event received and saved as {filepath}"}), 200 35 | 36 | if __name__ == '__main__': 37 | # Run the Flask app 38 | app.run(debug=True, host='0.0.0.0', port=4934) -------------------------------------------------------------------------------- /prompts/subgoal_generation/go-back-or-forward.md: -------------------------------------------------------------------------------- 1 | I need your help with an analysis to an action in browser and its related changes. 2 | We recorded a user's interactions with his browser for the current task. His ultimate goal is: 3 | 4 | {{ ultimate_goal }} 5 | 6 | In the current sub-task, the user clicked on "go back" or "go forward" button of the browser, it is possible that he didn't find the information he needed in the current page, or he may want to 7 | confirm information in the previous page. This is the information of this action: 8 | 9 | {{ action }} 10 | 11 | The content before he goes back or forward is: 12 | 13 | {{ page_content }} 14 | 15 | note that sometimes in the page content, you will see #rme and it means there are more children inside this tag but we hide it for shortening contexts. 16 | You should think about what is the purpose of this action by the user. You don't need to tell me your thought process. You only need to give me a final reply which is a concise and formatted instruction in JSON to make another agent to understand this sub-goal and reproduce the action. 17 | Do not mention "go back" or "go forward" because it is unclear. Tell which URL it should navigate to. e.g.: 18 | {"next_goal": "Navigate to https://www.allrecipes.com/search?q=baked+salmon to review search results for 'baked salmon' recipes on Allrecipes."} 19 | {"next_goal": "Navigate to google.com to search for keywords spanish restaurants."} -------------------------------------------------------------------------------- /prompts/subgoal_generation/common.md: -------------------------------------------------------------------------------- 1 | I need your help with an analysis to an action in browser and its related changes. 2 | We recorded a user's interactions with his browser for the current task. His ultimate goal is: 3 | 4 | {{ ultimate_goal }} 5 | 6 | So here is the basic information of the action for the current sub-task that a user takes in browser: 7 | 8 | {{ action }} 9 | 10 | note that "target" is the targeted element for this action. 11 | This is what happened in the browser DOM before and after this action: 12 | 13 | {{ change_events }} 14 | 15 | note that in some "nodeinfo", #rme mean there are more children inside this tag pairs but we hide it for shorting the context. 16 | You should think about what is the purpose of this action by the user, and think about what is the goal this user is trying to achieve in the current sub-task. You don't need to tell me your thought process. You only need to give me a final reply which is a concise and formatted instruction in JSON to make another agent to understand this sub-goal and reproduce the action. Subgoal should be generalized and fit the ultimate goal. Only include one action in the subgoal, do not explain action. Only include one action (verb) in the subgoal! If you want to use "and", only keep the first action. e.g.: 17 | {"next_goal": "Enter 'Singapore' as the destination in the search input field."} 18 | {"next_goal": "Click on the button with text 'Dinners'"} 19 | {"next_goal": "Click on the first item"} 20 | -------------------------------------------------------------------------------- /mcp_servers/find_top_rated_keyboard_amazon_ca_y757R6w6y17LVHXl_mcp_server.py: -------------------------------------------------------------------------------- 1 | 2 | from mcp.server.fastmcp import FastMCP 3 | import httpx 4 | 5 | mcp = FastMCP("find a top rated keyboard on amazon.ca") 6 | 7 | @mcp.tool() 8 | async def find_top_rated_keyboard_amazon_ca_smart_replay() -> str: 9 | """smart replay: find a top rated keyboard on amazon.ca""" 10 | async with httpx.AsyncClient(timeout=600.0) as client: 11 | response = await client.get( 12 | "http://localhost:3089/replay", 13 | params={ 14 | "concurrent": 1, 15 | "model": "openai", 16 | "file_path": 'data_processed/smart_replay/wap_smart_replay_list_y757R6w6y17LVHXl.json' 17 | } 18 | ) 19 | return response.text 20 | return "FAILED" 21 | 22 | @mcp.tool() 23 | async def find_top_rated_keyboard_amazon_ca_exact_replay() -> str: 24 | """exact replay: find a top rated keyboard on amazon.ca""" 25 | async with httpx.AsyncClient(timeout=600.0) as client: 26 | response = await client.get( 27 | "http://localhost:3089/replay", 28 | params={ 29 | "concurrent": 1, 30 | "model": "openai", 31 | "file_path": 'data_processed/exact_replay/wap_exact_replay_list_y757R6w6y17LVHXl.json' 32 | } 33 | ) 34 | return response.text 35 | return "FAILED" 36 | 37 | if __name__ == "__main__": 38 | mcp.run(transport="stdio") 39 | -------------------------------------------------------------------------------- /browser_use/browser/utils/screen_resolution.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def get_screen_resolution(): 5 | if sys.platform == 'darwin': # macOS 6 | try: 7 | from AppKit import NSScreen 8 | 9 | screen = NSScreen.mainScreen().frame() 10 | return {'width': int(screen.size.width), 'height': int(screen.size.height)} 11 | except ImportError: 12 | print('AppKit is not available. Make sure you are running this on macOS with pyobjc installed.') 13 | except Exception as e: 14 | print(f'Error retrieving macOS screen resolution: {e}') 15 | return {'width': 2560, 'height': 1664} 16 | 17 | else: # Windows & Linux 18 | try: 19 | from screeninfo import get_monitors 20 | 21 | monitors = get_monitors() 22 | if not monitors: 23 | raise Exception('No monitors detected.') 24 | monitor = monitors[0] 25 | return {'width': monitor.width, 'height': monitor.height} 26 | except ImportError: 27 | print("screeninfo package not found. Install it using 'pip install screeninfo'.") 28 | except Exception as e: 29 | print(f'Error retrieving screen resolution: {e}') 30 | 31 | return {'width': 1920, 'height': 1080} 32 | 33 | 34 | def get_window_adjustments(): 35 | """Returns recommended x, y offsets for window positioning""" 36 | if sys.platform == 'darwin': # macOS 37 | return -4, 24 # macOS has a small title bar, no border 38 | elif sys.platform == 'win32': # Windows 39 | return -8, 0 # Windows has a border on the left 40 | else: # Linux 41 | return 0, 0 42 | -------------------------------------------------------------------------------- /chrome-extension/js/ContentScriptProxy.js: -------------------------------------------------------------------------------- 1 | (function () { 2 | "use strict"; 3 | 4 | function callCommand(cmd) { 5 | chrome.devtools.inspectedWindow.eval( 6 | cmd, 7 | {useContentScriptContext: true}, 8 | function (isException, result) { 9 | if (isException || chrome.runtime.lastError) { 10 | console.error('Content script command call failed.', cmd, result, chrome.runtime.lastError); 11 | } 12 | } 13 | ); 14 | } 15 | 16 | function jsArg(str) { 17 | // safely quote argument for eval 18 | return JSON.stringify(str); 19 | } 20 | 21 | window.ContentScriptProxy = { 22 | inspectNode: function (nodeId) { 23 | callCommand('inspect(domListenerExtension.getNode(' + nodeId + '))'); 24 | }, 25 | highlightNode: function (nodeId) { 26 | callCommand('domListenerExtension.highlightNode(' + nodeId + ')'); 27 | }, 28 | startRecording: function (desc) { 29 | callCommand(`domListenerExtension.startTaskRecording(${jsArg(desc)})`); 30 | }, 31 | pauseRecording: function () { 32 | callCommand('domListenerExtension.pauseTaskRecording()'); 33 | }, 34 | resumeRecording: function (desc) { 35 | callCommand(`domListenerExtension.resumeTaskRecording(${jsArg(desc)})`); 36 | }, 37 | finishRecording: function () { 38 | callCommand('domListenerExtension.finishTaskRecording()'); 39 | } 40 | }; 41 | })(); 42 | -------------------------------------------------------------------------------- /browser_use/telemetry/views.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from dataclasses import asdict, dataclass 3 | from typing import Any, Dict, Sequence 4 | 5 | 6 | @dataclass 7 | class BaseTelemetryEvent(ABC): 8 | @property 9 | @abstractmethod 10 | def name(self) -> str: 11 | pass 12 | 13 | @property 14 | def properties(self) -> Dict[str, Any]: 15 | return {k: v for k, v in asdict(self).items() if k != 'name'} 16 | 17 | 18 | @dataclass 19 | class RegisteredFunction: 20 | name: str 21 | params: dict[str, Any] 22 | 23 | 24 | @dataclass 25 | class ControllerRegisteredFunctionsTelemetryEvent(BaseTelemetryEvent): 26 | registered_functions: list[RegisteredFunction] 27 | name: str = 'controller_registered_functions' 28 | 29 | 30 | @dataclass 31 | class AgentStepTelemetryEvent(BaseTelemetryEvent): 32 | agent_id: str 33 | step: int 34 | step_error: list[str] 35 | consecutive_failures: int 36 | actions: list[dict] 37 | name: str = 'agent_step' 38 | 39 | 40 | @dataclass 41 | class AgentRunTelemetryEvent(BaseTelemetryEvent): 42 | agent_id: str 43 | use_vision: bool 44 | task: str 45 | model_name: str 46 | chat_model_library: str 47 | version: str 48 | source: str 49 | name: str = 'agent_run' 50 | 51 | 52 | @dataclass 53 | class AgentEndTelemetryEvent(BaseTelemetryEvent): 54 | agent_id: str 55 | steps: int 56 | max_steps_reached: bool 57 | is_done: bool 58 | success: bool | None 59 | total_input_tokens: int 60 | total_duration_seconds: float 61 | 62 | errors: Sequence[str | None] 63 | name: str = 'agent_end' 64 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate==1.6.0 2 | annotated-types==0.7.0 3 | anthropic==0.49.0 4 | anyio==4.9.0 5 | backoff==2.2.1 6 | beautifulsoup4==4.13.3 7 | certifi==2025.1.31 8 | charset-normalizer==3.4.1 9 | colorama==0.4.6 10 | defusedxml==0.7.1 11 | distro==1.9.0 12 | filelock==3.18.0 13 | fsspec==2025.3.2 14 | greenlet==3.1.1 15 | h11==0.14.0 16 | httpcore==1.0.7 17 | httpx==0.28.1 18 | huggingface-hub==0.30.1 19 | idna==3.10 20 | Jinja2==3.1.6 21 | jiter==0.9.0 22 | joblib==1.4.2 23 | jsonpatch==1.33 24 | jsonpointer==3.0.0 25 | langchain-core>=0.3.58,<0.4.0 26 | langchain-anthropic==0.3.3 27 | langchain-ollama==0.2.2 28 | langchain-openai==0.3.1 29 | langsmith==0.3.24 30 | markdownify==0.14.1 31 | MarkupSafe==3.0.2 32 | monotonic==1.6 33 | mpmath==1.3.0 34 | networkx==3.4.2 35 | numpy==2.2.4 36 | ollama==0.4.7 37 | openai==1.70.0 38 | orjson==3.10.16 39 | packaging==24.2 40 | pillow==11.1.0 41 | playwright==1.51.0 42 | posthog==3.23.0 43 | psutil==7.0.0 44 | pydantic==2.11.2 45 | pydantic_core==2.33.1 46 | pyee==12.1.1 47 | python-dateutil==2.9.0.post0 48 | python-dotenv==1.1.0 49 | PyYAML==6.0.2 50 | regex==2024.11.6 51 | requests==2.32.3 52 | requests-toolbelt==1.0.0 53 | safetensors==0.5.3 54 | scikit-learn==1.6.1 55 | scipy==1.15.2 56 | sentence-transformers==4.0.2 57 | six==1.17.0 58 | sniffio==1.3.1 59 | soupsieve==2.6 60 | sympy==1.13.1 61 | tenacity==9.1.2 62 | threadpoolctl==3.6.0 63 | tiktoken==0.9.0 64 | tokenizers==0.21.1 65 | torch==2.6.0 66 | tqdm==4.67.1 67 | transformers==4.51.0 68 | typing-inspection==0.4.0 69 | typing_extensions==4.13.1 70 | urllib3==2.3.0 71 | zstandard==0.23.0 72 | flask==3.1.0 73 | flask_cors==5.0.1 74 | mem0ai==0.1.96 75 | faiss-cpu==1.11.0 76 | screeninfo==0.8.1 77 | mcp==1.7.1 78 | flask[async] 79 | langchain==0.3.25 80 | html_sanitizer==2.5.0 81 | -------------------------------------------------------------------------------- /browser_use/browser/views.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Any, Optional 3 | 4 | from pydantic import BaseModel 5 | 6 | from browser_use.dom.history_tree_processor.service import DOMHistoryElement 7 | from browser_use.dom.views import DOMState 8 | 9 | 10 | # Pydantic 11 | class TabInfo(BaseModel): 12 | """Represents information about a browser tab""" 13 | 14 | page_id: int 15 | url: str 16 | title: str 17 | parent_page_id: Optional[int] = None # parent page that contains this popup or cross-origin iframe 18 | 19 | 20 | class GroupTabsAction(BaseModel): 21 | tab_ids: list[int] 22 | title: str 23 | color: Optional[str] = 'blue' 24 | 25 | 26 | class UngroupTabsAction(BaseModel): 27 | tab_ids: list[int] 28 | 29 | 30 | @dataclass 31 | class BrowserState(DOMState): 32 | url: str 33 | title: str 34 | tabs: list[TabInfo] 35 | screenshot: Optional[str] = None 36 | pixels_above: int = 0 37 | pixels_below: int = 0 38 | browser_errors: list[str] = field(default_factory=list) 39 | 40 | 41 | @dataclass 42 | class BrowserStateHistory: 43 | url: str 44 | title: str 45 | tabs: list[TabInfo] 46 | interacted_element: list[DOMHistoryElement | None] | list[None] 47 | screenshot: Optional[str] = None 48 | 49 | def to_dict(self) -> dict[str, Any]: 50 | data = {} 51 | data['tabs'] = [tab.model_dump() for tab in self.tabs] 52 | data['screenshot'] = self.screenshot 53 | data['interacted_element'] = [el.to_dict() if el else None for el in self.interacted_element] 54 | data['url'] = self.url 55 | data['title'] = self.title 56 | return data 57 | 58 | 59 | class BrowserError(Exception): 60 | """Base class for all browser errors""" 61 | 62 | 63 | class URLNotAllowedError(BrowserError): 64 | """Error raised when a URL is not allowed""" 65 | -------------------------------------------------------------------------------- /data_samples/replay_list_samples/wap_exact_replay_list_l8vZDGTfw3qu3GBs.json: -------------------------------------------------------------------------------- 1 | { 2 | "ultimate_goal": "Find the best sold keyboard in BestBuy", 3 | "task_id": "l8vZDGTfw3qu3GBs", 4 | "type": "exact_replay", 5 | "action_list": [ 6 | { 7 | "action": "open_tab", 8 | "action_params": { 9 | "url": "https://www.bestbuy.ca/en-ca" 10 | } 11 | }, 12 | { 13 | "action": "wait_for_element", 14 | "action_params": { 15 | "selector": "INPUT.style-module_textField__MdLzL", 16 | "timeout": 5000 17 | } 18 | }, 19 | { 20 | "action": "input_text_by_selector", 21 | "action_params": { 22 | "selector": "INPUT.style-module_textField__MdLzL", 23 | "text": "keyboard" 24 | } 25 | }, 26 | { 27 | "action": "send_keys", 28 | "action_params": { 29 | "keys": "Enter" 30 | } 31 | }, 32 | { 33 | "action": "wait_for_element", 34 | "action_params": { 35 | "selector": "#Sort", 36 | "timeout": 5000 37 | } 38 | }, 39 | { 40 | "action": "select_option_by_selector", 41 | "action_params": { 42 | "css_selector": "#Sort", 43 | "value": "highestRated" 44 | } 45 | }, 46 | { 47 | "action": "wait_for_element", 48 | "action_params": { 49 | "selector": "h3[data-automation=\"productItemName\"]", 50 | "timeout": 5000 51 | } 52 | }, 53 | { 54 | "action": "click_element_by_selector", 55 | "action_params": { 56 | "css_selector": "h3[data-automation=\"productItemName\"]" 57 | } 58 | }, 59 | { 60 | "action": "extract_content", 61 | "action_params": { 62 | "goal": "Find the best sold keyboard in BestBuy", 63 | "should_strip_link_urls": false 64 | } 65 | }, 66 | { 67 | "action": "done", 68 | "action_params": { 69 | "text": "task executed successfully", 70 | "success": true 71 | } 72 | } 73 | ] 74 | } -------------------------------------------------------------------------------- /browser_use/dom/history_tree_processor/view.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Optional 3 | 4 | from pydantic import BaseModel 5 | 6 | 7 | @dataclass 8 | class HashedDomElement: 9 | """ 10 | Hash of the dom element to be used as a unique identifier 11 | """ 12 | 13 | branch_path_hash: str 14 | attributes_hash: str 15 | xpath_hash: str 16 | # text_hash: str 17 | 18 | 19 | class Coordinates(BaseModel): 20 | x: int 21 | y: int 22 | 23 | 24 | class CoordinateSet(BaseModel): 25 | top_left: Coordinates 26 | top_right: Coordinates 27 | bottom_left: Coordinates 28 | bottom_right: Coordinates 29 | center: Coordinates 30 | width: int 31 | height: int 32 | 33 | 34 | class ViewportInfo(BaseModel): 35 | scroll_x: int 36 | scroll_y: int 37 | width: int 38 | height: int 39 | 40 | 41 | @dataclass 42 | class DOMHistoryElement: 43 | tag_name: str 44 | xpath: str 45 | highlight_index: Optional[int] 46 | entire_parent_branch_path: list[str] 47 | attributes: dict[str, str] 48 | shadow_root: bool = False 49 | css_selector: Optional[str] = None 50 | page_coordinates: Optional[CoordinateSet] = None 51 | viewport_coordinates: Optional[CoordinateSet] = None 52 | viewport_info: Optional[ViewportInfo] = None 53 | 54 | def to_dict(self) -> dict: 55 | page_coordinates = self.page_coordinates.model_dump() if self.page_coordinates else None 56 | viewport_coordinates = self.viewport_coordinates.model_dump() if self.viewport_coordinates else None 57 | viewport_info = self.viewport_info.model_dump() if self.viewport_info else None 58 | 59 | return { 60 | 'tag_name': self.tag_name, 61 | 'xpath': self.xpath, 62 | 'highlight_index': self.highlight_index, 63 | 'entire_parent_branch_path': self.entire_parent_branch_path, 64 | 'attributes': self.attributes, 65 | 'shadow_root': self.shadow_root, 66 | 'css_selector': self.css_selector, 67 | 'page_coordinates': page_coordinates, 68 | 'viewport_coordinates': viewport_coordinates, 69 | 'viewport_info': viewport_info, 70 | } 71 | -------------------------------------------------------------------------------- /utils/html_cleaner.py: -------------------------------------------------------------------------------- 1 | from html_sanitizer import Sanitizer 2 | 3 | 4 | def run_html_sanitizer(html: str, action_type: str): 5 | def sanitize_html(html: str, config: dict) -> str: 6 | sanitizer = Sanitizer(config) 7 | return sanitizer.sanitize(html) 8 | 9 | config = {} 10 | if action_type == "task-finish": 11 | allowed_tags = [ 12 | 'a', 'address', 'article', 'aside', 'b', 'blockquote', 'button', 'caption', 'cite', 'code', 'col', 'colgroup', 13 | 'data', 'datalist', 'dd', 'del', 'details', 'div', 'dl', 'dt', 'em', 14 | 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 15 | 'h5', 'h6', 'header', 'hr', 'i', 'img', 'input', 'label', 'legend', 16 | 'li', 'main', 'menu', 'nav', 'ol', 'option', 'output', 'p', 'pre', 17 | 'q', 's', 'section', 'select', 'small', 'span', 'strong', 'sub', 'summary', 18 | 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'textarea', 'time', 'tr', 'ul', 'video', 19 | 'title' 20 | ] 21 | 22 | common_attrs = ["id", "aria-label", "role"] 23 | wildcard_data_attrs = "data-*" 24 | 25 | # Start with tag-specific attributes 26 | attributes = { 27 | "a": ["rel", "target"] + common_attrs, 28 | "img": ["alt"] + common_attrs, 29 | "button": ["aria-label"] + common_attrs, 30 | } 31 | 32 | # Add the common attributes to all other tags 33 | for tag in allowed_tags: 34 | if tag not in attributes: 35 | attributes[tag] = common_attrs.copy() 36 | # Add wildcard attributes for data-* only if supported by your sanitizer config 37 | attributes[tag].append(wildcard_data_attrs) 38 | 39 | config = { 40 | "tags": allowed_tags, 41 | "attributes": attributes, 42 | "empty": ["a", "img"], 43 | "separate": ["p", "div", "h1", "h2", "h3", "article", "main"], 44 | "keep_typographic_whitespace": True 45 | } 46 | 47 | return sanitize_html(html, config) -------------------------------------------------------------------------------- /utils/llm.py: -------------------------------------------------------------------------------- 1 | """Sub-goal generator helper 2 | 3 | This tiny helper takes a text prompt, sends it to OpenAI via LangChain, 4 | and returns the assistant's plain-text reply. 5 | """ 6 | from __future__ import annotations 7 | 8 | import os 9 | from typing import Optional 10 | 11 | from langchain_openai import ChatOpenAI 12 | from langchain.schema import AIMessage, HumanMessage, SystemMessage 13 | 14 | __all__ = ["ask_llm"] 15 | 16 | # --------------------------------------------------------------------------- 17 | # Basic LLM wrapper 18 | # --------------------------------------------------------------------------- 19 | 20 | def _build_llm(model: str = "gpt-4o", temperature: float = 0) -> ChatOpenAI: # type: ignore 21 | """Create a LangChain ChatOpenAI client with sane defaults. 22 | 23 | Parameters 24 | ---------- 25 | model : str 26 | OpenAI model name. Defaults to *gpt-4o-mini* (fast/cheap). Change to 27 | "gpt-4o" or "gpt-4-turbo" if you want higher quality. 28 | temperature : float 29 | Sampling temperature. 30 | """ 31 | # The key must be available in the environment. (Raise a clear error if not.) 32 | if "OPENAI_API_KEY" not in os.environ: 33 | raise RuntimeError("OPENAI_API_KEY environment variable is not set.") 34 | 35 | return ChatOpenAI(model_name=model, temperature=temperature) 36 | 37 | 38 | def ask_llm(prompt: str, 39 | system_prompt: Optional[str] = None, 40 | model: str = "gpt-4o", 41 | temperature: float = 0) -> str: 42 | """Send *prompt* to OpenAI and return the assistant text. 43 | 44 | Parameters 45 | ---------- 46 | prompt : str 47 | User prompt / question. 48 | system_prompt : str | None 49 | Optional system message to steer model behaviour. 50 | model : str 51 | OpenAI model name (default: gpt-4o-mini). 52 | temperature : float 53 | Sampling temperature (default 0.2). 54 | 55 | Returns 56 | ------- 57 | str 58 | Assistant's plain-text reply. 59 | """ 60 | llm = _build_llm(model=model, temperature=temperature) 61 | 62 | messages = [] 63 | if system_prompt: 64 | messages.append(SystemMessage(content=system_prompt)) 65 | messages.append(HumanMessage(content=prompt)) 66 | 67 | # Call the chat model. 68 | response = llm(messages) # -> AIMessage 69 | 70 | if not isinstance(response, AIMessage): 71 | raise RuntimeError("Unexpected response type from LLM") 72 | 73 | return response.content.strip() 74 | -------------------------------------------------------------------------------- /data_samples/replay_list_samples/wap_exact_replay_list_GqMnZeKFxvePGKGA.json: -------------------------------------------------------------------------------- 1 | { 2 | "ultimate_goal": "search for a recipe of baked salmon which takes less than 1 hour to cook", 3 | "task_id": "GqMnZeKFxvePGKGA", 4 | "type": "exact_replay", 5 | "action_list": [ 6 | { 7 | "action": "open_tab", 8 | "action_params": { 9 | "url": "https://www.allrecipes.com/" 10 | } 11 | }, 12 | { 13 | "action": "wait_for_element", 14 | "action_params": { 15 | "selector": "#mntl-search-form--open__search-input", 16 | "timeout": 5000 17 | } 18 | }, 19 | { 20 | "action": "input_text_by_selector", 21 | "action_params": { 22 | "selector": "#mntl-search-form--open__search-input", 23 | "text": "baked salmon" 24 | } 25 | }, 26 | { 27 | "action": "send_keys", 28 | "action_params": { 29 | "keys": "Enter" 30 | } 31 | }, 32 | { 33 | "action": "wait_for_element", 34 | "action_params": { 35 | "selector": "img[alt=\"Breaded, baked salmon fillets topped with lemon slices, served alongside asparagus slices and rice pilaf on blue plates\"]", 36 | "timeout": 5000 37 | } 38 | }, 39 | { 40 | "action": "click_element_by_selector", 41 | "action_params": { 42 | "css_selector": "img[alt=\"Breaded, baked salmon fillets topped with lemon slices, served alongside asparagus slices and rice pilaf on blue plates\"]" 43 | } 44 | }, 45 | { 46 | "action": "go_to_url", 47 | "action_params": { 48 | "url": "https://www.allrecipes.com/search?q=baked+salmon" 49 | } 50 | }, 51 | { 52 | "action": "wait_for_element", 53 | "action_params": { 54 | "selector": "body", 55 | "timeout": 8000 56 | } 57 | }, 58 | { 59 | "action": "wait_for_element", 60 | "action_params": { 61 | "selector": "img[alt=\"Filet of salmon topped with melted cheese on aluminum foil\"]", 62 | "timeout": 5000 63 | } 64 | }, 65 | { 66 | "action": "click_element_by_selector", 67 | "action_params": { 68 | "css_selector": "img[alt=\"Filet of salmon topped with melted cheese on aluminum foil\"]" 69 | } 70 | }, 71 | { 72 | "action": "extract_content", 73 | "action_params": { 74 | "goal": "search for a recipe of baked salmon which takes less than 1 hour to cook", 75 | "should_strip_link_urls": false 76 | } 77 | }, 78 | { 79 | "action": "done", 80 | "action_params": { 81 | "text": "task executed successfully", 82 | "success": true 83 | } 84 | } 85 | ] 86 | } -------------------------------------------------------------------------------- /data_samples/replay_list_samples/wap_exact_replay_list_LhTyE4ie0s5a1W6J.json: -------------------------------------------------------------------------------- 1 | { 2 | "ultimate_goal": "search for the best sold keyboard on Amazon", 3 | "task_id": "LhTyE4ie0s5a1W6J", 4 | "type": "exact_replay", 5 | "action_list": [ 6 | { 7 | "action": "open_tab", 8 | "action_params": { 9 | "url": "https://www.amazon.ca/" 10 | } 11 | }, 12 | { 13 | "action": "wait_for_element", 14 | "action_params": { 15 | "selector": "#searchDropdownBox", 16 | "timeout": 5000 17 | } 18 | }, 19 | { 20 | "action": "select_option_by_selector", 21 | "action_params": { 22 | "css_selector": "#searchDropdownBox", 23 | "value": "search-alias=aps" 24 | } 25 | }, 26 | { 27 | "action": "wait_for_element", 28 | "action_params": { 29 | "selector": "#twotabsearchtextbox", 30 | "timeout": 5000 31 | } 32 | }, 33 | { 34 | "action": "input_text_by_selector", 35 | "action_params": { 36 | "selector": "#twotabsearchtextbox", 37 | "text": "keyboard" 38 | } 39 | }, 40 | { 41 | "action": "send_keys", 42 | "action_params": { 43 | "keys": "Enter" 44 | } 45 | }, 46 | { 47 | "action": "wait_for_element", 48 | "action_params": { 49 | "selector": ".a-dropdown-prompt", 50 | "timeout": 5000 51 | } 52 | }, 53 | { 54 | "action": "click_element_by_selector", 55 | "action_params": { 56 | "css_selector": ".a-dropdown-prompt" 57 | } 58 | }, 59 | { 60 | "action": "wait_for_element", 61 | "action_params": { 62 | "selector": "#s-result-sort-select_5", 63 | "timeout": 5000 64 | } 65 | }, 66 | { 67 | "action": "click_element_by_selector", 68 | "action_params": { 69 | "css_selector": "#s-result-sort-select_5" 70 | } 71 | }, 72 | { 73 | "action": "wait_for_element", 74 | "action_params": { 75 | "selector": "span:text(\"Lenovo 300 USB Keyboard, Wired, Adjustable Tilt, Ergonomic, Windows 7/8/10, GX30M39655, Black\")", 76 | "timeout": 5000 77 | } 78 | }, 79 | { 80 | "action": "click_element_by_text", 81 | "action_params": { 82 | "text": "Lenovo 300 USB Keyboard, Wired, Adjustable Tilt, Ergonomic, Windows 7/8/10, GX30M39655, Black", 83 | "element_type": "span", 84 | "nth": 0 85 | } 86 | }, 87 | { 88 | "action": "extract_content", 89 | "action_params": { 90 | "goal": "search for the best sold keyboard on Amazon", 91 | "should_strip_link_urls": false 92 | } 93 | }, 94 | { 95 | "action": "done", 96 | "action_params": { 97 | "text": "task executed successfully", 98 | "success": true 99 | } 100 | } 101 | ] 102 | } -------------------------------------------------------------------------------- /browser_use/browser/tests/test_clicks.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | 4 | import pytest 5 | 6 | from browser_use.browser.browser import Browser, BrowserConfig 7 | from browser_use.dom.views import DOMBaseNode, DOMElementNode, DOMTextNode 8 | from browser_use.utils import time_execution_sync 9 | 10 | 11 | class ElementTreeSerializer: 12 | @staticmethod 13 | def dom_element_node_to_json(element_tree: DOMElementNode) -> dict: 14 | def node_to_dict(node: DOMBaseNode) -> dict: 15 | if isinstance(node, DOMTextNode): 16 | return {'type': 'text', 'text': node.text} 17 | elif isinstance(node, DOMElementNode): 18 | return { 19 | 'type': 'element', 20 | 'tag_name': node.tag_name, 21 | 'attributes': node.attributes, 22 | 'highlight_index': node.highlight_index, 23 | 'children': [node_to_dict(child) for child in node.children], 24 | } 25 | return {} 26 | 27 | return node_to_dict(element_tree) 28 | 29 | 30 | # run with: pytest browser_use/browser/tests/test_clicks.py 31 | @pytest.mark.asyncio 32 | async def test_highlight_elements(): 33 | browser = Browser(config=BrowserConfig(headless=False, disable_security=True)) 34 | 35 | async with await browser.new_context() as context: 36 | page = await context.get_current_page() 37 | # await page.goto('https://immobilienscout24.de') 38 | # await page.goto('https://help.sap.com/docs/sap-ai-core/sap-ai-core-service-guide/service-plans') 39 | # await page.goto('https://google.com/search?q=elon+musk') 40 | # await page.goto('https://kayak.com') 41 | # await page.goto('https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe') 42 | # await page.goto('https://dictionary.cambridge.org') 43 | # await page.goto('https://github.com') 44 | await page.goto('https://huggingface.co/') 45 | 46 | await asyncio.sleep(1) 47 | 48 | while True: 49 | try: 50 | # await asyncio.sleep(10) 51 | state = await context.get_state() 52 | 53 | with open('./tmp/page.json', 'w') as f: 54 | json.dump( 55 | ElementTreeSerializer.dom_element_node_to_json(state.element_tree), 56 | f, 57 | indent=1, 58 | ) 59 | 60 | # await time_execution_sync('highlight_selector_map_elements')( 61 | # browser.highlight_selector_map_elements 62 | # )(state.selector_map) 63 | 64 | # Find and print duplicate XPaths 65 | xpath_counts = {} 66 | if not state.selector_map: 67 | continue 68 | for selector in state.selector_map.values(): 69 | xpath = selector.xpath 70 | if xpath in xpath_counts: 71 | xpath_counts[xpath] += 1 72 | else: 73 | xpath_counts[xpath] = 1 74 | 75 | print('\nDuplicate XPaths found:') 76 | for xpath, count in xpath_counts.items(): 77 | if count > 1: 78 | print(f'XPath: {xpath}') 79 | print(f'Count: {count}\n') 80 | 81 | print(list(state.selector_map.keys()), 'Selector map keys') 82 | print(state.element_tree.clickable_elements_to_string()) 83 | action = input('Select next action: ') 84 | 85 | await time_execution_sync('remove_highlight_elements')(context.remove_highlights)() 86 | 87 | node_element = state.selector_map[int(action)] 88 | 89 | # check if index of selector map are the same as index of items in dom_items 90 | 91 | await context._click_element_node(node_element) 92 | 93 | except Exception as e: 94 | print(e) 95 | -------------------------------------------------------------------------------- /browser_use/telemetry/service.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import uuid 4 | from pathlib import Path 5 | 6 | from dotenv import load_dotenv 7 | from posthog import Posthog 8 | 9 | from browser_use.telemetry.views import BaseTelemetryEvent 10 | from browser_use.utils import singleton 11 | 12 | load_dotenv() 13 | 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | POSTHOG_EVENT_SETTINGS = { 19 | 'process_person_profile': True, 20 | } 21 | 22 | 23 | @singleton 24 | class ProductTelemetry: 25 | """ 26 | Service for capturing anonymized telemetry data. 27 | 28 | If the environment variable `ANONYMIZED_TELEMETRY=False`, anonymized telemetry will be disabled. 29 | """ 30 | 31 | USER_ID_PATH = str(Path.home() / '.cache' / 'browser_use' / 'telemetry_user_id') 32 | PROJECT_API_KEY = 'phc_F8JMNjW1i2KbGUTaW1unnDdLSPCoyc52SGRU0JecaUh' 33 | HOST = 'https://eu.i.posthog.com' 34 | UNKNOWN_USER_ID = 'UNKNOWN' 35 | 36 | _curr_user_id = None 37 | 38 | def __init__(self) -> None: 39 | telemetry_disabled = 'false' #os.getenv('ANONYMIZED_TELEMETRY', 'true').lower() == 'false' 40 | self.debug_logging = os.getenv('BROWSER_USE_LOGGING_LEVEL', 'info').lower() == 'debug' 41 | 42 | if telemetry_disabled: 43 | self._posthog_client = None 44 | else: 45 | logger.info( 46 | 'Anonymized telemetry enabled. See https://docs.browser-use.com/development/telemetry for more information.' 47 | ) 48 | self._posthog_client = Posthog( 49 | project_api_key=self.PROJECT_API_KEY, 50 | host=self.HOST, 51 | disable_geoip=False, 52 | ) 53 | 54 | # Silence posthog's logging 55 | if not self.debug_logging: 56 | posthog_logger = logging.getLogger('posthog') 57 | posthog_logger.disabled = True 58 | 59 | if self._posthog_client is None: 60 | logger.debug('Telemetry disabled') 61 | 62 | def capture(self, event: BaseTelemetryEvent) -> None: 63 | if self._posthog_client is None: 64 | return 65 | 66 | if self.debug_logging: 67 | logger.debug(f'Telemetry event: {event.name} {event.properties}') 68 | self._direct_capture(event) 69 | 70 | def _direct_capture(self, event: BaseTelemetryEvent) -> None: 71 | """ 72 | Should not be thread blocking because posthog magically handles it 73 | """ 74 | if self._posthog_client is None: 75 | return 76 | 77 | try: 78 | self._posthog_client.capture( 79 | self.user_id, 80 | event.name, 81 | {**event.properties, **POSTHOG_EVENT_SETTINGS}, 82 | ) 83 | except Exception as e: 84 | logger.error(f'Failed to send telemetry event {event.name}: {e}') 85 | 86 | @property 87 | def user_id(self) -> str: 88 | if self._curr_user_id: 89 | return self._curr_user_id 90 | 91 | # File access may fail due to permissions or other reasons. We don't want to 92 | # crash so we catch all exceptions. 93 | try: 94 | if not os.path.exists(self.USER_ID_PATH): 95 | os.makedirs(os.path.dirname(self.USER_ID_PATH), exist_ok=True) 96 | with open(self.USER_ID_PATH, 'w') as f: 97 | new_user_id = str(uuid.uuid4()) 98 | f.write(new_user_id) 99 | self._curr_user_id = new_user_id 100 | else: 101 | with open(self.USER_ID_PATH, 'r') as f: 102 | self._curr_user_id = f.read() 103 | except Exception: 104 | self._curr_user_id = 'UNKNOWN_USER_ID' 105 | return self._curr_user_id 106 | -------------------------------------------------------------------------------- /wap_replay/generate_smart_replay_list.py: -------------------------------------------------------------------------------- 1 | """ 2 | Batch-convert recorded event-JSON files into the canonical smart-replay” 3 | action list by calling `record_metadata_to_actions` from browser-use. 4 | 5 | Usage 6 | ----- 7 | python wap_replay/generate_smart_replay_list.py --data_dir_path \ 8 | [--output_dir_path data_processed/exact_replay] 9 | 10 | Example 11 | ----- 12 | python wap_replay/generate_smart_replay_list.py --data_dir_path data/20250423/Allrecipes--4 \ 13 | --output_dir_path data_processed/smart_replay 14 | """ 15 | import argparse 16 | from pathlib import Path 17 | from dotenv import load_dotenv 18 | from utils.action_processing import generate_subgoal_speculate_prompt, find_task_prompt, load_event_json 19 | from utils.subgoal_generator import generate_subgoals_from_dir, wap_subgoal_list_generation 20 | load_dotenv() 21 | 22 | 23 | def subgoal_prompt_generation(path: str, output_path: str, ultimate_goal: str) -> None: 24 | # 1️⃣ collect every .json file under the root folder -------------------- 25 | root = Path(path) 26 | json_paths = list(root.rglob("*.json")) 27 | if not json_paths: 28 | print(f"[OTA Info] No JSON files found under {root}") 29 | return 30 | 31 | print(f"[OTA Info] Found {len(json_paths)} event files.") 32 | # 2️⃣ process each event file ------------------------------------------ 33 | for idx, event_path in enumerate(json_paths, 1): 34 | print(f"\n[{idx}/{len(json_paths)}] Loading {event_path}") 35 | summary_event = load_event_json(event_path) 36 | 37 | print(" Generating sub-goal …") 38 | generate_subgoal_speculate_prompt(summary_event, ultimate_goal, event_path.stem, output_path) 39 | 40 | print("\n[OTA Info] All done.") 41 | 42 | 43 | def subgoal_llm_generation(folder, jsonl_name): 44 | results = generate_subgoals_from_dir( 45 | folder, 46 | system_prompt="You are a concise sub-goal assistant fot analysis of actions in browser.", 47 | model="gpt-4o", 48 | temperature=0, 49 | save_jsonl= jsonl_name 50 | ) 51 | 52 | def main() -> None: 53 | parser = argparse.ArgumentParser(description="Smart-replay pipeline") 54 | parser.add_argument("--data_dir_path", required=True, 55 | help="Directory containing recorded event JSON files") 56 | parser.add_argument("--output_dir_path", default="data_processed/smart_replay", 57 | help="Directory where all output will be placed " 58 | "(default: data_processed/smart_replay)") 59 | args = parser.parse_args() 60 | 61 | data_dir = Path(args.data_dir_path) 62 | output_dir = Path(args.output_dir_path) 63 | output_dir.mkdir(parents=True, exist_ok=True) 64 | 65 | task_prompt, task_id = find_task_prompt(data_dir) 66 | print("[OTA Info] Using task prompt =>", task_prompt) 67 | print("[OTA Info] taskId =>", task_id) 68 | 69 | subgoals_dir = output_dir / f"subgoals_{task_id}" 70 | subgoals_dir.mkdir(parents=True, exist_ok=True) 71 | 72 | subgoals_jsonl = subgoals_dir / "subgoals_output.jsonl" 73 | wap_json = output_dir / f"wap_smart_replay_list_{task_id}.json" 74 | 75 | subgoal_prompt_generation( 76 | data_dir, 77 | subgoals_dir, 78 | task_prompt, 79 | ) 80 | 81 | subgoal_llm_generation( 82 | subgoals_dir, 83 | subgoals_jsonl, 84 | ) 85 | 86 | wap_subgoal_list_generation( 87 | task_prompt, 88 | task_id, 89 | subgoals_jsonl, 90 | wap_json, 91 | ) 92 | 93 | if __name__ == "__main__": 94 | main() -------------------------------------------------------------------------------- /chrome-extension/js/specialEventHandler.js: -------------------------------------------------------------------------------- 1 | (function () { 2 | /****************************************************************** 3 | * specialEventHandler.js 4 | * -------------------------------------------------------------- 5 | * Registers and runs domain–specific listeners that the generic 6 | * DOMListener cannot reliably cover 7 | * 8 | * Usage from DOMListener.js 9 | * -------------------------------------------------------------- 10 | * import(chrome.runtime.getURL('js/specialEventHandler.js')) 11 | * .then(mod => mod.init()) 12 | * .catch(err => console.warn('[specialHandler] load failed', err)); 13 | ******************************************************************/ 14 | 15 | /* ---------- simple registry ------------------------------------------------ */ 16 | const _handlers = []; 17 | 18 | /** Register a new handler. 19 | * @param {RegExp} hostPattern – tested against location.hostname 20 | * @param {Function} initFn – called if the pattern matches */ 21 | function register(hostPattern, initFn) { _handlers.push({hostPattern, initFn}); } 22 | 23 | /* ---------- H A N D L E R S --------------------------------------------- */ 24 | register(/(^|\.)google\.[a-z.]+$/, ({ 25 | nodeToHTMLString, 26 | trimTarget, 27 | getEventHash, 28 | getCurrentHTMLSanitized, 29 | taskId 30 | }) => { 31 | 32 | const BOX = 'textarea[name="q"][role="combobox"]'; 33 | const BTN = 'button[aria-label="Search"][type="submit"]'; 34 | 35 | function report(value, originEl) { 36 | const evHash = getEventHash(); 37 | 38 | 39 | const actionTarget = { 40 | type : 'submit', 41 | target : nodeToHTMLString(originEl), // full raw HTML 42 | targetId : originEl.id, 43 | targetClass: originEl.className, 44 | value : value // the user query text 45 | }; 46 | 47 | // highlight element just like other flows 48 | originEl.setAttribute('ota-use-interactive-target', '1'); 49 | actionTarget.target = trimTarget(originEl); // prettified / trimmed 50 | // (optional) remove the mark after trimming 51 | originEl.removeAttribute('ota-use-interactive-target'); 52 | 53 | const summaryEvent = { 54 | taskId : taskId, 55 | eventHash : evHash, 56 | type : 'submit', 57 | actionTimestamp: Date.now(), 58 | eventTarget : actionTarget, 59 | allEvents : {}, // nothing to diff for a submit 60 | pageHTMLContent: getCurrentHTMLSanitized() 61 | }; 62 | 63 | /* ---- ship it to the background ------------------------------------ */ 64 | chrome.runtime.sendMessage({ 65 | type : 'submit', // pick any type name you handle in bg.js 66 | summaryEvent 67 | }); 68 | } 69 | 70 | /* enter key */ 71 | document.addEventListener('keydown', e => { 72 | if (e.key === 'Enter' && !e.shiftKey && e.target.matches(BOX)) { 73 | report(e.target.value, e.target); 74 | } 75 | }, /*capture*/ true); 76 | 77 | /* blue Search button */ 78 | document.addEventListener('click', e => { 79 | const btn = e.target.closest(BTN); 80 | if (!btn) return; 81 | const box = document.querySelector(BOX); 82 | if (box) report(box.value, btn); 83 | }, true); 84 | 85 | console.debug('[specialHandler] Google search attached'); 86 | }); 87 | 88 | /* -------------------------------------------------------------------------- */ 89 | /** Call once from DOMListener. Attaches every handler that matches 90 | * the current hostname. */ 91 | function init (deps) { 92 | const host = location.hostname; 93 | _handlers.forEach(({hostPattern, initFn}) => { 94 | if (hostPattern.test(host)) { 95 | console.log(hostPattern) 96 | try { initFn(deps); } 97 | catch (err) { 98 | console.error('[specialHandler] failed for', hostPattern, err); 99 | } 100 | } 101 | }); 102 | } 103 | 104 | window.SpecialEvents = { init }; 105 | 106 | })(); -------------------------------------------------------------------------------- /browser_use/controller/views.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from pydantic import BaseModel, ConfigDict, Field, model_validator 4 | 5 | 6 | # Action Input Models 7 | class SearchGoogleAction(BaseModel): 8 | query: str 9 | 10 | 11 | class GoToUrlAction(BaseModel): 12 | url: str 13 | 14 | 15 | class WaitForElementAction(BaseModel): 16 | selector: str 17 | timeout: Optional[int] = 10000 # Timeout in milliseconds 18 | 19 | 20 | class ClickElementAction(BaseModel): 21 | index: int 22 | xpath: Optional[str] = None 23 | 24 | 25 | class ClickElementByXpathAction(BaseModel): 26 | xpath: str 27 | 28 | 29 | class ClickElementBySelectorAction(BaseModel): 30 | css_selector: str 31 | 32 | class SelectOptionBySelectorAction(BaseModel): 33 | css_selector: str # e.g. "#searchDropdownBox" 34 | value: str | None = None # preferred (unique) 35 | label: str | None = None # visible text fallback 36 | 37 | class ClickElementByTextAction(BaseModel): 38 | text: str 39 | element_type: Optional[str] 40 | nth: int = 0 41 | 42 | 43 | class InputTextAction(BaseModel): 44 | index: int 45 | text: str 46 | xpath: Optional[str] = None 47 | 48 | 49 | class DoneAction(BaseModel): 50 | text: str 51 | success: bool 52 | 53 | 54 | class SwitchTabAction(BaseModel): 55 | page_id: int 56 | 57 | 58 | class OpenTabAction(BaseModel): 59 | url: str 60 | 61 | 62 | class CloseTabAction(BaseModel): 63 | page_id: int 64 | 65 | 66 | class ScrollAction(BaseModel): 67 | amount: Optional[int] = None # The number of pixels to scroll. If None, scroll down/up one page 68 | 69 | 70 | class SendKeysAction(BaseModel): 71 | keys: str 72 | 73 | 74 | class GroupTabsAction(BaseModel): 75 | tab_ids: list[int] = Field(..., description='List of tab IDs to group') 76 | title: str = Field(..., description='Name for the tab group') 77 | color: Optional[str] = Field( 78 | 'blue', 79 | description='Color for the group (grey/blue/red/yellow/green/pink/purple/cyan)', 80 | ) 81 | 82 | 83 | class UngroupTabsAction(BaseModel): 84 | tab_ids: list[int] = Field(..., description='List of tab IDs to ungroup') 85 | 86 | 87 | class ExtractPageContentAction(BaseModel): 88 | value: str 89 | 90 | 91 | class NoParamsAction(BaseModel): 92 | """ 93 | Accepts absolutely anything in the incoming data 94 | and discards it, so the final parsed model is empty. 95 | """ 96 | 97 | model_config = ConfigDict(extra='allow') 98 | 99 | @model_validator(mode='before') 100 | def ignore_all_inputs(cls, values): 101 | # No matter what the user sends, discard it and return empty. 102 | return {} 103 | 104 | 105 | class Position(BaseModel): 106 | x: int 107 | y: int 108 | 109 | 110 | class DragDropAction(BaseModel): 111 | # Element-based approach 112 | element_source: Optional[str] = Field(None, description='CSS selector or XPath of the element to drag from') 113 | element_target: Optional[str] = Field(None, description='CSS selector or XPath of the element to drop onto') 114 | element_source_offset: Optional[Position] = Field( 115 | None, description='Precise position within the source element to start drag (in pixels from top-left corner)' 116 | ) 117 | element_target_offset: Optional[Position] = Field( 118 | None, description='Precise position within the target element to drop (in pixels from top-left corner)' 119 | ) 120 | 121 | # Coordinate-based approach (used if selectors not provided) 122 | coord_source_x: Optional[int] = Field(None, description='Absolute X coordinate on page to start drag from (in pixels)') 123 | coord_source_y: Optional[int] = Field(None, description='Absolute Y coordinate on page to start drag from (in pixels)') 124 | coord_target_x: Optional[int] = Field(None, description='Absolute X coordinate on page to drop at (in pixels)') 125 | coord_target_y: Optional[int] = Field(None, description='Absolute Y coordinate on page to drop at (in pixels)') 126 | 127 | # Common options 128 | steps: Optional[int] = Field(10, description='Number of intermediate points for smoother movement (5-20 recommended)') 129 | delay_ms: Optional[int] = Field(5, description='Delay in milliseconds between steps (0 for fastest, 10-20 for more natural)') 130 | -------------------------------------------------------------------------------- /wap_replay/generate_exact_replay_list.py: -------------------------------------------------------------------------------- 1 | """ 2 | Batch-convert recorded event-JSON files into the canonical “exact-replay” 3 | action list by calling `record_metadata_to_actions` from browser-use. 4 | 5 | Usage 6 | ----- 7 | python wap_replay/generate_exact_replay_list.py --data_dir_path \ 8 | [--output_dir_path data_processed/exact_replay] 9 | 10 | Example 11 | ----- 12 | python wap_replay/generate_exact_replay_list.py --data_dir_path data/20250423/Allrecipes--4 \ 13 | --output_dir_path data_processed/exact_replay 14 | """ 15 | from __future__ import annotations 16 | 17 | import argparse 18 | import json 19 | from pathlib import Path 20 | from typing import List, Dict, Any 21 | from browser_use.wap.exact_replay import record_metadata_to_actions 22 | from utils.action_processing import find_task_prompt, load_event_json 23 | 24 | # ---------------------------------------------------------------------------# 25 | # core function # 26 | # ---------------------------------------------------------------------------# 27 | def folder_to_actions(folder_path: str | Path) -> List[Dict[str, Any]]: 28 | """ 29 | Walk sub-directories recursively, load every *.json file, convert each 30 | to replay actions via `record_metadata_to_actions`, and return the 31 | concatenated list. 32 | """ 33 | folder_path = Path(folder_path) 34 | 35 | if not folder_path.is_dir(): 36 | raise NotADirectoryError(folder_path) 37 | 38 | json_paths = list(folder_path.rglob("*.json")) # recursive search 39 | if not json_paths: 40 | print(f"[OTA Info] No JSON files found under {folder_path}") 41 | return [] 42 | 43 | print(f"[OTA Info] Found {len(json_paths)} event files.") 44 | 45 | all_actions: List[Dict[str, Any]] = [] 46 | 47 | for idx, event_path in enumerate(json_paths, 1): 48 | print(f"[{idx}/{len(json_paths)}] Loading {event_path}") 49 | try: 50 | event_json = load_event_json(event_path) 51 | actions = record_metadata_to_actions([event_json]) 52 | all_actions.extend(actions) 53 | except Exception as exc: 54 | print(f"[warn] could not process {event_path.name}: {exc}") 55 | 56 | print("[OTA Info] All done.") 57 | return all_actions 58 | 59 | 60 | def save_exact_replay_bundle( 61 | path: Path, 62 | *, 63 | ultimate_goal: str, 64 | task_id: str, 65 | actions: List[Dict[str, Any]], 66 | ) -> None: 67 | """ 68 | Write a JSON file shaped like 69 | """ 70 | bundle = { 71 | "ultimate_goal": ultimate_goal, 72 | "task_id": task_id, 73 | "type": "exact_replay", 74 | "action_list": actions, 75 | } 76 | path.write_text(json.dumps(bundle, ensure_ascii=False, indent=2), encoding="utf-8") 77 | print(f"[OTA info] wrote {len(actions)} actions → {path}") 78 | 79 | 80 | # ---------------------------------------------------------------------------# 81 | # command-line interface # 82 | # ---------------------------------------------------------------------------# 83 | def parse_args() -> argparse.Namespace: 84 | parser = argparse.ArgumentParser(description="Create exact-replay action list from a folder of event JSON files.") 85 | parser.add_argument("--data_dir_path", required=True, help="Folder containing recorded *.json files.") 86 | parser.add_argument("--output_dir_path", default="data_processed/exact_replay", help="Directory to store result file.") 87 | return parser.parse_args() 88 | 89 | def main() -> None: 90 | args = parse_args() 91 | 92 | input_folder = Path(args.data_dir_path) 93 | output_dir = Path(args.output_dir_path) 94 | output_dir.mkdir(parents=True, exist_ok=True) 95 | task_prompt, task_id = find_task_prompt(input_folder) 96 | output_path = output_dir / f"wap_exact_replay_list_{task_id}.json" 97 | 98 | actions = folder_to_actions(input_folder) 99 | save_exact_replay_bundle( 100 | output_path, 101 | ultimate_goal=task_prompt, 102 | task_id=task_id, 103 | actions=actions, 104 | ) 105 | 106 | if __name__ == "__main__": 107 | main() -------------------------------------------------------------------------------- /browser_use/dom/tests/debug_page_structure.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 6 | 7 | from browser_use.browser.browser import Browser, BrowserConfig 8 | from browser_use.browser.context import BrowserContext 9 | 10 | 11 | async def analyze_page_structure(url: str): 12 | """Analyze and print the structure of a webpage with enhanced debugging""" 13 | browser = Browser( 14 | config=BrowserConfig( 15 | headless=False, # Set to True if you don't need to see the browser 16 | ) 17 | ) 18 | 19 | context = BrowserContext(browser=browser) 20 | 21 | try: 22 | async with context as ctx: 23 | # Navigate to the URL 24 | page = await ctx.get_current_page() 25 | await page.goto(url) 26 | await page.wait_for_load_state('networkidle') 27 | 28 | # Get viewport dimensions 29 | viewport_info = await page.evaluate("""() => { 30 | return { 31 | viewport: { 32 | width: window.innerWidth, 33 | height: window.innerHeight, 34 | scrollX: window.scrollX, 35 | scrollY: window.scrollY 36 | } 37 | } 38 | }""") 39 | 40 | print('\nViewport Information:') 41 | print(f'Width: {viewport_info["viewport"]["width"]}') 42 | print(f'Height: {viewport_info["viewport"]["height"]}') 43 | print(f'ScrollX: {viewport_info["viewport"]["scrollX"]}') 44 | print(f'ScrollY: {viewport_info["viewport"]["scrollY"]}') 45 | 46 | # Enhanced debug information for cookie consent and fixed position elements 47 | debug_info = await page.evaluate("""() => { 48 | function getElementInfo(element) { 49 | const rect = element.getBoundingClientRect(); 50 | const style = window.getComputedStyle(element); 51 | return { 52 | tag: element.tagName.toLowerCase(), 53 | id: element.id, 54 | className: element.className, 55 | position: style.position, 56 | rect: { 57 | top: rect.top, 58 | right: rect.right, 59 | bottom: rect.bottom, 60 | left: rect.left, 61 | width: rect.width, 62 | height: rect.height 63 | }, 64 | isFixed: style.position === 'fixed', 65 | isSticky: style.position === 'sticky', 66 | zIndex: style.zIndex, 67 | visibility: style.visibility, 68 | display: style.display, 69 | opacity: style.opacity 70 | }; 71 | } 72 | 73 | // Find cookie-related elements 74 | const cookieElements = Array.from(document.querySelectorAll('[id*="cookie"], [id*="consent"], [class*="cookie"], [class*="consent"]')); 75 | const fixedElements = Array.from(document.querySelectorAll('*')).filter(el => { 76 | const style = window.getComputedStyle(el); 77 | return style.position === 'fixed' || style.position === 'sticky'; 78 | }); 79 | 80 | return { 81 | cookieElements: cookieElements.map(getElementInfo), 82 | fixedElements: fixedElements.map(getElementInfo) 83 | }; 84 | }""") 85 | 86 | print('\nCookie-related Elements:') 87 | for elem in debug_info['cookieElements']: 88 | print(f'\nElement: {elem["tag"]}#{elem["id"]} .{elem["className"]}') 89 | print(f'Position: {elem["position"]}') 90 | print(f'Rect: {elem["rect"]}') 91 | print(f'Z-Index: {elem["zIndex"]}') 92 | print(f'Visibility: {elem["visibility"]}') 93 | print(f'Display: {elem["display"]}') 94 | print(f'Opacity: {elem["opacity"]}') 95 | 96 | print('\nFixed/Sticky Position Elements:') 97 | for elem in debug_info['fixedElements']: 98 | print(f'\nElement: {elem["tag"]}#{elem["id"]} .{elem["className"]}') 99 | print(f'Position: {elem["position"]}') 100 | print(f'Rect: {elem["rect"]}') 101 | print(f'Z-Index: {elem["zIndex"]}') 102 | 103 | print(f'\nPage Structure for {url}:\n') 104 | structure = await ctx.get_page_structure() 105 | print(structure) 106 | 107 | input('Press Enter to close the browser...') 108 | finally: 109 | await browser.close() 110 | 111 | 112 | if __name__ == '__main__': 113 | # You can modify this URL to analyze different pages 114 | 115 | urls = [ 116 | 'https://www.mlb.com/yankees/stats/', 117 | 'https://immobilienscout24.de', 118 | 'https://www.zeiss.com/career/en/job-search.html?page=1', 119 | 'https://www.zeiss.com/career/en/job-search.html?page=1', 120 | 'https://reddit.com', 121 | ] 122 | for url in urls: 123 | asyncio.run(analyze_page_structure(url)) 124 | -------------------------------------------------------------------------------- /browser_use/agent/system_prompt_wap_replay.md: -------------------------------------------------------------------------------- 1 | You are an AI agent designed to automate browser tasks. Your goal is to accomplish the ultimate task by replaying the sub-goals for each step that we provided. 2 | 3 | # Input Format 4 | Task 5 | Previous steps 6 | Current URL 7 | Open Tabs 8 | Sub-goal List 9 | Interactive Elements 10 | [index]text 11 | - index: Numeric identifier for interaction 12 | - type: HTML element type (button, input, etc.) 13 | - text: Element description 14 | Example: 15 | [33] 16 | 17 | - Only elements with numeric indexes in [] are interactive 18 | - elements without [] provide only context 19 | 20 | # Response Rules 21 | 1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format: 22 | {{"current_state": {{"evaluation_previous_goal": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Mention if something unexpected happened. Shortly state why/why not", 23 | "memory": "Description of what has been done and what you need to remember. Be very specific. Count here ALWAYS how many times you have done something and how many remain. E.g. 0 out of 10 websites analyzed. Continue with abc and xyz", 24 | "subgoal_index": 1}}, 25 | "action":[{{"one_action_name": {{// action-specific parameter}}}}, // ... more actions in sequence]}} 26 | 27 | 2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {{max_actions}} actions per sequence. 28 | Common action sequences: 29 | - Form filling: [{{"input_text": {{"index": 1, "text": "username"}}}}, {{"input_text": {{"index": 2, "text": "password"}}}}, {{"click_element": {{"index": 3}}}}] 30 | - Navigation and extraction: [{{"go_to_url": {{"url": "https://example.com"}}}}, {{"extract_content": {{"goal": "extract the names"}}}}] 31 | - Actions are executed in the given order 32 | - If the page changes after an action, the sequence is interrupted and you get the new state. 33 | - only use multiple actions if it makes sense. 34 | 35 | 3. ELEMENT INTERACTION: 36 | - Only use indexes of the interactive elements 37 | - Elements marked with "[]Non-interactive text" are non-interactive 38 | 39 | 4. NAVIGATION & ERROR HANDLING: 40 | - If no suitable elements exist, use other functions to complete the task 41 | - Handle popups/cookies by accepting or closing them 42 | - Use scroll to find elements you are looking for 43 | - If you want to research something, open a new tab instead of using the current tab 44 | - If captcha pops up, try to solve it - else try a different approach 45 | - If the page is not fully loaded, use wait action 46 | 47 | 5. (MANDATORY) ACTIONS BASED ON SUB-GOAL LIST 48 | - You are provided with a previous and current sub-goals. 49 | - Check the action results in task history and the current page content to see whether the current status has already been safisfied: 50 | -> if YES, specify actions for the current sub-goal in this step 51 | -> if NO, try some approaches more to achieve the previous sub-goals, e.g.: scroll to find elements 52 | - In your response, fill the value of subgoal_index with the index of the sub-goal that you work on in this step 53 | - Keep track of the status and subresults in the memory. 54 | 55 | 6. TASK COMPLETION: 56 | - Use the done action as the last action as soon as the ultimate task is complete 57 | - Dont use "done" before you are done with everything the user asked you, except you reach the last step of max_steps. 58 | - If you reach your last step, use the done action even if the task is not fully finished. Provide all the information you have gathered so far. If the ultimate task is completely finished set success to true. If not everything the user asked for is completed set success in done to false! 59 | - If you have to do something repeatedly for example the task says for "each", or "for all", or "x times", count always inside "memory" how many times you have done it and how many remain. Don't stop until you have completed like the task asked you. Only call done after the last step. 60 | - Don't hallucinate actions 61 | 62 | 7. Form filling: 63 | - If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field. 64 | 65 | 8. Extraction: 66 | - If your task is to find information - call extract_content on the specific pages to get and store the information. 67 | Your responses must be always JSON with the specified format. 68 | -------------------------------------------------------------------------------- /browser_use/agent/memory/service.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | from typing import List, Optional 5 | 6 | from langchain_core.language_models.chat_models import BaseChatModel 7 | from langchain_core.messages import ( 8 | BaseMessage, 9 | HumanMessage, 10 | ) 11 | from langchain_core.messages.utils import convert_to_openai_messages 12 | from mem0 import Memory as Mem0Memory 13 | from pydantic import BaseModel 14 | 15 | from browser_use.agent.message_manager.service import MessageManager 16 | from browser_use.agent.message_manager.views import ManagedMessage, MessageMetadata 17 | from browser_use.utils import time_execution_sync 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | class MemorySettings(BaseModel): 23 | """Settings for procedural memory.""" 24 | 25 | agent_id: str 26 | interval: int = 10 27 | config: Optional[dict] | None = None 28 | 29 | 30 | class Memory: 31 | """ 32 | Manages procedural memory for agents. 33 | 34 | This class implements a procedural memory management system using Mem0 that transforms agent interaction history 35 | into concise, structured representations at specified intervals. It serves to optimize context window 36 | utilization during extended task execution by converting verbose historical information into compact, 37 | yet comprehensive memory constructs that preserve essential operational knowledge. 38 | """ 39 | 40 | def __init__( 41 | self, 42 | message_manager: MessageManager, 43 | llm: BaseChatModel, 44 | settings: MemorySettings, 45 | ): 46 | self.message_manager = message_manager 47 | self.llm = llm 48 | self.settings = settings 49 | self._memory_config = self.settings.config or {'vector_store': {'provider': 'faiss'}} 50 | self.mem0 = Mem0Memory.from_config(config_dict=self._memory_config) 51 | 52 | @time_execution_sync('--create_procedural_memory') 53 | def create_procedural_memory(self, current_step: int) -> None: 54 | """ 55 | Create a procedural memory if needed based on the current step. 56 | 57 | Args: 58 | current_step: The current step number of the agent 59 | """ 60 | logger.info(f'Creating procedural memory at step {current_step}') 61 | 62 | # Get all messages 63 | all_messages = self.message_manager.state.history.messages 64 | 65 | # Filter out messages that are marked as memory in metadata 66 | messages_to_process = [] 67 | new_messages = [] 68 | for msg in all_messages: 69 | # Exclude system message and initial messages 70 | if isinstance(msg, ManagedMessage) and msg.metadata.message_type in set(['init', 'memory']): 71 | new_messages.append(msg) 72 | else: 73 | messages_to_process.append(msg) 74 | 75 | if len(messages_to_process) <= 1: 76 | logger.info('Not enough non-memory messages to summarize') 77 | return 78 | 79 | # Create a summary 80 | summary = self._create([m.message for m in messages_to_process], current_step) 81 | 82 | if not summary: 83 | logger.warning('Failed to create summary') 84 | return 85 | 86 | # Replace the summarized messages with the summary 87 | summary_message = HumanMessage(content=summary) 88 | summary_tokens = self.message_manager._count_tokens(summary_message) 89 | summary_metadata = MessageMetadata(tokens=summary_tokens, message_type='memory') 90 | 91 | # Calculate the total tokens being removed 92 | removed_tokens = sum(m.metadata.tokens for m in messages_to_process) 93 | 94 | # Add the summary message 95 | new_messages.append(ManagedMessage(message=summary_message, metadata=summary_metadata)) 96 | 97 | # Update the history 98 | self.message_manager.state.history.messages = new_messages 99 | self.message_manager.state.history.current_tokens -= removed_tokens 100 | self.message_manager.state.history.current_tokens += summary_tokens 101 | 102 | logger.info(f'Memories summarized: {len(messages_to_process)} messages converted to procedural memory') 103 | logger.info(f'Token reduction: {removed_tokens - summary_tokens} tokens') 104 | 105 | def _create(self, messages: List[BaseMessage], current_step: int) -> Optional[str]: 106 | parsed_messages = convert_to_openai_messages(messages) 107 | try: 108 | results = self.mem0.add( 109 | messages=parsed_messages, 110 | agent_id=self.settings.agent_id, 111 | llm=self.llm, 112 | memory_type='procedural_memory', 113 | metadata={'step': current_step}, 114 | ) 115 | if len(results.get('results', [])): 116 | return results.get('results', [])[0].get('memory') 117 | return None 118 | except Exception as e: 119 | logger.error(f'Error creating procedural memory: {e}') 120 | return None 121 | -------------------------------------------------------------------------------- /browser_use/dom/history_tree_processor/service.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | from typing import Optional 3 | 4 | from browser_use.dom.history_tree_processor.view import DOMHistoryElement, HashedDomElement 5 | from browser_use.dom.views import DOMElementNode 6 | 7 | 8 | class HistoryTreeProcessor: 9 | """ " 10 | Operations on the DOM elements 11 | 12 | @dev be careful - text nodes can change even if elements stay the same 13 | """ 14 | 15 | @staticmethod 16 | def convert_dom_element_to_history_element(dom_element: DOMElementNode) -> DOMHistoryElement: 17 | from browser_use.browser.context import BrowserContext 18 | 19 | parent_branch_path = HistoryTreeProcessor._get_parent_branch_path(dom_element) 20 | css_selector = BrowserContext._enhanced_css_selector_for_element(dom_element) 21 | return DOMHistoryElement( 22 | dom_element.tag_name, 23 | dom_element.xpath, 24 | dom_element.highlight_index, 25 | parent_branch_path, 26 | dom_element.attributes, 27 | dom_element.shadow_root, 28 | css_selector=css_selector, 29 | page_coordinates=dom_element.page_coordinates, 30 | viewport_coordinates=dom_element.viewport_coordinates, 31 | viewport_info=dom_element.viewport_info, 32 | ) 33 | 34 | @staticmethod 35 | def find_history_element_in_tree(dom_history_element: DOMHistoryElement, tree: DOMElementNode) -> Optional[DOMElementNode]: 36 | hashed_dom_history_element = HistoryTreeProcessor._hash_dom_history_element(dom_history_element) 37 | 38 | def process_node(node: DOMElementNode): 39 | if node.highlight_index is not None: 40 | hashed_node = HistoryTreeProcessor._hash_dom_element(node) 41 | if hashed_node == hashed_dom_history_element: 42 | return node 43 | for child in node.children: 44 | if isinstance(child, DOMElementNode): 45 | result = process_node(child) 46 | if result is not None: 47 | return result 48 | return None 49 | 50 | return process_node(tree) 51 | 52 | @staticmethod 53 | def compare_history_element_and_dom_element(dom_history_element: DOMHistoryElement, dom_element: DOMElementNode) -> bool: 54 | hashed_dom_history_element = HistoryTreeProcessor._hash_dom_history_element(dom_history_element) 55 | hashed_dom_element = HistoryTreeProcessor._hash_dom_element(dom_element) 56 | 57 | return hashed_dom_history_element == hashed_dom_element 58 | 59 | @staticmethod 60 | def _hash_dom_history_element(dom_history_element: DOMHistoryElement) -> HashedDomElement: 61 | branch_path_hash = HistoryTreeProcessor._parent_branch_path_hash(dom_history_element.entire_parent_branch_path) 62 | attributes_hash = HistoryTreeProcessor._attributes_hash(dom_history_element.attributes) 63 | xpath_hash = HistoryTreeProcessor._xpath_hash(dom_history_element.xpath) 64 | 65 | return HashedDomElement(branch_path_hash, attributes_hash, xpath_hash) 66 | 67 | @staticmethod 68 | def _hash_dom_element(dom_element: DOMElementNode) -> HashedDomElement: 69 | parent_branch_path = HistoryTreeProcessor._get_parent_branch_path(dom_element) 70 | branch_path_hash = HistoryTreeProcessor._parent_branch_path_hash(parent_branch_path) 71 | attributes_hash = HistoryTreeProcessor._attributes_hash(dom_element.attributes) 72 | xpath_hash = HistoryTreeProcessor._xpath_hash(dom_element.xpath) 73 | # text_hash = DomTreeProcessor._text_hash(dom_element) 74 | 75 | return HashedDomElement(branch_path_hash, attributes_hash, xpath_hash) 76 | 77 | @staticmethod 78 | def _get_parent_branch_path(dom_element: DOMElementNode) -> list[str]: 79 | parents: list[DOMElementNode] = [] 80 | current_element: DOMElementNode = dom_element 81 | while current_element.parent is not None: 82 | parents.append(current_element) 83 | current_element = current_element.parent 84 | 85 | parents.reverse() 86 | 87 | return [parent.tag_name for parent in parents] 88 | 89 | @staticmethod 90 | def _parent_branch_path_hash(parent_branch_path: list[str]) -> str: 91 | parent_branch_path_string = '/'.join(parent_branch_path) 92 | return hashlib.sha256(parent_branch_path_string.encode()).hexdigest() 93 | 94 | @staticmethod 95 | def _attributes_hash(attributes: dict[str, str]) -> str: 96 | attributes_string = ''.join(f'{key}={value}' for key, value in attributes.items()) 97 | return hashlib.sha256(attributes_string.encode()).hexdigest() 98 | 99 | @staticmethod 100 | def _xpath_hash(xpath: str) -> str: 101 | return hashlib.sha256(xpath.encode()).hexdigest() 102 | 103 | @staticmethod 104 | def _text_hash(dom_element: DOMElementNode) -> str: 105 | """ """ 106 | text_string = dom_element.get_all_text_till_next_clickable_element() 107 | return hashlib.sha256(text_string.encode()).hexdigest() 108 | -------------------------------------------------------------------------------- /browser_use/logging_config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | 5 | from dotenv import load_dotenv 6 | 7 | load_dotenv() 8 | 9 | 10 | def addLoggingLevel(levelName, levelNum, methodName=None): 11 | """ 12 | Comprehensively adds a new logging level to the `logging` module and the 13 | currently configured logging class. 14 | 15 | `levelName` becomes an attribute of the `logging` module with the value 16 | `levelNum`. `methodName` becomes a convenience method for both `logging` 17 | itself and the class returned by `logging.getLoggerClass()` (usually just 18 | `logging.Logger`). If `methodName` is not specified, `levelName.lower()` is 19 | used. 20 | 21 | To avoid accidental clobberings of existing attributes, this method will 22 | raise an `AttributeError` if the level name is already an attribute of the 23 | `logging` module or if the method name is already present 24 | 25 | Example 26 | ------- 27 | >>> addLoggingLevel('TRACE', logging.DEBUG - 5) 28 | >>> logging.getLogger(__name__).setLevel('TRACE') 29 | >>> logging.getLogger(__name__).trace('that worked') 30 | >>> logging.trace('so did this') 31 | >>> logging.TRACE 32 | 5 33 | 34 | """ 35 | if not methodName: 36 | methodName = levelName.lower() 37 | 38 | if hasattr(logging, levelName): 39 | raise AttributeError('{} already defined in logging module'.format(levelName)) 40 | if hasattr(logging, methodName): 41 | raise AttributeError('{} already defined in logging module'.format(methodName)) 42 | if hasattr(logging.getLoggerClass(), methodName): 43 | raise AttributeError('{} already defined in logger class'.format(methodName)) 44 | 45 | # This method was inspired by the answers to Stack Overflow post 46 | # http://stackoverflow.com/q/2183233/2988730, especially 47 | # http://stackoverflow.com/a/13638084/2988730 48 | def logForLevel(self, message, *args, **kwargs): 49 | if self.isEnabledFor(levelNum): 50 | self._log(levelNum, message, args, **kwargs) 51 | 52 | def logToRoot(message, *args, **kwargs): 53 | logging.log(levelNum, message, *args, **kwargs) 54 | 55 | logging.addLevelName(levelNum, levelName) 56 | setattr(logging, levelName, levelNum) 57 | setattr(logging.getLoggerClass(), methodName, logForLevel) 58 | setattr(logging, methodName, logToRoot) 59 | 60 | 61 | def setup_logging(): 62 | # Try to add RESULT level, but ignore if it already exists 63 | try: 64 | addLoggingLevel('RESULT', 35) # This allows ERROR, FATAL and CRITICAL 65 | except AttributeError: 66 | pass # Level already exists, which is fine 67 | 68 | log_type = os.getenv('BROWSER_USE_LOGGING_LEVEL', 'info').lower() 69 | 70 | # Check if handlers are already set up 71 | if logging.getLogger().hasHandlers(): 72 | return 73 | 74 | # Clear existing handlers 75 | root = logging.getLogger() 76 | root.handlers = [] 77 | 78 | class BrowserUseFormatter(logging.Formatter): 79 | def format(self, record): 80 | if isinstance(record.name, str) and record.name.startswith('browser_use.'): 81 | record.name = record.name.split('.')[-2] 82 | return super().format(record) 83 | 84 | # Setup single handler for all loggers 85 | console = logging.StreamHandler(sys.stdout) 86 | 87 | # adittional setLevel here to filter logs 88 | if log_type == 'result': 89 | console.setLevel('RESULT') 90 | console.setFormatter(BrowserUseFormatter('%(message)s')) 91 | else: 92 | console.setFormatter(BrowserUseFormatter('%(levelname)-8s [%(name)s] %(message)s')) 93 | 94 | # Configure root logger only 95 | root.addHandler(console) 96 | 97 | # switch cases for log_type 98 | if log_type == 'result': 99 | root.setLevel('RESULT') # string usage to avoid syntax error 100 | elif log_type == 'debug': 101 | root.setLevel(logging.DEBUG) 102 | else: 103 | root.setLevel(logging.INFO) 104 | 105 | # Configure browser_use logger 106 | browser_use_logger = logging.getLogger('browser_use') 107 | browser_use_logger.propagate = False # Don't propagate to root logger 108 | browser_use_logger.addHandler(console) 109 | browser_use_logger.setLevel(root.level) # Set same level as root logger 110 | 111 | logger = logging.getLogger('browser_use') 112 | logger.info('BrowserUse logging setup complete with level %s', log_type) 113 | # Silence third-party loggers 114 | for logger in [ 115 | 'WDM', 116 | 'httpx', 117 | 'selenium', 118 | 'playwright', 119 | 'urllib3', 120 | 'asyncio', 121 | 'langchain', 122 | 'openai', 123 | 'httpcore', 124 | 'charset_normalizer', 125 | 'anthropic._base_client', 126 | 'PIL.PngImagePlugin', 127 | 'trafilatura.htmlprocessing', 128 | 'trafilatura', 129 | ]: 130 | third_party = logging.getLogger(logger) 131 | third_party.setLevel(logging.ERROR) 132 | third_party.propagate = False 133 | -------------------------------------------------------------------------------- /browser_use/agent/message_manager/views.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING, Any 4 | from warnings import filterwarnings 5 | 6 | from langchain_core._api import LangChainBetaWarning 7 | from langchain_core.load import dumpd, load 8 | from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage, ToolMessage 9 | from pydantic import BaseModel, ConfigDict, Field, model_serializer, model_validator 10 | 11 | filterwarnings('ignore', category=LangChainBetaWarning) 12 | 13 | if TYPE_CHECKING: 14 | from browser_use.agent.views import AgentOutput 15 | 16 | 17 | class MessageMetadata(BaseModel): 18 | """Metadata for a message""" 19 | 20 | tokens: int = 0 21 | message_type: str | None = None 22 | 23 | 24 | class ManagedMessage(BaseModel): 25 | """A message with its metadata""" 26 | 27 | message: BaseMessage 28 | metadata: MessageMetadata = Field(default_factory=MessageMetadata) 29 | 30 | model_config = ConfigDict(arbitrary_types_allowed=True) 31 | 32 | # https://github.com/pydantic/pydantic/discussions/7558 33 | @model_serializer(mode='wrap') 34 | def to_json(self, original_dump): 35 | """ 36 | Returns the JSON representation of the model. 37 | 38 | It uses langchain's `dumps` function to serialize the `message` 39 | property before encoding the overall dict with json.dumps. 40 | """ 41 | data = original_dump(self) 42 | 43 | # NOTE: We override the message field to use langchain JSON serialization. 44 | data['message'] = dumpd(self.message) 45 | 46 | return data 47 | 48 | @model_validator(mode='before') 49 | @classmethod 50 | def validate( 51 | cls, 52 | value: Any, 53 | *, 54 | strict: bool | None = None, 55 | from_attributes: bool | None = None, 56 | context: Any | None = None, 57 | ) -> Any: 58 | """ 59 | Custom validator that uses langchain's `loads` function 60 | to parse the message if it is provided as a JSON string. 61 | """ 62 | if isinstance(value, dict) and 'message' in value: 63 | # NOTE: We use langchain's load to convert the JSON string back into a BaseMessage object. 64 | filterwarnings('ignore', category=LangChainBetaWarning) 65 | value['message'] = load(value['message']) 66 | return value 67 | 68 | 69 | class MessageHistory(BaseModel): 70 | """History of messages with metadata""" 71 | 72 | messages: list[ManagedMessage] = Field(default_factory=list) 73 | current_tokens: int = 0 74 | 75 | model_config = ConfigDict(arbitrary_types_allowed=True) 76 | 77 | def add_message(self, message: BaseMessage, metadata: MessageMetadata, position: int | None = None) -> None: 78 | """Add message with metadata to history""" 79 | if position is None: 80 | self.messages.append(ManagedMessage(message=message, metadata=metadata)) 81 | else: 82 | self.messages.insert(position, ManagedMessage(message=message, metadata=metadata)) 83 | self.current_tokens += metadata.tokens 84 | 85 | def add_model_output(self, output: 'AgentOutput') -> None: 86 | """Add model output as AI message""" 87 | tool_calls = [ 88 | { 89 | 'name': 'AgentOutput', 90 | 'args': output.model_dump(mode='json', exclude_unset=True), 91 | 'id': '1', 92 | 'type': 'tool_call', 93 | } 94 | ] 95 | 96 | msg = AIMessage( 97 | content='', 98 | tool_calls=tool_calls, 99 | ) 100 | self.add_message(msg, MessageMetadata(tokens=100)) # Estimate tokens for tool calls 101 | 102 | # Empty tool response 103 | tool_message = ToolMessage(content='', tool_call_id='1') 104 | self.add_message(tool_message, MessageMetadata(tokens=10)) # Estimate tokens for empty response 105 | 106 | def get_messages(self) -> list[BaseMessage]: 107 | """Get all messages""" 108 | return [m.message for m in self.messages] 109 | 110 | def get_total_tokens(self) -> int: 111 | """Get total tokens in history""" 112 | return self.current_tokens 113 | 114 | def remove_oldest_message(self) -> None: 115 | """Remove oldest non-system message""" 116 | for i, msg in enumerate(self.messages): 117 | if not isinstance(msg.message, SystemMessage): 118 | self.current_tokens -= msg.metadata.tokens 119 | self.messages.pop(i) 120 | break 121 | 122 | def remove_last_state_message(self) -> None: 123 | """Remove last state message from history""" 124 | if len(self.messages) > 2 and isinstance(self.messages[-1].message, HumanMessage): 125 | self.current_tokens -= self.messages[-1].metadata.tokens 126 | self.messages.pop() 127 | 128 | 129 | class MessageManagerState(BaseModel): 130 | """Holds the state for MessageManager""" 131 | 132 | history: MessageHistory = Field(default_factory=MessageHistory) 133 | tool_id: int = 1 134 | 135 | model_config = ConfigDict(arbitrary_types_allowed=True) 136 | -------------------------------------------------------------------------------- /chrome-extension/panel.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | OTA user interaction data helper 8 | 9 | 10 | 11 | 12 | 13 | 14 |

(disconnected)

15 | 32 |
33 | 34 | 35 | 36 | 57 | 61 | 64 | 68 | 69 | 70 | 71 |
37 |
38 |
39 | 40 | 41 |
42 |
43 | 44 | 45 |
46 |
47 | 48 | 49 |
50 |
51 | 52 | 53 |
54 |
55 | Event (0) 56 |
58 | 59 | Target 60 | 62 | Details 63 | 65 | 66 | 67 |
72 |
73 | 74 | 75 | 96 | 97 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /browser_use/agent/message_manager/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | import logging 5 | import os 6 | from typing import Any, Optional, Type 7 | 8 | from langchain_core.messages import ( 9 | AIMessage, 10 | BaseMessage, 11 | HumanMessage, 12 | SystemMessage, 13 | ToolMessage, 14 | ) 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | def extract_json_from_model_output(content: str) -> dict: 20 | """Extract JSON from model output, handling both plain JSON and code-block-wrapped JSON.""" 21 | try: 22 | # If content is wrapped in code blocks, extract just the JSON part 23 | if '```' in content: 24 | # Find the JSON content between code blocks 25 | content = content.split('```')[1] 26 | # Remove language identifier if present (e.g., 'json\n') 27 | if '\n' in content: 28 | content = content.split('\n', 1)[1] 29 | # Parse the cleaned content 30 | return json.loads(content) 31 | except json.JSONDecodeError as e: 32 | logger.warning(f'Failed to parse model output: {content} {str(e)}') 33 | raise ValueError('Could not parse response.') 34 | 35 | 36 | def convert_input_messages(input_messages: list[BaseMessage], model_name: Optional[str]) -> list[BaseMessage]: 37 | """Convert input messages to a format that is compatible with the planner model""" 38 | if model_name is None: 39 | return input_messages 40 | if model_name == 'deepseek-reasoner' or 'deepseek-r1' in model_name: 41 | converted_input_messages = _convert_messages_for_non_function_calling_models(input_messages) 42 | merged_input_messages = _merge_successive_messages(converted_input_messages, HumanMessage) 43 | merged_input_messages = _merge_successive_messages(merged_input_messages, AIMessage) 44 | return merged_input_messages 45 | return input_messages 46 | 47 | 48 | def _convert_messages_for_non_function_calling_models(input_messages: list[BaseMessage]) -> list[BaseMessage]: 49 | """Convert messages for non-function-calling models""" 50 | output_messages = [] 51 | for message in input_messages: 52 | if isinstance(message, HumanMessage): 53 | output_messages.append(message) 54 | elif isinstance(message, SystemMessage): 55 | output_messages.append(message) 56 | elif isinstance(message, ToolMessage): 57 | output_messages.append(HumanMessage(content=message.content)) 58 | elif isinstance(message, AIMessage): 59 | # check if tool_calls is a valid JSON object 60 | if message.tool_calls: 61 | tool_calls = json.dumps(message.tool_calls) 62 | output_messages.append(AIMessage(content=tool_calls)) 63 | else: 64 | output_messages.append(message) 65 | else: 66 | raise ValueError(f'Unknown message type: {type(message)}') 67 | return output_messages 68 | 69 | 70 | def _merge_successive_messages(messages: list[BaseMessage], class_to_merge: Type[BaseMessage]) -> list[BaseMessage]: 71 | """Some models like deepseek-reasoner dont allow multiple human messages in a row. This function merges them into one.""" 72 | merged_messages = [] 73 | streak = 0 74 | for message in messages: 75 | if isinstance(message, class_to_merge): 76 | streak += 1 77 | if streak > 1: 78 | if isinstance(message.content, list): 79 | merged_messages[-1].content += message.content[0]['text'] # type:ignore 80 | else: 81 | merged_messages[-1].content += message.content 82 | else: 83 | merged_messages.append(message) 84 | else: 85 | merged_messages.append(message) 86 | streak = 0 87 | return merged_messages 88 | 89 | 90 | def save_conversation(input_messages: list[BaseMessage], response: Any, target: str, encoding: Optional[str] = None) -> None: 91 | """Save conversation history to file.""" 92 | 93 | # create folders if not exists 94 | if dirname := os.path.dirname(target): 95 | os.makedirs(dirname, exist_ok=True) 96 | 97 | with open( 98 | target, 99 | 'w', 100 | encoding=encoding, 101 | ) as f: 102 | _write_messages_to_file(f, input_messages) 103 | _write_response_to_file(f, response) 104 | 105 | 106 | def _write_messages_to_file(f: Any, messages: list[BaseMessage]) -> None: 107 | """Write messages to conversation file""" 108 | for message in messages: 109 | f.write(f' {message.__class__.__name__} \n') 110 | 111 | if isinstance(message.content, list): 112 | for item in message.content: 113 | if isinstance(item, dict) and item.get('type') == 'text': 114 | f.write(item['text'].strip() + '\n') 115 | elif isinstance(message.content, str): 116 | try: 117 | content = json.loads(message.content) 118 | f.write(json.dumps(content, indent=2) + '\n') 119 | except json.JSONDecodeError: 120 | f.write(message.content.strip() + '\n') 121 | 122 | f.write('\n') 123 | 124 | 125 | def _write_response_to_file(f: Any, response: Any) -> None: 126 | """Write model response to conversation file""" 127 | f.write(' RESPONSE\n') 128 | f.write(json.dumps(json.loads(response.model_dump_json(exclude_unset=True)), indent=2)) 129 | -------------------------------------------------------------------------------- /browser_use/agent/system_prompt.md: -------------------------------------------------------------------------------- 1 | You are an AI agent designed to automate browser tasks. Your goal is to accomplish the ultimate task following the rules. 2 | 3 | # Input Format 4 | Task 5 | Previous steps 6 | Current URL 7 | Open Tabs 8 | Interactive Elements 9 | [index]text 10 | - index: Numeric identifier for interaction 11 | - type: HTML element type (button, input, etc.) 12 | - text: Element description 13 | Example: 14 | [33] 15 | 16 | - Only elements with numeric indexes in [] are interactive 17 | - elements without [] provide only context 18 | 19 | # Response Rules 20 | 1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format: 21 | {{"current_state": {{"evaluation_previous_goal": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Mention if something unexpected happened. Shortly state why/why not", 22 | "memory": "Description of what has been done and what you need to remember. Be very specific. Count here ALWAYS how many times you have done something and how many remain. E.g. 0 out of 10 websites analyzed. Continue with abc and xyz", 23 | "next_goal": "What needs to be done with the next immediate action"}}, 24 | "action":[{{"one_action_name": {{// action-specific parameter}}}}, // ... more actions in sequence]}} 25 | 26 | 2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {{max_actions}} actions per sequence. 27 | Common action sequences: 28 | - Form filling: [{{"input_text": {{"index": 1, "text": "username"}}}}, {{"input_text": {{"index": 2, "text": "password"}}}}, {{"click_element": {{"index": 3}}}}] 29 | - Navigation and extraction: [{{"go_to_url": {{"url": "https://example.com"}}}}, {{"extract_content": {{"goal": "extract the names"}}}}] 30 | - Actions are executed in the given order 31 | - If the page changes after an action, the sequence is interrupted and you get the new state. 32 | - Only provide the action sequence until an action which changes the page state significantly. 33 | - Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page 34 | - only use multiple actions if it makes sense. 35 | 36 | 3. ELEMENT INTERACTION: 37 | - Only use indexes of the interactive elements 38 | - Elements marked with "[]Non-interactive text" are non-interactive 39 | 40 | 4. NAVIGATION & ERROR HANDLING: 41 | - If no suitable elements exist, use other functions to complete the task 42 | - If stuck, try alternative approaches - like going back to a previous page, new search, new tab etc. 43 | - Handle popups/cookies by accepting or closing them 44 | - Use scroll to find elements you are looking for 45 | - If you want to research something, open a new tab instead of using the current tab 46 | - If captcha pops up, try to solve it - else try a different approach 47 | - If the page is not fully loaded, use wait action 48 | 49 | 5. TASK COMPLETION: 50 | - Use the done action as the last action as soon as the ultimate task is complete 51 | - Dont use "done" before you are done with everything the user asked you, except you reach the last step of max_steps. 52 | - If you reach your last step, use the done action even if the task is not fully finished. Provide all the information you have gathered so far. If the ultimate task is completely finished set success to true. If not everything the user asked for is completed set success in done to false! 53 | - If you have to do something repeatedly for example the task says for "each", or "for all", or "x times", count always inside "memory" how many times you have done it and how many remain. Don't stop until you have completed like the task asked you. Only call done after the last step. 54 | - Don't hallucinate actions 55 | - Make sure you include everything you found out for the ultimate task in the done text parameter. Do not just say you are done, but include the requested information of the task. 56 | 57 | 6. VISUAL CONTEXT: 58 | - When an image is provided, use it to understand the page layout 59 | - Bounding boxes with labels on their top right corner correspond to element indexes 60 | 61 | 7. Form filling: 62 | - If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field. 63 | 64 | 8. Long tasks: 65 | - Keep track of the status and subresults in the memory. 66 | - You are provided with procedural memory summaries that condense previous task history (every N steps). Use these summaries to maintain context about completed actions, current progress, and next steps. The summaries appear in chronological order and contain key information about navigation history, findings, errors encountered, and current state. Refer to these summaries to avoid repeating actions and to ensure consistent progress toward the task goal. 67 | 68 | 9. Extraction: 69 | - If your task is to find information - call extract_content on the specific pages to get and store the information. 70 | Your responses must be always JSON with the specified format. 71 | -------------------------------------------------------------------------------- /browser_use/controller/registry/views.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Dict, Type 2 | 3 | from playwright.async_api import Page 4 | from pydantic import BaseModel, ConfigDict 5 | 6 | 7 | class RegisteredAction(BaseModel): 8 | """Model for a registered action""" 9 | 10 | name: str 11 | description: str 12 | function: Callable 13 | param_model: Type[BaseModel] 14 | 15 | # filters: provide specific domains or a function to determine whether the action should be available on the given page or not 16 | domains: list[str] | None = None # e.g. ['*.google.com', 'www.bing.com', 'yahoo.*] 17 | page_filter: Callable[[Page], bool] | None = None 18 | 19 | model_config = ConfigDict(arbitrary_types_allowed=True) 20 | 21 | def prompt_description(self) -> str: 22 | """Get a description of the action for the prompt""" 23 | skip_keys = ['title'] 24 | s = f'{self.description}: \n' 25 | s += '{' + str(self.name) + ': ' 26 | s += str( 27 | { 28 | k: {sub_k: sub_v for sub_k, sub_v in v.items() if sub_k not in skip_keys} 29 | for k, v in self.param_model.model_json_schema()['properties'].items() 30 | } 31 | ) 32 | s += '}' 33 | return s 34 | 35 | 36 | class ActionModel(BaseModel): 37 | """Base model for dynamically created action models""" 38 | 39 | # this will have all the registered actions, e.g. 40 | # click_element = param_model = ClickElementParams 41 | # done = param_model = None 42 | # 43 | model_config = ConfigDict(arbitrary_types_allowed=True) 44 | 45 | def get_index(self) -> int | None: 46 | """Get the index of the action""" 47 | # {'clicked_element': {'index':5}} 48 | params = self.model_dump(exclude_unset=True).values() 49 | if not params: 50 | return None 51 | for param in params: 52 | if param is not None and 'index' in param: 53 | return param['index'] 54 | return None 55 | 56 | def set_index(self, index: int): 57 | """Overwrite the index of the action""" 58 | # Get the action name and params 59 | action_data = self.model_dump(exclude_unset=True) 60 | action_name = next(iter(action_data.keys())) 61 | action_params = getattr(self, action_name) 62 | 63 | # Update the index directly on the model 64 | if hasattr(action_params, 'index'): 65 | action_params.index = index 66 | 67 | 68 | class ActionRegistry(BaseModel): 69 | """Model representing the action registry""" 70 | 71 | actions: Dict[str, RegisteredAction] = {} 72 | 73 | @staticmethod 74 | def _match_domains(domains: list[str] | None, url: str) -> bool: 75 | """ 76 | Match a list of domain glob patterns against a URL. 77 | 78 | Args: 79 | domain_patterns: A list of domain patterns that can include glob patterns (* wildcard) 80 | url: The URL to match against 81 | 82 | Returns: 83 | True if the URL's domain matches the pattern, False otherwise 84 | """ 85 | 86 | if domains is None or not url: 87 | return True 88 | 89 | import fnmatch 90 | from urllib.parse import urlparse 91 | 92 | # Parse the URL to get the domain 93 | try: 94 | parsed_url = urlparse(url) 95 | if not parsed_url.netloc: 96 | return False 97 | 98 | domain = parsed_url.netloc 99 | # Remove port if present 100 | if ':' in domain: 101 | domain = domain.split(':')[0] 102 | 103 | for domain_pattern in domains: 104 | if fnmatch.fnmatch(domain, domain_pattern): # Perform glob *.matching.* 105 | return True 106 | return False 107 | except Exception: 108 | return False 109 | 110 | @staticmethod 111 | def _match_page_filter(page_filter: Callable[[Page], bool] | None, page: Page) -> bool: 112 | """Match a page filter against a page""" 113 | if page_filter is None: 114 | return True 115 | return page_filter(page) 116 | 117 | def get_prompt_description(self, page: Page | None = None) -> str: 118 | """Get a description of all actions for the prompt 119 | 120 | Args: 121 | page: If provided, filter actions by page using page_filter and domains. 122 | 123 | Returns: 124 | A string description of available actions. 125 | - If page is None: return only actions with no page_filter and no domains (for system prompt) 126 | - If page is provided: return only filtered actions that match the current page (excluding unfiltered actions) 127 | """ 128 | if page is None: 129 | # For system prompt (no page provided), include only actions with no filters 130 | return '\n'.join( 131 | action.prompt_description() 132 | for action in self.actions.values() 133 | if action.page_filter is None and action.domains is None 134 | ) 135 | 136 | # only include filtered actions for the current page 137 | filtered_actions = [] 138 | for action in self.actions.values(): 139 | if not (action.domains or action.page_filter): 140 | # skip actions with no filters, they are already included in the system prompt 141 | continue 142 | 143 | domain_is_allowed = self._match_domains(action.domains, page.url) 144 | page_is_allowed = self._match_page_filter(action.page_filter, page) 145 | 146 | if domain_is_allowed and page_is_allowed: 147 | filtered_actions.append(action) 148 | 149 | return '\n'.join(action.prompt_description() for action in filtered_actions) 150 | -------------------------------------------------------------------------------- /chrome-extension/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
6 | OTA-tool-kits 7 |
8 |
9 |
10 | Homepage 12 | Hugging Face 14 | Code License 16 |


17 |
18 | 19 | 20 | OTA WAP Browser Action Capturer 21 | ====================== 22 | 23 | OTA browser action capturer is a simple tool which helps you to collect the interactions with browser such as click, typing etc. and transforms them to well structured data for generating LLM-powered "records and replay" instructions. The action data will be organized in a JSON format and sent to your local WAP server of data collection. 24 | 25 | To deploy the local WAP server, please refer to: https://github.com/OTA-Tech-AI/webagentprotocol 26 | WAP (Web Agent protocol) is our standard protocol for AI Agent record-and-play inferencing. 27 | 28 | 29 | Installation 30 | ----- 31 | 32 | Install our public **Chrome extension** at [WAP Browser Action Capturer](https://chromewebstore.google.com/detail/wap-browser-action-captur/chikiefojkdpmijbhepipdkadcljlbmh). 33 | 34 | If you want to install this extension locally, please refer to: https://developer.chrome.com/docs/extensions/get-started/tutorial/hello-world 35 | 36 | Usage of Action Capturer 37 | ----- 38 | 39 | ### Perpare 40 | 41 | Open Chrome DevTools and navigate to the **"OTA Action Capturer"** panel and you will see: 42 | 43 | Panel UI 44 | 45 | Make sure the IP address and port in Settings are correct: 46 | 47 | Settings GIF 48 | 49 | ### Start a record 50 | 51 | Clearly describe the task you will be working on and click "START RECORD": 52 | 53 | Start Record 54 | 55 | The capturer will record actions such as clicking, typing, navigating etc. only in the current page. 56 | 57 | If the HTML content in the page changed, the event table will present the added/removed/changed nodes. The information of changing will be collected and sent to your local WAP server. 58 | 59 | Recording 60 | 61 | An example of the formatted data which you will received in the WAP backend server is like: 62 | 63 | ```json 64 | { 65 | "taskId": "MkCAhQsHgXn7YgaK", 66 | "type": "click", 67 | "actionTimestamp": 1746325231479, 68 | "eventTarget": { 69 | "type": "click", 70 | "target": "\n
...", 71 | "targetId": "mntl-card-list-card--extendable_3-0", 72 | "targetClass": "comp mntl-card-list-card--extendable mntl-universal-card mntl-document-card mntl-card card card--no-image" 73 | }, 74 | "allEvents": {}, 75 | "pageHTMLContent": "
..." 76 | } 77 | ``` 78 | 79 | The extension utilized MutationObserver to capture any node changes in the page, please refer to [MutationObserver](https://developer.mozilla.org/en/docs/Web/API/MutationObserver) for more details. 80 | 81 | ### Pause 82 | During the record, you can click on "PAUSE" to pause the capturer so that no actions will be recorded until you hit "RESUME": 83 | 84 | Pause 85 | 86 | Thanks to 87 | ------ 88 | 89 | OTA action capturer is built on top of [DOMListenerExtension](https://github.com/kdzwinel/DOMListenerExtension). 90 | 91 | License 92 | ------- 93 | 94 | This program is free software: you can redistribute it and/or modify 95 | it under the terms of the GNU General Public License as published by 96 | the Free Software Foundation, either version 3 of the License, or 97 | (at your option) any later version. 98 | 99 | This program is distributed in the hope that it will be useful, 100 | but WITHOUT ANY WARRANTY; without even the implied warranty of 101 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 102 | GNU General Public License for more details. 103 | 104 | You should have received a copy of the GNU General Public License 105 | along with this program. If not, see . 106 | -------------------------------------------------------------------------------- /utils/action_processing.py: -------------------------------------------------------------------------------- 1 | import json, re, sys 2 | from typing import Any, Dict 3 | from utils.html_cleaner import run_html_sanitizer 4 | from jinja2 import Template 5 | from pathlib import Path 6 | 7 | TEMPLATE_DIR = Path("prompts/subgoal_generation") 8 | 9 | def choose_template(action_type: str) -> Path: 10 | """Return the correct template file for a given action_type.""" 11 | match action_type: 12 | case "submit": 13 | return TEMPLATE_DIR / "submit.md" 14 | case "go-back-or-forward": 15 | return TEMPLATE_DIR / "go-back-or-forward.md" 16 | case "task-start": 17 | return TEMPLATE_DIR / "task-start.md" 18 | case "task-finish": 19 | return TEMPLATE_DIR / "task-finish.md" 20 | case _: 21 | return TEMPLATE_DIR / "common.md" 22 | 23 | 24 | def extract_action_bundle(raw: Dict[str, Any], sanitize: bool = False) -> Dict[str, Any]: 25 | """ 26 | Split the incoming JSON dict into: 27 | action {type, eventTarget} 28 | change_events list from `allEvents` 29 | page_content sanitized or raw HTML 30 | 31 | Returns a dict with those keys. 32 | """ 33 | # 1. Top‑level type + eventTarget 34 | action = { 35 | "type": raw.get("type"), 36 | "eventTarget": raw.get("eventTarget") 37 | } 38 | 39 | # 2. Change events 40 | change_events = raw.get("allEvents", []) 41 | 42 | if change_events == {}: 43 | change_events = "[changes not available]" 44 | 45 | # 3. Page HTML 46 | page_html = raw.get("pageHTMLContent", "") 47 | 48 | # Use prettify to format the HTML. 49 | if sanitize: 50 | # Use the top‑level type to decide how to sanitize, if desired. 51 | page_html = run_html_sanitizer(page_html, action["type"] or "") 52 | 53 | page_html = re.sub(r'[\n\r\t\\]+', '', page_html) 54 | return { 55 | "action_type": raw.get("type"), 56 | "action": action, 57 | "change_events": change_events, 58 | "page_content": page_html 59 | } 60 | 61 | 62 | def generate_subgoal_speculate_prompt(summary_event: Dict[str, Any], ultimate_goal: str, subtask_name: str, output_path: str) -> None: 63 | # 1) bundle relevant pieces 64 | grouped_items = extract_action_bundle(summary_event, True) 65 | # 2) prepare the data that the template expects 66 | context = { 67 | "ultimate_goal": ultimate_goal, 68 | "action": grouped_items["action"], 69 | "change_events": grouped_items["change_events"], 70 | "page_content": grouped_items["page_content"], 71 | } 72 | 73 | template = choose_template(grouped_items["action_type"]) 74 | template_text = template.read_text(encoding="utf-8") 75 | filled_markdown = Template(template_text).render(**context) 76 | 77 | # 4) save to subgoals/subgoal_.md 78 | output_dir = Path(output_path) 79 | output_dir.mkdir(parents=True, exist_ok=True) 80 | out_path = output_dir / f"subgoal_{subtask_name}_{grouped_items['action_type']}.md" 81 | out_path.write_text(filled_markdown, encoding="utf-8") 82 | 83 | return out_path 84 | 85 | # --------------------------------------------------------------------------- 86 | # helper: locate exactly one task-start file and return its taskDescription 87 | # --------------------------------------------------------------------------- 88 | def find_task_prompt(data_dir: str | Path) -> str: 89 | data_dir = Path(data_dir) 90 | 91 | # 0️⃣ Does the path exist? 92 | if not data_dir.exists(): 93 | sys.exit(f"[OTA error] given path does not exist: {data_dir}") 94 | 95 | # 0️⃣b Is it a directory? 96 | if not data_dir.is_dir(): 97 | sys.exit(f"[OTA error] path is not a directory: {data_dir}") 98 | 99 | # 1️⃣ Gather every *.json recursively (sub-folders included) 100 | json_paths = sorted(data_dir.rglob("*.json")) 101 | if not json_paths: 102 | sys.exit(f"[OTA error] no *.json files found under {data_dir}") 103 | 104 | # 2️⃣ First file (by name) must be task-start 105 | first = json.loads(json_paths[0].read_text(encoding="utf-8")) 106 | if first.get("type") != "task-start": 107 | sys.exit("[OTA error] first JSON file is not a task-start record") 108 | 109 | # 3️⃣ Collect *all* task-start files 110 | task_start_files = [ 111 | p for p in json_paths 112 | if json.loads(p.read_text(encoding="utf-8")).get("type") == "task-start" 113 | ] 114 | if len(task_start_files) == 0: 115 | sys.exit("[OTA error] no task-start file found") 116 | if len(task_start_files) > 1: 117 | names = ", ".join(p.name for p in task_start_files) 118 | sys.exit(f"[OTA error] multiple task-start files detected: {names}") 119 | 120 | # 4️⃣ Extract taskDescription 121 | task_json = json.loads(task_start_files[0].read_text(encoding="utf-8")) 122 | task_id = task_json.get("taskId") 123 | task_desc = task_json.get("taskDescription") 124 | if not task_desc or not task_id: 125 | sys.exit(f"[OTA error] task-start file {task_start_files[0].name} " 126 | "has no taskDescription or taskId") 127 | 128 | return task_desc, task_id 129 | 130 | 131 | def load_event_json(path: str | Path) -> Dict[str, Any]: 132 | """Read the given JSON file and return it as a Python dict.""" 133 | path = Path(path) 134 | if not path.is_file(): 135 | raise FileNotFoundError(f"Cannot find JSON file: {path}") 136 | with path.open("r", encoding="utf-8") as f: 137 | return json.load(f) -------------------------------------------------------------------------------- /utils/subgoal_generator.py: -------------------------------------------------------------------------------- 1 | """Sub‑goal batch generator. 2 | 3 | Given a directory path, this helper loads every `*.md` file (each file is 4 | assumed to contain a prompt) and sends the content to OpenAI (via 5 | `ask_llm`). It returns a list of dicts with filename, prompt, and reply. 6 | """ 7 | from __future__ import annotations 8 | 9 | import json, re 10 | from pathlib import Path 11 | from typing import List, Dict, Any, Optional 12 | 13 | from utils.llm import ask_llm 14 | 15 | __all__ = ["generate_subgoals_from_dir"] 16 | 17 | 18 | def _load_prompts(dir_path: str | Path) -> List[tuple[Path, str]]: 19 | """Return a list of (path, text) for every .md file in *dir_path*.""" 20 | dir_path = Path(dir_path) 21 | paths = sorted(dir_path.glob("*.md")) 22 | prompts: List[tuple[Path, str]] = [] 23 | for p in paths: 24 | text = p.read_text(encoding="utf-8").strip() 25 | if text: 26 | prompts.append((p, text)) 27 | return prompts 28 | 29 | 30 | def generate_subgoals_from_dir( 31 | dir_path: str | Path, 32 | *, 33 | system_prompt: Optional[str] = None, 34 | model: str = "gpt-4o-mini", 35 | temperature: float = 0.2, 36 | save_jsonl: Optional[str | Path] = None, 37 | ) -> List[Dict[str, Any]]: 38 | """Load all .md files under *dir_path*, query the LLM, and return results. 39 | 40 | Parameters 41 | ---------- 42 | dir_path : str | Path 43 | Directory containing `*.md` prompt files. 44 | system_prompt : str | None 45 | Optional system message for the LLM. 46 | model : str 47 | OpenAI model name. 48 | temperature : float 49 | Sampling temperature. 50 | save_jsonl : str | Path | None 51 | If given, write a JSON‑lines file with each result. 52 | 53 | Returns 54 | ------- 55 | list[dict] 56 | Each dict contains {"file", "prompt", "reply"}. 57 | """ 58 | prompts = _load_prompts(dir_path) 59 | if not prompts: 60 | raise FileNotFoundError(f"No .md files found in {dir_path}") 61 | 62 | if save_jsonl: 63 | save_path = Path(save_jsonl) 64 | if save_path.exists(): 65 | save_path.unlink() 66 | 67 | results: List[Dict[str, Any]] = [] 68 | 69 | for idx, (path, prompt_text) in enumerate(prompts, 1): 70 | print(f"[{idx}/{len(prompts)}] Querying LLM for {path.name} …") 71 | reply = ask_llm( 72 | prompt_text, 73 | system_prompt=system_prompt, 74 | model=model, 75 | temperature=temperature, 76 | ) 77 | result = { 78 | "file": path.name, 79 | "prompt": prompt_text, 80 | "reply": reply, 81 | } 82 | results.append(result) 83 | 84 | # Optionally append to JSONL file incrementally 85 | if save_jsonl: 86 | with Path(save_jsonl).open("a", encoding="utf-8") as f: 87 | f.write(json.dumps(result, ensure_ascii=False) + "\n") 88 | 89 | return results 90 | 91 | # --------------------------------------------------------------------------- 92 | # JSONL "reply" → next_goal extractor 93 | # --------------------------------------------------------------------------- 94 | 95 | def _clean_reply(raw_reply: str) -> str: 96 | """Remove markdown fences and whitespace from a reply string.""" 97 | # strip triple back‑tick blocks if present 98 | fenced = re.compile(r"```json\s*(.*?)\s*```", re.DOTALL | re.IGNORECASE) 99 | m = fenced.search(raw_reply) 100 | if m: 101 | return m.group(1).strip() 102 | return raw_reply.strip() 103 | 104 | 105 | def wap_subgoal_list_generation( 106 | ultimate_goal: str, 107 | task_id: str, 108 | jsonl_path: str | Path, 109 | out_path: str | Path = "wap_subgoal.json", 110 | ) -> List[str]: 111 | """Read *jsonl_path*, extract the `next_goal` from each line's `reply`, 112 | and write the list of next goals to *out_path* (as a JSON array). 113 | 114 | Returns the list for immediate use. 115 | """ 116 | jsonl_path = Path(jsonl_path) 117 | if not jsonl_path.is_file(): 118 | raise FileNotFoundError(jsonl_path) 119 | 120 | goals: List[Dict[str, str]] = [{"index": 0, "subgoal": "task starts, go for the next sub-goal"}] 121 | with jsonl_path.open("r", encoding="utf-8") as fh: 122 | for line_no, line in enumerate(fh, 1): 123 | if not line.strip(): 124 | continue 125 | try: 126 | record = json.loads(line) 127 | except json.JSONDecodeError: 128 | print(f"[extract] line {line_no}: malformed JSON – skipped") 129 | continue 130 | 131 | raw_reply = str(record.get("reply", "")) 132 | cleaned = _clean_reply(raw_reply) 133 | try: 134 | reply_json = json.loads(cleaned) 135 | except json.JSONDecodeError: 136 | print(f"[extract] line {line_no}: reply not valid JSON – skipped") 137 | continue 138 | 139 | goal_text = reply_json.get("next_goal") 140 | if goal_text: 141 | goals.append({"index": len(goals), "subgoal": goal_text}) 142 | else: 143 | print(f"[extract] line {line_no}: no 'next_goal' key – skipped") 144 | 145 | goals.append({"index": len(goals), "subgoal": "task done"}) 146 | 147 | final_output = { 148 | "ultimate_goal": ultimate_goal, 149 | "task_id": task_id, 150 | "type": "smart_replay", 151 | "subgoal_list": goals 152 | } 153 | Path(out_path).write_text(json.dumps(final_output, ensure_ascii=False, indent=2), encoding="utf-8") 154 | print(f"[extract] wrote {len(goals)} sub‑goals → {out_path}") 155 | return goals 156 | -------------------------------------------------------------------------------- /wap_replay/generate_mcp_server.py: -------------------------------------------------------------------------------- 1 | from mcp.server.fastmcp import FastMCP 2 | import httpx 3 | import os 4 | import json 5 | import argparse 6 | from typing import Optional 7 | import utils.llm 8 | import glob 9 | from dotenv import load_dotenv 10 | 11 | load_dotenv() 12 | 13 | def extract_ultimate_goal(task_id: str) -> str: 14 | """ 15 | Try to extract ultimate_goal from either exact_replay or smart_replay file. 16 | 17 | Args: 18 | task_id: The task ID to look for in the replay files 19 | 20 | Returns: 21 | The ultimate_goal string if found 22 | 23 | Raises: 24 | FileNotFoundError: If neither replay file exists 25 | ValueError: If ultimate_goal field is not found 26 | """ 27 | exact_replay_path = os.path.join(".", "data_processed", "exact_replay", f"wap_exact_replay_list_{task_id}.json") 28 | smart_replay_path = os.path.join(".", "data_processed", "smart_replay", f"wap_smart_replay_list_{task_id}.json") 29 | 30 | for file_path in [exact_replay_path, smart_replay_path]: 31 | if os.path.exists(file_path): 32 | try: 33 | with open(file_path, 'r') as f: 34 | data = json.load(f) 35 | return data['ultimate_goal'] 36 | except (json.JSONDecodeError, KeyError): 37 | continue 38 | 39 | raise ValueError(f"Could not find ultimate_goal in replay files for task_id {task_id}") 40 | 41 | def summarize_goal(ultimate_goal: str) -> str: 42 | """ 43 | Generate a function name from the ultimate goal. 44 | This is a placeholder - in practice you would call OpenAI API here. 45 | """ 46 | return utils.llm.ask_llm(f"Summarize the following to a single function name with underscore in plaintext: {ultimate_goal}") 47 | 48 | def create_mcp_server(ultimate_goal: str, function_name: str, task_id: str) -> str: 49 | """ 50 | Creates an MCP server file with the specified parameters. 51 | 52 | Args: 53 | ultimate_goal: The goal description for the MCP 54 | function_name: The base name for the functions 55 | task_id: The task ID used in file paths 56 | 57 | Returns: 58 | The complete Python code as a string 59 | """ 60 | exact_replay_path = os.path.join(".", "data_processed", "exact_replay", f"wap_exact_replay_list_{task_id}.json") 61 | smart_replay_path = os.path.join(".", "data_processed", "smart_replay", f"wap_smart_replay_list_{task_id}.json") 62 | 63 | smart_docstring = f"smart replay: {ultimate_goal}" 64 | exact_docstring = f"exact replay: {ultimate_goal}" 65 | 66 | code = f''' 67 | from mcp.server.fastmcp import FastMCP 68 | import httpx 69 | 70 | mcp = FastMCP("{ultimate_goal}") 71 | ''' 72 | 73 | # Only include the tool function for the existing replay file 74 | if os.path.exists(smart_replay_path): 75 | code += f''' 76 | @mcp.tool() 77 | async def {function_name}_smart_replay() -> str: 78 | """{smart_docstring}""" 79 | async with httpx.AsyncClient(timeout=600.0) as client: 80 | response = await client.get( 81 | "http://localhost:3089/replay", 82 | params={{ 83 | "concurrent": 1, 84 | "model": "openai", 85 | "file_path": 'data_processed/smart_replay/wap_smart_replay_list_{task_id}.json' 86 | }} 87 | ) 88 | return response.text 89 | return "FAILED" 90 | ''' 91 | if os.path.exists(exact_replay_path): 92 | code += f''' 93 | @mcp.tool() 94 | async def {function_name}_exact_replay() -> str: 95 | """{exact_docstring}""" 96 | async with httpx.AsyncClient(timeout=600.0) as client: 97 | response = await client.get( 98 | "http://localhost:3089/replay", 99 | params={{ 100 | "concurrent": 1, 101 | "model": "openai", 102 | "file_path": 'data_processed/exact_replay/wap_exact_replay_list_{task_id}.json' 103 | }} 104 | ) 105 | return response.text 106 | return "FAILED" 107 | ''' 108 | 109 | code += ''' 110 | if __name__ == "__main__": 111 | mcp.run(transport="stdio") 112 | ''' 113 | return code 114 | 115 | def main(): 116 | parser = argparse.ArgumentParser(description='Create MCP server file from replay data') 117 | parser.add_argument('--task_id', required=True, help='Task ID to process') 118 | args = parser.parse_args() 119 | 120 | try: 121 | # Extract ultimate_goal from replay files 122 | ultimate_goal = extract_ultimate_goal(args.task_id) 123 | 124 | # Generate function name 125 | function_name = summarize_goal(ultimate_goal) 126 | 127 | # Generate the code 128 | server_code = create_mcp_server(ultimate_goal, function_name, args.task_id) 129 | 130 | # Create mcp_servers directory if it doesn't exist 131 | os.makedirs("mcp_servers", exist_ok=True) 132 | 133 | # Check for existing files with the same task_id 134 | existing_files = glob.glob(os.path.join("mcp_servers", f"*_{args.task_id}_mcp_server.py")) 135 | 136 | # If duplicates exist, remove them 137 | for existing_file in existing_files: 138 | os.remove(existing_file) 139 | print(f"Removed duplicate: {existing_file}") 140 | 141 | # Save to file in the mcp_servers folder 142 | filename = os.path.join("mcp_servers", f"{function_name}_{args.task_id}_mcp_server.py") 143 | with open(filename, "w") as file: 144 | file.write(server_code) 145 | 146 | print(f"Successfully created/updated {filename}") 147 | except Exception as e: 148 | print(f"Error: {str(e)}") 149 | exit(1) 150 | 151 | if __name__ == "__main__": 152 | main() -------------------------------------------------------------------------------- /browser_use/dom/tests/extraction_test.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import time 3 | 4 | from langchain_openai import ChatOpenAI 5 | 6 | from browser_use.browser.browser import Browser, BrowserConfig 7 | from browser_use.browser.context import BrowserContext, BrowserContextConfig 8 | from browser_use.dom.service import DomService 9 | from browser_use.utils import time_execution_sync 10 | 11 | 12 | def count_string_tokens(string: str, model: str) -> int: 13 | """Count the number of tokens in a string using a specified model.""" 14 | llm = ChatOpenAI(model=model) 15 | return llm.count_tokens(string) 16 | 17 | 18 | async def test_process_html_file(): 19 | config = BrowserContextConfig( 20 | cookies_file='cookies3.json', 21 | disable_security=True, 22 | wait_for_network_idle_page_load_time=2, 23 | ) 24 | 25 | browser = Browser( 26 | config=BrowserConfig( 27 | # chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', 28 | ) 29 | ) 30 | context = BrowserContext(browser=browser, config=config) # noqa: F821 31 | 32 | websites = [ 33 | 'https://kayak.com/flights', 34 | 'https://immobilienscout24.de', 35 | 'https://google.com', 36 | 'https://amazon.com', 37 | 'https://github.com', 38 | ] 39 | 40 | async with context as context: 41 | page = await context.get_current_page() 42 | dom_service = DomService(page) 43 | 44 | for website in websites: 45 | print(f'\n{"=" * 50}\nTesting {website}\n{"=" * 50}') 46 | await page.goto(website) 47 | time.sleep(2) # Additional wait for dynamic content 48 | 49 | async def test_viewport(expansion: int, description: str): 50 | print(f'\n{description}:') 51 | dom_state = await time_execution_sync(f'get_clickable_elements ({description})')( 52 | dom_service.get_clickable_elements 53 | )(highlight_elements=True, viewport_expansion=expansion) 54 | 55 | elements = dom_state.element_tree 56 | selector_map = dom_state.selector_map 57 | element_count = len(selector_map.keys()) 58 | token_count = count_string_tokens(elements.clickable_elements_to_string(), model='gpt-4o') 59 | 60 | print(f'Number of elements: {element_count}') 61 | print(f'Token count: {token_count}') 62 | return element_count, token_count 63 | 64 | expansions = [0, 100, 200, 300, 400, 500, 600, 1000, -1, -200] 65 | results = [] 66 | 67 | for i, expansion in enumerate(expansions): 68 | description = ( 69 | f'{i + 1}. Expansion {expansion}px' if expansion >= 0 else f'{i + 1}. All elements ({expansion} expansion)' 70 | ) 71 | count, tokens = await test_viewport(expansion, description) 72 | results.append((count, tokens)) 73 | input('Press Enter to continue...') 74 | await page.evaluate('document.getElementById("playwright-highlight-container")?.remove()') 75 | 76 | # Print comparison summary 77 | print('\nComparison Summary:') 78 | for i, (count, tokens) in enumerate(results): 79 | expansion = expansions[i] 80 | description = f'Expansion {expansion}px' if expansion >= 0 else 'All elements (-1)' 81 | initial_count, initial_tokens = results[0] 82 | print(f'{description}: {count} elements (+{count - initial_count}), {tokens} tokens') 83 | 84 | input('\nPress Enter to continue to next website...') 85 | 86 | # Clear highlights before next website 87 | await page.evaluate('document.getElementById("playwright-highlight-container")?.remove()') 88 | 89 | 90 | async def test_focus_vs_all_elements(): 91 | config = BrowserContextConfig( 92 | # cookies_file='cookies3.json', 93 | disable_security=True, 94 | wait_for_network_idle_page_load_time=2, 95 | ) 96 | 97 | browser = Browser( 98 | config=BrowserConfig( 99 | # browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', 100 | ) 101 | ) 102 | context = BrowserContext(browser=browser, config=config) # noqa: F821 103 | 104 | websites = [ 105 | 'https://en.wikipedia.org/wiki/Humanist_Party_of_Ontario', 106 | 'https://www.google.com/travel/flights?tfs=CBwQARoJagcIARIDTEpVGglyBwgBEgNMSlVAAUgBcAGCAQsI____________AZgBAQ&tfu=KgIIAw&hl=en-US&gl=US', 107 | # 'https://www.concur.com/?&cookie_preferences=cpra', 108 | 'https://immobilienscout24.de', 109 | 'https://docs.google.com/spreadsheets/d/1INaIcfpYXlMRWO__de61SHFCaqt1lfHlcvtXZPItlpI/edit', 110 | 'https://www.zeiss.com/career/en/job-search.html?page=1', 111 | 'https://www.mlb.com/yankees/stats/', 112 | 'https://www.amazon.com/s?k=laptop&s=review-rank&crid=1RZCEJ289EUSI&qid=1740202453&sprefix=laptop%2Caps%2C166&ref=sr_st_review-rank&ds=v1%3A4EnYKXVQA7DIE41qCvRZoNB4qN92Jlztd3BPsTFXmxU', 113 | 'https://codepen.io/geheimschriftstift/pen/mPLvQz', 114 | 'https://reddit.com', 115 | 'https://www.google.com/search?q=google+hi&oq=google+hi&gs_lcrp=EgZjaHJvbWUyBggAEEUYOTIGCAEQRRhA0gEIMjI2NmowajSoAgCwAgE&sourceid=chrome&ie=UTF-8', 116 | 'https://kayak.com/flights', 117 | 'https://google.com', 118 | 'https://amazon.com', 119 | 'https://github.com', 120 | ] 121 | 122 | async with context as context: 123 | page = await context.get_current_page() 124 | dom_service = DomService(page) 125 | 126 | for website in websites: 127 | # sleep 2 128 | await page.goto(website) 129 | time.sleep(2) 130 | 131 | while True: 132 | try: 133 | print(f'\n{"=" * 50}\nTesting {website}\n{"=" * 50}') 134 | # time.sleep(2) # Additional wait for dynamic content 135 | 136 | # First get all elements 137 | print('\nGetting all elements:') 138 | all_elements_state = await time_execution_sync('get_all_elements')(dom_service.get_clickable_elements)( 139 | highlight_elements=True, viewport_expansion=1000 140 | ) 141 | 142 | selector_map = all_elements_state.selector_map 143 | total_elements = len(selector_map.keys()) 144 | print(f'Total number of elements: {total_elements}') 145 | 146 | print(all_elements_state.element_tree.clickable_elements_to_string()) 147 | 148 | answer = input('Press Enter to clear highlights and continue...') 149 | if answer == 'q': 150 | break 151 | 152 | await page.evaluate('document.getElementById("playwright-highlight-container")?.remove()') 153 | 154 | except Exception as e: 155 | print(f'Error: {e}') 156 | pass 157 | 158 | 159 | if __name__ == '__main__': 160 | asyncio.run(test_focus_vs_all_elements()) 161 | asyncio.run(test_process_html_file()) 162 | -------------------------------------------------------------------------------- /browser_use/agent/tests.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from browser_use.agent.views import ( 4 | ActionResult, 5 | AgentBrain, 6 | AgentHistory, 7 | AgentHistoryList, 8 | AgentOutput, 9 | ) 10 | from browser_use.browser.views import BrowserState, BrowserStateHistory, TabInfo 11 | from browser_use.controller.registry.service import Registry 12 | from browser_use.controller.views import ClickElementAction, DoneAction, ExtractPageContentAction 13 | from browser_use.dom.views import DOMElementNode 14 | 15 | 16 | @pytest.fixture 17 | def sample_browser_state(): 18 | return BrowserState( 19 | url='https://example.com', 20 | title='Example Page', 21 | tabs=[TabInfo(url='https://example.com', title='Example Page', page_id=1)], 22 | screenshot='screenshot1.png', 23 | element_tree=DOMElementNode( 24 | tag_name='root', 25 | is_visible=True, 26 | parent=None, 27 | xpath='', 28 | attributes={}, 29 | children=[], 30 | ), 31 | selector_map={}, 32 | ) 33 | 34 | 35 | @pytest.fixture 36 | def action_registry(): 37 | registry = Registry() 38 | 39 | # Register the actions we need for testing 40 | @registry.action(description='Click an element', param_model=ClickElementAction) 41 | def click_element(params: ClickElementAction, browser=None): 42 | pass 43 | 44 | @registry.action( 45 | description='Extract page content', 46 | param_model=ExtractPageContentAction, 47 | ) 48 | def extract_page_content(params: ExtractPageContentAction, browser=None): 49 | pass 50 | 51 | @registry.action(description='Mark task as done', param_model=DoneAction) 52 | def done(params: DoneAction): 53 | pass 54 | 55 | # Create the dynamic ActionModel with all registered actions 56 | return registry.create_action_model() 57 | 58 | 59 | @pytest.fixture 60 | def sample_history(action_registry): 61 | # Create actions with nested params structure 62 | click_action = action_registry(click_element={'index': 1}) 63 | 64 | extract_action = action_registry(extract_page_content={'value': 'text'}) 65 | 66 | done_action = action_registry(done={'text': 'Task completed'}) 67 | 68 | histories = [ 69 | AgentHistory( 70 | model_output=AgentOutput( 71 | current_state=AgentBrain( 72 | evaluation_previous_goal='None', 73 | memory='Started task', 74 | next_goal='Click button', 75 | ), 76 | action=[click_action], 77 | ), 78 | result=[ActionResult(is_done=False)], 79 | state=BrowserStateHistory( 80 | url='https://example.com', 81 | title='Page 1', 82 | tabs=[TabInfo(url='https://example.com', title='Page 1', page_id=1)], 83 | screenshot='screenshot1.png', 84 | interacted_element=[{'xpath': '//button[1]'}], 85 | ), 86 | ), 87 | AgentHistory( 88 | model_output=AgentOutput( 89 | current_state=AgentBrain( 90 | evaluation_previous_goal='Clicked button', 91 | memory='Button clicked', 92 | next_goal='Extract content', 93 | ), 94 | action=[extract_action], 95 | ), 96 | result=[ 97 | ActionResult( 98 | is_done=False, 99 | extracted_content='Extracted text', 100 | error='Failed to extract completely', 101 | ) 102 | ], 103 | state=BrowserStateHistory( 104 | url='https://example.com/page2', 105 | title='Page 2', 106 | tabs=[TabInfo(url='https://example.com/page2', title='Page 2', page_id=2)], 107 | screenshot='screenshot2.png', 108 | interacted_element=[{'xpath': '//div[1]'}], 109 | ), 110 | ), 111 | AgentHistory( 112 | model_output=AgentOutput( 113 | current_state=AgentBrain( 114 | evaluation_previous_goal='Extracted content', 115 | memory='Content extracted', 116 | next_goal='Finish task', 117 | ), 118 | action=[done_action], 119 | ), 120 | result=[ActionResult(is_done=True, extracted_content='Task completed', error=None)], 121 | state=BrowserStateHistory( 122 | url='https://example.com/page2', 123 | title='Page 2', 124 | tabs=[TabInfo(url='https://example.com/page2', title='Page 2', page_id=2)], 125 | screenshot='screenshot3.png', 126 | interacted_element=[{'xpath': '//div[1]'}], 127 | ), 128 | ), 129 | ] 130 | return AgentHistoryList(history=histories) 131 | 132 | 133 | def test_last_model_output(sample_history: AgentHistoryList): 134 | last_output = sample_history.last_action() 135 | print(last_output) 136 | assert last_output == {'done': {'text': 'Task completed'}} 137 | 138 | 139 | def test_get_errors(sample_history: AgentHistoryList): 140 | errors = sample_history.errors() 141 | assert len(errors) == 1 142 | assert errors[0] == 'Failed to extract completely' 143 | 144 | 145 | def test_final_result(sample_history: AgentHistoryList): 146 | assert sample_history.final_result() == 'Task completed' 147 | 148 | 149 | def test_is_done(sample_history: AgentHistoryList): 150 | assert sample_history.is_done() is True 151 | 152 | 153 | def test_urls(sample_history: AgentHistoryList): 154 | urls = sample_history.urls() 155 | assert 'https://example.com' in urls 156 | assert 'https://example.com/page2' in urls 157 | 158 | 159 | def test_all_screenshots(sample_history: AgentHistoryList): 160 | screenshots = sample_history.screenshots() 161 | assert len(screenshots) == 3 162 | assert screenshots == ['screenshot1.png', 'screenshot2.png', 'screenshot3.png'] 163 | 164 | 165 | def test_all_model_outputs(sample_history: AgentHistoryList): 166 | outputs = sample_history.model_actions() 167 | print(f'DEBUG: {outputs[0]}') 168 | assert len(outputs) == 3 169 | # get first key value pair 170 | assert dict([next(iter(outputs[0].items()))]) == {'click_element': {'index': 1}} 171 | assert dict([next(iter(outputs[1].items()))]) == {'extract_page_content': {'value': 'text'}} 172 | assert dict([next(iter(outputs[2].items()))]) == {'done': {'text': 'Task completed'}} 173 | 174 | 175 | def test_all_model_outputs_filtered(sample_history: AgentHistoryList): 176 | filtered = sample_history.model_actions_filtered(include=['click_element']) 177 | assert len(filtered) == 1 178 | assert filtered[0]['click_element']['index'] == 1 179 | 180 | 181 | def test_empty_history(): 182 | empty_history = AgentHistoryList(history=[]) 183 | assert empty_history.last_action() is None 184 | assert empty_history.final_result() is None 185 | assert empty_history.is_done() is False 186 | assert len(empty_history.urls()) == 0 187 | 188 | 189 | # Add a test to verify action creation 190 | def test_action_creation(action_registry): 191 | click_action = action_registry(click_element={'index': 1}) 192 | 193 | assert click_action.model_dump(exclude_none=True) == {'click_element': {'index': 1}} 194 | 195 | 196 | # run this with: 197 | # pytest browser_use/agent/tests.py 198 | -------------------------------------------------------------------------------- /browser_use/dom/views.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from functools import cached_property 3 | from typing import TYPE_CHECKING, Dict, List, Optional 4 | 5 | from browser_use.dom.history_tree_processor.view import CoordinateSet, HashedDomElement, ViewportInfo 6 | from browser_use.utils import time_execution_sync 7 | 8 | # Avoid circular import issues 9 | if TYPE_CHECKING: 10 | from .views import DOMElementNode 11 | 12 | 13 | @dataclass(frozen=False) 14 | class DOMBaseNode: 15 | is_visible: bool 16 | # Use None as default and set parent later to avoid circular reference issues 17 | parent: Optional['DOMElementNode'] 18 | 19 | 20 | @dataclass(frozen=False) 21 | class DOMTextNode(DOMBaseNode): 22 | text: str 23 | type: str = 'TEXT_NODE' 24 | 25 | def has_parent_with_highlight_index(self) -> bool: 26 | current = self.parent 27 | while current is not None: 28 | # stop if the element has a highlight index (will be handled separately) 29 | if current.highlight_index is not None: 30 | return True 31 | 32 | current = current.parent 33 | return False 34 | 35 | def is_parent_in_viewport(self) -> bool: 36 | if self.parent is None: 37 | return False 38 | return self.parent.is_in_viewport 39 | 40 | def is_parent_top_element(self) -> bool: 41 | if self.parent is None: 42 | return False 43 | return self.parent.is_top_element 44 | 45 | 46 | @dataclass(frozen=False) 47 | class DOMElementNode(DOMBaseNode): 48 | """ 49 | xpath: the xpath of the element from the last root node (shadow root or iframe OR document if no shadow root or iframe). 50 | To properly reference the element we need to recursively switch the root node until we find the element (work you way up the tree with `.parent`) 51 | """ 52 | 53 | tag_name: str 54 | xpath: str 55 | attributes: Dict[str, str] 56 | children: List[DOMBaseNode] 57 | is_interactive: bool = False 58 | is_top_element: bool = False 59 | is_in_viewport: bool = False 60 | is_ota_interactive_element: bool = False 61 | shadow_root: bool = False 62 | highlight_index: Optional[int] = None 63 | viewport_coordinates: Optional[CoordinateSet] = None 64 | page_coordinates: Optional[CoordinateSet] = None 65 | viewport_info: Optional[ViewportInfo] = None 66 | 67 | def __repr__(self) -> str: 68 | tag_str = f'<{self.tag_name}' 69 | 70 | # Add attributes 71 | for key, value in self.attributes.items(): 72 | tag_str += f' {key}="{value}"' 73 | tag_str += '>' 74 | 75 | # Add extra info 76 | extras = [] 77 | if self.is_interactive: 78 | extras.append('interactive') 79 | if self.is_top_element: 80 | extras.append('top') 81 | if self.shadow_root: 82 | extras.append('shadow-root') 83 | if self.highlight_index is not None: 84 | extras.append(f'highlight:{self.highlight_index}') 85 | if self.is_in_viewport: 86 | extras.append('in-viewport') 87 | if self.is_ota_interactive_element: 88 | extras.append('ota-interactive-element') 89 | 90 | if extras: 91 | tag_str += f' [{", ".join(extras)}]' 92 | 93 | return tag_str 94 | 95 | @cached_property 96 | def hash(self) -> HashedDomElement: 97 | from browser_use.dom.history_tree_processor.service import ( 98 | HistoryTreeProcessor, 99 | ) 100 | 101 | return HistoryTreeProcessor._hash_dom_element(self) 102 | 103 | def get_all_text_till_next_clickable_element(self, max_depth: int = -1) -> str: 104 | text_parts = [] 105 | 106 | def collect_text(node: DOMBaseNode, current_depth: int) -> None: 107 | if max_depth != -1 and current_depth > max_depth: 108 | return 109 | 110 | # Skip this branch if we hit a highlighted element (except for the current node) 111 | if isinstance(node, DOMElementNode) and node != self and node.highlight_index is not None: 112 | return 113 | 114 | if isinstance(node, DOMTextNode): 115 | text_parts.append(node.text) 116 | elif isinstance(node, DOMElementNode): 117 | for child in node.children: 118 | collect_text(child, current_depth + 1) 119 | 120 | collect_text(self, 0) 121 | return '\n'.join(text_parts).strip() 122 | 123 | @time_execution_sync('--clickable_elements_to_string') 124 | def clickable_elements_to_string(self, include_attributes: list[str] | None = None) -> str: 125 | """Convert the processed DOM content to HTML.""" 126 | formatted_text = [] 127 | 128 | def process_node(node: DOMBaseNode, depth: int) -> None: 129 | if isinstance(node, DOMElementNode): 130 | # Add element with highlight_index 131 | if node.highlight_index is not None: 132 | attributes_str = '' 133 | text = node.get_all_text_till_next_clickable_element() 134 | if include_attributes: 135 | attributes = list( 136 | set( 137 | [ 138 | f"{str(key)}={str(value)}" 139 | for key, value in node.attributes.items() 140 | if key in include_attributes and value != node.tag_name 141 | ] 142 | ) 143 | ) 144 | if text in attributes: 145 | attributes.remove(text) 146 | attributes_str = ';'.join(attributes) 147 | line = f'[{node.highlight_index}]<{node.tag_name} ' 148 | if attributes_str: 149 | line += f'{attributes_str}' 150 | if text: 151 | if attributes_str: 152 | line += f'>{text}' 153 | else: 154 | line += f'{text}' 155 | line += '/>' 156 | formatted_text.append(line) 157 | 158 | # Process children regardless 159 | for child in node.children: 160 | process_node(child, depth + 1) 161 | 162 | elif isinstance(node, DOMTextNode): 163 | # Add text only if it doesn't have a highlighted parent 164 | if not node.has_parent_with_highlight_index() and node.is_visible: # and node.is_parent_top_element() 165 | formatted_text.append(f'{node.text}') 166 | 167 | process_node(self, 0) 168 | return '\n'.join(formatted_text) 169 | 170 | def get_file_upload_element(self, check_siblings: bool = True) -> Optional['DOMElementNode']: 171 | # Check if current element is a file input 172 | if self.tag_name == 'input' and self.attributes.get('type') == 'file': 173 | return self 174 | 175 | # Check children 176 | for child in self.children: 177 | if isinstance(child, DOMElementNode): 178 | result = child.get_file_upload_element(check_siblings=False) 179 | if result: 180 | return result 181 | 182 | # Check siblings only for the initial call 183 | if check_siblings and self.parent: 184 | for sibling in self.parent.children: 185 | if sibling is not self and isinstance(sibling, DOMElementNode): 186 | result = sibling.get_file_upload_element(check_siblings=False) 187 | if result: 188 | return result 189 | 190 | return None 191 | 192 | 193 | SelectorMap = dict[int, DOMElementNode] 194 | 195 | 196 | @dataclass 197 | class DOMState: 198 | element_tree: DOMElementNode 199 | selector_map: SelectorMap 200 | -------------------------------------------------------------------------------- /browser_use/dom/service.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import json 3 | import logging 4 | from dataclasses import dataclass 5 | from importlib import resources 6 | from typing import TYPE_CHECKING, Optional 7 | from urllib.parse import urlparse 8 | 9 | if TYPE_CHECKING: 10 | from playwright.async_api import Page 11 | 12 | from browser_use.dom.views import ( 13 | DOMBaseNode, 14 | DOMElementNode, 15 | DOMState, 16 | DOMTextNode, 17 | SelectorMap, 18 | ) 19 | from browser_use.utils import time_execution_async 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | @dataclass 25 | class ViewportInfo: 26 | width: int 27 | height: int 28 | 29 | 30 | class DomService: 31 | def __init__(self, page: 'Page'): 32 | self.page = page 33 | self.xpath_cache = {} 34 | 35 | self.js_code = resources.files('browser_use.dom').joinpath('buildDomTree.js').read_text() 36 | 37 | # region - Clickable elements 38 | @time_execution_async('--get_clickable_elements') 39 | async def get_clickable_elements( 40 | self, 41 | highlight_elements: bool = True, 42 | focus_element: int = -1, 43 | viewport_expansion: int = 0, 44 | ) -> DOMState: 45 | element_tree, selector_map = await self._build_dom_tree(highlight_elements, focus_element, viewport_expansion) 46 | return DOMState(element_tree=element_tree, selector_map=selector_map) 47 | 48 | @time_execution_async('--get_cross_origin_iframes') 49 | async def get_cross_origin_iframes(self) -> list[str]: 50 | # invisible cross-origin iframes are used for ads and tracking, dont open those 51 | hidden_frame_urls = await self.page.locator('iframe').filter(visible=False).evaluate_all('e => e.map(e => e.src)') 52 | 53 | is_ad_url = lambda url: any( 54 | domain in urlparse(url).netloc for domain in ('doubleclick.net', 'adroll.com', 'googletagmanager.com') 55 | ) 56 | 57 | return [ 58 | frame.url 59 | for frame in self.page.frames 60 | if urlparse(frame.url).netloc # exclude data:urls and about:blank 61 | and urlparse(frame.url).netloc != urlparse(self.page.url).netloc # exclude same-origin iframes 62 | and frame.url not in hidden_frame_urls # exclude hidden frames 63 | and not is_ad_url(frame.url) # exclude most common ad network tracker frame URLs 64 | ] 65 | 66 | @time_execution_async('--build_dom_tree') 67 | async def _build_dom_tree( 68 | self, 69 | highlight_elements: bool, 70 | focus_element: int, 71 | viewport_expansion: int, 72 | ) -> tuple[DOMElementNode, SelectorMap]: 73 | if await self.page.evaluate('1+1') != 2: 74 | raise ValueError('The page cannot evaluate javascript code properly') 75 | 76 | if self.page.url == 'about:blank': 77 | # short-circuit if the page is a new empty tab for speed, no need to inject buildDomTree.js 78 | return ( 79 | DOMElementNode( 80 | tag_name='body', 81 | xpath='', 82 | attributes={}, 83 | children=[], 84 | is_visible=False, 85 | parent=None, 86 | ), 87 | {}, 88 | ) 89 | 90 | # NOTE: We execute JS code in the browser to extract important DOM information. 91 | # The returned hash map contains information about the DOM tree and the 92 | # relationship between the DOM elements. 93 | debug_mode = logger.getEffectiveLevel() == logging.DEBUG 94 | args = { 95 | 'doHighlightElements': highlight_elements, 96 | 'focusHighlightIndex': focus_element, 97 | 'viewportExpansion': viewport_expansion, 98 | 'debugMode': debug_mode, 99 | } 100 | 101 | try: 102 | eval_page: dict = await self.page.evaluate(self.js_code, args) 103 | except Exception as e: 104 | logger.error('Error evaluating JavaScript: %s', e) 105 | raise 106 | 107 | # Only log performance metrics in debug mode 108 | if debug_mode and 'perfMetrics' in eval_page: 109 | logger.debug( 110 | 'DOM Tree Building Performance Metrics for: %s\n%s', 111 | self.page.url, 112 | json.dumps(eval_page['perfMetrics'], indent=2), 113 | ) 114 | 115 | return await self._construct_dom_tree(eval_page) 116 | 117 | @time_execution_async('--construct_dom_tree') 118 | async def _construct_dom_tree( 119 | self, 120 | eval_page: dict, 121 | ) -> tuple[DOMElementNode, SelectorMap]: 122 | js_node_map = eval_page['map'] 123 | js_root_id = eval_page['rootId'] 124 | 125 | selector_map = {} 126 | node_map = {} 127 | 128 | for id, node_data in js_node_map.items(): 129 | node, children_ids = self._parse_node(node_data) 130 | if node is None: 131 | continue 132 | 133 | node_map[id] = node 134 | 135 | if isinstance(node, DOMElementNode) and node.highlight_index is not None: 136 | selector_map[node.highlight_index] = node 137 | 138 | # NOTE: We know that we are building the tree bottom up 139 | # and all children are already processed. 140 | if isinstance(node, DOMElementNode): 141 | for child_id in children_ids: 142 | if child_id not in node_map: 143 | continue 144 | 145 | child_node = node_map[child_id] 146 | 147 | child_node.parent = node 148 | node.children.append(child_node) 149 | 150 | html_to_dict = node_map[str(js_root_id)] 151 | 152 | del node_map 153 | del js_node_map 154 | del js_root_id 155 | 156 | gc.collect() 157 | 158 | if html_to_dict is None or not isinstance(html_to_dict, DOMElementNode): 159 | raise ValueError('Failed to parse HTML to dictionary') 160 | 161 | return html_to_dict, selector_map 162 | 163 | def _parse_node( 164 | self, 165 | node_data: dict, 166 | ) -> tuple[Optional[DOMBaseNode], list[int]]: 167 | if not node_data: 168 | return None, [] 169 | 170 | # Process text nodes immediately 171 | if node_data.get('type') == 'TEXT_NODE': 172 | text_node = DOMTextNode( 173 | text=node_data['text'], 174 | is_visible=node_data['isVisible'], 175 | parent=None, 176 | ) 177 | return text_node, [] 178 | 179 | # Process coordinates if they exist for element nodes 180 | 181 | viewport_info = None 182 | 183 | if 'viewport' in node_data: 184 | viewport_info = ViewportInfo( 185 | width=node_data['viewport']['width'], 186 | height=node_data['viewport']['height'], 187 | ) 188 | 189 | element_node = DOMElementNode( 190 | tag_name=node_data['tagName'], 191 | xpath=node_data['xpath'], 192 | attributes=node_data.get('attributes', {}), 193 | children=[], 194 | is_visible=node_data.get('isVisible', False), 195 | is_interactive=node_data.get('isInteractive', False), 196 | is_top_element=node_data.get('isTopElement', False), 197 | is_in_viewport=node_data.get('isInViewport', False), 198 | is_ota_interactive_element=node_data.get('isOTAInteractiveElement', False), 199 | highlight_index=node_data.get('highlightIndex'), 200 | shadow_root=node_data.get('shadowRoot', False), 201 | parent=None, 202 | viewport_info=viewport_info, 203 | ) 204 | 205 | children_ids = node_data.get('children', []) 206 | 207 | return element_node, children_ids 208 | -------------------------------------------------------------------------------- /browser_use/agent/prompts.py: -------------------------------------------------------------------------------- 1 | import importlib.resources 2 | from datetime import datetime 3 | from typing import TYPE_CHECKING, List, Optional, Union 4 | 5 | from langchain_core.messages import HumanMessage, SystemMessage 6 | 7 | if TYPE_CHECKING: 8 | from browser_use.agent.views import ActionResult, AgentStepInfo 9 | from browser_use.browser.views import BrowserState 10 | 11 | 12 | class SystemPrompt: 13 | def __init__( 14 | self, 15 | action_description: str, 16 | max_actions_per_step: int = 10, 17 | override_system_message: Optional[str] = None, 18 | extend_system_message: Optional[str] = None, 19 | replay_mode: str = None 20 | ): 21 | self.default_action_description = action_description 22 | self.max_actions_per_step = max_actions_per_step 23 | self.system_prompt_path = 'system_prompt_wap_replay.md' if replay_mode == "smart_replay" else 'system_prompt.md' 24 | prompt = '' 25 | if override_system_message: 26 | prompt = override_system_message 27 | else: 28 | self._load_prompt_template() 29 | prompt = self.prompt_template.format(max_actions=self.max_actions_per_step) 30 | 31 | if extend_system_message: 32 | prompt += f'\n{extend_system_message}' 33 | 34 | self.system_message = SystemMessage(content=prompt) 35 | 36 | def _load_prompt_template(self) -> None: 37 | """Load the prompt template from the markdown file.""" 38 | try: 39 | # This works both in development and when installed as a package 40 | with importlib.resources.files('browser_use.agent').joinpath(self.system_prompt_path).open('r') as f: 41 | self.prompt_template = f.read() 42 | except Exception as e: 43 | raise RuntimeError(f'Failed to load system prompt template: {e}') 44 | 45 | def get_system_message(self) -> SystemMessage: 46 | """ 47 | Get the system prompt for the agent. 48 | 49 | Returns: 50 | SystemMessage: Formatted system prompt 51 | """ 52 | return self.system_message 53 | 54 | 55 | # Functions: 56 | # {self.default_action_description} 57 | 58 | # Example: 59 | # {self.example_response()} 60 | # Your AVAILABLE ACTIONS: 61 | # {self.default_action_description} 62 | 63 | 64 | class AgentMessagePrompt: 65 | def __init__( 66 | self, 67 | state: 'BrowserState', 68 | result: Optional[List['ActionResult']] = None, 69 | include_attributes: list[str] = [], 70 | step_info: Optional['AgentStepInfo'] = None, 71 | subgoals: Optional[list] = None, 72 | ): 73 | self.state = state 74 | self.result = result 75 | self.include_attributes = include_attributes 76 | self.step_info = step_info 77 | self.subgoals = subgoals or [] 78 | 79 | def get_user_message(self, use_vision: bool = True, is_smart_replay: bool = False) -> HumanMessage: 80 | elements_text = self.state.element_tree.clickable_elements_to_string(include_attributes=self.include_attributes) 81 | 82 | has_content_above = (self.state.pixels_above or 0) > 0 83 | has_content_below = (self.state.pixels_below or 0) > 0 84 | 85 | if elements_text != '': 86 | if has_content_above: 87 | elements_text = ( 88 | f'... {self.state.pixels_above} pixels above - scroll or extract content to see more ...\n{elements_text}' 89 | ) 90 | else: 91 | elements_text = f'[Start of page]\n{elements_text}' 92 | if has_content_below: 93 | elements_text = ( 94 | f'{elements_text}\n... {self.state.pixels_below} pixels below - scroll or extract content to see more ...' 95 | ) 96 | else: 97 | elements_text = f'{elements_text}\n[End of page]' 98 | else: 99 | elements_text = 'empty page' 100 | 101 | if self.step_info: 102 | step_info_description = f'Current step: {self.step_info.step_number + 1}/{self.step_info.max_steps}' 103 | else: 104 | step_info_description = '' 105 | time_str = datetime.now().strftime('%Y-%m-%d %H:%M') 106 | step_info_description += f'Current date and time: {time_str}' 107 | 108 | 109 | state_description = "" 110 | 111 | if not is_smart_replay: 112 | state_description = f""" 113 | [Task history memory ends] 114 | [Current state starts here] 115 | The following is one-time information - if you need to remember it write it to memory: 116 | Current url: {self.state.url} 117 | Available tabs: 118 | {self.state.tabs} 119 | Interactive elements from top layer of the current page inside the viewport: 120 | {elements_text} 121 | {step_info_description} 122 | """ 123 | else: 124 | state_description = f""" 125 | [Task history memory ends] 126 | [sub-goals start here] 127 | Previous sub-goal: {self.subgoals[0]} 128 | Current sub-goal: {self.subgoals[1]} 129 | [sub-goals end] 130 | [Current state starts here] 131 | The following is one-time information - if you need to remember it write it to memory: 132 | Current url: {self.state.url} 133 | Available tabs: 134 | {self.state.tabs} 135 | Interactive elements from top layer of the current page inside the viewport: 136 | {elements_text} 137 | {step_info_description} 138 | """ 139 | 140 | if self.result: 141 | for i, result in enumerate(self.result): 142 | if result.extracted_content: 143 | state_description += f'\nAction result {i + 1}/{len(self.result)}: {result.extracted_content}' 144 | if result.error: 145 | # only use last line of error 146 | error = result.error.split('\n')[-1] 147 | state_description += f'\nAction error {i + 1}/{len(self.result)}: ...{error}' 148 | 149 | if self.state.screenshot and use_vision is True: 150 | # Format message for vision model 151 | return HumanMessage( 152 | content=[ 153 | {'type': 'text', 'text': state_description}, 154 | { 155 | 'type': 'image_url', 156 | 'image_url': {'url': f'data:image/png;base64,{self.state.screenshot}'}, # , 'detail': 'low' 157 | }, 158 | ] 159 | ) 160 | 161 | return HumanMessage(content=state_description) 162 | 163 | 164 | class PlannerPrompt(SystemPrompt): 165 | def get_system_message(self, is_planner_reasoning) -> Union[SystemMessage, HumanMessage]: 166 | planner_prompt_text = """You are a planning agent that helps break down tasks into smaller steps and reason about the current state. 167 | Your role is to: 168 | 1. Analyze the current state and history 169 | 2. Evaluate progress towards the ultimate goal 170 | 3. Identify potential challenges or roadblocks 171 | 4. Suggest the next high-level steps to take 172 | 173 | Inside your messages, there will be AI messages from different agents with different formats. 174 | 175 | Your output format should be always a JSON object with the following fields: 176 | { 177 | "state_analysis": "Brief analysis of the current state and what has been done so far", 178 | "progress_evaluation": "Evaluation of progress towards the ultimate goal (as percentage and description)", 179 | "challenges": "List any potential challenges or roadblocks", 180 | "next_steps": "List 2-3 concrete next steps to take", 181 | "reasoning": "Explain your reasoning for the suggested next steps" 182 | } 183 | 184 | Ignore the other AI messages output structures. 185 | 186 | Keep your responses concise and focused on actionable insights.""" 187 | 188 | if is_planner_reasoning: 189 | return HumanMessage(content=planner_prompt_text) 190 | else: 191 | return SystemMessage(content=planner_prompt_text) 192 | -------------------------------------------------------------------------------- /chrome-extension/css/panel.css: -------------------------------------------------------------------------------- 1 | @font-face { 2 | font-family: 'Raleway'; 3 | font-style: normal; 4 | font-weight: 400; 5 | src: local('Raleway'), url(../other/Raleway.woff2) format('woff2'); 6 | unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2212, U+2215, U+E0FF, U+EFFD, U+F000; 7 | } 8 | 9 | * { 10 | box-sizing: border-box; 11 | } 12 | 13 | body { 14 | padding: 10px; 15 | background: white; 16 | } 17 | 18 | .intro { 19 | position: absolute; 20 | left: 50%; 21 | top: 50%; 22 | transform: translate(-50%, -50%); 23 | 24 | display: flex; 25 | flex-direction: column; 26 | align-items: center; 27 | text-align: center; 28 | } 29 | 30 | nav { 31 | position: fixed; 32 | z-index: 1; 33 | bottom: 0; 34 | left: 0; 35 | right: 0; 36 | background: rgba(255, 255, 255, 0.7); 37 | padding: 10px; 38 | } 39 | 40 | nav > button { 41 | margin-bottom: 0; 42 | background: white; 43 | } 44 | 45 | .top { 46 | opacity: 1; 47 | transition: opacity 300ms; 48 | } 49 | 50 | .top.hidden { 51 | pointer-events: none; 52 | opacity: 0; 53 | } 54 | 55 | .status { 56 | position: fixed; 57 | top: 10px; 58 | right: 10px; 59 | color: rgba(139, 0, 0, 0.5); 60 | font-size: small; 61 | z-index: 1; 62 | cursor: help; 63 | } 64 | 65 | .status.connected { 66 | display: none; 67 | } 68 | 69 | .events { 70 | margin-bottom: 50px; 71 | table-layout: fixed; 72 | } 73 | 74 | .events thead th { 75 | vertical-align: bottom; 76 | } 77 | 78 | .events thead th input[type=text] { 79 | margin-bottom: 9px; 80 | height: 30px; 81 | font-weight: normal; 82 | width: 100%; 83 | min-width: 55px; 84 | } 85 | 86 | .events thead th .checkbox { 87 | margin-bottom: 9px; 88 | } 89 | 90 | .events td { 91 | word-wrap: break-word; 92 | } 93 | 94 | .events .time { 95 | display: block; 96 | color: #b6b6b6; 97 | font-size: small; 98 | } 99 | 100 | .events td:first-child { 101 | border-left: solid transparent 5px; 102 | padding-left: 4px; 103 | } 104 | 105 | .events tr.nodes-added td:first-child { 106 | border-left-color: rgb(138,219,246); 107 | } 108 | 109 | .events tr.nodes-removed td:first-child { 110 | border-left-color: rgb(255,198,139); 111 | } 112 | 113 | .events tr.text-changed td:first-child { 114 | border-left-color: rgb(254,239,139); 115 | } 116 | 117 | .events tr.attribute-changed td:first-child { 118 | border-left-color: rgb(179,146,248); 119 | } 120 | 121 | .events th:nth-child(1), .events td:nth-child(1) { 122 | min-width: 155px; 123 | } 124 | 125 | .events th:nth-child(3), .events td:nth-child(3) { 126 | /* width: 60%; */ 127 | } 128 | 129 | .events td > div { 130 | max-height: 85px; 131 | overflow: auto; 132 | } 133 | 134 | .events td:nth-child(3) hr { 135 | margin-top: 2px; 136 | margin-bottom: 2px; 137 | } 138 | 139 | .events thead .counter { 140 | color: #b6b6b6; 141 | } 142 | 143 | .node:hover { 144 | text-decoration: underline; 145 | cursor: pointer; 146 | } 147 | 148 | /* 149 | Filter 150 | */ 151 | 152 | .events tbody tr { 153 | display: none; 154 | } 155 | 156 | .events tbody.nodes-added-visible tr.target-match.nodes-added, 157 | .events tbody.nodes-removed-visible tr.target-match.nodes-removed, 158 | .events tbody.text-changed-visible tr.target-match.text-changed, 159 | .events tbody.attribute-changed-visible tr.target-match.attribute-changed { 160 | display: table-row; 161 | } 162 | 163 | /* 164 | Checkbox 165 | */ 166 | 167 | .checkbox { 168 | width: 20px; 169 | position: relative; 170 | display: inline-block; 171 | } 172 | 173 | .checkbox label { 174 | cursor: pointer; 175 | position: absolute; 176 | width: 20px; 177 | height: 20px; 178 | top: 0; 179 | border-radius: 4px; 180 | margin: 0; 181 | } 182 | 183 | .checkbox label:after { 184 | opacity: 0; 185 | content: ''; 186 | position: absolute; 187 | width: 9px; 188 | height: 5px; 189 | background: transparent; 190 | top: 4px; 191 | left: 4px; 192 | border: 3px solid #fcfff4; 193 | border-top: none; 194 | border-right: none; 195 | 196 | transform: rotate(-45deg); 197 | } 198 | 199 | .checkbox label:hover::after { 200 | opacity: 0.3; 201 | } 202 | 203 | .checkbox input { 204 | visibility: hidden; 205 | margin: 0; 206 | } 207 | 208 | .checkbox input[type=checkbox]:checked + label:after { 209 | opacity: 1; 210 | } 211 | 212 | .checkbox.nodes-added label { 213 | background: rgb(138,219,246); 214 | } 215 | 216 | .checkbox.nodes-removed label { 217 | background: rgb(255,198,139); 218 | } 219 | 220 | .checkbox.text-changed label { 221 | background: rgb(254,239,139); 222 | } 223 | 224 | .checkbox.attribute-changed label { 225 | background: rgb(179,146,248); 226 | } 227 | 228 | .task-description-input { 229 | width: 100%; 230 | padding: 8px; 231 | font-size: 14px; 232 | min-height: 20px; 233 | margin-bottom: 0; 234 | resize: none; 235 | overflow: hidden; 236 | box-sizing: border-box; 237 | line-height: 1.4; 238 | } 239 | 240 | .task-description-input.invalid { 241 | border: 1px solid #e74c3c; 242 | } 243 | 244 | .task-description-section { 245 | padding-bottom: 10px; 246 | } 247 | 248 | .task-description-label { 249 | display: none; /* Default hidden */ 250 | font-size: 16px; 251 | font-weight: bold; 252 | margin-top: 10px; 253 | margin-bottom: 100px; 254 | padding: 5px 10px; 255 | background-color: #f0f0f0; 256 | color: #333; 257 | border: 1px solid #ccc; 258 | border-radius: 5px; 259 | white-space: pre-wrap; /* Allow line breaks if task description is long */ 260 | } 261 | 262 | .task-description-task-id { 263 | display: none; /* Default hidden */ 264 | font-size: 16px; 265 | font-weight: bold; 266 | margin-top: 10px; 267 | margin-bottom: 100px; 268 | padding: 5px 10px; 269 | background-color: hsl(52, 100%, 57%); 270 | color: #333; 271 | border: 1px solid #ccc; 272 | border-radius: 5px; 273 | white-space: pre-wrap; /* Allow line breaks if task description is long */ 274 | } 275 | 276 | .hidden { display: none !important; } 277 | 278 | .settings-panel { 279 | padding: 10px; 280 | border-top: 1px solid #ddd; 281 | } 282 | 283 | .settings-panel label { 284 | display: block; 285 | margin: 6px 0; 286 | } 287 | 288 | /* Make the header a positioning context */ 289 | table.events thead { 290 | position: relative; 291 | } 292 | 293 | /* Pin the button */ 294 | .settings-btn { 295 | position: absolute; 296 | top: 4px; 297 | right: 8px; 298 | background-color: #e5e5e5; 299 | } 300 | 301 | .social-buttons { 302 | display: flex; 303 | align-items: center; 304 | gap: 8px; 305 | } 306 | 307 | .website-btn { 308 | font-family: inherit; 309 | font-size: 10px; 310 | font-weight: 600; 311 | line-height: 1; 312 | padding: 5px 10px; 313 | color: #fff; 314 | background: #000d1d; 315 | border-radius: 15px; 316 | text-decoration: none; 317 | } 318 | 319 | .website-btn:hover, 320 | .website-btn:focus { 321 | color: #fff; 322 | background: #00223d; 323 | outline: none; 324 | } 325 | 326 | .website-btn:active { 327 | color: #fff; 328 | background: #00294e; 329 | } -------------------------------------------------------------------------------- /chrome-extension/js/EventTable.js: -------------------------------------------------------------------------------- 1 | (function () { 2 | "use strict"; 3 | 4 | function formatNode(node) { 5 | return '' + node.selector + ''; 6 | } 7 | 8 | function formatValue(value) { 9 | if (value === null) { 10 | return 'null'; 11 | } else if (value === undefined) { 12 | return 'undefined'; 13 | } else { 14 | return '"' + value + '"'; 15 | } 16 | } 17 | 18 | function momentJS(date, timeOnly) { 19 | let hours = date.getHours(); 20 | let minutes = date.getMinutes(); 21 | let seconds = date.getSeconds(); 22 | hours = hours % 24; 23 | minutes = minutes < 10 ? '0' + minutes : minutes; 24 | seconds = seconds < 10 ? '0' + seconds : seconds; 25 | const strTime = hours + ':' + minutes + ':' + seconds; 26 | 27 | if (timeOnly) { 28 | return strTime; 29 | } 30 | 31 | return date.getMonth()+1 + "/" + date.getDate() + "/" + date.getFullYear() + " " + strTime + ":" + date.getMilliseconds(); 32 | } 33 | 34 | function formatDate(timestamp) { 35 | const date = new Date(timestamp); 36 | 37 | return `${momentJS(date, true)}`; 38 | } 39 | 40 | function formatEventDetails(event) { 41 | var details = ""; 42 | switch (event.type) { 43 | case "nodes added": 44 | details = event.nodes.length + ' node(s) added: ' + 45 | '' + (event.nodes.map(formatNode)).join(', ') + ''; 46 | break; 47 | case "nodes removed": 48 | details = event.nodes.length + ' node(s) removed: ' + 49 | '' + (event.nodes.map(formatNode)).join(', ') + ''; 50 | break; 51 | case "attribute changed": 52 | details = '"' + event.attribute + '" '; 53 | 54 | if (event.oldValue === null && event.newValue === "") { 55 | details += ' was added'; 56 | } else if (event.newValue === null && event.oldValue === "") { 57 | details += ' was removed'; 58 | } else { 59 | details += 'changed from ' + formatValue(event.oldValue) + ' ' + 60 | 'to ' + formatValue(event.newValue) + ''; 61 | } 62 | 63 | break; 64 | case "text changed": 65 | details = 'text changed ' + 66 | 'from ' + formatValue(event.oldValue) + ' ' + 67 | 'to ' + formatValue(event.newValue) + ''; 68 | break; 69 | } 70 | 71 | return details; 72 | } 73 | 74 | function EventTable(table) { 75 | this._tableHead = table.tHead; 76 | this._tableBody = table.tBodies[0]; 77 | this._counter = this._tableHead.querySelector('.counter'); 78 | this._targetFilter = (this._tableHead).querySelector('.target-filter'); 79 | this._count = this._tableBody.children.length; 80 | 81 | //FILTERS 82 | var thead = this._tableHead; 83 | var tbody = this._tableBody; 84 | var targetFilter = this._targetFilter; 85 | var typeFilters = (this._tableHead).querySelectorAll('.type-filters input'); 86 | 87 | function updateTypeFilters() { 88 | var nodesAdded = thead.querySelector('.nodes-added input').checked; 89 | var nodesRemoved = thead.querySelector('.nodes-removed input').checked; 90 | var textChanged = thead.querySelector('.text-changed input').checked; 91 | var attributeChanged = thead.querySelector('.attribute-changed input').checked; 92 | 93 | if (nodesAdded) { 94 | tbody.classList.add('nodes-added-visible'); 95 | } else { 96 | tbody.classList.remove('nodes-added-visible'); 97 | } 98 | 99 | if (nodesRemoved) { 100 | tbody.classList.add('nodes-removed-visible'); 101 | } else { 102 | tbody.classList.remove('nodes-removed-visible'); 103 | } 104 | 105 | if (textChanged) { 106 | tbody.classList.add('text-changed-visible'); 107 | } else { 108 | tbody.classList.remove('text-changed-visible'); 109 | } 110 | 111 | if (attributeChanged) { 112 | tbody.classList.add('attribute-changed-visible'); 113 | } else { 114 | tbody.classList.remove('attribute-changed-visible'); 115 | } 116 | } 117 | 118 | updateTypeFilters(); 119 | 120 | function updateTargetFilter() { 121 | var query = (targetFilter.value).trim(); 122 | 123 | for (var i = 0, l = tbody.children.length; i < l; i++) { 124 | var tr = tbody.children[i]; 125 | var targetTd = tr.children[1]; 126 | 127 | if (!query || targetTd.innerText.indexOf(query) > -1) { 128 | tr.classList.add('target-match'); 129 | } else { 130 | tr.classList.remove('target-match'); 131 | } 132 | } 133 | } 134 | 135 | updateTargetFilter(); 136 | 137 | targetFilter.addEventListener('keyup', updateTargetFilter); 138 | 139 | for (var i = 0, l = typeFilters.length; i < l; i++) { 140 | typeFilters[i].addEventListener('change', updateTypeFilters); 141 | } 142 | } 143 | 144 | EventTable.prototype._updateEventCounter = function () { 145 | (this._counter).innerText = '(' + this._count + ')'; 146 | }; 147 | 148 | EventTable.prototype.clear = function () { 149 | (this._tableBody).innerHTML = ''; 150 | 151 | this._count = 0; 152 | this._updateEventCounter(); 153 | }; 154 | 155 | EventTable.prototype.addEvent = function (event) { 156 | var tr = (this._tableBody).firstChild; 157 | var tdAction, tdDetails, tdTarget; 158 | 159 | //check if events should be grouped together 160 | if (tr && parseInt(tr.dataset.targetNodeId, 10) === event.target.nodeId && tr.dataset.eventType === event.type) { 161 | tdAction = tr.querySelector('td:nth-child(1)'); 162 | tdDetails = tr.querySelector('td:nth-child(3)'); 163 | 164 | tr.dataset.count = parseInt(tr.dataset.count || "1", 10) + 1; 165 | 166 | tdAction.innerHTML = tr.dataset.count + ' x ' + event.type + formatDate(event.date); 167 | 168 | tdDetails.querySelector('div').innerHTML += '
' + formatEventDetails(event); 169 | 170 | this._count++; 171 | return; 172 | } 173 | 174 | tr = document.createElement('tr'); 175 | tdAction = document.createElement('td'); 176 | tdTarget = document.createElement('td'); 177 | tdDetails = document.createElement('td'); 178 | 179 | tr.dataset.targetNodeId = event.target.nodeId; 180 | tr.dataset.eventType = event.type; 181 | 182 | tr.classList.add(event.type.replace(' ', '-')); 183 | 184 | tdAction.innerHTML = event.type + formatDate(event.date); 185 | tdTarget.innerHTML = '
' + formatNode(event.target) + '
'; 186 | tdDetails.innerHTML = '
' + formatEventDetails(event) + '
'; 187 | 188 | tr.appendChild(tdAction); 189 | tr.appendChild(tdTarget); 190 | tr.appendChild(tdDetails); 191 | 192 | //check if it matches current query 193 | var query = ((this._targetFilter).value).trim(); 194 | if (!query || tdTarget.innerText.indexOf(query) > -1) { 195 | tr.classList.add('target-match'); 196 | } 197 | 198 | //insert at the top/beginning 199 | (this._tableBody).insertBefore(tr, this._tableBody.firstChild); 200 | 201 | tr.animate([ 202 | {opacity: 0}, 203 | {opacity: 1} 204 | ], 300); 205 | 206 | this._count++; 207 | this._updateEventCounter(); 208 | }; 209 | 210 | window.EventTable = EventTable; 211 | })(); 212 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
6 | OTA-tool-kits 7 |
8 |
9 |
10 | Homepage 12 | Hugging Face 14 | Code License 16 |


17 |
18 | 19 | # Web Agent Protocol 20 | 21 | ## Overview 22 | 23 | The Web Agent Protocol (WAP) is a standardized framework designed to enable seamless interaction between users, web agents, and browsers by recording and replaying browser actions. It separates the concerns of action recording and execution, allowing for efficient automation and reusability. The Python SDK for WAP implements the full specification, making it easy to: 24 | 25 | 1. **Collect** user‑interaction data with the [OTA‑WAP Chrome extension](https://github.com/OTA-Tech-AI/webagentprotocol/tree/main/chrome-extension). 26 | 2. **Convert** the raw event stream into either **_exact‑replay_** or **_smart‑replay_** action lists. 27 | 3. **Convert** recorded actions into **_MCP_** servers for reuse by any agent or user 28 | 4. **Replay** those lists using the **_WAP-Replay_** protocol to ensure accurate browser operations. 29 | 30 | ### WAP FULL DEMO 31 | 32 | [![Watch the video](https://img.youtube.com/vi/joh9FXJfnwk/0.jpg)](https://www.youtube.com/watch?v=joh9FXJfnwk) 33 | 34 | ### Without WAP 35 | ![image](https://github.com/user-attachments/assets/843ea9da-45c0-48e9-8a25-44f5bfb31786) 36 | 37 | ### WAP Record 38 | ![image](https://github.com/user-attachments/assets/3d041f56-9e76-4b61-9b56-0686070723a3) 39 | 40 | ### WAP Replay 41 | ![image](https://github.com/user-attachments/assets/e13ca7c7-3cc1-4952-8a79-3bd1e9e98580) 42 | 43 | ## Example using WAP 44 | ![image](https://github.com/user-attachments/assets/ccb7387b-0677-498c-b4ad-a10590e37e27) 45 | 46 | ## Setup 47 | Install the dependencies with the following command: 48 | 49 | Create a conda env 50 | 51 | ```bash 52 | conda create -n WAP python=3.11 53 | ``` 54 | 55 | Activate the conda env 56 | 57 | ```bash 58 | conda activate WAP 59 | ``` 60 | 61 | Install the dependencies 62 | 63 | ```bash 64 | pip install -r requirements.txt 65 | ``` 66 | 67 | Setup your repo source path: 68 | ``` 69 | set PYTHONPATH=C:/path/to/webagentprotocol # for Windows 70 | export PYTHONPATH=/path/to/webagentprotocol # for Linux 71 | ``` 72 | 73 | Create **.env** file under the repo root directory with your own API keys: 74 | ``` 75 | OPENAI_API_KEY=sk-proj-... 76 | DEEPSEEK_API_KEY=sk-... 77 | ``` 78 | 79 | ## Record 80 | 81 | ### WAP record extension 82 | Please refer to [OTA‑WAP Chrome Extension](https://github.com/OTA-Tech-AI/webagentprotocol/tree/main/chrome-extension) to setup action capturer in your Chrome browser. 83 | 84 | ### Start data‑collection server 85 | 86 | Run the following command to start the server to collect data from the extension: 87 | ```bash 88 | python action_collect_server.py 89 | ``` 90 | **Once the server is up, you can start to record from the page using WAP Chrome extension.** 91 | 92 | The server listens on http://localhost:4934/action-data by default, please make sure the Host and Port in the extension settings match this server config. 93 | Each session will be saved to: 94 | 95 | ```bash 96 | data/YYYYMMDD/taskid/summary_event_.json 97 | ``` 98 | 99 | An example of the formatted data which you will received in the WAP backend server is like: 100 | 101 | ```json 102 | { 103 | "taskId": "MkCAhQsHgXn7YgaK", 104 | "type": "click", 105 | "actionTimestamp": 1746325231479, 106 | "eventTarget": { 107 | "type": "click", 108 | "target": "\n
...", 109 | "targetId": "mntl-card-list-card--extendable_3-0", 110 | "targetClass": "comp mntl-card-list-card--extendable mntl-universal-card mntl-document-card mntl-card card card--no-image" 111 | }, 112 | "allEvents": {}, 113 | "pageHTMLContent": "
..." 114 | } 115 | ``` 116 | 117 | 118 | ## Generate replay lists 119 | 120 | | Mode | Command | 121 | | -------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 122 | | **Exact replay** – exactly reproduce every action | `python wap_replay/generate_exact_replay_list.py --data_dir_path data// --output_dir_path data_processed/exact_replay` | 123 | | **Smart replay** – condensed goal‑oriented steps | `python wap_replay/generate_smart_replay_list.py --data_dir_path data// --output_dir_path data_processed/smart_replay` | 124 | 125 | Replace **** with the folder produced by the extension 126 | (e.g. em3h6UBDZykz0gnH). 127 | 128 | Output structure: 129 | ```bash 130 | data_processed/smart_replay/ 131 | ├─ subgoals_/ # intermediate prompts & replies 132 | └─ wap_smart_replay_list_.json # final smart replay list for the agent 133 | 134 | data_processed/exact_replay/ 135 | └─ wap_smart_replay_list_.json # final exact replay list for the agent 136 | ``` 137 | 138 | ## Replay 139 | ```bash 140 | python run_replay.py --model-provider openai --wap_replay_list data_processed/exact_replay/wap_exact_replay_list_.json --max-concurrent 1 141 | ``` 142 | For **smart-replay**, replace the path with a smart‑replay JSON to test this mode. 143 | 144 | ## Convert to MCP Server 145 | 146 | ```bash 147 | python wap_replay\generate_mcp_server.py --task_id 148 | ``` 149 | 150 | converted MCP servers will be located under ``` mcp_servers ``` folder 151 | 152 | ## Replay with MCP 153 | 154 | You would need 2 terminals to replay with MCP. In the first termnial 155 | ```bash 156 | python wap_service.py 157 | ``` 158 | 159 | In the second termnial 160 | ```bash 161 | python mcp_client.py 162 | ``` 163 | 164 | Then enter your prompt in the second terminal 165 | 166 | ```bash 167 | example: find a top rated keyboard on amazon.ca using smart replay 168 | ``` 169 | 170 | ## Replay with our Desktop App 171 | 172 | We provide out-of-box desktop app for running replay lists. It is easy to install and you don't need any extra steps for setup and deployments. Visit [WAP Replay Tool releases](https://github.com/OTA-Tech-AI/web-agent-protocol/releases) for more details. 173 | 174 | WAP Replay Tool Demo GIF 175 | 176 | 177 | ## Troubleshooting 178 | 179 | **ModuleNotFoundError** – run commands from the project root or export PYTHONPATH=. (set PYTHONPATH=. for Windows). 180 | 181 | “no task‑start file” – ensure the extension recorded a full session; 182 | the generators require exactly one task-start and one task-finish record. 183 | 184 | ## Acknowledgement 185 | 186 | Browser-Use: https://github.com/browser-use/browser-use 187 | 188 | MCP: https://github.com/modelcontextprotocol/python-sdk 189 | 190 | DOM Extension: https://github.com/kdzwinel/DOMListenerExtension 191 | -------------------------------------------------------------------------------- /browser_use/agent/message_manager/tests.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from langchain_anthropic import ChatAnthropic 3 | from langchain_core.messages import AIMessage, HumanMessage, SystemMessage 4 | from langchain_openai import AzureChatOpenAI, ChatOpenAI 5 | 6 | from browser_use.agent.message_manager.service import MessageManager, MessageManagerSettings 7 | from browser_use.agent.views import ActionResult 8 | from browser_use.browser.views import BrowserState, TabInfo 9 | from browser_use.dom.views import DOMElementNode, DOMTextNode 10 | 11 | 12 | @pytest.fixture( 13 | params=[ 14 | ChatOpenAI(model='gpt-4o-mini'), 15 | AzureChatOpenAI(model='gpt-4o', api_version='2024-02-15-preview'), 16 | ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=100, temperature=0.0, stop=None), 17 | ], 18 | ids=['gpt-4o-mini', 'gpt-4o', 'claude-3-5-sonnet'], 19 | ) 20 | def message_manager(request: pytest.FixtureRequest): 21 | task = 'Test task' 22 | action_descriptions = 'Test actions' 23 | return MessageManager( 24 | task=task, 25 | system_message=SystemMessage(content=action_descriptions), 26 | settings=MessageManagerSettings( 27 | max_input_tokens=1000, 28 | estimated_characters_per_token=3, 29 | image_tokens=800, 30 | ), 31 | ) 32 | 33 | 34 | def test_initial_messages(message_manager: MessageManager): 35 | """Test that message manager initializes with system and task messages""" 36 | messages = message_manager.get_messages() 37 | assert len(messages) == 2 38 | assert isinstance(messages[0], SystemMessage) 39 | assert isinstance(messages[1], HumanMessage) 40 | assert 'Test task' in messages[1].content 41 | 42 | 43 | def test_add_state_message(message_manager: MessageManager): 44 | """Test adding browser state message""" 45 | state = BrowserState( 46 | url='https://test.com', 47 | title='Test Page', 48 | element_tree=DOMElementNode( 49 | tag_name='div', 50 | attributes={}, 51 | children=[], 52 | is_visible=True, 53 | parent=None, 54 | xpath='//div', 55 | ), 56 | selector_map={}, 57 | tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')], 58 | ) 59 | message_manager.add_state_message(state) 60 | 61 | messages = message_manager.get_messages() 62 | assert len(messages) == 3 63 | assert isinstance(messages[2], HumanMessage) 64 | assert 'https://test.com' in messages[2].content 65 | 66 | 67 | def test_add_state_with_memory_result(message_manager: MessageManager): 68 | """Test adding state with result that should be included in memory""" 69 | state = BrowserState( 70 | url='https://test.com', 71 | title='Test Page', 72 | element_tree=DOMElementNode( 73 | tag_name='div', 74 | attributes={}, 75 | children=[], 76 | is_visible=True, 77 | parent=None, 78 | xpath='//div', 79 | ), 80 | selector_map={}, 81 | tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')], 82 | ) 83 | result = ActionResult(extracted_content='Important content', include_in_memory=True) 84 | 85 | message_manager.add_state_message(state, [result]) 86 | messages = message_manager.get_messages() 87 | 88 | # Should have system, task, extracted content, and state messages 89 | assert len(messages) == 4 90 | assert 'Important content' in messages[2].content 91 | assert isinstance(messages[2], HumanMessage) 92 | assert isinstance(messages[3], HumanMessage) 93 | assert 'Important content' not in messages[3].content 94 | 95 | 96 | def test_add_state_with_non_memory_result(message_manager: MessageManager): 97 | """Test adding state with result that should not be included in memory""" 98 | state = BrowserState( 99 | url='https://test.com', 100 | title='Test Page', 101 | element_tree=DOMElementNode( 102 | tag_name='div', 103 | attributes={}, 104 | children=[], 105 | is_visible=True, 106 | parent=None, 107 | xpath='//div', 108 | ), 109 | selector_map={}, 110 | tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')], 111 | ) 112 | result = ActionResult(extracted_content='Temporary content', include_in_memory=False) 113 | 114 | message_manager.add_state_message(state, [result]) 115 | messages = message_manager.get_messages() 116 | 117 | # Should have system, task, and combined state+result message 118 | assert len(messages) == 3 119 | assert 'Temporary content' in messages[2].content 120 | assert isinstance(messages[2], HumanMessage) 121 | 122 | 123 | @pytest.mark.skip('not sure how to fix this') 124 | @pytest.mark.parametrize('max_tokens', [100000, 10000, 5000]) 125 | def test_token_overflow_handling_with_real_flow(message_manager: MessageManager, max_tokens): 126 | """Test handling of token overflow in a realistic message flow""" 127 | # Set more realistic token limit 128 | message_manager.settings.max_input_tokens = max_tokens 129 | 130 | # Create a long sequence of interactions 131 | for i in range(200): # Simulate 40 steps of interaction 132 | # Create state with varying content length 133 | state = BrowserState( 134 | url=f'https://test{i}.com', 135 | title=f'Test Page {i}', 136 | element_tree=DOMElementNode( 137 | tag_name='div', 138 | attributes={}, 139 | children=[ 140 | DOMTextNode( 141 | text=f'Content {j} ' * (10 + i), # Increasing content length 142 | is_visible=True, 143 | parent=None, 144 | ) 145 | for j in range(5) # Multiple DOM items 146 | ], 147 | is_visible=True, 148 | parent=None, 149 | xpath='//div', 150 | ), 151 | selector_map={j: f'//div[{j}]' for j in range(5)}, 152 | tabs=[TabInfo(page_id=1, url=f'https://test{i}.com', title=f'Test Page {i}')], 153 | ) 154 | 155 | # Alternate between different types of results 156 | result = None 157 | if i % 2 == 0: # Every other iteration 158 | result = ActionResult( 159 | extracted_content=f'Important content from step {i}' * 5, 160 | include_in_memory=i % 4 == 0, # Include in memory every 4th message 161 | ) 162 | 163 | # Add state message 164 | if result: 165 | message_manager.add_state_message(state, [result]) 166 | else: 167 | message_manager.add_state_message(state) 168 | 169 | try: 170 | messages = message_manager.get_messages() 171 | except ValueError as e: 172 | if 'Max token limit reached - history is too long' in str(e): 173 | return # If error occurs, end the test 174 | else: 175 | raise e 176 | 177 | assert message_manager.state.history.current_tokens <= message_manager.settings.max_input_tokens + 100 178 | 179 | last_msg = messages[-1] 180 | assert isinstance(last_msg, HumanMessage) 181 | 182 | if i % 4 == 0: 183 | assert isinstance(message_manager.state.history.messages[-2].message, HumanMessage) 184 | if i % 2 == 0 and not i % 4 == 0: 185 | if isinstance(last_msg.content, list): 186 | assert 'Current url: https://test' in last_msg.content[0]['text'] 187 | else: 188 | assert 'Current url: https://test' in last_msg.content 189 | 190 | # Add model output every time 191 | from browser_use.agent.views import AgentBrain, AgentOutput 192 | from browser_use.controller.registry.views import ActionModel 193 | 194 | output = AgentOutput( 195 | current_state=AgentBrain( 196 | evaluation_previous_goal=f'Success in step {i}', 197 | memory=f'Memory from step {i}', 198 | next_goal=f'Goal for step {i + 1}', 199 | ), 200 | action=[ActionModel()], 201 | ) 202 | message_manager._remove_last_state_message() 203 | message_manager.add_model_output(output) 204 | 205 | # Get messages and verify after each addition 206 | messages = [m.message for m in message_manager.state.history.messages] 207 | 208 | # Verify token limit is respected 209 | 210 | # Verify essential messages are preserved 211 | assert isinstance(messages[0], SystemMessage) # System prompt always first 212 | assert isinstance(messages[1], HumanMessage) # Task always second 213 | assert 'Test task' in messages[1].content 214 | 215 | # Verify structure of latest messages 216 | assert isinstance(messages[-1], AIMessage) # Last message should be model output 217 | assert f'step {i}' in messages[-1].content # Should contain current step info 218 | 219 | # Log token usage for debugging 220 | token_usage = message_manager.state.history.current_tokens 221 | token_limit = message_manager.settings.max_input_tokens 222 | # print(f'Step {i}: Using {token_usage}/{token_limit} tokens') 223 | 224 | # go through all messages and verify that the token count and total tokens is correct 225 | total_tokens = 0 226 | real_tokens = [] 227 | stored_tokens = [] 228 | for msg in message_manager.state.history.messages: 229 | total_tokens += msg.metadata.tokens 230 | stored_tokens.append(msg.metadata.tokens) 231 | real_tokens.append(message_manager._count_tokens(msg.message)) 232 | assert total_tokens == sum(real_tokens) 233 | assert stored_tokens == real_tokens 234 | assert message_manager.state.history.current_tokens == total_tokens 235 | 236 | 237 | # pytest -s browser_use/agent/message_manager/tests.py 238 | -------------------------------------------------------------------------------- /browser_use/controller/registry/service.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from inspect import iscoroutinefunction, signature 3 | from typing import Any, Callable, Dict, Generic, Optional, Type, TypeVar 4 | 5 | from langchain_core.language_models.chat_models import BaseChatModel 6 | from pydantic import BaseModel, Field, create_model 7 | 8 | from browser_use.browser.context import BrowserContext 9 | from browser_use.controller.registry.views import ( 10 | ActionModel, 11 | ActionRegistry, 12 | RegisteredAction, 13 | ) 14 | from browser_use.telemetry.service import ProductTelemetry 15 | from browser_use.telemetry.views import ( 16 | ControllerRegisteredFunctionsTelemetryEvent, 17 | RegisteredFunction, 18 | ) 19 | from browser_use.utils import time_execution_async, time_execution_sync 20 | 21 | Context = TypeVar('Context') 22 | 23 | 24 | class Registry(Generic[Context]): 25 | """Service for registering and managing actions""" 26 | 27 | def __init__(self, exclude_actions: list[str] | None = None): 28 | self.registry = ActionRegistry() 29 | self.telemetry = ProductTelemetry() 30 | self.exclude_actions = exclude_actions if exclude_actions is not None else [] 31 | 32 | @time_execution_sync('--create_param_model') 33 | def _create_param_model(self, function: Callable) -> Type[BaseModel]: 34 | """Creates a Pydantic model from function signature""" 35 | sig = signature(function) 36 | params = { 37 | name: (param.annotation, ... if param.default == param.empty else param.default) 38 | for name, param in sig.parameters.items() 39 | if name != 'browser' and name != 'page_extraction_llm' and name != 'available_file_paths' 40 | } 41 | # TODO: make the types here work 42 | return create_model( 43 | f'{function.__name__}_parameters', 44 | __base__=ActionModel, 45 | **params, # type: ignore 46 | ) 47 | 48 | def action( 49 | self, 50 | description: str, 51 | param_model: Optional[Type[BaseModel]] = None, 52 | domains: Optional[list[str]] = None, 53 | page_filter: Optional[Callable[[Any], bool]] = None, 54 | ): 55 | """Decorator for registering actions""" 56 | 57 | def decorator(func: Callable): 58 | # Skip registration if action is in exclude_actions 59 | if func.__name__ in self.exclude_actions: 60 | return func 61 | 62 | # Create param model from function if not provided 63 | actual_param_model = param_model or self._create_param_model(func) 64 | 65 | # Wrap sync functions to make them async 66 | if not iscoroutinefunction(func): 67 | 68 | async def async_wrapper(*args, **kwargs): 69 | return await asyncio.to_thread(func, *args, **kwargs) 70 | 71 | # Copy the signature and other metadata from the original function 72 | async_wrapper.__signature__ = signature(func) 73 | async_wrapper.__name__ = func.__name__ 74 | async_wrapper.__annotations__ = func.__annotations__ 75 | wrapped_func = async_wrapper 76 | else: 77 | wrapped_func = func 78 | 79 | action = RegisteredAction( 80 | name=func.__name__, 81 | description=description, 82 | function=wrapped_func, 83 | param_model=actual_param_model, 84 | domains=domains, 85 | page_filter=page_filter, 86 | ) 87 | self.registry.actions[func.__name__] = action 88 | return func 89 | 90 | return decorator 91 | 92 | @time_execution_async('--execute_action') 93 | async def execute_action( 94 | self, 95 | action_name: str, 96 | params: dict, 97 | browser: Optional[BrowserContext] = None, 98 | page_extraction_llm: Optional[BaseChatModel] = None, 99 | sensitive_data: Optional[Dict[str, str]] = None, 100 | available_file_paths: Optional[list[str]] = None, 101 | # 102 | context: Context | None = None, 103 | ) -> Any: 104 | """Execute a registered action""" 105 | if action_name not in self.registry.actions: 106 | raise ValueError(f'Action {action_name} not found') 107 | 108 | action = self.registry.actions[action_name] 109 | try: 110 | # Create the validated Pydantic model 111 | validated_params = action.param_model(**params) 112 | 113 | # Check if the first parameter is a Pydantic model 114 | sig = signature(action.function) 115 | parameters = list(sig.parameters.values()) 116 | is_pydantic = parameters and issubclass(parameters[0].annotation, BaseModel) 117 | parameter_names = [param.name for param in parameters] 118 | 119 | if sensitive_data: 120 | validated_params = self._replace_sensitive_data(validated_params, sensitive_data) 121 | 122 | # Check if the action requires browser 123 | if 'browser' in parameter_names and not browser: 124 | raise ValueError(f'Action {action_name} requires browser but none provided.') 125 | if 'page_extraction_llm' in parameter_names and not page_extraction_llm: 126 | raise ValueError(f'Action {action_name} requires page_extraction_llm but none provided.') 127 | if 'available_file_paths' in parameter_names and not available_file_paths: 128 | raise ValueError(f'Action {action_name} requires available_file_paths but none provided.') 129 | 130 | if 'context' in parameter_names and not context: 131 | raise ValueError(f'Action {action_name} requires context but none provided.') 132 | 133 | # Prepare arguments based on parameter type 134 | extra_args = {} 135 | if 'context' in parameter_names: 136 | extra_args['context'] = context 137 | if 'browser' in parameter_names: 138 | extra_args['browser'] = browser 139 | if 'page_extraction_llm' in parameter_names: 140 | extra_args['page_extraction_llm'] = page_extraction_llm 141 | if 'available_file_paths' in parameter_names: 142 | extra_args['available_file_paths'] = available_file_paths 143 | if action_name == 'input_text' and sensitive_data: 144 | extra_args['has_sensitive_data'] = True 145 | if is_pydantic: 146 | return await action.function(validated_params, **extra_args) 147 | return await action.function(**validated_params.model_dump(), **extra_args) 148 | 149 | except Exception as e: 150 | raise RuntimeError(f'Error executing action {action_name}: {str(e)}') from e 151 | 152 | def _replace_sensitive_data(self, params: BaseModel, sensitive_data: Dict[str, str]) -> BaseModel: 153 | """Replaces the sensitive data in the params""" 154 | # if there are any str with placeholder in the params, replace them with the actual value from sensitive_data 155 | 156 | import re 157 | 158 | secret_pattern = re.compile(r'(.*?)') 159 | 160 | def replace_secrets(value): 161 | if isinstance(value, str): 162 | matches = secret_pattern.findall(value) 163 | for placeholder in matches: 164 | if placeholder in sensitive_data: 165 | value = value.replace(f'{placeholder}', sensitive_data[placeholder]) 166 | return value 167 | elif isinstance(value, dict): 168 | return {k: replace_secrets(v) for k, v in value.items()} 169 | elif isinstance(value, list): 170 | return [replace_secrets(v) for v in value] 171 | return value 172 | 173 | for key, value in params.model_dump().items(): 174 | params.__dict__[key] = replace_secrets(value) 175 | return params 176 | 177 | @time_execution_sync('--create_action_model') 178 | def create_action_model(self, include_actions: Optional[list[str]] = None, page=None) -> Type[ActionModel]: 179 | """Creates a Pydantic model from registered actions, used by LLM APIs that support tool calling & enforce a schema""" 180 | 181 | # Filter actions based on page if provided: 182 | # if page is None, only include actions with no filters 183 | # if page is provided, only include actions that match the page 184 | 185 | available_actions = {} 186 | for name, action in self.registry.actions.items(): 187 | if include_actions is not None and name not in include_actions: 188 | continue 189 | 190 | # If no page provided, only include actions with no filters 191 | if page is None: 192 | if action.page_filter is None and action.domains is None: 193 | available_actions[name] = action 194 | continue 195 | 196 | # Check page_filter if present 197 | domain_is_allowed = self.registry._match_domains(action.domains, page.url) 198 | page_is_allowed = self.registry._match_page_filter(action.page_filter, page) 199 | 200 | # Include action if both filters match (or if either is not present) 201 | if domain_is_allowed and page_is_allowed: 202 | available_actions[name] = action 203 | 204 | fields = { 205 | name: ( 206 | Optional[action.param_model], 207 | Field(default=None, description=action.description), 208 | ) 209 | for name, action in available_actions.items() 210 | } 211 | 212 | self.telemetry.capture( 213 | ControllerRegisteredFunctionsTelemetryEvent( 214 | registered_functions=[ 215 | RegisteredFunction(name=name, params=action.param_model.model_json_schema()) 216 | for name, action in available_actions.items() 217 | ] 218 | ) 219 | ) 220 | 221 | return create_model('ActionModel', __base__=ActionModel, **fields) # type:ignore 222 | 223 | def get_prompt_description(self, page=None) -> str: 224 | """Get a description of all actions for the prompt 225 | 226 | If page is provided, only include actions that are available for that page 227 | based on their filter_func 228 | """ 229 | return self.registry.get_prompt_description(page=page) 230 | -------------------------------------------------------------------------------- /chrome-extension/css/normalize.css: -------------------------------------------------------------------------------- 1 | /*! normalize.css v3.0.2 | MIT License | git.io/normalize */ 2 | 3 | /** 4 | * 1. Set default font family to sans-serif. 5 | * 2. Prevent iOS text size adjust after orientation change, without disabling 6 | * user zoom. 7 | */ 8 | 9 | html { 10 | font-family: sans-serif; /* 1 */ 11 | -ms-text-size-adjust: 100%; /* 2 */ 12 | -webkit-text-size-adjust: 100%; /* 2 */ 13 | } 14 | 15 | /** 16 | * Remove default margin. 17 | */ 18 | 19 | body { 20 | margin: 0; 21 | } 22 | 23 | /* HTML5 display definitions 24 | ========================================================================== */ 25 | 26 | /** 27 | * Correct `block` display not defined for any HTML5 element in IE 8/9. 28 | * Correct `block` display not defined for `details` or `summary` in IE 10/11 29 | * and Firefox. 30 | * Correct `block` display not defined for `main` in IE 11. 31 | */ 32 | 33 | article, 34 | aside, 35 | details, 36 | figcaption, 37 | figure, 38 | footer, 39 | header, 40 | hgroup, 41 | main, 42 | menu, 43 | nav, 44 | section, 45 | summary { 46 | display: block; 47 | } 48 | 49 | /** 50 | * 1. Correct `inline-block` display not defined in IE 8/9. 51 | * 2. Normalize vertical alignment of `progress` in Chrome, Firefox, and Opera. 52 | */ 53 | 54 | audio, 55 | canvas, 56 | progress, 57 | video { 58 | display: inline-block; /* 1 */ 59 | vertical-align: baseline; /* 2 */ 60 | } 61 | 62 | /** 63 | * Prevent modern browsers from displaying `audio` without controls. 64 | * Remove excess height in iOS 5 devices. 65 | */ 66 | 67 | audio:not([controls]) { 68 | display: none; 69 | height: 0; 70 | } 71 | 72 | /** 73 | * Address `[hidden]` styling not present in IE 8/9/10. 74 | * Hide the `template` element in IE 8/9/11, Safari, and Firefox < 22. 75 | */ 76 | 77 | [hidden], 78 | template { 79 | display: none; 80 | } 81 | 82 | /* Links 83 | ========================================================================== */ 84 | 85 | /** 86 | * Remove the gray background color from active links in IE 10. 87 | */ 88 | 89 | a { 90 | background-color: transparent; 91 | } 92 | 93 | /** 94 | * Improve readability when focused and also mouse hovered in all browsers. 95 | */ 96 | 97 | a:active, 98 | a:hover { 99 | outline: 0; 100 | } 101 | 102 | /* Text-level semantics 103 | ========================================================================== */ 104 | 105 | /** 106 | * Address styling not present in IE 8/9/10/11, Safari, and Chrome. 107 | */ 108 | 109 | abbr[title] { 110 | border-bottom: 1px dotted; 111 | } 112 | 113 | /** 114 | * Address style set to `bolder` in Firefox 4+, Safari, and Chrome. 115 | */ 116 | 117 | b, 118 | strong { 119 | font-weight: bold; 120 | } 121 | 122 | /** 123 | * Address styling not present in Safari and Chrome. 124 | */ 125 | 126 | dfn { 127 | font-style: italic; 128 | } 129 | 130 | /** 131 | * Address variable `h1` font-size and margin within `section` and `article` 132 | * contexts in Firefox 4+, Safari, and Chrome. 133 | */ 134 | 135 | h1 { 136 | font-size: 2em; 137 | margin: 0.67em 0; 138 | } 139 | 140 | /** 141 | * Address styling not present in IE 8/9. 142 | */ 143 | 144 | mark { 145 | background: #ff0; 146 | color: #000; 147 | } 148 | 149 | /** 150 | * Address inconsistent and variable font size in all browsers. 151 | */ 152 | 153 | small { 154 | font-size: 80%; 155 | } 156 | 157 | /** 158 | * Prevent `sub` and `sup` affecting `line-height` in all browsers. 159 | */ 160 | 161 | sub, 162 | sup { 163 | font-size: 75%; 164 | line-height: 0; 165 | position: relative; 166 | vertical-align: baseline; 167 | } 168 | 169 | sup { 170 | top: -0.5em; 171 | } 172 | 173 | sub { 174 | bottom: -0.25em; 175 | } 176 | 177 | /* Embedded content 178 | ========================================================================== */ 179 | 180 | /** 181 | * Remove border when inside `a` element in IE 8/9/10. 182 | */ 183 | 184 | img { 185 | border: 0; 186 | } 187 | 188 | /** 189 | * Correct overflow not hidden in IE 9/10/11. 190 | */ 191 | 192 | svg:not(:root) { 193 | overflow: hidden; 194 | } 195 | 196 | /* Grouping content 197 | ========================================================================== */ 198 | 199 | /** 200 | * Address margin not present in IE 8/9 and Safari. 201 | */ 202 | 203 | figure { 204 | margin: 1em 40px; 205 | } 206 | 207 | /** 208 | * Address differences between Firefox and other browsers. 209 | */ 210 | 211 | hr { 212 | -moz-box-sizing: content-box; 213 | box-sizing: content-box; 214 | height: 0; 215 | } 216 | 217 | /** 218 | * Contain overflow in all browsers. 219 | */ 220 | 221 | pre { 222 | overflow: auto; 223 | } 224 | 225 | /** 226 | * Address odd `em`-unit font size rendering in all browsers. 227 | */ 228 | 229 | code, 230 | kbd, 231 | pre, 232 | samp { 233 | font-family: monospace, monospace; 234 | font-size: 1em; 235 | } 236 | 237 | /* Forms 238 | ========================================================================== */ 239 | 240 | /** 241 | * Known limitation: by default, Chrome and Safari on OS X allow very limited 242 | * styling of `select`, unless a `border` property is set. 243 | */ 244 | 245 | /** 246 | * 1. Correct color not being inherited. 247 | * Known issue: affects color of disabled elements. 248 | * 2. Correct font properties not being inherited. 249 | * 3. Address margins set differently in Firefox 4+, Safari, and Chrome. 250 | */ 251 | 252 | button, 253 | input, 254 | optgroup, 255 | select, 256 | textarea { 257 | color: inherit; /* 1 */ 258 | font: inherit; /* 2 */ 259 | margin: 0; /* 3 */ 260 | } 261 | 262 | /** 263 | * Address `overflow` set to `hidden` in IE 8/9/10/11. 264 | */ 265 | 266 | button { 267 | overflow: visible; 268 | } 269 | 270 | /** 271 | * Address inconsistent `text-transform` inheritance for `button` and `select`. 272 | * All other form control elements do not inherit `text-transform` values. 273 | * Correct `button` style inheritance in Firefox, IE 8/9/10/11, and Opera. 274 | * Correct `select` style inheritance in Firefox. 275 | */ 276 | 277 | button, 278 | select { 279 | text-transform: none; 280 | } 281 | 282 | /** 283 | * 1. Avoid the WebKit bug in Android 4.0.* where (2) destroys native `audio` 284 | * and `video` controls. 285 | * 2. Correct inability to style clickable `input` types in iOS. 286 | * 3. Improve usability and consistency of cursor style between image-type 287 | * `input` and others. 288 | */ 289 | 290 | button, 291 | html input[type="button"], /* 1 */ 292 | input[type="reset"], 293 | input[type="submit"] { 294 | -webkit-appearance: button; /* 2 */ 295 | cursor: pointer; /* 3 */ 296 | } 297 | 298 | /** 299 | * Re-set default cursor for disabled elements. 300 | */ 301 | 302 | button[disabled], 303 | html input[disabled] { 304 | cursor: default; 305 | } 306 | 307 | /** 308 | * Remove inner padding and border in Firefox 4+. 309 | */ 310 | 311 | button::-moz-focus-inner, 312 | input::-moz-focus-inner { 313 | border: 0; 314 | padding: 0; 315 | } 316 | 317 | /** 318 | * Address Firefox 4+ setting `line-height` on `input` using `!important` in 319 | * the UA stylesheet. 320 | */ 321 | 322 | input { 323 | line-height: normal; 324 | } 325 | 326 | /** 327 | * It's recommended that you don't attempt to style these elements. 328 | * Firefox's implementation doesn't respect box-sizing, padding, or width. 329 | * 330 | * 1. Address box sizing set to `content-box` in IE 8/9/10. 331 | * 2. Remove excess padding in IE 8/9/10. 332 | */ 333 | 334 | input[type="checkbox"], 335 | input[type="radio"] { 336 | box-sizing: border-box; /* 1 */ 337 | padding: 0; /* 2 */ 338 | } 339 | 340 | /** 341 | * Fix the cursor style for Chrome's increment/decrement buttons. For certain 342 | * `font-size` values of the `input`, it causes the cursor style of the 343 | * decrement button to change from `default` to `text`. 344 | */ 345 | 346 | input[type="number"]::-webkit-inner-spin-button, 347 | input[type="number"]::-webkit-outer-spin-button { 348 | height: auto; 349 | } 350 | 351 | /** 352 | * 1. Address `appearance` set to `searchfield` in Safari and Chrome. 353 | * 2. Address `box-sizing` set to `border-box` in Safari and Chrome 354 | * (include `-moz` to future-proof). 355 | */ 356 | 357 | input[type="search"] { 358 | -webkit-appearance: textfield; /* 1 */ 359 | -moz-box-sizing: content-box; 360 | -webkit-box-sizing: content-box; /* 2 */ 361 | box-sizing: content-box; 362 | } 363 | 364 | /** 365 | * Remove inner padding and search cancel button in Safari and Chrome on OS X. 366 | * Safari (but not Chrome) clips the cancel button when the search input has 367 | * padding (and `textfield` appearance). 368 | */ 369 | 370 | input[type="search"]::-webkit-search-cancel-button, 371 | input[type="search"]::-webkit-search-decoration { 372 | -webkit-appearance: none; 373 | } 374 | 375 | /** 376 | * Define consistent border, margin, and padding. 377 | */ 378 | 379 | fieldset { 380 | border: 1px solid #c0c0c0; 381 | margin: 0 2px; 382 | padding: 0.35em 0.625em 0.75em; 383 | } 384 | 385 | /** 386 | * 1. Correct `color` not being inherited in IE 8/9/10/11. 387 | * 2. Remove padding so people aren't caught out if they zero out fieldsets. 388 | */ 389 | 390 | legend { 391 | border: 0; /* 1 */ 392 | padding: 0; /* 2 */ 393 | } 394 | 395 | /** 396 | * Remove default vertical scrollbar in IE 8/9/10/11. 397 | */ 398 | 399 | textarea { 400 | overflow: auto; 401 | } 402 | 403 | /** 404 | * Don't inherit the `font-weight` (applied by a rule above). 405 | * NOTE: the default cannot safely be changed in Chrome and Safari on OS X. 406 | */ 407 | 408 | optgroup { 409 | font-weight: bold; 410 | } 411 | 412 | /* Tables 413 | ========================================================================== */ 414 | 415 | /** 416 | * Remove most spacing between table cells. 417 | */ 418 | 419 | table { 420 | border-collapse: collapse; 421 | border-spacing: 0; 422 | } 423 | 424 | td, 425 | th { 426 | padding: 0; 427 | } --------------------------------------------------------------------------------