├── .DS_Store ├── .gitignore ├── LICENSE ├── Mind2web-live_Leaderboard.md ├── README.md ├── agent ├── Environment │ ├── __init__.py │ └── html_env │ │ ├── __init__.py │ │ ├── actions.py │ │ ├── active_elements.py │ │ ├── async_env.py │ │ ├── build_tree.py │ │ ├── google_search.js │ │ └── utils.py ├── LLM │ ├── README.md │ ├── __init__.py │ ├── claude.py │ ├── gemini.py │ ├── llm_instance.py │ ├── openai.py │ ├── togetherai.py │ ├── token_calculator.py │ └── token_utils.py ├── Memory │ ├── __init__.py │ ├── base_trace.py │ ├── long_memory │ │ ├── __init__.py │ │ ├── reference_trace.py │ │ └── website_knowledge.py │ ├── retriever.py │ └── short_memory │ │ ├── __init__.py │ │ └── history.py ├── Plan │ ├── __init__.py │ ├── action.py │ └── planning.py ├── Prompt │ ├── __init__.py │ ├── base_prompts.py │ ├── dom_vision_disc_prompts.py │ ├── dom_vision_prompts.py │ ├── prompt_constructor.py │ ├── vision_prompts.py │ └── vision_to_dom_prompts.py ├── Reward │ ├── __init__.py │ └── global_reward.py ├── Tool │ ├── __init__.py │ └── base_tools.py └── Utils │ ├── __init__.py │ └── utils.py ├── configs └── setting.toml ├── data ├── dataset_io.py ├── example │ ├── .DS_Store │ ├── example_130.json │ └── mind2web-live_test_20241024.json └── raw_data_processor.py ├── evaluate.py ├── evaluate ├── __init__.py ├── evaluate_utils.py ├── step_score.py ├── step_score_js.py └── task_score.py ├── experiment_results.py ├── logs.py ├── requirements.txt ├── scripts └── run_evaluation.sh └── src ├── .DS_Store └── main_figure.png /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iMeanAI/WebCanvas/b9f289128614cd99b97abd0bb9bfc3a45f0847e0/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | 3 | **/__pycache__ 4 | 5 | **/*.json 6 | 7 | result_logs/ 8 | 9 | results/ 10 | 11 | LOGS/ 12 | 13 | batch_tasks_results/* 14 | 15 | data/* 16 | !data/example/ 17 | !data/example/example_130.json 18 | !data/example/mind2web-live_test_20241024.json 19 | !data/human_labeled_reward_reference/* 20 | !data/dataset_io.py 21 | !data/raw_data_processor.py 22 | 23 | test.py 24 | test_dom_tree.py 25 | 26 | agent/Environment/html_env/js_event/ 27 | 28 | node_modules/ 29 | package-lock.json 30 | package.json -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 iMeanAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /Mind2web-live_Leaderboard.md: -------------------------------------------------------------------------------- 1 | # Leaderboard 2 | 3 | 4 | 5 | | Agent | Model | Completion rate | Task success rate | 6 | | --------------- | ------------------- | --------------- | ----------------- | 7 | | **★SeeAct-V** | **GPT-4o** | **50.8%** | **19.2%** | 8 | | **★SeeAct-V** | **GPT-4** | **50.7%** | **23.1%** | 9 | | **★AGUVIS** | **AGUVIS-72B** | **/** | **27.1%** | 10 | | **★WebDreamer** | **GPT-4o** | **49.9%** | **25.0%** | 11 | | **★WebCanvas** | **GPT-4** | **48.8%** | **23.1%** | 12 | | **★WebCanvas** | **Claude-3-Sonnet** | **47.9%** | **22.1%** | 13 | | **★WebCanvas** | **GPT-4o** | **47.6%** | **22.1%** | 14 | | **★WebCanvas** | **GPT-4-turbo** | **44.3%** | **21.1%** | 15 | 16 | ### Explanation 17 | 18 | - **★** indicates that the model or code has been open-sourced and verified. 19 | - **Completion rate**: Indicates the success rate of the model in completing tasks. 20 | - **Task success rate**: Indicates the success rate of the model in performing tasks. 21 | 22 | 23 | ### References 24 | 25 | [1] [SeeAct-V ](https://arxiv.org/abs/2410.05243) **★** 26 | 27 | [2] [AGUVIS][https://arxiv.org/pdf/2412.04454] **★** 28 | 29 | [3] [WebDreamer](https://arxiv.org/pdf/2411.06559) **★** 30 | 31 | [4] [WebCanvas](https://arxiv.org/abs/2406.12373) **★** -------------------------------------------------------------------------------- /agent/Environment/__init__.py: -------------------------------------------------------------------------------- 1 | from .html_env import * 2 | 3 | 4 | -------------------------------------------------------------------------------- /agent/Environment/html_env/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import * 2 | from .build_tree import * 3 | from .active_elements import * 4 | from .actions import * 5 | from .async_env import * 6 | -------------------------------------------------------------------------------- /agent/Environment/html_env/actions.py: -------------------------------------------------------------------------------- 1 | from beartype import beartype 2 | from typing import TypedDict 3 | from enum import IntEnum 4 | 5 | 6 | class Action(TypedDict): 7 | action_type: int 8 | element_id: int 9 | element_name: str 10 | url: str 11 | fill_text: str 12 | 13 | 14 | class ActionTypes(IntEnum): 15 | NONE = 0 16 | CLICK = 1 17 | GOTO = 2 18 | GOOGLE_SEARCH = 3 19 | FILL_FORM = 4 20 | SWITCH_TAB = 5 21 | GO_BACK = 6 22 | FILL_SEARCH = 7 23 | SELECT_OPTION = 8 24 | HOVER = 9 25 | SCROLL_DOWN = 10 26 | SCROLL_UP = 11 27 | CACHE_DATA = 12 28 | GET_FINAL_ANSWER = 13 29 | 30 | @beartype 31 | def create_cache_data_action(elementid: int,fill_text: str) -> Action: 32 | return { 33 | "action_type": ActionTypes.CACHE_DATA, 34 | "element_id": elementid, 35 | "url": "", 36 | "fill_text": fill_text, 37 | "element_name": "" 38 | } 39 | 40 | 41 | @beartype 42 | def create_get_final_answer(elementid: int,fill_text: str) -> Action: 43 | return { 44 | "action_type": ActionTypes.GET_FINAL_ANSWER, 45 | "element_id": elementid, 46 | "url": "", 47 | "fill_text": fill_text, 48 | "element_name": "" 49 | } 50 | 51 | 52 | @beartype 53 | def create_click_action(elementid: int) -> Action: 54 | return { 55 | "action_type": ActionTypes.CLICK, 56 | "element_id": elementid, 57 | "url": "", 58 | "fill_text": "", 59 | "element_name": "" 60 | } 61 | 62 | 63 | @beartype 64 | def create_goto_action(elementid: int, url: str) -> Action: 65 | return { 66 | "action_type": ActionTypes.GOTO, 67 | "element_id": elementid, 68 | "url": url, 69 | "fill_text": "", 70 | "element_name": "" 71 | } 72 | 73 | 74 | @beartype 75 | def create_none_action(elementid: int) -> Action: 76 | return { 77 | "action_type": ActionTypes.NONE, 78 | "element_id": elementid, 79 | "url": "", 80 | "fill_text": "", 81 | "element_name": "" 82 | } 83 | 84 | 85 | @beartype 86 | def create_fill_action(elementid: int, fill_text: str) -> Action: 87 | return { 88 | "action_type": ActionTypes.FILL_FORM, 89 | "element_id": elementid, 90 | "url": "", 91 | "fill_text": fill_text, 92 | "element_name": "" 93 | } 94 | 95 | 96 | @beartype 97 | def create_fill_search_action(elementid: int, fill_text: str) -> Action: 98 | return { 99 | "action_type": ActionTypes.FILL_SEARCH, 100 | "element_id": elementid, 101 | "url": "", 102 | "fill_text": fill_text, 103 | "element_name": "" 104 | } 105 | 106 | 107 | @beartype 108 | def create_search_action(elementid: int, text: str) -> Action: 109 | return { 110 | "action_type": ActionTypes.GOOGLE_SEARCH, 111 | "element_id": elementid, 112 | "url": "https://www.google.com", 113 | "fill_text": text, 114 | "element_name": "" 115 | } 116 | 117 | 118 | @beartype 119 | def create_go_back_action(elementid: int) -> Action: 120 | return { 121 | "action_type": ActionTypes.GO_BACK, 122 | "element_id": elementid, 123 | "url": "", 124 | "fill_text": "", 125 | "element_name": "" 126 | } 127 | 128 | 129 | @beartype 130 | def create_select_option_action(elementid: int, target_value: str) -> Action: 131 | return { 132 | "action_type": ActionTypes.SELECT_OPTION, 133 | "element_id": elementid, 134 | "url": "", 135 | "fill_text": target_value, 136 | "element_name": "" 137 | } 138 | 139 | @beartype 140 | def create_hover_action(elementid: int) -> Action: 141 | return { 142 | "action_type": ActionTypes.HOVER, 143 | "element_id": elementid, 144 | "url": "", 145 | "fill_text": "", 146 | "element_name": "" 147 | } 148 | 149 | @beartype 150 | def create_scroll_down_action(elementid: int) -> Action: 151 | return { 152 | "action_type": ActionTypes.SCROLL_DOWN, 153 | "element_id": elementid, 154 | "url": "", 155 | "fill_text": "", 156 | "element_name": "" 157 | } 158 | 159 | @beartype 160 | def create_scroll_up_action(elementid: int) -> Action: 161 | return { 162 | "action_type": ActionTypes.SCROLL_UP, 163 | "element_id": elementid, 164 | "url": "", 165 | "fill_text": "", 166 | "element_name": "" 167 | } 168 | 169 | @beartype 170 | def create_action(elementid: int, action_type: str, action_input: str) -> Action: 171 | if action_type == "click": 172 | return create_click_action(elementid=elementid) 173 | elif action_type == "fill_form": 174 | return create_fill_action(elementid=elementid, fill_text=action_input) 175 | elif action_type == "fill_search": 176 | return create_fill_search_action(elementid=elementid, fill_text=action_input) 177 | elif action_type == "goto": 178 | return create_goto_action(elementid=elementid, url=action_input) 179 | elif action_type == "google_search": 180 | return create_search_action(elementid=elementid, text=action_input) 181 | elif action_type == "go_back": 182 | return create_go_back_action(elementid=elementid) 183 | elif action_type == "select_option": 184 | return create_select_option_action(elementid=elementid, target_value=action_input) 185 | elif action_type == "hover": 186 | return create_hover_action(elementid=elementid) 187 | elif action_type == "scroll_down": 188 | return create_scroll_down_action(elementid=elementid) 189 | elif action_type == "scroll_up": 190 | return create_scroll_up_action(elementid=elementid) 191 | elif action_type == "cache_storage": 192 | return create_cache_data_action(elementid=elementid,fill_text=action_input) 193 | elif action_type == "get_final_answer": 194 | return create_get_final_answer(elementid=elementid,fill_text=action_input) 195 | else: 196 | return create_none_action(elementid=elementid) 197 | 198 | 199 | __all__ = [ 200 | "Action", 201 | "ActionTypes", 202 | "create_click_action", 203 | "create_fill_action", 204 | "create_none_action", 205 | "create_goto_action", 206 | "create_search_action", 207 | "create_go_back_action", 208 | "create_fill_search_action", 209 | "create_select_option_action", 210 | "create_hover_action", 211 | "create_scroll_down_action", 212 | "create_scroll_up_action", 213 | "create_cache_data_action", 214 | "create_get_final_answer", 215 | "create_action" 216 | ] 217 | -------------------------------------------------------------------------------- /agent/Environment/html_env/active_elements.py: -------------------------------------------------------------------------------- 1 | from .utils import ElementNode, ConditionTagNameList, TypeList 2 | import re 3 | 4 | 5 | class ActiveElements: 6 | @staticmethod 7 | def is_visiable(element: ElementNode, only_child_check=True): 8 | style = element["attributes"].get('style') 9 | if style and ('display: none' in style or 'opacity: 0' in style): 10 | return False 11 | aria_hidden = element["attributes"].get('aria-hidden') 12 | if aria_hidden == 'true': 13 | return False 14 | if only_child_check: 15 | visibility = element["attributes"].get('style') 16 | if visibility and ('visibility: hidden' in visibility or 'visibility: collapse' in visibility): 17 | return False 18 | rect = element["attributes"].get('rect') 19 | if rect and (rect['width'] == 0 or rect['height'] == 0): 20 | return False 21 | return True 22 | 23 | @staticmethod 24 | def is_interactive(element: ElementNode): 25 | if element is None: 26 | return False 27 | tag = ActiveElements.get_element_tagName(element) 28 | if tag == 'input' and element["attributes"].get('type') == 'hidden': 29 | return False 30 | if tag in ['select', 'option'] and element["attributes"].get('disabled'): 31 | return False 32 | if tag in ['input', 'textarea', 'button', 'a'] and element["attributes"].get('disabled'): 33 | return False 34 | return True 35 | 36 | @staticmethod 37 | def get_element_tagName(element: ElementNode) -> str: 38 | tag_name = element["tagName"].lower() 39 | if tag_name == 'input': 40 | input_type = element["attributes"].get('type') 41 | if input_type == 'checkbox': 42 | return 'checkbox' 43 | elif input_type == 'radio': 44 | return 'radio' 45 | elif input_type == 'button': 46 | return 'button' 47 | else: 48 | return 'input' 49 | elif tag_name == 'select': 50 | return 'select' 51 | elif tag_name == 'optgroup': 52 | return 'optgroup' 53 | elif tag_name == 'textarea': 54 | return 'textarea' 55 | elif tag_name == 'option': 56 | return 'option' 57 | elif tag_name == 'datalist': 58 | return 'datalist' 59 | # elif tag_name == 'label': 60 | # return 'label' 61 | elif tag_name == 'button': 62 | return 'button' 63 | elif tag_name == 'a': 64 | return 'link' 65 | elif tag_name in ConditionTagNameList: 66 | role = element["attributes"].get('role') 67 | if not role: 68 | return 'unknown' 69 | elif role == 'button': 70 | return 'button' 71 | elif role == 'link': 72 | return 'link' 73 | elif role == 'menuitem': 74 | return 'link' 75 | elif role == 'textbox': 76 | return 'input' 77 | elif role == 'checkbox': 78 | return 'checkbox' 79 | elif role == 'radio': 80 | return 'radio' 81 | elif role == 'tab': 82 | return 'link' 83 | elif role == 'switch': 84 | return 'switch' 85 | elif role == 'option': 86 | return 'option' 87 | elif role == 'row': 88 | return 'row' 89 | elif role == 'search-box': 90 | return 'search-box' 91 | else: 92 | return 'unknown' 93 | else: 94 | return 'unknown' 95 | 96 | @staticmethod 97 | def is_valid_element(element: ElementNode) -> bool: 98 | return ActiveElements.is_interactive(element) and ActiveElements.is_visiable(element) 99 | 100 | @staticmethod 101 | def get_element_value(element: ElementNode) -> str: 102 | if element["text"] and element["text"] != "": 103 | return element["text"] 104 | title = element['attributes'].get('title') 105 | if title: 106 | return title 107 | placeholder = element['attributes'].get('placeholder') 108 | if placeholder: 109 | return placeholder 110 | aria_label = element['attributes'].get('aria-label') 111 | if aria_label: 112 | return aria_label 113 | aria_checked = element['attributes'].get('aria-checked') 114 | if aria_checked: 115 | return aria_checked 116 | element_type = element["attributes"].get('type') 117 | if element_type in TypeList: 118 | return element_type 119 | if element["tagName"] == "select": 120 | return "Select an option value" 121 | return "" 122 | -------------------------------------------------------------------------------- /agent/Environment/html_env/build_tree.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | from lxml.html import etree 3 | from io import StringIO 4 | 5 | from .utils import ElementNode, TagNameList, MapTagNameList, stringfy_selector 6 | from .active_elements import ActiveElements 7 | 8 | 9 | import copy 10 | 11 | 12 | class HTMLTree: 13 | def __init__(self): 14 | self.elementNodes = [ElementNode] * 100000 15 | self.rawNode2id: dict = {} 16 | self.element2id: dict = {} 17 | self.id2rawNode: dict = {} 18 | self.valid: list[bool] = [False] * 100000 19 | self.nodeCounts: int 20 | self.nodeDict = {} 21 | self.element_value = {} 22 | 23 | def fetch_html_content(self, html_content) -> str: 24 | self.__init__() 25 | parser = etree.HTMLParser() 26 | self.tree = etree.parse(StringIO(html_content), parser) 27 | self.copy_tree = copy.deepcopy(self.tree) 28 | root = self.tree.getroot() 29 | self.init_html_tree(root) 30 | self.build_html_tree(root) 31 | return self.prune_tree() 32 | 33 | @staticmethod 34 | def build_node(node, idx: int) -> ElementNode: 35 | elementNode = ElementNode() 36 | elementNode["nodeId"] = idx 37 | elementNode["tagName"] = node.tag 38 | elementNode["text"] = node.text 39 | elementNode["attributes"] = node.attrib 40 | elementNode["childIds"] = [] 41 | elementNode["parentId"] = "" 42 | elementNode["siblingId"] = "" 43 | elementNode["twinId"] = "" 44 | elementNode["depth"] = 1 45 | elementNode["htmlContents"] = etree.tostring( 46 | node, pretty_print=True).decode() 47 | return elementNode 48 | 49 | def build_mapping(self) -> None: 50 | self.element2id = {value["nodeId"]: index for index, 51 | value in enumerate(self.elementNodes)} 52 | self.id2rawNode = {str(index): value for value, 53 | index in self.rawNode2id.items()} 54 | 55 | def init_html_tree(self, root) -> None: 56 | node_queue = deque([root]) 57 | node_id = 0 58 | while node_queue: 59 | node = node_queue.popleft() 60 | self.elementNodes[node_id] = HTMLTree().build_node(node, node_id) 61 | self.rawNode2id[node] = node_id 62 | node_id += 1 63 | for child in node.getchildren(): 64 | node_queue.append(child) 65 | self.build_mapping() 66 | self.nodeCounts = node_id 67 | self.valid = self.valid[:self.nodeCounts + 1] 68 | 69 | def build_html_tree(self, root) -> None: 70 | node_queue = deque([root]) 71 | root_id = self.rawNode2id[root] 72 | self.elementNodes[root_id]["parentId"] = -1 73 | while node_queue: 74 | node = node_queue.popleft() 75 | parent_id = self.rawNode2id[node] 76 | tag_st = {} 77 | sibling_id = 1 78 | for child in node.getchildren(): 79 | child_id = self.rawNode2id[child] 80 | tag_name = self.elementNodes[child_id].get("tagName") 81 | tag_st[tag_name] = tag_st.get(tag_name, 0) + 1 82 | twin_id = tag_st.get(tag_name) 83 | self.elementNodes[parent_id]["childIds"].append(child_id) 84 | self.elementNodes[child_id]["parentId"] = parent_id 85 | self.elementNodes[child_id]["twinId"] = twin_id 86 | self.elementNodes[child_id]["depth"] = self.elementNodes[parent_id]["depth"] + 1 87 | self.elementNodes[child_id]["siblingId"] = sibling_id 88 | node_queue.append(child) 89 | sibling_id += 1 90 | self.pruningTreeNode = copy.deepcopy(self.elementNodes) 91 | 92 | def get_xpath(self, idx: int) -> str: 93 | locator_str = "" 94 | current_node = self.elementNodes[idx] 95 | tag_name = current_node["tagName"] 96 | twinId = current_node["twinId"] 97 | locator_str = "/" + tag_name + "[" + str(twinId) + "]" 98 | while current_node["parentId"] != 0: 99 | parentid = current_node["parentId"] 100 | current_node = self.elementNodes[parentid] 101 | current_tag_name = current_node["tagName"] 102 | twinId = current_node["twinId"] 103 | locator_str = "/" + current_tag_name + \ 104 | "[" + str(twinId) + "]" + locator_str 105 | parentid = current_node["parentId"] 106 | current_node = self.elementNodes[parentid] 107 | current_tag_name = current_node["tagName"] 108 | return "/" + current_tag_name + locator_str 109 | 110 | def get_selector(self, idx: int) -> str: 111 | selector_str = "" 112 | current_node = self.elementNodes[idx] 113 | while current_node["parentId"] != -1: 114 | tag_name = current_node["tagName"] 115 | siblingId = str(current_node["siblingId"]) 116 | if current_node["attributes"].get('id'): 117 | current_selector = stringfy_selector( 118 | current_node["attributes"].get('id')) 119 | return "#" + current_selector + selector_str 120 | if len(self.elementNodes[current_node["parentId"]]["childIds"]) > 1: 121 | uu_twin_node = True 122 | uu_id = True 123 | for childId in self.elementNodes[current_node["parentId"]]["childIds"]: 124 | sib_node = self.elementNodes[childId] 125 | if sib_node["nodeId"] != current_node["nodeId"] and current_node["attributes"].get('class') and sib_node["attributes"].get("class") == current_node["attributes"].get('class'): 126 | uu_twin_node = False 127 | if sib_node["nodeId"] != current_node["nodeId"] and current_node["tagName"] == sib_node["tagName"]: 128 | uu_id = False 129 | if uu_id: 130 | selector_str = " > " + tag_name + selector_str 131 | elif current_node["attributes"].get('class') and uu_twin_node is True: 132 | # fix div.IbBox.Whs\(n\) 133 | selector_str = " > " + tag_name + "." + \ 134 | stringfy_selector( 135 | current_node["attributes"].get('class')) + selector_str 136 | else: 137 | selector_str = " > " + tag_name + \ 138 | ":nth-child(" + siblingId + ")" + selector_str 139 | else: 140 | selector_str = " > " + tag_name + selector_str 141 | current_node = self.elementNodes[current_node["parentId"]] 142 | return current_node["tagName"] + selector_str 143 | 144 | def is_valid(self, idx: int) -> bool: 145 | node = self.pruningTreeNode[idx] 146 | if node["tagName"] in TagNameList: 147 | return ActiveElements.is_valid_element(node) 148 | 149 | def prune_tree(self) -> str: 150 | """Traverse each element to determine if it is valid and prune""" 151 | result_list = [] 152 | root = self.pruningTreeNode[0] 153 | if root is None: 154 | result_list = [] 155 | stack = [root] 156 | while stack: 157 | node = stack.pop() 158 | nodeId = node["nodeId"] 159 | result_list.append(nodeId) 160 | children = [] 161 | for childId in node["childIds"]: 162 | childNode = self.pruningTreeNode[childId] 163 | children.append(childNode) 164 | stack.extend(children) 165 | result = result_list[::-1] 166 | for nodeId in result: 167 | if self.is_valid(nodeId) or self.valid[nodeId] is True: 168 | rawNode = self.id2rawNode[str(nodeId)] 169 | html_contents = etree.tostring( 170 | rawNode, pretty_print=True).decode() 171 | self.pruningTreeNode[nodeId]["htmlContents"] = html_contents 172 | self.valid[nodeId] = True 173 | current_id = nodeId 174 | while self.pruningTreeNode[current_id]["parentId"] != -1: 175 | parent_id = self.pruningTreeNode[current_id]["parentId"] 176 | self.valid[parent_id] = True 177 | current_id = parent_id 178 | else: 179 | rawNode = self.id2rawNode[str(nodeId)] 180 | rawNode.getparent().remove(rawNode) 181 | current_node = self.pruningTreeNode[nodeId] 182 | current_node["htmlContents"] = "" 183 | parentid = current_node["parentId"] 184 | self.pruningTreeNode[parentid]["childIds"].remove(nodeId) 185 | self.valid[nodeId] = False 186 | return self.pruningTreeNode[0]["htmlContents"] 187 | 188 | def get_element_contents(self, idx: int) -> str: 189 | node = self.elementNodes[idx] 190 | html_content = node["htmlContents"] 191 | return html_content 192 | 193 | def get_tag_name(self, element: ElementNode) -> (str, int): # type: ignore 194 | tag_name = ActiveElements.get_element_tagName(element) 195 | tag_idx = element["nodeId"] 196 | if tag_name == "unknown": 197 | tag_name = element["tagName"] 198 | tag_idx = element["nodeId"] 199 | # TODO Add more mappings 200 | if tag_name in MapTagNameList: 201 | parent_element = self.pruningTreeNode[element["parentId"]] 202 | return self.get_tag_name(parent_element) 203 | else: 204 | return ("statictext", tag_idx) 205 | return (tag_name, tag_idx) 206 | 207 | def build_dom_tree(self) -> str: 208 | root = self.pruningTreeNode[0] 209 | stack = [root] 210 | contents = "" 211 | num = 0 212 | while stack: 213 | node = stack.pop() 214 | if self.valid[node["nodeId"]] is True: 215 | content_text = HTMLTree().process_element_contents(node) 216 | if content_text != "": 217 | tag_name, tag_idx = self.get_tag_name( 218 | node) 219 | if tag_name.lower() != "statictext": 220 | num += 1 221 | self.nodeDict[num] = tag_idx 222 | contents += " " * (node["depth"]-1) + "[" + str(num) + "] " + tag_name + \ 223 | " " + f"\'{content_text}\'" + "\n" 224 | self.element_value[str(tag_idx)] = content_text 225 | children = [] 226 | for child_id in node["childIds"]: 227 | children.append(self.pruningTreeNode[child_id]) 228 | stack.extend(reversed(children)) 229 | return contents 230 | 231 | def get_selector_and_xpath(self, idx: int) -> (str, str): # type: ignore 232 | try: 233 | selector = self.get_selector(idx) 234 | xpath = self.get_xpath(idx) 235 | return selector, xpath 236 | except: 237 | print(f"can't locate element") 238 | 239 | @staticmethod 240 | def process_element_contents(element: ElementNode) -> str: 241 | # TODO Add appropriate interactive element information, currently only processing interactive elements with text attributes 242 | html_text = ActiveElements.get_element_value(element) 243 | if html_text is None: 244 | return "" 245 | return html_text.replace("\n", "").replace("\t", "").strip() 246 | 247 | def get_element_value(self, element_id: int) -> str: 248 | return self.element_value[str(element_id)] 249 | 250 | 251 | __all__ = [ 252 | "HTMLTree" 253 | ] 254 | -------------------------------------------------------------------------------- /agent/Environment/html_env/google_search.js: -------------------------------------------------------------------------------- 1 | const axios = require('axios'); 2 | 3 | async function searchGoogle(query) { 4 | const API_KEY = process.env.GOOGLE_API_KEY; 5 | const CX = process.env.GOOGLE_CX; 6 | 7 | if (!API_KEY || !CX) { 8 | throw new Error('Missing required environment variables: GOOGLE_API_KEY and/or GOOGLE_CX'); 9 | } 10 | 11 | const url = `https://www.googleapis.com/customsearch/v1?q=${encodeURIComponent(query)}&key=${API_KEY}&cx=${CX}`; 12 | 13 | try { 14 | const response = await axios.get(url); 15 | return JSON.stringify(response.data); 16 | } catch (error) { 17 | return JSON.stringify({ error: error.message }); 18 | } 19 | } 20 | 21 | // Handle input from Python 22 | const query = process.argv[2]; 23 | if (query) { 24 | searchGoogle(query) 25 | .then(result => { 26 | console.log(result); 27 | process.exit(0); 28 | }) 29 | .catch(error => { 30 | console.error(JSON.stringify({ error: error.message })); 31 | process.exit(1); 32 | }); 33 | } -------------------------------------------------------------------------------- /agent/Environment/html_env/utils.py: -------------------------------------------------------------------------------- 1 | from typing import TypedDict, List 2 | from enum import IntEnum 3 | 4 | 5 | class ElementNode(TypedDict): 6 | nodeId: int # Element ID 7 | childIds: List[int] # List of child element IDs 8 | siblingId: int # Sibling element ranking 9 | twinId: int # Same tag element ranking 10 | tagName: str # Element 11 | attributes: dict # Element attributes 12 | text: str # Text attribute 13 | parentId: int # Parent element 14 | htmlContents: str # All information of the element 15 | depth: int # Depth 16 | 17 | TagNameList = [ 18 | "button", 19 | "a", 20 | "input", 21 | "select", 22 | "textarea", 23 | "option", 24 | "datalist", 25 | "label", 26 | "div", 27 | "span", 28 | "p", 29 | "th", 30 | "tr", 31 | "td", 32 | "ul", 33 | "li", 34 | "h1", 35 | "h2", 36 | "h3", 37 | "h4", 38 | "h5", 39 | "h6", 40 | "filter-chip", 41 | "sup", 42 | "select-label", 43 | "optgroup" 44 | ] 45 | 46 | MapTagNameList = [ 47 | "span", 48 | "h1", 49 | "h2", 50 | "h3", 51 | "h4", 52 | "h5", 53 | "h6", 54 | "div", 55 | "li", 56 | "ul", 57 | "p" 58 | ] 59 | 60 | DelTagNameList = [ 61 | "script", # del 62 | "noscript", # del 63 | "style", # del 64 | "link", # del 65 | "meta", # del 66 | ] 67 | 68 | 69 | ConditionTagNameList = [ 70 | 'span', 71 | 'td', 72 | 'th', 73 | 'tr', 74 | 'li', 75 | 'div', 76 | 'label', 77 | 'filter-chip' 78 | ] 79 | 80 | 81 | TypeList = [ 82 | "submit" 83 | ] 84 | 85 | 86 | def stringfy_selector(string: str): 87 | special_chars = '#.>+~[]():*^$|=%@!\'' 88 | string = string.replace("\t", " ").replace("\n", " ").lstrip().rstrip() 89 | string = ' '.join(string.split()) 90 | for char in special_chars: 91 | string = string.replace(char, '\\' + char) 92 | string = '.'.join(string.split(' ')) 93 | if string[0].isdigit(): 94 | string = f"\\{'{:X}'.format(ord(string[0]))}" + " " + string[1:] 95 | return string 96 | 97 | def stringfy_value(string): 98 | special_chars = '#.>+~[]():*^$|=@\'' 99 | for char in special_chars: 100 | string = string.replace(char, '\\' + char) 101 | return rf"{string}" 102 | 103 | __all__ = [ 104 | "ElementNode", 105 | "TagNameList", 106 | "DelTagNameList", 107 | "ConditionTagNameList", 108 | "TypeList", 109 | "stringfy_selector", 110 | "stringfy_value" 111 | ] 112 | -------------------------------------------------------------------------------- /agent/LLM/README.md: -------------------------------------------------------------------------------- 1 | ### Setup Your API Keys 2 | 3 | #### OpenAI API Keys 4 | 5 | For setting up OpenAI API keys, add your API key to your environment variables: 6 | 7 | MacOS/Linux: 8 | 9 | ```bash 10 | export OPENAI_API_KEY='your-api-key-here' 11 | ``` 12 | 13 | Windows: 14 | 15 | ```text 16 | setx OPENAI_API_KEY "your-api-key-here" 17 | ``` 18 | 19 | Visit [Quickstart tutorial - OpenAI API](https://platform.openai.com/docs/quickstart?context=python) for more details. 20 | 21 | #### Claude API Keys 22 | 23 | For setting up Claude API keys, add your API key to your environment variables: 24 | 25 | MacOS/Linux: 26 | 27 | ```bash 28 | export ANTHROPIC_API_KEY='your-api-key-here' 29 | ``` 30 | 31 | Windows: 32 | 33 | ```text 34 | setx ANTHROPIC_API_KEY "your-api-key-here" 35 | ``` 36 | #### Gemini API Keys 37 | 38 | For setting up Gemini API keys, add your API key to your environment variables: 39 | 40 | MacOS/Linux: 41 | 42 | ```bash 43 | export GOOGLE_API_KEY='your-api-key-here' 44 | ``` 45 | 46 | Windows: 47 | 48 | ```text 49 | setx GOOGLE_API_KEY "your-api-key-here" 50 | ``` 51 | 52 | #### Together AI API Keys 53 | 54 | For setting up Together AI API keys, add your API key to your environment variables: 55 | 56 | MacOS/Linux: 57 | 58 | ```bash 59 | export TOGETHER_API_KEY='your-api-key-here' 60 | ``` 61 | 62 | Windows: 63 | 64 | ```text 65 | setx TOGETHER_API_KEY "your-api-key-here" 66 | ``` 67 | 68 | Make sure to replace `your-api-key-here` with your actual API keys. This ensures that the necessary APIs are accessible for the features you intend to use in the repository. -------------------------------------------------------------------------------- /agent/LLM/__init__.py: -------------------------------------------------------------------------------- 1 | from .openai import * 2 | from .llm_instance import * 3 | from .token_utils import * 4 | from .claude import * 5 | from .token_calculator import * -------------------------------------------------------------------------------- /agent/LLM/claude.py: -------------------------------------------------------------------------------- 1 | import os 2 | import asyncio 3 | from anthropic import AsyncAnthropic 4 | from functools import partial 5 | import multiprocessing 6 | from concurrent.futures import ThreadPoolExecutor 7 | import asyncio 8 | import concurrent 9 | from logs import logger 10 | 11 | 12 | class ClaudeGenerator: 13 | 14 | def __init__(self, model=None): 15 | self.model = model 16 | 17 | self.client = AsyncAnthropic( 18 | api_key=os.environ.get('ANTHROPIC_API_KEY') 19 | ) 20 | self.pool = concurrent.futures.ThreadPoolExecutor( 21 | max_workers=multiprocessing.cpu_count() * 2) 22 | 23 | async def request(self, messages: list = None, max_tokens: int = 500, temperature: float = 0.7) -> tuple[str, str]: 24 | loop = asyncio.get_event_loop() 25 | try: 26 | response = await loop.run_in_executor(self.pool, partial(self.chat, messages, max_tokens, temperature)) 27 | return await response, "" 28 | except Exception as e: 29 | logger.error(f"Error in ClaudeGenerator.request: {e}") 30 | return "", str(e) 31 | 32 | async def chat(self, message, max_tokens=1024, temperature=0.7): 33 | 34 | messages = [{"role": "user", "content": "Please follow the instructions"}, {"role": "assistant", "content": message[0].get("content")}, { 35 | "role": "user", "content": message[1].get("content")}] 36 | data = { 37 | 'model': self.model, 38 | 'max_tokens': max_tokens, 39 | 'temperature': temperature, 40 | 'messages': messages, 41 | } 42 | response = await self.client.messages.create(**data) 43 | return response.content[0].text 44 | 45 | -------------------------------------------------------------------------------- /agent/LLM/gemini.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import asyncio 4 | from functools import partial 5 | import multiprocessing 6 | from concurrent.futures import ThreadPoolExecutor 7 | from sanic.log import logger 8 | import google.generativeai as genai 9 | 10 | 11 | class GeminiGenerator: 12 | def __init__(self, model=None): 13 | self.model = model 14 | self.pool = ThreadPoolExecutor(max_workers=os.cpu_count() * 2) 15 | 16 | async def request(self, messages: list = None, max_tokens: int = 500, temperature: float = 0.7) -> (str, str): 17 | genai.configure(api_key=os.getenv("GOOGLE_API_KEY")) 18 | loop = asyncio.get_event_loop() 19 | try: 20 | response = await loop.run_in_executor(self.pool, partial(self.chat, messages, max_tokens, temperature)) 21 | return response, "" 22 | except Exception as e: 23 | logger.error(f"Error in GeminiGenerator.request: {e}") 24 | return "", str(e) 25 | 26 | def chat(self, messages, max_tokens=500, temperature=0.7): 27 | chat_history = [] 28 | for message in messages: 29 | chat_history.append({"role": "user", "parts": [{"text": message.get("content")}]}) 30 | # chat_history.append({"role": "model", "parts": [{"text": message.get("content")}]}) 31 | running_model = genai.GenerativeModel(self.model) 32 | chat = running_model.start_chat(history=chat_history) 33 | latest_user_message = messages[-1].get("content") 34 | response = chat.send_message(latest_user_message, generation_config=genai.types.GenerationConfig( 35 | max_output_tokens=max_tokens, 36 | temperature=temperature)) 37 | return response.text 38 | -------------------------------------------------------------------------------- /agent/LLM/llm_instance.py: -------------------------------------------------------------------------------- 1 | from .openai import GPTGenerator, GPTGeneratorWithJSON 2 | from .claude import ClaudeGenerator 3 | from .gemini import GeminiGenerator 4 | from .togetherai import TogetherAIGenerator 5 | 6 | 7 | def create_llm_instance(model, json_mode=False, all_json_models=None): 8 | if "gpt" in model or "o1" in model: 9 | if json_mode: 10 | if model in all_json_models: 11 | return GPTGeneratorWithJSON(model) 12 | else: 13 | raise ValueError("The text model does not support JSON mode.") 14 | else: 15 | return GPTGenerator(model) 16 | elif "claude" in model: 17 | if json_mode: 18 | raise ValueError("Claude does not support JSON mode.") 19 | else: 20 | return ClaudeGenerator(model) 21 | elif "gemini" in model: 22 | if json_mode: 23 | raise ValueError("Gemini does not support JSON mode.") 24 | else: 25 | return GeminiGenerator(model) 26 | else: 27 | if json_mode: 28 | raise ValueError("TogetherAI does not support JSON mode.") 29 | else: 30 | return TogetherAIGenerator(model) 31 | 32 | async def semantic_match_llm_request(messages: list = None): 33 | GPT35 = GPTGenerator(model="gpt-3.5-turbo") 34 | return await GPT35.request(messages) -------------------------------------------------------------------------------- /agent/LLM/openai.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import openai 4 | import asyncio 5 | from functools import partial 6 | import multiprocessing 7 | from concurrent.futures import ThreadPoolExecutor 8 | from sanic.log import logger 9 | from agent.Utils import * 10 | from .token_utils import truncate_messages_based_on_estimated_tokens 11 | 12 | 13 | class GPTGenerator: 14 | def __init__(self, model=None): 15 | self.model = model 16 | self.client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY")) 17 | 18 | async def request(self, messages: list = None, max_tokens: int = 500, temperature: float = 0.7) -> (str, str): 19 | try: 20 | if "gpt-3.5" in self.model: 21 | messages = truncate_messages_based_on_estimated_tokens(messages, max_tokens=16385) 22 | if "o1" in self.model: 23 | messages = [ 24 | {**msg, "role": "user"} if msg["role"] == "system" else msg 25 | for msg in messages 26 | ] 27 | cpu_count = multiprocessing.cpu_count() 28 | with ThreadPoolExecutor(max_workers=cpu_count * 2) as pool: 29 | if "o1" in self.model: 30 | future_answer = pool.submit(self.chat, messages) 31 | else: 32 | future_answer = pool.submit(self.chat, messages, max_tokens, temperature) 33 | future_answer_result = await future_answer.result() 34 | choice = future_answer_result.choices[0] 35 | if choice.finish_reason == 'length': 36 | logger.warning("Response may be truncated due to length. Be cautious when parsing JSON.") 37 | openai_response = choice.message.content 38 | # output_token_count = future_answer_result.usage.completion_tokens 39 | # input_token_count = future_answer_result.usage.prompt_tokens 40 | return openai_response, "" 41 | except Exception as e: 42 | logger.error(f"Error in GPTGenerator.request: {e}") 43 | return "", str(e) 44 | 45 | async def chat(self, messages, max_tokens=500, temperature=0.7): 46 | loop = asyncio.get_event_loop() 47 | if "o1" in self.model: 48 | data = { 49 | 'model': self.model, 50 | 'messages': messages, 51 | } 52 | else: 53 | data = { 54 | 'model': self.model, 55 | 'max_tokens': max_tokens, 56 | 'temperature': temperature, 57 | 'messages': messages, 58 | } 59 | if hasattr(self, 'response_format'): 60 | data['response_format'] = self.response_format 61 | 62 | func = partial(self.client.chat.completions.create, **data) 63 | return await loop.run_in_executor(None, func) 64 | 65 | 66 | class JSONModeMixin(GPTGenerator): 67 | """ 68 | A mixin to add JSON mode support to GPTGenerator classes. 69 | """ 70 | 71 | def __init__(self, model=None): 72 | super().__init__(model=model) # Ensure initialization from base class 73 | self.response_format = {"type": "json_object"} # Set response format to JSON object 74 | 75 | @staticmethod 76 | def prepare_messages_for_json_mode(messages): 77 | # Ensure there's a system message instructing the model to generate JSON 78 | if not any("json" in message.get('content', '').lower() for message in messages): 79 | messages.insert(0, {"role": "system", "content": "You are a helpful assistant designed to output json."}) 80 | return messages 81 | 82 | async def request(self, messages: list = None, max_tokens: int = 500, temperature: float = 0.7) -> (str, str): 83 | messages = self.prepare_messages_for_json_mode(messages) # Prepare messages for JSON mode 84 | return await super().request(messages, max_tokens, temperature) 85 | 86 | 87 | class GPTGeneratorWithJSON(JSONModeMixin): 88 | def __init__(self, model=None): 89 | super().__init__(model=model if model is not None else "gpt-4-turbo") 90 | -------------------------------------------------------------------------------- /agent/LLM/togetherai.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import openai 4 | from openai import AsyncOpenAI 5 | from sanic.log import logger 6 | from agent.Utils import * 7 | import requests 8 | from sanic.log import logger 9 | 10 | 11 | class TogetherAIGenerator: 12 | def __init__(self, model=None): 13 | self.model = model 14 | self.client = AsyncOpenAI( 15 | api_key=os.environ.get("TOGETHER_API_KEY"), 16 | base_url="https://api.together.xyz/v1" 17 | ) 18 | 19 | async def request(self, messages: list = None, max_tokens: int = 500, temperature: float = 0.7 20 | ) -> (str, str): 21 | try: 22 | openai_response = await self.chat(messages, max_tokens, temperature) 23 | return openai_response, "" 24 | except Exception as e: 25 | logger.error(f"Error in TogetherAIGenerator.request: {e}") 26 | return "", str(e) 27 | 28 | async def chat(self, messages, max_tokens=512, temperature=0.7): 29 | data = { 30 | 'model': self.model, 31 | 'max_tokens': max_tokens, 32 | 'temperature': temperature, 33 | 'messages': messages, 34 | } 35 | 36 | response = await self.client.chat.completions.create(**data) 37 | try: 38 | message_content = response.choices[0].message.content 39 | return message_content 40 | except (ValueError, KeyError, json.JSONDecodeError) as e: 41 | logger.error(f"Invalid response format: {e}") 42 | return "" 43 | -------------------------------------------------------------------------------- /agent/LLM/token_calculator.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Union, List, Dict, Optional 3 | 4 | import tiktoken 5 | from .token_utils import is_model_supported 6 | 7 | def calculation_of_token( 8 | messages: Union[str, List[Dict]], 9 | model: str = 'gpt-3.5-turbo', 10 | max_tokens: int = 4096 11 | ) -> int: 12 | """ 13 | Calculate the number of tokens in the messages. 14 | 15 | Args: 16 | messages: List of messages or string to calculate tokens for 17 | model: Model to use for tokenization 18 | max_tokens: Maximum number of tokens allowed 19 | 20 | Returns: 21 | int: Number of tokens in the messages 22 | """ 23 | if not is_model_supported(model): 24 | print(f"Message: Model {model} not in pricing configuration. Skipping token calculation.") 25 | return 0 26 | 27 | try: 28 | encoding = tiktoken.encoding_for_model(model) 29 | except KeyError: 30 | print("Warning: Model not found. Using default encoding.") 31 | encoding = tiktoken.get_encoding("cl100k_base") 32 | 33 | current_tokens = 0 34 | 35 | if isinstance(messages, str): 36 | tokens = encoding.encode(messages) 37 | current_tokens += len(tokens) 38 | return current_tokens 39 | 40 | for message in messages: 41 | content = message.get('content') 42 | if content is None: 43 | print("Warning: Message content is None. Skipping.") 44 | break 45 | 46 | if isinstance(content, list): 47 | # Process list of prompt elements 48 | for element in content: 49 | if 'text' in element.get('type', ''): 50 | tokens = encoding.encode(element['text']) 51 | current_tokens += len(tokens) 52 | else: 53 | # Process direct text content 54 | tokens = encoding.encode(content) 55 | current_tokens += len(tokens) 56 | 57 | return current_tokens 58 | 59 | def save_token_count_to_file( 60 | filename: str, 61 | step_tokens: Dict, 62 | task_name: str, 63 | global_reward_text_model: str, 64 | planning_text_model: str, 65 | token_pricing: Dict 66 | ) -> None: 67 | """ 68 | Save token count to a file in JSON format. 69 | 70 | Args: 71 | filename: Name of the file to save the token count 72 | step_tokens: Number of tokens used in steps 73 | task_name: Name of the task associated with the token count 74 | global_reward_text_model: Model used for reward modeling 75 | planning_text_model: Model used for planning 76 | token_pricing: Pricing information for models 77 | """ 78 | if not is_model_supported(planning_text_model) or not is_model_supported(global_reward_text_model): 79 | print( 80 | f"Message: One or both models ({planning_text_model}, {global_reward_text_model}) " 81 | "not in pricing configuration. Skipping token saving." 82 | ) 83 | return 84 | 85 | # Initialize or load existing data 86 | try: 87 | with open(filename, 'r') as file: 88 | data = json.load(file) 89 | except FileNotFoundError: 90 | data = { 91 | "calls": [], 92 | "total_planning_input_tokens": 0, 93 | "total_planning_output_tokens": 0, 94 | "total_reward_input_tokens": 0, 95 | "total_reward_output_tokens": 0, 96 | "total_input_tokens": 0, 97 | "total_output_tokens": 0, 98 | "total_tokens": 0, 99 | } 100 | 101 | # Update call records 102 | call_record = { 103 | "task_name": task_name, 104 | "step_tokens": step_tokens 105 | } 106 | data["calls"].append(call_record) 107 | 108 | # Update token counts 109 | data["total_planning_input_tokens"] += step_tokens["steps_planning_input_token_counts"] 110 | data["total_planning_output_tokens"] += step_tokens["steps_planning_output_token_counts"] 111 | data["total_reward_input_tokens"] += step_tokens["steps_reward_input_token_counts"] 112 | data["total_reward_output_tokens"] += step_tokens["steps_reward_output_token_counts"] 113 | data["total_input_tokens"] += step_tokens["steps_input_token_counts"] 114 | data["total_output_tokens"] += step_tokens["steps_output_token_counts"] 115 | data["total_tokens"] += step_tokens["steps_token_counts"] 116 | 117 | # Update planning model costs 118 | if planning_text_model in token_pricing["pricing_models"]: 119 | if "total_planning_input_token_cost" not in data: 120 | data["total_planning_input_token_cost"] = 0 121 | if "total_planning_output_token_cost" not in data: 122 | data["total_planning_output_token_cost"] = 0 123 | 124 | data["total_planning_input_token_cost"] += ( 125 | step_tokens["steps_planning_input_token_counts"] * 126 | token_pricing[f"{planning_text_model}_input_price"] 127 | ) 128 | data["total_planning_output_token_cost"] += ( 129 | step_tokens["steps_planning_output_token_counts"] * 130 | token_pricing[f"{planning_text_model}_output_price"] 131 | ) 132 | 133 | # Update reward model costs 134 | if global_reward_text_model in token_pricing["pricing_models"]: 135 | if "total_reward_input_token_cost" not in data: 136 | data["total_reward_input_token_cost"] = 0 137 | if "total_reward_output_token_cost" not in data: 138 | data["total_reward_output_token_cost"] = 0 139 | 140 | data["total_reward_input_token_cost"] += ( 141 | step_tokens["steps_reward_input_token_counts"] * 142 | token_pricing[f"{global_reward_text_model}_input_price"] 143 | ) 144 | data["total_reward_output_token_cost"] += ( 145 | step_tokens["steps_reward_output_token_counts"] * 146 | token_pricing[f"{global_reward_text_model}_output_price"] 147 | ) 148 | 149 | # Update total costs 150 | if (planning_text_model in token_pricing["pricing_models"] and 151 | global_reward_text_model in token_pricing["pricing_models"]): 152 | 153 | if "total_input_token_cost" not in data: 154 | data["total_input_token_cost"] = 0 155 | if "total_output_token_cost" not in data: 156 | data["total_output_token_cost"] = 0 157 | if "total_token_cost" not in data: 158 | data["total_token_cost"] = 0 159 | 160 | data["total_input_token_cost"] += ( 161 | data["total_planning_input_token_cost"] + 162 | data["total_reward_input_token_cost"] 163 | ) 164 | data["total_output_token_cost"] += ( 165 | data["total_planning_output_token_cost"] + 166 | data["total_reward_output_token_cost"] 167 | ) 168 | data["total_token_cost"] += ( 169 | data["total_input_token_cost"] + 170 | data["total_output_token_cost"] 171 | ) 172 | 173 | # Save updated data 174 | with open(filename, 'w') as file: 175 | json.dump(data, file, indent=4) 176 | -------------------------------------------------------------------------------- /agent/LLM/token_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Union, List, Dict, Tuple, Optional 3 | 4 | import toml 5 | 6 | def read_config(toml_path: Optional[str] = None) -> Dict: 7 | """Read configuration from TOML file. 8 | 9 | Args: 10 | toml_path: Path to the TOML config file. Defaults to 'configs/setting.toml' 11 | 12 | Returns: 13 | Dict containing configuration data 14 | """ 15 | if toml_path is None: 16 | toml_path = 'configs/setting.toml' 17 | with open(toml_path, 'r') as f: 18 | config = toml.load(f) 19 | return config 20 | 21 | def is_model_supported(model_name: str) -> bool: 22 | """Check if the model is supported in the configuration. 23 | 24 | Args: 25 | model_name: Name of the model to check 26 | 27 | Returns: 28 | bool indicating whether the model is supported 29 | """ 30 | try: 31 | config = read_config() 32 | return model_name in config["token_pricing"]["pricing_models"] 33 | except: 34 | return False 35 | 36 | def estimate_tokens(text: str) -> float: 37 | """Estimate the number of tokens for a given text. 38 | 39 | Args: 40 | text: Input text to estimate tokens for 41 | 42 | Returns: 43 | Estimated number of tokens 44 | """ 45 | return len(text) / 4.8 46 | 47 | def truncate_text(text: str, max_length: int) -> str: 48 | """Truncate text to fit within the maximum length. 49 | 50 | Args: 51 | text: Text to truncate 52 | max_length: Maximum length allowed 53 | 54 | Returns: 55 | Truncated text 56 | """ 57 | return text[:max_length] 58 | 59 | def process_content( 60 | content: Union[str, List[Dict]], 61 | remaining_tokens: float 62 | ) -> Tuple[Union[str, List[Dict]], float]: 63 | """Process and possibly truncate content based on remaining token allowance. 64 | 65 | Args: 66 | content: Content to process (either string or list of content items) 67 | remaining_tokens: Number of tokens remaining 68 | 69 | Returns: 70 | Tuple of (processed content, tokens used) 71 | """ 72 | if isinstance(content, list): 73 | truncated_content = [] 74 | used_tokens = 0 75 | 76 | for item in content: 77 | if item['type'] == 'text': 78 | item_text = item['text'] 79 | item_tokens = estimate_tokens(item_text) 80 | 81 | if used_tokens + item_tokens > remaining_tokens: 82 | max_length = int((remaining_tokens - used_tokens) * 4.8) 83 | truncated_text = truncate_text(item_text, max_length) 84 | truncated_content.append({'type': 'text', 'text': truncated_text}) 85 | used_tokens += estimate_tokens(truncated_text) 86 | break 87 | 88 | truncated_content.append(item) 89 | used_tokens += item_tokens 90 | 91 | return truncated_content, used_tokens 92 | else: 93 | # Simple text content 94 | tokens = estimate_tokens(content) 95 | if tokens > remaining_tokens: 96 | truncated_content = truncate_text(content, int(remaining_tokens * 4.8)) 97 | return truncated_content, estimate_tokens(truncated_content) 98 | return content, tokens 99 | 100 | def truncate_messages_based_on_estimated_tokens( 101 | messages: List[Dict], 102 | max_tokens: int 103 | ) -> List[Dict]: 104 | """Truncate a list of messages based on an estimated token limit. 105 | 106 | Args: 107 | messages: List of message dictionaries to process 108 | max_tokens: Maximum number of tokens allowed 109 | 110 | Returns: 111 | List of truncated messages 112 | """ 113 | current_tokens = 0 114 | truncated_messages = [] 115 | 116 | for message in messages: 117 | content = message['content'] 118 | processed_content, used_tokens = process_content(content, max_tokens - current_tokens) 119 | 120 | if used_tokens > 0: 121 | truncated_messages.append({ 122 | 'role': message['role'], 123 | 'content': processed_content 124 | }) 125 | current_tokens += used_tokens 126 | 127 | if current_tokens >= max_tokens: 128 | break 129 | 130 | return truncated_messages 131 | -------------------------------------------------------------------------------- /agent/Memory/__init__.py: -------------------------------------------------------------------------------- 1 | from .retriever import * 2 | from .long_memory import * 3 | from .short_memory import * 4 | from .base_trace import * 5 | -------------------------------------------------------------------------------- /agent/Memory/base_trace.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iMeanAI/WebCanvas/b9f289128614cd99b97abd0bb9bfc3a45f0847e0/agent/Memory/base_trace.py -------------------------------------------------------------------------------- /agent/Memory/long_memory/__init__.py: -------------------------------------------------------------------------------- 1 | from .reference_trace import * 2 | from .website_knowledge import * 3 | -------------------------------------------------------------------------------- /agent/Memory/long_memory/reference_trace.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iMeanAI/WebCanvas/b9f289128614cd99b97abd0bb9bfc3a45f0847e0/agent/Memory/long_memory/reference_trace.py -------------------------------------------------------------------------------- /agent/Memory/long_memory/website_knowledge.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iMeanAI/WebCanvas/b9f289128614cd99b97abd0bb9bfc3a45f0847e0/agent/Memory/long_memory/website_knowledge.py -------------------------------------------------------------------------------- /agent/Memory/retriever.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iMeanAI/WebCanvas/b9f289128614cd99b97abd0bb9bfc3a45f0847e0/agent/Memory/retriever.py -------------------------------------------------------------------------------- /agent/Memory/short_memory/__init__.py: -------------------------------------------------------------------------------- 1 | from .history import * -------------------------------------------------------------------------------- /agent/Memory/short_memory/history.py: -------------------------------------------------------------------------------- 1 | import json5 2 | 3 | 4 | class HistoryMemory: 5 | def __init__(self, previous_trace: list = [], reflection: str = "") -> None: 6 | self.previous_trace = previous_trace 7 | self.reflection = reflection 8 | 9 | def stringfy_thought_and_action(self) -> str: 10 | input_list = None 11 | str_output = "" 12 | try: 13 | input_list = json5.loads(self.previous_trace, encoding="utf-8") 14 | except: 15 | input_list = self.previous_trace 16 | if len(input_list) > 2: 17 | str_output = "[" 18 | for idx in range(len(input_list)-1): 19 | str_output += f'Step{idx+1}:\"Thought: {input_list[idx]["thought"]}, Action: {input_list[idx]["action"]}, Reflection: {input_list[idx+1]["reflection"]}\";\n' 20 | str_output += "]" 21 | current_trace = input_list[-1] 22 | str_output += f'Specifically in the last step, you gave the following Thought: {current_trace["thought"]}\n You performed the following Action: {current_trace["action"]}\n You had the following Reflection: {self.reflection}\";\n' 23 | else: 24 | current_trace = input_list[-1] 25 | str_output += f'Specifically in the last step, you gave the following Thought: {current_trace["thought"]}\n You performed the following Action: {current_trace["action"]}\n You had the following Reflection: {self.reflection}\";\n' 26 | return str_output 27 | 28 | def construct_previous_trace_prompt(self) -> str: 29 | stringfy_thought_and_action_output = self.stringfy_thought_and_action() 30 | previous_trace_prompt = f"The previous thoughts, actions and reflections are as follows: \ 31 | {stringfy_thought_and_action_output}.\n\nYou have done the things above.\n\n" 32 | return previous_trace_prompt 33 | 34 | @staticmethod 35 | def construct_cache(cache_info: list): 36 | pass 37 | -------------------------------------------------------------------------------- /agent/Plan/__init__.py: -------------------------------------------------------------------------------- 1 | from .action import * 2 | from .planning import * -------------------------------------------------------------------------------- /agent/Plan/action.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Tuple, Any, List 3 | 4 | import json5 5 | import json 6 | 7 | import traceback 8 | from agent.Prompt import * 9 | from ..Utils import * 10 | 11 | 12 | class ResponseError(Exception): 13 | """Custom response error type""" 14 | 15 | def __init__(self, message): 16 | self.message = message 17 | super().__init__(self.message) 18 | 19 | 20 | class ActionParser(): 21 | def __init__(self): 22 | pass 23 | 24 | # Extract Thought and Action from the results returned by LLM, 25 | # return thought (str) and action (dict), where action has four fields: action, element_id, action_input, description 26 | def extract_thought_and_action(self, message) -> tuple[str, dict]: 27 | result_action = None 28 | try: 29 | result_action = re.findall("```(.*?)```", message, re.S)[0] 30 | result_action = self.parse_action(message) 31 | except: 32 | try: 33 | result_action = self.parse_action(message) 34 | except: 35 | result_action = self.parse_action_with_re(message) 36 | if not result_action: 37 | raise ResponseError( 38 | "Response is an invalid JSON blob or Empty!") 39 | elif result_action and result_action.get("action") == '': 40 | raise ResponseError( 41 | "Response action is Empty, Please try again.") 42 | result_thought = result_action.get("thought") 43 | return result_thought, result_action 44 | 45 | def parse_action_with_re(self, message): 46 | pattern = r'"thought"\s*:\s*"([^"]*)"\s*,\s*"action"\s*:\s*"([^"]*)"\s*,\s*"action_input"\s*:\s*"([^"]*)"\s*,\s*"element_id"\s*:\s*(null|\d*)\s*,\s*"description"\s*:\s*"([^"]*)"' 47 | match = re.search(pattern, message) 48 | if match: 49 | thought = str(match.group(1)) 50 | action = str(match.group(2)) 51 | action_input = str(match.group(3)) 52 | element_id = str(match.group(4)) 53 | description = str(match.group(5)) 54 | thought = re.sub(r'\s+', ' ', thought).strip() 55 | action = re.sub(r'\s+', ' ', action).strip() 56 | action_input = re.sub(r'\s+', ' ', action_input).strip() 57 | element_id = re.sub(r'\s+', ' ', element_id).strip() 58 | description = re.sub(r'\s+', ' ', description).strip() 59 | result_dict = { 60 | "thought": thought, 61 | "action": action, 62 | "action_input": action_input, 63 | "element_id": element_id, 64 | "description": description 65 | } 66 | return result_dict 67 | 68 | def parse_action(self, message): 69 | message_substring = extract_longest_substring(message) 70 | decoded_result = {} 71 | decoded_result = json5.loads(message_substring) 72 | return decoded_result 73 | 74 | def extract_status_and_description(self, message) -> dict: 75 | try: 76 | description = re.findall("```(.*?)```", message, re.S)[0] 77 | status_description = self.parse_action(description) 78 | except: 79 | try: 80 | description = message 81 | status_description = self.parse_action(description) 82 | except: 83 | description = message.split("description:")[-1].strip() 84 | status_description = self.parse_action(description) 85 | 86 | return status_description 87 | 88 | def extract_score_and_description(self, message) -> dict: 89 | result_score = "null" 90 | try: 91 | result_score = re.findall( 92 | "score:(.*?)description:", message, re.S)[0].strip() 93 | except: 94 | try: 95 | result_score = message.split("description:")[0].strip() 96 | except: 97 | result_score = "null" 98 | try: 99 | description = re.findall("```(.*?)```", message, re.S)[0] 100 | except: 101 | description = message.split("description:")[-1].strip() 102 | score_description = self.parse_action(description) 103 | return score_description 104 | 105 | @staticmethod 106 | def get_element_id(input_str) -> str: 107 | # First, try to parse with json.loads() 108 | 109 | # If JSON parsing fails, try to extract with a regular expression 110 | pattern = r'["\']element_id["\']:\s*["\']?(\d+)["\']?,\s*["\']' 111 | match = re.search(pattern, input_str) 112 | if match: 113 | return match.group(1) 114 | else: 115 | return '-1' 116 | -------------------------------------------------------------------------------- /agent/Plan/planning.py: -------------------------------------------------------------------------------- 1 | from ..Utils.utils import print_info, print_limited_json 2 | from agent.Prompt import * 3 | from agent.LLM import * 4 | from .action import * 5 | import time 6 | import json5 7 | from .action import ResponseError 8 | from logs import logger 9 | 10 | 11 | class InteractionMode: 12 | def __init__(self, text_model=None, visual_model=None): 13 | self.text_model = text_model 14 | self.visual_model = visual_model 15 | 16 | def execute(self, status_description, user_request, previous_trace, observation, feedback, observation_VforD): 17 | pass 18 | 19 | 20 | class DomMode(InteractionMode): 21 | def __init__(self, text_model=None, visual_model=None): 22 | super().__init__(text_model, visual_model) 23 | 24 | async def execute(self, status_description, user_request, previous_trace, observation, feedback, observation_VforD): 25 | planning_request = PlanningPromptConstructor().construct( 26 | user_request, previous_trace, observation, feedback, status_description) 27 | logger.info( 28 | f"\033[32mDOM_based_planning_request:\n{planning_request}\033[0m\n") 29 | logger.info(f"planning_text_model: {self.text_model.model}") 30 | planning_response, error_message = await self.text_model.request(planning_request) 31 | # if "gpt" in self.text_model.model: 32 | # output_token_count = future_answer_result.usage.completion_tokens 33 | # input_token_count = future_answer_result.usage.prompt_tokens 34 | input_token_count = calculation_of_token(planning_request, model=self.text_model.model) 35 | output_token_count = calculation_of_token(planning_response, model=self.text_model.model) 36 | planning_token_count = [input_token_count, output_token_count] 37 | 38 | return planning_response, error_message, None, None, planning_token_count 39 | 40 | 41 | class DomVDescMode(InteractionMode): 42 | def __init__(self, text_model=None, visual_model=None): 43 | super().__init__(text_model, visual_model) 44 | 45 | async def execute(self, status_description, user_request, previous_trace, observation, feedback, observation_VforD): 46 | if observation_VforD != "": 47 | vision_desc_request = VisionDisc2PromptConstructor().construct( 48 | user_request, observation_VforD) # vision description request with user_request 49 | # vision_desc_request = VisionDisc1PromptConstructor().construct(observation_VforD) 50 | vision_desc_response, error_message = await self.visual_model.request(vision_desc_request) 51 | else: 52 | vision_desc_response = "" 53 | print(f"\033[36mvision_disc_response:\n{vision_desc_response}") # blue 54 | planning_request = ObservationVisionDiscPromptConstructor().construct( 55 | user_request, previous_trace, observation, feedback, status_description, vision_desc_response) 56 | print( 57 | f"\033[35mplanning_request:\n{print_limited_json(planning_request, limit=10000)}") 58 | print("\033[0m") 59 | planning_response, error_message = await self.text_model.request(planning_request) 60 | return planning_response, error_message, None, None 61 | 62 | 63 | class VisionToDomMode(InteractionMode): 64 | def __init__(self, text_model=None, visual_model=None): 65 | super().__init__(text_model, visual_model) 66 | 67 | async def execute(self, status_description, user_request, previous_trace, observation, feedback, observation_VforD): 68 | vision_act_request = ObservationVisionActPromptConstructor().construct( 69 | user_request, previous_trace, observation_VforD, feedback, status_description) 70 | max_retries = 3 71 | for attempt in range(max_retries): 72 | vision_act_response, error_message = await self.visual_model.request(vision_act_request) 73 | # Blue output 74 | print(f"\033[36mvision_act_response:\n{vision_act_response}") 75 | print("\033[0m") # Reset color 76 | planning_response_thought, planning_response_get = ActionParser().extract_thought_and_action( 77 | vision_act_response) 78 | actions = { 79 | 'goto': "Found 'goto' in the vision_act_response.", 80 | 'google_search': "Found 'google_search' in the vision_act_response.", 81 | 'switch_tab': "Found 'switch_tab' in the vision_act_response.", 82 | 'scroll_down': "Found 'scroll_down' in the vision_act_response.", 83 | 'scroll_up': "Found 'scroll_up' in the vision_act_response.", 84 | 'go_back': "Found 'go_back' in the vision_act_response." 85 | } 86 | # Check if the action is in the predefined action list 87 | actions_found = False 88 | for action, message in actions.items(): 89 | if action == planning_response_get.get('action'): 90 | print(message) 91 | actions_found = True 92 | # The action does not need to be changed 93 | # `target_element` should not exist, if it does, it's not used 94 | break 95 | 96 | if not actions_found: 97 | print("None of 'goto', 'google_search', 'switch_tab', 'scroll_down', 'scroll_up', or 'go_back' were found in the vision_act_response.") 98 | 99 | target_element = planning_response_get.get('target_element') 100 | description = planning_response_get.get('description') 101 | 102 | # If the target element is None or does not exist 103 | if not target_element: 104 | print("The 'target_element' is None or empty.") 105 | continue 106 | 107 | # Construct the request from vision to DOM 108 | planning_request = VisionToDomPromptConstructor().construct(target_element, description, 109 | observation) 110 | print(f"\033[35mplanning_request:{planning_request}") 111 | print("\033[0m") 112 | 113 | # Send the request and wait for the response 114 | planning_response_dom, error_message = await self.text_model.request(planning_request) 115 | print( 116 | f"\033[34mVisionToDomplanning_response:\n{planning_response_dom}") 117 | print("\033[0m") 118 | # Parse the element ID 119 | element_id = ActionParser().get_element_id(planning_response_dom) 120 | if element_id == "-1": 121 | print("The 'element_id' is not found in the planning_response.") 122 | continue # If the 'element_id' is not found, continue to the next iteration of the loop 123 | else: 124 | planning_response_get['element_id'] = element_id 125 | break # If the 'element_id' is found, break the loop 126 | 127 | else: 128 | # If a predefined action is found, there is no need to retry, exit the loop directly 129 | break 130 | 131 | planning_response_json_str = json5.dumps( 132 | planning_response_get, indent=2) 133 | planning_response = f'```\n{planning_response_json_str}\n```' 134 | # Check if the maximum number of retries has been reached 135 | if attempt == max_retries - 1: 136 | print("Max retries of vision_act reached. Unable to proceed.") 137 | 138 | return planning_response, error_message, planning_response_thought, planning_response_get 139 | 140 | 141 | class DVMode(InteractionMode): 142 | def __init__(self, text_model=None, visual_model=None): 143 | super().__init__(text_model, visual_model) 144 | 145 | async def execute(self, status_description, user_request, previous_trace, observation, feedback, observation_VforD): 146 | planning_request = D_VObservationPromptConstructor().construct( 147 | user_request, previous_trace, observation, observation_VforD, feedback, status_description) 148 | 149 | print( 150 | f"\033[32mplanning_request:\n{print_limited_json(planning_request, limit=1000)}") 151 | print("\033[0m") 152 | planning_response, error_message = await self.visual_model.request(planning_request) 153 | return planning_response, error_message, None, None 154 | 155 | 156 | class VisionMode(InteractionMode): 157 | def __init__(self, text_model=None, visual_model=None): 158 | super().__init__(text_model, visual_model) 159 | 160 | async def execute(self, status_description, user_request, previous_trace, observation, feedback, observation_VforD): 161 | planning_request = VisionObservationPromptConstructor( 162 | ).construct(user_request, previous_trace, observation) 163 | print(f"\033[32m{planning_request}") # Green color 164 | print("\033[0m") 165 | logger.info("\033[32m%s\033[0m", planning_request) 166 | planning_response, error_message = await self.visual_model.request(planning_request) 167 | return planning_response, error_message, None, None 168 | 169 | 170 | class Planning: 171 | 172 | @staticmethod 173 | async def plan( 174 | config, 175 | user_request, 176 | text_model_name, 177 | previous_trace, 178 | observation, 179 | feedback, 180 | mode, 181 | observation_VforD, 182 | status_description 183 | ): 184 | 185 | gpt35 = GPTGenerator(model="gpt-3.5-turbo") 186 | gpt4v = GPTGenerator(model="gpt-4-turbo") 187 | 188 | all_json_models = config["model"]["json_models"] 189 | is_json_response = config["model"]["json_model_response"] 190 | 191 | llm_planning_text = create_llm_instance( 192 | text_model_name, is_json_response, all_json_models) 193 | 194 | modes = { 195 | "dom": DomMode(text_model=llm_planning_text), 196 | "dom_v_desc": DomVDescMode(visual_model=gpt4v, text_model=llm_planning_text), 197 | "vision_to_dom": VisionToDomMode(visual_model=gpt4v, text_model=llm_planning_text), 198 | "d_v": DVMode(visual_model=gpt4v), 199 | "vision": VisionMode(visual_model=gpt4v) 200 | } 201 | 202 | # planning_response_thought, planning_response_action 203 | planning_response, error_message, planning_response_thought, planning_response_action, planning_token_count = await modes[mode].execute( 204 | status_description=status_description, 205 | user_request=user_request, 206 | previous_trace=previous_trace, 207 | observation=observation, 208 | feedback=feedback, 209 | observation_VforD=observation_VforD) 210 | 211 | logger.info(f"\033[34mPlanning_Response:\n{planning_response}\033[0m") 212 | if mode != "vision_to_dom": 213 | try: 214 | planning_response_thought, planning_response_action = ActionParser().extract_thought_and_action( 215 | planning_response) 216 | except ResponseError as e: 217 | logger.error(f"Response Error:{e.message}") 218 | raise 219 | 220 | if planning_response_action.get('action') == "fill_form": 221 | JudgeSearchbarRequest = JudgeSearchbarPromptConstructor().construct( 222 | input_element=observation, planning_response_action=planning_response_action) 223 | try: 224 | Judge_response, error_message = await gpt35.request(JudgeSearchbarRequest) 225 | if Judge_response.lower() == "yes": 226 | planning_response_action['action'] = "fill_search" 227 | except: 228 | planning_response_action['action'] = "fill_form" 229 | 230 | # The description should include both the thought (returned by LLM) and the action (parsed from the planning response) 231 | planning_response_action["description"] = { 232 | "thought": planning_response_thought, 233 | "action": ( 234 | f'{planning_response_action["action"]}: {planning_response_action["action_input"]}' if "description" not in planning_response_action.keys() else 235 | planning_response_action["description"]) 236 | if mode in ["dom", "d_v", "dom_v_desc", "vision_to_dom"] else ( 237 | planning_response_action["action"] if "description" not in planning_response_action.keys() else 238 | planning_response_action["description"]) 239 | } 240 | if mode in ["dom", "d_v", "dom_v_desc", "vision_to_dom"]: 241 | planning_response_action = {element: planning_response_action.get( 242 | element, "") for element in ["element_id", "action", "action_input", "description"]} 243 | elif mode == "vision": 244 | planning_response_action = {element: planning_response_action.get( 245 | element, "") for element in ["action", "description"]} 246 | logger.info("****************") 247 | # logger.info(planning_response_action) 248 | dict_to_write = {} 249 | if mode in ["dom", "d_v", "dom_v_desc", "vision_to_dom"]: 250 | dict_to_write['id'] = planning_response_action['element_id'] 251 | dict_to_write['action_type'] = planning_response_action['action'] 252 | dict_to_write['value'] = planning_response_action['action_input'] 253 | elif mode == "vision": 254 | dict_to_write['action'] = planning_response_action['action'] 255 | dict_to_write['description'] = planning_response_action['description'] 256 | dict_to_write['error_message'] = error_message 257 | dict_to_write['planning_token_count'] = planning_token_count 258 | 259 | return dict_to_write 260 | -------------------------------------------------------------------------------- /agent/Prompt/__init__.py: -------------------------------------------------------------------------------- 1 | from .prompt_constructor import * 2 | from .base_prompts import * 3 | from .dom_vision_prompts import * 4 | from .vision_prompts import * 5 | from .vision_to_dom_prompts import * -------------------------------------------------------------------------------- /agent/Prompt/base_prompts.py: -------------------------------------------------------------------------------- 1 | class BasePrompts: 2 | 3 | example_output = '\n```\n{\n "action": "click",\n "action_input": "button",\n "element_id": "236",\n "description": "Now I\'m on Google\'s main page. I\'m now clicking the button with element_id [236] to see more information."\n}\n```' 4 | score_output = '\n```\n{\n "score": "10"\n,"description": "According to the previous trajectory, the current thought and the action performed are an important part of completing the target task, so it is very important, so I give 10 points"}\n```' 5 | 6 | 7 | # - goto: useful for when you need visit a new link or a website, it will open a new tab. 8 | # - fill_form: useful for when you need to fill out a form or input something from accessibility tree. Input should be a string. 9 | # - google_search: useful for when you need to use google to search something. 10 | # - click: useful for when you need to click a button/link from accessibility tree. 11 | # - select_option: useful for when you need to select a drop-down box value. When you get (select and option) tags from the accessibility tree, you need to select the serial number(element_id) corresponding to the select tag, not the option, and select the most likely content corresponding to the option as Input. 12 | # - go_back: useful when you find the current web page encounter some network error or you think the last step is not helpful. 13 | 14 | planning_prompt_system = '''You are an assistant who not only helps to browse and operate web pages to achieve certain goals, but also needs to explore the information on the page to answer the questions raised by the target task. Please answer the following questions as much as possible. 15 | There are key information you will get: 16 | **Key Information**: 17 | - Previous trace: all thoughts, actions and reflections you have made historically. 18 | - Accessibility tree: characteristic expression of the current web page. 19 | 20 | **Introduction to Accessibility Tree**: 21 | The accessibility tree is a tree-like data structure that describes the relationships between elements on a web page and provides accessibility information for each element (such as text, links, form elements, etc.). 22 | - **Accessibility Tree Example**: 23 | Here is an example of an accessibility tree: 24 | ``` 25 | current web tab name is 'Google' 26 | [40] link 'About' 27 | [41] link 'Store' 28 | [186] link 'Gmail' 29 | [187] link 'Images' 30 | [163] textarea 'Search' 31 | [236] button 'See more' 32 | ``` 33 | In this example, each row represents the characteristic representation of a web page element. It has three attributes: '[40]' for the element's element_id, 'link' indicates the element is a link, and 'About' for the content of the element. 34 | Note: The above element provided is purely for illustrative purposes and should NEVER be used directly in your output! 35 | 36 | You should always consider previous and subsequent steps and what to do. 37 | **Thought Space**: 38 | - What action do you think is needed now to complete the task? 39 | - What's the reason of taking that action? 40 | 41 | You have access to the following tools(helpful to interact with web page): 42 | **Execution Action Space**: 43 | - goto: useful for when you need visit a new link or a website, it will open a new tab. 44 | - fill_form: useful for when you need to fill out a form or input something from accessibility tree. Input should be a string. 45 | - google_search: useful for when you need to use google to search something. 46 | - click: useful for when you need to click a button/link from accessibility tree. 47 | - select_option: useful for when you need to select a drop-down box value. When you get (select and option) tags from the accessibility tree, you need to select the serial number(element_id) corresponding to the select tag, not the option, and select the most likely content corresponding to the option as Input. 48 | - go_back: useful when you find the current web page encounter some network error or you think the last step is not helpful. 49 | - cache_data: useful when you need to extract information from the page that you think is extremely valuable for completing the target task. It is not a direct answer to the target task, but it is extremely relevant to the target task. Subsequent actions may refer to this part of the information and return this information as input 50 | - get_final_answer: useful for when you think it is the answer to the target task and no other operations are required, Input should be a answer content. 51 | 52 | You also need to provide an effective description of the current execution action. 53 | A proper description contains: 54 | - What website it is; 55 | - Which action you choose; 56 | - REMEMBER DO NOT LEAVE THE DESCRIPTION EMPTY! 57 | 58 | You have to follow the instructions or notes: 59 | **Important Notes**: 60 | - Under the following conditions, you are restricted to using the `google_search` or `goto` tools exclusively: 61 | 1. In the initial step of a process or when there's no preceding interaction history (i.e., the previous trace is empty). 62 | 2. In situations where the accessibility tree is absent or not provided. 63 | - Your action should not be the same as last step's action. 64 | - The `element_id` should be an integer accurately representing the element's ID in the accessibility tree. 65 | - AVOID using the provided example's element_id as your output. 66 | - The output JSON blob must be valid; otherwise, it cannot be recognized. 67 | 68 | **Special Circumstances Guidelines**: 69 | - When performing a search on a website, if you find the search results do not display sufficient content, consider simplifying or modifying your search query. Reducing the complexity of your search query or altering keywords may yield more comprehensive results. 70 | 71 | Please ensure the accuracy of your output, as we will execute subsequent steps based on the `action`, `action_input` and `element_id` you provide. 72 | 73 | **Output Requirements**: 74 | - Ensure your output strictly adheres to the JSON blob format outlined below: 75 | 76 | ``` 77 | { 78 | "thought": ACTUAL_THOUGHT 79 | "action": ACTUAL_TOOLS, 80 | "action_input": ACTUAL_INPUT, 81 | "element_id": ACTUAL_ELEMENT_ID, 82 | "description": ACTUAL_DESCRIPTION 83 | } 84 | ``` 85 | 86 | - A VALID JSON BLOB EXAMPLE AS FELLOWS: 87 | ``` 88 | { 89 | "thought": "In order to complete this task,I need to go to the Google home page", 90 | "action": "click", 91 | "action_input": "button", 92 | "element_id": "236", 93 | "description": "Now I\'m on Google\'s main page. I\'m now clicking the button with element_id [236] to see more information." 94 | } 95 | ``` 96 | ''' 97 | 98 | planning_prompt_user = "The question here is described as \"{{user_request}}\".\n\n" 99 | 100 | global_reward_prompt_system = ('''\ 101 | You are an assistant to help navigate and operate the web page to achieve certain task. 102 | Your goal is to evaluate the previous series of traces(thoughts and actions) and think about what key steps are needed to complete the task in the future. 103 | There are key information you will get: 104 | **Key Information**: 105 | - Previous trace: all thoughts, actions and reflections you have made historically. 106 | - Accessibility tree: characteristic expression of the current web page. 107 | - Screenshot: visual information of the current web page (may include). 108 | 109 | You also need to combine the previous trace to give the completion status of the current task. 110 | **Status Of Task Completion** 111 | - doing: You have completed the intermediate steps of the target task but not entirely finish the target task. 112 | - finished: You are entirely certain about completing the target task. 113 | - loop: You find that the the last two steps of previous actions are the same, it is determined that the process is stuck in a local optimum solution. 114 | 115 | You will judge and score the task completion and reasonableness of previous actions. The score ranges from 1-10, but the score you give can only be selected from [1, 3, 7, 9, 10]. 116 | **Judging and Scoring Criteria**: 117 | - score = 1: You find that the status of the task is stuck in a loop by analyzing the previous trace. 118 | - score = 3: You find that performing the previous trajectories(thoughts and actions) is not likely helpful in completing target task and you need to adjust the direction of your planning and action or start over from beginning. 119 | - score = 7: You find that performing the previous trajectories(thoughts and actions) are helpful in completing the target task. 120 | - score = 9: You find that performing the previous trajectories(thoughts and actions) are a very critical intermediate step to complete this task. 121 | - score = 10: You find that performing the previous trajectories(thoughts and actions) have completed the task perfectly. 122 | You need to provide an effective evidence of scoring for the series of the previous trace. 123 | - Why do you give this score? 124 | - What is the reason? 125 | 126 | You also need to provide an effective description or summary of the above requirements through key information and characteristics of the current web page. 127 | **A proper description contains**: 128 | - What is the current completion status of the task? (IMPORTNAT) 129 | - What is your overall plan for completing your goal and target task in the future? (IMPORTNAT) 130 | - REMEMBER DO NOT LEAVE THE DESCRIPTION EMPTY! 131 | 132 | **Output Requirements**: 133 | - Ensure your output strictly follows this format: 134 | ```json 135 | { 136 | "status": "ACTUAL_STATUS", 137 | "score": "ACTUAL_SCORE", 138 | "reason": "ACTUAL_REASON", 139 | "description": "ACTUAL_DESCRIPTION" 140 | } 141 | ``` 142 | - A VALID JSON BLOB EXAMPLE AS FELLOWS: 143 | ``` 144 | { 145 | "status": "doing", 146 | "score": "3", 147 | "reason": "You need to complete a search for camping tents that can accommodate 2 people and sort the results in rei by price from low to high. According to your previous trajectory, you navigated to the rei official website and clicked the 2-person button, which are correct actions. But when you complete the final step of sorting prices, you actually click on a link to a tent product. This is a completely unreasonable action. So I give it 3 points. Maybe you need to return to the previous interface to re-plan and select the 'sort by' button" 148 | "description": "According to the current web page information, you can know that this is the homepage of a tent product, which is not very consistent with the purpose of the target task. The next overall plan to complete this task is to return to the previous page and select the sort by button." 149 | } 150 | ``` 151 | ''') 152 | 153 | global_reward_with_GroundTruth_prompt_system = ('''\ 154 | You are an assistant to help navigate and operate the web page to achieve certain task. 155 | Your goal is to evaluate the previous series of traces(thoughts and actions) and think about what key steps are needed to complete the task in the future. 156 | There are key information you will get: 157 | **Key Information**: 158 | - Previous trace: all thoughts, actions and reflections you have made historically. 159 | - Current Webpage Information: 160 | - Accessibility tree: characteristic expression of the current web page. 161 | - Screenshot: visual information of the current web page. (may include) 162 | - Reference Guide: detailed and step-by-step reference guide for completing the target task, serving as a benchmark for evaluating progress and strategizing the necessary actions. 163 | 164 | **Notes to Reference Guide**: 165 | - The Reference Guide plays a crucial role in aiding the evaluation of the current Status of Task Completion. The 'Completion Verification' section within the Reference Guide is instrumental in determining whether a task can be classified as 'finished.' 166 | - Furthermore, for a task to be considered fully completed, all **key conditions** must be met as specified. 167 | 168 | 169 | You also need to combine the previous trace to give the completion status of the current task. 170 | **Status of Task Completion** 171 | - doing: You have completed the intermediate steps of the target task but not entirely finish the target task. 172 | - finished: You are entirely certain about completing the target task. 173 | - loop: You find that the the last two steps of previous actions are the same, it is determined that the process is stuck in a local optimum solution. 174 | 175 | You will judge and score the task completion and reasonableness of previous actions. The score ranges from 1-10, but the score you give can only be selected from [1, 3, 7, 9, 10]. 176 | **Judging and Scoring Criteria**: 177 | - score = 1: You find that the status of the task is stuck in a loop by analyzing the previous trace. 178 | - score = 3: You find that performing the previous trajectories(thoughts and actions) is not likely helpful in completing target task and you need to adjust the direction of your planning and action or start over from beginning. 179 | - score = 7: You find that performing the previous trajectories(thoughts and actions) are helpful in completing the target task. 180 | - score = 9: You find that performing the previous trajectories(thoughts and actions) are a very critical intermediate step to complete this task. 181 | - score = 10: You find that performing the previous trajectories(thoughts and actions) have completed the task perfectly. 182 | You need to provide an effective evidence of scoring for the series of the previous trace. 183 | - Why do you give this score? 184 | - What is the reason? 185 | 186 | You also need to provide an effective description or summary of the above requirements through key information and characteristics of the current web page. 187 | **A proper description contains**: 188 | - What is the current completion status of the task? (IMPORTNAT) 189 | - What is your overall plan for completing your goal and target task in the future? (IMPORTNAT) 190 | - REMEMBER DO NOT LEAVE THE DESCRIPTION EMPTY! 191 | 192 | **Output Requirements**: 193 | - Ensure your output strictly follows this format: 194 | ```json 195 | { 196 | "status": "ACTUAL_STATUS", 197 | "score": "ACTUAL_SCORE", 198 | "reason": "ACTUAL_REASON", 199 | "description": "ACTUAL_DESCRIPTION" 200 | } 201 | ``` 202 | - A VALID JSON BLOB EXAMPLE AS FELLOWS: 203 | ``` 204 | { 205 | "status": "doing", 206 | "score": "3", 207 | "reason": "You need to complete a search for camping tents that can accommodate 2 people and sort the results in rei by price from low to high. According to your previous trajectory, you navigated to the rei official website and clicked the 2-person button, which are correct actions. But when you complete the final step of sorting prices, you actually click on a link to a tent product. This is a completely unreasonable action. So I give it 3 points. Maybe you need to return to the previous interface to re-plan and select the 'sort by' button" 208 | "description": "According to the current web page information, you can know that this is the homepage of a tent product, which is not very consistent with the purpose of the target task. The next overall plan to complete this task is to return to the previous page and select the sort by button." 209 | } 210 | ``` 211 | ''') 212 | 213 | global_reward_prompt_user = "The target task here is described as \"{{user_request}}\".\n\n"\ 214 | "The previous trajectories(thoughts, actions and reflections) are: {{stringfy_thought_and_action_output}}.\n\nYou have done the things above.\n\n" 215 | 216 | current_reward_prompt_system = '''You are an assistant to help navigate and operate the web page to achieve certain task. 217 | Your goal is to make an assessment of the action you are currently performing. 218 | There are key information you will get: 219 | **Key Information**: 220 | - previous trace: all thoughts and actions to complete this task step by step 221 | - current trace: current thought and action performed 222 | - accessibility tree: characteristic expression of the current web page 223 | 224 | You will judge and score the currently performed action. The score ranges from 1-10, but the score you give can only be selected from [1, 3, 7, 9, 10] 225 | **Judging and Scoring Criteria**: 226 | - score = 1: You may not have obtained accessibility tree information(IMPORTANT).You may have encountered the issues such as Network connection issues,Human-computer verification issues,Encountered a blank page. 227 | - score = 3: The action you performed (such as clicking on an element) does not help at all to complete the task when accessibility tree is provided 228 | - score = 7: The action you performed (such as clicking on an element) is helpful in completing this task when accessibility tree is provided 229 | - score = 9: This action you performed is a very critical intermediate step to complete this task when accessibility tree is provided 230 | - score = 10: This action is the last step to complete the task when accessibility tree is provided 231 | 232 | You also need to provide an effective description of making the assessment 233 | A proper description contains: 234 | - Why do you give this score? 235 | - What is the reason? 236 | - What would be better advice if given a low score? 237 | - REMEMBER DO NOT LEAVE THE DESCRIPTION EMPTY! 238 | 239 | **Output Requirements**: 240 | - Ensure your output strictly follows this format: 241 | ```json 242 | { 243 | "score": "ACTUAL_SCORE", 244 | "description": ACTUAL_DESCRIPTION 245 | } 246 | ``` 247 | - A VALID JSON BLOB EXAMPLE AS FELLOWS: 248 | ``` 249 | { 250 | "score": "10", 251 | "description":"According to the previous trajectory, the current thought and the action performed are an important part of completing the target task, so it is very important. I give 9 points." 252 | } 253 | ``` 254 | ''' 255 | 256 | current_reward_prompt_user = "The target task here is described as \"{{user_request}}\".\n\n"\ 257 | "The previous thought and action are:{{stringfy_previous_trace_output}}.\n\n"\ 258 | "The current thought and action is: {{stringfy_current_trace_output}}.\n\nYou have done the current action\n\n"\ 259 | 260 | judge_searchbar_prompt_system = "You are an assistant to help navigate and operate the web page to achieve certain goals. Answer the following questions as best you can.\n"\ 261 | "Your target is to judge whether the input is the search bar.\n" 262 | judge_searchbar_prompt_user = "Now the webpage's accessibility tree(the key information of current web page)is below: {{input_element}}\n"\ 263 | "Last step you have fill in the input(id={{element_id}}) with text:{{action_input}}"\ 264 | "Judge whether the input is the search bar. If the blank is search bar, return yes, else return no. You should only return one word!" 265 | 266 | semantic_match_prompt_system = "Now you are an assistant to judge whether 2 elements are semantically same. I'll provide a judge rule and an answer.\n"\ 267 | "If they are the same, you should return 1. If they are not related, you should return 0. "\ 268 | "If they are related but not identical, return a decimal (two decimal places) between 0 and 1 of the degree of relevance you think.\n"\ 269 | "For example, the judge rule is: Decide whether the place is New York. The score of \"new york\" and \"纽约\" are both 1, \"Brooklyn\" should be 0.\n"\ 270 | "However, if the judge rule is: Decide whether the place is in New York. The score of \"new york\" and \"纽约\" and \"Brooklyn\" are all 1.\n"\ 271 | "Another example, the judge rule is: Decide whether I'm looking for clothes. The score of \"red Clothes\" and \"green jacket\"should also be 1.\n"\ 272 | "However, if the judge rule is: Decide whether I'm looking for red clothes. the score of \"bright red Clothing\" could be 0.85(red include bright red but they are not the same), the score of \"green Clothes\"should be 0.5(red is not green).\n"\ 273 | "Remember, you should return a number with ``` and an explanation. Like output: ```1```, (your explanation)" # "Remember, you should only return a number without any punctuation or explanation!" 274 | 275 | semantic_match_prompt_user = "You should judge by the rule below:{{semantic_method}}.\n\nmy answer is:{{input_answer}}\n" 276 | -------------------------------------------------------------------------------- /agent/Prompt/dom_vision_disc_prompts.py: -------------------------------------------------------------------------------- 1 | class DomVisionDiscPrompts: 2 | dom_vision_disc_planning_prompt_user = "The question here is described as \"{{user_request}}\".\n\n" 3 | 4 | dom_vision_disc_prompt_system2 = """""" 5 | 6 | example_input = """ 7 | current web tab name is 'Google' 8 | [40] link 'About' 9 | [41] link 'Store' 10 | [186] link 'Gmail' 11 | [187] link 'Images' 12 | [163] textarea 'Search' 13 | [236] button 'See more' 14 | """ 15 | 16 | example_output = '\n```\n{\n "action": "click",\n "action_input": "button",\n "element_id": "236",\n "description": "Now I\'m on Google\'s main page. I\'m now clicking the button with element_id [236] to see more information."\n}\n```' 17 | 18 | 19 | dom_vision_disc_planning_prompt_system = '''''' 20 | -------------------------------------------------------------------------------- /agent/Prompt/dom_vision_prompts.py: -------------------------------------------------------------------------------- 1 | class DomVisionPrompts: 2 | 3 | example_input = """ 4 | current web tab name is 'Google' 5 | [40] link 'About' 6 | [41] link 'Store' 7 | [186] link 'Gmail' 8 | [187] link 'Images' 9 | [163] textarea 'Search' 10 | [236] button 'See more' 11 | """ 12 | 13 | example_output = '\n```\n{\n "action": "click",\n "action_input": "button",\n "element_id": "236",\n "description": "Now I\'m on Google\'s main page. I\'m now clicking the button with element_id [236] to see more information."\n}\n```' 14 | score_output = '\n```\n{\n "score": "10"\n,"description": "According to the previous trajectory, the current thought and the action performed are an important part of completing the target task, so it is very important, so I give 10 points"}\n```' 15 | 16 | d_v_planning_prompt_system = '''''' 17 | 18 | d_v_planning_prompt_user = "The question here is described as \"{{user_request}}\".\n\n" 19 | 20 | current_d_vision_reward_prompt_system = "You are an assistant to help navigate and operate the web page to achieve certain task.\n"\ 21 | "Your goal is to make an assessment of the action you are currently performing.\n There are key information you will get:\n"\ 22 | "1. You will get all previous trace including thoughts and actions for achieving the task.\n"\ 23 | "2. You will get current thought and action.\n"\ 24 | "3. You will get key information from current web page,such as accessibility tree.\n"\ 25 | "4. You will also obtain a screenshot of the web page\n"\ 26 | "Please judge whether executing this action is helpful for finishing the target task,and give this action a rating, from 1 to 10, give your points.\n"\ 27 | "Also, you should give the reason or description for giving this score.\n"\ 28 | f"Example output:{str(score_output)}\n" 29 | 30 | current_d_vision_reward_prompt_user = "The target task here is described as \"{{user_request}}\".\n\n"\ 31 | "The previous thought and action are:{{stringfy_previous_trace_output}}."\ 32 | "The current thought and action is: {{stringfy_current_trace_output}}.\n\nYou have done the current action\n\n" 33 | -------------------------------------------------------------------------------- /agent/Prompt/prompt_constructor.py: -------------------------------------------------------------------------------- 1 | from ..Utils.utils import is_valid_base64 2 | import json5 3 | 4 | from .vision_to_dom_prompts import VisionToDomPrompts 5 | from .dom_vision_disc_prompts import DomVisionDiscPrompts 6 | from .base_prompts import BasePrompts 7 | from .dom_vision_prompts import DomVisionPrompts 8 | from .vision_prompts import VisionPrompts 9 | from jinja2 import Template 10 | 11 | 12 | from agent.Memory.short_memory.history import HistoryMemory 13 | 14 | 15 | class BasePromptConstructor: 16 | def __init__(self): 17 | pass 18 | 19 | 20 | # Build a prompt for planning based on the DOM tree 21 | class PlanningPromptConstructor(BasePromptConstructor): 22 | def __init__(self): 23 | self.prompt_system = BasePrompts.planning_prompt_system 24 | self.prompt_user = BasePrompts.planning_prompt_user 25 | 26 | def construct( 27 | self, 28 | user_request: str, 29 | previous_trace: list, 30 | observation: str, 31 | feedback: str = "", 32 | status_description: str = "" 33 | ) -> list: 34 | self.prompt_user = Template(self.prompt_user).render( 35 | user_request=user_request) 36 | if len(previous_trace) > 0: 37 | self.prompt_user += HistoryMemory( 38 | previous_trace=previous_trace, reflection=status_description).construct_previous_trace_prompt() 39 | if status_description != "": 40 | self.prompt_user += \ 41 | f"Task completion description is {status_description}" 42 | if feedback != "": 43 | self.prompt_user += f"Here are some other things you need to know:\n {feedback}\n" 44 | self.prompt_user += f"\nHere is the accessibility tree that you should refer to for this task:\n{observation}" 45 | messages = [{"role": "system", "content": self.prompt_system}, { 46 | "role": "user", "content": self.prompt_user}] 47 | return messages 48 | 49 | # Previous thought, action and reflection are converted to formatted strings 50 | def stringfy_thought_and_action(self, input_list: list) -> str: 51 | input_list = json5.loads(input_list, encoding="utf-8") 52 | str_output = "[" 53 | for idx, i in enumerate(input_list): 54 | str_output += f'Step{idx + 1}:\"Thought: {i["thought"]}, Action: {i["action"]}, Reflection:{i["reflection"]}\";\n' 55 | str_output += "]" 56 | return str_output 57 | 58 | 59 | class VisionDisc2PromptConstructor(BasePromptConstructor): 60 | def __init__(self): 61 | super().__init__() 62 | self.prompt_system = DomVisionDiscPrompts.dom_vision_disc_prompt_system2 63 | self.prompt_user = DomVisionDiscPrompts.dom_vision_disc_planning_prompt_user 64 | 65 | def construct( 66 | self, 67 | user_request: str, 68 | base64_image: str 69 | ) -> list: 70 | rendered_prompt = Template(self.prompt_user).render( 71 | user_request=user_request) 72 | prompt_elements = [{"type": "text", "text": rendered_prompt}, 73 | {"type": "text", "text": "current web page screenshot is:"}, 74 | {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}] 75 | 76 | # Construct the final message payload 77 | messages = [{"role": "system", "content": self.prompt_system}, 78 | {"role": "user", "content": prompt_elements}] 79 | return messages 80 | 81 | 82 | class VisionDisc1PromptConstructor(BasePromptConstructor): 83 | def __init__(self): 84 | super().__init__() 85 | self.prompt_system = DomVisionDiscPrompts.dom_vision_disc_prompt_system1 86 | 87 | def construct( 88 | self, 89 | base64_image: str 90 | ) -> list: 91 | prompt_elements = [{"type": "text", "text": "current web page screenshot is:"}, 92 | {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}] 93 | 94 | # Construct the final message payload 95 | messages = [{"role": "system", "content": self.prompt_system}, 96 | {"role": "user", "content": prompt_elements}] 97 | return messages 98 | 99 | 100 | class ObservationVisionDiscPromptConstructor(BasePromptConstructor): 101 | def __init__(self): 102 | super().__init__() 103 | self.prompt_system = DomVisionDiscPrompts.dom_vision_disc_planning_prompt_system 104 | self.prompt_user = DomVisionDiscPrompts.dom_vision_disc_planning_prompt_user 105 | 106 | def construct( 107 | self, 108 | user_request: str, 109 | previous_trace: str, 110 | observation: str, 111 | feedback: str = "", 112 | status_description: str = "", 113 | vision_disc_response: str = "" 114 | ) -> list: 115 | self.prompt_user = Template(self.prompt_user).render( 116 | user_request=user_request) 117 | if len(previous_trace) > 0: 118 | self.prompt_user += HistoryMemory( 119 | previous_trace=previous_trace, reflection=status_description).construct_previous_trace_prompt() 120 | # if status_description != "": 121 | # self.prompt_user += \ 122 | # f"Task completion description is {status_description}" 123 | if feedback != "": 124 | self.prompt_user += f"An invalid action description is below:\n {feedback}\n" 125 | self.prompt_user += f"\nHere is the accessibility tree that you should refer to for this task:\n{observation}" 126 | if vision_disc_response: 127 | self.prompt_user += "\n\nHere is a visual analysis of the webpage's screenshot:\n" + \ 128 | vision_disc_response 129 | messages = [{"role": "system", "content": self.prompt_system}, 130 | {"role": "user", "content": self.prompt_user}] 131 | return messages 132 | 133 | # Convert previous thought and action into formatted string 134 | def stringfy_thought_and_action(self, input_list: list) -> str: 135 | input_list = json5.loads(input_list, encoding="utf-8") 136 | str_output = "[" 137 | for idx, i in enumerate(input_list): 138 | str_output += f'Step{idx + 1}:\"Thought: {i["thought"]}, Action: {i["action"]}\";\n' 139 | str_output += "]" 140 | return str_output 141 | 142 | 143 | class ObservationVisionActPromptConstructor(BasePromptConstructor): 144 | def __init__(self): 145 | super().__init__() 146 | self.prompt_system = VisionToDomPrompts.vision_act_planning_prompt_system 147 | self.prompt_user = VisionToDomPrompts.vision_act_planning_prompt_user 148 | 149 | def construct( 150 | self, 151 | user_request: str, 152 | previous_trace: str, 153 | observation_vision: str, 154 | feedback: str = "", 155 | status_description: str = "" 156 | ) -> list: 157 | rendered_prompt = Template(self.prompt_user).render( 158 | user_request=user_request) 159 | prompt_elements = [{"type": "text", "text": rendered_prompt}] 160 | if len(previous_trace) > 0: 161 | # history_memory = HistoryMemory(previous_trace=previous_trace) 162 | # trace_prompt = history_memory.construct_previous_trace_prompt() 163 | trace_prompt = HistoryMemory( 164 | previous_trace=previous_trace, reflection=status_description).construct_previous_trace_prompt() 165 | prompt_elements.append({"type": "text", "text": trace_prompt}) 166 | # if status_description != "": 167 | # prompt_elements.append({"type": "text", "text": f"Task completion description is {status_description}"}) 168 | if feedback != "": 169 | prompt_elements.append( 170 | {"type": "text", "text": f"An invalid action description is below:\n {feedback}\n"}) 171 | # prompt_elements.append({"type": "text", "text": f"The current webpage's URL is {url}"}) 172 | if observation_vision: 173 | prompt_elements.append( 174 | {"type": "text", "text": "The current webpage's screenshot is:"}) 175 | prompt_elements.append( 176 | {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{observation_vision}"}}) 177 | messages = [{"role": "system", "content": self.prompt_system}, 178 | {"role": "user", "content": prompt_elements}] 179 | # print(prompt_elements) 180 | print("messages finished!\n") 181 | return messages 182 | 183 | 184 | class VisionToDomPromptConstructor(BasePromptConstructor): 185 | def __init__(self): 186 | super().__init__() 187 | self.prompt_system = VisionToDomPrompts.vision_to_dom_planning_prompt_system 188 | self.prompt_user = "" # VisionToDomPrompts.vision_act_planning_prompt_user 189 | 190 | def construct( 191 | self, 192 | # user_request: str, 193 | target_element: str, 194 | action_description: str, 195 | observation: str 196 | ) -> list: 197 | # self.prompt_user = Template(self.prompt_user).render(user_request=user_request) 198 | self.prompt_user += f"Target Element Description: {target_element}\n" 199 | if action_description: 200 | self.prompt_user += f"Action Description: {action_description}\n" 201 | self.prompt_user += "\nHere is the accessibility tree that you should refer to for this task:\n" + observation 202 | messages = [{"role": "system", "content": self.prompt_system}, 203 | {"role": "user", "content": self.prompt_user}] 204 | return messages 205 | 206 | 207 | class D_VObservationPromptConstructor(BasePromptConstructor): 208 | def __init__(self): 209 | super().__init__() 210 | self.prompt_system = DomVisionPrompts.d_v_planning_prompt_system 211 | self.prompt_user = DomVisionPrompts.d_v_planning_prompt_user 212 | 213 | def construct( 214 | self, 215 | user_request: str, 216 | previous_trace: str, 217 | observation: str, 218 | observation_VforD: str, 219 | feedback: str = "", 220 | status_description: str = "" 221 | ) -> list: 222 | is_valid, message = is_valid_base64( 223 | observation_VforD) 224 | print("prompt_constructor.py D_VObservationPromptConstructor:", message, "\n") 225 | rendered_prompt = Template(self.prompt_user).render( 226 | user_request=user_request) 227 | prompt_elements = [{"type": "text", "text": rendered_prompt}] 228 | if len(previous_trace) > 0: 229 | # history_memory = HistoryMemory(previous_trace=previous_trace) 230 | trace_prompt = HistoryMemory( 231 | previous_trace=previous_trace, reflection=status_description).construct_previous_trace_prompt() 232 | # trace_prompt = history_memory.construct_previous_trace_prompt() 233 | prompt_elements.append({"type": "text", "text": trace_prompt}) 234 | # if status_description != "": 235 | # prompt_elements.append({"type": "text", "text": f"Task completion description is {status_description}"}) 236 | if feedback != "": 237 | prompt_elements.append( 238 | {"type": "text", "text": f"There an invalid action description is below:\n {feedback}\n"}) 239 | prompt_elements.append( 240 | {"type": "text", "text": f"\nHere is the accessibility tree that you should refer to for this task:\n{observation}"}) 241 | prompt_elements.append( 242 | {"type": "text", "text": "current screenshot is:"}) 243 | print("len of prompt_elements before observation_VforD:", 244 | len(prompt_elements)) 245 | prompt_elements_str = json5.dumps(prompt_elements) 246 | print("len of prompt_elements_str before observation_VforD:", len( 247 | prompt_elements_str)) # This will print the length of prompt_elements converted into JSON string 248 | print("len of about gpt token of prompt_elements_str before observation_VforD:", len( 249 | prompt_elements_str) / 5.42, "\n") 250 | prompt_elements.append( 251 | {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{observation_VforD}"}}) 252 | # Construct the final message payload 253 | messages = [{"role": "system", "content": self.prompt_system}, 254 | {"role": "user", "content": prompt_elements}] 255 | # print(prompt_elements) 256 | print("messages finished!\n") 257 | return messages 258 | 259 | # Convert previous thought and action into formatted string 260 | def stringfy_thought_and_action(self, input_list: list) -> str: 261 | input_list = json5.loads(input_list, encoding="utf-8") 262 | str_output = "[" 263 | for idx, i in enumerate(input_list): 264 | str_output += f'Step{idx + 1}:\"Thought: {i["thought"]}, Action: {i["action"]}\";\n' 265 | str_output += "]" 266 | return str_output 267 | 268 | 269 | class VisionObservationPromptConstructor(BasePromptConstructor): 270 | def __init__(self): 271 | self.prompt_system = VisionPrompts.vision_planning_prompt_system 272 | self.prompt_user = VisionPrompts.vision_prompt_user 273 | 274 | def construct(self, user_request: str, previous_trace: str, base64_image: str) -> list: 275 | rendered_prompt = Template(self.prompt_user).render( 276 | user_request=user_request) 277 | prompt_elements = [{"type": "text", "text": rendered_prompt}] 278 | 279 | if len(previous_trace) > 0: 280 | history_memory = HistoryMemory(previous_trace=[previous_trace]) 281 | trace_prompt = history_memory.construct_previous_trace_prompt() 282 | prompt_elements.append({"type": "text", "text": trace_prompt}) 283 | 284 | prompt_elements.append( 285 | {"type": "text", "text": "The current observation is:"}) 286 | prompt_elements.append( 287 | {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}) 288 | 289 | messages = [{"role": "system", "content": self.prompt_system}, 290 | {"role": "user", "content": prompt_elements}] 291 | return messages 292 | 293 | def stringfy_thought_and_action(self, input_list: list) -> str: 294 | input_list = json5.loads(input_list, encoding="utf-8") 295 | str_output = "[" 296 | for idx, i in enumerate(input_list): 297 | str_output += f'Step{idx + 1}:\"Thought: {i["thought"]}, Action: {i["action"]}\";\n' 298 | str_output += "]" 299 | return str_output 300 | 301 | 302 | class RewardPromptConstructor(BasePromptConstructor): 303 | def __init__(self): 304 | super().__init__() 305 | self.prompt_system = BasePrompts.global_reward_prompt_system 306 | self.prompt_user = BasePrompts.global_reward_prompt_user 307 | 308 | def construct( 309 | self, 310 | ground_truth_mode: str, 311 | global_reward_mode: str, 312 | user_request: str, 313 | stringfy_thought_and_action_output: str, 314 | observation: str, 315 | current_info=None, 316 | instruction: str = "" 317 | ) -> list: 318 | if ground_truth_mode: 319 | self.prompt_system = BasePrompts.global_reward_with_GroundTruth_prompt_system 320 | rendered_prompt = Template(self.prompt_user).render( 321 | user_request=user_request, stringfy_thought_and_action_output=stringfy_thought_and_action_output) 322 | prompt_elements = [{"type": "text", "text": rendered_prompt}] 323 | if 'current_url' in current_info: 324 | current_url = current_info.get('current_url', 'not available') 325 | prompt_elements.append( 326 | {"type": "text", "text": f"The current url is {current_url}"}) 327 | prompt_elements.append( 328 | {"type": "text", "text": f"Here is the current accessibility tree that you should refer to:\n{observation}"}) 329 | if "vision" in global_reward_mode: 330 | if "vision_reward" in current_info and current_info['vision_reward']: 331 | prompt_elements.append( 332 | {"type": "text", "text": "The current screenshot is:"}) 333 | prompt_elements.append( 334 | {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{current_info['vision_reward']}"}}) 335 | else: 336 | prompt_elements.append( 337 | {"type": "text", "text": "The current screenshot is not available."}) 338 | print("The current screenshot for vision reward is not available.") 339 | if ground_truth_mode: 340 | prompt_elements.append( 341 | {"type": "text", "text": f"Here is the Reference Guide for the target task:\n\n{instruction}"}) 342 | messages = [{"role": "system", "content": self.prompt_system}, 343 | {"role": "user", "content": prompt_elements}] 344 | return messages 345 | 346 | 347 | # Construct prompt for textual reward 348 | class CurrentRewardPromptConstructor(BasePromptConstructor): 349 | def __init__(self): 350 | self.prompt_system = BasePrompts.current_reward_prompt_system 351 | self.prompt_user = BasePrompts.current_reward_prompt_user 352 | 353 | def construct( 354 | self, 355 | user_request: str, 356 | stringfy_previous_trace_output: str, 357 | stringfy_current_trace_output: str, 358 | observation: str 359 | ) -> list: 360 | self.prompt_user = Template(self.prompt_user).render( 361 | user_request=user_request, stringfy_previous_trace_output=stringfy_previous_trace_output, 362 | stringfy_current_trace_output=stringfy_current_trace_output) 363 | self.prompt_user += f"\nHere is the accessibility tree that you should refer to:\n{observation}" 364 | messages = [{"role": "system", "content": self.prompt_system}, { 365 | "role": "user", "content": self.prompt_user}] 366 | return messages 367 | 368 | 369 | # Construct prompt for vision reward 370 | class VisionRewardPromptConstructor(BasePromptConstructor): 371 | def __init__(self): 372 | self.prompt_system = DomVisionPrompts.current_d_vision_reward_prompt_system 373 | self.prompt_user = DomVisionPrompts.current_d_vision_reward_prompt_user 374 | 375 | def construct( 376 | self, 377 | user_request: str, 378 | stringfy_previous_trace_output: str, 379 | stringfy_current_trace_output: str, 380 | observation: str, 381 | observation_VforD: str 382 | ) -> list: 383 | if not is_valid_base64(observation_VforD): 384 | print("The observation_VforD provided is not a valid Base64 encoding") 385 | 386 | self.prompt_user = Template(self.prompt_user).render( 387 | user_request=user_request, stringfy_previous_trace_output=stringfy_previous_trace_output, 388 | stringfy_current_trace_output=stringfy_current_trace_output) 389 | self.prompt_user += f"the key information of current web page is: {observation}" 390 | prompt_elements = [{"type": "text", "text": self.prompt_user}] 391 | 392 | prompt_elements.append( 393 | {"type": "text", "text": "the screenshot of current web page is :"}) 394 | prompt_elements.append( 395 | {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{observation_VforD}"}}) 396 | 397 | messages = [{"role": "system", "content": self.prompt_system}, 398 | {"role": "user", "content": prompt_elements}] 399 | return messages 400 | 401 | 402 | # Build a prompt to determine whether the element is a search box (if so, the front end needs to add an additional return operation) 403 | class JudgeSearchbarPromptConstructor(BasePromptConstructor): 404 | def __init__(self): 405 | self.prompt_system = BasePrompts.judge_searchbar_prompt_system 406 | self.prompt_user = BasePrompts.judge_searchbar_prompt_user 407 | 408 | # Build a prompt to determine whether it is a search box, and output a format that can be parsed by openai 409 | # TODO decoded_result 410 | def construct(self, input_element, planning_response_action) -> list: 411 | self.prompt_user = Template(self.prompt_user).render(input_element=str( 412 | input_element), element_id=planning_response_action['element_id'], 413 | action_input=planning_response_action['action_input']) 414 | messages = [{"role": "system", "content": self.prompt_system}, { 415 | "role": "user", "content": self.prompt_user}] 416 | return messages 417 | 418 | 419 | class SemanticMatchPromptConstructor(BasePromptConstructor): 420 | def __init__(self): 421 | self.prompt_system = BasePrompts.semantic_match_prompt_system 422 | self.prompt_user = BasePrompts.semantic_match_prompt_user 423 | 424 | def construct(self, input_answer, semantic_method) -> list: 425 | self.prompt_user = Template(self.prompt_user).render( 426 | semantic_method=semantic_method, input_answer=input_answer) 427 | messages = [{"role": "system", "content": self.prompt_system}, { 428 | "role": "user", "content": self.prompt_user}] 429 | return messages 430 | -------------------------------------------------------------------------------- /agent/Prompt/vision_prompts.py: -------------------------------------------------------------------------------- 1 | class VisionPrompts: 2 | vision_planning_prompt_system = """""" 3 | 4 | vision_prompt_user = "The question here is described as \"{{user_request}}\".\n\n" 5 | -------------------------------------------------------------------------------- /agent/Prompt/vision_to_dom_prompts.py: -------------------------------------------------------------------------------- 1 | class VisionToDomPrompts: 2 | example_input = """ 3 | current web tab name is 'Google' 4 | [40] link 'About' 5 | [41] link 'Store' 6 | [186] link 'Gmail' 7 | [187] link 'Images' 8 | [163] textarea 'Search' 9 | [236] button 'See more' 10 | """ 11 | example_output = '\n```\n{\n "action": "click",\n "action_input": "button",\n "element_id": "236",\n "description": "Now I\'m on Google\'s main page. I\'m now clicking the button with element_id [236] to see more information."\n}\n```' 12 | 13 | vision_act_planning_prompt_system = '''''' 14 | 15 | vision_act_planning_prompt_user = "The question here is described as \"{{user_request}}\".\n\n" 16 | 17 | vision_to_dom_planning_prompt_system = '''''' 18 | 19 | -------------------------------------------------------------------------------- /agent/Reward/__init__.py: -------------------------------------------------------------------------------- 1 | from .global_reward import * -------------------------------------------------------------------------------- /agent/Reward/global_reward.py: -------------------------------------------------------------------------------- 1 | from ..Utils.utils import print_info, print_limited_json 2 | from agent.Prompt import * 3 | from agent.LLM import * 4 | from agent.Plan.action import * 5 | import time 6 | import json5 7 | from logs import logger 8 | 9 | 10 | class InteractionMode: 11 | def __init__(self, text_model=None, visual_model=None): 12 | self.text_model = text_model 13 | self.visual_model = visual_model 14 | 15 | async def get_global_reward(self, user_request, previous_trace, observation, current_info, ground_truth_mode, 16 | global_reward_mode, ground_truth_data=None, task_name_id=None): 17 | reward_response = None 18 | reward_input_token_count = 0 19 | reward_output_token_count = 0 20 | reward_token_count = [reward_input_token_count, reward_output_token_count] 21 | if len(previous_trace) > 0: 22 | stringfy_thought_and_action_output = PlanningPromptConstructor().stringfy_thought_and_action( 23 | previous_trace) 24 | if not ground_truth_mode: 25 | reward_request = RewardPromptConstructor().construct( 26 | ground_truth_mode=ground_truth_mode, 27 | global_reward_mode=global_reward_mode, 28 | user_request=user_request, 29 | stringfy_thought_and_action_output=stringfy_thought_and_action_output, 30 | observation=observation, 31 | current_info=current_info) 32 | elif ground_truth_mode: 33 | for item in ground_truth_data: 34 | if item.get("index") == task_name_id: 35 | instruction = item["instruction"] 36 | reward_request = RewardPromptConstructor().construct( 37 | ground_truth_mode=ground_truth_mode, 38 | global_reward_mode=global_reward_mode, 39 | user_request=user_request, 40 | stringfy_thought_and_action_output=stringfy_thought_and_action_output, 41 | observation=observation, 42 | current_info=current_info, 43 | instruction=instruction) 44 | break 45 | else: 46 | logger.info("Running reward modeling without human-labeled reference.") 47 | reward_request = RewardPromptConstructor().construct( 48 | ground_truth_mode="false", 49 | global_reward_mode=global_reward_mode, 50 | user_request=user_request, 51 | stringfy_thought_and_action_output=stringfy_thought_and_action_output, 52 | observation=observation, 53 | current_info=current_info) 54 | print_info( 55 | f"Global_Reward_Request:\n{print_limited_json(reward_request, limit=1000)}", "\033[32m") # green 56 | response_str = "" 57 | for i in range(3): 58 | try: 59 | if "vision" in global_reward_mode: 60 | # TODO 61 | response_str, error_message = await self.visual_model.request(reward_request) 62 | else: 63 | print_info( 64 | f"using gpt_global_reward_text: {self.text_model.model}", "purple") 65 | response_str, error_message = await self.text_model.request(reward_request) 66 | reward_response = ActionParser().extract_status_and_description( 67 | response_str) 68 | input_token_count = calculation_of_token(reward_request, model=self.text_model.model) 69 | output_token_count = calculation_of_token(response_str, model=self.text_model.model) 70 | reward_input_token_count += input_token_count 71 | reward_output_token_count += output_token_count 72 | reward_token_count = [reward_input_token_count, reward_output_token_count] 73 | break 74 | except Exception as e: 75 | logger.error(traceback.format_exc()) 76 | # traceback.print_exc() 77 | logger.info( 78 | f"planning response_str or reward_response error for {i+1} times") 79 | continue 80 | 81 | logger.info( 82 | f"\033[34mGlobal_response_str:\n{response_str}\033[34m") 83 | else: 84 | response_str = "" 85 | return response_str, reward_response, reward_token_count 86 | 87 | 88 | class GlobalReward: 89 | 90 | @staticmethod 91 | async def evaluate( 92 | config, 93 | model_name, 94 | user_request, 95 | previous_trace, 96 | observation, 97 | current_info, 98 | task_name_id, 99 | global_reward_mode, 100 | ground_truth_mode, 101 | ground_truth_data, 102 | ): 103 | 104 | gpt4v = GPTGenerator(model="gpt-4-turbo") 105 | 106 | all_json_models = config["model"]["json_models"] 107 | is_json_response = config["model"]["json_model_response"] 108 | 109 | llm_global_reward_text = create_llm_instance( 110 | model_name, is_json_response, all_json_models) 111 | 112 | _, reward_response, reward_token_count = await InteractionMode(text_model=llm_global_reward_text, visual_model=gpt4v).get_global_reward( 113 | user_request=user_request, previous_trace=previous_trace, observation=observation, 114 | current_info=current_info, ground_truth_mode=ground_truth_mode, global_reward_mode=global_reward_mode, 115 | ground_truth_data=ground_truth_data, task_name_id=task_name_id) 116 | description = reward_response.get( 117 | "description") if reward_response and reward_response.get("description") else "" 118 | return reward_response, description, reward_token_count 119 | -------------------------------------------------------------------------------- /agent/Tool/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_tools import * -------------------------------------------------------------------------------- /agent/Tool/base_tools.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iMeanAI/WebCanvas/b9f289128614cd99b97abd0bb9bfc3a45f0847e0/agent/Tool/base_tools.py -------------------------------------------------------------------------------- /agent/Utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import * 2 | -------------------------------------------------------------------------------- /agent/Utils/utils.py: -------------------------------------------------------------------------------- 1 | import json5 2 | import base64 3 | # used for save_screenshot 4 | import os 5 | from PIL import Image 6 | from io import BytesIO 7 | from datetime import datetime 8 | # used for download_data and upload_result 9 | import requests 10 | import json 11 | 12 | # class Utility: 13 | 14 | # data utils 15 | 16 | 17 | def download_data(url, dest_path): 18 | response = requests.get(url) 19 | with open(dest_path, 'wb') as file: 20 | file.write(response.content) 21 | 22 | 23 | def upload_result(url, data): 24 | headers = {'Content-Type': 'application/json'} 25 | response = requests.post(url, data=json.dumps(data), headers=headers) 26 | return response.status_code, response.json() 27 | 28 | 29 | def save_json(data, file_path): 30 | with open(file_path, 'w') as json_file: 31 | json.dump(data, json_file, indent=4) 32 | 33 | 34 | def read_json_file(file_path): 35 | """ 36 | Read and parse a JSON file. 37 | 38 | Args: 39 | - file_path: str, the path of the JSON file. 40 | 41 | Returns: 42 | - Returns the parsed data on success. 43 | - Returns an error message on failure. 44 | """ 45 | try: 46 | with open(file_path, 'r', encoding='utf-8') as file: 47 | data = json5.load(file) 48 | return data 49 | except FileNotFoundError: 50 | return f"File not found: {file_path}" 51 | 52 | 53 | def save_screenshot(mode: str, record_time: str, task_name: str, step_number: int, description: str, 54 | screenshot_base64: str, task_name_id: str = None): 55 | 56 | timestamp = datetime.now().strftime('%Y%m%d-%H%M%S') 57 | invalid_chars = '<>:"/\\|?*' 58 | for char in invalid_chars: 59 | task_name = task_name.replace(char, '_') 60 | 61 | if task_name_id is None: 62 | task_folder = f'results/screenshots/screenshots_{mode}_{record_time}/{task_name}' 63 | else: 64 | task_folder = f'results/screenshots/screenshots_{mode}_{record_time}/{task_name_id}_{task_name}' 65 | if not os.path.exists(task_folder): 66 | os.makedirs(task_folder) 67 | 68 | image_data = base64.b64decode(screenshot_base64) 69 | image = Image.open(BytesIO(image_data)) 70 | 71 | screenshot_filename = f'{task_folder}/Step{step_number}_{timestamp}_{description}.png' 72 | 73 | image.save(screenshot_filename) 74 | 75 | 76 | def print_limited_json(obj, limit=500, indent=0): 77 | """ 78 | """ 79 | spaces = ' ' * indent 80 | if isinstance(obj, dict): 81 | items = [] 82 | for k, v in obj.items(): 83 | formatted_value = print_limited_json(v, limit, indent + 4) 84 | items.append(f'{spaces} "{k}": {formatted_value}') 85 | return f'{spaces}{{\n' + ',\n'.join(items) + '\n' + spaces + '}' 86 | elif isinstance(obj, list): 87 | elements = [print_limited_json( 88 | element, limit, indent + 4) for element in obj] 89 | return f'{spaces}[\n' + ',\n'.join(elements) + '\n' + spaces + ']' 90 | else: 91 | truncated_str = str(obj)[:limit] + \ 92 | "..." if len(str(obj)) > limit else str(obj) 93 | return json5.dumps(truncated_str) 94 | # Usage within the class or externally: 95 | # result = print_limited_json(your_object) 96 | 97 | 98 | def print_info(info, color): 99 | if color == 'yellow': 100 | print(f"\033[33m{info}\033[0m") 101 | elif color == 'red': 102 | print(f"\033[31m{info}\033[0m") 103 | elif color == 'green': 104 | print(f"\033[32m{info}\033[0m") 105 | elif color == 'cyan': 106 | print(f"\033[36m{info}\033[0m") 107 | elif color == 'blue': 108 | print(f"\033[34m{info}\033[0m") 109 | elif color == 'purple': 110 | print(f"\033[35m{info}\033[0m") 111 | elif color == 'white': 112 | print(f"\033[37m{info}\033[0m") 113 | elif color == 'black': 114 | print(f"\033[30m{info}\033[0m") 115 | elif color == 'bold': 116 | print(f"\033[1m{info}\033[0m") 117 | elif color == 'underline': 118 | print(f"\033[4m{info}\033[0m") 119 | else: 120 | print(f"{color}{info}\033[0m") # \033[0m 121 | 122 | 123 | def is_valid_base64(s): 124 | """ 125 | Validate if a given string is a valid Base64 encoded string. 126 | 127 | :param s: String to be checked. 128 | :return: A tuple (bool, str) where the first element is True if the string is a valid Base64 encoded string, 129 | and the second element is a message indicating the result or the type of error. 130 | 131 | Usage: is_valid, message = is_valid_base64(s) 132 | This function is only used to determine whether the picture is base64 encoded. 133 | """ 134 | if s is None: 135 | return False, "The string is None." 136 | 137 | if not isinstance(s, str): 138 | return False, "The input is not a string." 139 | 140 | if len(s) == 0: 141 | return False, "The string is empty." 142 | 143 | try: 144 | base64.b64decode(s, validate=True) 145 | return True, "The string is a valid Base64 encoded string." 146 | except ValueError: 147 | return False, "The string is NOT a valid Base64 encoded string." 148 | 149 | 150 | def extract_longest_substring(s): 151 | start = s.find('{') # Find the first occurrence of '[' 152 | end = s.rfind('}') # Find the last occurrence of ']' 153 | # Check if '[' and ']' were found and if they are in the right order 154 | if start != -1 and end != -1 and end > start: 155 | return s[start:end + 1] # Return the longest substring 156 | else: 157 | return None # Return None if no valid substring was found 158 | -------------------------------------------------------------------------------- /configs/setting.toml: -------------------------------------------------------------------------------- 1 | [basic] 2 | task_mode = "batch_tasks" # single_task or batch_tasks 3 | max_time_step = 25 # For all tasks, set the maximum step length 4 | 5 | [model] 6 | json_model_response = false # Whether to require a model to strictly output json format, currently only support OPENAI models. 7 | json_models = ["gpt-4-turbo", 8 | "gpt-4-turbo-2024-04-09", 9 | "gpt-4-0125-preview", 10 | "gpt-4-1106-preview", 11 | "gpt-3.5-turbo", 12 | "gpt-3.5-turbo-0125", 13 | "gpt-4o-2024-05-13", 14 | "gpt-4o-mini-2024-07-18"] 15 | 16 | 17 | [steps] 18 | interaction_mode = true # Whether human control of task execution status is required 19 | single_task_action_step = 10 20 | batch_tasks_max_action_step = 10 21 | batch_tasks_condition_step_increase = 5 22 | 23 | [files] 24 | batch_tasks_file_path = "./data/example/mind2web-live_test_20241024.json" # The input data path 25 | ground_truth_file_path = "./data/human_labeled_reward_reference/GT_instructions_202404161811_for_all_data_0328.json" # the ground_truth data path 26 | out_file_path = "./batch_tasks_results/example" # YOUR OUT FILE PATH 27 | 28 | [conditions] 29 | URL = ["error"] 30 | 31 | [token_pricing] 32 | pricing_models = [ 33 | "gpt-4o", 34 | "gpt-4o-2024-05-13", 35 | "gpt-4o-mini", 36 | "gpt-4o-mini-2024-07-18", 37 | "gpt-4-turbo", 38 | "gpt-4-turbo-2024-04-09", 39 | "gpt-4", 40 | "gpt-4-32k", 41 | "gpt-4-0125-preview", 42 | "gpt-4-1106-preview", 43 | "gpt-4-vision-preview", 44 | "gpt-3.5-turbo-0125", 45 | "gpt-3.5-turbo-1106"] 46 | 47 | # The price of each model for input and output, the unit is $/token 48 | # The name of input token price: model_name + "_input_price", such as gpt-4o_input_price 49 | # The name of output token price: model_name + "_output_price", such as gpt-4o_output_price 50 | gpt-4o_input_price = 0.000005 51 | gpt-4o_output_price = 0.000015 52 | gpt-4o-2024-05-13_input_price = 0.000005 53 | gpt-4o-2024-05-13_output_price = 0.000015 54 | gpt-4o-mini_input_price = 0.00000015 55 | gpt-4o-mini_output_price = 0.0000006 56 | gpt-4o-mini-2024-07-18_input_price = 0.00000015 57 | gpt-4o-mini-2024-07-18_output_price = 0.0000006 58 | gpt-4-turbo_input_price = 0.00001 59 | gpt-4-turbo_output_price = 0.00003 60 | gpt-4-turbo-2024-04-09_input_price = 0.00001 61 | gpt-4-turbo-2024-04-09_output_price = 0.00003 62 | gpt-4_input_price = 0.00003 63 | gpt-4_output_price = 0.00006 64 | gpt-4-32k_input_price = 0.00006 65 | gpt-4-32k_output_price = 0.00012 66 | gpt-4-0125-preview_input_price = 0.00001 67 | gpt-4-0125-preview_output_price = 0.00003 68 | gpt-4-1106-preview_input_price = 0.00001 69 | gpt-4-1106-preview_output_price = 0.00003 70 | gpt-4-vision-preview_input_price = 0.00001 71 | gpt-4-vision-preview_output_price = 0.00003 72 | gpt-3.5-turbo-0125_input_price = 0.0000005 73 | gpt-3.5-turbo-0125_output_price = 0.0000015 74 | gpt-3.5-turbo-1106_input_price = 0.000001 75 | gpt-3.5-turbo-1106_output_price = 0.000002 76 | -------------------------------------------------------------------------------- /data/dataset_io.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import zipfile 4 | from requests_toolbelt.multipart.encoder import MultipartEncoder 5 | import json 6 | import requests 7 | 8 | class GraphQLClient: 9 | def __init__(self): 10 | self.auth_token = None 11 | self.url = os.getenv('GRAPHQL_URL', 'https://www.imean.ai/graphql') 12 | self.username = os.getenv('GRAPHQL_USERNAME') 13 | self.password = os.getenv('GRAPHQL_PASSWORD') 14 | self.headers = {} 15 | self._validate_credentials() 16 | 17 | def _validate_credentials(self): 18 | if not self.username or not self.password: 19 | raise ValueError("Username or password is not set.") 20 | if len(self.username.strip()) < 1 or len(self.password.strip()) < 6: 21 | raise ValueError("Username or password is not valid.") 22 | 23 | def login(self): 24 | headers = self.headers 25 | 26 | payload = { 27 | "query": "mutation PwdLogin($password: String!, $username: String!) {\n pwdLogin(password: $password, username: $username)\n}", 28 | "variables": { 29 | "password": self.password, 30 | "username": self.username 31 | }, 32 | "operationName": "PwdLogin" 33 | } 34 | 35 | response = requests.post(self.url, headers=headers, json=payload) 36 | 37 | if response.status_code == 200: 38 | response_json = response.json() 39 | print(response_json) 40 | if 'data' in response_json and 'pwdLogin' in response_json['data']: 41 | self.auth_token = response_json['data']['pwdLogin'] 42 | print(f"Auth token: {self.auth_token}") 43 | else: 44 | raise Exception(f"Unexpected response format: {response_json}") 45 | else: 46 | raise Exception(f"Failed to login and retrieve auth token: {response.status_code} {response.text}") 47 | 48 | def get_file_url(self, file_path): 49 | self._validate_file_path(file_path) 50 | 51 | headers = { 52 | 'authorization': self.auth_token, 53 | 'origin': 'https://studio.apollographql.com', 54 | 'referer': 'https://studio.apollographql.com/sandbox/explorer', 55 | 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36', 56 | } 57 | 58 | multipart_data = MultipartEncoder( 59 | fields={ 60 | 'operations': '{"query":"mutation Upload($file: Upload!) {\\n upload(file: $file)\\n}","operationName":"Upload","variables":{"file":null}}', 61 | 'map': '{"0":["variables.file"]}', 62 | '0': ('file', open(file_path, 'rb'), 'application/zip') 63 | } 64 | ) 65 | 66 | headers['Content-Type'] = multipart_data.content_type 67 | 68 | response = requests.post(self.url, headers=headers, data=multipart_data) 69 | 70 | if response.status_code == 200: 71 | file_url = response.json()['data']['upload'] 72 | print('file_url:', file_url) 73 | return file_url 74 | else: 75 | raise Exception(f"Failed to upload file: {response.status_code} {response.text}") 76 | 77 | def upload_file(self, name, base_model, file_path, challenge_id): 78 | if not self.auth_token: 79 | raise Exception("Authorization token not available. Please login first.") 80 | 81 | self._validate_file_path(file_path) 82 | self._validate_other_params(name, base_model, challenge_id) 83 | 84 | file_url = self.get_file_url(file_path) 85 | 86 | headers = { 87 | 'accept': '*/*', 88 | 'authorization': self.auth_token, 89 | 'content-type': 'application/json' 90 | } 91 | 92 | data = { 93 | "query": """ 94 | mutation CreateAgent($input: CreateAgentInput!) { 95 | createAgent(input: $input) { 96 | id 97 | name 98 | resultUrl 99 | baseModel 100 | } 101 | } 102 | """, 103 | "variables": { 104 | "input": { 105 | "name": name, 106 | "baseModel": base_model, 107 | "resultUrl": file_url, 108 | "challengeId": challenge_id 109 | } 110 | } 111 | } 112 | 113 | response = requests.post(self.url, headers=headers, json=data) 114 | 115 | if response.status_code == 200: 116 | print('Upload successful:', response.json()) 117 | else: 118 | print('Upload failed:', response.status_code, response.text) 119 | 120 | def export_atom_flows(self, challenge_id, save_path): 121 | self.headers['authorization'] = self.auth_token 122 | self._validate_other_params(None, None, challenge_id) 123 | self._validate_save_path(save_path) 124 | 125 | data = { 126 | "query": "mutation AdminExportAtomFlowsOfChallenge($challengeId: String!) { " 127 | "adminExportAtomFlowsOfChallenge(challengeId: $challengeId) }", 128 | "variables": {"challengeId": challenge_id}, 129 | "operationName": "AdminExportAtomFlowsOfChallenge" 130 | } 131 | response = requests.post(self.url, headers=self.headers, json=data) 132 | 133 | if response.status_code == 200: 134 | response_data = response.json() 135 | print(response_data) 136 | if 'data' in response_data and 'adminExportAtomFlowsOfChallenge' in response_data['data']: 137 | zip_file_url = response_data['data']['adminExportAtomFlowsOfChallenge'] 138 | self.download_and_extract_zip_file(zip_file_url, save_path) 139 | else: 140 | raise Exception(f"Failed to export atom flows: {response.status_code} {response.text}") 141 | 142 | @staticmethod 143 | def download_and_extract_zip_file(url, save_path): 144 | response = requests.get(url) 145 | if response.status_code == 200: 146 | zip_path = save_path + ".zip" 147 | with open(zip_path, 'wb') as f: 148 | f.write(response.content) 149 | # Unzip file 150 | with zipfile.ZipFile(zip_path, 'r') as zip_ref: 151 | zip_ref.extractall(save_path) 152 | # Delete zip file 153 | os.remove(zip_path) 154 | print(f"File downloaded and extracted to {save_path}") 155 | else: 156 | raise Exception(f"Failed to download ZIP file: {response.status_code} {response.text}") 157 | 158 | @staticmethod 159 | def _validate_file_path(file_path): 160 | if not os.path.isfile(file_path): 161 | raise ValueError(f"The file path {file_path} does not exist or is not a file.") 162 | 163 | @staticmethod 164 | def _validate_save_path(save_path): 165 | if not os.path.isdir(save_path): 166 | raise ValueError(f"The save path {save_path} does not exist or is not a directory.") 167 | 168 | @staticmethod 169 | def _validate_other_params(name, base_model, challenge_id): 170 | if name and (len(name) < 3 or len(name) > 100): 171 | raise ValueError("Name must be between 3 and 100 characters long.") 172 | if base_model and (len(base_model) < 3 or len(base_model) > 100): 173 | raise ValueError("Base model must be between 3 and 100 characters long.") 174 | if not challenge_id or len(challenge_id) < 3: 175 | raise ValueError("Challenge ID must be at least 3 characters long.") 176 | 177 | if __name__ == "__main__": 178 | parser = argparse.ArgumentParser(description="GraphQL Client for iMean.ai") 179 | subparsers = parser.add_subparsers(dest="command", help="Sub-command help") 180 | 181 | # Subparser for the upload command 182 | parser_upload = subparsers.add_parser("upload", help="Upload a file") 183 | parser_upload.add_argument("--file-path", required=True, help="Path to the file to be uploaded") 184 | parser_upload.add_argument("--challenge-id", required=True, help="Challenge ID for the upload") 185 | parser_upload.add_argument("--name", required=True, help="Name for the upload") 186 | parser_upload.add_argument("--base-model", required=True, help="Base model information for the upload") 187 | 188 | # Subparser for the download command 189 | parser_download = subparsers.add_parser("download", help="Download atom flows") 190 | parser_download.add_argument("--challenge-id", required=True, help="Challenge ID for the download") 191 | parser_download.add_argument("--save-path", required=True, help="Path to save the downloaded file") 192 | 193 | args = parser.parse_args() 194 | 195 | client = GraphQLClient() 196 | client.login() 197 | 198 | if args.command == "upload": 199 | client.upload_file(args.name, args.base_model, args.file_path, args.challenge_id) 200 | elif args.command == "download": 201 | client.export_atom_flows(args.challenge_id, args.save_path) 202 | 203 | 204 | -------------------------------------------------------------------------------- /data/example/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iMeanAI/WebCanvas/b9f289128614cd99b97abd0bb9bfc3a45f0847e0/data/example/.DS_Store -------------------------------------------------------------------------------- /data/raw_data_processor.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import re 3 | from urllib.parse import unquote, parse_qs, urlparse 4 | import json5 5 | import ujson as json 6 | 7 | def is_url(string): 8 | parsed = urlparse(string) 9 | return bool(parsed.scheme) and bool(parsed.netloc) 10 | 11 | def process_file(input_file, output_file): 12 | with open(input_file, "r", encoding="utf-8") as f: 13 | json_file = json.load(f) 14 | 15 | print("JSON file loaded") 16 | 17 | output = [] 18 | 19 | for index, task in enumerate(json_file): 20 | task_name = task["title"] 21 | evaluation = [] 22 | print("task_name:", task_name) 23 | steps = task["steps"] 24 | reference_steps = len(steps) 25 | print("reference_steps:", reference_steps) 26 | for step in steps: 27 | if "rewardFunction" in step.keys() and len(step["rewardFunction"]) > 0: 28 | 29 | # hack: put url in description in href 30 | if "description" in step.keys() and is_url(step["description"]): 31 | step["href"] = step["description"] 32 | 33 | # hack: combine element value and element path 34 | flag_value = False 35 | flag_path = False 36 | for func in step["rewardFunction"]: 37 | if "element_value" in func["name"]: 38 | flag_value = True 39 | if "element_path" in func["name"]: 40 | flag_path = True 41 | if flag_value and flag_path: 42 | for idx, func in enumerate(step["rewardFunction"]): 43 | if "element_value" in func["name"]: 44 | func["name"] = f'{func["name"]}_path' 45 | if "element_path" in func["name"]: 46 | del_idx = idx 47 | del step["rewardFunction"][del_idx] 48 | for func in step["rewardFunction"]: 49 | if len(func) == 0: 50 | break 51 | temp = {} 52 | temp["match_function_name"] = func["name"] 53 | # element match 54 | if "element" in temp["match_function_name"]: 55 | url = urlparse(step["href"]) 56 | if url.netloc.startswith("www"): 57 | netloc = re.findall(".*?\.(.*?)\..*?", url.netloc)[0] 58 | else: 59 | netloc = re.findall("(.*?)\..*?", url.netloc)[0] 60 | 61 | # element path match 62 | if "element_path_exact" in temp["match_function_name"]: 63 | temp["method"] = "selector" 64 | temp["content"] = { 65 | "reference_answer": step["selector"], "netloc": netloc, "url": step["href"] 66 | } 67 | 68 | # element value match 69 | elif "element_value_exact" in temp["match_function_name"]: 70 | if "path" in temp["match_function_name"]: 71 | temp["match_function_name"] = temp["match_function_name"].replace("_path", "") 72 | temp["content"] = { 73 | "reference_answer": step["value"], "netloc": netloc, "path": step["selector"], "url": step["href"] 74 | } 75 | else: 76 | temp["content"] = { 77 | "reference_answer": step["value"], "netloc": netloc, "url": step["href"] 78 | } 79 | elif "element_value_include" in temp["match_function_name"]: 80 | if "path" in temp["match_function_name"]: 81 | temp["match_function_name"] = temp["match_function_name"].replace("_path", "") 82 | temp["content"] = { 83 | "reference_answer": func["required"], "netloc": netloc, "path": step["selector"], "url": step["href"] 84 | } 85 | else: 86 | temp["content"] = { 87 | "reference_answer": func["required"], "netloc": netloc, "url": step["href"] 88 | } 89 | elif "element_value_semantic" in temp["match_function_name"]: 90 | if "path" in temp["match_function_name"]: 91 | temp["match_function_name"] = temp["match_function_name"].replace("_path", "") 92 | temp["content"] = { 93 | "reference_answer": func["optional"], "netloc": netloc, "path": step["selector"], "url": step["href"] 94 | } 95 | else: 96 | temp["content"] = { 97 | "reference_answer": func["optional"], "netloc": netloc, "url": step["href"] 98 | } 99 | 100 | # url match 101 | elif "url_include" in temp["match_function_name"]: 102 | key = func["key"] if "key" in func.keys() else "" 103 | temp["content"] = { 104 | "key": unquote(key), 105 | "reference_answer": unquote(func["required"]), 106 | "url": step["href"] 107 | } 108 | elif "url_exact" in temp["match_function_name"]: 109 | key = func["key"] if "key" in func.keys() else "" 110 | if "optional" in func.keys(): 111 | reference_answer = func["optional"] 112 | elif len(key) > 0: 113 | try: 114 | parsed_url = urlparse(step["href"]) 115 | url_params = parse_qs(parsed_url.query) 116 | reference_answer = url_params[unquote(key)][0] 117 | except: 118 | print("\nError in parsing URL!") 119 | print("key to be parsed: ", key) 120 | print("recorded url: ", step["href"]) 121 | input("\nPress Enter to ignore and continue processing.") 122 | else: 123 | reference_answer = step["href"] 124 | key = unquote(key) 125 | reference_answer = unquote(reference_answer) 126 | 127 | temp["content"] = { 128 | "key": key, 129 | "reference_answer": reference_answer, 130 | "url": step["href"] 131 | } 132 | elif "url_semantic" in temp["match_function_name"]: 133 | key = func["key"] if "key" in func.keys() else "" 134 | temp["content"] = { 135 | "key": key, 136 | "reference_answer": func["optional"], 137 | "url": step["href"] 138 | } 139 | key = unquote(key) 140 | elif "cache_data_exact" in temp["match_function_name"]: 141 | temp["content"] = { 142 | "reference_answer": step["value"], 143 | "url": step["href"] 144 | } 145 | elif "cache_data_include" in temp["match_function_name"]: 146 | temp["content"] = { 147 | "reference_answer": unquote(func["required"]), 148 | "url": step["href"] 149 | } 150 | elif "cache_data_semantic" in temp["match_function_name"]: 151 | temp["content"] = { 152 | "reference_answer": unquote(func["optional"]), 153 | "url": step["href"] 154 | } 155 | elif "final_answer_exact" in temp["match_function_name"]: 156 | temp["content"] = { 157 | "reference_answer": step["value"], 158 | "url": step["href"] 159 | } 160 | elif "final_answer_semantic" in temp["match_function_name"]: 161 | temp["content"] = { 162 | "reference_answer": unquote(func["optional"]), 163 | "url": step["href"] 164 | } 165 | elif "final_answer_include" in temp["match_function_name"]: 166 | temp["content"] = { 167 | "reference_answer": unquote(func["required"]), 168 | "url": step["href"] 169 | } 170 | else: 171 | print("*" * 50, "\n", "other match function, coming soon!") 172 | evaluation.append(temp) 173 | output.append({ 174 | "index": index, 175 | "task": task_name, 176 | "reference_task_length": reference_steps, 177 | "evaluation": evaluation 178 | }) 179 | 180 | with open(output_file, "w", encoding="utf-8") as f_out: 181 | json5.dump(output, fp=f_out, ensure_ascii=False, indent=4, quote_keys=True, trailing_commas=False) 182 | 183 | def main(): 184 | parser = argparse.ArgumentParser(description="Process JSON file and generate output.") 185 | parser.add_argument("--input-file", required=True, help="Input JSON file") 186 | parser.add_argument("--output-file", required=True, help="Output JSON file") 187 | args = parser.parse_args() 188 | 189 | process_file(args.input_file, args.output_file) 190 | 191 | if __name__ == "__main__": 192 | main() 193 | 194 | -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | from agent.Environment.html_env.async_env import AsyncHTMLEnvironment 2 | from evaluate import * 3 | from agent.Plan import * 4 | from dataclasses import dataclass 5 | 6 | import re 7 | import asyncio 8 | import argparse 9 | import logging 10 | 11 | # universal tools 12 | from agent.Utils.utils import * 13 | # evaluate tools 14 | from evaluate.evaluate_utils import run_task, read_config, read_file 15 | from agent.Utils.utils import read_json_file 16 | from experiment_results import get_evaluate_result 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | from agent.LLM.token_utils import is_model_supported 21 | 22 | 23 | @dataclass 24 | class ExperimentConfig: 25 | mode: str 26 | global_reward_mode: str 27 | planning_text_model: str 28 | global_reward_text_model: str 29 | ground_truth_mode: bool 30 | single_task_name: str 31 | config: dict 32 | ground_truth_data: dict 33 | write_result_file_path: str 34 | record_time: str 35 | file: list 36 | 37 | 38 | def validate_config(config, observation_mode, global_reward_mode, observation_model, global_reward_model): 39 | task_mode = config['basic']['task_mode'] 40 | batch_tasks_file_path = config['files']['batch_tasks_file_path'] 41 | json_model_response = config['model']['json_model_response'] 42 | all_json_models = config['model']['json_models'] 43 | interaction_mode = config['steps']['interaction_mode'] 44 | 45 | if observation_mode not in ["dom"]: 46 | logger.error( 47 | "observation mode is not correctly defined! Currently we only support DOM observation.") 48 | exit() 49 | 50 | if interaction_mode not in [True, False]: 51 | logger.error( 52 | "interaction_mode is not defined! Try to define whether you want to evaluate the agent in an interactive manner.") 53 | exit() 54 | 55 | if json_model_response and (observation_model not in all_json_models or ( 56 | global_reward_mode != 'no_global_reward' and global_reward_model not in all_json_models)): 57 | logger.error("Model does not support JSON mode!") 58 | exit() 59 | 60 | if task_mode == 'batch_tasks' and not os.path.exists(batch_tasks_file_path): 61 | logger.error("batch_tasks_file_path not exist!") 62 | exit() 63 | 64 | 65 | def get_task_range(task_mode, file, raw_data_index): 66 | if task_mode == "batch_tasks": 67 | if raw_data_index != -1: 68 | re_result = re.split(r'\s|,', raw_data_index) 69 | raw_data_start_index = int(re_result[0]) 70 | raw_data_end_index = int(re_result[-1]) + 1 71 | else: 72 | raw_data_start_index = 0 73 | raw_data_end_index = len(file) 74 | return range(raw_data_start_index, raw_data_end_index) 75 | elif task_mode == "single_task": 76 | return range(0, 1) 77 | else: 78 | logger.error("task_mode error!") 79 | exit() 80 | 81 | 82 | def log_task_info(task_index, task_name, reference_task_length, reference_evaluate_steps): 83 | logger.info("*" * 100) 84 | logger.info(f"task index: {task_index}") 85 | logger.info(f"task name: {task_name}") 86 | logger.info(f"task reference length: {reference_task_length}") 87 | logger.info(f"raw data annotation: {reference_evaluate_steps}") 88 | 89 | 90 | def generate_result_file_path(config): 91 | return os.path.join(config["files"]["out_file_path"], "json_result") 92 | 93 | 94 | def load_ground_truth_data(config, ground_truth_mode): 95 | if ground_truth_mode: 96 | ground_truth_file_path = config['files']['ground_truth_file_path'] 97 | if not os.path.exists(ground_truth_file_path): 98 | logger.error("ground_truth_file_path not exist!") 99 | exit() 100 | return read_json_file(ground_truth_file_path) 101 | return None 102 | 103 | 104 | def create_html_environment(mode): 105 | return AsyncHTMLEnvironment( 106 | mode=mode, 107 | max_page_length=8192, 108 | headless=False, 109 | slow_mo=1000, 110 | current_viewport_only=False, 111 | viewport_size={"width": 1080, "height": 720}, 112 | save_trace_enabled=False, 113 | sleep_after_execution=0.0, 114 | locale="en-US", 115 | use_vimium_effect=True 116 | ) 117 | 118 | 119 | async def run_experiment(task_range, experiment_config): 120 | for task_index in task_range: 121 | task_uuid = None 122 | if experiment_config.config['basic']['task_mode'] == "batch_tasks": 123 | task = experiment_config.file[task_index] 124 | task_name, task_uuid, reference_task_length, reference_evaluate_steps = task 125 | evaluate_steps = reference_evaluate_steps 126 | log_task_info(task_index, task_name, 127 | reference_task_length, reference_evaluate_steps) 128 | elif experiment_config.config['basic']['task_mode'] == "single_task": 129 | task_name = experiment_config.single_task_name 130 | reference_task_length = experiment_config.config['steps']['single_task_action_step'] 131 | # TODO 132 | evaluate_steps = experiment_config.config['steps']['single_task_action_step'] 133 | reference_evaluate_steps = None 134 | logger.info(f"task_name: {task_name}") 135 | 136 | env = create_html_environment(experiment_config.mode) 137 | if is_model_supported(experiment_config.planning_text_model) and is_model_supported( 138 | experiment_config.global_reward_text_model): 139 | if not os.path.exists("token_results"): 140 | os.makedirs("token_results") 141 | token_counts_filename = f"token_results/token_counts_{experiment_config.record_time}_{experiment_config.planning_text_model}_{experiment_config.global_reward_text_model}.json" 142 | 143 | await run_task(mode=experiment_config.mode, 144 | task_mode=experiment_config.config['basic']['task_mode'], 145 | task_name=task_name, 146 | task_uuid=task_uuid, 147 | config=experiment_config.config, 148 | write_result_file_path=experiment_config.write_result_file_path, 149 | reference_task_length=reference_task_length, 150 | evaluate_steps=evaluate_steps, 151 | reference_evaluate_steps=reference_evaluate_steps, 152 | env=env, 153 | global_reward_mode=experiment_config.global_reward_mode, 154 | global_reward_text_model=experiment_config.global_reward_text_model, 155 | planning_text_model=experiment_config.planning_text_model, 156 | ground_truth_mode=experiment_config.ground_truth_mode, 157 | ground_truth_data=experiment_config.ground_truth_data, 158 | interaction_mode=experiment_config.config['steps']['interaction_mode'], 159 | task_index=task_index, 160 | record_time=experiment_config.record_time, 161 | token_pricing=experiment_config.config['token_pricing']) 162 | 163 | await env.close() 164 | del env 165 | if is_model_supported(experiment_config.planning_text_model) and is_model_supported(experiment_config.global_reward_text_model): 166 | with open(token_counts_filename, 'r') as file: 167 | data = json.load(file) 168 | total_token_cost = data.get("total_token_cost", 0) 169 | 170 | get_evaluate_result(experiment_config.config["files"]["out_file_path"], total_token_cost) 171 | logger.info('\033[31mAll tasks finished!\033[0m') 172 | logger.info('\033[31mPress Enter to exit...\033[0m') 173 | 174 | 175 | async def main(global_reward_mode="no_global_reward", 176 | planning_text_model="gpt-4-turbo", 177 | global_reward_text_model="gpt-4-turbo", 178 | single_task_name="", 179 | raw_data_index=-1, 180 | observation_mode="dom", 181 | ground_truth_mode=False, 182 | toml_path=None 183 | ): 184 | config = read_config(toml_path) 185 | validate_config(config, observation_mode, global_reward_mode, planning_text_model, global_reward_text_model) 186 | 187 | file = None 188 | if config['basic']['task_mode'] == "batch_tasks": 189 | file = read_file(file_path=config['files']['batch_tasks_file_path']) 190 | task_range = get_task_range( 191 | config['basic']['task_mode'], file, raw_data_index) 192 | elif config['basic']['task_mode'] == "single_task": 193 | task_range = get_task_range(config['basic']['task_mode'], None, -1) 194 | 195 | record_time = time.strftime("%Y%m%d-%H%M%S", time.localtime()) 196 | write_result_file_path = generate_result_file_path(config) 197 | ground_truth_data = load_ground_truth_data(config, ground_truth_mode) 198 | 199 | experiment_config = ExperimentConfig( 200 | mode=observation_mode, 201 | global_reward_mode=global_reward_mode, 202 | planning_text_model=planning_text_model, 203 | global_reward_text_model=global_reward_text_model, 204 | ground_truth_mode=ground_truth_mode, 205 | single_task_name=single_task_name, 206 | config=config, 207 | ground_truth_data=ground_truth_data, 208 | write_result_file_path=write_result_file_path, 209 | record_time=record_time, 210 | file=file 211 | ) 212 | 213 | await run_experiment(task_range, experiment_config) 214 | 215 | 216 | if __name__ == "__main__": 217 | parser = argparse.ArgumentParser( 218 | description="Run the web agent in different modes.") 219 | parser.add_argument("--global_reward_mode", 220 | choices=["dom_vision_reward", "dom_reward", 221 | "vision_reward", "no_global_reward"], 222 | default="no_global_reward", help="Choose the mode of global reward.") 223 | parser.add_argument("--index", type=str, default=-1) 224 | parser.add_argument("--single_task_name", type=str, 225 | default="Find Dota 2 game and add all DLC to cart in steam.") 226 | parser.add_argument("--planning_text_model", type=str, default="gpt-4o-mini") 227 | parser.add_argument("--global_reward_text_model", type=str, default="gpt-4o-mini") 228 | 229 | args = parser.parse_args() 230 | 231 | asyncio.run(main(global_reward_mode=args.global_reward_mode, 232 | planning_text_model=args.planning_text_model, 233 | global_reward_text_model=args.global_reward_text_model, 234 | single_task_name=args.single_task_name, 235 | raw_data_index=args.index 236 | ) 237 | ) 238 | -------------------------------------------------------------------------------- /evaluate/__init__.py: -------------------------------------------------------------------------------- 1 | from .step_score import * 2 | from .task_score import * 3 | from .evaluate_utils import * 4 | from .step_score_js import * -------------------------------------------------------------------------------- /evaluate/step_score.py: -------------------------------------------------------------------------------- 1 | import re 2 | from urllib.parse import parse_qs, urlparse, unquote 3 | from bs4 import BeautifulSoup 4 | 5 | import requests 6 | from lxml import html 7 | from agent.LLM import * 8 | from agent.Prompt import * 9 | from agent.Environment.html_env.utils import MapTagNameList 10 | 11 | 12 | class StepEvaluator(): 13 | def __init__(self): 14 | pass 15 | 16 | 17 | class URLEvaluator(StepEvaluator): 18 | '''URL Evaluation Scoring''' 19 | @staticmethod 20 | def url_exact_match(input_url, reference_answer, key=False): 21 | if key: 22 | try: 23 | parsed_url = urlparse(input_url) 24 | url_params = parse_qs(parsed_url.query) 25 | input_answer = url_params[key][0] 26 | except: 27 | return 0 28 | else: 29 | input_answer = input_url 30 | input_answer = unquote(input_answer) 31 | result_score = MatchFunction.exact_match( 32 | input_answer, reference_answer) 33 | # if result_score == 1: 34 | # print("url_exactly_match:", input_answer) 35 | return result_score 36 | 37 | @staticmethod 38 | def url_include_match(input_url, reference_answer, key=None): 39 | # print(input_url, reference_answer) 40 | if key: 41 | try: 42 | parsed_url = urlparse(input_url) 43 | url_params = parse_qs(parsed_url.query) 44 | input_answer = url_params[key][0] 45 | except: 46 | return 0 47 | else: 48 | try: 49 | parsed_url = urlparse(input_url) 50 | input_answer = parsed_url.netloc + parsed_url.path 51 | if parsed_url.fragment is not None and (parsed_url.fragment): 52 | input_answer += "#" + parsed_url.fragment 53 | except: 54 | input_answer = input_url 55 | input_answer = unquote(input_answer) 56 | result_score = MatchFunction.include_match( 57 | input_answer, reference_answer) 58 | # print("score:", result_score, input_answer) 59 | return result_score 60 | 61 | @staticmethod 62 | async def url_semantic_match(input_url, semantic_method, key=False): 63 | if key: 64 | try: 65 | parsed_url = urlparse(input_url) 66 | url_params = parse_qs(parsed_url.query) 67 | input_answer = url_params[key][0] 68 | except: 69 | return 0 70 | else: 71 | input_answer = input_url 72 | input_answer = unquote(input_answer) 73 | result_score = await MatchFunction.semantic_match(input_answer, semantic_method) 74 | return result_score 75 | 76 | 77 | class ElementEvaluator(StepEvaluator): 78 | '''Element evaluation and scoring''' 79 | @staticmethod 80 | def path_exact_match(input_answer, reference_answer, method, html_content, input_netloc, reference_netloc): 81 | score = 0 82 | if method == "xpath": 83 | if reference_netloc != input_netloc: 84 | # print("reference_netloc:", reference_netloc, 85 | # "input_netloc:", input_netloc) 86 | return 0 87 | try: 88 | tree = html.fromstring(html_content) 89 | input_elements = tree.xpath(input_answer) 90 | reference_elements = tree.xpath(reference_answer) 91 | except: 92 | score = 0 93 | if (input_elements is not None) and (reference_elements is not None): 94 | score = input_elements[0] is reference_elements[0] 95 | try: 96 | if reference_elements[0].tag in MapTagNameList: 97 | trace_up_count = 0 98 | current_element = reference_elements[0] 99 | while trace_up_count < 3 and score == 0: 100 | trace_up_count += 1 101 | current_element = current_element.getparent() 102 | score_parent = input_elements[0] is current_element 103 | score = max(score, score_parent) 104 | except: 105 | pass 106 | else: 107 | score = 0 108 | elif method == "selector": 109 | if reference_netloc != input_netloc: 110 | # print("reference_netloc:", reference_netloc, 111 | # "input_netloc:", input_netloc) 112 | return 0 113 | try: 114 | soup = BeautifulSoup(html_content, 'html.parser') 115 | input_element = soup.select_one(input_answer) 116 | reference_element = soup.select_one(reference_answer) 117 | if (input_element is not None) and (reference_element is not None): 118 | score = input_element is reference_element 119 | 120 | try: 121 | if reference_element.name in MapTagNameList: 122 | # parent_elements = reference_element.parent 123 | # score_parent = input_element is parent_elements 124 | # score = max(score, score_parent) 125 | trace_up_count = 0 126 | current_element = reference_element 127 | while trace_up_count < 3 and score == 0: 128 | trace_up_count += 1 129 | current_element = current_element.parent 130 | score_parent = input_element is current_element 131 | score = max(score, score_parent) 132 | except: 133 | pass 134 | except: 135 | score = 0 136 | # result_score = MatchFunction.include_match( 137 | # input_answer, reference_answer) 138 | return score 139 | 140 | @staticmethod 141 | def path_included_match(input_answer, reference_answer, method, html_content): 142 | # TODO Add path inclusion matching method 143 | result_score = MatchFunction.include_match( 144 | input_answer, reference_answer) 145 | return result_score 146 | 147 | @staticmethod 148 | def element_value_exact_match(input_answer, reference_answer, input_netloc, reference_netloc): 149 | if reference_netloc != input_netloc: 150 | # print("reference_netloc:", reference_netloc, 151 | # "input_netloc:", input_netloc) 152 | return 0 153 | result_score = MatchFunction.exact_match( 154 | input_answer, reference_answer) 155 | return result_score 156 | 157 | @staticmethod 158 | def element_value_include_match(input_answer, reference_answer, input_netloc, reference_netloc): 159 | if reference_netloc != input_netloc: 160 | # print("reference_netloc:", reference_netloc, 161 | # "input_netloc:", input_netloc) 162 | return 0 163 | result_score = MatchFunction.include_match( 164 | input_answer, reference_answer) 165 | return result_score 166 | 167 | @staticmethod 168 | async def element_value_semantic_match(input_answer, semantic_method, input_netloc, reference_netloc=0): 169 | if reference_netloc != input_netloc: 170 | # print("reference_netloc:", reference_netloc, 171 | # "input_netloc:", input_netloc) 172 | return 0 173 | if len(input_answer) == 0: 174 | return 0 175 | result_score = await MatchFunction.semantic_match(input_answer, semantic_method) 176 | return result_score 177 | 178 | 179 | class TextEvaluator(StepEvaluator): 180 | '''Text evaluation and scoring''' 181 | @staticmethod 182 | def text_exact_match(input_answer, reference_answer): 183 | input_answer = input_answer.lower() 184 | reference_answer = reference_answer.lower() 185 | result_score = MatchFunction.exact_match( 186 | input_answer, reference_answer) 187 | return result_score 188 | 189 | @staticmethod 190 | def text_included_match(input_answer, reference_answer): 191 | input_answer = input_answer.lower() 192 | reference_answer = reference_answer.lower() 193 | result_score = MatchFunction.include_match( 194 | input_answer, reference_answer) 195 | return result_score 196 | 197 | @staticmethod 198 | def text_semantic_match(input_answer, semantic_method): 199 | result_score = MatchFunction.semantic_match( 200 | input_answer, semantic_method) 201 | return result_score 202 | 203 | 204 | class MatchFunction(): 205 | def __init__(self): 206 | pass 207 | 208 | @staticmethod 209 | def exact_match(input_answer, reference_answer) -> int: 210 | return 1 if input_answer == reference_answer else 0 211 | 212 | @staticmethod 213 | def include_match(input_answer, reference_answer) -> int: 214 | return 1 if reference_answer in input_answer else 0 215 | 216 | @staticmethod 217 | async def semantic_match(input_answer, semantic_method) -> float: 218 | # GPT35 = GPTGenerator(model="gpt-3.5-turbo") 219 | semantic_request = SemanticMatchPromptConstructor( 220 | ).construct(input_answer, semantic_method) 221 | score = None 222 | for i in range(3): 223 | try: 224 | # response, _ = await GPT35.request(semantic_request) 225 | response, _ = await semantic_match_llm_request(semantic_request) 226 | score = re.findall("```(.*?)```", response, re.S)[0] 227 | score = eval(score) 228 | # Limit the score between 0 and 1 229 | score = max(0, min(1, score)) 230 | if score != None: 231 | break 232 | except: 233 | score = None 234 | if score == None: 235 | score = 0 236 | if score != 0 and score != 1: 237 | return round(score, 2) 238 | else: 239 | return score 240 | -------------------------------------------------------------------------------- /evaluate/step_score_js.py: -------------------------------------------------------------------------------- 1 | import re 2 | from urllib.parse import parse_qs, urlparse, unquote 3 | from bs4 import BeautifulSoup 4 | 5 | import requests 6 | from lxml import html 7 | from agent.LLM import * 8 | from agent.Prompt import * 9 | from agent.Environment.html_env.utils import MapTagNameList 10 | 11 | 12 | class StepEvaluator(): 13 | def __init__(self): 14 | pass 15 | 16 | 17 | class URLEvaluator(StepEvaluator): 18 | """URL Evaluation Scoring""" 19 | 20 | @staticmethod 21 | def url_exact_match(input_url, reference_answer, key=False): 22 | if key: 23 | try: 24 | parsed_url = urlparse(input_url) 25 | url_params = parse_qs(parsed_url.query) 26 | input_answer = url_params[key][0] 27 | except: 28 | return 0 29 | else: 30 | input_answer = input_url 31 | input_answer = unquote(input_answer) 32 | result_score = MatchFunction.exact_match(input_answer, reference_answer) 33 | return result_score 34 | 35 | @staticmethod 36 | def url_include_match(input_url, reference_answer, key=None): 37 | # print(input_url, reference_answer) 38 | if key: 39 | try: 40 | parsed_url = urlparse(input_url) 41 | url_params = parse_qs(parsed_url.query) 42 | input_answer = url_params[key][0] 43 | except: 44 | return 0 45 | else: 46 | try: 47 | parsed_url = urlparse(input_url) 48 | input_answer = parsed_url.netloc + parsed_url.path 49 | if parsed_url.fragment is not None and (parsed_url.fragment): 50 | input_answer += "#" + parsed_url.fragment 51 | except: 52 | input_answer = input_url 53 | input_answer = unquote(input_answer) 54 | result_score = MatchFunction.include_match(input_answer, reference_answer) 55 | # print("score:", result_score, input_answer) 56 | return result_score 57 | 58 | @staticmethod 59 | def url_semantic_match(input_url, semantic_method, key=False): 60 | if key: 61 | try: 62 | parsed_url = urlparse(input_url) 63 | url_params = parse_qs(parsed_url.query) 64 | input_answer = url_params[key][0] 65 | except: 66 | return 0 67 | else: 68 | input_answer = input_url 69 | input_answer = unquote(input_answer) 70 | result_score = MatchFunction.semantic_match(input_answer, semantic_method) 71 | return result_score 72 | 73 | class ElementEvaluator(StepEvaluator): 74 | """Element evaluation and scoring""" 75 | 76 | @staticmethod 77 | def is_same_element(page, input_element_handle, reference_element_handle): 78 | is_same_element = page.evaluate( 79 | "(elements) => elements[0] === elements[1]", 80 | [input_element_handle, reference_element_handle], 81 | ) 82 | return int(is_same_element) 83 | 84 | @staticmethod 85 | def path_exact_match(input_answer, reference_answer, method, page): 86 | score = 0 87 | if method == "xpath": 88 | try: 89 | html_content = page.content() 90 | tree = html.fromstring(html_content) 91 | input_elements = tree.xpath(input_answer) 92 | reference_elements = tree.xpath(reference_answer) 93 | except: 94 | score = 0 95 | if (input_elements is not None) and (reference_elements is not None): 96 | score = input_elements[0] is reference_elements[0] 97 | try: 98 | if reference_elements[0].tag in MapTagNameList: 99 | trace_up_count = 0 100 | current_element = reference_elements[0] 101 | while trace_up_count < 3 and score == 0: 102 | trace_up_count += 1 103 | current_element = current_element.getparent() 104 | parent_score = input_elements[0] is current_element 105 | score = max(score, parent_score) 106 | except: 107 | pass 108 | else: 109 | score = 0 110 | elif method == "selector": 111 | try: 112 | input_element = input_answer 113 | reference_element = page.locator(reference_answer) 114 | input_element_handle = input_element.element_handle() 115 | reference_element_handle = reference_element.element_handle() 116 | if (input_element is not None) and (reference_element is not None): 117 | score = ElementEvaluator.is_same_element( 118 | page, 119 | input_element_handle=input_element_handle, 120 | reference_element_handle=reference_element_handle, 121 | ) 122 | try: 123 | reference_tag = page.evaluate( 124 | "(element) => element.tagName.toLowerCase()", reference_element_handle 125 | ) 126 | if reference_tag in MapTagNameList: 127 | trace_up_count = 0 128 | current_element = reference_element 129 | while trace_up_count < 3 and score == 0: 130 | trace_up_count += 1 131 | parent_element = current_element.locator("xpath=..") 132 | parent_element_handle = parent_element.element_handle() 133 | current_element = parent_element 134 | if parent_element: 135 | parent_score = ElementEvaluator.is_same_element( 136 | page, 137 | input_element_handle=input_element_handle, 138 | reference_element_handle=parent_element_handle, 139 | ) 140 | score = max(score, parent_score) 141 | except Exception as e: 142 | print(e) 143 | pass 144 | except: 145 | score = 0 146 | return score 147 | 148 | @staticmethod 149 | def path_included_match(input_answer, reference_answer, method, html_content): 150 | # TODO Add path inclusion matching method 151 | result_score = MatchFunction.include_match(input_answer, reference_answer) 152 | return result_score 153 | 154 | @staticmethod 155 | def element_value_exact_match(input_answer, reference_answer): 156 | # TODO fuzzy check if the input_answer is on the same page as the reference_answer 157 | result_score = MatchFunction.exact_match(input_answer, reference_answer) 158 | return result_score 159 | 160 | @staticmethod 161 | def element_value_include_match(input_answer, reference_answer): 162 | # TODO fuzzy check if the input_answer is on the same page as the reference_answer 163 | result_score = MatchFunction.include_match(input_answer, reference_answer) 164 | return result_score 165 | 166 | @staticmethod 167 | def element_value_semantic_match(input_answer, semantic_method): 168 | # TODO fuzzy check if the input_answer is on the same page as the reference_answer 169 | if len(input_answer) == 0: 170 | return 0 171 | result_score = MatchFunction.semantic_match(input_answer, semantic_method) 172 | return result_score 173 | 174 | class TextEvaluator(StepEvaluator): 175 | """Text evaluation and scoring""" 176 | 177 | @staticmethod 178 | def text_exact_match(input_answer, reference_answer): 179 | result_score = MatchFunction.exact_match(input_answer, reference_answer) 180 | return result_score 181 | 182 | @staticmethod 183 | def text_included_match(input_answer, reference_answer): 184 | result_score = MatchFunction.include_match(input_answer, reference_answer) 185 | return result_score 186 | 187 | @staticmethod 188 | def text_semantic_match(input_answer, semantic_method): 189 | result_score = MatchFunction.semantic_match(input_answer, semantic_method, semantic_method) 190 | return result_score 191 | 192 | 193 | class MatchFunction: 194 | def __init__(self): 195 | pass 196 | 197 | @staticmethod 198 | def exact_match(input_answer, reference_answer) -> int: 199 | return 1 if input_answer == reference_answer else 0 200 | 201 | @staticmethod 202 | def include_match(input_answer, reference_answer) -> int: 203 | return 1 if reference_answer in input_answer else 0 204 | 205 | @staticmethod 206 | async def semantic_match(input_answer, semantic_method) -> float: 207 | # GPT35 = GPTGenerator(model="gpt-3.5-turbo") 208 | semantic_request = SemanticMatchPromptConstructor( 209 | ).construct(input_answer, semantic_method) 210 | score = None 211 | for i in range(3): 212 | try: 213 | # response, _ = await GPT35.request(semantic_request) 214 | response, _ = await semantic_match_llm_request(semantic_request) 215 | score = re.findall("```(.*?)```", response, re.S)[0] 216 | score = eval(score) 217 | # Limit the score between 0 and 1 218 | score = max(0, min(1, score)) 219 | if score != None: 220 | break 221 | except: 222 | score = None 223 | if score == None: 224 | score = 0 225 | if score != 0 and score != 1: 226 | return round(score, 2) 227 | else: 228 | return score 229 | -------------------------------------------------------------------------------- /evaluate/task_score.py: -------------------------------------------------------------------------------- 1 | class TaskEvaluator(): 2 | def __init__(self): 3 | pass 4 | 5 | 6 | class TaskLengthEvaluator(TaskEvaluator): 7 | def __init__(self, alpha=1.2): 8 | # Give a multiplier to the reference step number. 9 | # If it is within the reference multiplier * a, give full score, otherwise give score according to the ratio 10 | self.alpha = alpha 11 | 12 | def task_length_score(self, reference_length, current_task_length): 13 | ''' 14 | Judge whether the task length is within a times the reference number of steps. 15 | If so, score, otherwise get reference number of steps/actual number of steps 16 | ''' 17 | reference_length *= self.alpha 18 | if current_task_length < reference_length: 19 | return 1 # TODO Specific value to be determined 20 | else: 21 | return reference_length/current_task_length 22 | 23 | 24 | class FinishTaskEvaluator(TaskEvaluator): 25 | 26 | @staticmethod 27 | def finish_task_score(reference_step_score, step_score): 28 | '''Judge whether the task is completed. If the task is completed, score''' 29 | if reference_step_score == step_score: 30 | return 1 # TODO Specific value to be determined 31 | else: 32 | return 0 33 | -------------------------------------------------------------------------------- /experiment_results.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pandas import json_normalize 3 | import json5 4 | import json 5 | import re 6 | import os 7 | from logs import logger 8 | 9 | 10 | def parse_thought_action(dict_str): 11 | thought_action = {} 12 | thought_match = re.search(r"'thought':\s*(.+?)\s*,\s*'action'", dict_str) 13 | action_match = re.search(r"'action':\s*(.+?)\s*}", dict_str) 14 | thought = thought_match.group(1) if thought_match else None 15 | thought = thought.replace("\\", "").replace("\"", "").replace("\'", "") 16 | action = action_match.group(1) if action_match else None 17 | action = action.replace("\\", "").replace("\"", "").replace("\'", "") 18 | thought_action = {"thought": thought, "action": action} 19 | return thought_action 20 | 21 | 22 | def enum_to_action_str(): 23 | action_types = [ 24 | ("NONE", 0), 25 | ("CLICK", 1), 26 | ("GOTO", 2), 27 | ("GOOGLE_SEARCH", 3), 28 | ("FILL_FORM", 4), 29 | ("SWITCH_TAB", 5), 30 | ("GO_BACK", 6), 31 | ("FILL_SEARCH", 7), 32 | ("SELECT_OPTION", 8), 33 | ("HOVER", 9), 34 | ("SCROLL_DOWN", 10), 35 | ("SCROLL_UP", 11), 36 | ("CACHE_DATA", 12), 37 | ("GET_FINAL_ANSWER", 13) 38 | ] 39 | action_dict = {str(value): name for name, 40 | value in action_types if name.isupper()} 41 | return action_dict 42 | 43 | 44 | def to_dict(input_string): 45 | pattern = r"('action_type'|'element_id'|'url'|'fill_text'):\s*(<[^>]+>|\d+|'[^']+'|\"[^\"]+\")" 46 | matches = re.findall(pattern, input_string) 47 | extracted_fields = {} 48 | for match in matches: 49 | field_name, field_value = match 50 | if field_value.startswith('<') and field_value.endswith('>'): 51 | enum_name = field_value.split('.')[-1].strip('<> ') 52 | extracted_fields[field_name.strip("'")] = enum_name 53 | else: 54 | extracted_fields[field_name.strip("'")] = field_value.strip("'") 55 | action_dict = enum_to_action_str() 56 | extracted_fields["action_type"] = action_dict[str( 57 | extracted_fields["action_type"])].lower() 58 | extracted_fields["fill_text"] = extracted_fields["fill_text"] if extracted_fields.get( 59 | "fill_text") else "" 60 | action = "" 61 | if "google_search" in extracted_fields["action_type"].lower(): 62 | action = "google_search" + "[" + extracted_fields["fill_text"] + "]" 63 | elif "fill_search" in extracted_fields["action_type"].lower(): 64 | action = "fill_search" + \ 65 | "[" + str(extracted_fields["element_id"]) + "," + \ 66 | extracted_fields["fill_text"] + "]" 67 | elif "fill_form" in extracted_fields["action_type"].lower(): 68 | action = "fill_search" + \ 69 | "[" + str(extracted_fields["element_id"]) + "," + \ 70 | extracted_fields["fill_text"] + "]" 71 | elif "select_option" in extracted_fields["action_type"].lower(): 72 | action = "select_option" + \ 73 | "[" + str(extracted_fields["element_id"]) + "," + \ 74 | extracted_fields["fill_text"] + "]" 75 | elif "goto" in extracted_fields["action_type"].lower() and extracted_fields.get('url'): 76 | action = "goto" + "[" + extracted_fields["url"] + "]" 77 | elif "click" in extracted_fields["action_type"].lower(): 78 | action = "click" + "[" + str(extracted_fields["element_id"]) + "]" 79 | elif "go_back" in extracted_fields["action_type"].lower(): 80 | action = "go_back" + "[" + str(extracted_fields["element_id"]) + "]" 81 | elif "none" in extracted_fields["action_type"].lower(): 82 | action = "None" 83 | elif "cache_data" in extracted_fields["action_type"].lower(): 84 | action = "cache_data" + "[" + extracted_fields["fill_text"] + "]" 85 | elif "final_answer" in extracted_fields["action_type"].lower(): 86 | action = "get_final_answer" + "[" + extracted_fields["fill_text"] + "]" 87 | return action 88 | 89 | 90 | def score_rate(score): 91 | first, second = score.split("/") 92 | return float(first) / float(second) 93 | 94 | 95 | def parse_step_reward(dict_str): 96 | score_description = {} 97 | score_match = re.search(r"'score':\s*(.+?)\s*,\s*'description'", dict_str) 98 | description_match = re.search(r"'description':\s*(.+?)\s*}", dict_str) 99 | score = score_match.group(1) if score_match else None 100 | score = score.replace("\\", "").replace("\"", "").replace("\'", "") 101 | description = description_match.group(1) if description_match else None 102 | description = description.replace( 103 | "\\", "").replace("\"", "").replace("\'", "") 104 | score_description = {"score": score, "description": description} 105 | return score_description 106 | 107 | 108 | def process_step_reward(dict_str): 109 | if dict_str.lower() == "{}": 110 | dict_str = {} 111 | elif dict_str.lower() == "finished": 112 | dict_str = {"score:": 10, "description": "finished"} 113 | else: 114 | dict_str = parse_step_reward(dict_str) 115 | return dict_str 116 | 117 | 118 | def write_task_result_to_df(each_task_json_file_path): 119 | with open(each_task_json_file_path) as f: 120 | data = json.load(f) 121 | step_list = data["step_list"] 122 | task_name = data["task_name"] 123 | task_status = data["status"] 124 | reference_task_length = data["reference_task_length"] 125 | evaluate_steps = data["evaluate_steps"] 126 | for idx, item in enumerate(step_list): 127 | for key in item: 128 | step_list[idx][key] = str(step_list[idx][key]) 129 | data_df = json_normalize(step_list, errors='ignore') 130 | return task_name, task_status, reference_task_length, evaluate_steps, data_df 131 | 132 | 133 | def write_to_json(df): 134 | df["step_index"] = df["step_index"].apply(lambda x: int(x)) 135 | df["trace_to_dict"] = df["current_trace"].apply( 136 | lambda x: parse_thought_action(x)) 137 | df["action_to_str"] = df["execute_action"].apply(lambda x: to_dict(x)) 138 | df["score_rate"] = df["score"].apply(lambda x: score_rate(x)) 139 | df["step_reward"] = df["step_reward"].apply( 140 | lambda x: process_step_reward(x)) 141 | df["selector"] = df["selector"].fillna("None") 142 | df["match_result"] = df["match_func_result"] 143 | df["element_value"] = df["element_value"].fillna("None") 144 | df["error"] = df["error_message"].fillna("None") 145 | df["step_url"] = df["step_url"].fillna("None") 146 | df_copy = df[ 147 | [ 148 | "step_index", 149 | "trace_to_dict", 150 | "selector", 151 | "action_to_str", 152 | "score", 153 | "score_rate", 154 | "step_reward", 155 | "step_url", 156 | "match_result", 157 | "element_value", 158 | "error" 159 | ] 160 | ] 161 | 162 | def summary(x): 163 | dic = { 164 | "step_index": x["step_index"], 165 | "trace_description": x["trace_to_dict"] if x["trace_to_dict"] else {}, 166 | "selector": x["selector"] if x["selector"] != "None" else "", 167 | "element_value": x["element_value"] if x["element_value"] != "None" else "", 168 | "action": x["action_to_str"] if x["action_to_str"] else "", 169 | "task_score": x["score"], 170 | "task_score_rate": x["score_rate"], 171 | "current_reward_score_description": x["step_reward"], 172 | "url": x["step_url"], 173 | "match_result": x["match_result"], 174 | "error": x["error"] if x["error"] != "None" else "" 175 | } 176 | # print(dic["match_result"]) 177 | return dic 178 | 179 | step_list = [] 180 | df_copy.apply(lambda x: step_list.append(summary(x)), axis=1) 181 | return step_list 182 | 183 | 184 | def get_result(input_json_path): 185 | json_result_path = input_json_path + "/json_result" 186 | out_file_path = input_json_path + "/result" 187 | task_list = [] 188 | for _, filename in enumerate(os.listdir(json_result_path)): 189 | file_path = os.path.join(json_result_path, filename) 190 | out_json = {} 191 | task_name, task_status, reference_task_length, evaluate_steps, data_df = write_task_result_to_df( 192 | file_path) 193 | out_json["task_id"] = int(filename.split("_")[0]) 194 | out_json["task_name"] = task_name 195 | out_json["task_status"] = task_status 196 | if os.path.isfile(file_path): 197 | task_step_list = write_to_json(data_df) 198 | out_json["step_list"] = task_step_list 199 | out_json["evaluation"] = evaluate_steps 200 | task_list.append(out_json) 201 | 202 | task_list = sorted(task_list, key=lambda x: x['task_id']) 203 | 204 | if not os.path.exists(out_file_path): 205 | os.makedirs(out_file_path) 206 | out_json_file_path = out_file_path + '/out.json' 207 | with open(out_json_file_path, 'w') as json_file: 208 | json.dump(task_list, json_file) 209 | return out_file_path 210 | 211 | 212 | def read_json_result(file_path): 213 | with open(file_path) as f: 214 | data = json.load(f) 215 | last_action_result_list = [] 216 | for items in data: 217 | data_dic = {} 218 | data_dic["task_id"] = items["task_id"] 219 | data_dic["task_name"] = items["task_name"] 220 | data_dic["status"] = items["task_status"] 221 | data_dic["steps"] = items["step_list"][-1]["step_index"] + 1 222 | data_dic["task_score"] = items["step_list"][-1]["task_score"] 223 | data_dic["task_score_rate"] = items["step_list"][-1]["task_score_rate"] 224 | data_dic["reward_count"] = len(items["evaluation"]) 225 | last_action_result_list.append(data_dic) 226 | return last_action_result_list 227 | 228 | 229 | def calculate_total_score(scores): 230 | molecular_sum = sum(float(x.split('/')[0]) for x in scores) 231 | denominator_sum = sum(float(x.split('/')[1]) for x in scores) 232 | final_score = molecular_sum / denominator_sum 233 | return final_score 234 | 235 | 236 | def evaluate(file_path, total_token_cost): 237 | input_file_path = file_path + "/out.json" 238 | result_file_path = file_path + "/result.json" 239 | all_data = read_json_result(input_file_path) 240 | df = pd.DataFrame(all_data) 241 | df["step_score"] = df["task_score"].apply(lambda x: float(x.split("/")[0])) 242 | df["efficiency_score"] = [s / sc if sc != 0 else 0 for s, sc in zip(df['steps'], df['step_score'])] 243 | # The agent is only one key node away from completing the task 244 | df["task_near_success"] = df["task_score"].apply(lambda x: float( 245 | x.split("/")[1]) - float(x.split("/")[0]) == 1.0) 246 | 247 | df_evaluate = df[["task_name", "status", "steps", "task_score", 248 | "task_score_rate", "step_score", "efficiency_score", "task_near_success"]] 249 | 250 | key_node_completion_rate = calculate_total_score(df_evaluate['task_score']) 251 | key_node_completion_sum = df_evaluate['step_score'].sum() 252 | task_success_rate = df_evaluate[df_evaluate["status"] 253 | == "finished"].shape[0] / df_evaluate.shape[0] 254 | task_near_success_rate = df_evaluate[df_evaluate["task_near_success"] 255 | == True].shape[0] / df_evaluate.shape[0] 256 | 257 | average_step_score_rate = df_evaluate["task_score_rate"].mean() 258 | average_efficiency_score = df_evaluate["efficiency_score"].mean() 259 | if total_token_cost != 0: 260 | usd_efficiency_score = total_token_cost / key_node_completion_sum 261 | 262 | result_dict = {} 263 | result_dict["task_counts"] = df_evaluate.shape[0] 264 | result_dict["average_step_score_rate"] = average_step_score_rate 265 | result_dict["average_efficiency_score"] = average_efficiency_score 266 | if total_token_cost != 0: 267 | result_dict["usd_efficiency_score"] = usd_efficiency_score 268 | result_dict["key_node_completion_rate"] = key_node_completion_rate 269 | result_dict["task_success_rate"] = task_success_rate 270 | result_dict["task_near_success_rate"] = task_near_success_rate 271 | 272 | with open(result_file_path, 'w') as json_file: 273 | json.dump(result_dict, json_file) 274 | 275 | logger.info(f'\033[31mAll results write to {result_file_path} !\033[0m') 276 | 277 | 278 | def get_evaluate_result(input_result_path, total_token_cost): 279 | out_file_path = get_result(input_result_path) 280 | evaluate(file_path=out_file_path, total_token_cost=total_token_cost) 281 | -------------------------------------------------------------------------------- /logs.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | import time 5 | import colorlog 6 | import re 7 | 8 | 9 | log_folder = "LOGS" 10 | if not os.path.exists(log_folder): 11 | os.makedirs(log_folder) 12 | log_file_name = os.path.join( 13 | log_folder, time.strftime("%Y-%m-%d_%H-%M-%S") + ".log") 14 | logger = logging.getLogger() 15 | logger.setLevel(logging.INFO) 16 | 17 | stream_formatter = colorlog.ColoredFormatter( 18 | "%(asctime)s**[%(log_color)s%(levelname)s%(reset)s]**|| %(message)s", 19 | datefmt=None, 20 | reset=True, 21 | log_colors={ 22 | 'DEBUG': 'cyan', 23 | 'WARNING': 'yellow', 24 | 'ERROR': 'red', 25 | 'INFO': 'green', 26 | 'CRITICAL': 'red,bg_white', 27 | }, 28 | secondary_log_colors={}, 29 | style='%' 30 | ) 31 | 32 | 33 | class Formatter(colorlog.ColoredFormatter): 34 | def __init__(self, *args, **kwargs): 35 | super().__init__(*args, **kwargs) 36 | self.color_pattern = re.compile(r'\x1b\[[0-9;]*m') 37 | 38 | def format(self, record): 39 | formatted_record = super().format(record) 40 | clean_record = self.color_pattern.sub('', formatted_record) 41 | return clean_record 42 | 43 | 44 | file_formatter = Formatter( 45 | "%(asctime)s**[%(levelname)s]**|| %(message)s", 46 | datefmt=None, 47 | reset=True, 48 | log_colors={ 49 | 'DEBUG': 'cyan', 50 | 'WARNING': 'yellow', 51 | 'ERROR': 'red', 52 | 'INFO': 'green', 53 | 'CRITICAL': 'red,bg_white', 54 | }, 55 | secondary_log_colors={}, 56 | style='%' 57 | ) 58 | 59 | 60 | file_handler = logging.FileHandler(log_file_name, encoding='utf-8') 61 | file_handler.setFormatter(file_formatter) 62 | stream_handler = logging.StreamHandler(sys.stdout) 63 | stream_handler.setFormatter(stream_formatter) 64 | 65 | logger.addHandler(file_handler) 66 | logger.addHandler(stream_handler) 67 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Jinja2==3.1.2 2 | json5==0.9.14 3 | retry==0.9.2 4 | sanic==23.6.0 5 | gymnasium 6 | playwright==1.32.1 7 | Pillow 8 | evaluate 9 | openai 10 | types-tqdm 11 | tiktoken 12 | aiolimiter 13 | beartype==0.12.0 14 | flask 15 | nltk 16 | text-generation 17 | transformers==4.33.2 18 | bs4 19 | lxml 20 | colorlog 21 | toml 22 | argparse 23 | requests_toolbelt 24 | anthropic 25 | google-generativeai 26 | tomli -------------------------------------------------------------------------------- /scripts/run_evaluation.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iMeanAI/WebCanvas/b9f289128614cd99b97abd0bb9bfc3a45f0847e0/scripts/run_evaluation.sh -------------------------------------------------------------------------------- /src/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iMeanAI/WebCanvas/b9f289128614cd99b97abd0bb9bfc3a45f0847e0/src/.DS_Store -------------------------------------------------------------------------------- /src/main_figure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iMeanAI/WebCanvas/b9f289128614cd99b97abd0bb9bfc3a45f0847e0/src/main_figure.png --------------------------------------------------------------------------------