├── LICENSE ├── README.md ├── assets └── framework.png ├── autowebbench ├── en │ ├── ind │ │ └── test.json │ └── ood │ │ └── test.json └── zh │ ├── ind │ └── test.json │ └── ood │ └── test.json ├── eval.py ├── mind2web ├── domain │ └── test.json ├── task │ └── test.json └── website │ └── test.json ├── miniwob++ ├── .gitignore ├── README.md ├── html_tools │ ├── __init__.py │ ├── configs │ │ ├── __init__.py │ │ ├── config.py │ │ └── html_prompt.py │ ├── html_parser.py │ ├── identifier.py │ ├── prompt.py │ └── utils.py ├── install_dependency.sh ├── llms │ ├── __init__.py │ ├── call.py │ └── providers │ │ ├── __init__.py │ │ └── gpt.py ├── main.py ├── miniwob_tools │ ├── __init__.py │ ├── action.py │ ├── configs │ │ ├── __init__.py │ │ ├── config.py │ │ └── prompt.py │ └── utils.py ├── monitor.py ├── requirements.txt └── setup.sh └── webarena ├── .github └── workflows │ ├── pre-commit.yml │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CITATION.cff ├── LICENSE ├── README.md ├── agent ├── __init__.py ├── agent.py └── prompts │ ├── README.md │ ├── __init__.py │ ├── prompt_constructor.py │ ├── raw │ ├── new_action_prompt.py │ ├── p_cot_id_actree_2s.py │ ├── p_cot_id_actree_2s_no_na.py │ ├── p_direct_id_actree_2s.py │ ├── p_direct_id_actree_2s_no_na.py │ ├── p_direct_id_actree_3s_llama.py │ └── test_prompt.py │ └── to_json.py ├── browser_env ├── __init__.py ├── actions.py ├── async_envs.py ├── auto_login.py ├── constants.py ├── env_config.py ├── envs.py ├── helper_functions.py ├── html_tools │ ├── __init__.py │ ├── configs │ │ ├── __init__.py │ │ ├── config.py │ │ └── html_prompt.py │ ├── html_parser.py │ ├── identifier.py │ ├── prompt.py │ └── utils.py ├── processors.py ├── py.typed ├── scripts │ ├── __init__.py │ ├── canva_handler.js │ ├── get_data.js │ ├── get_text.js │ ├── label_marker.js │ ├── local_marker.js │ └── mix_marker.js ├── trajectory.py └── utils.py ├── check_errors.sh ├── config_files ├── examples │ ├── 1.json │ ├── 2.json │ ├── 3.json │ └── 4.json └── test.raw.json ├── environment_docker ├── README.md └── webarena-homepage │ ├── app.py │ ├── static │ └── figures │ │ ├── calculator.png │ │ ├── cms.png │ │ ├── gitlab.png │ │ ├── manual1.png │ │ ├── manual2.png │ │ ├── map.png │ │ ├── onestopshop.png │ │ ├── password.png │ │ ├── reddit.png │ │ ├── scratchpad.png │ │ └── wikipedia.png │ └── templates │ ├── calculator.html │ ├── index.html │ └── scratchpad.html ├── evaluation_harness ├── __init__.py ├── evaluators.py └── helper_functions.py ├── llms ├── __init__.py ├── lm_config.py ├── providers │ ├── hf_utils.py │ ├── openai_utils.py │ └── ours.py ├── tokenizers.py └── utils.py ├── media ├── example_trace_viewer.png ├── homepage_demo.png ├── logo.png ├── overview.png ├── v1_result.png └── v2_result.png ├── minimal_example.py ├── parallel_run.sh ├── prepare.sh ├── requirements.txt ├── resources └── README.md ├── run.py ├── scripts ├── check_error_runs.py ├── collect_obs.py ├── generate_test_data.py ├── html2json.py └── webarena-zeno.ipynb ├── setup.cfg ├── setup.py ├── solver ├── __init__.py ├── shopping_admin.py └── utils.py └── tests ├── conftest.py ├── test_browser_env ├── test_action_functionalities.py ├── test_actions.py ├── test_auth_cookie.py ├── test_playwright_actions.py └── test_script_browser_env.py └── test_evaluation_harness ├── configs ├── func_eval_fail.json ├── func_eval_success.json ├── func_url_func_1.json ├── func_url_func_2.json ├── html_content_element_exact_match.json ├── html_content_exact_match.json ├── html_content_url_comb.json ├── string_match.json └── url_exact_match.json ├── test_evaluators.py └── test_helper_functions.py /README.md: -------------------------------------------------------------------------------- 1 |

AutoWebGLM: A Large Language Model-based Web Navigating Agent

2 | 3 | This is the official implementation of AutoWebGLM. If you find our open-sourced efforts useful, please 🌟 the repo to encourage our following development! 4 | 5 | # Overview 6 | 7 | ![paper](./assets/framework.png) 8 | 9 | AutoWebGLM is a project aimed at building a more efficient language model-driven automated web navigation agent. This project is built on top of the ChatGLM3-6B model, extending its capabilities to navigate the web more effectively and tackle real-world browsing challenges better. 10 | 11 | ## Features 12 | 13 | - **HTML Simplification Algorithm**: Inspired by human browsing patterns, we've designed an algorithm to simplify HTML, making webpages more digestible for LLM agents while preserving crucial information. 14 | - **Hybrid Human-AI Training**: We combine human and AI knowledge to build web browsing data for curriculum training, enhancing the model's practical navigation skills. 15 | - **Reinforcement Learning and Rejection Sampling**: We enhance the model's webpage comprehension, browser operations, and efficient task decomposition abilities by bootstrapping it with reinforcement learning and rejection sampling. 16 | - **Bilingual Web Navigation Benchmark**: We introduce AutoWebBench—a bilingual (Chinese and English) benchmark for real-world web browsing tasks. This benchmark provides a robust tool for testing and refining the capabilities of AI web navigation agents. 17 | 18 | # Evaluation 19 | 20 | We have publicly disclosed our evaluation code, data, and environment. You may conduct the experiment using the following code. 21 | 22 | ## AutoWebBench & Mind2Web 23 | 24 | You can find our evaluation datasets at AutoWebBench and Mind2Web. 25 | For the code to perform model inference, please refer to ChatGLM3-6B. 26 | After obtaining the output file, the score can be obtained through ```python eval.py [result_path]```. 27 | 28 | ## WebArena 29 | 30 | We have made modifications to the WebArena environment to fit the interaction of our system; see WebArena. The modifications and execution instructions can be found in README. 31 | 32 | ## MiniWob++ 33 | 34 | We have also made modifications to the MiniWob++ environment, see MiniWob++. The modifications and execution instructions can be found in README. 35 | 36 | # License 37 | 38 | This repository is licensed under the [Apache-2.0 License](LICENSE). All open-sourced data is for resarch purpose only. 39 | 40 | # Citation 41 | If you use this code for your research, please cite our paper. 42 | 43 | ``` 44 | @inproceedings{lai2024autowebglm, 45 | author = {Lai, Hanyu and Liu, Xiao and Iong, Iat Long and Yao, Shuntian and Chen, Yuxuan and Shen, Pengbo and Yu, Hao and Zhang, Hanchen and Zhang, Xiaohan and Dong, Yuxiao and Tang, Jie}, 46 | title = {AutoWebGLM: A Large Language Model-based Web Navigating Agent}, 47 | booktitle = {Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining}, 48 | pages = {5295–-5306}, 49 | year = {2024} 50 | } 51 | ``` 52 | -------------------------------------------------------------------------------- /assets/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AutoWebGLM/eb8524e55cc3fac44625496e895e4044cea81fba/assets/framework.png -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | import re 4 | import numpy as np 5 | 6 | from rouge_chinese import Rouge 7 | import jieba # you can use any other word cutting library 8 | 9 | def get_rouge_score(hypothesis, reference): 10 | if hypothesis is None or reference is None: 11 | return None 12 | 13 | hypothesis = ' '.join(jieba.cut(hypothesis)) 14 | reference = ' '.join(jieba.cut(reference)) 15 | 16 | rouge = Rouge() 17 | scores = rouge.get_scores(hypothesis, reference) 18 | 19 | return scores[0]["rouge-1"]['f'] 20 | 21 | def parse_function_call(function_call): 22 | pattern = r"(\w+)\((.*)\)" 23 | match = re.match(pattern, function_call) 24 | 25 | if match: 26 | function_name = match.group(1) 27 | 28 | def return_args(*args): 29 | return args 30 | 31 | function_args = eval(f'return_args({match.group(2)})') 32 | 33 | return function_name, function_args 34 | 35 | return None 36 | 37 | def extract(text): 38 | ans = { 39 | 'type': None, 40 | 'label': None, 41 | 'param': None 42 | } 43 | 44 | match = parse_function_call(text) 45 | if match: 46 | ans['type'] = match[0] 47 | args = match[1] 48 | 49 | if ans['type']: 50 | if ans['type'] == 'click': 51 | ans['label'] = args[0] 52 | elif ans['type'] == 'hover': 53 | ans['label'] = args[0] 54 | elif ans['type'] == 'select': 55 | ans['label'] = args[0] 56 | ans['param'] = args[1] 57 | elif ans['type'] == 'type_string': 58 | ans['label'] = args[0] 59 | ans['param'] = args[1] 60 | elif ans['type'] == 'scroll_page': 61 | ans['param'] = args[0] 62 | elif ans['type'] == 'go': 63 | ans['param'] = args[0] 64 | elif ans['type'] == 'jump_to': 65 | ans['param'] = args[0] 66 | elif ans['type'] == 'switch_tab': 67 | ans['param'] = args[0] 68 | elif ans['type'] == 'user_input': 69 | ans['param'] = args[0] 70 | elif ans['type'] == 'finish': 71 | ans['param'] = args[0] 72 | 73 | return ans 74 | 75 | if __name__ == '__main__': 76 | result_path = sys.argv[1] 77 | res_list = { 78 | 'type': [], 79 | 'label': [], 80 | 'param': [], 81 | 'all': [] 82 | } 83 | 84 | for ix, r_str in enumerate(open(result_path).readlines()): 85 | r = json.loads(r_str) 86 | try: 87 | labels = json.loads(r['labels']) 88 | except: 89 | labels = [r['labels']] 90 | 91 | res = {} 92 | 93 | for label in labels: 94 | pred = r['predict'].split('A: ')[-1].strip() 95 | try: 96 | label_ans = extract(label) 97 | pred_ans = extract(pred) 98 | except: 99 | continue 100 | 101 | print(f'{ix}. label:', label_ans) 102 | print(f'{ix}. pred:', pred_ans) 103 | 104 | if label_ans['type'] is not None: 105 | if label_ans['type'] == pred_ans['type']: 106 | res['type'] = 1 107 | else: 108 | res['type'] = 0 109 | 110 | if label_ans['label'] is not None: 111 | if label_ans['label'] == pred_ans['label']: 112 | res['label'] = 1 113 | else: 114 | res['label'] = 0 115 | 116 | if label_ans['param'] is not None: 117 | rouge = get_rouge_score(label_ans['param'], pred_ans['param']) 118 | if rouge: 119 | res['param'] = rouge 120 | 121 | if label_ans['type'] is not None and label_ans['label'] is not None: 122 | if label_ans['type'] == pred_ans['type'] and label_ans['label'] == pred_ans['label']: 123 | res['all'] = 1 124 | break 125 | else: 126 | res['all'] = 0 127 | 128 | for k, v in res.items(): 129 | res_list[k].append(v) 130 | 131 | for k, v in res_list.items(): 132 | if v: 133 | res_list[k] = float(np.mean(v)) 134 | else: 135 | res_list[k] = 0.0 136 | 137 | print(res_list) -------------------------------------------------------------------------------- /miniwob++/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.pyc 3 | 4 | log_files/ 5 | result/ 6 | raw/ 7 | 8 | *.png 9 | test.py 10 | 11 | .DS_Store 12 | *.deb -------------------------------------------------------------------------------- /miniwob++/README.md: -------------------------------------------------------------------------------- 1 | # MiniWoB++ 2 | 3 | ## Usage 4 | 5 | ```shell 6 | pip install -r requirement.txt 7 | python main.py [cudas] [test-amount] [model-path] [result-path] 8 | ``` 9 | 10 | ### Parameter Description 11 | 12 | | Parameter | Format | Mandatory | Use | 13 | | ----------- | ------------ | --------- | ---------------------------------------------------------- | 14 | | cudas | 0,1,2 | Yes | The GPU number to be used, separated by commas, no spaces | 15 | | test-amount | 10 | Yes | Number of test cases per task, the paper uses 100, but generally, 10 groups are more reasonable for efficiency | 16 | | model-path | model_path/ | Yes | Path to the model to be tested, if set to 'manual' then manual execution can be performed | 17 | | result-path | result/ | Yes | Location for the model's output (Tasks that have been completed in the same path **will not** be executed again) | 18 | 19 | ## Results 20 | 21 | After running the above command, you should see a `log_files` folder appear in the current directory. The `**.log` files inside are the run results. When a task is completed, you should see the following output, where the result represents the test case score, which can be 0 or 1: 22 | 23 | ```sh 24 | 2023-11-30 06:28:13,283 - INFO - {"task": "click-button", "case_id": 10, "result": 1.0} 25 | ``` 26 | 27 | When all test cases for a group of tasks have been run, the following record will be output in the log: 28 | 29 | ```sh 30 | 2023-11-30 07:10:13,593 - INFO - {"task": "grid-coordinate", "avg_score": 0.3} 31 | ``` 32 | 33 | When all tasks in a process are completed, the log will record the following information: 34 | 35 | ```sh 36 | 2023-11-30 07:10:13,836 - INFO - ------ 37 | 2023-11-30 07:10:13,836 - INFO - click-button-sequence 1.00 38 | 2023-11-30 07:10:13,836 - INFO - click-checkboxes 0.62 39 | 2023-11-30 07:10:13,837 - INFO - click-checkboxes-large 0.07 40 | 2023-11-30 07:10:13,837 - INFO - click-color 0.24 41 | ... (50 lines omitted) 42 | 2023-11-30 07:10:13,839 - INFO - enter-date 1.00 43 | 2023-11-30 07:10:13,839 - INFO - grid-coordinate 0.30 44 | 2023-11-30 07:10:13,839 - INFO - all 0.442 45 | ``` 46 | -------------------------------------------------------------------------------- /miniwob++/html_tools/__init__.py: -------------------------------------------------------------------------------- 1 | from .identifier import IdentifierTool 2 | from .prompt import HtmlPrompt 3 | from .html_parser import HtmlParser 4 | 5 | from .utils import print_html_object 6 | from .configs import basic_attrs, mind2web_keep_attrs -------------------------------------------------------------------------------- /miniwob++/html_tools/configs/__init__.py: -------------------------------------------------------------------------------- 1 | from .html_prompt import prompts 2 | from .config import basic_attrs, mind2web_keep_attrs, miniwob_attrs 3 | from .config import config_meta -------------------------------------------------------------------------------- /miniwob++/html_tools/configs/config.py: -------------------------------------------------------------------------------- 1 | basic_attrs = [ 2 | 'title', 3 | 'value', 4 | 'placeholder', 5 | ] 6 | 7 | mind2web_keep_attrs = [ 8 | 'alt', 9 | 'aria_description', 10 | 'aria_label', 11 | 'aria_role', 12 | 'input_checked', 13 | 'input_value', 14 | 'label', 15 | 'name', 16 | 'option_selected', 17 | 'placeholder', 18 | 'role', 19 | 'text_value', 20 | 'title', 21 | 'type', 22 | 'value', 23 | ] 24 | 25 | miniwob_attrs = [ 26 | 'id', 27 | 'type', 28 | 'value', 29 | ] 30 | 31 | config_meta = """ 32 | ======= Configs ======= 33 | Columns: 34 | - id: {id_attr} 35 | - label: {label_attr} 36 | Position: {use_position} 37 | - window: {window_size} 38 | - rect_dict: {rect} 39 | Keep: 40 | - parents: {parent_chain} 41 | - attrs: {keep_attrs} 42 | - elems: {keep_elem} 43 | - obs_elem: {obs_elem} 44 | Generator: 45 | - prompt: {prompt_name} 46 | - label: {identifier_name} 47 | ======================== 48 | """ -------------------------------------------------------------------------------- /miniwob++/html_tools/configs/html_prompt.py: -------------------------------------------------------------------------------- 1 | refine_prompt = { 2 | 'dom': '<{tag}{label}|{attr}{content}{subtree} >', 3 | 'label': '[{label}]', 4 | 'attr': '{attr}', 5 | 'attr_splitter': '; ', 6 | 'subtree_splitter': ' ', 7 | } 8 | 9 | xml_prompt = { 10 | 'dom': '<{tag}{label}{attr}>{content}{subtree} ', 11 | 'label': ' id="{label}"', 12 | 'attr': '{key}="{attr}"', 13 | 'attr_splitter': ' ', 14 | 'subtree_splitter': ' ', 15 | } 16 | 17 | prompts = { 18 | 'refine': refine_prompt, 19 | 'xml': xml_prompt, 20 | 'new_data': refine_prompt, 21 | } 22 | -------------------------------------------------------------------------------- /miniwob++/html_tools/identifier.py: -------------------------------------------------------------------------------- 1 | import secrets 2 | 3 | class IdentifierTool: 4 | def __init__(self, method: str='order', existing_labels: dict[str]={}) -> None: 5 | self.methods = { 6 | 'order': self.get_identifier_in_order, 7 | 'random': self.get_random_identifier, 8 | } 9 | 10 | if method is None: 11 | method = 'order' 12 | 13 | self.func = self.methods.get(method, None) 14 | self.name = method 15 | if self.func is None: 16 | raise ValueError(f'Invalid method for identifier: {method}') 17 | 18 | self.reset(existing_labels) 19 | 20 | def reset(self, exists: dict[str]={}) -> None: 21 | self.identifier = -1 22 | self.exists = {} if exists is None else exists 23 | 24 | def get_identifier_in_order(self) -> str: 25 | def id2str(id: int) -> str: 26 | if id < 26: 27 | return chr(id + 65) 28 | id -= 26 29 | c0 = id // 676 30 | c1 = (id // 26) % 26 31 | c2 = id % 26 32 | label = f'{chr(c1 + 65)}{chr(c2 + 65)}' 33 | return label if c0 == 0 else f'{chr(c0 + 64)}{label}' 34 | 35 | self.identifier += 1 36 | label = id2str(self.identifier) 37 | 38 | while label in self.exists: 39 | self.identifier += 1 40 | label = id2str(self.identifier) 41 | 42 | self.exists[label] = True 43 | return label 44 | 45 | def get_random_identifier(self) -> str: 46 | secret_generator = secrets.SystemRandom() 47 | 48 | def get_random_label(n: int=2) -> str: 49 | tmp = '' 50 | for _ in range(n): 51 | tmp += chr(secret_generator.randint(65, 90)) 52 | return tmp 53 | 54 | wc = 3 if len(self.exists) > 280 else 2 55 | 56 | label = get_random_label(wc) 57 | while label in self.exists: 58 | label = get_random_label(wc) 59 | 60 | self.exists[label] = True 61 | return label 62 | 63 | def generate(self): 64 | return self.func() -------------------------------------------------------------------------------- /miniwob++/html_tools/prompt.py: -------------------------------------------------------------------------------- 1 | from .configs import prompts 2 | 3 | class HtmlPrompt: 4 | def __init__(self, prompt: str='') -> None: 5 | prompt = self.extract(prompt, 'xml') 6 | if prompt not in prompts: 7 | raise Exception('Unknown prompt: ' + prompt) 8 | 9 | constructors = { 10 | 'refine': self.normal_prompt_constructor, 11 | 'xml': self.normal_prompt_constructor, 12 | 'new_data': self.new_data_prompt_constructor, 13 | } 14 | 15 | self.name = prompt 16 | self.prompt = prompts[prompt] 17 | self.constructor = constructors[prompt] 18 | 19 | @staticmethod 20 | def extract(data, default=''): 21 | return data if data is not None else default 22 | 23 | def subtree_constructor(self, subtree: list[str]=[]) -> str: 24 | return self.prompt['subtree_splitter'].join(subtree) 25 | 26 | def normal_prompt_constructor(self, tag: str='', label: str='', content: str='', subtree_str: str='', class_dict: dict[str]={}) -> str: 27 | def add_prefix(data, prefix): 28 | return prefix + data if len(data) > 0 else '' 29 | 30 | tag = self.extract(tag) 31 | label = self.extract(label) 32 | content = self.extract(content) 33 | subtree_str = self.extract(subtree_str, '') 34 | class_dict = self.extract(class_dict, {}) 35 | 36 | label_str = '' 37 | if len(label) > 0: 38 | label_str = self.prompt['label'].format(label=label) 39 | 40 | classes = [] 41 | values = set() 42 | for key, val in class_dict.items(): 43 | if val in values: 44 | continue 45 | values.add(val) 46 | classes.append(self.prompt['attr'].format(key=key, attr=val)) 47 | classes_str = self.prompt['attr_splitter'].join(classes) 48 | 49 | content_splitter = ' ' if len(classes_str) == 0 else self.prompt['attr_splitter'] 50 | classes_str = add_prefix(classes_str, ' ') 51 | content_str = add_prefix(content, content_splitter) 52 | subtree_str = add_prefix(subtree_str, ' ') 53 | 54 | return self.prompt['dom'].format(tag=tag, label=label_str, attr=classes_str, content=content_str, subtree=subtree_str) 55 | 56 | def new_data_prompt_constructor(self, tag: str='', label: str='', content: str='', subtree_str: str='', class_dict: dict[str]={}) -> str: 57 | def add_prefix(data, prefix): 58 | return prefix + data if len(data) > 0 else '' 59 | 60 | tag = self.extract(tag) 61 | label = self.extract(label) 62 | content = self.extract(content) 63 | subtree_str = self.extract(subtree_str, '') 64 | class_dict = self.extract(class_dict, {}) 65 | 66 | label_str = '' 67 | if len(label) > 0: 68 | label_str = self.prompt['label'].format(label=label) 69 | 70 | classes = [] 71 | values = set() 72 | 73 | message = [] 74 | for key, val in class_dict.items(): 75 | if val == '': 76 | message.append(key) 77 | continue 78 | if val in values: 79 | continue 80 | values.add(val) 81 | classes.append(self.prompt['attr'].format(key=key, attr=val)) 82 | 83 | if len(message) > 0: 84 | message_str = ' '.join(message) 85 | classes.append(self.prompt['attr'].format(key='message', attr=message_str)) 86 | 87 | classes_str = self.prompt['attr_splitter'].join(classes) 88 | 89 | content_splitter = ' ' if len(classes_str) == 0 else self.prompt['attr_splitter'] 90 | classes_str = add_prefix(classes_str, ' ') 91 | content_str = add_prefix(content, content_splitter) 92 | subtree_str = add_prefix(subtree_str, ' ') 93 | 94 | return self.prompt['dom'].format(tag=tag, label=label_str, attr=classes_str, content=content_str, subtree=subtree_str) 95 | 96 | def prompt_constructor(self, tag: str='', label: str='', content: str='', subtree_str: str='', class_dict: dict[str]={}) -> str: 97 | return self.constructor(tag, label, content, subtree_str, class_dict) -------------------------------------------------------------------------------- /miniwob++/html_tools/utils.py: -------------------------------------------------------------------------------- 1 | from lxml import html 2 | def get_xpath_top_down(element: html.HtmlElement, id_column: str='temp_id', path: str='', order: int=0, 3 | in_svg: bool=False, temp_id: int=0, i2xpath: dict[str, str]={}) -> tuple[int, dict[str, str], dict[str]]: 4 | used_labels = {} 5 | # path 6 | tag = element.tag.lower() 7 | in_svg = in_svg or (tag == 'svg') 8 | 9 | if not in_svg and 'id' in element.attrib: 10 | node_id = element.attrib['id'] 11 | path = f"//*[@id='{node_id}']" 12 | else: 13 | suffix = f'[{order}]' if order > 0 else '' 14 | prefix = f"*[name()='{tag}']" if in_svg else tag 15 | path = path + '/' + prefix + suffix 16 | 17 | # add temp id 18 | element.attrib['temp_id'] = str(temp_id) 19 | ori_label = element.attrib.get(id_column, '') 20 | if ori_label != '': 21 | used_labels[ori_label] = True 22 | 23 | bid = str(temp_id) 24 | i2xpath[bid] = f'xpath/{path}' 25 | i2xpath[f'/{path}'] = bid 26 | i2xpath[f'xpath/{path}'] = bid 27 | i2xpath[f'xpath=/{path}'] = bid 28 | 29 | temp_id += 1 30 | 31 | # traverse node 32 | children = element.getchildren() 33 | tag_dict = {} 34 | id_list = [] 35 | for child in children: 36 | ctag = child.tag.lower() 37 | if ctag not in tag_dict: 38 | tag_dict[ctag] = 0 39 | tag_dict[ctag] += 1 40 | id_list.append(tag_dict[ctag]) 41 | 42 | for cid, child in zip(id_list, children): 43 | ctag = child.tag.lower() 44 | cod = cid if tag_dict[ctag] > 1 else 0 45 | temp_id, i2x, ulabels = get_xpath_top_down(child, id_column, path, cod, in_svg, temp_id, i2xpath) 46 | i2xpath.update(i2x) 47 | used_labels.update(ulabels) 48 | 49 | return temp_id, i2xpath, used_labels 50 | 51 | def print_html_object(obj: str='') -> str: 52 | tab_cnt = 0 53 | result, content, sep = '', '', '' 54 | last_is_left, last_is_right = False, False 55 | for ch in obj: 56 | if ch == '<': 57 | result += '\n' 58 | if len(content.strip()) > 0: 59 | result += sep + content.strip() + '\n' 60 | result += sep + '<' 61 | 62 | tab_cnt += 1 63 | sep = ' ' * tab_cnt 64 | 65 | content = '' 66 | last_is_right = False 67 | last_is_left = True 68 | elif ch == '>': 69 | if last_is_left: 70 | result += content 71 | else: 72 | if last_is_right: 73 | result += '\n' 74 | if len(content.strip()) > 0: 75 | result += sep + content.strip() + '\n' 76 | 77 | tab_cnt -= 1 78 | sep = ' ' * tab_cnt 79 | 80 | if not last_is_left: 81 | result += sep 82 | 83 | result += '>' 84 | content = '' 85 | 86 | last_is_right = True 87 | last_is_left = False 88 | else: 89 | content += ch 90 | 91 | return result -------------------------------------------------------------------------------- /miniwob++/install_dependency.sh: -------------------------------------------------------------------------------- 1 | apt-get install -y chromium-browser 2 | apt-get install -y libxcb1 3 | apt-get install -y libatk1.0-0 4 | apt-get install -y libnss3 5 | apt-get install -y libatk-bridge2.0-0 6 | apt-get install -y libcups2 7 | apt-get install -y libdrm2 8 | apt-get install -y libxkbcommon0 9 | apt-get install -y libxcomposite1 10 | apt-get install -y libxdamage1 11 | apt-get install -y libxfixes3 12 | apt-get install -y libxrandr2 13 | apt-get install -y libgbm1 14 | apt-get install -y libpango1.0-0 15 | ls /root/.cache/selenium/chrome/linux64/ # run /root/.cache/selenium/chrome/linux64/xxx/chrome to check if it works -------------------------------------------------------------------------------- /miniwob++/llms/__init__.py: -------------------------------------------------------------------------------- 1 | from .call import CallLLM -------------------------------------------------------------------------------- /miniwob++/llms/call.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModel 2 | from .providers import call_method 3 | 4 | class CallLLM(): 5 | def __init__(self, model_path='chatgpt', cuda='0'): 6 | if model_path in call_method: 7 | self.func = call_method[model_path] 8 | return 9 | 10 | model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device=f'cuda:{cuda}') 11 | self.cuda = cuda 12 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 13 | self.model = model.eval() 14 | self.func = self.call_pretrain_model 15 | 16 | def call_pretrain_model(self, query: str, sample_times: int=1): 17 | def chatglm3_base_template(query, history=None, system=None): 18 | prompt = f'Q: {query}\n\nA: ' 19 | return prompt 20 | 21 | def model_chat(prompt: str): 22 | output, updated_history = self.model.chat(self.tokenizer, prompt, history=None) 23 | return output 24 | 25 | def generation(prompt: str, sample_times: int=1): 26 | input_ids = self.tokenizer.encode( 27 | text=prompt, 28 | return_tensors='pt', 29 | max_length=8192, 30 | truncation=False 31 | ).to(f'cuda:{self.cuda}') 32 | 33 | if len(input_ids[0]) > 7500: 34 | return '' 35 | 36 | output_ids = self.model.generate( 37 | input_ids=input_ids, 38 | max_new_tokens=1024, 39 | do_sample=True, 40 | top_p=0.7, 41 | temperature=0.95, 42 | num_return_sequences=sample_times 43 | ) 44 | 45 | output_text_list = [] 46 | for i in range(sample_times): 47 | output_text = self.tokenizer.decode(output_ids[i], skip_special_tokens=True) 48 | output_text = output_text.split('A: ')[-1] 49 | output_text_list.append(output_text) 50 | 51 | output = output_text_list[0] 52 | return output 53 | 54 | prompt = chatglm3_base_template(query) 55 | output = generation(prompt) 56 | # output = model_chat(prompt) 57 | print('[Model]', output) 58 | return output 59 | 60 | def model_call(self, prompt): 61 | output = self.func(prompt) 62 | return output 63 | -------------------------------------------------------------------------------- /miniwob++/llms/providers/__init__.py: -------------------------------------------------------------------------------- 1 | from .gpt import call_gpt 2 | from functools import partial 3 | 4 | def call_manual(prompt, history=None, system=None): 5 | return input() 6 | 7 | call_method = { 8 | 'chatgpt': partial(call_gpt, 'gpt-3.5-turbo'), 9 | 'gpt4': partial(call_gpt, 'gpt-4'), 10 | 'manual': call_manual, 11 | } -------------------------------------------------------------------------------- /miniwob++/llms/providers/gpt.py: -------------------------------------------------------------------------------- 1 | import openai 2 | import os 3 | 4 | async def call_gpt(model, prompt, history=None, system=None): 5 | message = [] 6 | if system: 7 | message.append({ 8 | "role": "system", 9 | "content": system 10 | }) 11 | 12 | if history: 13 | for chat in history: 14 | message.append({ 15 | "role": "user", 16 | "content": chat[0] 17 | }) 18 | message.append({ 19 | "role": "assistant", 20 | "content": chat[1] 21 | }) 22 | 23 | message.append({ 24 | "role": "user", 25 | "content": prompt 26 | }) 27 | 28 | if "OPENAI_API_KEY" not in os.environ: 29 | raise ValueError( 30 | "OPENAI_API_KEY environment variable must be set when using OpenAI API." 31 | ) 32 | key = os.environ["OPENAI_API_KEY"] 33 | 34 | resp = openai.ChatCompletion.create( 35 | model=model, 36 | messages=message, 37 | api_key=key, 38 | timeout=1000 39 | ) 40 | 41 | output = resp["choices"][0]["message"]["content"] 42 | 43 | return output -------------------------------------------------------------------------------- /miniwob++/miniwob_tools/__init__.py: -------------------------------------------------------------------------------- 1 | from .configs import testcases, mwpp_attrs, not_clickable_tag, miniwob_attrs 2 | from .action import ActionParser 3 | from .utils import save_pixel_array, get_dom_list, get_html, update_dom_list, get_position_bar, get_page_height, get_position_info, process_dom_list -------------------------------------------------------------------------------- /miniwob++/miniwob_tools/action.py: -------------------------------------------------------------------------------- 1 | from .configs import miniwob_prompt, miniwob_prompt_with_tp, miniwob_prompt_new_action_space 2 | 3 | class ActionParser: 4 | operation_pattern = { 5 | 'Click': r'#Click#\s*([A-Z]{1,3})', 6 | 'Hover': r'#Hover#\s*([A-Z]{1,3})', 7 | 'Scroll_up': r'#Scroll_up#', 8 | 'Scroll_down': r'#Scroll_down#', 9 | 'Type': r'#Type#\s*([A-Z]{1,3})\s*"{0,1}(.+)"{0,1}', 10 | } 11 | 12 | new_action_space_pattern = { 13 | 'Click': r'click\([\'\"]([A-Z]{1,3})[\'\"]\)', 14 | 'Hover': r'hover\([\'\"]([A-Z]{1,3})[\'\"]\)', 15 | 'Scroll_up': r'scroll_page\([\'\"]up[\'\"]\)', 16 | 'Scroll_down': r'scroll_page\([\'\"]down[\'\"]\)', 17 | 'Type': r'type_string\([\'\"]([A-Z]{1,3})[\'\"]\s*,\s*[\'\"](.+)[\'\"]\s*,\s*(True|False)\)', 18 | } 19 | 20 | prompts = { 21 | 'basic': miniwob_prompt, 22 | 'tp': miniwob_prompt_with_tp, 23 | 'new_action_space': miniwob_prompt_new_action_space, 24 | } 25 | 26 | def __init__(self, prompt: str='basic') -> None: 27 | if prompt not in self.prompts: 28 | raise ValueError('Invalid prompt type.') 29 | 30 | funcs = { 31 | 'basic': self.extract_operation, 32 | 'tp': self.extract_operation_with_tp, 33 | 'new_action_space': self.extract_operation_new_action_space, 34 | } 35 | 36 | self.prompt = self.prompts[prompt] 37 | self.func = funcs[prompt] 38 | 39 | def get_prompt(self) -> str: 40 | return self.prompt 41 | 42 | def extract(self, result: str='') -> (None, tuple): 43 | return self.func(result) 44 | 45 | @staticmethod 46 | def extract_operation(result: str='') -> (str, str): 47 | import re 48 | # match = re.search(r'#Operation:\s*(.+)', result) 49 | # if not match: 50 | # return None 51 | # opstr = match.group(1) 52 | opstr = result 53 | 54 | for op, pattern in ActionParser.operation_pattern.items(): 55 | match = re.search(pattern, opstr) 56 | if not match: 57 | continue 58 | param = match.groups() 59 | if op == 'Type': 60 | param.append(param[1]) 61 | return '', op, param 62 | 63 | return None 64 | 65 | @staticmethod 66 | def extract_operation_with_tp(result: str='') -> (str, str): 67 | import re 68 | match = re.search(r'#Thinking Process:\s*(.+)\s*#Operation:\s*(.+)', result) 69 | if not match: 70 | return None 71 | tpstr = match.group(1) 72 | opstr = match.group(2) 73 | 74 | for op, pattern in ActionParser.operation_pattern.items(): 75 | match = re.search(pattern, opstr) 76 | if not match: 77 | continue 78 | param = match.groups() 79 | if op == 'Type': 80 | param.append(False) 81 | return tpstr, op, match.groups() 82 | 83 | return None 84 | 85 | @staticmethod 86 | def extract_operation(result: str='') -> (str, str): 87 | import re 88 | # match = re.search(r'#Operation:\s*(.+)', result) 89 | # if not match: 90 | # return None 91 | # opstr = match.group(1) 92 | opstr = result 93 | 94 | for op, pattern in ActionParser.operation_pattern.items(): 95 | match = re.search(pattern, opstr) 96 | if not match: 97 | continue 98 | param = match.groups() 99 | if op == 'Type': 100 | param.append(False) 101 | return '', op, param 102 | 103 | return None 104 | 105 | @staticmethod 106 | def extract_operation_new_action_space(result: str='') -> (str, str): 107 | import re 108 | opstr = result 109 | 110 | for op, pattern in ActionParser.new_action_space_pattern.items(): 111 | match = re.search(pattern, opstr) 112 | if not match: 113 | continue 114 | param = match.groups() 115 | if op == 'Type': 116 | if param[1] == 'True': 117 | param[1] = True 118 | elif param[1] == 'False': 119 | param[1] = False 120 | 121 | return '', op, param 122 | 123 | return None -------------------------------------------------------------------------------- /miniwob++/miniwob_tools/configs/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import testcases, mwpp_attrs, not_clickable_tag, miniwob_attrs, special_classes 2 | from .prompt import miniwob_prompt, miniwob_prompt_with_tp, miniwob_prompt_new_action_space -------------------------------------------------------------------------------- /miniwob++/miniwob_tools/configs/config.py: -------------------------------------------------------------------------------- 1 | testcases = [ 2 | 'book-flight', 3 | 'choose-date', 4 | 'choose-date-easy', 5 | 'choose-date-medium', 6 | 'choose-list', 7 | 'click-button', 8 | 'click-button-sequence', 9 | 'click-checkboxes', 10 | 'click-checkboxes-large', 11 | 'click-checkboxes-soft', 12 | 'click-checkboxes-transfer', 13 | 'click-collapsible', 14 | 'click-collapsible-2', 15 | 'click-color', 16 | 'click-dialog', 17 | 'click-dialog-2', 18 | 'click-link', 19 | 'click-menu', 20 | 'click-option', 21 | 'click-pie', 22 | 'click-scroll-list', 23 | 'click-shades', 24 | 'click-shape', 25 | 'click-tab', 26 | 'click-tab-2', 27 | 'click-tab-2-hard', 28 | 'click-test', 29 | 'click-test-2', 30 | 'click-widget', 31 | 'count-shape', 32 | 'email-inbox', 33 | 'email-inbox-forward-nl', 34 | 'email-inbox-forward-nl-turk', 35 | 'email-inbox-nl-turk', 36 | 'enter-date', 37 | 'enter-password', 38 | 'enter-text', 39 | 'enter-text-dynamic', 40 | 'enter-time', 41 | 'focus-text', 42 | 'focus-text-2', 43 | 'grid-coordinate', 44 | 'guess-number', 45 | 'identify-shape', 46 | 'login-user', 47 | 'login-user-popup', 48 | 'multi-layouts', 49 | 'multi-orderings', 50 | 'navigate-tree', 51 | 'search-engine', 52 | 'social-media', 53 | 'social-media-all', 54 | 'social-media-some', 55 | 'tic-tac-toe', 56 | 'use-autocomplete', 57 | 'use-spinner', 58 | ] 59 | 60 | mwpp_attrs = { 61 | 'basic': [ 62 | 'id', 63 | 'classes', 64 | 'value', 65 | ], 66 | 'position': [ 67 | 'left', 68 | 'top', 69 | 'width', 70 | 'height', 71 | ], 72 | 'color': { 73 | 'bgColor': 'background-color', 74 | 'fgColor': 'color', 75 | } 76 | } 77 | 78 | not_clickable_tag = [ 79 | 'body', 80 | 'div', 81 | 'form', 82 | 'h1', 83 | 'h2', 84 | 'h3', 85 | 'h4', 86 | 'h5', 87 | 'h6', 88 | 't', 89 | 'tr', 90 | 'td', 91 | 'th', 92 | 'p', 93 | 'li', 94 | ] 95 | 96 | miniwob_attrs = [ 97 | 'id', 98 | 'type', 99 | 'classes', 100 | 'value', 101 | 'rgba', 102 | 'size', 103 | ] 104 | 105 | special_classes = [ 106 | 'alink', 107 | 'color', 108 | 'share', 109 | 'copy', 110 | 'embed', 111 | 'menu-user', 112 | 'block-user', 113 | 'report', 114 | 'email-forward', 115 | 'forward-sender', 116 | 'email-reply', 117 | 'email-sender' 118 | ] -------------------------------------------------------------------------------- /miniwob++/miniwob_tools/configs/prompt.py: -------------------------------------------------------------------------------- 1 | miniwob_prompt = """ %s 2 | 3 | You are a helpful assistant that can assist with web navigation tasks. 4 | You are given a simplified html webpage and a task description. 5 | Your goal is to complete the task. You can perform the specified operations below to interact with the webpage. 6 | 7 | #Valid operations: - #Click# id: Click on the element with the specified id. 8 | - #Hover# id: Hover on the element with the specified id. 9 | - #Scroll_up#: Scroll up 1 page. 10 | - #Scroll_down#: Scroll down 1 page. 11 | - #Type# id "text": Type in the text at the element with the specified id. 12 | 13 | #Current viewport position: %s 14 | 15 | #Previous Operation: %s 16 | 17 | #Task: %s 18 | 19 | Your output SHOULD be in the following format: 20 | #Operation: {Next operation to perform} 21 | """ 22 | 23 | miniwob_prompt_with_tp = """ %s 24 | 25 | You are a helpful assistant that can assist with web navigation tasks. 26 | You are given a simplified html webpage and a task description. 27 | Your goal is to complete the task. You can perform the specified operations below to interact with the webpage. 28 | 29 | #Valid operations: - #Click# id: Click on the element with the specified id. 30 | - #Hover# id: Hover on the element with the specified id. 31 | - #Scroll_up#: Scroll up 1 page. 32 | - #Scroll_down#: Scroll down 1 page. 33 | - #Type# id "text": Type in the text at the element with the specified id. 34 | 35 | #Current viewport position: %s 36 | 37 | #Previous Operation: %s 38 | 39 | #Task: %s 40 | 41 | Your output SHOULD be in the following format: 42 | #Thinking Process: {Your thinking process to complete the task, including detailed analysis. For example, I have completed xxx and need to do xxx, so I need to perform xxx operation on the element } 43 | #Operation: {Next operation to perform} 44 | """ 45 | 46 | miniwob_prompt_new_action_space = """ %s 47 | 48 | You are a helpful assistant that can assist with web navigation tasks. 49 | You are given a simplified html webpage and a task description. 50 | Your goal is to complete the task. You can use the provided functions below to interact with the current webpage. 51 | 52 | #Provided functions: 53 | def click(element_id: str) -> None: 54 | \"\"\" 55 | Click on the element with the specified id. 56 | 57 | Args: 58 | element_id: The id of the element. 59 | \"\"\" 60 | 61 | def hover(element_id: str) -> None: 62 | \"\"\" 63 | Hover on the element with the specified id. 64 | 65 | Args: 66 | element_id: The id of the element. 67 | \"\"\" 68 | 69 | def select(element_id: str, option: str) -> None: 70 | \"\"\" 71 | Select an option from a dropdown. 72 | 73 | Args: 74 | element_id: The id of the element. 75 | option: Value of the option to select. 76 | \"\"\" 77 | 78 | def type_string(element_id: str, content: str, press_enter: bool) -> None: 79 | \"\"\" 80 | Type a string into the element with the specified id. 81 | 82 | Args: 83 | element_id: The id of the element. 84 | content: The string to type. 85 | press_enter: Whether to press enter after typing the string. 86 | \"\"\" 87 | 88 | def scroll_page(direction: Literal['up', 'down']) -> None: 89 | \"\"\" 90 | Scroll down/up one page. 91 | 92 | Args: 93 | direction: The direction to scroll. 94 | \"\"\" 95 | 96 | def go(direction: Literal['forward', 'backward']) -> None: 97 | \"\"\" 98 | Go forward/backward 99 | 100 | Args: 101 | direction: The direction to go to. 102 | \"\"\" 103 | 104 | def jump_to(url: str, new_tab: bool) -> None: 105 | \"\"\" 106 | Jump to the specified url. 107 | 108 | Args: 109 | url: The url to jump to. 110 | new_tab: Whether to open the url in a new tab. 111 | \"\"\" 112 | 113 | def switch_tab(tab_index: int) -> None: 114 | \"\"\" 115 | Switch to the specified tab. 116 | 117 | Args: 118 | tab_index: The index of the tab to switch to. 119 | \"\"\" 120 | 121 | def user_input(message: str) -> str: 122 | \"\"\" 123 | Wait for user input. 124 | 125 | Args: 126 | message: The message to display to the user. 127 | 128 | Returns: The user input. 129 | \"\"\" 130 | 131 | def finish(answer: Optional[str]) -> None: 132 | \"\"\" 133 | Finish the task (optionally with an answer). 134 | 135 | Args: 136 | answer: The answer to the task. 137 | \"\"\" 138 | 139 | #Previous commands: %s 140 | 141 | #Window tabs: 1. Default <-- current tab 142 | 143 | #Current viewport (pages): %s 144 | 145 | #Task: %s 146 | 147 | You should output one command to interact to the currrent webpage. 148 | """ 149 | -------------------------------------------------------------------------------- /miniwob++/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | gymnasium==0.29.0 3 | lxml==4.9.3 4 | miniwob==1.0 5 | numpy==1.22.2 6 | openai==1.3.7 7 | Pillow==9.2.0 8 | Pillow==10.1.0 9 | Requests==2.31.0 10 | transformers==4.35.2 11 | -------------------------------------------------------------------------------- /miniwob++/setup.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AutoWebGLM/eb8524e55cc3fac44625496e895e4044cea81fba/miniwob++/setup.sh -------------------------------------------------------------------------------- /webarena/.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- 1 | name: pre-commit 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: [main] 7 | 8 | jobs: 9 | pre-commit: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v3 13 | - name: Set up Python 3.10 14 | uses: actions/setup-python@v4 15 | with: 16 | python-version: 3.10.9 17 | - uses: pre-commit/action@v3.0.0 18 | -------------------------------------------------------------------------------- /webarena/.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Python Package Pytest 2 | on: [push] 3 | 4 | jobs: 5 | test-all: 6 | runs-on: ubuntu-latest 7 | env: 8 | SHOPPING: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:7770" 9 | SHOPPING_ADMIN: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:7780/admin" 10 | REDDIT: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:9999" 11 | GITLAB: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:8023" 12 | MAP: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:3000" 13 | WIKIPEDIA: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing" 14 | HOMEPAGE: "PASS" 15 | strategy: 16 | max-parallel: 5 17 | steps: 18 | - uses: actions/checkout@v3 19 | - name: Set up Python 3.10 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: 3.10.9 23 | - name: Install dependencies 24 | run: | 25 | pip install -r requirements.txt 26 | playwright install 27 | python -m nltk.downloader punkt stopwords 28 | pip install -e .[dev] 29 | - name: Type-checking package with mypy 30 | run: | 31 | # Manually install mypy in the standard way. 32 | pip --quiet install -U mypy 33 | # Log this mypy version for debuggability. 34 | mypy --version 35 | # Run this mypy instance against our main package. 36 | mypy --install-types --non-interactive . 37 | mypy --strict . --exclude scripts 38 | - name: Enviroment prepare 39 | run: | 40 | bash prepare.sh 41 | - name: Test with pytest 42 | run: | 43 | pytest 44 | -------------------------------------------------------------------------------- /webarena/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # mac OS 132 | *.DS_Store 133 | 134 | .vscode 135 | *tmp* 136 | 137 | .auth/* 138 | 139 | # local debug 140 | run.sh 141 | 142 | # trajectory visualization 143 | render_cache/* 144 | cache/* 145 | 146 | # TMP IGNORE 147 | agent/prompts/jsons/* 148 | log_files/ 149 | config_files*/*0.json 150 | config_files*/*1.json 151 | config_files*/*2.json 152 | config_files*/*3.json 153 | config_files*/*4.json 154 | config_files*/*5.json 155 | config_files*/*6.json 156 | config_files*/*7.json 157 | config_files*/*8.json 158 | config_files*/*9.json 159 | config_files*/test.json 160 | 161 | # Our Trash 162 | output/ 163 | result/ 164 | result-*/ 165 | config_files/check.py 166 | config_files_backup/ 167 | get*.py 168 | parallel_run_*.sh 169 | 170 | check_correct_id.py 171 | rm_data.py -------------------------------------------------------------------------------- /webarena/.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v3.2.0 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: end-of-file-fixer 7 | - id: check-yaml 8 | - id: check-added-large-files 9 | args: ['--maxkb=10240'] 10 | - repo: https://github.com/psf/black 11 | rev: 22.12.0 12 | hooks: 13 | - id: black 14 | exclude: '^(agent/prompts/raw)' 15 | args: [--line-length=79] 16 | - repo: https://github.com/pycqa/isort 17 | rev: 5.12.0 18 | hooks: 19 | - id: isort 20 | args: ["--profile", "black", --line-length=72] 21 | - repo: https://github.com/kynan/nbstripout 22 | rev: 0.6.0 23 | hooks: 24 | - id: nbstripout 25 | -------------------------------------------------------------------------------- /webarena/CITATION.cff: -------------------------------------------------------------------------------- 1 | @article{zhou2023webarena, 2 | title={WebArena: A Realistic Web Environment for Building Autonomous Agents}, 3 | author={Zhou, Shuyan and Xu, Frank F and Zhu, Hao and Zhou, Xuhui and Lo, Robert and Sridhar, Abishek and Cheng, Xianyi and Bisk, Yonatan and Fried, Daniel and Alon, Uri and others}, 4 | journal={arXiv preprint arXiv:2307.13854}, 5 | year={2023} 6 | } 7 | -------------------------------------------------------------------------------- /webarena/README.md: -------------------------------------------------------------------------------- 1 | # Modified WebArena evaluation 2 | 3 | We modified the configuration in WebArena to add our unique simplification method to improve the speed of the evaluation. 4 | 5 | The following content is inherited from the WebArena repository, and we've only modified some of the test commands and prompt formats. 6 | 7 | ## Install 8 | 9 | ```bash 10 | # Python 3.10+ 11 | conda create -n webarena python=3.10; conda activate webarena 12 | pip install -r requirements.txt 13 | playwright install 14 | pip install -e . 15 | 16 | # optional, dev only 17 | pip install -e ".[dev]" 18 | mypy --install-types --non-interactive browser_env agents evaluation_harness 19 | pip install pre-commit 20 | pre-commit install 21 | ``` 22 | ## End-to-end Evaluation 23 | 24 | 1. Setup the standalone environment. 25 | Please check out [this page](environment_docker/README.md) for details. 26 | 27 | 2. Configurate the urls for each website. 28 | ```bash 29 | export SHOPPING=":7770" 30 | export SHOPPING_ADMIN=":7780/admin" 31 | export REDDIT=":9999" 32 | export GITLAB=":8023" 33 | export MAP=":3000" 34 | export WIKIPEDIA=":8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing" 35 | export HOMEPAGE=":4399" # this is a placeholder 36 | ``` 37 | 38 | > You are encouraged to update the environment variables in [github workflow](.github/workflows/tests.yml#L7) to ensure the correctness of unit tests 39 | 40 | 3. Generate config file for each test example 41 | ```bash 42 | python scripts/generate_test_data.py 43 | ``` 44 | You will see `*.json` files generated in [config_files](./config_files) folder. Each file contains the configuration for one test example. 45 | 46 | 4. Obtain the auto-login cookies for all websites 47 | ``` 48 | mkdir -p ./.auth 49 | python browser_env/auto_login.py 50 | ``` 51 | 5. export `OPENAI_API_KEY=your_key`, a valid OpenAI API key starts with `sk-` 52 | 53 | 6. Launch the evaluation 54 | ```bash 55 | python run.py \ 56 | --instruction_path agent/prompts/jsons/new_action_prompt.json \ # this is the reasoning agent prompt we used in the paper 57 | --model gpt-3.5-turbo \ 58 | --mode completion \ 59 | --observation_type html \ 60 | --action_set_tag id_html_nasc_tree \ 61 | --result_dir \ 62 | --test_start_idx 0 \ 63 | --test_end_idx 1 \ 64 | ``` 65 | This script will run the first example with GPT-3.5 reasoning agent. The trajectory will be saved in `/0.html` 66 | 67 | ## Develop Your Prompt-based Agent 68 | 1. Define the prompts. We provide two baseline agents whose correrponding prompts are listed [here](./agent/prompts/raw). Each prompt is a dictionary with the following keys: 69 | ```python 70 | prompt = { 71 | "intro": , 72 | "examples": [ 73 | ( 74 | example_1_observation, 75 | example_1_response 76 | ), 77 | ( 78 | example_2_observation, 79 | example_2_response 80 | ), 81 | ... 82 | ], 83 | "template": , 84 | "meta_data": { 85 | "observation": , 86 | "action_type": , 87 | "keywords": , 88 | "prompt_constructor": , 89 | "action_splitter": 90 | } 91 | } 92 | ``` 93 | 94 | 2. Implement the prompt constructor. An example prompt constructor using Chain-of-thought/ReAct style reasoning is [here](./agent/prompts/prompt_constructor.py#L184). The prompt constructor is a class with the following methods: 95 | * `construct`: construct the input feed to an LLM 96 | * `_extract_action`: given the generation from an LLM, how to extract the phrase that corresponds to the action -------------------------------------------------------------------------------- /webarena/agent/__init__.py: -------------------------------------------------------------------------------- 1 | from .agent import ( 2 | Agent, 3 | PromptAgent, 4 | TeacherForcingAgent, 5 | construct_agent, 6 | ) 7 | 8 | __all__ = ["Agent", "TeacherForcingAgent", "PromptAgent", "construct_agent"] 9 | -------------------------------------------------------------------------------- /webarena/agent/prompts/README.md: -------------------------------------------------------------------------------- 1 | ## Naming of the prompt files 2 | `description.action_space.observation_space.json` 3 | -------------------------------------------------------------------------------- /webarena/agent/prompts/__init__.py: -------------------------------------------------------------------------------- 1 | from .prompt_constructor import * 2 | -------------------------------------------------------------------------------- /webarena/agent/prompts/raw/new_action_prompt.py: -------------------------------------------------------------------------------- 1 | prompt = { 2 | "intro": "", 3 | "examples": [], 4 | "template": """ {html} 5 | 6 | You are a helpful assistant that can assist with web navigation tasks. 7 | You are given a simplified html webpage and a task description. 8 | Your goal is to complete the task. You can use the provided functions below to interact with the current webpage. 9 | 10 | #Provided functions: 11 | def click(element_id: str) -> None: 12 | \"\"\" 13 | Click on the element with the specified id. 14 | 15 | Args: 16 | element_id: The id of the element. 17 | \"\"\" 18 | 19 | def hover(element_id: str) -> None: 20 | \"\"\" 21 | Hover on the element with the specified id. 22 | 23 | Args: 24 | element_id: The id of the element. 25 | \"\"\" 26 | 27 | def select(element_id: str, option: str) -> None: 28 | \"\"\" 29 | Select an option from a dropdown. 30 | 31 | Args: 32 | element_id: The id of the element. 33 | option: Value of the option to select. 34 | \"\"\" 35 | 36 | def type_string(element_id: str, content: str, press_enter: bool) -> None: 37 | \"\"\" 38 | Type a string into the element with the specified id. 39 | 40 | Args: 41 | element_id: The id of the element. 42 | content: The string to type. 43 | press_enter: Whether to press enter after typing the string. 44 | \"\"\" 45 | 46 | def scroll_page(direction: Literal['up', 'down']) -> None: 47 | \"\"\" 48 | Scroll down/up one page. 49 | 50 | Args: 51 | direction: The direction to scroll. 52 | \"\"\" 53 | 54 | def go(direction: Literal['forward', 'backward']) -> None: 55 | \"\"\" 56 | Go forward/backward 57 | 58 | Args: 59 | direction: The direction to go to. 60 | \"\"\" 61 | 62 | def jump_to(url: str, new_tab: bool) -> None: 63 | \"\"\" 64 | Jump to the specified url. 65 | 66 | Args: 67 | url: The url to jump to. 68 | new_tab: Whether to open the url in a new tab. 69 | \"\"\" 70 | 71 | def switch_tab(tab_index: int) -> None: 72 | \"\"\" 73 | Switch to the specified tab. 74 | 75 | Args: 76 | tab_index: The index of the tab to switch to. 77 | \"\"\" 78 | 79 | def user_input(message: str) -> str: 80 | \"\"\" 81 | Wait for user input. 82 | 83 | Args: 84 | message: The message to display to the user. 85 | 86 | Returns: The user input. 87 | \"\"\" 88 | 89 | def finish(answer: Optional[str]) -> None: 90 | \"\"\" 91 | Finish the task (optionally with an answer). 92 | 93 | Args: 94 | answer: The answer to the task. 95 | \"\"\" 96 | 97 | #Previous commands: {previous_action} 98 | 99 | #Window tabs: {tabs} 100 | 101 | #Current viewport (pages): {position} 102 | 103 | #Task: {objective} 104 | 105 | You should output one command to interact to the currrent webpage. 106 | You should add a brief comment to your command to explain your reasoning and thinking process. 107 | """, 108 | "finale": "", 109 | "meta_data": { 110 | "observation": "html", 111 | "action_type": "id_html_nasc_tree", 112 | "keywords": ["url", "html", "objective", "position", "previous_action", "tabs"], 113 | "prompt_constructor": "NewASPromptConstructor", 114 | "answer_phrase": "", 115 | "action_splitter": "#" 116 | }, 117 | } 118 | 119 | -------------------------------------------------------------------------------- /webarena/agent/prompts/raw/p_cot_id_actree_2s.py: -------------------------------------------------------------------------------- 1 | prompt = { 2 | "intro": """You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue. 3 | 4 | Here's the information you'll have: 5 | The user's objective: This is the task you're trying to complete. 6 | The current web page's accessibility tree: This is a simplified representation of the webpage, providing key information. 7 | The current web page's URL: This is the page you're currently navigating. 8 | The open tabs: These are the tabs you have open. 9 | The previous action: This is the action you just performed. It may be helpful to track your progress. 10 | 11 | The actions you can perform fall into several categories: 12 | 13 | Page Operation Actions: 14 | `click [id]`: This action clicks on an element with a specific id on the webpage. 15 | `type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. 16 | `hover [id]`: Hover over an element with id. 17 | `press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v). 18 | `scroll [direction=down|up]`: Scroll the page up or down. 19 | 20 | Tab Management Actions: 21 | `new_tab`: Open a new, empty browser tab. 22 | `tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index. 23 | `close_tab`: Close the currently active tab. 24 | 25 | URL Navigation Actions: 26 | `goto [url]`: Navigate to a specific URL. 27 | `go_back`: Navigate to the previously viewed page. 28 | `go_forward`: Navigate to the next page (if a previous 'go_back' action was performed). 29 | 30 | Completion Action: 31 | `stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. 32 | 33 | Homepage: 34 | If you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit. 35 | http://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites. 36 | 37 | To be successful, it is very important to follow the following rules: 38 | 1. You should only issue an action that is valid given the current observation 39 | 2. You should only issue one action at a time. 40 | 3. You should follow the examples to reason step by step and then issue the next action. 41 | 4. Generate the action in the correct format. Start with a "In summary, the next action I will perform is" phrase, followed by action inside ``````. For example, "In summary, the next action I will perform is ```click [1234]```". 42 | 5. Issue stop action when you think you have achieved the objective. Don't generate anything after stop.""", 43 | "examples": [ 44 | ( 45 | """OBSERVATION: 46 | [1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)' 47 | [1749] StaticText '$279.49' 48 | [1757] button 'Add to Cart' 49 | [1760] button 'Add to Wish List' 50 | [1761] button 'Add to Compare' 51 | URL: http://onestopmarket.com/office-products/office-electronics.html 52 | OBJECTIVE: What is the price of HP Inkjet Fax Machine 53 | PREVIOUS ACTION: None""", 54 | "Let's think step-by-step. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I think I have achieved the objective. I will issue the stop action with the answer. In summary, the next action I will perform is ```stop [$279.49]```", 55 | ), 56 | ( 57 | """OBSERVATION: 58 | [164] textbox 'Search' focused: True required: False 59 | [171] button 'Go' 60 | [174] link 'Find directions between two points' 61 | [212] heading 'Search Results' 62 | [216] button 'Close' 63 | URL: http://openstreetmap.org 64 | OBJECTIVE: Show me the restaurants near CMU 65 | PREVIOUS ACTION: None""", 66 | "Let's think step-by-step. This page has a search box whose ID is [164]. According to the nominatim rule of openstreetmap, I can search for the restaurants near a location by \"restaurants near\". I can submit my typing by pressing the Enter afterwards. In summary, the next action I will perform is ```type [164] [restaurants near CMU] [1]```", 67 | ), 68 | ], 69 | "template": """OBSERVATION: 70 | {observation} 71 | URL: {url} 72 | OBJECTIVE: {objective} 73 | PREVIOUS ACTION: {previous_action}""", 74 | "meta_data": { 75 | "observation": "accessibility_tree", 76 | "action_type": "id_accessibility_tree", 77 | "keywords": ["url", "objective", "observation", "previous_action"], 78 | "prompt_constructor": "CoTPromptConstructor", 79 | "answer_phrase": "In summary, the next action I will perform is", 80 | "action_splitter": "```" 81 | }, 82 | } 83 | -------------------------------------------------------------------------------- /webarena/agent/prompts/raw/p_cot_id_actree_2s_no_na.py: -------------------------------------------------------------------------------- 1 | prompt = { 2 | "intro": """You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue. 3 | 4 | Here's the information you'll have: 5 | The user's objective: This is the task you're trying to complete. 6 | The current web page's accessibility tree: This is a simplified representation of the webpage, providing key information. 7 | The current web page's URL: This is the page you're currently navigating. 8 | The open tabs: These are the tabs you have open. 9 | The previous action: This is the action you just performed. It may be helpful to track your progress. 10 | 11 | The actions you can perform fall into several categories: 12 | 13 | Page Operation Actions: 14 | `click [id]`: This action clicks on an element with a specific id on the webpage. 15 | `type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. 16 | `hover [id]`: Hover over an element with id. 17 | `press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v). 18 | `scroll [direction=down|up]`: Scroll the page up or down. 19 | 20 | Tab Management Actions: 21 | `new_tab`: Open a new, empty browser tab. 22 | `tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index. 23 | `close_tab`: Close the currently active tab. 24 | 25 | URL Navigation Actions: 26 | `goto [url]`: Navigate to a specific URL. 27 | `go_back`: Navigate to the previously viewed page. 28 | `go_forward`: Navigate to the next page (if a previous 'go_back' action was performed). 29 | 30 | Completion Action: 31 | `stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. 32 | 33 | Homepage: 34 | If you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit. 35 | http://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites. 36 | 37 | To be successful, it is very important to follow the following rules: 38 | 1. You should only issue an action that is valid given the current observation 39 | 2. You should only issue one action at a time. 40 | 3. You should follow the examples to reason step by step and then issue the next action. 41 | 4. Generate the action in the correct format. Start with a "In summary, the next action I will perform is" phrase, followed by action inside ``````. For example, "In summary, the next action I will perform is ```click [1234]```". 42 | 5. Issue stop action when you think you have achieved the objective. Don't generate anything after stop.""", 43 | "examples": [ 44 | ( 45 | """OBSERVATION: 46 | [1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)' 47 | [1749] StaticText '$279.49' 48 | [1757] button 'Add to Cart' 49 | [1760] button 'Add to Wish List' 50 | [1761] button 'Add to Compare' 51 | URL: http://onestopmarket.com/office-products/office-electronics.html 52 | OBJECTIVE: What is the price of HP Inkjet Fax Machine 53 | PREVIOUS ACTION: None""", 54 | "Let's think step-by-step. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I think I have achieved the objective. I will issue the stop action with the answer. In summary, the next action I will perform is ```stop [$279.49]```", 55 | ), 56 | ( 57 | """OBSERVATION: 58 | [164] textbox 'Search' focused: True required: False 59 | [171] button 'Go' 60 | [174] link 'Find directions between two points' 61 | [212] heading 'Search Results' 62 | [216] button 'Close' 63 | URL: http://openstreetmap.org 64 | OBJECTIVE: Show me the restaurants near CMU 65 | PREVIOUS ACTION: None""", 66 | "Let's think step-by-step. This page has a search box whose ID is [164]. According to the nominatim rule of openstreetmap, I can search for the restaurants near a location by \"restaurants near\". I can submit my typing by pressing the Enter afterwards. In summary, the next action I will perform is ```type [164] [restaurants near CMU] [1]```", 67 | ), 68 | ], 69 | "template": """OBSERVATION: 70 | {observation} 71 | URL: {url} 72 | OBJECTIVE: {objective} 73 | PREVIOUS ACTION: {previous_action}""", 74 | "meta_data": { 75 | "observation": "accessibility_tree", 76 | "action_type": "id_accessibility_tree", 77 | "keywords": ["url", "objective", "observation", "previous_action"], 78 | "prompt_constructor": "CoTPromptConstructor", 79 | "answer_phrase": "In summary, the next action I will perform is", 80 | "action_splitter": "```" 81 | }, 82 | } 83 | -------------------------------------------------------------------------------- /webarena/agent/prompts/raw/p_direct_id_actree_2s.py: -------------------------------------------------------------------------------- 1 | prompt = { 2 | "intro": """You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue. 3 | 4 | Here's the information you'll have: 5 | The user's objective: This is the task you're trying to complete. 6 | The current web page's accessibility tree: This is a simplified representation of the webpage, providing key information. 7 | The current web page's URL: This is the page you're currently navigating. 8 | The open tabs: These are the tabs you have open. 9 | The previous action: This is the action you just performed. It may be helpful to track your progress. 10 | 11 | The actions you can perform fall into several categories: 12 | 13 | Page Operation Actions: 14 | `click [id]`: This action clicks on an element with a specific id on the webpage. 15 | `type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. 16 | `hover [id]`: Hover over an element with id. 17 | `press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v). 18 | `scroll [direction=down|up]`: Scroll the page up or down. 19 | 20 | Tab Management Actions: 21 | `new_tab`: Open a new, empty browser tab. 22 | `tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index. 23 | `close_tab`: Close the currently active tab. 24 | 25 | URL Navigation Actions: 26 | `goto [url]`: Navigate to a specific URL. 27 | `go_back`: Navigate to the previously viewed page. 28 | `go_forward`: Navigate to the next page (if a previous 'go_back' action was performed). 29 | 30 | Completion Action: 31 | `stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. 32 | 33 | Homepage: 34 | If you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit. 35 | http://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites. 36 | 37 | To be successful, it is very important to follow the following rules: 38 | 1. You should only issue an action that is valid given the current observation 39 | 2. You should only issue one action at a time. 40 | 3. Generate the action in the correct format. Always put the action inside a pair of ```. For example, ```click [1234]```. 41 | 5. Issue stop action when you think you have achieved the objective. Don't generate anything after stop.""", 42 | "examples": [ 43 | ( 44 | """OBSERVATION: 45 | [1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)' 46 | [1749] StaticText '$279.49' 47 | [1757] button 'Add to Cart' 48 | [1760] button 'Add to Wish List' 49 | [1761] button 'Add to Compare' 50 | URL: http://onestopmarket.com/office-products/office-electronics.html 51 | OBJECTIVE: What is the price of HP Inkjet Fax Machine 52 | PREVIOUS ACTION: None""", 53 | "```stop [$279.49]```", 54 | ), 55 | ( 56 | """OBSERVATION: 57 | [164] textbox 'Search' focused: True required: False 58 | [171] button 'Go' 59 | [174] link 'Find directions between two points' 60 | [212] heading 'Search Results' 61 | [216] button 'Close' 62 | URL: http://openstreetmap.org 63 | OBJECTIVE: Show me the restaurants near CMU 64 | PREVIOUS ACTION: None""", 65 | "```type [164] [restaurants near CMU] [1]```", 66 | ), 67 | ], 68 | "template": """OBSERVATION: 69 | {observation} 70 | URL: {url} 71 | OBJECTIVE: {objective} 72 | PREVIOUS ACTION: {previous_action}""", 73 | "meta_data": { 74 | "observation": "accessibility_tree", 75 | "action_type": "id_accessibility_tree", 76 | "keywords": ["url", "objective", "observation", "previous_action"], 77 | "prompt_constructor": "DirectPromptConstructor", 78 | "action_splitter": "```" 79 | }, 80 | } 81 | -------------------------------------------------------------------------------- /webarena/agent/prompts/raw/p_direct_id_actree_2s_no_na.py: -------------------------------------------------------------------------------- 1 | prompt = { 2 | "intro": """You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue. 3 | 4 | Here's the information you'll have: 5 | The user's objective: This is the task you're trying to complete. 6 | The current web page's accessibility tree: This is a simplified representation of the webpage, providing key information. 7 | The current web page's URL: This is the page you're currently navigating. 8 | The open tabs: These are the tabs you have open. 9 | The previous action: This is the action you just performed. It may be helpful to track your progress. 10 | 11 | The actions you can perform fall into several categories: 12 | 13 | Page Operation Actions: 14 | `click [id]`: This action clicks on an element with a specific id on the webpage. 15 | `type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. 16 | `hover [id]`: Hover over an element with id. 17 | `press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v). 18 | `scroll [direction=down|up]`: Scroll the page up or down. 19 | 20 | Tab Management Actions: 21 | `new_tab`: Open a new, empty browser tab. 22 | `tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index. 23 | `close_tab`: Close the currently active tab. 24 | 25 | URL Navigation Actions: 26 | `goto [url]`: Navigate to a specific URL. 27 | `go_back`: Navigate to the previously viewed page. 28 | `go_forward`: Navigate to the next page (if a previous 'go_back' action was performed). 29 | 30 | Completion Action: 31 | `stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. 32 | 33 | Homepage: 34 | If you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit. 35 | http://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites. 36 | 37 | To be successful, it is very important to follow the following rules: 38 | 1. You should only issue an action that is valid given the current observation 39 | 2. You should only issue one action at a time. 40 | 4. Generate the action in the correct format, wrap the action inside ``````. For example, ```click [1234]```". 41 | 5. Issue stop action when you think you have achieved the objective.""", 42 | "examples": [ 43 | ( 44 | """OBSERVATION: 45 | [1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)' 46 | [1749] StaticText '$279.49' 47 | [1757] button 'Add to Cart' 48 | [1760] button 'Add to Wish List' 49 | [1761] button 'Add to Compare' 50 | URL: http://onestopmarket.com/office-products/office-electronics.html 51 | OBJECTIVE: What is the price of HP Inkjet Fax Machine 52 | PREVIOUS ACTION: None""", 53 | "```stop [$279.49]```", 54 | ), 55 | ( 56 | """OBSERVATION: 57 | [164] textbox 'Search' focused: True required: False 58 | [171] button 'Go' 59 | [174] link 'Find directions between two points' 60 | [212] heading 'Search Results' 61 | [216] button 'Close' 62 | URL: http://openstreetmap.org 63 | OBJECTIVE: Show me the restaurants near CMU 64 | PREVIOUS ACTION: None""", 65 | "```type [164] [restaurants near CMU] [1]```", 66 | ), 67 | ], 68 | "template": """OBSERVATION: 69 | {observation} 70 | URL: {url} 71 | OBJECTIVE: {objective} 72 | PREVIOUS ACTION: {previous_action}""", 73 | "meta_data": { 74 | "observation": "accessibility_tree", 75 | "action_type": "id_accessibility_tree", 76 | "keywords": ["url", "objective", "observation", "previous_action"], 77 | "prompt_constructor": "CoTPromptConstructor", 78 | "answer_phrase": "In summary, the next action I will perform is", 79 | "action_splitter": "```" 80 | }, 81 | } 82 | -------------------------------------------------------------------------------- /webarena/agent/prompts/raw/p_direct_id_actree_3s_llama.py: -------------------------------------------------------------------------------- 1 | prompt = { 2 | "intro": """You are an autonomous intelligent agent tasked with navigating a web browser. The actions you can perform fall into several categories: 3 | 4 | Page Operation Actions: 5 | `click [id]`: This action clicks on an element with a specific id on the webpage. 6 | `type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. 7 | `hover [id]`: Hover over an element with id. 8 | `press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v). 9 | `scroll [direction=down|up]`: Scroll the page up or down. 10 | 11 | Tab Management Actions: 12 | `new_tab`: Open a new, empty browser tab. 13 | `tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index. 14 | `close_tab`: Close the currently active tab. 15 | 16 | URL Navigation Actions: 17 | `goto [url]`: Navigate to a specific URL. 18 | `go_back`: Navigate to the previously viewed page. 19 | `go_forward`: Navigate to the next page (if a previous 'go_back' action was performed). 20 | 21 | Completion Action: 22 | `stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. 23 | 24 | Homepage: 25 | If you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit. 26 | 27 | You can only issue one action at a time""", 28 | 29 | "examples": [ 30 | ( 31 | """Observation: 32 | [1744] link 'HP CB782A#ABA 640 Inkjet Fax Machine (Renewed)' 33 | [1749] StaticText '$279.49' 34 | [1757] button 'Add to Cart' 35 | [1760] button 'Add to Wish List' 36 | [1761] button 'Add to Compare' 37 | URL: http://onestopmarket.com/office-products/office-electronics.html 38 | Objective: What is the price of HP Inkjet Fax Machine 39 | Previous action: None""", 40 | "```stop [$279.49]```", 41 | ), 42 | ( 43 | """Observation: 44 | [164] textbox 'Search' focused: True required: False 45 | [171] button 'Go' 46 | [174] link 'Find directions between two points' 47 | [212] heading 'Search Results' 48 | [216] button 'Close' 49 | URL: http://openstreetmap.org 50 | Objective: Show me the restaurants near CMU 51 | Previous action: None""", 52 | "```type [164] [restaurants near CMU] [1]```", 53 | ), 54 | ( 55 | """Observation: 56 | [2036] button 'Sort by: New' hasPopup: menu expanded: False 57 | [587] link 'US Marine’s adoption of Afghan war orphan voided' 58 | [989] time 'March 30, 2023 at 15:03:48 AM UTC' 59 | [602] link 'York student uses AI chatbot to get parking fine revoked' 60 | [1025] time 'March 15, 2023 at 7:48:34 AM UTC' 61 | [617] link 'Loveland parents furious after teachers leave, communication lagged during school threat investigation' 62 | [1025] time 'March 2, 2023 at 3:46:01 AM UTC' 63 | URL: http://reddit.com/f/news/new 64 | Objective: Open the most recent post that was published prior to March 1st. 65 | Previous action: None""", 66 | "```scroll [down]```", 67 | ) 68 | ], 69 | "template": """Observation: 70 | {observation} 71 | URL: {url} 72 | Objective: {objective} 73 | Previous action: {previous_action}""", 74 | "meta_data": { 75 | "observation": "accessibility_tree", 76 | "action_type": "id_accessibility_tree", 77 | "keywords": ["url", "objective", "observation", "previous_action"], 78 | "prompt_constructor": "DirectPromptConstructor", 79 | "answer_phrase": "In summary, the next action I will perform is", 80 | "action_splitter": "```", 81 | "force_prefix": "```" 82 | }, 83 | } 84 | -------------------------------------------------------------------------------- /webarena/agent/prompts/raw/test_prompt.py: -------------------------------------------------------------------------------- 1 | prompt = { 2 | "intro": "", 3 | "examples": [], 4 | "template": """ {html} 5 | 6 | You are a helpful assistant that can assist with web navigation tasks. 7 | You are given a simplified html webpage and a task description. 8 | Your goal is to complete the task. You can perform the specified operations below to interact with the webpage. 9 | 10 | #Valid operations: - #Click# id: Click on the element with the specified id 11 | - #Scroll_up#: Scroll up 1 page. 12 | - #Scroll_down#: Scroll down 1 page. 13 | - #Go_backward#: Go back to the previous page. 14 | - #Go_forward#: Go forward to the next page. 15 | - #Hover# id: Hover over the element with the specified id. 16 | - #Type# id "text": Type in the text at the element with the specified id. 17 | - #Select# id "option": Select the option at the element with the specified id. 18 | - #Record# "content": Mark content that is useful in answering the question. 19 | - #Answer# "text": output the text as the answer to the user. 20 | - #Exit#: Complete the task and exit the program. 21 | 22 | #Current viewport position: {position} 23 | 24 | #Previous Operation: {previous_action} 25 | 26 | #Task: {objective} 27 | """, 28 | "finale": """ 29 | Your output SHOULD be in the following format: 30 | #Operation: {Next operation to perform} 31 | """, 32 | "meta_data": { 33 | "observation": "html", 34 | "action_type": "id_html_tree", 35 | "keywords": ["url", "html", "objective", "position", "previous_action"], 36 | "prompt_constructor": "MyPromptConstructor", 37 | "answer_phrase": "", 38 | "action_splitter": "#" 39 | }, 40 | } 41 | 42 | -------------------------------------------------------------------------------- /webarena/agent/prompts/to_json.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import importlib 3 | import json 4 | import os 5 | 6 | 7 | # use the current directory as the root 8 | def run() -> None: 9 | """Convert all python files in agent/prompts to json files in agent/prompts/jsons 10 | 11 | Python files are easiser to edit 12 | """ 13 | for p_file in glob.glob(f"agent/prompts/raw/*.py"): 14 | # import the file as a module 15 | base_name = os.path.basename(p_file).replace(".py", "") 16 | module = importlib.import_module(f"agent.prompts.raw.{base_name}") 17 | prompt = module.prompt 18 | # save the prompt as a json file 19 | os.makedirs("agent/prompts/jsons", exist_ok=True) 20 | with open(f"agent/prompts/jsons/{base_name}.json", "w+") as f: 21 | json.dump(prompt, f, indent=2) 22 | print(f"Done convert python files to json") 23 | 24 | 25 | if __name__ == "__main__": 26 | run() 27 | -------------------------------------------------------------------------------- /webarena/browser_env/__init__.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from .actions import ( 4 | Action, 5 | ActionParsingError, 6 | ActionTypes, 7 | action2create_function, 8 | action2str, 9 | create_check_action, 10 | create_click_action, 11 | create_focus_and_click_action, 12 | create_focus_and_type_action, 13 | create_go_back_action, 14 | create_go_forward_action, 15 | create_goto_url_action, 16 | create_hover_action, 17 | create_id_based_action, 18 | create_key_press_action, 19 | create_keyboard_type_action, 20 | create_mouse_click_action, 21 | create_mouse_hover_action, 22 | create_new_tab_action, 23 | create_none_action, 24 | create_page_close_action, 25 | create_page_focus_action, 26 | create_playwright_action, 27 | create_random_action, 28 | create_scroll_action, 29 | create_select_option_action, 30 | create_stop_action, 31 | create_type_action, 32 | is_equivalent, 33 | ) 34 | from .async_envs import AsyncScriptBrowserEnv 35 | from .envs import ScriptBrowserEnv 36 | from .processors import ObservationMetadata 37 | from .trajectory import Trajectory 38 | from .utils import DetachedPage, StateInfo 39 | 40 | __all__ = [ 41 | "ScriptBrowserEnv", 42 | "AsyncScriptBrowserEnv", 43 | "DetachedPage", 44 | "StateInfo", 45 | "ObservationMetadata", 46 | "Action", 47 | "ActionTypes", 48 | "action2str", 49 | "create_random_action", 50 | "create_focus_and_click_action", 51 | "create_focus_and_type_action", 52 | "is_equivalent", 53 | "create_mouse_click_action", 54 | "create_mouse_hover_action", 55 | "create_none_action", 56 | "create_keyboard_type_action", 57 | "create_page_focus_action", 58 | "create_new_tab_action", 59 | "create_go_back_action", 60 | "create_go_forward_action", 61 | "create_goto_url_action", 62 | "create_page_close_action", 63 | "action2create_function", 64 | "create_playwright_action", 65 | "create_id_based_action", 66 | "create_scroll_action", 67 | "create_key_press_action", 68 | "create_check_action", 69 | "create_click_action", 70 | "create_type_action", 71 | "create_hover_action", 72 | "create_select_option_action", 73 | "create_stop_action", 74 | "ActionParsingError", 75 | "Trajectory", 76 | ] 77 | -------------------------------------------------------------------------------- /webarena/browser_env/async_envs.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | from dataclasses import dataclass 4 | from pathlib import Path 5 | 6 | import numpy as np 7 | import numpy.typing as npt 8 | from gymnasium import Env 9 | from gymnasium.spaces import Box, Text 10 | from playwright.async_api import Page, ViewportSize, async_playwright 11 | 12 | from .actions import Action, aexecute_action, get_action_space 13 | from .utils import DetachedPage, png_bytes_to_numpy 14 | 15 | 16 | class AsyncScriptBrowserEnv(Env[npt.NDArray[np.uint8], Action]): 17 | """ 18 | The goal of this environment is to produce a prototype of a browser environment. 19 | In the end, we want to support a fully configurable browser environment with wide 20 | range of action spaces and observation spaces, both structured and unstructured. 21 | But in this prototype, we just support action space specified by Playwright script, 22 | and observation space is the html content of the page. 23 | """ 24 | 25 | def __init__( 26 | self, 27 | max_page_length: int = 2048, 28 | headless: bool = True, 29 | slow_mo: int = 0, 30 | timeout: int = 30000, 31 | viewport_size: ViewportSize = {"width": 1280, "height": 720}, 32 | ): 33 | self.observation_space = Box( 34 | 0, 35 | 255, 36 | (viewport_size["height"], viewport_size["width"], 4), 37 | np.uint8, 38 | ) 39 | # TODO: make Space[Action] = ActionSpace 40 | self.action_space = get_action_space() # type: ignore[assignment] 41 | self.headless = headless 42 | self.slow_mo = slow_mo 43 | self.reset_finished = False 44 | self.timeout = timeout 45 | self.viewport_size = viewport_size 46 | 47 | async def setup(self, config_file: Path | None = None) -> None: 48 | self.context_manager = async_playwright() 49 | self.playwright = await self.context_manager.__aenter__() 50 | self.browser = await self.playwright.chromium.launch( 51 | headless=self.headless, slow_mo=self.slow_mo 52 | ) 53 | if config_file: 54 | with open(config_file, "r") as f: 55 | instance_config = json.load(f) 56 | else: 57 | instance_config = {} 58 | 59 | storage_state = instance_config.get("storage_state", None) 60 | start_url = instance_config.get("start_url", None) 61 | geolocation = instance_config.get("geolocation", None) 62 | 63 | self.context = await self.browser.new_context( 64 | viewport=self.viewport_size, 65 | storage_state=storage_state, 66 | geolocation=geolocation, 67 | device_scale_factor=1, 68 | ) 69 | self.page = await self.context.new_page() 70 | if start_url: 71 | await self.page.goto(start_url) 72 | 73 | async def areset( 74 | self, 75 | *, 76 | seed: int | None = None, 77 | options: dict[str, str] | None = None, 78 | ) -> tuple[npt.NDArray[np.uint8], dict[str, object]]: 79 | """ 80 | Reset the environment. 81 | :param options: options for the environment. The options are: 82 | - storage_state: the path to the storage state file 83 | """ 84 | super().reset(seed=seed, options=options) 85 | if self.reset_finished: 86 | await self.context_manager.__aexit__() 87 | if options is not None and "config_file" in options: 88 | config_file = Path(options["config_file"]) 89 | if config_file.exists(): 90 | await self.setup(config_file=config_file) 91 | else: 92 | raise ValueError(f"Config state {config_file} does not exist.") 93 | else: 94 | await self.setup() 95 | self.reset_finished = True 96 | content = await self.page.content() 97 | screenshot = png_bytes_to_numpy(await self.page.screenshot()) 98 | return ( 99 | screenshot, 100 | {"page": DetachedPage(self.page.url, content)}, 101 | ) 102 | 103 | def reset( 104 | self, 105 | *, 106 | seed: int | None = None, 107 | options: dict[str, str] | None = None, 108 | ) -> tuple[npt.NDArray[np.uint8], dict[str, object]]: 109 | return asyncio.run(self.areset(seed=seed, options=options)) 110 | 111 | async def aclose(self) -> None: 112 | if self.reset_finished: 113 | await self.context_manager.__aexit__() 114 | 115 | def close(self) -> None: 116 | asyncio.run(self.aclose()) 117 | 118 | async def astep( 119 | self, action: Action 120 | ) -> tuple[npt.NDArray[np.uint8], float, bool, bool, dict[str, object]]: 121 | if not self.reset_finished: 122 | raise RuntimeError("Call reset first before calling step.") 123 | success = False 124 | fail_error = "" 125 | try: 126 | self.page = await aexecute_action(action, self.page, self.context) 127 | success = True 128 | except Exception as e: 129 | fail_error = str(e) 130 | 131 | try: 132 | content = await self.page.content() 133 | screenshot = png_bytes_to_numpy(await self.page.screenshot()) 134 | except: 135 | await self.page.wait_for_load_state("load") 136 | content = await self.page.content() 137 | screenshot = png_bytes_to_numpy(await self.page.screenshot()) 138 | 139 | return ( 140 | screenshot, 141 | float(success), 142 | False, 143 | False, 144 | { 145 | "page": DetachedPage(self.page.url, content), 146 | "fail_error": fail_error, 147 | }, 148 | ) 149 | 150 | def step( 151 | self, action: Action 152 | ) -> tuple[npt.NDArray[np.uint8], float, bool, bool, dict[str, object]]: 153 | return asyncio.run(self.astep(action), debug=True) 154 | -------------------------------------------------------------------------------- /webarena/browser_env/auto_login.py: -------------------------------------------------------------------------------- 1 | """Script to automatically login each website""" 2 | import argparse 3 | import glob 4 | import os 5 | import time 6 | from concurrent.futures import ThreadPoolExecutor 7 | from itertools import combinations 8 | from pathlib import Path 9 | 10 | from playwright.sync_api import sync_playwright 11 | 12 | from browser_env.env_config import ( 13 | ACCOUNTS, 14 | GITLAB, 15 | REDDIT, 16 | SHOPPING, 17 | SHOPPING_ADMIN, 18 | ) 19 | 20 | HEADLESS = True 21 | SLOW_MO = 0 22 | 23 | 24 | SITES = ["gitlab", "shopping", "shopping_admin", "reddit"] 25 | URLS = [ 26 | f"{GITLAB}/-/profile", 27 | f"{SHOPPING}/wishlist/", 28 | f"{SHOPPING_ADMIN}/dashboard", 29 | f"{REDDIT}/user/{ACCOUNTS['reddit']['username']}/account", 30 | ] 31 | EXACT_MATCH = [True, True, True, True] 32 | KEYWORDS = ["", "", "Dashboard", "Delete"] 33 | 34 | 35 | def is_expired( 36 | storage_state: Path, url: str, keyword: str, url_exact: bool = True 37 | ) -> bool: 38 | """Test whether the cookie is expired""" 39 | if not storage_state.exists(): 40 | return True 41 | 42 | context_manager = sync_playwright() 43 | playwright = context_manager.__enter__() 44 | browser = playwright.chromium.launch(headless=True, slow_mo=SLOW_MO) 45 | context = browser.new_context(storage_state=storage_state) 46 | page = context.new_page() 47 | page.goto(url) 48 | time.sleep(1) 49 | d_url = page.url 50 | content = page.content() 51 | context_manager.__exit__() 52 | if keyword: 53 | return keyword not in content 54 | else: 55 | if url_exact: 56 | return d_url != url 57 | else: 58 | return url not in d_url 59 | 60 | 61 | def renew_comb(comb: list[str], auth_folder: str = "./.auth") -> None: 62 | context_manager = sync_playwright() 63 | playwright = context_manager.__enter__() 64 | browser = playwright.chromium.launch(headless=HEADLESS) 65 | context = browser.new_context() 66 | page = context.new_page() 67 | 68 | if "shopping" in comb: 69 | username = ACCOUNTS["shopping"]["username"] 70 | password = ACCOUNTS["shopping"]["password"] 71 | page.goto(f"{SHOPPING}/customer/account/login/") 72 | page.get_by_label("Email", exact=True).fill(username) 73 | page.get_by_label("Password", exact=True).fill(password) 74 | page.get_by_role("button", name="Sign In").click() 75 | 76 | if "reddit" in comb: 77 | username = ACCOUNTS["reddit"]["username"] 78 | password = ACCOUNTS["reddit"]["password"] 79 | page.goto(f"{REDDIT}/login") 80 | page.get_by_label("Username").fill(username) 81 | page.get_by_label("Password").fill(password) 82 | page.get_by_role("button", name="Log in").click() 83 | 84 | if "shopping_admin" in comb: 85 | username = ACCOUNTS["shopping_admin"]["username"] 86 | password = ACCOUNTS["shopping_admin"]["password"] 87 | page.goto(f"{SHOPPING_ADMIN}") 88 | page.get_by_placeholder("user name").fill(username) 89 | page.get_by_placeholder("password").fill(password) 90 | page.get_by_role("button", name="Sign in").click() 91 | 92 | if "gitlab" in comb: 93 | username = ACCOUNTS["gitlab"]["username"] 94 | password = ACCOUNTS["gitlab"]["password"] 95 | page.goto(f"{GITLAB}/users/sign_in") 96 | page.get_by_test_id("username-field").click() 97 | page.get_by_test_id("username-field").fill(username) 98 | page.get_by_test_id("username-field").press("Tab") 99 | page.get_by_test_id("password-field").fill(password) 100 | page.get_by_test_id("sign-in-button").click() 101 | 102 | context.storage_state(path=f"{auth_folder}/{'.'.join(comb)}_state.json") 103 | 104 | context_manager.__exit__() 105 | 106 | 107 | def get_site_comb_from_filepath(file_path: str) -> list[str]: 108 | comb = os.path.basename(file_path).rsplit("_", 1)[0].split(".") 109 | return comb 110 | 111 | 112 | def main(auth_folder: str = "./.auth") -> None: 113 | pairs = list(combinations(SITES, 2)) 114 | 115 | max_workers = 8 116 | with ThreadPoolExecutor(max_workers=max_workers) as executor: 117 | for pair in pairs: 118 | # TODO[shuyanzh] auth don't work on these two sites 119 | if "reddit" in pair and ( 120 | "shopping" in pair or "shopping_admin" in pair 121 | ): 122 | continue 123 | executor.submit( 124 | renew_comb, list(sorted(pair)), auth_folder=auth_folder 125 | ) 126 | 127 | for site in SITES: 128 | executor.submit(renew_comb, [site], auth_folder=auth_folder) 129 | 130 | futures = [] 131 | cookie_files = list(glob.glob(f"{auth_folder}/*.json")) 132 | with ThreadPoolExecutor(max_workers=max_workers) as executor: 133 | for c_file in cookie_files: 134 | comb = get_site_comb_from_filepath(c_file) 135 | for cur_site in comb: 136 | url = URLS[SITES.index(cur_site)] 137 | keyword = KEYWORDS[SITES.index(cur_site)] 138 | match = EXACT_MATCH[SITES.index(cur_site)] 139 | future = executor.submit( 140 | is_expired, Path(c_file), url, keyword, match 141 | ) 142 | futures.append(future) 143 | 144 | for i, future in enumerate(futures): 145 | assert not future.result(), f"Cookie {cookie_files[i]} expired." 146 | 147 | 148 | if __name__ == "__main__": 149 | parser = argparse.ArgumentParser() 150 | parser.add_argument("--site_list", nargs="+", default=[]) 151 | parser.add_argument("--auth_folder", type=str, default="./.auth") 152 | args = parser.parse_args() 153 | if not args.site_list: 154 | main() 155 | else: 156 | if "all" in args.site_list: 157 | main(auth_folder=args.auth_folder) 158 | else: 159 | renew_comb(args.site_list, auth_folder=args.auth_folder) 160 | -------------------------------------------------------------------------------- /webarena/browser_env/constants.py: -------------------------------------------------------------------------------- 1 | from typing import Literal 2 | 3 | ROLES = ( 4 | "alert", 5 | "alertdialog", 6 | "application", 7 | "article", 8 | "banner", 9 | "blockquote", 10 | "button", 11 | "caption", 12 | "cell", 13 | "checkbox", 14 | "code", 15 | "columnheader", 16 | "combobox", 17 | "complementary", 18 | "contentinfo", 19 | "definition", 20 | "deletion", 21 | "dialog", 22 | "directory", 23 | "document", 24 | "emphasis", 25 | "feed", 26 | "figure", 27 | "form", 28 | "generic", 29 | "grid", 30 | "gridcell", 31 | "group", 32 | "heading", 33 | "img", 34 | "insertion", 35 | "link", 36 | "list", 37 | "listbox", 38 | "listitem", 39 | "log", 40 | "main", 41 | "marquee", 42 | "math", 43 | "meter", 44 | "menu", 45 | "menubar", 46 | "menuitem", 47 | "menuitemcheckbox", 48 | "menuitemradio", 49 | "navigation", 50 | "none", 51 | "note", 52 | "option", 53 | "paragraph", 54 | "presentation", 55 | "progressbar", 56 | "radio", 57 | "radiogroup", 58 | "region", 59 | "row", 60 | "rowgroup", 61 | "rowheader", 62 | "scrollbar", 63 | "search", 64 | "searchbox", 65 | "separator", 66 | "slider", 67 | "spinbutton", 68 | "status", 69 | "strong", 70 | "subscript", 71 | "superscript", 72 | "switch", 73 | "tab", 74 | "table", 75 | "tablist", 76 | "tabpanel", 77 | "term", 78 | "textbox", 79 | "time", 80 | "timer", 81 | "toolbar", 82 | "tooltip", 83 | "tree", 84 | "treegrid", 85 | "treeitem", 86 | ) 87 | 88 | SPECIAL_LOCATORS = ( 89 | "alt_text", 90 | "label", 91 | "placeholder", 92 | ) 93 | 94 | ASCII_CHARSET = "".join(chr(x) for x in range(32, 128)) 95 | FREQ_UNICODE_CHARSET = "".join(chr(x) for x in range(129, 110000)) 96 | UTTERANCE_MAX_LENGTH = 8192 97 | ATTRIBUTE_MAX_LENGTH = 256 98 | TEXT_MAX_LENGTH = 256 99 | TYPING_MAX_LENGTH = 64 100 | URL_MAX_LENGTH = 256 101 | MAX_ELEMENT_INDEX_IN_VIEWPORT = 10 102 | MAX_ELEMENT_ID = 1000 103 | MAX_ANSWER_LENGTH = 512 104 | 105 | MIN_REF = -1000000 106 | MAX_REF = 1000000 107 | 108 | WINDOW_WIDTH = 500 109 | WINDOW_HEIGHT = 240 110 | TASK_WIDTH = 160 111 | TASK_HEIGHT = 210 112 | 113 | FLIGHT_WINDOW_WIDTH = 600 114 | FLIGHT_WINDOW_HEIGHT = 700 115 | FLIGHT_TASK_WIDTH = 375 116 | FLIGHT_TASK_HEIGHT = 667 117 | MAX_PAGE_NUMBER = 10 118 | 119 | SPECIAL_KEYS = ( 120 | "Enter", 121 | "Tab", 122 | "Control", 123 | "Shift", 124 | "Meta", 125 | "Backspace", 126 | "Delete", 127 | "Escape", 128 | "ArrowUp", 129 | "ArrowDown", 130 | "ArrowLeft", 131 | "ArrowRight", 132 | "PageDown", 133 | "PageUp", 134 | "Meta+a", 135 | ) 136 | 137 | SPECIAL_KEY_MAPPINGS = { 138 | "backquote": "Backquote", 139 | "minus": "Minus", 140 | "equal": "Equal", 141 | "backslash": "Backslash", 142 | "backspace": "Backspace", 143 | "meta": "Meta", 144 | "tab": "Tab", 145 | "delete": "Delete", 146 | "escape": "Escape", 147 | "arrowdown": "ArrowDown", 148 | "end": "End", 149 | "enter": "Enter", 150 | "home": "Home", 151 | "insert": "Insert", 152 | "pagedown": "PageDown", 153 | "pageup": "PageUp", 154 | "arrowright": "ArrowRight", 155 | "arrowup": "ArrowUp", 156 | "f1": "F1", 157 | "f2": "F2", 158 | "f3": "F3", 159 | "f4": "F4", 160 | "f5": "F5", 161 | "f6": "F6", 162 | "f7": "F7", 163 | "f8": "F8", 164 | "f9": "F9", 165 | "f10": "F10", 166 | "f11": "F11", 167 | "f12": "F12", 168 | } 169 | 170 | RolesType = Literal[ 171 | "alert", 172 | "alertdialog", 173 | "application", 174 | "article", 175 | "banner", 176 | "blockquote", 177 | "button", 178 | "caption", 179 | "cell", 180 | "checkbox", 181 | "code", 182 | "columnheader", 183 | "combobox", 184 | "complementary", 185 | "contentinfo", 186 | "definition", 187 | "deletion", 188 | "dialog", 189 | "directory", 190 | "document", 191 | "emphasis", 192 | "feed", 193 | "figure", 194 | "form", 195 | "generic", 196 | "grid", 197 | "gridcell", 198 | "group", 199 | "heading", 200 | "img", 201 | "insertion", 202 | "link", 203 | "list", 204 | "listbox", 205 | "listitem", 206 | "log", 207 | "main", 208 | "marquee", 209 | "math", 210 | "meter", 211 | "menu", 212 | "menubar", 213 | "menuitem", 214 | "menuitemcheckbox", 215 | "menuitemradio", 216 | "navigation", 217 | "none", 218 | "note", 219 | "option", 220 | "paragraph", 221 | "presentation", 222 | "progressbar", 223 | "radio", 224 | "radiogroup", 225 | "region", 226 | "row", 227 | "rowgroup", 228 | "rowheader", 229 | "scrollbar", 230 | "search", 231 | "searchbox", 232 | "separator", 233 | "slider", 234 | "spinbutton", 235 | "status", 236 | "strong", 237 | "subscript", 238 | "superscript", 239 | "switch", 240 | "tab", 241 | "table", 242 | "tablist", 243 | "tabpanel", 244 | "term", 245 | "textbox", 246 | "time", 247 | "timer", 248 | "toolbar", 249 | "tooltip", 250 | "tree", 251 | "treegrid", 252 | "treeitem", 253 | "alt_text", 254 | "label", 255 | "placeholder", 256 | ] 257 | 258 | MAX_VANILLA_STR_LENGTH = 1000 259 | 260 | PLAYWRIGHT_LOCATORS = ( 261 | "get_by_role", 262 | "get_by_text", 263 | "get_by_label", 264 | "get_by_placeholder", 265 | "get_by_alt_text", 266 | "get_by_title", 267 | "get_by_test_id", 268 | "filter", 269 | "frame_locator", 270 | "locator", 271 | ) 272 | 273 | PLAYWRIGHT_ACTIONS = ( 274 | "fill", 275 | "check", 276 | "select_option", 277 | "click", 278 | "hover", 279 | "dclick", 280 | "type", 281 | "focus", 282 | "goto", 283 | "press", 284 | "scroll", 285 | ) 286 | 287 | IGNORED_ACTREE_PROPERTIES = ( 288 | "focusable", 289 | "editable", 290 | "readonly", 291 | "level", 292 | "settable", 293 | "multiline", 294 | "invalid", 295 | ) 296 | -------------------------------------------------------------------------------- /webarena/browser_env/env_config.py: -------------------------------------------------------------------------------- 1 | # websites domain 2 | import os 3 | 4 | REDDIT = os.environ.get("REDDIT", "") 5 | SHOPPING = os.environ.get("SHOPPING", "") 6 | SHOPPING_ADMIN = os.environ.get("SHOPPING_ADMIN", "") 7 | GITLAB = os.environ.get("GITLAB", "") 8 | WIKIPEDIA = os.environ.get("WIKIPEDIA", "") 9 | MAP = os.environ.get("MAP", "") 10 | HOMEPAGE = os.environ.get("HOMEPAGE", "") 11 | 12 | assert ( 13 | REDDIT 14 | and SHOPPING 15 | and SHOPPING_ADMIN 16 | and GITLAB 17 | and WIKIPEDIA 18 | and MAP 19 | and HOMEPAGE 20 | ), ( 21 | f"Please setup the URLs to each site. Current: \n" 22 | + f"Reddit: {REDDIT}\n" 23 | + f"Shopping: {SHOPPING}\n" 24 | + f"Shopping Admin: {SHOPPING_ADMIN}\n" 25 | + f"Gitlab: {GITLAB}\n" 26 | + f"Wikipedia: {WIKIPEDIA}\n" 27 | + f"Map: {MAP}\n" 28 | + f"Homepage: {HOMEPAGE}\n" 29 | ) 30 | 31 | 32 | ACCOUNTS = { 33 | "reddit": {"username": "MarvelsGrantMan136", "password": "test1234"}, 34 | "gitlab": {"username": "byteblaze", "password": "hello1234"}, 35 | "shopping": { 36 | "username": "emma.lopez@gmail.com", 37 | "password": "Password.123", 38 | }, 39 | "shopping_admin": {"username": "admin", "password": "admin1234"}, 40 | "shopping_site_admin": {"username": "admin", "password": "admin1234"}, 41 | } 42 | 43 | URL_MAPPINGS = { 44 | REDDIT: "http://reddit.com", 45 | SHOPPING: "http://onestopmarket.com", 46 | SHOPPING_ADMIN: "http://luma.com/admin", 47 | GITLAB: "http://gitlab.com", 48 | WIKIPEDIA: "http://wikipedia.org", 49 | MAP: "http://openstreetmap.org", 50 | HOMEPAGE: "http://homepage.com", 51 | } 52 | -------------------------------------------------------------------------------- /webarena/browser_env/html_tools/__init__.py: -------------------------------------------------------------------------------- 1 | from .identifier import IdentifierTool 2 | from .prompt import HtmlPrompt 3 | from .html_parser import HtmlParser 4 | 5 | from .utils import print_html_object 6 | from .configs import basic_attrs, mind2web_keep_attrs -------------------------------------------------------------------------------- /webarena/browser_env/html_tools/configs/__init__.py: -------------------------------------------------------------------------------- 1 | from .html_prompt import prompts 2 | from .config import basic_attrs, mind2web_keep_attrs, miniwob_attrs 3 | from .config import config_meta -------------------------------------------------------------------------------- /webarena/browser_env/html_tools/configs/config.py: -------------------------------------------------------------------------------- 1 | basic_attrs = [ 2 | 'title', 3 | 'value', 4 | 'placeholder', 5 | 'selected', 6 | ] 7 | 8 | mind2web_keep_attrs = [ 9 | 'alt', 10 | 'aria_description', 11 | 'aria_label', 12 | 'aria_role', 13 | 'input_checked', 14 | 'input_value', 15 | 'label', 16 | 'name', 17 | 'option_selected', 18 | 'placeholder', 19 | 'role', 20 | 'text_value', 21 | 'title', 22 | 'type', 23 | 'value', 24 | ] 25 | 26 | miniwob_attrs = [ 27 | 'id', 28 | 'type', 29 | 'value', 30 | ] 31 | 32 | config_meta = """ 33 | ======= Configs ======= 34 | Columns: 35 | - id: {id_attr} 36 | - label: {label_attr} 37 | Position: {use_position} 38 | - window: {window_size} 39 | - rect_dict: {rect} 40 | Keep: 41 | - parents: {parent_chain} 42 | - attrs: {keep_attrs} 43 | - elems: {keep_elem} 44 | - obs_elem: {obs_elem} 45 | Generator: 46 | - prompt: {prompt_name} 47 | - label: {identifier_name} 48 | ======================== 49 | """ -------------------------------------------------------------------------------- /webarena/browser_env/html_tools/configs/html_prompt.py: -------------------------------------------------------------------------------- 1 | refine_prompt = { 2 | 'dom': '<{tag}{label}|{attr}{content}{subtree} >', 3 | 'label': '[{label}]', 4 | 'attr': '{attr}', 5 | 'attr_splitter': '; ', 6 | 'subtree_splitter': ' ', 7 | } 8 | 9 | xml_prompt = { 10 | 'dom': '<{tag}{label}{attr}>{content}{subtree} ', 11 | 'label': ' id="{label}"', 12 | 'attr': '{key}="{attr}"', 13 | 'attr_splitter': ' ', 14 | 'subtree_splitter': ' ', 15 | } 16 | 17 | prompts = { 18 | 'refine': refine_prompt, 19 | 'xml': xml_prompt, 20 | 'new_data': refine_prompt, 21 | } 22 | -------------------------------------------------------------------------------- /webarena/browser_env/html_tools/identifier.py: -------------------------------------------------------------------------------- 1 | import secrets 2 | 3 | class IdentifierTool: 4 | def __init__(self, method: str='order', existing_labels: dict[str]={}) -> None: 5 | self.methods = { 6 | 'order': self.get_identifier_in_order, 7 | 'random': self.get_random_identifier, 8 | } 9 | 10 | if method is None: 11 | method = 'order' 12 | 13 | self.func = self.methods.get(method, None) 14 | self.name = method 15 | if self.func is None: 16 | raise ValueError(f'Invalid method for identifier: {method}') 17 | 18 | self.reset(existing_labels) 19 | 20 | def reset(self, exists: dict[str]={}) -> None: 21 | self.identifier = -1 22 | self.exists = {} if exists is None else exists 23 | 24 | def get_identifier_in_order(self) -> str: 25 | def id2str(id: int) -> str: 26 | if id < 26: 27 | return chr(id + 65) 28 | id -= 26 29 | c0 = id // 676 30 | c1 = (id // 26) % 26 31 | c2 = id % 26 32 | label = f'{chr(c1 + 65)}{chr(c2 + 65)}' 33 | return label if c0 == 0 else f'{chr(c0 + 64)}{label}' 34 | 35 | self.identifier += 1 36 | label = id2str(self.identifier) 37 | 38 | while label in self.exists: 39 | self.identifier += 1 40 | label = id2str(self.identifier) 41 | 42 | self.exists[label] = True 43 | return label 44 | 45 | def get_random_identifier(self) -> str: 46 | secret_generator = secrets.SystemRandom() 47 | 48 | def get_random_label(n: int=2) -> str: 49 | tmp = '' 50 | for _ in range(n): 51 | tmp += chr(secret_generator.randint(65, 90)) 52 | return tmp 53 | 54 | wc = 3 if len(self.exists) > 280 else 2 55 | 56 | label = get_random_label(wc) 57 | while label in self.exists: 58 | label = get_random_label(wc) 59 | 60 | self.exists[label] = True 61 | return label 62 | 63 | def generate(self): 64 | return self.func() -------------------------------------------------------------------------------- /webarena/browser_env/html_tools/prompt.py: -------------------------------------------------------------------------------- 1 | from .configs import prompts 2 | 3 | class HtmlPrompt: 4 | def __init__(self, prompt: str='') -> None: 5 | prompt = self.extract(prompt, 'xml') 6 | if prompt not in prompts: 7 | raise Exception('Unknown prompt: ' + prompt) 8 | 9 | constructors = { 10 | 'refine': self.normal_prompt_constructor, 11 | 'xml': self.normal_prompt_constructor, 12 | 'new_data': self.new_data_prompt_constructor, 13 | } 14 | 15 | self.name = prompt 16 | self.prompt = prompts[prompt] 17 | self.constructor = constructors[prompt] 18 | 19 | @staticmethod 20 | def extract(data, default=''): 21 | return data if data is not None else default 22 | 23 | def subtree_constructor(self, subtree: list[str]=[]) -> str: 24 | return self.prompt['subtree_splitter'].join(subtree) 25 | 26 | def normal_prompt_constructor(self, tag: str='', label: str='', content: str='', subtree_str: str='', class_dict: dict[str]={}) -> str: 27 | def add_prefix(data, prefix): 28 | return prefix + data if len(data) > 0 else '' 29 | 30 | tag = self.extract(tag) 31 | label = self.extract(label) 32 | content = self.extract(content) 33 | subtree_str = self.extract(subtree_str, '') 34 | class_dict = self.extract(class_dict, {}) 35 | 36 | label_str = '' 37 | if len(label) > 0: 38 | label_str = self.prompt['label'].format(label=label) 39 | 40 | classes = [] 41 | values = set() 42 | for key, val in class_dict.items(): 43 | if val in values: 44 | continue 45 | values.add(val) 46 | classes.append(self.prompt['attr'].format(key=key, attr=val)) 47 | classes_str = self.prompt['attr_splitter'].join(classes) 48 | 49 | content_splitter = ' ' if len(classes_str) == 0 else self.prompt['attr_splitter'] 50 | classes_str = add_prefix(classes_str, ' ') 51 | content_str = add_prefix(content, content_splitter) 52 | subtree_str = add_prefix(subtree_str, ' ') 53 | 54 | return self.prompt['dom'].format(tag=tag, label=label_str, attr=classes_str, content=content_str, subtree=subtree_str) 55 | 56 | def new_data_prompt_constructor(self, tag: str='', label: str='', content: str='', subtree_str: str='', class_dict: dict[str]={}) -> str: 57 | def add_prefix(data, prefix): 58 | return prefix + data if len(data) > 0 else '' 59 | 60 | tag = self.extract(tag) 61 | label = self.extract(label) 62 | content = self.extract(content) 63 | subtree_str = self.extract(subtree_str, '') 64 | class_dict = self.extract(class_dict, {}) 65 | 66 | label_str = '' 67 | if len(label) > 0: 68 | label_str = self.prompt['label'].format(label=label) 69 | 70 | classes = [] 71 | values = set() 72 | 73 | message = [] 74 | for key, val in class_dict.items(): 75 | if val == '': 76 | message.append(key) 77 | continue 78 | if val in values: 79 | continue 80 | values.add(val) 81 | classes.append(self.prompt['attr'].format(key=key, attr=val)) 82 | 83 | if len(message) > 0: 84 | message_str = ' '.join(message) 85 | classes.append(self.prompt['attr'].format(key='message', attr=message_str)) 86 | 87 | classes_str = self.prompt['attr_splitter'].join(classes) 88 | 89 | content_splitter = ' ' if len(classes_str) == 0 else self.prompt['attr_splitter'] 90 | classes_str = add_prefix(classes_str, ' ') 91 | content_str = add_prefix(content, content_splitter) 92 | subtree_str = add_prefix(subtree_str, ' ') 93 | 94 | return self.prompt['dom'].format(tag=tag, label=label_str, attr=classes_str, content=content_str, subtree=subtree_str) 95 | 96 | def prompt_constructor(self, tag: str='', label: str='', content: str='', subtree_str: str='', class_dict: dict[str]={}) -> str: 97 | return self.constructor(tag, label, content, subtree_str, class_dict) -------------------------------------------------------------------------------- /webarena/browser_env/html_tools/utils.py: -------------------------------------------------------------------------------- 1 | from lxml import html 2 | def get_xpath_top_down(element: html.HtmlElement, id_column: str='temp_id', label_column: str='temp_clickable_label', path: str='', order: int=0, 3 | in_svg: bool=False, temp_id: int=0) -> tuple[int, dict[str, str], dict[str]]: 4 | used_labels, i2xpath = {}, {} 5 | # path 6 | tag = element.tag.lower() 7 | in_svg = in_svg or (tag == 'svg') 8 | 9 | if not in_svg and 'id' in element.attrib: 10 | node_id = element.attrib['id'] 11 | path = f'//*[@id="{node_id}"]' 12 | else: 13 | suffix = f'[{order}]' if order > 0 else '' 14 | prefix = f'*[name()="{tag}"]' if in_svg else tag 15 | path = path + '/' + prefix + suffix 16 | 17 | # add temp id 18 | element.attrib[id_column] = str(temp_id) 19 | ori_label = element.attrib.get(label_column, '') 20 | if ori_label != '': 21 | used_labels[ori_label] = True 22 | 23 | bid = str(temp_id) 24 | i2xpath[bid] = path 25 | i2xpath[path] = bid 26 | i2xpath[f'xpath/{path}'] = bid 27 | i2xpath[f'xpath=/{path}'] = bid 28 | 29 | temp_id += 1 30 | 31 | # traverse node 32 | children = element.getchildren() 33 | tag_dict = {} 34 | id_list = [] 35 | for child in children: 36 | ctag = child.tag.lower() 37 | if ctag not in tag_dict: 38 | tag_dict[ctag] = 0 39 | tag_dict[ctag] += 1 40 | id_list.append(tag_dict[ctag]) 41 | 42 | for cid, child in zip(id_list, children): 43 | ctag = child.tag.lower() 44 | cod = cid if tag_dict[ctag] > 1 else 0 45 | temp_id, i2x, ulabels = get_xpath_top_down(child, id_column, label_column, path, cod, in_svg, temp_id) 46 | i2xpath.update(i2x) 47 | used_labels.update(ulabels) 48 | 49 | return temp_id, i2xpath, used_labels 50 | 51 | def print_html_object(obj: str='') -> str: 52 | tab_cnt = 0 53 | result, content, sep = '', '', '' 54 | last_is_left, last_is_right = False, False 55 | for ch in obj: 56 | if ch == '<': 57 | result += '\n' 58 | if len(content.strip()) > 0: 59 | result += sep + content.strip() + '\n' 60 | result += sep + '<' 61 | 62 | tab_cnt += 1 63 | sep = ' ' * tab_cnt 64 | 65 | content = '' 66 | last_is_right = False 67 | last_is_left = True 68 | elif ch == '>': 69 | if last_is_left: 70 | result += content 71 | else: 72 | if last_is_right: 73 | result += '\n' 74 | if len(content.strip()) > 0: 75 | result += sep + content.strip() + '\n' 76 | 77 | tab_cnt -= 1 78 | sep = ' ' * tab_cnt 79 | 80 | if not last_is_left: 81 | result += sep 82 | 83 | result += '>' 84 | content = '' 85 | 86 | last_is_right = True 87 | last_is_left = False 88 | else: 89 | content += ch 90 | 91 | return result 92 | 93 | def rect2tuple(rect: str) -> tuple[int, int, int, int]: 94 | if rect is None or type(rect) != type('str'): 95 | return None 96 | rect = rect.strip() 97 | if rect.count(',') != 3: 98 | return None 99 | rect = rect.split(',') 100 | rect = [float(r) for r in rect] 101 | return tuple(rect) -------------------------------------------------------------------------------- /webarena/browser_env/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AutoWebGLM/eb8524e55cc3fac44625496e895e4044cea81fba/webarena/browser_env/py.typed -------------------------------------------------------------------------------- /webarena/browser_env/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | rootdir = Path(__file__).parent 4 | 5 | # marker, gpt-4v-act style 6 | with open(os.path.join(rootdir, 'local_marker.js'), 'r') as f: 7 | local_marker_script = f.read() 8 | 9 | with open(os.path.join(rootdir, 'mix_marker.js'), 'r') as f: 10 | mix_marker_script = f.read() 11 | 12 | with open(os.path.join(rootdir, 'get_data.js'), 'r') as f: 13 | get_rect_script = f.read() 14 | 15 | # canva handler 16 | with open(os.path.join(rootdir, 'canva_handler.js'), 'r') as f: 17 | canva_handler_script = f.read() 18 | 19 | # draw label on page 20 | with open(os.path.join(rootdir, 'label_marker.js'), 'r') as f: 21 | label_marker_script = f.read() 22 | 23 | # get text from page 24 | with open(os.path.join(rootdir, 'get_text.js'), 'r') as f: 25 | get_text_script = f.read() 26 | 27 | # remove label draw on page 28 | remove_label_mark_script = """ 29 | () => { 30 | document.querySelectorAll(".our-dom-marker").forEach(item => { 31 | document.body.removeChild(item); 32 | }); 33 | } 34 | """ 35 | 36 | remove_id_script = """ 37 | () => { 38 | Array.from(document.getElementsByClassName('possible-clickable-element')).forEach((element) => { 39 | element.classList.remove('possible-clickable-element'); 40 | element.removeAttribute('data-testid'); 41 | }); 42 | } 43 | """ 44 | -------------------------------------------------------------------------------- /webarena/browser_env/scripts/canva_handler.js: -------------------------------------------------------------------------------- 1 | () => { 2 | var items = Array.prototype.slice.call( 3 | document.querySelectorAll("canvas") 4 | ); 5 | 6 | var vw = Math.max(document.documentElement.clientWidth || 0, window.innerWidth || 0); 7 | var vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0); 8 | 9 | items = items.map(element => { 10 | // var img = element.toDataURL("image/png"); 11 | var bb = element.getClientRects(); 12 | var rect = { 13 | left: -1, 14 | top: -1, 15 | right: -1, 16 | bottom: -1, 17 | width: 0, 18 | height: 0, 19 | }; 20 | if (bb.length > 0) { 21 | bb = bb[0]; 22 | rect = { 23 | left: Math.max(0, bb.left), 24 | top: Math.max(0, bb.top), 25 | right: Math.min(vw, bb.right), 26 | bottom: Math.min(vh, bb.bottom) 27 | }; 28 | rect = { 29 | ...rect, 30 | width: rect.right - rect.left, 31 | height: rect.bottom - rect.top 32 | }; 33 | } 34 | 35 | return { 36 | rects: rect, 37 | tag: element.tagName.toLowerCase?.() || "", 38 | text: element.textContent.trim().replace(/\s{2,}/g, ' '), 39 | // img: img 40 | }; 41 | }); 42 | 43 | return items; 44 | } -------------------------------------------------------------------------------- /webarena/browser_env/scripts/get_data.js: -------------------------------------------------------------------------------- 1 | (packet) => { 2 | function int2str(index) { 3 | var str = ""; 4 | while (index >= 0) { 5 | str = String.fromCharCode(65 + index % 26) + str; 6 | index = Math.floor(index / 26) - 1; 7 | } 8 | return str; 9 | }; 10 | 11 | selector = packet.selector 12 | index = packet.startIndex 13 | var items = Array.prototype.slice.call( 14 | document.querySelectorAll(selector) 15 | ); 16 | 17 | var vw = Math.max(document.documentElement.clientWidth || 0, window.innerWidth || 0); 18 | var vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0); 19 | 20 | items = items.filter( 21 | x => !items.some(y => x.contains(y) && !(x == y)) 22 | ).map(element => { 23 | var bb = element.getClientRects(); 24 | var rect = { 25 | left: 0, 26 | top: 0, 27 | right: 0, 28 | bottom: 0, 29 | width: 0, 30 | height: 0 31 | }; 32 | var keep = false; 33 | var text = "", id = -1; 34 | if (bb.length > 0) { 35 | bb = bb[0]; 36 | rect = { 37 | left: Math.max(0, bb.left), 38 | top: Math.max(0, bb.top), 39 | right: Math.min(vw, bb.right), 40 | bottom: Math.min(vh, bb.bottom) 41 | }; 42 | rect = { 43 | ...rect, 44 | width: rect.right - rect.left, 45 | height: rect.bottom - rect.top 46 | }; 47 | if (rect.width > 0 || rect.height > 0) { 48 | keep = true; 49 | if (index >= 0) { 50 | id = int2str(index++); 51 | element.setAttribute("data-testid", id); 52 | } 53 | var childNodes = element.childNodes; 54 | 55 | for (var i = 0; i < childNodes.length; i++) { 56 | if (childNodes[i].nodeType == Node.TEXT_NODE) { 57 | text += childNodes[i].textContent; 58 | } 59 | } 60 | } 61 | } 62 | 63 | return { 64 | keep: true, 65 | id, 66 | rects: rect, 67 | tag: element.tagName.toLowerCase?.() || "", 68 | text,//: element.innerText?.trim().replace(/\s{2,}/g, " ") || "" 69 | }; 70 | }).filter(x => x.keep); 71 | 72 | return [items, index]; 73 | } -------------------------------------------------------------------------------- /webarena/browser_env/scripts/get_text.js: -------------------------------------------------------------------------------- 1 | () => { 2 | var items = Array.prototype.slice.call( 3 | document.querySelectorAll("*") 4 | ); 5 | 6 | var vw = Math.max(document.documentElement.clientWidth || 0, window.innerWidth || 0); 7 | var vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0); 8 | const ignoreTags = ["script", "html"]; 9 | items = items.map(element => { 10 | const tag = element.tagName.toLowerCase?.() || ""; 11 | var bb = element.getClientRects(); 12 | var keep = false; 13 | var text = ''; 14 | 15 | const domId = element.getAttribute('data-testid'); 16 | var id = domId? parseInt(domId): "-"; 17 | 18 | if (bb.length > 0) { 19 | bb = bb[0]; 20 | var width = Math.min(vw, bb.right) - Math.max(0, bb.left); 21 | var height = Math.min(vh, bb.bottom) - Math.max(0, bb.top); 22 | 23 | if (width > 0 && height > 0) { 24 | keep = true; 25 | var childNodes = element.childNodes; 26 | 27 | for (var i = 0; i < childNodes.length; i++) { 28 | if (childNodes[i].nodeType == Node.TEXT_NODE) { 29 | text += childNodes[i].textContent; 30 | } 31 | } 32 | } 33 | } 34 | 35 | text = text.trim().replace(/\s{2,}/g, ' '); 36 | if (ignoreTags.includes(tag)) keep = false; 37 | if (id == "-" && text.length == 0) keep = false; 38 | 39 | return { 40 | keep, 41 | tag, 42 | id, 43 | text, //:element.innerText?.trim().replace(/\s{2,}/g, " ") || "" 44 | }; 45 | }).filter(x => x.keep); 46 | 47 | return items; 48 | } -------------------------------------------------------------------------------- /webarena/browser_env/scripts/label_marker.js: -------------------------------------------------------------------------------- 1 | (items) => { 2 | function getRandomColor() { 3 | var letters = '0123456789ABCDEF'; 4 | var color = '#'; 5 | for (var i = 0; i < 6; i++) { 6 | color += letters[Math.floor(Math.random() * 16)]; 7 | } 8 | return color; 9 | } 10 | 11 | items.filter( 12 | item => item.id != "" 13 | ).forEach((item) => { 14 | const bbox = item.rects; 15 | const id_string = `dom-marker-id-${index}`; 16 | 17 | index = item.id; 18 | 19 | outerElement = document.createElement("div"); 20 | outerElement.classList.add("our-dom-marker"); 21 | // var borderColor = getRandomColor(); 22 | var borderColor = "#FFFF00"; 23 | outerElement.style.outline = `2px dashed ${borderColor}`; 24 | outerElement.style.position = "fixed"; 25 | outerElement.style.left = bbox.left - 2 + "px"; 26 | outerElement.style.top = bbox.top - 2 + "px"; 27 | outerElement.style.width = bbox.width + 4 + "px"; 28 | outerElement.style.height = bbox.height + 4 + "px"; 29 | outerElement.style.pointerEvents = "none"; 30 | outerElement.style.boxSizing = "border-box"; 31 | outerElement.style.zIndex = 2147483647; 32 | 33 | innerElement = document.createElement("div"); 34 | innerElement.classList.add("our-dom-marker"); 35 | innerElement.style.outline = `2px dashed #222288`; 36 | innerElement.style.position = "fixed"; 37 | innerElement.style.left = bbox.left + "px"; 38 | innerElement.style.top = bbox.top + "px"; 39 | innerElement.style.width = bbox.width + "px"; 40 | innerElement.style.height = bbox.height + "px"; 41 | innerElement.style.pointerEvents = "none"; 42 | innerElement.style.boxSizing = "border-box"; 43 | innerElement.style.zIndex = 2147483647; 44 | 45 | // Add floating label at the corner 46 | var label = document.createElement("span"); 47 | var topPosition = 25; 48 | if (bbox.top < 25) topPosition = bbox.top; 49 | label.textContent = index; 50 | label.style.position = "absolute"; 51 | label.style.top = `-${topPosition}px`; 52 | label.style.left = "0px"; 53 | label.style.background = borderColor; 54 | label.style.color = "black"; 55 | label.style.padding = "2px 4px"; 56 | label.style.fontSize = "16px"; 57 | label.style.borderRadius = "2px"; 58 | label.style.fontWeight = "bold"; 59 | outerElement.appendChild(label); 60 | 61 | document.body.appendChild(outerElement); 62 | document.body.appendChild(innerElement); 63 | }) 64 | return items; 65 | } -------------------------------------------------------------------------------- /webarena/browser_env/scripts/local_marker.js: -------------------------------------------------------------------------------- 1 | () => { 2 | var items = Array.prototype.slice.call( 3 | document.querySelectorAll('*') 4 | ).map((element) => { 5 | var vw = Math.max(document.documentElement.clientWidth || 0, window.innerWidth || 0); 6 | var vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0); 7 | 8 | var rects = [...element.getClientRects()].filter(bb => { 9 | var center_x = bb.left + bb.width / 2; 10 | var center_y = bb.top + bb.height / 2; 11 | var elAtCenter = document.elementFromPoint(center_x, center_y); 12 | 13 | return elAtCenter === element || element.contains(elAtCenter) 14 | }).map(bb => { 15 | const rect = { 16 | left: Math.max(0, bb.left), 17 | top: Math.max(0, bb.top), 18 | right: Math.min(vw, bb.right), 19 | bottom: Math.min(vh, bb.bottom) 20 | }; 21 | return { 22 | ...rect, 23 | width: rect.right - rect.left, 24 | height: rect.bottom - rect.top 25 | } 26 | }); 27 | 28 | var area = rects.reduce((acc, rect) => acc + rect.width * rect.height, 0); 29 | 30 | return { 31 | element: element, 32 | include: window.getComputedStyle(element).cursor == "pointer", 33 | area, 34 | rects, 35 | text: element.textContent.trim().replace(/\s{2,}/g, ' '), 36 | }; 37 | }).filter(item => 38 | item.include && (item.area >= 20) 39 | ) 40 | 41 | items = items.filter(x => !items.some(y => x.element.contains(y.element) && !(x == y))) 42 | 43 | items.forEach(item => { 44 | item.element.classList.add('possible-clickable-element'); 45 | }); 46 | } -------------------------------------------------------------------------------- /webarena/browser_env/trajectory.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | from .actions import Action 4 | from .utils import StateInfo 5 | 6 | Trajectory = list[Union[StateInfo, Action]] 7 | -------------------------------------------------------------------------------- /webarena/browser_env/utils.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from io import BytesIO 3 | from typing import Any, Dict, TypedDict, Union 4 | 5 | import numpy as np 6 | import numpy.typing as npt 7 | from PIL import Image 8 | 9 | 10 | @dataclass 11 | class DetachedPage: 12 | url: str 13 | content: str # html 14 | 15 | 16 | def png_bytes_to_numpy(png: bytes) -> npt.NDArray[np.uint8]: 17 | """Convert png bytes to numpy array 18 | 19 | Example: 20 | 21 | >>> fig = go.Figure(go.Scatter(x=[1], y=[1])) 22 | >>> plt.imshow(png_bytes_to_numpy(fig.to_image('png'))) 23 | """ 24 | return np.array(Image.open(BytesIO(png))) 25 | 26 | 27 | class AccessibilityTreeNode(TypedDict): 28 | nodeId: str 29 | ignored: bool 30 | role: dict[str, Any] 31 | chromeRole: dict[str, Any] 32 | name: dict[str, Any] 33 | properties: list[dict[str, Any]] 34 | childIds: list[str] 35 | parentId: str 36 | backendDOMNodeId: str 37 | frameId: str 38 | bound: list[float] | None 39 | union_bound: list[float] | None 40 | offsetrect_bound: list[float] | None 41 | 42 | 43 | class DOMNode(TypedDict): 44 | nodeId: str 45 | nodeType: str 46 | nodeName: str 47 | nodeValue: str 48 | attributes: str 49 | backendNodeId: str 50 | parentId: str 51 | childIds: list[str] 52 | cursor: int 53 | union_bound: list[float] | None 54 | 55 | 56 | class BrowserConfig(TypedDict): 57 | win_top_bound: float 58 | win_left_bound: float 59 | win_width: float 60 | win_height: float 61 | win_right_bound: float 62 | win_lower_bound: float 63 | device_pixel_ratio: float 64 | 65 | 66 | class BrowserInfo(TypedDict): 67 | DOMTree: dict[str, Any] 68 | config: BrowserConfig 69 | 70 | 71 | AccessibilityTree = list[AccessibilityTreeNode] 72 | DOMTree = list[DOMNode] 73 | 74 | 75 | Observation = str | npt.NDArray[np.uint8] 76 | 77 | 78 | class StateInfo(TypedDict): 79 | observation: dict[str, Observation] 80 | info: Dict[str, Any] 81 | -------------------------------------------------------------------------------- /webarena/check_errors.sh: -------------------------------------------------------------------------------- 1 | #!/bin/zsh 2 | 3 | result_folder=$1 4 | cd cache/$result_folder 5 | 6 | 7 | # check whether there is any auto-login errors 8 | errors=$(grep -l "Creating an account has many benefits: check out faster" *.html | sort -u | grep -o '[0-9]\+') 9 | c=$(echo $errors | wc -l) 10 | echo "Shopping total errors: $c" 11 | echo $errors | tr '\n' ',' 12 | echo '\n\n' 13 | 14 | 15 | errors=$(grep -l "Welcome, please sign in" *.html | sort -u | grep -o '[0-9]\+') 16 | c=$(echo $errors | wc -l) 17 | echo "Admin total errors: $c" 18 | echo $errors | tr '\n' ',' 19 | echo '\n\n' 20 | 21 | 22 | 23 | errors=$(grep -l "Username or email" *.html | sort -u | grep -o '[0-9]\+') 24 | c=$(echo $errors | wc -l) 25 | echo "Gitlab errors: $c" 26 | echo $errors | tr '\n' ',' 27 | echo '\n\n' 28 | 29 | 30 | errors=$(grep -l "Keep me logged in" *.html | sort -u | grep -o '[0-9]\+') 31 | c=$(echo $errors | wc -l) 32 | echo "Reddit errors: $c" 33 | echo $errors | tr '\n' ',' 34 | -------------------------------------------------------------------------------- /webarena/config_files/examples/1.json: -------------------------------------------------------------------------------- 1 | { 2 | "sites": ["reddit"], 3 | "task_id": 1, 4 | "require_login": true, 5 | "storage_state": "./.auth/reddit_state.json", 6 | "start_url": "http://metis.lti.cs.cmu.edu:9999/", 7 | "geolocation": null, 8 | "intent_template": "tell me all subreddits starting with character '{{character}}'", 9 | "instantiation_dict": {"character": "a"}, 10 | "intent": "tell me all subreddits starting with character 'a'", 11 | "require_reset": false, 12 | "eval": { 13 | "eval_types": ["string_match"], 14 | "reference_answers": ["announcements Art AskReddit askscience aww"], 15 | "reference_url": "", 16 | "program_html": [ 17 | { 18 | "url": "", 19 | "required_contents": [] 20 | } 21 | ] 22 | }, 23 | "reference_action_sequence": { 24 | "action_set_tag": "playwright", 25 | "action_sequence": [ 26 | "page.get_by_role(\"link\", name=\"Forums\").click()", 27 | "page.get_by_role(\"link\", name=\"Alphabetical\").click()", 28 | "page.stop(\"announcements Art AskReddit askscience aww\")" 29 | ] 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /webarena/config_files/examples/2.json: -------------------------------------------------------------------------------- 1 | { 2 | "sites": ["misc"], 3 | "task_id": 2, 4 | "require_login": false, 5 | "storage_state": null, 6 | "start_url": "https://russmaxdesign.github.io/exercise", 7 | "geolocation": null, 8 | "intent_template": "", 9 | "instantiation_dict": {}, 10 | "intent": "Check out the classification section", 11 | "require_reset": false, 12 | "eval": { 13 | "eval_types": ["url_match"], 14 | "reference_answers": null, 15 | "reference_url": "https://russmaxdesign.github.io/exercise/#link-two", 16 | "program_html": [ 17 | { 18 | "url": "", 19 | "required_contents": [] 20 | } 21 | ] 22 | }, 23 | "reference_action_sequence": { 24 | "action_set_tag": "playwright", 25 | "action_sequence": [ 26 | "page.get_by_role(\"navigation\").get_by_role(\"link\", name=\"Classification\").click()", 27 | "page.stop(\"Wilson and Reade\")" 28 | ] 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /webarena/config_files/examples/3.json: -------------------------------------------------------------------------------- 1 | { 2 | "sites": ["misc"], 3 | "task_id": 3, 4 | "require_login": false, 5 | "storage_state": null, 6 | "start_url": "https://russmaxdesign.github.io/exercise", 7 | "geolocation": null, 8 | "intent_template": "", 9 | "instantiation_dict": {}, 10 | "intent": "Tell me who provide a collection of concise, detailed information for mammal classification in 2005", 11 | "require_reset": false, 12 | "eval": { 13 | "eval_types": ["string_match"], 14 | "reference_answers": ["Wilson and Reader"], 15 | "reference_url": "", 16 | "program_html": [ 17 | { 18 | "url": "", 19 | "required_contents": [] 20 | } 21 | ] 22 | }, 23 | "reference_action_sequence": { 24 | "action_set_tag": "id_accessibility_tree", 25 | "action_sequence": [ 26 | "type [13] [xyz@gmail.com] [0]", 27 | "click [65]", 28 | "stop [Wilson and Reader]" 29 | ] 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /webarena/config_files/examples/4.json: -------------------------------------------------------------------------------- 1 | { 2 | "sites": ["reddit"], 3 | "task_id": 4, 4 | "require_login": true, 5 | "storage_state": "./.auth/reddit_state.json", 6 | "start_url": "http://metis.lti.cs.cmu.edu:9999/", 7 | "geolocation": null, 8 | "intent_template": "list all subreddits in alphabetical order", 9 | "instantiation_dict": {}, 10 | "intent": "list all subreddits in alphabetical order", 11 | "require_reset": false, 12 | "eval": { 13 | "eval_types": ["url_match"], 14 | "reference_answers": null, 15 | "reference_url": "http://metis.lti.cs.cmu.edu:9999/forums/all", 16 | "program_html": [ 17 | { 18 | "url": "", 19 | "required_contents": [] 20 | } 21 | ] 22 | }, 23 | "reference_action_sequence": { 24 | "action_set_tag": "playwright", 25 | "action_sequence": [ 26 | "page.get_by_role(\"link\", name=\"Forums\").click()", 27 | "page.get_by_role(\"link\", name=\"Alphabetical\").click()", 28 | "page.stop()" 29 | ] 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /webarena/environment_docker/webarena-homepage/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template 2 | 3 | app = Flask(__name__) 4 | 5 | 6 | @app.route("/") 7 | def index() -> str: 8 | return render_template("index.html") 9 | 10 | 11 | @app.route("/scratchpad.html") 12 | def scratchpad() -> str: 13 | return render_template("scratchpad.html") 14 | 15 | 16 | @app.route("/calculator.html") 17 | def calculator() -> str: 18 | return render_template("calculator.html") 19 | 20 | 21 | @app.route("/password.html") 22 | def password() -> str: 23 | return render_template("password.html") 24 | 25 | 26 | if __name__ == "__main__": 27 | app.run(host="0.0.0.0", port=4399) 28 | -------------------------------------------------------------------------------- /webarena/environment_docker/webarena-homepage/static/figures/calculator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AutoWebGLM/eb8524e55cc3fac44625496e895e4044cea81fba/webarena/environment_docker/webarena-homepage/static/figures/calculator.png -------------------------------------------------------------------------------- /webarena/environment_docker/webarena-homepage/static/figures/cms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AutoWebGLM/eb8524e55cc3fac44625496e895e4044cea81fba/webarena/environment_docker/webarena-homepage/static/figures/cms.png -------------------------------------------------------------------------------- /webarena/environment_docker/webarena-homepage/static/figures/gitlab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AutoWebGLM/eb8524e55cc3fac44625496e895e4044cea81fba/webarena/environment_docker/webarena-homepage/static/figures/gitlab.png -------------------------------------------------------------------------------- /webarena/environment_docker/webarena-homepage/static/figures/manual1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AutoWebGLM/eb8524e55cc3fac44625496e895e4044cea81fba/webarena/environment_docker/webarena-homepage/static/figures/manual1.png -------------------------------------------------------------------------------- /webarena/environment_docker/webarena-homepage/static/figures/manual2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AutoWebGLM/eb8524e55cc3fac44625496e895e4044cea81fba/webarena/environment_docker/webarena-homepage/static/figures/manual2.png -------------------------------------------------------------------------------- /webarena/environment_docker/webarena-homepage/static/figures/map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AutoWebGLM/eb8524e55cc3fac44625496e895e4044cea81fba/webarena/environment_docker/webarena-homepage/static/figures/map.png -------------------------------------------------------------------------------- /webarena/environment_docker/webarena-homepage/static/figures/onestopshop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AutoWebGLM/eb8524e55cc3fac44625496e895e4044cea81fba/webarena/environment_docker/webarena-homepage/static/figures/onestopshop.png -------------------------------------------------------------------------------- /webarena/environment_docker/webarena-homepage/static/figures/password.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AutoWebGLM/eb8524e55cc3fac44625496e895e4044cea81fba/webarena/environment_docker/webarena-homepage/static/figures/password.png -------------------------------------------------------------------------------- /webarena/environment_docker/webarena-homepage/static/figures/reddit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AutoWebGLM/eb8524e55cc3fac44625496e895e4044cea81fba/webarena/environment_docker/webarena-homepage/static/figures/reddit.png -------------------------------------------------------------------------------- /webarena/environment_docker/webarena-homepage/static/figures/scratchpad.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AutoWebGLM/eb8524e55cc3fac44625496e895e4044cea81fba/webarena/environment_docker/webarena-homepage/static/figures/scratchpad.png -------------------------------------------------------------------------------- /webarena/environment_docker/webarena-homepage/static/figures/wikipedia.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AutoWebGLM/eb8524e55cc3fac44625496e895e4044cea81fba/webarena/environment_docker/webarena-homepage/static/figures/wikipedia.png -------------------------------------------------------------------------------- /webarena/environment_docker/webarena-homepage/templates/calculator.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Calculator 5 | 72 | 73 | 74 |
75 |

Calculator

76 |

Enter the expression and get the results

77 | 78 | 79 | 80 |
Result:
81 |
82 | 83 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /webarena/environment_docker/webarena-homepage/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Homepage 5 | 6 | 67 | 68 | 69 |
70 | 73 | 74 |
75 | Logo for OneStopShop 76 | 77 |

OneStopShop

78 |
79 |

An online shopping site

80 |
81 | 82 |
83 | Logo for CMS 84 | 85 |

Merchant Admin Portal

86 |
87 |

An admin portal to manage E-commerce business (u: admin, p: admin1234)

88 |
89 | 90 |
91 | Logo for Reddit 92 | 93 |

Reddit

94 |
95 |

A social news aggregation and discussion website

96 |
97 | 98 |
99 | Logo for Gitlab 100 | 101 |

Gitlab

102 |
103 |

a DevOps software

104 |
105 | 106 |
107 | Logo for Map 108 | 109 |

OpenStreetMap

110 |
111 |

North east US map

112 |
113 | 114 |
115 | Logo for Calculator 116 | 117 |

Calculator

118 |
119 |

A calculator

120 |
121 | 122 |
123 | Logo for Scratchpad 124 | 125 |

Scratchpad

126 |
127 |

A scratchpad for taking notes

128 |
129 | 130 |
131 | Logo for Wikipedia 132 | 133 |

Wikipedia

134 |
135 |

An online encyclopedia

136 |
137 | 138 |
139 | Logo for Gitlab Manual 140 | 141 |

Gitlab Documentation

142 |
143 |

Documentation for GitLab

144 |
145 | 146 |
147 | Logo for Admin Manual 148 | 149 |

Admin Portal Manual

150 |
151 |

Manual on using the admin portal

152 |
153 | 154 | 155 |
156 | 157 | 158 | -------------------------------------------------------------------------------- /webarena/environment_docker/webarena-homepage/templates/scratchpad.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Note Taking App 6 | 81 | 82 | 83 |
84 |

My Notes

85 |
86 | 87 |
88 |
89 |
90 | 91 | 92 |
93 |
94 | 95 |

History

96 | 97 |
98 | 99 |
100 |
101 | 102 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /webarena/evaluation_harness/__init__.py: -------------------------------------------------------------------------------- 1 | from .evaluators import * 2 | from .helper_functions import ( 3 | shopping_get_latest_order_url, 4 | shopping_get_sku_latest_review_author, 5 | shopping_get_sku_latest_review_rating, 6 | ) 7 | -------------------------------------------------------------------------------- /webarena/llms/__init__.py: -------------------------------------------------------------------------------- 1 | """This module is adapt from https://github.com/zeno-ml/zeno-build""" 2 | from .providers.hf_utils import generate_from_huggingface_completion 3 | from .providers.ours import call_pretrain_model 4 | from .providers.openai_utils import ( 5 | generate_from_openai_chat_completion, 6 | generate_from_openai_completion, 7 | ) 8 | from .utils import call_llm 9 | 10 | __all__ = [ 11 | "generate_from_openai_completion", 12 | "generate_from_openai_chat_completion", 13 | "generate_from_huggingface_completion", 14 | "call_llm", 15 | "call_pretrain_model" 16 | ] 17 | -------------------------------------------------------------------------------- /webarena/llms/lm_config.py: -------------------------------------------------------------------------------- 1 | """Config for language models.""" 2 | 3 | from __future__ import annotations 4 | 5 | import argparse 6 | import dataclasses 7 | from dataclasses import dataclass 8 | from typing import Any 9 | 10 | 11 | @dataclass(frozen=True) 12 | class LMConfig: 13 | """A config for a language model. 14 | 15 | Attributes: 16 | provider: The name of the API provider. 17 | model: The name of the model. 18 | model_cls: The Python class corresponding to the model, mostly for 19 | Hugging Face transformers. 20 | tokenizer_cls: The Python class corresponding to the tokenizer, mostly 21 | for Hugging Face transformers. 22 | mode: The mode of the API calls, e.g., "chat" or "generation". 23 | """ 24 | 25 | provider: str 26 | model: str 27 | model_cls: type | None = None 28 | tokenizer_cls: type | None = None 29 | mode: str | None = None 30 | gen_config: dict[str, Any] = dataclasses.field(default_factory=dict) 31 | cuda: str = '0' 32 | 33 | 34 | def construct_llm_config(args: argparse.Namespace) -> LMConfig: 35 | llm_config = LMConfig( 36 | provider=args.provider, model=args.model, mode=args.mode, cuda=args.cuda 37 | ) 38 | if args.provider == "openai": 39 | llm_config.gen_config["temperature"] = args.temperature 40 | llm_config.gen_config["top_p"] = args.top_p 41 | llm_config.gen_config["context_length"] = args.context_length 42 | llm_config.gen_config["max_tokens"] = args.max_tokens 43 | llm_config.gen_config["stop_token"] = args.stop_token 44 | llm_config.gen_config["max_obs_length"] = args.max_obs_length 45 | llm_config.gen_config["max_retry"] = args.max_retry 46 | elif args.provider == "huggingface": 47 | llm_config.gen_config["temperature"] = args.temperature 48 | llm_config.gen_config["top_p"] = args.top_p 49 | llm_config.gen_config["max_new_tokens"] = args.max_tokens 50 | llm_config.gen_config["stop_sequences"] = ( 51 | [args.stop_token] if args.stop_token else None 52 | ) 53 | llm_config.gen_config["max_obs_length"] = args.max_obs_length 54 | llm_config.gen_config["model_endpoint"] = args.model_endpoint 55 | llm_config.gen_config["max_retry"] = args.max_retry 56 | elif args.provider == "ours": 57 | llm_config.gen_config["temperature"] = args.temperature 58 | llm_config.gen_config["top_p"] = args.top_p 59 | llm_config.gen_config["context_length"] = args.context_length 60 | llm_config.gen_config["max_tokens"] = args.max_tokens 61 | llm_config.gen_config["stop_token"] = args.stop_token 62 | llm_config.gen_config["max_obs_length"] = args.max_obs_length 63 | llm_config.gen_config["max_retry"] = args.max_retry 64 | llm_config.gen_config["cuda"] = args.cuda 65 | else: 66 | raise NotImplementedError(f"provider {args.provider} not implemented") 67 | return llm_config 68 | -------------------------------------------------------------------------------- /webarena/llms/providers/hf_utils.py: -------------------------------------------------------------------------------- 1 | from text_generation import Client # type: ignore 2 | 3 | 4 | def generate_from_huggingface_completion( 5 | prompt: str, 6 | model_endpoint: str, 7 | temperature: float, 8 | top_p: float, 9 | max_new_tokens: int, 10 | stop_sequences: list[str] | None = None, 11 | ) -> str: 12 | client = Client(model_endpoint, timeout=60) 13 | generation: str = client.generate( 14 | prompt=prompt, 15 | temperature=temperature, 16 | top_p=top_p, 17 | max_new_tokens=max_new_tokens, 18 | stop_sequences=stop_sequences, 19 | ).generated_text 20 | 21 | return generation 22 | -------------------------------------------------------------------------------- /webarena/llms/providers/ours.py: -------------------------------------------------------------------------------- 1 | def call_pretrain_model(query: str, model, tokenizer, cuda, sample_times: int=1): 2 | def chatglm3_base_template(query, history=None, system=None): 3 | prompt = f'Q: {query}\n\nA: ' 4 | return prompt 5 | 6 | def model_chat(prompt: str): 7 | output, updated_history = model.chat(tokenizer, prompt, history=None) 8 | return output 9 | 10 | def generation(prompt: str, sample_times: int=1): 11 | input_ids = tokenizer.encode( 12 | text=prompt, 13 | return_tensors='pt', 14 | max_length=8192, 15 | truncation=False 16 | ).to(f'cuda:{cuda}') 17 | 18 | if len(input_ids[0]) > 7500: 19 | return '' 20 | 21 | output_ids = model.generate( 22 | input_ids=input_ids, 23 | max_new_tokens=1024, 24 | do_sample=True, 25 | top_p=0.7, 26 | temperature=0.95, 27 | num_return_sequences=sample_times 28 | ) 29 | 30 | output_text_list = [] 31 | for i in range(sample_times): 32 | output_text = tokenizer.decode(output_ids[i], skip_special_tokens=True) 33 | output_text = output_text.split('A: ')[-1] 34 | output_text_list.append(output_text) 35 | 36 | output = output_text_list[0] 37 | return output 38 | 39 | prompt = chatglm3_base_template(query) 40 | output = generation(prompt) 41 | # output = model_chat(prompt) 42 | print('[Model]', output) 43 | return output 44 | -------------------------------------------------------------------------------- /webarena/llms/tokenizers.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import tiktoken 4 | from transformers import LlamaTokenizer # type: ignore 5 | 6 | 7 | class Tokenizer(object): 8 | def __init__(self, provider: str, model_name: str) -> None: 9 | if provider == "openai": 10 | self.tokenizer = tiktoken.encoding_for_model(model_name) 11 | elif provider == "huggingface": 12 | self.tokenizer = LlamaTokenizer.from_pretrained(model_name) 13 | # turn off adding special tokens automatically 14 | self.tokenizer.add_special_tokens = False # type: ignore[attr-defined] 15 | self.tokenizer.add_bos_token = False # type: ignore[attr-defined] 16 | self.tokenizer.add_eos_token = False # type: ignore[attr-defined] 17 | elif provider == "ours": 18 | self.tokenizer = tiktoken.encoding_for_model("gpt-4") 19 | else: 20 | raise NotImplementedError 21 | 22 | def encode(self, text: str) -> list[int]: 23 | return self.tokenizer.encode(text) 24 | 25 | def decode(self, ids: list[int]) -> str: 26 | return self.tokenizer.decode(ids) 27 | 28 | def __call__(self, text: str) -> list[int]: 29 | return self.tokenizer.encode(text) 30 | -------------------------------------------------------------------------------- /webarena/llms/utils.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from typing import Any 3 | from transformers import AutoTokenizer, AutoModel 4 | 5 | from llms import ( 6 | generate_from_huggingface_completion, 7 | generate_from_openai_chat_completion, 8 | generate_from_openai_completion, 9 | call_pretrain_model, 10 | lm_config, 11 | ) 12 | 13 | APIInput = str | list[Any] | dict[str, Any] 14 | 15 | model = None 16 | tokenizer = None 17 | 18 | def call_llm( 19 | lm_config: lm_config.LMConfig, 20 | prompt: APIInput, 21 | ) -> str: 22 | global model 23 | global tokenizer 24 | 25 | response: str 26 | 27 | if lm_config.provider == "openai": 28 | if lm_config.mode == "chat": 29 | assert isinstance(prompt, list) 30 | response = generate_from_openai_chat_completion( 31 | messages=prompt, 32 | model=lm_config.model, 33 | temperature=lm_config.gen_config["temperature"], 34 | top_p=lm_config.gen_config["top_p"], 35 | context_length=lm_config.gen_config["context_length"], 36 | max_tokens=lm_config.gen_config["max_tokens"], 37 | stop_token=None, 38 | ) 39 | elif lm_config.mode == "completion": 40 | assert isinstance(prompt, str) 41 | response = generate_from_openai_completion( 42 | prompt=prompt, 43 | engine=lm_config.model, 44 | temperature=lm_config.gen_config["temperature"], 45 | max_tokens=lm_config.gen_config["max_tokens"], 46 | top_p=lm_config.gen_config["top_p"], 47 | stop_token=lm_config.gen_config["stop_token"], 48 | ) 49 | else: 50 | raise ValueError( 51 | f"OpenAI models do not support mode {lm_config.mode}" 52 | ) 53 | elif lm_config.provider == "huggingface": 54 | assert isinstance(prompt, str) 55 | response = generate_from_huggingface_completion( 56 | prompt=prompt, 57 | model_endpoint=lm_config.gen_config["model_endpoint"], 58 | temperature=lm_config.gen_config["temperature"], 59 | top_p=lm_config.gen_config["top_p"], 60 | stop_sequences=lm_config.gen_config["stop_sequences"], 61 | max_new_tokens=lm_config.gen_config["max_new_tokens"], 62 | ) 63 | elif lm_config.provider == "ours": 64 | # print(prompt) 65 | if lm_config.model == 'manual': 66 | response = input("Command > ") 67 | else: 68 | if not model: 69 | model = AutoModel.from_pretrained(lm_config.model, trust_remote_code=True, device=f'cuda:{lm_config.cuda}') 70 | tokenizer = AutoTokenizer.from_pretrained(lm_config.model, trust_remote_code=True) 71 | model.eval() 72 | response = call_pretrain_model(prompt, model, tokenizer, lm_config.cuda) 73 | else: 74 | raise NotImplementedError( 75 | f"Provider {lm_config.provider} not implemented" 76 | ) 77 | 78 | return response 79 | -------------------------------------------------------------------------------- /webarena/media/example_trace_viewer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AutoWebGLM/eb8524e55cc3fac44625496e895e4044cea81fba/webarena/media/example_trace_viewer.png -------------------------------------------------------------------------------- /webarena/media/homepage_demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AutoWebGLM/eb8524e55cc3fac44625496e895e4044cea81fba/webarena/media/homepage_demo.png -------------------------------------------------------------------------------- /webarena/media/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AutoWebGLM/eb8524e55cc3fac44625496e895e4044cea81fba/webarena/media/logo.png -------------------------------------------------------------------------------- /webarena/media/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AutoWebGLM/eb8524e55cc3fac44625496e895e4044cea81fba/webarena/media/overview.png -------------------------------------------------------------------------------- /webarena/media/v1_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AutoWebGLM/eb8524e55cc3fac44625496e895e4044cea81fba/webarena/media/v1_result.png -------------------------------------------------------------------------------- /webarena/media/v2_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AutoWebGLM/eb8524e55cc3fac44625496e895e4044cea81fba/webarena/media/v2_result.png -------------------------------------------------------------------------------- /webarena/minimal_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # type: ignore 3 | 4 | import json 5 | import os 6 | import re 7 | import subprocess 8 | import time 9 | 10 | SLEEP = 1.5 11 | # set the URLs of each website, we use the demo sites as an example 12 | os.environ[ 13 | "SHOPPING" 14 | ] = "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:7770" 15 | os.environ[ 16 | "SHOPPING_ADMIN" 17 | ] = "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:7780/admin" 18 | os.environ[ 19 | "REDDIT" 20 | ] = "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:9999" 21 | os.environ[ 22 | "GITLAB" 23 | ] = "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:8023" 24 | os.environ[ 25 | "MAP" 26 | ] = "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:3000" 27 | os.environ[ 28 | "WIKIPEDIA" 29 | ] = "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing" 30 | os.environ[ 31 | "HOMEPAGE" 32 | ] = "PASS" # The home page is not currently hosted in the demo site 33 | print("Done setting up URLs") 34 | 35 | # First, run `python scripts/generate_test_data.py` to generate the config files 36 | p = subprocess.run( 37 | ["python", "scripts/generate_test_data.py"], capture_output=True 38 | ) 39 | 40 | # It will generate individual config file for each test example in config_files 41 | assert os.path.exists("config_files/0.json") 42 | 43 | # Make sure the URLs in the config files are replaced properly 44 | with open("config_files/0.json", "r") as f: 45 | config = json.load(f) 46 | assert os.environ["SHOPPING_ADMIN"] in config["start_url"], ( 47 | os.environ["SHOPPING_ADMIN"], 48 | config["start_url"], 49 | ) 50 | 51 | print("Done generating config files with the correct URLs") 52 | 53 | # run bash prepare.sh to save all account cookies, this only needs to be done once 54 | subprocess.run(["bash", "prepare.sh"]) 55 | print("Done saving account cookies") 56 | 57 | # Init an environment 58 | from browser_env import ( 59 | Action, 60 | ActionTypes, 61 | ObservationMetadata, 62 | ScriptBrowserEnv, 63 | StateInfo, 64 | Trajectory, 65 | action2str, 66 | create_id_based_action, 67 | create_stop_action, 68 | ) 69 | from evaluation_harness.evaluators import evaluator_router 70 | 71 | # Init the environment 72 | env = ScriptBrowserEnv( 73 | headless=False, 74 | slow_mo=100, 75 | observation_type="accessibility_tree", 76 | current_viewport_only=True, 77 | viewport_size={"width": 1280, "height": 720}, 78 | ) 79 | 80 | # example 156 as an example 81 | config_file = "config_files/156.json" 82 | # maintain a trajectory 83 | trajectory: Trajectory = [] 84 | 85 | # set the environment for the current example 86 | obs, info = env.reset(options={"config_file": config_file}) 87 | actree_obs = obs["text"] 88 | print(actree_obs) 89 | 90 | # You should see some output like this: 91 | """ 92 | [4] RootWebArea 'Projects · Dashboard · GitLab' focused: True 93 | [12] link 'Skip to content' 94 | [28] link 'Dashboard' 95 | [2266] button '' hasPopup: menu expanded: False 96 | [63] textbox 'Search GitLab' required: False 97 | [61] generic 'Use the shortcut key / to start a search' 98 | [79] link 'Create new...' 99 | [95] link 'Issues' 100 | [97] generic '13 assigned issues' 101 | [101] link 'Merge requests' 102 | [104] generic '8 merge requests'""" 103 | 104 | # save the state info to the trajectory 105 | state_info: StateInfo = {"observation": obs, "info": info} 106 | trajectory.append(state_info) 107 | 108 | # Now let's try to perform the action of clicking the "Merge request" link 109 | # As the element ID is dynamic each time, we use regex to match the element as the demo 110 | match = re.search(r"\[(\d+)\] link 'Merge requests'", actree_obs).group(1) 111 | # Create the action click [ELEMENT_ID] 112 | click_action = create_id_based_action(f"click [{match}]") 113 | # Add the action to the trajectory 114 | trajectory.append(click_action) 115 | 116 | # Step and get the new observation 117 | obs, _, terminated, _, info = env.step(click_action) 118 | # New observation 119 | actree_obs = obs["text"] 120 | print(actree_obs) 121 | time.sleep(SLEEP) 122 | 123 | state_info = {"observation": obs, "info": info} 124 | trajectory.append(state_info) 125 | 126 | # Next click "assign to you" 127 | match = re.search(r"\[(\d+)\] link 'Assigned to you", actree_obs).group(1) 128 | click_action = create_id_based_action(f"click [{match}]") 129 | trajectory.append(click_action) 130 | 131 | obs, _, terminated, _, info = env.step(click_action) 132 | actree_obs = obs["text"] 133 | print(actree_obs) 134 | time.sleep(SLEEP) 135 | state_info = {"observation": obs, "info": info} 136 | trajectory.append(state_info) 137 | 138 | # add a stop action to mark the end of the trajectory 139 | trajectory.append(create_stop_action("")) 140 | 141 | 142 | # Demo evaluation 143 | evaluator = evaluator_router(config_file) 144 | score = evaluator( 145 | trajectory=trajectory, 146 | config_file=config_file, 147 | page=env.page, 148 | client=env.get_page_client(env.page), 149 | ) 150 | 151 | # as we manually perform the task, the task should be judged as correct 152 | assert score == 1.0 153 | -------------------------------------------------------------------------------- /webarena/parallel_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # [TODO] change this 4 | model="" 5 | result_dir="" 6 | cuda_list=(0 1 2 3) 7 | SERVER="" 8 | OPENAI_API_KEY="" 9 | 10 | OPENAI_ORGANIZATION="" 11 | CONDA_ENV_NAME="webarena" 12 | instruction_path="agent/prompts/jsons/new_action_prompt.json" 13 | 14 | ENV_VARIABLES="export SHOPPING='http://${SERVER}:7770';export SHOPPING_ADMIN='http://${SERVER}:7780/admin';export REDDIT='http://${SERVER}:9999';export GITLAB='http://${SERVER}:8023';export MAP='http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:3000';export WIKIPEDIA='http://${SERVER}:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing';export HOMEPAGE='http://${SERVER}:4399';export OPENAI_API_KEY=${OPENAI_API_KEY};export OPENAI_ORGANIZATION=${OPENAI_ORGANIZATION}" 15 | 16 | # get the number of tmux panes 17 | num_panes=$(tmux list-panes | wc -l) 18 | 19 | # calculate how many panes need to be created 20 | let "panes_to_create = 5 - num_panes" 21 | 22 | # array of tmux commands to create each pane 23 | tmux_commands=( 24 | 'tmux split-window -h' 25 | 'tmux split-window -v' 26 | 'tmux select-pane -t 0; tmux split-window -v' 27 | 'tmux split-window -v' 28 | 'tmux select-pane -t 3; tmux split-window -v' 29 | ) 30 | 31 | # create panes up to 5 32 | for ((i=0; i<$panes_to_create; i++)); do 33 | eval ${tmux_commands[$i]} 34 | done 35 | 36 | #!/bin/bash 37 | 38 | # Function to run a job 39 | run_job() { 40 | tmux select-pane -t $1 41 | tmux send-keys "conda activate ${CONDA_ENV_NAME}; ${ENV_VARIABLES}; until python run.py --test_start_idx $2 --test_end_idx $3 --provider ours --mode completion --observation_type html --action_set_tag id_html_nasc_tree --model ${model} --instruction_path ${instruction_path} --result_dir ${result_dir} --cuda $1 --sample 1; do echo 'crashed' >&2; sleep 1; done" C-m 42 | sleep 3 43 | } 44 | 45 | TOLERANCE=2 46 | run_batch() { 47 | args=("$@") # save all arguments in an array 48 | num_jobs=${#args[@]} # get number of arguments 49 | 50 | for ((i=1; i<$num_jobs; i++)); do 51 | run_job $i ${args[i-1]} ${args[i]} ${cuda_list[i-1]} 52 | done 53 | 54 | # Wait for all jobs to finish 55 | while tmux list-panes -F "#{pane_pid} #{pane_current_command}" | grep -q python; do 56 | sleep 100 # wait for 10 seconds before checking again 57 | done 58 | 59 | # Run checker 60 | while ! python scripts/check_error_runs.py ${result_dir} --delete_errors --tolerance ${TOLERANCE}; do 61 | echo "Check failed, rerunning jobs..." 62 | for ((i=1; i<$num_jobs; i++)); do 63 | run_job $i ${args[i-1]} ${args[i]} ${cuda_list[i-1]} 64 | done 65 | 66 | # Wait for all jobs to finish 67 | while tmux list-panes -F "#{pane_pid} #{pane_current_command}" | grep -q python; do 68 | sleep 100 # wait for 10 seconds before checking again 69 | done 70 | done 71 | 72 | } 73 | 74 | run_batch 0 203 406 609 812 75 | python get_result.py ${result_dir} 76 | -------------------------------------------------------------------------------- /webarena/prepare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # prepare the evaluation 4 | # re-validate login information 5 | mkdir -p ./.auth 6 | python browser_env/auto_login.py 7 | -------------------------------------------------------------------------------- /webarena/requirements.txt: -------------------------------------------------------------------------------- 1 | gymnasium 2 | playwright==1.32.1 3 | Pillow 4 | evaluate 5 | openai==0.27.0 6 | types-tqdm 7 | tiktoken 8 | aiolimiter 9 | beartype==0.12.0 10 | flask 11 | nltk 12 | text-generation 13 | transformers==4.33.2 14 | lxml==5.1.0 15 | torch 16 | sentencepiece==0.2.0 -------------------------------------------------------------------------------- /webarena/resources/README.md: -------------------------------------------------------------------------------- 1 | # WebArena Resources 2 | ## [12/21/2023] Human Trajectories 3 | We collected human trajectories on 179 tasks and the recording files are [here](https://drive.google.com/drive/folders/1NrN_sawtYK2V_uHnmmS8ugmGIKUAsPgt?usp=sharing). 4 | 5 | We sample one task from each template or templates that share similar task semantic. Each file is named as `.zip`, and the corresponding template id can be found in the [task config file](../config_files/test.raw.json). The trajectories are presented as playwright trace files. You can view the concrete HTML, network traffic etc by `playwright show-trace .zip`. 6 | 7 | Human task success rate: 78.24% 8 | 9 | 10 | ## [11/3/2023] Execution Traces from Our Experiments (v2) 11 | ![v2 results](../media/v2_result.png) 12 | The results on the release v2 can be found in this [folder](https://drive.google.com/drive/folders/1H4wkzDkY2ufiC63DISMXllri0j-ipWcs?usp=sharing). It contains 13 | * text-bison-001 + CoT + UA Hint 14 | * GPT3.5-turbo-0613-16k + Direct + UA Hint 15 | * GPT3.5-turbo-0613-16k + Direct 16 | * GPT3.5-turbo-0613-16k + CoT + UA Hint 17 | * GPT3.5-turbo-0613-16k + CoT 18 | * GPT4-0613 + CoT 19 | 20 | ## [8/7/2023] Execution Traces from Our Experiments (v1) 21 | ![v1 results](../media/v1_result.png) 22 | The results on the release v1 can be found in this [folder](https://drive.google.com/drive/folders/18Oww0fAgwhuSjSzxUNgzBUlC6M9IZZB2?usp=sharing). It contains 23 | * GPT4-0613 + CoT 24 | * GPT3.5-turbo-0613 + CoT 25 | * GPT3.5-turbo-0613 + Direct 26 | 27 | 28 | Once you unzip the file with `unzip .zip`, you will see a list of `render_*.html`, a log file `merge_log.txt` recording whether an example failed or passed and a `trace` folder containing the `playwright` recording of the executions. 29 | 30 | ### render_*.html 31 | Each file render the execution trace of the correponding example with (1) the accessibility tree observations, (2) the raw prediction from the agent and (3) the parsed action. We also provide the correponding screenshot of each observation. 32 | 33 | To extract specific information from the html, you could use the following code snippet: 34 | ```python 35 | from bs4 import BeautifulSoup 36 | with open("render_.html", 'r') as f: 37 | content = f.read() 38 | soup = BeautifulSoup(content, 'html.parser') 39 | # get the observations 40 | observations = soup.find_all("div", {"class": "state_obv"}) 41 | # urls 42 | urls = soup.find_all("h3", {"class": "url"}) 43 | # get the raw predictions (e.g, let's think step-by-step ....) 44 | raw_predictions = soup.find_all("div", {"class": "raw_parsed_prediction"}) 45 | # get the action object 46 | actions = soup.find_all("div", {"class": "action_object"}) 47 | ``` 48 | ### trace/*.zip 49 | The zip files are generated automatically with [playwright](https://playwright.dev/python/docs/trace-viewer). You can view the concrete HTML, network traffic etc by `playwright show-trace .zip`. You will see something like this: 50 | ![example_trace_viewer](../media/example_trace_viewer.png) 51 | -------------------------------------------------------------------------------- /webarena/scripts/check_error_runs.py: -------------------------------------------------------------------------------- 1 | """Some executions may failed. 2 | This script checks the recordings, print the task ids. 3 | It deletes the recordings if needed.""" 4 | import argparse 5 | import glob 6 | import os 7 | import shutil 8 | import sys 9 | 10 | 11 | def merge_logs(result_folder: str, args: argparse.Namespace) -> str: 12 | if not os.path.exists(f"{result_folder}/log_files.txt"): 13 | sys.exit(1) 14 | 15 | with open(f"{result_folder}/log_files.txt", "r") as f: 16 | log_files = f.readlines() 17 | 18 | merged_results = {} 19 | for file in log_files: 20 | with open(file.strip(), "r") as f: 21 | lines = f.readlines() 22 | 23 | cur_log: list[str] = [] 24 | index = None 25 | for line in lines: 26 | if "[Config file]" in line: 27 | if ( 28 | cur_log 29 | and index 30 | and os.path.exists(f"{result_folder}/render_{index}.html") 31 | and len(cur_log) >= 3 32 | ): 33 | merged_results[index] = cur_log 34 | # update index and log 35 | index = line.split("/")[-1].split(".")[0] 36 | cur_log = [line] 37 | else: 38 | cur_log.append(line) 39 | 40 | if ( 41 | cur_log 42 | and index 43 | and os.path.exists(f"{result_folder}/render_{index}.html") 44 | and len(cur_log) >= 3 45 | ): 46 | 47 | merged_results[index] = cur_log 48 | 49 | # sort by the key 50 | merged_results = dict( 51 | sorted(merged_results.items(), key=lambda x: int(x[0])) 52 | ) 53 | 54 | merged_log_path = f"{result_folder}/tmp_merged_log.txt" 55 | with open(merged_log_path, "w") as f: 56 | for k, v in merged_results.items(): 57 | for line in v: 58 | f.write(line) 59 | print(f"Number of examples: {len(merged_results)}") 60 | 61 | unlog_examples = [] 62 | for i in range(812): 63 | if ( 64 | os.path.exists(f"{result_folder}/render_{i}.html") 65 | and str(i) not in merged_results 66 | ): 67 | unlog_examples.append(i) 68 | 69 | print(f"Number of unlogged examples: {len(unlog_examples)}") 70 | print(unlog_examples) 71 | if ( 72 | args.delete_errors 73 | or input("Do you want to delete these examples? (y/n)") == "y" 74 | ): 75 | for idx in unlog_examples: 76 | os.remove(f"{args.result_folder}/render_{idx}.html") 77 | 78 | unifinished_examples = [ 79 | i for i in range(0, 812) if str(i) not in merged_results 80 | ] 81 | print(f"Number of unfinished examples: {len(unifinished_examples)}") 82 | print(unifinished_examples) 83 | 84 | return merged_log_path 85 | 86 | 87 | def check_unhandled_errors(args: argparse.Namespace) -> int: 88 | log_path = merge_logs(args.result_folder, args) 89 | with open(log_path, "r") as f: 90 | logs = f.read() 91 | 92 | error_examples = [] 93 | for line in logs.split("\n"): 94 | if "[Config file]" in line: 95 | example_idx = line.split("/")[-1].split(".")[0] 96 | if "[Unhandled Error]" in line or "[OpenAI Error]" in line: 97 | error_examples.append(int(example_idx)) 98 | 99 | num_errors = len(error_examples) 100 | print(f"Number of unhandled errors: {len(error_examples)}") 101 | print(error_examples) 102 | if ( 103 | args.delete_errors 104 | or input("Do you want to delete these examples? (y/n)") == "y" 105 | ): 106 | for idx in error_examples: 107 | if os.path.exists(f"{args.result_folder}/render_{idx}.html"): 108 | os.remove(f"{args.result_folder}/render_{idx}.html") 109 | return num_errors 110 | 111 | 112 | def check_unexpected_logout(args: argparse.Namespace) -> int: 113 | target_strings = set( 114 | [ 115 | "Creating an account has many benefits: check out faster", 116 | "Welcome, please sign in", 117 | "Username or email", 118 | "Keep me logged in", 119 | ] 120 | ) 121 | 122 | error_examples = [] 123 | for render_file in glob.glob(f"{args.result_folder}/render_*.html"): 124 | with open(render_file, "r") as f: 125 | contents = f.read() 126 | if any([s in contents for s in target_strings]): 127 | task_id = int( 128 | render_file.split("/")[-1].split(".")[0].split("_")[-1] 129 | ) 130 | error_examples.append(task_id) 131 | print(f"Number of unexpected logout: {len(error_examples)}") 132 | print(error_examples) 133 | num_errors = len(error_examples) 134 | if ( 135 | args.delete_errors 136 | or input("Do you want to delete these examples? (y/n)") == "y" 137 | ): 138 | for idx in error_examples: 139 | if os.path.exists(f"{args.result_folder}/render_{idx}.html"): 140 | os.remove(f"{args.result_folder}/render_{idx}.html") 141 | 142 | return num_errors 143 | 144 | 145 | if __name__ == "__main__": 146 | parser = argparse.ArgumentParser() 147 | parser.add_argument("result_folder", type=str) 148 | parser.add_argument("--delete_errors", action="store_true") 149 | parser.add_argument("--tolerance", type=int, default=0) 150 | 151 | args = parser.parse_args() 152 | n1 = check_unhandled_errors(args) 153 | n2 = check_unexpected_logout(args) 154 | if n1 + n2 > args.tolerance: 155 | sys.exit(1) 156 | else: 157 | sys.exit(0) 158 | -------------------------------------------------------------------------------- /webarena/scripts/collect_obs.py: -------------------------------------------------------------------------------- 1 | """Simple script to quickly get the observation of a page""" 2 | 3 | import json 4 | import re 5 | import time 6 | from typing import Dict, Optional, Tuple, Type, Union, cast 7 | 8 | import pytest 9 | from playwright.sync_api import Page, expect 10 | 11 | from browser_env import ( 12 | ScriptBrowserEnv, 13 | create_id_based_action, 14 | create_key_press_action, 15 | create_playwright_action, 16 | create_scroll_action, 17 | ) 18 | from browser_env.env_config import * 19 | 20 | HEADLESS = False 21 | 22 | 23 | def gen_tmp_storage_state() -> None: 24 | with open(f"scripts/tmp_storage_state.json", "w") as f: 25 | json.dump({"storage_state": ".auth/shopping_admin_state.json"}, f) 26 | 27 | 28 | def get_observation( 29 | observation_type: str, current_viewport_only: bool 30 | ) -> None: 31 | env = ScriptBrowserEnv( 32 | observation_type=observation_type, 33 | current_viewport_only=current_viewport_only, 34 | headless=HEADLESS, 35 | sleep_after_execution=2.0, 36 | ) 37 | env.reset(options={"config_file": f"scripts/tmp_storage_state.json"}) 38 | s = f"""page.goto("http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:7780/admin/admin/dashboard/") 39 | page.get_by_label("", exact=True).fill("reviews") 40 | page.get_by_label("", exact=True).press("Enter") 41 | page.scroll(down)""" 42 | action_seq = s.split("\n") 43 | 44 | for action in action_seq: 45 | action = action.strip() 46 | obs, success, _, _, info = env.step(create_playwright_action(action)) 47 | print(obs["text"]) 48 | _ = input("Press enter to continue") 49 | 50 | 51 | if __name__ == "__main__": 52 | gen_tmp_storage_state() 53 | obs_type = "accessibility_tree" 54 | current_viewport_only = True 55 | get_observation(obs_type, current_viewport_only) 56 | -------------------------------------------------------------------------------- /webarena/scripts/generate_test_data.py: -------------------------------------------------------------------------------- 1 | """Replace the website placeholders with website domains from env_config 2 | Generate the test data""" 3 | import json 4 | 5 | from browser_env.env_config import * 6 | 7 | 8 | def main() -> None: 9 | with open("config_files/test.raw.json", "r") as f: 10 | raw = f.read() 11 | raw = raw.replace("__GITLAB__", GITLAB) 12 | raw = raw.replace("__REDDIT__", REDDIT) 13 | raw = raw.replace("__SHOPPING__", SHOPPING) 14 | raw = raw.replace("__SHOPPING_ADMIN__", SHOPPING_ADMIN) 15 | raw = raw.replace("__WIKIPEDIA__", WIKIPEDIA) 16 | raw = raw.replace("__MAP__", MAP) 17 | with open("config_files/test.json", "w") as f: 18 | f.write(raw) 19 | # split to multiple files 20 | data = json.loads(raw) 21 | for idx, item in enumerate(data): 22 | with open(f"config_files/{idx}.json", "w") as f: 23 | json.dump(item, f, indent=2) 24 | 25 | 26 | if __name__ == "__main__": 27 | main() 28 | -------------------------------------------------------------------------------- /webarena/scripts/html2json.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import base64 3 | import glob 4 | import json 5 | import os 6 | from collections import defaultdict 7 | from typing import Any 8 | 9 | from bs4 import BeautifulSoup 10 | 11 | 12 | def main(result_folder: str, config_json: str) -> None: 13 | all_data = {} 14 | template_to_id: dict[str, Any] = defaultdict(lambda: len(template_to_id)) 15 | 16 | with open(config_json, "r") as f: 17 | data_configs = json.load(f) 18 | data_configs = {int(item["task_id"]): item for item in data_configs} 19 | for k, v in data_configs.items(): 20 | v.pop("require_login") 21 | v.pop("storage_state") 22 | v.pop("start_url") 23 | v.pop("geolocation") 24 | v.pop("require_reset") 25 | v.pop("intent_template_id") 26 | v["intent_template_id"] = template_to_id[v["intent_template"]] 27 | v["eval_types"] = v["eval"].pop("eval_types") 28 | if v["eval"]["reference_answers"]: 29 | v["reference_answers"] = v["eval"].pop("reference_answers") 30 | if v["eval"]["reference_url"]: 31 | v["reference_url"] = v["eval"].pop("reference_url") 32 | v.pop("eval") 33 | if v.get("reference_answers", {}).get("exact_match", "") == "N/A": 34 | v["achievable"] = False 35 | else: 36 | v["achievable"] = True 37 | 38 | with open(f"{result_folder}/merged_log.txt", "r") as f: 39 | results = {} 40 | for line in f: 41 | if "[Result]" in line: 42 | id = line.strip().split(".")[-2].split("/")[-1] 43 | results[int(id)] = True if "(PASS)" in line else False 44 | 45 | files = list(glob.glob(f"{result_folder}/render_*.html")) 46 | files = [x for x in files if os.path.exists(x)] 47 | print(f"Total number of files: {len(files)}") 48 | 49 | for render_file in files: 50 | task_id = int(render_file.split("_")[-1].split(".")[0]) 51 | with open(render_file, "r") as f: 52 | try: 53 | content = f.read() 54 | soup = BeautifulSoup(content, "html.parser") 55 | observations = [ 56 | obv.find("pre").text 57 | for obv in soup.find_all("div", {"class": "state_obv"}) 58 | ] 59 | base64_images = [ 60 | img["src"].split(",")[1] for img in soup.find_all("img") 61 | ] 62 | image_observations = [] 63 | # save image to file and change the value to be path 64 | image_folder = f"images/{os.path.basename(result_folder)}" 65 | os.makedirs(image_folder, exist_ok=True) 66 | for i, image in enumerate(base64_images): 67 | image_data = base64.b64decode(image) 68 | filename = f"{image_folder}/image_{task_id}_{i}.png" 69 | with open(filename, "wb") as f: # type: ignore[assignment] 70 | f.write(image_data) # type: ignore[arg-type] 71 | image_observations.append(filename) 72 | urls = [ 73 | url.get_text() 74 | for url in soup.find_all("h3", {"class": "url"}) 75 | ] 76 | actions = [ 77 | action.get_text() 78 | for action in soup.find_all( 79 | "div", {"class": "raw_parsed_prediction"} 80 | ) 81 | ] 82 | parsed_actions = [ 83 | action.get_text() 84 | for action in soup.find_all( 85 | "div", {"class": "parsed_action"} 86 | ) 87 | ] 88 | # fill action with parsed action if action is empty 89 | for i in range(len(actions)): 90 | if actions[i] == "": 91 | actions[i] = parsed_actions[i] 92 | 93 | messages = [] 94 | for o, u, a, image in zip( 95 | observations, urls, actions, image_observations 96 | ): 97 | messages.append( 98 | { 99 | "user": f"{u}\n\nobservation:\n{o}", 100 | "image": image, 101 | } 102 | ) 103 | messages.append({"assistant": a}) 104 | 105 | all_data[f"example_{task_id}"] = { 106 | **data_configs[task_id], 107 | "messages": messages, 108 | "success": results.get(task_id, False), 109 | } 110 | 111 | except Exception as e: 112 | print(e) 113 | print(f"Error in {render_file}") 114 | 115 | with open(f"{result_folder}/json_dump.json", "w+") as f: 116 | json.dump(all_data, f, indent=4) 117 | 118 | 119 | if __name__ == "__main__": 120 | parser = argparse.ArgumentParser() 121 | parser.add_argument("--result_folder", type=str) 122 | parser.add_argument( 123 | "--config_json", type=str, default="config_files/test.raw.json" 124 | ) 125 | args = parser.parse_args() 126 | main(args.result_folder, args.config_json) 127 | -------------------------------------------------------------------------------- /webarena/setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = webarena 3 | 4 | [tool.pytest.ini_options] 5 | testpaths = ["tests"] 6 | python_files = "test_*.py" 7 | 8 | [options.extras_require] 9 | dev = 10 | pre-commit==3.0.1 11 | pytest==7.1.2 12 | mypy==0.991 13 | nbmake 14 | pytest-asyncio 15 | types-requests 16 | 17 | [options] 18 | python_requires = >=3.7, <4 19 | packages = 20 | browser_env 21 | agent 22 | evaluation_harness 23 | llms 24 | [mypy] 25 | strict = true 26 | -------------------------------------------------------------------------------- /webarena/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | if __name__ == "__main__": 4 | setup() 5 | -------------------------------------------------------------------------------- /webarena/solver/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import step_once, show_screenshot 2 | 3 | from browser_env import ( 4 | StateInfo, 5 | Trajectory, 6 | ) 7 | 8 | from .shopping_admin import * 9 | 10 | def manual_solver(env, config_file, render_helper, agent, args, intent, field_dict): 11 | # return solver_364(env, config_file, render_helper, agent, args, intent, field_dict) 12 | agent.reset(config_file) 13 | trajectory: Trajectory = [] 14 | obs, info = env.reset(options={"config_file": config_file}) 15 | state_info: StateInfo = {"observation": obs, "info": info} 16 | trajectory.append(state_info) 17 | 18 | meta_data = {"action_history": ["None"]} 19 | traces = [] 20 | 21 | obs_info = info["observation_metadata"]["text"] 22 | dom_tree, nodes_info = obs_info["dom_info"]["dom_tree"], obs_info["obs_nodes_info"] 23 | 24 | print(field_dict) 25 | while True: 26 | obs_info = state_info["info"]["observation_metadata"]["text"] 27 | dom_tree, nodes_info = obs_info["dom_info"]["dom_tree"], obs_info["obs_nodes_info"] 28 | target_action = input() 29 | res = step_once(env, trajectory, render_helper, traces, state_info, agent, args, intent, meta_data, target_action) 30 | if res is None: 31 | return traces, trajectory 32 | state_info, dom_tree, nodes_info = res 33 | -------------------------------------------------------------------------------- /webarena/solver/utils.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import json 3 | from PIL import Image 4 | 5 | from agent import PromptAgent 6 | from browser_env.helper_functions import get_action_description 7 | from browser_env import ActionTypes 8 | 9 | def show_screenshot(state_info): 10 | image_data = state_info["observation"]["image"] 11 | im = Image.fromarray(image_data) 12 | im.save('output/show_screenshot.png') 13 | 14 | def get_nodes(dom_tree, nodes_info, attr, attrval, mode: int=0, use_elem: int=0): 15 | tar_nodes, temp_nodes = [], [] 16 | for nodes in dom_tree: 17 | if (mode == 1 and nodes[attr] == attrval) or (mode == 0 and nodes[attr].count(attrval) > 0): 18 | if use_elem == 0: 19 | tar_nodes.append(nodes['backendNodeId']) 20 | elif use_elem == 1: 21 | temp_nodes.extend(nodes['childIds']) 22 | elif use_elem == 2: 23 | temp_nodes.append(nodes['parentId']) 24 | 25 | if use_elem != 0: 26 | for node in dom_tree: 27 | if node['nodeId'] in temp_nodes: 28 | tar_nodes.append(node['backendNodeId']) 29 | 30 | act_nodes = [] 31 | 32 | for node in list(nodes_info.values()): 33 | if node['backend_id'] in tar_nodes: 34 | act_nodes.append(node) 35 | return act_nodes 36 | 37 | def step_once(env, trajectory, render_helper, traces, state_info, agent, args, intent, meta_data, target_action): 38 | obs_info = state_info["info"]["observation_metadata"]["text"] 39 | images = state_info["info"]["images"] 40 | dom_info, nodes_info = obs_info["dom_info"], obs_info["obs_nodes_info"] 41 | dom_tree = dom_info["dom_tree"] 42 | raw_html = dom_info["raw_html"] 43 | 44 | if target_action.count('#Type#') > 0 and target_action.endswith('\\n'): 45 | target_action = target_action[:-2] + '\n' 46 | 47 | prompt, action = agent.check_action( 48 | trajectory, intent, meta_data, target_action 49 | ) 50 | 51 | print('[prompt] ', prompt) 52 | print('[action] ', action) 53 | 54 | # our_dom_tree = copy.deepcopy(dom_tree) 55 | # for elem in our_dom_tree: 56 | # elem["union_bound"] = elem["union_bound"].tolist() 57 | 58 | myaction = copy.deepcopy(action) 59 | myaction["coords"] = myaction["coords"].tolist() 60 | 61 | need_to_keep = action['action_type'] != ActionTypes.NONE or target_action.count("#Record#") > 0 62 | 63 | trajectory.append(action) 64 | 65 | action_str = get_action_description( 66 | action, 67 | state_info["info"]["observation_metadata"], 68 | action_set_tag=args.action_set_tag, 69 | prompt_constructor=agent.prompt_constructor 70 | if isinstance(agent, PromptAgent) 71 | else None, 72 | ) 73 | 74 | if need_to_keep: 75 | user_action = action_str.split(' #HTML Segment')[0] 76 | traces.append({ 77 | 'source': prompt, 78 | 'target': f'{real_action}', 79 | 'extra_data': { 80 | 'element_id': action.get('element_id', ''), 81 | 'dom_tree': dom_tree, 82 | 'raw_html': raw_html, 83 | 'nodes_info': nodes_info, 84 | 'raw_action': myaction, 85 | 'images': images, 86 | }, 87 | }) 88 | 89 | render_helper.render( 90 | action, state_info, meta_data, args.render_screenshot 91 | ) 92 | 93 | if need_to_keep: 94 | meta_data["action_history"].append(action_str) 95 | 96 | if action["action_type"] == ActionTypes.STOP: 97 | return None 98 | 99 | # if action['action_type'] == ActionTypes.TYPE: 100 | # action['text'] = [110] * 500 + action['text'] 101 | 102 | obs, _, terminated, _, info = env.step(action) 103 | state_info = {"observation": obs, "info": info} 104 | show_screenshot(state_info) 105 | 106 | trajectory.append(state_info) 107 | 108 | if terminated: 109 | # add a action place holder 110 | trajectory.append(create_stop_action("")) 111 | return None 112 | 113 | obs_info = state_info["info"]["observation_metadata"]["text"] 114 | dom_tree, nodes_info = obs_info["dom_info"]["dom_tree"], obs_info["obs_nodes_info"] 115 | 116 | return state_info, dom_tree, nodes_info -------------------------------------------------------------------------------- /webarena/tests/conftest.py: -------------------------------------------------------------------------------- 1 | from typing import AsyncGenerator, Generator 2 | 3 | import pytest 4 | import pytest_asyncio 5 | 6 | from browser_env import AsyncScriptBrowserEnv, ScriptBrowserEnv 7 | 8 | HEADLESS = True 9 | SLOW_MO = 0 10 | 11 | 12 | @pytest.fixture(scope="function") 13 | def script_browser_env() -> Generator[ScriptBrowserEnv, None, None]: 14 | """Create a ScriptBrowserEnv instance for testing. 15 | It is automatically closed after the test session. 16 | This is helpful when the test failed and the browser is still open. 17 | """ 18 | env = ScriptBrowserEnv( 19 | headless=HEADLESS, 20 | slow_mo=SLOW_MO, 21 | ) 22 | yield env 23 | env.close() 24 | 25 | 26 | @pytest.fixture(scope="function") 27 | def current_viewport_script_browser_env() -> Generator[ 28 | ScriptBrowserEnv, None, None 29 | ]: 30 | env = ScriptBrowserEnv( 31 | headless=HEADLESS, 32 | slow_mo=SLOW_MO, 33 | current_viewport_only=True, 34 | ) 35 | yield env 36 | env.close() 37 | 38 | 39 | @pytest.fixture(scope="function") 40 | def accessibility_tree_script_browser_env() -> Generator[ 41 | ScriptBrowserEnv, None, None 42 | ]: 43 | env = ScriptBrowserEnv( 44 | headless=HEADLESS, 45 | slow_mo=SLOW_MO, 46 | observation_type="accessibility_tree", 47 | ) 48 | yield env 49 | env.close() 50 | 51 | 52 | @pytest.fixture(scope="function") 53 | def accessibility_tree_current_viewport_script_browser_env() -> Generator[ 54 | ScriptBrowserEnv, None, None 55 | ]: 56 | env = ScriptBrowserEnv( 57 | headless=HEADLESS, 58 | slow_mo=SLOW_MO, 59 | observation_type="accessibility_tree", 60 | current_viewport_only=True, 61 | ) 62 | yield env 63 | env.close() 64 | 65 | 66 | @pytest_asyncio.fixture(scope="function", autouse=True) 67 | async def async_script_browser_env() -> AsyncGenerator[ 68 | AsyncScriptBrowserEnv, None 69 | ]: 70 | env = AsyncScriptBrowserEnv(headless=HEADLESS, slow_mo=SLOW_MO) 71 | yield env 72 | await env.aclose() 73 | -------------------------------------------------------------------------------- /webarena/tests/test_browser_env/test_actions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from browser_env import * 4 | 5 | 6 | def test_is_equivalent() -> None: 7 | for action_type in ActionTypes.__members__.values(): 8 | action_a = create_random_action() 9 | action_b = create_random_action() 10 | if action_a["action_type"] != action_b["action_type"]: 11 | assert not is_equivalent(action_a, action_b) 12 | action_a["action_type"] = action_type 13 | action_b["action_type"] = action_type 14 | match action_type: 15 | case ActionTypes.MOUSE_CLICK | ActionTypes.MOUSE_HOVER: 16 | if not np.allclose(action_a["coords"], action_b["coords"]): 17 | assert not is_equivalent(action_a, action_b) 18 | action_a["coords"] = action_b["coords"] 19 | assert is_equivalent(action_a, action_b) 20 | case ActionTypes.KEYBOARD_TYPE: 21 | if action_a["text"] != action_b["text"]: 22 | assert not is_equivalent(action_a, action_b) 23 | action_a["text"] = action_b["text"] 24 | assert is_equivalent(action_a, action_b) 25 | case ActionTypes.CLICK | ActionTypes.HOVER | ActionTypes.TYPE: 26 | if action_a["element_id"] and action_b["element_id"]: 27 | if action_a["element_id"] != action_b["element_id"]: 28 | assert not is_equivalent(action_a, action_b) 29 | action_a["element_id"] = action_b["element_id"] 30 | assert is_equivalent(action_a, action_b) 31 | elif action_a["element_id"] and action_b["element_id"]: 32 | if action_a["element_role"] != action_b["element_role"]: 33 | assert not is_equivalent(action_a, action_b) 34 | action_a["element_role"] = action_b["element_role"] 35 | if action_a["element_name"] != action_b["element_name"]: 36 | assert not is_equivalent(action_a, action_b) 37 | action_a["element_name"] = action_b["element_name"] 38 | assert is_equivalent(action_a, action_b) 39 | elif action_a["pw_code"] and action_b["pw_code"]: 40 | if action_a["pw_code"] != action_b["pw_code"]: 41 | assert not is_equivalent(action_a, action_b) 42 | action_a["pw_code"] = action_b["pw_code"] 43 | assert is_equivalent(action_a, action_b) 44 | else: 45 | action_a["element_id"] = action_b["element_id"] 46 | assert is_equivalent(action_a, action_b) 47 | case ActionTypes.GOTO_URL: 48 | if action_a["url"] != action_b["url"]: 49 | assert not is_equivalent(action_a, action_b) 50 | action_a["url"] = action_b["url"] 51 | assert is_equivalent(action_a, action_b) 52 | case ActionTypes.PAGE_FOCUS: 53 | if action_a["page_number"] != action_b["page_number"]: 54 | assert not is_equivalent(action_a, action_b) 55 | action_a["page_number"] = action_b["page_number"] 56 | assert is_equivalent(action_a, action_b) 57 | case ActionTypes.SCROLL: 58 | da = "up" if "up" in action_a["direction"] else "down" 59 | db = "up" if "up" in action_b["direction"] else "down" 60 | if da != db: 61 | assert not is_equivalent(action_a, action_b) 62 | action_a["direction"] = action_b["direction"] 63 | assert is_equivalent(action_a, action_b) 64 | case ActionTypes.KEY_PRESS: 65 | if action_a["key_comb"] != action_b["key_comb"]: 66 | assert not is_equivalent(action_a, action_b) 67 | action_a["key_comb"] = action_b["key_comb"] 68 | assert is_equivalent(action_a, action_b) 69 | case ActionTypes.CHECK | ActionTypes.SELECT_OPTION: 70 | if action_a["pw_code"] != action_b["pw_code"]: 71 | assert not is_equivalent(action_a, action_b) 72 | action_a["pw_code"] = action_b["pw_code"] 73 | assert is_equivalent(action_a, action_b) 74 | case ActionTypes.STOP: 75 | if action_a["answer"] != action_b["answer"]: 76 | assert not is_equivalent(action_a, action_b) 77 | action_a["answer"] = action_b["answer"] 78 | assert is_equivalent(action_a, action_b) 79 | case _: 80 | assert is_equivalent(action_a, action_b) 81 | 82 | 83 | def test_action2create_function() -> None: 84 | for _ in range(1000): 85 | action = create_random_action() 86 | create_function = action2create_function(action) 87 | assert is_equivalent(action, eval(create_function)) 88 | -------------------------------------------------------------------------------- /webarena/tests/test_browser_env/test_auth_cookie.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | 4 | from browser_env import * 5 | 6 | auth_json = { 7 | "cookies": [ 8 | { 9 | "name": "session-username", 10 | "value": "standard_user", 11 | "domain": "www.saucedemo.com", 12 | "path": "/", 13 | "httpOnly": False, 14 | "secure": False, 15 | "sameSite": "Lax", 16 | } 17 | ], 18 | "origins": [], 19 | } 20 | 21 | 22 | def test_auth_cookie() -> None: 23 | env = ScriptBrowserEnv() 24 | env.reset() 25 | _, reward, _, _, info = env.step( 26 | create_goto_url_action("https://www.saucedemo.com/inventory.html"), 27 | ) 28 | assert reward == 1 29 | assert "page" in info and isinstance(info["page"], DetachedPage) 30 | assert info["page"].url == "https://www.saucedemo.com/" 31 | json.dump(auth_json, open("/tmp/auth.json", "w")) 32 | instance_config = {"storage_state": "/tmp/auth.json"} 33 | json.dump(instance_config, open("/tmp/config.json", "w")) 34 | env.reset(options={"config_file": "/tmp/config.json"}) 35 | _, reward, _, _, info = env.step( 36 | create_goto_url_action("https://www.saucedemo.com/inventory.html"), 37 | ) 38 | assert reward == 1 39 | assert "page" in info and isinstance(info["page"], DetachedPage) 40 | assert info["page"].url == "https://www.saucedemo.com/inventory.html" 41 | env.close() 42 | 43 | 44 | def test_async_auth_cookie() -> None: 45 | env = AsyncScriptBrowserEnv() 46 | 47 | async def _test() -> None: 48 | await env.areset() 49 | _, reward, _, _, info = await env.astep( 50 | create_goto_url_action("https://www.saucedemo.com/inventory.html"), 51 | ) 52 | assert reward == 1 53 | assert "page" in info and isinstance(info["page"], DetachedPage) 54 | assert info["page"].url == "https://www.saucedemo.com/" 55 | json.dump(auth_json, open("/tmp/auth.json", "w")) 56 | instance_config = {"storage_state": "/tmp/auth.json"} 57 | json.dump(instance_config, open("/tmp/config.json", "w")) 58 | await env.areset(options={"config_file": "/tmp/config.json"}) 59 | _, reward, _, _, info = await env.astep( 60 | create_goto_url_action("https://www.saucedemo.com/inventory.html"), 61 | ) 62 | assert reward == 1 63 | assert "page" in info and isinstance(info["page"], DetachedPage) 64 | assert info["page"].url == "https://www.saucedemo.com/inventory.html" 65 | await env.aclose() 66 | 67 | asyncio.run(_test()) 68 | -------------------------------------------------------------------------------- /webarena/tests/test_browser_env/test_playwright_actions.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Generator, Optional, Tuple, Type, Union, cast 2 | 3 | import pytest 4 | from playwright.sync_api import Page 5 | 6 | from browser_env import ScriptBrowserEnv, create_playwright_action 7 | 8 | HEADLESS = True 9 | SLOW_MO = 0 10 | 11 | 12 | def test_frame_locator(script_browser_env: ScriptBrowserEnv) -> None: 13 | env = script_browser_env 14 | seq = """page.goto("https://www.littlewebhut.com/articles/html_iframe_example/") 15 | page.frame_locator("iframe[name=\\"imgbox\\"]").get_by_role("img").click()""" 16 | 17 | env.reset() 18 | for action in seq.split("\n"): 19 | action = action.strip() 20 | _, success, _, _, info = env.step(create_playwright_action(action)) 21 | assert success 22 | 23 | 24 | def test_basic(script_browser_env: ScriptBrowserEnv) -> None: 25 | # click, fill, press, check, goto 26 | env = script_browser_env 27 | seq = """page.goto("https://demo.playwright.dev/todomvc/") 28 | page.get_by_placeholder("What needs to be done?").click() 29 | page.get_by_placeholder("What needs to be done?").fill("hello") 30 | page.get_by_placeholder("What needs to be done?").press("Enter") 31 | page.get_by_placeholder("What needs to be done?").fill("world") 32 | page.get_by_placeholder("What needs to be done?").press("Enter") 33 | page.get_by_placeholder("What needs to be done?").fill("yes") 34 | page.get_by_placeholder("What needs to be done?").press("Enter") 35 | page.get_by_placeholder("What needs to be done?").fill("no") 36 | page.get_by_placeholder("What needs to be done?").press("Enter") 37 | page.get_by_role("listitem").filter(has_text="world").get_by_role("checkbox", name="Toggle Todo").check() 38 | page.get_by_role("button", name="Clear completed").click()""" 39 | 40 | env.reset() 41 | for action in seq.split("\n"): 42 | action = action.strip() 43 | _, success, _, _, info = env.step(create_playwright_action(action)) 44 | assert success 45 | 46 | 47 | @pytest.mark.skip(reason="not important, but the site is flaky") 48 | def test_hover(script_browser_env: ScriptBrowserEnv) -> None: 49 | env = script_browser_env 50 | seq = """page.goto("https://www.w3schools.com/cssref/tryit.php?filename=trycss_sel_hover") 51 | page.frame_locator("iframe[name=\\'iframeResult\\']").get_by_role("link", name="w3schools.com").hover()""" 52 | 53 | env.reset() 54 | for action in seq.split("\n"): 55 | action = action.strip() 56 | _, success, _, _, info = env.step(create_playwright_action(action)) 57 | assert success 58 | 59 | 60 | @pytest.mark.skip(reason="not important, but the site is flaky") 61 | def test_select_option(script_browser_env: ScriptBrowserEnv) -> None: 62 | env = script_browser_env 63 | seq = """page.goto("https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_select") 64 | page.frame_locator("iframe[name=\\'iframeResult\\']").get_by_role("combobox", name="Choose a car:").select_option("opel")""" 65 | 66 | env.reset() 67 | for action in seq.split("\n"): 68 | action = action.strip() 69 | _, success, _, _, info = env.step(create_playwright_action(action)) 70 | assert success 71 | 72 | 73 | def test_xpath(script_browser_env: ScriptBrowserEnv) -> None: 74 | env = script_browser_env 75 | seq = """page.goto("https://demo.playwright.dev/todomvc/") 76 | page.goto("https://demo.playwright.dev/todomvc/#/") 77 | page.get_by_placeholder("What needs to be done?").click() 78 | page.get_by_placeholder("What needs to be done?").fill("hello") 79 | page.get_by_placeholder("What needs to be done?").press("Enter") 80 | page.get_by_role("link", name="Completed").click() 81 | page.locator("xpath=/html/body/section/div/header/input").fill("no") 82 | page.get_by_placeholder("What needs to be done?").press("Enter") 83 | page.goto("https://bic-berkeley.github.io/psych-214-fall-2016/string_literals.html") 84 | page.locator("xpath=//*[@id=\'searchbox\']/div/form/input[1]").fill("type")""" 85 | env.reset() 86 | for action in seq.split("\n"): 87 | action = action.strip() 88 | _, success, _, _, info = env.step(create_playwright_action(action)) 89 | assert success 90 | -------------------------------------------------------------------------------- /webarena/tests/test_evaluation_harness/configs/func_eval_fail.json: -------------------------------------------------------------------------------- 1 | { 2 | "sites": ["shopping"], 3 | "task_id": 0, 4 | "require_login": true, 5 | "storage_state": null, 6 | "start_url": null, 7 | "geolocation": null, 8 | "intent_template": "", 9 | "instantiation_dict": {}, 10 | "intent": "", 11 | "require_reset": false, 12 | "eval": { 13 | "eval_types": ["program_html"], 14 | "reference_answers": [], 15 | "reference_url": "", 16 | "program_html": [ 17 | { 18 | "url": "last", 19 | "required_contents": {"must_include": ["80"]}, 20 | "locator": "func:shopping_get_sku_latest_review_rating('B09BCM56J7')" 21 | }, 22 | { 23 | "url": "last", 24 | "required_contents": {"must_include": ["cupcakecupcake"]}, 25 | "locator": "func:shopping_get_sku_latest_review_author('B09BCM56J7')" 26 | } 27 | ] 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /webarena/tests/test_evaluation_harness/configs/func_eval_success.json: -------------------------------------------------------------------------------- 1 | { 2 | "sites": ["shopping"], 3 | "task_id": 0, 4 | "require_login": true, 5 | "storage_state": null, 6 | "start_url": null, 7 | "geolocation": null, 8 | "intent_template": "", 9 | "instantiation_dict": {}, 10 | "intent": "", 11 | "require_reset": false, 12 | "eval": { 13 | "eval_types": ["program_html"], 14 | "reference_answers": [], 15 | "reference_url": "", 16 | "program_html": [ 17 | { 18 | "url": "last", 19 | "required_contents": {"must_include": ["100"]}, 20 | "locator": "func:shopping_get_sku_latest_review_rating('B09BCM56J7')" 21 | }, 22 | { 23 | "url": "last", 24 | "required_contents": {"must_include": ["cupcakecupcake"]}, 25 | "locator": "func:shopping_get_sku_latest_review_author('B09BCM56J7')" 26 | } 27 | ] 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /webarena/tests/test_evaluation_harness/configs/func_url_func_1.json: -------------------------------------------------------------------------------- 1 | { 2 | "sites": ["shopping"], 3 | "task_id": 0, 4 | "require_login": true, 5 | "storage_state": null, 6 | "start_url": null, 7 | "geolocation": null, 8 | "intent_template": "", 9 | "instantiation_dict": {}, 10 | "intent": "", 11 | "require_reset": false, 12 | "eval": { 13 | "eval_types": ["program_html"], 14 | "reference_answers": [], 15 | "reference_url": "", 16 | "program_html": [ 17 | { 18 | "url": "func:reddit_get_post_url('__last_url__')", 19 | "locator": "document.querySelector('.submission__inner').outerText", 20 | "required_contents": {"must_include": ["How will SPY close on Monday 11/28"]} 21 | } 22 | ] 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /webarena/tests/test_evaluation_harness/configs/func_url_func_2.json: -------------------------------------------------------------------------------- 1 | { 2 | "sites": [ 3 | "shopping" 4 | ], 5 | "task_id": 0, 6 | "require_login": true, 7 | "storage_state": "./.auth/gitlab_state.json", 8 | "start_url": null, 9 | "geolocation": null, 10 | "intent_template": "", 11 | "instantiation_dict": {}, 12 | "intent": "", 13 | "require_reset": false, 14 | "eval": { 15 | "eval_types": [ 16 | "program_html" 17 | ], 18 | "reference_answers": [], 19 | "reference_url": "", 20 | "program_html": [ 21 | { 22 | "url": "__GITLAB__/primer/design/-/project_members", 23 | "locator": "func:gitlab_get_project_memeber_role(__page__, 'byteblaze')", 24 | "required_contents": {"must_include": ["Developer"]} 25 | }, 26 | { 27 | "url": "__GITLAB__/primer/design/-/project_members", 28 | "locator": "func:gitlab_get_project_memeber_role(__page__, 'primer')", 29 | "required_contents": {"must_include": ["Owner"]} 30 | } 31 | ] 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /webarena/tests/test_evaluation_harness/configs/html_content_element_exact_match.json: -------------------------------------------------------------------------------- 1 | { 2 | "sites": ["gitlab"], 3 | "task_id": 0, 4 | "require_login": true, 5 | "storage_state": "./.auth/gitlab_state.json", 6 | "start_url": null, 7 | "geolocation": null, 8 | "intent_template": "", 9 | "instantiation_dict": {}, 10 | "intent": "", 11 | "require_reset": false, 12 | "eval": { 13 | "eval_types": ["program_html"], 14 | "reference_answers": [], 15 | "reference_url": "", 16 | "program_html": [ 17 | { 18 | "url": "last", 19 | "required_contents": {"must_include": ["Hello World"]}, 20 | "locator": "document.querySelector('[id=\"form-name\"').value" 21 | }, 22 | { 23 | "url": "last", 24 | "required_contents": {"must_include": ["alexisxy@hotmail.com"]}, 25 | "locator": "document.querySelector('[id=\"form-email\"').value" 26 | } 27 | ] 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /webarena/tests/test_evaluation_harness/configs/html_content_exact_match.json: -------------------------------------------------------------------------------- 1 | { 2 | "sites": ["gitlab"], 3 | "task_id": 0, 4 | "require_login": true, 5 | "storage_state": "./.auth/gitlab_state.json", 6 | "start_url": null, 7 | "geolocation": null, 8 | "intent_template": "", 9 | "instantiation_dict": {}, 10 | "intent": "", 11 | "require_reset": false, 12 | "eval": { 13 | "eval_types": ["program_html"], 14 | "reference_answers": [], 15 | "reference_url": "", 16 | "program_html": [ 17 | { 18 | "url": "last", 19 | "required_contents": {"must_include": ["What are mammals?"]}, 20 | "locator": "" 21 | }, 22 | { 23 | "url": "https://www.google.com/", 24 | "required_contents": {"must_include": ["Google Search"]}, 25 | "locator": "" 26 | } 27 | ] 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /webarena/tests/test_evaluation_harness/configs/html_content_url_comb.json: -------------------------------------------------------------------------------- 1 | { 2 | "sites": ["gitlab"], 3 | "task_id": 0, 4 | "require_login": true, 5 | "storage_state": null, 6 | "start_url": null, 7 | "geolocation": null, 8 | "intent_template": "", 9 | "instantiation_dict": {}, 10 | "intent": "", 11 | "require_reset": false, 12 | "eval": { 13 | "eval_types": ["program_html", "url_match"], 14 | "reference_answers": [], 15 | "reference_url": "https://russmaxdesign.github.io/", 16 | "url_note": "GOLD in PRED", 17 | "program_html": [ 18 | { 19 | "url": "last", 20 | "required_contents": {"must_include": ["Hello World"]}, 21 | "locator": "document.querySelector('[id=\"form-name\"').value" 22 | }, 23 | { 24 | "url": "last", 25 | "required_contents": {"must_include": ["alexisxy@hotmail.com"]}, 26 | "locator": "document.querySelector('[id=\"form-email\"').value" 27 | } 28 | ] 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /webarena/tests/test_evaluation_harness/configs/string_match.json: -------------------------------------------------------------------------------- 1 | { 2 | "sites": ["reddit"], 3 | "task_id": 0, 4 | "require_login": true, 5 | "storage_state": "./.auth/reddit_state.json", 6 | "start_url": null, 7 | "geolocation": null, 8 | "intent_template": "", 9 | "instantiation_dict": {}, 10 | "intent": "", 11 | "require_reset": false, 12 | "eval": { 13 | "eval_types": ["string_match"], 14 | "reference_answers": { 15 | "must_include": ["1985/04/18"] 16 | }, 17 | "reference_url": "", 18 | "program_html": null 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /webarena/tests/test_evaluation_harness/configs/url_exact_match.json: -------------------------------------------------------------------------------- 1 | { 2 | "sites": ["reddit"], 3 | "task_id": 0, 4 | "require_login": true, 5 | "storage_state": null, 6 | "start_url": null, 7 | "geolocation": null, 8 | "intent_template": "", 9 | "instantiation_dict": {}, 10 | "intent": "", 11 | "require_reset": false, 12 | "eval": { 13 | "eval_types": ["url_match"], 14 | "reference_answers": [], 15 | "reference_url": "https://www.google.com/", 16 | "program_html": [ 17 | { 18 | "url": "", 19 | "required_contents": [] 20 | } 21 | ] 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /webarena/tests/test_evaluation_harness/test_helper_functions.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from pathlib import Path 4 | 5 | from browser_env import ScriptBrowserEnv 6 | from browser_env.env_config import * 7 | from evaluation_harness.helper_functions import ( 8 | gitlab_get_project_memeber_role, 9 | ) 10 | 11 | HEADLESS = True 12 | config_file_folder = "tests/test_evaluation_harness/configs" 13 | 14 | 15 | def test_gitlab_get_project_memeber_role( 16 | script_browser_env: ScriptBrowserEnv, 17 | ) -> None: 18 | env = script_browser_env 19 | config_file = f"{config_file_folder}/tmp_config.json" 20 | 21 | with open(config_file, "w") as f: 22 | json.dump({"storage_state": ".auth/gitlab_state.json"}, f) 23 | env.reset(options={"config_file": config_file}) 24 | env.page.goto(f"{GITLAB}/primer/design/-/project_members") 25 | role1 = gitlab_get_project_memeber_role(env.page, "byteblaze") 26 | assert role1 == "Developer" 27 | role2 = gitlab_get_project_memeber_role(env.page, "primer") 28 | assert role2 == "Owner" 29 | 30 | # remove tmp config file 31 | os.remove(config_file) 32 | --------------------------------------------------------------------------------