├── .gitignore ├── LICENSE ├── README.md ├── configs ├── miniwob │ └── eval_openai_agent.yml └── webarena │ └── eval_openai_agent.yml ├── notebooks ├── others │ ├── eval_airlinecrm_metrics.ipynb │ ├── eval_compression.ipynb │ ├── eval_miniwob_llama_metrics.ipynb │ └── eval_miniwob_metrics.ipynb └── webarena │ ├── context_len_hist_webarena_step_vs_flat.ipynb │ ├── final_webarena_step_vs_all.ipynb │ └── plots_webarena_step_vs_all.ipynb ├── requirements.txt ├── scripts ├── evaluate │ ├── eval_miniwob.py │ └── eval_webarena.py └── setup │ └── auto_login_webarena.py ├── setup.py └── src └── webagents_step ├── agents ├── agent.py ├── finetuned_agent.py ├── keyboard_agent.py ├── playback_agent.py ├── prompt_agent.py └── step_agent.py ├── environment ├── env.py ├── liveweb.py ├── miniwob.py └── webarena.py ├── parser ├── miniwob_parser.py └── playwright_parser_webarena.py ├── prompts ├── miniwob │ ├── flat_fewshot_template.py │ └── step_fewshot_template.py └── webarena │ ├── flat_fewshot_template.py │ └── step_fewshot_template.py └── utils ├── data_prep.py ├── llm.py └── stack.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # OS generated files 132 | .DS_Store 133 | .DS_Store? 134 | 135 | # Custom dirs 136 | local/ 137 | data/ 138 | outputs/ 139 | api_key.py 140 | 141 | # Data files 142 | *.csv 143 | # *.txt 144 | 145 | *chrome* 146 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 ASAPP Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SteP: Stacked LLM Policies for Web Actions 2 | 3 | Paper link: [https://arxiv.org/abs/2310.03720](https://arxiv.org/abs/2310.03720) 4 | 5 | ## Installation 6 | 7 | To set up the project, clone the repository and create a virtual environment: 8 | 9 | ```bash 10 | cd webagents-step 11 | pyenv virtualenv webagents-step 12 | pyenv activate webagents-step 13 | ``` 14 | 15 | Install the required packages: 16 | 17 | ```bash 18 | pip install -r requirements.txt 19 | ``` 20 | 21 | ## WebArena Evaluation 22 | 23 | ### WebArena Results 24 | 25 | We break down the success rates across different websites and provide links to the trajectory logs below, containing the observations, model predictions, and evaluator outputs for each task. 26 | 27 | The latest runs with `gpt-4-turbo-2024-04-09` model and WebArena code (last commit May 29, 2024) are linked below 28 | 29 | | Website | Number of tasks | Success Rate | Trajectory Logs | 30 | |---------|--------------------|--------------|------------------| 31 | | Gitlab | 180 | 31.7% | [logs](https://drive.google.com/drive/folders/1znkg8aQoEVLTvSyQ8iebb_bsOJL2DrKl?usp=share_link) | 32 | | Reddit | 106 | 59.4% | [logs](https://drive.google.com/drive/folders/1Ek9cMz344tKXbEchakPyPXoTU14FYSlm?usp=share_link) | 33 | | Shopping | 187 | 36.9% | [logs](https://drive.google.com/drive/folders/1ztCP7JH18XS_mGlPCIrP7cKc2eF6Yf8S?usp=share_link) | 34 | | Shopping admin (CMS) | 182 | 24.2% | [logs](https://drive.google.com/drive/folders/1quti9851rBO49alYYL9C1NZNcpRI_Cg-?usp=share_link) | 35 | | Map | 109 | 30.3% | [logs](https://drive.google.com/drive/folders/1V7c122QKNAIVdbskLFNwTJcwILGIf_kS?usp=share_link) | 36 | | Multisite | 48 | 12.5% | [logs](https://drive.google.com/drive/folders/1JmvrY1Ys_bHHY8eQmJocnyZGiPeG7BpV?usp=share_link) | 37 | | All | 812 | 33.5% | [logs](https://drive.google.com/drive/folders/1AKXlClGbFU4RQtfWN9f6jva7MbbGCbur?usp=share_link) | 38 | 39 | ### Installing WebArena 40 | Install WebArena from [WebArena github repository](https://github.com/web-arena-x/webarena). This code uses the last commit 4c741b4b20a3e183836e58f383f9be1785248160 on May 29, 2024. 41 | 42 | Generate test data configs: 43 | ```bash 44 | python scripts/generate_test_data.py 45 | ``` 46 | You will see `*.json` files generated in config_files/ folder. Copy these over to a `tasks/webarena` directory in the `webagents-step/` root directory. 47 | 48 | You will also need to setup authentication for all websites as per instructions in the WebArena README (See instructions for *Obtain the auto-login cookies for all websites*). This will generate a `.auth` folder. Copy this over to `webagents-step/` root directory. 49 | 50 | ### Running Evaluation 51 | 52 | To run WebArena evaluation: 53 | ```bash 54 | python scripts/evaluate/eval_webarena.py --config configs/webarena/eval_openai_agent.yml 55 | ``` 56 | 57 | Important: 58 | * Set up each website as a docker as listed in [WebArena instructions](https://github.com/web-arena-x/webarena/blob/main/environment_docker/README.md) 59 | * Reset the website state before running an evaluation. This matters since the initial state of the website affects the success of the task. 60 | * For Reddit tasks, there is a rate limit on making more than 3 posts in an hour. You need to add a sleep of 21 minutes before every new task. This can be done by adding `time.sleep(1260)` inside the for loop in `eval_webarena.py` 61 | 62 | ## MiniWoB++ Evaluation 63 | 64 | ### Installing MiniWob++ 65 | Install MiniWoB++ from [this repository](https://github.com/Farama-Foundation/miniwob-plusplus). Use commit 43bd1fe. 66 | 67 | ### Running Evaluation 68 | 69 | To run MiniWoB++ evaluation: 70 | ```bash 71 | python scripts/evaluate/eval_miniwob.py --config configs/miniwob/eval_openai_agent.yml 72 | ``` 73 | 74 | ## Contact 75 | This project is still in active development. For any questions or issues, please contact us at [psodhi@asapp.com](mailto:psodhi@asapp.com). 76 | -------------------------------------------------------------------------------- /configs/miniwob/eval_openai_agent.yml: -------------------------------------------------------------------------------- 1 | dataset: "miniwob" 2 | logging: True 3 | verbose: 1 4 | debug: False 5 | logdir: "data/miniwob/eval" 6 | agent: 7 | type: "step" #"flat_fewshot" 8 | root_action: 'miniwob_agent' 9 | low_level_action_list: ['click', 'type', 'stop'] 10 | model_name: "gpt-4-turbo-preview" 11 | model_host: "openai" 12 | prompt_mode: "chat" 13 | max_target_len: 100 14 | env: 15 | max_env_steps: 30 16 | max_browser_rows: 150 17 | headless: False 18 | start_seed: 50 19 | end_seed: 51 20 | num_samples_per_task: 0 21 | tasks: ['book-flight'] 22 | # tasks: ['click-link', 'click-option', 'focus-text', 'click-button', 'click-button-sequence', 'click-dialog', 'click-dialog-2', 'click-tab', 'click-test', 'click-test-2', 'enter-text', 'focus-text-2', 'enter-text-dynamic', 'enter-password', 'login-user', 'click-pie', 'enter-date', 'grid-coordinate', 'click-widget', 'multi-orderings', 'choose-date', 'click-collapsible-2', 'simple-arithmetic', 'click-tab-2', 'click-tab-2-hard', 'multi-layouts', 'copy-paste', 'click-collapsible', 'choose-date-easy', 'copy-paste-2', 'simple-algebra', 'click-checkboxes', 'click-checkboxes-transfer', 'login-user-popup', 'click-checkboxes-soft', 'enter-text-2', 'email-inbox-forward-nl', 'search-engine', 'find-word', 'choose-date-medium', 'click-checkboxes-large', 'book-flight', 'email-inbox-nl-turk', 'email-inbox-forward-nl-turk', 'email-inbox'] -------------------------------------------------------------------------------- /configs/webarena/eval_openai_agent.yml: -------------------------------------------------------------------------------- 1 | dataset: "webarena" 2 | logging: True 3 | verbose: 1 4 | debug: False 5 | logdir: "data/webarena/eval" 6 | agent: 7 | type: "step" # "flat_fewshot8k", "flat_fewshot4k" 8 | root_action: 'shopping_agent' 9 | low_level_action_list: ['click', 'type', 'scroll', 'stop', 'goto', 'hover', 'note', 'go_back'] 10 | model_name: "gpt-4-turbo-2024-04-09" 11 | # model_name: "gpt-3.5-turbo-0125" 12 | model_host: "openai" 13 | prompt_mode: "chat" 14 | max_target_len: 100 15 | env: 16 | max_env_steps: 20 17 | max_browser_rows: 500 18 | headless: False 19 | task_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811] 20 | -------------------------------------------------------------------------------- /notebooks/others/eval_compression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "import seaborn as sns\n", 13 | "\n", 14 | "from tabulate import tabulate\n", 15 | "\n", 16 | "plt.rcParams['font.size'] = 18" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 8, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | " method not_compressed compressed\n", 29 | "0 Miniwob 182 182\n", 30 | "1 CRM 1367 539\n", 31 | "2 Liveweb 11485 1933\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "import pandas as pd\n", 37 | "\n", 38 | "# Define the data\n", 39 | "data = {\n", 40 | " \"method\": [\"Miniwob\", \"CRM\", \"Liveweb\"],\n", 41 | " \"not_compressed\": [182, 1367, 11485],\n", 42 | " \"compressed\": [182, 539, 1933]\n", 43 | "}\n", 44 | "\n", 45 | "# Create the DataFrame\n", 46 | "df = pd.DataFrame(data)\n", 47 | "\n", 48 | "# Display the DataFrame\n", 49 | "print(df)\n" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 10, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/plain": [ 60 | "
" 61 | ] 62 | }, 63 | "metadata": {}, 64 | "output_type": "display_data" 65 | }, 66 | { 67 | "data": { 68 | "image/png": "", 69 | "text/plain": [ 70 | "
" 71 | ] 72 | }, 73 | "metadata": {}, 74 | "output_type": "display_data" 75 | } 76 | ], 77 | "source": [ 78 | "# Create the DataFrame\n", 79 | "df = pd.DataFrame(data)\n", 80 | "\n", 81 | "# Set the colors\n", 82 | "colors = ['skyblue', 'lightcoral']\n", 83 | "\n", 84 | "plt.figure(figsize=(10, 6))\n", 85 | "\n", 86 | "# Plot the data\n", 87 | "ax = df.plot(x=\"method\", kind=\"bar\", color=colors, width=0.7)\n", 88 | "# plt.title(\"Comparison of Compressed v/s Not Compressed\")\n", 89 | "plt.ylabel(\"Tokens\")\n", 90 | "plt.xlabel(\"Tasks\")\n", 91 | "plt.xticks(rotation=0)\n", 92 | "plt.tight_layout()\n", 93 | "plt.legend(loc='upper left')\n", 94 | "\n", 95 | "# Display the plot\n", 96 | "plt.show()" 97 | ] 98 | } 99 | ], 100 | "metadata": { 101 | "kernelspec": { 102 | "display_name": "webactions-lm", 103 | "language": "python", 104 | "name": "python3" 105 | }, 106 | "language_info": { 107 | "codemirror_mode": { 108 | "name": "ipython", 109 | "version": 3 110 | }, 111 | "file_extension": ".py", 112 | "mimetype": "text/x-python", 113 | "name": "python", 114 | "nbconvert_exporter": "python", 115 | "pygments_lexer": "ipython3", 116 | "version": "3.9.13" 117 | }, 118 | "orig_nbformat": 4 119 | }, 120 | "nbformat": 4, 121 | "nbformat_minor": 2 122 | } 123 | -------------------------------------------------------------------------------- /notebooks/webarena/final_webarena_step_vs_all.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 10, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import ast" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 11, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# Download model outputs from https://drive.google.com/drive/folders/1P8HxK9tF623bJ9c6xIxn10UnryVdVzsc\n", 20 | "# and save them to data/results_step/webarena/2405_all_tasks/" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 12, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | "
tasktask_idsitesmodellogfiledonerewardgpt4_step_successgpt4_step_num_actionstypecategory
0tasks/webarena/0.json0[shopping_admin]gpt-4-turbo-preview20240505-204634/0.jsonTrue0.00.01stepshopping_admin
1tasks/webarena/1.json1[shopping_admin]gpt-4-turbo-preview20240505-204634/1.jsonTrue0.00.06stepshopping_admin
2tasks/webarena/2.json2[shopping_admin]gpt-4-turbo-preview20240505-204634/2.jsonTrue0.00.01stepshopping_admin
3tasks/webarena/3.json3[shopping_admin]gpt-4-turbo-preview20240505-204634/3.jsonTrue0.00.01stepshopping_admin
4tasks/webarena/4.json4[shopping_admin]gpt-4-turbo-preview20240505-204634/4.jsonTrue0.00.01stepshopping_admin
....................................
799tasks/webarena/807.json807[gitlab]gpt-4-turbo-preview20240508-201838/807.jsonTrue0.00.011stepgitlab
800tasks/webarena/808.json808[gitlab]gpt-4-turbo-preview20240508-201838/808.jsonTrue0.00.08stepgitlab
801tasks/webarena/809.json809[gitlab]gpt-4-turbo-preview20240324-091310/809.jsonTrue1.01.011stepgitlab
802tasks/webarena/810.json810[gitlab]gpt-4-turbo-preview20240508-201838/810.jsonTrue0.00.07stepgitlab
803tasks/webarena/811.json811[gitlab]gpt-4-turbo-preview20240508-201838/811.jsonTrue0.00.05stepgitlab
\n", 219 | "

804 rows × 11 columns

\n", 220 | "
" 221 | ], 222 | "text/plain": [ 223 | " task task_id sites model \\\n", 224 | "0 tasks/webarena/0.json 0 [shopping_admin] gpt-4-turbo-preview \n", 225 | "1 tasks/webarena/1.json 1 [shopping_admin] gpt-4-turbo-preview \n", 226 | "2 tasks/webarena/2.json 2 [shopping_admin] gpt-4-turbo-preview \n", 227 | "3 tasks/webarena/3.json 3 [shopping_admin] gpt-4-turbo-preview \n", 228 | "4 tasks/webarena/4.json 4 [shopping_admin] gpt-4-turbo-preview \n", 229 | ".. ... ... ... ... \n", 230 | "799 tasks/webarena/807.json 807 [gitlab] gpt-4-turbo-preview \n", 231 | "800 tasks/webarena/808.json 808 [gitlab] gpt-4-turbo-preview \n", 232 | "801 tasks/webarena/809.json 809 [gitlab] gpt-4-turbo-preview \n", 233 | "802 tasks/webarena/810.json 810 [gitlab] gpt-4-turbo-preview \n", 234 | "803 tasks/webarena/811.json 811 [gitlab] gpt-4-turbo-preview \n", 235 | "\n", 236 | " logfile done reward gpt4_step_success \\\n", 237 | "0 20240505-204634/0.json True 0.0 0.0 \n", 238 | "1 20240505-204634/1.json True 0.0 0.0 \n", 239 | "2 20240505-204634/2.json True 0.0 0.0 \n", 240 | "3 20240505-204634/3.json True 0.0 0.0 \n", 241 | "4 20240505-204634/4.json True 0.0 0.0 \n", 242 | ".. ... ... ... ... \n", 243 | "799 20240508-201838/807.json True 0.0 0.0 \n", 244 | "800 20240508-201838/808.json True 0.0 0.0 \n", 245 | "801 20240324-091310/809.json True 1.0 1.0 \n", 246 | "802 20240508-201838/810.json True 0.0 0.0 \n", 247 | "803 20240508-201838/811.json True 0.0 0.0 \n", 248 | "\n", 249 | " gpt4_step_num_actions type category \n", 250 | "0 1 step shopping_admin \n", 251 | "1 6 step shopping_admin \n", 252 | "2 1 step shopping_admin \n", 253 | "3 1 step shopping_admin \n", 254 | "4 1 step shopping_admin \n", 255 | ".. ... ... ... \n", 256 | "799 11 step gitlab \n", 257 | "800 8 step gitlab \n", 258 | "801 11 step gitlab \n", 259 | "802 7 step gitlab \n", 260 | "803 5 step gitlab \n", 261 | "\n", 262 | "[804 rows x 11 columns]" 263 | ] 264 | }, 265 | "execution_count": 12, 266 | "metadata": {}, 267 | "output_type": "execute_result" 268 | } 269 | ], 270 | "source": [ 271 | "# Load results summarygit\n", 272 | "srcdir = \"../../data/results_step/webarena/2405_all_tasks\"\n", 273 | "df = pd.read_csv(f\"{srcdir}/summary.csv\")\n", 274 | "df['sites'] = df['sites'].apply(ast.literal_eval)\n", 275 | "df['category'] = df['sites'].apply(lambda x: 'multisite' if len(x) > 1 else x[0])\n", 276 | "df" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 16, 282 | "metadata": {}, 283 | "outputs": [ 284 | { 285 | "name": "stdout", 286 | "output_type": "stream", 287 | "text": [ 288 | " Site Mean SteP success (%)\n", 289 | "0 gitlab 32.77\n", 290 | "1 map 31.19\n", 291 | "2 multisite 10.42\n", 292 | "3 reddit 53.77\n", 293 | "4 shopping 52.20\n", 294 | "5 shopping_admin 23.63\n", 295 | "Overall mean: 36.32%\n" 296 | ] 297 | } 298 | ], 299 | "source": [ 300 | "df_category = df.groupby('category')['gpt4_step_success'].mean().reset_index()\n", 301 | "df_category['gpt4_step_success'] = (df_category['gpt4_step_success'] * 100).round(2) # convert to %\n", 302 | "\n", 303 | "df_category.columns = ['Site', 'Mean SteP success (%)']\n", 304 | "print(df_category)\n", 305 | "\n", 306 | "overall_mean = (df['gpt4_step_success'].mean() * 100).round(2)\n", 307 | "print(f\"Overall mean: {overall_mean}%\")" 308 | ] 309 | } 310 | ], 311 | "metadata": { 312 | "kernelspec": { 313 | "display_name": "autoui", 314 | "language": "python", 315 | "name": "python3" 316 | }, 317 | "language_info": { 318 | "codemirror_mode": { 319 | "name": "ipython", 320 | "version": 3 321 | }, 322 | "file_extension": ".py", 323 | "mimetype": "text/x-python", 324 | "name": "python", 325 | "nbconvert_exporter": "python", 326 | "pygments_lexer": "ipython3", 327 | "version": "3.10.12" 328 | } 329 | }, 330 | "nbformat": 4, 331 | "nbformat_minor": 2 332 | } 333 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.26.2 2 | pandas==2.1.3 3 | torch==2.1.1 4 | openai 5 | transformers==4.30 6 | datasets==2.10.1 7 | bert-score==0.3.13 8 | bitsandbytes==0.37.2 9 | playwright==1.32.1 10 | beautifulsoup4 11 | colorama 12 | matplotlib==3.8.2 13 | tensorboard==2.15.1 14 | requests 15 | tqdm>=4.31.1 16 | gradio==4.7.1 17 | huggingface-hub==0.17.3 18 | setuptools==69.0.2 19 | peft @ git+https://github.com/huggingface/peft.git@b4faffea8ae031e5bd69a76b55418b3650c04c80 20 | accelerate==0.24.0 21 | ctranslate2 22 | PyYAML==6.0.1 23 | sentencepiece==0.1.97 24 | tiktoken -------------------------------------------------------------------------------- /scripts/evaluate/eval_miniwob.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import time 4 | import re 5 | import argparse 6 | import itertools 7 | from tqdm import tqdm 8 | 9 | import os 10 | import sys 11 | import argparse 12 | from typing import List 13 | import shutil 14 | 15 | import openai 16 | 17 | from webagents_step.utils.data_prep import * 18 | from webagents_step.agents.prompt_agent import PromptAgent 19 | from webagents_step.agents.step_agent import StepAgent 20 | from webagents_step.prompts.miniwob import flat_fewshot_template, step_fewshot_template 21 | from webagents_step.environment.miniwob import MiniWoBEnvironmentWrapper 22 | 23 | openai.api_key = os.environ.get("OPENAI_API_KEY") 24 | 25 | def run(): 26 | parser = argparse.ArgumentParser( 27 | description="Only the config file argument should be passed" 28 | ) 29 | parser.add_argument( 30 | "--config", type=str, required=True, help="yaml config file location" 31 | ) 32 | args = parser.parse_args() 33 | with open(args.config, "r") as file: 34 | config = DotDict(yaml.safe_load(file)) 35 | 36 | dstdir = f"{config.logdir}/{time.strftime('%Y%m%d-%H%M%S')}" 37 | os.makedirs(dstdir, exist_ok=True) 38 | shutil.copyfile(args.config, os.path.join(dstdir, args.config.split("/")[-1])) 39 | random.seed(42) 40 | 41 | tasks = config.env.tasks 42 | seeds = range(config.env.start_seed, config.env.end_seed) 43 | sampled_seeds = ( 44 | random.sample(seeds, config.env.num_samples_per_task) 45 | if (config.env.num_samples_per_task > 0) 46 | else seeds 47 | ) 48 | 49 | ##### 50 | # Initialize agent 51 | ##### 52 | if config.agent.type == "step": 53 | action_to_prompt_dict = {k: v for k, v in step_fewshot_template.__dict__.items() if isinstance(v, dict)} 54 | low_level_action_list = config.agent.low_level_action_list 55 | agent_init = lambda: StepAgent( 56 | root_action = config.agent.root_action, 57 | action_to_prompt_dict = action_to_prompt_dict, 58 | low_level_action_list = low_level_action_list, 59 | max_actions=config.env.max_env_steps, 60 | verbose=config.verbose, 61 | logging=config.logging, 62 | debug=config.debug, 63 | model=config.agent.model_name, 64 | prompt_mode=config.agent.prompt_mode, 65 | ) 66 | elif config.agent.type == "flat_fewshot": 67 | agent_init = lambda: PromptAgent( 68 | prompt_template=flat_fewshot_template.flat_fewshot_agent, 69 | model=config.agent.model_name, 70 | prompt_mode=config.agent.prompt_mode, 71 | max_actions=config.env.max_env_steps, 72 | verbose=config.verbose, 73 | logging=config.logging, 74 | debug=config.debug, 75 | ) 76 | else: 77 | raise NotImplementedError(f"{config.agent.type} not implemented") 78 | 79 | ##### 80 | # Evaluate 81 | ##### 82 | 83 | for task_seed_pair in tqdm(itertools.product(tasks, sampled_seeds)): 84 | task, seed = task_seed_pair 85 | env = MiniWoBEnvironmentWrapper( 86 | task=task, 87 | seed=seed, 88 | max_browser_rows=config.env.max_browser_rows, 89 | max_steps=config.env.max_env_steps, 90 | headless=config.env.headless, 91 | ) 92 | agent = agent_init() 93 | objective = env.get_objective() 94 | status = agent.act(objective=objective, env=env) 95 | env.close() 96 | 97 | if config.logging: 98 | log_file = os.path.join(dstdir, f"{task}_{seed:03d}.json") 99 | log_data = { 100 | "task": task, 101 | "seed": seed, 102 | "model": config.agent.model_name, 103 | "trajectory": agent.get_trajectory(), 104 | } 105 | summary_file = os.path.join(dstdir, "summary.csv") 106 | summary_data = { 107 | "task": task, 108 | "seed": seed, 109 | "model": config.agent.model_name, 110 | "logfile": re.search(r"/([^/]+/[^/]+\.json)$", log_file).group(1), 111 | } 112 | summary_data.update(status) 113 | log_run( 114 | log_file=log_file, 115 | log_data=log_data, 116 | summary_file=summary_file, 117 | summary_data=summary_data, 118 | ) 119 | 120 | if __name__ == "__main__": 121 | run() 122 | -------------------------------------------------------------------------------- /scripts/evaluate/eval_webarena.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import time 4 | import re 5 | import argparse 6 | import itertools 7 | from tqdm import tqdm 8 | 9 | import os 10 | import sys 11 | import argparse 12 | from typing import List 13 | import shutil 14 | 15 | import openai 16 | import time 17 | 18 | from webagents_step.utils.data_prep import * 19 | from webagents_step.agents.prompt_agent import PromptAgent 20 | from webagents_step.agents.step_agent import StepAgent 21 | from webagents_step.prompts.webarena import flat_fewshot_template, step_fewshot_template 22 | from webagents_step.environment.webarena import WebArenaEnvironmentWrapper 23 | 24 | openai.api_key = os.environ.get("OPENAI_API_KEY") 25 | 26 | def run(): 27 | parser = argparse.ArgumentParser( 28 | description="Only the config file argument should be passed" 29 | ) 30 | parser.add_argument( 31 | "--config", type=str, required=True, help="yaml config file location" 32 | ) 33 | args = parser.parse_args() 34 | with open(args.config, "r") as file: 35 | config = DotDict(yaml.safe_load(file)) 36 | 37 | dstdir = f"{config.logdir}/{time.strftime('%Y%m%d-%H%M%S')}" 38 | os.makedirs(dstdir, exist_ok=True) 39 | shutil.copyfile(args.config, os.path.join(dstdir, args.config.split("/")[-1])) 40 | random.seed(42) 41 | 42 | config_file_list = [] 43 | 44 | # ids covered in gitlab 45 | task_ids = config.env.task_ids 46 | 47 | for task_id in task_ids: 48 | config_file_list.append(f"tasks/webarena/{task_id}.json") 49 | 50 | action_to_prompt_dict = {k: v for k, v in step_fewshot_template.__dict__.items() if isinstance(v, dict)} 51 | low_level_action_list = config.agent.low_level_action_list 52 | 53 | if config.agent.type == "step": 54 | agent_init = lambda: StepAgent( 55 | root_action = config.agent.root_action, 56 | action_to_prompt_dict = action_to_prompt_dict, 57 | low_level_action_list = low_level_action_list, 58 | max_actions=config.env.max_env_steps, 59 | verbose=config.verbose, 60 | logging=config.logging, 61 | debug=config.debug, 62 | model=config.agent.model_name, 63 | prompt_mode=config.agent.prompt_mode, 64 | ) 65 | elif config.agent.type == "flat_fewshot8k": 66 | agent_init = lambda: PromptAgent( 67 | prompt_template=flat_fewshot_template.flat_fewshot_agent8k, 68 | model=config.agent.model_name, 69 | prompt_mode=config.agent.prompt_mode, 70 | max_actions=config.env.max_env_steps, 71 | verbose=config.verbose, 72 | logging=config.logging, 73 | debug=config.debug, 74 | ) 75 | elif config.agent.type == "flat_fewshot4k": 76 | agent_init = lambda: PromptAgent( 77 | prompt_template=flat_fewshot_template.flat_fewshot_agent4k, 78 | model=config.agent.model_name, 79 | prompt_mode=config.agent.prompt_mode, 80 | max_actions=config.env.max_env_steps, 81 | verbose=config.verbose, 82 | logging=config.logging, 83 | debug=config.debug, 84 | ) 85 | else: 86 | raise NotImplementedError(f"{config.agent.type} not implemented") 87 | 88 | ##### 89 | # Evaluate 90 | ##### 91 | 92 | for config_file in config_file_list: 93 | env = WebArenaEnvironmentWrapper(config_file=config_file, 94 | max_browser_rows=config.env.max_browser_rows, 95 | max_steps=config.env.max_env_steps, 96 | slow_mo=1, 97 | observation_type="accessibility_tree", 98 | current_viewport_only=True, 99 | viewport_size={"width": 1920, "height": 1080}, 100 | headless=config.env.headless) 101 | 102 | agent = agent_init() 103 | objective = env.get_objective() 104 | status = agent.act(objective=objective, env=env) 105 | env.close() 106 | 107 | if config.logging: 108 | with open(config_file, "r") as f: 109 | task_config = json.load(f) 110 | log_file = os.path.join(dstdir, f"{task_config['task_id']}.json") 111 | log_data = { 112 | "task": config_file, 113 | "id": task_config['task_id'], 114 | "model": config.agent.model_name, 115 | "type": config.agent.type, 116 | "trajectory": agent.get_trajectory(), 117 | } 118 | summary_file = os.path.join(dstdir, "summary.csv") 119 | summary_data = { 120 | "task": config_file, 121 | "task_id": task_config['task_id'], 122 | "model": config.agent.model_name, 123 | "type": config.agent.type, 124 | "logfile": re.search(r"/([^/]+/[^/]+\.json)$", log_file).group(1), 125 | } 126 | summary_data.update(status) 127 | log_run( 128 | log_file=log_file, 129 | log_data=log_data, 130 | summary_file=summary_file, 131 | summary_data=summary_data, 132 | ) 133 | 134 | # For reddit: Sleep for 21 minutes (720 seconds) 135 | # time.sleep(1260) 136 | 137 | if __name__ == "__main__": 138 | run() 139 | -------------------------------------------------------------------------------- /scripts/setup/auto_login_webarena.py: -------------------------------------------------------------------------------- 1 | """Script to automatically login each website""" 2 | import argparse 3 | import glob 4 | import os 5 | import time 6 | from concurrent.futures import ThreadPoolExecutor 7 | from itertools import combinations 8 | from pathlib import Path 9 | 10 | from playwright.sync_api import sync_playwright 11 | 12 | os.makedirs('./.auth', exist_ok=True) 13 | 14 | 15 | SLEEP = 1.5 16 | 17 | # set the URLs of each website, we use the demo sites as an example 18 | os.environ[ 19 | "SHOPPING" 20 | ] = "https://webarena-env-shopping.awsdev.asapp.com" 21 | os.environ[ 22 | "SHOPPING_ADMIN" 23 | ] = "https://webarena-env-shopping.awsdev.asapp.com/admin" 24 | os.environ[ 25 | "REDDIT" 26 | ] = "https://webarena-env-reddit.awsdev.asapp.com" 27 | os.environ[ 28 | "GITLAB" 29 | ] = "https://webarena-env-github.awsdev.asapp.com" 30 | os.environ[ 31 | "MAP" 32 | ] = "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:3000" 33 | os.environ[ 34 | "WIKIPEDIA" 35 | ] = "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing" 36 | os.environ[ 37 | "HOMEPAGE" 38 | ] = "PASS" # The home page is not currently hosted in the demo site 39 | print("Done setting up URLs") 40 | 41 | from browser_env.env_config import ( 42 | ACCOUNTS, 43 | GITLAB, 44 | REDDIT, 45 | SHOPPING, 46 | SHOPPING_ADMIN, 47 | ) 48 | 49 | HEADLESS = False 50 | SLOW_MO = 0 51 | 52 | 53 | SITES = ["gitlab", "shopping", "shopping_admin", "reddit"] 54 | URLS = [ 55 | f"{GITLAB}/-/profile", 56 | f"{SHOPPING}/wishlist/", 57 | f"{SHOPPING_ADMIN}/dashboard", 58 | f"{REDDIT}/user/{ACCOUNTS['reddit']['username']}/account", 59 | ] 60 | EXACT_MATCH = [True, True, True, True] 61 | KEYWORDS = ["", "", "Dashboard", "Delete"] 62 | 63 | 64 | def is_expired( 65 | storage_state: Path, url: str, keyword: str, url_exact: bool = True 66 | ) -> bool: 67 | """Test whether the cookie is expired""" 68 | if not storage_state.exists(): 69 | return True 70 | 71 | context_manager = sync_playwright() 72 | playwright = context_manager.__enter__() 73 | browser = playwright.chromium.launch(headless=True, slow_mo=SLOW_MO) 74 | context = browser.new_context(storage_state=storage_state) 75 | page = context.new_page() 76 | page.goto(url) 77 | time.sleep(1) 78 | d_url = page.url 79 | content = page.content() 80 | context_manager.__exit__() 81 | if keyword: 82 | return keyword not in content 83 | else: 84 | if url_exact: 85 | return d_url != url 86 | else: 87 | return url not in d_url 88 | 89 | 90 | def renew_comb(comb: list[str], auth_folder: str = "./.auth") -> None: 91 | context_manager = sync_playwright() 92 | playwright = context_manager.__enter__() 93 | browser = playwright.chromium.launch(headless=HEADLESS) 94 | context = browser.new_context() 95 | page = context.new_page() 96 | 97 | if "shopping" in comb: 98 | username = ACCOUNTS["shopping"]["username"] 99 | password = ACCOUNTS["shopping"]["password"] 100 | page.goto(f"{SHOPPING}/customer/account/login/") 101 | page.get_by_label("Email", exact=True).fill(username) 102 | page.get_by_label("Password", exact=True).fill(password) 103 | page.get_by_role("button", name="Sign In").click() 104 | 105 | if "reddit" in comb: 106 | username = ACCOUNTS["reddit"]["username"] 107 | password = ACCOUNTS["reddit"]["password"] 108 | page.goto(f"{REDDIT}/registration") 109 | page.goto(f"{REDDIT}/login") 110 | page.get_by_label("Username").fill(username) 111 | page.get_by_label("Password").fill(password) 112 | page.get_by_role("button", name="Log in").click() 113 | 114 | if "shopping_admin" in comb: 115 | username = ACCOUNTS["shopping_admin"]["username"] 116 | password = ACCOUNTS["shopping_admin"]["password"] 117 | page.goto(f"{SHOPPING_ADMIN}") 118 | page.get_by_placeholder("user name").fill(username) 119 | page.get_by_placeholder("password").fill(password) 120 | page.get_by_role("button", name="Sign in").click() 121 | 122 | if "gitlab" in comb: 123 | username = ACCOUNTS["gitlab"]["username"] 124 | password = ACCOUNTS["gitlab"]["password"] 125 | page.goto(f"{GITLAB}/users/sign_in") 126 | page.get_by_test_id("username-field").click() 127 | page.get_by_test_id("username-field").fill(username) 128 | page.get_by_test_id("username-field").press("Tab") 129 | page.get_by_test_id("password-field").fill(password) 130 | page.get_by_test_id("sign-in-button").click() 131 | 132 | context.storage_state(path=f"{auth_folder}/{'.'.join(comb)}_state.json") 133 | 134 | context_manager.__exit__() 135 | 136 | 137 | def get_site_comb_from_filepath(file_path: str) -> list[str]: 138 | comb = os.path.basename(file_path).rsplit("_", 1)[0].split(".") 139 | return comb 140 | 141 | 142 | def main(auth_folder: str = "./.auth") -> None: 143 | pairs = list(combinations(SITES, 2)) 144 | 145 | max_workers = 8 146 | with ThreadPoolExecutor(max_workers=max_workers) as executor: 147 | for pair in pairs: 148 | # TODO[shuyanzh] auth don't work on these two sites 149 | if "reddit" in pair and ( 150 | "shopping" in pair or "shopping_admin" in pair 151 | ): 152 | continue 153 | executor.submit( 154 | renew_comb, list(sorted(pair)), auth_folder=auth_folder 155 | ) 156 | 157 | for site in SITES: 158 | executor.submit(renew_comb, [site], auth_folder=auth_folder) 159 | 160 | futures = [] 161 | cookie_files = list(glob.glob(f"{auth_folder}/*.json")) 162 | with ThreadPoolExecutor(max_workers=max_workers) as executor: 163 | for c_file in cookie_files: 164 | comb = get_site_comb_from_filepath(c_file) 165 | for cur_site in comb: 166 | url = URLS[SITES.index(cur_site)] 167 | keyword = KEYWORDS[SITES.index(cur_site)] 168 | match = EXACT_MATCH[SITES.index(cur_site)] 169 | future = executor.submit( 170 | is_expired, Path(c_file), url, keyword, match 171 | ) 172 | futures.append(future) 173 | 174 | for i, future in enumerate(futures): 175 | assert not future.result(), f"Cookie {cookie_files[i]} expired." 176 | 177 | 178 | if __name__ == "__main__": 179 | parser = argparse.ArgumentParser() 180 | parser.add_argument("--site_list", nargs="+", default=[]) 181 | parser.add_argument("--auth_folder", type=str, default="./.auth") 182 | args = parser.parse_args() 183 | if not args.site_list: 184 | main() 185 | else: 186 | if "all" in args.site_list: 187 | main(auth_folder=args.auth_folder) 188 | else: 189 | renew_comb(args.site_list, auth_folder=args.auth_folder) 190 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | from io import open 3 | 4 | 5 | def read_requirements_file(filename): 6 | with open(filename) as f: 7 | return [line.strip() for line in f] 8 | 9 | 10 | setup( 11 | name="webagents_step", 12 | version="0.0.1", 13 | author="Paloma Sodhi", 14 | author_email="psodhi@asapp.com", 15 | long_description=open("README.md", "r", encoding="utf-8").read(), 16 | long_description_content_type="text/markdown", 17 | url="", 18 | package_dir={'': 'src'}, 19 | packages=find_packages( 20 | exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), 21 | install_requires=read_requirements_file("requirements.txt"), 22 | entry_points={}, 23 | include_package_data=True, 24 | python_requires=">=3.6", 25 | tests_require=["pytest"], 26 | ) 27 | -------------------------------------------------------------------------------- /src/webagents_step/agents/agent.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | 4 | class Agent: 5 | def __init__( 6 | self, 7 | max_actions, 8 | verbose=0, 9 | logging=False, 10 | previous_actions: List = None, 11 | previous_reasons: List = None, 12 | previous_responses: List = None, 13 | ): 14 | self.previous_actions = [] if previous_actions is None else previous_actions 15 | self.previous_reasons = [] if previous_reasons is None else previous_reasons 16 | self.previous_responses = [] if previous_responses is None else previous_responses 17 | self.max_actions = max_actions 18 | self.verbose = verbose 19 | self.logging = logging 20 | self.trajectory = [] 21 | self.data_to_log = {} 22 | 23 | def reset(self): 24 | self.previous_actions = [] 25 | self.previous_reasons = [] 26 | self.previous_responses = [] 27 | self.trajectory = [] 28 | self.data_to_log = {} 29 | 30 | def get_trajectory(self): 31 | return self.trajectory 32 | 33 | def update_history(self, action, reason): 34 | if action: 35 | self.previous_actions += [action] 36 | if reason: 37 | self.previous_reasons += [reason] 38 | 39 | def predict_action(self, objective, observation, url=None): 40 | pass 41 | 42 | def receive_response(self, response): 43 | self.previous_responses += [response] 44 | 45 | def act(self, objective, env): 46 | while not env.done(): 47 | observation = env.observation() 48 | action, reason = self.predict_action( 49 | objective=objective, observation=observation, url=env.get_url() 50 | ) 51 | status = env.step(action) 52 | 53 | if self.logging: 54 | self.log_step( 55 | objective=objective, 56 | url=env.get_url(), 57 | observation=observation, 58 | action=action, 59 | reason=reason, 60 | status=status, 61 | ) 62 | 63 | if len(self.previous_actions) >= self.max_actions: 64 | print(f"Agent exceeded max actions: {self.max_actions}") 65 | break 66 | 67 | return status 68 | 69 | async def async_act(self, objective, env): 70 | while not env.done(): 71 | observation = await env.observation() 72 | action, reason = self.predict_action( 73 | objective=objective, observation=observation, url=env.get_url() 74 | ) 75 | status = await env.step(action) 76 | 77 | if self.logging: 78 | self.log_step( 79 | objective=objective, 80 | url=env.get_url(), 81 | observation=observation, 82 | action=action, 83 | reason=reason, 84 | status=status, 85 | ) 86 | 87 | if len(self.previous_actions) >= self.max_actions: 88 | print(f"Agent exceeded max actions: {self.max_actions}") 89 | break 90 | 91 | return status 92 | 93 | def log_step(self, objective, url, observation, action, reason, status): 94 | self.data_to_log['objective'] = objective 95 | self.data_to_log['url'] = url 96 | self.data_to_log['observation'] = observation 97 | self.data_to_log['previous_actions'] = self.previous_actions[:-1] 98 | self.data_to_log['previous_responses'] = self.previous_responses[:-1] 99 | self.data_to_log['previous_reasons'] = self.previous_reasons[:-1] 100 | self.data_to_log['action'] = action 101 | self.data_to_log['reason'] = reason 102 | for (k, v) in status.items(): 103 | self.data_to_log[k] = v 104 | self.trajectory.append(self.data_to_log) 105 | self.data_to_log = {} 106 | -------------------------------------------------------------------------------- /src/webagents_step/agents/finetuned_agent.py: -------------------------------------------------------------------------------- 1 | from webagents_step.agents.agent import Agent 2 | from webagents_step.utils.llm import fill_prompt_template, construct_llm_message_hf, generate_prediction, parse_action_reason 3 | from typing import Dict 4 | 5 | 6 | class FineTunedAgent(Agent): 7 | def __init__(self, max_actions=10, verbose=0, logging=False, debug=False, model=None, tokenizer=None, prompt_template=None, prompt_mode="completion", model_type="llama2", model_kwargs: Dict = None): 8 | super().__init__(max_actions=max_actions, verbose=verbose, logging=logging) 9 | self.debug = debug 10 | self.model = model 11 | self.tokenizer = tokenizer 12 | self.prompt_template = prompt_template 13 | self.prompt_mode = prompt_mode 14 | self.model_type = model_type 15 | self.model_kwargs = model_kwargs 16 | 17 | self.model.eval() 18 | 19 | def previous_history(self): 20 | previous_history = [] 21 | 22 | if len(self.previous_actions) == len(self.previous_responses): 23 | for action, response in zip(self.previous_actions, self.previous_responses): 24 | if response: 25 | previous_history.append(f"{response} = {action}") 26 | else: 27 | previous_history.append(action) 28 | previous_history="\n".join(previous_history) 29 | else: 30 | previous_history="\n".join(self.previous_actions) 31 | 32 | return previous_history 33 | 34 | def predict_action(self, objective, observation, url=None): 35 | prompt = fill_prompt_template(prompt_template=self.prompt_template, objective=objective, 36 | observation=observation, url=url, 37 | previous_history=self.previous_history()) 38 | messages = construct_llm_message_hf(prompt=prompt, prompt_mode=self.prompt_mode, model_type=self.model_type) 39 | 40 | inputs = messages[0]["content"] # todo: generalize to 'chat' mode 41 | model_response = generate_prediction(inputs=inputs, model=self.model, tokenizer=self.tokenizer, **self.model_kwargs) 42 | action, reason = parse_action_reason(model_response) 43 | 44 | if self.logging: 45 | self.data_to_log['input'] = inputs 46 | self.data_to_log['model_response'] = model_response 47 | 48 | if self.verbose > 0: 49 | if self.verbose > 1: 50 | print(f"\n OBSERVATION: {observation}") 51 | print(f"\n RESPONSE: {model_response}") 52 | print(f"\n OBJECTIVE: {objective}") 53 | print(f"\n PREVIOUS HISTORY: {self.previous_history()}") 54 | print(f"\n REASON: {reason}") 55 | print(f"\n ACTION: {action}") 56 | 57 | if self.debug: 58 | human_input = input() 59 | if human_input != "c": 60 | action = human_input 61 | reason = "None" 62 | 63 | self.update_history(action=action, reason=reason) 64 | return action, reason -------------------------------------------------------------------------------- /src/webagents_step/agents/keyboard_agent.py: -------------------------------------------------------------------------------- 1 | from webagents_step.agents.agent import Agent 2 | 3 | class KeyboardAgent(Agent): 4 | def __init__(self, max_actions: int = 10, verbose: bool = False, logging: bool = False): 5 | super().__init__(max_actions=max_actions, verbose=verbose, logging=logging) 6 | 7 | def predict_action(self, objective, observation, url=None): 8 | print(f"\n OBJECTIVE: {objective}") 9 | print(f"\n OBSERVATION: {observation}") 10 | print(f'\n PREVIOUS ACTIONS: {self.previous_actions}') 11 | action = input() 12 | 13 | self.update_history(action=action, reason="") 14 | return action, None -------------------------------------------------------------------------------- /src/webagents_step/agents/playback_agent.py: -------------------------------------------------------------------------------- 1 | from webagents_step.agents.agent import Agent 2 | from typing import List 3 | 4 | class PlaybackAgent(Agent): 5 | def __init__(self, max_actions: int = 1e6, verbose: bool = False, logging: bool = False, 6 | debug: bool = False, playback_trajectory=None, previous_actions: List = None): 7 | super().__init__(max_actions=max_actions, verbose=verbose, logging=logging, previous_actions=previous_actions) 8 | self.debug = debug 9 | # List of dictionary with atleast the action, reason key 10 | self.playback_trajectory = playback_trajectory 11 | self.max_actions = len(self.playback_trajectory) 12 | 13 | def predict_action(self, objective, observation, url=None): 14 | index = len(self.previous_actions) 15 | 16 | if index < len(self.playback_trajectory): 17 | action = self.playback_trajectory[index]['action'] 18 | reason = self.playback_trajectory[index]['reason'] 19 | 20 | if self.verbose > 0: 21 | if self.verbose > 1: 22 | print(f"\n OBSERVATION: {observation}") 23 | print(f"\n OBJECTIVE: {objective}") 24 | print(f'\n PREVIOUS ACTIONS: {self.previous_actions}') 25 | print(f"\n REASON: {reason}") 26 | print(f"\n ACTION: {action}") 27 | 28 | if self.debug: 29 | human_input = input() 30 | if human_input != "c": 31 | action = human_input 32 | reason = "None" 33 | 34 | self.update_history(action=action, reason=reason) 35 | return action, reason 36 | return None, None -------------------------------------------------------------------------------- /src/webagents_step/agents/prompt_agent.py: -------------------------------------------------------------------------------- 1 | from webagents_step.agents.agent import Agent 2 | from typing import List 3 | from webagents_step.utils.llm import fill_prompt_template, construct_llm_message_openai, call_openai_llm, parse_action_reason, calculate_cost_openai 4 | 5 | class PromptAgent(Agent): 6 | def __init__(self, max_actions: int = 10, verbose: bool = False, logging: bool = False, 7 | debug: bool = False, prompt_template: str = None, model: str = "gpt-3.5-turbo", 8 | prompt_mode: str = "chat", previous_actions: List = None, previous_reasons: List = None, previous_responses: List = None): 9 | super().__init__(max_actions=max_actions, verbose=verbose, logging=logging, previous_actions=previous_actions, previous_reasons=previous_reasons, previous_responses=previous_responses) 10 | self.debug = debug 11 | self.prompt_template = prompt_template 12 | self.model = model 13 | self.prompt_mode = prompt_mode 14 | 15 | def previous_history(self): 16 | previous_history = [] 17 | 18 | if len(self.previous_actions) == len(self.previous_responses): 19 | for action, response in zip(self.previous_actions, self.previous_responses): 20 | if response: 21 | previous_history.append(f"{response} = {action}") 22 | else: 23 | previous_history.append(action) 24 | previous_history="\n".join(previous_history) 25 | else: 26 | previous_history = "\n".join(action for action in self.previous_actions if action is not None) if self.previous_actions is not None else "" 27 | 28 | 29 | return previous_history 30 | 31 | def predict_action(self, objective, observation, url=None): 32 | prompt = fill_prompt_template(prompt_template=self.prompt_template, objective=objective, 33 | observation=observation, url=url, 34 | previous_history=self.previous_history()) 35 | messages = construct_llm_message_openai(prompt=prompt, prompt_mode=self.prompt_mode) 36 | model_response = call_openai_llm(messages=messages, model=self.model) 37 | action, reason = parse_action_reason(model_response) 38 | 39 | if self.logging: 40 | self.data_to_log['prompt'] = messages 41 | 42 | if self.verbose > 0: 43 | if self.verbose > 1: 44 | print(f"\n OBSERVATION: {observation}") 45 | print(f"\n RESPONSE: {model_response}") 46 | print(f"\n OBJECTIVE: {objective}") 47 | print(f"\n URL: {url}") 48 | print(f"\n PREVIOUS HISTORY: {self.previous_history()}") 49 | print(f"\n REASON: {reason}") 50 | print(f"\n ACTION: {action}") 51 | 52 | if self.debug: 53 | human_input = input() 54 | if human_input != "c": 55 | action = human_input 56 | reason = "None" 57 | 58 | self.update_history(action=action, reason=reason) 59 | return action, reason -------------------------------------------------------------------------------- /src/webagents_step/agents/step_agent.py: -------------------------------------------------------------------------------- 1 | from webagents_step.agents.agent import Agent 2 | from webagents_step.utils.stack import Stack 3 | from webagents_step.agents.prompt_agent import PromptAgent 4 | 5 | from typing import List, Dict 6 | import re 7 | 8 | class StepAgent(Agent): 9 | def __init__(self, max_actions: int = 10, verbose: bool = False, logging: bool = False, 10 | debug: bool = False, 11 | root_action: str = None, 12 | action_to_prompt_dict: Dict = None, 13 | low_level_action_list: List = None, 14 | model: str = "gpt-3.5-turbo", 15 | prompt_mode: str = "chat", previous_actions: List = None): 16 | super().__init__(max_actions=max_actions, verbose=verbose, logging=logging, previous_actions=previous_actions) 17 | self.debug = debug 18 | self.root_action = root_action 19 | self.action_to_prompt_dict = {} if action_to_prompt_dict is None else action_to_prompt_dict 20 | self.low_level_action_list = [] if low_level_action_list is None else low_level_action_list 21 | self.model = model 22 | self.prompt_mode = prompt_mode 23 | self.stack = Stack() 24 | 25 | def is_done(self, action): 26 | if "stop" in action: 27 | return True 28 | return False 29 | 30 | def is_low_level_action(self, action): 31 | action_type = action.split()[0] 32 | return (action_type in self.low_level_action_list) 33 | 34 | def is_high_level_action(self, action): 35 | action_type = action.split()[0] 36 | return (action_type in self.action_to_prompt_dict) 37 | 38 | def init_root_agent(self, objective): 39 | root_prompt_template = self.action_to_prompt_dict[self.root_action] 40 | agent = PromptAgent( 41 | prompt_template=root_prompt_template, 42 | model=self.model, 43 | prompt_mode=self.prompt_mode, 44 | max_actions=self.max_actions, 45 | verbose=self.verbose, 46 | logging=self.logging, 47 | debug=self.debug, 48 | previous_actions=[], 49 | previous_reasons=[], 50 | previous_responses=[] 51 | ) 52 | return {'agent': agent, 'objective': objective} 53 | 54 | def init_agent(self, action): 55 | pattern = r'(\w+)\s+\[(.*?)\]' 56 | matches = re.findall(pattern, action) 57 | action_type, _ = matches[0] 58 | objective = action 59 | prompt_template = self.action_to_prompt_dict[action_type] 60 | agent = PromptAgent( 61 | prompt_template=prompt_template, 62 | model=self.model, 63 | prompt_mode=self.prompt_mode, 64 | max_actions=self.max_actions, 65 | verbose=self.verbose, 66 | logging=self.logging, 67 | debug=self.debug, 68 | previous_actions=[], 69 | previous_reasons=[], 70 | previous_responses=[] 71 | ) 72 | return {'agent': agent, 'objective': objective} 73 | 74 | def predict_action(self, objective, observation, url=None): 75 | if self.stack.is_empty(): 76 | new_element = self.init_root_agent(objective=objective) 77 | self.stack.push(new_element) 78 | 79 | action, reason = None, None 80 | while not self.stack.is_empty(): 81 | element = self.stack.peek() 82 | action, reason = element['agent'].predict_action(objective=element['objective'], observation=observation, url=url) 83 | if (not self.is_done(action)) and self.is_low_level_action(action): 84 | element['agent'].receive_response("") 85 | return action, reason 86 | if (not self.is_done(action)) and self.is_high_level_action(action): 87 | new_element = self.init_agent(action) 88 | self.stack.push(new_element) 89 | if self.logging: 90 | self.log_step(objective=element['objective'], url=url, observation=observation, action=action, reason=reason, status={}) 91 | continue 92 | if self.is_done(action): 93 | self.stack.pop() 94 | if not self.stack.is_empty(): 95 | self.stack.peek()['agent'].receive_response(re.search(r"\[(.*?)\]", action).group(1)) 96 | if self.logging: 97 | self.log_step(objective=element['objective'], url=url, observation=observation, action=action, reason=reason, status={}) 98 | continue 99 | return action, reason -------------------------------------------------------------------------------- /src/webagents_step/environment/env.py: -------------------------------------------------------------------------------- 1 | class WebEnvironment(): 2 | def __init__(self): 3 | pass 4 | 5 | def reset(self): 6 | pass 7 | 8 | def observation(self): 9 | pass 10 | 11 | def get_url(self): 12 | pass 13 | 14 | def step(self, action): 15 | pass 16 | 17 | def done(self): 18 | pass 19 | -------------------------------------------------------------------------------- /src/webagents_step/environment/liveweb.py: -------------------------------------------------------------------------------- 1 | from time import sleep 2 | import pandas as pd 3 | import re 4 | 5 | from webagents_step.parser import ( 6 | heihei_web_parser, 7 | playwright_parser_nat, 8 | playwright_parser_webarena, 9 | ) 10 | from webagents_step.environment.env import WebEnvironment 11 | 12 | 13 | class LiveWebEnvironmentWrapper(WebEnvironment): 14 | def __init__( 15 | self, 16 | url=None, 17 | objective=None, 18 | parser_type="heihei", 19 | observation_type="text", 20 | text_observation_type="accesibility_tree", 21 | max_browser_rows=1000, 22 | max_steps=50, 23 | step_delay=2, 24 | headless=False, 25 | ): 26 | self.url = url 27 | self.objective = objective 28 | self.headless = headless 29 | self.parser_type = parser_type 30 | self.observation_type = observation_type 31 | self.text_observation_type = text_observation_type 32 | self.max_browser_rows = max_browser_rows 33 | self.max_steps = max_steps 34 | 35 | self.steps = 0 36 | self.is_done = False 37 | self.parse_timeout = 5 38 | self.step_delay = step_delay 39 | self.response = "" 40 | 41 | async def init_parser(self): 42 | if self.parser_type == "heihei": 43 | self.parser = heihei_web_parser.HeiHeiWebParser() 44 | await self.parser.init() 45 | elif self.parser_type == "playwright_webarena": 46 | self.parser = playwright_parser_webarena.PlaywrightParserWebArena( 47 | headless=self.headless, 48 | observation_type=self.observation_type, 49 | text_observation_type=self.text_observation_type, 50 | ) 51 | self.parser.init() 52 | elif self.parser_type == "playwright_nat": 53 | self.parser = playwright_parser_nat.PlaywrightParserNat( 54 | headless=self.headless 55 | ) 56 | await self.parser.init() 57 | else: 58 | raise NotImplementedError(f"{self.parser_type} not implemented.") 59 | 60 | if self.url is not None: 61 | await self.parser.go_to_page(self.url) 62 | self.clear_page_presets() 63 | await self.parser.parse_page() 64 | 65 | def clear_page_presets(self): 66 | pass 67 | 68 | async def reset(self): 69 | await self.close() 70 | await self.init_parser() 71 | 72 | async def close(self): 73 | await self.parser.close() 74 | 75 | async def observation(self, tab_id=None, format=None): 76 | format = self.text_observation_type if format is None else format 77 | if self.parser_type == "heihei": 78 | try: 79 | browser_content = await self.parser.parse_page( 80 | format=format, tab_id=tab_id 81 | ) 82 | except: 83 | sleep(self.parse_timeout) 84 | browser_content = await self.parser.parse_page( 85 | format=format, tab_id=tab_id 86 | ) 87 | else: 88 | browser_content = await self.parser.parse_page() 89 | 90 | if format not in ["htree", "html", "json"]: 91 | browser_content = [str(w) for w in browser_content] 92 | browser_content = browser_content[: self.max_browser_rows] 93 | browser_content = "\n".join(browser_content) 94 | 95 | return browser_content 96 | 97 | def get_log(self): 98 | return self.df_log 99 | 100 | def get_response(self): 101 | return self.response 102 | 103 | def get_url(self): 104 | return self.parser.get_url() 105 | 106 | async def execute_action(self, action): 107 | """ 108 | Execute a given action based on the action type, 109 | - click [id]: Clicks an element based on the provided id. 110 | - type [id] [content]: Types the provided content into the element with the specified id. 111 | - goto [url]: Navigates to an existing tab at that URL 112 | - open [url]: Opens a new tab with provided URL 113 | - copy [content]: Copies content, but no-op action 114 | - stop [response]: Stops execution and optionally provides a response. 115 | """ 116 | click_match = re.match(r"click \[(\S+)\]", action, re.IGNORECASE) 117 | type_match = re.match(r"type \[(\S+)\] \[(.+)\]", action, re.IGNORECASE) 118 | goto_match = re.match(r"goto \[(\S+)\]", action, re.IGNORECASE) 119 | open_match = re.match(r"open \[(\S+)\]", action, re.IGNORECASE) 120 | copy_match = re.match(r"copy \[(\S+)\]", action, re.IGNORECASE) 121 | stop_match = re.match(r"stop \[([^\]]*)\]", action, re.IGNORECASE) 122 | 123 | if click_match: 124 | id = click_match.group(1) 125 | if not id.isdigit(): 126 | raise Exception("Id not a valid integer") 127 | await self.parser.click(int(id)) 128 | 129 | elif type_match: 130 | id = type_match.group(1) 131 | content = type_match.group(2) 132 | if not id.isdigit(): 133 | raise Exception("Id not a valid integer") 134 | await self.parser.type(int(id), content) 135 | 136 | elif goto_match: 137 | url = goto_match.group(1) 138 | tab_id, tab_url = await self.parser.get_tab_from_url(url) 139 | await self.parser.go_to_page(url) 140 | 141 | elif open_match: 142 | url = open_match.group(1) 143 | await self.parser.go_to_page(url) 144 | 145 | elif copy_match: 146 | pass 147 | 148 | elif stop_match: 149 | self.response = stop_match.group(1) 150 | self.is_done = True 151 | 152 | else: 153 | print(f"[execute_action] Error {action} not defined") 154 | 155 | async def step(self, action, delay=None): 156 | delay = self.step_delay if delay is None else delay 157 | 158 | if self.steps > self.max_steps: 159 | print(f"Steps {self.steps} exceeded maximum {self.max_steps}") 160 | self.is_done = True 161 | return 162 | 163 | print(f"[Step {self.steps+1}] {action}") 164 | try: 165 | await self.execute_action(action) 166 | except Exception as e: 167 | print(f"Error while executing action '{action}'. Details: {e}") 168 | 169 | sleep(delay) 170 | self.steps = self.steps + 1 171 | 172 | return {"done": self.is_done, "response": self.response} 173 | 174 | def done(self): 175 | return self.is_done 176 | -------------------------------------------------------------------------------- /src/webagents_step/environment/miniwob.py: -------------------------------------------------------------------------------- 1 | from webagents_step.parser import miniwob_parser 2 | from webagents_step.environment.env import WebEnvironment 3 | 4 | import re 5 | from miniwob.action import create_element_click_action, create_focus_and_type_action 6 | from miniwob.environment import MiniWoBEnvironment 7 | 8 | import logging 9 | logging.basicConfig(level=logging.ERROR) 10 | 11 | class MiniWoBEnvironmentWrapper(WebEnvironment): 12 | def __init__(self, task, seed=None, max_browser_rows=125, 13 | max_steps=50, wait_ms=600, headless=False): 14 | 15 | render_mode = None if headless else "human" 16 | self.miniwob_env = MiniWoBEnvironment(subdomain=task, wait_ms=wait_ms, render_mode=render_mode) 17 | obs, _ = self.miniwob_env.reset(seed) 18 | 19 | self.obs = obs 20 | self.objective = obs["utterance"] 21 | self.url = "" 22 | self.max_browser_rows = max_browser_rows 23 | self.max_steps = max_steps 24 | self.steps = 0 25 | self.is_done = False 26 | self.reward = 0.0 27 | 28 | def reset(self, seed=None): 29 | obs, _ = self.miniwob_env.reset(seed) 30 | self.obs = obs 31 | 32 | def close(self): 33 | self.miniwob_env.close() 34 | 35 | def observation(self): # browser content 36 | dom_elements = self.obs['dom_elements'] 37 | browser_content = miniwob_parser.parse_dom_browser_content( 38 | dom_elements, process_dates=True) 39 | browser_content = browser_content[:self.max_browser_rows] 40 | browser_content = "\n".join(browser_content) 41 | return browser_content 42 | 43 | def get_url(self): 44 | return self.url 45 | 46 | def done(self): 47 | if self.is_done: 48 | return True 49 | return False 50 | 51 | def parse_action(self, action): 52 | """ 53 | Parse a given action based on the action type, 54 | - click [id]: Clicks an element based on the provided id. 55 | - type [id] [content]: Types the provided content into the element with the specified id. 56 | - stop [response]: Stops execution and optionally provides a response. 57 | """ 58 | if not action: 59 | print("Action text is None") 60 | return None 61 | 62 | click_match = re.match(r"click \[(\S+)\]", action, re.IGNORECASE) 63 | type_match = re.match(r"type \[(\S+)\] \[(.+)\]", action, re.IGNORECASE) 64 | stop_match = re.match(r"stop \[([^\]]*)\]", action, re.IGNORECASE) 65 | 66 | action_cmd = None 67 | if click_match: 68 | id = click_match.group(1) 69 | action_cmd = create_element_click_action(int(id)) if id.isdigit() else None 70 | elif type_match: 71 | id = type_match.group(1) 72 | content = type_match.group(2) 73 | action_cmd = create_focus_and_type_action(int(id), content) if id.isdigit() else None 74 | elif stop_match: 75 | self.response = stop_match.group(1) 76 | self.is_done = True 77 | action_cmd = None 78 | else: 79 | print(f"[MiniWoBEnvironmentWrapper::parse_action] Error {action} not defined") 80 | 81 | return action_cmd 82 | 83 | def status(self): 84 | return {'done': self.is_done, 'reward': self.reward, 'success': float(self.reward > 0), 'num_actions': self.steps} 85 | 86 | def step(self, action): 87 | self.steps = self.steps + 1 88 | print(f"[Step {self.steps}] {action}") 89 | 90 | if self.steps > self.max_steps: 91 | print(f"Steps {self.steps} exceeded maximum {self.max_steps}") 92 | self.is_done = True 93 | self.reward = -1 94 | return self.status() 95 | 96 | action_cmd = self.parse_action(action) 97 | if action_cmd: 98 | try: 99 | obs, reward, done, truncated, info = self.miniwob_env.step(action_cmd) 100 | self.obs = obs 101 | if "raw_reward" in info: 102 | self.reward = info['raw_reward'] 103 | else: 104 | self.reward = reward 105 | self.is_done = done 106 | except Exception as e: 107 | print(f"Error occurred while taking step: {e}") 108 | 109 | return self.status() 110 | 111 | def get_objective(self): 112 | return self.objective -------------------------------------------------------------------------------- /src/webagents_step/environment/webarena.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ[ 3 | "SHOPPING" 4 | ] = "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:7770" 5 | os.environ[ 6 | "SHOPPING_ADMIN" 7 | ] = "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:7780/admin" 8 | os.environ[ 9 | "REDDIT" 10 | ] = "https://webarena-env-reddit.awsdev.asapp.com" 11 | os.environ[ 12 | "GITLAB" 13 | ] = "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:8023/" 14 | os.environ[ 15 | "MAP" 16 | ] = "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:3000" 17 | os.environ[ 18 | "WIKIPEDIA" 19 | ] = "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing" 20 | os.environ[ 21 | "HOMEPAGE" 22 | ] = "PASS" # The home page is not currently hosted in the demo site 23 | 24 | 25 | from webagents_step.environment.env import WebEnvironment 26 | import json 27 | import re 28 | # Init an environment 29 | from browser_env import ( 30 | create_id_based_action, 31 | StateInfo, 32 | Trajectory, 33 | ActionTypes, 34 | ScriptBrowserEnv 35 | ) 36 | from evaluation_harness.evaluators import evaluator_router 37 | 38 | class WebArenaEnvironmentWrapper(WebEnvironment): 39 | def __init__(self, config_file, max_browser_rows=300, max_steps=50, slow_mo=1, observation_type="accessibility_tree", current_viewport_only=False, viewport_size={"width": 1280, "height": 720}, headless=False): 40 | self.webarena_env = ScriptBrowserEnv( 41 | headless=headless, 42 | slow_mo=slow_mo, 43 | observation_type=observation_type, 44 | current_viewport_only=current_viewport_only, 45 | viewport_size=viewport_size 46 | ) 47 | self.config_file = config_file 48 | with open(self.config_file, "r") as f: 49 | self.config = json.load(f) 50 | 51 | self.obs, self.info = self.webarena_env.reset(options={"config_file": self.config_file}) 52 | self.terminated = False 53 | self.objective = self.config["intent"] 54 | self.url = self.config["start_url"] 55 | self.max_browser_rows = max_browser_rows 56 | self.max_steps = max_steps 57 | self.steps = 0 58 | self.is_done = False 59 | self.reward = 0.0 60 | self.action_limit_exceeded = False 61 | 62 | self.trajectory: Trajectory = [] 63 | self.update_webarena_metrics() 64 | 65 | def reset(self): 66 | self.obs, self.info = self.webarena_env.reset(options={"config_file": self.config_file}) 67 | 68 | def close(self): 69 | self.webarena_env.close() 70 | 71 | def get_url(self): 72 | return self.url 73 | 74 | def get_objective(self): 75 | return self.objective 76 | 77 | def observation(self): 78 | self.obs = self.webarena_env._get_obs() 79 | self.url = self.webarena_env.page.url 80 | browser_content = self.obs["text"] 81 | browser_content = browser_content.split("\n")[:self.max_browser_rows] 82 | browser_content = "\n".join(browser_content) 83 | return browser_content 84 | 85 | def done(self): 86 | if self.is_done: 87 | return True 88 | return False 89 | 90 | def status(self): 91 | return {'done': self.is_done, 'reward': self.reward, 'success': float(self.reward > 0), 'num_actions': self.steps, 'action_limit_exceeded': self.action_limit_exceeded} 92 | 93 | def step(self, action): 94 | self.steps = self.steps + 1 95 | print(f"[Step {self.steps}] {action}") 96 | 97 | if self.steps > self.max_steps: 98 | print(f"Steps {self.steps} exceeded maximum {self.max_steps}") 99 | self.action_limit_exceeded = True 100 | self.is_done = True 101 | action_cmd = create_id_based_action("stop [N/A]") 102 | self.update_webarena_metrics(action_cmd) 103 | return self.status() 104 | 105 | if "stop [" in action: 106 | action = action.replace('\\', '') 107 | 108 | if action is None or action is "" or ("note [" in action): 109 | action_cmd = None 110 | else: 111 | action_cmd = create_id_based_action(action) 112 | 113 | if action_cmd: 114 | try: 115 | self.obs, _, self.terminated, _, self.info = self.webarena_env.step(action_cmd) 116 | self.update_webarena_metrics(action_cmd) 117 | except Exception as e: 118 | print(f"Error occurred while taking step: {e}") 119 | 120 | return self.status() 121 | 122 | def update_webarena_metrics(self, action_cmd=None): 123 | # Append action (if any) and resulting sate 124 | if action_cmd: 125 | self.trajectory.append(action_cmd) 126 | if action_cmd["action_type"]== ActionTypes.STOP: 127 | self.is_done = True 128 | 129 | if not self.is_done: # If we are done, no need to append state 130 | state_info: StateInfo = {"observation": self.obs, "info": self.info} 131 | self.trajectory.append(state_info) 132 | 133 | if self.is_done: 134 | try: 135 | evaluator = evaluator_router(self.config_file) 136 | self.reward = evaluator(trajectory=self.trajectory, config_file=self.config_file, page=self.webarena_env.page, client=self.webarena_env.get_page_client(self.webarena_env.page)) 137 | except Exception as e: 138 | print(f"Got excepetion: {e}") 139 | self.reward = 0 -------------------------------------------------------------------------------- /src/webagents_step/parser/miniwob_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | import datetime 3 | 4 | from miniwob.action import create_element_click_action, create_focus_and_type_action, create_coord_click_action 5 | 6 | def parse_dom_browser_content(dom, ignore_tags=["tr", "td"], format='html', process_dates=False): 7 | """ 8 | Parse dom observations into a browser action set 9 | """ 10 | if format == "html": 11 | output = parse_dom_browser_content_html( 12 | dom, ignore_tags=ignore_tags, process_dates=process_dates) 13 | else: 14 | raise NotImplementedError 15 | 16 | return output 17 | 18 | 19 | def parse_dom_browser_content_html(dom, ignore_tags, process_dates): 20 | action_set = [] 21 | 22 | for node in dom: 23 | if node['tag'] in ignore_tags: 24 | continue 25 | attrs = f" id={node['ref']}" 26 | if len(node['text']) > 0: 27 | value = f" val={node['text']}" 28 | elif 'value' in node and (len(node['value']) > 0): 29 | value = f" val={node['value']}" 30 | else: 31 | value = f" val={node['id']}" 32 | action_set.append(f"<{node['tag']}{attrs}{value} />") 33 | 34 | if process_dates: 35 | action_set = parse_dates_table(action_set) 36 | 37 | return action_set 38 | 39 | 40 | def parse_dates_table(action_set): 41 | action_set_text = "\n".join(action_set) 42 | if "val=ui-datepicker-div" not in action_set_text: 43 | return action_set 44 | 45 | pattern = r"\b(January|February|March|April|May|June|July|August|September|October|November|December)\b" 46 | month = re.findall(pattern, action_set_text)[0] # month name 47 | month = datetime.datetime.strptime(month, '%B').month # month number 48 | pattern = r"20[0-4][0-9]|2050" 49 | year = re.findall(pattern, action_set_text)[0] 50 | 51 | pattern = r'' 52 | action_set_text = re.sub( 53 | pattern, lambda m: f'', action_set_text) 54 | 55 | action_set = action_set_text.split("\n") 56 | 57 | return action_set 58 | -------------------------------------------------------------------------------- /src/webagents_step/parser/playwright_parser_webarena.py: -------------------------------------------------------------------------------- 1 | from sys import platform 2 | from playwright.sync_api import sync_playwright 3 | from browser_env.processors import TextObervationProcessor, ImageObservationProcessor 4 | 5 | 6 | class PlaywrightParserWebArena: 7 | def __init__( 8 | self, 9 | headless=True, 10 | observation_type="text", 11 | text_observation_type="accessibility_tree", 12 | viewport_size={"width": 1280, "height": 1080}, 13 | current_viewport_only=True, 14 | ): 15 | self.headless = headless 16 | self.viewport_size = viewport_size 17 | self.current_viewport_only = current_viewport_only 18 | self.observation_type = observation_type 19 | self.text_observation_type = text_observation_type 20 | 21 | self.playwright = sync_playwright().start() 22 | self.browser = self.playwright.chromium.launch(headless=self.headless) 23 | self.context = self.browser.new_context( 24 | viewport=self.viewport_size, 25 | device_scale_factor=1, 26 | ) 27 | 28 | self.page = self.context.new_page() 29 | client = self.page.context.new_cdp_session(self.page) 30 | if (self.observation_type == "text") and ( 31 | self.text_observation_type == "accessibility_tree" 32 | ): 33 | client.send("Accessibility.enable") 34 | self.page.client = client 35 | 36 | ## scratch ## 37 | # initialize with html string 38 | # self.page.goto(url if "://" in url else "http://" + url) 39 | # potentially later 40 | # self.page.goto("https://www.google.com", wait_until='networkidle') 41 | # print(self.page.accessibility.snapshot()) 42 | # self.page = self.page.accessibility.snapshot() 43 | 44 | self.text_processor = TextObervationProcessor( 45 | observation_type=self.text_observation_type, 46 | current_viewport_only=self.current_viewport_only, 47 | viewport_size=self.viewport_size, 48 | ) 49 | self.image_processor = ImageObservationProcessor(observation_type="image") 50 | 51 | def clear_page_presets(): 52 | pass 53 | 54 | def observation_processor(self): 55 | if self.observation_type == "text": 56 | return self.text_processor 57 | elif self.observation_type == "image": 58 | return self.image_processor 59 | else: 60 | raise ValueError("Invalid observation type") 61 | 62 | def get_url(self): 63 | return self.page.url 64 | 65 | def go_to_page(self, url: str): 66 | self.page.goto(url if "://" in url else "http://" + url) 67 | 68 | def close(self): 69 | self.browser.close() 70 | self.playwright_context.stop() 71 | 72 | def click_xy(self, x: float, y: float) -> None: 73 | viewport_size = self.page.viewport_size 74 | self.page.mouse.click(x * viewport_size["width"], y * viewport_size["height"]) 75 | 76 | def click(self, id: int) -> None: 77 | element_center = self.observation_processor().get_element_center(id) 78 | self.click_xy(element_center[0], element_center[1]) 79 | 80 | def type(self, id: int, text: str, clear: bool = True): 81 | if clear: 82 | self.clear(id) 83 | self.click(id) 84 | self.page.keyboard.type(text) 85 | 86 | def clear(self, id: int) -> None: 87 | self.click(id) 88 | select_key = "Meta" if platform.startswith("darwin") else "Control" 89 | self.page.keyboard.down(select_key) 90 | self.page.keyboard.press("a") 91 | self.page.keyboard.up(select_key) 92 | self.page.keyboard.press("Backspace") 93 | 94 | def parse_page(self): 95 | observation = self.observation_processor().process( 96 | page=self.page, client=self.page.client 97 | ) 98 | 99 | return observation 100 | -------------------------------------------------------------------------------- /src/webagents_step/prompts/miniwob/flat_fewshot_template.py: -------------------------------------------------------------------------------- 1 | flat_fewshot_agent = { 2 | "instruction": """You are an AI assistant performing tasks on a web browser. To solve these tasks, you will issue specific actions. 3 | 4 | You can only interact with web elements like links, inputs, buttons in the browser content. You can issue any one of the actions below: 5 | click [id]: Clicks an element corresponding to the provided id. 6 | type [id] [content]: Types the provided content into the element corresponding to the provided id. 7 | stop [answer]: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. Otherwise, leave it empty. 8 | 9 | Examples of actions are click [7], type [11] [New York]. Please issue only one action at a time. 10 | 11 | You will be provided with the following, 12 | OBJECTIVE: 13 | The goal you need to achieve. 14 | OBSERVATION: 15 | A simplified text description of the current browser content, without formatting elements. 16 | URL: 17 | The current webpage URL 18 | PREVIOUS ACTIONS: 19 | A list of your past actions 20 | 21 | You need to generate response containing, 22 | REASON: 23 | A rationale for selecting the action below 24 | ACTION: 25 | A single action 26 | 27 | Please generate in the following format: 28 | 29 | REASON: 30 | Your reason here 31 | ACTION: 32 | Your action here 33 | """, 34 | 35 | "input": """ 36 | OBJECTIVE: 37 | {objective} 38 | OBSERVATION: 39 | {observation} 40 | URL: 41 | {url} 42 | PREVIOUS ACTIONS: 43 | {previous_actions} 44 | """, 45 | 46 | "response": "", 47 | 48 | "examples": [ 49 | { 50 | "input": """ 51 | OBJECTIVE: 52 | Book the shortest one-way flight from: LEB to: RDG on 12/26/2016. 53 | OBSERVATION: 54 | 55 |
56 |
57 |
58 |

59 |
60 | 61 |
62 | 63 |
64 | URL: 65 | 66 | PREVIOUS ACTIONS: 67 | 68 | """, 69 | "response": """ 70 | REASON: 71 | I have no previous actions. 72 | I have to first type "LEB" in the field flight-from corresponding to id 7 73 | ACTION: 74 | type [7] [LEB] 75 | """}, 76 | { 77 | "input": """ 78 | OBJECTIVE: 79 | Book the shortest one-way flight from: LEB to: RDG on 12/26/2016. 80 | CURRENT BROWSER CONTENT: 81 | 82 |
83 |
84 |
85 |

86 |
87 | 88 |
89 |
90 |