├── .github └── workflows │ └── lint.yml ├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── WAA_setup.pdf ├── evaluation_sets ├── test_all.json └── test_small_new.json ├── gui_agents ├── __init__.py ├── s1 │ ├── README.md │ ├── WindowsAgentArena.md │ ├── aci │ │ ├── ACI.py │ │ ├── LinuxOSACI.py │ │ ├── MacOSACI.py │ │ ├── WindowsOSACI.py │ │ ├── __init__.py │ │ └── windowsagentarena │ │ │ └── GroundingAgent.py │ ├── cli_app.py │ ├── core │ │ ├── AgentS.py │ │ ├── BaseModule.py │ │ ├── Knowledge.py │ │ ├── Manager.py │ │ ├── ProceduralMemory.py │ │ ├── Worker.py │ │ └── __init__.py │ ├── mllm │ │ ├── MultimodalAgent.py │ │ ├── MultimodalEngine.py │ │ └── __init__.py │ └── utils │ │ ├── __init__.py │ │ ├── common_utils.py │ │ ├── ocr_server.py │ │ └── query_perplexica.py ├── s2 │ ├── __init__.py │ ├── agents │ │ ├── __init__.py │ │ ├── agent_s.py │ │ ├── grounding.py │ │ ├── manager.py │ │ └── worker.py │ ├── cli_app.py │ ├── core │ │ ├── __init__.py │ │ ├── engine.py │ │ ├── knowledge.py │ │ ├── mllm.py │ │ └── module.py │ ├── memory │ │ ├── __init__.py │ │ └── procedural_memory.py │ └── utils │ │ ├── __init__.py │ │ ├── common_utils.py │ │ └── query_perplexica.py └── utils.py ├── images ├── .DS_Store ├── agent_s.png ├── agent_s2_architecture.png ├── agent_s2_osworld_result.png ├── agent_s2_teaser.png ├── agent_s_architecture.pdf ├── osworld_result.png ├── results.pdf ├── results.png ├── teaser.png └── windows_result.png ├── models.md ├── osworld_setup ├── s1 │ ├── OSWorld.md │ ├── lib_run_single.py │ └── run.py └── s2 │ ├── OSWorld.md │ ├── lib_run_single.py │ └── run.py ├── requirements.txt ├── server.py ├── setup.py └── tests ├── test_aci.py ├── test_app_switching.py ├── test_uielement_base.py ├── test_uielement_linux.py ├── test_uielement_macos.py └── test_uielement_osworld.py /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: lint 2 | on: 3 | pull_request: 4 | types: [opened, reopened, synchronize] 5 | paths: 6 | - "gui_agents/**" 7 | - "tests/**" 8 | - ".github/workflows/lint.yml" 9 | push: 10 | branches: 11 | - main 12 | paths: 13 | - "gui_agents/**" 14 | - "tests/**" 15 | - ".github/workflows/lint.yml" 16 | 17 | env: 18 | SUPPORTED_PYTHON_VERSIONS: "3.11" 19 | 20 | jobs: 21 | build: 22 | runs-on: ubuntu-latest 23 | strategy: 24 | fail-fast: false 25 | matrix: 26 | python-version: ["3.10", "3.11"] 27 | steps: 28 | - uses: actions/checkout@v3 29 | 30 | - name: Set up Python ${{ matrix.python-version }} 31 | uses: actions/setup-python@v4 32 | with: 33 | python-version: ${{ matrix.python-version }} 34 | 35 | - name: Install dependencies 36 | run: | 37 | python -m pip install --upgrade pip 38 | pip install -e .[dev] 39 | 40 | - name: Run Linter 41 | run: | 42 | black --check gui_agents tests 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | logs/ 164 | .DS_Store -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "Perplexica"] 2 | path = Perplexica 3 | url = https://github.com/ItzCrazyKns/Perplexica 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /WAA_setup.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/WAA_setup.pdf -------------------------------------------------------------------------------- /evaluation_sets/test_small_new.json: -------------------------------------------------------------------------------- 1 | { 2 | "os": [ 3 | "5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57", 4 | "5812b315-e7bd-4265-b51f-863c02174c28", 5 | "c288e301-e626-4b98-a1ab-159dcb162af5", 6 | "4783cc41-c03c-4e1b-89b4-50658f642bd5", 7 | "5c1075ca-bb34-46a3-a7a0-029bd7463e79", 8 | "5ced85fc-fa1a-4217-95fd-0fb530545ce2" 9 | ], 10 | "gimp": [ 11 | "a746add2-cab0-4740-ac36-c3769d9bfb46", 12 | "7a4deb26-d57d-4ea9-9a73-630f66a7b568", 13 | "d52d6308-ec58-42b7-a2c9-de80e4837b2b", 14 | "2a729ded-3296-423d-aec4-7dd55ed5fbb3", 15 | "d16c99dc-2a1e-46f2-b350-d97c86c85c15" 16 | ], 17 | "chrome": [ 18 | "bb5e4c0d-f964-439c-97b6-bdb9747de3f4", 19 | "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3", 20 | "35253b65-1c19-4304-8aa4-6884b8218fc0", 21 | "a96b564e-dbe9-42c3-9ccf-b4498073438a", 22 | "e1e75309-3ddb-4d09-92ec-de869c928143", 23 | "82bc8d6a-36eb-4d2d-8801-ef714fb1e55a" 24 | ], 25 | "thunderbird": [ 26 | "bb5e4c0d-f964-439c-97b6-bdb9747de3f4", 27 | "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3", 28 | "2ad9387a-65d8-4e33-ad5b-7580065a27ca", 29 | "480bcfea-d68f-4aaa-a0a9-2589ef319381", 30 | "030eeff7-b492-4218-b312-701ec99ee0cc" 31 | ], 32 | "vs_code": [ 33 | "0ed39f63-6049-43d4-ba4d-5fa2fe04a951", 34 | "dcbe20e8-647f-4f1d-8696-f1c5bbb570e3", 35 | "9439a27b-18ae-42d8-9778-5f68f891805e", 36 | "7c4cc09e-7a92-40dd-8338-b2286535c4ed", 37 | "9d425400-e9b2-4424-9a4b-d4c7abac4140" 38 | ], 39 | "vlc": [ 40 | "59f21cfb-0120-4326-b255-a5b827b38967", 41 | "8f080098-ddb1-424c-b438-4e96e5e4786e", 42 | "5ac2891a-eacd-4954-b339-98abba077adb", 43 | "f3977615-2b45-4ac5-8bba-80c17dbe2a37", 44 | "215dfd39-f493-4bc3-a027-8a97d72c61bf" 45 | ], 46 | "libreoffice_calc": [ 47 | "357ef137-7eeb-4c80-a3bb-0951f26a8aff", 48 | "42e0a640-4f19-4b28-973d-729602b5a4a7", 49 | "abed40dc-063f-4598-8ba5-9fe749c0615d", 50 | "035f41ba-6653-43ab-aa63-c86d449d62e5", 51 | "7efeb4b1-3d19-4762-b163-63328d66303b" 52 | ], 53 | "libreoffice_impress": [ 54 | "5d901039-a89c-4bfb-967b-bf66f4df075e", 55 | "550ce7e7-747b-495f-b122-acdc4d0b8e54", 56 | "ac9bb6cb-1888-43ab-81e4-a98a547918cd", 57 | "2cd43775-7085-45d8-89fa-9e35c0a915cf", 58 | "358aa0a7-6677-453f-ae35-e440f004c31e", 59 | "a669ef01-ded5-4099-9ea9-25e99b569840" 60 | ], 61 | "libreoffice_writer": [ 62 | "0810415c-bde4-4443-9047-d5f70165a697", 63 | "e246f6d8-78d7-44ac-b668-fcf47946cb50", 64 | "d53ff5ee-3b1a-431e-b2be-30ed2673079b", 65 | "b21acd93-60fd-4127-8a43-2f5178f4a830", 66 | "0a0faba3-5580-44df-965d-f562a99b291c", 67 | "adf5e2c3-64c7-4644-b7b6-d2f0167927e7" 68 | ], 69 | "multi_apps": [ 70 | "a74b607e-6bb5-4ea8-8a7c-5d97c7bbcd2a", 71 | "5990457f-2adb-467b-a4af-5c857c92d762", 72 | "2b9493d7-49b8-493a-a71b-56cd1f4d6908", 73 | "acb0f96b-e27c-44d8-b55f-7cb76609dfcd", 74 | "c867c42d-a52d-4a24-8ae3-f75d256b5618", 75 | "74d5859f-ed66-4d3e-aa0e-93d7a592ce41", 76 | "b5062e3e-641c-4e3a-907b-ac864d2e7652", 77 | "48d05431-6cd5-4e76-82eb-12b60d823f7d", 78 | "eb303e01-261e-4972-8c07-c9b4e7a4922a", 79 | "d1acdb87-bb67-4f30-84aa-990e56a09c92", 80 | "deec51c9-3b1e-4b9e-993c-4776f20e8bb2", 81 | "8e116af7-7db7-4e35-a68b-b0939c066c78", 82 | "716a6079-22da-47f1-ba73-c9d58f986a38", 83 | "46407397-a7d5-4c6b-92c6-dbe038b1457b", 84 | "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc", 85 | "897e3b53-5d4d-444b-85cb-2cdc8a97d903" 86 | ] 87 | } 88 | -------------------------------------------------------------------------------- /gui_agents/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/gui_agents/__init__.py -------------------------------------------------------------------------------- /gui_agents/s1/README.md: -------------------------------------------------------------------------------- 1 |

2 | Logo Agent S: 3 | Using Computers Like a Human 4 |

5 | 6 |

7 | 🌐 [Website] 8 | 📄 [Paper] 9 | 🎥 [Video] 10 | 🗨️ [Discord] 11 |

12 | 13 | ## 🥳 Updates 14 | - [x] **2025/01/22**: The [Agent S paper](https://arxiv.org/abs/2410.08164) is accepted to ICLR 2025! 15 | - [x] **2025/01/21**: Released v0.1.2 of [gui-agents](https://github.com/simular-ai/Agent-S) library, with support for Linux and Windows! 16 | - [x] **2024/12/05**: Released v0.1.0 of [gui-agents](https://github.com/simular-ai/Agent-S) library, allowing you to use Agent-S for Mac, OSWorld, and WindowsAgentArena with ease! 17 | - [x] **2024/10/10**: Released [Agent S paper](https://arxiv.org/abs/2410.08164) and codebase! 18 | 19 | ## Table of Contents 20 | 21 | 1. [💡 Introduction](#-introduction) 22 | 2. [🎯 Current Results](#-current-results) 23 | 3. [🛠️ Installation](#%EF%B8%8F-installation) 24 | 4. [🚀 Usage](#-usage) 25 | 5. [🙌 Contributors](#-contributors) 26 | 6. [💬 Citation](#-citation) 27 | 28 | ## 💡 Introduction 29 | 30 |

31 | 32 |

33 | 34 | Welcome to **Agent S**, an open-source framework designed to enable autonomous interaction with computers through Agent-Computer Interface. Our mission is to build intelligent GUI agents that can learn from past experiences and perform complex tasks autonomously on your computer. 35 | 36 | Whether you're interested in AI, automation, or contributing to cutting-edge agent-based systems, we're excited to have you here! 37 | 38 | ## 🎯 Current Results 39 | 40 |

41 | 42 |
43 | Results of Successful Rate (%) on the OSWorld full test set of all 369 test examples using Image + Accessibility Tree input. 44 |

45 | 46 | 47 | ## 🛠️ Installation & Setup 48 | 49 | > ❗**Warning**❗: If you are on a Linux machine, creating a `conda` environment will interfere with `pyatspi`. As of now, there's no clean solution for this issue. Proceed through the installation without using `conda` or any virtual environment. 50 | 51 | Clone the repository: 52 | ``` 53 | git clone https://github.com/simular-ai/Agent-S.git 54 | ``` 55 | 56 | Install the gui-agents package: 57 | ``` 58 | pip install gui-agents 59 | ``` 60 | 61 | Set your LLM API Keys and other environment variables. You can do this by adding the following line to your .bashrc (Linux), or .zshrc (MacOS) file. 62 | 63 | ``` 64 | export OPENAI_API_KEY= 65 | ``` 66 | 67 | Alternatively, you can set the environment variable in your Python script: 68 | 69 | ``` 70 | import os 71 | os.environ["OPENAI_API_KEY"] = "" 72 | ``` 73 | 74 | We also support Azure OpenAI, Anthropic, and vLLM inference. For more information refer to [../../models.md](models.md). 75 | 76 | ### Setup Retrieval from Web using Perplexica 77 | Agent S works best with web-knowledge retrieval. To enable this feature, you need to setup Perplexica: 78 | 79 | 1. Ensure Docker Desktop is installed and running on your system. 80 | 81 | 2. Navigate to the directory containing the project files. 82 | 83 | ```bash 84 | cd Perplexica 85 | git submodule update --init 86 | ``` 87 | 88 | 3. Rename the `sample.config.toml` file to `config.toml`. For Docker setups, you need only fill in the following fields: 89 | 90 | - `OPENAI`: Your OpenAI API key. **You only need to fill this if you wish to use OpenAI's models**. 91 | - `OLLAMA`: Your Ollama API URL. You should enter it as `http://host.docker.internal:PORT_NUMBER`. If you installed Ollama on port 11434, use `http://host.docker.internal:11434`. For other ports, adjust accordingly. **You need to fill this if you wish to use Ollama's models instead of OpenAI's**. 92 | - `GROQ`: Your Groq API key. **You only need to fill this if you wish to use Groq's hosted models**. 93 | - `ANTHROPIC`: Your Anthropic API key. **You only need to fill this if you wish to use Anthropic models**. 94 | 95 | **Note**: You can change these after starting Perplexica from the settings dialog. 96 | 97 | - `SIMILARITY_MEASURE`: The similarity measure to use (This is filled by default; you can leave it as is if you are unsure about it.) 98 | 99 | 4. Ensure you are in the directory containing the `docker-compose.yaml` file and execute: 100 | 101 | ```bash 102 | docker compose up -d 103 | ``` 104 | 105 | 5. Next, export your Perplexica URL. This URL is used to interact with the Perplexica API backend. The port is given by the `config.toml` in your Perplexica directory. 106 | 107 | ```bash 108 | export PERPLEXICA_URL=http://localhost:{port}/api/search 109 | ``` 110 | 111 | 6. Our implementation of Agent S incorporates the Perplexica API to integrate a search engine capability, which allows for a more convenient and responsive user experience. If you want to tailor the API to your settings and specific requirements, you may modify the URL and the message of request parameters in `agent_s/query_perplexica.py`. For a comprehensive guide on configuring the Perplexica API, please refer to [Perplexica Search API Documentation](https://github.com/ItzCrazyKns/Perplexica/blob/master/docs/API/SEARCH.md) 112 | 113 | For a more detailed setup and usage guide, please refer to the [Perplexica Repository](https://github.com/ItzCrazyKns/Perplexica.git). 114 | 115 | ### Setup Paddle-OCR Server 116 | 117 | Switch to a new terminal where you will run Agent S. Set the OCR_SERVER_ADDRESS environment variable as shown below. For a better experience, add the following line directly to your .bashrc (Linux), or .zshrc (MacOS) file. 118 | 119 | ``` 120 | export OCR_SERVER_ADDRESS=http://localhost:8000/ocr/ 121 | ``` 122 | 123 | Run the ocr_server.py file code to use OCR-based bounding boxes. 124 | 125 | ``` 126 | cd Agent-S 127 | python gui_agents/utils/ocr_server.py 128 | ``` 129 | 130 | You can change the server address by editing the address in [gui_agents/s1/utils/ocr_server.py](utils/ocr_server.py) file. 131 | 132 | 133 | > ❗**Warning**❗: The agent will directly run python code to control your computer. Please use with care. 134 | 135 | ## 🚀 Usage 136 | 137 | ### CLI 138 | 139 | Run agent_s on your computer using: 140 | ``` 141 | agent_s1 --model gpt-4o 142 | ``` 143 | This will show a user query prompt where you can enter your query and interact with Agent S. You can use any model from the list of supported models in [models.md](../../models.md). 144 | 145 | ### `gui_agents` SDK 146 | 147 | To deploy Agent S on MacOS or Windows: 148 | 149 | ``` 150 | import pyautogui 151 | import io 152 | from gui_agents.core.AgentS import GraphSearchAgent 153 | import platform 154 | 155 | if platform.system() == "Darwin": 156 | from gui_agents.aci.MacOSACI import MacOSACI, UIElement 157 | grounding_agent = MacOSACI() 158 | elif platform.system() == "Windows": 159 | from gui_agents.aci.WindowsOSACI import WindowsACI, UIElement 160 | grounding_agent = WindowsACI() 161 | elif platform.system() == "Linux": 162 | from gui_agents.aci.LinuxOSACI import LinuxACI, UIElement 163 | grounding_agent = LinuxACI() 164 | else: 165 | raise ValueError("Unsupported platform") 166 | 167 | engine_params = { 168 | "engine_type": "openai", 169 | "model": "gpt-4o", 170 | } 171 | 172 | agent = GraphSearchAgent( 173 | engine_params, 174 | grounding_agent, 175 | platform="ubuntu", # "macos", "windows" 176 | action_space="pyautogui", 177 | observation_type="mixed", 178 | search_engine="Perplexica" 179 | ) 180 | 181 | # Get screenshot. 182 | screenshot = pyautogui.screenshot() 183 | buffered = io.BytesIO() 184 | screenshot.save(buffered, format="PNG") 185 | screenshot_bytes = buffered.getvalue() 186 | 187 | # Get accessibility tree. 188 | acc_tree = UIElement.systemWideElement() 189 | 190 | obs = { 191 | "screenshot": screenshot_bytes, 192 | "accessibility_tree": acc_tree, 193 | } 194 | 195 | instruction = "Close VS Code" 196 | info, action = agent.predict(instruction=instruction, observation=obs) 197 | 198 | exec(action[0]) 199 | ``` 200 | 201 | Refer to `cli_app.py` for more details on how the inference loop works. 202 | 203 | #### Downloading the Knowledege Base 204 | 205 | Agent S2 uses a knowledge base that continually updates with new knowledge during inference. The knowledge base is initially downloaded when initializing `GraphSearchAgent`. The knowledge base is stored as assets under our [GitHub Releases](https://github.com/simular-ai/Agent-S/releases). The `GraphSearchAgent` initialization will only download the knowledge base for your specified platform and agent version (e.g s1, s2). If you'd like to download the knowledge base programmatically, you can use the following code: 206 | 207 | ``` 208 | download_kb_data( 209 | version="s2", 210 | release_tag="v0.2.2", 211 | download_dir="kb_data", 212 | platform="linux" # "darwin", "windows" 213 | ) 214 | ``` 215 | 216 | This will download Agent S2's knowledge base for Linux from release tag `v0.2.2` to the `kb_data` directory. Refer to our [GitHub Releases](https://github.com/simular-ai/Agent-S/releases) or release tags that include the knowledge bases. 217 | 218 | ### OSWorld 219 | 220 | To deploy Agent S in OSWorld, follow the [OSWorld Deployment instructions](OSWorld.md). 221 | 222 | ### WindowsAgentArena 223 | 224 | To deploy Agent S in WindowsAgentArena, follow the [WindowsAgentArena Deployment instructions](WindowsAgentArena.md). 225 | 226 | ## 🙌 Contributors 227 | 228 | We’re grateful to all the [amazing people](https://github.com/simular-ai/Agent-S/graphs/contributors) who have contributed to this project. Thank you! 🙏 229 | 230 | ## 💬 Citation 231 | ``` 232 | @misc{agashe2024agentsopenagentic, 233 | title={Agent S: An Open Agentic Framework that Uses Computers Like a Human}, 234 | author={Saaket Agashe and Jiuzhou Han and Shuyu Gan and Jiachen Yang and Ang Li and Xin Eric Wang}, 235 | year={2024}, 236 | eprint={2410.08164}, 237 | archivePrefix={arXiv}, 238 | primaryClass={cs.AI}, 239 | url={https://arxiv.org/abs/2410.08164}, 240 | } 241 | ``` 242 | 243 | -------------------------------------------------------------------------------- /gui_agents/s1/WindowsAgentArena.md: -------------------------------------------------------------------------------- 1 | ## Deploying Agent-S in WindowsAgentArena 2 | > ⚠️ **Warning**: The refactored code has not be fully tested on WindowsAgentArena. To reproduce the results on WindowsAgentArena, please use commit 496a9fa of this repository. 3 | 4 | 1. To use the Agent S with WindowsAgentArena, follows the setup instructions at: https://github.com/microsoft/WindowsAgentArena.git. **Please use the development mode while preparing the image and running the client as instructed in https://github.com/microsoft/WindowsAgentArena/blob/main/docs/Development-Tips.md.** 5 | 6 | 2. To deploy our agent in the WindowsAgentArena, copy the agent_s folder in this repository to `WindowsAgentArena/src/win-arena-container/client/mm_agents`. 7 | 8 | 3. Change the name of the GraphSearchAgent.py file to agent.py to conform to the WindowsAgentArena Setup. 9 | 10 | 4. Copy the ocr_server.py file to client/folder `WindowsAgentArena/src/win-arena-container/client` folder 11 | 12 | ``` 13 | cd WindowsAgentArena/src/win-arena-container/client 14 | cp mm_agents/agent_s/ocr_server.py . 15 | ``` 16 | 17 | 5. Update the `start_client.sh` file in `WindowsAgentArena/src/win-arena-container` by adding the following line before Running the agent on line 75. 18 | 19 | ``` 20 | python ocr_server.py & 21 | ``` 22 | 23 | 6. In the `src/win-arena-container/client/run.py` file import Agent S 24 | ``` 25 | from mm_agents.agent_s.agent import GraphSearchAgent 26 | ``` 27 | 28 | 7. In the `src/win-arena-container/client/run.py` file, instantiate Agent S by adding the following lines after line 187 where the if condition for NAVI agent ends 29 | 30 | ```python 31 | elif cfg_args["agent_name"] == "agent_s": 32 | if cfg_args["som_origin"] in ["a11y"]: 33 | som_config = None 34 | elif cfg_args["som_origin"] in ["oss", "mixed-oss"]: 35 | som_config = { 36 | "pipeline": ["webparse", "groundingdino", "ocr"], 37 | "groundingdino": { 38 | "prompts": ["icon", "image"] 39 | }, 40 | "ocr": { 41 | "class_name": "TesseractOCR" 42 | }, 43 | "webparse": { 44 | "cdp_url": f"http://{args.emulator_ip}:9222" 45 | } 46 | } 47 | if args.model.startswith("claude"): 48 | engine_type = "anthropic" 49 | elif args.model.startswith("gpt"): 50 | engine_type = "openai" 51 | else: 52 | engine_type = "vllm" 53 | 54 | engine_params = { 55 | "engine_type": engine_type, 56 | "model": args.model, 57 | } 58 | agent = GraphSearchAgent( 59 | engine_params=engine_params, 60 | experiment_type='windowsAgentArena', 61 | temperature=args.temperature 62 | ) 63 | ``` 64 | 65 | 8. Run Agent S on WindowsAgentArena by changing the following parameters in the `scripts/run-local.sh` file 66 | 67 | ``` 68 | agent="agent_s" 69 | model="gpt-4o" 70 | ``` -------------------------------------------------------------------------------- /gui_agents/s1/aci/ACI.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Any, Dict, List 3 | 4 | logger = logging.getLogger("desktopenv.agent") 5 | 6 | 7 | def agent_action(func): 8 | func.is_agent_action = True 9 | return func 10 | 11 | 12 | class ACI: 13 | def __init__(self, top_app_only: bool = True, ocr: bool = False): 14 | self.top_app_only = top_app_only 15 | self.ocr = ocr 16 | self.index_out_of_range_flag = False 17 | self.notes: List[str] = [] 18 | self.clipboard = "" 19 | self.nodes: List[Any] = [] 20 | 21 | def get_active_apps(self, obs: Dict) -> List[str]: 22 | pass 23 | 24 | def get_top_app(self): 25 | pass 26 | 27 | def preserve_nodes(self, tree: Any, exclude_roles: set = None) -> List[Dict]: 28 | pass 29 | 30 | def linearize_and_annotate_tree( 31 | self, obs: Dict, show_all_elements: bool = False 32 | ) -> str: 33 | pass 34 | 35 | def find_element(self, element_id: int) -> Dict: 36 | pass 37 | -------------------------------------------------------------------------------- /gui_agents/s1/aci/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/gui_agents/s1/aci/__init__.py -------------------------------------------------------------------------------- /gui_agents/s1/cli_app.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import io 4 | import logging 5 | import os 6 | import platform 7 | import sys 8 | import time 9 | 10 | import pyautogui 11 | 12 | from gui_agents.s1.core.AgentS import GraphSearchAgent, UIAgent 13 | 14 | current_platform = platform.system().lower() 15 | 16 | if current_platform == "darwin": 17 | from gui_agents.s1.aci.MacOSACI import MacOSACI, UIElement 18 | elif current_platform == "linux": 19 | from gui_agents.s1.aci.LinuxOSACI import LinuxACI, UIElement 20 | elif current_platform == "windows": 21 | from gui_agents.s1.aci.WindowsOSACI import WindowsACI, UIElement 22 | else: 23 | raise ValueError(f"Unsupported platform: {current_platform}") 24 | 25 | logger = logging.getLogger() 26 | logger.setLevel(logging.DEBUG) 27 | 28 | datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") 29 | 30 | log_dir = "logs" 31 | os.makedirs(log_dir, exist_ok=True) 32 | 33 | file_handler = logging.FileHandler( 34 | os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8" 35 | ) 36 | debug_handler = logging.FileHandler( 37 | os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8" 38 | ) 39 | stdout_handler = logging.StreamHandler(sys.stdout) 40 | sdebug_handler = logging.FileHandler( 41 | os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8" 42 | ) 43 | 44 | file_handler.setLevel(logging.INFO) 45 | debug_handler.setLevel(logging.DEBUG) 46 | stdout_handler.setLevel(logging.INFO) 47 | sdebug_handler.setLevel(logging.DEBUG) 48 | 49 | formatter = logging.Formatter( 50 | fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s" 51 | ) 52 | file_handler.setFormatter(formatter) 53 | debug_handler.setFormatter(formatter) 54 | stdout_handler.setFormatter(formatter) 55 | sdebug_handler.setFormatter(formatter) 56 | 57 | stdout_handler.addFilter(logging.Filter("desktopenv")) 58 | sdebug_handler.addFilter(logging.Filter("desktopenv")) 59 | 60 | logger.addHandler(file_handler) 61 | logger.addHandler(debug_handler) 62 | logger.addHandler(stdout_handler) 63 | logger.addHandler(sdebug_handler) 64 | 65 | platform_os = platform.system() 66 | 67 | 68 | def show_permission_dialog(code: str, action_description: str): 69 | """Show a platform-specific permission dialog and return True if approved.""" 70 | if platform.system() == "Darwin": 71 | result = os.system( 72 | f'osascript -e \'display dialog "Do you want to execute this action?\n\n{code} which will try to {action_description}" with title "Action Permission" buttons {{"Cancel", "OK"}} default button "OK" cancel button "Cancel"\'' 73 | ) 74 | return result == 0 75 | elif platform.system() == "Linux": 76 | result = os.system( 77 | f'zenity --question --title="Action Permission" --text="Do you want to execute this action?\n\n{code}" --width=400 --height=200' 78 | ) 79 | return result == 0 80 | return False 81 | 82 | 83 | def run_agent(agent: UIAgent, instruction: str): 84 | obs = {} 85 | traj = "Task:\n" + instruction 86 | subtask_traj = "" 87 | for _ in range(15): 88 | obs["accessibility_tree"] = UIElement.systemWideElement() 89 | 90 | # Get screen shot using pyautogui. 91 | # Take a screenshot 92 | screenshot = pyautogui.screenshot() 93 | 94 | # Save the screenshot to a BytesIO object 95 | buffered = io.BytesIO() 96 | screenshot.save(buffered, format="PNG") 97 | 98 | # Get the byte value of the screenshot 99 | screenshot_bytes = buffered.getvalue() 100 | # Convert to base64 string. 101 | obs["screenshot"] = screenshot_bytes 102 | 103 | # Get next action code from the agent 104 | info, code = agent.predict(instruction=instruction, observation=obs) 105 | 106 | if "done" in code[0].lower() or "fail" in code[0].lower(): 107 | if platform.system() == "Darwin": 108 | os.system( 109 | f'osascript -e \'display dialog "Task Completed" with title "OpenACI Agent" buttons "OK" default button "OK"\'' 110 | ) 111 | elif platform.system() == "Linux": 112 | os.system( 113 | f'zenity --info --title="OpenACI Agent" --text="Task Completed" --width=200 --height=100' 114 | ) 115 | 116 | agent.update_narrative_memory(traj) 117 | break 118 | 119 | if "next" in code[0].lower(): 120 | continue 121 | 122 | if "wait" in code[0].lower(): 123 | time.sleep(5) 124 | continue 125 | 126 | else: 127 | time.sleep(1.0) 128 | print("EXECUTING CODE:", code[0]) 129 | 130 | # Ask for permission before executing 131 | exec(code[0]) 132 | time.sleep(1.0) 133 | 134 | # Update task and subtask trajectories and optionally the episodic memory 135 | traj += ( 136 | "\n\nReflection:\n" 137 | + str(info["reflection"]) 138 | + "\n\n----------------------\n\nPlan:\n" 139 | + info["executor_plan"] 140 | ) 141 | subtask_traj = agent.update_episodic_memory(info, subtask_traj) 142 | 143 | 144 | def main(): 145 | parser = argparse.ArgumentParser( 146 | description="Run GraphSearchAgent with specified model." 147 | ) 148 | parser.add_argument( 149 | "--model", 150 | type=str, 151 | default="gpt-4o-mini", 152 | help="Specify the model to use (e.g., gpt-4o)", 153 | ) 154 | args = parser.parse_args() 155 | 156 | if current_platform == "Darwin": 157 | grounding_agent = MacOSACI() 158 | elif current_platform == "Windows": 159 | grounding_agent = WindowsACI() 160 | elif current_platform == "Linux": 161 | grounding_agent = LinuxACI() 162 | else: 163 | raise ValueError("Unsupported platform") 164 | 165 | while True: 166 | query = input("Query: ") 167 | if "gpt" in args.model: 168 | engine_type = "openai" 169 | elif "claude" in args.model: 170 | engine_type = "anthropic" 171 | engine_params = { 172 | "engine_type": engine_type, 173 | "model": args.model, 174 | } 175 | 176 | agent = GraphSearchAgent( 177 | engine_params, 178 | grounding_agent, 179 | platform=current_platform, 180 | action_space="pyautogui", 181 | observation_type="mixed", 182 | ) 183 | 184 | agent.reset() 185 | 186 | # Run the agent on your own device 187 | run_agent(agent, query) 188 | 189 | response = input("Would you like to provide another query? (y/n): ") 190 | if response.lower() != "y": 191 | break 192 | 193 | 194 | if __name__ == "__main__": 195 | main() 196 | -------------------------------------------------------------------------------- /gui_agents/s1/core/BaseModule.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional 2 | 3 | from gui_agents.s1.mllm.MultimodalAgent import LMMAgent 4 | 5 | 6 | class BaseModule: 7 | def __init__(self, engine_params: Dict, platform: str): 8 | self.engine_params = engine_params 9 | self.platform = platform 10 | 11 | def _create_agent( 12 | self, system_prompt: str = None, engine_params: Optional[Dict] = None 13 | ) -> LMMAgent: 14 | """Create a new LMMAgent instance""" 15 | agent = LMMAgent(engine_params or self.engine_params) 16 | if system_prompt: 17 | agent.add_system_prompt(system_prompt) 18 | return agent 19 | -------------------------------------------------------------------------------- /gui_agents/s1/core/Knowledge.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from typing import Dict, Tuple 4 | 5 | import numpy as np 6 | from sklearn.metrics.pairwise import cosine_similarity 7 | 8 | from gui_agents.s1.core.BaseModule import BaseModule 9 | from gui_agents.s1.core.ProceduralMemory import PROCEDURAL_MEMORY 10 | from gui_agents.s1.mllm.MultimodalEngine import OpenAIEmbeddingEngine 11 | from gui_agents.s1.utils.common_utils import ( 12 | load_embeddings, 13 | load_knowledge_base, 14 | save_embeddings, 15 | ) 16 | from gui_agents.s1.utils.query_perplexica import query_to_perplexica 17 | 18 | 19 | class KnowledgeBase(BaseModule): 20 | def __init__( 21 | self, 22 | local_kb_path: str, 23 | platform: str, 24 | engine_params: Dict, 25 | use_image_for_search: bool = False, 26 | ): 27 | super().__init__(engine_params, platform) 28 | 29 | self.local_kb_path = local_kb_path 30 | 31 | # initialize embedding engine 32 | # TODO: Support other embedding engines 33 | self.embedding_engine = OpenAIEmbeddingEngine( 34 | api_key=( 35 | engine_params["api_key"] 36 | if "api_key" in engine_params 37 | else os.getenv("OPENAI_API_KEY") 38 | ) 39 | ) 40 | 41 | # Initialize paths for different memory types 42 | self.episodic_memory_path = os.path.join( 43 | self.local_kb_path, self.platform, "episodic_memory.json" 44 | ) 45 | self.narrative_memory_path = os.path.join( 46 | self.local_kb_path, self.platform, "narrative_memory.json" 47 | ) 48 | self.embeddings_path = os.path.join( 49 | self.local_kb_path, self.platform, "embeddings.pkl" 50 | ) 51 | 52 | self.rag_module_system_prompt = PROCEDURAL_MEMORY.RAG_AGENT.replace( 53 | "CURRENT_OS", self.platform 54 | ) 55 | 56 | # All three agent share a generic RAG prompt that ask agent to provide information for UI automation in CURRENT_OS 57 | self.query_formulator = self._create_agent(self.rag_module_system_prompt) 58 | self.llm_search_agent = self._create_agent(self.rag_module_system_prompt) 59 | self.knowledge_fusion_agent = self._create_agent(self.rag_module_system_prompt) 60 | 61 | self.use_image_for_search = use_image_for_search 62 | 63 | def retrieve_knowledge( 64 | self, instruction: str, search_query: str, search_engine: str = "llm" 65 | ) -> Tuple[str, str]: 66 | """Retrieve knowledge using search engine 67 | Args: 68 | instruction (str): task instruction 69 | observation (Dict): current observation 70 | search_engine (str): search engine to use""" 71 | 72 | # Use search engine to retrieve knowledge based on the formulated query 73 | search_results = self._search(instruction, search_query, search_engine) 74 | 75 | return search_query, search_results 76 | 77 | def formulate_query(self, instruction: str, observation: Dict) -> str: 78 | """Formulate search query based on instruction and current state""" 79 | query_path = os.path.join( 80 | self.local_kb_path, self.platform, "formulate_query.json" 81 | ) 82 | try: 83 | with open(query_path, "r") as f: 84 | formulate_query = json.load(f) 85 | except: 86 | formulate_query = {} 87 | 88 | if instruction in formulate_query: 89 | return formulate_query[instruction] 90 | 91 | self.query_formulator.add_message( 92 | f"The task is: {instruction}\n" 93 | f"Accessibility tree of the current desktop UI state: {observation['linearized_accessibility_tree']}\n" 94 | "To use google search to get some useful information, first carefully analyze " 95 | "the accessibility tree of the current desktop UI state, then given the task " 96 | "instruction, formulate a question that can be used to search on the Internet " 97 | "for information in helping with the task execution.\n" 98 | "The question should not be too general or too specific. Please ONLY provide " 99 | "the question.\nQuestion:", 100 | image_content=( 101 | observation["screenshot"] 102 | if self.use_image_for_search and "screenshot" in observation 103 | else None 104 | ), 105 | ) 106 | 107 | search_query = self.query_formulator.get_response().strip().replace('"', "") 108 | print("search query: ", search_query) 109 | formulate_query[instruction] = search_query 110 | with open(query_path, "w") as f: 111 | json.dump(formulate_query, f, indent=2) 112 | 113 | return search_query 114 | 115 | def _search(self, instruction: str, search_query: str, search_engine: str) -> str: 116 | """Execute search using specified engine""" 117 | 118 | # Default to perplexica rag knowledge to see if the query exists 119 | file = os.path.join( 120 | self.local_kb_path, self.platform, f"{search_engine}_rag_knowledge.json" 121 | ) 122 | 123 | try: 124 | with open(file, "r") as f: 125 | exist_search_results = json.load(f) 126 | except: 127 | exist_search_results = {} 128 | 129 | if instruction in exist_search_results: 130 | return exist_search_results[instruction] 131 | if search_engine.lower() == "llm": 132 | # Use LLM's internal knowledge like a search engine 133 | self.llm_search_agent.add_message(search_query) 134 | search_results = self.llm_search_agent.get_response() 135 | elif search_engine.lower() == "perplexica": 136 | # Use perplexica to search for the query 137 | search_results = query_to_perplexica(search_query) 138 | else: 139 | raise ValueError(f"Unsupported search engine: {search_engine}") 140 | 141 | exist_search_results[instruction] = search_results.strip() 142 | with open( 143 | os.path.join( 144 | self.local_kb_path, 145 | self.platform, 146 | f"{search_engine}_rag_knowledge.json", 147 | ), 148 | "w", 149 | ) as f: 150 | json.dump(exist_search_results, f, indent=2) 151 | 152 | return search_results 153 | 154 | def retrieve_narrative_experience(self, instruction: str) -> Tuple[str, str]: 155 | """Retrieve narrative experience using embeddings""" 156 | knowledge_base = load_knowledge_base(self.narrative_memory_path) 157 | if not knowledge_base: 158 | return "None", "None" 159 | 160 | embeddings = load_embeddings(self.embeddings_path) 161 | 162 | # Get or create instruction embedding 163 | instruction_embedding = embeddings.get(instruction) 164 | 165 | if instruction_embedding is None: 166 | instruction_embedding = self.embedding_engine.get_embeddings(instruction) 167 | embeddings[instruction] = instruction_embedding 168 | 169 | # Get or create embeddings for knowledge base entries 170 | candidate_embeddings = [] 171 | for key in knowledge_base: 172 | candidate_embedding = embeddings.get(key) 173 | if candidate_embedding is None: 174 | candidate_embedding = self.embedding_engine.get_embeddings(key) 175 | embeddings[key] = candidate_embedding 176 | 177 | candidate_embeddings.append(candidate_embedding) 178 | 179 | save_embeddings(self.embeddings_path, embeddings) 180 | 181 | similarities = cosine_similarity( 182 | instruction_embedding, np.vstack(candidate_embeddings) 183 | )[0] 184 | sorted_indices = np.argsort(similarities)[::-1] 185 | 186 | keys = list(knowledge_base.keys()) 187 | idx = 1 if keys[sorted_indices[0]] == instruction else 0 188 | return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]] 189 | 190 | def retrieve_episodic_experience(self, instruction: str) -> Tuple[str, str]: 191 | """Retrieve similar task experience using embeddings""" 192 | knowledge_base = load_knowledge_base(self.episodic_memory_path) 193 | if not knowledge_base: 194 | return "None", "None" 195 | 196 | embeddings = load_embeddings(self.embeddings_path) 197 | 198 | # Get or create instruction embedding 199 | instruction_embedding = embeddings.get(instruction) 200 | 201 | if instruction_embedding is None: 202 | instruction_embedding = self.embedding_engine.get_embeddings(instruction) 203 | embeddings[instruction] = instruction_embedding 204 | 205 | # Get or create embeddings for knowledge base entries 206 | candidate_embeddings = [] 207 | for key in knowledge_base: 208 | candidate_embedding = embeddings.get(key) 209 | if candidate_embedding is None: 210 | candidate_embedding = self.embedding_engine.get_embeddings(key) 211 | embeddings[key] = candidate_embedding 212 | 213 | candidate_embeddings.append(candidate_embedding) 214 | 215 | save_embeddings(self.embeddings_path, embeddings) 216 | 217 | similarities = cosine_similarity( 218 | instruction_embedding, np.vstack(candidate_embeddings) 219 | )[0] 220 | sorted_indices = np.argsort(similarities)[::-1] 221 | 222 | keys = list(knowledge_base.keys()) 223 | idx = 1 if keys[sorted_indices[0]] == instruction else 0 224 | return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]] 225 | 226 | def knowledge_fusion( 227 | self, 228 | observation: Dict, 229 | instruction: str, 230 | web_knowledge: str, 231 | similar_task: str, 232 | experience: str, 233 | ) -> str: 234 | """Combine web knowledge with similar task experience""" 235 | self.knowledge_fusion_agent.add_message( 236 | f"Task: {instruction}\n" 237 | f"Accessibility tree of the current desktop UI state: {observation['linearized_accessibility_tree']}\n" 238 | f"**Web search result**:\n{web_knowledge}\n\n" 239 | f"**Retrieved similar task experience**:\n" 240 | f"Similar task:{similar_task}\n{experience}\n\n" 241 | f"Based on the web search result and the retrieved similar task experience, " 242 | f"if you think the similar task experience is indeed useful to the main task, " 243 | f"integrate it with the web search result. Provide the final knowledge in a numbered list.", 244 | image_content=( 245 | observation["screenshot"] 246 | if self.use_image_for_search and "screenshot" in observation 247 | else None 248 | ), 249 | ) 250 | return self.knowledge_fusion_agent.get_response() 251 | -------------------------------------------------------------------------------- /gui_agents/s1/core/Manager.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from collections import defaultdict 3 | from typing import Dict, List, Optional, Tuple 4 | import platform 5 | 6 | from gui_agents.s1.aci.ACI import ACI 7 | from gui_agents.s1.core.BaseModule import BaseModule 8 | from gui_agents.s1.core.Knowledge import KnowledgeBase 9 | from gui_agents.s1.core.ProceduralMemory import PROCEDURAL_MEMORY 10 | from gui_agents.s1.utils.common_utils import ( 11 | Dag, 12 | Node, 13 | calculate_tokens, 14 | call_llm_safe, 15 | parse_dag, 16 | ) 17 | 18 | logger = logging.getLogger("desktopenv.agent") 19 | 20 | NUM_IMAGE_TOKEN = 1105 # Value set of screen of size 1920x1080 for openai vision 21 | 22 | 23 | class Manager(BaseModule): 24 | def __init__( 25 | self, 26 | engine_params: Dict, 27 | grounding_agent: ACI, 28 | local_kb_path: str, 29 | search_engine: Optional[str] = None, 30 | multi_round: bool = False, 31 | platform: str = platform.system().lower(), 32 | ): 33 | # TODO: move the prompt to Procedural Memory 34 | super().__init__(engine_params, platform) 35 | 36 | # Initialize the ACI 37 | self.grounding_agent = grounding_agent 38 | 39 | # Initialize the submodules of the Manager 40 | self.generator_agent = self._create_agent(PROCEDURAL_MEMORY.MANAGER_PROMPT) 41 | self.dag_translator_agent = self._create_agent( 42 | PROCEDURAL_MEMORY.DAG_TRANSLATOR_PROMPT 43 | ) 44 | self.narrative_summarization_agent = self._create_agent( 45 | PROCEDURAL_MEMORY.TASK_SUMMARIZATION_PROMPT 46 | ) 47 | self.episode_summarization_agent = self._create_agent( 48 | PROCEDURAL_MEMORY.SUBTASK_SUMMARIZATION_PROMPT 49 | ) 50 | 51 | self.local_kb_path = local_kb_path 52 | 53 | self.knowledge_base = KnowledgeBase(self.local_kb_path, platform, engine_params) 54 | 55 | self.planner_history = [] 56 | 57 | self.turn_count = 0 58 | self.search_engine = search_engine 59 | self.multi_round = multi_round 60 | self.platform = platform 61 | 62 | def summarize_episode(self, trajectory): 63 | """Summarize the episode experience for lifelong learning reflection 64 | Args: 65 | trajectory: str: The episode experience to be summarized 66 | """ 67 | 68 | # Create Reflection on whole trajectories for next round trial, keep earlier messages as exemplars 69 | self.episode_summarization_agent.add_message(trajectory) 70 | subtask_summarization = call_llm_safe(self.episode_summarization_agent) 71 | self.episode_summarization_agent.add_message(subtask_summarization) 72 | 73 | return subtask_summarization 74 | 75 | def summarize_narrative(self, trajectory): 76 | """Summarize the narrative experience for lifelong learning reflection 77 | Args: 78 | trajectory: str: The narrative experience to be summarized 79 | """ 80 | # Create Reflection on whole trajectories for next round trial 81 | self.narrative_summarization_agent.add_message(trajectory) 82 | lifelong_learning_reflection = call_llm_safe(self.narrative_summarization_agent) 83 | 84 | return lifelong_learning_reflection 85 | 86 | def _generate_step_by_step_plan( 87 | self, observation: Dict, instruction: str, failure_feedback: str = "" 88 | ) -> Tuple[Dict, str]: 89 | agent = self.grounding_agent 90 | 91 | self.active_apps = agent.get_active_apps(observation) 92 | 93 | tree_input = agent.linearize_and_annotate_tree(observation) 94 | observation["linearized_accessibility_tree"] = tree_input 95 | 96 | # Perform Retrieval only at the first planning step 97 | if self.turn_count == 0: 98 | 99 | self.search_query = self.knowledge_base.formulate_query( 100 | instruction, observation 101 | ) 102 | 103 | retrieved_experience = "" 104 | integrated_knowledge = "" 105 | # Retrieve most similar narrative (task) experience 106 | most_similar_task, retrieved_experience = ( 107 | self.knowledge_base.retrieve_narrative_experience(instruction) 108 | ) 109 | logger.info( 110 | "SIMILAR TASK EXPERIENCE: %s", 111 | most_similar_task + "\n" + retrieved_experience.strip(), 112 | ) 113 | 114 | # Retrieve knowledge from the web if search_engine is provided 115 | if self.search_engine is not None: 116 | retrieved_knowledge = self.knowledge_base.retrieve_knowledge( 117 | instruction=instruction, 118 | search_query=self.search_query, 119 | search_engine=self.search_engine, 120 | ) 121 | logger.info("RETRIEVED KNOWLEDGE: %s", retrieved_knowledge) 122 | 123 | if retrieved_knowledge is not None: 124 | # Fuse the retrieved knowledge and experience 125 | integrated_knowledge = self.knowledge_base.knowledge_fusion( 126 | observation=observation, 127 | instruction=instruction, 128 | web_knowledge=retrieved_knowledge, 129 | similar_task=most_similar_task, 130 | experience=retrieved_experience, 131 | ) 132 | logger.info("INTEGRATED KNOWLEDGE: %s", integrated_knowledge) 133 | 134 | integrated_knowledge = integrated_knowledge or retrieved_experience 135 | 136 | # Add the integrated knowledge to the task instruction in the system prompt 137 | if integrated_knowledge: 138 | instruction += f"\nYou may refer to some retrieved knowledge if you think they are useful.{integrated_knowledge}" 139 | 140 | self.generator_agent.add_system_prompt( 141 | self.generator_agent.system_prompt.replace( 142 | "TASK_DESCRIPTION", instruction 143 | ) 144 | ) 145 | 146 | generator_message = ( 147 | f"Accessibility Tree: {tree_input}\n" 148 | f"The clipboard contains: {agent.clipboard}." 149 | f"The current open applications are {agent.get_active_apps(observation)}" 150 | + ( 151 | f" Previous plan failed at step: {failure_feedback}" 152 | if failure_feedback 153 | else "" 154 | ) 155 | ) 156 | 157 | self.generator_agent.add_message( 158 | generator_message, image_content=observation.get("screenshot", None) 159 | ) 160 | 161 | logger.info("GENERATING HIGH LEVEL PLAN") 162 | 163 | plan = call_llm_safe(self.generator_agent) 164 | 165 | if plan == "": 166 | raise Exception("Plan Generation Failed - Fix the Prompt") 167 | 168 | logger.info("HIGH LEVEL STEP BY STEP PLAN: %s", plan) 169 | 170 | self.generator_agent.add_message(plan) 171 | 172 | self.planner_history.append(plan) 173 | 174 | self.turn_count += 1 175 | 176 | input_tokens, output_tokens = calculate_tokens(self.generator_agent.messages) 177 | 178 | # Set Cost based on GPT-4o 179 | cost = input_tokens * (0.0050 / 1000) + output_tokens * (0.0150 / 1000) 180 | 181 | planner_info = { 182 | "search_query": self.search_query, 183 | "goal_plan": plan, 184 | "num_input_tokens_plan": input_tokens, 185 | "num_output_tokens_plan": output_tokens, 186 | "goal_plan_cost": cost, 187 | } 188 | 189 | assert type(plan) == str 190 | 191 | return planner_info, plan 192 | 193 | def _generate_dag(self, instruction: str, plan: str) -> Tuple[Dict, Dag]: 194 | # Add initial instruction and plan to the agent's message history 195 | self.dag_translator_agent.add_message( 196 | f"Instruction: {instruction}\nPlan: {plan}" 197 | ) 198 | 199 | logger.info("GENERATING DAG") 200 | 201 | # Generate DAG 202 | dag_raw = call_llm_safe(self.dag_translator_agent) 203 | 204 | dag = parse_dag(dag_raw) 205 | 206 | logger.info("Generated DAG: %s", dag_raw) 207 | 208 | self.dag_translator_agent.add_message(dag_raw) 209 | 210 | input_tokens, output_tokens = calculate_tokens( 211 | self.dag_translator_agent.messages 212 | ) 213 | 214 | # Set Cost based on GPT-4o 215 | cost = input_tokens * (0.0050 / 1000) + output_tokens * (0.0150 / 1000) 216 | 217 | dag_info = { 218 | "dag": dag_raw, 219 | "num_input_tokens_dag": input_tokens, 220 | "num_output_tokens_dag": output_tokens, 221 | "dag_cost": cost, 222 | } 223 | 224 | assert type(dag) == Dag 225 | 226 | return dag_info, dag 227 | 228 | def _topological_sort(self, dag: Dag) -> List[Node]: 229 | """Topological sort of the DAG using DFS 230 | dag: Dag: Object representation of the DAG with nodes and edges 231 | """ 232 | 233 | def dfs(node_name, visited, stack): 234 | visited[node_name] = True 235 | for neighbor in adj_list[node_name]: 236 | if not visited[neighbor]: 237 | dfs(neighbor, visited, stack) 238 | stack.append(node_name) 239 | 240 | # Convert edges to adjacency list 241 | adj_list = defaultdict(list) 242 | for u, v in dag.edges: 243 | adj_list[u.name].append(v.name) 244 | 245 | visited = {node.name: False for node in dag.nodes} 246 | stack = [] 247 | 248 | for node in dag.nodes: 249 | if not visited[node.name]: 250 | dfs(node.name, visited, stack) 251 | 252 | # Return the nodes in topologically sorted order 253 | sorted_nodes = [ 254 | next(n for n in dag.nodes if n.name == name) for name in stack[::-1] 255 | ] 256 | return sorted_nodes 257 | 258 | def get_action_queue( 259 | self, 260 | instruction: str, 261 | observation: Dict, 262 | failure_feedback: str = None, 263 | ): 264 | """Generate the action list based on the instruction 265 | instruction:str: Instruction for the task 266 | """ 267 | # Generate the high level plan 268 | planner_info, plan = self._generate_step_by_step_plan( 269 | observation, instruction, failure_feedback 270 | ) 271 | 272 | # Generate the DAG 273 | dag_info, dag = self._generate_dag(instruction, plan) 274 | 275 | # Topological sort of the DAG 276 | action_queue = self._topological_sort(dag) 277 | 278 | planner_info.update(dag_info) 279 | 280 | return planner_info, action_queue 281 | -------------------------------------------------------------------------------- /gui_agents/s1/core/Worker.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import re 4 | from typing import Dict, List, Tuple 5 | import platform 6 | 7 | from gui_agents.s1.aci.ACI import ACI 8 | from gui_agents.s1.core.BaseModule import BaseModule 9 | from gui_agents.s1.core.Knowledge import KnowledgeBase 10 | from gui_agents.s1.core.ProceduralMemory import PROCEDURAL_MEMORY 11 | from gui_agents.s1.utils import common_utils 12 | from gui_agents.s1.utils.common_utils import Node, calculate_tokens, call_llm_safe 13 | 14 | logger = logging.getLogger("desktopenv.agent") 15 | 16 | 17 | class Worker(BaseModule): 18 | def __init__( 19 | self, 20 | engine_params: Dict, 21 | grounding_agent: ACI, 22 | local_kb_path: str, 23 | platform: str = platform.system().lower(), 24 | search_engine: str = "perplexica", 25 | enable_reflection: bool = True, 26 | use_subtask_experience: bool = True, 27 | ): 28 | """ 29 | Worker receives a subtask list and active subtask and generates the next action for the to execute. 30 | Args: 31 | engine_params: Dict 32 | Parameters for the multimodal engine 33 | grounding_agent: Agent 34 | The grounding agent to use 35 | local_kb_path: str 36 | Path to knowledge base 37 | search_engine: str 38 | The search engine to use 39 | enable_reflection: bool 40 | Whether to enable reflection 41 | use_subtask_experience: bool 42 | Whether to use subtask experience 43 | """ 44 | super().__init__(engine_params, platform) 45 | 46 | self.grounding_agent = grounding_agent 47 | self.local_kb_path = local_kb_path 48 | self.enable_reflection = enable_reflection 49 | self.search_engine = search_engine 50 | self.use_subtask_experience = use_subtask_experience 51 | self.reset() 52 | 53 | def flush_messages(self, n): 54 | # After every max_trajectory_length trajectories, remove messages from the start except the system prompt 55 | for agent in [self.generator_agent]: 56 | if len(agent.messages) > 2 * n + 1: 57 | # Remove the user message and assistant message, both are 1 because the elements will move back after 1 pop 58 | agent.remove_message_at(1) 59 | agent.remove_message_at(1) 60 | 61 | def reset(self): 62 | self.generator_agent = self._create_agent( 63 | PROCEDURAL_MEMORY.construct_worker_procedural_memory( 64 | type(self.grounding_agent) 65 | ).replace("CURRENT_OS", self.platform) 66 | ) 67 | self.reflection_agent = self._create_agent( 68 | PROCEDURAL_MEMORY.REFLECTION_ON_TRAJECTORY 69 | ) 70 | 71 | self.knowledge_base = KnowledgeBase( 72 | local_kb_path=self.local_kb_path, 73 | platform=self.platform, 74 | engine_params=self.engine_params, 75 | ) 76 | 77 | self.turn_count = 0 78 | self.planner_history = [] 79 | self.reflections = [] 80 | self.cost_this_turn = 0 81 | self.tree_inputs = [] 82 | self.screenshot_inputs = [] 83 | 84 | # TODO: Experimental 85 | def remove_ids_from_history(self): 86 | for message in self.generator_agent.messages: 87 | if message["role"] == "user": 88 | for content in message["content"]: 89 | if content["type"] == "text": 90 | # Regex pattern to match lines that start with a number followed by spaces and remove the number 91 | pattern = r"^\d+\s+" 92 | 93 | # Apply the regex substitution on each line 94 | processed_lines = [ 95 | re.sub(pattern, "", line) 96 | for line in content["text"].splitlines() 97 | ] 98 | 99 | # Join the processed lines back into a single string 100 | result = "\n".join(processed_lines) 101 | 102 | result = result.replace("id\t", "") 103 | 104 | # replace message content 105 | content["text"] = result 106 | 107 | def generate_next_action( 108 | self, 109 | instruction: str, 110 | search_query: str, 111 | subtask: str, 112 | subtask_info: str, 113 | future_tasks: List[Node], 114 | done_task: List[Node], 115 | obs: Dict, 116 | ) -> Tuple[Dict, List]: 117 | """ 118 | Predict the next action(s) based on the current observation. 119 | """ 120 | # Provide the top_app to the Grounding Agent to remove all other applications from the tree. At t=0, top_app is None 121 | agent = self.grounding_agent 122 | 123 | self.active_apps = agent.get_active_apps(obs) 124 | 125 | # Get RAG knowledge, only update system message at t=0 126 | if self.turn_count == 0: 127 | # TODO: uncomment and fix for subtask level RAG 128 | if self.use_subtask_experience: 129 | subtask_query_key = ( 130 | "Task:\n" 131 | + search_query 132 | + "\n\nSubtask: " 133 | + subtask 134 | + "\nSubtask Instruction: " 135 | + subtask_info 136 | ) 137 | retrieved_similar_subtask, retrieved_subtask_experience = ( 138 | self.knowledge_base.retrieve_episodic_experience(subtask_query_key) 139 | ) 140 | logger.info( 141 | "SIMILAR SUBTASK EXPERIENCE: %s", 142 | retrieved_similar_subtask 143 | + "\n" 144 | + retrieved_subtask_experience.strip(), 145 | ) 146 | instruction += "\nYou may refer to some similar subtask experience if you think they are useful. {}".format( 147 | retrieved_similar_subtask + "\n" + retrieved_subtask_experience 148 | ) 149 | 150 | self.generator_agent.add_system_prompt( 151 | self.generator_agent.system_prompt.replace( 152 | "SUBTASK_DESCRIPTION", subtask 153 | ) 154 | .replace("TASK_DESCRIPTION", instruction) 155 | .replace("FUTURE_TASKS", ", ".join([f.name for f in future_tasks])) 156 | .replace("DONE_TASKS", ",".join(d.name for d in done_task)) 157 | ) 158 | 159 | # Clear older messages - we keep full context. if you want to keep only the last n messages, you can use the flush_messages function 160 | # self.flush_messages(3) # flushes generator messages 161 | 162 | # Reflection generation 163 | reflection = None 164 | if self.enable_reflection and self.turn_count > 0: 165 | # TODO: reuse planner history 166 | self.reflection_agent.add_message( 167 | "Task Description: " 168 | + subtask 169 | + " Instruction: " 170 | + subtask_info 171 | + "\n" 172 | + "Current Trajectory: " 173 | + "\n\n".join(self.planner_history) 174 | + "\n" 175 | ) 176 | reflection = call_llm_safe(self.reflection_agent) 177 | self.reflections.append(reflection) 178 | self.reflection_agent.add_message(reflection) 179 | 180 | logger.info("REFLECTION: %s", reflection) 181 | 182 | # Plan Generation 183 | tree_input = agent.linearize_and_annotate_tree(obs) 184 | 185 | self.remove_ids_from_history() 186 | 187 | # Bash terminal message. 188 | generator_message = ( 189 | ( 190 | f"\nYou may use the reflection on the previous trajectory: {reflection}\n" 191 | if reflection 192 | else "" 193 | ) 194 | + f"Accessibility Tree: {tree_input}\n" 195 | f"Text Buffer = [{','.join(agent.notes)}]. " 196 | f"The current open applications are {agent.get_active_apps(obs)} and the active app is {agent.get_top_app(obs)}.\n" 197 | ) 198 | 199 | print("ACTIVE APP IS: ", agent.get_top_app(obs)) 200 | # Only provide subinfo in the very first message to avoid over influence and redundancy 201 | if self.turn_count == 0: 202 | generator_message += f"Remeber only complete the subtask: {subtask}\n" 203 | generator_message += f"You can use this extra information for completing the current subtask: {subtask_info}.\n" 204 | 205 | logger.info("GENERATOR MESSAGE: %s", generator_message) 206 | 207 | self.generator_agent.add_message( 208 | generator_message, image_content=obs["screenshot"] 209 | ) 210 | 211 | plan = call_llm_safe(self.generator_agent) 212 | self.planner_history.append(plan) 213 | logger.info("PLAN: %s", plan) 214 | 215 | self.generator_agent.add_message(plan) 216 | 217 | # Calculate input and output tokens 218 | input_tokens, output_tokens = calculate_tokens(self.generator_agent.messages) 219 | 220 | # Set Cost based on GPT-4o 221 | cost = input_tokens * (0.0050 / 1000) + output_tokens * (0.0150 / 1000) 222 | self.cost_this_turn += cost 223 | logger.info("EXECTUOR COST: %s", self.cost_this_turn) 224 | 225 | # Extract code block from the plan 226 | plan_code = common_utils.parse_single_code_from_string( 227 | plan.split("Grounded Action")[-1] 228 | ) 229 | plan_code = common_utils.sanitize_code(plan_code) 230 | plan_code = common_utils.extract_first_agent_function(plan_code) 231 | exec_code = eval(plan_code) 232 | 233 | # If agent selects an element that was out of range, it should not be executed just send a WAIT command. 234 | # TODO: should provide this as code feedback to the agent? 235 | if agent.index_out_of_range_flag: 236 | plan_code = "agent.wait(1.0)" 237 | exec_code = eval(plan_code) 238 | agent.index_out_of_range_flag = False 239 | 240 | executor_info = { 241 | "current_subtask": subtask, 242 | "current_subtask_info": subtask_info, 243 | "executor_plan": plan, 244 | "linearized_accessibility_tree": tree_input, 245 | "plan_code": plan_code, 246 | "reflection": reflection, 247 | "num_input_tokens_executor": input_tokens, 248 | "num_output_tokens_executor": output_tokens, 249 | "executor_cost": cost, 250 | } 251 | self.turn_count += 1 252 | 253 | self.tree_inputs.append(tree_input) 254 | self.screenshot_inputs.append(obs["screenshot"]) 255 | 256 | return executor_info, [exec_code] 257 | -------------------------------------------------------------------------------- /gui_agents/s1/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/gui_agents/s1/core/__init__.py -------------------------------------------------------------------------------- /gui_agents/s1/mllm/MultimodalAgent.py: -------------------------------------------------------------------------------- 1 | # Author: Saaket Agashe 2 | # Date: 2021-09-15 3 | # License: MIT 4 | 5 | import base64 6 | import re 7 | 8 | from gui_agents.s1.mllm.MultimodalEngine import ( 9 | LMMEngineAnthropic, 10 | LMMEngineAzureOpenAI, 11 | LMMEngineOpenAI, 12 | LMMEnginevLLM, 13 | ) 14 | 15 | data_type_map = { 16 | "openai": {"image_url": "image_url"}, 17 | "anthropic": {"image_url": "image"}, 18 | } 19 | 20 | 21 | class LMMAgent: 22 | def __init__(self, engine_params=None, system_prompt=None, engine=None): 23 | if engine is None: 24 | if engine_params is not None: 25 | engine_type = engine_params.get("engine_type") 26 | if engine_type == "openai": 27 | self.engine = LMMEngineOpenAI(**engine_params) 28 | elif engine_type == "anthropic": 29 | self.engine = LMMEngineAnthropic(**engine_params) 30 | elif engine_type == "azure": 31 | self.engine = LMMEngineAzureOpenAI(**engine_params) 32 | elif engine_type == "vllm": 33 | self.engine = LMMEnginevLLM(**engine_params) 34 | else: 35 | raise ValueError("engine_type must be either 'openai' or 'azure'") 36 | else: 37 | raise ValueError("engine_params must be provided") 38 | else: 39 | self.engine = engine 40 | 41 | self.messages = [] # Empty messages 42 | 43 | if system_prompt: 44 | self.add_system_prompt(system_prompt) 45 | else: 46 | self.add_system_prompt("You are a helpful assistant.") 47 | 48 | def encode_image(self, image_content): 49 | # if image_content is a path to an image file, check type of the image_content to verify 50 | if isinstance(image_content, str): 51 | with open(image_content, "rb") as image_file: 52 | return base64.b64encode(image_file.read()).decode("utf-8") 53 | else: 54 | return base64.b64encode(image_content).decode("utf-8") 55 | 56 | def reset( 57 | self, 58 | ): 59 | 60 | self.messages = [ 61 | { 62 | "role": "system", 63 | "content": [{"type": "text", "text": self.system_prompt}], 64 | } 65 | ] 66 | 67 | def add_system_prompt(self, system_prompt): 68 | self.system_prompt = system_prompt 69 | if len(self.messages) > 0: 70 | self.messages[0] = { 71 | "role": "system", 72 | "content": [{"type": "text", "text": self.system_prompt}], 73 | } 74 | else: 75 | self.messages.append( 76 | { 77 | "role": "system", 78 | "content": [{"type": "text", "text": self.system_prompt}], 79 | } 80 | ) 81 | 82 | def remove_message_at(self, index): 83 | """Remove a message at a given index""" 84 | if index < len(self.messages): 85 | self.messages.pop(index) 86 | 87 | def replace_message_at( 88 | self, index, text_content, image_content=None, image_detail="high" 89 | ): 90 | """Replace a message at a given index""" 91 | if index < len(self.messages): 92 | self.messages[index] = { 93 | "role": self.messages[index]["role"], 94 | "content": [{"type": "text", "text": text_content}], 95 | } 96 | if image_content: 97 | base64_image = self.encode_image(image_content) 98 | self.messages[index]["content"].append( 99 | { 100 | "type": "image_url", 101 | "image_url": { 102 | "url": f"data:image/png;base64,{base64_image}", 103 | "detail": image_detail, 104 | }, 105 | } 106 | ) 107 | 108 | def add_message( 109 | self, text_content, image_content=None, role=None, image_detail="high" 110 | ): 111 | """Add a new message to the list of messages""" 112 | 113 | # API-style inference from OpenAI and AzureOpenAI 114 | if isinstance(self.engine, (LMMEngineOpenAI, LMMEngineAzureOpenAI)): 115 | # infer role from previous message 116 | if role != "user": 117 | if self.messages[-1]["role"] == "system": 118 | role = "user" 119 | elif self.messages[-1]["role"] == "user": 120 | role = "assistant" 121 | elif self.messages[-1]["role"] == "assistant": 122 | role = "user" 123 | 124 | message = { 125 | "role": role, 126 | "content": [{"type": "text", "text": text_content}], 127 | } 128 | 129 | if image_content: 130 | # Check if image_content is a list or a single image 131 | if isinstance(image_content, list): 132 | # If image_content is a list of images, loop through each image 133 | for image in image_content: 134 | base64_image = self.encode_image(image) 135 | message["content"].append( 136 | { 137 | "type": "image_url", 138 | "image_url": { 139 | "url": f"data:image/png;base64,{base64_image}", 140 | "detail": image_detail, 141 | }, 142 | } 143 | ) 144 | else: 145 | # If image_content is a single image, handle it directly 146 | base64_image = self.encode_image(image_content) 147 | message["content"].append( 148 | { 149 | "type": "image_url", 150 | "image_url": { 151 | "url": f"data:image/png;base64,{base64_image}", 152 | "detail": image_detail, 153 | }, 154 | } 155 | ) 156 | self.messages.append(message) 157 | 158 | # For API-style inference from Anthropic 159 | elif isinstance(self.engine, LMMEngineAnthropic): 160 | # infer role from previous message 161 | if role != "user": 162 | if self.messages[-1]["role"] == "system": 163 | role = "user" 164 | elif self.messages[-1]["role"] == "user": 165 | role = "assistant" 166 | elif self.messages[-1]["role"] == "assistant": 167 | role = "user" 168 | 169 | message = { 170 | "role": role, 171 | "content": [{"type": "text", "text": text_content}], 172 | } 173 | 174 | if image_content: 175 | # Check if image_content is a list or a single image 176 | if isinstance(image_content, list): 177 | # If image_content is a list of images, loop through each image 178 | for image in image_content: 179 | base64_image = self.encode_image(image) 180 | message["content"].append( 181 | { 182 | "type": "image", 183 | "source": { 184 | "type": "base64", 185 | "media_type": "image/png", 186 | "data": base64_image, 187 | }, 188 | } 189 | ) 190 | else: 191 | # If image_content is a single image, handle it directly 192 | base64_image = self.encode_image(image_content) 193 | message["content"].append( 194 | { 195 | "type": "image", 196 | "source": { 197 | "type": "base64", 198 | "media_type": "image/png", 199 | "data": base64_image, 200 | }, 201 | } 202 | ) 203 | self.messages.append(message) 204 | 205 | # Locally hosted vLLM model inference 206 | elif isinstance(self.engine, LMMEnginevLLM): 207 | # infer role from previous message 208 | if role != "user": 209 | if self.messages[-1]["role"] == "system": 210 | role = "user" 211 | elif self.messages[-1]["role"] == "user": 212 | role = "assistant" 213 | elif self.messages[-1]["role"] == "assistant": 214 | role = "user" 215 | 216 | message = { 217 | "role": role, 218 | "content": [{"type": "text", "text": text_content}], 219 | } 220 | 221 | if image_content: 222 | # Check if image_content is a list or a single image 223 | if isinstance(image_content, list): 224 | # If image_content is a list of images, loop through each image 225 | for image in image_content: 226 | base64_image = self.encode_image(image) 227 | message["content"].append( 228 | { 229 | "type": "image", 230 | "image": f"data:image;base64,{base64_image}", 231 | } 232 | ) 233 | else: 234 | # If image_content is a single image, handle it directly 235 | base64_image = self.encode_image(image_content) 236 | message["content"].append( 237 | {"type": "image", "image": f"data:image;base64,{base64_image}"} 238 | ) 239 | self.messages.append(message) 240 | 241 | def get_response( 242 | self, 243 | user_message=None, 244 | image=None, 245 | messages=None, 246 | temperature=0.0, 247 | max_new_tokens=None, 248 | **kwargs, 249 | ): 250 | """Generate the next response based on previous messages""" 251 | if messages is None: 252 | messages = self.messages 253 | if user_message: 254 | messages.append( 255 | {"role": "user", "content": [{"type": "text", "text": user_message}]} 256 | ) 257 | 258 | return self.engine.generate( 259 | messages, 260 | temperature=temperature, 261 | max_new_tokens=max_new_tokens, 262 | **kwargs, 263 | ) 264 | -------------------------------------------------------------------------------- /gui_agents/s1/mllm/MultimodalEngine.py: -------------------------------------------------------------------------------- 1 | # Author: Saaket Agashe 2 | # Date: 2021-09-15 3 | # License: MIT 4 | 5 | import os 6 | import re 7 | from io import BytesIO 8 | 9 | import backoff 10 | import numpy as np 11 | import openai 12 | import requests 13 | from anthropic import Anthropic 14 | from openai import APIConnectionError, APIError, AzureOpenAI, OpenAI, RateLimitError 15 | from PIL import Image 16 | 17 | # TODO: Import only if module exists, else ignore 18 | # from llava.model.builder import load_pretrained_model 19 | # from llava.mm_utils import ( 20 | # process_images, 21 | # tokenizer_image_token, 22 | # get_model_name_from_path, 23 | # KeywordsStoppingCriteria, 24 | # ) 25 | # from llava.constants import ( 26 | # IMAGE_TOKEN_INDEX, 27 | # DEFAULT_IMAGE_TOKEN, 28 | # DEFAULT_IM_START_TOKEN, 29 | # DEFAULT_IM_END_TOKEN, 30 | # IMAGE_PLACEHOLDER, 31 | # ) 32 | # from llava.conversation import conv_templates, SeparatorStyle 33 | 34 | 35 | # from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig 36 | 37 | 38 | def image_parser(args): 39 | out = args.image_file.split(args.sep) 40 | return out 41 | 42 | 43 | def load_image(image_file): 44 | if image_file.startswith("http") or image_file.startswith("https"): 45 | response = requests.get(image_file) 46 | image = Image.open(BytesIO(response.content)).convert("RGB") 47 | else: 48 | image = Image.open(image_file).convert("RGB") 49 | return image 50 | 51 | 52 | def load_images(image_files): 53 | out = [] 54 | for image_file in image_files: 55 | image = load_image(image_file) 56 | out.append(image) 57 | return out 58 | 59 | 60 | class LMMEngine: 61 | pass 62 | 63 | 64 | class LMMEngineOpenAI(LMMEngine): 65 | def __init__(self, api_key=None, model=None, rate_limit=-1, **kwargs): 66 | assert model is not None, "model must be provided" 67 | self.model = model 68 | 69 | api_key = api_key or os.getenv("OPENAI_API_KEY") 70 | if api_key is None: 71 | raise ValueError( 72 | "An API Key needs to be provided in either the api_key parameter or as an environment variable named OPENAI_API_KEY" 73 | ) 74 | 75 | self.api_key = api_key 76 | self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit 77 | 78 | self.llm_client = OpenAI(api_key=self.api_key) 79 | 80 | @backoff.on_exception( 81 | backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60 82 | ) 83 | def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs): 84 | """Generate the next message based on previous messages""" 85 | return ( 86 | self.llm_client.chat.completions.create( 87 | model=self.model, 88 | messages=messages, 89 | max_tokens=max_new_tokens if max_new_tokens else 4096, 90 | temperature=temperature, 91 | **kwargs, 92 | ) 93 | .choices[0] 94 | .message.content 95 | ) 96 | 97 | 98 | class LMMEngineAnthropic(LMMEngine): 99 | def __init__(self, api_key=None, model=None, **kwargs): 100 | assert model is not None, "model must be provided" 101 | self.model = model 102 | 103 | api_key = api_key or os.getenv("ANTHROPIC_API_KEY") 104 | if api_key is None: 105 | raise ValueError( 106 | "An API Key needs to be provided in either the api_key parameter or as an environment variable named ANTHROPIC_API_KEY" 107 | ) 108 | 109 | self.api_key = api_key 110 | 111 | self.llm_client = Anthropic(api_key=self.api_key) 112 | 113 | @backoff.on_exception( 114 | backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60 115 | ) 116 | def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs): 117 | """Generate the next message based on previous messages""" 118 | return ( 119 | self.llm_client.messages.create( 120 | system=messages[0]["content"][0]["text"], 121 | model=self.model, 122 | messages=messages[1:], 123 | max_tokens=max_new_tokens if max_new_tokens else 4096, 124 | temperature=temperature, 125 | **kwargs, 126 | ) 127 | .content[0] 128 | .text 129 | ) 130 | 131 | 132 | class OpenAIEmbeddingEngine(LMMEngine): 133 | def __init__( 134 | self, 135 | api_key=None, 136 | rate_limit: int = -1, 137 | display_cost: bool = True, 138 | ): 139 | """Init an OpenAI Embedding engine 140 | 141 | Args: 142 | api_key (_type_, optional): Auth key from OpenAI. Defaults to None. 143 | rate_limit (int, optional): Max number of requests per minute. Defaults to -1. 144 | display_cost (bool, optional): Display cost of API call. Defaults to True. 145 | """ 146 | self.model = "text-embedding-3-small" 147 | self.cost_per_thousand_tokens = 0.00002 148 | 149 | api_key = api_key or os.getenv("OPENAI_API_KEY") 150 | if api_key is None: 151 | raise ValueError( 152 | "An API Key needs to be provided in either the api_key parameter or as an environment variable named OPENAI_API_KEY" 153 | ) 154 | self.api_key = api_key 155 | self.display_cost = display_cost 156 | self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit 157 | 158 | @backoff.on_exception( 159 | backoff.expo, 160 | ( 161 | APIError, 162 | RateLimitError, 163 | APIConnectionError, 164 | ), 165 | ) 166 | def get_embeddings(self, text: str) -> np.ndarray: 167 | client = OpenAI(api_key=self.api_key) 168 | response = client.embeddings.create(model=self.model, input=text) 169 | if self.display_cost: 170 | total_tokens = response.usage.total_tokens 171 | cost = self.cost_per_thousand_tokens * total_tokens / 1000 172 | # print(f"Total cost for this embedding API call: {cost}") 173 | return np.array([data.embedding for data in response.data]) 174 | 175 | 176 | class LMMEngineAzureOpenAI(LMMEngine): 177 | def __init__( 178 | self, 179 | api_key=None, 180 | azure_endpoint=None, 181 | model=None, 182 | api_version=None, 183 | rate_limit=-1, 184 | **kwargs 185 | ): 186 | assert model is not None, "model must be provided" 187 | self.model = model 188 | 189 | assert api_version is not None, "api_version must be provided" 190 | self.api_version = api_version 191 | 192 | api_key = api_key or os.getenv("AZURE_OPENAI_API_KEY") 193 | if api_key is None: 194 | raise ValueError( 195 | "An API Key needs to be provided in either the api_key parameter or as an environment variable named AZURE_OPENAI_API_KEY" 196 | ) 197 | 198 | self.api_key = api_key 199 | 200 | azure_endpoint = azure_endpoint or os.getenv("AZURE_OPENAI_API_BASE") 201 | if azure_endpoint is None: 202 | raise ValueError( 203 | "An Azure API endpoint needs to be provided in either the azure_endpoint parameter or as an environment variable named AZURE_OPENAI_API_BASE" 204 | ) 205 | 206 | self.azure_endpoint = azure_endpoint 207 | self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit 208 | 209 | self.llm_client = AzureOpenAI( 210 | azure_endpoint=self.azure_endpoint, 211 | api_key=self.api_key, 212 | api_version=self.api_version, 213 | ) 214 | self.cost = 0.0 215 | 216 | # @backoff.on_exception(backoff.expo, (APIConnectionError, APIError, RateLimitError), max_tries=10) 217 | def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs): 218 | """Generate the next message based on previous messages""" 219 | completion = self.llm_client.chat.completions.create( 220 | model=self.model, 221 | messages=messages, 222 | max_tokens=max_new_tokens if max_new_tokens else 4096, 223 | temperature=temperature, 224 | **kwargs, 225 | ) 226 | total_tokens = completion.usage.total_tokens 227 | self.cost += 0.02 * ((total_tokens + 500) / 1000) 228 | return completion.choices[0].message.content 229 | 230 | 231 | class LMMEnginevLLM(LMMEngine): 232 | def __init__( 233 | self, base_url=None, api_key=None, model=None, rate_limit=-1, **kwargs 234 | ): 235 | assert model is not None, "model must be provided" 236 | self.model = model 237 | self.api_key = api_key 238 | 239 | self.base_url = base_url or os.getenv("vLLM_ENDPOINT_URL") 240 | if self.base_url is None: 241 | raise ValueError( 242 | "An endpoint URL needs to be provided in either the endpoint_url parameter or as an environment variable named vLLM_ENDPOINT_URL" 243 | ) 244 | 245 | self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit 246 | 247 | self.llm_client = OpenAI(base_url=self.base_url, api_key=self.api_key) 248 | 249 | # @backoff.on_exception(backoff.expo, (APIConnectionError, APIError, RateLimitError), max_tries=10) 250 | # TODO: Default params chosen for the Qwen model 251 | def generate( 252 | self, 253 | messages, 254 | temperature=0.0, 255 | top_p=0.8, 256 | repetition_penalty=1.05, 257 | max_new_tokens=512, 258 | **kwargs 259 | ): 260 | """Generate the next message based on previous messages""" 261 | completion = self.llm_client.chat.completions.create( 262 | model=self.model, 263 | messages=messages, 264 | max_tokens=max_new_tokens if max_new_tokens else 4096, 265 | temperature=temperature, 266 | top_p=top_p, 267 | extra_body={"repetition_penalty": repetition_penalty}, 268 | ) 269 | return completion.choices[0].message.content 270 | -------------------------------------------------------------------------------- /gui_agents/s1/mllm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/gui_agents/s1/mllm/__init__.py -------------------------------------------------------------------------------- /gui_agents/s1/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/gui_agents/s1/utils/__init__.py -------------------------------------------------------------------------------- /gui_agents/s1/utils/ocr_server.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import gc 3 | import io 4 | 5 | import numpy as np 6 | from fastapi import FastAPI 7 | from paddleocr import PaddleOCR 8 | from PIL import Image 9 | from pydantic import BaseModel 10 | 11 | app = FastAPI() 12 | ocr_module = PaddleOCR(use_angle_cls=True, lang="en") 13 | 14 | 15 | class ImageData(BaseModel): 16 | img_bytes: bytes 17 | 18 | 19 | def text_cvt_orc_format_paddle(paddle_result): 20 | texts = [] 21 | print("paddle_result: ", paddle_result) 22 | for i, line in enumerate(paddle_result[0]): 23 | points = np.array(line[0]) 24 | print("points: ", points) 25 | location = { 26 | "left": int(min(points[:, 0])), 27 | "top": int(min(points[:, 1])), 28 | "right": int(max(points[:, 0])), 29 | "bottom": int(max(points[:, 1])), 30 | } 31 | print("location: ", location) 32 | content = line[1][0] 33 | texts.append((i, content, location)) 34 | return texts 35 | 36 | 37 | def ocr_results(screenshot): 38 | screenshot_img = Image.open(io.BytesIO(screenshot)) 39 | result = ocr_module.ocr(np.array(screenshot_img), cls=True) 40 | return text_cvt_orc_format_paddle(result) 41 | 42 | 43 | @app.post("/ocr/") 44 | async def read_image(image_data: ImageData): 45 | image_bytes = base64.b64decode(image_data.img_bytes) 46 | results = ocr_results(image_bytes) 47 | 48 | # Explicitly delete unused variables and run garbage collector 49 | del image_bytes 50 | gc.collect() 51 | 52 | return {"results": results} 53 | 54 | 55 | if __name__ == "__main__": 56 | import uvicorn 57 | 58 | uvicorn.run(app, host="127.0.0.1", port=8000) 59 | -------------------------------------------------------------------------------- /gui_agents/s1/utils/query_perplexica.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import toml 3 | import os 4 | 5 | 6 | def query_to_perplexica(query): 7 | # Retrieve the URL from an environment variable 8 | url = os.getenv("PERPLEXICA_URL") 9 | if not url: 10 | raise ValueError( 11 | "PERPLEXICA_URL environment variable not set. It may take the form: 'http://localhost:{port}/api/search'. The port number is set in the config.toml in the Perplexica directory." 12 | ) 13 | 14 | # Request Message 15 | message = {"focusMode": "webSearch", "query": query, "history": [["human", query]]} 16 | 17 | response = requests.post(url, json=message) 18 | 19 | if response.status_code == 200: 20 | return response.json()["message"] 21 | elif response.status_code == 400: 22 | raise ValueError( 23 | "The request is malformed or missing required fields, such as FocusModel or query" 24 | ) 25 | else: 26 | raise ValueError("Internal Server Error") 27 | 28 | 29 | # Test Code 30 | if __name__ == "__main__": 31 | query = "What is Agent S?" 32 | response = query_to_perplexica(query) 33 | print(response) 34 | -------------------------------------------------------------------------------- /gui_agents/s2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/gui_agents/s2/__init__.py -------------------------------------------------------------------------------- /gui_agents/s2/agents/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/gui_agents/s2/agents/__init__.py -------------------------------------------------------------------------------- /gui_agents/s2/agents/manager.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | from collections import defaultdict 4 | from typing import Dict, List, Optional, Tuple 5 | import platform 6 | 7 | from gui_agents.s2.agents.grounding import ACI 8 | from gui_agents.s2.core.module import BaseModule 9 | from gui_agents.s2.core.knowledge import KnowledgeBase 10 | from gui_agents.s2.memory.procedural_memory import PROCEDURAL_MEMORY 11 | from gui_agents.s2.core.engine import OpenAIEmbeddingEngine 12 | from gui_agents.s2.utils.common_utils import ( 13 | Dag, 14 | Node, 15 | calculate_tokens, 16 | call_llm_safe, 17 | parse_dag, 18 | ) 19 | 20 | logger = logging.getLogger("desktopenv.agent") 21 | 22 | NUM_IMAGE_TOKEN = 1105 # Value set of screen of size 1920x1080 for openai vision 23 | 24 | 25 | class Manager(BaseModule): 26 | def __init__( 27 | self, 28 | engine_params: Dict, 29 | grounding_agent: ACI, 30 | local_kb_path: str, 31 | embedding_engine=OpenAIEmbeddingEngine(), 32 | search_engine: Optional[str] = None, 33 | multi_round: bool = False, 34 | platform: str = platform.system().lower(), 35 | ): 36 | # TODO: move the prompt to Procedural Memory 37 | super().__init__(engine_params, platform) 38 | 39 | # Initialize the ACI 40 | self.grounding_agent = grounding_agent 41 | 42 | # Initialize the planner 43 | sys_prompt = PROCEDURAL_MEMORY.COMBINED_MANAGER_PROMPT 44 | 45 | self.generator_agent = self._create_agent(sys_prompt) 46 | 47 | # Initialize the remaining modules 48 | self.dag_translator_agent = self._create_agent( 49 | PROCEDURAL_MEMORY.DAG_TRANSLATOR_PROMPT 50 | ) 51 | self.narrative_summarization_agent = self._create_agent( 52 | PROCEDURAL_MEMORY.TASK_SUMMARIZATION_PROMPT 53 | ) 54 | self.episode_summarization_agent = self._create_agent( 55 | PROCEDURAL_MEMORY.SUBTASK_SUMMARIZATION_PROMPT 56 | ) 57 | 58 | self.local_kb_path = local_kb_path 59 | 60 | self.embedding_engine = embedding_engine 61 | self.knowledge_base = KnowledgeBase( 62 | embedding_engine=self.embedding_engine, 63 | local_kb_path=self.local_kb_path, 64 | platform=platform, 65 | engine_params=engine_params, 66 | ) 67 | 68 | self.planner_history = [] 69 | 70 | self.turn_count = 0 71 | self.search_engine = search_engine 72 | self.multi_round = multi_round 73 | 74 | def summarize_episode(self, trajectory): 75 | """Summarize the episode experience for lifelong learning reflection 76 | Args: 77 | trajectory: str: The episode experience to be summarized 78 | """ 79 | 80 | # Create Reflection on whole trajectories for next round trial, keep earlier messages as exemplars 81 | self.episode_summarization_agent.add_message(trajectory, role="user") 82 | subtask_summarization = call_llm_safe(self.episode_summarization_agent) 83 | self.episode_summarization_agent.add_message( 84 | subtask_summarization, role="assistant" 85 | ) 86 | 87 | return subtask_summarization 88 | 89 | def summarize_narrative(self, trajectory): 90 | """Summarize the narrative experience for lifelong learning reflection 91 | Args: 92 | trajectory: str: The narrative experience to be summarized 93 | """ 94 | # Create Reflection on whole trajectories for next round trial 95 | self.narrative_summarization_agent.add_message(trajectory, role="user") 96 | lifelong_learning_reflection = call_llm_safe(self.narrative_summarization_agent) 97 | 98 | return lifelong_learning_reflection 99 | 100 | def _generate_step_by_step_plan( 101 | self, 102 | observation: Dict, 103 | instruction: str, 104 | failed_subtask: Optional[Node] = None, 105 | completed_subtasks_list: List[Node] = [], 106 | remaining_subtasks_list: List[Node] = [], 107 | ) -> Tuple[Dict, str]: 108 | agent = self.grounding_agent 109 | 110 | # Converts a list of DAG Nodes into a natural langauge list 111 | def format_subtask_list(subtasks: List[Node]) -> str: 112 | res = "" 113 | for idx, node in enumerate(subtasks): 114 | res += f"{idx+1}. **{node.name}**:\n" 115 | bullets = re.split(r"(?<=[.!?;]) +", node.info) 116 | for bullet in bullets: 117 | res += f" - {bullet}\n" 118 | res += "\n" 119 | return res 120 | 121 | # Perform Retrieval only at the first planning step 122 | if self.turn_count == 0: 123 | 124 | self.search_query = self.knowledge_base.formulate_query( 125 | instruction, observation 126 | ) 127 | 128 | most_similar_task = "" 129 | retrieved_experience = "" 130 | integrated_knowledge = "" 131 | # Retrieve most similar narrative (task) experience 132 | most_similar_task, retrieved_experience = ( 133 | self.knowledge_base.retrieve_narrative_experience(instruction) 134 | ) 135 | logger.info( 136 | "SIMILAR TASK EXPERIENCE: %s", 137 | most_similar_task + "\n" + retrieved_experience.strip(), 138 | ) 139 | 140 | # Retrieve knowledge from the web if search_engine is provided 141 | if self.search_engine is not None: 142 | retrieved_knowledge = self.knowledge_base.retrieve_knowledge( 143 | instruction=instruction, 144 | search_query=self.search_query, 145 | search_engine=self.search_engine, 146 | ) 147 | logger.info("RETRIEVED KNOWLEDGE: %s", retrieved_knowledge) 148 | 149 | if retrieved_knowledge is not None: 150 | # Fuse the retrieved knowledge and experience 151 | integrated_knowledge = self.knowledge_base.knowledge_fusion( 152 | observation=observation, 153 | instruction=instruction, 154 | web_knowledge=retrieved_knowledge, 155 | similar_task=most_similar_task, 156 | experience=retrieved_experience, 157 | ) 158 | logger.info("INTEGRATED KNOWLEDGE: %s", integrated_knowledge) 159 | 160 | integrated_knowledge = integrated_knowledge or retrieved_experience 161 | 162 | # Add the integrated knowledge to the task instruction in the system prompt 163 | if integrated_knowledge: 164 | instruction += f"\nYou may refer to some retrieved knowledge if you think they are useful.{integrated_knowledge}" 165 | 166 | self.generator_agent.add_system_prompt( 167 | self.generator_agent.system_prompt.replace( 168 | "TASK_DESCRIPTION", instruction 169 | ) 170 | ) 171 | 172 | # Re-plan on failure case 173 | if failed_subtask: 174 | generator_message = ( 175 | f"The subtask {failed_subtask} cannot be completed. Please generate a new plan for the remainder of the trajectory.\n\n" 176 | f"Successfully Completed Subtasks:\n{format_subtask_list(completed_subtasks_list)}\n" 177 | ) 178 | # Re-plan on subtask completion case 179 | elif len(completed_subtasks_list) + len(remaining_subtasks_list) > 0: 180 | generator_message = ( 181 | "The current trajectory and desktop state is provided. Please revise the plan for the following trajectory.\n\n" 182 | f"Successfully Completed Subtasks:\n{format_subtask_list(completed_subtasks_list)}\n" 183 | f"Future Remaining Subtasks:\n{format_subtask_list(remaining_subtasks_list)}\n" 184 | ) 185 | # Initial plan case 186 | else: 187 | generator_message = "Please generate the initial plan for the task.\n" 188 | 189 | logger.info("GENERATOR MESSAGE: %s", generator_message) 190 | 191 | self.generator_agent.add_message( 192 | generator_message, 193 | image_content=observation.get("screenshot", None), 194 | role="user", 195 | ) 196 | 197 | logger.info("GENERATING HIGH LEVEL PLAN") 198 | 199 | plan = call_llm_safe(self.generator_agent) 200 | if plan == "": 201 | raise Exception("Plan Generation Failed - Fix the Prompt") 202 | 203 | logger.info("HIGH LEVEL STEP BY STEP PLAN: %s", plan) 204 | 205 | self.generator_agent.add_message(plan, role="assistant") 206 | self.planner_history.append(plan) 207 | self.turn_count += 1 208 | 209 | # Set Cost based on GPT-4o 210 | input_tokens, output_tokens = calculate_tokens(self.generator_agent.messages) 211 | cost = input_tokens * (0.0050 / 1000) + output_tokens * (0.0150 / 1000) 212 | 213 | planner_info = { 214 | "search_query": self.search_query, 215 | "goal_plan": plan, 216 | "num_input_tokens_plan": input_tokens, 217 | "num_output_tokens_plan": output_tokens, 218 | "goal_plan_cost": cost, 219 | } 220 | 221 | assert type(plan) == str 222 | 223 | return planner_info, plan 224 | 225 | def _generate_dag(self, instruction: str, plan: str) -> Tuple[Dict, Dag]: 226 | # For the re-planning case, remove the prior input since this should only translate the new plan 227 | self.dag_translator_agent.reset() 228 | 229 | # Add initial instruction and plan to the agent's message history 230 | self.dag_translator_agent.add_message( 231 | f"Instruction: {instruction}\nPlan: {plan}", role="user" 232 | ) 233 | 234 | logger.info("GENERATING DAG") 235 | 236 | # Generate DAG 237 | dag_raw = call_llm_safe(self.dag_translator_agent) 238 | 239 | dag = parse_dag(dag_raw) 240 | 241 | logger.info("Generated DAG: %s", dag_raw) 242 | 243 | self.dag_translator_agent.add_message(dag_raw, role="assistant") 244 | 245 | input_tokens, output_tokens = calculate_tokens( 246 | self.dag_translator_agent.messages 247 | ) 248 | 249 | # Set Cost based on GPT-4o 250 | cost = input_tokens * (0.0050 / 1000) + output_tokens * (0.0150 / 1000) 251 | 252 | dag_info = { 253 | "dag": dag_raw, 254 | "num_input_tokens_dag": input_tokens, 255 | "num_output_tokens_dag": output_tokens, 256 | "dag_cost": cost, 257 | } 258 | 259 | assert type(dag) == Dag 260 | 261 | return dag_info, dag 262 | 263 | def _topological_sort(self, dag: Dag) -> List[Node]: 264 | """Topological sort of the DAG using DFS 265 | dag: Dag: Object representation of the DAG with nodes and edges 266 | """ 267 | 268 | def dfs(node_name, visited, stack): 269 | visited[node_name] = True 270 | for neighbor in adj_list[node_name]: 271 | if not visited[neighbor]: 272 | dfs(neighbor, visited, stack) 273 | stack.append(node_name) 274 | 275 | # Convert edges to adjacency list 276 | adj_list = defaultdict(list) 277 | for u, v in dag.edges: 278 | adj_list[u.name].append(v.name) 279 | 280 | visited = {node.name: False for node in dag.nodes} 281 | stack = [] 282 | 283 | for node in dag.nodes: 284 | if not visited[node.name]: 285 | dfs(node.name, visited, stack) 286 | 287 | # Return the nodes in topologically sorted order 288 | sorted_nodes = [ 289 | next(n for n in dag.nodes if n.name == name) for name in stack[::-1] 290 | ] 291 | return sorted_nodes 292 | 293 | def get_action_queue( 294 | self, 295 | instruction: str, 296 | observation: Dict, 297 | failed_subtask: Optional[Node] = None, 298 | completed_subtasks_list: List[Node] = [], 299 | remaining_subtasks_list: List[Node] = [], 300 | ): 301 | """Generate the action list based on the instruction 302 | instruction:str: Instruction for the task 303 | """ 304 | 305 | planner_info, plan = self._generate_step_by_step_plan( 306 | observation, 307 | instruction, 308 | failed_subtask, 309 | completed_subtasks_list, 310 | remaining_subtasks_list, 311 | ) 312 | 313 | # Generate the DAG 314 | dag_info, dag = self._generate_dag(instruction, plan) 315 | 316 | # Topological sort of the DAG 317 | action_queue = self._topological_sort(dag) 318 | 319 | planner_info.update(dag_info) 320 | 321 | return planner_info, action_queue 322 | -------------------------------------------------------------------------------- /gui_agents/s2/agents/worker.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | import textwrap 4 | from typing import Dict, List, Tuple 5 | import platform 6 | 7 | from gui_agents.s2.agents.grounding import ACI 8 | from gui_agents.s2.core.module import BaseModule 9 | from gui_agents.s2.core.knowledge import KnowledgeBase 10 | from gui_agents.s2.memory.procedural_memory import PROCEDURAL_MEMORY 11 | from gui_agents.s2.core.engine import OpenAIEmbeddingEngine 12 | from gui_agents.s2.utils.common_utils import ( 13 | Node, 14 | calculate_tokens, 15 | call_llm_safe, 16 | parse_single_code_from_string, 17 | sanitize_code, 18 | extract_first_agent_function, 19 | ) 20 | 21 | logger = logging.getLogger("desktopenv.agent") 22 | 23 | 24 | class Worker(BaseModule): 25 | def __init__( 26 | self, 27 | engine_params: Dict, 28 | grounding_agent: ACI, 29 | local_kb_path: str, 30 | embedding_engine=OpenAIEmbeddingEngine(), 31 | platform: str = platform.system().lower(), 32 | enable_reflection: bool = True, 33 | use_subtask_experience: bool = True, 34 | ): 35 | """ 36 | Worker receives a subtask list and active subtask and generates the next action for the to execute. 37 | Args: 38 | engine_params: Dict 39 | Parameters for the multimodal engine 40 | grounding_agent: Agent 41 | The grounding agent to use 42 | local_kb_path: str 43 | Path to knowledge base 44 | platform: str 45 | OS platform the agent runs on (darwin, linux, windows) 46 | enable_reflection: bool 47 | Whether to enable reflection 48 | use_subtask_experience: bool 49 | Whether to use subtask experience 50 | """ 51 | super().__init__(engine_params, platform) 52 | 53 | self.grounding_agent = grounding_agent 54 | self.local_kb_path = local_kb_path 55 | self.embedding_engine = embedding_engine 56 | self.enable_reflection = enable_reflection 57 | self.use_subtask_experience = use_subtask_experience 58 | self.reset() 59 | 60 | def reset(self): 61 | if self.platform != "linux": 62 | skipped_actions = ["set_cell_values"] 63 | else: 64 | skipped_actions = [] 65 | 66 | sys_prompt = PROCEDURAL_MEMORY.construct_worker_procedural_memory( 67 | type(self.grounding_agent), skipped_actions=skipped_actions 68 | ).replace("CURRENT_OS", self.platform) 69 | 70 | self.generator_agent = self._create_agent(sys_prompt) 71 | self.reflection_agent = self._create_agent( 72 | PROCEDURAL_MEMORY.REFLECTION_ON_TRAJECTORY 73 | ) 74 | 75 | self.knowledge_base = KnowledgeBase( 76 | embedding_engine=self.embedding_engine, 77 | local_kb_path=self.local_kb_path, 78 | platform=self.platform, 79 | engine_params=self.engine_params, 80 | ) 81 | 82 | self.turn_count = 0 83 | self.worker_history = [] 84 | self.reflections = [] 85 | self.cost_this_turn = 0 86 | self.screenshot_inputs = [] 87 | self.planner_history = [] 88 | self.max_trajector_length = 8 89 | 90 | def flush_messages(self): 91 | # generator msgs are alternating [user, assistant], so 2 per round 92 | if len(self.generator_agent.messages) > 2 * self.max_trajector_length + 1: 93 | self.generator_agent.remove_message_at(1) 94 | self.generator_agent.remove_message_at(1) 95 | # reflector msgs are all [(user text, user image)], so 1 per round 96 | if len(self.reflection_agent.messages) > self.max_trajector_length + 1: 97 | self.reflection_agent.remove_message_at(1) 98 | 99 | def generate_next_action( 100 | self, 101 | instruction: str, 102 | search_query: str, 103 | subtask: str, 104 | subtask_info: Dict, 105 | future_tasks: List[Node], 106 | done_task: List[Node], 107 | obs: Dict, 108 | ) -> Tuple[Dict, List]: 109 | """ 110 | Predict the next action(s) based on the current observation. 111 | """ 112 | # Provide the top_app to the Grounding Agent to remove all other applications from the tree. At t=0, top_app is None 113 | agent = self.grounding_agent 114 | 115 | # Get RAG knowledge, only update system message at t=0 116 | if self.turn_count == 0: 117 | if self.use_subtask_experience: 118 | subtask_query_key = ( 119 | "Task:\n" 120 | + search_query 121 | + "\n\nSubtask: " 122 | + subtask 123 | + "\nSubtask Instruction: " 124 | + subtask_info 125 | ) 126 | retrieved_similar_subtask, retrieved_subtask_experience = ( 127 | self.knowledge_base.retrieve_episodic_experience(subtask_query_key) 128 | ) 129 | 130 | # Dirty fix to replace id with element description during subtask retrieval 131 | pattern = r"\(\d+" 132 | retrieved_subtask_experience = re.sub( 133 | pattern, "(element_description", retrieved_subtask_experience 134 | ) 135 | retrieved_subtask_experience = retrieved_subtask_experience.replace( 136 | "_id", "_description" 137 | ) 138 | 139 | logger.info( 140 | "SIMILAR SUBTASK EXPERIENCE: %s", 141 | retrieved_similar_subtask 142 | + "\n" 143 | + retrieved_subtask_experience.strip(), 144 | ) 145 | instruction += "\nYou may refer to some similar subtask experience if you think they are useful. {}".format( 146 | retrieved_similar_subtask + "\n" + retrieved_subtask_experience 147 | ) 148 | 149 | self.generator_agent.add_system_prompt( 150 | self.generator_agent.system_prompt.replace( 151 | "SUBTASK_DESCRIPTION", subtask 152 | ) 153 | .replace("TASK_DESCRIPTION", instruction) 154 | .replace("FUTURE_TASKS", ", ".join([f.name for f in future_tasks])) 155 | .replace("DONE_TASKS", ",".join(d.name for d in done_task)) 156 | ) 157 | 158 | # Reflection generation does not add its own response, it only gets the trajectory 159 | reflection = None 160 | if self.enable_reflection: 161 | # Load the initial subtask info 162 | if self.turn_count == 0: 163 | text_content = textwrap.dedent( 164 | f""" 165 | Subtask Description: {subtask} 166 | Subtask Information: {subtask_info} 167 | Current Trajectory below: 168 | """ 169 | ) 170 | updated_sys_prompt = ( 171 | self.reflection_agent.system_prompt + "\n" + text_content 172 | ) 173 | self.reflection_agent.add_system_prompt(updated_sys_prompt) 174 | self.reflection_agent.add_message( 175 | text_content="The initial screen is provided. No action has been taken yet.", 176 | image_content=obs["screenshot"], 177 | role="user", 178 | ) 179 | # Load the latest action 180 | else: 181 | text_content = self.clean_worker_generation_for_reflection( 182 | self.planner_history[-1] 183 | ) 184 | self.reflection_agent.add_message( 185 | text_content=text_content, 186 | image_content=obs["screenshot"], 187 | role="user", 188 | ) 189 | reflection = call_llm_safe(self.reflection_agent) 190 | self.reflections.append(reflection) 191 | logger.info("REFLECTION: %s", reflection) 192 | 193 | generator_message = ( 194 | f"\nYou may use this reflection on the previous action and overall trajectory: {reflection}\n" 195 | if reflection and self.turn_count > 0 196 | else "" 197 | ) + f"Text Buffer = [{','.join(agent.notes)}]." 198 | 199 | # Only provide subinfo in the very first message to avoid over influence and redundancy 200 | if self.turn_count == 0: 201 | generator_message += f"Remember only complete the subtask: {subtask}\n" 202 | generator_message += f"You can use this extra information for completing the current subtask: {subtask_info}.\n" 203 | 204 | # logger.info("GENERATOR MESSAGE: %s", generator_message) 205 | 206 | self.generator_agent.add_message( 207 | generator_message, image_content=obs["screenshot"], role="user" 208 | ) 209 | 210 | plan = call_llm_safe(self.generator_agent) 211 | self.planner_history.append(plan) 212 | logger.info("PLAN: %s", plan) 213 | self.generator_agent.add_message(plan, role="assistant") 214 | 215 | # Calculate input/output tokens and gpt-4o cost 216 | input_tokens, output_tokens = calculate_tokens(self.generator_agent.messages) 217 | cost = input_tokens * (0.0050 / 1000) + output_tokens * (0.0150 / 1000) 218 | self.cost_this_turn += cost 219 | logger.info("EXECTUOR COST: %s", self.cost_this_turn) 220 | 221 | # Use the DescriptionBasedACI to convert agent_action("desc") into agent_action([x, y]) 222 | try: 223 | agent.assign_coordinates(plan, obs) 224 | plan_code = parse_single_code_from_string(plan.split("Grounded Action")[-1]) 225 | plan_code = sanitize_code(plan_code) 226 | plan_code = extract_first_agent_function(plan_code) 227 | exec_code = eval(plan_code) 228 | except Exception as e: 229 | logger.error("Error in parsing plan code: %s", e) 230 | plan_code = "agent.wait(1.0)" 231 | exec_code = eval(plan_code) 232 | 233 | executor_info = { 234 | "current_subtask": subtask, 235 | "current_subtask_info": subtask_info, 236 | "executor_plan": plan, 237 | "plan_code": plan_code, 238 | "reflection": reflection, 239 | "num_input_tokens_executor": input_tokens, 240 | "num_output_tokens_executor": output_tokens, 241 | } 242 | self.turn_count += 1 243 | 244 | self.screenshot_inputs.append(obs["screenshot"]) 245 | self.flush_messages() 246 | 247 | return executor_info, [exec_code] 248 | 249 | # Removes the previous action verification, and removes any extraneous grounded actions 250 | def clean_worker_generation_for_reflection(self, worker_generation: str) -> str: 251 | # Remove the previous action verification 252 | res = worker_generation[worker_generation.find("(Screenshot Analysis)") :] 253 | action = extract_first_agent_function(worker_generation) 254 | # Cut off extra grounded actions 255 | res = res[: res.find("(Grounded Action)")] 256 | res += f"(Grounded Action)\n```python\n{action}\n```\n" 257 | return res 258 | -------------------------------------------------------------------------------- /gui_agents/s2/cli_app.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import io 4 | import logging 5 | import os 6 | import platform 7 | import pyautogui 8 | import sys 9 | import time 10 | 11 | from PIL import Image 12 | 13 | from gui_agents.s2.agents.grounding import OSWorldACI 14 | from gui_agents.s2.agents.agent_s import AgentS2 15 | 16 | current_platform = platform.system().lower() 17 | 18 | logger = logging.getLogger() 19 | logger.setLevel(logging.DEBUG) 20 | 21 | datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") 22 | 23 | log_dir = "logs" 24 | os.makedirs(log_dir, exist_ok=True) 25 | 26 | file_handler = logging.FileHandler( 27 | os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8" 28 | ) 29 | debug_handler = logging.FileHandler( 30 | os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8" 31 | ) 32 | stdout_handler = logging.StreamHandler(sys.stdout) 33 | sdebug_handler = logging.FileHandler( 34 | os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8" 35 | ) 36 | 37 | file_handler.setLevel(logging.INFO) 38 | debug_handler.setLevel(logging.DEBUG) 39 | stdout_handler.setLevel(logging.INFO) 40 | sdebug_handler.setLevel(logging.DEBUG) 41 | 42 | formatter = logging.Formatter( 43 | fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s" 44 | ) 45 | file_handler.setFormatter(formatter) 46 | debug_handler.setFormatter(formatter) 47 | stdout_handler.setFormatter(formatter) 48 | sdebug_handler.setFormatter(formatter) 49 | 50 | stdout_handler.addFilter(logging.Filter("desktopenv")) 51 | sdebug_handler.addFilter(logging.Filter("desktopenv")) 52 | 53 | logger.addHandler(file_handler) 54 | logger.addHandler(debug_handler) 55 | logger.addHandler(stdout_handler) 56 | logger.addHandler(sdebug_handler) 57 | 58 | platform_os = platform.system() 59 | 60 | 61 | def show_permission_dialog(code: str, action_description: str): 62 | """Show a platform-specific permission dialog and return True if approved.""" 63 | if platform.system() == "Darwin": 64 | result = os.system( 65 | f'osascript -e \'display dialog "Do you want to execute this action?\n\n{code} which will try to {action_description}" with title "Action Permission" buttons {{"Cancel", "OK"}} default button "OK" cancel button "Cancel"\'' 66 | ) 67 | return result == 0 68 | elif platform.system() == "Linux": 69 | result = os.system( 70 | f'zenity --question --title="Action Permission" --text="Do you want to execute this action?\n\n{code}" --width=400 --height=200' 71 | ) 72 | return result == 0 73 | return False 74 | 75 | 76 | def scale_screen_dimensions(width: int, height: int, max_dim_size: int): 77 | scale_factor = min(max_dim_size / width, max_dim_size / height, 1) 78 | safe_width = int(width * scale_factor) 79 | safe_height = int(height * scale_factor) 80 | return safe_width, safe_height 81 | 82 | 83 | def run_agent(agent, instruction: str, scaled_width: int, scaled_height: int): 84 | obs = {} 85 | traj = "Task:\n" + instruction 86 | subtask_traj = "" 87 | for _ in range(15): 88 | # Get screen shot using pyautogui 89 | screenshot = pyautogui.screenshot() 90 | screenshot = screenshot.resize((scaled_width, scaled_height), Image.LANCZOS) 91 | 92 | # Save the screenshot to a BytesIO object 93 | buffered = io.BytesIO() 94 | screenshot.save(buffered, format="PNG") 95 | 96 | # Get the byte value of the screenshot 97 | screenshot_bytes = buffered.getvalue() 98 | # Convert to base64 string. 99 | obs["screenshot"] = screenshot_bytes 100 | 101 | # Get next action code from the agent 102 | info, code = agent.predict(instruction=instruction, observation=obs) 103 | 104 | if "done" in code[0].lower() or "fail" in code[0].lower(): 105 | if platform.system() == "Darwin": 106 | os.system( 107 | f'osascript -e \'display dialog "Task Completed" with title "OpenACI Agent" buttons "OK" default button "OK"\'' 108 | ) 109 | elif platform.system() == "Linux": 110 | os.system( 111 | f'zenity --info --title="OpenACI Agent" --text="Task Completed" --width=200 --height=100' 112 | ) 113 | 114 | agent.update_narrative_memory(traj) 115 | break 116 | 117 | if "next" in code[0].lower(): 118 | continue 119 | 120 | if "wait" in code[0].lower(): 121 | time.sleep(5) 122 | continue 123 | 124 | else: 125 | time.sleep(1.0) 126 | print("EXECUTING CODE:", code[0]) 127 | 128 | # Ask for permission before executing 129 | exec(code[0]) 130 | time.sleep(1.0) 131 | 132 | # Update task and subtask trajectories and optionally the episodic memory 133 | traj += ( 134 | "\n\nReflection:\n" 135 | + str(info["reflection"]) 136 | + "\n\n----------------------\n\nPlan:\n" 137 | + info["executor_plan"] 138 | ) 139 | subtask_traj = agent.update_episodic_memory(info, subtask_traj) 140 | 141 | 142 | def main(): 143 | parser = argparse.ArgumentParser(description="Run AgentS2 with specified model.") 144 | parser.add_argument( 145 | "--provider", 146 | type=str, 147 | default="anthropic", 148 | help="Specify the provider to use (e.g., openai, anthropic, etc.)", 149 | ) 150 | parser.add_argument( 151 | "--model", 152 | type=str, 153 | default="claude-3-7-sonnet-20250219", 154 | help="Specify the model to use (e.g., gpt-4o)", 155 | ) 156 | parser.add_argument( 157 | "--model_url", 158 | type=str, 159 | default="", 160 | help="The URL of the main generation model API.", 161 | ) 162 | parser.add_argument( 163 | "--model_api_key", 164 | type=str, 165 | default="", 166 | help="The API key of the main generation model.", 167 | ) 168 | 169 | # Grounding model config option 1: API based 170 | parser.add_argument( 171 | "--grounding_model_provider", 172 | type=str, 173 | default="anthropic", 174 | help="Specify the provider to use for the grounding model (e.g., openai, anthropic, etc.)", 175 | ) 176 | parser.add_argument( 177 | "--grounding_model", 178 | type=str, 179 | default="claude-3-7-sonnet-20250219", 180 | help="Specify the grounding model to use (e.g., claude-3-5-sonnet-20241022)", 181 | ) 182 | parser.add_argument( 183 | "--grounding_model_resize_width", 184 | type=int, 185 | default=1366, 186 | help="Width of screenshot image after processor rescaling", 187 | ) 188 | parser.add_argument( 189 | "--grounding_model_resize_height", 190 | type=int, 191 | default=None, 192 | help="Height of screenshot image after processor rescaling", 193 | ) 194 | 195 | # Grounding model config option 2: Self-hosted endpoint based 196 | parser.add_argument( 197 | "--endpoint_provider", 198 | type=str, 199 | default="", 200 | help="Specify the endpoint provider for your grounding model, only HuggingFace TGI support for now", 201 | ) 202 | parser.add_argument( 203 | "--endpoint_url", 204 | type=str, 205 | default="", 206 | help="Specify the endpoint URL for your grounding model", 207 | ) 208 | parser.add_argument( 209 | "--endpoint_api_key", 210 | type=str, 211 | default="", 212 | help="The API key of the grounding model.", 213 | ) 214 | 215 | parser.add_argument( 216 | "--embedding_engine_type", 217 | type=str, 218 | default="openai", 219 | help="Specify the embedding engine type (supports openai, gemini)", 220 | ) 221 | 222 | args = parser.parse_args() 223 | assert ( 224 | args.grounding_model_provider and args.grounding_model 225 | ) or args.endpoint_url, "Error: No grounding model was provided. Either provide an API based model, or a self-hosted HuggingFace endpoint" 226 | 227 | # Re-scales screenshot size to ensure it fits in UI-TARS context limit 228 | screen_width, screen_height = pyautogui.size() 229 | scaled_width, scaled_height = scale_screen_dimensions( 230 | screen_width, screen_height, max_dim_size=2400 231 | ) 232 | 233 | # Load the general engine params 234 | engine_params = { 235 | "engine_type": args.provider, 236 | "model": args.model, 237 | "base_url": args.model_url, 238 | "api_key": args.model_api_key, 239 | } 240 | 241 | # Load the grounding engine from a HuggingFace TGI endpoint 242 | if args.endpoint_url: 243 | engine_params_for_grounding = { 244 | "engine_type": args.endpoint_provider, 245 | "base_url": args.endpoint_url, 246 | "api_key": args.endpoint_api_key, 247 | } 248 | else: 249 | grounding_height = args.grounding_model_resize_height 250 | # If not provided, use the aspect ratio of the screen to compute the height 251 | if grounding_height is None: 252 | grounding_height = ( 253 | screen_height * args.grounding_model_resize_width / screen_width 254 | ) 255 | 256 | engine_params_for_grounding = { 257 | "engine_type": args.grounding_model_provider, 258 | "model": args.grounding_model, 259 | "grounding_width": args.grounding_model_resize_width, 260 | "grounding_height": grounding_height, 261 | } 262 | 263 | grounding_agent = OSWorldACI( 264 | platform=current_platform, 265 | engine_params_for_generation=engine_params, 266 | engine_params_for_grounding=engine_params_for_grounding, 267 | width=screen_width, 268 | height=screen_height, 269 | ) 270 | 271 | agent = AgentS2( 272 | engine_params, 273 | grounding_agent, 274 | platform=current_platform, 275 | action_space="pyautogui", 276 | observation_type="mixed", 277 | search_engine=None, 278 | embedding_engine_type=args.embedding_engine_type, 279 | ) 280 | 281 | while True: 282 | query = input("Query: ") 283 | 284 | agent.reset() 285 | 286 | # Run the agent on your own device 287 | run_agent(agent, query, scaled_width, scaled_height) 288 | 289 | response = input("Would you like to provide another query? (y/n): ") 290 | if response.lower() != "y": 291 | break 292 | 293 | 294 | if __name__ == "__main__": 295 | main() 296 | -------------------------------------------------------------------------------- /gui_agents/s2/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/gui_agents/s2/core/__init__.py -------------------------------------------------------------------------------- /gui_agents/s2/core/mllm.py: -------------------------------------------------------------------------------- 1 | import base64 2 | 3 | import numpy as np 4 | 5 | from gui_agents.s2.core.engine import ( 6 | LMMEngineAnthropic, 7 | LMMEngineAzureOpenAI, 8 | LMMEngineHuggingFace, 9 | LMMEngineOpenAI, 10 | LMMEngineOpenRouter, 11 | LMMEnginevLLM, 12 | LMMEngineGemini, 13 | ) 14 | 15 | 16 | class LMMAgent: 17 | def __init__(self, engine_params=None, system_prompt=None, engine=None): 18 | if engine is None: 19 | if engine_params is not None: 20 | engine_type = engine_params.get("engine_type") 21 | if engine_type == "openai": 22 | self.engine = LMMEngineOpenAI(**engine_params) 23 | elif engine_type == "anthropic": 24 | self.engine = LMMEngineAnthropic(**engine_params) 25 | elif engine_type == "azure": 26 | self.engine = LMMEngineAzureOpenAI(**engine_params) 27 | elif engine_type == "vllm": 28 | self.engine = LMMEnginevLLM(**engine_params) 29 | elif engine_type == "huggingface": 30 | self.engine = LMMEngineHuggingFace(**engine_params) 31 | elif engine_type == "gemini": 32 | self.engine = LMMEngineGemini(**engine_params) 33 | elif engine_type == "open_router": 34 | self.engine = LMMEngineOpenRouter(**engine_params) 35 | else: 36 | raise ValueError("engine_type is not supported") 37 | else: 38 | raise ValueError("engine_params must be provided") 39 | else: 40 | self.engine = engine 41 | 42 | self.messages = [] # Empty messages 43 | 44 | if system_prompt: 45 | self.add_system_prompt(system_prompt) 46 | else: 47 | self.add_system_prompt("You are a helpful assistant.") 48 | 49 | def encode_image(self, image_content): 50 | # if image_content is a path to an image file, check type of the image_content to verify 51 | if isinstance(image_content, str): 52 | with open(image_content, "rb") as image_file: 53 | return base64.b64encode(image_file.read()).decode("utf-8") 54 | else: 55 | return base64.b64encode(image_content).decode("utf-8") 56 | 57 | def reset( 58 | self, 59 | ): 60 | 61 | self.messages = [ 62 | { 63 | "role": "system", 64 | "content": [{"type": "text", "text": self.system_prompt}], 65 | } 66 | ] 67 | 68 | def add_system_prompt(self, system_prompt): 69 | self.system_prompt = system_prompt 70 | if len(self.messages) > 0: 71 | self.messages[0] = { 72 | "role": "system", 73 | "content": [{"type": "text", "text": self.system_prompt}], 74 | } 75 | else: 76 | self.messages.append( 77 | { 78 | "role": "system", 79 | "content": [{"type": "text", "text": self.system_prompt}], 80 | } 81 | ) 82 | 83 | def remove_message_at(self, index): 84 | """Remove a message at a given index""" 85 | if index < len(self.messages): 86 | self.messages.pop(index) 87 | 88 | def replace_message_at( 89 | self, index, text_content, image_content=None, image_detail="high" 90 | ): 91 | """Replace a message at a given index""" 92 | if index < len(self.messages): 93 | self.messages[index] = { 94 | "role": self.messages[index]["role"], 95 | "content": [{"type": "text", "text": text_content}], 96 | } 97 | if image_content: 98 | base64_image = self.encode_image(image_content) 99 | self.messages[index]["content"].append( 100 | { 101 | "type": "image_url", 102 | "image_url": { 103 | "url": f"data:image/png;base64,{base64_image}", 104 | "detail": image_detail, 105 | }, 106 | } 107 | ) 108 | 109 | def add_message( 110 | self, 111 | text_content, 112 | image_content=None, 113 | role=None, 114 | image_detail="high", 115 | put_text_last=False, 116 | ): 117 | """Add a new message to the list of messages""" 118 | 119 | # API-style inference from OpenAI and AzureOpenAI 120 | if isinstance( 121 | self.engine, 122 | ( 123 | LMMEngineOpenAI, 124 | LMMEngineAzureOpenAI, 125 | LMMEngineHuggingFace, 126 | LMMEngineGemini, 127 | LMMEngineOpenRouter, 128 | ), 129 | ): 130 | # infer role from previous message 131 | if role != "user": 132 | if self.messages[-1]["role"] == "system": 133 | role = "user" 134 | elif self.messages[-1]["role"] == "user": 135 | role = "assistant" 136 | elif self.messages[-1]["role"] == "assistant": 137 | role = "user" 138 | 139 | message = { 140 | "role": role, 141 | "content": [{"type": "text", "text": text_content}], 142 | } 143 | 144 | if isinstance(image_content, np.ndarray) or image_content: 145 | # Check if image_content is a list or a single image 146 | if isinstance(image_content, list): 147 | # If image_content is a list of images, loop through each image 148 | for image in image_content: 149 | base64_image = self.encode_image(image) 150 | message["content"].append( 151 | { 152 | "type": "image_url", 153 | "image_url": { 154 | "url": f"data:image/png;base64,{base64_image}", 155 | "detail": image_detail, 156 | }, 157 | } 158 | ) 159 | else: 160 | # If image_content is a single image, handle it directly 161 | base64_image = self.encode_image(image_content) 162 | message["content"].append( 163 | { 164 | "type": "image_url", 165 | "image_url": { 166 | "url": f"data:image/png;base64,{base64_image}", 167 | "detail": image_detail, 168 | }, 169 | } 170 | ) 171 | 172 | # Rotate text to be the last message if desired 173 | if put_text_last: 174 | text_content = message["content"].pop(0) 175 | message["content"].append(text_content) 176 | 177 | self.messages.append(message) 178 | 179 | # For API-style inference from Anthropic 180 | elif isinstance(self.engine, LMMEngineAnthropic): 181 | # infer role from previous message 182 | if role != "user": 183 | if self.messages[-1]["role"] == "system": 184 | role = "user" 185 | elif self.messages[-1]["role"] == "user": 186 | role = "assistant" 187 | elif self.messages[-1]["role"] == "assistant": 188 | role = "user" 189 | 190 | message = { 191 | "role": role, 192 | "content": [{"type": "text", "text": text_content}], 193 | } 194 | 195 | if image_content: 196 | # Check if image_content is a list or a single image 197 | if isinstance(image_content, list): 198 | # If image_content is a list of images, loop through each image 199 | for image in image_content: 200 | base64_image = self.encode_image(image) 201 | message["content"].append( 202 | { 203 | "type": "image", 204 | "source": { 205 | "type": "base64", 206 | "media_type": "image/png", 207 | "data": base64_image, 208 | }, 209 | } 210 | ) 211 | else: 212 | # If image_content is a single image, handle it directly 213 | base64_image = self.encode_image(image_content) 214 | message["content"].append( 215 | { 216 | "type": "image", 217 | "source": { 218 | "type": "base64", 219 | "media_type": "image/png", 220 | "data": base64_image, 221 | }, 222 | } 223 | ) 224 | self.messages.append(message) 225 | 226 | # Locally hosted vLLM model inference 227 | elif isinstance(self.engine, LMMEnginevLLM): 228 | # infer role from previous message 229 | if role != "user": 230 | if self.messages[-1]["role"] == "system": 231 | role = "user" 232 | elif self.messages[-1]["role"] == "user": 233 | role = "assistant" 234 | elif self.messages[-1]["role"] == "assistant": 235 | role = "user" 236 | 237 | message = { 238 | "role": role, 239 | "content": [{"type": "text", "text": text_content}], 240 | } 241 | 242 | if image_content: 243 | # Check if image_content is a list or a single image 244 | if isinstance(image_content, list): 245 | # If image_content is a list of images, loop through each image 246 | for image in image_content: 247 | base64_image = self.encode_image(image) 248 | message["content"].append( 249 | { 250 | "type": "image_url", 251 | "image_url": { 252 | "url": f"data:image;base64,{base64_image}" 253 | }, 254 | } 255 | ) 256 | else: 257 | # If image_content is a single image, handle it directly 258 | base64_image = self.encode_image(image_content) 259 | message["content"].append( 260 | { 261 | "type": "image_url", 262 | "image_url": {"url": f"data:image;base64,{base64_image}"}, 263 | } 264 | ) 265 | 266 | self.messages.append(message) 267 | else: 268 | raise ValueError("engine_type is not supported") 269 | 270 | def get_response( 271 | self, 272 | user_message=None, 273 | messages=None, 274 | temperature=0.0, 275 | max_new_tokens=None, 276 | **kwargs, 277 | ): 278 | """Generate the next response based on previous messages""" 279 | if messages is None: 280 | messages = self.messages 281 | if user_message: 282 | messages.append( 283 | {"role": "user", "content": [{"type": "text", "text": user_message}]} 284 | ) 285 | 286 | return self.engine.generate( 287 | messages, 288 | temperature=temperature, 289 | max_new_tokens=max_new_tokens, 290 | **kwargs, 291 | ) 292 | -------------------------------------------------------------------------------- /gui_agents/s2/core/module.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional 2 | from gui_agents.s2.core.mllm import LMMAgent 3 | 4 | 5 | class BaseModule: 6 | def __init__(self, engine_params: Dict, platform: str): 7 | self.engine_params = engine_params 8 | self.platform = platform 9 | 10 | def _create_agent( 11 | self, system_prompt: str = None, engine_params: Optional[Dict] = None 12 | ) -> LMMAgent: 13 | """Create a new LMMAgent instance""" 14 | agent = LMMAgent(engine_params or self.engine_params) 15 | if system_prompt: 16 | agent.add_system_prompt(system_prompt) 17 | return agent 18 | -------------------------------------------------------------------------------- /gui_agents/s2/memory/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/gui_agents/s2/memory/__init__.py -------------------------------------------------------------------------------- /gui_agents/s2/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/gui_agents/s2/utils/__init__.py -------------------------------------------------------------------------------- /gui_agents/s2/utils/common_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | from typing import List 4 | import time 5 | import tiktoken 6 | 7 | from typing import Tuple, List, Union, Dict 8 | 9 | from pydantic import BaseModel, ValidationError 10 | 11 | import pickle 12 | 13 | 14 | class Node(BaseModel): 15 | name: str 16 | info: str 17 | 18 | 19 | class Dag(BaseModel): 20 | nodes: List[Node] 21 | edges: List[List[Node]] 22 | 23 | 24 | NUM_IMAGE_TOKEN = 1105 # Value set of screen of size 1920x1080 for openai vision 25 | 26 | 27 | def call_llm_safe(agent) -> Union[str, Dag]: 28 | # Retry if fails 29 | max_retries = 3 # Set the maximum number of retries 30 | attempt = 0 31 | response = "" 32 | while attempt < max_retries: 33 | try: 34 | response = agent.get_response() 35 | break # If successful, break out of the loop 36 | except Exception as e: 37 | attempt += 1 38 | print(f"Attempt {attempt} failed: {e}") 39 | if attempt == max_retries: 40 | print("Max retries reached. Handling failure.") 41 | time.sleep(1.0) 42 | return response 43 | 44 | 45 | def calculate_tokens(messages, num_image_token=NUM_IMAGE_TOKEN) -> Tuple[int, int]: 46 | 47 | num_input_images = 0 48 | output_message = messages[-1] 49 | 50 | input_message = messages[:-1] 51 | 52 | input_string = """""" 53 | for message in input_message: 54 | input_string += message["content"][0]["text"] + "\n" 55 | if len(message["content"]) > 1: 56 | num_input_images += 1 57 | 58 | input_text_tokens = get_input_token_length(input_string) 59 | 60 | input_image_tokens = num_image_token * num_input_images 61 | 62 | output_tokens = get_input_token_length(output_message["content"][0]["text"]) 63 | 64 | return (input_text_tokens + input_image_tokens), output_tokens 65 | 66 | 67 | # Code based on https://github.com/xlang-ai/OSWorld/blob/main/mm_agents/agent.py 68 | 69 | 70 | def parse_dag(text): 71 | pattern = r"(.*?)" 72 | match = re.search(pattern, text, re.DOTALL) 73 | if match: 74 | json_str = match.group(1) 75 | try: 76 | json_data = json.loads(json_str) 77 | return Dag(**json_data["dag"]) 78 | except json.JSONDecodeError: 79 | print("Error: Invalid JSON") 80 | return None 81 | except KeyError: 82 | print("Error: 'dag' key not found in JSON") 83 | return None 84 | except ValidationError as e: 85 | print(f"Error: Invalid data structure - {e}") 86 | return None 87 | else: 88 | print("Error: JSON not found") 89 | return None 90 | 91 | 92 | def parse_dag(text): 93 | """ 94 | Try extracting JSON from tags first; 95 | if not found, try ```json … ``` Markdown fences. 96 | """ 97 | 98 | def _extract(pattern): 99 | m = re.search(pattern, text, re.DOTALL) 100 | return m.group(1).strip() if m else None 101 | 102 | # 1) look for 103 | json_str = _extract(r"(.*?)") 104 | # 2) fallback to ```json … ``` 105 | if json_str is None: 106 | json_str = _extract(r"```json\s*(.*?)\s*```") 107 | 108 | if json_str is None: 109 | print("Error: JSON not found in either tags or ```json``` fence") 110 | return None 111 | 112 | try: 113 | payload = json.loads(json_str) 114 | except json.JSONDecodeError as e: 115 | print(f"Error: Invalid JSON ({e})") 116 | return None 117 | 118 | if "dag" not in payload: 119 | print("Error: 'dag' key not found in JSON") 120 | return None 121 | 122 | try: 123 | return Dag(**payload["dag"]) 124 | except ValidationError as e: 125 | print(f"Error: Invalid data structure - {e}") 126 | return None 127 | 128 | 129 | def parse_single_code_from_string(input_string): 130 | input_string = input_string.strip() 131 | if input_string.strip() in ["WAIT", "DONE", "FAIL"]: 132 | return input_string.strip() 133 | 134 | # This regular expression will match both ```code``` and ```python code``` 135 | # and capture the `code` part. It uses a non-greedy match for the content inside. 136 | pattern = r"```(?:\w+\s+)?(.*?)```" 137 | # Find all non-overlapping matches in the string 138 | matches = re.findall(pattern, input_string, re.DOTALL) 139 | 140 | # The regex above captures the content inside the triple backticks. 141 | # The `re.DOTALL` flag allows the dot `.` to match newline characters as well, 142 | # so the code inside backticks can span multiple lines. 143 | 144 | # matches now contains all the captured code snippets 145 | 146 | codes = [] 147 | 148 | for match in matches: 149 | match = match.strip() 150 | commands = [ 151 | "WAIT", 152 | "DONE", 153 | "FAIL", 154 | ] # fixme: updates this part when we have more commands 155 | 156 | if match in commands: 157 | codes.append(match.strip()) 158 | elif match.split("\n")[-1] in commands: 159 | if len(match.split("\n")) > 1: 160 | codes.append("\n".join(match.split("\n")[:-1])) 161 | codes.append(match.split("\n")[-1]) 162 | else: 163 | codes.append(match) 164 | 165 | if len(codes) <= 0: 166 | return "fail" 167 | return codes[0] 168 | 169 | 170 | def get_input_token_length(input_string): 171 | enc = tiktoken.encoding_for_model("gpt-4") 172 | tokens = enc.encode(input_string) 173 | return len(tokens) 174 | 175 | 176 | def sanitize_code(code): 177 | # This pattern captures the outermost double-quoted text 178 | if "\n" in code: 179 | pattern = r'(".*?")' 180 | # Find all matches in the text 181 | matches = re.findall(pattern, code, flags=re.DOTALL) 182 | if matches: 183 | # Replace the first occurrence only 184 | first_match = matches[0] 185 | code = code.replace(first_match, f'"""{first_match[1:-1]}"""', 1) 186 | return code 187 | 188 | 189 | def extract_first_agent_function(code_string): 190 | # Regular expression pattern to match 'agent' functions with any arguments, including nested parentheses 191 | pattern = r'agent\.[a-zA-Z_]+\((?:[^()\'"]|\'[^\']*\'|"[^"]*")*\)' 192 | 193 | # Find all matches in the string 194 | matches = re.findall(pattern, code_string) 195 | 196 | # Return the first match if found, otherwise return None 197 | return matches[0] if matches else None 198 | 199 | 200 | def load_knowledge_base(kb_path: str) -> Dict: 201 | try: 202 | with open(kb_path, "r") as f: 203 | return json.load(f) 204 | except Exception as e: 205 | print(f"Error loading knowledge base: {e}") 206 | return {} 207 | 208 | 209 | def load_embeddings(embeddings_path: str) -> Dict: 210 | try: 211 | with open(embeddings_path, "rb") as f: 212 | return pickle.load(f) 213 | except Exception as e: 214 | print(f"Error loading embeddings: {e}") 215 | return {} 216 | 217 | 218 | def save_embeddings(embeddings_path: str, embeddings: Dict): 219 | try: 220 | with open(embeddings_path, "wb") as f: 221 | pickle.dump(embeddings, f) 222 | except Exception as e: 223 | print(f"Error saving embeddings: {e}") 224 | -------------------------------------------------------------------------------- /gui_agents/s2/utils/query_perplexica.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import os 3 | 4 | 5 | def query_to_perplexica(query): 6 | # Retrieve the URL from an environment variable 7 | url = os.getenv("PERPLEXICA_URL") 8 | if not url: 9 | raise ValueError( 10 | "PERPLEXICA_URL environment variable not set. It may take the form: 'http://localhost:{port}/api/search'. The port number is set in the config.toml in the Perplexica directory." 11 | ) 12 | 13 | # Request Message 14 | message = {"focusMode": "webSearch", "query": query, "history": [["human", query]]} 15 | 16 | response = requests.post(url, json=message) 17 | 18 | if response.status_code == 200: 19 | return response.json()["message"] 20 | elif response.status_code == 400: 21 | raise ValueError( 22 | "The request is malformed or missing required fields, such as FocusModel or query" 23 | ) 24 | else: 25 | raise ValueError("Internal Server Error") 26 | 27 | 28 | # Test Code 29 | if __name__ == "__main__": 30 | query = "What is Agent S?" 31 | response = query_to_perplexica(query) 32 | print(response) 33 | -------------------------------------------------------------------------------- /gui_agents/utils.py: -------------------------------------------------------------------------------- 1 | """General utility.""" 2 | 3 | import platform 4 | import requests 5 | import zipfile 6 | import io 7 | import os 8 | 9 | 10 | def download_kb_data( 11 | version="s2", 12 | release_tag="v0.2.2", 13 | download_dir="kb_data", 14 | platform=platform.system().lower(), 15 | ): 16 | """Download and extract the appropriate KB ZIP file for the current OS. 17 | 18 | Args: 19 | version (str): Prefix in the asset name (e.g., "s1" or "s2") 20 | release_tag (str): Tag of the release that has the assets (e.g., "v0.2.2") 21 | download_dir (str): Where to extract the downloaded files 22 | platform (str): OS (e.g., "windows", "darwin", "linux") 23 | """ 24 | # Detect OS 25 | if platform not in ["windows", "darwin", "linux"]: 26 | raise RuntimeError(f"Unsupported OS: {platform}") 27 | 28 | # Build asset filename, e.g. "s1_windows.zip" or "s1_darwin.zip" 29 | asset_name = f"{version}_{platform}.zip" 30 | 31 | download_url = f"https://github.com/simular-ai/Agent-S/releases/download/{release_tag}/{asset_name}" 32 | 33 | # Make sure our output directory exists 34 | os.makedirs(download_dir, exist_ok=True) 35 | 36 | print(f"Downloading {asset_name} from {download_url} ...") 37 | response = requests.get(download_url) 38 | if response.status_code != 200: 39 | raise RuntimeError( 40 | f"Failed to download {asset_name}. " 41 | f"HTTP status: {response.status_code} - {response.reason}" 42 | ) 43 | 44 | # Extract the ZIP in-memory 45 | zip_data = io.BytesIO(response.content) 46 | with zipfile.ZipFile(zip_data, "r") as zip_ref: 47 | zip_ref.extractall(download_dir) 48 | 49 | print(f"Extracted {asset_name} to ./{download_dir}") 50 | -------------------------------------------------------------------------------- /images/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/images/.DS_Store -------------------------------------------------------------------------------- /images/agent_s.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/images/agent_s.png -------------------------------------------------------------------------------- /images/agent_s2_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/images/agent_s2_architecture.png -------------------------------------------------------------------------------- /images/agent_s2_osworld_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/images/agent_s2_osworld_result.png -------------------------------------------------------------------------------- /images/agent_s2_teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/images/agent_s2_teaser.png -------------------------------------------------------------------------------- /images/agent_s_architecture.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/images/agent_s_architecture.pdf -------------------------------------------------------------------------------- /images/osworld_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/images/osworld_result.png -------------------------------------------------------------------------------- /images/results.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/images/results.pdf -------------------------------------------------------------------------------- /images/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/images/results.png -------------------------------------------------------------------------------- /images/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/images/teaser.png -------------------------------------------------------------------------------- /images/windows_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/images/windows_result.png -------------------------------------------------------------------------------- /models.md: -------------------------------------------------------------------------------- 1 | We support the following APIs for MLLM inference: OpenAI, Anthropic, Gemini, Azure OpenAI, vLLM for local models, and Open Router. To use these APIs, you need to set the corresponding environment variables: 2 | 3 | 1. OpenAI 4 | 5 | ``` 6 | export OPENAI_API_KEY= 7 | ``` 8 | 9 | 2. Anthropic 10 | 11 | ``` 12 | export ANTHROPIC_API_KEY= 13 | ``` 14 | 15 | 3. Gemini 16 | 17 | ``` 18 | export GEMINI_API_KEY= 19 | export GEMINI_ENDPOINT_URL="https://generativelanguage.googleapis.com/v1beta/openai/" 20 | ``` 21 | 22 | 4. OpenAI on Azure 23 | 24 | ``` 25 | export AZURE_OPENAI_API_BASE= 26 | export AZURE_OPENAI_API_KEY= 27 | ``` 28 | 29 | 5. vLLM for Local Models 30 | 31 | ``` 32 | export vLLM_ENDPOINT_URL= 33 | ``` 34 | 35 | Alternatively you can directly pass the API keys into the engine_params argument while instantating the agent. 36 | 37 | 6. Open Router 38 | 39 | ``` 40 | export OPENROUTER_API_KEY= 41 | export OPEN_ROUTER_ENDPOINT_URL="https://openrouter.ai/api/v1" 42 | ``` 43 | 44 | ```python 45 | from gui_agents.s2.agents.agent_s import AgentS2 46 | 47 | engine_params = { 48 | "engine_type": 'anthropic', # Allowed Values: 'openai', 'anthropic', 'gemini', 'azure_openai', 'vllm', 'open_router' 49 | "model": 'claude-3-5-sonnet-20240620', # Allowed Values: Any Vision and Language Model from the supported APIs 50 | } 51 | agent = AgentS2( 52 | engine_params, 53 | grounding_agent, 54 | platform=current_platform, 55 | action_space="pyautogui", 56 | observation_type="mixed", 57 | search_engine="LLM" 58 | ) 59 | ``` 60 | 61 | To use the underlying Multimodal Agent (LMMAgent) which wraps LLMs with message handling functionality, you can use the following code snippet: 62 | 63 | ```python 64 | from gui_agents.core.mllm import LMMAgent 65 | 66 | engine_params = { 67 | "engine_type": 'anthropic', # Allowed Values: 'openai', 'anthropic', 'gemini', 'azure_openai', 'vllm', 'open_router' 68 | "model": 'claude-3-5-sonnet-20240620', # Allowed Values: Any Vision and Language Model from the supported APIs 69 | } 70 | agent = LMMAgent( 71 | engine_params=engine_params, 72 | ) 73 | ``` 74 | 75 | The `AgentS2` also utilizes this `LMMAgent` internally. -------------------------------------------------------------------------------- /osworld_setup/s1/OSWorld.md: -------------------------------------------------------------------------------- 1 | # Deplying Agent-S in OSWorld 2 | 3 | # Step 1: Set up Agent S 4 | 5 | Follow the [README.md](https://github.com/simular-ai/Agent-S/blob/main/gui_agents/s1/README.md) to set up Agent S. 6 | 7 | # Step 2: Copying Over Run Files 8 | 9 | If you haven't already, please follow the [OSWorld environment setup](https://github.com/xlang-ai/OSWorld/blob/main/README.md). We've provided the relevant OSWorld run files for evaluation in this `osworld_setup` folder. Please copy this over to your OSWorld folder. 10 | 11 | We have set the latest Agent S to use the latest Ubuntu VM image from OSWorld. However, our experiments are based on the older version of the VM. To reproduce the results, set the vm_version argument to 'old' while instantiating the agent. 12 | 13 | 14 | # Step 3: Best Practices 15 | 16 | At this point, you will have set up the Agent-S and OSWorld environments and the VMWare Workstation Pro application. Below, we'll list some best practices, and common problems and their fixes. 17 | 18 | --- 19 | 20 | ``` 21 | from desktop_env.desktop_env import DesktopEnv 22 | 23 | example = { 24 | "id": "94d95f96-9699-4208-98ba-3c3119edf9c2", 25 | "instruction": "I want to install Spotify on my current system. Could you please help me?", 26 | "config": [ 27 | { 28 | "type": "execute", 29 | "parameters": { 30 | "command": [ 31 | "python", 32 | "-c", 33 | "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" 34 | ] 35 | } 36 | } 37 | ], 38 | "evaluator": { 39 | "func": "check_include_exclude", 40 | "result": { 41 | "type": "vm_command_line", 42 | "command": "which spotify" 43 | }, 44 | "expected": { 45 | "type": "rule", 46 | "rules": { 47 | "include": ["spotify"], 48 | "exclude": ["not found"] 49 | } 50 | } 51 | } 52 | } 53 | 54 | env = DesktopEnv(action_space="pyautogui") 55 | 56 | obs = env.reset(task_config=example) 57 | obs, reward, done, info = env.step("pyautogui.rightClick()") 58 | ``` 59 | 60 | The code above will boot up a VM and restart it. If, for whatever reason, running the starter code below leads to an infinitely long run time, cancel out of the VM. 61 | You should then see: 62 | 63 | ``` 64 | parent/ 65 | Agent-S/ 66 | OSWorld/ 67 | vmware_vm_data/ 68 | Ubuntu0/ 69 | *.lck 70 | *.vmem 71 | ... 72 | ... 73 | UbuntuX/ 74 | ``` 75 | 76 | If you happen to have any `*.lck` folder in your VM's folder, be sure to delete them. Every time you are powering on the VM from creating a new `DesktopEnv` instance, you need to 77 | delete the `*.lck` folders first. If your VM is already powered on, and your session (in a Jupyter Notebook, for example) crashes, you can keep the `*.lck` files and just re-instantiate the `DesktopEnv` instance. I'd also suggest using just a single VM (as a VM takes up a lot of space!). 78 | 79 | --- 80 | 81 | If even after rerunning the code and deleting the `*.lck` files don't work, then you should try passing in the `path_to_vm` explicitly to the `DesktopEnv` class. 82 | 83 | ``` 84 | env = DesktopEnv(action_space="pyautogui", headless=False, require_terminal=True, path_to_vm=) 85 | ``` 86 | 87 | Pass the absolute path to your VM's (Ubuntu0) `.vmx` file. This file is located here: 88 | 89 | 90 | ``` 91 | parent/ 92 | Agent-S/ 93 | OSWorld/ 94 | vmware_vm_data/ 95 | Ubuntu0/ 96 | *.lck 97 | *.vmem 98 | ... 99 | *.vmx 100 | ... 101 | UbuntuX/ 102 | ``` 103 | 104 | 📌 **Note**: If you are testing on the `os` domain, there is an [issue](https://github.com/asweigart/pyautogui/issues/198#issuecomment-1465268536) with `pyautogui`. A *hacky* way to solve this is to, inside the VM, locate where the `pyautogui` module is installed and open the `__init__.py` located under the `pyautogui` folder and remove the "<" in the `set(...)` within the following function: 105 | 106 | ``` 107 | def isShiftCharacter(character): 108 | """ 109 | Returns True if the ``character`` is a keyboard key that would require the shift key to be held down, such as 110 | uppercase letters or the symbols on the keyboard's number row. 111 | """ 112 | # NOTE TODO - This will be different for non-qwerty keyboards. 113 | return character.isupper() or character in set('~!@#$%^&*()_+{}|:"<>?') 114 | ``` 115 | 116 | 📌 **Note**: If in case, your VM encounters an issue with "The root file system on requires a manual fsck", reset the VM to the previous snapshot. 117 | 118 | With these changes, you should be able to get up and running with VMWare, DesktopEnv, and OSWorld! 😊 -------------------------------------------------------------------------------- /osworld_setup/s1/lib_run_single.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import json 3 | import logging 4 | import os 5 | import time 6 | from wrapt_timeout_decorator import * 7 | 8 | logger = logging.getLogger("desktopenv.experiment") 9 | 10 | 11 | def run_single_example( 12 | agent, env, example, max_steps, instruction, args, example_result_dir, scores 13 | ): 14 | runtime_logger = setup_logger(example, example_result_dir) 15 | agent.reset() 16 | env.reset(task_config=example) 17 | time.sleep(60) # Wait for the environment to be ready 18 | obs = env._get_obs() # Get the initial observation 19 | done = False 20 | step_idx = 0 21 | env.controller.start_recording() 22 | while not done and step_idx < max_steps: 23 | response, actions = agent.predict(instruction, obs) 24 | for action in actions: 25 | # Capture the timestamp before executing the action 26 | action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") 27 | logger.info("Step %d: %s", step_idx + 1, action) 28 | obs, reward, done, info = env.step(action, args.sleep_after_execution) 29 | 30 | logger.info("Reward: %.2f", reward) 31 | logger.info("Done: %s", done) 32 | # Save screenshot and trajectory information 33 | with open( 34 | os.path.join( 35 | example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png" 36 | ), 37 | "wb", 38 | ) as _f: 39 | _f.write(obs["screenshot"]) 40 | with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f: 41 | f.write( 42 | json.dumps( 43 | { 44 | "step_num": step_idx + 1, 45 | "action_timestamp": action_timestamp, 46 | "action": action, 47 | "reward": reward, 48 | "done": done, 49 | "info": info, 50 | "screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png", 51 | } 52 | ) 53 | ) 54 | f.write("\n") 55 | if done: 56 | logger.info("The episode is done.") 57 | break 58 | step_idx += 1 59 | result = env.evaluate() 60 | logger.info("Result: %.2f", result) 61 | scores.append(result) 62 | with open( 63 | os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8" 64 | ) as f: 65 | f.write(f"{result}\n") 66 | env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4")) 67 | 68 | 69 | def setup_logger(example, example_result_dir): 70 | runtime_logger = logging.getLogger(f"desktopenv.example.{example['id']}") 71 | runtime_logger.setLevel(logging.DEBUG) 72 | runtime_logger.addHandler( 73 | logging.FileHandler(os.path.join(example_result_dir, "runtime.log")) 74 | ) 75 | return runtime_logger 76 | -------------------------------------------------------------------------------- /osworld_setup/s1/run.py: -------------------------------------------------------------------------------- 1 | """OSWorld's run.py with AgentS.""" 2 | 3 | """Script to run end-to-end evaluation on the benchmark. 4 | Utils and basic architecture credit to https://github.com/web-arena-x/webarena/blob/main/run.py. 5 | """ 6 | 7 | import argparse 8 | import datetime 9 | import json 10 | import logging 11 | import os 12 | import sys 13 | 14 | from gui_agents.s1.core.AgentS import GraphSearchAgent 15 | from gui_agents.s1.aci.LinuxOSACI import LinuxACI 16 | from tqdm import tqdm 17 | 18 | import lib_run_single 19 | from desktop_env.desktop_env import DesktopEnv 20 | 21 | # import wandb 22 | 23 | 24 | # Logger Configs {{{ # 25 | logger = logging.getLogger() 26 | logger.setLevel(logging.DEBUG) 27 | 28 | datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") 29 | 30 | file_handler = logging.FileHandler( 31 | os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8" 32 | ) 33 | debug_handler = logging.FileHandler( 34 | os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8" 35 | ) 36 | stdout_handler = logging.StreamHandler(sys.stdout) 37 | sdebug_handler = logging.FileHandler( 38 | os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8" 39 | ) 40 | 41 | file_handler.setLevel(logging.INFO) 42 | debug_handler.setLevel(logging.DEBUG) 43 | stdout_handler.setLevel(logging.INFO) 44 | sdebug_handler.setLevel(logging.DEBUG) 45 | 46 | formatter = logging.Formatter( 47 | fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s" 48 | ) 49 | file_handler.setFormatter(formatter) 50 | debug_handler.setFormatter(formatter) 51 | stdout_handler.setFormatter(formatter) 52 | sdebug_handler.setFormatter(formatter) 53 | 54 | stdout_handler.addFilter(logging.Filter("desktopenv")) 55 | sdebug_handler.addFilter(logging.Filter("desktopenv")) 56 | 57 | logger.addHandler(file_handler) 58 | logger.addHandler(debug_handler) 59 | logger.addHandler(stdout_handler) 60 | logger.addHandler(sdebug_handler) 61 | # }}} Logger Configs # 62 | 63 | logger = logging.getLogger("desktopenv.experiment") 64 | 65 | 66 | def config() -> argparse.Namespace: 67 | parser = argparse.ArgumentParser( 68 | description="Run end-to-end evaluation on the benchmark" 69 | ) 70 | 71 | # environment config 72 | parser.add_argument("--path_to_vm", type=str, default=None) 73 | parser.add_argument( 74 | "--headless", action="store_true", help="Run in headless machine" 75 | ) 76 | parser.add_argument( 77 | "--action_space", type=str, default="pyautogui", help="Action type" 78 | ) 79 | parser.add_argument( 80 | "--observation_type", 81 | choices=["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"], 82 | default="a11y_tree", 83 | help="Observation type", 84 | ) 85 | parser.add_argument("--screen_width", type=int, default=1920) 86 | parser.add_argument("--screen_height", type=int, default=1080) 87 | parser.add_argument("--sleep_after_execution", type=float, default=0.0) 88 | parser.add_argument("--max_steps", type=int, default=15) 89 | 90 | # agent config 91 | parser.add_argument("--max_trajectory_length", type=int, default=3) 92 | parser.add_argument( 93 | "--test_config_base_dir", type=str, default="evaluation_examples" 94 | ) 95 | 96 | # lm config 97 | parser.add_argument("--model", type=str, default="gpt-4o") 98 | parser.add_argument("--temperature", type=float, default=1.0) 99 | parser.add_argument("--top_p", type=float, default=0.9) 100 | parser.add_argument("--max_tokens", type=int, default=1500) 101 | parser.add_argument("--stop_token", type=str, default=None) 102 | 103 | # example config 104 | parser.add_argument("--domain", type=str, default="all") 105 | parser.add_argument( 106 | "--test_all_meta_path", type=str, default="evaluation_examples/test_all.json" 107 | ) 108 | 109 | # logging related 110 | parser.add_argument("--result_dir", type=str, default="./results") 111 | 112 | # NEW! 113 | parser.add_argument("--huggingface_endpoint_url", type=str, required=True) 114 | parser.add_argument("--kb_name", default="kb_s2", type=str) 115 | 116 | args = parser.parse_args() 117 | 118 | return args 119 | 120 | 121 | def test(args: argparse.Namespace, test_all_meta: dict) -> None: 122 | scores = [] 123 | max_steps = args.max_steps 124 | 125 | # log args 126 | logger.info("Args: %s", args) 127 | # set wandb project 128 | cfg_args = { 129 | "path_to_vm": args.path_to_vm, 130 | "headless": args.headless, 131 | "action_space": args.action_space, 132 | "observation_type": args.observation_type, 133 | "screen_width": args.screen_width, 134 | "screen_height": args.screen_height, 135 | "sleep_after_execution": args.sleep_after_execution, 136 | "max_steps": args.max_steps, 137 | "max_trajectory_length": args.max_trajectory_length, 138 | "model": args.model, 139 | "temperature": args.temperature, 140 | "top_p": args.top_p, 141 | "max_tokens": args.max_tokens, 142 | "stop_token": args.stop_token, 143 | "result_dir": args.result_dir, 144 | } 145 | 146 | # NEW! 147 | if args.model.startswith("claude"): 148 | engine_type = "anthropic" 149 | elif args.model.startswith("gpt"): 150 | engine_type = "openai" 151 | else: 152 | engine_type = "vllm" 153 | 154 | engine_params = {"engine_type": engine_type, "model": args.model} 155 | 156 | # NEW! 157 | grounding_agent = LinuxACI() 158 | 159 | # NEW! 160 | agent = GraphSearchAgent( 161 | engine_params, 162 | grounding_agent, 163 | platform="linux", 164 | action_space="pyautogui", 165 | observation_type="mixed", 166 | search_engine="Perplexica", 167 | memory_root_path=os.getcwd(), 168 | memory_folder_name=args.kb_name, 169 | kb_release_tag="v0.2.2", 170 | ) 171 | 172 | env = DesktopEnv( 173 | path_to_vm=args.path_to_vm, 174 | action_space=agent.action_space, 175 | screen_size=(args.screen_width, args.screen_height), 176 | headless=args.headless, 177 | os_type="Ubuntu", 178 | require_a11y_tree=args.observation_type 179 | in ["a11y_tree", "screenshot_a11y_tree", "som"], 180 | ) 181 | 182 | for domain in tqdm(test_all_meta, desc="Domain"): 183 | for example_id in tqdm(test_all_meta[domain], desc="Example", leave=False): 184 | config_file = os.path.join( 185 | args.test_config_base_dir, f"examples/{domain}/{example_id}.json" 186 | ) 187 | with open(config_file, "r", encoding="utf-8") as f: 188 | example = json.load(f) 189 | 190 | logger.info(f"[Domain]: {domain}") 191 | logger.info(f"[Example ID]: {example_id}") 192 | 193 | instruction = example["instruction"] 194 | 195 | logger.info(f"[Instruction]: {instruction}") 196 | # wandb each example config settings 197 | cfg_args["instruction"] = instruction 198 | cfg_args["start_time"] = datetime.datetime.now().strftime( 199 | "%Y:%m:%d-%H:%M:%S" 200 | ) 201 | # run.config.update(cfg_args) 202 | 203 | example_result_dir = os.path.join( 204 | args.result_dir, 205 | args.action_space, 206 | args.observation_type, 207 | args.model, 208 | domain, 209 | example_id, 210 | ) 211 | os.makedirs(example_result_dir, exist_ok=True) 212 | # example start running 213 | try: 214 | lib_run_single.run_single_example( 215 | agent, 216 | env, 217 | example, 218 | max_steps, 219 | instruction, 220 | args, 221 | example_result_dir, 222 | scores, 223 | ) 224 | except Exception as e: 225 | logger.error(f"Exception in {domain}/{example_id}: {e}") 226 | env.controller.end_recording( 227 | os.path.join(example_result_dir, "recording.mp4") 228 | ) 229 | with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f: 230 | f.write( 231 | json.dumps( 232 | {"Error": f"Time limit exceeded in {domain}/{example_id}"} 233 | ) 234 | ) 235 | f.write("\n") 236 | 237 | env.close() 238 | logger.info(f"Average score: {sum(scores) / len(scores)}") 239 | 240 | 241 | def get_unfinished( 242 | action_space, use_model, observation_type, result_dir, total_file_json 243 | ): 244 | target_dir = os.path.join(result_dir, action_space, observation_type, use_model) 245 | 246 | if not os.path.exists(target_dir): 247 | return total_file_json 248 | 249 | finished = {} 250 | for domain in os.listdir(target_dir): 251 | finished[domain] = [] 252 | domain_path = os.path.join(target_dir, domain) 253 | if os.path.isdir(domain_path): 254 | for example_id in os.listdir(domain_path): 255 | if example_id == "onboard": 256 | continue 257 | example_path = os.path.join(domain_path, example_id) 258 | if os.path.isdir(example_path): 259 | if "result.txt" not in os.listdir(example_path): 260 | # empty all files under example_id 261 | for file in os.listdir(example_path): 262 | os.remove(os.path.join(example_path, file)) 263 | else: 264 | finished[domain].append(example_id) 265 | 266 | if not finished: 267 | return total_file_json 268 | 269 | for domain, examples in finished.items(): 270 | if domain in total_file_json: 271 | total_file_json[domain] = [ 272 | x for x in total_file_json[domain] if x not in examples 273 | ] 274 | 275 | return total_file_json 276 | 277 | 278 | def get_result(action_space, use_model, observation_type, result_dir, total_file_json): 279 | target_dir = os.path.join(result_dir, action_space, observation_type, use_model) 280 | if not os.path.exists(target_dir): 281 | print("New experiment, no result yet.") 282 | return None 283 | 284 | all_result = [] 285 | 286 | for domain in os.listdir(target_dir): 287 | domain_path = os.path.join(target_dir, domain) 288 | if os.path.isdir(domain_path): 289 | for example_id in os.listdir(domain_path): 290 | example_path = os.path.join(domain_path, example_id) 291 | if os.path.isdir(example_path): 292 | if "result.txt" in os.listdir(example_path): 293 | # empty all files under example_id 294 | try: 295 | all_result.append( 296 | float( 297 | open( 298 | os.path.join(example_path, "result.txt"), "r" 299 | ).read() 300 | ) 301 | ) 302 | except: 303 | all_result.append(0.0) 304 | 305 | if not all_result: 306 | print("New experiment, no result yet.") 307 | return None 308 | else: 309 | print("Current Success Rate:", sum(all_result) / len(all_result) * 100, "%") 310 | return all_result 311 | 312 | 313 | if __name__ == "__main__": 314 | ####### The complete version of the list of examples ####### 315 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 316 | args = config() 317 | 318 | with open(args.test_all_meta_path, "r", encoding="utf-8") as f: 319 | test_all_meta = json.load(f) 320 | 321 | if args.domain != "all": 322 | test_all_meta = {args.domain: test_all_meta[args.domain]} 323 | 324 | test_file_list = get_unfinished( 325 | args.action_space, 326 | args.model, 327 | args.observation_type, 328 | args.result_dir, 329 | test_all_meta, 330 | ) 331 | left_info = "" 332 | for domain in test_file_list: 333 | left_info += f"{domain}: {len(test_file_list[domain])}\n" 334 | logger.info(f"Left tasks:\n{left_info}") 335 | 336 | get_result( 337 | args.action_space, 338 | args.model, 339 | args.observation_type, 340 | args.result_dir, 341 | test_all_meta, 342 | ) 343 | test(args, test_file_list) 344 | -------------------------------------------------------------------------------- /osworld_setup/s2/OSWorld.md: -------------------------------------------------------------------------------- 1 | # Deplying Agent S2 in OSWorld 2 | 3 | # Step 1: Set up Agent S2 4 | 5 | Follow the [README.md](https://github.com/simular-ai/Agent-S/blob/main/README.md) to set up Agent S2. 6 | 7 | # Step 2: Copying Over Run Files 8 | 9 | If you haven't already, please follow the [OSWorld environment setup](https://github.com/xlang-ai/OSWorld/blob/main/README.md). We've provided the relevant OSWorld run files for evaluation in this `osworld_setup` folder. Please copy this over to your OSWorld folder. 10 | 11 | # Best Practices 12 | 13 | At this point, you will have set up the Agent S2, the OSWorld environment, and the VMWare Workstation Pro application set up. Below, we'll list some best practices, and common problems and their fixes. 14 | 15 | --- 16 | 17 | ``` 18 | from desktop_env.desktop_env import DesktopEnv 19 | 20 | example = { 21 | "id": "94d95f96-9699-4208-98ba-3c3119edf9c2", 22 | "instruction": "I want to install Spotify on my current system. Could you please help me?", 23 | "config": [ 24 | { 25 | "type": "execute", 26 | "parameters": { 27 | "command": [ 28 | "python", 29 | "-c", 30 | "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);" 31 | ] 32 | } 33 | } 34 | ], 35 | "evaluator": { 36 | "func": "check_include_exclude", 37 | "result": { 38 | "type": "vm_command_line", 39 | "command": "which spotify" 40 | }, 41 | "expected": { 42 | "type": "rule", 43 | "rules": { 44 | "include": ["spotify"], 45 | "exclude": ["not found"] 46 | } 47 | } 48 | } 49 | } 50 | 51 | env = DesktopEnv(action_space="pyautogui") 52 | 53 | obs = env.reset(task_config=example) 54 | obs, reward, done, info = env.step("pyautogui.rightClick()") 55 | ``` 56 | 57 | Note, this code is just for demonstrating how the OSWorld `DesktopEnv` is instantiated. If you're running OSWorld, this process is already part of their code base. The code above will boot up a VM and restart it. If, for whatever reason, running the starter code (or running OSWorld experiments) leads to an infinitely long run time, cancel out of the VM. 58 | You should then see: 59 | 60 | ``` 61 | parent/ 62 | OSWorld/ 63 | vmware_vm_data/ 64 | Ubuntu0/ 65 | *.lck 66 | *.vmem 67 | ... 68 | ... 69 | UbuntuX/ 70 | ``` 71 | 72 | If you happen to have any `*.lck` folder in your VM's folder, be sure to delete them. Every time you are powering on the VM from creating a new `DesktopEnv` instance, you need to 73 | delete the `*.lck` folders first. If your VM is already powered on, and your session (in a Jupyter Notebook, for example) crashes, you can keep the `*.lck` files and just re-instantiate the `DesktopEnv` instance. I'd also suggest using just a single VM (as a VM takes up a lot of space!). Also, be sure to shut down the VM when you've finished using it. Deleting the `*.lck` files should be done after every time you power off the VM (though it seems to not be an issue from testing). 74 | 75 | --- 76 | 77 | If even after rerunning the code and deleting the `*.lck` files don't work, then you should try passing in the `path_to_vm` explicitly to the `DesktopEnv` class. 78 | 79 | ``` 80 | env = DesktopEnv(action_space="pyautogui", headless=False, require_terminal=True, path_to_vm=) 81 | ``` 82 | 83 | Pass the absolute path to your VM's (Ubuntu0) `.vmx` file. This file is located here: 84 | 85 | 86 | ``` 87 | parent/ 88 | OSWorld/ 89 | vmware_vm_data/ 90 | Ubuntu0/ 91 | *.lck 92 | *.vmem 93 | ... 94 | *.vmx 95 | ... 96 | UbuntuX/ 97 | ``` 98 | 99 | 📌 **Note**: If you are testing on the `os` domain, there is an [issue](https://github.com/asweigart/pyautogui/issues/198#issuecomment-1465268536) with `pyautogui`. A *hacky* way to solve this is to, inside the VM, locate where the `pyautogui` module is installed and open the `__init__.py` located under the `pyautogui` folder and remove the "<" in the `set(...)` within the following function: 100 | 101 | ``` 102 | def isShiftCharacter(character): 103 | """ 104 | Returns True if the ``character`` is a keyboard key that would require the shift key to be held down, such as 105 | uppercase letters or the symbols on the keyboard's number row. 106 | """ 107 | # NOTE TODO - This will be different for non-qwerty keyboards. 108 | return character.isupper() or character in set('~!@#$%^&*()_+{}|:"<>?') 109 | ``` 110 | 111 | 📌 **Note**: If in case, your VM encounters an issue with "The root file system on requires a manual fsck", reset the VM to the previous snapshot. 112 | 113 | 📌 **Note**: OSWorld scripts will create the `DesktopEnv` instance which will create a VM for you with a specific snapshot (`snapshot_name` parameter in `DesktopEnv`). If you wish to create a new snapshot of the VM and use that for your experiments, be sure to specify the name of this snapshot where `DesktopEnv` is instantiated. 114 | 115 | With these changes, you should be able to get up and running with VMWare, DesktopEnv, and OSWorld! 😊 -------------------------------------------------------------------------------- /osworld_setup/s2/lib_run_single.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import json 3 | import logging 4 | import os 5 | import time 6 | from wrapt_timeout_decorator import * 7 | 8 | logger = logging.getLogger("desktopenv.experiment") 9 | 10 | 11 | def run_single_example( 12 | agent, env, example, max_steps, instruction, args, example_result_dir, scores 13 | ): 14 | runtime_logger = setup_logger(example, example_result_dir) 15 | agent.reset() 16 | env.reset(task_config=example) 17 | time.sleep(60) # Wait for the environment to be ready 18 | obs = env._get_obs() # Get the initial observation 19 | done = False 20 | step_idx = 0 21 | env.controller.start_recording() 22 | while not done and step_idx < max_steps: 23 | response, actions = agent.predict(instruction, obs) 24 | for action in actions: 25 | # Capture the timestamp before executing the action 26 | action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") 27 | logger.info("Step %d: %s", step_idx + 1, action) 28 | obs, reward, done, info = env.step(action, args.sleep_after_execution) 29 | 30 | logger.info("Reward: %.2f", reward) 31 | logger.info("Done: %s", done) 32 | # Save screenshot and trajectory information 33 | with open( 34 | os.path.join( 35 | example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png" 36 | ), 37 | "wb", 38 | ) as _f: 39 | _f.write(obs["screenshot"]) 40 | with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f: 41 | f.write( 42 | json.dumps( 43 | { 44 | "step_num": step_idx + 1, 45 | "action_timestamp": action_timestamp, 46 | "action": action, 47 | "reward": reward, 48 | "done": done, 49 | "info": info, 50 | "screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png", 51 | } 52 | ) 53 | ) 54 | f.write("\n") 55 | if done: 56 | logger.info("The episode is done.") 57 | break 58 | step_idx += 1 59 | result = env.evaluate() 60 | logger.info("Result: %.2f", result) 61 | scores.append(result) 62 | with open( 63 | os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8" 64 | ) as f: 65 | f.write(f"{result}\n") 66 | env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4")) 67 | 68 | 69 | def setup_logger(example, example_result_dir): 70 | runtime_logger = logging.getLogger(f"desktopenv.example.{example['id']}") 71 | runtime_logger.setLevel(logging.DEBUG) 72 | runtime_logger.addHandler( 73 | logging.FileHandler(os.path.join(example_result_dir, "runtime.log")) 74 | ) 75 | return runtime_logger 76 | -------------------------------------------------------------------------------- /osworld_setup/s2/run.py: -------------------------------------------------------------------------------- 1 | """OSWorld's run.py with AgentS2.""" 2 | 3 | """Script to run end-to-end evaluation on the benchmark. 4 | Utils and basic architecture credit to https://github.com/web-arena-x/webarena/blob/main/run.py. 5 | """ 6 | 7 | import argparse 8 | import datetime 9 | import json 10 | import logging 11 | import os 12 | import sys 13 | 14 | from gui_agents.s2.agents.agent_s import AgentS2 15 | from gui_agents.s2.agents.grounding import OSWorldACI 16 | from tqdm import tqdm 17 | 18 | import lib_run_single 19 | from desktop_env.desktop_env import DesktopEnv 20 | 21 | 22 | # Logger Configs {{{ # 23 | logger = logging.getLogger() 24 | logger.setLevel(logging.DEBUG) 25 | 26 | datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") 27 | 28 | file_handler = logging.FileHandler( 29 | os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8" 30 | ) 31 | debug_handler = logging.FileHandler( 32 | os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8" 33 | ) 34 | stdout_handler = logging.StreamHandler(sys.stdout) 35 | sdebug_handler = logging.FileHandler( 36 | os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8" 37 | ) 38 | 39 | file_handler.setLevel(logging.INFO) 40 | debug_handler.setLevel(logging.DEBUG) 41 | stdout_handler.setLevel(logging.INFO) 42 | sdebug_handler.setLevel(logging.DEBUG) 43 | 44 | formatter = logging.Formatter( 45 | fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s" 46 | ) 47 | file_handler.setFormatter(formatter) 48 | debug_handler.setFormatter(formatter) 49 | stdout_handler.setFormatter(formatter) 50 | sdebug_handler.setFormatter(formatter) 51 | 52 | stdout_handler.addFilter(logging.Filter("desktopenv")) 53 | sdebug_handler.addFilter(logging.Filter("desktopenv")) 54 | 55 | logger.addHandler(file_handler) 56 | logger.addHandler(debug_handler) 57 | logger.addHandler(stdout_handler) 58 | logger.addHandler(sdebug_handler) 59 | # }}} Logger Configs # 60 | 61 | logger = logging.getLogger("desktopenv.experiment") 62 | 63 | 64 | def config() -> argparse.Namespace: 65 | parser = argparse.ArgumentParser( 66 | description="Run end-to-end evaluation on the benchmark" 67 | ) 68 | 69 | # environment config 70 | parser.add_argument("--path_to_vm", type=str, default=None) 71 | parser.add_argument( 72 | "--headless", action="store_true", help="Run in headless machine" 73 | ) 74 | parser.add_argument( 75 | "--action_space", type=str, default="pyautogui", help="Action type" 76 | ) 77 | parser.add_argument( 78 | "--observation_type", 79 | choices=["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"], 80 | default="screenshot", 81 | help="Observation type", 82 | ) 83 | parser.add_argument("--screen_width", type=int, default=1920) 84 | parser.add_argument("--screen_height", type=int, default=1080) 85 | parser.add_argument("--sleep_after_execution", type=float, default=0.0) 86 | parser.add_argument("--max_steps", type=int, default=15) 87 | 88 | # agent config 89 | parser.add_argument("--max_trajectory_length", type=int, default=3) 90 | parser.add_argument( 91 | "--test_config_base_dir", type=str, default="evaluation_examples" 92 | ) 93 | 94 | # lm config 95 | parser.add_argument("--model_provider", type=str, default="openai") 96 | parser.add_argument("--model", type=str, default="gpt-4o") 97 | parser.add_argument( 98 | "--model_url", 99 | type=str, 100 | default="", 101 | help="The URL of the main generation model API.", 102 | ) 103 | parser.add_argument( 104 | "--model_api_key", 105 | type=str, 106 | default="", 107 | help="The API key of the main generation model.", 108 | ) 109 | parser.add_argument("--temperature", type=float, default=1.0) 110 | parser.add_argument("--top_p", type=float, default=0.9) 111 | parser.add_argument("--max_tokens", type=int, default=1500) 112 | parser.add_argument("--stop_token", type=str, default=None) 113 | 114 | # example config 115 | parser.add_argument("--domain", type=str, default="all") 116 | parser.add_argument( 117 | "--test_all_meta_path", type=str, default="evaluation_examples/test_all.json" 118 | ) 119 | 120 | # logging related 121 | parser.add_argument("--result_dir", type=str, default="./results") 122 | 123 | # NEW! 124 | 125 | # Configuration 1 126 | parser.add_argument("--grounding_model_provider", type=str, default="anthropic") 127 | parser.add_argument( 128 | "--grounding_model", type=str, default="claude-3-7-sonnet-20250219" 129 | ) 130 | parser.add_argument( 131 | "--grounding_model_resize_width", 132 | type=int, 133 | default=1366, 134 | help="Width of screenshot image after processor rescaling", 135 | ) 136 | parser.add_argument( 137 | "--grounding_model_resize_height", 138 | type=int, 139 | default=None, 140 | help="Height of screenshot image after processor rescaling", 141 | ) 142 | 143 | # Configuration 2 144 | parser.add_argument("--endpoint_provider", type=str, default="") 145 | parser.add_argument("--endpoint_url", type=str, default="") 146 | parser.add_argument( 147 | "--endpoint_api_key", 148 | type=str, 149 | default="", 150 | help="The API key of the grounding model.", 151 | ) 152 | 153 | parser.add_argument("--kb_name", default="kb_s2", type=str) 154 | 155 | args = parser.parse_args() 156 | 157 | return args 158 | 159 | 160 | def test(args: argparse.Namespace, test_all_meta: dict) -> None: 161 | scores = [] 162 | max_steps = args.max_steps 163 | 164 | # log args 165 | logger.info("Args: %s", args) 166 | cfg_args = { 167 | "path_to_vm": args.path_to_vm, 168 | "headless": args.headless, 169 | "action_space": args.action_space, 170 | "observation_type": args.observation_type, 171 | "screen_width": args.screen_width, 172 | "screen_height": args.screen_height, 173 | "sleep_after_execution": args.sleep_after_execution, 174 | "max_steps": args.max_steps, 175 | "max_trajectory_length": args.max_trajectory_length, 176 | "model": args.model, 177 | "temperature": args.temperature, 178 | "top_p": args.top_p, 179 | "max_tokens": args.max_tokens, 180 | "stop_token": args.stop_token, 181 | "result_dir": args.result_dir, 182 | } 183 | 184 | # NEW! 185 | engine_params = { 186 | "engine_type": args.model_provider, 187 | "model": args.model, 188 | "base_url": args.model_url, 189 | "api_key": args.model_api_key, 190 | } 191 | 192 | if args.endpoint_url: 193 | engine_params_for_grounding = { 194 | "engine_type": args.endpoint_provider, 195 | "base_url": args.endpoint_url, 196 | "api_key": args.endpoint_api_key, 197 | } 198 | else: 199 | grounding_height = args.grounding_model_resize_height 200 | # If not provided, use the aspect ratio of the screen to compute the height 201 | if grounding_height is None: 202 | grounding_height = ( 203 | args.screen_height 204 | * args.grounding_model_resize_width 205 | / args.screen_width 206 | ) 207 | 208 | engine_params_for_grounding = { 209 | "engine_type": args.grounding_model_provider, 210 | "model": args.grounding_model, 211 | "grounding_width": args.grounding_model_resize_width, 212 | "grounding_height": grounding_height, 213 | } 214 | 215 | # NEW! 216 | grounding_agent = OSWorldACI( 217 | platform="linux", 218 | engine_params_for_generation=engine_params, 219 | engine_params_for_grounding=engine_params_for_grounding, 220 | width=args.screen_width, 221 | height=args.screen_height, 222 | ) 223 | 224 | # NEW! 225 | agent = AgentS2( 226 | engine_params, 227 | grounding_agent, 228 | platform="linux", 229 | action_space="pyautogui", 230 | observation_type="mixed", 231 | search_engine="Perplexica", 232 | memory_root_path=os.getcwd(), 233 | memory_folder_name=args.kb_name, 234 | kb_release_tag="v0.2.2", 235 | embedding_engine_type="openai", 236 | ) 237 | 238 | env = DesktopEnv( 239 | path_to_vm=args.path_to_vm, 240 | action_space=agent.action_space, 241 | screen_size=(args.screen_width, args.screen_height), 242 | headless=args.headless, 243 | require_a11y_tree=args.observation_type 244 | in ["a11y_tree", "screenshot_a11y_tree", "som"], 245 | ) 246 | 247 | for domain in tqdm(test_all_meta, desc="Domain"): 248 | for example_id in tqdm(test_all_meta[domain], desc="Example", leave=False): 249 | config_file = os.path.join( 250 | args.test_config_base_dir, f"examples/{domain}/{example_id}.json" 251 | ) 252 | with open(config_file, "r", encoding="utf-8") as f: 253 | example = json.load(f) 254 | 255 | logger.info(f"[Domain]: {domain}") 256 | logger.info(f"[Example ID]: {example_id}") 257 | 258 | instruction = example["instruction"] 259 | 260 | logger.info(f"[Instruction]: {instruction}") 261 | # wandb each example config settings 262 | cfg_args["instruction"] = instruction 263 | cfg_args["start_time"] = datetime.datetime.now().strftime( 264 | "%Y:%m:%d-%H:%M:%S" 265 | ) 266 | 267 | example_result_dir = os.path.join( 268 | args.result_dir, 269 | args.action_space, 270 | args.observation_type, 271 | args.model, 272 | domain, 273 | example_id, 274 | ) 275 | os.makedirs(example_result_dir, exist_ok=True) 276 | # example start running 277 | try: 278 | lib_run_single.run_single_example( 279 | agent, 280 | env, 281 | example, 282 | max_steps, 283 | instruction, 284 | args, 285 | example_result_dir, 286 | scores, 287 | ) 288 | except Exception as e: 289 | logger.error(f"Exception in {domain}/{example_id}: {e}") 290 | env.controller.end_recording( 291 | os.path.join(example_result_dir, "recording.mp4") 292 | ) 293 | with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f: 294 | f.write( 295 | json.dumps( 296 | {"Error": f"Time limit exceeded in {domain}/{example_id}"} 297 | ) 298 | ) 299 | f.write("\n") 300 | 301 | env.close() 302 | logger.info(f"Average score: {sum(scores) / len(scores)}") 303 | 304 | 305 | def get_unfinished( 306 | action_space, use_model, observation_type, result_dir, total_file_json 307 | ): 308 | target_dir = os.path.join(result_dir, action_space, observation_type, use_model) 309 | 310 | if not os.path.exists(target_dir): 311 | return total_file_json 312 | 313 | finished = {} 314 | for domain in os.listdir(target_dir): 315 | finished[domain] = [] 316 | domain_path = os.path.join(target_dir, domain) 317 | if os.path.isdir(domain_path): 318 | for example_id in os.listdir(domain_path): 319 | if example_id == "onboard": 320 | continue 321 | example_path = os.path.join(domain_path, example_id) 322 | if os.path.isdir(example_path): 323 | if "result.txt" not in os.listdir(example_path): 324 | # empty all files under example_id 325 | for file in os.listdir(example_path): 326 | os.remove(os.path.join(example_path, file)) 327 | else: 328 | finished[domain].append(example_id) 329 | 330 | if not finished: 331 | return total_file_json 332 | 333 | for domain, examples in finished.items(): 334 | if domain in total_file_json: 335 | total_file_json[domain] = [ 336 | x for x in total_file_json[domain] if x not in examples 337 | ] 338 | 339 | return total_file_json 340 | 341 | 342 | def get_result(action_space, use_model, observation_type, result_dir, total_file_json): 343 | target_dir = os.path.join(result_dir, action_space, observation_type, use_model) 344 | if not os.path.exists(target_dir): 345 | print("New experiment, no result yet.") 346 | return None 347 | 348 | all_result = [] 349 | 350 | for domain in os.listdir(target_dir): 351 | domain_path = os.path.join(target_dir, domain) 352 | if os.path.isdir(domain_path): 353 | for example_id in os.listdir(domain_path): 354 | example_path = os.path.join(domain_path, example_id) 355 | if os.path.isdir(example_path): 356 | if "result.txt" in os.listdir(example_path): 357 | # empty all files under example_id 358 | try: 359 | all_result.append( 360 | float( 361 | open( 362 | os.path.join(example_path, "result.txt"), "r" 363 | ).read() 364 | ) 365 | ) 366 | except: 367 | all_result.append(0.0) 368 | 369 | if not all_result: 370 | print("New experiment, no result yet.") 371 | return None 372 | else: 373 | print("Current Success Rate:", sum(all_result) / len(all_result) * 100, "%") 374 | return all_result 375 | 376 | 377 | if __name__ == "__main__": 378 | ####### The complete version of the list of examples ####### 379 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 380 | args = config() 381 | 382 | with open(args.test_all_meta_path, "r", encoding="utf-8") as f: 383 | test_all_meta = json.load(f) 384 | 385 | if args.domain != "all": 386 | test_all_meta = {args.domain: test_all_meta[args.domain]} 387 | 388 | test_file_list = get_unfinished( 389 | args.action_space, 390 | args.model, 391 | args.observation_type, 392 | args.result_dir, 393 | test_all_meta, 394 | ) 395 | left_info = "" 396 | for domain in test_file_list: 397 | left_info += f"{domain}: {len(test_file_list[domain])}\n" 398 | logger.info(f"Left tasks:\n{left_info}") 399 | 400 | get_result( 401 | args.action_space, 402 | args.model, 403 | args.observation_type, 404 | args.result_dir, 405 | test_all_meta, 406 | ) 407 | test(args, test_file_list) 408 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | backoff 3 | pandas 4 | openai 5 | anthropic 6 | fastapi 7 | uvicorn 8 | paddleocr 9 | paddlepaddle 10 | together 11 | scikit-learn 12 | websockets 13 | tiktoken 14 | pyautogui 15 | toml 16 | black 17 | pytesseract 18 | 19 | # Platform-specific dependencies 20 | pyobjc; platform_system == "Darwin" 21 | pywinauto; platform_system == "Windows" 22 | pywin32; platform_system == "Windows" -------------------------------------------------------------------------------- /server.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import platform 4 | from fastapi import FastAPI, HTTPException 5 | from fastapi.responses import StreamingResponse 6 | from pydantic import BaseModel 7 | from gui_agents.s1.core.AgentS import GraphSearchAgent 8 | import io 9 | import pyautogui 10 | import time 11 | from threading import Event, Lock 12 | 13 | # Determine the operating system and select appropriate ACI 14 | current_platform = platform.system().lower() 15 | if current_platform == "linux": 16 | from gui_agents.s1.aci.LinuxOSACI import LinuxACI, UIElement 17 | 18 | grounding_agent = LinuxACI() 19 | elif current_platform == "darwin": 20 | from gui_agents.s1.aci.MacOSACI import MacOSACI, UIElement 21 | 22 | grounding_agent = MacOSACI() 23 | elif current_platform == "windows": 24 | from gui_agents.s1.aci.WindowsOSACI import WindowsACI, UIElement 25 | 26 | grounding_agent = WindowsACI() 27 | else: 28 | raise ValueError(f"Unsupported operating system: {current_platform}") 29 | 30 | app = FastAPI() 31 | 32 | # Add global lock and status tracking 33 | agent_lock = Lock() 34 | agent_status = {"is_running": False, "current_instruction": None, "start_time": None} 35 | 36 | # Add a stop event 37 | stop_event = Event() 38 | 39 | 40 | class InstructionData(BaseModel): 41 | screenshot: str 42 | accessibility_tree: str 43 | 44 | 45 | class CommandRequest(BaseModel): 46 | obs: InstructionData 47 | instruction: str 48 | 49 | 50 | class RunRequest(BaseModel): 51 | model: str 52 | instruction: str 53 | api_key: str | None = None 54 | 55 | 56 | async def stream_code(code: str): 57 | for line in code.splitlines(keepends=True): 58 | yield line 59 | await asyncio.sleep(0.1) 60 | 61 | 62 | def run_agent(agent: GraphSearchAgent, instruction: str): 63 | global stop_event 64 | stop_event.clear() # Reset the stop event 65 | obs = {} 66 | traj = "Task:\n" + instruction 67 | subtask_traj = "" 68 | for _ in range(15): 69 | # Check if stop was requested 70 | if stop_event.is_set(): 71 | print("Agent execution stopped by user") 72 | return 73 | 74 | print("iteration", _) 75 | 76 | obs["accessibility_tree"] = UIElement.systemWideElement() 77 | 78 | # Get screen shot using pyautogui. 79 | # Take a screenshot 80 | screenshot = pyautogui.screenshot() 81 | 82 | # Save the screenshot to a BytesIO object 83 | buffered = io.BytesIO() 84 | screenshot.save(buffered, format="PNG") 85 | 86 | # Get the byte value of the screenshot 87 | screenshot_bytes = buffered.getvalue() 88 | # Convert to base64 string. 89 | obs["screenshot"] = screenshot_bytes 90 | 91 | # Get next action code from the agent 92 | info, code = agent.predict(instruction=instruction, observation=obs) 93 | 94 | if "done" in code[0].lower() or "fail" in code[0].lower(): 95 | if platform.system() == "Darwin": 96 | os.system( 97 | f'osascript -e \'display dialog "Task Completed" with title "OpenACI Agent" buttons "OK" default button "OK"\'' 98 | ) 99 | elif platform.system() == "Linux": 100 | os.system( 101 | f'zenity --info --title="OpenACI Agent" --text="Task Completed" --width=200 --height=100' 102 | ) 103 | 104 | agent.update_narrative_memory(traj) 105 | break 106 | 107 | if "next" in code[0].lower(): 108 | continue 109 | 110 | if "wait" in code[0].lower(): 111 | time.sleep(5) 112 | continue 113 | 114 | else: 115 | time.sleep(1.0) 116 | print("EXECUTING CODE:", code[0]) 117 | 118 | # Ask for permission before executing 119 | exec(code[0]) 120 | time.sleep(1.0) 121 | 122 | # Update task and subtask trajectories and optionally the episodic memory 123 | traj += ( 124 | "\n\nReflection:\n" 125 | + str(info["reflection"]) 126 | + "\n\n----------------------\n\nPlan:\n" 127 | + info["executor_plan"] 128 | ) 129 | subtask_traj = agent.update_episodic_memory(info, subtask_traj) 130 | 131 | 132 | @app.post("/run") 133 | async def run(request: RunRequest): 134 | global agent_status 135 | 136 | # Check if agent is already running 137 | if not agent_lock.acquire(blocking=False): 138 | raise HTTPException( 139 | status_code=409, 140 | detail="An agent is already running. Use /status to check current run or /stop to stop it.", 141 | ) 142 | 143 | try: 144 | agent_status = { 145 | "is_running": True, 146 | "current_instruction": request.instruction, 147 | "start_time": time.time(), 148 | "model": request.model, 149 | } 150 | 151 | if "gpt" in request.model: 152 | engine_type = "openai" 153 | elif "claude" in request.model: 154 | engine_type = "anthropic" 155 | 156 | engine_params = { 157 | "engine_type": engine_type, 158 | "model": request.model, 159 | "api_key": request.api_key, 160 | } 161 | 162 | print("engine_params", engine_params) 163 | 164 | agent = GraphSearchAgent( 165 | engine_params, 166 | grounding_agent, 167 | platform=current_platform, 168 | action_space="pyautogui", 169 | observation_type="mixed", 170 | ) 171 | 172 | agent.reset() 173 | print("start the agent") 174 | run_agent(agent, request.instruction) 175 | 176 | return {"status": "completed"} 177 | 178 | finally: 179 | agent_status = { 180 | "is_running": False, 181 | "current_instruction": None, 182 | "start_time": None, 183 | } 184 | agent_lock.release() 185 | 186 | 187 | @app.get("/status") 188 | async def get_status(): 189 | if agent_status["is_running"]: 190 | duration = time.time() - agent_status["start_time"] 191 | return { 192 | "status": "running", 193 | "instruction": agent_status["current_instruction"], 194 | "model": agent_status["model"], 195 | "running_for_seconds": round(duration, 2), 196 | } 197 | return {"status": "idle"} 198 | 199 | 200 | @app.post("/execute") 201 | async def execute_command_stream(cmd: CommandRequest): 202 | engine_params = { 203 | "engine_type": "openai", 204 | "model": "gpt-4o", 205 | } 206 | 207 | agent = GraphSearchAgent( 208 | engine_params, 209 | grounding_agent, 210 | platform=current_platform, 211 | action_space="pyautogui", 212 | observation_type="mixed", 213 | ) 214 | 215 | obs = { 216 | "screenshot": cmd.obs.screenshot, 217 | "accessibility_tree": cmd.obs.accessibility_tree, 218 | } 219 | instruction = cmd.instruction 220 | info, code = agent.predict(instruction=instruction, observation=obs) 221 | 222 | return StreamingResponse(stream_code(code), media_type="text/plain") 223 | 224 | 225 | @app.post("/stop") 226 | async def stop_agent(): 227 | if not agent_status["is_running"]: 228 | raise HTTPException(status_code=404, detail="No agent is currently running") 229 | 230 | global stop_event 231 | stop_event.set() 232 | return {"status": "stop signal sent"} 233 | 234 | 235 | import uvicorn 236 | 237 | if __name__ == "__main__": 238 | uvicorn.run( 239 | "server:app", 240 | host="0.0.0.0", # Allows external access 241 | port=8000, # Default port for FastAPI 242 | reload=True, # Auto-reload on code changes 243 | ) 244 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name="gui-agents", 5 | version="0.2.5", 6 | description="A library for creating general purpose GUI agents using multimodal LLMs.", 7 | long_description=open("README.md", encoding="utf-8").read(), 8 | long_description_content_type="text/markdown", 9 | author="Simular AI", 10 | author_email="eric@simular.ai", 11 | packages=find_packages(), 12 | install_requires=[ 13 | "numpy", 14 | "backoff", 15 | "pandas", 16 | "openai", 17 | "anthropic", 18 | "fastapi", 19 | "uvicorn", 20 | "paddleocr", 21 | "paddlepaddle", 22 | "together", 23 | "scikit-learn", 24 | "websockets", 25 | "tiktoken", 26 | "selenium", 27 | 'pyobjc; platform_system == "Darwin"', 28 | "pyautogui", 29 | "toml", 30 | "pytesseract", 31 | "google-genai", 32 | 'pywinauto; platform_system == "Windows"', # Only for Windows 33 | 'pywin32; platform_system == "Windows"', # Only for Windows 34 | ], 35 | extras_require={"dev": ["black"]}, # Code formatter for linting 36 | entry_points={ 37 | "console_scripts": [ 38 | "agent_s1=gui_agents.s1.cli_app:main", 39 | "agent_s2=gui_agents.s2.cli_app:main", 40 | ], 41 | }, 42 | classifiers=[ 43 | "Programming Language :: Python :: 3", 44 | "Programming Language :: Python :: 3.9", 45 | "License :: OSI Approved :: Apache Software License", 46 | "Operating System :: Microsoft :: Windows", 47 | "Operating System :: POSIX :: Linux", 48 | "Operating System :: MacOS :: MacOS X", 49 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 50 | ], 51 | keywords="ai, llm, gui, agent, multimodal", 52 | project_urls={ 53 | "Source": "https://github.com/simular-ai/Agent-S", 54 | "Bug Reports": "https://github.com/simular-ai/Agent-S/issues", 55 | }, 56 | python_requires=">=3.9, <=3.12", 57 | ) 58 | -------------------------------------------------------------------------------- /tests/test_aci.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock, patch 2 | 3 | import pytest 4 | 5 | from gui_agents.s1.aci.ACI import ACI, _normalize_key 6 | 7 | 8 | @pytest.fixture 9 | def aci(): 10 | return ACI(top_app_only=True, ocr=False) 11 | 12 | 13 | def test_normalize_key(): 14 | """Test key normalization""" 15 | assert _normalize_key("cmd") == "command" 16 | assert _normalize_key("ctrl") == "ctrl" 17 | assert _normalize_key("shift") == "shift" 18 | 19 | 20 | def test_hotkey_cmd_normalization(aci): 21 | """Test cmd normalization in hotkey command""" 22 | command = aci.hotkey(["cmd", "c"]) 23 | assert "command" in command 24 | assert "cmd" not in command 25 | 26 | 27 | def test_click_with_cmd_key(aci): 28 | """Test cmd normalization in click command""" 29 | aci.nodes = [{"position": (100, 200), "size": (50, 50)}] 30 | command = aci.click(0, hold_keys=["cmd"]) 31 | assert "command" in command 32 | assert "cmd" not in command 33 | 34 | 35 | def test_type_with_overwrite(aci): 36 | """Test type command with overwrite""" 37 | aci.nodes = [{"position": (100, 200), "size": (50, 50)}] 38 | command = aci.type(0, "test", overwrite=True) 39 | assert "command" in command or "ctrl" in command 40 | assert "backspace" in command 41 | -------------------------------------------------------------------------------- /tests/test_app_switching.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import pyautogui 4 | from AppKit import NSWorkspace 5 | 6 | from gui_agents.s1.aci.MacOSACI import MacOSACI 7 | 8 | agent = MacOSACI() 9 | 10 | 11 | def test_app_switching(): 12 | app_or_file_name = "Safari" 13 | 14 | exec(agent.switch_applications(app_or_file_name)) 15 | 16 | # Checking the frontmost application 17 | frontmost_app = NSWorkspace.sharedWorkspace().frontmostApplication().localizedName() 18 | print(frontmost_app) 19 | 20 | # Assert to confirm Safari is the frontmost application 21 | assert frontmost_app == "Safari", f"Expected Safari, but got {frontmost_app}" 22 | 23 | 24 | # Run the test 25 | test_app_switching() 26 | -------------------------------------------------------------------------------- /tests/test_uielement_base.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from gui_agents.aci.UIElementBase import UIElementBase 4 | 5 | 6 | def test_uielement_base_is_abstract(): 7 | """Test that UIElementBase cannot be instantiated directly""" 8 | with pytest.raises(TypeError): 9 | UIElementBase() 10 | -------------------------------------------------------------------------------- /tests/test_uielement_linux.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock, patch 2 | 3 | import pyatspi 4 | import pytest 5 | 6 | from gui_agents.aci.UIElementLinux import UIElement 7 | 8 | 9 | @pytest.fixture 10 | def mock_accessible(): 11 | mock = Mock() 12 | mock.name = "Test Window" 13 | mock.getRole.return_value = pyatspi.ROLE_WINDOW 14 | mock.getState.return_value.contains.return_value = True 15 | return mock 16 | 17 | 18 | @pytest.fixture 19 | def ui_element(mock_accessible): 20 | return UIElement(mock_accessible) 21 | 22 | 23 | def test_role(ui_element, mock_accessible): 24 | """Test role retrieval""" 25 | mock_accessible.getRoleName.return_value = "window" 26 | assert ui_element.role() == "window" 27 | 28 | 29 | def test_position(ui_element, mock_accessible): 30 | """Test position retrieval""" 31 | mock_accessible.getPosition.return_value = (100, 200) 32 | assert ui_element.position() == (100, 200) 33 | 34 | 35 | def test_size(ui_element, mock_accessible): 36 | """Test size retrieval""" 37 | mock_accessible.getSize.return_value = (300, 400) 38 | assert ui_element.size() == (300, 400) 39 | -------------------------------------------------------------------------------- /tests/test_uielement_macos.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | from unittest.mock import Mock, patch 3 | 4 | import pytest 5 | 6 | from gui_agents.s1.aci.MacOSACI import UIElement 7 | 8 | 9 | @pytest.fixture 10 | def mock_ax_element(): 11 | mock_element = Mock() 12 | mock_element.__repr__ = lambda x: "x:100 y:200" 13 | return mock_element 14 | 15 | 16 | @pytest.fixture 17 | def mock_size_element(): 18 | mock_element = Mock() 19 | mock_element.__repr__ = lambda x: "w:300 h:400" 20 | return mock_element 21 | 22 | 23 | @pytest.fixture 24 | def ui_element(mock_ax_element): 25 | element = UIElement(mock_ax_element) 26 | return element 27 | 28 | 29 | def test_position_parsing(ui_element, mock_ax_element): 30 | """Test position parsing from AX element""" 31 | with patch.object(ui_element, "attribute", return_value=mock_ax_element): 32 | pos = ui_element.position() 33 | assert pos == (100.0, 200.0) 34 | 35 | 36 | def test_size_parsing(ui_element, mock_size_element): 37 | """Test size parsing from AX element""" 38 | with patch.object(ui_element, "attribute", return_value=mock_size_element): 39 | size = ui_element.size() 40 | assert size == (300.0, 400.0) 41 | 42 | 43 | def test_get_current_applications(obs: Dict): 44 | """Test getting list of current applications""" 45 | with patch("AppKit.NSWorkspace") as mock_workspace: 46 | mock_app = Mock() 47 | mock_app.activationPolicy.return_value = 0 48 | mock_app.localizedName.return_value = "TestApp" 49 | mock_workspace.sharedWorkspace.return_value.runningApplications.return_value = [ 50 | mock_app 51 | ] 52 | 53 | apps = UIElement.get_current_applications(obs) 54 | assert apps == ["TestApp"] 55 | -------------------------------------------------------------------------------- /tests/test_uielement_osworld.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | 3 | import pytest 4 | 5 | from gui_agents.aci.UIElementOSWorld import UIElement 6 | 7 | 8 | @pytest.fixture 9 | def sample_xml(): 10 | return """ 11 | 12 | 13 | 14 | 18 | 19 | 20 | 21 | """ 22 | 23 | 24 | @pytest.fixture 25 | def ui_element(sample_xml): 26 | tree = ET.ElementTree(ET.fromstring(sample_xml)) 27 | return UIElement(tree.getroot()) 28 | 29 | 30 | def test_nodeFromTree(sample_xml): 31 | """Test creating UIElement from XML string""" 32 | element = UIElement.nodeFromTree(sample_xml) 33 | assert element is not None 34 | assert isinstance(element, UIElement) 35 | 36 | 37 | def test_position(ui_element): 38 | """Test position extraction from XML""" 39 | button = ui_element.children()[0].children()[0] 40 | assert button.position() == (100, 200) 41 | 42 | 43 | def test_size(ui_element): 44 | """Test size extraction from XML""" 45 | button = ui_element.children()[0].children()[0] 46 | assert button.size() == (300, 400) 47 | --------------------------------------------------------------------------------