├── .github
    └── workflows
    │   └── lint.yml
├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── WAA_setup.pdf
├── evaluation_sets
    ├── test_all.json
    └── test_small_new.json
├── gui_agents
    ├── __init__.py
    ├── s1
    │   ├── README.md
    │   ├── WindowsAgentArena.md
    │   ├── aci
    │   │   ├── ACI.py
    │   │   ├── LinuxOSACI.py
    │   │   ├── MacOSACI.py
    │   │   ├── WindowsOSACI.py
    │   │   ├── __init__.py
    │   │   └── windowsagentarena
    │   │   │   └── GroundingAgent.py
    │   ├── cli_app.py
    │   ├── core
    │   │   ├── AgentS.py
    │   │   ├── BaseModule.py
    │   │   ├── Knowledge.py
    │   │   ├── Manager.py
    │   │   ├── ProceduralMemory.py
    │   │   ├── Worker.py
    │   │   └── __init__.py
    │   ├── mllm
    │   │   ├── MultimodalAgent.py
    │   │   ├── MultimodalEngine.py
    │   │   └── __init__.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── common_utils.py
    │   │   ├── ocr_server.py
    │   │   └── query_perplexica.py
    ├── s2
    │   ├── __init__.py
    │   ├── agents
    │   │   ├── __init__.py
    │   │   ├── agent_s.py
    │   │   ├── grounding.py
    │   │   ├── manager.py
    │   │   └── worker.py
    │   ├── cli_app.py
    │   ├── core
    │   │   ├── __init__.py
    │   │   ├── engine.py
    │   │   ├── knowledge.py
    │   │   ├── mllm.py
    │   │   └── module.py
    │   ├── memory
    │   │   ├── __init__.py
    │   │   └── procedural_memory.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── common_utils.py
    │   │   └── query_perplexica.py
    └── utils.py
├── images
    ├── .DS_Store
    ├── agent_s.png
    ├── agent_s2_architecture.png
    ├── agent_s2_osworld_result.png
    ├── agent_s2_teaser.png
    ├── agent_s_architecture.pdf
    ├── osworld_result.png
    ├── results.pdf
    ├── results.png
    ├── teaser.png
    └── windows_result.png
├── models.md
├── osworld_setup
    ├── s1
    │   ├── OSWorld.md
    │   ├── lib_run_single.py
    │   └── run.py
    └── s2
    │   ├── OSWorld.md
    │   ├── lib_run_single.py
    │   └── run.py
├── requirements.txt
├── server.py
├── setup.py
└── tests
    ├── test_aci.py
    ├── test_app_switching.py
    ├── test_uielement_base.py
    ├── test_uielement_linux.py
    ├── test_uielement_macos.py
    └── test_uielement_osworld.py


/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: lint
 2 | on:
 3 |   pull_request:
 4 |     types: [opened, reopened, synchronize]
 5 |     paths:
 6 |       - "gui_agents/**"
 7 |       - "tests/**"
 8 |       - ".github/workflows/lint.yml"
 9 |   push:
10 |     branches:
11 |       - main
12 |     paths:
13 |       - "gui_agents/**"
14 |       - "tests/**"
15 |       - ".github/workflows/lint.yml"
16 | 
17 | env:
18 |   SUPPORTED_PYTHON_VERSIONS: "3.11"
19 | 
20 | jobs:
21 |   build:
22 |     runs-on: ubuntu-latest
23 |     strategy:
24 |       fail-fast: false
25 |       matrix:
26 |         python-version: ["3.10", "3.11"]
27 |     steps:
28 |     - uses: actions/checkout@v3
29 | 
30 |     - name: Set up Python ${{ matrix.python-version }}
31 |       uses: actions/setup-python@v4
32 |       with:
33 |         python-version: ${{ matrix.python-version }}
34 | 
35 |     - name: Install dependencies
36 |       run: |
37 |         python -m pip install --upgrade pip
38 |         pip install -e .[dev]
39 | 
40 |     - name: Run Linter
41 |       run: |
42 |         black --check gui_agents tests
43 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | logs/
164 | .DS_Store


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "Perplexica"]
2 | 	path = Perplexica
3 | 	url = https://github.com/ItzCrazyKns/Perplexica
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/WAA_setup.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/WAA_setup.pdf


--------------------------------------------------------------------------------
/evaluation_sets/test_small_new.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "os": [
 3 |         "5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57",
 4 |         "5812b315-e7bd-4265-b51f-863c02174c28",
 5 |         "c288e301-e626-4b98-a1ab-159dcb162af5",
 6 |         "4783cc41-c03c-4e1b-89b4-50658f642bd5",
 7 |         "5c1075ca-bb34-46a3-a7a0-029bd7463e79",
 8 |         "5ced85fc-fa1a-4217-95fd-0fb530545ce2"
 9 |     ],
10 |     "gimp": [
11 |         "a746add2-cab0-4740-ac36-c3769d9bfb46",
12 |         "7a4deb26-d57d-4ea9-9a73-630f66a7b568",
13 |         "d52d6308-ec58-42b7-a2c9-de80e4837b2b",
14 |         "2a729ded-3296-423d-aec4-7dd55ed5fbb3",
15 |         "d16c99dc-2a1e-46f2-b350-d97c86c85c15"
16 |     ],
17 |     "chrome": [
18 |         "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
19 |         "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
20 |         "35253b65-1c19-4304-8aa4-6884b8218fc0",
21 |         "a96b564e-dbe9-42c3-9ccf-b4498073438a",
22 |         "e1e75309-3ddb-4d09-92ec-de869c928143",
23 |         "82bc8d6a-36eb-4d2d-8801-ef714fb1e55a"
24 |     ],
25 |     "thunderbird": [
26 |         "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
27 |         "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
28 |         "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
29 |         "480bcfea-d68f-4aaa-a0a9-2589ef319381",
30 |         "030eeff7-b492-4218-b312-701ec99ee0cc"
31 |     ],
32 |     "vs_code": [
33 |         "0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
34 |         "dcbe20e8-647f-4f1d-8696-f1c5bbb570e3",
35 |         "9439a27b-18ae-42d8-9778-5f68f891805e",
36 |         "7c4cc09e-7a92-40dd-8338-b2286535c4ed",
37 |         "9d425400-e9b2-4424-9a4b-d4c7abac4140"
38 |     ],
39 |     "vlc": [
40 |         "59f21cfb-0120-4326-b255-a5b827b38967",
41 |         "8f080098-ddb1-424c-b438-4e96e5e4786e",
42 |         "5ac2891a-eacd-4954-b339-98abba077adb",
43 |         "f3977615-2b45-4ac5-8bba-80c17dbe2a37",
44 |         "215dfd39-f493-4bc3-a027-8a97d72c61bf"
45 |     ],
46 |     "libreoffice_calc": [
47 |         "357ef137-7eeb-4c80-a3bb-0951f26a8aff",
48 |         "42e0a640-4f19-4b28-973d-729602b5a4a7",
49 |         "abed40dc-063f-4598-8ba5-9fe749c0615d",
50 |         "035f41ba-6653-43ab-aa63-c86d449d62e5",
51 |         "7efeb4b1-3d19-4762-b163-63328d66303b"
52 |     ],
53 |     "libreoffice_impress": [
54 |         "5d901039-a89c-4bfb-967b-bf66f4df075e",
55 |         "550ce7e7-747b-495f-b122-acdc4d0b8e54",
56 |         "ac9bb6cb-1888-43ab-81e4-a98a547918cd",
57 |         "2cd43775-7085-45d8-89fa-9e35c0a915cf",
58 |         "358aa0a7-6677-453f-ae35-e440f004c31e",
59 |         "a669ef01-ded5-4099-9ea9-25e99b569840"
60 |     ],
61 |     "libreoffice_writer": [
62 |         "0810415c-bde4-4443-9047-d5f70165a697",
63 |         "e246f6d8-78d7-44ac-b668-fcf47946cb50",
64 |         "d53ff5ee-3b1a-431e-b2be-30ed2673079b",
65 |         "b21acd93-60fd-4127-8a43-2f5178f4a830",
66 |         "0a0faba3-5580-44df-965d-f562a99b291c",
67 |         "adf5e2c3-64c7-4644-b7b6-d2f0167927e7"
68 |     ],
69 |     "multi_apps": [
70 |         "a74b607e-6bb5-4ea8-8a7c-5d97c7bbcd2a",
71 |         "5990457f-2adb-467b-a4af-5c857c92d762",
72 |         "2b9493d7-49b8-493a-a71b-56cd1f4d6908",
73 |         "acb0f96b-e27c-44d8-b55f-7cb76609dfcd",
74 |         "c867c42d-a52d-4a24-8ae3-f75d256b5618",
75 |         "74d5859f-ed66-4d3e-aa0e-93d7a592ce41",
76 |         "b5062e3e-641c-4e3a-907b-ac864d2e7652",
77 |         "48d05431-6cd5-4e76-82eb-12b60d823f7d",
78 |         "eb303e01-261e-4972-8c07-c9b4e7a4922a",
79 |         "d1acdb87-bb67-4f30-84aa-990e56a09c92",
80 |         "deec51c9-3b1e-4b9e-993c-4776f20e8bb2",
81 |         "8e116af7-7db7-4e35-a68b-b0939c066c78",
82 |         "716a6079-22da-47f1-ba73-c9d58f986a38",
83 |         "46407397-a7d5-4c6b-92c6-dbe038b1457b",
84 |         "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc",
85 |         "897e3b53-5d4d-444b-85cb-2cdc8a97d903"
86 |     ]
87 | }
88 | 


--------------------------------------------------------------------------------
/gui_agents/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/gui_agents/__init__.py


--------------------------------------------------------------------------------
/gui_agents/s1/README.md:
--------------------------------------------------------------------------------
  1 | <h1 align="center">
  2 |   <img src="../../images/agent_s.png" alt="Logo" style="vertical-align:middle" width="60"> Agent S:
  3 |   <small>Using Computers Like a Human</small>
  4 | </h1>
  5 | 
  6 | <p align="center">
  7 |   🌐 <a href="https://www.simular.ai/agent-s">[Website]</a>
  8 |   📄 <a href="https://arxiv.org/abs/2410.08164">[Paper]</a>
  9 |   🎥 <a href="https://www.youtube.com/watch?v=OBDE3Knte0g">[Video]</a>
 10 |   🗨️ <a href="https://discord.gg/E2XfsK9fPV">[Discord]</a>
 11 | </p>
 12 | 
 13 | ## 🥳 Updates
 14 | - [x] **2025/01/22**: The [Agent S paper](https://arxiv.org/abs/2410.08164) is accepted to ICLR 2025!
 15 | - [x] **2025/01/21**: Released v0.1.2 of [gui-agents](https://github.com/simular-ai/Agent-S) library, with support for Linux and Windows!
 16 | - [x] **2024/12/05**: Released v0.1.0 of [gui-agents](https://github.com/simular-ai/Agent-S) library, allowing you to use Agent-S for Mac, OSWorld, and WindowsAgentArena with ease!
 17 | - [x] **2024/10/10**: Released [Agent S paper](https://arxiv.org/abs/2410.08164) and codebase!
 18 | 
 19 | ## Table of Contents
 20 | 
 21 | 1. [💡 Introduction](#-introduction)
 22 | 2. [🎯 Current Results](#-current-results)
 23 | 3. [🛠️ Installation](#%EF%B8%8F-installation) 
 24 | 4. [🚀 Usage](#-usage)
 25 | 5. [🙌 Contributors](#-contributors)
 26 | 6. [💬 Citation](#-citation)
 27 | 
 28 | ## 💡 Introduction
 29 | 
 30 | <p align="center">
 31 |     <img src="../../images/teaser.png" width="800">
 32 | </p>
 33 | 
 34 | Welcome to **Agent S**, an open-source framework designed to enable autonomous interaction with computers through Agent-Computer Interface. Our mission is to build intelligent GUI agents that can learn from past experiences and perform complex tasks autonomously on your computer. 
 35 | 
 36 | Whether you're interested in AI, automation, or contributing to cutting-edge agent-based systems, we're excited to have you here!
 37 | 
 38 | ## 🎯 Current Results
 39 | 
 40 | <p align="center">
 41 |     <img src="../../images/results.png" width="600">
 42 |     <br>
 43 |     Results of Successful Rate (%) on the OSWorld full test set of all 369 test examples using Image + Accessibility Tree input.
 44 | </p>
 45 | 
 46 | 
 47 | ## 🛠️ Installation & Setup
 48 | 
 49 | > ❗**Warning**❗: If you are on a Linux machine, creating a `conda` environment will interfere with `pyatspi`. As of now, there's no clean solution for this issue. Proceed through the installation without using `conda` or any virtual environment.
 50 | 
 51 | Clone the repository:
 52 | ```
 53 | git clone https://github.com/simular-ai/Agent-S.git
 54 | ```
 55 | 
 56 | Install the gui-agents package:
 57 | ```
 58 | pip install gui-agents
 59 | ```
 60 | 
 61 | Set your LLM API Keys and other environment variables. You can do this by adding the following line to your .bashrc (Linux), or .zshrc (MacOS) file. 
 62 | 
 63 | ```
 64 | export OPENAI_API_KEY=<YOUR_API_KEY>
 65 | ```
 66 | 
 67 | Alternatively, you can set the environment variable in your Python script:
 68 | 
 69 | ```
 70 | import os
 71 | os.environ["OPENAI_API_KEY"] = "<YOUR_API_KEY>"
 72 | ```
 73 | 
 74 | We also support Azure OpenAI, Anthropic, and vLLM inference. For more information refer to [../../models.md](models.md).
 75 | 
 76 | ### Setup Retrieval from Web using Perplexica
 77 | Agent S works best with web-knowledge retrieval. To enable this feature, you need to setup Perplexica: 
 78 | 
 79 | 1. Ensure Docker Desktop is installed and running on your system.
 80 | 
 81 | 2. Navigate to the directory containing the project files.
 82 | 
 83 |    ```bash
 84 |     cd Perplexica
 85 |     git submodule update --init
 86 |    ```
 87 | 
 88 | 3. Rename the `sample.config.toml` file to `config.toml`. For Docker setups, you need only fill in the following fields:
 89 | 
 90 |    - `OPENAI`: Your OpenAI API key. **You only need to fill this if you wish to use OpenAI's models**.
 91 |    - `OLLAMA`: Your Ollama API URL. You should enter it as `http://host.docker.internal:PORT_NUMBER`. If you installed Ollama on port 11434, use `http://host.docker.internal:11434`. For other ports, adjust accordingly. **You need to fill this if you wish to use Ollama's models instead of OpenAI's**.
 92 |    - `GROQ`: Your Groq API key. **You only need to fill this if you wish to use Groq's hosted models**.
 93 |    - `ANTHROPIC`: Your Anthropic API key. **You only need to fill this if you wish to use Anthropic models**.
 94 | 
 95 |      **Note**: You can change these after starting Perplexica from the settings dialog.
 96 | 
 97 |    - `SIMILARITY_MEASURE`: The similarity measure to use (This is filled by default; you can leave it as is if you are unsure about it.)
 98 | 
 99 | 4. Ensure you are in the directory containing the `docker-compose.yaml` file and execute:
100 | 
101 |    ```bash
102 |    docker compose up -d
103 |    ```
104 | 
105 | 5. Next, export your Perplexica URL. This URL is used to interact with the Perplexica API backend. The port is given by the `config.toml` in your Perplexica directory.
106 | 
107 |    ```bash
108 |    export PERPLEXICA_URL=http://localhost:{port}/api/search
109 |    ```
110 | 
111 | 6. Our implementation of Agent S incorporates the Perplexica API to integrate a search engine capability, which allows for a more convenient and responsive user experience. If you want to tailor the API to your settings and specific requirements, you may modify the URL and the message of request parameters in  `agent_s/query_perplexica.py`. For a comprehensive guide on configuring the Perplexica API, please refer to [Perplexica Search API Documentation](https://github.com/ItzCrazyKns/Perplexica/blob/master/docs/API/SEARCH.md)
112 | 
113 | For a more detailed setup and usage guide, please refer to the [Perplexica Repository](https://github.com/ItzCrazyKns/Perplexica.git).
114 | 
115 | ### Setup Paddle-OCR Server
116 | 
117 | Switch to a new terminal where you will run Agent S. Set the OCR_SERVER_ADDRESS environment variable as shown below. For a better experience, add the following line directly to your .bashrc (Linux), or .zshrc (MacOS) file.
118 | 
119 | ```
120 | export OCR_SERVER_ADDRESS=http://localhost:8000/ocr/
121 | ```
122 | 
123 | Run the ocr_server.py file code to use OCR-based bounding boxes.
124 | 
125 | ```
126 | cd Agent-S
127 | python gui_agents/utils/ocr_server.py
128 | ```
129 | 
130 | You can change the server address by editing the address in [gui_agents/s1/utils/ocr_server.py](utils/ocr_server.py) file.
131 | 
132 | 
133 | > ❗**Warning**❗: The agent will directly run python code to control your computer. Please use with care.
134 | 
135 | ## 🚀 Usage
136 | 
137 | ### CLI
138 | 
139 | Run agent_s on your computer using:  
140 | ```
141 | agent_s1 --model gpt-4o
142 | ```
143 | This will show a user query prompt where you can enter your query and interact with Agent S. You can use any model from the list of supported models in [models.md](../../models.md).
144 | 
145 | ### `gui_agents` SDK
146 | 
147 | To deploy Agent S on MacOS or Windows:
148 | 
149 | ```
150 | import pyautogui
151 | import io
152 | from gui_agents.core.AgentS import GraphSearchAgent
153 | import platform
154 | 
155 | if platform.system() == "Darwin":
156 |   from gui_agents.aci.MacOSACI import MacOSACI, UIElement
157 |   grounding_agent = MacOSACI()
158 | elif platform.system() == "Windows":
159 |   from gui_agents.aci.WindowsOSACI import WindowsACI, UIElement
160 |   grounding_agent = WindowsACI()
161 | elif platform.system() == "Linux":
162 |   from gui_agents.aci.LinuxOSACI import LinuxACI, UIElement
163 |   grounding_agent = LinuxACI()
164 | else:
165 |   raise ValueError("Unsupported platform")
166 | 
167 | engine_params = {
168 |     "engine_type": "openai",
169 |     "model": "gpt-4o",
170 | }
171 | 
172 | agent = GraphSearchAgent(
173 |   engine_params,
174 |   grounding_agent,
175 |   platform="ubuntu",  # "macos", "windows"
176 |   action_space="pyautogui",
177 |   observation_type="mixed",
178 |   search_engine="Perplexica"
179 | )
180 | 
181 | # Get screenshot.
182 | screenshot = pyautogui.screenshot()
183 | buffered = io.BytesIO() 
184 | screenshot.save(buffered, format="PNG")
185 | screenshot_bytes = buffered.getvalue()
186 | 
187 | # Get accessibility tree.
188 | acc_tree = UIElement.systemWideElement()
189 | 
190 | obs = {
191 |   "screenshot": screenshot_bytes,
192 |   "accessibility_tree": acc_tree,
193 | }
194 | 
195 | instruction = "Close VS Code"
196 | info, action = agent.predict(instruction=instruction, observation=obs)
197 | 
198 | exec(action[0])
199 | ```
200 | 
201 | Refer to `cli_app.py` for more details on how the inference loop works.
202 | 
203 | #### Downloading the Knowledege Base
204 | 
205 | Agent S2 uses a knowledge base that continually updates with new knowledge during inference. The knowledge base is initially downloaded when initializing `GraphSearchAgent`. The knowledge base is stored as assets under our [GitHub Releases](https://github.com/simular-ai/Agent-S/releases). The `GraphSearchAgent` initialization will only download the knowledge base for your specified platform and agent version (e.g s1, s2). If you'd like to download the knowledge base programmatically, you can use the following code:
206 | 
207 | ```
208 | download_kb_data(
209 |     version="s2",
210 |     release_tag="v0.2.2",
211 |     download_dir="kb_data",
212 |     platform="linux"  # "darwin", "windows"
213 | )
214 | ```
215 | 
216 | This will download Agent S2's knowledge base for Linux from release tag `v0.2.2` to the `kb_data` directory. Refer to our [GitHub Releases](https://github.com/simular-ai/Agent-S/releases) or release tags that include the knowledge bases.
217 | 
218 | ### OSWorld
219 | 
220 | To deploy Agent S in OSWorld, follow the [OSWorld Deployment instructions](OSWorld.md).
221 | 
222 | ### WindowsAgentArena
223 | 
224 | To deploy Agent S in WindowsAgentArena, follow the [WindowsAgentArena Deployment instructions](WindowsAgentArena.md).
225 | 
226 | ## 🙌 Contributors
227 | 
228 | We’re grateful to all the [amazing people](https://github.com/simular-ai/Agent-S/graphs/contributors) who have contributed to this project. Thank you! 🙏  
229 | 
230 | ## 💬 Citation
231 | ```
232 | @misc{agashe2024agentsopenagentic,
233 |       title={Agent S: An Open Agentic Framework that Uses Computers Like a Human}, 
234 |       author={Saaket Agashe and Jiuzhou Han and Shuyu Gan and Jiachen Yang and Ang Li and Xin Eric Wang},
235 |       year={2024},
236 |       eprint={2410.08164},
237 |       archivePrefix={arXiv},
238 |       primaryClass={cs.AI},
239 |       url={https://arxiv.org/abs/2410.08164}, 
240 | }
241 | ```
242 | 
243 | 


--------------------------------------------------------------------------------
/gui_agents/s1/WindowsAgentArena.md:
--------------------------------------------------------------------------------
 1 | ## Deploying Agent-S in WindowsAgentArena
 2 | > ⚠️ **Warning**: The refactored code has not be fully tested on WindowsAgentArena. To reproduce the results on WindowsAgentArena, please use commit 496a9fa of this repository.
 3 | 
 4 | 1. To use the Agent S with WindowsAgentArena, follows the setup instructions at: https://github.com/microsoft/WindowsAgentArena.git. **Please use the development mode while preparing the image and running the client as instructed in https://github.com/microsoft/WindowsAgentArena/blob/main/docs/Development-Tips.md.** 
 5 | 
 6 | 2. To deploy our agent in the WindowsAgentArena, copy the agent_s folder in this repository to  `WindowsAgentArena/src/win-arena-container/client/mm_agents`. 
 7 | 
 8 | 3. Change the name of the GraphSearchAgent.py file to agent.py to conform to the WindowsAgentArena Setup. 
 9 | 
10 | 4. Copy the ocr_server.py file to client/folder `WindowsAgentArena/src/win-arena-container/client` folder
11 | 
12 | ```
13 | cd WindowsAgentArena/src/win-arena-container/client
14 | cp mm_agents/agent_s/ocr_server.py .
15 | ```
16 | 
17 | 5. Update the `start_client.sh` file in `WindowsAgentArena/src/win-arena-container` by adding the following line before Running the agent on line 75. 
18 | 
19 | ```
20 | python ocr_server.py &
21 | ```
22 | 
23 | 6. In the `src/win-arena-container/client/run.py` file import Agent S
24 | ```
25 | from mm_agents.agent_s.agent import GraphSearchAgent
26 | ```
27 | 
28 | 7. In the `src/win-arena-container/client/run.py` file, instantiate Agent S by adding the following lines after line 187 where the if condition for NAVI agent ends 
29 | 
30 | ```python
31 | elif cfg_args["agent_name"] == "agent_s":
32 |   if cfg_args["som_origin"] in ["a11y"]:
33 |     som_config = None
34 |   elif cfg_args["som_origin"] in ["oss", "mixed-oss"]:
35 |     som_config = {
36 |       "pipeline": ["webparse", "groundingdino", "ocr"],
37 |       "groundingdino": {
38 |         "prompts": ["icon", "image"]
39 |       },
40 |       "ocr": {
41 |         "class_name": "TesseractOCR"
42 |       },
43 |       "webparse": {
44 |         "cdp_url": f"http://{args.emulator_ip}:9222"
45 |       }
46 |     }
47 |   if args.model.startswith("claude"):
48 |     engine_type = "anthropic"
49 |   elif args.model.startswith("gpt"):
50 |     engine_type = "openai"
51 |   else:
52 |     engine_type = "vllm"
53 | 
54 |   engine_params = {
55 |     "engine_type": engine_type,
56 |     "model": args.model,
57 |   }
58 |   agent = GraphSearchAgent(
59 |     engine_params=engine_params,
60 |     experiment_type='windowsAgentArena',
61 |     temperature=args.temperature
62 |   )
63 | ```
64 | 
65 | 8. Run Agent S on WindowsAgentArena by changing the following parameters in the `scripts/run-local.sh` file
66 | 
67 | ```
68 | agent="agent_s"
69 | model="gpt-4o"
70 | ```


--------------------------------------------------------------------------------
/gui_agents/s1/aci/ACI.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Any, Dict, List
 3 | 
 4 | logger = logging.getLogger("desktopenv.agent")
 5 | 
 6 | 
 7 | def agent_action(func):
 8 |     func.is_agent_action = True
 9 |     return func
10 | 
11 | 
12 | class ACI:
13 |     def __init__(self, top_app_only: bool = True, ocr: bool = False):
14 |         self.top_app_only = top_app_only
15 |         self.ocr = ocr
16 |         self.index_out_of_range_flag = False
17 |         self.notes: List[str] = []
18 |         self.clipboard = ""
19 |         self.nodes: List[Any] = []
20 | 
21 |     def get_active_apps(self, obs: Dict) -> List[str]:
22 |         pass
23 | 
24 |     def get_top_app(self):
25 |         pass
26 | 
27 |     def preserve_nodes(self, tree: Any, exclude_roles: set = None) -> List[Dict]:
28 |         pass
29 | 
30 |     def linearize_and_annotate_tree(
31 |         self, obs: Dict, show_all_elements: bool = False
32 |     ) -> str:
33 |         pass
34 | 
35 |     def find_element(self, element_id: int) -> Dict:
36 |         pass
37 | 


--------------------------------------------------------------------------------
/gui_agents/s1/aci/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/gui_agents/s1/aci/__init__.py


--------------------------------------------------------------------------------
/gui_agents/s1/cli_app.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import io
  4 | import logging
  5 | import os
  6 | import platform
  7 | import sys
  8 | import time
  9 | 
 10 | import pyautogui
 11 | 
 12 | from gui_agents.s1.core.AgentS import GraphSearchAgent, UIAgent
 13 | 
 14 | current_platform = platform.system().lower()
 15 | 
 16 | if current_platform == "darwin":
 17 |     from gui_agents.s1.aci.MacOSACI import MacOSACI, UIElement
 18 | elif current_platform == "linux":
 19 |     from gui_agents.s1.aci.LinuxOSACI import LinuxACI, UIElement
 20 | elif current_platform == "windows":
 21 |     from gui_agents.s1.aci.WindowsOSACI import WindowsACI, UIElement
 22 | else:
 23 |     raise ValueError(f"Unsupported platform: {current_platform}")
 24 | 
 25 | logger = logging.getLogger()
 26 | logger.setLevel(logging.DEBUG)
 27 | 
 28 | datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
 29 | 
 30 | log_dir = "logs"
 31 | os.makedirs(log_dir, exist_ok=True)
 32 | 
 33 | file_handler = logging.FileHandler(
 34 |     os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8"
 35 | )
 36 | debug_handler = logging.FileHandler(
 37 |     os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8"
 38 | )
 39 | stdout_handler = logging.StreamHandler(sys.stdout)
 40 | sdebug_handler = logging.FileHandler(
 41 |     os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8"
 42 | )
 43 | 
 44 | file_handler.setLevel(logging.INFO)
 45 | debug_handler.setLevel(logging.DEBUG)
 46 | stdout_handler.setLevel(logging.INFO)
 47 | sdebug_handler.setLevel(logging.DEBUG)
 48 | 
 49 | formatter = logging.Formatter(
 50 |     fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s"
 51 | )
 52 | file_handler.setFormatter(formatter)
 53 | debug_handler.setFormatter(formatter)
 54 | stdout_handler.setFormatter(formatter)
 55 | sdebug_handler.setFormatter(formatter)
 56 | 
 57 | stdout_handler.addFilter(logging.Filter("desktopenv"))
 58 | sdebug_handler.addFilter(logging.Filter("desktopenv"))
 59 | 
 60 | logger.addHandler(file_handler)
 61 | logger.addHandler(debug_handler)
 62 | logger.addHandler(stdout_handler)
 63 | logger.addHandler(sdebug_handler)
 64 | 
 65 | platform_os = platform.system()
 66 | 
 67 | 
 68 | def show_permission_dialog(code: str, action_description: str):
 69 |     """Show a platform-specific permission dialog and return True if approved."""
 70 |     if platform.system() == "Darwin":
 71 |         result = os.system(
 72 |             f'osascript -e \'display dialog "Do you want to execute this action?\n\n{code} which will try to {action_description}" with title "Action Permission" buttons {{"Cancel", "OK"}} default button "OK" cancel button "Cancel"\''
 73 |         )
 74 |         return result == 0
 75 |     elif platform.system() == "Linux":
 76 |         result = os.system(
 77 |             f'zenity --question --title="Action Permission" --text="Do you want to execute this action?\n\n{code}" --width=400 --height=200'
 78 |         )
 79 |         return result == 0
 80 |     return False
 81 | 
 82 | 
 83 | def run_agent(agent: UIAgent, instruction: str):
 84 |     obs = {}
 85 |     traj = "Task:\n" + instruction
 86 |     subtask_traj = ""
 87 |     for _ in range(15):
 88 |         obs["accessibility_tree"] = UIElement.systemWideElement()
 89 | 
 90 |         # Get screen shot using pyautogui.
 91 |         # Take a screenshot
 92 |         screenshot = pyautogui.screenshot()
 93 | 
 94 |         # Save the screenshot to a BytesIO object
 95 |         buffered = io.BytesIO()
 96 |         screenshot.save(buffered, format="PNG")
 97 | 
 98 |         # Get the byte value of the screenshot
 99 |         screenshot_bytes = buffered.getvalue()
100 |         # Convert to base64 string.
101 |         obs["screenshot"] = screenshot_bytes
102 | 
103 |         # Get next action code from the agent
104 |         info, code = agent.predict(instruction=instruction, observation=obs)
105 | 
106 |         if "done" in code[0].lower() or "fail" in code[0].lower():
107 |             if platform.system() == "Darwin":
108 |                 os.system(
109 |                     f'osascript -e \'display dialog "Task Completed" with title "OpenACI Agent" buttons "OK" default button "OK"\''
110 |                 )
111 |             elif platform.system() == "Linux":
112 |                 os.system(
113 |                     f'zenity --info --title="OpenACI Agent" --text="Task Completed" --width=200 --height=100'
114 |                 )
115 | 
116 |             agent.update_narrative_memory(traj)
117 |             break
118 | 
119 |         if "next" in code[0].lower():
120 |             continue
121 | 
122 |         if "wait" in code[0].lower():
123 |             time.sleep(5)
124 |             continue
125 | 
126 |         else:
127 |             time.sleep(1.0)
128 |             print("EXECUTING CODE:", code[0])
129 | 
130 |             # Ask for permission before executing
131 |             exec(code[0])
132 |             time.sleep(1.0)
133 | 
134 |             # Update task and subtask trajectories and optionally the episodic memory
135 |             traj += (
136 |                 "\n\nReflection:\n"
137 |                 + str(info["reflection"])
138 |                 + "\n\n----------------------\n\nPlan:\n"
139 |                 + info["executor_plan"]
140 |             )
141 |             subtask_traj = agent.update_episodic_memory(info, subtask_traj)
142 | 
143 | 
144 | def main():
145 |     parser = argparse.ArgumentParser(
146 |         description="Run GraphSearchAgent with specified model."
147 |     )
148 |     parser.add_argument(
149 |         "--model",
150 |         type=str,
151 |         default="gpt-4o-mini",
152 |         help="Specify the model to use (e.g., gpt-4o)",
153 |     )
154 |     args = parser.parse_args()
155 | 
156 |     if current_platform == "Darwin":
157 |         grounding_agent = MacOSACI()
158 |     elif current_platform == "Windows":
159 |         grounding_agent = WindowsACI()
160 |     elif current_platform == "Linux":
161 |         grounding_agent = LinuxACI()
162 |     else:
163 |         raise ValueError("Unsupported platform")
164 | 
165 |     while True:
166 |         query = input("Query: ")
167 |         if "gpt" in args.model:
168 |             engine_type = "openai"
169 |         elif "claude" in args.model:
170 |             engine_type = "anthropic"
171 |         engine_params = {
172 |             "engine_type": engine_type,
173 |             "model": args.model,
174 |         }
175 | 
176 |         agent = GraphSearchAgent(
177 |             engine_params,
178 |             grounding_agent,
179 |             platform=current_platform,
180 |             action_space="pyautogui",
181 |             observation_type="mixed",
182 |         )
183 | 
184 |         agent.reset()
185 | 
186 |         # Run the agent on your own device
187 |         run_agent(agent, query)
188 | 
189 |         response = input("Would you like to provide another query? (y/n): ")
190 |         if response.lower() != "y":
191 |             break
192 | 
193 | 
194 | if __name__ == "__main__":
195 |     main()
196 | 


--------------------------------------------------------------------------------
/gui_agents/s1/core/BaseModule.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Optional
 2 | 
 3 | from gui_agents.s1.mllm.MultimodalAgent import LMMAgent
 4 | 
 5 | 
 6 | class BaseModule:
 7 |     def __init__(self, engine_params: Dict, platform: str):
 8 |         self.engine_params = engine_params
 9 |         self.platform = platform
10 | 
11 |     def _create_agent(
12 |         self, system_prompt: str = None, engine_params: Optional[Dict] = None
13 |     ) -> LMMAgent:
14 |         """Create a new LMMAgent instance"""
15 |         agent = LMMAgent(engine_params or self.engine_params)
16 |         if system_prompt:
17 |             agent.add_system_prompt(system_prompt)
18 |         return agent
19 | 


--------------------------------------------------------------------------------
/gui_agents/s1/core/Knowledge.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from typing import Dict, Tuple
  4 | 
  5 | import numpy as np
  6 | from sklearn.metrics.pairwise import cosine_similarity
  7 | 
  8 | from gui_agents.s1.core.BaseModule import BaseModule
  9 | from gui_agents.s1.core.ProceduralMemory import PROCEDURAL_MEMORY
 10 | from gui_agents.s1.mllm.MultimodalEngine import OpenAIEmbeddingEngine
 11 | from gui_agents.s1.utils.common_utils import (
 12 |     load_embeddings,
 13 |     load_knowledge_base,
 14 |     save_embeddings,
 15 | )
 16 | from gui_agents.s1.utils.query_perplexica import query_to_perplexica
 17 | 
 18 | 
 19 | class KnowledgeBase(BaseModule):
 20 |     def __init__(
 21 |         self,
 22 |         local_kb_path: str,
 23 |         platform: str,
 24 |         engine_params: Dict,
 25 |         use_image_for_search: bool = False,
 26 |     ):
 27 |         super().__init__(engine_params, platform)
 28 | 
 29 |         self.local_kb_path = local_kb_path
 30 | 
 31 |         # initialize embedding engine
 32 |         # TODO: Support other embedding engines
 33 |         self.embedding_engine = OpenAIEmbeddingEngine(
 34 |             api_key=(
 35 |                 engine_params["api_key"]
 36 |                 if "api_key" in engine_params
 37 |                 else os.getenv("OPENAI_API_KEY")
 38 |             )
 39 |         )
 40 | 
 41 |         # Initialize paths for different memory types
 42 |         self.episodic_memory_path = os.path.join(
 43 |             self.local_kb_path, self.platform, "episodic_memory.json"
 44 |         )
 45 |         self.narrative_memory_path = os.path.join(
 46 |             self.local_kb_path, self.platform, "narrative_memory.json"
 47 |         )
 48 |         self.embeddings_path = os.path.join(
 49 |             self.local_kb_path, self.platform, "embeddings.pkl"
 50 |         )
 51 | 
 52 |         self.rag_module_system_prompt = PROCEDURAL_MEMORY.RAG_AGENT.replace(
 53 |             "CURRENT_OS", self.platform
 54 |         )
 55 | 
 56 |         # All three agent share a generic RAG prompt that ask agent to provide information for UI automation in CURRENT_OS
 57 |         self.query_formulator = self._create_agent(self.rag_module_system_prompt)
 58 |         self.llm_search_agent = self._create_agent(self.rag_module_system_prompt)
 59 |         self.knowledge_fusion_agent = self._create_agent(self.rag_module_system_prompt)
 60 | 
 61 |         self.use_image_for_search = use_image_for_search
 62 | 
 63 |     def retrieve_knowledge(
 64 |         self, instruction: str, search_query: str, search_engine: str = "llm"
 65 |     ) -> Tuple[str, str]:
 66 |         """Retrieve knowledge using search engine
 67 |         Args:
 68 |             instruction (str): task instruction
 69 |             observation (Dict): current observation
 70 |             search_engine (str): search engine to use"""
 71 | 
 72 |         # Use search engine to retrieve knowledge based on the formulated query
 73 |         search_results = self._search(instruction, search_query, search_engine)
 74 | 
 75 |         return search_query, search_results
 76 | 
 77 |     def formulate_query(self, instruction: str, observation: Dict) -> str:
 78 |         """Formulate search query based on instruction and current state"""
 79 |         query_path = os.path.join(
 80 |             self.local_kb_path, self.platform, "formulate_query.json"
 81 |         )
 82 |         try:
 83 |             with open(query_path, "r") as f:
 84 |                 formulate_query = json.load(f)
 85 |         except:
 86 |             formulate_query = {}
 87 | 
 88 |         if instruction in formulate_query:
 89 |             return formulate_query[instruction]
 90 | 
 91 |         self.query_formulator.add_message(
 92 |             f"The task is: {instruction}\n"
 93 |             f"Accessibility tree of the current desktop UI state: {observation['linearized_accessibility_tree']}\n"
 94 |             "To use google search to get some useful information, first carefully analyze "
 95 |             "the accessibility tree of the current desktop UI state, then given the task "
 96 |             "instruction, formulate a question that can be used to search on the Internet "
 97 |             "for information in helping with the task execution.\n"
 98 |             "The question should not be too general or too specific. Please ONLY provide "
 99 |             "the question.\nQuestion:",
100 |             image_content=(
101 |                 observation["screenshot"]
102 |                 if self.use_image_for_search and "screenshot" in observation
103 |                 else None
104 |             ),
105 |         )
106 | 
107 |         search_query = self.query_formulator.get_response().strip().replace('"', "")
108 |         print("search query: ", search_query)
109 |         formulate_query[instruction] = search_query
110 |         with open(query_path, "w") as f:
111 |             json.dump(formulate_query, f, indent=2)
112 | 
113 |         return search_query
114 | 
115 |     def _search(self, instruction: str, search_query: str, search_engine: str) -> str:
116 |         """Execute search using specified engine"""
117 | 
118 |         # Default to perplexica rag knowledge to see if the query exists
119 |         file = os.path.join(
120 |             self.local_kb_path, self.platform, f"{search_engine}_rag_knowledge.json"
121 |         )
122 | 
123 |         try:
124 |             with open(file, "r") as f:
125 |                 exist_search_results = json.load(f)
126 |         except:
127 |             exist_search_results = {}
128 | 
129 |         if instruction in exist_search_results:
130 |             return exist_search_results[instruction]
131 |         if search_engine.lower() == "llm":
132 |             # Use LLM's internal knowledge like a search engine
133 |             self.llm_search_agent.add_message(search_query)
134 |             search_results = self.llm_search_agent.get_response()
135 |         elif search_engine.lower() == "perplexica":
136 |             # Use perplexica to search for the query
137 |             search_results = query_to_perplexica(search_query)
138 |         else:
139 |             raise ValueError(f"Unsupported search engine: {search_engine}")
140 | 
141 |         exist_search_results[instruction] = search_results.strip()
142 |         with open(
143 |             os.path.join(
144 |                 self.local_kb_path,
145 |                 self.platform,
146 |                 f"{search_engine}_rag_knowledge.json",
147 |             ),
148 |             "w",
149 |         ) as f:
150 |             json.dump(exist_search_results, f, indent=2)
151 | 
152 |         return search_results
153 | 
154 |     def retrieve_narrative_experience(self, instruction: str) -> Tuple[str, str]:
155 |         """Retrieve narrative experience using embeddings"""
156 |         knowledge_base = load_knowledge_base(self.narrative_memory_path)
157 |         if not knowledge_base:
158 |             return "None", "None"
159 | 
160 |         embeddings = load_embeddings(self.embeddings_path)
161 | 
162 |         # Get or create instruction embedding
163 |         instruction_embedding = embeddings.get(instruction)
164 | 
165 |         if instruction_embedding is None:
166 |             instruction_embedding = self.embedding_engine.get_embeddings(instruction)
167 |             embeddings[instruction] = instruction_embedding
168 | 
169 |         # Get or create embeddings for knowledge base entries
170 |         candidate_embeddings = []
171 |         for key in knowledge_base:
172 |             candidate_embedding = embeddings.get(key)
173 |             if candidate_embedding is None:
174 |                 candidate_embedding = self.embedding_engine.get_embeddings(key)
175 |                 embeddings[key] = candidate_embedding
176 | 
177 |             candidate_embeddings.append(candidate_embedding)
178 | 
179 |         save_embeddings(self.embeddings_path, embeddings)
180 | 
181 |         similarities = cosine_similarity(
182 |             instruction_embedding, np.vstack(candidate_embeddings)
183 |         )[0]
184 |         sorted_indices = np.argsort(similarities)[::-1]
185 | 
186 |         keys = list(knowledge_base.keys())
187 |         idx = 1 if keys[sorted_indices[0]] == instruction else 0
188 |         return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]]
189 | 
190 |     def retrieve_episodic_experience(self, instruction: str) -> Tuple[str, str]:
191 |         """Retrieve similar task experience using embeddings"""
192 |         knowledge_base = load_knowledge_base(self.episodic_memory_path)
193 |         if not knowledge_base:
194 |             return "None", "None"
195 | 
196 |         embeddings = load_embeddings(self.embeddings_path)
197 | 
198 |         # Get or create instruction embedding
199 |         instruction_embedding = embeddings.get(instruction)
200 | 
201 |         if instruction_embedding is None:
202 |             instruction_embedding = self.embedding_engine.get_embeddings(instruction)
203 |             embeddings[instruction] = instruction_embedding
204 | 
205 |         # Get or create embeddings for knowledge base entries
206 |         candidate_embeddings = []
207 |         for key in knowledge_base:
208 |             candidate_embedding = embeddings.get(key)
209 |             if candidate_embedding is None:
210 |                 candidate_embedding = self.embedding_engine.get_embeddings(key)
211 |                 embeddings[key] = candidate_embedding
212 | 
213 |             candidate_embeddings.append(candidate_embedding)
214 | 
215 |         save_embeddings(self.embeddings_path, embeddings)
216 | 
217 |         similarities = cosine_similarity(
218 |             instruction_embedding, np.vstack(candidate_embeddings)
219 |         )[0]
220 |         sorted_indices = np.argsort(similarities)[::-1]
221 | 
222 |         keys = list(knowledge_base.keys())
223 |         idx = 1 if keys[sorted_indices[0]] == instruction else 0
224 |         return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]]
225 | 
226 |     def knowledge_fusion(
227 |         self,
228 |         observation: Dict,
229 |         instruction: str,
230 |         web_knowledge: str,
231 |         similar_task: str,
232 |         experience: str,
233 |     ) -> str:
234 |         """Combine web knowledge with similar task experience"""
235 |         self.knowledge_fusion_agent.add_message(
236 |             f"Task: {instruction}\n"
237 |             f"Accessibility tree of the current desktop UI state: {observation['linearized_accessibility_tree']}\n"
238 |             f"**Web search result**:\n{web_knowledge}\n\n"
239 |             f"**Retrieved similar task experience**:\n"
240 |             f"Similar task:{similar_task}\n{experience}\n\n"
241 |             f"Based on the web search result and the retrieved similar task experience, "
242 |             f"if you think the similar task experience is indeed useful to the main task, "
243 |             f"integrate it with the web search result. Provide the final knowledge in a numbered list.",
244 |             image_content=(
245 |                 observation["screenshot"]
246 |                 if self.use_image_for_search and "screenshot" in observation
247 |                 else None
248 |             ),
249 |         )
250 |         return self.knowledge_fusion_agent.get_response()
251 | 


--------------------------------------------------------------------------------
/gui_agents/s1/core/Manager.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from collections import defaultdict
  3 | from typing import Dict, List, Optional, Tuple
  4 | import platform
  5 | 
  6 | from gui_agents.s1.aci.ACI import ACI
  7 | from gui_agents.s1.core.BaseModule import BaseModule
  8 | from gui_agents.s1.core.Knowledge import KnowledgeBase
  9 | from gui_agents.s1.core.ProceduralMemory import PROCEDURAL_MEMORY
 10 | from gui_agents.s1.utils.common_utils import (
 11 |     Dag,
 12 |     Node,
 13 |     calculate_tokens,
 14 |     call_llm_safe,
 15 |     parse_dag,
 16 | )
 17 | 
 18 | logger = logging.getLogger("desktopenv.agent")
 19 | 
 20 | NUM_IMAGE_TOKEN = 1105  # Value set of screen of size 1920x1080 for openai vision
 21 | 
 22 | 
 23 | class Manager(BaseModule):
 24 |     def __init__(
 25 |         self,
 26 |         engine_params: Dict,
 27 |         grounding_agent: ACI,
 28 |         local_kb_path: str,
 29 |         search_engine: Optional[str] = None,
 30 |         multi_round: bool = False,
 31 |         platform: str = platform.system().lower(),
 32 |     ):
 33 |         # TODO: move the prompt to Procedural Memory
 34 |         super().__init__(engine_params, platform)
 35 | 
 36 |         # Initialize the ACI
 37 |         self.grounding_agent = grounding_agent
 38 | 
 39 |         # Initialize the submodules of the Manager
 40 |         self.generator_agent = self._create_agent(PROCEDURAL_MEMORY.MANAGER_PROMPT)
 41 |         self.dag_translator_agent = self._create_agent(
 42 |             PROCEDURAL_MEMORY.DAG_TRANSLATOR_PROMPT
 43 |         )
 44 |         self.narrative_summarization_agent = self._create_agent(
 45 |             PROCEDURAL_MEMORY.TASK_SUMMARIZATION_PROMPT
 46 |         )
 47 |         self.episode_summarization_agent = self._create_agent(
 48 |             PROCEDURAL_MEMORY.SUBTASK_SUMMARIZATION_PROMPT
 49 |         )
 50 | 
 51 |         self.local_kb_path = local_kb_path
 52 | 
 53 |         self.knowledge_base = KnowledgeBase(self.local_kb_path, platform, engine_params)
 54 | 
 55 |         self.planner_history = []
 56 | 
 57 |         self.turn_count = 0
 58 |         self.search_engine = search_engine
 59 |         self.multi_round = multi_round
 60 |         self.platform = platform
 61 | 
 62 |     def summarize_episode(self, trajectory):
 63 |         """Summarize the episode experience for lifelong learning reflection
 64 |         Args:
 65 |             trajectory: str: The episode experience to be summarized
 66 |         """
 67 | 
 68 |         # Create Reflection on whole trajectories for next round trial, keep earlier messages as exemplars
 69 |         self.episode_summarization_agent.add_message(trajectory)
 70 |         subtask_summarization = call_llm_safe(self.episode_summarization_agent)
 71 |         self.episode_summarization_agent.add_message(subtask_summarization)
 72 | 
 73 |         return subtask_summarization
 74 | 
 75 |     def summarize_narrative(self, trajectory):
 76 |         """Summarize the narrative experience for lifelong learning reflection
 77 |         Args:
 78 |             trajectory: str: The narrative experience to be summarized
 79 |         """
 80 |         # Create Reflection on whole trajectories for next round trial
 81 |         self.narrative_summarization_agent.add_message(trajectory)
 82 |         lifelong_learning_reflection = call_llm_safe(self.narrative_summarization_agent)
 83 | 
 84 |         return lifelong_learning_reflection
 85 | 
 86 |     def _generate_step_by_step_plan(
 87 |         self, observation: Dict, instruction: str, failure_feedback: str = ""
 88 |     ) -> Tuple[Dict, str]:
 89 |         agent = self.grounding_agent
 90 | 
 91 |         self.active_apps = agent.get_active_apps(observation)
 92 | 
 93 |         tree_input = agent.linearize_and_annotate_tree(observation)
 94 |         observation["linearized_accessibility_tree"] = tree_input
 95 | 
 96 |         # Perform Retrieval only at the first planning step
 97 |         if self.turn_count == 0:
 98 | 
 99 |             self.search_query = self.knowledge_base.formulate_query(
100 |                 instruction, observation
101 |             )
102 | 
103 |             retrieved_experience = ""
104 |             integrated_knowledge = ""
105 |             # Retrieve most similar narrative (task) experience
106 |             most_similar_task, retrieved_experience = (
107 |                 self.knowledge_base.retrieve_narrative_experience(instruction)
108 |             )
109 |             logger.info(
110 |                 "SIMILAR TASK EXPERIENCE: %s",
111 |                 most_similar_task + "\n" + retrieved_experience.strip(),
112 |             )
113 | 
114 |             # Retrieve knowledge from the web if search_engine is provided
115 |             if self.search_engine is not None:
116 |                 retrieved_knowledge = self.knowledge_base.retrieve_knowledge(
117 |                     instruction=instruction,
118 |                     search_query=self.search_query,
119 |                     search_engine=self.search_engine,
120 |                 )
121 |                 logger.info("RETRIEVED KNOWLEDGE: %s", retrieved_knowledge)
122 | 
123 |                 if retrieved_knowledge is not None:
124 |                     # Fuse the retrieved knowledge and experience
125 |                     integrated_knowledge = self.knowledge_base.knowledge_fusion(
126 |                         observation=observation,
127 |                         instruction=instruction,
128 |                         web_knowledge=retrieved_knowledge,
129 |                         similar_task=most_similar_task,
130 |                         experience=retrieved_experience,
131 |                     )
132 |                     logger.info("INTEGRATED KNOWLEDGE: %s", integrated_knowledge)
133 | 
134 |             integrated_knowledge = integrated_knowledge or retrieved_experience
135 | 
136 |             # Add the integrated knowledge to the task instruction in the system prompt
137 |             if integrated_knowledge:
138 |                 instruction += f"\nYou may refer to some retrieved knowledge if you think they are useful.{integrated_knowledge}"
139 | 
140 |             self.generator_agent.add_system_prompt(
141 |                 self.generator_agent.system_prompt.replace(
142 |                     "TASK_DESCRIPTION", instruction
143 |                 )
144 |             )
145 | 
146 |         generator_message = (
147 |             f"Accessibility Tree: {tree_input}\n"
148 |             f"The clipboard contains: {agent.clipboard}."
149 |             f"The current open applications are {agent.get_active_apps(observation)}"
150 |             + (
151 |                 f" Previous plan failed at step: {failure_feedback}"
152 |                 if failure_feedback
153 |                 else ""
154 |             )
155 |         )
156 | 
157 |         self.generator_agent.add_message(
158 |             generator_message, image_content=observation.get("screenshot", None)
159 |         )
160 | 
161 |         logger.info("GENERATING HIGH LEVEL PLAN")
162 | 
163 |         plan = call_llm_safe(self.generator_agent)
164 | 
165 |         if plan == "":
166 |             raise Exception("Plan Generation Failed - Fix the Prompt")
167 | 
168 |         logger.info("HIGH LEVEL STEP BY STEP PLAN: %s", plan)
169 | 
170 |         self.generator_agent.add_message(plan)
171 | 
172 |         self.planner_history.append(plan)
173 | 
174 |         self.turn_count += 1
175 | 
176 |         input_tokens, output_tokens = calculate_tokens(self.generator_agent.messages)
177 | 
178 |         # Set Cost based on GPT-4o
179 |         cost = input_tokens * (0.0050 / 1000) + output_tokens * (0.0150 / 1000)
180 | 
181 |         planner_info = {
182 |             "search_query": self.search_query,
183 |             "goal_plan": plan,
184 |             "num_input_tokens_plan": input_tokens,
185 |             "num_output_tokens_plan": output_tokens,
186 |             "goal_plan_cost": cost,
187 |         }
188 | 
189 |         assert type(plan) == str
190 | 
191 |         return planner_info, plan
192 | 
193 |     def _generate_dag(self, instruction: str, plan: str) -> Tuple[Dict, Dag]:
194 |         # Add initial instruction and plan to the agent's message history
195 |         self.dag_translator_agent.add_message(
196 |             f"Instruction: {instruction}\nPlan: {plan}"
197 |         )
198 | 
199 |         logger.info("GENERATING DAG")
200 | 
201 |         # Generate DAG
202 |         dag_raw = call_llm_safe(self.dag_translator_agent)
203 | 
204 |         dag = parse_dag(dag_raw)
205 | 
206 |         logger.info("Generated DAG: %s", dag_raw)
207 | 
208 |         self.dag_translator_agent.add_message(dag_raw)
209 | 
210 |         input_tokens, output_tokens = calculate_tokens(
211 |             self.dag_translator_agent.messages
212 |         )
213 | 
214 |         # Set Cost based on GPT-4o
215 |         cost = input_tokens * (0.0050 / 1000) + output_tokens * (0.0150 / 1000)
216 | 
217 |         dag_info = {
218 |             "dag": dag_raw,
219 |             "num_input_tokens_dag": input_tokens,
220 |             "num_output_tokens_dag": output_tokens,
221 |             "dag_cost": cost,
222 |         }
223 | 
224 |         assert type(dag) == Dag
225 | 
226 |         return dag_info, dag
227 | 
228 |     def _topological_sort(self, dag: Dag) -> List[Node]:
229 |         """Topological sort of the DAG using DFS
230 |         dag: Dag: Object representation of the DAG with nodes and edges
231 |         """
232 | 
233 |         def dfs(node_name, visited, stack):
234 |             visited[node_name] = True
235 |             for neighbor in adj_list[node_name]:
236 |                 if not visited[neighbor]:
237 |                     dfs(neighbor, visited, stack)
238 |             stack.append(node_name)
239 | 
240 |         # Convert edges to adjacency list
241 |         adj_list = defaultdict(list)
242 |         for u, v in dag.edges:
243 |             adj_list[u.name].append(v.name)
244 | 
245 |         visited = {node.name: False for node in dag.nodes}
246 |         stack = []
247 | 
248 |         for node in dag.nodes:
249 |             if not visited[node.name]:
250 |                 dfs(node.name, visited, stack)
251 | 
252 |         # Return the nodes in topologically sorted order
253 |         sorted_nodes = [
254 |             next(n for n in dag.nodes if n.name == name) for name in stack[::-1]
255 |         ]
256 |         return sorted_nodes
257 | 
258 |     def get_action_queue(
259 |         self,
260 |         instruction: str,
261 |         observation: Dict,
262 |         failure_feedback: str = None,
263 |     ):
264 |         """Generate the action list based on the instruction
265 |         instruction:str: Instruction for the task
266 |         """
267 |         # Generate the high level plan
268 |         planner_info, plan = self._generate_step_by_step_plan(
269 |             observation, instruction, failure_feedback
270 |         )
271 | 
272 |         # Generate the DAG
273 |         dag_info, dag = self._generate_dag(instruction, plan)
274 | 
275 |         # Topological sort of the DAG
276 |         action_queue = self._topological_sort(dag)
277 | 
278 |         planner_info.update(dag_info)
279 | 
280 |         return planner_info, action_queue
281 | 


--------------------------------------------------------------------------------
/gui_agents/s1/core/Worker.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import re
  4 | from typing import Dict, List, Tuple
  5 | import platform
  6 | 
  7 | from gui_agents.s1.aci.ACI import ACI
  8 | from gui_agents.s1.core.BaseModule import BaseModule
  9 | from gui_agents.s1.core.Knowledge import KnowledgeBase
 10 | from gui_agents.s1.core.ProceduralMemory import PROCEDURAL_MEMORY
 11 | from gui_agents.s1.utils import common_utils
 12 | from gui_agents.s1.utils.common_utils import Node, calculate_tokens, call_llm_safe
 13 | 
 14 | logger = logging.getLogger("desktopenv.agent")
 15 | 
 16 | 
 17 | class Worker(BaseModule):
 18 |     def __init__(
 19 |         self,
 20 |         engine_params: Dict,
 21 |         grounding_agent: ACI,
 22 |         local_kb_path: str,
 23 |         platform: str = platform.system().lower(),
 24 |         search_engine: str = "perplexica",
 25 |         enable_reflection: bool = True,
 26 |         use_subtask_experience: bool = True,
 27 |     ):
 28 |         """
 29 |         Worker receives a subtask list and active subtask and generates the next action for the to execute.
 30 |         Args:
 31 |             engine_params: Dict
 32 |                 Parameters for the multimodal engine
 33 |             grounding_agent: Agent
 34 |                 The grounding agent to use
 35 |             local_kb_path: str
 36 |                 Path to knowledge base
 37 |             search_engine: str
 38 |                 The search engine to use
 39 |             enable_reflection: bool
 40 |                 Whether to enable reflection
 41 |             use_subtask_experience: bool
 42 |                 Whether to use subtask experience
 43 |         """
 44 |         super().__init__(engine_params, platform)
 45 | 
 46 |         self.grounding_agent = grounding_agent
 47 |         self.local_kb_path = local_kb_path
 48 |         self.enable_reflection = enable_reflection
 49 |         self.search_engine = search_engine
 50 |         self.use_subtask_experience = use_subtask_experience
 51 |         self.reset()
 52 | 
 53 |     def flush_messages(self, n):
 54 |         # After every max_trajectory_length trajectories, remove messages from the start except the system prompt
 55 |         for agent in [self.generator_agent]:
 56 |             if len(agent.messages) > 2 * n + 1:
 57 |                 # Remove the user message and assistant message, both are 1 because the elements will move back after 1 pop
 58 |                 agent.remove_message_at(1)
 59 |                 agent.remove_message_at(1)
 60 | 
 61 |     def reset(self):
 62 |         self.generator_agent = self._create_agent(
 63 |             PROCEDURAL_MEMORY.construct_worker_procedural_memory(
 64 |                 type(self.grounding_agent)
 65 |             ).replace("CURRENT_OS", self.platform)
 66 |         )
 67 |         self.reflection_agent = self._create_agent(
 68 |             PROCEDURAL_MEMORY.REFLECTION_ON_TRAJECTORY
 69 |         )
 70 | 
 71 |         self.knowledge_base = KnowledgeBase(
 72 |             local_kb_path=self.local_kb_path,
 73 |             platform=self.platform,
 74 |             engine_params=self.engine_params,
 75 |         )
 76 | 
 77 |         self.turn_count = 0
 78 |         self.planner_history = []
 79 |         self.reflections = []
 80 |         self.cost_this_turn = 0
 81 |         self.tree_inputs = []
 82 |         self.screenshot_inputs = []
 83 | 
 84 |     # TODO: Experimental
 85 |     def remove_ids_from_history(self):
 86 |         for message in self.generator_agent.messages:
 87 |             if message["role"] == "user":
 88 |                 for content in message["content"]:
 89 |                     if content["type"] == "text":
 90 |                         # Regex pattern to match lines that start with a number followed by spaces and remove the number
 91 |                         pattern = r"^\d+\s+"
 92 | 
 93 |                         # Apply the regex substitution on each line
 94 |                         processed_lines = [
 95 |                             re.sub(pattern, "", line)
 96 |                             for line in content["text"].splitlines()
 97 |                         ]
 98 | 
 99 |                         # Join the processed lines back into a single string
100 |                         result = "\n".join(processed_lines)
101 | 
102 |                         result = result.replace("id\t", "")
103 | 
104 |                         # replace message content
105 |                         content["text"] = result
106 | 
107 |     def generate_next_action(
108 |         self,
109 |         instruction: str,
110 |         search_query: str,
111 |         subtask: str,
112 |         subtask_info: str,
113 |         future_tasks: List[Node],
114 |         done_task: List[Node],
115 |         obs: Dict,
116 |     ) -> Tuple[Dict, List]:
117 |         """
118 |         Predict the next action(s) based on the current observation.
119 |         """
120 |         # Provide the top_app to the Grounding Agent to remove all other applications from the tree. At t=0, top_app is None
121 |         agent = self.grounding_agent
122 | 
123 |         self.active_apps = agent.get_active_apps(obs)
124 | 
125 |         # Get RAG knowledge, only update system message at t=0
126 |         if self.turn_count == 0:
127 |             # TODO: uncomment and fix for subtask level RAG
128 |             if self.use_subtask_experience:
129 |                 subtask_query_key = (
130 |                     "Task:\n"
131 |                     + search_query
132 |                     + "\n\nSubtask: "
133 |                     + subtask
134 |                     + "\nSubtask Instruction: "
135 |                     + subtask_info
136 |                 )
137 |                 retrieved_similar_subtask, retrieved_subtask_experience = (
138 |                     self.knowledge_base.retrieve_episodic_experience(subtask_query_key)
139 |                 )
140 |                 logger.info(
141 |                     "SIMILAR SUBTASK EXPERIENCE: %s",
142 |                     retrieved_similar_subtask
143 |                     + "\n"
144 |                     + retrieved_subtask_experience.strip(),
145 |                 )
146 |                 instruction += "\nYou may refer to some similar subtask experience if you think they are useful. {}".format(
147 |                     retrieved_similar_subtask + "\n" + retrieved_subtask_experience
148 |                 )
149 | 
150 |             self.generator_agent.add_system_prompt(
151 |                 self.generator_agent.system_prompt.replace(
152 |                     "SUBTASK_DESCRIPTION", subtask
153 |                 )
154 |                 .replace("TASK_DESCRIPTION", instruction)
155 |                 .replace("FUTURE_TASKS", ", ".join([f.name for f in future_tasks]))
156 |                 .replace("DONE_TASKS", ",".join(d.name for d in done_task))
157 |             )
158 | 
159 |         # Clear older messages - we keep full context. if you want to keep only the last n messages, you can use the flush_messages function
160 |         # self.flush_messages(3) # flushes generator messages
161 | 
162 |         # Reflection generation
163 |         reflection = None
164 |         if self.enable_reflection and self.turn_count > 0:
165 |             # TODO: reuse planner history
166 |             self.reflection_agent.add_message(
167 |                 "Task Description: "
168 |                 + subtask
169 |                 + " Instruction: "
170 |                 + subtask_info
171 |                 + "\n"
172 |                 + "Current Trajectory: "
173 |                 + "\n\n".join(self.planner_history)
174 |                 + "\n"
175 |             )
176 |             reflection = call_llm_safe(self.reflection_agent)
177 |             self.reflections.append(reflection)
178 |             self.reflection_agent.add_message(reflection)
179 | 
180 |             logger.info("REFLECTION: %s", reflection)
181 | 
182 |         # Plan Generation
183 |         tree_input = agent.linearize_and_annotate_tree(obs)
184 | 
185 |         self.remove_ids_from_history()
186 | 
187 |         # Bash terminal message.
188 |         generator_message = (
189 |             (
190 |                 f"\nYou may use the reflection on the previous trajectory: {reflection}\n"
191 |                 if reflection
192 |                 else ""
193 |             )
194 |             + f"Accessibility Tree: {tree_input}\n"
195 |             f"Text Buffer = [{','.join(agent.notes)}]. "
196 |             f"The current open applications are {agent.get_active_apps(obs)} and the active app is {agent.get_top_app(obs)}.\n"
197 |         )
198 | 
199 |         print("ACTIVE APP IS: ", agent.get_top_app(obs))
200 |         # Only provide subinfo in the very first message to avoid over influence and redundancy
201 |         if self.turn_count == 0:
202 |             generator_message += f"Remeber only complete the subtask: {subtask}\n"
203 |             generator_message += f"You can use this extra information for completing the current subtask: {subtask_info}.\n"
204 | 
205 |         logger.info("GENERATOR MESSAGE: %s", generator_message)
206 | 
207 |         self.generator_agent.add_message(
208 |             generator_message, image_content=obs["screenshot"]
209 |         )
210 | 
211 |         plan = call_llm_safe(self.generator_agent)
212 |         self.planner_history.append(plan)
213 |         logger.info("PLAN: %s", plan)
214 | 
215 |         self.generator_agent.add_message(plan)
216 | 
217 |         # Calculate input and output tokens
218 |         input_tokens, output_tokens = calculate_tokens(self.generator_agent.messages)
219 | 
220 |         # Set Cost based on GPT-4o
221 |         cost = input_tokens * (0.0050 / 1000) + output_tokens * (0.0150 / 1000)
222 |         self.cost_this_turn += cost
223 |         logger.info("EXECTUOR COST: %s", self.cost_this_turn)
224 | 
225 |         # Extract code block from the plan
226 |         plan_code = common_utils.parse_single_code_from_string(
227 |             plan.split("Grounded Action")[-1]
228 |         )
229 |         plan_code = common_utils.sanitize_code(plan_code)
230 |         plan_code = common_utils.extract_first_agent_function(plan_code)
231 |         exec_code = eval(plan_code)
232 | 
233 |         # If agent selects an element that was out of range, it should not be executed just send a WAIT command.
234 |         # TODO: should provide this as code feedback to the agent?
235 |         if agent.index_out_of_range_flag:
236 |             plan_code = "agent.wait(1.0)"
237 |             exec_code = eval(plan_code)
238 |             agent.index_out_of_range_flag = False
239 | 
240 |         executor_info = {
241 |             "current_subtask": subtask,
242 |             "current_subtask_info": subtask_info,
243 |             "executor_plan": plan,
244 |             "linearized_accessibility_tree": tree_input,
245 |             "plan_code": plan_code,
246 |             "reflection": reflection,
247 |             "num_input_tokens_executor": input_tokens,
248 |             "num_output_tokens_executor": output_tokens,
249 |             "executor_cost": cost,
250 |         }
251 |         self.turn_count += 1
252 | 
253 |         self.tree_inputs.append(tree_input)
254 |         self.screenshot_inputs.append(obs["screenshot"])
255 | 
256 |         return executor_info, [exec_code]
257 | 


--------------------------------------------------------------------------------
/gui_agents/s1/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/gui_agents/s1/core/__init__.py


--------------------------------------------------------------------------------
/gui_agents/s1/mllm/MultimodalAgent.py:
--------------------------------------------------------------------------------
  1 | # Author: Saaket Agashe
  2 | # Date: 2021-09-15
  3 | # License: MIT
  4 | 
  5 | import base64
  6 | import re
  7 | 
  8 | from gui_agents.s1.mllm.MultimodalEngine import (
  9 |     LMMEngineAnthropic,
 10 |     LMMEngineAzureOpenAI,
 11 |     LMMEngineOpenAI,
 12 |     LMMEnginevLLM,
 13 | )
 14 | 
 15 | data_type_map = {
 16 |     "openai": {"image_url": "image_url"},
 17 |     "anthropic": {"image_url": "image"},
 18 | }
 19 | 
 20 | 
 21 | class LMMAgent:
 22 |     def __init__(self, engine_params=None, system_prompt=None, engine=None):
 23 |         if engine is None:
 24 |             if engine_params is not None:
 25 |                 engine_type = engine_params.get("engine_type")
 26 |                 if engine_type == "openai":
 27 |                     self.engine = LMMEngineOpenAI(**engine_params)
 28 |                 elif engine_type == "anthropic":
 29 |                     self.engine = LMMEngineAnthropic(**engine_params)
 30 |                 elif engine_type == "azure":
 31 |                     self.engine = LMMEngineAzureOpenAI(**engine_params)
 32 |                 elif engine_type == "vllm":
 33 |                     self.engine = LMMEnginevLLM(**engine_params)
 34 |                 else:
 35 |                     raise ValueError("engine_type must be either 'openai' or 'azure'")
 36 |             else:
 37 |                 raise ValueError("engine_params must be provided")
 38 |         else:
 39 |             self.engine = engine
 40 | 
 41 |         self.messages = []  # Empty messages
 42 | 
 43 |         if system_prompt:
 44 |             self.add_system_prompt(system_prompt)
 45 |         else:
 46 |             self.add_system_prompt("You are a helpful assistant.")
 47 | 
 48 |     def encode_image(self, image_content):
 49 |         # if image_content is a path to an image file, check type of the image_content to verify
 50 |         if isinstance(image_content, str):
 51 |             with open(image_content, "rb") as image_file:
 52 |                 return base64.b64encode(image_file.read()).decode("utf-8")
 53 |         else:
 54 |             return base64.b64encode(image_content).decode("utf-8")
 55 | 
 56 |     def reset(
 57 |         self,
 58 |     ):
 59 | 
 60 |         self.messages = [
 61 |             {
 62 |                 "role": "system",
 63 |                 "content": [{"type": "text", "text": self.system_prompt}],
 64 |             }
 65 |         ]
 66 | 
 67 |     def add_system_prompt(self, system_prompt):
 68 |         self.system_prompt = system_prompt
 69 |         if len(self.messages) > 0:
 70 |             self.messages[0] = {
 71 |                 "role": "system",
 72 |                 "content": [{"type": "text", "text": self.system_prompt}],
 73 |             }
 74 |         else:
 75 |             self.messages.append(
 76 |                 {
 77 |                     "role": "system",
 78 |                     "content": [{"type": "text", "text": self.system_prompt}],
 79 |                 }
 80 |             )
 81 | 
 82 |     def remove_message_at(self, index):
 83 |         """Remove a message at a given index"""
 84 |         if index < len(self.messages):
 85 |             self.messages.pop(index)
 86 | 
 87 |     def replace_message_at(
 88 |         self, index, text_content, image_content=None, image_detail="high"
 89 |     ):
 90 |         """Replace a message at a given index"""
 91 |         if index < len(self.messages):
 92 |             self.messages[index] = {
 93 |                 "role": self.messages[index]["role"],
 94 |                 "content": [{"type": "text", "text": text_content}],
 95 |             }
 96 |             if image_content:
 97 |                 base64_image = self.encode_image(image_content)
 98 |                 self.messages[index]["content"].append(
 99 |                     {
100 |                         "type": "image_url",
101 |                         "image_url": {
102 |                             "url": f"data:image/png;base64,{base64_image}",
103 |                             "detail": image_detail,
104 |                         },
105 |                     }
106 |                 )
107 | 
108 |     def add_message(
109 |         self, text_content, image_content=None, role=None, image_detail="high"
110 |     ):
111 |         """Add a new message to the list of messages"""
112 | 
113 |         # API-style inference from OpenAI and AzureOpenAI
114 |         if isinstance(self.engine, (LMMEngineOpenAI, LMMEngineAzureOpenAI)):
115 |             # infer role from previous message
116 |             if role != "user":
117 |                 if self.messages[-1]["role"] == "system":
118 |                     role = "user"
119 |                 elif self.messages[-1]["role"] == "user":
120 |                     role = "assistant"
121 |                 elif self.messages[-1]["role"] == "assistant":
122 |                     role = "user"
123 | 
124 |             message = {
125 |                 "role": role,
126 |                 "content": [{"type": "text", "text": text_content}],
127 |             }
128 | 
129 |             if image_content:
130 |                 # Check if image_content is a list or a single image
131 |                 if isinstance(image_content, list):
132 |                     # If image_content is a list of images, loop through each image
133 |                     for image in image_content:
134 |                         base64_image = self.encode_image(image)
135 |                         message["content"].append(
136 |                             {
137 |                                 "type": "image_url",
138 |                                 "image_url": {
139 |                                     "url": f"data:image/png;base64,{base64_image}",
140 |                                     "detail": image_detail,
141 |                                 },
142 |                             }
143 |                         )
144 |                 else:
145 |                     # If image_content is a single image, handle it directly
146 |                     base64_image = self.encode_image(image_content)
147 |                     message["content"].append(
148 |                         {
149 |                             "type": "image_url",
150 |                             "image_url": {
151 |                                 "url": f"data:image/png;base64,{base64_image}",
152 |                                 "detail": image_detail,
153 |                             },
154 |                         }
155 |                     )
156 |             self.messages.append(message)
157 | 
158 |         # For API-style inference from Anthropic
159 |         elif isinstance(self.engine, LMMEngineAnthropic):
160 |             # infer role from previous message
161 |             if role != "user":
162 |                 if self.messages[-1]["role"] == "system":
163 |                     role = "user"
164 |                 elif self.messages[-1]["role"] == "user":
165 |                     role = "assistant"
166 |                 elif self.messages[-1]["role"] == "assistant":
167 |                     role = "user"
168 | 
169 |             message = {
170 |                 "role": role,
171 |                 "content": [{"type": "text", "text": text_content}],
172 |             }
173 | 
174 |             if image_content:
175 |                 # Check if image_content is a list or a single image
176 |                 if isinstance(image_content, list):
177 |                     # If image_content is a list of images, loop through each image
178 |                     for image in image_content:
179 |                         base64_image = self.encode_image(image)
180 |                         message["content"].append(
181 |                             {
182 |                                 "type": "image",
183 |                                 "source": {
184 |                                     "type": "base64",
185 |                                     "media_type": "image/png",
186 |                                     "data": base64_image,
187 |                                 },
188 |                             }
189 |                         )
190 |                 else:
191 |                     # If image_content is a single image, handle it directly
192 |                     base64_image = self.encode_image(image_content)
193 |                     message["content"].append(
194 |                         {
195 |                             "type": "image",
196 |                             "source": {
197 |                                 "type": "base64",
198 |                                 "media_type": "image/png",
199 |                                 "data": base64_image,
200 |                             },
201 |                         }
202 |                     )
203 |             self.messages.append(message)
204 | 
205 |         # Locally hosted vLLM model inference
206 |         elif isinstance(self.engine, LMMEnginevLLM):
207 |             # infer role from previous message
208 |             if role != "user":
209 |                 if self.messages[-1]["role"] == "system":
210 |                     role = "user"
211 |                 elif self.messages[-1]["role"] == "user":
212 |                     role = "assistant"
213 |                 elif self.messages[-1]["role"] == "assistant":
214 |                     role = "user"
215 | 
216 |             message = {
217 |                 "role": role,
218 |                 "content": [{"type": "text", "text": text_content}],
219 |             }
220 | 
221 |             if image_content:
222 |                 # Check if image_content is a list or a single image
223 |                 if isinstance(image_content, list):
224 |                     # If image_content is a list of images, loop through each image
225 |                     for image in image_content:
226 |                         base64_image = self.encode_image(image)
227 |                         message["content"].append(
228 |                             {
229 |                                 "type": "image",
230 |                                 "image": f"data:image;base64,{base64_image}",
231 |                             }
232 |                         )
233 |                 else:
234 |                     # If image_content is a single image, handle it directly
235 |                     base64_image = self.encode_image(image_content)
236 |                     message["content"].append(
237 |                         {"type": "image", "image": f"data:image;base64,{base64_image}"}
238 |                     )
239 |             self.messages.append(message)
240 | 
241 |     def get_response(
242 |         self,
243 |         user_message=None,
244 |         image=None,
245 |         messages=None,
246 |         temperature=0.0,
247 |         max_new_tokens=None,
248 |         **kwargs,
249 |     ):
250 |         """Generate the next response based on previous messages"""
251 |         if messages is None:
252 |             messages = self.messages
253 |         if user_message:
254 |             messages.append(
255 |                 {"role": "user", "content": [{"type": "text", "text": user_message}]}
256 |             )
257 | 
258 |         return self.engine.generate(
259 |             messages,
260 |             temperature=temperature,
261 |             max_new_tokens=max_new_tokens,
262 |             **kwargs,
263 |         )
264 | 


--------------------------------------------------------------------------------
/gui_agents/s1/mllm/MultimodalEngine.py:
--------------------------------------------------------------------------------
  1 | # Author: Saaket Agashe
  2 | # Date: 2021-09-15
  3 | # License: MIT
  4 | 
  5 | import os
  6 | import re
  7 | from io import BytesIO
  8 | 
  9 | import backoff
 10 | import numpy as np
 11 | import openai
 12 | import requests
 13 | from anthropic import Anthropic
 14 | from openai import APIConnectionError, APIError, AzureOpenAI, OpenAI, RateLimitError
 15 | from PIL import Image
 16 | 
 17 | # TODO: Import only if module exists, else ignore
 18 | # from llava.model.builder import load_pretrained_model
 19 | # from llava.mm_utils import (
 20 | #     process_images,
 21 | #     tokenizer_image_token,
 22 | #     get_model_name_from_path,
 23 | #     KeywordsStoppingCriteria,
 24 | # )
 25 | # from llava.constants import (
 26 | #     IMAGE_TOKEN_INDEX,
 27 | #     DEFAULT_IMAGE_TOKEN,
 28 | #     DEFAULT_IM_START_TOKEN,
 29 | #     DEFAULT_IM_END_TOKEN,
 30 | #     IMAGE_PLACEHOLDER,
 31 | # )
 32 | # from llava.conversation import conv_templates, SeparatorStyle
 33 | 
 34 | 
 35 | # from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 36 | 
 37 | 
 38 | def image_parser(args):
 39 |     out = args.image_file.split(args.sep)
 40 |     return out
 41 | 
 42 | 
 43 | def load_image(image_file):
 44 |     if image_file.startswith("http") or image_file.startswith("https"):
 45 |         response = requests.get(image_file)
 46 |         image = Image.open(BytesIO(response.content)).convert("RGB")
 47 |     else:
 48 |         image = Image.open(image_file).convert("RGB")
 49 |     return image
 50 | 
 51 | 
 52 | def load_images(image_files):
 53 |     out = []
 54 |     for image_file in image_files:
 55 |         image = load_image(image_file)
 56 |         out.append(image)
 57 |     return out
 58 | 
 59 | 
 60 | class LMMEngine:
 61 |     pass
 62 | 
 63 | 
 64 | class LMMEngineOpenAI(LMMEngine):
 65 |     def __init__(self, api_key=None, model=None, rate_limit=-1, **kwargs):
 66 |         assert model is not None, "model must be provided"
 67 |         self.model = model
 68 | 
 69 |         api_key = api_key or os.getenv("OPENAI_API_KEY")
 70 |         if api_key is None:
 71 |             raise ValueError(
 72 |                 "An API Key needs to be provided in either the api_key parameter or as an environment variable named OPENAI_API_KEY"
 73 |             )
 74 | 
 75 |         self.api_key = api_key
 76 |         self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit
 77 | 
 78 |         self.llm_client = OpenAI(api_key=self.api_key)
 79 | 
 80 |     @backoff.on_exception(
 81 |         backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60
 82 |     )
 83 |     def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):
 84 |         """Generate the next message based on previous messages"""
 85 |         return (
 86 |             self.llm_client.chat.completions.create(
 87 |                 model=self.model,
 88 |                 messages=messages,
 89 |                 max_tokens=max_new_tokens if max_new_tokens else 4096,
 90 |                 temperature=temperature,
 91 |                 **kwargs,
 92 |             )
 93 |             .choices[0]
 94 |             .message.content
 95 |         )
 96 | 
 97 | 
 98 | class LMMEngineAnthropic(LMMEngine):
 99 |     def __init__(self, api_key=None, model=None, **kwargs):
100 |         assert model is not None, "model must be provided"
101 |         self.model = model
102 | 
103 |         api_key = api_key or os.getenv("ANTHROPIC_API_KEY")
104 |         if api_key is None:
105 |             raise ValueError(
106 |                 "An API Key needs to be provided in either the api_key parameter or as an environment variable named ANTHROPIC_API_KEY"
107 |             )
108 | 
109 |         self.api_key = api_key
110 | 
111 |         self.llm_client = Anthropic(api_key=self.api_key)
112 | 
113 |     @backoff.on_exception(
114 |         backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60
115 |     )
116 |     def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):
117 |         """Generate the next message based on previous messages"""
118 |         return (
119 |             self.llm_client.messages.create(
120 |                 system=messages[0]["content"][0]["text"],
121 |                 model=self.model,
122 |                 messages=messages[1:],
123 |                 max_tokens=max_new_tokens if max_new_tokens else 4096,
124 |                 temperature=temperature,
125 |                 **kwargs,
126 |             )
127 |             .content[0]
128 |             .text
129 |         )
130 | 
131 | 
132 | class OpenAIEmbeddingEngine(LMMEngine):
133 |     def __init__(
134 |         self,
135 |         api_key=None,
136 |         rate_limit: int = -1,
137 |         display_cost: bool = True,
138 |     ):
139 |         """Init an OpenAI Embedding engine
140 | 
141 |         Args:
142 |             api_key (_type_, optional): Auth key from OpenAI. Defaults to None.
143 |             rate_limit (int, optional): Max number of requests per minute. Defaults to -1.
144 |             display_cost (bool, optional): Display cost of API call. Defaults to True.
145 |         """
146 |         self.model = "text-embedding-3-small"
147 |         self.cost_per_thousand_tokens = 0.00002
148 | 
149 |         api_key = api_key or os.getenv("OPENAI_API_KEY")
150 |         if api_key is None:
151 |             raise ValueError(
152 |                 "An API Key needs to be provided in either the api_key parameter or as an environment variable named OPENAI_API_KEY"
153 |             )
154 |         self.api_key = api_key
155 |         self.display_cost = display_cost
156 |         self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit
157 | 
158 |     @backoff.on_exception(
159 |         backoff.expo,
160 |         (
161 |             APIError,
162 |             RateLimitError,
163 |             APIConnectionError,
164 |         ),
165 |     )
166 |     def get_embeddings(self, text: str) -> np.ndarray:
167 |         client = OpenAI(api_key=self.api_key)
168 |         response = client.embeddings.create(model=self.model, input=text)
169 |         if self.display_cost:
170 |             total_tokens = response.usage.total_tokens
171 |             cost = self.cost_per_thousand_tokens * total_tokens / 1000
172 |             # print(f"Total cost for this embedding API call: {cost}")
173 |         return np.array([data.embedding for data in response.data])
174 | 
175 | 
176 | class LMMEngineAzureOpenAI(LMMEngine):
177 |     def __init__(
178 |         self,
179 |         api_key=None,
180 |         azure_endpoint=None,
181 |         model=None,
182 |         api_version=None,
183 |         rate_limit=-1,
184 |         **kwargs
185 |     ):
186 |         assert model is not None, "model must be provided"
187 |         self.model = model
188 | 
189 |         assert api_version is not None, "api_version must be provided"
190 |         self.api_version = api_version
191 | 
192 |         api_key = api_key or os.getenv("AZURE_OPENAI_API_KEY")
193 |         if api_key is None:
194 |             raise ValueError(
195 |                 "An API Key needs to be provided in either the api_key parameter or as an environment variable named AZURE_OPENAI_API_KEY"
196 |             )
197 | 
198 |         self.api_key = api_key
199 | 
200 |         azure_endpoint = azure_endpoint or os.getenv("AZURE_OPENAI_API_BASE")
201 |         if azure_endpoint is None:
202 |             raise ValueError(
203 |                 "An Azure API endpoint needs to be provided in either the azure_endpoint parameter or as an environment variable named AZURE_OPENAI_API_BASE"
204 |             )
205 | 
206 |         self.azure_endpoint = azure_endpoint
207 |         self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit
208 | 
209 |         self.llm_client = AzureOpenAI(
210 |             azure_endpoint=self.azure_endpoint,
211 |             api_key=self.api_key,
212 |             api_version=self.api_version,
213 |         )
214 |         self.cost = 0.0
215 | 
216 |     # @backoff.on_exception(backoff.expo, (APIConnectionError, APIError, RateLimitError), max_tries=10)
217 |     def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):
218 |         """Generate the next message based on previous messages"""
219 |         completion = self.llm_client.chat.completions.create(
220 |             model=self.model,
221 |             messages=messages,
222 |             max_tokens=max_new_tokens if max_new_tokens else 4096,
223 |             temperature=temperature,
224 |             **kwargs,
225 |         )
226 |         total_tokens = completion.usage.total_tokens
227 |         self.cost += 0.02 * ((total_tokens + 500) / 1000)
228 |         return completion.choices[0].message.content
229 | 
230 | 
231 | class LMMEnginevLLM(LMMEngine):
232 |     def __init__(
233 |         self, base_url=None, api_key=None, model=None, rate_limit=-1, **kwargs
234 |     ):
235 |         assert model is not None, "model must be provided"
236 |         self.model = model
237 |         self.api_key = api_key
238 | 
239 |         self.base_url = base_url or os.getenv("vLLM_ENDPOINT_URL")
240 |         if self.base_url is None:
241 |             raise ValueError(
242 |                 "An endpoint URL needs to be provided in either the endpoint_url parameter or as an environment variable named vLLM_ENDPOINT_URL"
243 |             )
244 | 
245 |         self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit
246 | 
247 |         self.llm_client = OpenAI(base_url=self.base_url, api_key=self.api_key)
248 | 
249 |     # @backoff.on_exception(backoff.expo, (APIConnectionError, APIError, RateLimitError), max_tries=10)
250 |     # TODO: Default params chosen for the Qwen model
251 |     def generate(
252 |         self,
253 |         messages,
254 |         temperature=0.0,
255 |         top_p=0.8,
256 |         repetition_penalty=1.05,
257 |         max_new_tokens=512,
258 |         **kwargs
259 |     ):
260 |         """Generate the next message based on previous messages"""
261 |         completion = self.llm_client.chat.completions.create(
262 |             model=self.model,
263 |             messages=messages,
264 |             max_tokens=max_new_tokens if max_new_tokens else 4096,
265 |             temperature=temperature,
266 |             top_p=top_p,
267 |             extra_body={"repetition_penalty": repetition_penalty},
268 |         )
269 |         return completion.choices[0].message.content
270 | 


--------------------------------------------------------------------------------
/gui_agents/s1/mllm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/gui_agents/s1/mllm/__init__.py


--------------------------------------------------------------------------------
/gui_agents/s1/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/gui_agents/s1/utils/__init__.py


--------------------------------------------------------------------------------
/gui_agents/s1/utils/ocr_server.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import gc
 3 | import io
 4 | 
 5 | import numpy as np
 6 | from fastapi import FastAPI
 7 | from paddleocr import PaddleOCR
 8 | from PIL import Image
 9 | from pydantic import BaseModel
10 | 
11 | app = FastAPI()
12 | ocr_module = PaddleOCR(use_angle_cls=True, lang="en")
13 | 
14 | 
15 | class ImageData(BaseModel):
16 |     img_bytes: bytes
17 | 
18 | 
19 | def text_cvt_orc_format_paddle(paddle_result):
20 |     texts = []
21 |     print("paddle_result: ", paddle_result)
22 |     for i, line in enumerate(paddle_result[0]):
23 |         points = np.array(line[0])
24 |         print("points: ", points)
25 |         location = {
26 |             "left": int(min(points[:, 0])),
27 |             "top": int(min(points[:, 1])),
28 |             "right": int(max(points[:, 0])),
29 |             "bottom": int(max(points[:, 1])),
30 |         }
31 |         print("location: ", location)
32 |         content = line[1][0]
33 |         texts.append((i, content, location))
34 |     return texts
35 | 
36 | 
37 | def ocr_results(screenshot):
38 |     screenshot_img = Image.open(io.BytesIO(screenshot))
39 |     result = ocr_module.ocr(np.array(screenshot_img), cls=True)
40 |     return text_cvt_orc_format_paddle(result)
41 | 
42 | 
43 | @app.post("/ocr/")
44 | async def read_image(image_data: ImageData):
45 |     image_bytes = base64.b64decode(image_data.img_bytes)
46 |     results = ocr_results(image_bytes)
47 | 
48 |     # Explicitly delete unused variables and run garbage collector
49 |     del image_bytes
50 |     gc.collect()
51 | 
52 |     return {"results": results}
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     import uvicorn
57 | 
58 |     uvicorn.run(app, host="127.0.0.1", port=8000)
59 | 


--------------------------------------------------------------------------------
/gui_agents/s1/utils/query_perplexica.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import toml
 3 | import os
 4 | 
 5 | 
 6 | def query_to_perplexica(query):
 7 |     # Retrieve the URL from an environment variable
 8 |     url = os.getenv("PERPLEXICA_URL")
 9 |     if not url:
10 |         raise ValueError(
11 |             "PERPLEXICA_URL environment variable not set. It may take the form: 'http://localhost:{port}/api/search'. The port number is set in the config.toml in the Perplexica directory."
12 |         )
13 | 
14 |     # Request Message
15 |     message = {"focusMode": "webSearch", "query": query, "history": [["human", query]]}
16 | 
17 |     response = requests.post(url, json=message)
18 | 
19 |     if response.status_code == 200:
20 |         return response.json()["message"]
21 |     elif response.status_code == 400:
22 |         raise ValueError(
23 |             "The request is malformed or missing required fields, such as FocusModel or query"
24 |         )
25 |     else:
26 |         raise ValueError("Internal Server Error")
27 | 
28 | 
29 | # Test Code
30 | if __name__ == "__main__":
31 |     query = "What is Agent S?"
32 |     response = query_to_perplexica(query)
33 |     print(response)
34 | 


--------------------------------------------------------------------------------
/gui_agents/s2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/gui_agents/s2/__init__.py


--------------------------------------------------------------------------------
/gui_agents/s2/agents/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/gui_agents/s2/agents/__init__.py


--------------------------------------------------------------------------------
/gui_agents/s2/agents/manager.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import re
  3 | from collections import defaultdict
  4 | from typing import Dict, List, Optional, Tuple
  5 | import platform
  6 | 
  7 | from gui_agents.s2.agents.grounding import ACI
  8 | from gui_agents.s2.core.module import BaseModule
  9 | from gui_agents.s2.core.knowledge import KnowledgeBase
 10 | from gui_agents.s2.memory.procedural_memory import PROCEDURAL_MEMORY
 11 | from gui_agents.s2.core.engine import OpenAIEmbeddingEngine
 12 | from gui_agents.s2.utils.common_utils import (
 13 |     Dag,
 14 |     Node,
 15 |     calculate_tokens,
 16 |     call_llm_safe,
 17 |     parse_dag,
 18 | )
 19 | 
 20 | logger = logging.getLogger("desktopenv.agent")
 21 | 
 22 | NUM_IMAGE_TOKEN = 1105  # Value set of screen of size 1920x1080 for openai vision
 23 | 
 24 | 
 25 | class Manager(BaseModule):
 26 |     def __init__(
 27 |         self,
 28 |         engine_params: Dict,
 29 |         grounding_agent: ACI,
 30 |         local_kb_path: str,
 31 |         embedding_engine=OpenAIEmbeddingEngine(),
 32 |         search_engine: Optional[str] = None,
 33 |         multi_round: bool = False,
 34 |         platform: str = platform.system().lower(),
 35 |     ):
 36 |         # TODO: move the prompt to Procedural Memory
 37 |         super().__init__(engine_params, platform)
 38 | 
 39 |         # Initialize the ACI
 40 |         self.grounding_agent = grounding_agent
 41 | 
 42 |         # Initialize the planner
 43 |         sys_prompt = PROCEDURAL_MEMORY.COMBINED_MANAGER_PROMPT
 44 | 
 45 |         self.generator_agent = self._create_agent(sys_prompt)
 46 | 
 47 |         # Initialize the remaining modules
 48 |         self.dag_translator_agent = self._create_agent(
 49 |             PROCEDURAL_MEMORY.DAG_TRANSLATOR_PROMPT
 50 |         )
 51 |         self.narrative_summarization_agent = self._create_agent(
 52 |             PROCEDURAL_MEMORY.TASK_SUMMARIZATION_PROMPT
 53 |         )
 54 |         self.episode_summarization_agent = self._create_agent(
 55 |             PROCEDURAL_MEMORY.SUBTASK_SUMMARIZATION_PROMPT
 56 |         )
 57 | 
 58 |         self.local_kb_path = local_kb_path
 59 | 
 60 |         self.embedding_engine = embedding_engine
 61 |         self.knowledge_base = KnowledgeBase(
 62 |             embedding_engine=self.embedding_engine,
 63 |             local_kb_path=self.local_kb_path,
 64 |             platform=platform,
 65 |             engine_params=engine_params,
 66 |         )
 67 | 
 68 |         self.planner_history = []
 69 | 
 70 |         self.turn_count = 0
 71 |         self.search_engine = search_engine
 72 |         self.multi_round = multi_round
 73 | 
 74 |     def summarize_episode(self, trajectory):
 75 |         """Summarize the episode experience for lifelong learning reflection
 76 |         Args:
 77 |             trajectory: str: The episode experience to be summarized
 78 |         """
 79 | 
 80 |         # Create Reflection on whole trajectories for next round trial, keep earlier messages as exemplars
 81 |         self.episode_summarization_agent.add_message(trajectory, role="user")
 82 |         subtask_summarization = call_llm_safe(self.episode_summarization_agent)
 83 |         self.episode_summarization_agent.add_message(
 84 |             subtask_summarization, role="assistant"
 85 |         )
 86 | 
 87 |         return subtask_summarization
 88 | 
 89 |     def summarize_narrative(self, trajectory):
 90 |         """Summarize the narrative experience for lifelong learning reflection
 91 |         Args:
 92 |             trajectory: str: The narrative experience to be summarized
 93 |         """
 94 |         # Create Reflection on whole trajectories for next round trial
 95 |         self.narrative_summarization_agent.add_message(trajectory, role="user")
 96 |         lifelong_learning_reflection = call_llm_safe(self.narrative_summarization_agent)
 97 | 
 98 |         return lifelong_learning_reflection
 99 | 
100 |     def _generate_step_by_step_plan(
101 |         self,
102 |         observation: Dict,
103 |         instruction: str,
104 |         failed_subtask: Optional[Node] = None,
105 |         completed_subtasks_list: List[Node] = [],
106 |         remaining_subtasks_list: List[Node] = [],
107 |     ) -> Tuple[Dict, str]:
108 |         agent = self.grounding_agent
109 | 
110 |         # Converts a list of DAG Nodes into a natural langauge list
111 |         def format_subtask_list(subtasks: List[Node]) -> str:
112 |             res = ""
113 |             for idx, node in enumerate(subtasks):
114 |                 res += f"{idx+1}. **{node.name}**:\n"
115 |                 bullets = re.split(r"(?<=[.!?;]) +", node.info)
116 |                 for bullet in bullets:
117 |                     res += f"   - {bullet}\n"
118 |                 res += "\n"
119 |             return res
120 | 
121 |         # Perform Retrieval only at the first planning step
122 |         if self.turn_count == 0:
123 | 
124 |             self.search_query = self.knowledge_base.formulate_query(
125 |                 instruction, observation
126 |             )
127 | 
128 |             most_similar_task = ""
129 |             retrieved_experience = ""
130 |             integrated_knowledge = ""
131 |             # Retrieve most similar narrative (task) experience
132 |             most_similar_task, retrieved_experience = (
133 |                 self.knowledge_base.retrieve_narrative_experience(instruction)
134 |             )
135 |             logger.info(
136 |                 "SIMILAR TASK EXPERIENCE: %s",
137 |                 most_similar_task + "\n" + retrieved_experience.strip(),
138 |             )
139 | 
140 |             # Retrieve knowledge from the web if search_engine is provided
141 |             if self.search_engine is not None:
142 |                 retrieved_knowledge = self.knowledge_base.retrieve_knowledge(
143 |                     instruction=instruction,
144 |                     search_query=self.search_query,
145 |                     search_engine=self.search_engine,
146 |                 )
147 |                 logger.info("RETRIEVED KNOWLEDGE: %s", retrieved_knowledge)
148 | 
149 |                 if retrieved_knowledge is not None:
150 |                     # Fuse the retrieved knowledge and experience
151 |                     integrated_knowledge = self.knowledge_base.knowledge_fusion(
152 |                         observation=observation,
153 |                         instruction=instruction,
154 |                         web_knowledge=retrieved_knowledge,
155 |                         similar_task=most_similar_task,
156 |                         experience=retrieved_experience,
157 |                     )
158 |                     logger.info("INTEGRATED KNOWLEDGE: %s", integrated_knowledge)
159 | 
160 |             integrated_knowledge = integrated_knowledge or retrieved_experience
161 | 
162 |             # Add the integrated knowledge to the task instruction in the system prompt
163 |             if integrated_knowledge:
164 |                 instruction += f"\nYou may refer to some retrieved knowledge if you think they are useful.{integrated_knowledge}"
165 | 
166 |             self.generator_agent.add_system_prompt(
167 |                 self.generator_agent.system_prompt.replace(
168 |                     "TASK_DESCRIPTION", instruction
169 |                 )
170 |             )
171 | 
172 |         # Re-plan on failure case
173 |         if failed_subtask:
174 |             generator_message = (
175 |                 f"The subtask {failed_subtask} cannot be completed. Please generate a new plan for the remainder of the trajectory.\n\n"
176 |                 f"Successfully Completed Subtasks:\n{format_subtask_list(completed_subtasks_list)}\n"
177 |             )
178 |         # Re-plan on subtask completion case
179 |         elif len(completed_subtasks_list) + len(remaining_subtasks_list) > 0:
180 |             generator_message = (
181 |                 "The current trajectory and desktop state is provided. Please revise the plan for the following trajectory.\n\n"
182 |                 f"Successfully Completed Subtasks:\n{format_subtask_list(completed_subtasks_list)}\n"
183 |                 f"Future Remaining Subtasks:\n{format_subtask_list(remaining_subtasks_list)}\n"
184 |             )
185 |         # Initial plan case
186 |         else:
187 |             generator_message = "Please generate the initial plan for the task.\n"
188 | 
189 |         logger.info("GENERATOR MESSAGE: %s", generator_message)
190 | 
191 |         self.generator_agent.add_message(
192 |             generator_message,
193 |             image_content=observation.get("screenshot", None),
194 |             role="user",
195 |         )
196 | 
197 |         logger.info("GENERATING HIGH LEVEL PLAN")
198 | 
199 |         plan = call_llm_safe(self.generator_agent)
200 |         if plan == "":
201 |             raise Exception("Plan Generation Failed - Fix the Prompt")
202 | 
203 |         logger.info("HIGH LEVEL STEP BY STEP PLAN: %s", plan)
204 | 
205 |         self.generator_agent.add_message(plan, role="assistant")
206 |         self.planner_history.append(plan)
207 |         self.turn_count += 1
208 | 
209 |         # Set Cost based on GPT-4o
210 |         input_tokens, output_tokens = calculate_tokens(self.generator_agent.messages)
211 |         cost = input_tokens * (0.0050 / 1000) + output_tokens * (0.0150 / 1000)
212 | 
213 |         planner_info = {
214 |             "search_query": self.search_query,
215 |             "goal_plan": plan,
216 |             "num_input_tokens_plan": input_tokens,
217 |             "num_output_tokens_plan": output_tokens,
218 |             "goal_plan_cost": cost,
219 |         }
220 | 
221 |         assert type(plan) == str
222 | 
223 |         return planner_info, plan
224 | 
225 |     def _generate_dag(self, instruction: str, plan: str) -> Tuple[Dict, Dag]:
226 |         # For the re-planning case, remove the prior input since this should only translate the new plan
227 |         self.dag_translator_agent.reset()
228 | 
229 |         # Add initial instruction and plan to the agent's message history
230 |         self.dag_translator_agent.add_message(
231 |             f"Instruction: {instruction}\nPlan: {plan}", role="user"
232 |         )
233 | 
234 |         logger.info("GENERATING DAG")
235 | 
236 |         # Generate DAG
237 |         dag_raw = call_llm_safe(self.dag_translator_agent)
238 | 
239 |         dag = parse_dag(dag_raw)
240 | 
241 |         logger.info("Generated DAG: %s", dag_raw)
242 | 
243 |         self.dag_translator_agent.add_message(dag_raw, role="assistant")
244 | 
245 |         input_tokens, output_tokens = calculate_tokens(
246 |             self.dag_translator_agent.messages
247 |         )
248 | 
249 |         # Set Cost based on GPT-4o
250 |         cost = input_tokens * (0.0050 / 1000) + output_tokens * (0.0150 / 1000)
251 | 
252 |         dag_info = {
253 |             "dag": dag_raw,
254 |             "num_input_tokens_dag": input_tokens,
255 |             "num_output_tokens_dag": output_tokens,
256 |             "dag_cost": cost,
257 |         }
258 | 
259 |         assert type(dag) == Dag
260 | 
261 |         return dag_info, dag
262 | 
263 |     def _topological_sort(self, dag: Dag) -> List[Node]:
264 |         """Topological sort of the DAG using DFS
265 |         dag: Dag: Object representation of the DAG with nodes and edges
266 |         """
267 | 
268 |         def dfs(node_name, visited, stack):
269 |             visited[node_name] = True
270 |             for neighbor in adj_list[node_name]:
271 |                 if not visited[neighbor]:
272 |                     dfs(neighbor, visited, stack)
273 |             stack.append(node_name)
274 | 
275 |         # Convert edges to adjacency list
276 |         adj_list = defaultdict(list)
277 |         for u, v in dag.edges:
278 |             adj_list[u.name].append(v.name)
279 | 
280 |         visited = {node.name: False for node in dag.nodes}
281 |         stack = []
282 | 
283 |         for node in dag.nodes:
284 |             if not visited[node.name]:
285 |                 dfs(node.name, visited, stack)
286 | 
287 |         # Return the nodes in topologically sorted order
288 |         sorted_nodes = [
289 |             next(n for n in dag.nodes if n.name == name) for name in stack[::-1]
290 |         ]
291 |         return sorted_nodes
292 | 
293 |     def get_action_queue(
294 |         self,
295 |         instruction: str,
296 |         observation: Dict,
297 |         failed_subtask: Optional[Node] = None,
298 |         completed_subtasks_list: List[Node] = [],
299 |         remaining_subtasks_list: List[Node] = [],
300 |     ):
301 |         """Generate the action list based on the instruction
302 |         instruction:str: Instruction for the task
303 |         """
304 | 
305 |         planner_info, plan = self._generate_step_by_step_plan(
306 |             observation,
307 |             instruction,
308 |             failed_subtask,
309 |             completed_subtasks_list,
310 |             remaining_subtasks_list,
311 |         )
312 | 
313 |         # Generate the DAG
314 |         dag_info, dag = self._generate_dag(instruction, plan)
315 | 
316 |         # Topological sort of the DAG
317 |         action_queue = self._topological_sort(dag)
318 | 
319 |         planner_info.update(dag_info)
320 | 
321 |         return planner_info, action_queue
322 | 


--------------------------------------------------------------------------------
/gui_agents/s2/agents/worker.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import re
  3 | import textwrap
  4 | from typing import Dict, List, Tuple
  5 | import platform
  6 | 
  7 | from gui_agents.s2.agents.grounding import ACI
  8 | from gui_agents.s2.core.module import BaseModule
  9 | from gui_agents.s2.core.knowledge import KnowledgeBase
 10 | from gui_agents.s2.memory.procedural_memory import PROCEDURAL_MEMORY
 11 | from gui_agents.s2.core.engine import OpenAIEmbeddingEngine
 12 | from gui_agents.s2.utils.common_utils import (
 13 |     Node,
 14 |     calculate_tokens,
 15 |     call_llm_safe,
 16 |     parse_single_code_from_string,
 17 |     sanitize_code,
 18 |     extract_first_agent_function,
 19 | )
 20 | 
 21 | logger = logging.getLogger("desktopenv.agent")
 22 | 
 23 | 
 24 | class Worker(BaseModule):
 25 |     def __init__(
 26 |         self,
 27 |         engine_params: Dict,
 28 |         grounding_agent: ACI,
 29 |         local_kb_path: str,
 30 |         embedding_engine=OpenAIEmbeddingEngine(),
 31 |         platform: str = platform.system().lower(),
 32 |         enable_reflection: bool = True,
 33 |         use_subtask_experience: bool = True,
 34 |     ):
 35 |         """
 36 |         Worker receives a subtask list and active subtask and generates the next action for the to execute.
 37 |         Args:
 38 |             engine_params: Dict
 39 |                 Parameters for the multimodal engine
 40 |             grounding_agent: Agent
 41 |                 The grounding agent to use
 42 |             local_kb_path: str
 43 |                 Path to knowledge base
 44 |             platform: str
 45 |                 OS platform the agent runs on (darwin, linux, windows)
 46 |             enable_reflection: bool
 47 |                 Whether to enable reflection
 48 |             use_subtask_experience: bool
 49 |                 Whether to use subtask experience
 50 |         """
 51 |         super().__init__(engine_params, platform)
 52 | 
 53 |         self.grounding_agent = grounding_agent
 54 |         self.local_kb_path = local_kb_path
 55 |         self.embedding_engine = embedding_engine
 56 |         self.enable_reflection = enable_reflection
 57 |         self.use_subtask_experience = use_subtask_experience
 58 |         self.reset()
 59 | 
 60 |     def reset(self):
 61 |         if self.platform != "linux":
 62 |             skipped_actions = ["set_cell_values"]
 63 |         else:
 64 |             skipped_actions = []
 65 | 
 66 |         sys_prompt = PROCEDURAL_MEMORY.construct_worker_procedural_memory(
 67 |             type(self.grounding_agent), skipped_actions=skipped_actions
 68 |         ).replace("CURRENT_OS", self.platform)
 69 | 
 70 |         self.generator_agent = self._create_agent(sys_prompt)
 71 |         self.reflection_agent = self._create_agent(
 72 |             PROCEDURAL_MEMORY.REFLECTION_ON_TRAJECTORY
 73 |         )
 74 | 
 75 |         self.knowledge_base = KnowledgeBase(
 76 |             embedding_engine=self.embedding_engine,
 77 |             local_kb_path=self.local_kb_path,
 78 |             platform=self.platform,
 79 |             engine_params=self.engine_params,
 80 |         )
 81 | 
 82 |         self.turn_count = 0
 83 |         self.worker_history = []
 84 |         self.reflections = []
 85 |         self.cost_this_turn = 0
 86 |         self.screenshot_inputs = []
 87 |         self.planner_history = []
 88 |         self.max_trajector_length = 8
 89 | 
 90 |     def flush_messages(self):
 91 |         # generator msgs are alternating [user, assistant], so 2 per round
 92 |         if len(self.generator_agent.messages) > 2 * self.max_trajector_length + 1:
 93 |             self.generator_agent.remove_message_at(1)
 94 |             self.generator_agent.remove_message_at(1)
 95 |         # reflector msgs are all [(user text, user image)], so 1 per round
 96 |         if len(self.reflection_agent.messages) > self.max_trajector_length + 1:
 97 |             self.reflection_agent.remove_message_at(1)
 98 | 
 99 |     def generate_next_action(
100 |         self,
101 |         instruction: str,
102 |         search_query: str,
103 |         subtask: str,
104 |         subtask_info: Dict,
105 |         future_tasks: List[Node],
106 |         done_task: List[Node],
107 |         obs: Dict,
108 |     ) -> Tuple[Dict, List]:
109 |         """
110 |         Predict the next action(s) based on the current observation.
111 |         """
112 |         # Provide the top_app to the Grounding Agent to remove all other applications from the tree. At t=0, top_app is None
113 |         agent = self.grounding_agent
114 | 
115 |         # Get RAG knowledge, only update system message at t=0
116 |         if self.turn_count == 0:
117 |             if self.use_subtask_experience:
118 |                 subtask_query_key = (
119 |                     "Task:\n"
120 |                     + search_query
121 |                     + "\n\nSubtask: "
122 |                     + subtask
123 |                     + "\nSubtask Instruction: "
124 |                     + subtask_info
125 |                 )
126 |                 retrieved_similar_subtask, retrieved_subtask_experience = (
127 |                     self.knowledge_base.retrieve_episodic_experience(subtask_query_key)
128 |                 )
129 | 
130 |                 # Dirty fix to replace id with element description during subtask retrieval
131 |                 pattern = r"\(\d+"
132 |                 retrieved_subtask_experience = re.sub(
133 |                     pattern, "(element_description", retrieved_subtask_experience
134 |                 )
135 |                 retrieved_subtask_experience = retrieved_subtask_experience.replace(
136 |                     "_id", "_description"
137 |                 )
138 | 
139 |                 logger.info(
140 |                     "SIMILAR SUBTASK EXPERIENCE: %s",
141 |                     retrieved_similar_subtask
142 |                     + "\n"
143 |                     + retrieved_subtask_experience.strip(),
144 |                 )
145 |                 instruction += "\nYou may refer to some similar subtask experience if you think they are useful. {}".format(
146 |                     retrieved_similar_subtask + "\n" + retrieved_subtask_experience
147 |                 )
148 | 
149 |             self.generator_agent.add_system_prompt(
150 |                 self.generator_agent.system_prompt.replace(
151 |                     "SUBTASK_DESCRIPTION", subtask
152 |                 )
153 |                 .replace("TASK_DESCRIPTION", instruction)
154 |                 .replace("FUTURE_TASKS", ", ".join([f.name for f in future_tasks]))
155 |                 .replace("DONE_TASKS", ",".join(d.name for d in done_task))
156 |             )
157 | 
158 |         # Reflection generation does not add its own response, it only gets the trajectory
159 |         reflection = None
160 |         if self.enable_reflection:
161 |             # Load the initial subtask info
162 |             if self.turn_count == 0:
163 |                 text_content = textwrap.dedent(
164 |                     f"""
165 |                     Subtask Description: {subtask}
166 |                     Subtask Information: {subtask_info}
167 |                     Current Trajectory below:
168 |                     """
169 |                 )
170 |                 updated_sys_prompt = (
171 |                     self.reflection_agent.system_prompt + "\n" + text_content
172 |                 )
173 |                 self.reflection_agent.add_system_prompt(updated_sys_prompt)
174 |                 self.reflection_agent.add_message(
175 |                     text_content="The initial screen is provided. No action has been taken yet.",
176 |                     image_content=obs["screenshot"],
177 |                     role="user",
178 |                 )
179 |             # Load the latest action
180 |             else:
181 |                 text_content = self.clean_worker_generation_for_reflection(
182 |                     self.planner_history[-1]
183 |                 )
184 |                 self.reflection_agent.add_message(
185 |                     text_content=text_content,
186 |                     image_content=obs["screenshot"],
187 |                     role="user",
188 |                 )
189 |                 reflection = call_llm_safe(self.reflection_agent)
190 |                 self.reflections.append(reflection)
191 |                 logger.info("REFLECTION: %s", reflection)
192 | 
193 |         generator_message = (
194 |             f"\nYou may use this reflection on the previous action and overall trajectory: {reflection}\n"
195 |             if reflection and self.turn_count > 0
196 |             else ""
197 |         ) + f"Text Buffer = [{','.join(agent.notes)}]."
198 | 
199 |         # Only provide subinfo in the very first message to avoid over influence and redundancy
200 |         if self.turn_count == 0:
201 |             generator_message += f"Remember only complete the subtask: {subtask}\n"
202 |             generator_message += f"You can use this extra information for completing the current subtask: {subtask_info}.\n"
203 | 
204 |         # logger.info("GENERATOR MESSAGE: %s", generator_message)
205 | 
206 |         self.generator_agent.add_message(
207 |             generator_message, image_content=obs["screenshot"], role="user"
208 |         )
209 | 
210 |         plan = call_llm_safe(self.generator_agent)
211 |         self.planner_history.append(plan)
212 |         logger.info("PLAN: %s", plan)
213 |         self.generator_agent.add_message(plan, role="assistant")
214 | 
215 |         # Calculate input/output tokens and gpt-4o cost
216 |         input_tokens, output_tokens = calculate_tokens(self.generator_agent.messages)
217 |         cost = input_tokens * (0.0050 / 1000) + output_tokens * (0.0150 / 1000)
218 |         self.cost_this_turn += cost
219 |         logger.info("EXECTUOR COST: %s", self.cost_this_turn)
220 | 
221 |         # Use the DescriptionBasedACI to convert agent_action("desc") into agent_action([x, y])
222 |         try:
223 |             agent.assign_coordinates(plan, obs)
224 |             plan_code = parse_single_code_from_string(plan.split("Grounded Action")[-1])
225 |             plan_code = sanitize_code(plan_code)
226 |             plan_code = extract_first_agent_function(plan_code)
227 |             exec_code = eval(plan_code)
228 |         except Exception as e:
229 |             logger.error("Error in parsing plan code: %s", e)
230 |             plan_code = "agent.wait(1.0)"
231 |             exec_code = eval(plan_code)
232 | 
233 |         executor_info = {
234 |             "current_subtask": subtask,
235 |             "current_subtask_info": subtask_info,
236 |             "executor_plan": plan,
237 |             "plan_code": plan_code,
238 |             "reflection": reflection,
239 |             "num_input_tokens_executor": input_tokens,
240 |             "num_output_tokens_executor": output_tokens,
241 |         }
242 |         self.turn_count += 1
243 | 
244 |         self.screenshot_inputs.append(obs["screenshot"])
245 |         self.flush_messages()
246 | 
247 |         return executor_info, [exec_code]
248 | 
249 |     # Removes the previous action verification, and removes any extraneous grounded actions
250 |     def clean_worker_generation_for_reflection(self, worker_generation: str) -> str:
251 |         # Remove the previous action verification
252 |         res = worker_generation[worker_generation.find("(Screenshot Analysis)") :]
253 |         action = extract_first_agent_function(worker_generation)
254 |         # Cut off extra grounded actions
255 |         res = res[: res.find("(Grounded Action)")]
256 |         res += f"(Grounded Action)\n```python\n{action}\n```\n"
257 |         return res
258 | 


--------------------------------------------------------------------------------
/gui_agents/s2/cli_app.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import io
  4 | import logging
  5 | import os
  6 | import platform
  7 | import pyautogui
  8 | import sys
  9 | import time
 10 | 
 11 | from PIL import Image
 12 | 
 13 | from gui_agents.s2.agents.grounding import OSWorldACI
 14 | from gui_agents.s2.agents.agent_s import AgentS2
 15 | 
 16 | current_platform = platform.system().lower()
 17 | 
 18 | logger = logging.getLogger()
 19 | logger.setLevel(logging.DEBUG)
 20 | 
 21 | datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
 22 | 
 23 | log_dir = "logs"
 24 | os.makedirs(log_dir, exist_ok=True)
 25 | 
 26 | file_handler = logging.FileHandler(
 27 |     os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8"
 28 | )
 29 | debug_handler = logging.FileHandler(
 30 |     os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8"
 31 | )
 32 | stdout_handler = logging.StreamHandler(sys.stdout)
 33 | sdebug_handler = logging.FileHandler(
 34 |     os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8"
 35 | )
 36 | 
 37 | file_handler.setLevel(logging.INFO)
 38 | debug_handler.setLevel(logging.DEBUG)
 39 | stdout_handler.setLevel(logging.INFO)
 40 | sdebug_handler.setLevel(logging.DEBUG)
 41 | 
 42 | formatter = logging.Formatter(
 43 |     fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s"
 44 | )
 45 | file_handler.setFormatter(formatter)
 46 | debug_handler.setFormatter(formatter)
 47 | stdout_handler.setFormatter(formatter)
 48 | sdebug_handler.setFormatter(formatter)
 49 | 
 50 | stdout_handler.addFilter(logging.Filter("desktopenv"))
 51 | sdebug_handler.addFilter(logging.Filter("desktopenv"))
 52 | 
 53 | logger.addHandler(file_handler)
 54 | logger.addHandler(debug_handler)
 55 | logger.addHandler(stdout_handler)
 56 | logger.addHandler(sdebug_handler)
 57 | 
 58 | platform_os = platform.system()
 59 | 
 60 | 
 61 | def show_permission_dialog(code: str, action_description: str):
 62 |     """Show a platform-specific permission dialog and return True if approved."""
 63 |     if platform.system() == "Darwin":
 64 |         result = os.system(
 65 |             f'osascript -e \'display dialog "Do you want to execute this action?\n\n{code} which will try to {action_description}" with title "Action Permission" buttons {{"Cancel", "OK"}} default button "OK" cancel button "Cancel"\''
 66 |         )
 67 |         return result == 0
 68 |     elif platform.system() == "Linux":
 69 |         result = os.system(
 70 |             f'zenity --question --title="Action Permission" --text="Do you want to execute this action?\n\n{code}" --width=400 --height=200'
 71 |         )
 72 |         return result == 0
 73 |     return False
 74 | 
 75 | 
 76 | def scale_screen_dimensions(width: int, height: int, max_dim_size: int):
 77 |     scale_factor = min(max_dim_size / width, max_dim_size / height, 1)
 78 |     safe_width = int(width * scale_factor)
 79 |     safe_height = int(height * scale_factor)
 80 |     return safe_width, safe_height
 81 | 
 82 | 
 83 | def run_agent(agent, instruction: str, scaled_width: int, scaled_height: int):
 84 |     obs = {}
 85 |     traj = "Task:\n" + instruction
 86 |     subtask_traj = ""
 87 |     for _ in range(15):
 88 |         # Get screen shot using pyautogui
 89 |         screenshot = pyautogui.screenshot()
 90 |         screenshot = screenshot.resize((scaled_width, scaled_height), Image.LANCZOS)
 91 | 
 92 |         # Save the screenshot to a BytesIO object
 93 |         buffered = io.BytesIO()
 94 |         screenshot.save(buffered, format="PNG")
 95 | 
 96 |         # Get the byte value of the screenshot
 97 |         screenshot_bytes = buffered.getvalue()
 98 |         # Convert to base64 string.
 99 |         obs["screenshot"] = screenshot_bytes
100 | 
101 |         # Get next action code from the agent
102 |         info, code = agent.predict(instruction=instruction, observation=obs)
103 | 
104 |         if "done" in code[0].lower() or "fail" in code[0].lower():
105 |             if platform.system() == "Darwin":
106 |                 os.system(
107 |                     f'osascript -e \'display dialog "Task Completed" with title "OpenACI Agent" buttons "OK" default button "OK"\''
108 |                 )
109 |             elif platform.system() == "Linux":
110 |                 os.system(
111 |                     f'zenity --info --title="OpenACI Agent" --text="Task Completed" --width=200 --height=100'
112 |                 )
113 | 
114 |             agent.update_narrative_memory(traj)
115 |             break
116 | 
117 |         if "next" in code[0].lower():
118 |             continue
119 | 
120 |         if "wait" in code[0].lower():
121 |             time.sleep(5)
122 |             continue
123 | 
124 |         else:
125 |             time.sleep(1.0)
126 |             print("EXECUTING CODE:", code[0])
127 | 
128 |             # Ask for permission before executing
129 |             exec(code[0])
130 |             time.sleep(1.0)
131 | 
132 |             # Update task and subtask trajectories and optionally the episodic memory
133 |             traj += (
134 |                 "\n\nReflection:\n"
135 |                 + str(info["reflection"])
136 |                 + "\n\n----------------------\n\nPlan:\n"
137 |                 + info["executor_plan"]
138 |             )
139 |             subtask_traj = agent.update_episodic_memory(info, subtask_traj)
140 | 
141 | 
142 | def main():
143 |     parser = argparse.ArgumentParser(description="Run AgentS2 with specified model.")
144 |     parser.add_argument(
145 |         "--provider",
146 |         type=str,
147 |         default="anthropic",
148 |         help="Specify the provider to use (e.g., openai, anthropic, etc.)",
149 |     )
150 |     parser.add_argument(
151 |         "--model",
152 |         type=str,
153 |         default="claude-3-7-sonnet-20250219",
154 |         help="Specify the model to use (e.g., gpt-4o)",
155 |     )
156 |     parser.add_argument(
157 |         "--model_url",
158 |         type=str,
159 |         default="",
160 |         help="The URL of the main generation model API.",
161 |     )
162 |     parser.add_argument(
163 |         "--model_api_key",
164 |         type=str,
165 |         default="",
166 |         help="The API key of the main generation model.",
167 |     )
168 | 
169 |     # Grounding model config option 1: API based
170 |     parser.add_argument(
171 |         "--grounding_model_provider",
172 |         type=str,
173 |         default="anthropic",
174 |         help="Specify the provider to use for the grounding model (e.g., openai, anthropic, etc.)",
175 |     )
176 |     parser.add_argument(
177 |         "--grounding_model",
178 |         type=str,
179 |         default="claude-3-7-sonnet-20250219",
180 |         help="Specify the grounding model to use (e.g., claude-3-5-sonnet-20241022)",
181 |     )
182 |     parser.add_argument(
183 |         "--grounding_model_resize_width",
184 |         type=int,
185 |         default=1366,
186 |         help="Width of screenshot image after processor rescaling",
187 |     )
188 |     parser.add_argument(
189 |         "--grounding_model_resize_height",
190 |         type=int,
191 |         default=None,
192 |         help="Height of screenshot image after processor rescaling",
193 |     )
194 | 
195 |     # Grounding model config option 2: Self-hosted endpoint based
196 |     parser.add_argument(
197 |         "--endpoint_provider",
198 |         type=str,
199 |         default="",
200 |         help="Specify the endpoint provider for your grounding model, only HuggingFace TGI support for now",
201 |     )
202 |     parser.add_argument(
203 |         "--endpoint_url",
204 |         type=str,
205 |         default="",
206 |         help="Specify the endpoint URL for your grounding model",
207 |     )
208 |     parser.add_argument(
209 |         "--endpoint_api_key",
210 |         type=str,
211 |         default="",
212 |         help="The API key of the grounding model.",
213 |     )
214 | 
215 |     parser.add_argument(
216 |         "--embedding_engine_type",
217 |         type=str,
218 |         default="openai",
219 |         help="Specify the embedding engine type (supports openai, gemini)",
220 |     )
221 | 
222 |     args = parser.parse_args()
223 |     assert (
224 |         args.grounding_model_provider and args.grounding_model
225 |     ) or args.endpoint_url, "Error: No grounding model was provided. Either provide an API based model, or a self-hosted HuggingFace endpoint"
226 | 
227 |     # Re-scales screenshot size to ensure it fits in UI-TARS context limit
228 |     screen_width, screen_height = pyautogui.size()
229 |     scaled_width, scaled_height = scale_screen_dimensions(
230 |         screen_width, screen_height, max_dim_size=2400
231 |     )
232 | 
233 |     # Load the general engine params
234 |     engine_params = {
235 |         "engine_type": args.provider,
236 |         "model": args.model,
237 |         "base_url": args.model_url,
238 |         "api_key": args.model_api_key,
239 |     }
240 | 
241 |     # Load the grounding engine from a HuggingFace TGI endpoint
242 |     if args.endpoint_url:
243 |         engine_params_for_grounding = {
244 |             "engine_type": args.endpoint_provider,
245 |             "base_url": args.endpoint_url,
246 |             "api_key": args.endpoint_api_key,
247 |         }
248 |     else:
249 |         grounding_height = args.grounding_model_resize_height
250 |         # If not provided, use the aspect ratio of the screen to compute the height
251 |         if grounding_height is None:
252 |             grounding_height = (
253 |                 screen_height * args.grounding_model_resize_width / screen_width
254 |             )
255 | 
256 |         engine_params_for_grounding = {
257 |             "engine_type": args.grounding_model_provider,
258 |             "model": args.grounding_model,
259 |             "grounding_width": args.grounding_model_resize_width,
260 |             "grounding_height": grounding_height,
261 |         }
262 | 
263 |     grounding_agent = OSWorldACI(
264 |         platform=current_platform,
265 |         engine_params_for_generation=engine_params,
266 |         engine_params_for_grounding=engine_params_for_grounding,
267 |         width=screen_width,
268 |         height=screen_height,
269 |     )
270 | 
271 |     agent = AgentS2(
272 |         engine_params,
273 |         grounding_agent,
274 |         platform=current_platform,
275 |         action_space="pyautogui",
276 |         observation_type="mixed",
277 |         search_engine=None,
278 |         embedding_engine_type=args.embedding_engine_type,
279 |     )
280 | 
281 |     while True:
282 |         query = input("Query: ")
283 | 
284 |         agent.reset()
285 | 
286 |         # Run the agent on your own device
287 |         run_agent(agent, query, scaled_width, scaled_height)
288 | 
289 |         response = input("Would you like to provide another query? (y/n): ")
290 |         if response.lower() != "y":
291 |             break
292 | 
293 | 
294 | if __name__ == "__main__":
295 |     main()
296 | 


--------------------------------------------------------------------------------
/gui_agents/s2/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/gui_agents/s2/core/__init__.py


--------------------------------------------------------------------------------
/gui_agents/s2/core/mllm.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | 
  3 | import numpy as np
  4 | 
  5 | from gui_agents.s2.core.engine import (
  6 |     LMMEngineAnthropic,
  7 |     LMMEngineAzureOpenAI,
  8 |     LMMEngineHuggingFace,
  9 |     LMMEngineOpenAI,
 10 |     LMMEngineOpenRouter,
 11 |     LMMEnginevLLM,
 12 |     LMMEngineGemini,
 13 | )
 14 | 
 15 | 
 16 | class LMMAgent:
 17 |     def __init__(self, engine_params=None, system_prompt=None, engine=None):
 18 |         if engine is None:
 19 |             if engine_params is not None:
 20 |                 engine_type = engine_params.get("engine_type")
 21 |                 if engine_type == "openai":
 22 |                     self.engine = LMMEngineOpenAI(**engine_params)
 23 |                 elif engine_type == "anthropic":
 24 |                     self.engine = LMMEngineAnthropic(**engine_params)
 25 |                 elif engine_type == "azure":
 26 |                     self.engine = LMMEngineAzureOpenAI(**engine_params)
 27 |                 elif engine_type == "vllm":
 28 |                     self.engine = LMMEnginevLLM(**engine_params)
 29 |                 elif engine_type == "huggingface":
 30 |                     self.engine = LMMEngineHuggingFace(**engine_params)
 31 |                 elif engine_type == "gemini":
 32 |                     self.engine = LMMEngineGemini(**engine_params)
 33 |                 elif engine_type == "open_router":
 34 |                     self.engine = LMMEngineOpenRouter(**engine_params)
 35 |                 else:
 36 |                     raise ValueError("engine_type is not supported")
 37 |             else:
 38 |                 raise ValueError("engine_params must be provided")
 39 |         else:
 40 |             self.engine = engine
 41 | 
 42 |         self.messages = []  # Empty messages
 43 | 
 44 |         if system_prompt:
 45 |             self.add_system_prompt(system_prompt)
 46 |         else:
 47 |             self.add_system_prompt("You are a helpful assistant.")
 48 | 
 49 |     def encode_image(self, image_content):
 50 |         # if image_content is a path to an image file, check type of the image_content to verify
 51 |         if isinstance(image_content, str):
 52 |             with open(image_content, "rb") as image_file:
 53 |                 return base64.b64encode(image_file.read()).decode("utf-8")
 54 |         else:
 55 |             return base64.b64encode(image_content).decode("utf-8")
 56 | 
 57 |     def reset(
 58 |         self,
 59 |     ):
 60 | 
 61 |         self.messages = [
 62 |             {
 63 |                 "role": "system",
 64 |                 "content": [{"type": "text", "text": self.system_prompt}],
 65 |             }
 66 |         ]
 67 | 
 68 |     def add_system_prompt(self, system_prompt):
 69 |         self.system_prompt = system_prompt
 70 |         if len(self.messages) > 0:
 71 |             self.messages[0] = {
 72 |                 "role": "system",
 73 |                 "content": [{"type": "text", "text": self.system_prompt}],
 74 |             }
 75 |         else:
 76 |             self.messages.append(
 77 |                 {
 78 |                     "role": "system",
 79 |                     "content": [{"type": "text", "text": self.system_prompt}],
 80 |                 }
 81 |             )
 82 | 
 83 |     def remove_message_at(self, index):
 84 |         """Remove a message at a given index"""
 85 |         if index < len(self.messages):
 86 |             self.messages.pop(index)
 87 | 
 88 |     def replace_message_at(
 89 |         self, index, text_content, image_content=None, image_detail="high"
 90 |     ):
 91 |         """Replace a message at a given index"""
 92 |         if index < len(self.messages):
 93 |             self.messages[index] = {
 94 |                 "role": self.messages[index]["role"],
 95 |                 "content": [{"type": "text", "text": text_content}],
 96 |             }
 97 |             if image_content:
 98 |                 base64_image = self.encode_image(image_content)
 99 |                 self.messages[index]["content"].append(
100 |                     {
101 |                         "type": "image_url",
102 |                         "image_url": {
103 |                             "url": f"data:image/png;base64,{base64_image}",
104 |                             "detail": image_detail,
105 |                         },
106 |                     }
107 |                 )
108 | 
109 |     def add_message(
110 |         self,
111 |         text_content,
112 |         image_content=None,
113 |         role=None,
114 |         image_detail="high",
115 |         put_text_last=False,
116 |     ):
117 |         """Add a new message to the list of messages"""
118 | 
119 |         # API-style inference from OpenAI and AzureOpenAI
120 |         if isinstance(
121 |             self.engine,
122 |             (
123 |                 LMMEngineOpenAI,
124 |                 LMMEngineAzureOpenAI,
125 |                 LMMEngineHuggingFace,
126 |                 LMMEngineGemini,
127 |                 LMMEngineOpenRouter,
128 |             ),
129 |         ):
130 |             # infer role from previous message
131 |             if role != "user":
132 |                 if self.messages[-1]["role"] == "system":
133 |                     role = "user"
134 |                 elif self.messages[-1]["role"] == "user":
135 |                     role = "assistant"
136 |                 elif self.messages[-1]["role"] == "assistant":
137 |                     role = "user"
138 | 
139 |             message = {
140 |                 "role": role,
141 |                 "content": [{"type": "text", "text": text_content}],
142 |             }
143 | 
144 |             if isinstance(image_content, np.ndarray) or image_content:
145 |                 # Check if image_content is a list or a single image
146 |                 if isinstance(image_content, list):
147 |                     # If image_content is a list of images, loop through each image
148 |                     for image in image_content:
149 |                         base64_image = self.encode_image(image)
150 |                         message["content"].append(
151 |                             {
152 |                                 "type": "image_url",
153 |                                 "image_url": {
154 |                                     "url": f"data:image/png;base64,{base64_image}",
155 |                                     "detail": image_detail,
156 |                                 },
157 |                             }
158 |                         )
159 |                 else:
160 |                     # If image_content is a single image, handle it directly
161 |                     base64_image = self.encode_image(image_content)
162 |                     message["content"].append(
163 |                         {
164 |                             "type": "image_url",
165 |                             "image_url": {
166 |                                 "url": f"data:image/png;base64,{base64_image}",
167 |                                 "detail": image_detail,
168 |                             },
169 |                         }
170 |                     )
171 | 
172 |             # Rotate text to be the last message if desired
173 |             if put_text_last:
174 |                 text_content = message["content"].pop(0)
175 |                 message["content"].append(text_content)
176 | 
177 |             self.messages.append(message)
178 | 
179 |         # For API-style inference from Anthropic
180 |         elif isinstance(self.engine, LMMEngineAnthropic):
181 |             # infer role from previous message
182 |             if role != "user":
183 |                 if self.messages[-1]["role"] == "system":
184 |                     role = "user"
185 |                 elif self.messages[-1]["role"] == "user":
186 |                     role = "assistant"
187 |                 elif self.messages[-1]["role"] == "assistant":
188 |                     role = "user"
189 | 
190 |             message = {
191 |                 "role": role,
192 |                 "content": [{"type": "text", "text": text_content}],
193 |             }
194 | 
195 |             if image_content:
196 |                 # Check if image_content is a list or a single image
197 |                 if isinstance(image_content, list):
198 |                     # If image_content is a list of images, loop through each image
199 |                     for image in image_content:
200 |                         base64_image = self.encode_image(image)
201 |                         message["content"].append(
202 |                             {
203 |                                 "type": "image",
204 |                                 "source": {
205 |                                     "type": "base64",
206 |                                     "media_type": "image/png",
207 |                                     "data": base64_image,
208 |                                 },
209 |                             }
210 |                         )
211 |                 else:
212 |                     # If image_content is a single image, handle it directly
213 |                     base64_image = self.encode_image(image_content)
214 |                     message["content"].append(
215 |                         {
216 |                             "type": "image",
217 |                             "source": {
218 |                                 "type": "base64",
219 |                                 "media_type": "image/png",
220 |                                 "data": base64_image,
221 |                             },
222 |                         }
223 |                     )
224 |             self.messages.append(message)
225 | 
226 |         # Locally hosted vLLM model inference
227 |         elif isinstance(self.engine, LMMEnginevLLM):
228 |             # infer role from previous message
229 |             if role != "user":
230 |                 if self.messages[-1]["role"] == "system":
231 |                     role = "user"
232 |                 elif self.messages[-1]["role"] == "user":
233 |                     role = "assistant"
234 |                 elif self.messages[-1]["role"] == "assistant":
235 |                     role = "user"
236 | 
237 |             message = {
238 |                 "role": role,
239 |                 "content": [{"type": "text", "text": text_content}],
240 |             }
241 | 
242 |             if image_content:
243 |                 # Check if image_content is a list or a single image
244 |                 if isinstance(image_content, list):
245 |                     # If image_content is a list of images, loop through each image
246 |                     for image in image_content:
247 |                         base64_image = self.encode_image(image)
248 |                         message["content"].append(
249 |                             {
250 |                                 "type": "image_url",
251 |                                 "image_url": {
252 |                                     "url": f"data:image;base64,{base64_image}"
253 |                                 },
254 |                             }
255 |                         )
256 |                 else:
257 |                     # If image_content is a single image, handle it directly
258 |                     base64_image = self.encode_image(image_content)
259 |                     message["content"].append(
260 |                         {
261 |                             "type": "image_url",
262 |                             "image_url": {"url": f"data:image;base64,{base64_image}"},
263 |                         }
264 |                     )
265 | 
266 |             self.messages.append(message)
267 |         else:
268 |             raise ValueError("engine_type is not supported")
269 | 
270 |     def get_response(
271 |         self,
272 |         user_message=None,
273 |         messages=None,
274 |         temperature=0.0,
275 |         max_new_tokens=None,
276 |         **kwargs,
277 |     ):
278 |         """Generate the next response based on previous messages"""
279 |         if messages is None:
280 |             messages = self.messages
281 |         if user_message:
282 |             messages.append(
283 |                 {"role": "user", "content": [{"type": "text", "text": user_message}]}
284 |             )
285 | 
286 |         return self.engine.generate(
287 |             messages,
288 |             temperature=temperature,
289 |             max_new_tokens=max_new_tokens,
290 |             **kwargs,
291 |         )
292 | 


--------------------------------------------------------------------------------
/gui_agents/s2/core/module.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Optional
 2 | from gui_agents.s2.core.mllm import LMMAgent
 3 | 
 4 | 
 5 | class BaseModule:
 6 |     def __init__(self, engine_params: Dict, platform: str):
 7 |         self.engine_params = engine_params
 8 |         self.platform = platform
 9 | 
10 |     def _create_agent(
11 |         self, system_prompt: str = None, engine_params: Optional[Dict] = None
12 |     ) -> LMMAgent:
13 |         """Create a new LMMAgent instance"""
14 |         agent = LMMAgent(engine_params or self.engine_params)
15 |         if system_prompt:
16 |             agent.add_system_prompt(system_prompt)
17 |         return agent
18 | 


--------------------------------------------------------------------------------
/gui_agents/s2/memory/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/gui_agents/s2/memory/__init__.py


--------------------------------------------------------------------------------
/gui_agents/s2/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/gui_agents/s2/utils/__init__.py


--------------------------------------------------------------------------------
/gui_agents/s2/utils/common_utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | from typing import List
  4 | import time
  5 | import tiktoken
  6 | 
  7 | from typing import Tuple, List, Union, Dict
  8 | 
  9 | from pydantic import BaseModel, ValidationError
 10 | 
 11 | import pickle
 12 | 
 13 | 
 14 | class Node(BaseModel):
 15 |     name: str
 16 |     info: str
 17 | 
 18 | 
 19 | class Dag(BaseModel):
 20 |     nodes: List[Node]
 21 |     edges: List[List[Node]]
 22 | 
 23 | 
 24 | NUM_IMAGE_TOKEN = 1105  # Value set of screen of size 1920x1080 for openai vision
 25 | 
 26 | 
 27 | def call_llm_safe(agent) -> Union[str, Dag]:
 28 |     # Retry if fails
 29 |     max_retries = 3  # Set the maximum number of retries
 30 |     attempt = 0
 31 |     response = ""
 32 |     while attempt < max_retries:
 33 |         try:
 34 |             response = agent.get_response()
 35 |             break  # If successful, break out of the loop
 36 |         except Exception as e:
 37 |             attempt += 1
 38 |             print(f"Attempt {attempt} failed: {e}")
 39 |             if attempt == max_retries:
 40 |                 print("Max retries reached. Handling failure.")
 41 |         time.sleep(1.0)
 42 |     return response
 43 | 
 44 | 
 45 | def calculate_tokens(messages, num_image_token=NUM_IMAGE_TOKEN) -> Tuple[int, int]:
 46 | 
 47 |     num_input_images = 0
 48 |     output_message = messages[-1]
 49 | 
 50 |     input_message = messages[:-1]
 51 | 
 52 |     input_string = """"""
 53 |     for message in input_message:
 54 |         input_string += message["content"][0]["text"] + "\n"
 55 |         if len(message["content"]) > 1:
 56 |             num_input_images += 1
 57 | 
 58 |     input_text_tokens = get_input_token_length(input_string)
 59 | 
 60 |     input_image_tokens = num_image_token * num_input_images
 61 | 
 62 |     output_tokens = get_input_token_length(output_message["content"][0]["text"])
 63 | 
 64 |     return (input_text_tokens + input_image_tokens), output_tokens
 65 | 
 66 | 
 67 | # Code based on https://github.com/xlang-ai/OSWorld/blob/main/mm_agents/agent.py
 68 | 
 69 | 
 70 | def parse_dag(text):
 71 |     pattern = r"<json>(.*?)</json>"
 72 |     match = re.search(pattern, text, re.DOTALL)
 73 |     if match:
 74 |         json_str = match.group(1)
 75 |         try:
 76 |             json_data = json.loads(json_str)
 77 |             return Dag(**json_data["dag"])
 78 |         except json.JSONDecodeError:
 79 |             print("Error: Invalid JSON")
 80 |             return None
 81 |         except KeyError:
 82 |             print("Error: 'dag' key not found in JSON")
 83 |             return None
 84 |         except ValidationError as e:
 85 |             print(f"Error: Invalid data structure - {e}")
 86 |             return None
 87 |     else:
 88 |         print("Error: JSON not found")
 89 |         return None
 90 | 
 91 | 
 92 | def parse_dag(text):
 93 |     """
 94 |     Try extracting JSON from <json>…</json> tags first;
 95 |     if not found, try ```json … ``` Markdown fences.
 96 |     """
 97 | 
 98 |     def _extract(pattern):
 99 |         m = re.search(pattern, text, re.DOTALL)
100 |         return m.group(1).strip() if m else None
101 | 
102 |     # 1) look for <json>…</json>
103 |     json_str = _extract(r"<json>(.*?)</json>")
104 |     # 2) fallback to ```json … ```
105 |     if json_str is None:
106 |         json_str = _extract(r"```json\s*(.*?)\s*```")
107 | 
108 |     if json_str is None:
109 |         print("Error: JSON not found in either <json> tags or ```json``` fence")
110 |         return None
111 | 
112 |     try:
113 |         payload = json.loads(json_str)
114 |     except json.JSONDecodeError as e:
115 |         print(f"Error: Invalid JSON ({e})")
116 |         return None
117 | 
118 |     if "dag" not in payload:
119 |         print("Error: 'dag' key not found in JSON")
120 |         return None
121 | 
122 |     try:
123 |         return Dag(**payload["dag"])
124 |     except ValidationError as e:
125 |         print(f"Error: Invalid data structure - {e}")
126 |         return None
127 | 
128 | 
129 | def parse_single_code_from_string(input_string):
130 |     input_string = input_string.strip()
131 |     if input_string.strip() in ["WAIT", "DONE", "FAIL"]:
132 |         return input_string.strip()
133 | 
134 |     # This regular expression will match both ```code``` and ```python code```
135 |     # and capture the `code` part. It uses a non-greedy match for the content inside.
136 |     pattern = r"```(?:\w+\s+)?(.*?)```"
137 |     # Find all non-overlapping matches in the string
138 |     matches = re.findall(pattern, input_string, re.DOTALL)
139 | 
140 |     # The regex above captures the content inside the triple backticks.
141 |     # The `re.DOTALL` flag allows the dot `.` to match newline characters as well,
142 |     # so the code inside backticks can span multiple lines.
143 | 
144 |     # matches now contains all the captured code snippets
145 | 
146 |     codes = []
147 | 
148 |     for match in matches:
149 |         match = match.strip()
150 |         commands = [
151 |             "WAIT",
152 |             "DONE",
153 |             "FAIL",
154 |         ]  # fixme: updates this part when we have more commands
155 | 
156 |         if match in commands:
157 |             codes.append(match.strip())
158 |         elif match.split("\n")[-1] in commands:
159 |             if len(match.split("\n")) > 1:
160 |                 codes.append("\n".join(match.split("\n")[:-1]))
161 |             codes.append(match.split("\n")[-1])
162 |         else:
163 |             codes.append(match)
164 | 
165 |     if len(codes) <= 0:
166 |         return "fail"
167 |     return codes[0]
168 | 
169 | 
170 | def get_input_token_length(input_string):
171 |     enc = tiktoken.encoding_for_model("gpt-4")
172 |     tokens = enc.encode(input_string)
173 |     return len(tokens)
174 | 
175 | 
176 | def sanitize_code(code):
177 |     # This pattern captures the outermost double-quoted text
178 |     if "\n" in code:
179 |         pattern = r'(".*?")'
180 |         # Find all matches in the text
181 |         matches = re.findall(pattern, code, flags=re.DOTALL)
182 |         if matches:
183 |             # Replace the first occurrence only
184 |             first_match = matches[0]
185 |             code = code.replace(first_match, f'"""{first_match[1:-1]}"""', 1)
186 |     return code
187 | 
188 | 
189 | def extract_first_agent_function(code_string):
190 |     # Regular expression pattern to match 'agent' functions with any arguments, including nested parentheses
191 |     pattern = r'agent\.[a-zA-Z_]+\((?:[^()\'"]|\'[^\']*\'|"[^"]*")*\)'
192 | 
193 |     # Find all matches in the string
194 |     matches = re.findall(pattern, code_string)
195 | 
196 |     # Return the first match if found, otherwise return None
197 |     return matches[0] if matches else None
198 | 
199 | 
200 | def load_knowledge_base(kb_path: str) -> Dict:
201 |     try:
202 |         with open(kb_path, "r") as f:
203 |             return json.load(f)
204 |     except Exception as e:
205 |         print(f"Error loading knowledge base: {e}")
206 |         return {}
207 | 
208 | 
209 | def load_embeddings(embeddings_path: str) -> Dict:
210 |     try:
211 |         with open(embeddings_path, "rb") as f:
212 |             return pickle.load(f)
213 |     except Exception as e:
214 |         print(f"Error loading embeddings: {e}")
215 |         return {}
216 | 
217 | 
218 | def save_embeddings(embeddings_path: str, embeddings: Dict):
219 |     try:
220 |         with open(embeddings_path, "wb") as f:
221 |             pickle.dump(embeddings, f)
222 |     except Exception as e:
223 |         print(f"Error saving embeddings: {e}")
224 | 


--------------------------------------------------------------------------------
/gui_agents/s2/utils/query_perplexica.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import os
 3 | 
 4 | 
 5 | def query_to_perplexica(query):
 6 |     # Retrieve the URL from an environment variable
 7 |     url = os.getenv("PERPLEXICA_URL")
 8 |     if not url:
 9 |         raise ValueError(
10 |             "PERPLEXICA_URL environment variable not set. It may take the form: 'http://localhost:{port}/api/search'. The port number is set in the config.toml in the Perplexica directory."
11 |         )
12 | 
13 |     # Request Message
14 |     message = {"focusMode": "webSearch", "query": query, "history": [["human", query]]}
15 | 
16 |     response = requests.post(url, json=message)
17 | 
18 |     if response.status_code == 200:
19 |         return response.json()["message"]
20 |     elif response.status_code == 400:
21 |         raise ValueError(
22 |             "The request is malformed or missing required fields, such as FocusModel or query"
23 |         )
24 |     else:
25 |         raise ValueError("Internal Server Error")
26 | 
27 | 
28 | # Test Code
29 | if __name__ == "__main__":
30 |     query = "What is Agent S?"
31 |     response = query_to_perplexica(query)
32 |     print(response)
33 | 


--------------------------------------------------------------------------------
/gui_agents/utils.py:
--------------------------------------------------------------------------------
 1 | """General utility."""
 2 | 
 3 | import platform
 4 | import requests
 5 | import zipfile
 6 | import io
 7 | import os
 8 | 
 9 | 
10 | def download_kb_data(
11 |     version="s2",
12 |     release_tag="v0.2.2",
13 |     download_dir="kb_data",
14 |     platform=platform.system().lower(),
15 | ):
16 |     """Download and extract the appropriate KB ZIP file for the current OS.
17 | 
18 |     Args:
19 |         version (str): Prefix in the asset name (e.g., "s1" or "s2")
20 |         release_tag (str): Tag of the release that has the assets (e.g., "v0.2.2")
21 |         download_dir (str): Where to extract the downloaded files
22 |         platform (str): OS (e.g., "windows", "darwin", "linux")
23 |     """
24 |     # Detect OS
25 |     if platform not in ["windows", "darwin", "linux"]:
26 |         raise RuntimeError(f"Unsupported OS: {platform}")
27 | 
28 |     # Build asset filename, e.g. "s1_windows.zip" or "s1_darwin.zip"
29 |     asset_name = f"{version}_{platform}.zip"
30 | 
31 |     download_url = f"https://github.com/simular-ai/Agent-S/releases/download/{release_tag}/{asset_name}"
32 | 
33 |     # Make sure our output directory exists
34 |     os.makedirs(download_dir, exist_ok=True)
35 | 
36 |     print(f"Downloading {asset_name} from {download_url} ...")
37 |     response = requests.get(download_url)
38 |     if response.status_code != 200:
39 |         raise RuntimeError(
40 |             f"Failed to download {asset_name}. "
41 |             f"HTTP status: {response.status_code} - {response.reason}"
42 |         )
43 | 
44 |     # Extract the ZIP in-memory
45 |     zip_data = io.BytesIO(response.content)
46 |     with zipfile.ZipFile(zip_data, "r") as zip_ref:
47 |         zip_ref.extractall(download_dir)
48 | 
49 |     print(f"Extracted {asset_name} to ./{download_dir}")
50 | 


--------------------------------------------------------------------------------
/images/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/images/.DS_Store


--------------------------------------------------------------------------------
/images/agent_s.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/images/agent_s.png


--------------------------------------------------------------------------------
/images/agent_s2_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/images/agent_s2_architecture.png


--------------------------------------------------------------------------------
/images/agent_s2_osworld_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/images/agent_s2_osworld_result.png


--------------------------------------------------------------------------------
/images/agent_s2_teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/images/agent_s2_teaser.png


--------------------------------------------------------------------------------
/images/agent_s_architecture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/images/agent_s_architecture.pdf


--------------------------------------------------------------------------------
/images/osworld_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/images/osworld_result.png


--------------------------------------------------------------------------------
/images/results.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/images/results.pdf


--------------------------------------------------------------------------------
/images/results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/images/results.png


--------------------------------------------------------------------------------
/images/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/images/teaser.png


--------------------------------------------------------------------------------
/images/windows_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simular-ai/Agent-S/6877ed5be5c7b3c90680268b5d65ab2a23900fd9/images/windows_result.png


--------------------------------------------------------------------------------
/models.md:
--------------------------------------------------------------------------------
 1 | We support the following APIs for MLLM inference: OpenAI, Anthropic, Gemini, Azure OpenAI, vLLM for local models, and Open Router. To use these APIs, you need to set the corresponding environment variables:
 2 | 
 3 | 1. OpenAI
 4 | 
 5 | ```
 6 | export OPENAI_API_KEY=<YOUR_API_KEY>
 7 | ```
 8 | 
 9 | 2. Anthropic
10 | 
11 | ```
12 | export ANTHROPIC_API_KEY=<YOUR_API_KEY>
13 | ```
14 | 
15 | 3. Gemini
16 | 
17 | ```
18 | export GEMINI_API_KEY=<YOUR_API_KEY>
19 | export GEMINI_ENDPOINT_URL="https://generativelanguage.googleapis.com/v1beta/openai/"
20 | ```
21 | 
22 | 4. OpenAI on Azure
23 | 
24 | ```
25 | export AZURE_OPENAI_API_BASE=<DEPLOYMENT_NAME>
26 | export AZURE_OPENAI_API_KEY=<YOUR_API_KEY>
27 | ```
28 | 
29 | 5. vLLM for Local Models
30 | 
31 | ```
32 | export vLLM_ENDPOINT_URL=<YOUR_DEPLOYMENT_URL>
33 | ```
34 | 
35 | Alternatively you can directly pass the API keys into the engine_params argument while instantating the agent.
36 | 
37 | 6. Open Router
38 | 
39 | ```
40 | export OPENROUTER_API_KEY=<YOUR_API_KEY>
41 | export OPEN_ROUTER_ENDPOINT_URL="https://openrouter.ai/api/v1"
42 | ```
43 | 
44 | ```python
45 | from gui_agents.s2.agents.agent_s import AgentS2
46 | 
47 | engine_params = {
48 |     "engine_type": 'anthropic', # Allowed Values: 'openai', 'anthropic', 'gemini', 'azure_openai', 'vllm', 'open_router'
49 |     "model": 'claude-3-5-sonnet-20240620', # Allowed Values: Any Vision and Language Model from the supported APIs
50 | }
51 | agent = AgentS2(
52 |     engine_params,
53 |     grounding_agent,
54 |     platform=current_platform,
55 |     action_space="pyautogui",
56 |     observation_type="mixed",
57 |     search_engine="LLM"
58 | )
59 | ```
60 | 
61 | To use the underlying Multimodal Agent (LMMAgent) which wraps LLMs with message handling functionality, you can use the following code snippet:
62 | 
63 | ```python
64 | from gui_agents.core.mllm import LMMAgent
65 | 
66 | engine_params = {
67 |     "engine_type": 'anthropic', # Allowed Values: 'openai', 'anthropic', 'gemini', 'azure_openai', 'vllm', 'open_router'
68 |     "model": 'claude-3-5-sonnet-20240620', # Allowed Values: Any Vision and Language Model from the supported APIs
69 |     }
70 | agent = LMMAgent(
71 |     engine_params=engine_params,
72 | )
73 | ```
74 | 
75 | The `AgentS2` also utilizes this `LMMAgent` internally.


--------------------------------------------------------------------------------
/osworld_setup/s1/OSWorld.md:
--------------------------------------------------------------------------------
  1 | # Deplying Agent-S in OSWorld
  2 | 
  3 | # Step 1: Set up Agent S
  4 | 
  5 | Follow the [README.md](https://github.com/simular-ai/Agent-S/blob/main/gui_agents/s1/README.md) to set up Agent S.
  6 | 
  7 | # Step 2: Copying Over Run Files
  8 | 
  9 | If you haven't already, please follow the [OSWorld environment setup](https://github.com/xlang-ai/OSWorld/blob/main/README.md). We've provided the relevant OSWorld run files for evaluation in this `osworld_setup` folder. Please copy this over to your OSWorld folder.
 10 | 
 11 | We have set the latest Agent S to use the latest Ubuntu VM image from OSWorld. However, our experiments are based on the older version of the VM. To reproduce the results, set the vm_version argument to 'old' while instantiating the agent.
 12 | 
 13 | 
 14 | # Step 3: Best Practices
 15 | 
 16 | At this point, you will have set up the Agent-S and OSWorld environments and the VMWare Workstation Pro application. Below, we'll list some best practices, and common problems and their fixes.
 17 | 
 18 | ---
 19 | 
 20 | ```
 21 | from desktop_env.desktop_env import DesktopEnv
 22 | 
 23 | example = {
 24 |     "id": "94d95f96-9699-4208-98ba-3c3119edf9c2",
 25 |     "instruction": "I want to install Spotify on my current system. Could you please help me?",
 26 |     "config": [
 27 |         {
 28 |             "type": "execute",
 29 |             "parameters": {
 30 |                 "command": [
 31 |                     "python",
 32 |                     "-c",
 33 |                     "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
 34 |                 ]
 35 |             }
 36 |         }
 37 |     ],
 38 |     "evaluator": {
 39 |         "func": "check_include_exclude",
 40 |         "result": {
 41 |             "type": "vm_command_line",
 42 |             "command": "which spotify"
 43 |         },
 44 |         "expected": {
 45 |             "type": "rule",
 46 |             "rules": {
 47 |                 "include": ["spotify"],
 48 |                 "exclude": ["not found"]
 49 |             }
 50 |         }
 51 |     }
 52 | }
 53 | 
 54 | env = DesktopEnv(action_space="pyautogui")
 55 | 
 56 | obs = env.reset(task_config=example)
 57 | obs, reward, done, info = env.step("pyautogui.rightClick()")
 58 | ```
 59 | 
 60 | The code above will boot up a VM and restart it. If, for whatever reason, running the starter code below leads to an infinitely long run time, cancel out of the VM.
 61 | You should then see:
 62 | 
 63 | ```
 64 | parent/
 65 |   Agent-S/
 66 |   OSWorld/
 67 |     vmware_vm_data/
 68 |       Ubuntu0/
 69 |         *.lck
 70 |         *.vmem
 71 |         ...
 72 |       ...
 73 |       UbuntuX/
 74 | ```
 75 | 
 76 | If you happen to have any `*.lck` folder in your VM's folder, be sure to delete them. Every time you are powering on the VM from creating a new `DesktopEnv` instance, you need to 
 77 | delete the `*.lck` folders first. If your VM is already powered on, and your session (in a Jupyter Notebook, for example) crashes, you can keep the `*.lck` files and just re-instantiate the `DesktopEnv` instance. I'd also suggest using just a single VM (as a VM takes up a lot of space!). 
 78 | 
 79 | ---
 80 | 
 81 | If even after rerunning the code and deleting the `*.lck` files don't work, then you should try passing in the `path_to_vm` explicitly to the `DesktopEnv` class. 
 82 | 
 83 | ```
 84 | env = DesktopEnv(action_space="pyautogui", headless=False, require_terminal=True, path_to_vm=<absolute_path>)
 85 | ```
 86 | 
 87 | Pass the absolute path to your VM's (Ubuntu0) `.vmx` file. This file is located here:
 88 | 
 89 | 
 90 | ```
 91 | parent/
 92 |   Agent-S/
 93 |   OSWorld/
 94 |     vmware_vm_data/
 95 |       Ubuntu0/
 96 |         *.lck
 97 |         *.vmem
 98 |         ...
 99 |         *.vmx
100 |       ...
101 |       UbuntuX/
102 | ```
103 | 
104 | 📌 **Note**: If you are testing on the `os` domain, there is an [issue](https://github.com/asweigart/pyautogui/issues/198#issuecomment-1465268536) with `pyautogui`. A *hacky* way to solve this is to, inside the VM, locate where the `pyautogui` module is installed and open the `__init__.py` located under the `pyautogui` folder and remove the "<" in the `set(...)` within the following function: 
105 | 
106 | ```
107 | def isShiftCharacter(character):
108 |     """
109 |     Returns True if the ``character`` is a keyboard key that would require the shift key to be held down, such as
110 |     uppercase letters or the symbols on the keyboard's number row.
111 |     """
112 |     # NOTE TODO - This will be different for non-qwerty keyboards.
113 |     return character.isupper() or character in set('~!@#$%^&*()_+{}|:"<>?')
114 | ```
115 | 
116 | 📌 **Note**: If in case, your VM encounters an issue with "The root file system on <path> requires a manual fsck", reset the VM to the previous snapshot. 
117 | 
118 | With these changes, you should be able to get up and running with VMWare, DesktopEnv, and OSWorld! 😊


--------------------------------------------------------------------------------
/osworld_setup/s1/lib_run_single.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import json
 3 | import logging
 4 | import os
 5 | import time
 6 | from wrapt_timeout_decorator import *
 7 | 
 8 | logger = logging.getLogger("desktopenv.experiment")
 9 | 
10 | 
11 | def run_single_example(
12 |     agent, env, example, max_steps, instruction, args, example_result_dir, scores
13 | ):
14 |     runtime_logger = setup_logger(example, example_result_dir)
15 |     agent.reset()
16 |     env.reset(task_config=example)
17 |     time.sleep(60)  # Wait for the environment to be ready
18 |     obs = env._get_obs()  # Get the initial observation
19 |     done = False
20 |     step_idx = 0
21 |     env.controller.start_recording()
22 |     while not done and step_idx < max_steps:
23 |         response, actions = agent.predict(instruction, obs)
24 |         for action in actions:
25 |             # Capture the timestamp before executing the action
26 |             action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
27 |             logger.info("Step %d: %s", step_idx + 1, action)
28 |             obs, reward, done, info = env.step(action, args.sleep_after_execution)
29 | 
30 |             logger.info("Reward: %.2f", reward)
31 |             logger.info("Done: %s", done)
32 |             # Save screenshot and trajectory information
33 |             with open(
34 |                 os.path.join(
35 |                     example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"
36 |                 ),
37 |                 "wb",
38 |             ) as _f:
39 |                 _f.write(obs["screenshot"])
40 |             with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
41 |                 f.write(
42 |                     json.dumps(
43 |                         {
44 |                             "step_num": step_idx + 1,
45 |                             "action_timestamp": action_timestamp,
46 |                             "action": action,
47 |                             "reward": reward,
48 |                             "done": done,
49 |                             "info": info,
50 |                             "screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png",
51 |                         }
52 |                     )
53 |                 )
54 |                 f.write("\n")
55 |             if done:
56 |                 logger.info("The episode is done.")
57 |                 break
58 |         step_idx += 1
59 |     result = env.evaluate()
60 |     logger.info("Result: %.2f", result)
61 |     scores.append(result)
62 |     with open(
63 |         os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8"
64 |     ) as f:
65 |         f.write(f"{result}\n")
66 |     env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
67 | 
68 | 
69 | def setup_logger(example, example_result_dir):
70 |     runtime_logger = logging.getLogger(f"desktopenv.example.{example['id']}")
71 |     runtime_logger.setLevel(logging.DEBUG)
72 |     runtime_logger.addHandler(
73 |         logging.FileHandler(os.path.join(example_result_dir, "runtime.log"))
74 |     )
75 |     return runtime_logger
76 | 


--------------------------------------------------------------------------------
/osworld_setup/s1/run.py:
--------------------------------------------------------------------------------
  1 | """OSWorld's run.py with AgentS."""
  2 | 
  3 | """Script to run end-to-end evaluation on the benchmark.
  4 | Utils and basic architecture credit to https://github.com/web-arena-x/webarena/blob/main/run.py.
  5 | """
  6 | 
  7 | import argparse
  8 | import datetime
  9 | import json
 10 | import logging
 11 | import os
 12 | import sys
 13 | 
 14 | from gui_agents.s1.core.AgentS import GraphSearchAgent
 15 | from gui_agents.s1.aci.LinuxOSACI import LinuxACI
 16 | from tqdm import tqdm
 17 | 
 18 | import lib_run_single
 19 | from desktop_env.desktop_env import DesktopEnv
 20 | 
 21 | # import wandb
 22 | 
 23 | 
 24 | #  Logger Configs {{{ #
 25 | logger = logging.getLogger()
 26 | logger.setLevel(logging.DEBUG)
 27 | 
 28 | datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
 29 | 
 30 | file_handler = logging.FileHandler(
 31 |     os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8"
 32 | )
 33 | debug_handler = logging.FileHandler(
 34 |     os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8"
 35 | )
 36 | stdout_handler = logging.StreamHandler(sys.stdout)
 37 | sdebug_handler = logging.FileHandler(
 38 |     os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8"
 39 | )
 40 | 
 41 | file_handler.setLevel(logging.INFO)
 42 | debug_handler.setLevel(logging.DEBUG)
 43 | stdout_handler.setLevel(logging.INFO)
 44 | sdebug_handler.setLevel(logging.DEBUG)
 45 | 
 46 | formatter = logging.Formatter(
 47 |     fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s"
 48 | )
 49 | file_handler.setFormatter(formatter)
 50 | debug_handler.setFormatter(formatter)
 51 | stdout_handler.setFormatter(formatter)
 52 | sdebug_handler.setFormatter(formatter)
 53 | 
 54 | stdout_handler.addFilter(logging.Filter("desktopenv"))
 55 | sdebug_handler.addFilter(logging.Filter("desktopenv"))
 56 | 
 57 | logger.addHandler(file_handler)
 58 | logger.addHandler(debug_handler)
 59 | logger.addHandler(stdout_handler)
 60 | logger.addHandler(sdebug_handler)
 61 | #  }}} Logger Configs #
 62 | 
 63 | logger = logging.getLogger("desktopenv.experiment")
 64 | 
 65 | 
 66 | def config() -> argparse.Namespace:
 67 |     parser = argparse.ArgumentParser(
 68 |         description="Run end-to-end evaluation on the benchmark"
 69 |     )
 70 | 
 71 |     # environment config
 72 |     parser.add_argument("--path_to_vm", type=str, default=None)
 73 |     parser.add_argument(
 74 |         "--headless", action="store_true", help="Run in headless machine"
 75 |     )
 76 |     parser.add_argument(
 77 |         "--action_space", type=str, default="pyautogui", help="Action type"
 78 |     )
 79 |     parser.add_argument(
 80 |         "--observation_type",
 81 |         choices=["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"],
 82 |         default="a11y_tree",
 83 |         help="Observation type",
 84 |     )
 85 |     parser.add_argument("--screen_width", type=int, default=1920)
 86 |     parser.add_argument("--screen_height", type=int, default=1080)
 87 |     parser.add_argument("--sleep_after_execution", type=float, default=0.0)
 88 |     parser.add_argument("--max_steps", type=int, default=15)
 89 | 
 90 |     # agent config
 91 |     parser.add_argument("--max_trajectory_length", type=int, default=3)
 92 |     parser.add_argument(
 93 |         "--test_config_base_dir", type=str, default="evaluation_examples"
 94 |     )
 95 | 
 96 |     # lm config
 97 |     parser.add_argument("--model", type=str, default="gpt-4o")
 98 |     parser.add_argument("--temperature", type=float, default=1.0)
 99 |     parser.add_argument("--top_p", type=float, default=0.9)
100 |     parser.add_argument("--max_tokens", type=int, default=1500)
101 |     parser.add_argument("--stop_token", type=str, default=None)
102 | 
103 |     # example config
104 |     parser.add_argument("--domain", type=str, default="all")
105 |     parser.add_argument(
106 |         "--test_all_meta_path", type=str, default="evaluation_examples/test_all.json"
107 |     )
108 | 
109 |     # logging related
110 |     parser.add_argument("--result_dir", type=str, default="./results")
111 | 
112 |     # NEW!
113 |     parser.add_argument("--huggingface_endpoint_url", type=str, required=True)
114 |     parser.add_argument("--kb_name", default="kb_s2", type=str)
115 | 
116 |     args = parser.parse_args()
117 | 
118 |     return args
119 | 
120 | 
121 | def test(args: argparse.Namespace, test_all_meta: dict) -> None:
122 |     scores = []
123 |     max_steps = args.max_steps
124 | 
125 |     # log args
126 |     logger.info("Args: %s", args)
127 |     # set wandb project
128 |     cfg_args = {
129 |         "path_to_vm": args.path_to_vm,
130 |         "headless": args.headless,
131 |         "action_space": args.action_space,
132 |         "observation_type": args.observation_type,
133 |         "screen_width": args.screen_width,
134 |         "screen_height": args.screen_height,
135 |         "sleep_after_execution": args.sleep_after_execution,
136 |         "max_steps": args.max_steps,
137 |         "max_trajectory_length": args.max_trajectory_length,
138 |         "model": args.model,
139 |         "temperature": args.temperature,
140 |         "top_p": args.top_p,
141 |         "max_tokens": args.max_tokens,
142 |         "stop_token": args.stop_token,
143 |         "result_dir": args.result_dir,
144 |     }
145 | 
146 |     # NEW!
147 |     if args.model.startswith("claude"):
148 |         engine_type = "anthropic"
149 |     elif args.model.startswith("gpt"):
150 |         engine_type = "openai"
151 |     else:
152 |         engine_type = "vllm"
153 | 
154 |     engine_params = {"engine_type": engine_type, "model": args.model}
155 | 
156 |     # NEW!
157 |     grounding_agent = LinuxACI()
158 | 
159 |     # NEW!
160 |     agent = GraphSearchAgent(
161 |         engine_params,
162 |         grounding_agent,
163 |         platform="linux",
164 |         action_space="pyautogui",
165 |         observation_type="mixed",
166 |         search_engine="Perplexica",
167 |         memory_root_path=os.getcwd(),
168 |         memory_folder_name=args.kb_name,
169 |         kb_release_tag="v0.2.2",
170 |     )
171 | 
172 |     env = DesktopEnv(
173 |         path_to_vm=args.path_to_vm,
174 |         action_space=agent.action_space,
175 |         screen_size=(args.screen_width, args.screen_height),
176 |         headless=args.headless,
177 |         os_type="Ubuntu",
178 |         require_a11y_tree=args.observation_type
179 |         in ["a11y_tree", "screenshot_a11y_tree", "som"],
180 |     )
181 | 
182 |     for domain in tqdm(test_all_meta, desc="Domain"):
183 |         for example_id in tqdm(test_all_meta[domain], desc="Example", leave=False):
184 |             config_file = os.path.join(
185 |                 args.test_config_base_dir, f"examples/{domain}/{example_id}.json"
186 |             )
187 |             with open(config_file, "r", encoding="utf-8") as f:
188 |                 example = json.load(f)
189 | 
190 |             logger.info(f"[Domain]: {domain}")
191 |             logger.info(f"[Example ID]: {example_id}")
192 | 
193 |             instruction = example["instruction"]
194 | 
195 |             logger.info(f"[Instruction]: {instruction}")
196 |             # wandb each example config settings
197 |             cfg_args["instruction"] = instruction
198 |             cfg_args["start_time"] = datetime.datetime.now().strftime(
199 |                 "%Y:%m:%d-%H:%M:%S"
200 |             )
201 |             # run.config.update(cfg_args)
202 | 
203 |             example_result_dir = os.path.join(
204 |                 args.result_dir,
205 |                 args.action_space,
206 |                 args.observation_type,
207 |                 args.model,
208 |                 domain,
209 |                 example_id,
210 |             )
211 |             os.makedirs(example_result_dir, exist_ok=True)
212 |             # example start running
213 |             try:
214 |                 lib_run_single.run_single_example(
215 |                     agent,
216 |                     env,
217 |                     example,
218 |                     max_steps,
219 |                     instruction,
220 |                     args,
221 |                     example_result_dir,
222 |                     scores,
223 |                 )
224 |             except Exception as e:
225 |                 logger.error(f"Exception in {domain}/{example_id}: {e}")
226 |                 env.controller.end_recording(
227 |                     os.path.join(example_result_dir, "recording.mp4")
228 |                 )
229 |                 with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
230 |                     f.write(
231 |                         json.dumps(
232 |                             {"Error": f"Time limit exceeded in {domain}/{example_id}"}
233 |                         )
234 |                     )
235 |                     f.write("\n")
236 | 
237 |     env.close()
238 |     logger.info(f"Average score: {sum(scores) / len(scores)}")
239 | 
240 | 
241 | def get_unfinished(
242 |     action_space, use_model, observation_type, result_dir, total_file_json
243 | ):
244 |     target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
245 | 
246 |     if not os.path.exists(target_dir):
247 |         return total_file_json
248 | 
249 |     finished = {}
250 |     for domain in os.listdir(target_dir):
251 |         finished[domain] = []
252 |         domain_path = os.path.join(target_dir, domain)
253 |         if os.path.isdir(domain_path):
254 |             for example_id in os.listdir(domain_path):
255 |                 if example_id == "onboard":
256 |                     continue
257 |                 example_path = os.path.join(domain_path, example_id)
258 |                 if os.path.isdir(example_path):
259 |                     if "result.txt" not in os.listdir(example_path):
260 |                         # empty all files under example_id
261 |                         for file in os.listdir(example_path):
262 |                             os.remove(os.path.join(example_path, file))
263 |                     else:
264 |                         finished[domain].append(example_id)
265 | 
266 |     if not finished:
267 |         return total_file_json
268 | 
269 |     for domain, examples in finished.items():
270 |         if domain in total_file_json:
271 |             total_file_json[domain] = [
272 |                 x for x in total_file_json[domain] if x not in examples
273 |             ]
274 | 
275 |     return total_file_json
276 | 
277 | 
278 | def get_result(action_space, use_model, observation_type, result_dir, total_file_json):
279 |     target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
280 |     if not os.path.exists(target_dir):
281 |         print("New experiment, no result yet.")
282 |         return None
283 | 
284 |     all_result = []
285 | 
286 |     for domain in os.listdir(target_dir):
287 |         domain_path = os.path.join(target_dir, domain)
288 |         if os.path.isdir(domain_path):
289 |             for example_id in os.listdir(domain_path):
290 |                 example_path = os.path.join(domain_path, example_id)
291 |                 if os.path.isdir(example_path):
292 |                     if "result.txt" in os.listdir(example_path):
293 |                         # empty all files under example_id
294 |                         try:
295 |                             all_result.append(
296 |                                 float(
297 |                                     open(
298 |                                         os.path.join(example_path, "result.txt"), "r"
299 |                                     ).read()
300 |                                 )
301 |                             )
302 |                         except:
303 |                             all_result.append(0.0)
304 | 
305 |     if not all_result:
306 |         print("New experiment, no result yet.")
307 |         return None
308 |     else:
309 |         print("Current Success Rate:", sum(all_result) / len(all_result) * 100, "%")
310 |         return all_result
311 | 
312 | 
313 | if __name__ == "__main__":
314 |     ####### The complete version of the list of examples #######
315 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
316 |     args = config()
317 | 
318 |     with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
319 |         test_all_meta = json.load(f)
320 | 
321 |     if args.domain != "all":
322 |         test_all_meta = {args.domain: test_all_meta[args.domain]}
323 | 
324 |     test_file_list = get_unfinished(
325 |         args.action_space,
326 |         args.model,
327 |         args.observation_type,
328 |         args.result_dir,
329 |         test_all_meta,
330 |     )
331 |     left_info = ""
332 |     for domain in test_file_list:
333 |         left_info += f"{domain}: {len(test_file_list[domain])}\n"
334 |     logger.info(f"Left tasks:\n{left_info}")
335 | 
336 |     get_result(
337 |         args.action_space,
338 |         args.model,
339 |         args.observation_type,
340 |         args.result_dir,
341 |         test_all_meta,
342 |     )
343 |     test(args, test_file_list)
344 | 


--------------------------------------------------------------------------------
/osworld_setup/s2/OSWorld.md:
--------------------------------------------------------------------------------
  1 | # Deplying Agent S2 in OSWorld
  2 | 
  3 | # Step 1: Set up Agent S2
  4 | 
  5 | Follow the [README.md](https://github.com/simular-ai/Agent-S/blob/main/README.md) to set up Agent S2.
  6 | 
  7 | # Step 2: Copying Over Run Files
  8 | 
  9 | If you haven't already, please follow the [OSWorld environment setup](https://github.com/xlang-ai/OSWorld/blob/main/README.md). We've provided the relevant OSWorld run files for evaluation in this `osworld_setup` folder. Please copy this over to your OSWorld folder.
 10 | 
 11 | # Best Practices
 12 | 
 13 | At this point, you will have set up the Agent S2, the OSWorld environment, and the VMWare Workstation Pro application set up. Below, we'll list some best practices, and common problems and their fixes.
 14 | 
 15 | ---
 16 | 
 17 | ```
 18 | from desktop_env.desktop_env import DesktopEnv
 19 | 
 20 | example = {
 21 |     "id": "94d95f96-9699-4208-98ba-3c3119edf9c2",
 22 |     "instruction": "I want to install Spotify on my current system. Could you please help me?",
 23 |     "config": [
 24 |         {
 25 |             "type": "execute",
 26 |             "parameters": {
 27 |                 "command": [
 28 |                     "python",
 29 |                     "-c",
 30 |                     "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);"
 31 |                 ]
 32 |             }
 33 |         }
 34 |     ],
 35 |     "evaluator": {
 36 |         "func": "check_include_exclude",
 37 |         "result": {
 38 |             "type": "vm_command_line",
 39 |             "command": "which spotify"
 40 |         },
 41 |         "expected": {
 42 |             "type": "rule",
 43 |             "rules": {
 44 |                 "include": ["spotify"],
 45 |                 "exclude": ["not found"]
 46 |             }
 47 |         }
 48 |     }
 49 | }
 50 | 
 51 | env = DesktopEnv(action_space="pyautogui")
 52 | 
 53 | obs = env.reset(task_config=example)
 54 | obs, reward, done, info = env.step("pyautogui.rightClick()")
 55 | ```
 56 | 
 57 | Note, this code is just for demonstrating how the OSWorld `DesktopEnv` is instantiated. If you're running OSWorld, this process is already part of their code base. The code above will boot up a VM and restart it. If, for whatever reason, running the starter code (or running OSWorld experiments) leads to an infinitely long run time, cancel out of the VM.
 58 | You should then see:
 59 | 
 60 | ```
 61 | parent/
 62 |   OSWorld/
 63 |     vmware_vm_data/
 64 |       Ubuntu0/
 65 |         *.lck
 66 |         *.vmem
 67 |         ...
 68 |       ...
 69 |       UbuntuX/
 70 | ```
 71 | 
 72 | If you happen to have any `*.lck` folder in your VM's folder, be sure to delete them. Every time you are powering on the VM from creating a new `DesktopEnv` instance, you need to 
 73 | delete the `*.lck` folders first. If your VM is already powered on, and your session (in a Jupyter Notebook, for example) crashes, you can keep the `*.lck` files and just re-instantiate the `DesktopEnv` instance. I'd also suggest using just a single VM (as a VM takes up a lot of space!). Also, be sure to shut down the VM when you've finished using it. Deleting the `*.lck` files should be done after every time you power off the VM (though it seems to not be an issue from testing).
 74 | 
 75 | ---
 76 | 
 77 | If even after rerunning the code and deleting the `*.lck` files don't work, then you should try passing in the `path_to_vm` explicitly to the `DesktopEnv` class. 
 78 | 
 79 | ```
 80 | env = DesktopEnv(action_space="pyautogui", headless=False, require_terminal=True, path_to_vm=<absolute_path>)
 81 | ```
 82 | 
 83 | Pass the absolute path to your VM's (Ubuntu0) `.vmx` file. This file is located here:
 84 | 
 85 | 
 86 | ```
 87 | parent/
 88 |   OSWorld/
 89 |     vmware_vm_data/
 90 |       Ubuntu0/
 91 |         *.lck
 92 |         *.vmem
 93 |         ...
 94 |         *.vmx
 95 |       ...
 96 |       UbuntuX/
 97 | ```
 98 | 
 99 | 📌 **Note**: If you are testing on the `os` domain, there is an [issue](https://github.com/asweigart/pyautogui/issues/198#issuecomment-1465268536) with `pyautogui`. A *hacky* way to solve this is to, inside the VM, locate where the `pyautogui` module is installed and open the `__init__.py` located under the `pyautogui` folder and remove the "<" in the `set(...)` within the following function: 
100 | 
101 | ```
102 | def isShiftCharacter(character):
103 |     """
104 |     Returns True if the ``character`` is a keyboard key that would require the shift key to be held down, such as
105 |     uppercase letters or the symbols on the keyboard's number row.
106 |     """
107 |     # NOTE TODO - This will be different for non-qwerty keyboards.
108 |     return character.isupper() or character in set('~!@#$%^&*()_+{}|:"<>?')
109 | ```
110 | 
111 | 📌 **Note**: If in case, your VM encounters an issue with "The root file system on <path> requires a manual fsck", reset the VM to the previous snapshot. 
112 | 
113 | 📌 **Note**: OSWorld scripts will create the `DesktopEnv` instance which will create a VM for you with a specific snapshot (`snapshot_name` parameter in `DesktopEnv`). If you wish to create a new snapshot of the VM and use that for your experiments, be sure to specify the name of this snapshot where `DesktopEnv` is instantiated.
114 | 
115 | With these changes, you should be able to get up and running with VMWare, DesktopEnv, and OSWorld! 😊


--------------------------------------------------------------------------------
/osworld_setup/s2/lib_run_single.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import json
 3 | import logging
 4 | import os
 5 | import time
 6 | from wrapt_timeout_decorator import *
 7 | 
 8 | logger = logging.getLogger("desktopenv.experiment")
 9 | 
10 | 
11 | def run_single_example(
12 |     agent, env, example, max_steps, instruction, args, example_result_dir, scores
13 | ):
14 |     runtime_logger = setup_logger(example, example_result_dir)
15 |     agent.reset()
16 |     env.reset(task_config=example)
17 |     time.sleep(60)  # Wait for the environment to be ready
18 |     obs = env._get_obs()  # Get the initial observation
19 |     done = False
20 |     step_idx = 0
21 |     env.controller.start_recording()
22 |     while not done and step_idx < max_steps:
23 |         response, actions = agent.predict(instruction, obs)
24 |         for action in actions:
25 |             # Capture the timestamp before executing the action
26 |             action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
27 |             logger.info("Step %d: %s", step_idx + 1, action)
28 |             obs, reward, done, info = env.step(action, args.sleep_after_execution)
29 | 
30 |             logger.info("Reward: %.2f", reward)
31 |             logger.info("Done: %s", done)
32 |             # Save screenshot and trajectory information
33 |             with open(
34 |                 os.path.join(
35 |                     example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"
36 |                 ),
37 |                 "wb",
38 |             ) as _f:
39 |                 _f.write(obs["screenshot"])
40 |             with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
41 |                 f.write(
42 |                     json.dumps(
43 |                         {
44 |                             "step_num": step_idx + 1,
45 |                             "action_timestamp": action_timestamp,
46 |                             "action": action,
47 |                             "reward": reward,
48 |                             "done": done,
49 |                             "info": info,
50 |                             "screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png",
51 |                         }
52 |                     )
53 |                 )
54 |                 f.write("\n")
55 |             if done:
56 |                 logger.info("The episode is done.")
57 |                 break
58 |         step_idx += 1
59 |     result = env.evaluate()
60 |     logger.info("Result: %.2f", result)
61 |     scores.append(result)
62 |     with open(
63 |         os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8"
64 |     ) as f:
65 |         f.write(f"{result}\n")
66 |     env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
67 | 
68 | 
69 | def setup_logger(example, example_result_dir):
70 |     runtime_logger = logging.getLogger(f"desktopenv.example.{example['id']}")
71 |     runtime_logger.setLevel(logging.DEBUG)
72 |     runtime_logger.addHandler(
73 |         logging.FileHandler(os.path.join(example_result_dir, "runtime.log"))
74 |     )
75 |     return runtime_logger
76 | 


--------------------------------------------------------------------------------
/osworld_setup/s2/run.py:
--------------------------------------------------------------------------------
  1 | """OSWorld's run.py with AgentS2."""
  2 | 
  3 | """Script to run end-to-end evaluation on the benchmark.
  4 | Utils and basic architecture credit to https://github.com/web-arena-x/webarena/blob/main/run.py.
  5 | """
  6 | 
  7 | import argparse
  8 | import datetime
  9 | import json
 10 | import logging
 11 | import os
 12 | import sys
 13 | 
 14 | from gui_agents.s2.agents.agent_s import AgentS2
 15 | from gui_agents.s2.agents.grounding import OSWorldACI
 16 | from tqdm import tqdm
 17 | 
 18 | import lib_run_single
 19 | from desktop_env.desktop_env import DesktopEnv
 20 | 
 21 | 
 22 | #  Logger Configs {{{ #
 23 | logger = logging.getLogger()
 24 | logger.setLevel(logging.DEBUG)
 25 | 
 26 | datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
 27 | 
 28 | file_handler = logging.FileHandler(
 29 |     os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8"
 30 | )
 31 | debug_handler = logging.FileHandler(
 32 |     os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8"
 33 | )
 34 | stdout_handler = logging.StreamHandler(sys.stdout)
 35 | sdebug_handler = logging.FileHandler(
 36 |     os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8"
 37 | )
 38 | 
 39 | file_handler.setLevel(logging.INFO)
 40 | debug_handler.setLevel(logging.DEBUG)
 41 | stdout_handler.setLevel(logging.INFO)
 42 | sdebug_handler.setLevel(logging.DEBUG)
 43 | 
 44 | formatter = logging.Formatter(
 45 |     fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s"
 46 | )
 47 | file_handler.setFormatter(formatter)
 48 | debug_handler.setFormatter(formatter)
 49 | stdout_handler.setFormatter(formatter)
 50 | sdebug_handler.setFormatter(formatter)
 51 | 
 52 | stdout_handler.addFilter(logging.Filter("desktopenv"))
 53 | sdebug_handler.addFilter(logging.Filter("desktopenv"))
 54 | 
 55 | logger.addHandler(file_handler)
 56 | logger.addHandler(debug_handler)
 57 | logger.addHandler(stdout_handler)
 58 | logger.addHandler(sdebug_handler)
 59 | #  }}} Logger Configs #
 60 | 
 61 | logger = logging.getLogger("desktopenv.experiment")
 62 | 
 63 | 
 64 | def config() -> argparse.Namespace:
 65 |     parser = argparse.ArgumentParser(
 66 |         description="Run end-to-end evaluation on the benchmark"
 67 |     )
 68 | 
 69 |     # environment config
 70 |     parser.add_argument("--path_to_vm", type=str, default=None)
 71 |     parser.add_argument(
 72 |         "--headless", action="store_true", help="Run in headless machine"
 73 |     )
 74 |     parser.add_argument(
 75 |         "--action_space", type=str, default="pyautogui", help="Action type"
 76 |     )
 77 |     parser.add_argument(
 78 |         "--observation_type",
 79 |         choices=["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"],
 80 |         default="screenshot",
 81 |         help="Observation type",
 82 |     )
 83 |     parser.add_argument("--screen_width", type=int, default=1920)
 84 |     parser.add_argument("--screen_height", type=int, default=1080)
 85 |     parser.add_argument("--sleep_after_execution", type=float, default=0.0)
 86 |     parser.add_argument("--max_steps", type=int, default=15)
 87 | 
 88 |     # agent config
 89 |     parser.add_argument("--max_trajectory_length", type=int, default=3)
 90 |     parser.add_argument(
 91 |         "--test_config_base_dir", type=str, default="evaluation_examples"
 92 |     )
 93 | 
 94 |     # lm config
 95 |     parser.add_argument("--model_provider", type=str, default="openai")
 96 |     parser.add_argument("--model", type=str, default="gpt-4o")
 97 |     parser.add_argument(
 98 |         "--model_url",
 99 |         type=str,
100 |         default="",
101 |         help="The URL of the main generation model API.",
102 |     )
103 |     parser.add_argument(
104 |         "--model_api_key",
105 |         type=str,
106 |         default="",
107 |         help="The API key of the main generation model.",
108 |     )
109 |     parser.add_argument("--temperature", type=float, default=1.0)
110 |     parser.add_argument("--top_p", type=float, default=0.9)
111 |     parser.add_argument("--max_tokens", type=int, default=1500)
112 |     parser.add_argument("--stop_token", type=str, default=None)
113 | 
114 |     # example config
115 |     parser.add_argument("--domain", type=str, default="all")
116 |     parser.add_argument(
117 |         "--test_all_meta_path", type=str, default="evaluation_examples/test_all.json"
118 |     )
119 | 
120 |     # logging related
121 |     parser.add_argument("--result_dir", type=str, default="./results")
122 | 
123 |     # NEW!
124 | 
125 |     # Configuration 1
126 |     parser.add_argument("--grounding_model_provider", type=str, default="anthropic")
127 |     parser.add_argument(
128 |         "--grounding_model", type=str, default="claude-3-7-sonnet-20250219"
129 |     )
130 |     parser.add_argument(
131 |         "--grounding_model_resize_width",
132 |         type=int,
133 |         default=1366,
134 |         help="Width of screenshot image after processor rescaling",
135 |     )
136 |     parser.add_argument(
137 |         "--grounding_model_resize_height",
138 |         type=int,
139 |         default=None,
140 |         help="Height of screenshot image after processor rescaling",
141 |     )
142 | 
143 |     # Configuration 2
144 |     parser.add_argument("--endpoint_provider", type=str, default="")
145 |     parser.add_argument("--endpoint_url", type=str, default="")
146 |     parser.add_argument(
147 |         "--endpoint_api_key",
148 |         type=str,
149 |         default="",
150 |         help="The API key of the grounding model.",
151 |     )
152 | 
153 |     parser.add_argument("--kb_name", default="kb_s2", type=str)
154 | 
155 |     args = parser.parse_args()
156 | 
157 |     return args
158 | 
159 | 
160 | def test(args: argparse.Namespace, test_all_meta: dict) -> None:
161 |     scores = []
162 |     max_steps = args.max_steps
163 | 
164 |     # log args
165 |     logger.info("Args: %s", args)
166 |     cfg_args = {
167 |         "path_to_vm": args.path_to_vm,
168 |         "headless": args.headless,
169 |         "action_space": args.action_space,
170 |         "observation_type": args.observation_type,
171 |         "screen_width": args.screen_width,
172 |         "screen_height": args.screen_height,
173 |         "sleep_after_execution": args.sleep_after_execution,
174 |         "max_steps": args.max_steps,
175 |         "max_trajectory_length": args.max_trajectory_length,
176 |         "model": args.model,
177 |         "temperature": args.temperature,
178 |         "top_p": args.top_p,
179 |         "max_tokens": args.max_tokens,
180 |         "stop_token": args.stop_token,
181 |         "result_dir": args.result_dir,
182 |     }
183 | 
184 |     # NEW!
185 |     engine_params = {
186 |         "engine_type": args.model_provider,
187 |         "model": args.model,
188 |         "base_url": args.model_url,
189 |         "api_key": args.model_api_key,
190 |     }
191 | 
192 |     if args.endpoint_url:
193 |         engine_params_for_grounding = {
194 |             "engine_type": args.endpoint_provider,
195 |             "base_url": args.endpoint_url,
196 |             "api_key": args.endpoint_api_key,
197 |         }
198 |     else:
199 |         grounding_height = args.grounding_model_resize_height
200 |         # If not provided, use the aspect ratio of the screen to compute the height
201 |         if grounding_height is None:
202 |             grounding_height = (
203 |                 args.screen_height
204 |                 * args.grounding_model_resize_width
205 |                 / args.screen_width
206 |             )
207 | 
208 |         engine_params_for_grounding = {
209 |             "engine_type": args.grounding_model_provider,
210 |             "model": args.grounding_model,
211 |             "grounding_width": args.grounding_model_resize_width,
212 |             "grounding_height": grounding_height,
213 |         }
214 | 
215 |     # NEW!
216 |     grounding_agent = OSWorldACI(
217 |         platform="linux",
218 |         engine_params_for_generation=engine_params,
219 |         engine_params_for_grounding=engine_params_for_grounding,
220 |         width=args.screen_width,
221 |         height=args.screen_height,
222 |     )
223 | 
224 |     # NEW!
225 |     agent = AgentS2(
226 |         engine_params,
227 |         grounding_agent,
228 |         platform="linux",
229 |         action_space="pyautogui",
230 |         observation_type="mixed",
231 |         search_engine="Perplexica",
232 |         memory_root_path=os.getcwd(),
233 |         memory_folder_name=args.kb_name,
234 |         kb_release_tag="v0.2.2",
235 |         embedding_engine_type="openai",
236 |     )
237 | 
238 |     env = DesktopEnv(
239 |         path_to_vm=args.path_to_vm,
240 |         action_space=agent.action_space,
241 |         screen_size=(args.screen_width, args.screen_height),
242 |         headless=args.headless,
243 |         require_a11y_tree=args.observation_type
244 |         in ["a11y_tree", "screenshot_a11y_tree", "som"],
245 |     )
246 | 
247 |     for domain in tqdm(test_all_meta, desc="Domain"):
248 |         for example_id in tqdm(test_all_meta[domain], desc="Example", leave=False):
249 |             config_file = os.path.join(
250 |                 args.test_config_base_dir, f"examples/{domain}/{example_id}.json"
251 |             )
252 |             with open(config_file, "r", encoding="utf-8") as f:
253 |                 example = json.load(f)
254 | 
255 |             logger.info(f"[Domain]: {domain}")
256 |             logger.info(f"[Example ID]: {example_id}")
257 | 
258 |             instruction = example["instruction"]
259 | 
260 |             logger.info(f"[Instruction]: {instruction}")
261 |             # wandb each example config settings
262 |             cfg_args["instruction"] = instruction
263 |             cfg_args["start_time"] = datetime.datetime.now().strftime(
264 |                 "%Y:%m:%d-%H:%M:%S"
265 |             )
266 | 
267 |             example_result_dir = os.path.join(
268 |                 args.result_dir,
269 |                 args.action_space,
270 |                 args.observation_type,
271 |                 args.model,
272 |                 domain,
273 |                 example_id,
274 |             )
275 |             os.makedirs(example_result_dir, exist_ok=True)
276 |             # example start running
277 |             try:
278 |                 lib_run_single.run_single_example(
279 |                     agent,
280 |                     env,
281 |                     example,
282 |                     max_steps,
283 |                     instruction,
284 |                     args,
285 |                     example_result_dir,
286 |                     scores,
287 |                 )
288 |             except Exception as e:
289 |                 logger.error(f"Exception in {domain}/{example_id}: {e}")
290 |                 env.controller.end_recording(
291 |                     os.path.join(example_result_dir, "recording.mp4")
292 |                 )
293 |                 with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
294 |                     f.write(
295 |                         json.dumps(
296 |                             {"Error": f"Time limit exceeded in {domain}/{example_id}"}
297 |                         )
298 |                     )
299 |                     f.write("\n")
300 | 
301 |     env.close()
302 |     logger.info(f"Average score: {sum(scores) / len(scores)}")
303 | 
304 | 
305 | def get_unfinished(
306 |     action_space, use_model, observation_type, result_dir, total_file_json
307 | ):
308 |     target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
309 | 
310 |     if not os.path.exists(target_dir):
311 |         return total_file_json
312 | 
313 |     finished = {}
314 |     for domain in os.listdir(target_dir):
315 |         finished[domain] = []
316 |         domain_path = os.path.join(target_dir, domain)
317 |         if os.path.isdir(domain_path):
318 |             for example_id in os.listdir(domain_path):
319 |                 if example_id == "onboard":
320 |                     continue
321 |                 example_path = os.path.join(domain_path, example_id)
322 |                 if os.path.isdir(example_path):
323 |                     if "result.txt" not in os.listdir(example_path):
324 |                         # empty all files under example_id
325 |                         for file in os.listdir(example_path):
326 |                             os.remove(os.path.join(example_path, file))
327 |                     else:
328 |                         finished[domain].append(example_id)
329 | 
330 |     if not finished:
331 |         return total_file_json
332 | 
333 |     for domain, examples in finished.items():
334 |         if domain in total_file_json:
335 |             total_file_json[domain] = [
336 |                 x for x in total_file_json[domain] if x not in examples
337 |             ]
338 | 
339 |     return total_file_json
340 | 
341 | 
342 | def get_result(action_space, use_model, observation_type, result_dir, total_file_json):
343 |     target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
344 |     if not os.path.exists(target_dir):
345 |         print("New experiment, no result yet.")
346 |         return None
347 | 
348 |     all_result = []
349 | 
350 |     for domain in os.listdir(target_dir):
351 |         domain_path = os.path.join(target_dir, domain)
352 |         if os.path.isdir(domain_path):
353 |             for example_id in os.listdir(domain_path):
354 |                 example_path = os.path.join(domain_path, example_id)
355 |                 if os.path.isdir(example_path):
356 |                     if "result.txt" in os.listdir(example_path):
357 |                         # empty all files under example_id
358 |                         try:
359 |                             all_result.append(
360 |                                 float(
361 |                                     open(
362 |                                         os.path.join(example_path, "result.txt"), "r"
363 |                                     ).read()
364 |                                 )
365 |                             )
366 |                         except:
367 |                             all_result.append(0.0)
368 | 
369 |     if not all_result:
370 |         print("New experiment, no result yet.")
371 |         return None
372 |     else:
373 |         print("Current Success Rate:", sum(all_result) / len(all_result) * 100, "%")
374 |         return all_result
375 | 
376 | 
377 | if __name__ == "__main__":
378 |     ####### The complete version of the list of examples #######
379 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
380 |     args = config()
381 | 
382 |     with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
383 |         test_all_meta = json.load(f)
384 | 
385 |     if args.domain != "all":
386 |         test_all_meta = {args.domain: test_all_meta[args.domain]}
387 | 
388 |     test_file_list = get_unfinished(
389 |         args.action_space,
390 |         args.model,
391 |         args.observation_type,
392 |         args.result_dir,
393 |         test_all_meta,
394 |     )
395 |     left_info = ""
396 |     for domain in test_file_list:
397 |         left_info += f"{domain}: {len(test_file_list[domain])}\n"
398 |     logger.info(f"Left tasks:\n{left_info}")
399 | 
400 |     get_result(
401 |         args.action_space,
402 |         args.model,
403 |         args.observation_type,
404 |         args.result_dir,
405 |         test_all_meta,
406 |     )
407 |     test(args, test_file_list)
408 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy
 2 | backoff
 3 | pandas
 4 | openai
 5 | anthropic
 6 | fastapi
 7 | uvicorn
 8 | paddleocr
 9 | paddlepaddle
10 | together
11 | scikit-learn
12 | websockets
13 | tiktoken
14 | pyautogui
15 | toml
16 | black
17 | pytesseract
18 | 
19 | # Platform-specific dependencies
20 | pyobjc; platform_system == "Darwin"
21 | pywinauto; platform_system == "Windows"
22 | pywin32; platform_system == "Windows" 


--------------------------------------------------------------------------------
/server.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import os
  3 | import platform
  4 | from fastapi import FastAPI, HTTPException
  5 | from fastapi.responses import StreamingResponse
  6 | from pydantic import BaseModel
  7 | from gui_agents.s1.core.AgentS import GraphSearchAgent
  8 | import io
  9 | import pyautogui
 10 | import time
 11 | from threading import Event, Lock
 12 | 
 13 | # Determine the operating system and select appropriate ACI
 14 | current_platform = platform.system().lower()
 15 | if current_platform == "linux":
 16 |     from gui_agents.s1.aci.LinuxOSACI import LinuxACI, UIElement
 17 | 
 18 |     grounding_agent = LinuxACI()
 19 | elif current_platform == "darwin":
 20 |     from gui_agents.s1.aci.MacOSACI import MacOSACI, UIElement
 21 | 
 22 |     grounding_agent = MacOSACI()
 23 | elif current_platform == "windows":
 24 |     from gui_agents.s1.aci.WindowsOSACI import WindowsACI, UIElement
 25 | 
 26 |     grounding_agent = WindowsACI()
 27 | else:
 28 |     raise ValueError(f"Unsupported operating system: {current_platform}")
 29 | 
 30 | app = FastAPI()
 31 | 
 32 | # Add global lock and status tracking
 33 | agent_lock = Lock()
 34 | agent_status = {"is_running": False, "current_instruction": None, "start_time": None}
 35 | 
 36 | # Add a stop event
 37 | stop_event = Event()
 38 | 
 39 | 
 40 | class InstructionData(BaseModel):
 41 |     screenshot: str
 42 |     accessibility_tree: str
 43 | 
 44 | 
 45 | class CommandRequest(BaseModel):
 46 |     obs: InstructionData
 47 |     instruction: str
 48 | 
 49 | 
 50 | class RunRequest(BaseModel):
 51 |     model: str
 52 |     instruction: str
 53 |     api_key: str | None = None
 54 | 
 55 | 
 56 | async def stream_code(code: str):
 57 |     for line in code.splitlines(keepends=True):
 58 |         yield line
 59 |         await asyncio.sleep(0.1)
 60 | 
 61 | 
 62 | def run_agent(agent: GraphSearchAgent, instruction: str):
 63 |     global stop_event
 64 |     stop_event.clear()  # Reset the stop event
 65 |     obs = {}
 66 |     traj = "Task:\n" + instruction
 67 |     subtask_traj = ""
 68 |     for _ in range(15):
 69 |         # Check if stop was requested
 70 |         if stop_event.is_set():
 71 |             print("Agent execution stopped by user")
 72 |             return
 73 | 
 74 |         print("iteration", _)
 75 | 
 76 |         obs["accessibility_tree"] = UIElement.systemWideElement()
 77 | 
 78 |         # Get screen shot using pyautogui.
 79 |         # Take a screenshot
 80 |         screenshot = pyautogui.screenshot()
 81 | 
 82 |         # Save the screenshot to a BytesIO object
 83 |         buffered = io.BytesIO()
 84 |         screenshot.save(buffered, format="PNG")
 85 | 
 86 |         # Get the byte value of the screenshot
 87 |         screenshot_bytes = buffered.getvalue()
 88 |         # Convert to base64 string.
 89 |         obs["screenshot"] = screenshot_bytes
 90 | 
 91 |         # Get next action code from the agent
 92 |         info, code = agent.predict(instruction=instruction, observation=obs)
 93 | 
 94 |         if "done" in code[0].lower() or "fail" in code[0].lower():
 95 |             if platform.system() == "Darwin":
 96 |                 os.system(
 97 |                     f'osascript -e \'display dialog "Task Completed" with title "OpenACI Agent" buttons "OK" default button "OK"\''
 98 |                 )
 99 |             elif platform.system() == "Linux":
100 |                 os.system(
101 |                     f'zenity --info --title="OpenACI Agent" --text="Task Completed" --width=200 --height=100'
102 |                 )
103 | 
104 |             agent.update_narrative_memory(traj)
105 |             break
106 | 
107 |         if "next" in code[0].lower():
108 |             continue
109 | 
110 |         if "wait" in code[0].lower():
111 |             time.sleep(5)
112 |             continue
113 | 
114 |         else:
115 |             time.sleep(1.0)
116 |             print("EXECUTING CODE:", code[0])
117 | 
118 |             # Ask for permission before executing
119 |             exec(code[0])
120 |             time.sleep(1.0)
121 | 
122 |             # Update task and subtask trajectories and optionally the episodic memory
123 |             traj += (
124 |                 "\n\nReflection:\n"
125 |                 + str(info["reflection"])
126 |                 + "\n\n----------------------\n\nPlan:\n"
127 |                 + info["executor_plan"]
128 |             )
129 |             subtask_traj = agent.update_episodic_memory(info, subtask_traj)
130 | 
131 | 
132 | @app.post("/run")
133 | async def run(request: RunRequest):
134 |     global agent_status
135 | 
136 |     # Check if agent is already running
137 |     if not agent_lock.acquire(blocking=False):
138 |         raise HTTPException(
139 |             status_code=409,
140 |             detail="An agent is already running. Use /status to check current run or /stop to stop it.",
141 |         )
142 | 
143 |     try:
144 |         agent_status = {
145 |             "is_running": True,
146 |             "current_instruction": request.instruction,
147 |             "start_time": time.time(),
148 |             "model": request.model,
149 |         }
150 | 
151 |         if "gpt" in request.model:
152 |             engine_type = "openai"
153 |         elif "claude" in request.model:
154 |             engine_type = "anthropic"
155 | 
156 |         engine_params = {
157 |             "engine_type": engine_type,
158 |             "model": request.model,
159 |             "api_key": request.api_key,
160 |         }
161 | 
162 |         print("engine_params", engine_params)
163 | 
164 |         agent = GraphSearchAgent(
165 |             engine_params,
166 |             grounding_agent,
167 |             platform=current_platform,
168 |             action_space="pyautogui",
169 |             observation_type="mixed",
170 |         )
171 | 
172 |         agent.reset()
173 |         print("start the agent")
174 |         run_agent(agent, request.instruction)
175 | 
176 |         return {"status": "completed"}
177 | 
178 |     finally:
179 |         agent_status = {
180 |             "is_running": False,
181 |             "current_instruction": None,
182 |             "start_time": None,
183 |         }
184 |         agent_lock.release()
185 | 
186 | 
187 | @app.get("/status")
188 | async def get_status():
189 |     if agent_status["is_running"]:
190 |         duration = time.time() - agent_status["start_time"]
191 |         return {
192 |             "status": "running",
193 |             "instruction": agent_status["current_instruction"],
194 |             "model": agent_status["model"],
195 |             "running_for_seconds": round(duration, 2),
196 |         }
197 |     return {"status": "idle"}
198 | 
199 | 
200 | @app.post("/execute")
201 | async def execute_command_stream(cmd: CommandRequest):
202 |     engine_params = {
203 |         "engine_type": "openai",
204 |         "model": "gpt-4o",
205 |     }
206 | 
207 |     agent = GraphSearchAgent(
208 |         engine_params,
209 |         grounding_agent,
210 |         platform=current_platform,
211 |         action_space="pyautogui",
212 |         observation_type="mixed",
213 |     )
214 | 
215 |     obs = {
216 |         "screenshot": cmd.obs.screenshot,
217 |         "accessibility_tree": cmd.obs.accessibility_tree,
218 |     }
219 |     instruction = cmd.instruction
220 |     info, code = agent.predict(instruction=instruction, observation=obs)
221 | 
222 |     return StreamingResponse(stream_code(code), media_type="text/plain")
223 | 
224 | 
225 | @app.post("/stop")
226 | async def stop_agent():
227 |     if not agent_status["is_running"]:
228 |         raise HTTPException(status_code=404, detail="No agent is currently running")
229 | 
230 |     global stop_event
231 |     stop_event.set()
232 |     return {"status": "stop signal sent"}
233 | 
234 | 
235 | import uvicorn
236 | 
237 | if __name__ == "__main__":
238 |     uvicorn.run(
239 |         "server:app",
240 |         host="0.0.0.0",  # Allows external access
241 |         port=8000,  # Default port for FastAPI
242 |         reload=True,  # Auto-reload on code changes
243 |     )
244 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | setup(
 4 |     name="gui-agents",
 5 |     version="0.2.5",
 6 |     description="A library for creating general purpose GUI agents using multimodal LLMs.",
 7 |     long_description=open("README.md", encoding="utf-8").read(),
 8 |     long_description_content_type="text/markdown",
 9 |     author="Simular AI",
10 |     author_email="eric@simular.ai",
11 |     packages=find_packages(),
12 |     install_requires=[
13 |         "numpy",
14 |         "backoff",
15 |         "pandas",
16 |         "openai",
17 |         "anthropic",
18 |         "fastapi",
19 |         "uvicorn",
20 |         "paddleocr",
21 |         "paddlepaddle",
22 |         "together",
23 |         "scikit-learn",
24 |         "websockets",
25 |         "tiktoken",
26 |         "selenium",
27 |         'pyobjc; platform_system == "Darwin"',
28 |         "pyautogui",
29 |         "toml",
30 |         "pytesseract",
31 |         "google-genai",
32 |         'pywinauto; platform_system == "Windows"',  # Only for Windows
33 |         'pywin32; platform_system == "Windows"',  # Only for Windows
34 |     ],
35 |     extras_require={"dev": ["black"]},  # Code formatter for linting
36 |     entry_points={
37 |         "console_scripts": [
38 |             "agent_s1=gui_agents.s1.cli_app:main",
39 |             "agent_s2=gui_agents.s2.cli_app:main",
40 |         ],
41 |     },
42 |     classifiers=[
43 |         "Programming Language :: Python :: 3",
44 |         "Programming Language :: Python :: 3.9",
45 |         "License :: OSI Approved :: Apache Software License",
46 |         "Operating System :: Microsoft :: Windows",
47 |         "Operating System :: POSIX :: Linux",
48 |         "Operating System :: MacOS :: MacOS X",
49 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
50 |     ],
51 |     keywords="ai, llm, gui, agent, multimodal",
52 |     project_urls={
53 |         "Source": "https://github.com/simular-ai/Agent-S",
54 |         "Bug Reports": "https://github.com/simular-ai/Agent-S/issues",
55 |     },
56 |     python_requires=">=3.9, <=3.12",
57 | )
58 | 


--------------------------------------------------------------------------------
/tests/test_aci.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import Mock, patch
 2 | 
 3 | import pytest
 4 | 
 5 | from gui_agents.s1.aci.ACI import ACI, _normalize_key
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def aci():
10 |     return ACI(top_app_only=True, ocr=False)
11 | 
12 | 
13 | def test_normalize_key():
14 |     """Test key normalization"""
15 |     assert _normalize_key("cmd") == "command"
16 |     assert _normalize_key("ctrl") == "ctrl"
17 |     assert _normalize_key("shift") == "shift"
18 | 
19 | 
20 | def test_hotkey_cmd_normalization(aci):
21 |     """Test cmd normalization in hotkey command"""
22 |     command = aci.hotkey(["cmd", "c"])
23 |     assert "command" in command
24 |     assert "cmd" not in command
25 | 
26 | 
27 | def test_click_with_cmd_key(aci):
28 |     """Test cmd normalization in click command"""
29 |     aci.nodes = [{"position": (100, 200), "size": (50, 50)}]
30 |     command = aci.click(0, hold_keys=["cmd"])
31 |     assert "command" in command
32 |     assert "cmd" not in command
33 | 
34 | 
35 | def test_type_with_overwrite(aci):
36 |     """Test type command with overwrite"""
37 |     aci.nodes = [{"position": (100, 200), "size": (50, 50)}]
38 |     command = aci.type(0, "test", overwrite=True)
39 |     assert "command" in command or "ctrl" in command
40 |     assert "backspace" in command
41 | 


--------------------------------------------------------------------------------
/tests/test_app_switching.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import pyautogui
 4 | from AppKit import NSWorkspace
 5 | 
 6 | from gui_agents.s1.aci.MacOSACI import MacOSACI
 7 | 
 8 | agent = MacOSACI()
 9 | 
10 | 
11 | def test_app_switching():
12 |     app_or_file_name = "Safari"
13 | 
14 |     exec(agent.switch_applications(app_or_file_name))
15 | 
16 |     # Checking the frontmost application
17 |     frontmost_app = NSWorkspace.sharedWorkspace().frontmostApplication().localizedName()
18 |     print(frontmost_app)
19 | 
20 |     # Assert to confirm Safari is the frontmost application
21 |     assert frontmost_app == "Safari", f"Expected Safari, but got {frontmost_app}"
22 | 
23 | 
24 | # Run the test
25 | test_app_switching()
26 | 


--------------------------------------------------------------------------------
/tests/test_uielement_base.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from gui_agents.aci.UIElementBase import UIElementBase
 4 | 
 5 | 
 6 | def test_uielement_base_is_abstract():
 7 |     """Test that UIElementBase cannot be instantiated directly"""
 8 |     with pytest.raises(TypeError):
 9 |         UIElementBase()
10 | 


--------------------------------------------------------------------------------
/tests/test_uielement_linux.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import Mock, patch
 2 | 
 3 | import pyatspi
 4 | import pytest
 5 | 
 6 | from gui_agents.aci.UIElementLinux import UIElement
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def mock_accessible():
11 |     mock = Mock()
12 |     mock.name = "Test Window"
13 |     mock.getRole.return_value = pyatspi.ROLE_WINDOW
14 |     mock.getState.return_value.contains.return_value = True
15 |     return mock
16 | 
17 | 
18 | @pytest.fixture
19 | def ui_element(mock_accessible):
20 |     return UIElement(mock_accessible)
21 | 
22 | 
23 | def test_role(ui_element, mock_accessible):
24 |     """Test role retrieval"""
25 |     mock_accessible.getRoleName.return_value = "window"
26 |     assert ui_element.role() == "window"
27 | 
28 | 
29 | def test_position(ui_element, mock_accessible):
30 |     """Test position retrieval"""
31 |     mock_accessible.getPosition.return_value = (100, 200)
32 |     assert ui_element.position() == (100, 200)
33 | 
34 | 
35 | def test_size(ui_element, mock_accessible):
36 |     """Test size retrieval"""
37 |     mock_accessible.getSize.return_value = (300, 400)
38 |     assert ui_element.size() == (300, 400)
39 | 


--------------------------------------------------------------------------------
/tests/test_uielement_macos.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | from unittest.mock import Mock, patch
 3 | 
 4 | import pytest
 5 | 
 6 | from gui_agents.s1.aci.MacOSACI import UIElement
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def mock_ax_element():
11 |     mock_element = Mock()
12 |     mock_element.__repr__ = lambda x: "x:100 y:200"
13 |     return mock_element
14 | 
15 | 
16 | @pytest.fixture
17 | def mock_size_element():
18 |     mock_element = Mock()
19 |     mock_element.__repr__ = lambda x: "w:300 h:400"
20 |     return mock_element
21 | 
22 | 
23 | @pytest.fixture
24 | def ui_element(mock_ax_element):
25 |     element = UIElement(mock_ax_element)
26 |     return element
27 | 
28 | 
29 | def test_position_parsing(ui_element, mock_ax_element):
30 |     """Test position parsing from AX element"""
31 |     with patch.object(ui_element, "attribute", return_value=mock_ax_element):
32 |         pos = ui_element.position()
33 |         assert pos == (100.0, 200.0)
34 | 
35 | 
36 | def test_size_parsing(ui_element, mock_size_element):
37 |     """Test size parsing from AX element"""
38 |     with patch.object(ui_element, "attribute", return_value=mock_size_element):
39 |         size = ui_element.size()
40 |         assert size == (300.0, 400.0)
41 | 
42 | 
43 | def test_get_current_applications(obs: Dict):
44 |     """Test getting list of current applications"""
45 |     with patch("AppKit.NSWorkspace") as mock_workspace:
46 |         mock_app = Mock()
47 |         mock_app.activationPolicy.return_value = 0
48 |         mock_app.localizedName.return_value = "TestApp"
49 |         mock_workspace.sharedWorkspace.return_value.runningApplications.return_value = [
50 |             mock_app
51 |         ]
52 | 
53 |         apps = UIElement.get_current_applications(obs)
54 |         assert apps == ["TestApp"]
55 | 


--------------------------------------------------------------------------------
/tests/test_uielement_osworld.py:
--------------------------------------------------------------------------------
 1 | import xml.etree.ElementTree as ET
 2 | 
 3 | import pytest
 4 | 
 5 | from gui_agents.aci.UIElementOSWorld import UIElement
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def sample_xml():
10 |     return """
11 |     <root>
12 |         <application name="TestApp">
13 |             <window uri:deskat:state.at-spi.gnome.org:active="true">
14 |                 <button uri:deskat:component.at-spi.gnome.org:screencoord="(100,200)"
15 |                         uri:deskat:component.at-spi.gnome.org:size="(300,400)">
16 |                     Click me
17 |                 </button>
18 |             </window>
19 |         </application>
20 |     </root>
21 |     """
22 | 
23 | 
24 | @pytest.fixture
25 | def ui_element(sample_xml):
26 |     tree = ET.ElementTree(ET.fromstring(sample_xml))
27 |     return UIElement(tree.getroot())
28 | 
29 | 
30 | def test_nodeFromTree(sample_xml):
31 |     """Test creating UIElement from XML string"""
32 |     element = UIElement.nodeFromTree(sample_xml)
33 |     assert element is not None
34 |     assert isinstance(element, UIElement)
35 | 
36 | 
37 | def test_position(ui_element):
38 |     """Test position extraction from XML"""
39 |     button = ui_element.children()[0].children()[0]
40 |     assert button.position() == (100, 200)
41 | 
42 | 
43 | def test_size(ui_element):
44 |     """Test size extraction from XML"""
45 |     button = ui_element.children()[0].children()[0]
46 |     assert button.size() == (300, 400)
47 | 


--------------------------------------------------------------------------------