├── .gitattributes
├── .gitignore
├── .zed
    └── settings.json
├── Dockerfile
├── LICENSE
├── LICENSE.QucikStart
├── README.md
├── computer_use_demo
    ├── __init__.py
    ├── loop.py
    ├── requirements.txt
    ├── streamlit.py
    └── tools
    │   ├── __init__.py
    │   ├── base.py
    │   ├── bash.py
    │   ├── collection.py
    │   ├── computer.py
    │   ├── edit.py
    │   ├── game.py
    │   └── run.py
├── dev-requirements.txt
├── image
    ├── .config
    │   └── tint2
    │   │   ├── applications
    │   │       ├── firefox-custom.desktop
    │   │       ├── gedit.desktop
    │   │       └── terminal.desktop
    │   │   └── tint2rc
    ├── .streamlit
    │   └── config.toml
    ├── entrypoint.sh
    ├── http_server.py
    ├── index.html
    ├── mutter_startup.sh
    ├── novnc_startup.sh
    ├── start_all.sh
    ├── static_content
    │   └── index.html
    ├── tint2_startup.sh
    ├── x11vnc_startup.sh
    └── xvfb_startup.sh
├── main.py
├── pyproject.toml
├── ruff.toml
├── setup.sh
└── tests
    ├── conftest.py
    ├── loop_test.py
    ├── streamlit_test.py
    └── tools
        ├── bash_test.py
        ├── computer_test.py
        └── edit_test.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | .venv
  6 | .ruff_cache
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | cover/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | .pybuilder/
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | #   For a library or package, you might want to ignore these files since the code is
 89 | #   intended to run in multiple environments; otherwise, check them in:
 90 | # .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # poetry
100 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
102 | #   commonly ignored for libraries.
103 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104 | #poetry.lock
105 | 
106 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
107 | __pypackages__/
108 | 
109 | # Celery stuff
110 | celerybeat-schedule
111 | celerybeat.pid
112 | 
113 | # SageMath parsed files
114 | *.sage.py
115 | 
116 | # Environments
117 | .env
118 | .venv
119 | env/
120 | venv/
121 | ENV/
122 | env.bak/
123 | venv.bak/
124 | 
125 | # Spyder project settings
126 | .spyderproject
127 | .spyproject
128 | 
129 | # Rope project settings
130 | .ropeproject
131 | 
132 | # mkdocs documentation
133 | /site
134 | 
135 | # mypy
136 | .mypy_cache/
137 | .dmypy.json
138 | dmypy.json
139 | 
140 | # Pyre type checker
141 | .pyre/
142 | 
143 | # pytype static type analyzer
144 | .pytype/
145 | 
146 | # Cython debug symbols
147 | cython_debug/
148 | 
149 | # PyCharm
150 | #  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
151 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
152 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
153 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
154 | #.idea/
155 | 


--------------------------------------------------------------------------------
/.zed/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "preferred_line_length": 88,
 3 |   "languages": {
 4 |     "Python": {
 5 |       "language_servers": ["pyright", "ruff"]
 6 |     }
 7 |   },
 8 |   "telemetry": {
 9 |     "diagnostics": false,
10 |     "metrics": false
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
  1 | FROM docker.io/ubuntu:22.04
  2 | 
  3 | ENV DEBIAN_FRONTEND=noninteractive
  4 | ENV DEBIAN_PRIORITY=high
  5 | 
  6 | RUN apt-get update && \
  7 |     apt-get -y upgrade && \
  8 |     apt-get -y install \
  9 |     build-essential \
 10 |     # UI Requirements
 11 |     xvfb \
 12 |     xterm \
 13 |     xdotool \
 14 |     scrot \
 15 |     imagemagick \
 16 |     sudo \
 17 |     mutter \
 18 |     x11vnc \
 19 |     # Python/pyenv reqs
 20 |     build-essential \
 21 |     libssl-dev  \
 22 |     zlib1g-dev \
 23 |     libbz2-dev \
 24 |     libreadline-dev \
 25 |     libsqlite3-dev \
 26 |     curl \
 27 |     git \
 28 |     libncursesw5-dev \
 29 |     xz-utils \
 30 |     tk-dev \
 31 |     libxml2-dev \
 32 |     libxmlsec1-dev \
 33 |     libffi-dev \
 34 |     liblzma-dev \
 35 |     # Network tools
 36 |     net-tools \
 37 |     netcat \
 38 |     # PPA req
 39 |     software-properties-common && \
 40 |     # Userland apps
 41 |     sudo add-apt-repository ppa:mozillateam/ppa && \
 42 |     sudo apt-get install -y --no-install-recommends \
 43 |     libreoffice \
 44 |     firefox-esr \
 45 |     x11-apps \
 46 |     xpdf \
 47 |     gedit \
 48 |     xpaint \
 49 |     tint2 \
 50 |     galculator \
 51 |     pcmanfm \
 52 |     unzip && \
 53 |     apt-get clean
 54 | 
 55 | # Install noVNC
 56 | RUN git clone --branch v1.5.0 https://github.com/novnc/noVNC.git /opt/noVNC && \
 57 |     git clone --branch v0.12.0 https://github.com/novnc/websockify /opt/noVNC/utils/websockify && \
 58 |     ln -s /opt/noVNC/vnc.html /opt/noVNC/index.html
 59 | 
 60 | # setup user
 61 | ENV USERNAME=computeruse
 62 | ENV HOME=/home/$USERNAME
 63 | RUN useradd -m -s /bin/bash -d $HOME $USERNAME
 64 | RUN echo "${USERNAME} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
 65 | USER computeruse
 66 | WORKDIR $HOME
 67 | 
 68 | # setup python
 69 | RUN git clone https://github.com/pyenv/pyenv.git ~/.pyenv && \
 70 |     cd ~/.pyenv && src/configure && make -C src && cd .. && \
 71 |     echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc && \
 72 |     echo 'command -v pyenv >/dev/null || export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc && \
 73 |     echo 'eval "$(pyenv init -)"' >> ~/.bashrc
 74 | ENV PYENV_ROOT="$HOME/.pyenv"
 75 | ENV PATH="$PYENV_ROOT/bin:$PATH"
 76 | ENV PYENV_VERSION_MAJOR=3
 77 | ENV PYENV_VERSION_MINOR=11
 78 | ENV PYENV_VERSION_PATCH=6
 79 | ENV PYENV_VERSION=$PYENV_VERSION_MAJOR.$PYENV_VERSION_MINOR.$PYENV_VERSION_PATCH
 80 | RUN eval "$(pyenv init -)" && \
 81 |     pyenv install $PYENV_VERSION && \
 82 |     pyenv global $PYENV_VERSION && \
 83 |     pyenv rehash
 84 | 
 85 | ENV PATH="$HOME/.pyenv/shims:$HOME/.pyenv/bin:$PATH"
 86 | 
 87 | RUN python -m pip install --upgrade pip==23.1.2 setuptools==58.0.4 wheel==0.40.0 && \
 88 |     python -m pip config set global.disable-pip-version-check true
 89 | 
 90 | # only reinstall if requirements.txt changes
 91 | COPY --chown=$USERNAME:$USERNAME computer_use_demo/requirements.txt $HOME/computer_use_demo/requirements.txt
 92 | RUN python -m pip install -r $HOME/computer_use_demo/requirements.txt
 93 | 
 94 | # setup desktop env & app
 95 | COPY --chown=$USERNAME:$USERNAME image/ $HOME
 96 | COPY --chown=$USERNAME:$USERNAME computer_use_demo/ $HOME/computer_use_demo/
 97 | 
 98 | ARG DISPLAY_NUM=1
 99 | ARG HEIGHT=768
100 | ARG WIDTH=1024
101 | ENV DISPLAY_NUM=$DISPLAY_NUM
102 | ENV HEIGHT=$HEIGHT
103 | ENV WIDTH=$WIDTH
104 | 
105 | ENTRYPOINT [ "./entrypoint.sh" ]
106 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 observedobserver
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/LICENSE.QucikStart:
--------------------------------------------------------------------------------
1 | Copyright 2024 Anthropic, PBC.
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # claude-minecraft-use
 2 | 
 3 | This repo uses Claude computer use [quick starts](https://github.com/anthropics/anthropic-quickstarts) as base templates with modifications:
 4 | - Direct control of local MacOS (requires some changes of tools for actions)
 5 | - Allows AI to control Minecraft
 6 | 
 7 | This repo is just for fun and testing the capabilities of Claude computer use. Thanks to the great work of the Claude team for making this possible. Also thanks to [BlueM](https://github.com/BlueM) for cliclick which makes this demo work on Mac.
 8 | 
 9 | Limitations discovered during testing:
10 | 1. Coordinate control is not accurate enough
11 | 2. Cannot handle complex tasks step by step unless prompted well
12 | 
13 | https://github.com/user-attachments/assets/39e74c82-d4fe-4cb2-b213-b0b504d64772
14 | 
15 | 


--------------------------------------------------------------------------------
/computer_use_demo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ObservedObserver/claude-minecraft-use/8b52ef2a5aa175a49475db07ad7168b33089f8b6/computer_use_demo/__init__.py


--------------------------------------------------------------------------------
/computer_use_demo/loop.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Agentic sampling loop that calls the Anthropic API and local implenmentation of anthropic-defined computer use tools.
  3 | """
  4 | 
  5 | import platform
  6 | from collections.abc import Callable
  7 | from datetime import datetime
  8 | from enum import StrEnum
  9 | from typing import Any, cast
 10 | 
 11 | from anthropic import Anthropic, AnthropicBedrock, AnthropicVertex, APIResponse
 12 | from anthropic.types import (
 13 |     ToolResultBlockParam,
 14 | )
 15 | from anthropic.types.beta import (
 16 |     BetaContentBlock,
 17 |     BetaContentBlockParam,
 18 |     BetaImageBlockParam,
 19 |     BetaMessage,
 20 |     BetaMessageParam,
 21 |     BetaTextBlockParam,
 22 |     BetaToolResultBlockParam,
 23 | )
 24 | 
 25 | from .tools import BashTool, ComputerTool, EditTool, ToolCollection, ToolResult, GameTool
 26 | 
 27 | BETA_FLAG = "computer-use-2024-10-22"
 28 | 
 29 | 
 30 | class APIProvider(StrEnum):
 31 |     ANTHROPIC = "anthropic"
 32 |     BEDROCK = "bedrock"
 33 |     VERTEX = "vertex"
 34 | 
 35 | 
 36 | PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
 37 |     APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
 38 |     APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
 39 |     APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
 40 | }
 41 | 
 42 | 
 43 | # This system prompt is optimized for the Docker environment in this repository and
 44 | # specific tool combinations enabled.
 45 | # We encourage modifying this system prompt to ensure the model has context for the
 46 | # environment it is running in, and to provide any additional information that may be
 47 | # helpful for the task at hand.
 48 | SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
 49 | * You are utilising an MacOS machine using {platform.machine()} architecture with internet access.
 50 | * You can feel free to install MacOS applications with your bash tool. Use curl instead of wget.
 51 | * To open chrome, please just click on the chrome icon.  Note, chrome is what is installed on your system.
 52 | * Minecraft is installed on your system.  You can use the computer tool to interact with it.
 53 | * Using bash tool you can start GUI applications, but you need to set export DISPLAY=:1 and use a subshell. For example "(DISPLAY=:1 xterm &)". GUI apps run with bash tool will appear within your desktop environment, but they may take some time to appear. Take a screenshot to confirm it did.
 54 | * When using your bash tool with commands that are expected to output very large quantities of text, redirect into a tmp file and use str_replace_editor or `grep -n -B <lines before> -A <lines after> <query> <filename>` to confirm output.
 55 | * When viewing a page it can be helpful to zoom out so that you can see everything on the page.  Either that, or make sure you scroll down to see everything before deciding something isn't available.
 56 | * When using your computer function calls, they take a while to run and send back to you.  Where possible/feasible, try to chain multiple of these calls all into one function calls request.
 57 | * The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
 58 | </SYSTEM_CAPABILITY>
 59 | 
 60 | <IMPORTANT>
 61 | * When using Chrome, if a startup wizard appears, IGNORE IT.  Do not even click "skip this step".  Instead, click on the address bar where it says "Search or enter address", and enter the appropriate search term or URL there.
 62 | * If the item you are looking at is a pdf, if after taking a single screenshot of the pdf it seems that you want to read the entire document instead of trying to continue to read the pdf from your screenshots + navigation, determine the URL, use curl to download the pdf, install and use pdftotext to convert it to a text file, and then read that text file directly with your StrReplaceEditTool.
 63 | </IMPORTANT>"""
 64 | 
 65 | 
 66 | async def sampling_loop(
 67 |     *,
 68 |     model: str,
 69 |     provider: APIProvider,
 70 |     system_prompt_suffix: str,
 71 |     messages: list[BetaMessageParam],
 72 |     output_callback: Callable[[BetaContentBlock], None],
 73 |     tool_output_callback: Callable[[ToolResult, str], None],
 74 |     api_response_callback: Callable[[APIResponse[BetaMessage]], None],
 75 |     api_key: str,
 76 |     only_n_most_recent_images: int | None = None,
 77 |     max_tokens: int = 4096,
 78 | ):
 79 |     """
 80 |     Agentic sampling loop for the assistant/tool interaction of computer use.
 81 |     """
 82 |     tool_collection = ToolCollection(
 83 |         # ComputerTool(),
 84 |         GameTool(),
 85 |         BashTool(),
 86 |         EditTool(),
 87 |     )
 88 |     system = (
 89 |         f"{SYSTEM_PROMPT}{' ' + system_prompt_suffix if system_prompt_suffix else ''}"
 90 |     )
 91 | 
 92 |     while True:
 93 |         if only_n_most_recent_images:
 94 |             _maybe_filter_to_n_most_recent_images(messages, only_n_most_recent_images)
 95 | 
 96 |         if provider == APIProvider.ANTHROPIC:
 97 |             client = Anthropic(api_key=api_key)
 98 |         elif provider == APIProvider.VERTEX:
 99 |             client = AnthropicVertex()
100 |         elif provider == APIProvider.BEDROCK:
101 |             client = AnthropicBedrock()
102 | 
103 |         # Call the API
104 |         # we use raw_response to provide debug information to streamlit. Your
105 |         # implementation may be able call the SDK directly with:
106 |         # `response = client.messages.create(...)` instead.
107 |         raw_response = client.beta.messages.with_raw_response.create(
108 |             max_tokens=max_tokens,
109 |             messages=messages,
110 |             model=model,
111 |             system=system,
112 |             tools=tool_collection.to_params(),
113 |             betas=["computer-use-2024-10-22"],
114 |         )
115 | 
116 |         api_response_callback(cast(APIResponse[BetaMessage], raw_response))
117 | 
118 |         response = raw_response.parse()
119 | 
120 |         messages.append(
121 |             {
122 |                 "role": "assistant",
123 |                 "content": cast(list[BetaContentBlockParam], response.content),
124 |             }
125 |         )
126 | 
127 |         tool_result_content: list[BetaToolResultBlockParam] = []
128 |         for content_block in cast(list[BetaContentBlock], response.content):
129 |             output_callback(content_block)
130 |             if content_block.type == "tool_use":
131 |                 result = await tool_collection.run(
132 |                     name=content_block.name,
133 |                     tool_input=cast(dict[str, Any], content_block.input),
134 |                 )
135 |                 tool_result_content.append(
136 |                     _make_api_tool_result(result, content_block.id)
137 |                 )
138 |                 tool_output_callback(result, content_block.id)
139 | 
140 |         if not tool_result_content:
141 |             return messages
142 | 
143 |         messages.append({"content": tool_result_content, "role": "user"})
144 | 
145 | 
146 | def _maybe_filter_to_n_most_recent_images(
147 |     messages: list[BetaMessageParam],
148 |     images_to_keep: int,
149 |     min_removal_threshold: int = 10,
150 | ):
151 |     """
152 |     With the assumption that images are screenshots that are of diminishing value as
153 |     the conversation progresses, remove all but the final `images_to_keep` tool_result
154 |     images in place, with a chunk of min_removal_threshold to reduce the amount we
155 |     break the implicit prompt cache.
156 |     """
157 |     if images_to_keep is None:
158 |         return messages
159 | 
160 |     tool_result_blocks = cast(
161 |         list[ToolResultBlockParam],
162 |         [
163 |             item
164 |             for message in messages
165 |             for item in (
166 |                 message["content"] if isinstance(message["content"], list) else []
167 |             )
168 |             if isinstance(item, dict) and item.get("type") == "tool_result"
169 |         ],
170 |     )
171 | 
172 |     total_images = sum(
173 |         1
174 |         for tool_result in tool_result_blocks
175 |         for content in tool_result.get("content", [])
176 |         if isinstance(content, dict) and content.get("type") == "image"
177 |     )
178 | 
179 |     images_to_remove = total_images - images_to_keep
180 |     # for better cache behavior, we want to remove in chunks
181 |     images_to_remove -= images_to_remove % min_removal_threshold
182 | 
183 |     for tool_result in tool_result_blocks:
184 |         if isinstance(tool_result.get("content"), list):
185 |             new_content = []
186 |             for content in tool_result.get("content", []):
187 |                 if isinstance(content, dict) and content.get("type") == "image":
188 |                     if images_to_remove > 0:
189 |                         images_to_remove -= 1
190 |                         continue
191 |                 new_content.append(content)
192 |             tool_result["content"] = new_content
193 | 
194 | 
195 | def _make_api_tool_result(
196 |     result: ToolResult, tool_use_id: str
197 | ) -> BetaToolResultBlockParam:
198 |     """Convert an agent ToolResult to an API ToolResultBlockParam."""
199 |     tool_result_content: list[BetaTextBlockParam | BetaImageBlockParam] | str = []
200 |     is_error = False
201 |     if result.error:
202 |         is_error = True
203 |         tool_result_content = _maybe_prepend_system_tool_result(result, result.error)
204 |     else:
205 |         if result.output:
206 |             tool_result_content.append(
207 |                 {
208 |                     "type": "text",
209 |                     "text": _maybe_prepend_system_tool_result(result, result.output),
210 |                 }
211 |             )
212 |         if result.base64_image:
213 |             tool_result_content.append(
214 |                 {
215 |                     "type": "image",
216 |                     "source": {
217 |                         "type": "base64",
218 |                         "media_type": "image/png",
219 |                         "data": result.base64_image,
220 |                     },
221 |                 }
222 |             )
223 |     return {
224 |         "type": "tool_result",
225 |         "content": tool_result_content,
226 |         "tool_use_id": tool_use_id,
227 |         "is_error": is_error,
228 |     }
229 | 
230 | 
231 | def _maybe_prepend_system_tool_result(result: ToolResult, result_text: str):
232 |     if result.system:
233 |         result_text = f"<system>{result.system}</system>\n{result_text}"
234 |     return result_text
235 | 


--------------------------------------------------------------------------------
/computer_use_demo/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit>=1.38.0
2 | anthropic[bedrock,vertex]>=0.37.1
3 | jsonschema==4.22.0
4 | boto3>=1.28.57
5 | google-auth<3,>=2
6 | 


--------------------------------------------------------------------------------
/computer_use_demo/streamlit.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Entrypoint for streamlit, see https://docs.streamlit.io/
  3 | """
  4 | 
  5 | import asyncio
  6 | import base64
  7 | import os
  8 | import subprocess
  9 | from datetime import datetime
 10 | from enum import StrEnum
 11 | from functools import partial
 12 | from pathlib import PosixPath
 13 | from typing import cast
 14 | 
 15 | import streamlit as st
 16 | from anthropic import APIResponse
 17 | from anthropic.types import (
 18 |     TextBlock,
 19 | )
 20 | from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock
 21 | from anthropic.types.tool_use_block import ToolUseBlock
 22 | from streamlit.delta_generator import DeltaGenerator
 23 | 
 24 | from computer_use_demo.loop import (
 25 |     PROVIDER_TO_DEFAULT_MODEL_NAME,
 26 |     APIProvider,
 27 |     sampling_loop,
 28 | )
 29 | from computer_use_demo.tools import ToolResult
 30 | 
 31 | from dotenv import load_dotenv
 32 | # load env vars
 33 | load_dotenv()
 34 | 
 35 | CONFIG_DIR = PosixPath("~/.anthropic").expanduser()
 36 | API_KEY_FILE = CONFIG_DIR / "api_key"
 37 | STREAMLIT_STYLE = """
 38 | <style>
 39 |     /* Hide chat input while agent loop is running */
 40 |     .stApp[data-teststate=running] .stChatInput textarea,
 41 |     .stApp[data-test-script-state=running] .stChatInput textarea {
 42 |         display: none;
 43 |     }
 44 |      /* Hide the streamlit deploy button */
 45 |     .stDeployButton {
 46 |         visibility: hidden;
 47 |     }
 48 | </style>
 49 | """
 50 | 
 51 | WARNING_TEXT = "⚠️ Security Alert: Never provide access to sensitive accounts or data, as malicious web content can hijack Claude's behavior"
 52 | 
 53 | 
 54 | class Sender(StrEnum):
 55 |     USER = "user"
 56 |     BOT = "assistant"
 57 |     TOOL = "tool"
 58 | 
 59 | 
 60 | def setup_state():
 61 |     if "messages" not in st.session_state:
 62 |         st.session_state.messages = []
 63 |     if "api_key" not in st.session_state:
 64 |         # Try to load API key from file first, then environment
 65 |         st.session_state.api_key = load_from_storage("api_key") or os.getenv(
 66 |             "ANTHROPIC_API_KEY", ""
 67 |         )
 68 |     if "provider" not in st.session_state:
 69 |         st.session_state.provider = (
 70 |             os.getenv("API_PROVIDER", "anthropic") or APIProvider.ANTHROPIC
 71 |         )
 72 |     if "provider_radio" not in st.session_state:
 73 |         st.session_state.provider_radio = st.session_state.provider
 74 |     if "model" not in st.session_state:
 75 |         _reset_model()
 76 |     if "auth_validated" not in st.session_state:
 77 |         st.session_state.auth_validated = False
 78 |     if "responses" not in st.session_state:
 79 |         st.session_state.responses = {}
 80 |     if "tools" not in st.session_state:
 81 |         st.session_state.tools = {}
 82 |     if "only_n_most_recent_images" not in st.session_state:
 83 |         st.session_state.only_n_most_recent_images = 10
 84 |     if "custom_system_prompt" not in st.session_state:
 85 |         st.session_state.custom_system_prompt = load_from_storage("system_prompt") or ""
 86 |     if "hide_images" not in st.session_state:
 87 |         st.session_state.hide_images = False
 88 | 
 89 | 
 90 | def _reset_model():
 91 |     st.session_state.model = PROVIDER_TO_DEFAULT_MODEL_NAME[
 92 |         cast(APIProvider, st.session_state.provider)
 93 |     ]
 94 | 
 95 | 
 96 | async def main():
 97 |     """Render loop for streamlit"""
 98 |     setup_state()
 99 | 
100 |     st.markdown(STREAMLIT_STYLE, unsafe_allow_html=True)
101 | 
102 |     st.title("Claude Computer Use Minecraft")
103 |     st.markdown("""
104 |     This is a demo of using Claude to control a computer running Minecraft.
105 |     Github Repo: [github.com/ObservedObserver/claude-minecraft-use](https://github.com/ObservedObserver/claude-minecraft-use)
106 |     """)
107 | 
108 |     # if not os.getenv("HIDE_WARNING", False):
109 |     #     st.warning(WARNING_TEXT)
110 | 
111 |     with st.sidebar:
112 | 
113 |         def _reset_api_provider():
114 |             if st.session_state.provider_radio != st.session_state.provider:
115 |                 _reset_model()
116 |                 st.session_state.provider = st.session_state.provider_radio
117 |                 st.session_state.auth_validated = False
118 | 
119 |         provider_options = [option.value for option in APIProvider]
120 |         st.radio(
121 |             "API Provider",
122 |             options=provider_options,
123 |             key="provider_radio",
124 |             format_func=lambda x: x.title(),
125 |             on_change=_reset_api_provider,
126 |         )
127 | 
128 |         st.text_input("Model", key="model")
129 | 
130 |         if st.session_state.provider == APIProvider.ANTHROPIC:
131 |             st.text_input(
132 |                 "Anthropic API Key",
133 |                 type="password",
134 |                 key="api_key",
135 |                 on_change=lambda: save_to_storage("api_key", st.session_state.api_key),
136 |             )
137 | 
138 |         st.number_input(
139 |             "Only send N most recent images",
140 |             min_value=0,
141 |             key="only_n_most_recent_images",
142 |             help="To decrease the total tokens sent, remove older screenshots from the conversation",
143 |         )
144 |         st.text_area(
145 |             "Custom System Prompt Suffix",
146 |             key="custom_system_prompt",
147 |             help="Additional instructions to append to the system prompt. see computer_use_demo/loop.py for the base system prompt.",
148 |             on_change=lambda: save_to_storage(
149 |                 "system_prompt", st.session_state.custom_system_prompt
150 |             ),
151 |         )
152 |         st.checkbox("Hide screenshots", key="hide_images")
153 | 
154 |         if st.button("Reset", type="primary"):
155 |             with st.spinner("Resetting..."):
156 |                 st.session_state.clear()
157 |                 setup_state()
158 | 
159 |                 subprocess.run("pkill Xvfb; pkill tint2", shell=True)  # noqa: ASYNC221
160 |                 await asyncio.sleep(1)
161 |                 subprocess.run("./start_all.sh", shell=True)  # noqa: ASYNC221
162 | 
163 |     if not st.session_state.auth_validated:
164 |         if auth_error := validate_auth(
165 |             st.session_state.provider, st.session_state.api_key
166 |         ):
167 |             st.warning(f"Please resolve the following auth issue:\n\n{auth_error}")
168 |             return
169 |         else:
170 |             st.session_state.auth_validated = True
171 | 
172 |     chat, http_logs = st.tabs(["Chat", "HTTP Exchange Logs"])
173 |     new_message = st.chat_input(
174 |         "Type a message to send to Claude to control the computer..."
175 |     )
176 | 
177 |     with chat:
178 |         # render past chats
179 |         for message in st.session_state.messages:
180 |             if isinstance(message["content"], str):
181 |                 _render_message(message["role"], message["content"])
182 |             elif isinstance(message["content"], list):
183 |                 for block in message["content"]:
184 |                     # the tool result we send back to the Anthropic API isn't sufficient to render all details,
185 |                     # so we store the tool use responses
186 |                     if isinstance(block, dict) and block["type"] == "tool_result":
187 |                         _render_message(
188 |                             Sender.TOOL, st.session_state.tools[block["tool_use_id"]]
189 |                         )
190 |                     else:
191 |                         _render_message(
192 |                             message["role"],
193 |                             cast(BetaTextBlock | BetaToolUseBlock, block),
194 |                         )
195 | 
196 |         # render past http exchanges
197 |         for identity, response in st.session_state.responses.items():
198 |             _render_api_response(response, identity, http_logs)
199 | 
200 |         # render past chats
201 |         if new_message:
202 |             st.session_state.messages.append(
203 |                 {
204 |                     "role": Sender.USER,
205 |                     "content": [TextBlock(type="text", text=new_message)],
206 |                 }
207 |             )
208 |             _render_message(Sender.USER, new_message)
209 | 
210 |         try:
211 |             most_recent_message = st.session_state["messages"][-1]
212 |         except IndexError:
213 |             return
214 | 
215 |         if most_recent_message["role"] is not Sender.USER:
216 |             # we don't have a user message to respond to, exit early
217 |             return
218 | 
219 |         with st.spinner("Running Agent..."):
220 |             # run the agent sampling loop with the newest message
221 |             st.session_state.messages = await sampling_loop(
222 |                 system_prompt_suffix=st.session_state.custom_system_prompt,
223 |                 model=st.session_state.model,
224 |                 provider=st.session_state.provider,
225 |                 messages=st.session_state.messages,
226 |                 output_callback=partial(_render_message, Sender.BOT),
227 |                 tool_output_callback=partial(
228 |                     _tool_output_callback, tool_state=st.session_state.tools
229 |                 ),
230 |                 api_response_callback=partial(
231 |                     _api_response_callback,
232 |                     tab=http_logs,
233 |                     response_state=st.session_state.responses,
234 |                 ),
235 |                 api_key=st.session_state.api_key,
236 |                 only_n_most_recent_images=st.session_state.only_n_most_recent_images,
237 |             )
238 | 
239 | 
240 | def validate_auth(provider: APIProvider, api_key: str | None):
241 |     if provider == APIProvider.ANTHROPIC:
242 |         if not api_key:
243 |             return "Enter your Anthropic API key in the sidebar to continue."
244 |     if provider == APIProvider.BEDROCK:
245 |         import boto3
246 | 
247 |         if not boto3.Session().get_credentials():
248 |             return "You must have AWS credentials set up to use the Bedrock API."
249 |     if provider == APIProvider.VERTEX:
250 |         import google.auth
251 |         from google.auth.exceptions import DefaultCredentialsError
252 | 
253 |         if not os.environ.get("CLOUD_ML_REGION"):
254 |             return "Set the CLOUD_ML_REGION environment variable to use the Vertex API."
255 |         try:
256 |             google.auth.default(
257 |                 scopes=["https://www.googleapis.com/auth/cloud-platform"],
258 |             )
259 |         except DefaultCredentialsError:
260 |             return "Your google cloud credentials are not set up correctly."
261 | 
262 | 
263 | def load_from_storage(filename: str) -> str | None:
264 |     """Load data from a file in the storage directory."""
265 |     try:
266 |         file_path = CONFIG_DIR / filename
267 |         if file_path.exists():
268 |             data = file_path.read_text().strip()
269 |             if data:
270 |                 return data
271 |     except Exception as e:
272 |         st.write(f"Debug: Error loading {filename}: {e}")
273 |     return None
274 | 
275 | 
276 | def save_to_storage(filename: str, data: str) -> None:
277 |     """Save data to a file in the storage directory."""
278 |     try:
279 |         CONFIG_DIR.mkdir(parents=True, exist_ok=True)
280 |         file_path = CONFIG_DIR / filename
281 |         file_path.write_text(data)
282 |         # Ensure only user can read/write the file
283 |         file_path.chmod(0o600)
284 |     except Exception as e:
285 |         st.write(f"Debug: Error saving {filename}: {e}")
286 | 
287 | 
288 | def _api_response_callback(
289 |     response: APIResponse[BetaMessage],
290 |     tab: DeltaGenerator,
291 |     response_state: dict[str, APIResponse[BetaMessage]],
292 | ):
293 |     """
294 |     Handle an API response by storing it to state and rendering it.
295 |     """
296 |     response_id = datetime.now().isoformat()
297 |     response_state[response_id] = response
298 |     _render_api_response(response, response_id, tab)
299 | 
300 | 
301 | def _tool_output_callback(
302 |     tool_output: ToolResult, tool_id: str, tool_state: dict[str, ToolResult]
303 | ):
304 |     """Handle a tool output by storing it to state and rendering it."""
305 |     tool_state[tool_id] = tool_output
306 |     _render_message(Sender.TOOL, tool_output)
307 | 
308 | 
309 | def _render_api_response(
310 |     response: APIResponse[BetaMessage], response_id: str, tab: DeltaGenerator
311 | ):
312 |     """Render an API response to a streamlit tab"""
313 |     with tab:
314 |         with st.expander(f"Request/Response ({response_id})"):
315 |             newline = "\n\n"
316 |             st.markdown(
317 |                 f"`{response.http_request.method} {response.http_request.url}`{newline}{newline.join(f'`{k}: {v}`' for k, v in response.http_request.headers.items())}"
318 |             )
319 |             st.json(response.http_request.read().decode())
320 |             st.markdown(
321 |                 f"`{response.http_response.status_code}`{newline}{newline.join(f'`{k}: {v}`' for k, v in response.headers.items())}"
322 |             )
323 |             st.json(response.http_response.text)
324 | 
325 | 
326 | def _render_message(
327 |     sender: Sender,
328 |     message: str | BetaTextBlock | BetaToolUseBlock | ToolResult,
329 | ):
330 |     """Convert input from the user or output from the agent to a streamlit message."""
331 |     # streamlit's hotreloading breaks isinstance checks, so we need to check for class names
332 |     is_tool_result = not isinstance(message, str) and (
333 |         isinstance(message, ToolResult)
334 |         or message.__class__.__name__ == "ToolResult"
335 |         or message.__class__.__name__ == "CLIResult"
336 |     )
337 |     if not message or (
338 |         is_tool_result
339 |         and st.session_state.hide_images
340 |         and not hasattr(message, "error")
341 |         and not hasattr(message, "output")
342 |     ):
343 |         return
344 |     with st.chat_message(sender):
345 |         if is_tool_result:
346 |             message = cast(ToolResult, message)
347 |             if message.output:
348 |                 if message.__class__.__name__ == "CLIResult":
349 |                     st.code(message.output)
350 |                 else:
351 |                     st.markdown(message.output)
352 |             if message.error:
353 |                 st.error(message.error)
354 |             if message.base64_image and not st.session_state.hide_images:
355 |                 st.image(base64.b64decode(message.base64_image))
356 |         elif isinstance(message, BetaTextBlock) or isinstance(message, TextBlock):
357 |             st.write(message.text)
358 |         elif isinstance(message, BetaToolUseBlock) or isinstance(message, ToolUseBlock):
359 |             st.code(f"Tool Use: {message.name}\nInput: {message.input}")
360 |         else:
361 |             st.markdown(message)
362 | 
363 | 
364 | if __name__ == "__main__":
365 |     asyncio.run(main())
366 | 


--------------------------------------------------------------------------------
/computer_use_demo/tools/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import CLIResult, ToolResult
 2 | from .bash import BashTool
 3 | from .collection import ToolCollection
 4 | from .computer import ComputerTool
 5 | from .edit import EditTool
 6 | from .game import GameTool
 7 | 
 8 | __ALL__ = [
 9 |     BashTool,
10 |     CLIResult,
11 |     ComputerTool,
12 |     EditTool,
13 |     ToolCollection,
14 |     ToolResult,
15 |     GameTool,
16 | ]
17 | 


--------------------------------------------------------------------------------
/computer_use_demo/tools/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABCMeta, abstractmethod
 2 | from dataclasses import dataclass, fields, replace
 3 | from typing import Any
 4 | 
 5 | from anthropic.types.beta import BetaToolUnionParam
 6 | 
 7 | 
 8 | class BaseAnthropicTool(metaclass=ABCMeta):
 9 |     """Abstract base class for Anthropic-defined tools."""
10 | 
11 |     @abstractmethod
12 |     def __call__(self, **kwargs) -> Any:
13 |         """Executes the tool with the given arguments."""
14 |         ...
15 | 
16 |     @abstractmethod
17 |     def to_params(
18 |         self,
19 |     ) -> BetaToolUnionParam:
20 |         raise NotImplementedError
21 | 
22 | 
23 | @dataclass(kw_only=True, frozen=True)
24 | class ToolResult:
25 |     """Represents the result of a tool execution."""
26 | 
27 |     output: str | None = None
28 |     error: str | None = None
29 |     base64_image: str | None = None
30 |     system: str | None = None
31 | 
32 |     def __bool__(self):
33 |         return any(getattr(self, field.name) for field in fields(self))
34 | 
35 |     def __add__(self, other: "ToolResult"):
36 |         def combine_fields(
37 |             field: str | None, other_field: str | None, concatenate: bool = True
38 |         ):
39 |             if field and other_field:
40 |                 if concatenate:
41 |                     return field + other_field
42 |                 raise ValueError("Cannot combine tool results")
43 |             return field or other_field
44 | 
45 |         return ToolResult(
46 |             output=combine_fields(self.output, other.output),
47 |             error=combine_fields(self.error, other.error),
48 |             base64_image=combine_fields(self.base64_image, other.base64_image, False),
49 |             system=combine_fields(self.system, other.system),
50 |         )
51 | 
52 |     def replace(self, **kwargs):
53 |         """Returns a new ToolResult with the given fields replaced."""
54 |         return replace(self, **kwargs)
55 | 
56 | 
57 | class CLIResult(ToolResult):
58 |     """A ToolResult that can be rendered as a CLI output."""
59 | 
60 | 
61 | class ToolFailure(ToolResult):
62 |     """A ToolResult that represents a failure."""
63 | 
64 | 
65 | class ToolError(Exception):
66 |     """Raised when a tool encounters an error."""
67 | 
68 |     def __init__(self, message):
69 |         self.message = message
70 | 


--------------------------------------------------------------------------------
/computer_use_demo/tools/bash.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import os
  3 | from typing import ClassVar, Literal
  4 | 
  5 | from anthropic.types.beta import BetaToolBash20241022Param
  6 | 
  7 | from .base import BaseAnthropicTool, CLIResult, ToolError, ToolResult
  8 | 
  9 | 
 10 | class _BashSession:
 11 |     """A session of a bash shell."""
 12 | 
 13 |     _started: bool
 14 |     _process: asyncio.subprocess.Process
 15 | 
 16 |     command: str = "/bin/bash"
 17 |     _output_delay: float = 0.2  # seconds
 18 |     _timeout: float = 120.0  # seconds
 19 |     _sentinel: str = "<<exit>>"
 20 | 
 21 |     def __init__(self):
 22 |         self._started = False
 23 |         self._timed_out = False
 24 | 
 25 |     async def start(self):
 26 |         if self._started:
 27 |             return
 28 | 
 29 |         self._process = await asyncio.create_subprocess_shell(
 30 |             self.command,
 31 |             preexec_fn=os.setsid,
 32 |             shell=True,
 33 |             bufsize=0,
 34 |             stdin=asyncio.subprocess.PIPE,
 35 |             stdout=asyncio.subprocess.PIPE,
 36 |             stderr=asyncio.subprocess.PIPE,
 37 |         )
 38 | 
 39 |         self._started = True
 40 | 
 41 |     def stop(self):
 42 |         """Terminate the bash shell."""
 43 |         if not self._started:
 44 |             raise ToolError("Session has not started.")
 45 |         if self._process.returncode is not None:
 46 |             return
 47 |         self._process.terminate()
 48 | 
 49 |     async def run(self, command: str):
 50 |         """Execute a command in the bash shell."""
 51 |         if not self._started:
 52 |             raise ToolError("Session has not started.")
 53 |         if self._process.returncode is not None:
 54 |             return ToolResult(
 55 |                 system="tool must be restarted",
 56 |                 error=f"bash has exited with returncode {self._process.returncode}",
 57 |             )
 58 |         if self._timed_out:
 59 |             raise ToolError(
 60 |                 f"timed out: bash has not returned in {self._timeout} seconds and must be restarted",
 61 |             )
 62 | 
 63 |         # we know these are not None because we created the process with PIPEs
 64 |         assert self._process.stdin
 65 |         assert self._process.stdout
 66 |         assert self._process.stderr
 67 | 
 68 |         # send command to the process
 69 |         self._process.stdin.write(
 70 |             command.encode() + f"; echo '{self._sentinel}'\n".encode()
 71 |         )
 72 |         await self._process.stdin.drain()
 73 | 
 74 |         # read output from the process, until the sentinel is found
 75 |         try:
 76 |             async with asyncio.timeout(self._timeout):
 77 |                 while True:
 78 |                     await asyncio.sleep(self._output_delay)
 79 |                     # if we read directly from stdout/stderr, it will wait forever for
 80 |                     # EOF. use the StreamReader buffer directly instead.
 81 |                     output = self._process.stdout._buffer.decode()  # pyright: ignore[reportAttributeAccessIssue]
 82 |                     if self._sentinel in output:
 83 |                         # strip the sentinel and break
 84 |                         output = output[: output.index(self._sentinel)]
 85 |                         break
 86 |         except asyncio.TimeoutError:
 87 |             self._timed_out = True
 88 |             raise ToolError(
 89 |                 f"timed out: bash has not returned in {self._timeout} seconds and must be restarted",
 90 |             ) from None
 91 | 
 92 |         if output.endswith("\n"):
 93 |             output = output[:-1]
 94 | 
 95 |         error = self._process.stderr._buffer.decode()  # pyright: ignore[reportAttributeAccessIssue]
 96 |         if error.endswith("\n"):
 97 |             error = error[:-1]
 98 | 
 99 |         # clear the buffers so that the next output can be read correctly
100 |         self._process.stdout._buffer.clear()  # pyright: ignore[reportAttributeAccessIssue]
101 |         self._process.stderr._buffer.clear()  # pyright: ignore[reportAttributeAccessIssue]
102 | 
103 |         return CLIResult(output=output, error=error)
104 | 
105 | 
106 | class BashTool(BaseAnthropicTool):
107 |     """
108 |     A tool that allows the agent to run bash commands.
109 |     The tool parameters are defined by Anthropic and are not editable.
110 |     """
111 | 
112 |     _session: _BashSession | None
113 |     name: ClassVar[Literal["bash"]] = "bash"
114 |     api_type: ClassVar[Literal["bash_20241022"]] = "bash_20241022"
115 | 
116 |     def __init__(self):
117 |         self._session = None
118 |         super().__init__()
119 | 
120 |     async def __call__(
121 |         self, command: str | None = None, restart: bool = False, **kwargs
122 |     ):
123 |         if restart:
124 |             if self._session:
125 |                 self._session.stop()
126 |             self._session = _BashSession()
127 |             await self._session.start()
128 | 
129 |             return ToolResult(system="tool has been restarted.")
130 | 
131 |         if self._session is None:
132 |             self._session = _BashSession()
133 |             await self._session.start()
134 | 
135 |         if command is not None:
136 |             return await self._session.run(command)
137 | 
138 |         raise ToolError("no command provided.")
139 | 
140 |     def to_params(self) -> BetaToolBash20241022Param:
141 |         return {
142 |             "type": self.api_type,
143 |             "name": self.name,
144 |         }
145 | 


--------------------------------------------------------------------------------
/computer_use_demo/tools/collection.py:
--------------------------------------------------------------------------------
 1 | """Collection classes for managing multiple tools."""
 2 | 
 3 | from typing import Any
 4 | 
 5 | from anthropic.types.beta import BetaToolUnionParam
 6 | 
 7 | from .base import (
 8 |     BaseAnthropicTool,
 9 |     ToolError,
10 |     ToolFailure,
11 |     ToolResult,
12 | )
13 | 
14 | 
15 | class ToolCollection:
16 |     """A collection of anthropic-defined tools."""
17 | 
18 |     def __init__(self, *tools: BaseAnthropicTool):
19 |         self.tools = tools
20 |         self.tool_map = {tool.to_params()["name"]: tool for tool in tools}
21 | 
22 |     def to_params(
23 |         self,
24 |     ) -> list[BetaToolUnionParam]:
25 |         return [tool.to_params() for tool in self.tools]
26 | 
27 |     async def run(self, *, name: str, tool_input: dict[str, Any]) -> ToolResult:
28 |         tool = self.tool_map.get(name)
29 |         if not tool:
30 |             return ToolFailure(error=f"Tool {name} is invalid")
31 |         try:
32 |             return await tool(**tool_input)
33 |         except ToolError as e:
34 |             return ToolFailure(error=e.message)
35 | 


--------------------------------------------------------------------------------
/computer_use_demo/tools/computer.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import base64
  3 | import os
  4 | import shlex
  5 | import shutil
  6 | from enum import StrEnum
  7 | from pathlib import Path
  8 | from typing import Literal, TypedDict
  9 | from uuid import uuid4
 10 | 
 11 | from anthropic.types.beta import BetaToolComputerUse20241022Param
 12 | 
 13 | from .base import BaseAnthropicTool, ToolError, ToolResult
 14 | from .run import run
 15 | 
 16 | OUTPUT_DIR = "/tmp/outputs"
 17 | 
 18 | TYPING_DELAY_MS = 12
 19 | TYPING_GROUP_SIZE = 50
 20 | 
 21 | Action = Literal[
 22 |     "key",
 23 |     "type",
 24 |     "mouse_move",
 25 |     "left_click",
 26 |     "left_click_drag",
 27 |     "right_click",
 28 |     "middle_click",
 29 |     "double_click",
 30 |     "screenshot",
 31 |     "cursor_position",
 32 |     # minecraft
 33 |     "hold_down_left_button",
 34 |     "release_left_button",
 35 |     "hold_down_arrow_up",
 36 |     "release_arrow_up",
 37 |     "hold_down_arrow_down",
 38 |     "release_arrow_down",
 39 |     "hold_down_arrow_left",
 40 |     "release_arrow_left",
 41 |     "hold_down_arrow_right",
 42 |     "release_arrow_right",
 43 | ]
 44 | 
 45 | 
 46 | class Resolution(TypedDict):
 47 |     width: int
 48 |     height: int
 49 | 
 50 | 
 51 | # sizes above XGA/WXGA are not recommended (see README.md)
 52 | # scale down to one of these targets if ComputerTool._scaling_enabled is set
 53 | MAX_SCALING_TARGETS: dict[str, Resolution] = {
 54 |     "XGA": Resolution(width=1024, height=768),  # 4:3
 55 |     "WXGA": Resolution(width=1280, height=800),  # 16:10
 56 |     "FWXGA": Resolution(width=1366, height=768),  # ~16:9
 57 | }
 58 | 
 59 | 
 60 | class ScalingSource(StrEnum):
 61 |     COMPUTER = "computer"
 62 |     API = "api"
 63 | 
 64 | 
 65 | class ComputerToolOptions(TypedDict):
 66 |     display_height_px: int
 67 |     display_width_px: int
 68 |     display_number: int | None
 69 | 
 70 | 
 71 | def chunks(s: str, chunk_size: int) -> list[str]:
 72 |     return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)]
 73 | 
 74 | 
 75 | class ComputerTool(BaseAnthropicTool):
 76 |     """
 77 |     A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer.
 78 |     The tool parameters are defined by Anthropic and are not editable.
 79 |     """
 80 | 
 81 |     name: Literal["computer"] = "computer"
 82 |     api_type: Literal["computer_20241022"] = "computer_20241022"
 83 |     width: int
 84 |     height: int
 85 |     display_num: int | None
 86 | 
 87 |     _screenshot_delay = 2.0
 88 |     _scaling_enabled = True
 89 | 
 90 |     @property
 91 |     def options(self) -> ComputerToolOptions:
 92 |         width, height = self.scale_coordinates(
 93 |             ScalingSource.COMPUTER, self.width, self.height
 94 |         )
 95 |         return {
 96 |             "display_width_px": width,
 97 |             "display_height_px": height,
 98 |             "display_number": self.display_num,
 99 |         }
100 | 
101 |     def to_params(self) -> BetaToolComputerUse20241022Param:
102 |         return {"name": self.name, "type": self.api_type, **self.options}
103 | 
104 |     def __init__(self):
105 |         super().__init__()
106 | 
107 |         self.width = int(os.getenv("WIDTH") or 0)
108 |         self.height = int(os.getenv("HEIGHT") or 0)
109 |         assert self.width and self.height, "WIDTH, HEIGHT must be set"
110 |         if (display_num := os.getenv("DISPLAY_NUM")) is not None:
111 |             self.display_num = int(display_num)
112 |             self._display_prefix = f"DISPLAY=:{self.display_num} "
113 |         else:
114 |             self.display_num = None
115 |             self._display_prefix = ""
116 | 
117 |         self.xdotool = f"{self._display_prefix}cliclick"
118 | 
119 |     async def __call__(
120 |         self,
121 |         *,
122 |         action: Action,
123 |         text: str | None = None,
124 |         coordinate: tuple[int, int] | None = None,
125 |         **kwargs,
126 |     ):
127 |         if action in ("mouse_move", "left_click_drag"):
128 |             if coordinate is None:
129 |                 raise ToolError(f"coordinate is required for {action}")
130 |             if text is not None:
131 |                 raise ToolError(f"text is not accepted for {action}")
132 |             if not isinstance(coordinate, list) or len(coordinate) != 2:
133 |                 raise ToolError(f"{coordinate} must be a tuple of length 2")
134 |             if not all(isinstance(i, int) and i >= 0 for i in coordinate):
135 |                 raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
136 | 
137 |             x, y = self.scale_coordinates(
138 |                 ScalingSource.API, coordinate[0], coordinate[1]
139 |             )
140 | 
141 |             if action == "mouse_move":
142 |                 return await self.shell(f"{self.xdotool} m:{x},{y}")
143 |             elif action == "left_click_drag":
144 |                 return await self.shell(f"{self.xdotool} dd:. dm:{x},{y} du:{x},{y}")
145 | 
146 |         if action in ("key", "type"):
147 |             if text is None:
148 |                 raise ToolError(f"text is required for {action}")
149 |             if coordinate is not None:
150 |                 raise ToolError(f"coordinate is not accepted for {action}")
151 |             if not isinstance(text, str):
152 |                 raise ToolError(output=f"{text} must be a string")
153 | 
154 |             if action == "key":
155 |                 return await self.shell(f"{self.xdotool} kp:{text}")
156 |             elif action == "type":
157 |                 results: list[ToolResult] = []
158 |                 for chunk in chunks(text, TYPING_GROUP_SIZE):
159 |                     cmd = f"{self.xdotool} t:{shlex.quote(chunk)} -w {TYPING_DELAY_MS}"
160 |                     results.append(await self.shell(cmd, take_screenshot=False))
161 |                 screenshot_base64 = (await self.screenshot()).base64_image
162 |                 return ToolResult(
163 |                     output="".join(result.output or "" for result in results),
164 |                     error="".join(result.error or "" for result in results),
165 |                     base64_image=screenshot_base64,
166 |                 )
167 | 
168 |         if action in (
169 |             "left_click",
170 |             "right_click",
171 |             "double_click",
172 |             "middle_click",
173 |             "screenshot",
174 |             "cursor_position",
175 |         ):
176 |             if text is not None:
177 |                 raise ToolError(f"text is not accepted for {action}")
178 |             if coordinate is not None:
179 |                 raise ToolError(f"coordinate is not accepted for {action}")
180 | 
181 |             if action == "screenshot":
182 |                 return await self.screenshot()
183 |             elif action == "cursor_position":
184 |                 result = await self.shell(
185 |                     f"{self.xdotool} p",
186 |                     take_screenshot=False,
187 |                 )
188 |                 output = result.output or ""
189 |                 x, y = self.scale_coordinates(
190 |                     ScalingSource.COMPUTER,
191 |                     int(output.split(",")[0]),
192 |                     int(output.split(",")[1]),
193 |                 )
194 |                 return result.replace(output=f"X={x},Y={y}")
195 |             else:
196 |                 click_arg = {
197 |                     "left_click": "c:.",
198 |                     "right_click": "rc:.",
199 |                     "middle_click": "mc:.",
200 |                     "double_click": "dc:.",
201 |                 }[action]
202 |                 return await self.shell(f"{self.xdotool} {click_arg}")
203 | 
204 |         # minecraft
205 |         if action == "hold_down_left_button":
206 |             return await self.shell(f"{self.xdotool} m:d")
207 |         elif action == "release_left_button":
208 |             return await self.shell(f"{self.xdotool} m:u")
209 |         elif action == "hold_down_arrow_up":
210 |             return await self.shell(f"{self.xdotool} kp:up")
211 |         elif action == "release_arrow_up":
212 |             return await self.shell(f"{self.xdotool} ku:up")
213 |         elif action == "hold_down_arrow_down":
214 |             return await self.shell(f"{self.xdotool} kp:down")
215 |         elif action == "release_arrow_down":
216 |             return await self.shell(f"{self.xdotool} ku:down")
217 |         elif action == "hold_down_arrow_left":
218 |             return await self.shell(f"{self.xdotool} kp:left")
219 |         elif action == "release_arrow_left":
220 |             return await self.shell(f"{self.xdotool} ku:left")
221 |         elif action == "hold_down_arrow_right":
222 |             return await self.shell(f"{self.xdotool} kp:right")
223 |         elif action == "release_arrow_right":
224 |             return await self.shell(f"{self.xdotool} ku:right")
225 | 
226 |         raise ToolError(f"Invalid action: {action}")
227 | 
228 |     async def screenshot(self):
229 |         """Take a screenshot of the current screen and return the base64 encoded image."""
230 |         output_dir = Path(OUTPUT_DIR)
231 |         output_dir.mkdir(parents=True, exist_ok=True)
232 |         path = output_dir / f"screenshot_{uuid4().hex}.png"
233 | 
234 |         screenshot_cmd = f"{self._display_prefix}screencapture -f {path} -p"
235 | 
236 |         result = await self.shell(screenshot_cmd, take_screenshot=False)
237 |         if self._scaling_enabled:
238 |             x, y = self.scale_coordinates(
239 |                 ScalingSource.COMPUTER, self.width, self.height
240 |             )
241 |             await self.shell(
242 |                 f"convert {path} -resize {x}x{y}! {path}", take_screenshot=False
243 |             )
244 | 
245 |         if path.exists():
246 |             return result.replace(
247 |                 base64_image=base64.b64encode(path.read_bytes()).decode()
248 |             )
249 |         raise ToolError(f"Failed to take screenshot: {result.error}")
250 | 
251 |     async def shell(self, command: str, take_screenshot=True) -> ToolResult:
252 |         """Run a shell command and return the output, error, and optionally a screenshot."""
253 |         _, stdout, stderr = await run(command)
254 |         base64_image = None
255 | 
256 |         if take_screenshot:
257 |             # delay to let things settle before taking a screenshot
258 |             await asyncio.sleep(self._screenshot_delay)
259 |             base64_image = (await self.screenshot()).base64_image
260 | 
261 |         return ToolResult(output=stdout, error=stderr, base64_image=base64_image)
262 | 
263 |     def scale_coordinates(self, source: ScalingSource, x: int, y: int):
264 |         """Scale coordinates to a target maximum resolution."""
265 |         if not self._scaling_enabled:
266 |             return x, y
267 |         ratio = self.width / self.height
268 |         target_dimension = None
269 |         for dimension in MAX_SCALING_TARGETS.values():
270 |             # allow some error in the aspect ratio - not ratios are exactly 16:9
271 |             if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
272 |                 if dimension["width"] < self.width:
273 |                     target_dimension = dimension
274 |                 break
275 |         if target_dimension is None:
276 |             return x, y
277 |         # should be less than 1
278 |         x_scaling_factor = target_dimension["width"] / self.width
279 |         y_scaling_factor = target_dimension["height"] / self.height
280 |         if source == ScalingSource.API:
281 |             if x > self.width or y > self.height:
282 |                 raise ToolError(f"Coordinates {x}, {y} are out of bounds")
283 |             # scale up
284 |             return round(x / x_scaling_factor), round(y / y_scaling_factor)
285 |         # scale down
286 |         return round(x * x_scaling_factor), round(y * y_scaling_factor)
287 | 


--------------------------------------------------------------------------------
/computer_use_demo/tools/edit.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from pathlib import Path
  3 | from typing import Literal, get_args
  4 | 
  5 | from anthropic.types.beta import BetaToolTextEditor20241022Param
  6 | 
  7 | from .base import BaseAnthropicTool, CLIResult, ToolError, ToolResult
  8 | from .run import maybe_truncate, run
  9 | 
 10 | Command = Literal[
 11 |     "view",
 12 |     "create",
 13 |     "str_replace",
 14 |     "insert",
 15 |     "undo_edit",
 16 | ]
 17 | SNIPPET_LINES: int = 4
 18 | 
 19 | 
 20 | class EditTool(BaseAnthropicTool):
 21 |     """
 22 |     An filesystem editor tool that allows the agent to view, create, and edit files.
 23 |     The tool parameters are defined by Anthropic and are not editable.
 24 |     """
 25 | 
 26 |     api_type: Literal["text_editor_20241022"] = "text_editor_20241022"
 27 |     name: Literal["str_replace_editor"] = "str_replace_editor"
 28 | 
 29 |     _file_history: dict[Path, list[str]]
 30 | 
 31 |     def __init__(self):
 32 |         self._file_history = defaultdict(list)
 33 |         super().__init__()
 34 | 
 35 |     def to_params(self) -> BetaToolTextEditor20241022Param:
 36 |         return {
 37 |             "name": self.name,
 38 |             "type": self.api_type,
 39 |         }
 40 | 
 41 |     async def __call__(
 42 |         self,
 43 |         *,
 44 |         command: Command,
 45 |         path: str,
 46 |         file_text: str | None = None,
 47 |         view_range: list[int] | None = None,
 48 |         old_str: str | None = None,
 49 |         new_str: str | None = None,
 50 |         insert_line: int | None = None,
 51 |         **kwargs,
 52 |     ):
 53 |         _path = Path(path)
 54 |         self.validate_path(command, _path)
 55 |         if command == "view":
 56 |             return await self.view(_path, view_range)
 57 |         elif command == "create":
 58 |             if not file_text:
 59 |                 raise ToolError("Parameter `file_text` is required for command: create")
 60 |             self.write_file(_path, file_text)
 61 |             self._file_history[_path].append(file_text)
 62 |             return ToolResult(output=f"File created successfully at: {_path}")
 63 |         elif command == "str_replace":
 64 |             if not old_str:
 65 |                 raise ToolError(
 66 |                     "Parameter `old_str` is required for command: str_replace"
 67 |                 )
 68 |             return self.str_replace(_path, old_str, new_str)
 69 |         elif command == "insert":
 70 |             if insert_line is None:
 71 |                 raise ToolError(
 72 |                     "Parameter `insert_line` is required for command: insert"
 73 |                 )
 74 |             if not new_str:
 75 |                 raise ToolError("Parameter `new_str` is required for command: insert")
 76 |             return self.insert(_path, insert_line, new_str)
 77 |         elif command == "undo_edit":
 78 |             return self.undo_edit(_path)
 79 |         raise ToolError(
 80 |             f'Unrecognized command {command}. The allowed commands for the {self.name} tool are: {", ".join(get_args(Command))}'
 81 |         )
 82 | 
 83 |     def validate_path(self, command: str, path: Path):
 84 |         """
 85 |         Check that the path/command combination is valid.
 86 |         """
 87 |         # Check if its an absolute path
 88 |         if not path.is_absolute():
 89 |             suggested_path = Path("") / path
 90 |             raise ToolError(
 91 |                 f"The path {path} is not an absolute path, it should start with `/`. Maybe you meant {suggested_path}?"
 92 |             )
 93 |         # Check if path exists
 94 |         if not path.exists() and command != "create":
 95 |             raise ToolError(
 96 |                 f"The path {path} does not exist. Please provide a valid path."
 97 |             )
 98 |         if path.exists() and command == "create":
 99 |             raise ToolError(
100 |                 f"File already exists at: {path}. Cannot overwrite files using command `create`."
101 |             )
102 |         # Check if the path points to a directory
103 |         if path.is_dir():
104 |             if command != "view":
105 |                 raise ToolError(
106 |                     f"The path {path} is a directory and only the `view` command can be used on directories"
107 |                 )
108 | 
109 |     async def view(self, path: Path, view_range: list[int] | None = None):
110 |         """Implement the view command"""
111 |         if path.is_dir():
112 |             if view_range:
113 |                 raise ToolError(
114 |                     "The `view_range` parameter is not allowed when `path` points to a directory."
115 |                 )
116 | 
117 |             _, stdout, stderr = await run(
118 |                 rf"find {path} -maxdepth 2 -not -path '*/\.*'"
119 |             )
120 |             if not stderr:
121 |                 stdout = f"Here's the files and directories up to 2 levels deep in {path}, excluding hidden items:\n{stdout}\n"
122 |             return CLIResult(output=stdout, error=stderr)
123 | 
124 |         file_content = self.read_file(path)
125 |         init_line = 1
126 |         if view_range:
127 |             if len(view_range) != 2 or not all(isinstance(i, int) for i in view_range):
128 |                 raise ToolError(
129 |                     "Invalid `view_range`. It should be a list of two integers."
130 |                 )
131 |             file_lines = file_content.split("\n")
132 |             n_lines_file = len(file_lines)
133 |             init_line, final_line = view_range
134 |             if init_line < 1 or init_line > n_lines_file:
135 |                 raise ToolError(
136 |                     f"Invalid `view_range`: {view_range}. It's first element `{init_line}` should be within the range of lines of the file: {[1, n_lines_file]}"
137 |                 )
138 |             if final_line > n_lines_file:
139 |                 raise ToolError(
140 |                     f"Invalid `view_range`: {view_range}. It's second element `{final_line}` should be smaller than the number of lines in the file: `{n_lines_file}`"
141 |                 )
142 |             if final_line != -1 and final_line < init_line:
143 |                 raise ToolError(
144 |                     f"Invalid `view_range`: {view_range}. It's second element `{final_line}` should be larger or equal than its first `{init_line}`"
145 |                 )
146 | 
147 |             if final_line == -1:
148 |                 file_content = "\n".join(file_lines[init_line - 1 :])
149 |             else:
150 |                 file_content = "\n".join(file_lines[init_line - 1 : final_line])
151 | 
152 |         return CLIResult(
153 |             output=self._make_output(file_content, str(path), init_line=init_line)
154 |         )
155 | 
156 |     def str_replace(self, path: Path, old_str: str, new_str: str | None):
157 |         """Implement the str_replace command, which replaces old_str with new_str in the file content"""
158 |         # Read the file content
159 |         file_content = self.read_file(path).expandtabs()
160 |         old_str = old_str.expandtabs()
161 |         new_str = new_str.expandtabs() if new_str is not None else ""
162 | 
163 |         # Check if old_str is unique in the file
164 |         occurrences = file_content.count(old_str)
165 |         if occurrences == 0:
166 |             raise ToolError(
167 |                 f"No replacement was performed, old_str `{old_str}` did not appear verbatim in {path}."
168 |             )
169 |         elif occurrences > 1:
170 |             file_content_lines = file_content.split("\n")
171 |             lines = [
172 |                 idx + 1
173 |                 for idx, line in enumerate(file_content_lines)
174 |                 if old_str in line
175 |             ]
176 |             raise ToolError(
177 |                 f"No replacement was performed. Multiple occurrences of old_str `{old_str}` in lines {lines}. Please ensure it is unique"
178 |             )
179 | 
180 |         # Replace old_str with new_str
181 |         new_file_content = file_content.replace(old_str, new_str)
182 | 
183 |         # Write the new content to the file
184 |         self.write_file(path, new_file_content)
185 | 
186 |         # Save the content to history
187 |         self._file_history[path].append(file_content)
188 | 
189 |         # Create a snippet of the edited section
190 |         replacement_line = file_content.split(old_str)[0].count("\n")
191 |         start_line = max(0, replacement_line - SNIPPET_LINES)
192 |         end_line = replacement_line + SNIPPET_LINES + new_str.count("\n")
193 |         snippet = "\n".join(new_file_content.split("\n")[start_line : end_line + 1])
194 | 
195 |         # Prepare the success message
196 |         success_msg = f"The file {path} has been edited. "
197 |         success_msg += self._make_output(
198 |             snippet, f"a snippet of {path}", start_line + 1
199 |         )
200 |         success_msg += "Review the changes and make sure they are as expected. Edit the file again if necessary."
201 | 
202 |         return CLIResult(output=success_msg)
203 | 
204 |     def insert(self, path: Path, insert_line: int, new_str: str):
205 |         """Implement the insert command, which inserts new_str at the specified line in the file content."""
206 |         file_text = self.read_file(path).expandtabs()
207 |         new_str = new_str.expandtabs()
208 |         file_text_lines = file_text.split("\n")
209 |         n_lines_file = len(file_text_lines)
210 | 
211 |         if insert_line < 0 or insert_line > n_lines_file:
212 |             raise ToolError(
213 |                 f"Invalid `insert_line` parameter: {insert_line}. It should be within the range of lines of the file: {[0, n_lines_file]}"
214 |             )
215 | 
216 |         new_str_lines = new_str.split("\n")
217 |         new_file_text_lines = (
218 |             file_text_lines[:insert_line]
219 |             + new_str_lines
220 |             + file_text_lines[insert_line:]
221 |         )
222 |         snippet_lines = (
223 |             file_text_lines[max(0, insert_line - SNIPPET_LINES) : insert_line]
224 |             + new_str_lines
225 |             + file_text_lines[insert_line : insert_line + SNIPPET_LINES]
226 |         )
227 | 
228 |         new_file_text = "\n".join(new_file_text_lines)
229 |         snippet = "\n".join(snippet_lines)
230 | 
231 |         self.write_file(path, new_file_text)
232 |         self._file_history[path].append(file_text)
233 | 
234 |         success_msg = f"The file {path} has been edited. "
235 |         success_msg += self._make_output(
236 |             snippet,
237 |             "a snippet of the edited file",
238 |             max(1, insert_line - SNIPPET_LINES + 1),
239 |         )
240 |         success_msg += "Review the changes and make sure they are as expected (correct indentation, no duplicate lines, etc). Edit the file again if necessary."
241 |         return CLIResult(output=success_msg)
242 | 
243 |     def undo_edit(self, path: Path):
244 |         """Implement the undo_edit command."""
245 |         if not self._file_history[path]:
246 |             raise ToolError(f"No edit history found for {path}.")
247 | 
248 |         old_text = self._file_history[path].pop()
249 |         self.write_file(path, old_text)
250 | 
251 |         return CLIResult(
252 |             output=f"Last edit to {path} undone successfully. {self._make_output(old_text, str(path))}"
253 |         )
254 | 
255 |     def read_file(self, path: Path):
256 |         """Read the content of a file from a given path; raise a ToolError if an error occurs."""
257 |         try:
258 |             return path.read_text()
259 |         except Exception as e:
260 |             raise ToolError(f"Ran into {e} while trying to read {path}") from None
261 | 
262 |     def write_file(self, path: Path, file: str):
263 |         """Write the content of a file to a given path; raise a ToolError if an error occurs."""
264 |         try:
265 |             path.write_text(file)
266 |         except Exception as e:
267 |             raise ToolError(f"Ran into {e} while trying to write to {path}") from None
268 | 
269 |     def _make_output(
270 |         self,
271 |         file_content: str,
272 |         file_descriptor: str,
273 |         init_line: int = 1,
274 |         expand_tabs: bool = True,
275 |     ):
276 |         """Generate output for the CLI based on the content of a file."""
277 |         file_content = maybe_truncate(file_content)
278 |         if expand_tabs:
279 |             file_content = file_content.expandtabs()
280 |         file_content = "\n".join(
281 |             [
282 |                 f"{i + init_line:6}\t{line}"
283 |                 for i, line in enumerate(file_content.split("\n"))
284 |             ]
285 |         )
286 |         return (
287 |             f"Here's the result of running `cat -n` on {file_descriptor}:\n"
288 |             + file_content
289 |             + "\n"
290 |         )
291 | 


--------------------------------------------------------------------------------
/computer_use_demo/tools/game.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import base64
  3 | import os
  4 | import pyautogui
  5 | from enum import StrEnum
  6 | from pathlib import Path
  7 | from typing import Literal, TypedDict
  8 | from uuid import uuid4
  9 | 
 10 | from anthropic.types.beta import BetaToolComputerUse20241022Param
 11 | 
 12 | from .base import BaseAnthropicTool, ToolError, ToolResult
 13 | from .run import run
 14 | 
 15 | OUTPUT_DIR = "/tmp/outputs"
 16 | 
 17 | TYPING_DELAY_MS = 12
 18 | TYPING_GROUP_SIZE = 50
 19 | 
 20 | Action = Literal[
 21 |     "key",
 22 |     "type",
 23 |     "mouse_move",
 24 |     "left_click",
 25 |     "left_click_drag",
 26 |     "right_click",
 27 |     "middle_click",
 28 |     "double_click",
 29 |     "screenshot",
 30 |     "cursor_position",
 31 |     # minecraft
 32 |     "left_down",
 33 |     "left_up",
 34 |     "hold_arrow_up",
 35 |     "release_arrow_up",
 36 |     "hold_arrow_down",
 37 |     "release_arrow_down",
 38 |     "hold_arrow_left",
 39 |     "release_arrow_left",
 40 |     "hold_arrow_right",
 41 |     "release_arrow_right",
 42 | ]
 43 | 
 44 | 
 45 | class Resolution(TypedDict):
 46 |     width: int
 47 |     height: int
 48 | 
 49 | 
 50 | # sizes above XGA/WXGA are not recommended (see README.md)
 51 | # scale down to one of these targets if ComputerTool._scaling_enabled is set
 52 | MAX_SCALING_TARGETS: dict[str, Resolution] = {
 53 |     "XGA": Resolution(width=1024, height=768),  # 4:3
 54 |     "WXGA": Resolution(width=1280, height=800),  # 16:10
 55 |     "FWXGA": Resolution(width=1366, height=768),  # ~16:9
 56 | }
 57 | 
 58 | 
 59 | class ScalingSource(StrEnum):
 60 |     COMPUTER = "computer"
 61 |     API = "api"
 62 | 
 63 | 
 64 | class ComputerToolOptions(TypedDict):
 65 |     display_height_px: int
 66 |     display_width_px: int
 67 |     display_number: int | None
 68 | 
 69 | # <MINECRAFT_GAME_INSTRUCTIONS>
 70 | # You may be asked to play a game of minecraft.  Here are some instructions:
 71 | # * For movement, WASD is disabled.  Instead, use the hold_arrow_left, release_arrow_left, hold_arrow_right, release_arrow_right, hold_arrow_up, release_arrow_up, hold_arrow_down, release_arrow_down tool calls.
 72 | # * You can control the minecraft game with your computer tool.  Use the minecraft_button tool call to hold down the left mouse button, and the left_up tool call to release it.
 73 | # * Use the hold_arrow_up tool call to hold down the up arrow key, and the release_arrow_up tool call to release it.
 74 | # * Use the hold_arrow_down tool call to hold down the down arrow key, and the release_arrow_down tool call to release it.
 75 | # * Use the hold_arrow_left tool call to hold down the left arrow key, and the release_arrow_left tool call to release it.
 76 | # * Use the hold_arrow_right tool call to hold down the right arrow key, and the release_arrow_right tool call to release it.
 77 | # </MINECRAFT_INSTRUCTIONS>
 78 | def chunks(s: str, chunk_size: int) -> list[str]:
 79 |     return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)]
 80 | 
 81 | class GameTool(BaseAnthropicTool):
 82 |     """
 83 |     A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer. Also allows the agent to control a minecraft game.
 84 |     The tool parameters are defined by Anthropic and are not editable.
 85 |     """
 86 | 
 87 |     name: Literal["computer"] = "computer"
 88 |     api_type: Literal["computer_20241022"] = "computer_20241022"
 89 |     width: int
 90 |     height: int
 91 |     description: str = """
 92 |     Use a mouse and keyboard to interact with a computer, and take screenshots.
 93 |     * This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.
 94 |     * Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try taking another screenshot.
 95 |     * The screen's resolution is {{ display_width_px }}x{{ display_height_px }}.
 96 |     * The display number is {{ display_number }}
 97 |     * Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.
 98 |     * If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.
 99 |     * Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.
100 |     In Minecraft, the standard controls are:
101 |     space: Jump
102 |     mouse_move: Look around
103 |     left_down: Break blocks/attack
104 |     right click: Place blocks/interact
105 |     For movement in minecraft, WASD is disabled.  Instead, use the hold_arrow_left, release_arrow_left, hold_arrow_right, release_arrow_right, hold_arrow_up, release_arrow_up, hold_arrow_down, release_arrow_down tool calls.
106 |     """
107 |     input_schema = {
108 |     "properties": {
109 |         "action": {
110 |             "description": """The action to perform. The available actions are:
111 |                 * `key`: Press a key or key-combination on the keyboard.
112 |                   - This supports cliclick's `key` syntax.
113 |                   - All possible keys are: arrow-up, brightness-down, brightness-up, delete, end, enter, esc, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15, f16, fwd-delete, home, keys-light-down, keys-light-toggle, keys-light-up, mute, num-0, num-1, num-2, num-3, num-4, num-5, num-6, num-7, num-8, num-9, num-clear, num-divide, num-enter, num-equals, num-minus, num-multiply, num-plus, page-down, page-up, play-next, play-pause, play-previous, return, space, tab, volume-down, volume-up
114 |                 * `type`: Type a string of text on the keyboard.
115 |                 * `cursor_position`: Get the current (x, y) pixel coordinate of the cursor on the screen.
116 |                 * `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen.
117 |                 * `left_click`: Click the left mouse button.
118 |                 * `right_click`: Click the right mouse button.
119 |                 * `middle_click`: Click the middle mouse button.
120 |                 * `double_click`: Double-click the left mouse button.
121 |                 * `screenshot`: Take a screenshot of the screen.
122 |             """,
123 |             "enum": [
124 |                 "key",
125 |                 "type",
126 |                 "mouse_move",
127 |                 "left_click",
128 |                 # "left_click_drag",
129 |                 "right_click",
130 |                 "middle_click",
131 |                 "double_click",
132 |                 "screenshot",
133 |                 "cursor_position",
134 |                 # minecraft
135 |                 "left_down",
136 |                 "left_up",
137 |                 "hold_arrow_up",
138 |                 "release_arrow_up",
139 |                 "hold_arrow_down",
140 |                 "release_arrow_down",
141 |                 "hold_arrow_left",
142 |                 "release_arrow_left",
143 |                 "hold_arrow_right",
144 |                 "release_arrow_right",
145 |             ],
146 |             "type": "string",
147 |         },
148 |         "coordinate": {
149 |             "description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=mouse_move` and `action=left_click_drag`.",
150 |             "type": "array",
151 |         },
152 |         "text": {
153 |             "description": "Required only by `action=type` and `action=key`.",
154 |             "type": "string",
155 |         },
156 |     },
157 |     "required": ["action"],
158 |     "type": "object",
159 |     }
160 | 
161 |     display_num: int | None
162 | 
163 |     _screenshot_delay = 2.0
164 |     _scaling_enabled = True
165 | 
166 |     @property
167 |     def options(self) -> ComputerToolOptions:
168 |         width, height = self.scale_coordinates(
169 |             ScalingSource.COMPUTER, self.width, self.height
170 |         )
171 |         return {
172 |             "display_width_px": width,
173 |             "display_height_px": height,
174 |             "display_number": self.display_num,
175 |         }
176 | 
177 |     def to_params(self) -> BetaToolComputerUse20241022Param:
178 |         return {"name": self.name, "type": self.api_type, **self.options}
179 | 
180 |     def __init__(self):
181 |         super().__init__()
182 | 
183 |         self.width = int(os.getenv("WIDTH") or 0)
184 |         self.height = int(os.getenv("HEIGHT") or 0)
185 |         assert self.width and self.height, "WIDTH, HEIGHT must be set"
186 |         if (display_num := os.getenv("DISPLAY_NUM")) is not None:
187 |             self.display_num = int(display_num)
188 |             self._display_prefix = f"DISPLAY=:{self.display_num} "
189 |         else:
190 |             self.display_num = None
191 |             self._display_prefix = ""
192 | 
193 |     async def __call__(
194 |         self,
195 |         *,
196 |         action: Action,
197 |         text: str | None = None,
198 |         coordinate: tuple[int, int] | None = None,
199 |         **kwargs,
200 |     ):
201 |         if action in ("mouse_move", "left_click_drag"):
202 |             if coordinate is None:
203 |                 raise ToolError(f"coordinate is required for {action}")
204 |             if text is not None:
205 |                 raise ToolError(f"text is not accepted for {action}")
206 |             if not isinstance(coordinate, list) or len(coordinate) != 2:
207 |                 raise ToolError(f"{coordinate} must be a tuple of length 2")
208 |             if not all(isinstance(i, int) and i >= 0 for i in coordinate):
209 |                 raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
210 | 
211 |             x, y = self.scale_coordinates(
212 |                 ScalingSource.API, coordinate[0], coordinate[1]
213 |             )
214 | 
215 |             if action == "mouse_move":
216 |                 pyautogui.moveTo(x, y)
217 |                 return ToolResult(output=f"Moved mouse to {x},{y}")
218 |             elif action == "left_click_drag":
219 |                 pyautogui.dragTo(x, y)
220 |                 return ToolResult(output=f"Dragged mouse to {x},{y}")
221 | 
222 |         if action in ("key", "type"):
223 |             if text is None:
224 |                 raise ToolError(f"text is required for {action}")
225 |             if coordinate is not None:
226 |                 raise ToolError(f"coordinate is not accepted for {action}")
227 |             if not isinstance(text, str):
228 |                 raise ToolError(output=f"{text} must be a string")
229 | 
230 |             if action == "key":
231 |                 if text.lower() in 'wasd':
232 |                     pyautogui.keyDown(text.lower())
233 |                     pyautogui.sleep(1)
234 |                     pyautogui.keyUp(text.lower())
235 |                 elif text.lower() in 'abcdefghijklmnopqrstuvwxyz':
236 |                     pyautogui.press(text.lower())
237 |                 elif text in "1234567890":
238 |                     pyautogui.press(text)
239 |                 elif text.lower() == "return":
240 |                     pyautogui.press('enter')
241 |                 elif text.lower() in ("right-arrow", "right", "left-arrow", "left", "up-arrow", "up", "down-arrow", "down"):
242 |                     pyautogui.press(text.split('-')[0].lower())
243 |                 else:
244 |                     pyautogui.press(text.lower())
245 |                 return ToolResult(output=f"Pressed key: {text}")
246 |             elif action == "type":
247 |                 for chunk in chunks(text, TYPING_GROUP_SIZE):
248 |                     pyautogui.write(chunk, interval=TYPING_DELAY_MS/1000)
249 |                 screenshot_base64 = (await self.screenshot()).base64_image
250 |                 return ToolResult(
251 |                     output=f"Typed: {text}",
252 |                     base64_image=screenshot_base64,
253 |                 )
254 | 
255 |         if action in (
256 |             "left_click",
257 |             "right_click",
258 |             "double_click",
259 |             "middle_click",
260 |             "screenshot",
261 |             "cursor_position",
262 |         ):
263 |             if text is not None:
264 |                 raise ToolError(f"text is not accepted for {action}")
265 |             if coordinate is not None:
266 |                 raise ToolError(f"coordinate is not accepted for {action}")
267 | 
268 |             if action == "screenshot":
269 |                 return await self.screenshot()
270 |             elif action == "cursor_position":
271 |                 x, y = pyautogui.position()
272 |                 x, y = self.scale_coordinates(
273 |                     ScalingSource.COMPUTER, x, y
274 |                 )
275 |                 return ToolResult(output=f"X={x},Y={y}")
276 |             else:
277 |                 click_map = {
278 |                     "left_click": pyautogui.click,
279 |                     "right_click": pyautogui.rightClick,
280 |                     "middle_click": pyautogui.middleClick,
281 |                     "double_click": pyautogui.doubleClick,
282 |                 }
283 |                 click_map[action]()
284 |                 return ToolResult(output=f"Performed {action}")
285 | 
286 |         # minecraft
287 |         if action == "left_down":
288 |             pyautogui.mouseDown()
289 |         elif action == "left_up":
290 |             pyautogui.mouseUp()
291 |         elif action.startswith("hold_arrow_"):
292 |             pyautogui.keyDown(action.split('_')[-1])
293 |         elif action.startswith("release_arrow_"):
294 |             pyautogui.keyUp(action.split('_')[-1])
295 |         else:
296 |             raise ToolError(f"Invalid action: {action}")
297 | 
298 |         return ToolResult(output=f"Performed {action}")
299 | 
300 |     async def screenshot(self):
301 |         """Take a screenshot of the current screen and return the base64 encoded image."""
302 |         output_dir = Path(OUTPUT_DIR)
303 |         output_dir.mkdir(parents=True, exist_ok=True)
304 |         path = output_dir / f"screenshot_{uuid4().hex}.png"
305 | 
306 |         screenshot_cmd = f"{self._display_prefix}screencapture -C {path} -p"
307 | 
308 |         # resize the screenshot to default width and height
309 |         await self.shell(f"convert {path} -resize {self.width}x{self.height}! {path}", take_screenshot=False)
310 | 
311 |         result = await self.shell(screenshot_cmd, take_screenshot=False)
312 |         if self._scaling_enabled:
313 |             x, y = self.scale_coordinates(
314 |                 ScalingSource.COMPUTER, self.width, self.height
315 |             )
316 |             await self.shell(
317 |                 f"convert {path} -resize {x}x{y}! {path}", take_screenshot=False
318 |             )
319 | 
320 |         if path.exists():
321 |             return result.replace(
322 |                 base64_image=base64.b64encode(path.read_bytes()).decode()
323 |             )
324 |         raise ToolError(f"Failed to take screenshot: {result.error}")
325 | 
326 |     async def shell(self, command: str, take_screenshot=True) -> ToolResult:
327 |         """Run a shell command and return the output, error, and optionally a screenshot."""
328 |         _, stdout, stderr = await run(command)
329 |         base64_image = None
330 | 
331 |         if take_screenshot:
332 |             # delay to let things settle before taking a screenshot
333 |             await asyncio.sleep(self._screenshot_delay)
334 |             base64_image = (await self.screenshot()).base64_image
335 | 
336 |         return ToolResult(output=stdout, error=stderr, base64_image=base64_image)
337 | 
338 |     def scale_coordinates(self, source: ScalingSource, x: int, y: int):
339 |         """Scale coordinates to a target maximum resolution."""
340 |         if not self._scaling_enabled:
341 |             return x, y
342 |         ratio = self.width / self.height
343 |         target_dimension = None
344 |         for dimension in MAX_SCALING_TARGETS.values():
345 |             # allow some error in the aspect ratio - not ratios are exactly 16:9
346 |             if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
347 |                 if dimension["width"] < self.width:
348 |                     target_dimension = dimension
349 |                 break
350 |         if target_dimension is None:
351 |             return x, y
352 |         # should be less than 1
353 |         x_scaling_factor = target_dimension["width"] / self.width
354 |         y_scaling_factor = target_dimension["height"] / self.height
355 |         if source == ScalingSource.API:
356 |             if x > self.width or y > self.height:
357 |                 raise ToolError(f"Coordinates {x}, {y} are out of bounds")
358 |             # scale up
359 |             return round(x / x_scaling_factor), round(y / y_scaling_factor)
360 |         # scale down
361 |         return round(x * x_scaling_factor), round(y * y_scaling_factor)
362 | 
363 | 


--------------------------------------------------------------------------------
/computer_use_demo/tools/run.py:
--------------------------------------------------------------------------------
 1 | """Utility to run shell commands asynchronously with a timeout."""
 2 | 
 3 | import asyncio
 4 | 
 5 | TRUNCATED_MESSAGE: str = "<response clipped><NOTE>To save on context only part of this file has been shown to you. You should retry this tool after you have searched inside the file with `grep -n` in order to find the line numbers of what you are looking for.</NOTE>"
 6 | MAX_RESPONSE_LEN: int = 16000
 7 | 
 8 | 
 9 | def maybe_truncate(content: str, truncate_after: int | None = MAX_RESPONSE_LEN):
10 |     """Truncate content and append a notice if content exceeds the specified length."""
11 |     return (
12 |         content
13 |         if not truncate_after or len(content) <= truncate_after
14 |         else content[:truncate_after] + TRUNCATED_MESSAGE
15 |     )
16 | 
17 | 
18 | async def run(
19 |     cmd: str,
20 |     timeout: float | None = 120.0,  # seconds
21 |     truncate_after: int | None = MAX_RESPONSE_LEN,
22 | ):
23 |     """Run a shell command asynchronously with a timeout."""
24 |     process = await asyncio.create_subprocess_shell(
25 |         cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
26 |     )
27 | 
28 |     try:
29 |         stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=timeout)
30 |         return (
31 |             process.returncode or 0,
32 |             maybe_truncate(stdout.decode(), truncate_after=truncate_after),
33 |             maybe_truncate(stderr.decode(), truncate_after=truncate_after),
34 |         )
35 |     except asyncio.TimeoutError as exc:
36 |         try:
37 |             process.kill()
38 |         except ProcessLookupError:
39 |             pass
40 |         raise TimeoutError(
41 |             f"Command '{cmd}' timed out after {timeout} seconds"
42 |         ) from exc
43 | 


--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
1 | -r computer_use_demo/requirements.txt
2 | ruff==0.6.7
3 | pre-commit==3.8.0
4 | pytest==8.3.3
5 | pytest-asyncio==0.23.6
6 | 


--------------------------------------------------------------------------------
/image/.config/tint2/applications/firefox-custom.desktop:
--------------------------------------------------------------------------------
1 | [Desktop Entry]
2 | Name=Firefox Custom
3 | Comment=Open Firefox with custom URL
4 | Exec=firefox-esr -new-window
5 | Icon=firefox-esr
6 | Terminal=false
7 | Type=Application
8 | Categories=Network;WebBrowser;
9 | 


--------------------------------------------------------------------------------
/image/.config/tint2/applications/gedit.desktop:
--------------------------------------------------------------------------------
1 | [Desktop Entry]
2 | Name=Gedit
3 | Comment=Open gedit
4 | Exec=gedit
5 | Icon=text-editor-symbolic
6 | Terminal=false
7 | Type=Application
8 | Categories=TextEditor;
9 | 


--------------------------------------------------------------------------------
/image/.config/tint2/applications/terminal.desktop:
--------------------------------------------------------------------------------
1 | [Desktop Entry]
2 | Name=Terminal
3 | Comment=Open Terminal
4 | Exec=xterm
5 | Icon=utilities-terminal
6 | Terminal=false
7 | Type=Application
8 | Categories=System;TerminalEmulator;
9 | 


--------------------------------------------------------------------------------
/image/.config/tint2/tint2rc:
--------------------------------------------------------------------------------
  1 | #-------------------------------------
  2 | # Panel
  3 | panel_items = TL
  4 | panel_size = 100% 60
  5 | panel_margin = 0 0
  6 | panel_padding = 2 0 2
  7 | panel_background_id = 1
  8 | wm_menu = 0
  9 | panel_dock = 0
 10 | panel_position = bottom center horizontal
 11 | panel_layer = top
 12 | panel_monitor = all
 13 | panel_shrink = 0
 14 | autohide = 0
 15 | autohide_show_timeout = 0
 16 | autohide_hide_timeout = 0.5
 17 | autohide_height = 2
 18 | strut_policy = follow_size
 19 | panel_window_name = tint2
 20 | disable_transparency = 1
 21 | mouse_effects = 1
 22 | font_shadow = 0
 23 | mouse_hover_icon_asb = 100 0 10
 24 | mouse_pressed_icon_asb = 100 0 0
 25 | scale_relative_to_dpi = 0
 26 | scale_relative_to_screen_height = 0
 27 | 
 28 | #-------------------------------------
 29 | # Taskbar
 30 | taskbar_mode = single_desktop
 31 | taskbar_hide_if_empty = 0
 32 | taskbar_padding = 0 0 2
 33 | taskbar_background_id = 0
 34 | taskbar_active_background_id = 0
 35 | taskbar_name = 1
 36 | taskbar_hide_inactive_tasks = 0
 37 | taskbar_hide_different_monitor = 0
 38 | taskbar_hide_different_desktop = 0
 39 | taskbar_always_show_all_desktop_tasks = 0
 40 | taskbar_name_padding = 4 2
 41 | taskbar_name_background_id = 0
 42 | taskbar_name_active_background_id = 0
 43 | taskbar_name_font_color = #e3e3e3 100
 44 | taskbar_name_active_font_color = #ffffff 100
 45 | taskbar_distribute_size = 0
 46 | taskbar_sort_order = none
 47 | task_align = left
 48 | 
 49 | #-------------------------------------
 50 | # Launcher
 51 | launcher_padding = 4 8 4
 52 | launcher_background_id = 0
 53 | launcher_icon_background_id = 0
 54 | launcher_icon_size = 48
 55 | launcher_icon_asb = 100 0 0
 56 | launcher_icon_theme_override = 0
 57 | startup_notifications = 1
 58 | launcher_tooltip = 1
 59 | 
 60 | #-------------------------------------
 61 | # Launcher icon
 62 | launcher_item_app = /usr/share/applications/libreoffice-calc.desktop
 63 | launcher_item_app = /home/computeruse/.config/tint2/applications/terminal.desktop
 64 | launcher_item_app = /home/computeruse/.config/tint2/applications/firefox-custom.desktop
 65 | launcher_item_app = /usr/share/applications/xpaint.desktop
 66 | launcher_item_app = /usr/share/applications/xpdf.desktop
 67 | launcher_item_app = /home/computeruse/.config/tint2/applications/gedit.desktop
 68 | launcher_item_app = /usr/share/applications/galculator.desktop
 69 | 
 70 | #-------------------------------------
 71 | # Background definitions
 72 | # ID 1
 73 | rounded = 0
 74 | border_width = 0
 75 | background_color = #000000 60
 76 | border_color = #000000 30
 77 | 
 78 | # ID 2
 79 | rounded = 4
 80 | border_width = 1
 81 | background_color = #777777 20
 82 | border_color = #777777 30
 83 | 
 84 | # ID 3
 85 | rounded = 4
 86 | border_width = 1
 87 | background_color = #777777 20
 88 | border_color = #ffffff 40
 89 | 
 90 | # ID 4
 91 | rounded = 4
 92 | border_width = 1
 93 | background_color = #aa4400 100
 94 | border_color = #aa7733 100
 95 | 
 96 | # ID 5
 97 | rounded = 4
 98 | border_width = 1
 99 | background_color = #aaaa00 100
100 | border_color = #aaaa00 100
101 | 


--------------------------------------------------------------------------------
/image/.streamlit/config.toml:
--------------------------------------------------------------------------------
1 | [server]
2 | fileWatcherType = "auto"
3 | runOnSave = true
4 | 
5 | [browser]
6 | gatherUsageStats = false
7 | 


--------------------------------------------------------------------------------
/image/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | ./start_all.sh
 5 | ./novnc_startup.sh
 6 | 
 7 | python http_server.py > /tmp/server_logs.txt 2>&1 &
 8 | 
 9 | STREAMLIT_SERVER_PORT=8501 python -m streamlit run computer_use_demo/streamlit.py > /tmp/streamlit_stdout.log &
10 | 
11 | echo "✨ Computer Use Demo is ready!"
12 | echo "➡️  Open http://localhost:8080 in your browser to begin"
13 | 
14 | # Keep the container running
15 | tail -f /dev/null
16 | 


--------------------------------------------------------------------------------
/image/http_server.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import socket
 3 | from http.server import HTTPServer, SimpleHTTPRequestHandler
 4 | 
 5 | 
 6 | class HTTPServerV6(HTTPServer):
 7 |     address_family = socket.AF_INET6
 8 | 
 9 | 
10 | def run_server():
11 |     os.chdir(os.path.dirname(__file__) + "/static_content")
12 |     server_address = ("::", 8080)
13 |     httpd = HTTPServerV6(server_address, SimpleHTTPRequestHandler)
14 |     print("Starting HTTP server on port 8080...")  # noqa: T201
15 |     httpd.serve_forever()
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     run_server()
20 | 


--------------------------------------------------------------------------------
/image/index.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html>
 3 |     <head>
 4 |         <title>Computer Use Demo</title>
 5 |         <meta name="permissions-policy" content="fullscreen=*" />
 6 |         <style>
 7 |             body {
 8 |                 margin: 0;
 9 |                 padding: 0;
10 |                 overflow: hidden;
11 |             }
12 |             .container {
13 |                 display: flex;
14 |                 height: 100vh;
15 |                 width: 100vw;
16 |             }
17 |             .left {
18 |                 flex: 1;
19 |                 border: none;
20 |                 height: 100vh;
21 |             }
22 |             .right {
23 |                 flex: 2;
24 |                 border: none;
25 |                 height: 100vh;
26 |             }
27 |         </style>
28 |     </head>
29 |     <body>
30 |         <div class="container">
31 |             <iframe
32 |                 src="http://localhost:8501"
33 |                 class="left"
34 |                 allow="fullscreen"
35 |             ></iframe>
36 |             <iframe
37 |                 src="http://localhost:6080/vnc.html?view_only=1&autoconnect=1&resize=scale"
38 |                 class="right"
39 |                 allow="fullscreen"
40 |             ></iframe>
41 |         </div>
42 |     </body>
43 | </html>
44 | 


--------------------------------------------------------------------------------
/image/mutter_startup.sh:
--------------------------------------------------------------------------------
 1 | echo "starting mutter"
 2 | XDG_SESSION_TYPE=x11 mutter --replace --sm-disable 2>/tmp/mutter_stderr.log &
 3 | 
 4 | # Wait for tint2 window properties to appear
 5 | timeout=30
 6 | while [ $timeout -gt 0 ]; do
 7 |     if xdotool search --class "mutter" >/dev/null 2>&1; then
 8 |         break
 9 |     fi
10 |     sleep 1
11 |     ((timeout--))
12 | done
13 | 
14 | if [ $timeout -eq 0 ]; then
15 |     echo "mutter stderr output:" >&2
16 |     cat /tmp/mutter_stderr.log >&2
17 |     exit 1
18 | fi
19 | 
20 | rm /tmp/mutter_stderr.log
21 | 


--------------------------------------------------------------------------------
/image/novnc_startup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | echo "starting noVNC"
 3 | 
 4 | # Start noVNC with explicit websocket settings
 5 | /opt/noVNC/utils/novnc_proxy \
 6 |     --vnc localhost:5900 \
 7 |     --listen 6080 \
 8 |     --web /opt/noVNC \
 9 |     > /tmp/novnc.log 2>&1 &
10 | 
11 | # Wait for noVNC to start
12 | timeout=10
13 | while [ $timeout -gt 0 ]; do
14 |     if netstat -tuln | grep -q ":6080 "; then
15 |         break
16 |     fi
17 |     sleep 1
18 |     ((timeout--))
19 | done
20 | 
21 | echo "noVNC started successfully"
22 | 


--------------------------------------------------------------------------------
/image/start_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | export DISPLAY=:${DISPLAY_NUM}
 6 | ./xvfb_startup.sh
 7 | ./tint2_startup.sh
 8 | ./mutter_startup.sh
 9 | ./x11vnc_startup.sh
10 | 


--------------------------------------------------------------------------------
/image/static_content/index.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html>
 3 |     <head>
 4 |         <title>Computer Use Demo</title>
 5 |         <meta name="permissions-policy" content="fullscreen=*" />
 6 |         <style>
 7 |             body {
 8 |                 margin: 0;
 9 |                 padding: 0;
10 |                 overflow: hidden;
11 |             }
12 |             .container {
13 |                 display: flex;
14 |                 height: 100vh;
15 |                 width: 100vw;
16 |             }
17 |             .left {
18 |                 flex: 1;
19 |                 border: none;
20 |                 height: 100vh;
21 |             }
22 |             .right {
23 |                 flex: 2;
24 |                 border: none;
25 |                 height: 100vh;
26 |             }
27 |         </style>
28 |     </head>
29 |     <body>
30 |         <div class="container">
31 |             <iframe
32 |                 src="http://localhost:8501"
33 |                 class="left"
34 |                 allow="fullscreen"
35 |             ></iframe>
36 |             <iframe
37 |                 id="vnc"
38 |                 src="http://127.0.0.1:6080/vnc.html?&resize=scale&autoconnect=1&view_only=1&reconnect=1&reconnect_delay=2000"
39 |                 class="right"
40 |                 allow="fullscreen"
41 |             ></iframe>
42 |             <button
43 |                 id="toggleViewOnly"
44 |                 style="position: absolute; top: 10px; right: 10px; z-index: 1000"
45 |             >
46 |                 Toggle Screen Control (Off)
47 |             </button>
48 |             <script>
49 |                 document
50 |                     .getElementById("toggleViewOnly")
51 |                     .addEventListener("click", function () {
52 |                         var vncIframe = document.getElementById("vnc");
53 |                         var button = document.getElementById("toggleViewOnly");
54 |                         var currentSrc = vncIframe.src;
55 |                         if (currentSrc.includes("view_only=1")) {
56 |                             vncIframe.src = currentSrc.replace(
57 |                                 "view_only=1",
58 |                                 "view_only=0",
59 |                             );
60 |                             button.innerText = "Toggle Screen Control (On)";
61 |                         } else {
62 |                             vncIframe.src = currentSrc.replace(
63 |                                 "view_only=0",
64 |                                 "view_only=1",
65 |                             );
66 |                             button.innerText = "Toggle Screen Control (Off)";
67 |                         }
68 |                     });
69 |             </script>
70 |         </div>
71 |     </body>
72 | </html>
73 | 


--------------------------------------------------------------------------------
/image/tint2_startup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | echo "starting tint2 on display :$DISPLAY_NUM ..."
 3 | 
 4 | # Start tint2 and capture its stderr
 5 | tint2 -c $HOME/.config/tint2/tint2rc 2>/tmp/tint2_stderr.log &
 6 | 
 7 | # Wait for tint2 window properties to appear
 8 | timeout=30
 9 | while [ $timeout -gt 0 ]; do
10 |     if xdotool search --class "tint2" >/dev/null 2>&1; then
11 |         break
12 |     fi
13 |     sleep 1
14 |     ((timeout--))
15 | done
16 | 
17 | if [ $timeout -eq 0 ]; then
18 |     echo "tint2 stderr output:" >&2
19 |     cat /tmp/tint2_stderr.log >&2
20 |     exit 1
21 | fi
22 | 
23 | # Remove the temporary stderr log file
24 | rm /tmp/tint2_stderr.log
25 | 


--------------------------------------------------------------------------------
/image/x11vnc_startup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | echo "starting vnc"
 3 | 
 4 | (x11vnc -display $DISPLAY \
 5 |     -forever \
 6 |     -shared \
 7 |     -wait 50 \
 8 |     -timeout 60 \
 9 |     -noxrecord \
10 |     -noxfixes \
11 |     -noxdamage \
12 |     -rfbport 5900 \
13 |     2>/tmp/x11vnc_stderr.log) &
14 | 
15 | x11vnc_pid=$!
16 | 
17 | # Wait for x11vnc to start
18 | timeout=10
19 | while [ $timeout -gt 0 ]; do
20 |     if netstat -tuln | grep -q ":5900 "; then
21 |         break
22 |     fi
23 |     sleep 1
24 |     ((timeout--))
25 | done
26 | 
27 | if [ $timeout -eq 0 ]; then
28 |     echo "x11vnc failed to start, stderr output:" >&2
29 |     cat /tmp/x11vnc_stderr.log >&2
30 |     exit 1
31 | fi
32 | 
33 | : > /tmp/x11vnc_stderr.log
34 | 
35 | # Monitor x11vnc process in the background
36 | (
37 |     while true; do
38 |         if ! kill -0 $x11vnc_pid 2>/dev/null; then
39 |             echo "x11vnc process crashed, restarting..." >&2
40 |             if [ -f /tmp/x11vnc_stderr.log ]; then
41 |                 echo "x11vnc stderr output:" >&2
42 |                 cat /tmp/x11vnc_stderr.log >&2
43 |                 rm /tmp/x11vnc_stderr.log
44 |             fi
45 |             exec "$0"
46 |         fi
47 |         sleep 5
48 |     done
49 | ) &
50 | 


--------------------------------------------------------------------------------
/image/xvfb_startup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e  # Exit on error
 3 | 
 4 | DPI=96
 5 | RES_AND_DEPTH=${WIDTH}x${HEIGHT}x24
 6 | 
 7 | # Function to check if Xvfb is already running
 8 | check_xvfb_running() {
 9 |     if [ -e /tmp/.X${DISPLAY_NUM}-lock ]; then
10 |         return 0  # Xvfb is already running
11 |     else
12 |         return 1  # Xvfb is not running
13 |     fi
14 | }
15 | 
16 | # Function to check if Xvfb is ready
17 | wait_for_xvfb() {
18 |     local timeout=10
19 |     local start_time=$(date +%s)
20 |     while ! xdpyinfo >/dev/null 2>&1; do
21 |         if [ $(($(date +%s) - start_time)) -gt $timeout ]; then
22 |             echo "Xvfb failed to start within $timeout seconds" >&2
23 |             return 1
24 |         fi
25 |         sleep 0.1
26 |     done
27 |     return 0
28 | }
29 | 
30 | # Check if Xvfb is already running
31 | if check_xvfb_running; then
32 |     echo "Xvfb is already running on display ${DISPLAY}"
33 |     exit 0
34 | fi
35 | 
36 | # Start Xvfb
37 | Xvfb $DISPLAY -ac -screen 0 $RES_AND_DEPTH -retro -dpi $DPI -nolisten tcp -nolisten unix &
38 | XVFB_PID=$!
39 | 
40 | # Wait for Xvfb to start
41 | if wait_for_xvfb; then
42 |     echo "Xvfb started successfully on display ${DISPLAY}"
43 |     echo "Xvfb PID: $XVFB_PID"
44 | else
45 |     echo "Xvfb failed to start"
46 |     kill $XVFB_PID
47 |     exit 1
48 | fi
49 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ObservedObserver/claude-minecraft-use/8b52ef2a5aa175a49475db07ad7168b33089f8b6/main.py


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.pyright]
2 | venvPath = "."
3 | venv = ".venv"
4 | useLibraryCodeForTypes = false
5 | 
6 | [tool.pytest.ini_options]
7 | pythonpath = "."
8 | asyncio_mode = "auto"
9 | 


--------------------------------------------------------------------------------
/ruff.toml:
--------------------------------------------------------------------------------
 1 | extend-exclude = [".venv"]
 2 | 
 3 | [format]
 4 | docstring-code-format = true
 5 | 
 6 | [lint]
 7 | select = [
 8 |     "A",
 9 |     "ASYNC",
10 |     "B",
11 |     "E",
12 |     "F",
13 |     "I",
14 |     "PIE",
15 |     "RUF200",
16 |     "T20",
17 |     "UP",
18 |     "W",
19 | ]
20 | 
21 | ignore = ["E501", "ASYNC230"]
22 | 
23 | [lint.isort]
24 | combine-as-imports = true
25 | 


--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | PYTHON_MINOR_VERSION=$(python3 --version | awk -F. '{print $2}')
 3 | 
 4 | if [ "$PYTHON_MINOR_VERSION" -gt 12 ]; then
 5 |     echo "Python version 3.$PYTHON_MINOR_VERSION detected. Python 3.12 or lower is required for setup to complete."
 6 |     echo "If you have multiple versions of Python installed, you can set the correct one by adjusting setup.sh to use a specific version, for example:"
 7 |     echo "'python3 -m venv .venv' -> 'python3.12 -m venv .venv'"
 8 |     exit 1
 9 | fi
10 | 
11 | if ! command -v cargo &> /dev/null; then
12 |     echo "Cargo (the package manager for Rust) is not present.  This is required for one of this module's dependencies."
13 |     echo "See https://www.rust-lang.org/tools/install for installation instructions."
14 |     exit 1
15 | fi
16 | 
17 | python3 -m venv .venv
18 | source .venv/bin/activate
19 | pip install --upgrade pip
20 | pip install -r dev-requirements.txt
21 | pre-commit install
22 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from unittest import mock
 3 | 
 4 | import pytest
 5 | 
 6 | 
 7 | @pytest.fixture(autouse=True)
 8 | def mock_screen_dimensions():
 9 |     with mock.patch.dict(
10 |         os.environ, {"HEIGHT": "768", "WIDTH": "1024", "DISPLAY_NUM": "1"}
11 |     ):
12 |         yield
13 | 


--------------------------------------------------------------------------------
/tests/loop_test.py:
--------------------------------------------------------------------------------
 1 | from unittest import mock
 2 | 
 3 | from anthropic.types import TextBlock, ToolUseBlock
 4 | from anthropic.types.beta import BetaMessage, BetaMessageParam
 5 | 
 6 | from computer_use_demo.loop import APIProvider, sampling_loop
 7 | 
 8 | 
 9 | async def test_loop():
10 |     client = mock.Mock()
11 |     client.beta.messages.with_raw_response.create.return_value = mock.Mock()
12 |     client.beta.messages.with_raw_response.create.return_value.parse.side_effect = [
13 |         mock.Mock(
14 |             spec=BetaMessage,
15 |             content=[
16 |                 TextBlock(type="text", text="Hello"),
17 |                 ToolUseBlock(
18 |                     type="tool_use", id="1", name="computer", input={"action": "test"}
19 |                 ),
20 |             ],
21 |         ),
22 |         mock.Mock(spec=BetaMessage, content=[TextBlock(type="text", text="Done!")]),
23 |     ]
24 | 
25 |     tool_collection = mock.AsyncMock()
26 |     tool_collection.run.return_value = mock.Mock(
27 |         output="Tool output", error=None, base64_image=None
28 |     )
29 | 
30 |     output_callback = mock.Mock()
31 |     tool_output_callback = mock.Mock()
32 |     api_response_callback = mock.Mock()
33 | 
34 |     with mock.patch(
35 |         "computer_use_demo.loop.Anthropic", return_value=client
36 |     ), mock.patch(
37 |         "computer_use_demo.loop.ToolCollection", return_value=tool_collection
38 |     ):
39 |         messages: list[BetaMessageParam] = [{"role": "user", "content": "Test message"}]
40 |         result = await sampling_loop(
41 |             model="test-model",
42 |             provider=APIProvider.ANTHROPIC,
43 |             system_prompt_suffix="",
44 |             messages=messages,
45 |             output_callback=output_callback,
46 |             tool_output_callback=tool_output_callback,
47 |             api_response_callback=api_response_callback,
48 |             api_key="test-key",
49 |         )
50 | 
51 |         assert len(result) == 4
52 |         assert result[0] == {"role": "user", "content": "Test message"}
53 |         assert result[1]["role"] == "assistant"
54 |         assert result[2]["role"] == "user"
55 |         assert result[3]["role"] == "assistant"
56 | 
57 |         assert client.beta.messages.with_raw_response.create.call_count == 2
58 |         tool_collection.run.assert_called_once_with(
59 |             name="computer", tool_input={"action": "test"}
60 |         )
61 |         output_callback.assert_called_with(TextBlock(text="Done!", type="text"))
62 |         assert output_callback.call_count == 3
63 |         assert tool_output_callback.call_count == 1
64 |         assert api_response_callback.call_count == 2
65 | 


--------------------------------------------------------------------------------
/tests/streamlit_test.py:
--------------------------------------------------------------------------------
 1 | from unittest import mock
 2 | 
 3 | import pytest
 4 | from streamlit.testing.v1 import AppTest
 5 | 
 6 | from computer_use_demo.streamlit import Sender, TextBlock
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def streamlit_app():
11 |     return AppTest.from_file("computer_use_demo/streamlit.py")
12 | 
13 | 
14 | def test_streamlit(streamlit_app: AppTest):
15 |     streamlit_app.run()
16 |     streamlit_app.text_input[1].set_value("sk-ant-0000000000000").run()
17 |     with mock.patch("computer_use_demo.loop.sampling_loop") as patch:
18 |         streamlit_app.chat_input[0].set_value("Hello").run()
19 |         assert patch.called
20 |         assert patch.call_args.kwargs["messages"] == [
21 |             {"role": Sender.USER, "content": [TextBlock(text="Hello", type="text")]}
22 |         ]
23 |         assert not streamlit_app.exception
24 | 


--------------------------------------------------------------------------------
/tests/tools/bash_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from computer_use_demo.tools.bash import BashTool, ToolError
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def bash_tool():
 8 |     return BashTool()
 9 | 
10 | 
11 | @pytest.mark.asyncio
12 | async def test_bash_tool_restart(bash_tool):
13 |     result = await bash_tool(restart=True)
14 |     assert result.system == "tool has been restarted."
15 | 
16 |     # Verify the tool can be used after restart
17 |     result = await bash_tool(command="echo 'Hello after restart'")
18 |     assert "Hello after restart" in result.output
19 | 
20 | 
21 | @pytest.mark.asyncio
22 | async def test_bash_tool_run_command(bash_tool):
23 |     result = await bash_tool(command="echo 'Hello, World!'")
24 |     assert result.output.strip() == "Hello, World!"
25 |     assert result.error == ""
26 | 
27 | 
28 | @pytest.mark.asyncio
29 | async def test_bash_tool_no_command(bash_tool):
30 |     with pytest.raises(ToolError, match="no command provided."):
31 |         await bash_tool()
32 | 
33 | 
34 | @pytest.mark.asyncio
35 | async def test_bash_tool_session_creation(bash_tool):
36 |     result = await bash_tool(command="echo 'Session created'")
37 |     assert bash_tool._session is not None
38 |     assert "Session created" in result.output
39 | 
40 | 
41 | @pytest.mark.asyncio
42 | async def test_bash_tool_session_reuse(bash_tool):
43 |     result1 = await bash_tool(command="echo 'First command'")
44 |     result2 = await bash_tool(command="echo 'Second command'")
45 | 
46 |     assert "First command" in result1.output
47 |     assert "Second command" in result2.output
48 | 
49 | 
50 | @pytest.mark.asyncio
51 | async def test_bash_tool_session_error(bash_tool):
52 |     result = await bash_tool(command="invalid_command_that_does_not_exist")
53 |     assert "command not found" in result.error
54 | 
55 | 
56 | @pytest.mark.asyncio
57 | async def test_bash_tool_non_zero_exit(bash_tool):
58 |     result = await bash_tool(command="bash -c 'exit 1'")
59 |     assert result.error.strip() == ""
60 |     assert result.output.strip() == ""
61 | 
62 | 
63 | @pytest.mark.asyncio
64 | async def test_bash_tool_timeout(bash_tool):
65 |     await bash_tool(command="echo 'Hello, World!'")
66 |     bash_tool._session._timeout = 0.1  # Set a very short timeout for testing
67 |     with pytest.raises(
68 |         ToolError,
69 |         match="timed out: bash has not returned in 0.1 seconds and must be restarted",
70 |     ):
71 |         await bash_tool(command="sleep 1")
72 | 


--------------------------------------------------------------------------------
/tests/tools/computer_test.py:
--------------------------------------------------------------------------------
  1 | from unittest.mock import AsyncMock, patch
  2 | 
  3 | import pytest
  4 | 
  5 | from computer_use_demo.tools.computer import (
  6 |     ComputerTool,
  7 |     ScalingSource,
  8 |     ToolError,
  9 |     ToolResult,
 10 | )
 11 | 
 12 | 
 13 | @pytest.fixture
 14 | def computer_tool():
 15 |     return ComputerTool()
 16 | 
 17 | 
 18 | @pytest.mark.asyncio
 19 | async def test_computer_tool_mouse_move(computer_tool):
 20 |     with patch.object(computer_tool, "shell", new_callable=AsyncMock) as mock_shell:
 21 |         mock_shell.return_value = ToolResult(output="Mouse moved")
 22 |         result = await computer_tool(action="mouse_move", coordinate=[100, 200])
 23 |         mock_shell.assert_called_once_with(
 24 |             f"{computer_tool.xdotool} mousemove --sync 100 200"
 25 |         )
 26 |         assert result.output == "Mouse moved"
 27 | 
 28 | 
 29 | @pytest.mark.asyncio
 30 | async def test_computer_tool_type(computer_tool):
 31 |     with (
 32 |         patch.object(computer_tool, "shell", new_callable=AsyncMock) as mock_shell,
 33 |         patch.object(
 34 |             computer_tool, "screenshot", new_callable=AsyncMock
 35 |         ) as mock_screenshot,
 36 |     ):
 37 |         mock_shell.return_value = ToolResult(output="Text typed")
 38 |         mock_screenshot.return_value = ToolResult(base64_image="base64_screenshot")
 39 |         result = await computer_tool(action="type", text="Hello, World!")
 40 |         assert mock_shell.call_count == 1
 41 |         assert "type --delay 12 -- 'Hello, World!'" in mock_shell.call_args[0][0]
 42 |         assert result.output == "Text typed"
 43 |         assert result.base64_image == "base64_screenshot"
 44 | 
 45 | 
 46 | @pytest.mark.asyncio
 47 | async def test_computer_tool_screenshot(computer_tool):
 48 |     with patch.object(
 49 |         computer_tool, "screenshot", new_callable=AsyncMock
 50 |     ) as mock_screenshot:
 51 |         mock_screenshot.return_value = ToolResult(base64_image="base64_screenshot")
 52 |         result = await computer_tool(action="screenshot")
 53 |         mock_screenshot.assert_called_once()
 54 |         assert result.base64_image == "base64_screenshot"
 55 | 
 56 | 
 57 | @pytest.mark.asyncio
 58 | async def test_computer_tool_scaling(computer_tool):
 59 |     computer_tool._scaling_enabled = True
 60 |     computer_tool.width = 1920
 61 |     computer_tool.height = 1080
 62 | 
 63 |     # Test scaling from API to computer
 64 |     x, y = computer_tool.scale_coordinates(ScalingSource.API, 1366, 768)
 65 |     assert x == 1920
 66 |     assert y == 1080
 67 | 
 68 |     # Test scaling from computer to API
 69 |     x, y = computer_tool.scale_coordinates(ScalingSource.COMPUTER, 1920, 1080)
 70 |     assert x == 1366
 71 |     assert y == 768
 72 | 
 73 |     # Test no scaling when disabled
 74 |     computer_tool._scaling_enabled = False
 75 |     x, y = computer_tool.scale_coordinates(ScalingSource.API, 1366, 768)
 76 |     assert x == 1366
 77 |     assert y == 768
 78 | 
 79 | 
 80 | @pytest.mark.asyncio
 81 | async def test_computer_tool_scaling_with_different_aspect_ratio(computer_tool):
 82 |     computer_tool._scaling_enabled = True
 83 |     computer_tool.width = 1920
 84 |     computer_tool.height = 1200  # 16:10 aspect ratio
 85 | 
 86 |     # Test scaling from API to computer
 87 |     x, y = computer_tool.scale_coordinates(ScalingSource.API, 1280, 800)
 88 |     assert x == 1920
 89 |     assert y == 1200
 90 | 
 91 |     # Test scaling from computer to API
 92 |     x, y = computer_tool.scale_coordinates(ScalingSource.COMPUTER, 1920, 1200)
 93 |     assert x == 1280
 94 |     assert y == 800
 95 | 
 96 | 
 97 | @pytest.mark.asyncio
 98 | async def test_computer_tool_no_scaling_for_unsupported_resolution(computer_tool):
 99 |     computer_tool._scaling_enabled = True
100 |     computer_tool.width = 4096
101 |     computer_tool.height = 2160
102 | 
103 |     # Test no scaling for unsupported resolution
104 |     x, y = computer_tool.scale_coordinates(ScalingSource.API, 4096, 2160)
105 |     assert x == 4096
106 |     assert y == 2160
107 | 
108 |     x, y = computer_tool.scale_coordinates(ScalingSource.COMPUTER, 4096, 2160)
109 |     assert x == 4096
110 |     assert y == 2160
111 | 
112 | 
113 | @pytest.mark.asyncio
114 | async def test_computer_tool_scaling_out_of_bounds(computer_tool):
115 |     computer_tool._scaling_enabled = True
116 |     computer_tool.width = 1920
117 |     computer_tool.height = 1080
118 | 
119 |     # Test scaling from API with out of bounds coordinates
120 |     with pytest.raises(ToolError, match="Coordinates .*, .* are out of bounds"):
121 |         x, y = computer_tool.scale_coordinates(ScalingSource.API, 2000, 1500)
122 | 
123 | 
124 | @pytest.mark.asyncio
125 | async def test_computer_tool_invalid_action(computer_tool):
126 |     with pytest.raises(ToolError, match="Invalid action: invalid_action"):
127 |         await computer_tool(action="invalid_action")
128 | 
129 | 
130 | @pytest.mark.asyncio
131 | async def test_computer_tool_missing_coordinate(computer_tool):
132 |     with pytest.raises(ToolError, match="coordinate is required for mouse_move"):
133 |         await computer_tool(action="mouse_move")
134 | 
135 | 
136 | @pytest.mark.asyncio
137 | async def test_computer_tool_missing_text(computer_tool):
138 |     with pytest.raises(ToolError, match="text is required for type"):
139 |         await computer_tool(action="type")
140 | 


--------------------------------------------------------------------------------
/tests/tools/edit_test.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from unittest.mock import patch
  3 | 
  4 | import pytest
  5 | 
  6 | from computer_use_demo.tools.base import CLIResult, ToolError, ToolResult
  7 | from computer_use_demo.tools.edit import EditTool
  8 | 
  9 | 
 10 | @pytest.mark.asyncio
 11 | async def test_view_command():
 12 |     edit_tool = EditTool()
 13 | 
 14 |     # Test viewing a file that exists
 15 |     with patch("pathlib.Path.exists", return_value=True), patch(
 16 |         "pathlib.Path.is_dir", return_value=False
 17 |     ), patch("pathlib.Path.read_text") as mock_read_text:
 18 |         mock_read_text.return_value = "File content"
 19 |         result = await edit_tool(command="view", path="/test/file.txt")
 20 |         assert isinstance(result, CLIResult)
 21 |         assert result.output
 22 |         assert "File content" in result.output
 23 | 
 24 |     # Test viewing a directory
 25 |     with patch("pathlib.Path.exists", return_value=True), patch(
 26 |         "pathlib.Path.is_dir", return_value=True
 27 |     ), patch("computer_use_demo.tools.edit.run") as mock_run:
 28 |         mock_run.return_value = (None, "file1.txt\nfile2.txt", None)
 29 |         result = await edit_tool(command="view", path="/test/dir")
 30 |         assert isinstance(result, CLIResult)
 31 |         assert result.output
 32 |         assert "file1.txt" in result.output
 33 |         assert "file2.txt" in result.output
 34 | 
 35 |     # Test viewing a file with a specific range
 36 |     with patch("pathlib.Path.exists", return_value=True), patch(
 37 |         "pathlib.Path.is_dir", return_value=False
 38 |     ), patch("pathlib.Path.read_text") as mock_read_text:
 39 |         mock_read_text.return_value = "Line 1\nLine 2\nLine 3\nLine 4"
 40 |         result = await edit_tool(
 41 |             command="view", path="/test/file.txt", view_range=[2, 3]
 42 |         )
 43 |         assert isinstance(result, CLIResult)
 44 |         assert result.output
 45 |         assert "\n     2\tLine 2\n     3\tLine 3\n" in result.output
 46 | 
 47 |     # Test viewing a file with an invalid range
 48 |     with patch("pathlib.Path.exists", return_value=True), patch(
 49 |         "pathlib.Path.is_dir", return_value=False
 50 |     ), patch("pathlib.Path.read_text") as mock_read_text:
 51 |         mock_read_text.return_value = "Line 1\nLine 2\nLine 3\nLine 4"
 52 |         with pytest.raises(ToolError, match="Invalid `view_range`"):
 53 |             await edit_tool(command="view", path="/test/file.txt", view_range=[3, 2])
 54 | 
 55 |     # Test viewing a non-existent file
 56 |     with patch("pathlib.Path.exists", return_value=False):
 57 |         with pytest.raises(ToolError, match="does not exist"):
 58 |             await edit_tool(command="view", path="/nonexistent/file.txt")
 59 | 
 60 |     # Test viewing a directory with a view_range
 61 |     with patch("pathlib.Path.exists", return_value=True), patch(
 62 |         "pathlib.Path.is_dir", return_value=True
 63 |     ):
 64 |         with pytest.raises(ToolError, match="view_range` parameter is not allowed"):
 65 |             await edit_tool(command="view", path="/test/dir", view_range=[1, 2])
 66 | 
 67 | 
 68 | @pytest.mark.asyncio
 69 | async def test_create_command():
 70 |     edit_tool = EditTool()
 71 | 
 72 |     # Test creating a new file with content
 73 |     with patch("pathlib.Path.exists", return_value=False), patch(
 74 |         "pathlib.Path.write_text"
 75 |     ) as mock_write_text:
 76 |         result = await edit_tool(
 77 |             command="create", path="/test/newfile.txt", file_text="New file content"
 78 |         )
 79 |         assert isinstance(result, ToolResult)
 80 |         assert result.output
 81 |         assert "File created successfully" in result.output
 82 |         mock_write_text.assert_called_once_with("New file content")
 83 | 
 84 |     # Test attempting to create a file without content
 85 |     with patch("pathlib.Path.exists", return_value=False):
 86 |         with pytest.raises(ToolError, match="Parameter `file_text` is required"):
 87 |             await edit_tool(command="create", path="/test/newfile.txt")
 88 | 
 89 |     # Test attempting to create a file that already exists
 90 |     with patch("pathlib.Path.exists", return_value=True):
 91 |         with pytest.raises(ToolError, match="File already exists"):
 92 |             await edit_tool(
 93 |                 command="create", path="/test/existingfile.txt", file_text="Content"
 94 |             )
 95 | 
 96 | 
 97 | @pytest.mark.asyncio
 98 | async def test_str_replace_command():
 99 |     edit_tool = EditTool()
100 | 
101 |     # Test replacing a unique string in a file
102 |     with patch("pathlib.Path.exists", return_value=True), patch(
103 |         "pathlib.Path.is_dir", return_value=False
104 |     ), patch("pathlib.Path.read_text") as mock_read_text, patch(
105 |         "pathlib.Path.write_text"
106 |     ) as mock_write_text:
107 |         mock_read_text.return_value = "Original content"
108 |         result = await edit_tool(
109 |             command="str_replace",
110 |             path="/test/file.txt",
111 |             old_str="Original",
112 |             new_str="New",
113 |         )
114 |         assert isinstance(result, CLIResult)
115 |         assert result.output
116 |         assert "has been edited" in result.output
117 |         mock_write_text.assert_called_once_with("New content")
118 | 
119 |     # Test attempting to replace a non-existent string
120 |     with patch("pathlib.Path.exists", return_value=True), patch(
121 |         "pathlib.Path.is_dir", return_value=False
122 |     ), patch("pathlib.Path.read_text") as mock_read_text:
123 |         mock_read_text.return_value = "Original content"
124 |         with pytest.raises(ToolError, match="did not appear verbatim"):
125 |             await edit_tool(
126 |                 command="str_replace",
127 |                 path="/test/file.txt",
128 |                 old_str="Nonexistent",
129 |                 new_str="New",
130 |             )
131 | 
132 |     # Test attempting to replace a string that appears multiple times
133 |     with patch("pathlib.Path.exists", return_value=True), patch(
134 |         "pathlib.Path.is_dir", return_value=False
135 |     ), patch("pathlib.Path.read_text") as mock_read_text:
136 |         mock_read_text.return_value = "Test test test"
137 |         with pytest.raises(ToolError, match="Multiple occurrences"):
138 |             await edit_tool(
139 |                 command="str_replace",
140 |                 path="/test/file.txt",
141 |                 old_str="test",
142 |                 new_str="example",
143 |             )
144 | 
145 |     edit_tool._file_history.clear()
146 |     # Verify that the file history is updated after replacement
147 |     with patch("pathlib.Path.exists", return_value=True), patch(
148 |         "pathlib.Path.is_dir", return_value=False
149 |     ), patch("pathlib.Path.read_text") as mock_read_text, patch(
150 |         "pathlib.Path.write_text"
151 |     ):
152 |         mock_read_text.return_value = "Original content"
153 |         await edit_tool(
154 |             command="str_replace",
155 |             path="/test/file.txt",
156 |             old_str="Original",
157 |             new_str="New",
158 |         )
159 |         assert edit_tool._file_history[Path("/test/file.txt")] == ["Original content"]
160 | 
161 | 
162 | @pytest.mark.asyncio
163 | async def test_insert_command():
164 |     edit_tool = EditTool()
165 | 
166 |     # Test inserting a string at a valid line number
167 |     with patch("pathlib.Path.exists", return_value=True), patch(
168 |         "pathlib.Path.is_dir", return_value=False
169 |     ), patch("pathlib.Path.read_text") as mock_read_text, patch(
170 |         "pathlib.Path.write_text"
171 |     ) as mock_write_text:
172 |         mock_read_text.return_value = "Line 1\nLine 2\nLine 3"
173 |         result = await edit_tool(
174 |             command="insert", path="/test/file.txt", insert_line=2, new_str="New Line"
175 |         )
176 |         assert isinstance(result, CLIResult)
177 |         assert result.output
178 |         assert "has been edited" in result.output
179 |         mock_write_text.assert_called_once_with("Line 1\nLine 2\nNew Line\nLine 3")
180 | 
181 |     # Test inserting a string at the beginning of the file (line 0)
182 |     with patch("pathlib.Path.exists", return_value=True), patch(
183 |         "pathlib.Path.is_dir", return_value=False
184 |     ), patch("pathlib.Path.read_text") as mock_read_text, patch(
185 |         "pathlib.Path.write_text"
186 |     ) as mock_write_text:
187 |         mock_read_text.return_value = "Line 1\nLine 2"
188 |         result = await edit_tool(
189 |             command="insert",
190 |             path="/test/file.txt",
191 |             insert_line=0,
192 |             new_str="New First Line",
193 |         )
194 |         assert isinstance(result, CLIResult)
195 |         assert result.output
196 |         assert "has been edited" in result.output
197 |         mock_write_text.assert_called_once_with("New First Line\nLine 1\nLine 2")
198 | 
199 |     # Test inserting a string at the end of the file
200 |     with patch("pathlib.Path.exists", return_value=True), patch(
201 |         "pathlib.Path.is_dir", return_value=False
202 |     ), patch("pathlib.Path.read_text") as mock_read_text, patch(
203 |         "pathlib.Path.write_text"
204 |     ) as mock_write_text:
205 |         mock_read_text.return_value = "Line 1\nLine 2"
206 |         result = await edit_tool(
207 |             command="insert",
208 |             path="/test/file.txt",
209 |             insert_line=2,
210 |             new_str="New Last Line",
211 |         )
212 |         assert isinstance(result, CLIResult)
213 |         assert result.output
214 |         assert "has been edited" in result.output
215 |         mock_write_text.assert_called_once_with("Line 1\nLine 2\nNew Last Line")
216 | 
217 |     # Test attempting to insert at an invalid line number
218 |     with patch("pathlib.Path.exists", return_value=True), patch(
219 |         "pathlib.Path.is_dir", return_value=False
220 |     ), patch("pathlib.Path.read_text") as mock_read_text:
221 |         mock_read_text.return_value = "Line 1\nLine 2"
222 |         with pytest.raises(ToolError, match="Invalid `insert_line` parameter"):
223 |             await edit_tool(
224 |                 command="insert",
225 |                 path="/test/file.txt",
226 |                 insert_line=5,
227 |                 new_str="Invalid Line",
228 |             )
229 | 
230 |     # Verify that the file history is updated after insertion
231 |     edit_tool._file_history.clear()
232 |     with patch("pathlib.Path.exists", return_value=True), patch(
233 |         "pathlib.Path.is_dir", return_value=False
234 |     ), patch("pathlib.Path.read_text") as mock_read_text, patch(
235 |         "pathlib.Path.write_text"
236 |     ):
237 |         mock_read_text.return_value = "Original content"
238 |         await edit_tool(
239 |             command="insert", path="/test/file.txt", insert_line=1, new_str="New Line"
240 |         )
241 |         assert edit_tool._file_history[Path("/test/file.txt")] == ["Original content"]
242 | 
243 | 
244 | @pytest.mark.asyncio
245 | async def test_undo_edit_command():
246 |     edit_tool = EditTool()
247 | 
248 |     # Test undoing a str_replace operation
249 |     with patch("pathlib.Path.exists", return_value=True), patch(
250 |         "pathlib.Path.is_dir", return_value=False
251 |     ), patch("pathlib.Path.read_text") as mock_read_text, patch(
252 |         "pathlib.Path.write_text"
253 |     ) as mock_write_text:
254 |         mock_read_text.return_value = "Original content"
255 |         await edit_tool(
256 |             command="str_replace",
257 |             path="/test/file.txt",
258 |             old_str="Original",
259 |             new_str="New",
260 |         )
261 |         mock_read_text.return_value = "New content"
262 |         result = await edit_tool(command="undo_edit", path="/test/file.txt")
263 |         assert isinstance(result, CLIResult)
264 |         assert result.output
265 |         assert "Last edit to /test/file.txt undone successfully" in result.output
266 |         mock_write_text.assert_called_with("Original content")
267 | 
268 |     # Test undoing an insert operation
269 |     edit_tool._file_history.clear()
270 |     with patch("pathlib.Path.exists", return_value=True), patch(
271 |         "pathlib.Path.is_dir", return_value=False
272 |     ), patch("pathlib.Path.read_text") as mock_read_text, patch(
273 |         "pathlib.Path.write_text"
274 |     ) as mock_write_text:
275 |         mock_read_text.return_value = "Line 1\nLine 2"
276 |         await edit_tool(
277 |             command="insert", path="/test/file.txt", insert_line=1, new_str="New Line"
278 |         )
279 |         mock_read_text.return_value = "Line 1\nNew Line\nLine 2"
280 |         result = await edit_tool(command="undo_edit", path="/test/file.txt")
281 |         assert isinstance(result, CLIResult)
282 |         assert result.output
283 |         assert "Last edit to /test/file.txt undone successfully" in result.output
284 |         mock_write_text.assert_called_with("Line 1\nLine 2")
285 | 
286 |     # Test attempting to undo when there's no history
287 |     edit_tool._file_history.clear()
288 |     with patch("pathlib.Path.exists", return_value=True), patch(
289 |         "pathlib.Path.is_dir", return_value=False
290 |     ):
291 |         with pytest.raises(ToolError, match="No edit history found"):
292 |             await edit_tool(command="undo_edit", path="/test/file.txt")
293 | 
294 | 
295 | @pytest.mark.asyncio
296 | async def test_validate_path():
297 |     edit_tool = EditTool()
298 | 
299 |     # Test with valid absolute paths
300 |     with patch("pathlib.Path.exists", return_value=True), patch(
301 |         "pathlib.Path.is_dir", return_value=False
302 |     ):
303 |         edit_tool.validate_path("view", Path("/valid/path.txt"))
304 | 
305 |     # Test with relative paths (should raise an error)
306 |     with pytest.raises(ToolError, match="not an absolute path"):
307 |         edit_tool.validate_path("view", Path("relative/path.txt"))
308 | 
309 |     # Test with non-existent paths for non-create commands (should raise an error)
310 |     with patch("pathlib.Path.exists", return_value=False):
311 |         with pytest.raises(ToolError, match="does not exist"):
312 |             edit_tool.validate_path("view", Path("/nonexistent/file.txt"))
313 | 
314 |     # Test with existing paths for create command (should raise an error)
315 |     with patch("pathlib.Path.exists", return_value=True):
316 |         with pytest.raises(ToolError, match="File already exists"):
317 |             edit_tool.validate_path("create", Path("/existing/file.txt"))
318 | 
319 |     # Test with directory paths for non-view commands (should raise an error)
320 |     with patch("pathlib.Path.exists", return_value=True), patch(
321 |         "pathlib.Path.is_dir", return_value=True
322 |     ):
323 |         with pytest.raises(ToolError, match="is a directory"):
324 |             edit_tool.validate_path("str_replace", Path("/directory/path"))
325 | 
326 |     # Test with directory path for view command (should not raise an error)
327 |     with patch("pathlib.Path.exists", return_value=True), patch(
328 |         "pathlib.Path.is_dir", return_value=True
329 |     ):
330 |         edit_tool.validate_path("view", Path("/directory/path"))
331 | 


--------------------------------------------------------------------------------