├── .gitattributes ├── .gitignore ├── .zed └── settings.json ├── Dockerfile ├── LICENSE ├── LICENSE.QucikStart ├── README.md ├── computer_use_demo ├── __init__.py ├── loop.py ├── requirements.txt ├── streamlit.py └── tools │ ├── __init__.py │ ├── base.py │ ├── bash.py │ ├── collection.py │ ├── computer.py │ ├── edit.py │ ├── game.py │ └── run.py ├── dev-requirements.txt ├── image ├── .config │ └── tint2 │ │ ├── applications │ │ ├── firefox-custom.desktop │ │ ├── gedit.desktop │ │ └── terminal.desktop │ │ └── tint2rc ├── .streamlit │ └── config.toml ├── entrypoint.sh ├── http_server.py ├── index.html ├── mutter_startup.sh ├── novnc_startup.sh ├── start_all.sh ├── static_content │ └── index.html ├── tint2_startup.sh ├── x11vnc_startup.sh └── xvfb_startup.sh ├── main.py ├── pyproject.toml ├── ruff.toml ├── setup.sh └── tests ├── conftest.py ├── loop_test.py ├── streamlit_test.py └── tools ├── bash_test.py ├── computer_test.py └── edit_test.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .venv 6 | .ruff_cache 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | cover/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | .pybuilder/ 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | # For a library or package, you might want to ignore these files since the code is 89 | # intended to run in multiple environments; otherwise, check them in: 90 | # .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # poetry 100 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 101 | # This is especially recommended for binary packages to ensure reproducibility, and is more 102 | # commonly ignored for libraries. 103 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 104 | #poetry.lock 105 | 106 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 107 | __pypackages__/ 108 | 109 | # Celery stuff 110 | celerybeat-schedule 111 | celerybeat.pid 112 | 113 | # SageMath parsed files 114 | *.sage.py 115 | 116 | # Environments 117 | .env 118 | .venv 119 | env/ 120 | venv/ 121 | ENV/ 122 | env.bak/ 123 | venv.bak/ 124 | 125 | # Spyder project settings 126 | .spyderproject 127 | .spyproject 128 | 129 | # Rope project settings 130 | .ropeproject 131 | 132 | # mkdocs documentation 133 | /site 134 | 135 | # mypy 136 | .mypy_cache/ 137 | .dmypy.json 138 | dmypy.json 139 | 140 | # Pyre type checker 141 | .pyre/ 142 | 143 | # pytype static type analyzer 144 | .pytype/ 145 | 146 | # Cython debug symbols 147 | cython_debug/ 148 | 149 | # PyCharm 150 | # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can 151 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 152 | # and can be added to the global gitignore or merged into this file. For a more nuclear 153 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 154 | #.idea/ 155 | -------------------------------------------------------------------------------- /.zed/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "preferred_line_length": 88, 3 | "languages": { 4 | "Python": { 5 | "language_servers": ["pyright", "ruff"] 6 | } 7 | }, 8 | "telemetry": { 9 | "diagnostics": false, 10 | "metrics": false 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/ubuntu:22.04 2 | 3 | ENV DEBIAN_FRONTEND=noninteractive 4 | ENV DEBIAN_PRIORITY=high 5 | 6 | RUN apt-get update && \ 7 | apt-get -y upgrade && \ 8 | apt-get -y install \ 9 | build-essential \ 10 | # UI Requirements 11 | xvfb \ 12 | xterm \ 13 | xdotool \ 14 | scrot \ 15 | imagemagick \ 16 | sudo \ 17 | mutter \ 18 | x11vnc \ 19 | # Python/pyenv reqs 20 | build-essential \ 21 | libssl-dev \ 22 | zlib1g-dev \ 23 | libbz2-dev \ 24 | libreadline-dev \ 25 | libsqlite3-dev \ 26 | curl \ 27 | git \ 28 | libncursesw5-dev \ 29 | xz-utils \ 30 | tk-dev \ 31 | libxml2-dev \ 32 | libxmlsec1-dev \ 33 | libffi-dev \ 34 | liblzma-dev \ 35 | # Network tools 36 | net-tools \ 37 | netcat \ 38 | # PPA req 39 | software-properties-common && \ 40 | # Userland apps 41 | sudo add-apt-repository ppa:mozillateam/ppa && \ 42 | sudo apt-get install -y --no-install-recommends \ 43 | libreoffice \ 44 | firefox-esr \ 45 | x11-apps \ 46 | xpdf \ 47 | gedit \ 48 | xpaint \ 49 | tint2 \ 50 | galculator \ 51 | pcmanfm \ 52 | unzip && \ 53 | apt-get clean 54 | 55 | # Install noVNC 56 | RUN git clone --branch v1.5.0 https://github.com/novnc/noVNC.git /opt/noVNC && \ 57 | git clone --branch v0.12.0 https://github.com/novnc/websockify /opt/noVNC/utils/websockify && \ 58 | ln -s /opt/noVNC/vnc.html /opt/noVNC/index.html 59 | 60 | # setup user 61 | ENV USERNAME=computeruse 62 | ENV HOME=/home/$USERNAME 63 | RUN useradd -m -s /bin/bash -d $HOME $USERNAME 64 | RUN echo "${USERNAME} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers 65 | USER computeruse 66 | WORKDIR $HOME 67 | 68 | # setup python 69 | RUN git clone https://github.com/pyenv/pyenv.git ~/.pyenv && \ 70 | cd ~/.pyenv && src/configure && make -C src && cd .. && \ 71 | echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc && \ 72 | echo 'command -v pyenv >/dev/null || export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc && \ 73 | echo 'eval "$(pyenv init -)"' >> ~/.bashrc 74 | ENV PYENV_ROOT="$HOME/.pyenv" 75 | ENV PATH="$PYENV_ROOT/bin:$PATH" 76 | ENV PYENV_VERSION_MAJOR=3 77 | ENV PYENV_VERSION_MINOR=11 78 | ENV PYENV_VERSION_PATCH=6 79 | ENV PYENV_VERSION=$PYENV_VERSION_MAJOR.$PYENV_VERSION_MINOR.$PYENV_VERSION_PATCH 80 | RUN eval "$(pyenv init -)" && \ 81 | pyenv install $PYENV_VERSION && \ 82 | pyenv global $PYENV_VERSION && \ 83 | pyenv rehash 84 | 85 | ENV PATH="$HOME/.pyenv/shims:$HOME/.pyenv/bin:$PATH" 86 | 87 | RUN python -m pip install --upgrade pip==23.1.2 setuptools==58.0.4 wheel==0.40.0 && \ 88 | python -m pip config set global.disable-pip-version-check true 89 | 90 | # only reinstall if requirements.txt changes 91 | COPY --chown=$USERNAME:$USERNAME computer_use_demo/requirements.txt $HOME/computer_use_demo/requirements.txt 92 | RUN python -m pip install -r $HOME/computer_use_demo/requirements.txt 93 | 94 | # setup desktop env & app 95 | COPY --chown=$USERNAME:$USERNAME image/ $HOME 96 | COPY --chown=$USERNAME:$USERNAME computer_use_demo/ $HOME/computer_use_demo/ 97 | 98 | ARG DISPLAY_NUM=1 99 | ARG HEIGHT=768 100 | ARG WIDTH=1024 101 | ENV DISPLAY_NUM=$DISPLAY_NUM 102 | ENV HEIGHT=$HEIGHT 103 | ENV WIDTH=$WIDTH 104 | 105 | ENTRYPOINT [ "./entrypoint.sh" ] 106 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 observedobserver 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /LICENSE.QucikStart: -------------------------------------------------------------------------------- 1 | Copyright 2024 Anthropic, PBC. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # claude-minecraft-use 2 | 3 | This repo uses Claude computer use [quick starts](https://github.com/anthropics/anthropic-quickstarts) as base templates with modifications: 4 | - Direct control of local MacOS (requires some changes of tools for actions) 5 | - Allows AI to control Minecraft 6 | 7 | This repo is just for fun and testing the capabilities of Claude computer use. Thanks to the great work of the Claude team for making this possible. Also thanks to [BlueM](https://github.com/BlueM) for cliclick which makes this demo work on Mac. 8 | 9 | Limitations discovered during testing: 10 | 1. Coordinate control is not accurate enough 11 | 2. Cannot handle complex tasks step by step unless prompted well 12 | 13 | https://github.com/user-attachments/assets/39e74c82-d4fe-4cb2-b213-b0b504d64772 14 | 15 | -------------------------------------------------------------------------------- /computer_use_demo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ObservedObserver/claude-minecraft-use/8b52ef2a5aa175a49475db07ad7168b33089f8b6/computer_use_demo/__init__.py -------------------------------------------------------------------------------- /computer_use_demo/loop.py: -------------------------------------------------------------------------------- 1 | """ 2 | Agentic sampling loop that calls the Anthropic API and local implenmentation of anthropic-defined computer use tools. 3 | """ 4 | 5 | import platform 6 | from collections.abc import Callable 7 | from datetime import datetime 8 | from enum import StrEnum 9 | from typing import Any, cast 10 | 11 | from anthropic import Anthropic, AnthropicBedrock, AnthropicVertex, APIResponse 12 | from anthropic.types import ( 13 | ToolResultBlockParam, 14 | ) 15 | from anthropic.types.beta import ( 16 | BetaContentBlock, 17 | BetaContentBlockParam, 18 | BetaImageBlockParam, 19 | BetaMessage, 20 | BetaMessageParam, 21 | BetaTextBlockParam, 22 | BetaToolResultBlockParam, 23 | ) 24 | 25 | from .tools import BashTool, ComputerTool, EditTool, ToolCollection, ToolResult, GameTool 26 | 27 | BETA_FLAG = "computer-use-2024-10-22" 28 | 29 | 30 | class APIProvider(StrEnum): 31 | ANTHROPIC = "anthropic" 32 | BEDROCK = "bedrock" 33 | VERTEX = "vertex" 34 | 35 | 36 | PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = { 37 | APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022", 38 | APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0", 39 | APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022", 40 | } 41 | 42 | 43 | # This system prompt is optimized for the Docker environment in this repository and 44 | # specific tool combinations enabled. 45 | # We encourage modifying this system prompt to ensure the model has context for the 46 | # environment it is running in, and to provide any additional information that may be 47 | # helpful for the task at hand. 48 | SYSTEM_PROMPT = f""" 49 | * You are utilising an MacOS machine using {platform.machine()} architecture with internet access. 50 | * You can feel free to install MacOS applications with your bash tool. Use curl instead of wget. 51 | * To open chrome, please just click on the chrome icon. Note, chrome is what is installed on your system. 52 | * Minecraft is installed on your system. You can use the computer tool to interact with it. 53 | * Using bash tool you can start GUI applications, but you need to set export DISPLAY=:1 and use a subshell. For example "(DISPLAY=:1 xterm &)". GUI apps run with bash tool will appear within your desktop environment, but they may take some time to appear. Take a screenshot to confirm it did. 54 | * When using your bash tool with commands that are expected to output very large quantities of text, redirect into a tmp file and use str_replace_editor or `grep -n -B -A ` to confirm output. 55 | * When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available. 56 | * When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request. 57 | * The current date is {datetime.today().strftime('%A, %B %-d, %Y')}. 58 | 59 | 60 | 61 | * When using Chrome, if a startup wizard appears, IGNORE IT. Do not even click "skip this step". Instead, click on the address bar where it says "Search or enter address", and enter the appropriate search term or URL there. 62 | * If the item you are looking at is a pdf, if after taking a single screenshot of the pdf it seems that you want to read the entire document instead of trying to continue to read the pdf from your screenshots + navigation, determine the URL, use curl to download the pdf, install and use pdftotext to convert it to a text file, and then read that text file directly with your StrReplaceEditTool. 63 | """ 64 | 65 | 66 | async def sampling_loop( 67 | *, 68 | model: str, 69 | provider: APIProvider, 70 | system_prompt_suffix: str, 71 | messages: list[BetaMessageParam], 72 | output_callback: Callable[[BetaContentBlock], None], 73 | tool_output_callback: Callable[[ToolResult, str], None], 74 | api_response_callback: Callable[[APIResponse[BetaMessage]], None], 75 | api_key: str, 76 | only_n_most_recent_images: int | None = None, 77 | max_tokens: int = 4096, 78 | ): 79 | """ 80 | Agentic sampling loop for the assistant/tool interaction of computer use. 81 | """ 82 | tool_collection = ToolCollection( 83 | # ComputerTool(), 84 | GameTool(), 85 | BashTool(), 86 | EditTool(), 87 | ) 88 | system = ( 89 | f"{SYSTEM_PROMPT}{' ' + system_prompt_suffix if system_prompt_suffix else ''}" 90 | ) 91 | 92 | while True: 93 | if only_n_most_recent_images: 94 | _maybe_filter_to_n_most_recent_images(messages, only_n_most_recent_images) 95 | 96 | if provider == APIProvider.ANTHROPIC: 97 | client = Anthropic(api_key=api_key) 98 | elif provider == APIProvider.VERTEX: 99 | client = AnthropicVertex() 100 | elif provider == APIProvider.BEDROCK: 101 | client = AnthropicBedrock() 102 | 103 | # Call the API 104 | # we use raw_response to provide debug information to streamlit. Your 105 | # implementation may be able call the SDK directly with: 106 | # `response = client.messages.create(...)` instead. 107 | raw_response = client.beta.messages.with_raw_response.create( 108 | max_tokens=max_tokens, 109 | messages=messages, 110 | model=model, 111 | system=system, 112 | tools=tool_collection.to_params(), 113 | betas=["computer-use-2024-10-22"], 114 | ) 115 | 116 | api_response_callback(cast(APIResponse[BetaMessage], raw_response)) 117 | 118 | response = raw_response.parse() 119 | 120 | messages.append( 121 | { 122 | "role": "assistant", 123 | "content": cast(list[BetaContentBlockParam], response.content), 124 | } 125 | ) 126 | 127 | tool_result_content: list[BetaToolResultBlockParam] = [] 128 | for content_block in cast(list[BetaContentBlock], response.content): 129 | output_callback(content_block) 130 | if content_block.type == "tool_use": 131 | result = await tool_collection.run( 132 | name=content_block.name, 133 | tool_input=cast(dict[str, Any], content_block.input), 134 | ) 135 | tool_result_content.append( 136 | _make_api_tool_result(result, content_block.id) 137 | ) 138 | tool_output_callback(result, content_block.id) 139 | 140 | if not tool_result_content: 141 | return messages 142 | 143 | messages.append({"content": tool_result_content, "role": "user"}) 144 | 145 | 146 | def _maybe_filter_to_n_most_recent_images( 147 | messages: list[BetaMessageParam], 148 | images_to_keep: int, 149 | min_removal_threshold: int = 10, 150 | ): 151 | """ 152 | With the assumption that images are screenshots that are of diminishing value as 153 | the conversation progresses, remove all but the final `images_to_keep` tool_result 154 | images in place, with a chunk of min_removal_threshold to reduce the amount we 155 | break the implicit prompt cache. 156 | """ 157 | if images_to_keep is None: 158 | return messages 159 | 160 | tool_result_blocks = cast( 161 | list[ToolResultBlockParam], 162 | [ 163 | item 164 | for message in messages 165 | for item in ( 166 | message["content"] if isinstance(message["content"], list) else [] 167 | ) 168 | if isinstance(item, dict) and item.get("type") == "tool_result" 169 | ], 170 | ) 171 | 172 | total_images = sum( 173 | 1 174 | for tool_result in tool_result_blocks 175 | for content in tool_result.get("content", []) 176 | if isinstance(content, dict) and content.get("type") == "image" 177 | ) 178 | 179 | images_to_remove = total_images - images_to_keep 180 | # for better cache behavior, we want to remove in chunks 181 | images_to_remove -= images_to_remove % min_removal_threshold 182 | 183 | for tool_result in tool_result_blocks: 184 | if isinstance(tool_result.get("content"), list): 185 | new_content = [] 186 | for content in tool_result.get("content", []): 187 | if isinstance(content, dict) and content.get("type") == "image": 188 | if images_to_remove > 0: 189 | images_to_remove -= 1 190 | continue 191 | new_content.append(content) 192 | tool_result["content"] = new_content 193 | 194 | 195 | def _make_api_tool_result( 196 | result: ToolResult, tool_use_id: str 197 | ) -> BetaToolResultBlockParam: 198 | """Convert an agent ToolResult to an API ToolResultBlockParam.""" 199 | tool_result_content: list[BetaTextBlockParam | BetaImageBlockParam] | str = [] 200 | is_error = False 201 | if result.error: 202 | is_error = True 203 | tool_result_content = _maybe_prepend_system_tool_result(result, result.error) 204 | else: 205 | if result.output: 206 | tool_result_content.append( 207 | { 208 | "type": "text", 209 | "text": _maybe_prepend_system_tool_result(result, result.output), 210 | } 211 | ) 212 | if result.base64_image: 213 | tool_result_content.append( 214 | { 215 | "type": "image", 216 | "source": { 217 | "type": "base64", 218 | "media_type": "image/png", 219 | "data": result.base64_image, 220 | }, 221 | } 222 | ) 223 | return { 224 | "type": "tool_result", 225 | "content": tool_result_content, 226 | "tool_use_id": tool_use_id, 227 | "is_error": is_error, 228 | } 229 | 230 | 231 | def _maybe_prepend_system_tool_result(result: ToolResult, result_text: str): 232 | if result.system: 233 | result_text = f"{result.system}\n{result_text}" 234 | return result_text 235 | -------------------------------------------------------------------------------- /computer_use_demo/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit>=1.38.0 2 | anthropic[bedrock,vertex]>=0.37.1 3 | jsonschema==4.22.0 4 | boto3>=1.28.57 5 | google-auth<3,>=2 6 | -------------------------------------------------------------------------------- /computer_use_demo/streamlit.py: -------------------------------------------------------------------------------- 1 | """ 2 | Entrypoint for streamlit, see https://docs.streamlit.io/ 3 | """ 4 | 5 | import asyncio 6 | import base64 7 | import os 8 | import subprocess 9 | from datetime import datetime 10 | from enum import StrEnum 11 | from functools import partial 12 | from pathlib import PosixPath 13 | from typing import cast 14 | 15 | import streamlit as st 16 | from anthropic import APIResponse 17 | from anthropic.types import ( 18 | TextBlock, 19 | ) 20 | from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock 21 | from anthropic.types.tool_use_block import ToolUseBlock 22 | from streamlit.delta_generator import DeltaGenerator 23 | 24 | from computer_use_demo.loop import ( 25 | PROVIDER_TO_DEFAULT_MODEL_NAME, 26 | APIProvider, 27 | sampling_loop, 28 | ) 29 | from computer_use_demo.tools import ToolResult 30 | 31 | from dotenv import load_dotenv 32 | # load env vars 33 | load_dotenv() 34 | 35 | CONFIG_DIR = PosixPath("~/.anthropic").expanduser() 36 | API_KEY_FILE = CONFIG_DIR / "api_key" 37 | STREAMLIT_STYLE = """ 38 | 49 | """ 50 | 51 | WARNING_TEXT = "⚠️ Security Alert: Never provide access to sensitive accounts or data, as malicious web content can hijack Claude's behavior" 52 | 53 | 54 | class Sender(StrEnum): 55 | USER = "user" 56 | BOT = "assistant" 57 | TOOL = "tool" 58 | 59 | 60 | def setup_state(): 61 | if "messages" not in st.session_state: 62 | st.session_state.messages = [] 63 | if "api_key" not in st.session_state: 64 | # Try to load API key from file first, then environment 65 | st.session_state.api_key = load_from_storage("api_key") or os.getenv( 66 | "ANTHROPIC_API_KEY", "" 67 | ) 68 | if "provider" not in st.session_state: 69 | st.session_state.provider = ( 70 | os.getenv("API_PROVIDER", "anthropic") or APIProvider.ANTHROPIC 71 | ) 72 | if "provider_radio" not in st.session_state: 73 | st.session_state.provider_radio = st.session_state.provider 74 | if "model" not in st.session_state: 75 | _reset_model() 76 | if "auth_validated" not in st.session_state: 77 | st.session_state.auth_validated = False 78 | if "responses" not in st.session_state: 79 | st.session_state.responses = {} 80 | if "tools" not in st.session_state: 81 | st.session_state.tools = {} 82 | if "only_n_most_recent_images" not in st.session_state: 83 | st.session_state.only_n_most_recent_images = 10 84 | if "custom_system_prompt" not in st.session_state: 85 | st.session_state.custom_system_prompt = load_from_storage("system_prompt") or "" 86 | if "hide_images" not in st.session_state: 87 | st.session_state.hide_images = False 88 | 89 | 90 | def _reset_model(): 91 | st.session_state.model = PROVIDER_TO_DEFAULT_MODEL_NAME[ 92 | cast(APIProvider, st.session_state.provider) 93 | ] 94 | 95 | 96 | async def main(): 97 | """Render loop for streamlit""" 98 | setup_state() 99 | 100 | st.markdown(STREAMLIT_STYLE, unsafe_allow_html=True) 101 | 102 | st.title("Claude Computer Use Minecraft") 103 | st.markdown(""" 104 | This is a demo of using Claude to control a computer running Minecraft. 105 | Github Repo: [github.com/ObservedObserver/claude-minecraft-use](https://github.com/ObservedObserver/claude-minecraft-use) 106 | """) 107 | 108 | # if not os.getenv("HIDE_WARNING", False): 109 | # st.warning(WARNING_TEXT) 110 | 111 | with st.sidebar: 112 | 113 | def _reset_api_provider(): 114 | if st.session_state.provider_radio != st.session_state.provider: 115 | _reset_model() 116 | st.session_state.provider = st.session_state.provider_radio 117 | st.session_state.auth_validated = False 118 | 119 | provider_options = [option.value for option in APIProvider] 120 | st.radio( 121 | "API Provider", 122 | options=provider_options, 123 | key="provider_radio", 124 | format_func=lambda x: x.title(), 125 | on_change=_reset_api_provider, 126 | ) 127 | 128 | st.text_input("Model", key="model") 129 | 130 | if st.session_state.provider == APIProvider.ANTHROPIC: 131 | st.text_input( 132 | "Anthropic API Key", 133 | type="password", 134 | key="api_key", 135 | on_change=lambda: save_to_storage("api_key", st.session_state.api_key), 136 | ) 137 | 138 | st.number_input( 139 | "Only send N most recent images", 140 | min_value=0, 141 | key="only_n_most_recent_images", 142 | help="To decrease the total tokens sent, remove older screenshots from the conversation", 143 | ) 144 | st.text_area( 145 | "Custom System Prompt Suffix", 146 | key="custom_system_prompt", 147 | help="Additional instructions to append to the system prompt. see computer_use_demo/loop.py for the base system prompt.", 148 | on_change=lambda: save_to_storage( 149 | "system_prompt", st.session_state.custom_system_prompt 150 | ), 151 | ) 152 | st.checkbox("Hide screenshots", key="hide_images") 153 | 154 | if st.button("Reset", type="primary"): 155 | with st.spinner("Resetting..."): 156 | st.session_state.clear() 157 | setup_state() 158 | 159 | subprocess.run("pkill Xvfb; pkill tint2", shell=True) # noqa: ASYNC221 160 | await asyncio.sleep(1) 161 | subprocess.run("./start_all.sh", shell=True) # noqa: ASYNC221 162 | 163 | if not st.session_state.auth_validated: 164 | if auth_error := validate_auth( 165 | st.session_state.provider, st.session_state.api_key 166 | ): 167 | st.warning(f"Please resolve the following auth issue:\n\n{auth_error}") 168 | return 169 | else: 170 | st.session_state.auth_validated = True 171 | 172 | chat, http_logs = st.tabs(["Chat", "HTTP Exchange Logs"]) 173 | new_message = st.chat_input( 174 | "Type a message to send to Claude to control the computer..." 175 | ) 176 | 177 | with chat: 178 | # render past chats 179 | for message in st.session_state.messages: 180 | if isinstance(message["content"], str): 181 | _render_message(message["role"], message["content"]) 182 | elif isinstance(message["content"], list): 183 | for block in message["content"]: 184 | # the tool result we send back to the Anthropic API isn't sufficient to render all details, 185 | # so we store the tool use responses 186 | if isinstance(block, dict) and block["type"] == "tool_result": 187 | _render_message( 188 | Sender.TOOL, st.session_state.tools[block["tool_use_id"]] 189 | ) 190 | else: 191 | _render_message( 192 | message["role"], 193 | cast(BetaTextBlock | BetaToolUseBlock, block), 194 | ) 195 | 196 | # render past http exchanges 197 | for identity, response in st.session_state.responses.items(): 198 | _render_api_response(response, identity, http_logs) 199 | 200 | # render past chats 201 | if new_message: 202 | st.session_state.messages.append( 203 | { 204 | "role": Sender.USER, 205 | "content": [TextBlock(type="text", text=new_message)], 206 | } 207 | ) 208 | _render_message(Sender.USER, new_message) 209 | 210 | try: 211 | most_recent_message = st.session_state["messages"][-1] 212 | except IndexError: 213 | return 214 | 215 | if most_recent_message["role"] is not Sender.USER: 216 | # we don't have a user message to respond to, exit early 217 | return 218 | 219 | with st.spinner("Running Agent..."): 220 | # run the agent sampling loop with the newest message 221 | st.session_state.messages = await sampling_loop( 222 | system_prompt_suffix=st.session_state.custom_system_prompt, 223 | model=st.session_state.model, 224 | provider=st.session_state.provider, 225 | messages=st.session_state.messages, 226 | output_callback=partial(_render_message, Sender.BOT), 227 | tool_output_callback=partial( 228 | _tool_output_callback, tool_state=st.session_state.tools 229 | ), 230 | api_response_callback=partial( 231 | _api_response_callback, 232 | tab=http_logs, 233 | response_state=st.session_state.responses, 234 | ), 235 | api_key=st.session_state.api_key, 236 | only_n_most_recent_images=st.session_state.only_n_most_recent_images, 237 | ) 238 | 239 | 240 | def validate_auth(provider: APIProvider, api_key: str | None): 241 | if provider == APIProvider.ANTHROPIC: 242 | if not api_key: 243 | return "Enter your Anthropic API key in the sidebar to continue." 244 | if provider == APIProvider.BEDROCK: 245 | import boto3 246 | 247 | if not boto3.Session().get_credentials(): 248 | return "You must have AWS credentials set up to use the Bedrock API." 249 | if provider == APIProvider.VERTEX: 250 | import google.auth 251 | from google.auth.exceptions import DefaultCredentialsError 252 | 253 | if not os.environ.get("CLOUD_ML_REGION"): 254 | return "Set the CLOUD_ML_REGION environment variable to use the Vertex API." 255 | try: 256 | google.auth.default( 257 | scopes=["https://www.googleapis.com/auth/cloud-platform"], 258 | ) 259 | except DefaultCredentialsError: 260 | return "Your google cloud credentials are not set up correctly." 261 | 262 | 263 | def load_from_storage(filename: str) -> str | None: 264 | """Load data from a file in the storage directory.""" 265 | try: 266 | file_path = CONFIG_DIR / filename 267 | if file_path.exists(): 268 | data = file_path.read_text().strip() 269 | if data: 270 | return data 271 | except Exception as e: 272 | st.write(f"Debug: Error loading {filename}: {e}") 273 | return None 274 | 275 | 276 | def save_to_storage(filename: str, data: str) -> None: 277 | """Save data to a file in the storage directory.""" 278 | try: 279 | CONFIG_DIR.mkdir(parents=True, exist_ok=True) 280 | file_path = CONFIG_DIR / filename 281 | file_path.write_text(data) 282 | # Ensure only user can read/write the file 283 | file_path.chmod(0o600) 284 | except Exception as e: 285 | st.write(f"Debug: Error saving {filename}: {e}") 286 | 287 | 288 | def _api_response_callback( 289 | response: APIResponse[BetaMessage], 290 | tab: DeltaGenerator, 291 | response_state: dict[str, APIResponse[BetaMessage]], 292 | ): 293 | """ 294 | Handle an API response by storing it to state and rendering it. 295 | """ 296 | response_id = datetime.now().isoformat() 297 | response_state[response_id] = response 298 | _render_api_response(response, response_id, tab) 299 | 300 | 301 | def _tool_output_callback( 302 | tool_output: ToolResult, tool_id: str, tool_state: dict[str, ToolResult] 303 | ): 304 | """Handle a tool output by storing it to state and rendering it.""" 305 | tool_state[tool_id] = tool_output 306 | _render_message(Sender.TOOL, tool_output) 307 | 308 | 309 | def _render_api_response( 310 | response: APIResponse[BetaMessage], response_id: str, tab: DeltaGenerator 311 | ): 312 | """Render an API response to a streamlit tab""" 313 | with tab: 314 | with st.expander(f"Request/Response ({response_id})"): 315 | newline = "\n\n" 316 | st.markdown( 317 | f"`{response.http_request.method} {response.http_request.url}`{newline}{newline.join(f'`{k}: {v}`' for k, v in response.http_request.headers.items())}" 318 | ) 319 | st.json(response.http_request.read().decode()) 320 | st.markdown( 321 | f"`{response.http_response.status_code}`{newline}{newline.join(f'`{k}: {v}`' for k, v in response.headers.items())}" 322 | ) 323 | st.json(response.http_response.text) 324 | 325 | 326 | def _render_message( 327 | sender: Sender, 328 | message: str | BetaTextBlock | BetaToolUseBlock | ToolResult, 329 | ): 330 | """Convert input from the user or output from the agent to a streamlit message.""" 331 | # streamlit's hotreloading breaks isinstance checks, so we need to check for class names 332 | is_tool_result = not isinstance(message, str) and ( 333 | isinstance(message, ToolResult) 334 | or message.__class__.__name__ == "ToolResult" 335 | or message.__class__.__name__ == "CLIResult" 336 | ) 337 | if not message or ( 338 | is_tool_result 339 | and st.session_state.hide_images 340 | and not hasattr(message, "error") 341 | and not hasattr(message, "output") 342 | ): 343 | return 344 | with st.chat_message(sender): 345 | if is_tool_result: 346 | message = cast(ToolResult, message) 347 | if message.output: 348 | if message.__class__.__name__ == "CLIResult": 349 | st.code(message.output) 350 | else: 351 | st.markdown(message.output) 352 | if message.error: 353 | st.error(message.error) 354 | if message.base64_image and not st.session_state.hide_images: 355 | st.image(base64.b64decode(message.base64_image)) 356 | elif isinstance(message, BetaTextBlock) or isinstance(message, TextBlock): 357 | st.write(message.text) 358 | elif isinstance(message, BetaToolUseBlock) or isinstance(message, ToolUseBlock): 359 | st.code(f"Tool Use: {message.name}\nInput: {message.input}") 360 | else: 361 | st.markdown(message) 362 | 363 | 364 | if __name__ == "__main__": 365 | asyncio.run(main()) 366 | -------------------------------------------------------------------------------- /computer_use_demo/tools/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import CLIResult, ToolResult 2 | from .bash import BashTool 3 | from .collection import ToolCollection 4 | from .computer import ComputerTool 5 | from .edit import EditTool 6 | from .game import GameTool 7 | 8 | __ALL__ = [ 9 | BashTool, 10 | CLIResult, 11 | ComputerTool, 12 | EditTool, 13 | ToolCollection, 14 | ToolResult, 15 | GameTool, 16 | ] 17 | -------------------------------------------------------------------------------- /computer_use_demo/tools/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | from dataclasses import dataclass, fields, replace 3 | from typing import Any 4 | 5 | from anthropic.types.beta import BetaToolUnionParam 6 | 7 | 8 | class BaseAnthropicTool(metaclass=ABCMeta): 9 | """Abstract base class for Anthropic-defined tools.""" 10 | 11 | @abstractmethod 12 | def __call__(self, **kwargs) -> Any: 13 | """Executes the tool with the given arguments.""" 14 | ... 15 | 16 | @abstractmethod 17 | def to_params( 18 | self, 19 | ) -> BetaToolUnionParam: 20 | raise NotImplementedError 21 | 22 | 23 | @dataclass(kw_only=True, frozen=True) 24 | class ToolResult: 25 | """Represents the result of a tool execution.""" 26 | 27 | output: str | None = None 28 | error: str | None = None 29 | base64_image: str | None = None 30 | system: str | None = None 31 | 32 | def __bool__(self): 33 | return any(getattr(self, field.name) for field in fields(self)) 34 | 35 | def __add__(self, other: "ToolResult"): 36 | def combine_fields( 37 | field: str | None, other_field: str | None, concatenate: bool = True 38 | ): 39 | if field and other_field: 40 | if concatenate: 41 | return field + other_field 42 | raise ValueError("Cannot combine tool results") 43 | return field or other_field 44 | 45 | return ToolResult( 46 | output=combine_fields(self.output, other.output), 47 | error=combine_fields(self.error, other.error), 48 | base64_image=combine_fields(self.base64_image, other.base64_image, False), 49 | system=combine_fields(self.system, other.system), 50 | ) 51 | 52 | def replace(self, **kwargs): 53 | """Returns a new ToolResult with the given fields replaced.""" 54 | return replace(self, **kwargs) 55 | 56 | 57 | class CLIResult(ToolResult): 58 | """A ToolResult that can be rendered as a CLI output.""" 59 | 60 | 61 | class ToolFailure(ToolResult): 62 | """A ToolResult that represents a failure.""" 63 | 64 | 65 | class ToolError(Exception): 66 | """Raised when a tool encounters an error.""" 67 | 68 | def __init__(self, message): 69 | self.message = message 70 | -------------------------------------------------------------------------------- /computer_use_demo/tools/bash.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | from typing import ClassVar, Literal 4 | 5 | from anthropic.types.beta import BetaToolBash20241022Param 6 | 7 | from .base import BaseAnthropicTool, CLIResult, ToolError, ToolResult 8 | 9 | 10 | class _BashSession: 11 | """A session of a bash shell.""" 12 | 13 | _started: bool 14 | _process: asyncio.subprocess.Process 15 | 16 | command: str = "/bin/bash" 17 | _output_delay: float = 0.2 # seconds 18 | _timeout: float = 120.0 # seconds 19 | _sentinel: str = "<>" 20 | 21 | def __init__(self): 22 | self._started = False 23 | self._timed_out = False 24 | 25 | async def start(self): 26 | if self._started: 27 | return 28 | 29 | self._process = await asyncio.create_subprocess_shell( 30 | self.command, 31 | preexec_fn=os.setsid, 32 | shell=True, 33 | bufsize=0, 34 | stdin=asyncio.subprocess.PIPE, 35 | stdout=asyncio.subprocess.PIPE, 36 | stderr=asyncio.subprocess.PIPE, 37 | ) 38 | 39 | self._started = True 40 | 41 | def stop(self): 42 | """Terminate the bash shell.""" 43 | if not self._started: 44 | raise ToolError("Session has not started.") 45 | if self._process.returncode is not None: 46 | return 47 | self._process.terminate() 48 | 49 | async def run(self, command: str): 50 | """Execute a command in the bash shell.""" 51 | if not self._started: 52 | raise ToolError("Session has not started.") 53 | if self._process.returncode is not None: 54 | return ToolResult( 55 | system="tool must be restarted", 56 | error=f"bash has exited with returncode {self._process.returncode}", 57 | ) 58 | if self._timed_out: 59 | raise ToolError( 60 | f"timed out: bash has not returned in {self._timeout} seconds and must be restarted", 61 | ) 62 | 63 | # we know these are not None because we created the process with PIPEs 64 | assert self._process.stdin 65 | assert self._process.stdout 66 | assert self._process.stderr 67 | 68 | # send command to the process 69 | self._process.stdin.write( 70 | command.encode() + f"; echo '{self._sentinel}'\n".encode() 71 | ) 72 | await self._process.stdin.drain() 73 | 74 | # read output from the process, until the sentinel is found 75 | try: 76 | async with asyncio.timeout(self._timeout): 77 | while True: 78 | await asyncio.sleep(self._output_delay) 79 | # if we read directly from stdout/stderr, it will wait forever for 80 | # EOF. use the StreamReader buffer directly instead. 81 | output = self._process.stdout._buffer.decode() # pyright: ignore[reportAttributeAccessIssue] 82 | if self._sentinel in output: 83 | # strip the sentinel and break 84 | output = output[: output.index(self._sentinel)] 85 | break 86 | except asyncio.TimeoutError: 87 | self._timed_out = True 88 | raise ToolError( 89 | f"timed out: bash has not returned in {self._timeout} seconds and must be restarted", 90 | ) from None 91 | 92 | if output.endswith("\n"): 93 | output = output[:-1] 94 | 95 | error = self._process.stderr._buffer.decode() # pyright: ignore[reportAttributeAccessIssue] 96 | if error.endswith("\n"): 97 | error = error[:-1] 98 | 99 | # clear the buffers so that the next output can be read correctly 100 | self._process.stdout._buffer.clear() # pyright: ignore[reportAttributeAccessIssue] 101 | self._process.stderr._buffer.clear() # pyright: ignore[reportAttributeAccessIssue] 102 | 103 | return CLIResult(output=output, error=error) 104 | 105 | 106 | class BashTool(BaseAnthropicTool): 107 | """ 108 | A tool that allows the agent to run bash commands. 109 | The tool parameters are defined by Anthropic and are not editable. 110 | """ 111 | 112 | _session: _BashSession | None 113 | name: ClassVar[Literal["bash"]] = "bash" 114 | api_type: ClassVar[Literal["bash_20241022"]] = "bash_20241022" 115 | 116 | def __init__(self): 117 | self._session = None 118 | super().__init__() 119 | 120 | async def __call__( 121 | self, command: str | None = None, restart: bool = False, **kwargs 122 | ): 123 | if restart: 124 | if self._session: 125 | self._session.stop() 126 | self._session = _BashSession() 127 | await self._session.start() 128 | 129 | return ToolResult(system="tool has been restarted.") 130 | 131 | if self._session is None: 132 | self._session = _BashSession() 133 | await self._session.start() 134 | 135 | if command is not None: 136 | return await self._session.run(command) 137 | 138 | raise ToolError("no command provided.") 139 | 140 | def to_params(self) -> BetaToolBash20241022Param: 141 | return { 142 | "type": self.api_type, 143 | "name": self.name, 144 | } 145 | -------------------------------------------------------------------------------- /computer_use_demo/tools/collection.py: -------------------------------------------------------------------------------- 1 | """Collection classes for managing multiple tools.""" 2 | 3 | from typing import Any 4 | 5 | from anthropic.types.beta import BetaToolUnionParam 6 | 7 | from .base import ( 8 | BaseAnthropicTool, 9 | ToolError, 10 | ToolFailure, 11 | ToolResult, 12 | ) 13 | 14 | 15 | class ToolCollection: 16 | """A collection of anthropic-defined tools.""" 17 | 18 | def __init__(self, *tools: BaseAnthropicTool): 19 | self.tools = tools 20 | self.tool_map = {tool.to_params()["name"]: tool for tool in tools} 21 | 22 | def to_params( 23 | self, 24 | ) -> list[BetaToolUnionParam]: 25 | return [tool.to_params() for tool in self.tools] 26 | 27 | async def run(self, *, name: str, tool_input: dict[str, Any]) -> ToolResult: 28 | tool = self.tool_map.get(name) 29 | if not tool: 30 | return ToolFailure(error=f"Tool {name} is invalid") 31 | try: 32 | return await tool(**tool_input) 33 | except ToolError as e: 34 | return ToolFailure(error=e.message) 35 | -------------------------------------------------------------------------------- /computer_use_demo/tools/computer.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import base64 3 | import os 4 | import shlex 5 | import shutil 6 | from enum import StrEnum 7 | from pathlib import Path 8 | from typing import Literal, TypedDict 9 | from uuid import uuid4 10 | 11 | from anthropic.types.beta import BetaToolComputerUse20241022Param 12 | 13 | from .base import BaseAnthropicTool, ToolError, ToolResult 14 | from .run import run 15 | 16 | OUTPUT_DIR = "/tmp/outputs" 17 | 18 | TYPING_DELAY_MS = 12 19 | TYPING_GROUP_SIZE = 50 20 | 21 | Action = Literal[ 22 | "key", 23 | "type", 24 | "mouse_move", 25 | "left_click", 26 | "left_click_drag", 27 | "right_click", 28 | "middle_click", 29 | "double_click", 30 | "screenshot", 31 | "cursor_position", 32 | # minecraft 33 | "hold_down_left_button", 34 | "release_left_button", 35 | "hold_down_arrow_up", 36 | "release_arrow_up", 37 | "hold_down_arrow_down", 38 | "release_arrow_down", 39 | "hold_down_arrow_left", 40 | "release_arrow_left", 41 | "hold_down_arrow_right", 42 | "release_arrow_right", 43 | ] 44 | 45 | 46 | class Resolution(TypedDict): 47 | width: int 48 | height: int 49 | 50 | 51 | # sizes above XGA/WXGA are not recommended (see README.md) 52 | # scale down to one of these targets if ComputerTool._scaling_enabled is set 53 | MAX_SCALING_TARGETS: dict[str, Resolution] = { 54 | "XGA": Resolution(width=1024, height=768), # 4:3 55 | "WXGA": Resolution(width=1280, height=800), # 16:10 56 | "FWXGA": Resolution(width=1366, height=768), # ~16:9 57 | } 58 | 59 | 60 | class ScalingSource(StrEnum): 61 | COMPUTER = "computer" 62 | API = "api" 63 | 64 | 65 | class ComputerToolOptions(TypedDict): 66 | display_height_px: int 67 | display_width_px: int 68 | display_number: int | None 69 | 70 | 71 | def chunks(s: str, chunk_size: int) -> list[str]: 72 | return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)] 73 | 74 | 75 | class ComputerTool(BaseAnthropicTool): 76 | """ 77 | A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer. 78 | The tool parameters are defined by Anthropic and are not editable. 79 | """ 80 | 81 | name: Literal["computer"] = "computer" 82 | api_type: Literal["computer_20241022"] = "computer_20241022" 83 | width: int 84 | height: int 85 | display_num: int | None 86 | 87 | _screenshot_delay = 2.0 88 | _scaling_enabled = True 89 | 90 | @property 91 | def options(self) -> ComputerToolOptions: 92 | width, height = self.scale_coordinates( 93 | ScalingSource.COMPUTER, self.width, self.height 94 | ) 95 | return { 96 | "display_width_px": width, 97 | "display_height_px": height, 98 | "display_number": self.display_num, 99 | } 100 | 101 | def to_params(self) -> BetaToolComputerUse20241022Param: 102 | return {"name": self.name, "type": self.api_type, **self.options} 103 | 104 | def __init__(self): 105 | super().__init__() 106 | 107 | self.width = int(os.getenv("WIDTH") or 0) 108 | self.height = int(os.getenv("HEIGHT") or 0) 109 | assert self.width and self.height, "WIDTH, HEIGHT must be set" 110 | if (display_num := os.getenv("DISPLAY_NUM")) is not None: 111 | self.display_num = int(display_num) 112 | self._display_prefix = f"DISPLAY=:{self.display_num} " 113 | else: 114 | self.display_num = None 115 | self._display_prefix = "" 116 | 117 | self.xdotool = f"{self._display_prefix}cliclick" 118 | 119 | async def __call__( 120 | self, 121 | *, 122 | action: Action, 123 | text: str | None = None, 124 | coordinate: tuple[int, int] | None = None, 125 | **kwargs, 126 | ): 127 | if action in ("mouse_move", "left_click_drag"): 128 | if coordinate is None: 129 | raise ToolError(f"coordinate is required for {action}") 130 | if text is not None: 131 | raise ToolError(f"text is not accepted for {action}") 132 | if not isinstance(coordinate, list) or len(coordinate) != 2: 133 | raise ToolError(f"{coordinate} must be a tuple of length 2") 134 | if not all(isinstance(i, int) and i >= 0 for i in coordinate): 135 | raise ToolError(f"{coordinate} must be a tuple of non-negative ints") 136 | 137 | x, y = self.scale_coordinates( 138 | ScalingSource.API, coordinate[0], coordinate[1] 139 | ) 140 | 141 | if action == "mouse_move": 142 | return await self.shell(f"{self.xdotool} m:{x},{y}") 143 | elif action == "left_click_drag": 144 | return await self.shell(f"{self.xdotool} dd:. dm:{x},{y} du:{x},{y}") 145 | 146 | if action in ("key", "type"): 147 | if text is None: 148 | raise ToolError(f"text is required for {action}") 149 | if coordinate is not None: 150 | raise ToolError(f"coordinate is not accepted for {action}") 151 | if not isinstance(text, str): 152 | raise ToolError(output=f"{text} must be a string") 153 | 154 | if action == "key": 155 | return await self.shell(f"{self.xdotool} kp:{text}") 156 | elif action == "type": 157 | results: list[ToolResult] = [] 158 | for chunk in chunks(text, TYPING_GROUP_SIZE): 159 | cmd = f"{self.xdotool} t:{shlex.quote(chunk)} -w {TYPING_DELAY_MS}" 160 | results.append(await self.shell(cmd, take_screenshot=False)) 161 | screenshot_base64 = (await self.screenshot()).base64_image 162 | return ToolResult( 163 | output="".join(result.output or "" for result in results), 164 | error="".join(result.error or "" for result in results), 165 | base64_image=screenshot_base64, 166 | ) 167 | 168 | if action in ( 169 | "left_click", 170 | "right_click", 171 | "double_click", 172 | "middle_click", 173 | "screenshot", 174 | "cursor_position", 175 | ): 176 | if text is not None: 177 | raise ToolError(f"text is not accepted for {action}") 178 | if coordinate is not None: 179 | raise ToolError(f"coordinate is not accepted for {action}") 180 | 181 | if action == "screenshot": 182 | return await self.screenshot() 183 | elif action == "cursor_position": 184 | result = await self.shell( 185 | f"{self.xdotool} p", 186 | take_screenshot=False, 187 | ) 188 | output = result.output or "" 189 | x, y = self.scale_coordinates( 190 | ScalingSource.COMPUTER, 191 | int(output.split(",")[0]), 192 | int(output.split(",")[1]), 193 | ) 194 | return result.replace(output=f"X={x},Y={y}") 195 | else: 196 | click_arg = { 197 | "left_click": "c:.", 198 | "right_click": "rc:.", 199 | "middle_click": "mc:.", 200 | "double_click": "dc:.", 201 | }[action] 202 | return await self.shell(f"{self.xdotool} {click_arg}") 203 | 204 | # minecraft 205 | if action == "hold_down_left_button": 206 | return await self.shell(f"{self.xdotool} m:d") 207 | elif action == "release_left_button": 208 | return await self.shell(f"{self.xdotool} m:u") 209 | elif action == "hold_down_arrow_up": 210 | return await self.shell(f"{self.xdotool} kp:up") 211 | elif action == "release_arrow_up": 212 | return await self.shell(f"{self.xdotool} ku:up") 213 | elif action == "hold_down_arrow_down": 214 | return await self.shell(f"{self.xdotool} kp:down") 215 | elif action == "release_arrow_down": 216 | return await self.shell(f"{self.xdotool} ku:down") 217 | elif action == "hold_down_arrow_left": 218 | return await self.shell(f"{self.xdotool} kp:left") 219 | elif action == "release_arrow_left": 220 | return await self.shell(f"{self.xdotool} ku:left") 221 | elif action == "hold_down_arrow_right": 222 | return await self.shell(f"{self.xdotool} kp:right") 223 | elif action == "release_arrow_right": 224 | return await self.shell(f"{self.xdotool} ku:right") 225 | 226 | raise ToolError(f"Invalid action: {action}") 227 | 228 | async def screenshot(self): 229 | """Take a screenshot of the current screen and return the base64 encoded image.""" 230 | output_dir = Path(OUTPUT_DIR) 231 | output_dir.mkdir(parents=True, exist_ok=True) 232 | path = output_dir / f"screenshot_{uuid4().hex}.png" 233 | 234 | screenshot_cmd = f"{self._display_prefix}screencapture -f {path} -p" 235 | 236 | result = await self.shell(screenshot_cmd, take_screenshot=False) 237 | if self._scaling_enabled: 238 | x, y = self.scale_coordinates( 239 | ScalingSource.COMPUTER, self.width, self.height 240 | ) 241 | await self.shell( 242 | f"convert {path} -resize {x}x{y}! {path}", take_screenshot=False 243 | ) 244 | 245 | if path.exists(): 246 | return result.replace( 247 | base64_image=base64.b64encode(path.read_bytes()).decode() 248 | ) 249 | raise ToolError(f"Failed to take screenshot: {result.error}") 250 | 251 | async def shell(self, command: str, take_screenshot=True) -> ToolResult: 252 | """Run a shell command and return the output, error, and optionally a screenshot.""" 253 | _, stdout, stderr = await run(command) 254 | base64_image = None 255 | 256 | if take_screenshot: 257 | # delay to let things settle before taking a screenshot 258 | await asyncio.sleep(self._screenshot_delay) 259 | base64_image = (await self.screenshot()).base64_image 260 | 261 | return ToolResult(output=stdout, error=stderr, base64_image=base64_image) 262 | 263 | def scale_coordinates(self, source: ScalingSource, x: int, y: int): 264 | """Scale coordinates to a target maximum resolution.""" 265 | if not self._scaling_enabled: 266 | return x, y 267 | ratio = self.width / self.height 268 | target_dimension = None 269 | for dimension in MAX_SCALING_TARGETS.values(): 270 | # allow some error in the aspect ratio - not ratios are exactly 16:9 271 | if abs(dimension["width"] / dimension["height"] - ratio) < 0.02: 272 | if dimension["width"] < self.width: 273 | target_dimension = dimension 274 | break 275 | if target_dimension is None: 276 | return x, y 277 | # should be less than 1 278 | x_scaling_factor = target_dimension["width"] / self.width 279 | y_scaling_factor = target_dimension["height"] / self.height 280 | if source == ScalingSource.API: 281 | if x > self.width or y > self.height: 282 | raise ToolError(f"Coordinates {x}, {y} are out of bounds") 283 | # scale up 284 | return round(x / x_scaling_factor), round(y / y_scaling_factor) 285 | # scale down 286 | return round(x * x_scaling_factor), round(y * y_scaling_factor) 287 | -------------------------------------------------------------------------------- /computer_use_demo/tools/edit.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from pathlib import Path 3 | from typing import Literal, get_args 4 | 5 | from anthropic.types.beta import BetaToolTextEditor20241022Param 6 | 7 | from .base import BaseAnthropicTool, CLIResult, ToolError, ToolResult 8 | from .run import maybe_truncate, run 9 | 10 | Command = Literal[ 11 | "view", 12 | "create", 13 | "str_replace", 14 | "insert", 15 | "undo_edit", 16 | ] 17 | SNIPPET_LINES: int = 4 18 | 19 | 20 | class EditTool(BaseAnthropicTool): 21 | """ 22 | An filesystem editor tool that allows the agent to view, create, and edit files. 23 | The tool parameters are defined by Anthropic and are not editable. 24 | """ 25 | 26 | api_type: Literal["text_editor_20241022"] = "text_editor_20241022" 27 | name: Literal["str_replace_editor"] = "str_replace_editor" 28 | 29 | _file_history: dict[Path, list[str]] 30 | 31 | def __init__(self): 32 | self._file_history = defaultdict(list) 33 | super().__init__() 34 | 35 | def to_params(self) -> BetaToolTextEditor20241022Param: 36 | return { 37 | "name": self.name, 38 | "type": self.api_type, 39 | } 40 | 41 | async def __call__( 42 | self, 43 | *, 44 | command: Command, 45 | path: str, 46 | file_text: str | None = None, 47 | view_range: list[int] | None = None, 48 | old_str: str | None = None, 49 | new_str: str | None = None, 50 | insert_line: int | None = None, 51 | **kwargs, 52 | ): 53 | _path = Path(path) 54 | self.validate_path(command, _path) 55 | if command == "view": 56 | return await self.view(_path, view_range) 57 | elif command == "create": 58 | if not file_text: 59 | raise ToolError("Parameter `file_text` is required for command: create") 60 | self.write_file(_path, file_text) 61 | self._file_history[_path].append(file_text) 62 | return ToolResult(output=f"File created successfully at: {_path}") 63 | elif command == "str_replace": 64 | if not old_str: 65 | raise ToolError( 66 | "Parameter `old_str` is required for command: str_replace" 67 | ) 68 | return self.str_replace(_path, old_str, new_str) 69 | elif command == "insert": 70 | if insert_line is None: 71 | raise ToolError( 72 | "Parameter `insert_line` is required for command: insert" 73 | ) 74 | if not new_str: 75 | raise ToolError("Parameter `new_str` is required for command: insert") 76 | return self.insert(_path, insert_line, new_str) 77 | elif command == "undo_edit": 78 | return self.undo_edit(_path) 79 | raise ToolError( 80 | f'Unrecognized command {command}. The allowed commands for the {self.name} tool are: {", ".join(get_args(Command))}' 81 | ) 82 | 83 | def validate_path(self, command: str, path: Path): 84 | """ 85 | Check that the path/command combination is valid. 86 | """ 87 | # Check if its an absolute path 88 | if not path.is_absolute(): 89 | suggested_path = Path("") / path 90 | raise ToolError( 91 | f"The path {path} is not an absolute path, it should start with `/`. Maybe you meant {suggested_path}?" 92 | ) 93 | # Check if path exists 94 | if not path.exists() and command != "create": 95 | raise ToolError( 96 | f"The path {path} does not exist. Please provide a valid path." 97 | ) 98 | if path.exists() and command == "create": 99 | raise ToolError( 100 | f"File already exists at: {path}. Cannot overwrite files using command `create`." 101 | ) 102 | # Check if the path points to a directory 103 | if path.is_dir(): 104 | if command != "view": 105 | raise ToolError( 106 | f"The path {path} is a directory and only the `view` command can be used on directories" 107 | ) 108 | 109 | async def view(self, path: Path, view_range: list[int] | None = None): 110 | """Implement the view command""" 111 | if path.is_dir(): 112 | if view_range: 113 | raise ToolError( 114 | "The `view_range` parameter is not allowed when `path` points to a directory." 115 | ) 116 | 117 | _, stdout, stderr = await run( 118 | rf"find {path} -maxdepth 2 -not -path '*/\.*'" 119 | ) 120 | if not stderr: 121 | stdout = f"Here's the files and directories up to 2 levels deep in {path}, excluding hidden items:\n{stdout}\n" 122 | return CLIResult(output=stdout, error=stderr) 123 | 124 | file_content = self.read_file(path) 125 | init_line = 1 126 | if view_range: 127 | if len(view_range) != 2 or not all(isinstance(i, int) for i in view_range): 128 | raise ToolError( 129 | "Invalid `view_range`. It should be a list of two integers." 130 | ) 131 | file_lines = file_content.split("\n") 132 | n_lines_file = len(file_lines) 133 | init_line, final_line = view_range 134 | if init_line < 1 or init_line > n_lines_file: 135 | raise ToolError( 136 | f"Invalid `view_range`: {view_range}. It's first element `{init_line}` should be within the range of lines of the file: {[1, n_lines_file]}" 137 | ) 138 | if final_line > n_lines_file: 139 | raise ToolError( 140 | f"Invalid `view_range`: {view_range}. It's second element `{final_line}` should be smaller than the number of lines in the file: `{n_lines_file}`" 141 | ) 142 | if final_line != -1 and final_line < init_line: 143 | raise ToolError( 144 | f"Invalid `view_range`: {view_range}. It's second element `{final_line}` should be larger or equal than its first `{init_line}`" 145 | ) 146 | 147 | if final_line == -1: 148 | file_content = "\n".join(file_lines[init_line - 1 :]) 149 | else: 150 | file_content = "\n".join(file_lines[init_line - 1 : final_line]) 151 | 152 | return CLIResult( 153 | output=self._make_output(file_content, str(path), init_line=init_line) 154 | ) 155 | 156 | def str_replace(self, path: Path, old_str: str, new_str: str | None): 157 | """Implement the str_replace command, which replaces old_str with new_str in the file content""" 158 | # Read the file content 159 | file_content = self.read_file(path).expandtabs() 160 | old_str = old_str.expandtabs() 161 | new_str = new_str.expandtabs() if new_str is not None else "" 162 | 163 | # Check if old_str is unique in the file 164 | occurrences = file_content.count(old_str) 165 | if occurrences == 0: 166 | raise ToolError( 167 | f"No replacement was performed, old_str `{old_str}` did not appear verbatim in {path}." 168 | ) 169 | elif occurrences > 1: 170 | file_content_lines = file_content.split("\n") 171 | lines = [ 172 | idx + 1 173 | for idx, line in enumerate(file_content_lines) 174 | if old_str in line 175 | ] 176 | raise ToolError( 177 | f"No replacement was performed. Multiple occurrences of old_str `{old_str}` in lines {lines}. Please ensure it is unique" 178 | ) 179 | 180 | # Replace old_str with new_str 181 | new_file_content = file_content.replace(old_str, new_str) 182 | 183 | # Write the new content to the file 184 | self.write_file(path, new_file_content) 185 | 186 | # Save the content to history 187 | self._file_history[path].append(file_content) 188 | 189 | # Create a snippet of the edited section 190 | replacement_line = file_content.split(old_str)[0].count("\n") 191 | start_line = max(0, replacement_line - SNIPPET_LINES) 192 | end_line = replacement_line + SNIPPET_LINES + new_str.count("\n") 193 | snippet = "\n".join(new_file_content.split("\n")[start_line : end_line + 1]) 194 | 195 | # Prepare the success message 196 | success_msg = f"The file {path} has been edited. " 197 | success_msg += self._make_output( 198 | snippet, f"a snippet of {path}", start_line + 1 199 | ) 200 | success_msg += "Review the changes and make sure they are as expected. Edit the file again if necessary." 201 | 202 | return CLIResult(output=success_msg) 203 | 204 | def insert(self, path: Path, insert_line: int, new_str: str): 205 | """Implement the insert command, which inserts new_str at the specified line in the file content.""" 206 | file_text = self.read_file(path).expandtabs() 207 | new_str = new_str.expandtabs() 208 | file_text_lines = file_text.split("\n") 209 | n_lines_file = len(file_text_lines) 210 | 211 | if insert_line < 0 or insert_line > n_lines_file: 212 | raise ToolError( 213 | f"Invalid `insert_line` parameter: {insert_line}. It should be within the range of lines of the file: {[0, n_lines_file]}" 214 | ) 215 | 216 | new_str_lines = new_str.split("\n") 217 | new_file_text_lines = ( 218 | file_text_lines[:insert_line] 219 | + new_str_lines 220 | + file_text_lines[insert_line:] 221 | ) 222 | snippet_lines = ( 223 | file_text_lines[max(0, insert_line - SNIPPET_LINES) : insert_line] 224 | + new_str_lines 225 | + file_text_lines[insert_line : insert_line + SNIPPET_LINES] 226 | ) 227 | 228 | new_file_text = "\n".join(new_file_text_lines) 229 | snippet = "\n".join(snippet_lines) 230 | 231 | self.write_file(path, new_file_text) 232 | self._file_history[path].append(file_text) 233 | 234 | success_msg = f"The file {path} has been edited. " 235 | success_msg += self._make_output( 236 | snippet, 237 | "a snippet of the edited file", 238 | max(1, insert_line - SNIPPET_LINES + 1), 239 | ) 240 | success_msg += "Review the changes and make sure they are as expected (correct indentation, no duplicate lines, etc). Edit the file again if necessary." 241 | return CLIResult(output=success_msg) 242 | 243 | def undo_edit(self, path: Path): 244 | """Implement the undo_edit command.""" 245 | if not self._file_history[path]: 246 | raise ToolError(f"No edit history found for {path}.") 247 | 248 | old_text = self._file_history[path].pop() 249 | self.write_file(path, old_text) 250 | 251 | return CLIResult( 252 | output=f"Last edit to {path} undone successfully. {self._make_output(old_text, str(path))}" 253 | ) 254 | 255 | def read_file(self, path: Path): 256 | """Read the content of a file from a given path; raise a ToolError if an error occurs.""" 257 | try: 258 | return path.read_text() 259 | except Exception as e: 260 | raise ToolError(f"Ran into {e} while trying to read {path}") from None 261 | 262 | def write_file(self, path: Path, file: str): 263 | """Write the content of a file to a given path; raise a ToolError if an error occurs.""" 264 | try: 265 | path.write_text(file) 266 | except Exception as e: 267 | raise ToolError(f"Ran into {e} while trying to write to {path}") from None 268 | 269 | def _make_output( 270 | self, 271 | file_content: str, 272 | file_descriptor: str, 273 | init_line: int = 1, 274 | expand_tabs: bool = True, 275 | ): 276 | """Generate output for the CLI based on the content of a file.""" 277 | file_content = maybe_truncate(file_content) 278 | if expand_tabs: 279 | file_content = file_content.expandtabs() 280 | file_content = "\n".join( 281 | [ 282 | f"{i + init_line:6}\t{line}" 283 | for i, line in enumerate(file_content.split("\n")) 284 | ] 285 | ) 286 | return ( 287 | f"Here's the result of running `cat -n` on {file_descriptor}:\n" 288 | + file_content 289 | + "\n" 290 | ) 291 | -------------------------------------------------------------------------------- /computer_use_demo/tools/game.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import base64 3 | import os 4 | import pyautogui 5 | from enum import StrEnum 6 | from pathlib import Path 7 | from typing import Literal, TypedDict 8 | from uuid import uuid4 9 | 10 | from anthropic.types.beta import BetaToolComputerUse20241022Param 11 | 12 | from .base import BaseAnthropicTool, ToolError, ToolResult 13 | from .run import run 14 | 15 | OUTPUT_DIR = "/tmp/outputs" 16 | 17 | TYPING_DELAY_MS = 12 18 | TYPING_GROUP_SIZE = 50 19 | 20 | Action = Literal[ 21 | "key", 22 | "type", 23 | "mouse_move", 24 | "left_click", 25 | "left_click_drag", 26 | "right_click", 27 | "middle_click", 28 | "double_click", 29 | "screenshot", 30 | "cursor_position", 31 | # minecraft 32 | "left_down", 33 | "left_up", 34 | "hold_arrow_up", 35 | "release_arrow_up", 36 | "hold_arrow_down", 37 | "release_arrow_down", 38 | "hold_arrow_left", 39 | "release_arrow_left", 40 | "hold_arrow_right", 41 | "release_arrow_right", 42 | ] 43 | 44 | 45 | class Resolution(TypedDict): 46 | width: int 47 | height: int 48 | 49 | 50 | # sizes above XGA/WXGA are not recommended (see README.md) 51 | # scale down to one of these targets if ComputerTool._scaling_enabled is set 52 | MAX_SCALING_TARGETS: dict[str, Resolution] = { 53 | "XGA": Resolution(width=1024, height=768), # 4:3 54 | "WXGA": Resolution(width=1280, height=800), # 16:10 55 | "FWXGA": Resolution(width=1366, height=768), # ~16:9 56 | } 57 | 58 | 59 | class ScalingSource(StrEnum): 60 | COMPUTER = "computer" 61 | API = "api" 62 | 63 | 64 | class ComputerToolOptions(TypedDict): 65 | display_height_px: int 66 | display_width_px: int 67 | display_number: int | None 68 | 69 | # 70 | # You may be asked to play a game of minecraft. Here are some instructions: 71 | # * For movement, WASD is disabled. Instead, use the hold_arrow_left, release_arrow_left, hold_arrow_right, release_arrow_right, hold_arrow_up, release_arrow_up, hold_arrow_down, release_arrow_down tool calls. 72 | # * You can control the minecraft game with your computer tool. Use the minecraft_button tool call to hold down the left mouse button, and the left_up tool call to release it. 73 | # * Use the hold_arrow_up tool call to hold down the up arrow key, and the release_arrow_up tool call to release it. 74 | # * Use the hold_arrow_down tool call to hold down the down arrow key, and the release_arrow_down tool call to release it. 75 | # * Use the hold_arrow_left tool call to hold down the left arrow key, and the release_arrow_left tool call to release it. 76 | # * Use the hold_arrow_right tool call to hold down the right arrow key, and the release_arrow_right tool call to release it. 77 | # 78 | def chunks(s: str, chunk_size: int) -> list[str]: 79 | return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)] 80 | 81 | class GameTool(BaseAnthropicTool): 82 | """ 83 | A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer. Also allows the agent to control a minecraft game. 84 | The tool parameters are defined by Anthropic and are not editable. 85 | """ 86 | 87 | name: Literal["computer"] = "computer" 88 | api_type: Literal["computer_20241022"] = "computer_20241022" 89 | width: int 90 | height: int 91 | description: str = """ 92 | Use a mouse and keyboard to interact with a computer, and take screenshots. 93 | * This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications. 94 | * Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try taking another screenshot. 95 | * The screen's resolution is {{ display_width_px }}x{{ display_height_px }}. 96 | * The display number is {{ display_number }} 97 | * Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor. 98 | * If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click. 99 | * Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked. 100 | In Minecraft, the standard controls are: 101 | space: Jump 102 | mouse_move: Look around 103 | left_down: Break blocks/attack 104 | right click: Place blocks/interact 105 | For movement in minecraft, WASD is disabled. Instead, use the hold_arrow_left, release_arrow_left, hold_arrow_right, release_arrow_right, hold_arrow_up, release_arrow_up, hold_arrow_down, release_arrow_down tool calls. 106 | """ 107 | input_schema = { 108 | "properties": { 109 | "action": { 110 | "description": """The action to perform. The available actions are: 111 | * `key`: Press a key or key-combination on the keyboard. 112 | - This supports cliclick's `key` syntax. 113 | - All possible keys are: arrow-up, brightness-down, brightness-up, delete, end, enter, esc, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15, f16, fwd-delete, home, keys-light-down, keys-light-toggle, keys-light-up, mute, num-0, num-1, num-2, num-3, num-4, num-5, num-6, num-7, num-8, num-9, num-clear, num-divide, num-enter, num-equals, num-minus, num-multiply, num-plus, page-down, page-up, play-next, play-pause, play-previous, return, space, tab, volume-down, volume-up 114 | * `type`: Type a string of text on the keyboard. 115 | * `cursor_position`: Get the current (x, y) pixel coordinate of the cursor on the screen. 116 | * `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen. 117 | * `left_click`: Click the left mouse button. 118 | * `right_click`: Click the right mouse button. 119 | * `middle_click`: Click the middle mouse button. 120 | * `double_click`: Double-click the left mouse button. 121 | * `screenshot`: Take a screenshot of the screen. 122 | """, 123 | "enum": [ 124 | "key", 125 | "type", 126 | "mouse_move", 127 | "left_click", 128 | # "left_click_drag", 129 | "right_click", 130 | "middle_click", 131 | "double_click", 132 | "screenshot", 133 | "cursor_position", 134 | # minecraft 135 | "left_down", 136 | "left_up", 137 | "hold_arrow_up", 138 | "release_arrow_up", 139 | "hold_arrow_down", 140 | "release_arrow_down", 141 | "hold_arrow_left", 142 | "release_arrow_left", 143 | "hold_arrow_right", 144 | "release_arrow_right", 145 | ], 146 | "type": "string", 147 | }, 148 | "coordinate": { 149 | "description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=mouse_move` and `action=left_click_drag`.", 150 | "type": "array", 151 | }, 152 | "text": { 153 | "description": "Required only by `action=type` and `action=key`.", 154 | "type": "string", 155 | }, 156 | }, 157 | "required": ["action"], 158 | "type": "object", 159 | } 160 | 161 | display_num: int | None 162 | 163 | _screenshot_delay = 2.0 164 | _scaling_enabled = True 165 | 166 | @property 167 | def options(self) -> ComputerToolOptions: 168 | width, height = self.scale_coordinates( 169 | ScalingSource.COMPUTER, self.width, self.height 170 | ) 171 | return { 172 | "display_width_px": width, 173 | "display_height_px": height, 174 | "display_number": self.display_num, 175 | } 176 | 177 | def to_params(self) -> BetaToolComputerUse20241022Param: 178 | return {"name": self.name, "type": self.api_type, **self.options} 179 | 180 | def __init__(self): 181 | super().__init__() 182 | 183 | self.width = int(os.getenv("WIDTH") or 0) 184 | self.height = int(os.getenv("HEIGHT") or 0) 185 | assert self.width and self.height, "WIDTH, HEIGHT must be set" 186 | if (display_num := os.getenv("DISPLAY_NUM")) is not None: 187 | self.display_num = int(display_num) 188 | self._display_prefix = f"DISPLAY=:{self.display_num} " 189 | else: 190 | self.display_num = None 191 | self._display_prefix = "" 192 | 193 | async def __call__( 194 | self, 195 | *, 196 | action: Action, 197 | text: str | None = None, 198 | coordinate: tuple[int, int] | None = None, 199 | **kwargs, 200 | ): 201 | if action in ("mouse_move", "left_click_drag"): 202 | if coordinate is None: 203 | raise ToolError(f"coordinate is required for {action}") 204 | if text is not None: 205 | raise ToolError(f"text is not accepted for {action}") 206 | if not isinstance(coordinate, list) or len(coordinate) != 2: 207 | raise ToolError(f"{coordinate} must be a tuple of length 2") 208 | if not all(isinstance(i, int) and i >= 0 for i in coordinate): 209 | raise ToolError(f"{coordinate} must be a tuple of non-negative ints") 210 | 211 | x, y = self.scale_coordinates( 212 | ScalingSource.API, coordinate[0], coordinate[1] 213 | ) 214 | 215 | if action == "mouse_move": 216 | pyautogui.moveTo(x, y) 217 | return ToolResult(output=f"Moved mouse to {x},{y}") 218 | elif action == "left_click_drag": 219 | pyautogui.dragTo(x, y) 220 | return ToolResult(output=f"Dragged mouse to {x},{y}") 221 | 222 | if action in ("key", "type"): 223 | if text is None: 224 | raise ToolError(f"text is required for {action}") 225 | if coordinate is not None: 226 | raise ToolError(f"coordinate is not accepted for {action}") 227 | if not isinstance(text, str): 228 | raise ToolError(output=f"{text} must be a string") 229 | 230 | if action == "key": 231 | if text.lower() in 'wasd': 232 | pyautogui.keyDown(text.lower()) 233 | pyautogui.sleep(1) 234 | pyautogui.keyUp(text.lower()) 235 | elif text.lower() in 'abcdefghijklmnopqrstuvwxyz': 236 | pyautogui.press(text.lower()) 237 | elif text in "1234567890": 238 | pyautogui.press(text) 239 | elif text.lower() == "return": 240 | pyautogui.press('enter') 241 | elif text.lower() in ("right-arrow", "right", "left-arrow", "left", "up-arrow", "up", "down-arrow", "down"): 242 | pyautogui.press(text.split('-')[0].lower()) 243 | else: 244 | pyautogui.press(text.lower()) 245 | return ToolResult(output=f"Pressed key: {text}") 246 | elif action == "type": 247 | for chunk in chunks(text, TYPING_GROUP_SIZE): 248 | pyautogui.write(chunk, interval=TYPING_DELAY_MS/1000) 249 | screenshot_base64 = (await self.screenshot()).base64_image 250 | return ToolResult( 251 | output=f"Typed: {text}", 252 | base64_image=screenshot_base64, 253 | ) 254 | 255 | if action in ( 256 | "left_click", 257 | "right_click", 258 | "double_click", 259 | "middle_click", 260 | "screenshot", 261 | "cursor_position", 262 | ): 263 | if text is not None: 264 | raise ToolError(f"text is not accepted for {action}") 265 | if coordinate is not None: 266 | raise ToolError(f"coordinate is not accepted for {action}") 267 | 268 | if action == "screenshot": 269 | return await self.screenshot() 270 | elif action == "cursor_position": 271 | x, y = pyautogui.position() 272 | x, y = self.scale_coordinates( 273 | ScalingSource.COMPUTER, x, y 274 | ) 275 | return ToolResult(output=f"X={x},Y={y}") 276 | else: 277 | click_map = { 278 | "left_click": pyautogui.click, 279 | "right_click": pyautogui.rightClick, 280 | "middle_click": pyautogui.middleClick, 281 | "double_click": pyautogui.doubleClick, 282 | } 283 | click_map[action]() 284 | return ToolResult(output=f"Performed {action}") 285 | 286 | # minecraft 287 | if action == "left_down": 288 | pyautogui.mouseDown() 289 | elif action == "left_up": 290 | pyautogui.mouseUp() 291 | elif action.startswith("hold_arrow_"): 292 | pyautogui.keyDown(action.split('_')[-1]) 293 | elif action.startswith("release_arrow_"): 294 | pyautogui.keyUp(action.split('_')[-1]) 295 | else: 296 | raise ToolError(f"Invalid action: {action}") 297 | 298 | return ToolResult(output=f"Performed {action}") 299 | 300 | async def screenshot(self): 301 | """Take a screenshot of the current screen and return the base64 encoded image.""" 302 | output_dir = Path(OUTPUT_DIR) 303 | output_dir.mkdir(parents=True, exist_ok=True) 304 | path = output_dir / f"screenshot_{uuid4().hex}.png" 305 | 306 | screenshot_cmd = f"{self._display_prefix}screencapture -C {path} -p" 307 | 308 | # resize the screenshot to default width and height 309 | await self.shell(f"convert {path} -resize {self.width}x{self.height}! {path}", take_screenshot=False) 310 | 311 | result = await self.shell(screenshot_cmd, take_screenshot=False) 312 | if self._scaling_enabled: 313 | x, y = self.scale_coordinates( 314 | ScalingSource.COMPUTER, self.width, self.height 315 | ) 316 | await self.shell( 317 | f"convert {path} -resize {x}x{y}! {path}", take_screenshot=False 318 | ) 319 | 320 | if path.exists(): 321 | return result.replace( 322 | base64_image=base64.b64encode(path.read_bytes()).decode() 323 | ) 324 | raise ToolError(f"Failed to take screenshot: {result.error}") 325 | 326 | async def shell(self, command: str, take_screenshot=True) -> ToolResult: 327 | """Run a shell command and return the output, error, and optionally a screenshot.""" 328 | _, stdout, stderr = await run(command) 329 | base64_image = None 330 | 331 | if take_screenshot: 332 | # delay to let things settle before taking a screenshot 333 | await asyncio.sleep(self._screenshot_delay) 334 | base64_image = (await self.screenshot()).base64_image 335 | 336 | return ToolResult(output=stdout, error=stderr, base64_image=base64_image) 337 | 338 | def scale_coordinates(self, source: ScalingSource, x: int, y: int): 339 | """Scale coordinates to a target maximum resolution.""" 340 | if not self._scaling_enabled: 341 | return x, y 342 | ratio = self.width / self.height 343 | target_dimension = None 344 | for dimension in MAX_SCALING_TARGETS.values(): 345 | # allow some error in the aspect ratio - not ratios are exactly 16:9 346 | if abs(dimension["width"] / dimension["height"] - ratio) < 0.02: 347 | if dimension["width"] < self.width: 348 | target_dimension = dimension 349 | break 350 | if target_dimension is None: 351 | return x, y 352 | # should be less than 1 353 | x_scaling_factor = target_dimension["width"] / self.width 354 | y_scaling_factor = target_dimension["height"] / self.height 355 | if source == ScalingSource.API: 356 | if x > self.width or y > self.height: 357 | raise ToolError(f"Coordinates {x}, {y} are out of bounds") 358 | # scale up 359 | return round(x / x_scaling_factor), round(y / y_scaling_factor) 360 | # scale down 361 | return round(x * x_scaling_factor), round(y * y_scaling_factor) 362 | 363 | -------------------------------------------------------------------------------- /computer_use_demo/tools/run.py: -------------------------------------------------------------------------------- 1 | """Utility to run shell commands asynchronously with a timeout.""" 2 | 3 | import asyncio 4 | 5 | TRUNCATED_MESSAGE: str = "To save on context only part of this file has been shown to you. You should retry this tool after you have searched inside the file with `grep -n` in order to find the line numbers of what you are looking for." 6 | MAX_RESPONSE_LEN: int = 16000 7 | 8 | 9 | def maybe_truncate(content: str, truncate_after: int | None = MAX_RESPONSE_LEN): 10 | """Truncate content and append a notice if content exceeds the specified length.""" 11 | return ( 12 | content 13 | if not truncate_after or len(content) <= truncate_after 14 | else content[:truncate_after] + TRUNCATED_MESSAGE 15 | ) 16 | 17 | 18 | async def run( 19 | cmd: str, 20 | timeout: float | None = 120.0, # seconds 21 | truncate_after: int | None = MAX_RESPONSE_LEN, 22 | ): 23 | """Run a shell command asynchronously with a timeout.""" 24 | process = await asyncio.create_subprocess_shell( 25 | cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE 26 | ) 27 | 28 | try: 29 | stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=timeout) 30 | return ( 31 | process.returncode or 0, 32 | maybe_truncate(stdout.decode(), truncate_after=truncate_after), 33 | maybe_truncate(stderr.decode(), truncate_after=truncate_after), 34 | ) 35 | except asyncio.TimeoutError as exc: 36 | try: 37 | process.kill() 38 | except ProcessLookupError: 39 | pass 40 | raise TimeoutError( 41 | f"Command '{cmd}' timed out after {timeout} seconds" 42 | ) from exc 43 | -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | -r computer_use_demo/requirements.txt 2 | ruff==0.6.7 3 | pre-commit==3.8.0 4 | pytest==8.3.3 5 | pytest-asyncio==0.23.6 6 | -------------------------------------------------------------------------------- /image/.config/tint2/applications/firefox-custom.desktop: -------------------------------------------------------------------------------- 1 | [Desktop Entry] 2 | Name=Firefox Custom 3 | Comment=Open Firefox with custom URL 4 | Exec=firefox-esr -new-window 5 | Icon=firefox-esr 6 | Terminal=false 7 | Type=Application 8 | Categories=Network;WebBrowser; 9 | -------------------------------------------------------------------------------- /image/.config/tint2/applications/gedit.desktop: -------------------------------------------------------------------------------- 1 | [Desktop Entry] 2 | Name=Gedit 3 | Comment=Open gedit 4 | Exec=gedit 5 | Icon=text-editor-symbolic 6 | Terminal=false 7 | Type=Application 8 | Categories=TextEditor; 9 | -------------------------------------------------------------------------------- /image/.config/tint2/applications/terminal.desktop: -------------------------------------------------------------------------------- 1 | [Desktop Entry] 2 | Name=Terminal 3 | Comment=Open Terminal 4 | Exec=xterm 5 | Icon=utilities-terminal 6 | Terminal=false 7 | Type=Application 8 | Categories=System;TerminalEmulator; 9 | -------------------------------------------------------------------------------- /image/.config/tint2/tint2rc: -------------------------------------------------------------------------------- 1 | #------------------------------------- 2 | # Panel 3 | panel_items = TL 4 | panel_size = 100% 60 5 | panel_margin = 0 0 6 | panel_padding = 2 0 2 7 | panel_background_id = 1 8 | wm_menu = 0 9 | panel_dock = 0 10 | panel_position = bottom center horizontal 11 | panel_layer = top 12 | panel_monitor = all 13 | panel_shrink = 0 14 | autohide = 0 15 | autohide_show_timeout = 0 16 | autohide_hide_timeout = 0.5 17 | autohide_height = 2 18 | strut_policy = follow_size 19 | panel_window_name = tint2 20 | disable_transparency = 1 21 | mouse_effects = 1 22 | font_shadow = 0 23 | mouse_hover_icon_asb = 100 0 10 24 | mouse_pressed_icon_asb = 100 0 0 25 | scale_relative_to_dpi = 0 26 | scale_relative_to_screen_height = 0 27 | 28 | #------------------------------------- 29 | # Taskbar 30 | taskbar_mode = single_desktop 31 | taskbar_hide_if_empty = 0 32 | taskbar_padding = 0 0 2 33 | taskbar_background_id = 0 34 | taskbar_active_background_id = 0 35 | taskbar_name = 1 36 | taskbar_hide_inactive_tasks = 0 37 | taskbar_hide_different_monitor = 0 38 | taskbar_hide_different_desktop = 0 39 | taskbar_always_show_all_desktop_tasks = 0 40 | taskbar_name_padding = 4 2 41 | taskbar_name_background_id = 0 42 | taskbar_name_active_background_id = 0 43 | taskbar_name_font_color = #e3e3e3 100 44 | taskbar_name_active_font_color = #ffffff 100 45 | taskbar_distribute_size = 0 46 | taskbar_sort_order = none 47 | task_align = left 48 | 49 | #------------------------------------- 50 | # Launcher 51 | launcher_padding = 4 8 4 52 | launcher_background_id = 0 53 | launcher_icon_background_id = 0 54 | launcher_icon_size = 48 55 | launcher_icon_asb = 100 0 0 56 | launcher_icon_theme_override = 0 57 | startup_notifications = 1 58 | launcher_tooltip = 1 59 | 60 | #------------------------------------- 61 | # Launcher icon 62 | launcher_item_app = /usr/share/applications/libreoffice-calc.desktop 63 | launcher_item_app = /home/computeruse/.config/tint2/applications/terminal.desktop 64 | launcher_item_app = /home/computeruse/.config/tint2/applications/firefox-custom.desktop 65 | launcher_item_app = /usr/share/applications/xpaint.desktop 66 | launcher_item_app = /usr/share/applications/xpdf.desktop 67 | launcher_item_app = /home/computeruse/.config/tint2/applications/gedit.desktop 68 | launcher_item_app = /usr/share/applications/galculator.desktop 69 | 70 | #------------------------------------- 71 | # Background definitions 72 | # ID 1 73 | rounded = 0 74 | border_width = 0 75 | background_color = #000000 60 76 | border_color = #000000 30 77 | 78 | # ID 2 79 | rounded = 4 80 | border_width = 1 81 | background_color = #777777 20 82 | border_color = #777777 30 83 | 84 | # ID 3 85 | rounded = 4 86 | border_width = 1 87 | background_color = #777777 20 88 | border_color = #ffffff 40 89 | 90 | # ID 4 91 | rounded = 4 92 | border_width = 1 93 | background_color = #aa4400 100 94 | border_color = #aa7733 100 95 | 96 | # ID 5 97 | rounded = 4 98 | border_width = 1 99 | background_color = #aaaa00 100 100 | border_color = #aaaa00 100 101 | -------------------------------------------------------------------------------- /image/.streamlit/config.toml: -------------------------------------------------------------------------------- 1 | [server] 2 | fileWatcherType = "auto" 3 | runOnSave = true 4 | 5 | [browser] 6 | gatherUsageStats = false 7 | -------------------------------------------------------------------------------- /image/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | ./start_all.sh 5 | ./novnc_startup.sh 6 | 7 | python http_server.py > /tmp/server_logs.txt 2>&1 & 8 | 9 | STREAMLIT_SERVER_PORT=8501 python -m streamlit run computer_use_demo/streamlit.py > /tmp/streamlit_stdout.log & 10 | 11 | echo "✨ Computer Use Demo is ready!" 12 | echo "➡️ Open http://localhost:8080 in your browser to begin" 13 | 14 | # Keep the container running 15 | tail -f /dev/null 16 | -------------------------------------------------------------------------------- /image/http_server.py: -------------------------------------------------------------------------------- 1 | import os 2 | import socket 3 | from http.server import HTTPServer, SimpleHTTPRequestHandler 4 | 5 | 6 | class HTTPServerV6(HTTPServer): 7 | address_family = socket.AF_INET6 8 | 9 | 10 | def run_server(): 11 | os.chdir(os.path.dirname(__file__) + "/static_content") 12 | server_address = ("::", 8080) 13 | httpd = HTTPServerV6(server_address, SimpleHTTPRequestHandler) 14 | print("Starting HTTP server on port 8080...") # noqa: T201 15 | httpd.serve_forever() 16 | 17 | 18 | if __name__ == "__main__": 19 | run_server() 20 | -------------------------------------------------------------------------------- /image/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Computer Use Demo 5 | 6 | 28 | 29 | 30 |
31 | 36 | 41 |
42 | 43 | 44 | -------------------------------------------------------------------------------- /image/mutter_startup.sh: -------------------------------------------------------------------------------- 1 | echo "starting mutter" 2 | XDG_SESSION_TYPE=x11 mutter --replace --sm-disable 2>/tmp/mutter_stderr.log & 3 | 4 | # Wait for tint2 window properties to appear 5 | timeout=30 6 | while [ $timeout -gt 0 ]; do 7 | if xdotool search --class "mutter" >/dev/null 2>&1; then 8 | break 9 | fi 10 | sleep 1 11 | ((timeout--)) 12 | done 13 | 14 | if [ $timeout -eq 0 ]; then 15 | echo "mutter stderr output:" >&2 16 | cat /tmp/mutter_stderr.log >&2 17 | exit 1 18 | fi 19 | 20 | rm /tmp/mutter_stderr.log 21 | -------------------------------------------------------------------------------- /image/novnc_startup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "starting noVNC" 3 | 4 | # Start noVNC with explicit websocket settings 5 | /opt/noVNC/utils/novnc_proxy \ 6 | --vnc localhost:5900 \ 7 | --listen 6080 \ 8 | --web /opt/noVNC \ 9 | > /tmp/novnc.log 2>&1 & 10 | 11 | # Wait for noVNC to start 12 | timeout=10 13 | while [ $timeout -gt 0 ]; do 14 | if netstat -tuln | grep -q ":6080 "; then 15 | break 16 | fi 17 | sleep 1 18 | ((timeout--)) 19 | done 20 | 21 | echo "noVNC started successfully" 22 | -------------------------------------------------------------------------------- /image/start_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | export DISPLAY=:${DISPLAY_NUM} 6 | ./xvfb_startup.sh 7 | ./tint2_startup.sh 8 | ./mutter_startup.sh 9 | ./x11vnc_startup.sh 10 | -------------------------------------------------------------------------------- /image/static_content/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Computer Use Demo 5 | 6 | 28 | 29 | 30 |
31 | 36 | 42 | 48 | 70 |
71 | 72 | 73 | -------------------------------------------------------------------------------- /image/tint2_startup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "starting tint2 on display :$DISPLAY_NUM ..." 3 | 4 | # Start tint2 and capture its stderr 5 | tint2 -c $HOME/.config/tint2/tint2rc 2>/tmp/tint2_stderr.log & 6 | 7 | # Wait for tint2 window properties to appear 8 | timeout=30 9 | while [ $timeout -gt 0 ]; do 10 | if xdotool search --class "tint2" >/dev/null 2>&1; then 11 | break 12 | fi 13 | sleep 1 14 | ((timeout--)) 15 | done 16 | 17 | if [ $timeout -eq 0 ]; then 18 | echo "tint2 stderr output:" >&2 19 | cat /tmp/tint2_stderr.log >&2 20 | exit 1 21 | fi 22 | 23 | # Remove the temporary stderr log file 24 | rm /tmp/tint2_stderr.log 25 | -------------------------------------------------------------------------------- /image/x11vnc_startup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "starting vnc" 3 | 4 | (x11vnc -display $DISPLAY \ 5 | -forever \ 6 | -shared \ 7 | -wait 50 \ 8 | -timeout 60 \ 9 | -noxrecord \ 10 | -noxfixes \ 11 | -noxdamage \ 12 | -rfbport 5900 \ 13 | 2>/tmp/x11vnc_stderr.log) & 14 | 15 | x11vnc_pid=$! 16 | 17 | # Wait for x11vnc to start 18 | timeout=10 19 | while [ $timeout -gt 0 ]; do 20 | if netstat -tuln | grep -q ":5900 "; then 21 | break 22 | fi 23 | sleep 1 24 | ((timeout--)) 25 | done 26 | 27 | if [ $timeout -eq 0 ]; then 28 | echo "x11vnc failed to start, stderr output:" >&2 29 | cat /tmp/x11vnc_stderr.log >&2 30 | exit 1 31 | fi 32 | 33 | : > /tmp/x11vnc_stderr.log 34 | 35 | # Monitor x11vnc process in the background 36 | ( 37 | while true; do 38 | if ! kill -0 $x11vnc_pid 2>/dev/null; then 39 | echo "x11vnc process crashed, restarting..." >&2 40 | if [ -f /tmp/x11vnc_stderr.log ]; then 41 | echo "x11vnc stderr output:" >&2 42 | cat /tmp/x11vnc_stderr.log >&2 43 | rm /tmp/x11vnc_stderr.log 44 | fi 45 | exec "$0" 46 | fi 47 | sleep 5 48 | done 49 | ) & 50 | -------------------------------------------------------------------------------- /image/xvfb_startup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e # Exit on error 3 | 4 | DPI=96 5 | RES_AND_DEPTH=${WIDTH}x${HEIGHT}x24 6 | 7 | # Function to check if Xvfb is already running 8 | check_xvfb_running() { 9 | if [ -e /tmp/.X${DISPLAY_NUM}-lock ]; then 10 | return 0 # Xvfb is already running 11 | else 12 | return 1 # Xvfb is not running 13 | fi 14 | } 15 | 16 | # Function to check if Xvfb is ready 17 | wait_for_xvfb() { 18 | local timeout=10 19 | local start_time=$(date +%s) 20 | while ! xdpyinfo >/dev/null 2>&1; do 21 | if [ $(($(date +%s) - start_time)) -gt $timeout ]; then 22 | echo "Xvfb failed to start within $timeout seconds" >&2 23 | return 1 24 | fi 25 | sleep 0.1 26 | done 27 | return 0 28 | } 29 | 30 | # Check if Xvfb is already running 31 | if check_xvfb_running; then 32 | echo "Xvfb is already running on display ${DISPLAY}" 33 | exit 0 34 | fi 35 | 36 | # Start Xvfb 37 | Xvfb $DISPLAY -ac -screen 0 $RES_AND_DEPTH -retro -dpi $DPI -nolisten tcp -nolisten unix & 38 | XVFB_PID=$! 39 | 40 | # Wait for Xvfb to start 41 | if wait_for_xvfb; then 42 | echo "Xvfb started successfully on display ${DISPLAY}" 43 | echo "Xvfb PID: $XVFB_PID" 44 | else 45 | echo "Xvfb failed to start" 46 | kill $XVFB_PID 47 | exit 1 48 | fi 49 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ObservedObserver/claude-minecraft-use/8b52ef2a5aa175a49475db07ad7168b33089f8b6/main.py -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.pyright] 2 | venvPath = "." 3 | venv = ".venv" 4 | useLibraryCodeForTypes = false 5 | 6 | [tool.pytest.ini_options] 7 | pythonpath = "." 8 | asyncio_mode = "auto" 9 | -------------------------------------------------------------------------------- /ruff.toml: -------------------------------------------------------------------------------- 1 | extend-exclude = [".venv"] 2 | 3 | [format] 4 | docstring-code-format = true 5 | 6 | [lint] 7 | select = [ 8 | "A", 9 | "ASYNC", 10 | "B", 11 | "E", 12 | "F", 13 | "I", 14 | "PIE", 15 | "RUF200", 16 | "T20", 17 | "UP", 18 | "W", 19 | ] 20 | 21 | ignore = ["E501", "ASYNC230"] 22 | 23 | [lint.isort] 24 | combine-as-imports = true 25 | -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PYTHON_MINOR_VERSION=$(python3 --version | awk -F. '{print $2}') 3 | 4 | if [ "$PYTHON_MINOR_VERSION" -gt 12 ]; then 5 | echo "Python version 3.$PYTHON_MINOR_VERSION detected. Python 3.12 or lower is required for setup to complete." 6 | echo "If you have multiple versions of Python installed, you can set the correct one by adjusting setup.sh to use a specific version, for example:" 7 | echo "'python3 -m venv .venv' -> 'python3.12 -m venv .venv'" 8 | exit 1 9 | fi 10 | 11 | if ! command -v cargo &> /dev/null; then 12 | echo "Cargo (the package manager for Rust) is not present. This is required for one of this module's dependencies." 13 | echo "See https://www.rust-lang.org/tools/install for installation instructions." 14 | exit 1 15 | fi 16 | 17 | python3 -m venv .venv 18 | source .venv/bin/activate 19 | pip install --upgrade pip 20 | pip install -r dev-requirements.txt 21 | pre-commit install 22 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest import mock 3 | 4 | import pytest 5 | 6 | 7 | @pytest.fixture(autouse=True) 8 | def mock_screen_dimensions(): 9 | with mock.patch.dict( 10 | os.environ, {"HEIGHT": "768", "WIDTH": "1024", "DISPLAY_NUM": "1"} 11 | ): 12 | yield 13 | -------------------------------------------------------------------------------- /tests/loop_test.py: -------------------------------------------------------------------------------- 1 | from unittest import mock 2 | 3 | from anthropic.types import TextBlock, ToolUseBlock 4 | from anthropic.types.beta import BetaMessage, BetaMessageParam 5 | 6 | from computer_use_demo.loop import APIProvider, sampling_loop 7 | 8 | 9 | async def test_loop(): 10 | client = mock.Mock() 11 | client.beta.messages.with_raw_response.create.return_value = mock.Mock() 12 | client.beta.messages.with_raw_response.create.return_value.parse.side_effect = [ 13 | mock.Mock( 14 | spec=BetaMessage, 15 | content=[ 16 | TextBlock(type="text", text="Hello"), 17 | ToolUseBlock( 18 | type="tool_use", id="1", name="computer", input={"action": "test"} 19 | ), 20 | ], 21 | ), 22 | mock.Mock(spec=BetaMessage, content=[TextBlock(type="text", text="Done!")]), 23 | ] 24 | 25 | tool_collection = mock.AsyncMock() 26 | tool_collection.run.return_value = mock.Mock( 27 | output="Tool output", error=None, base64_image=None 28 | ) 29 | 30 | output_callback = mock.Mock() 31 | tool_output_callback = mock.Mock() 32 | api_response_callback = mock.Mock() 33 | 34 | with mock.patch( 35 | "computer_use_demo.loop.Anthropic", return_value=client 36 | ), mock.patch( 37 | "computer_use_demo.loop.ToolCollection", return_value=tool_collection 38 | ): 39 | messages: list[BetaMessageParam] = [{"role": "user", "content": "Test message"}] 40 | result = await sampling_loop( 41 | model="test-model", 42 | provider=APIProvider.ANTHROPIC, 43 | system_prompt_suffix="", 44 | messages=messages, 45 | output_callback=output_callback, 46 | tool_output_callback=tool_output_callback, 47 | api_response_callback=api_response_callback, 48 | api_key="test-key", 49 | ) 50 | 51 | assert len(result) == 4 52 | assert result[0] == {"role": "user", "content": "Test message"} 53 | assert result[1]["role"] == "assistant" 54 | assert result[2]["role"] == "user" 55 | assert result[3]["role"] == "assistant" 56 | 57 | assert client.beta.messages.with_raw_response.create.call_count == 2 58 | tool_collection.run.assert_called_once_with( 59 | name="computer", tool_input={"action": "test"} 60 | ) 61 | output_callback.assert_called_with(TextBlock(text="Done!", type="text")) 62 | assert output_callback.call_count == 3 63 | assert tool_output_callback.call_count == 1 64 | assert api_response_callback.call_count == 2 65 | -------------------------------------------------------------------------------- /tests/streamlit_test.py: -------------------------------------------------------------------------------- 1 | from unittest import mock 2 | 3 | import pytest 4 | from streamlit.testing.v1 import AppTest 5 | 6 | from computer_use_demo.streamlit import Sender, TextBlock 7 | 8 | 9 | @pytest.fixture 10 | def streamlit_app(): 11 | return AppTest.from_file("computer_use_demo/streamlit.py") 12 | 13 | 14 | def test_streamlit(streamlit_app: AppTest): 15 | streamlit_app.run() 16 | streamlit_app.text_input[1].set_value("sk-ant-0000000000000").run() 17 | with mock.patch("computer_use_demo.loop.sampling_loop") as patch: 18 | streamlit_app.chat_input[0].set_value("Hello").run() 19 | assert patch.called 20 | assert patch.call_args.kwargs["messages"] == [ 21 | {"role": Sender.USER, "content": [TextBlock(text="Hello", type="text")]} 22 | ] 23 | assert not streamlit_app.exception 24 | -------------------------------------------------------------------------------- /tests/tools/bash_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from computer_use_demo.tools.bash import BashTool, ToolError 4 | 5 | 6 | @pytest.fixture 7 | def bash_tool(): 8 | return BashTool() 9 | 10 | 11 | @pytest.mark.asyncio 12 | async def test_bash_tool_restart(bash_tool): 13 | result = await bash_tool(restart=True) 14 | assert result.system == "tool has been restarted." 15 | 16 | # Verify the tool can be used after restart 17 | result = await bash_tool(command="echo 'Hello after restart'") 18 | assert "Hello after restart" in result.output 19 | 20 | 21 | @pytest.mark.asyncio 22 | async def test_bash_tool_run_command(bash_tool): 23 | result = await bash_tool(command="echo 'Hello, World!'") 24 | assert result.output.strip() == "Hello, World!" 25 | assert result.error == "" 26 | 27 | 28 | @pytest.mark.asyncio 29 | async def test_bash_tool_no_command(bash_tool): 30 | with pytest.raises(ToolError, match="no command provided."): 31 | await bash_tool() 32 | 33 | 34 | @pytest.mark.asyncio 35 | async def test_bash_tool_session_creation(bash_tool): 36 | result = await bash_tool(command="echo 'Session created'") 37 | assert bash_tool._session is not None 38 | assert "Session created" in result.output 39 | 40 | 41 | @pytest.mark.asyncio 42 | async def test_bash_tool_session_reuse(bash_tool): 43 | result1 = await bash_tool(command="echo 'First command'") 44 | result2 = await bash_tool(command="echo 'Second command'") 45 | 46 | assert "First command" in result1.output 47 | assert "Second command" in result2.output 48 | 49 | 50 | @pytest.mark.asyncio 51 | async def test_bash_tool_session_error(bash_tool): 52 | result = await bash_tool(command="invalid_command_that_does_not_exist") 53 | assert "command not found" in result.error 54 | 55 | 56 | @pytest.mark.asyncio 57 | async def test_bash_tool_non_zero_exit(bash_tool): 58 | result = await bash_tool(command="bash -c 'exit 1'") 59 | assert result.error.strip() == "" 60 | assert result.output.strip() == "" 61 | 62 | 63 | @pytest.mark.asyncio 64 | async def test_bash_tool_timeout(bash_tool): 65 | await bash_tool(command="echo 'Hello, World!'") 66 | bash_tool._session._timeout = 0.1 # Set a very short timeout for testing 67 | with pytest.raises( 68 | ToolError, 69 | match="timed out: bash has not returned in 0.1 seconds and must be restarted", 70 | ): 71 | await bash_tool(command="sleep 1") 72 | -------------------------------------------------------------------------------- /tests/tools/computer_test.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import AsyncMock, patch 2 | 3 | import pytest 4 | 5 | from computer_use_demo.tools.computer import ( 6 | ComputerTool, 7 | ScalingSource, 8 | ToolError, 9 | ToolResult, 10 | ) 11 | 12 | 13 | @pytest.fixture 14 | def computer_tool(): 15 | return ComputerTool() 16 | 17 | 18 | @pytest.mark.asyncio 19 | async def test_computer_tool_mouse_move(computer_tool): 20 | with patch.object(computer_tool, "shell", new_callable=AsyncMock) as mock_shell: 21 | mock_shell.return_value = ToolResult(output="Mouse moved") 22 | result = await computer_tool(action="mouse_move", coordinate=[100, 200]) 23 | mock_shell.assert_called_once_with( 24 | f"{computer_tool.xdotool} mousemove --sync 100 200" 25 | ) 26 | assert result.output == "Mouse moved" 27 | 28 | 29 | @pytest.mark.asyncio 30 | async def test_computer_tool_type(computer_tool): 31 | with ( 32 | patch.object(computer_tool, "shell", new_callable=AsyncMock) as mock_shell, 33 | patch.object( 34 | computer_tool, "screenshot", new_callable=AsyncMock 35 | ) as mock_screenshot, 36 | ): 37 | mock_shell.return_value = ToolResult(output="Text typed") 38 | mock_screenshot.return_value = ToolResult(base64_image="base64_screenshot") 39 | result = await computer_tool(action="type", text="Hello, World!") 40 | assert mock_shell.call_count == 1 41 | assert "type --delay 12 -- 'Hello, World!'" in mock_shell.call_args[0][0] 42 | assert result.output == "Text typed" 43 | assert result.base64_image == "base64_screenshot" 44 | 45 | 46 | @pytest.mark.asyncio 47 | async def test_computer_tool_screenshot(computer_tool): 48 | with patch.object( 49 | computer_tool, "screenshot", new_callable=AsyncMock 50 | ) as mock_screenshot: 51 | mock_screenshot.return_value = ToolResult(base64_image="base64_screenshot") 52 | result = await computer_tool(action="screenshot") 53 | mock_screenshot.assert_called_once() 54 | assert result.base64_image == "base64_screenshot" 55 | 56 | 57 | @pytest.mark.asyncio 58 | async def test_computer_tool_scaling(computer_tool): 59 | computer_tool._scaling_enabled = True 60 | computer_tool.width = 1920 61 | computer_tool.height = 1080 62 | 63 | # Test scaling from API to computer 64 | x, y = computer_tool.scale_coordinates(ScalingSource.API, 1366, 768) 65 | assert x == 1920 66 | assert y == 1080 67 | 68 | # Test scaling from computer to API 69 | x, y = computer_tool.scale_coordinates(ScalingSource.COMPUTER, 1920, 1080) 70 | assert x == 1366 71 | assert y == 768 72 | 73 | # Test no scaling when disabled 74 | computer_tool._scaling_enabled = False 75 | x, y = computer_tool.scale_coordinates(ScalingSource.API, 1366, 768) 76 | assert x == 1366 77 | assert y == 768 78 | 79 | 80 | @pytest.mark.asyncio 81 | async def test_computer_tool_scaling_with_different_aspect_ratio(computer_tool): 82 | computer_tool._scaling_enabled = True 83 | computer_tool.width = 1920 84 | computer_tool.height = 1200 # 16:10 aspect ratio 85 | 86 | # Test scaling from API to computer 87 | x, y = computer_tool.scale_coordinates(ScalingSource.API, 1280, 800) 88 | assert x == 1920 89 | assert y == 1200 90 | 91 | # Test scaling from computer to API 92 | x, y = computer_tool.scale_coordinates(ScalingSource.COMPUTER, 1920, 1200) 93 | assert x == 1280 94 | assert y == 800 95 | 96 | 97 | @pytest.mark.asyncio 98 | async def test_computer_tool_no_scaling_for_unsupported_resolution(computer_tool): 99 | computer_tool._scaling_enabled = True 100 | computer_tool.width = 4096 101 | computer_tool.height = 2160 102 | 103 | # Test no scaling for unsupported resolution 104 | x, y = computer_tool.scale_coordinates(ScalingSource.API, 4096, 2160) 105 | assert x == 4096 106 | assert y == 2160 107 | 108 | x, y = computer_tool.scale_coordinates(ScalingSource.COMPUTER, 4096, 2160) 109 | assert x == 4096 110 | assert y == 2160 111 | 112 | 113 | @pytest.mark.asyncio 114 | async def test_computer_tool_scaling_out_of_bounds(computer_tool): 115 | computer_tool._scaling_enabled = True 116 | computer_tool.width = 1920 117 | computer_tool.height = 1080 118 | 119 | # Test scaling from API with out of bounds coordinates 120 | with pytest.raises(ToolError, match="Coordinates .*, .* are out of bounds"): 121 | x, y = computer_tool.scale_coordinates(ScalingSource.API, 2000, 1500) 122 | 123 | 124 | @pytest.mark.asyncio 125 | async def test_computer_tool_invalid_action(computer_tool): 126 | with pytest.raises(ToolError, match="Invalid action: invalid_action"): 127 | await computer_tool(action="invalid_action") 128 | 129 | 130 | @pytest.mark.asyncio 131 | async def test_computer_tool_missing_coordinate(computer_tool): 132 | with pytest.raises(ToolError, match="coordinate is required for mouse_move"): 133 | await computer_tool(action="mouse_move") 134 | 135 | 136 | @pytest.mark.asyncio 137 | async def test_computer_tool_missing_text(computer_tool): 138 | with pytest.raises(ToolError, match="text is required for type"): 139 | await computer_tool(action="type") 140 | -------------------------------------------------------------------------------- /tests/tools/edit_test.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from unittest.mock import patch 3 | 4 | import pytest 5 | 6 | from computer_use_demo.tools.base import CLIResult, ToolError, ToolResult 7 | from computer_use_demo.tools.edit import EditTool 8 | 9 | 10 | @pytest.mark.asyncio 11 | async def test_view_command(): 12 | edit_tool = EditTool() 13 | 14 | # Test viewing a file that exists 15 | with patch("pathlib.Path.exists", return_value=True), patch( 16 | "pathlib.Path.is_dir", return_value=False 17 | ), patch("pathlib.Path.read_text") as mock_read_text: 18 | mock_read_text.return_value = "File content" 19 | result = await edit_tool(command="view", path="/test/file.txt") 20 | assert isinstance(result, CLIResult) 21 | assert result.output 22 | assert "File content" in result.output 23 | 24 | # Test viewing a directory 25 | with patch("pathlib.Path.exists", return_value=True), patch( 26 | "pathlib.Path.is_dir", return_value=True 27 | ), patch("computer_use_demo.tools.edit.run") as mock_run: 28 | mock_run.return_value = (None, "file1.txt\nfile2.txt", None) 29 | result = await edit_tool(command="view", path="/test/dir") 30 | assert isinstance(result, CLIResult) 31 | assert result.output 32 | assert "file1.txt" in result.output 33 | assert "file2.txt" in result.output 34 | 35 | # Test viewing a file with a specific range 36 | with patch("pathlib.Path.exists", return_value=True), patch( 37 | "pathlib.Path.is_dir", return_value=False 38 | ), patch("pathlib.Path.read_text") as mock_read_text: 39 | mock_read_text.return_value = "Line 1\nLine 2\nLine 3\nLine 4" 40 | result = await edit_tool( 41 | command="view", path="/test/file.txt", view_range=[2, 3] 42 | ) 43 | assert isinstance(result, CLIResult) 44 | assert result.output 45 | assert "\n 2\tLine 2\n 3\tLine 3\n" in result.output 46 | 47 | # Test viewing a file with an invalid range 48 | with patch("pathlib.Path.exists", return_value=True), patch( 49 | "pathlib.Path.is_dir", return_value=False 50 | ), patch("pathlib.Path.read_text") as mock_read_text: 51 | mock_read_text.return_value = "Line 1\nLine 2\nLine 3\nLine 4" 52 | with pytest.raises(ToolError, match="Invalid `view_range`"): 53 | await edit_tool(command="view", path="/test/file.txt", view_range=[3, 2]) 54 | 55 | # Test viewing a non-existent file 56 | with patch("pathlib.Path.exists", return_value=False): 57 | with pytest.raises(ToolError, match="does not exist"): 58 | await edit_tool(command="view", path="/nonexistent/file.txt") 59 | 60 | # Test viewing a directory with a view_range 61 | with patch("pathlib.Path.exists", return_value=True), patch( 62 | "pathlib.Path.is_dir", return_value=True 63 | ): 64 | with pytest.raises(ToolError, match="view_range` parameter is not allowed"): 65 | await edit_tool(command="view", path="/test/dir", view_range=[1, 2]) 66 | 67 | 68 | @pytest.mark.asyncio 69 | async def test_create_command(): 70 | edit_tool = EditTool() 71 | 72 | # Test creating a new file with content 73 | with patch("pathlib.Path.exists", return_value=False), patch( 74 | "pathlib.Path.write_text" 75 | ) as mock_write_text: 76 | result = await edit_tool( 77 | command="create", path="/test/newfile.txt", file_text="New file content" 78 | ) 79 | assert isinstance(result, ToolResult) 80 | assert result.output 81 | assert "File created successfully" in result.output 82 | mock_write_text.assert_called_once_with("New file content") 83 | 84 | # Test attempting to create a file without content 85 | with patch("pathlib.Path.exists", return_value=False): 86 | with pytest.raises(ToolError, match="Parameter `file_text` is required"): 87 | await edit_tool(command="create", path="/test/newfile.txt") 88 | 89 | # Test attempting to create a file that already exists 90 | with patch("pathlib.Path.exists", return_value=True): 91 | with pytest.raises(ToolError, match="File already exists"): 92 | await edit_tool( 93 | command="create", path="/test/existingfile.txt", file_text="Content" 94 | ) 95 | 96 | 97 | @pytest.mark.asyncio 98 | async def test_str_replace_command(): 99 | edit_tool = EditTool() 100 | 101 | # Test replacing a unique string in a file 102 | with patch("pathlib.Path.exists", return_value=True), patch( 103 | "pathlib.Path.is_dir", return_value=False 104 | ), patch("pathlib.Path.read_text") as mock_read_text, patch( 105 | "pathlib.Path.write_text" 106 | ) as mock_write_text: 107 | mock_read_text.return_value = "Original content" 108 | result = await edit_tool( 109 | command="str_replace", 110 | path="/test/file.txt", 111 | old_str="Original", 112 | new_str="New", 113 | ) 114 | assert isinstance(result, CLIResult) 115 | assert result.output 116 | assert "has been edited" in result.output 117 | mock_write_text.assert_called_once_with("New content") 118 | 119 | # Test attempting to replace a non-existent string 120 | with patch("pathlib.Path.exists", return_value=True), patch( 121 | "pathlib.Path.is_dir", return_value=False 122 | ), patch("pathlib.Path.read_text") as mock_read_text: 123 | mock_read_text.return_value = "Original content" 124 | with pytest.raises(ToolError, match="did not appear verbatim"): 125 | await edit_tool( 126 | command="str_replace", 127 | path="/test/file.txt", 128 | old_str="Nonexistent", 129 | new_str="New", 130 | ) 131 | 132 | # Test attempting to replace a string that appears multiple times 133 | with patch("pathlib.Path.exists", return_value=True), patch( 134 | "pathlib.Path.is_dir", return_value=False 135 | ), patch("pathlib.Path.read_text") as mock_read_text: 136 | mock_read_text.return_value = "Test test test" 137 | with pytest.raises(ToolError, match="Multiple occurrences"): 138 | await edit_tool( 139 | command="str_replace", 140 | path="/test/file.txt", 141 | old_str="test", 142 | new_str="example", 143 | ) 144 | 145 | edit_tool._file_history.clear() 146 | # Verify that the file history is updated after replacement 147 | with patch("pathlib.Path.exists", return_value=True), patch( 148 | "pathlib.Path.is_dir", return_value=False 149 | ), patch("pathlib.Path.read_text") as mock_read_text, patch( 150 | "pathlib.Path.write_text" 151 | ): 152 | mock_read_text.return_value = "Original content" 153 | await edit_tool( 154 | command="str_replace", 155 | path="/test/file.txt", 156 | old_str="Original", 157 | new_str="New", 158 | ) 159 | assert edit_tool._file_history[Path("/test/file.txt")] == ["Original content"] 160 | 161 | 162 | @pytest.mark.asyncio 163 | async def test_insert_command(): 164 | edit_tool = EditTool() 165 | 166 | # Test inserting a string at a valid line number 167 | with patch("pathlib.Path.exists", return_value=True), patch( 168 | "pathlib.Path.is_dir", return_value=False 169 | ), patch("pathlib.Path.read_text") as mock_read_text, patch( 170 | "pathlib.Path.write_text" 171 | ) as mock_write_text: 172 | mock_read_text.return_value = "Line 1\nLine 2\nLine 3" 173 | result = await edit_tool( 174 | command="insert", path="/test/file.txt", insert_line=2, new_str="New Line" 175 | ) 176 | assert isinstance(result, CLIResult) 177 | assert result.output 178 | assert "has been edited" in result.output 179 | mock_write_text.assert_called_once_with("Line 1\nLine 2\nNew Line\nLine 3") 180 | 181 | # Test inserting a string at the beginning of the file (line 0) 182 | with patch("pathlib.Path.exists", return_value=True), patch( 183 | "pathlib.Path.is_dir", return_value=False 184 | ), patch("pathlib.Path.read_text") as mock_read_text, patch( 185 | "pathlib.Path.write_text" 186 | ) as mock_write_text: 187 | mock_read_text.return_value = "Line 1\nLine 2" 188 | result = await edit_tool( 189 | command="insert", 190 | path="/test/file.txt", 191 | insert_line=0, 192 | new_str="New First Line", 193 | ) 194 | assert isinstance(result, CLIResult) 195 | assert result.output 196 | assert "has been edited" in result.output 197 | mock_write_text.assert_called_once_with("New First Line\nLine 1\nLine 2") 198 | 199 | # Test inserting a string at the end of the file 200 | with patch("pathlib.Path.exists", return_value=True), patch( 201 | "pathlib.Path.is_dir", return_value=False 202 | ), patch("pathlib.Path.read_text") as mock_read_text, patch( 203 | "pathlib.Path.write_text" 204 | ) as mock_write_text: 205 | mock_read_text.return_value = "Line 1\nLine 2" 206 | result = await edit_tool( 207 | command="insert", 208 | path="/test/file.txt", 209 | insert_line=2, 210 | new_str="New Last Line", 211 | ) 212 | assert isinstance(result, CLIResult) 213 | assert result.output 214 | assert "has been edited" in result.output 215 | mock_write_text.assert_called_once_with("Line 1\nLine 2\nNew Last Line") 216 | 217 | # Test attempting to insert at an invalid line number 218 | with patch("pathlib.Path.exists", return_value=True), patch( 219 | "pathlib.Path.is_dir", return_value=False 220 | ), patch("pathlib.Path.read_text") as mock_read_text: 221 | mock_read_text.return_value = "Line 1\nLine 2" 222 | with pytest.raises(ToolError, match="Invalid `insert_line` parameter"): 223 | await edit_tool( 224 | command="insert", 225 | path="/test/file.txt", 226 | insert_line=5, 227 | new_str="Invalid Line", 228 | ) 229 | 230 | # Verify that the file history is updated after insertion 231 | edit_tool._file_history.clear() 232 | with patch("pathlib.Path.exists", return_value=True), patch( 233 | "pathlib.Path.is_dir", return_value=False 234 | ), patch("pathlib.Path.read_text") as mock_read_text, patch( 235 | "pathlib.Path.write_text" 236 | ): 237 | mock_read_text.return_value = "Original content" 238 | await edit_tool( 239 | command="insert", path="/test/file.txt", insert_line=1, new_str="New Line" 240 | ) 241 | assert edit_tool._file_history[Path("/test/file.txt")] == ["Original content"] 242 | 243 | 244 | @pytest.mark.asyncio 245 | async def test_undo_edit_command(): 246 | edit_tool = EditTool() 247 | 248 | # Test undoing a str_replace operation 249 | with patch("pathlib.Path.exists", return_value=True), patch( 250 | "pathlib.Path.is_dir", return_value=False 251 | ), patch("pathlib.Path.read_text") as mock_read_text, patch( 252 | "pathlib.Path.write_text" 253 | ) as mock_write_text: 254 | mock_read_text.return_value = "Original content" 255 | await edit_tool( 256 | command="str_replace", 257 | path="/test/file.txt", 258 | old_str="Original", 259 | new_str="New", 260 | ) 261 | mock_read_text.return_value = "New content" 262 | result = await edit_tool(command="undo_edit", path="/test/file.txt") 263 | assert isinstance(result, CLIResult) 264 | assert result.output 265 | assert "Last edit to /test/file.txt undone successfully" in result.output 266 | mock_write_text.assert_called_with("Original content") 267 | 268 | # Test undoing an insert operation 269 | edit_tool._file_history.clear() 270 | with patch("pathlib.Path.exists", return_value=True), patch( 271 | "pathlib.Path.is_dir", return_value=False 272 | ), patch("pathlib.Path.read_text") as mock_read_text, patch( 273 | "pathlib.Path.write_text" 274 | ) as mock_write_text: 275 | mock_read_text.return_value = "Line 1\nLine 2" 276 | await edit_tool( 277 | command="insert", path="/test/file.txt", insert_line=1, new_str="New Line" 278 | ) 279 | mock_read_text.return_value = "Line 1\nNew Line\nLine 2" 280 | result = await edit_tool(command="undo_edit", path="/test/file.txt") 281 | assert isinstance(result, CLIResult) 282 | assert result.output 283 | assert "Last edit to /test/file.txt undone successfully" in result.output 284 | mock_write_text.assert_called_with("Line 1\nLine 2") 285 | 286 | # Test attempting to undo when there's no history 287 | edit_tool._file_history.clear() 288 | with patch("pathlib.Path.exists", return_value=True), patch( 289 | "pathlib.Path.is_dir", return_value=False 290 | ): 291 | with pytest.raises(ToolError, match="No edit history found"): 292 | await edit_tool(command="undo_edit", path="/test/file.txt") 293 | 294 | 295 | @pytest.mark.asyncio 296 | async def test_validate_path(): 297 | edit_tool = EditTool() 298 | 299 | # Test with valid absolute paths 300 | with patch("pathlib.Path.exists", return_value=True), patch( 301 | "pathlib.Path.is_dir", return_value=False 302 | ): 303 | edit_tool.validate_path("view", Path("/valid/path.txt")) 304 | 305 | # Test with relative paths (should raise an error) 306 | with pytest.raises(ToolError, match="not an absolute path"): 307 | edit_tool.validate_path("view", Path("relative/path.txt")) 308 | 309 | # Test with non-existent paths for non-create commands (should raise an error) 310 | with patch("pathlib.Path.exists", return_value=False): 311 | with pytest.raises(ToolError, match="does not exist"): 312 | edit_tool.validate_path("view", Path("/nonexistent/file.txt")) 313 | 314 | # Test with existing paths for create command (should raise an error) 315 | with patch("pathlib.Path.exists", return_value=True): 316 | with pytest.raises(ToolError, match="File already exists"): 317 | edit_tool.validate_path("create", Path("/existing/file.txt")) 318 | 319 | # Test with directory paths for non-view commands (should raise an error) 320 | with patch("pathlib.Path.exists", return_value=True), patch( 321 | "pathlib.Path.is_dir", return_value=True 322 | ): 323 | with pytest.raises(ToolError, match="is a directory"): 324 | edit_tool.validate_path("str_replace", Path("/directory/path")) 325 | 326 | # Test with directory path for view command (should not raise an error) 327 | with patch("pathlib.Path.exists", return_value=True), patch( 328 | "pathlib.Path.is_dir", return_value=True 329 | ): 330 | edit_tool.validate_path("view", Path("/directory/path")) 331 | --------------------------------------------------------------------------------