├── computer_use_demo ├── __init__.py ├── tools │ ├── __init__.py │ ├── collection.py │ ├── run.py │ ├── base.py │ ├── bash.py │ ├── computer.py │ └── edit.py └── loop.py ├── requirements.txt ├── .gitignore ├── LICENSE ├── README.md └── main.py /computer_use_demo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | anthropic[bedrock,vertex]>=0.37.1 2 | pillow 3 | PyAutoGUI 4 | -------------------------------------------------------------------------------- /computer_use_demo/tools/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import CLIResult, ToolResult 2 | from .bash import BashTool 3 | from .collection import ToolCollection 4 | from .computer import ComputerTool 5 | from .edit import EditTool 6 | 7 | __ALL__ = [ 8 | BashTool, 9 | CLIResult, 10 | ComputerTool, 11 | EditTool, 12 | ToolCollection, 13 | ToolResult, 14 | ] 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # macOS-specific files/folders 2 | .DS_Store 3 | .AppleDouble 4 | .LSOverride 5 | Icon? 6 | ._* 7 | 8 | # Common Python artifacts 9 | __pycache__/ 10 | *.pyc 11 | *.pyo 12 | *.pyd 13 | .Python 14 | env/ 15 | venv/ 16 | *.egg-info/ 17 | .tox/ 18 | build/ 19 | dist/ 20 | 21 | # IDE/Editor files 22 | .idea/ 23 | *.iml 24 | .vscode/ 25 | 26 | # Notebook checkpoints 27 | .ipynb_checkpoints 28 | 29 | # macOS Trash folder 30 | .Trash/ 31 | -------------------------------------------------------------------------------- /computer_use_demo/tools/collection.py: -------------------------------------------------------------------------------- 1 | """Collection classes for managing multiple tools.""" 2 | 3 | from typing import Any 4 | 5 | from anthropic.types.beta import BetaToolUnionParam 6 | 7 | from .base import ( 8 | BaseAnthropicTool, 9 | ToolError, 10 | ToolFailure, 11 | ToolResult, 12 | ) 13 | 14 | 15 | class ToolCollection: 16 | """A collection of anthropic-defined tools.""" 17 | 18 | def __init__(self, *tools: BaseAnthropicTool): 19 | self.tools = tools 20 | self.tool_map = {tool.to_params()["name"]: tool for tool in tools} 21 | 22 | def to_params( 23 | self, 24 | ) -> list[BetaToolUnionParam]: 25 | return [tool.to_params() for tool in self.tools] 26 | 27 | async def run(self, *, name: str, tool_input: dict[str, Any]) -> ToolResult: 28 | tool = self.tool_map.get(name) 29 | if not tool: 30 | return ToolFailure(error=f"Tool {name} is invalid") 31 | try: 32 | return await tool(**tool_input) 33 | except ToolError as e: 34 | return ToolFailure(error=e.message) 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Pallav Agarwal 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /computer_use_demo/tools/run.py: -------------------------------------------------------------------------------- 1 | """Utility to run shell commands asynchronously with a timeout.""" 2 | 3 | import asyncio 4 | 5 | TRUNCATED_MESSAGE: str = "To save on context only part of this file has been shown to you. You should retry this tool after you have searched inside the file with `grep -n` in order to find the line numbers of what you are looking for." 6 | MAX_RESPONSE_LEN: int = 16000 7 | 8 | 9 | def maybe_truncate(content: str, truncate_after: int | None = MAX_RESPONSE_LEN): 10 | """Truncate content and append a notice if content exceeds the specified length.""" 11 | return ( 12 | content 13 | if not truncate_after or len(content) <= truncate_after 14 | else content[:truncate_after] + TRUNCATED_MESSAGE 15 | ) 16 | 17 | 18 | async def run( 19 | cmd: str, 20 | timeout: float | None = 120.0, # seconds 21 | truncate_after: int | None = MAX_RESPONSE_LEN, 22 | ): 23 | """Run a shell command asynchronously with a timeout.""" 24 | process = await asyncio.create_subprocess_shell( 25 | cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE 26 | ) 27 | 28 | try: 29 | stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=timeout) 30 | return ( 31 | process.returncode or 0, 32 | maybe_truncate(stdout.decode(), truncate_after=truncate_after), 33 | maybe_truncate(stderr.decode(), truncate_after=truncate_after), 34 | ) 35 | except asyncio.TimeoutError as exc: 36 | try: 37 | process.kill() 38 | except ProcessLookupError: 39 | pass 40 | raise TimeoutError( 41 | f"Command '{cmd}' timed out after {timeout} seconds" 42 | ) from exc 43 | -------------------------------------------------------------------------------- /computer_use_demo/tools/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | from dataclasses import dataclass, fields, replace 3 | from typing import Any 4 | 5 | from anthropic.types.beta import BetaToolUnionParam 6 | 7 | 8 | class BaseAnthropicTool(metaclass=ABCMeta): 9 | """Abstract base class for Anthropic-defined tools.""" 10 | 11 | @abstractmethod 12 | def __call__(self, **kwargs) -> Any: 13 | """Executes the tool with the given arguments.""" 14 | ... 15 | 16 | @abstractmethod 17 | def to_params( 18 | self, 19 | ) -> BetaToolUnionParam: 20 | raise NotImplementedError 21 | 22 | 23 | @dataclass(kw_only=True, frozen=True) 24 | class ToolResult: 25 | """Represents the result of a tool execution.""" 26 | 27 | output: str | None = None 28 | error: str | None = None 29 | base64_image: str | None = None 30 | system: str | None = None 31 | 32 | def __bool__(self): 33 | return any(getattr(self, field.name) for field in fields(self)) 34 | 35 | def __add__(self, other: "ToolResult"): 36 | def combine_fields( 37 | field: str | None, other_field: str | None, concatenate: bool = True 38 | ): 39 | if field and other_field: 40 | if concatenate: 41 | return field + other_field 42 | raise ValueError("Cannot combine tool results") 43 | return field or other_field 44 | 45 | return ToolResult( 46 | output=combine_fields(self.output, other.output), 47 | error=combine_fields(self.error, other.error), 48 | base64_image=combine_fields(self.base64_image, other.base64_image, False), 49 | system=combine_fields(self.system, other.system), 50 | ) 51 | 52 | def replace(self, **kwargs): 53 | """Returns a new ToolResult with the given fields replaced.""" 54 | return replace(self, **kwargs) 55 | 56 | 57 | class CLIResult(ToolResult): 58 | """A ToolResult that can be rendered as a CLI output.""" 59 | 60 | 61 | class ToolFailure(ToolResult): 62 | """A ToolResult that represents a failure.""" 63 | 64 | 65 | class ToolError(Exception): 66 | """Raised when a tool encounters an error.""" 67 | 68 | def __init__(self, message): 69 | self.message = message 70 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Claude Computer Use Demo for MacOS 2 | 3 | This repository contains a Python script that demonstrates Anthropic's Computer Use capabilities, modified to run on MacOS without requiring a Docker container. The script allows Claude 3.5 Sonnet to perform tasks on your Mac by simulating mouse and keyboard actions as well as running bash command. 4 | 5 | Forked from Anthropic's [computer use demo](https://github.com/anthropics/anthropic-quickstarts/tree/main/computer-use-demo) - optimized for MacOS. 6 | View Anthropic's docs [here](https://docs.anthropic.com/en/docs/build-with-claude/computer-use). 7 | 8 | > [!WARNING] 9 | > Use this script with caution. Allowing Claude to control your computer can be risky. By running this script, you assume all responsibility and liability. 10 | 11 | ## Installation and Setup 12 | 13 | 1. **Clone the repository:** 14 | 15 | ```bash 16 | git clone https://github.com/PallavAg/claude-computer-use-macos.git 17 | cd claude-computer-use-macos 18 | ``` 19 | 20 | 2. **Create a virtual environment + install dependencies:** 21 | 22 | ```bash 23 | python3.12 -m venv venv 24 | source venv/bin/activate 25 | pip3.12 install -r requirements.txt 26 | ``` 27 | 28 | 3. **Set your Anthropic API key as an environment variable:** 29 | 30 | ```bash 31 | export ANTHROPIC_API_KEY="CLAUDE_API_KEY" 32 | ``` 33 | 34 | Replace `CLAUDE_API_KEY` with your actual Anthropic API key. You find yours [here](https://console.anthropic.com/settings/keys). 35 | 36 | 4. **Grant Accessibility Permissions:** 37 | 38 | The script uses `pyautogui` to control mouse and keyboard events. On MacOS, you need to grant accessibility permissions. These popups should show automatically the first time you run the script so you can skip this step. But to manually provide permissions: 39 | 40 | - Go to **System Preferences** > **Security & Privacy** > **Privacy** tab. 41 | - Select **Accessibility** from the list on the left. 42 | - Add your terminal application or Python interpreter to the list of allowed apps. 43 | 44 | ## Usage 45 | 46 | You can run the script by passing the instruction directly via the command line or by editing the `main.py` file. 47 | 48 | **Example using command line instruction:** 49 | 50 | ```bash 51 | python3.12 main.py 'Open Safari and look up Anthropic' 52 | ``` 53 | 54 | Replace `'Open Safari and look up Anthropic'` with your desired instruction. 55 | 56 | **Note:** If you do not provide an instruction via the command line, the script will use the default instruction specified in `main.py`. You can edit `main.py` to change this default instruction. 57 | 58 | ## Exiting the Script 59 | 60 | You can quit the script at any time by pressing `Ctrl+C` in the terminal. 61 | 62 | ## ⚠ Disclaimer 63 | 64 | > [!CAUTION] 65 | > - **Security Risks:** This script allows claude to control your computer's mouse and keyboard and run bash commands. Use it at your own risk. 66 | > - **Responsibility:** By running this script, you assume all responsibility and liability for any results. 67 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import sys 4 | import json 5 | import base64 6 | 7 | from computer_use_demo.loop import sampling_loop, APIProvider 8 | from computer_use_demo.tools import ToolResult 9 | from anthropic.types.beta import BetaMessage, BetaMessageParam 10 | from anthropic import APIResponse 11 | 12 | 13 | async def main(): 14 | # Set up your Anthropic API key and model 15 | api_key = os.getenv("ANTHROPIC_API_KEY", "YOUR_API_KEY_HERE") 16 | if api_key == "YOUR_API_KEY_HERE": 17 | raise ValueError( 18 | "Please first set your API key in the ANTHROPIC_API_KEY environment variable" 19 | ) 20 | provider = APIProvider.ANTHROPIC 21 | 22 | # Check if the instruction is provided via command line arguments 23 | if len(sys.argv) > 1: 24 | instruction = " ".join(sys.argv[1:]) 25 | else: 26 | instruction = "Save an image of a cat to the desktop." 27 | 28 | print( 29 | f"Starting Claude 'Computer Use'.\nPress ctrl+c to stop.\nInstructions provided: '{instruction}'" 30 | ) 31 | 32 | # Set up the initial messages 33 | messages: list[BetaMessageParam] = [ 34 | { 35 | "role": "user", 36 | "content": instruction, 37 | } 38 | ] 39 | 40 | # Define callbacks (you can customize these) 41 | def output_callback(content_block): 42 | if isinstance(content_block, dict) and content_block.get("type") == "text": 43 | print("Assistant:", content_block.get("text")) 44 | 45 | def tool_output_callback(result: ToolResult, tool_use_id: str): 46 | if result.output: 47 | print(f"> Tool Output [{tool_use_id}]:", result.output) 48 | if result.error: 49 | print(f"!!! Tool Error [{tool_use_id}]:", result.error) 50 | if result.base64_image: 51 | # Save the image to a file if needed 52 | os.makedirs("screenshots", exist_ok=True) 53 | image_data = result.base64_image 54 | with open(f"screenshots/screenshot_{tool_use_id}.png", "wb") as f: 55 | f.write(base64.b64decode(image_data)) 56 | print(f"Took screenshot screenshot_{tool_use_id}.png") 57 | 58 | def api_response_callback(response: APIResponse[BetaMessage]): 59 | print( 60 | "\n---------------\nAPI Response:\n", 61 | json.dumps(json.loads(response.text)["content"], indent=4), # type: ignore 62 | "\n", 63 | ) 64 | 65 | # Run the sampling loop 66 | messages = await sampling_loop( 67 | model="claude-3-5-sonnet-20241022", 68 | provider=provider, 69 | system_prompt_suffix="", 70 | messages=messages, 71 | output_callback=output_callback, 72 | tool_output_callback=tool_output_callback, 73 | api_response_callback=api_response_callback, 74 | api_key=api_key, 75 | only_n_most_recent_images=10, 76 | max_tokens=4096, 77 | ) 78 | 79 | 80 | if __name__ == "__main__": 81 | try: 82 | asyncio.run(main()) 83 | except Exception as e: 84 | print(f"Encountered Error:\n{e}") 85 | -------------------------------------------------------------------------------- /computer_use_demo/tools/bash.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | from typing import ClassVar, Literal 4 | 5 | from anthropic.types.beta import BetaToolBash20241022Param 6 | 7 | from .base import BaseAnthropicTool, CLIResult, ToolError, ToolResult 8 | 9 | 10 | class _BashSession: 11 | """A session of a bash shell.""" 12 | 13 | _started: bool 14 | _process: asyncio.subprocess.Process 15 | 16 | command: str = "/bin/bash" 17 | _output_delay: float = 0.2 # seconds 18 | _timeout: float = 120.0 # seconds 19 | _sentinel: str = "<>" 20 | 21 | def __init__(self): 22 | self._started = False 23 | self._timed_out = False 24 | 25 | async def start(self): 26 | if self._started: 27 | return 28 | 29 | self._process = await asyncio.create_subprocess_shell( 30 | self.command, 31 | preexec_fn=os.setsid, 32 | shell=True, 33 | bufsize=0, 34 | stdin=asyncio.subprocess.PIPE, 35 | stdout=asyncio.subprocess.PIPE, 36 | stderr=asyncio.subprocess.PIPE, 37 | ) 38 | 39 | self._started = True 40 | 41 | def stop(self): 42 | """Terminate the bash shell.""" 43 | if not self._started: 44 | raise ToolError("Session has not started.") 45 | if self._process.returncode is not None: 46 | return 47 | self._process.terminate() 48 | 49 | async def run(self, command: str): 50 | """Execute a command in the bash shell.""" 51 | if not self._started: 52 | raise ToolError("Session has not started.") 53 | if self._process.returncode is not None: 54 | return ToolResult( 55 | system="tool must be restarted", 56 | error=f"bash has exited with returncode {self._process.returncode}", 57 | ) 58 | if self._timed_out: 59 | raise ToolError( 60 | f"timed out: bash has not returned in {self._timeout} seconds and must be restarted", 61 | ) 62 | 63 | # we know these are not None because we created the process with PIPEs 64 | assert self._process.stdin 65 | assert self._process.stdout 66 | assert self._process.stderr 67 | 68 | # send command to the process 69 | self._process.stdin.write( 70 | command.encode() + f"; echo '{self._sentinel}'\n".encode() 71 | ) 72 | await self._process.stdin.drain() 73 | 74 | # read output from the process, until the sentinel is found 75 | try: 76 | async with asyncio.timeout(self._timeout): 77 | while True: 78 | await asyncio.sleep(self._output_delay) 79 | # if we read directly from stdout/stderr, it will wait forever for 80 | # EOF. use the StreamReader buffer directly instead. 81 | output = ( 82 | self._process.stdout._buffer.decode() # pyright: ignore[reportAttributeAccessIssue] 83 | ) 84 | if self._sentinel in output: 85 | # strip the sentinel and break 86 | output = output[: output.index(self._sentinel)] 87 | break 88 | except asyncio.TimeoutError: 89 | self._timed_out = True 90 | raise ToolError( 91 | f"timed out: bash has not returned in {self._timeout} seconds and must be restarted", 92 | ) from None 93 | 94 | if output.endswith("\n"): 95 | output = output[:-1] 96 | 97 | error = ( 98 | self._process.stderr._buffer.decode() # pyright: ignore[reportAttributeAccessIssue] 99 | ) 100 | if error.endswith("\n"): 101 | error = error[:-1] 102 | 103 | # clear the buffers so that the next output can be read correctly 104 | self._process.stdout._buffer.clear() # pyright: ignore[reportAttributeAccessIssue] 105 | self._process.stderr._buffer.clear() # pyright: ignore[reportAttributeAccessIssue] 106 | 107 | return CLIResult(output=output, error=error) 108 | 109 | 110 | class BashTool(BaseAnthropicTool): 111 | """ 112 | A tool that allows the agent to run bash commands. 113 | The tool parameters are defined by Anthropic and are not editable. 114 | """ 115 | 116 | _session: _BashSession | None 117 | name: ClassVar[Literal["bash"]] = "bash" 118 | api_type: ClassVar[Literal["bash_20241022"]] = "bash_20241022" 119 | 120 | def __init__(self): 121 | self._session = None 122 | super().__init__() 123 | 124 | async def __call__( 125 | self, command: str | None = None, restart: bool = False, **kwargs 126 | ): 127 | print("### Running bash command:", command) 128 | if restart: 129 | if self._session: 130 | self._session.stop() 131 | self._session = _BashSession() 132 | await self._session.start() 133 | 134 | return ToolResult(system="tool has been restarted.") 135 | 136 | if self._session is None: 137 | self._session = _BashSession() 138 | await self._session.start() 139 | 140 | if command is not None: 141 | return await self._session.run(command) 142 | 143 | raise ToolError("no command provided.") 144 | 145 | def to_params(self) -> BetaToolBash20241022Param: 146 | return { 147 | "type": self.api_type, 148 | "name": self.name, 149 | } 150 | -------------------------------------------------------------------------------- /computer_use_demo/tools/computer.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import base64 3 | import io 4 | from enum import StrEnum 5 | from typing import Literal, TypedDict 6 | import pyautogui 7 | from anthropic.types.beta import BetaToolComputerUse20241022Param 8 | 9 | from .base import BaseAnthropicTool, ToolError, ToolResult 10 | 11 | OUTPUT_DIR = "/tmp/outputs" 12 | 13 | TYPING_DELAY_MS = 12 14 | TYPING_GROUP_SIZE = 50 15 | 16 | Action = Literal[ 17 | "key", 18 | "type", 19 | "mouse_move", 20 | "left_click", 21 | "left_click_drag", 22 | "right_click", 23 | "middle_click", 24 | "double_click", 25 | "screenshot", 26 | "cursor_position", 27 | ] 28 | 29 | 30 | class ScalingSource(StrEnum): 31 | COMPUTER = "computer" 32 | API = "api" 33 | 34 | 35 | class ComputerToolOptions(TypedDict): 36 | display_height_px: int 37 | display_width_px: int 38 | display_number: int | None 39 | 40 | 41 | def chunks(s: str, chunk_size: int) -> list[str]: 42 | return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)] 43 | 44 | 45 | class ComputerTool(BaseAnthropicTool): 46 | """ 47 | A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer. 48 | The tool parameters are defined by Anthropic and are not editable. 49 | """ 50 | 51 | name: Literal["computer"] = "computer" 52 | api_type: Literal["computer_20241022"] = "computer_20241022" 53 | width: int 54 | height: int 55 | display_num: int | None 56 | 57 | _screenshot_delay = 1.0 58 | _scaling_enabled = True 59 | 60 | @property 61 | def options(self) -> ComputerToolOptions: 62 | return { 63 | "display_width_px": self.target_width, 64 | "display_height_px": self.target_height, 65 | "display_number": self.display_num, 66 | } 67 | 68 | def to_params(self) -> BetaToolComputerUse20241022Param: 69 | return {"name": self.name, "type": self.api_type, **self.options} 70 | 71 | def __init__(self): 72 | super().__init__() 73 | 74 | self.width = int(pyautogui.size()[0]) 75 | self.height = int(pyautogui.size()[1]) 76 | 77 | self.display_num = None # Not used on MacOS 78 | 79 | MAX_WIDTH = 1280 # Max screenshot width 80 | if self.width > MAX_WIDTH: 81 | self.scale_factor = MAX_WIDTH / self.width 82 | self.target_width = MAX_WIDTH 83 | self.target_height = int(self.height * self.scale_factor) 84 | else: 85 | self.scale_factor = 1.0 86 | self.target_width = self.width 87 | self.target_height = self.height 88 | 89 | async def __call__( 90 | self, 91 | *, 92 | action: Action, 93 | text: str | None = None, 94 | coordinate: list[int] | None = None, 95 | **kwargs, 96 | ): 97 | print( 98 | f"### Performing action: {action}{f", text: {text}" if text else ''}{f", coordinate: {coordinate}" if coordinate else ''}" 99 | ) 100 | if action in ("mouse_move", "left_click_drag"): 101 | if coordinate is None: 102 | raise ToolError(f"coordinate is required for {action}") 103 | if text is not None: 104 | raise ToolError(f"text is not accepted for {action}") 105 | if not isinstance(coordinate, list) or len(coordinate) != 2: 106 | raise ToolError(f"coordinate must be a list of length 2") 107 | if not all(isinstance(i, int) and i >= 0 for i in coordinate): 108 | raise ToolError(f"coordinate must be a list of non-negative integers") 109 | 110 | x, y = self.scale_coordinates( 111 | ScalingSource.API, coordinate[0], coordinate[1] 112 | ) 113 | 114 | if action == "mouse_move": 115 | await asyncio.to_thread(pyautogui.moveTo, x, y) 116 | return ToolResult(output=f"Mouse moved successfully to X={x}, Y={y}") 117 | elif action == "left_click_drag": 118 | await asyncio.to_thread(pyautogui.mouseDown) 119 | await asyncio.to_thread(pyautogui.moveTo, x, y) 120 | await asyncio.to_thread(pyautogui.mouseUp) 121 | return ToolResult(output="Mouse drag action completed.") 122 | 123 | if action in ("key", "type"): 124 | if text is None: 125 | raise ToolError(f"text is required for {action}") 126 | if coordinate is not None: 127 | raise ToolError(f"coordinate is not accepted for {action}") 128 | if not isinstance(text, str): 129 | raise ToolError(f"text must be a string") 130 | 131 | if action == "key": 132 | # Handle key combinations and modifiers 133 | # Replace 'super' with 'command' 134 | key_sequence = text.lower().replace("super", "command").split("+") 135 | key_sequence = [key.strip() for key in key_sequence] 136 | # Map 'cmd' to 'command' for MacOS 137 | key_sequence = [ 138 | "command" if key == "cmd" else key for key in key_sequence 139 | ] 140 | # Handle special keys that pyautogui expects 141 | special_keys = { 142 | "ctrl": "ctrl", 143 | "control": "ctrl", 144 | "alt": "alt", 145 | "option": "alt", 146 | "shift": "shift", 147 | "command": "command", 148 | "tab": "tab", 149 | "enter": "enter", 150 | "return": "enter", 151 | "esc": "esc", 152 | "escape": "esc", 153 | "space": "space", 154 | "spacebar": "space", 155 | "up": "up", 156 | "down": "down", 157 | "left": "left", 158 | "right": "right", 159 | # Add more special keys as needed 160 | } 161 | key_sequence = [special_keys.get(key, key) for key in key_sequence] 162 | await asyncio.to_thread(pyautogui.hotkey, *key_sequence) 163 | return ToolResult(output=f"Key combination '{text}' pressed.") 164 | elif action == "type": 165 | await asyncio.to_thread( 166 | pyautogui.write, text, interval=TYPING_DELAY_MS / 1000.0 167 | ) 168 | return ToolResult(output=f"Typed text: {text}") 169 | 170 | if action in ( 171 | "left_click", 172 | "right_click", 173 | "double_click", 174 | "screenshot", 175 | "cursor_position", 176 | ): 177 | if text is not None: 178 | raise ToolError(f"text is not accepted for {action}") 179 | if coordinate is not None: 180 | raise ToolError(f"coordinate is not accepted for {action}") 181 | 182 | if action == "screenshot": 183 | return await self.screenshot() 184 | elif action == "cursor_position": 185 | x, y = pyautogui.position() 186 | x, y = self.scale_coordinates(ScalingSource.COMPUTER, int(x), int(y)) 187 | return ToolResult(output=f"X={x},Y={y}") 188 | else: 189 | if action == "left_click": 190 | await asyncio.to_thread(pyautogui.click, button="left") 191 | return ToolResult(output="Left click performed.") 192 | elif action == "right_click": 193 | await asyncio.to_thread(pyautogui.click, button="right") 194 | return ToolResult(output="Right click performed.") 195 | elif action == "double_click": 196 | await asyncio.to_thread(pyautogui.doubleClick) 197 | return ToolResult(output="Double click performed.") 198 | 199 | raise ToolError(f"Invalid action: {action}") 200 | 201 | async def screenshot(self): 202 | """Take a screenshot of the current screen and return the base64 encoded image.""" 203 | # Capture screenshot using PyAutoGUI 204 | screenshot = await asyncio.to_thread(pyautogui.screenshot) 205 | 206 | if self._scaling_enabled and self.scale_factor < 1.0: 207 | screenshot = screenshot.resize((self.target_width, self.target_height)) 208 | 209 | img_buffer = io.BytesIO() 210 | # Save the image to an in-memory buffer 211 | screenshot.save(img_buffer, format="PNG", optimize=True) 212 | img_buffer.seek(0) 213 | base64_image = base64.b64encode(img_buffer.read()).decode() 214 | 215 | return ToolResult(base64_image=base64_image) 216 | 217 | def scale_coordinates(self, source: ScalingSource, x: int, y: int): 218 | """Scale coordinates between the assistant's coordinate system and the real screen coordinates.""" 219 | if not self._scaling_enabled: 220 | return x, y 221 | x_scaling_factor = self.width / self.target_width 222 | y_scaling_factor = self.height / self.target_height 223 | if source == ScalingSource.API: 224 | # Assistant's coordinates -> real screen coordinates 225 | return round(x * x_scaling_factor), round(y * y_scaling_factor) 226 | else: 227 | # Real screen coordinates -> assistant's coordinate system 228 | return round(x / x_scaling_factor), round(y / y_scaling_factor) 229 | -------------------------------------------------------------------------------- /computer_use_demo/loop.py: -------------------------------------------------------------------------------- 1 | """ 2 | Agentic sampling loop that calls the Anthropic API and local implenmentation of anthropic-defined computer use tools. 3 | """ 4 | 5 | import platform 6 | from collections.abc import Callable 7 | from datetime import datetime 8 | from enum import StrEnum 9 | from typing import Any, cast 10 | 11 | from anthropic import Anthropic, AnthropicBedrock, AnthropicVertex, APIResponse 12 | from anthropic.types import ( 13 | ToolResultBlockParam, 14 | ) 15 | from anthropic.types.beta import ( 16 | BetaContentBlock, 17 | BetaContentBlockParam, 18 | BetaImageBlockParam, 19 | BetaMessage, 20 | BetaMessageParam, 21 | BetaTextBlockParam, 22 | BetaToolResultBlockParam, 23 | ) 24 | 25 | from .tools import BashTool, ComputerTool, EditTool, ToolCollection, ToolResult 26 | 27 | BETA_FLAG = "computer-use-2024-10-22" 28 | 29 | 30 | class APIProvider(StrEnum): 31 | ANTHROPIC = "anthropic" 32 | BEDROCK = "bedrock" 33 | VERTEX = "vertex" 34 | 35 | 36 | PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = { 37 | APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022", 38 | APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0", 39 | APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022", 40 | } 41 | 42 | 43 | # This system prompt is optimized for the Docker environment in this repository and 44 | # specific tool combinations enabled. 45 | # We encourage modifying this system prompt to ensure the model has context for the 46 | # environment it is running in, and to provide any additional information that may be 47 | # helpful for the task at hand. 48 | SYSTEM_PROMPT = f""" 49 | * You are utilizing a MacOS computer using {platform.machine()} architecture with internet access. 50 | * You can use the bash tool to execute commands in the terminal. 51 | * To open applications, you can use the `open` command in the bash tool. For example, `open -a Safari` to open the Safari browser. 52 | * When using your bash tool with commands that are expected to output very large quantities of text, redirect the output into a temporary file and use `str_replace_editor` or `grep -n -B -A ` to inspect the output. 53 | * When viewing a page, it can be helpful to zoom out so that you can see everything on the page. Alternatively, ensure you scroll down to see everything before deciding something isn't available. 54 | * When using your computer function calls, they may take a while to run and send back to you. Where possible and feasible, try to chain multiple of these calls into one function call request. 55 | * The current date is {datetime.today().strftime('%A, %B %-d, %Y')}. 56 | 57 | 58 | 59 | * When using Safari or other applications, if any startup wizards or prompts appear, **IGNORE THEM**. Do not interact with them. Instead, click on the address bar or the area where you can enter commands or URLs, and proceed with your task. 60 | * If the item you are looking at is a PDF, and after taking a single screenshot of the PDF it seems you want to read the entire document, instead of trying to continue to read the PDF from your screenshots and navigation, determine the URL, use `curl` to download the PDF, install and use `pdftotext` (you may need to install it via `brew install poppler`) to convert it to a text file, and then read that text file directly with your `str_replace_editor` tool. 61 | """ 62 | 63 | 64 | async def sampling_loop( 65 | *, 66 | model: str, 67 | provider: APIProvider, 68 | system_prompt_suffix: str, 69 | messages: list[BetaMessageParam], 70 | output_callback: Callable[[BetaContentBlock], None], 71 | tool_output_callback: Callable[[ToolResult, str], None], 72 | api_response_callback: Callable[[APIResponse[BetaMessage]], None], 73 | api_key: str, 74 | only_n_most_recent_images: int | None = None, 75 | max_tokens: int = 4096, 76 | ): 77 | """ 78 | Agentic sampling loop for the assistant/tool interaction of computer use. 79 | """ 80 | tool_collection = ToolCollection( 81 | ComputerTool(), 82 | BashTool(), 83 | EditTool(), 84 | ) 85 | system = ( 86 | f"{SYSTEM_PROMPT}{' ' + system_prompt_suffix if system_prompt_suffix else ''}" 87 | ) 88 | 89 | while True: 90 | if only_n_most_recent_images: 91 | _maybe_filter_to_n_most_recent_images(messages, only_n_most_recent_images) 92 | 93 | if provider == APIProvider.ANTHROPIC: 94 | client = Anthropic(api_key=api_key) 95 | elif provider == APIProvider.VERTEX: 96 | client = AnthropicVertex() 97 | elif provider == APIProvider.BEDROCK: 98 | client = AnthropicBedrock() 99 | 100 | # Call the API 101 | # we use raw_response to provide debug information to streamlit. Your 102 | # implementation may be able call the SDK directly with: 103 | # `response = client.messages.create(...)` instead. 104 | raw_response = client.beta.messages.with_raw_response.create( 105 | max_tokens=max_tokens, 106 | messages=messages, 107 | model=model, 108 | system=system, 109 | tools=tool_collection.to_params(), 110 | betas=["computer-use-2024-10-22"], 111 | ) 112 | 113 | api_response_callback(cast(APIResponse[BetaMessage], raw_response)) 114 | 115 | response = raw_response.parse() 116 | 117 | messages.append( 118 | { 119 | "role": "assistant", 120 | "content": cast(list[BetaContentBlockParam], response.content), 121 | } 122 | ) 123 | 124 | tool_result_content: list[BetaToolResultBlockParam] = [] 125 | for content_block in cast(list[BetaContentBlock], response.content): 126 | output_callback(content_block) 127 | if content_block.type == "tool_use": 128 | result = await tool_collection.run( 129 | name=content_block.name, 130 | tool_input=cast(dict[str, Any], content_block.input), 131 | ) 132 | tool_result_content.append( 133 | _make_api_tool_result(result, content_block.id) 134 | ) 135 | tool_output_callback(result, content_block.id) 136 | 137 | if not tool_result_content: 138 | return messages 139 | 140 | messages.append({"content": tool_result_content, "role": "user"}) 141 | 142 | 143 | def _maybe_filter_to_n_most_recent_images( 144 | messages: list[BetaMessageParam], 145 | images_to_keep: int, 146 | min_removal_threshold: int = 10, 147 | ): 148 | """ 149 | With the assumption that images are screenshots that are of diminishing value as 150 | the conversation progresses, remove all but the final `images_to_keep` tool_result 151 | images in place, with a chunk of min_removal_threshold to reduce the amount we 152 | break the implicit prompt cache. 153 | """ 154 | if images_to_keep is None: 155 | return messages 156 | 157 | tool_result_blocks = cast( 158 | list[ToolResultBlockParam], 159 | [ 160 | item 161 | for message in messages 162 | for item in ( 163 | message["content"] if isinstance(message["content"], list) else [] 164 | ) 165 | if isinstance(item, dict) and item.get("type") == "tool_result" 166 | ], 167 | ) 168 | 169 | total_images = sum( 170 | 1 171 | for tool_result in tool_result_blocks 172 | for content in tool_result.get("content", []) 173 | if isinstance(content, dict) and content.get("type") == "image" 174 | ) 175 | 176 | images_to_remove = total_images - images_to_keep 177 | # for better cache behavior, we want to remove in chunks 178 | images_to_remove -= images_to_remove % min_removal_threshold 179 | 180 | for tool_result in tool_result_blocks: 181 | if isinstance(tool_result.get("content"), list): 182 | new_content = [] 183 | for content in tool_result.get("content", []): 184 | if isinstance(content, dict) and content.get("type") == "image": 185 | if images_to_remove > 0: 186 | images_to_remove -= 1 187 | continue 188 | new_content.append(content) 189 | tool_result["content"] = new_content 190 | 191 | 192 | def _make_api_tool_result( 193 | result: ToolResult, tool_use_id: str 194 | ) -> BetaToolResultBlockParam: 195 | """Convert an agent ToolResult to an API ToolResultBlockParam.""" 196 | tool_result_content: list[BetaTextBlockParam | BetaImageBlockParam] | str = [] 197 | is_error = False 198 | if result.error: 199 | is_error = True 200 | tool_result_content = _maybe_prepend_system_tool_result(result, result.error) 201 | else: 202 | if result.output: 203 | tool_result_content.append( 204 | { 205 | "type": "text", 206 | "text": _maybe_prepend_system_tool_result(result, result.output), 207 | } 208 | ) 209 | if result.base64_image: 210 | tool_result_content.append( 211 | { 212 | "type": "image", 213 | "source": { 214 | "type": "base64", 215 | "media_type": "image/png", 216 | "data": result.base64_image, 217 | }, 218 | } 219 | ) 220 | return { 221 | "type": "tool_result", 222 | "content": tool_result_content, 223 | "tool_use_id": tool_use_id, 224 | "is_error": is_error, 225 | } 226 | 227 | 228 | def _maybe_prepend_system_tool_result(result: ToolResult, result_text: str): 229 | if result.system: 230 | result_text = f"{result.system}\n{result_text}" 231 | return result_text 232 | -------------------------------------------------------------------------------- /computer_use_demo/tools/edit.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from pathlib import Path 3 | from typing import Literal, get_args 4 | 5 | from anthropic.types.beta import BetaToolTextEditor20241022Param 6 | 7 | from .base import BaseAnthropicTool, CLIResult, ToolError, ToolResult 8 | from .run import maybe_truncate, run 9 | 10 | Command = Literal[ 11 | "view", 12 | "create", 13 | "str_replace", 14 | "insert", 15 | "undo_edit", 16 | ] 17 | SNIPPET_LINES: int = 4 18 | 19 | 20 | class EditTool(BaseAnthropicTool): 21 | """ 22 | An filesystem editor tool that allows the agent to view, create, and edit files. 23 | The tool parameters are defined by Anthropic and are not editable. 24 | """ 25 | 26 | api_type: Literal["text_editor_20241022"] = "text_editor_20241022" 27 | name: Literal["str_replace_editor"] = "str_replace_editor" 28 | 29 | _file_history: dict[Path, list[str]] 30 | 31 | def __init__(self): 32 | self._file_history = defaultdict(list) 33 | super().__init__() 34 | 35 | def to_params(self) -> BetaToolTextEditor20241022Param: 36 | return { 37 | "name": self.name, 38 | "type": self.api_type, 39 | } 40 | 41 | async def __call__( 42 | self, 43 | *, 44 | command: Command, 45 | path: str, 46 | file_text: str | None = None, 47 | view_range: list[int] | None = None, 48 | old_str: str | None = None, 49 | new_str: str | None = None, 50 | insert_line: int | None = None, 51 | **kwargs, 52 | ): 53 | _path = Path(path) 54 | self.validate_path(command, _path) 55 | if command == "view": 56 | return await self.view(_path, view_range) 57 | elif command == "create": 58 | if not file_text: 59 | raise ToolError("Parameter `file_text` is required for command: create") 60 | self.write_file(_path, file_text) 61 | self._file_history[_path].append(file_text) 62 | return ToolResult(output=f"File created successfully at: {_path}") 63 | elif command == "str_replace": 64 | if not old_str: 65 | raise ToolError( 66 | "Parameter `old_str` is required for command: str_replace" 67 | ) 68 | return self.str_replace(_path, old_str, new_str) 69 | elif command == "insert": 70 | if insert_line is None: 71 | raise ToolError( 72 | "Parameter `insert_line` is required for command: insert" 73 | ) 74 | if not new_str: 75 | raise ToolError("Parameter `new_str` is required for command: insert") 76 | return self.insert(_path, insert_line, new_str) 77 | elif command == "undo_edit": 78 | return self.undo_edit(_path) 79 | raise ToolError( 80 | f'Unrecognized command {command}. The allowed commands for the {self.name} tool are: {", ".join(get_args(Command))}' 81 | ) 82 | 83 | def validate_path(self, command: str, path: Path): 84 | """ 85 | Check that the path/command combination is valid. 86 | """ 87 | # Check if its an absolute path 88 | if not path.is_absolute(): 89 | suggested_path = Path("") / path 90 | raise ToolError( 91 | f"The path {path} is not an absolute path, it should start with `/`. Maybe you meant {suggested_path}?" 92 | ) 93 | # Check if path exists 94 | if not path.exists() and command != "create": 95 | raise ToolError( 96 | f"The path {path} does not exist. Please provide a valid path." 97 | ) 98 | if path.exists() and command == "create": 99 | raise ToolError( 100 | f"File already exists at: {path}. Cannot overwrite files using command `create`." 101 | ) 102 | # Check if the path points to a directory 103 | if path.is_dir(): 104 | if command != "view": 105 | raise ToolError( 106 | f"The path {path} is a directory and only the `view` command can be used on directories" 107 | ) 108 | 109 | async def view(self, path: Path, view_range: list[int] | None = None): 110 | """Implement the view command""" 111 | if path.is_dir(): 112 | if view_range: 113 | raise ToolError( 114 | "The `view_range` parameter is not allowed when `path` points to a directory." 115 | ) 116 | 117 | _, stdout, stderr = await run( 118 | rf"find {path} -maxdepth 2 -not -path '*/\.*'" 119 | ) 120 | if not stderr: 121 | stdout = f"Here's the files and directories up to 2 levels deep in {path}, excluding hidden items:\n{stdout}\n" 122 | return CLIResult(output=stdout, error=stderr) 123 | 124 | file_content = self.read_file(path) 125 | init_line = 1 126 | if view_range: 127 | if len(view_range) != 2 or not all(isinstance(i, int) for i in view_range): 128 | raise ToolError( 129 | "Invalid `view_range`. It should be a list of two integers." 130 | ) 131 | file_lines = file_content.split("\n") 132 | n_lines_file = len(file_lines) 133 | init_line, final_line = view_range 134 | if init_line < 1 or init_line > n_lines_file: 135 | raise ToolError( 136 | f"Invalid `view_range`: {view_range}. It's first element `{init_line}` should be within the range of lines of the file: {[1, n_lines_file]}" 137 | ) 138 | if final_line > n_lines_file: 139 | raise ToolError( 140 | f"Invalid `view_range`: {view_range}. It's second element `{final_line}` should be smaller than the number of lines in the file: `{n_lines_file}`" 141 | ) 142 | if final_line != -1 and final_line < init_line: 143 | raise ToolError( 144 | f"Invalid `view_range`: {view_range}. It's second element `{final_line}` should be larger or equal than its first `{init_line}`" 145 | ) 146 | 147 | if final_line == -1: 148 | file_content = "\n".join(file_lines[init_line - 1 :]) 149 | else: 150 | file_content = "\n".join(file_lines[init_line - 1 : final_line]) 151 | 152 | return CLIResult( 153 | output=self._make_output(file_content, str(path), init_line=init_line) 154 | ) 155 | 156 | def str_replace(self, path: Path, old_str: str, new_str: str | None): 157 | """Implement the str_replace command, which replaces old_str with new_str in the file content""" 158 | # Read the file content 159 | file_content = self.read_file(path).expandtabs() 160 | old_str = old_str.expandtabs() 161 | new_str = new_str.expandtabs() if new_str is not None else "" 162 | 163 | # Check if old_str is unique in the file 164 | occurrences = file_content.count(old_str) 165 | if occurrences == 0: 166 | raise ToolError( 167 | f"No replacement was performed, old_str `{old_str}` did not appear verbatim in {path}." 168 | ) 169 | elif occurrences > 1: 170 | file_content_lines = file_content.split("\n") 171 | lines = [ 172 | idx + 1 173 | for idx, line in enumerate(file_content_lines) 174 | if old_str in line 175 | ] 176 | raise ToolError( 177 | f"No replacement was performed. Multiple occurrences of old_str `{old_str}` in lines {lines}. Please ensure it is unique" 178 | ) 179 | 180 | # Replace old_str with new_str 181 | new_file_content = file_content.replace(old_str, new_str) 182 | 183 | # Write the new content to the file 184 | self.write_file(path, new_file_content) 185 | 186 | # Save the content to history 187 | self._file_history[path].append(file_content) 188 | 189 | # Create a snippet of the edited section 190 | replacement_line = file_content.split(old_str)[0].count("\n") 191 | start_line = max(0, replacement_line - SNIPPET_LINES) 192 | end_line = replacement_line + SNIPPET_LINES + new_str.count("\n") 193 | snippet = "\n".join(new_file_content.split("\n")[start_line : end_line + 1]) 194 | 195 | # Prepare the success message 196 | success_msg = f"The file {path} has been edited. " 197 | success_msg += self._make_output( 198 | snippet, f"a snippet of {path}", start_line + 1 199 | ) 200 | success_msg += "Review the changes and make sure they are as expected. Edit the file again if necessary." 201 | 202 | return CLIResult(output=success_msg) 203 | 204 | def insert(self, path: Path, insert_line: int, new_str: str): 205 | """Implement the insert command, which inserts new_str at the specified line in the file content.""" 206 | file_text = self.read_file(path).expandtabs() 207 | new_str = new_str.expandtabs() 208 | file_text_lines = file_text.split("\n") 209 | n_lines_file = len(file_text_lines) 210 | 211 | if insert_line < 0 or insert_line > n_lines_file: 212 | raise ToolError( 213 | f"Invalid `insert_line` parameter: {insert_line}. It should be within the range of lines of the file: {[0, n_lines_file]}" 214 | ) 215 | 216 | new_str_lines = new_str.split("\n") 217 | new_file_text_lines = ( 218 | file_text_lines[:insert_line] 219 | + new_str_lines 220 | + file_text_lines[insert_line:] 221 | ) 222 | snippet_lines = ( 223 | file_text_lines[max(0, insert_line - SNIPPET_LINES) : insert_line] 224 | + new_str_lines 225 | + file_text_lines[insert_line : insert_line + SNIPPET_LINES] 226 | ) 227 | 228 | new_file_text = "\n".join(new_file_text_lines) 229 | snippet = "\n".join(snippet_lines) 230 | 231 | self.write_file(path, new_file_text) 232 | self._file_history[path].append(file_text) 233 | 234 | success_msg = f"The file {path} has been edited. " 235 | success_msg += self._make_output( 236 | snippet, 237 | "a snippet of the edited file", 238 | max(1, insert_line - SNIPPET_LINES + 1), 239 | ) 240 | success_msg += "Review the changes and make sure they are as expected (correct indentation, no duplicate lines, etc). Edit the file again if necessary." 241 | return CLIResult(output=success_msg) 242 | 243 | def undo_edit(self, path: Path): 244 | """Implement the undo_edit command.""" 245 | if not self._file_history[path]: 246 | raise ToolError(f"No edit history found for {path}.") 247 | 248 | old_text = self._file_history[path].pop() 249 | self.write_file(path, old_text) 250 | 251 | return CLIResult( 252 | output=f"Last edit to {path} undone successfully. {self._make_output(old_text, str(path))}" 253 | ) 254 | 255 | def read_file(self, path: Path): 256 | """Read the content of a file from a given path; raise a ToolError if an error occurs.""" 257 | try: 258 | return path.read_text() 259 | except Exception as e: 260 | raise ToolError(f"Ran into {e} while trying to read {path}") from None 261 | 262 | def write_file(self, path: Path, file: str): 263 | """Write the content of a file to a given path; raise a ToolError if an error occurs.""" 264 | try: 265 | path.write_text(file) 266 | except Exception as e: 267 | raise ToolError(f"Ran into {e} while trying to write to {path}") from None 268 | 269 | def _make_output( 270 | self, 271 | file_content: str, 272 | file_descriptor: str, 273 | init_line: int = 1, 274 | expand_tabs: bool = True, 275 | ): 276 | """Generate output for the CLI based on the content of a file.""" 277 | file_content = maybe_truncate(file_content) 278 | if expand_tabs: 279 | file_content = file_content.expandtabs() 280 | file_content = "\n".join( 281 | [ 282 | f"{i + init_line:6}\t{line}" 283 | for i, line in enumerate(file_content.split("\n")) 284 | ] 285 | ) 286 | return ( 287 | f"Here's the result of running `cat -n` on {file_descriptor}:\n" 288 | + file_content 289 | + "\n" 290 | ) 291 | --------------------------------------------------------------------------------