├── operate ├── __init__.py ├── models │ ├── __init__.py │ ├── weights │ │ ├── __init__.py │ │ └── best.pt │ ├── prompts.py │ └── apis.py ├── utils │ ├── __init__.py │ ├── style.py │ ├── misc.py │ ├── screenshot.py │ ├── operating_system.py │ ├── ocr.py │ └── label.py ├── exceptions.py ├── main.py ├── operate.py └── config.py ├── requirements-audio.txt ├── readme ├── key.png ├── terminal-access-1.png ├── terminal-access-2.png └── self-operating-computer.png ├── .github ├── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── workflows │ └── upload-package.yml └── PULL_REQUEST_TEMPLATE.md ├── setup.py ├── LICENSE ├── requirements.txt ├── CONTRIBUTING.md ├── .gitignore ├── evaluate.py └── README.md /operate/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /operate/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /operate/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements-audio.txt: -------------------------------------------------------------------------------- 1 | whisper-mic -------------------------------------------------------------------------------- /operate/models/weights/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /readme/key.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OthersideAI/self-operating-computer/HEAD/readme/key.png -------------------------------------------------------------------------------- /readme/terminal-access-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OthersideAI/self-operating-computer/HEAD/readme/terminal-access-1.png -------------------------------------------------------------------------------- /readme/terminal-access-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OthersideAI/self-operating-computer/HEAD/readme/terminal-access-2.png -------------------------------------------------------------------------------- /operate/models/weights/best.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OthersideAI/self-operating-computer/HEAD/operate/models/weights/best.pt -------------------------------------------------------------------------------- /readme/self-operating-computer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OthersideAI/self-operating-computer/HEAD/readme/self-operating-computer.png -------------------------------------------------------------------------------- /operate/exceptions.py: -------------------------------------------------------------------------------- 1 | class ModelNotRecognizedException(Exception): 2 | """Exception raised for unrecognized models. 3 | 4 | Attributes: 5 | model -- the unrecognized model 6 | message -- explanation of the error 7 | """ 8 | 9 | def __init__(self, model, message="Model not recognized"): 10 | self.model = model 11 | self.message = message 12 | super().__init__(self.message) 13 | 14 | def __str__(self): 15 | return f"{self.message} : {self.model} " -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '[FEATURE] Short Description of the Feature' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | ### Is your feature request related to a problem? Please describe. 11 | 12 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 13 | 14 | ### Describe the solution you'd like 15 | A clear and concise description of what you want to happen. 16 | 17 | ### Describe alternatives you've considered 18 | A clear and concise description of any alternative solutions or features you've considered. 19 | 20 | ### Additional context 21 | Add any other context or screenshots about the feature request here. -------------------------------------------------------------------------------- /.github/workflows/upload-package.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v3 13 | 14 | - name: Set up Python 15 | uses: actions/setup-python@v3 16 | with: 17 | python-version: '3.8' 18 | 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install setuptools wheel twine 23 | 24 | - name: Build and check package 25 | run: | 26 | python setup.py sdist bdist_wheel 27 | twine check dist/* 28 | 29 | - name: Upload to PyPi 30 | uses: pypa/gh-action-pypi-publish@v1.4.2 31 | with: 32 | user: __token__ 33 | password: ${{ secrets.PYPI_API_TOKEN }} 34 | 35 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '[BUG] Brief Description of the Issue' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | Found a bug? Please fill out the sections below. 👍 11 | 12 | 13 | ### Describe the bug 14 | 15 | A clear and concise description of what the bug is. 16 | 17 | ### Steps to Reproduce 18 | 19 | 1. (for ex.) went to... 20 | 2. clicked on this point 21 | 3. not working 22 | 23 | ### Expected Behavior 24 | A brief description of what you expected to happen. 25 | 26 | ### Actual Behavior: 27 | what actually happened. 28 | 29 | ### Environment 30 | - OS: 31 | - Model Used (e.g., GPT-4v, Gemini Pro Vision): 32 | - Framework Version (optional): 33 | 34 | ### Screenshots 35 | If applicable, add screenshots to help explain your problem. 36 | 37 | ### Additional context 38 | Add any other context about the problem here. -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | # Read the contents of your requirements.txt file 4 | with open("requirements.txt") as f: 5 | required = f.read().splitlines() 6 | 7 | # Read the contents of your README.md file for the project description 8 | with open("README.md", "r", encoding="utf-8") as readme_file: 9 | long_description = readme_file.read() 10 | 11 | setup( 12 | name="self-operating-computer", 13 | version="1.5.8", 14 | packages=find_packages(), 15 | install_requires=required, # Add dependencies here 16 | entry_points={ 17 | "console_scripts": [ 18 | "operate=operate.main:main_entry", 19 | ], 20 | }, 21 | package_data={ 22 | # Include the file in the operate.models.weights package 23 | "operate.models.weights": ["best.pt"], 24 | }, 25 | long_description=long_description, # Add project description here 26 | long_description_content_type="text/markdown", # Specify Markdown format 27 | # include any other necessary setup options here 28 | ) 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 OthersideAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | annotated-types==0.6.0 2 | anyio==3.7.1 3 | certifi==2023.7.22 4 | charset-normalizer==3.3.2 5 | colorama==0.4.6 6 | contourpy==1.2.0 7 | cycler==0.12.1 8 | distro==1.8.0 9 | EasyProcess==1.1 10 | entrypoint2==1.1 11 | exceptiongroup==1.1.3 12 | fonttools==4.44.0 13 | h11==0.14.0 14 | httpcore==1.0.2 15 | httpx>=0.25.2 16 | idna==3.4 17 | importlib-resources==6.1.1 18 | kiwisolver==1.4.5 19 | matplotlib==3.8.1 20 | MouseInfo==0.1.3 21 | mss==9.0.1 22 | numpy==1.26.1 23 | openai==1.2.3 24 | packaging==23.2 25 | Pillow==10.1.0 26 | prompt-toolkit==3.0.39 27 | PyAutoGUI==0.9.54 28 | pydantic==2.4.2 29 | pydantic_core==2.10.1 30 | PyGetWindow==0.0.9 31 | PyMsgBox==1.0.9 32 | pyparsing==3.1.1 33 | pyperclip==1.8.2 34 | PyRect==0.2.0 35 | pyscreenshot==3.1 36 | PyScreeze==0.1.29 37 | python3-xlib==0.15 38 | python-dateutil==2.8.2 39 | python-dotenv==1.0.0 40 | pytweening==1.0.7 41 | requests==2.31.0 42 | rubicon-objc==0.4.7 43 | six==1.16.0 44 | sniffio==1.3.0 45 | tqdm==4.66.1 46 | typing_extensions==4.8.0 47 | urllib3==2.0.7 48 | wcwidth==0.2.9 49 | zipp==3.17.0 50 | google-generativeai==0.3.0 51 | aiohttp==3.9.1 52 | ultralytics==8.0.227 53 | easyocr==1.7.1 54 | ollama==0.1.6 55 | anthropic -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## What does this PR do? 2 | 3 | 4 | 5 | Fixes # (issue) 6 | 7 | ## Requirement/Documentation 8 | 9 | 10 | 11 | - If there is a requirement document, please, share it here. 12 | 13 | ## Type of change 14 | 15 | 16 | 17 | - [ ] Bug fix (non-breaking change which fixes an issue) 18 | - [ ] Chore (refactoring code, technical debt, workflow improvements) 19 | - [ ] New feature (non-breaking change which adds functionality) 20 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) 21 | - [ ] Tests (Unit/Integration/E2E or any other test) 22 | - [ ] This change requires a documentation update 23 | 24 | 25 | ## Mandatory Tasks 26 | 27 | - [ ] Make sure you have self-reviewed the code. A decent size PR without self-review might be rejected. Make sure before submmiting this PR you run tests with evaluate.py 28 | -------------------------------------------------------------------------------- /operate/utils/style.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import platform 3 | import os 4 | from prompt_toolkit.styles import Style as PromptStyle 5 | 6 | 7 | # Define style 8 | style = PromptStyle.from_dict( 9 | { 10 | "dialog": "bg:#88ff88", 11 | "button": "bg:#ffffff #000000", 12 | "dialog.body": "bg:#44cc44 #ffffff", 13 | "dialog shadow": "bg:#003800", 14 | } 15 | ) 16 | 17 | 18 | # Check if on a windows terminal that supports ANSI escape codes 19 | def supports_ansi(): 20 | """ 21 | Check if the terminal supports ANSI escape codes 22 | """ 23 | plat = platform.system() 24 | supported_platform = plat != "Windows" or "ANSICON" in os.environ 25 | is_a_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty() 26 | return supported_platform and is_a_tty 27 | 28 | 29 | # Define ANSI color codes 30 | ANSI_GREEN = "\033[32m" if supports_ansi() else "" # Standard green text 31 | ANSI_BRIGHT_GREEN = "\033[92m" if supports_ansi() else "" # Bright/bold green text 32 | ANSI_RESET = "\033[0m" if supports_ansi() else "" # Reset to default text color 33 | ANSI_BLUE = "\033[94m" if supports_ansi() else "" # Bright blue 34 | ANSI_YELLOW = "\033[33m" if supports_ansi() else "" # Standard yellow text 35 | ANSI_RED = "\033[31m" if supports_ansi() else "" 36 | ANSI_BRIGHT_MAGENTA = "\033[95m" if supports_ansi() else "" # Bright magenta text 37 | -------------------------------------------------------------------------------- /operate/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Self-Operating Computer 3 | """ 4 | import argparse 5 | from operate.utils.style import ANSI_BRIGHT_MAGENTA 6 | from operate.operate import main 7 | 8 | 9 | def main_entry(): 10 | parser = argparse.ArgumentParser( 11 | description="Run the self-operating-computer with a specified model." 12 | ) 13 | parser.add_argument( 14 | "-m", 15 | "--model", 16 | help="Specify the model to use", 17 | required=False, 18 | default="gpt-4-with-ocr", 19 | ) 20 | 21 | # Add a voice flag 22 | parser.add_argument( 23 | "--voice", 24 | help="Use voice input mode", 25 | action="store_true", 26 | ) 27 | 28 | # Add a flag for verbose mode 29 | parser.add_argument( 30 | "--verbose", 31 | help="Run operate in verbose mode", 32 | action="store_true", 33 | ) 34 | 35 | # Allow for direct input of prompt 36 | parser.add_argument( 37 | "--prompt", 38 | help="Directly input the objective prompt", 39 | type=str, 40 | required=False, 41 | ) 42 | 43 | try: 44 | args = parser.parse_args() 45 | main( 46 | args.model, 47 | terminal_prompt=args.prompt, 48 | voice_mode=args.voice, 49 | verbose_mode=args.verbose 50 | ) 51 | except KeyboardInterrupt: 52 | print(f"\n{ANSI_BRIGHT_MAGENTA}Exiting...") 53 | 54 | 55 | if __name__ == "__main__": 56 | main_entry() 57 | -------------------------------------------------------------------------------- /operate/utils/misc.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | 5 | def convert_percent_to_decimal(percent): 6 | try: 7 | # Remove the '%' sign and convert to float 8 | decimal_value = float(percent) 9 | 10 | # Convert to decimal (e.g., 20% -> 0.20) 11 | return decimal_value 12 | except ValueError as e: 13 | print(f"[convert_percent_to_decimal] error: {e}") 14 | return None 15 | 16 | 17 | def parse_operations(response): 18 | if response == "DONE": 19 | return {"type": "DONE", "data": None} 20 | elif response.startswith("CLICK"): 21 | # Adjust the regex to match the correct format 22 | click_data = re.search(r"CLICK \{ (.+) \}", response).group(1) 23 | click_data_json = json.loads(f"{{{click_data}}}") 24 | return {"type": "CLICK", "data": click_data_json} 25 | 26 | elif response.startswith("TYPE"): 27 | # Extract the text to type 28 | try: 29 | type_data = re.search(r"TYPE (.+)", response, re.DOTALL).group(1) 30 | except: 31 | type_data = re.search(r'TYPE "(.+)"', response, re.DOTALL).group(1) 32 | return {"type": "TYPE", "data": type_data} 33 | 34 | elif response.startswith("SEARCH"): 35 | # Extract the search query 36 | try: 37 | search_data = re.search(r'SEARCH "(.+)"', response).group(1) 38 | except: 39 | search_data = re.search(r"SEARCH (.+)", response).group(1) 40 | return {"type": "SEARCH", "data": search_data} 41 | 42 | return {"type": "UNKNOWN", "data": response} 43 | -------------------------------------------------------------------------------- /operate/utils/screenshot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | import subprocess 4 | import pyautogui 5 | from PIL import Image, ImageDraw, ImageGrab 6 | import Xlib.display 7 | import Xlib.X 8 | import Xlib.Xutil # not sure if Xutil is necessary 9 | 10 | 11 | def capture_screen_with_cursor(file_path): 12 | user_platform = platform.system() 13 | 14 | if user_platform == "Windows": 15 | screenshot = pyautogui.screenshot() 16 | screenshot.save(file_path) 17 | elif user_platform == "Linux": 18 | # Use xlib to prevent scrot dependency for Linux 19 | screen = Xlib.display.Display().screen() 20 | size = screen.width_in_pixels, screen.height_in_pixels 21 | screenshot = ImageGrab.grab(bbox=(0, 0, size[0], size[1])) 22 | screenshot.save(file_path) 23 | elif user_platform == "Darwin": # (Mac OS) 24 | # Use the screencapture utility to capture the screen with the cursor 25 | subprocess.run(["screencapture", "-C", file_path]) 26 | else: 27 | print(f"The platform you're using ({user_platform}) is not currently supported") 28 | 29 | 30 | def compress_screenshot(raw_screenshot_filename, screenshot_filename): 31 | with Image.open(raw_screenshot_filename) as img: 32 | # Check if the image has an alpha channel (transparency) 33 | if img.mode in ('RGBA', 'LA') or (img.mode == 'P' and 'transparency' in img.info): 34 | # Create a white background image 35 | background = Image.new('RGB', img.size, (255, 255, 255)) 36 | # Paste the image onto the background, using the alpha channel as mask 37 | background.paste(img, mask=img.split()[3]) # 3 is the alpha channel 38 | # Save the result as JPEG 39 | background.save(screenshot_filename, 'JPEG', quality=85) # Adjust quality as needed 40 | else: 41 | # If no alpha channel, simply convert and save 42 | img.convert('RGB').save(screenshot_filename, 'JPEG', quality=85) 43 | -------------------------------------------------------------------------------- /operate/utils/operating_system.py: -------------------------------------------------------------------------------- 1 | import pyautogui 2 | import platform 3 | import time 4 | import math 5 | 6 | from operate.utils.misc import convert_percent_to_decimal 7 | 8 | 9 | class OperatingSystem: 10 | def write(self, content): 11 | try: 12 | content = content.replace("\\n", "\n") 13 | for char in content: 14 | pyautogui.write(char) 15 | except Exception as e: 16 | print("[OperatingSystem][write] error:", e) 17 | 18 | def press(self, keys): 19 | try: 20 | for key in keys: 21 | pyautogui.keyDown(key) 22 | time.sleep(0.1) 23 | for key in keys: 24 | pyautogui.keyUp(key) 25 | except Exception as e: 26 | print("[OperatingSystem][press] error:", e) 27 | 28 | def mouse(self, click_detail): 29 | try: 30 | x = convert_percent_to_decimal(click_detail.get("x")) 31 | y = convert_percent_to_decimal(click_detail.get("y")) 32 | 33 | if click_detail and isinstance(x, float) and isinstance(y, float): 34 | self.click_at_percentage(x, y) 35 | 36 | except Exception as e: 37 | print("[OperatingSystem][mouse] error:", e) 38 | 39 | def click_at_percentage( 40 | self, 41 | x_percentage, 42 | y_percentage, 43 | duration=0.2, 44 | circle_radius=50, 45 | circle_duration=0.5, 46 | ): 47 | try: 48 | screen_width, screen_height = pyautogui.size() 49 | x_pixel = int(screen_width * float(x_percentage)) 50 | y_pixel = int(screen_height * float(y_percentage)) 51 | 52 | pyautogui.moveTo(x_pixel, y_pixel, duration=duration) 53 | 54 | start_time = time.time() 55 | while time.time() - start_time < circle_duration: 56 | angle = ((time.time() - start_time) / circle_duration) * 2 * math.pi 57 | x = x_pixel + math.cos(angle) * circle_radius 58 | y = y_pixel + math.sin(angle) * circle_radius 59 | pyautogui.moveTo(x, y, duration=0.1) 60 | 61 | pyautogui.click(x_pixel, y_pixel) 62 | except Exception as e: 63 | print("[OperatingSystem][click_at_percentage] error:", e) 64 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | We appreciate your contributions! 3 | 4 | ## Process 5 | 1. Fork it 6 | 2. Create your feature branch (`git checkout -b my-new-feature`) 7 | 3. Commit your changes (`git commit -am 'Add some feature'`) 8 | 4. Push to the branch (`git push origin my-new-feature`) 9 | 5. Create new Pull Request 10 | 11 | ## Modifying and Running Code 12 | 1. Make changes in `operate/main.py` 13 | 2. Run `pip install .` again 14 | 3. Run `operate` to see your changes 15 | 16 | ## Testing Changes 17 | **After making significant changes, it's important to verify that SOC can still successfully perform a set of common test cases.** 18 | In the root directory of the project, run: 19 | ``` 20 | python3 evaluate.py 21 | ``` 22 | This will automatically prompt `operate` to perform several simple objectives. 23 | Upon completion of each objective, GPT-4v will give an evaluation and determine if the objective was successfully reached. 24 | 25 | `evaluate.py` will print out if each test case `[PASSED]` or `[FAILED]`. In addition, a justification will be given on why the pass/fail was given. 26 | 27 | It is recommended that a screenshot of the `evaluate.py` output is included in any PR which could impact the performance of SOC. 28 | 29 | ## Contribution Ideas 30 | - **Improve performance by finding optimal screenshot grid**: A primary element of the framework is that it overlays a percentage grid on the screenshot which GPT-4v uses to estimate click locations. If someone is able to find the optimal grid and some evaluation metrics to confirm it is an improvement on the current method then we will merge that PR. 31 | - **Improve the `SUMMARY_PROMPT`** 32 | - **Improve Linux and Windows compatibility**: There are still some issues with Linux and Windows compatibility. PRs to fix the issues are encouraged. 33 | - **Adding New Multimodal Models**: Integration of new multimodal models is welcomed. If you have a specific model in mind that you believe would be a valuable addition, please feel free to integrate it and submit a PR. 34 | - **Iterate `--accurate` flag functionality**: Look at https://github.com/OthersideAI/self-operating-computer/pull/57 for previous iteration 35 | - **Enhanced Security**: A feature request to implement a _robust security feature_ that prompts users for _confirmation before executing potentially harmful actions_. This feature aims to _prevent unintended actions_ and _safeguard user data_ as mentioned here in this [OtherSide#25](https://github.com/OthersideAI/self-operating-computer/issues/25) 36 | 37 | 38 | ## Guidelines 39 | This will primarily be a [Software 2.0](https://karpathy.medium.com/software-2-0-a64152b37c35) project. For this reason: 40 | 41 | - Let's try to hold off refactors into separate files until `main.py` is more than 1000 lines 42 | 43 | -------------------------------------------------------------------------------- /operate/utils/ocr.py: -------------------------------------------------------------------------------- 1 | from operate.config import Config 2 | from PIL import Image, ImageDraw 3 | import os 4 | from datetime import datetime 5 | 6 | # Load configuration 7 | config = Config() 8 | 9 | 10 | def get_text_element(result, search_text, image_path): 11 | """ 12 | Searches for a text element in the OCR results and returns its index. Also draws bounding boxes on the image. 13 | Args: 14 | result (list): The list of results returned by EasyOCR. 15 | search_text (str): The text to search for in the OCR results. 16 | image_path (str): Path to the original image. 17 | 18 | Returns: 19 | int: The index of the element containing the search text. 20 | 21 | Raises: 22 | Exception: If the text element is not found in the results. 23 | """ 24 | if config.verbose: 25 | print("[get_text_element]") 26 | print("[get_text_element] search_text", search_text) 27 | # Create /ocr directory if it doesn't exist 28 | ocr_dir = "ocr" 29 | if not os.path.exists(ocr_dir): 30 | os.makedirs(ocr_dir) 31 | 32 | # Open the original image 33 | image = Image.open(image_path) 34 | draw = ImageDraw.Draw(image) 35 | 36 | found_index = None 37 | for index, element in enumerate(result): 38 | text = element[1] 39 | box = element[0] 40 | 41 | if config.verbose: 42 | # Draw bounding box in blue 43 | draw.polygon([tuple(point) for point in box], outline="blue") 44 | 45 | if search_text in text: 46 | found_index = index 47 | if config.verbose: 48 | print("[get_text_element][loop] found search_text, index:", index) 49 | 50 | if found_index is not None: 51 | if config.verbose: 52 | # Draw bounding box of the found text in red 53 | box = result[found_index][0] 54 | draw.polygon([tuple(point) for point in box], outline="red") 55 | # Save the image with bounding boxes 56 | datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S") 57 | ocr_image_path = os.path.join(ocr_dir, f"ocr_image_{datetime_str}.png") 58 | image.save(ocr_image_path) 59 | print("[get_text_element] OCR image saved at:", ocr_image_path) 60 | 61 | return found_index 62 | 63 | raise Exception("The text element was not found in the image") 64 | 65 | 66 | def get_text_coordinates(result, index, image_path): 67 | """ 68 | Gets the coordinates of the text element at the specified index as a percentage of screen width and height. 69 | Args: 70 | result (list): The list of results returned by EasyOCR. 71 | index (int): The index of the text element in the results list. 72 | image_path (str): Path to the screenshot image. 73 | 74 | Returns: 75 | dict: A dictionary containing the 'x' and 'y' coordinates as percentages of the screen width and height. 76 | """ 77 | if index >= len(result): 78 | raise Exception("Index out of range in OCR results") 79 | 80 | # Get the bounding box of the text element 81 | bounding_box = result[index][0] 82 | 83 | # Calculate the center of the bounding box 84 | min_x = min([coord[0] for coord in bounding_box]) 85 | max_x = max([coord[0] for coord in bounding_box]) 86 | min_y = min([coord[1] for coord in bounding_box]) 87 | max_y = max([coord[1] for coord in bounding_box]) 88 | 89 | center_x = (min_x + max_x) / 2 90 | center_y = (min_y + max_y) / 2 91 | 92 | # Get image dimensions 93 | with Image.open(image_path) as img: 94 | width, height = img.size 95 | 96 | # Convert to percentages 97 | percent_x = round((center_x / width), 3) 98 | percent_y = round((center_y / height), 3) 99 | 100 | return {"x": percent_x, "y": percent_y} 101 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | .DS_Store 163 | 164 | # Avoid sending testing screenshots up 165 | *.png 166 | operate/screenshots/ 167 | -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import subprocess 4 | import platform 5 | import base64 6 | import json 7 | import openai 8 | import argparse 9 | 10 | from dotenv import load_dotenv 11 | 12 | # "Objective for `operate`" : "Guideline for passing this test case given to GPT-4v" 13 | TEST_CASES = { 14 | "Go to Github.com": "A Github page is visible.", 15 | "Go to Youtube.com and play a video": "The YouTube video player is visible.", 16 | } 17 | 18 | EVALUATION_PROMPT = """ 19 | Your job is to look at the given screenshot and determine if the following guideline is met in the image. 20 | You must respond in the following format ONLY. Do not add anything else: 21 | {{ "guideline_met": (true|false), "reason": "Explanation for why guideline was or wasn't met" }} 22 | guideline_met must be set to a JSON boolean. True if the image meets the given guideline. 23 | reason must be a string containing a justification for your decision. 24 | 25 | Guideline: {guideline} 26 | """ 27 | 28 | SCREENSHOT_PATH = os.path.join("screenshots", "screenshot.png") 29 | 30 | 31 | # Check if on a windows terminal that supports ANSI escape codes 32 | def supports_ansi(): 33 | """ 34 | Check if the terminal supports ANSI escape codes 35 | """ 36 | plat = platform.system() 37 | supported_platform = plat != "Windows" or "ANSICON" in os.environ 38 | is_a_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty() 39 | return supported_platform and is_a_tty 40 | 41 | 42 | if supports_ansi(): 43 | # Standard green text 44 | ANSI_GREEN = "\033[32m" 45 | # Bright/bold green text 46 | ANSI_BRIGHT_GREEN = "\033[92m" 47 | # Reset to default text color 48 | ANSI_RESET = "\033[0m" 49 | # ANSI escape code for blue text 50 | ANSI_BLUE = "\033[94m" # This is for bright blue 51 | 52 | # Standard yellow text 53 | ANSI_YELLOW = "\033[33m" 54 | 55 | ANSI_RED = "\033[31m" 56 | 57 | # Bright magenta text 58 | ANSI_BRIGHT_MAGENTA = "\033[95m" 59 | else: 60 | ANSI_GREEN = "" 61 | ANSI_BRIGHT_GREEN = "" 62 | ANSI_RESET = "" 63 | ANSI_BLUE = "" 64 | ANSI_YELLOW = "" 65 | ANSI_RED = "" 66 | ANSI_BRIGHT_MAGENTA = "" 67 | 68 | 69 | def format_evaluation_prompt(guideline): 70 | prompt = EVALUATION_PROMPT.format(guideline=guideline) 71 | return prompt 72 | 73 | 74 | def parse_eval_content(content): 75 | try: 76 | res = json.loads(content) 77 | 78 | print(res["reason"]) 79 | 80 | return res["guideline_met"] 81 | except: 82 | print( 83 | "The model gave a bad evaluation response and it couldn't be parsed. Exiting..." 84 | ) 85 | exit(1) 86 | 87 | 88 | def evaluate_final_screenshot(guideline): 89 | """Load the final screenshot and return True or False if it meets the given guideline.""" 90 | with open(SCREENSHOT_PATH, "rb") as img_file: 91 | img_base64 = base64.b64encode(img_file.read()).decode("utf-8") 92 | 93 | eval_message = [ 94 | { 95 | "role": "user", 96 | "content": [ 97 | {"type": "text", "text": format_evaluation_prompt(guideline)}, 98 | { 99 | "type": "image_url", 100 | "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, 101 | }, 102 | ], 103 | } 104 | ] 105 | 106 | response = openai.chat.completions.create( 107 | model="gpt-4o", 108 | messages=eval_message, 109 | presence_penalty=1, 110 | frequency_penalty=1, 111 | temperature=0.7, 112 | ) 113 | 114 | eval_content = response.choices[0].message.content 115 | 116 | return parse_eval_content(eval_content) 117 | 118 | 119 | def run_test_case(objective, guideline, model): 120 | """Returns True if the result of the test with the given prompt meets the given guideline for the given model.""" 121 | # Run `operate` with the model to evaluate and the test case prompt 122 | subprocess.run( 123 | ["operate", "-m", model, "--prompt", f'"{objective}"'], 124 | stdout=subprocess.DEVNULL, 125 | ) 126 | 127 | try: 128 | result = evaluate_final_screenshot(guideline) 129 | except OSError: 130 | print("[Error] Couldn't open the screenshot for evaluation") 131 | return False 132 | 133 | return result 134 | 135 | 136 | def get_test_model(): 137 | parser = argparse.ArgumentParser( 138 | description="Run the self-operating-computer with a specified model." 139 | ) 140 | 141 | parser.add_argument( 142 | "-m", 143 | "--model", 144 | help="Specify the model to evaluate.", 145 | required=False, 146 | default="gpt-4-with-ocr", 147 | ) 148 | 149 | return parser.parse_args().model 150 | 151 | 152 | def main(): 153 | load_dotenv() 154 | openai.api_key = os.getenv("OPENAI_API_KEY") 155 | 156 | model = get_test_model() 157 | 158 | print(f"{ANSI_BLUE}[EVALUATING MODEL `{model}`]{ANSI_RESET}") 159 | print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET}") 160 | 161 | passed = 0 162 | failed = 0 163 | for objective, guideline in TEST_CASES.items(): 164 | print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'") 165 | 166 | result = run_test_case(objective, guideline, model) 167 | if result: 168 | print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'") 169 | passed += 1 170 | else: 171 | print(f"{ANSI_RED}[FAILED]{ANSI_RESET} '{objective}'") 172 | failed += 1 173 | 174 | print( 175 | f"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} test{'' if passed == 1 else 's'} passed, {failed} test{'' if failed == 1 else 's'} failed" 176 | ) 177 | 178 | 179 | if __name__ == "__main__": 180 | main() 181 | -------------------------------------------------------------------------------- /operate/utils/label.py: -------------------------------------------------------------------------------- 1 | import io 2 | import base64 3 | import json 4 | import os 5 | import time 6 | import asyncio 7 | from PIL import Image, ImageDraw 8 | 9 | 10 | def validate_and_extract_image_data(data): 11 | if not data or "messages" not in data: 12 | raise ValueError("Invalid request, no messages found") 13 | 14 | messages = data["messages"] 15 | if ( 16 | not messages 17 | or not isinstance(messages, list) 18 | or not messages[-1].get("image_url") 19 | ): 20 | raise ValueError("No image provided or incorrect format") 21 | 22 | image_data = messages[-1]["image_url"]["url"] 23 | if not image_data.startswith("data:image"): 24 | raise ValueError("Invalid image format") 25 | 26 | return image_data.split("base64,")[-1], messages 27 | 28 | 29 | def get_label_coordinates(label, label_coordinates): 30 | """ 31 | Retrieves the coordinates for a given label. 32 | 33 | :param label: The label to find coordinates for (e.g., "~1"). 34 | :param label_coordinates: Dictionary containing labels and their coordinates. 35 | :return: Coordinates of the label or None if the label is not found. 36 | """ 37 | return label_coordinates.get(label) 38 | 39 | 40 | def is_overlapping(box1, box2): 41 | x1_box1, y1_box1, x2_box1, y2_box1 = box1 42 | x1_box2, y1_box2, x2_box2, y2_box2 = box2 43 | 44 | # Check if there is no overlap 45 | if x1_box1 > x2_box2 or x1_box2 > x2_box1: 46 | return False 47 | if ( 48 | y1_box1 > y2_box2 or y1_box2 > y2_box1 49 | ): # Adjusted to check 100px proximity above 50 | return False 51 | 52 | return True 53 | 54 | 55 | def add_labels(base64_data, yolo_model): 56 | image_bytes = base64.b64decode(base64_data) 57 | image_labeled = Image.open(io.BytesIO(image_bytes)) # Corrected this line 58 | image_debug = image_labeled.copy() # Create a copy for the debug image 59 | image_original = ( 60 | image_labeled.copy() 61 | ) # Copy of the original image for base64 return 62 | 63 | results = yolo_model(image_labeled) 64 | 65 | draw = ImageDraw.Draw(image_labeled) 66 | debug_draw = ImageDraw.Draw( 67 | image_debug 68 | ) # Create a separate draw object for the debug image 69 | font_size = 45 70 | 71 | labeled_images_dir = "labeled_images" 72 | label_coordinates = {} # Dictionary to store coordinates 73 | 74 | if not os.path.exists(labeled_images_dir): 75 | os.makedirs(labeled_images_dir) 76 | 77 | counter = 0 78 | drawn_boxes = [] # List to keep track of boxes already drawn 79 | for result in results: 80 | if hasattr(result, "boxes"): 81 | for det in result.boxes: 82 | bbox = det.xyxy[0] 83 | x1, y1, x2, y2 = bbox.tolist() 84 | 85 | debug_label = "D_" + str(counter) 86 | debug_index_position = (x1, y1 - font_size) 87 | debug_draw.rectangle([(x1, y1), (x2, y2)], outline="blue", width=1) 88 | debug_draw.text( 89 | debug_index_position, 90 | debug_label, 91 | fill="blue", 92 | font_size=font_size, 93 | ) 94 | 95 | overlap = any( 96 | is_overlapping((x1, y1, x2, y2), box) for box in drawn_boxes 97 | ) 98 | 99 | if not overlap: 100 | draw.rectangle([(x1, y1), (x2, y2)], outline="red", width=1) 101 | label = "~" + str(counter) 102 | index_position = (x1, y1 - font_size) 103 | draw.text( 104 | index_position, 105 | label, 106 | fill="red", 107 | font_size=font_size, 108 | ) 109 | 110 | # Add the non-overlapping box to the drawn_boxes list 111 | drawn_boxes.append((x1, y1, x2, y2)) 112 | label_coordinates[label] = (x1, y1, x2, y2) 113 | 114 | counter += 1 115 | 116 | # Save the image 117 | timestamp = time.strftime("%Y%m%d-%H%M%S") 118 | 119 | output_path = os.path.join(labeled_images_dir, f"img_{timestamp}_labeled.png") 120 | output_path_debug = os.path.join(labeled_images_dir, f"img_{timestamp}_debug.png") 121 | output_path_original = os.path.join( 122 | labeled_images_dir, f"img_{timestamp}_original.png" 123 | ) 124 | 125 | image_labeled.save(output_path) 126 | image_debug.save(output_path_debug) 127 | image_original.save(output_path_original) 128 | 129 | buffered_original = io.BytesIO() 130 | image_original.save(buffered_original, format="PNG") # I guess this is needed 131 | img_base64_original = base64.b64encode(buffered_original.getvalue()).decode("utf-8") 132 | 133 | # Convert image to base64 for return 134 | buffered_labeled = io.BytesIO() 135 | image_labeled.save(buffered_labeled, format="PNG") # I guess this is needed 136 | img_base64_labeled = base64.b64encode(buffered_labeled.getvalue()).decode("utf-8") 137 | 138 | return img_base64_labeled, label_coordinates 139 | 140 | 141 | def get_click_position_in_percent(coordinates, image_size): 142 | """ 143 | Calculates the click position at the center of the bounding box and converts it to percentages. 144 | 145 | :param coordinates: A tuple of the bounding box coordinates (x1, y1, x2, y2). 146 | :param image_size: A tuple of the image dimensions (width, height). 147 | :return: A tuple of the click position in percentages (x_percent, y_percent). 148 | """ 149 | if not coordinates or not image_size: 150 | return None 151 | 152 | # Calculate the center of the bounding box 153 | x_center = (coordinates[0] + coordinates[2]) / 2 154 | y_center = (coordinates[1] + coordinates[3]) / 2 155 | 156 | # Convert to percentages 157 | x_percent = x_center / image_size[0] 158 | y_percent = y_center / image_size[1] 159 | 160 | return x_percent, y_percent 161 | -------------------------------------------------------------------------------- /operate/operate.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import time 4 | import asyncio 5 | from prompt_toolkit.shortcuts import message_dialog 6 | from prompt_toolkit import prompt 7 | from operate.exceptions import ModelNotRecognizedException 8 | import platform 9 | 10 | # from operate.models.prompts import USER_QUESTION, get_system_prompt 11 | from operate.models.prompts import ( 12 | USER_QUESTION, 13 | get_system_prompt, 14 | ) 15 | from operate.config import Config 16 | from operate.utils.style import ( 17 | ANSI_GREEN, 18 | ANSI_RESET, 19 | ANSI_YELLOW, 20 | ANSI_RED, 21 | ANSI_BRIGHT_MAGENTA, 22 | ANSI_BLUE, 23 | style, 24 | ) 25 | from operate.utils.operating_system import OperatingSystem 26 | from operate.models.apis import get_next_action 27 | 28 | # Load configuration 29 | config = Config() 30 | operating_system = OperatingSystem() 31 | 32 | 33 | def main(model, terminal_prompt, voice_mode=False, verbose_mode=False): 34 | """ 35 | Main function for the Self-Operating Computer. 36 | 37 | Parameters: 38 | - model: The model used for generating responses. 39 | - terminal_prompt: A string representing the prompt provided in the terminal. 40 | - voice_mode: A boolean indicating whether to enable voice mode. 41 | 42 | Returns: 43 | None 44 | """ 45 | 46 | mic = None 47 | # Initialize `WhisperMic`, if `voice_mode` is True 48 | 49 | config.verbose = verbose_mode 50 | config.validation(model, voice_mode) 51 | 52 | if voice_mode: 53 | try: 54 | from whisper_mic import WhisperMic 55 | 56 | # Initialize WhisperMic if import is successful 57 | mic = WhisperMic() 58 | except ImportError: 59 | print( 60 | "Voice mode requires the 'whisper_mic' module. Please install it using 'pip install -r requirements-audio.txt'" 61 | ) 62 | sys.exit(1) 63 | 64 | # Skip message dialog if prompt was given directly 65 | if not terminal_prompt: 66 | message_dialog( 67 | title="Self-Operating Computer", 68 | text="An experimental framework to enable multimodal models to operate computers", 69 | style=style, 70 | ).run() 71 | 72 | else: 73 | print("Running direct prompt...") 74 | 75 | # # Clear the console 76 | if platform.system() == "Windows": 77 | os.system("cls") 78 | else: 79 | print("\033c", end="") 80 | 81 | if terminal_prompt: # Skip objective prompt if it was given as an argument 82 | objective = terminal_prompt 83 | elif voice_mode: 84 | print( 85 | f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)" 86 | ) 87 | try: 88 | objective = mic.listen() 89 | except Exception as e: 90 | print(f"{ANSI_RED}Error in capturing voice input: {e}{ANSI_RESET}") 91 | return # Exit if voice input fails 92 | else: 93 | print( 94 | f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BRIGHT_MAGENTA} {model}{ANSI_RESET}]\n{USER_QUESTION}" 95 | ) 96 | print(f"{ANSI_YELLOW}[User]{ANSI_RESET}") 97 | objective = prompt(style=style) 98 | 99 | system_prompt = get_system_prompt(model, objective) 100 | system_message = {"role": "system", "content": system_prompt} 101 | messages = [system_message] 102 | 103 | loop_count = 0 104 | 105 | session_id = None 106 | 107 | while True: 108 | if config.verbose: 109 | print("[Self Operating Computer] loop_count", loop_count) 110 | try: 111 | operations, session_id = asyncio.run( 112 | get_next_action(model, messages, objective, session_id) 113 | ) 114 | 115 | stop = operate(operations, model) 116 | if stop: 117 | break 118 | 119 | loop_count += 1 120 | if loop_count > 10: 121 | break 122 | except ModelNotRecognizedException as e: 123 | print( 124 | f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}" 125 | ) 126 | break 127 | except Exception as e: 128 | print( 129 | f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}" 130 | ) 131 | break 132 | 133 | 134 | def operate(operations, model): 135 | if config.verbose: 136 | print("[Self Operating Computer][operate]") 137 | for operation in operations: 138 | if config.verbose: 139 | print("[Self Operating Computer][operate] operation", operation) 140 | # wait one second 141 | time.sleep(1) 142 | operate_type = operation.get("operation").lower() 143 | operate_thought = operation.get("thought") 144 | operate_detail = "" 145 | if config.verbose: 146 | print("[Self Operating Computer][operate] operate_type", operate_type) 147 | 148 | if operate_type == "press" or operate_type == "hotkey": 149 | keys = operation.get("keys") 150 | operate_detail = keys 151 | operating_system.press(keys) 152 | elif operate_type == "write": 153 | content = operation.get("content") 154 | operate_detail = content 155 | operating_system.write(content) 156 | elif operate_type == "click": 157 | x = operation.get("x") 158 | y = operation.get("y") 159 | click_detail = {"x": x, "y": y} 160 | operate_detail = click_detail 161 | 162 | operating_system.mouse(click_detail) 163 | elif operate_type == "done": 164 | summary = operation.get("summary") 165 | 166 | print( 167 | f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BRIGHT_MAGENTA} {model}{ANSI_RESET}]" 168 | ) 169 | print(f"{ANSI_BLUE}Objective Complete: {ANSI_RESET}{summary}\n") 170 | return True 171 | 172 | else: 173 | print( 174 | f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] unknown operation response :({ANSI_RESET}" 175 | ) 176 | print( 177 | f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response {ANSI_RESET}{operation}" 178 | ) 179 | return True 180 | 181 | print( 182 | f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BRIGHT_MAGENTA} {model}{ANSI_RESET}]" 183 | ) 184 | print(f"{operate_thought}") 185 | print(f"{ANSI_BLUE}Action: {ANSI_RESET}{operate_type} {operate_detail}\n") 186 | 187 | return False 188 | -------------------------------------------------------------------------------- /operate/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import google.generativeai as genai 5 | from dotenv import load_dotenv 6 | from ollama import Client 7 | from openai import OpenAI 8 | import anthropic 9 | from prompt_toolkit.shortcuts import input_dialog 10 | 11 | 12 | class Config: 13 | """ 14 | Configuration class for managing settings. 15 | 16 | Attributes: 17 | verbose (bool): Flag indicating whether verbose mode is enabled. 18 | openai_api_key (str): API key for OpenAI. 19 | google_api_key (str): API key for Google. 20 | ollama_host (str): url to ollama running remotely. 21 | """ 22 | 23 | _instance = None 24 | 25 | def __new__(cls): 26 | if cls._instance is None: 27 | cls._instance = super(Config, cls).__new__(cls) 28 | # Put any initialization here 29 | return cls._instance 30 | 31 | def __init__(self): 32 | load_dotenv() 33 | self.verbose = False 34 | self.openai_api_key = ( 35 | None # instance variables are backups in case saving to a `.env` fails 36 | ) 37 | self.google_api_key = ( 38 | None # instance variables are backups in case saving to a `.env` fails 39 | ) 40 | self.ollama_host = ( 41 | None # instance variables are backups in case savint to a `.env` fails 42 | ) 43 | self.anthropic_api_key = ( 44 | None # instance variables are backups in case saving to a `.env` fails 45 | ) 46 | self.qwen_api_key = ( 47 | None # instance variables are backups in case saving to a `.env` fails 48 | ) 49 | 50 | def initialize_openai(self): 51 | if self.verbose: 52 | print("[Config][initialize_openai]") 53 | 54 | if self.openai_api_key: 55 | if self.verbose: 56 | print("[Config][initialize_openai] using cached openai_api_key") 57 | api_key = self.openai_api_key 58 | else: 59 | if self.verbose: 60 | print( 61 | "[Config][initialize_openai] no cached openai_api_key, try to get from env." 62 | ) 63 | api_key = os.getenv("OPENAI_API_KEY") 64 | 65 | client = OpenAI( 66 | api_key=api_key, 67 | ) 68 | client.api_key = api_key 69 | client.base_url = os.getenv("OPENAI_API_BASE_URL", client.base_url) 70 | return client 71 | 72 | def initialize_qwen(self): 73 | if self.verbose: 74 | print("[Config][initialize_qwen]") 75 | 76 | if self.qwen_api_key: 77 | if self.verbose: 78 | print("[Config][initialize_qwen] using cached qwen_api_key") 79 | api_key = self.qwen_api_key 80 | else: 81 | if self.verbose: 82 | print( 83 | "[Config][initialize_qwen] no cached qwen_api_key, try to get from env." 84 | ) 85 | api_key = os.getenv("QWEN_API_KEY") 86 | 87 | client = OpenAI( 88 | api_key=api_key, 89 | base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", 90 | ) 91 | client.api_key = api_key 92 | client.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1" 93 | return client 94 | 95 | def initialize_google(self): 96 | if self.google_api_key: 97 | if self.verbose: 98 | print("[Config][initialize_google] using cached google_api_key") 99 | api_key = self.google_api_key 100 | else: 101 | if self.verbose: 102 | print( 103 | "[Config][initialize_google] no cached google_api_key, try to get from env." 104 | ) 105 | api_key = os.getenv("GOOGLE_API_KEY") 106 | genai.configure(api_key=api_key, transport="rest") 107 | model = genai.GenerativeModel("gemini-pro-vision") 108 | 109 | return model 110 | 111 | def initialize_ollama(self): 112 | if self.ollama_host: 113 | if self.verbose: 114 | print("[Config][initialize_ollama] using cached ollama host") 115 | else: 116 | if self.verbose: 117 | print( 118 | "[Config][initialize_ollama] no cached ollama host. Assuming ollama running locally." 119 | ) 120 | self.ollama_host = os.getenv("OLLAMA_HOST", None) 121 | model = Client(host=self.ollama_host) 122 | return model 123 | 124 | def initialize_anthropic(self): 125 | if self.anthropic_api_key: 126 | api_key = self.anthropic_api_key 127 | else: 128 | api_key = os.getenv("ANTHROPIC_API_KEY") 129 | return anthropic.Anthropic(api_key=api_key) 130 | 131 | def validation(self, model, voice_mode): 132 | """ 133 | Validate the input parameters for the dialog operation. 134 | """ 135 | self.require_api_key( 136 | "OPENAI_API_KEY", 137 | "OpenAI API key", 138 | model == "gpt-4" 139 | or voice_mode 140 | or model == "gpt-4-with-som" 141 | or model == "gpt-4-with-ocr" 142 | or model == "gpt-4.1-with-ocr" 143 | or model == "o1-with-ocr", 144 | ) 145 | self.require_api_key( 146 | "GOOGLE_API_KEY", "Google API key", model == "gemini-pro-vision" 147 | ) 148 | self.require_api_key( 149 | "ANTHROPIC_API_KEY", "Anthropic API key", model == "claude-3" 150 | ) 151 | self.require_api_key("QWEN_API_KEY", "Qwen API key", model == "qwen-vl") 152 | 153 | def require_api_key(self, key_name, key_description, is_required): 154 | key_exists = bool(os.environ.get(key_name)) 155 | if self.verbose: 156 | print("[Config] require_api_key") 157 | print("[Config] key_name", key_name) 158 | print("[Config] key_description", key_description) 159 | print("[Config] key_exists", key_exists) 160 | if is_required and not key_exists: 161 | self.prompt_and_save_api_key(key_name, key_description) 162 | 163 | def prompt_and_save_api_key(self, key_name, key_description): 164 | key_value = input_dialog( 165 | title="API Key Required", text=f"Please enter your {key_description}:" 166 | ).run() 167 | 168 | if key_value is None: # User pressed cancel or closed the dialog 169 | sys.exit("Operation cancelled by user.") 170 | 171 | if key_value: 172 | if key_name == "OPENAI_API_KEY": 173 | self.openai_api_key = key_value 174 | elif key_name == "GOOGLE_API_KEY": 175 | self.google_api_key = key_value 176 | elif key_name == "ANTHROPIC_API_KEY": 177 | self.anthropic_api_key = key_value 178 | elif key_name == "QWEN_API_KEY": 179 | self.qwen_api_key = key_value 180 | self.save_api_key_to_env(key_name, key_value) 181 | load_dotenv() # Reload environment variables 182 | # Update the instance attribute with the new key 183 | 184 | @staticmethod 185 | def save_api_key_to_env(key_name, key_value): 186 | with open(".env", "a") as file: 187 | file.write(f"\n{key_name}='{key_value}'") 188 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ome 2 |

Self-Operating Computer Framework

3 | 4 |

5 | A framework to enable multimodal models to operate a computer. 6 |

7 |

8 | Using the same inputs and outputs as a human operator, the model views the screen and decides on a series of mouse and keyboard actions to reach an objective. Released Nov 2023, the Self-Operating Computer Framework was one of the first examples of full computer-use. 9 |

10 | 11 |
12 | 13 |
14 | 15 | 19 | 20 | 21 | ## Key Features 22 | - **Compatibility**: Designed for various multimodal models. 23 | - **Integration**: Currently integrated with **GPT-4o, GPT-4.1, o1, Gemini Pro Vision, Claude 3, Qwen-VL and LLaVa.** 24 | - **Future Plans**: Support for additional models. 25 | 26 | ## Demo 27 | https://github.com/OthersideAI/self-operating-computer/assets/42594239/9e8abc96-c76a-46fb-9b13-03678b3c67e0 28 | 29 | 30 | ## Run `Self-Operating Computer` 31 | 32 | 1. **Install the project** 33 | ``` 34 | pip install self-operating-computer 35 | ``` 36 | 2. **Run the project** 37 | ``` 38 | operate 39 | ``` 40 | 3. **Enter your OpenAI Key**: If you don't have one, you can obtain an OpenAI key [here](https://platform.openai.com/account/api-keys). If you need you change your key at a later point, run `vim .env` to open the `.env` and replace the old key. 41 | 42 |
43 | 44 |
45 | 46 | 4. **Give Terminal app the required permissions**: As a last step, the Terminal app will ask for permission for "Screen Recording" and "Accessibility" in the "Security & Privacy" page of Mac's "System Preferences". 47 | 48 |
49 | 50 | 51 |
52 | 53 | ## Using `operate` Modes 54 | 55 | #### OpenAI models 56 | 57 | The default model for the project is gpt-4o which you can use by simply typing `operate`. To try running OpenAI's new `o1` model, use the command below. 58 | 59 | ``` 60 | operate -m o1-with-ocr 61 | ``` 62 | 63 | To experiment with OpenAI's latest `gpt-4.1` model, run: 64 | 65 | ``` 66 | operate -m gpt-4.1-with-ocr 67 | ``` 68 | 69 | 70 | ### Multimodal Models `-m` 71 | Try Google's `gemini-pro-vision` by following the instructions below. Start `operate` with the Gemini model 72 | ``` 73 | operate -m gemini-pro-vision 74 | ``` 75 | 76 | **Enter your Google AI Studio API key when terminal prompts you for it** If you don't have one, you can obtain a key [here](https://makersuite.google.com/app/apikey) after setting up your Google AI Studio account. You may also need [authorize credentials for a desktop application](https://ai.google.dev/palm_docs/oauth_quickstart). It took me a bit of time to get it working, if anyone knows a simpler way, please make a PR. 77 | 78 | #### Try Claude `-m claude-3` 79 | Use Claude 3 with Vision to see how it stacks up to GPT-4-Vision at operating a computer. Navigate to the [Claude dashboard](https://console.anthropic.com/dashboard) to get an API key and run the command below to try it. 80 | 81 | ``` 82 | operate -m claude-3 83 | ``` 84 | 85 | #### Try qwen `-m qwen-vl` 86 | Use Qwen-vl with Vision to see how it stacks up to GPT-4-Vision at operating a computer. Navigate to the [Qwen dashboard](https://bailian.console.aliyun.com/) to get an API key and run the command below to try it. 87 | 88 | ``` 89 | operate -m qwen-vl 90 | ``` 91 | 92 | #### Try LLaVa Hosted Through Ollama `-m llava` 93 | If you wish to experiment with the Self-Operating Computer Framework using LLaVA on your own machine, you can with Ollama! 94 | *Note: Ollama currently only supports MacOS and Linux. Windows now in Preview* 95 | 96 | First, install Ollama on your machine from https://ollama.ai/download. 97 | 98 | Once Ollama is installed, pull the LLaVA model: 99 | ``` 100 | ollama pull llava 101 | ``` 102 | This will download the model on your machine which takes approximately 5 GB of storage. 103 | 104 | When Ollama has finished pulling LLaVA, start the server: 105 | ``` 106 | ollama serve 107 | ``` 108 | 109 | That's it! Now start `operate` and select the LLaVA model: 110 | ``` 111 | operate -m llava 112 | ``` 113 | **Important:** Error rates when using LLaVA are very high. This is simply intended to be a base to build off of as local multimodal models improve over time. 114 | 115 | Learn more about Ollama at its [GitHub Repository](https://www.github.com/ollama/ollama) 116 | 117 | ### Voice Mode `--voice` 118 | The framework supports voice inputs for the objective. Try voice by following the instructions below. 119 | **Clone the repo** to a directory on your computer: 120 | ``` 121 | git clone https://github.com/OthersideAI/self-operating-computer.git 122 | ``` 123 | **Cd into directory**: 124 | ``` 125 | cd self-operating-computer 126 | ``` 127 | Install the additional `requirements-audio.txt` 128 | ``` 129 | pip install -r requirements-audio.txt 130 | ``` 131 | **Install device requirements** 132 | For mac users: 133 | ``` 134 | brew install portaudio 135 | ``` 136 | For Linux users: 137 | ``` 138 | sudo apt install portaudio19-dev python3-pyaudio 139 | ``` 140 | Run with voice mode 141 | ``` 142 | operate --voice 143 | ``` 144 | 145 | ### Optical Character Recognition Mode `-m gpt-4-with-ocr` 146 | The Self-Operating Computer Framework now integrates Optical Character Recognition (OCR) capabilities with the `gpt-4-with-ocr` mode. This mode gives GPT-4 a hash map of clickable elements by coordinates. GPT-4 can decide to `click` elements by text and then the code references the hash map to get the coordinates for that element GPT-4 wanted to click. 147 | 148 | Based on recent tests, OCR performs better than `som` and vanilla GPT-4 so we made it the default for the project. To use the OCR mode you can simply write: 149 | 150 | `operate` or `operate -m gpt-4-with-ocr` will also work. 151 | 152 | ### Set-of-Mark Prompting `-m gpt-4-with-som` 153 | The Self-Operating Computer Framework now supports Set-of-Mark (SoM) Prompting with the `gpt-4-with-som` command. This new visual prompting method enhances the visual grounding capabilities of large multimodal models. 154 | 155 | Learn more about SoM Prompting in the detailed arXiv paper: [here](https://arxiv.org/abs/2310.11441). 156 | 157 | For this initial version, a simple YOLOv8 model is trained for button detection, and the `best.pt` file is included under `model/weights/`. Users are encouraged to swap in their `best.pt` file to evaluate performance improvements. If your model outperforms the existing one, please contribute by creating a pull request (PR). 158 | 159 | Start `operate` with the SoM model 160 | 161 | ``` 162 | operate -m gpt-4-with-som 163 | ``` 164 | 165 | 166 | 167 | ## Contributions are Welcomed!: 168 | 169 | If you want to contribute yourself, see [CONTRIBUTING.md](https://github.com/OthersideAI/self-operating-computer/blob/main/CONTRIBUTING.md). 170 | 171 | ## Feedback 172 | 173 | For any input on improving this project, feel free to reach out to [Josh](https://twitter.com/josh_bickett) on Twitter. 174 | 175 | ## Join Our Discord Community 176 | 177 | For real-time discussions and community support, join our Discord server. 178 | - If you're already a member, join the discussion in [#self-operating-computer](https://discord.com/channels/877638638001877052/1181241785834541157). 179 | - If you're new, first [join our Discord Server](https://discord.gg/YqaKtyBEzM) and then navigate to the [#self-operating-computer](https://discord.com/channels/877638638001877052/1181241785834541157). 180 | 181 | ## Follow HyperWriteAI for More Updates 182 | 183 | Stay updated with the latest developments: 184 | - Follow HyperWriteAI on [Twitter](https://twitter.com/HyperWriteAI). 185 | - Follow HyperWriteAI on [LinkedIn](https://www.linkedin.com/company/othersideai/). 186 | 187 | ## Compatibility 188 | - This project is compatible with Mac OS, Windows, and Linux (with X server installed). 189 | 190 | ## OpenAI Rate Limiting Note 191 | The ```gpt-4o``` model is required. To unlock access to this model, your account needs to spend at least \$5 in API credits. Pre-paying for these credits will unlock access if you haven't already spent the minimum \$5. 192 | Learn more **[here](https://platform.openai.com/docs/guides/rate-limits?context=tier-one)** 193 | -------------------------------------------------------------------------------- /operate/models/prompts.py: -------------------------------------------------------------------------------- 1 | import platform 2 | from operate.config import Config 3 | 4 | # Load configuration 5 | config = Config() 6 | 7 | # General user Prompts 8 | USER_QUESTION = "Hello, I can help you with anything. What would you like done?" 9 | 10 | 11 | SYSTEM_PROMPT_STANDARD = """ 12 | You are operating a {operating_system} computer, using the same operating system as a human. 13 | 14 | From looking at the screen, the objective, and your previous actions, take the next best series of action. 15 | 16 | You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. 17 | 18 | 1. click - Move mouse and click 19 | ``` 20 | [{{ "thought": "write a thought here", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # "percent" refers to the percentage of the screen's dimensions in decimal format 21 | ``` 22 | 23 | 2. write - Write with your keyboard 24 | ``` 25 | [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] 26 | ``` 27 | 28 | 3. press - Use a hotkey or press key to operate the computer 29 | ``` 30 | [{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] 31 | ``` 32 | 33 | 4. done - The objective is completed 34 | ``` 35 | [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] 36 | ``` 37 | 38 | Return the actions in array format `[]`. You can take just one action or multiple actions. 39 | 40 | Here a helpful example: 41 | 42 | Example 1: Searches for Google Chrome on the OS and opens it 43 | ``` 44 | [ 45 | {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": {os_search_str} }}, 46 | {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }}, 47 | {{ "thought": "Finally I'll press enter to open Google Chrome assuming it is available", "operation": "press", "keys": ["enter"] }} 48 | ] 49 | ``` 50 | 51 | Example 2: Focuses on the address bar in a browser before typing a website 52 | ``` 53 | [ 54 | {{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": [{cmd_string}, "l"] }}, 55 | {{ "thought": "Now that the address bar is in focus I can type the URL", "operation": "write", "content": "https://news.ycombinator.com/" }}, 56 | {{ "thought": "I'll need to press enter to go the URL now", "operation": "press", "keys": ["enter"] }} 57 | ] 58 | ``` 59 | 60 | A few important notes: 61 | 62 | - Go to Google Docs and Google Sheets by typing in the Chrome Address bar 63 | - Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. 64 | 65 | Objective: {objective} 66 | """ 67 | 68 | 69 | SYSTEM_PROMPT_LABELED = """ 70 | You are operating a {operating_system} computer, using the same operating system as a human. 71 | 72 | From looking at the screen, the objective, and your previous actions, take the next best series of action. 73 | 74 | You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. 75 | 76 | 1. click - Move mouse and click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x` 77 | ``` 78 | [{{ "thought": "write a thought here", "operation": "click", "label": "~x" }}] # 'percent' refers to the percentage of the screen's dimensions in decimal format 79 | ``` 80 | 2. write - Write with your keyboard 81 | ``` 82 | [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] 83 | ``` 84 | 3. press - Use a hotkey or press key to operate the computer 85 | ``` 86 | [{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] 87 | ``` 88 | 89 | 4. done - The objective is completed 90 | ``` 91 | [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] 92 | ``` 93 | Return the actions in array format `[]`. You can take just one action or multiple actions. 94 | 95 | Here a helpful example: 96 | 97 | Example 1: Searches for Google Chrome on the OS and opens it 98 | ``` 99 | [ 100 | {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": {os_search_str} }}, 101 | {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }}, 102 | ] 103 | ``` 104 | 105 | Example 2: Focuses on the address bar in a browser before typing a website 106 | ``` 107 | [ 108 | {{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": [{cmd_string}, "l"] }}, 109 | {{ "thought": "Now that the address bar is in focus I can type the URL", "operation": "write", "content": "https://news.ycombinator.com/" }}, 110 | {{ "thought": "I'll need to press enter to go the URL now", "operation": "press", "keys": ["enter"] }} 111 | ] 112 | ``` 113 | 114 | Example 3: Send a "Hello World" message in the chat 115 | ``` 116 | [ 117 | {{ "thought": "I see a messsage field on this page near the button. It looks like it has a label", "operation": "click", "label": "~34" }}, 118 | {{ "thought": "Now that I am focused on the message field, I'll go ahead and write ", "operation": "write", "content": "Hello World" }}, 119 | ] 120 | ``` 121 | 122 | A few important notes: 123 | 124 | - Go to Google Docs and Google Sheets by typing in the Chrome Address bar 125 | - Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. 126 | 127 | Objective: {objective} 128 | """ 129 | 130 | 131 | # TODO: Add an example or instruction about `Action: press ['pagedown']` to scroll 132 | SYSTEM_PROMPT_OCR = """ 133 | You are operating a {operating_system} computer, using the same operating system as a human. 134 | 135 | From looking at the screen, the objective, and your previous actions, take the next best series of action. 136 | 137 | You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. 138 | 139 | 1. click - Move mouse and click - Look for text to click. Try to find relevant text to click, but if there's nothing relevant enough you can return `"nothing to click"` for the text value and we'll try a different method. 140 | ``` 141 | [{{ "thought": "write a thought here", "operation": "click", "text": "The text in the button or link to click" }}] 142 | ``` 143 | 2. write - Write with your keyboard 144 | ``` 145 | [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] 146 | ``` 147 | 3. press - Use a hotkey or press key to operate the computer 148 | ``` 149 | [{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] 150 | ``` 151 | 4. done - The objective is completed 152 | ``` 153 | [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] 154 | ``` 155 | 156 | Return the actions in array format `[]`. You can take just one action or multiple actions. 157 | 158 | Here a helpful example: 159 | 160 | Example 1: Searches for Google Chrome on the OS and opens it 161 | ``` 162 | [ 163 | {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": {os_search_str} }}, 164 | {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }}, 165 | {{ "thought": "Finally I'll press enter to open Google Chrome assuming it is available", "operation": "press", "keys": ["enter"] }} 166 | ] 167 | ``` 168 | 169 | Example 2: Open a new Google Docs when the browser is already open 170 | ``` 171 | [ 172 | {{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": [{cmd_string}, "t"] }}, 173 | {{ "thought": "Now that the address bar is in focus I can type the URL", "operation": "write", "content": "https://docs.new/" }}, 174 | {{ "thought": "I'll need to press enter to go the URL now", "operation": "press", "keys": ["enter"] }} 175 | ] 176 | ``` 177 | 178 | Example 3: Search for someone on Linkedin when already on linkedin.com 179 | ``` 180 | [ 181 | {{ "thought": "I can see the search field with the placeholder text 'search'. I click that field to search", "operation": "click", "text": "search" }}, 182 | {{ "thought": "Now that the field is active I can write the name of the person I'd like to search for", "operation": "write", "content": "John Doe" }}, 183 | {{ "thought": "Finally I'll submit the search form with enter", "operation": "press", "keys": ["enter"] }} 184 | ] 185 | ``` 186 | 187 | A few important notes: 188 | 189 | - Default to Google Chrome as the browser 190 | - Go to websites by opening a new tab with `press` and then `write` the URL 191 | - Reflect on previous actions and the screenshot to ensure they align and that your previous actions worked. 192 | - If the first time clicking a button or link doesn't work, don't try again to click it. Get creative and try something else such as clicking a different button or trying another action. 193 | - Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. 194 | 195 | Objective: {objective} 196 | """ 197 | 198 | OPERATE_FIRST_MESSAGE_PROMPT = """ 199 | Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 4 operations available: click, write, press, done 200 | 201 | You just started so you are in the terminal app and your code is running in this terminal tab. To leave the terminal, search for a new program on the OS. 202 | 203 | Action:""" 204 | 205 | OPERATE_PROMPT = """ 206 | Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 4 operations available: click, write, press, done 207 | Action:""" 208 | 209 | 210 | def get_system_prompt(model, objective): 211 | """ 212 | Format the vision prompt more efficiently and print the name of the prompt used 213 | """ 214 | 215 | if platform.system() == "Darwin": 216 | cmd_string = "\"command\"" 217 | os_search_str = "[\"command\", \"space\"]" 218 | operating_system = "Mac" 219 | elif platform.system() == "Windows": 220 | cmd_string = "\"ctrl\"" 221 | os_search_str = "[\"win\"]" 222 | operating_system = "Windows" 223 | else: 224 | cmd_string = "\"ctrl\"" 225 | os_search_str = "[\"win\"]" 226 | operating_system = "Linux" 227 | 228 | if model == "gpt-4-with-som": 229 | prompt = SYSTEM_PROMPT_LABELED.format( 230 | objective=objective, 231 | cmd_string=cmd_string, 232 | os_search_str=os_search_str, 233 | operating_system=operating_system, 234 | ) 235 | elif model == "gpt-4-with-ocr" or model == "gpt-4.1-with-ocr" or model == "o1-with-ocr" or model == "claude-3" or model == "qwen-vl": 236 | 237 | prompt = SYSTEM_PROMPT_OCR.format( 238 | objective=objective, 239 | cmd_string=cmd_string, 240 | os_search_str=os_search_str, 241 | operating_system=operating_system, 242 | ) 243 | 244 | else: 245 | prompt = SYSTEM_PROMPT_STANDARD.format( 246 | objective=objective, 247 | cmd_string=cmd_string, 248 | os_search_str=os_search_str, 249 | operating_system=operating_system, 250 | ) 251 | 252 | # Optional verbose output 253 | if config.verbose: 254 | print("[get_system_prompt] model:", model) 255 | # print("[get_system_prompt] prompt:", prompt) 256 | 257 | return prompt 258 | 259 | 260 | def get_user_prompt(): 261 | prompt = OPERATE_PROMPT 262 | return prompt 263 | 264 | 265 | def get_user_first_message_prompt(): 266 | prompt = OPERATE_FIRST_MESSAGE_PROMPT 267 | return prompt 268 | -------------------------------------------------------------------------------- /operate/models/apis.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import io 3 | import json 4 | import os 5 | import time 6 | import traceback 7 | 8 | import easyocr 9 | import ollama 10 | import pkg_resources 11 | from PIL import Image 12 | from ultralytics import YOLO 13 | 14 | from operate.config import Config 15 | from operate.exceptions import ModelNotRecognizedException 16 | from operate.models.prompts import ( 17 | get_system_prompt, 18 | get_user_first_message_prompt, 19 | get_user_prompt, 20 | ) 21 | from operate.utils.label import ( 22 | add_labels, 23 | get_click_position_in_percent, 24 | get_label_coordinates, 25 | ) 26 | from operate.utils.ocr import get_text_coordinates, get_text_element 27 | from operate.utils.screenshot import capture_screen_with_cursor, compress_screenshot 28 | from operate.utils.style import ANSI_BRIGHT_MAGENTA, ANSI_GREEN, ANSI_RED, ANSI_RESET 29 | 30 | # Load configuration 31 | config = Config() 32 | 33 | 34 | async def get_next_action(model, messages, objective, session_id): 35 | if config.verbose: 36 | print("[Self-Operating Computer][get_next_action]") 37 | print("[Self-Operating Computer][get_next_action] model", model) 38 | if model == "gpt-4": 39 | return call_gpt_4o(messages), None 40 | if model == "qwen-vl": 41 | operation = await call_qwen_vl_with_ocr(messages, objective, model) 42 | return operation, None 43 | if model == "gpt-4-with-som": 44 | operation = await call_gpt_4o_labeled(messages, objective, model) 45 | return operation, None 46 | if model == "gpt-4-with-ocr": 47 | operation = await call_gpt_4o_with_ocr(messages, objective, model) 48 | return operation, None 49 | if model == "gpt-4.1-with-ocr": 50 | operation = await call_gpt_4_1_with_ocr(messages, objective, model) 51 | return operation, None 52 | if model == "o1-with-ocr": 53 | operation = await call_o1_with_ocr(messages, objective, model) 54 | return operation, None 55 | if model == "agent-1": 56 | return "coming soon" 57 | if model == "gemini-pro-vision": 58 | return call_gemini_pro_vision(messages, objective), None 59 | if model == "llava": 60 | operation = call_ollama_llava(messages) 61 | return operation, None 62 | if model == "claude-3": 63 | operation = await call_claude_3_with_ocr(messages, objective, model) 64 | return operation, None 65 | raise ModelNotRecognizedException(model) 66 | 67 | 68 | def call_gpt_4o(messages): 69 | if config.verbose: 70 | print("[call_gpt_4_v]") 71 | time.sleep(1) 72 | client = config.initialize_openai() 73 | try: 74 | screenshots_dir = "screenshots" 75 | if not os.path.exists(screenshots_dir): 76 | os.makedirs(screenshots_dir) 77 | 78 | screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") 79 | # Call the function to capture the screen with the cursor 80 | capture_screen_with_cursor(screenshot_filename) 81 | 82 | with open(screenshot_filename, "rb") as img_file: 83 | img_base64 = base64.b64encode(img_file.read()).decode("utf-8") 84 | 85 | if len(messages) == 1: 86 | user_prompt = get_user_first_message_prompt() 87 | else: 88 | user_prompt = get_user_prompt() 89 | 90 | if config.verbose: 91 | print( 92 | "[call_gpt_4_v] user_prompt", 93 | user_prompt, 94 | ) 95 | 96 | vision_message = { 97 | "role": "user", 98 | "content": [ 99 | {"type": "text", "text": user_prompt}, 100 | { 101 | "type": "image_url", 102 | "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, 103 | }, 104 | ], 105 | } 106 | messages.append(vision_message) 107 | 108 | response = client.chat.completions.create( 109 | model="gpt-4o", 110 | messages=messages, 111 | presence_penalty=1, 112 | frequency_penalty=1, 113 | ) 114 | 115 | content = response.choices[0].message.content 116 | 117 | content = clean_json(content) 118 | 119 | assistant_message = {"role": "assistant", "content": content} 120 | if config.verbose: 121 | print( 122 | "[call_gpt_4_v] content", 123 | content, 124 | ) 125 | content = json.loads(content) 126 | 127 | messages.append(assistant_message) 128 | 129 | return content 130 | 131 | except Exception as e: 132 | print( 133 | f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying again {ANSI_RESET}", 134 | e, 135 | ) 136 | print( 137 | f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}", 138 | content, 139 | ) 140 | if config.verbose: 141 | traceback.print_exc() 142 | return call_gpt_4o(messages) 143 | 144 | 145 | async def call_qwen_vl_with_ocr(messages, objective, model): 146 | if config.verbose: 147 | print("[call_qwen_vl_with_ocr]") 148 | 149 | # Construct the path to the file within the package 150 | try: 151 | time.sleep(1) 152 | client = config.initialize_qwen() 153 | 154 | confirm_system_prompt(messages, objective, model) 155 | screenshots_dir = "screenshots" 156 | if not os.path.exists(screenshots_dir): 157 | os.makedirs(screenshots_dir) 158 | 159 | # Call the function to capture the screen with the cursor 160 | raw_screenshot_filename = os.path.join(screenshots_dir, "raw_screenshot.png") 161 | capture_screen_with_cursor(raw_screenshot_filename) 162 | 163 | # Compress screenshot image to make size be smaller 164 | screenshot_filename = os.path.join(screenshots_dir, "screenshot.jpeg") 165 | compress_screenshot(raw_screenshot_filename, screenshot_filename) 166 | 167 | with open(screenshot_filename, "rb") as img_file: 168 | img_base64 = base64.b64encode(img_file.read()).decode("utf-8") 169 | 170 | if len(messages) == 1: 171 | user_prompt = get_user_first_message_prompt() 172 | else: 173 | user_prompt = get_user_prompt() 174 | 175 | vision_message = { 176 | "role": "user", 177 | "content": [ 178 | {"type": "text", 179 | "text": f"{user_prompt}**REMEMBER** Only output json format, do not append any other text."}, 180 | { 181 | "type": "image_url", 182 | "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, 183 | }, 184 | ], 185 | } 186 | messages.append(vision_message) 187 | 188 | response = client.chat.completions.create( 189 | model="qwen2.5-vl-72b-instruct", 190 | messages=messages, 191 | ) 192 | 193 | content = response.choices[0].message.content 194 | 195 | content = clean_json(content) 196 | 197 | # used later for the messages 198 | content_str = content 199 | 200 | content = json.loads(content) 201 | 202 | processed_content = [] 203 | 204 | for operation in content: 205 | if operation.get("operation") == "click": 206 | text_to_click = operation.get("text") 207 | if config.verbose: 208 | print( 209 | "[call_qwen_vl_with_ocr][click] text_to_click", 210 | text_to_click, 211 | ) 212 | # Initialize EasyOCR Reader 213 | reader = easyocr.Reader(["en"]) 214 | 215 | # Read the screenshot 216 | result = reader.readtext(screenshot_filename) 217 | 218 | text_element_index = get_text_element( 219 | result, text_to_click, screenshot_filename 220 | ) 221 | coordinates = get_text_coordinates( 222 | result, text_element_index, screenshot_filename 223 | ) 224 | 225 | # add `coordinates`` to `content` 226 | operation["x"] = coordinates["x"] 227 | operation["y"] = coordinates["y"] 228 | 229 | if config.verbose: 230 | print( 231 | "[call_qwen_vl_with_ocr][click] text_element_index", 232 | text_element_index, 233 | ) 234 | print( 235 | "[call_qwen_vl_with_ocr][click] coordinates", 236 | coordinates, 237 | ) 238 | print( 239 | "[call_qwen_vl_with_ocr][click] final operation", 240 | operation, 241 | ) 242 | processed_content.append(operation) 243 | 244 | else: 245 | processed_content.append(operation) 246 | 247 | # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history 248 | assistant_message = {"role": "assistant", "content": content_str} 249 | messages.append(assistant_message) 250 | 251 | return processed_content 252 | 253 | except Exception as e: 254 | print( 255 | f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}" 256 | ) 257 | if config.verbose: 258 | print("[Self-Operating Computer][Operate] error", e) 259 | traceback.print_exc() 260 | return gpt_4_fallback(messages, objective, model) 261 | 262 | def call_gemini_pro_vision(messages, objective): 263 | """ 264 | Get the next action for Self-Operating Computer using Gemini Pro Vision 265 | """ 266 | if config.verbose: 267 | print( 268 | "[Self Operating Computer][call_gemini_pro_vision]", 269 | ) 270 | # sleep for a second 271 | time.sleep(1) 272 | try: 273 | screenshots_dir = "screenshots" 274 | if not os.path.exists(screenshots_dir): 275 | os.makedirs(screenshots_dir) 276 | 277 | screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") 278 | # Call the function to capture the screen with the cursor 279 | capture_screen_with_cursor(screenshot_filename) 280 | # sleep for a second 281 | time.sleep(1) 282 | prompt = get_system_prompt("gemini-pro-vision", objective) 283 | 284 | model = config.initialize_google() 285 | if config.verbose: 286 | print("[call_gemini_pro_vision] model", model) 287 | 288 | response = model.generate_content([prompt, Image.open(screenshot_filename)]) 289 | 290 | content = response.text[1:] 291 | if config.verbose: 292 | print("[call_gemini_pro_vision] response", response) 293 | print("[call_gemini_pro_vision] content", content) 294 | 295 | content = json.loads(content) 296 | if config.verbose: 297 | print( 298 | "[get_next_action][call_gemini_pro_vision] content", 299 | content, 300 | ) 301 | 302 | return content 303 | 304 | except Exception as e: 305 | print( 306 | f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying another method {ANSI_RESET}" 307 | ) 308 | if config.verbose: 309 | print("[Self-Operating Computer][Operate] error", e) 310 | traceback.print_exc() 311 | return call_gpt_4o(messages) 312 | 313 | 314 | async def call_gpt_4o_with_ocr(messages, objective, model): 315 | if config.verbose: 316 | print("[call_gpt_4o_with_ocr]") 317 | 318 | # Construct the path to the file within the package 319 | try: 320 | time.sleep(1) 321 | client = config.initialize_openai() 322 | 323 | confirm_system_prompt(messages, objective, model) 324 | screenshots_dir = "screenshots" 325 | if not os.path.exists(screenshots_dir): 326 | os.makedirs(screenshots_dir) 327 | 328 | screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") 329 | # Call the function to capture the screen with the cursor 330 | capture_screen_with_cursor(screenshot_filename) 331 | 332 | with open(screenshot_filename, "rb") as img_file: 333 | img_base64 = base64.b64encode(img_file.read()).decode("utf-8") 334 | 335 | if len(messages) == 1: 336 | user_prompt = get_user_first_message_prompt() 337 | else: 338 | user_prompt = get_user_prompt() 339 | 340 | vision_message = { 341 | "role": "user", 342 | "content": [ 343 | {"type": "text", "text": user_prompt}, 344 | { 345 | "type": "image_url", 346 | "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, 347 | }, 348 | ], 349 | } 350 | messages.append(vision_message) 351 | 352 | response = client.chat.completions.create( 353 | model="gpt-4o", 354 | messages=messages, 355 | ) 356 | 357 | content = response.choices[0].message.content 358 | 359 | content = clean_json(content) 360 | 361 | # used later for the messages 362 | content_str = content 363 | 364 | content = json.loads(content) 365 | 366 | processed_content = [] 367 | 368 | for operation in content: 369 | if operation.get("operation") == "click": 370 | text_to_click = operation.get("text") 371 | if config.verbose: 372 | print( 373 | "[call_gpt_4o_with_ocr][click] text_to_click", 374 | text_to_click, 375 | ) 376 | # Initialize EasyOCR Reader 377 | reader = easyocr.Reader(["en"]) 378 | 379 | # Read the screenshot 380 | result = reader.readtext(screenshot_filename) 381 | 382 | text_element_index = get_text_element( 383 | result, text_to_click, screenshot_filename 384 | ) 385 | coordinates = get_text_coordinates( 386 | result, text_element_index, screenshot_filename 387 | ) 388 | 389 | # add `coordinates`` to `content` 390 | operation["x"] = coordinates["x"] 391 | operation["y"] = coordinates["y"] 392 | 393 | if config.verbose: 394 | print( 395 | "[call_gpt_4o_with_ocr][click] text_element_index", 396 | text_element_index, 397 | ) 398 | print( 399 | "[call_gpt_4o_with_ocr][click] coordinates", 400 | coordinates, 401 | ) 402 | print( 403 | "[call_gpt_4o_with_ocr][click] final operation", 404 | operation, 405 | ) 406 | processed_content.append(operation) 407 | 408 | else: 409 | processed_content.append(operation) 410 | 411 | # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history 412 | assistant_message = {"role": "assistant", "content": content_str} 413 | messages.append(assistant_message) 414 | 415 | return processed_content 416 | 417 | except Exception as e: 418 | print( 419 | f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}" 420 | ) 421 | if config.verbose: 422 | print("[Self-Operating Computer][Operate] error", e) 423 | traceback.print_exc() 424 | return gpt_4_fallback(messages, objective, model) 425 | 426 | 427 | async def call_gpt_4_1_with_ocr(messages, objective, model): 428 | if config.verbose: 429 | print("[call_gpt_4_1_with_ocr]") 430 | 431 | try: 432 | time.sleep(1) 433 | client = config.initialize_openai() 434 | 435 | confirm_system_prompt(messages, objective, model) 436 | screenshots_dir = "screenshots" 437 | if not os.path.exists(screenshots_dir): 438 | os.makedirs(screenshots_dir) 439 | 440 | screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") 441 | capture_screen_with_cursor(screenshot_filename) 442 | 443 | with open(screenshot_filename, "rb") as img_file: 444 | img_base64 = base64.b64encode(img_file.read()).decode("utf-8") 445 | 446 | if len(messages) == 1: 447 | user_prompt = get_user_first_message_prompt() 448 | else: 449 | user_prompt = get_user_prompt() 450 | 451 | vision_message = { 452 | "role": "user", 453 | "content": [ 454 | {"type": "text", "text": user_prompt}, 455 | { 456 | "type": "image_url", 457 | "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, 458 | }, 459 | ], 460 | } 461 | messages.append(vision_message) 462 | 463 | response = client.chat.completions.create( 464 | model="gpt-4.1", 465 | messages=messages, 466 | ) 467 | 468 | content = response.choices[0].message.content 469 | 470 | content = clean_json(content) 471 | 472 | content_str = content 473 | 474 | content = json.loads(content) 475 | 476 | processed_content = [] 477 | 478 | for operation in content: 479 | if operation.get("operation") == "click": 480 | text_to_click = operation.get("text") 481 | if config.verbose: 482 | print( 483 | "[call_gpt_4_1_with_ocr][click] text_to_click", 484 | text_to_click, 485 | ) 486 | reader = easyocr.Reader(["en"]) 487 | 488 | result = reader.readtext(screenshot_filename) 489 | 490 | text_element_index = get_text_element( 491 | result, text_to_click, screenshot_filename 492 | ) 493 | coordinates = get_text_coordinates( 494 | result, text_element_index, screenshot_filename 495 | ) 496 | 497 | operation["x"] = coordinates["x"] 498 | operation["y"] = coordinates["y"] 499 | 500 | if config.verbose: 501 | print( 502 | "[call_gpt_4_1_with_ocr][click] text_element_index", 503 | text_element_index, 504 | ) 505 | print( 506 | "[call_gpt_4_1_with_ocr][click] coordinates", 507 | coordinates, 508 | ) 509 | print( 510 | "[call_gpt_4_1_with_ocr][click] final operation", 511 | operation, 512 | ) 513 | processed_content.append(operation) 514 | 515 | else: 516 | processed_content.append(operation) 517 | 518 | assistant_message = {"role": "assistant", "content": content_str} 519 | messages.append(assistant_message) 520 | 521 | return processed_content 522 | 523 | except Exception as e: 524 | print( 525 | f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}" 526 | ) 527 | if config.verbose: 528 | print("[Self-Operating Computer][Operate] error", e) 529 | traceback.print_exc() 530 | return gpt_4_fallback(messages, objective, model) 531 | 532 | 533 | async def call_o1_with_ocr(messages, objective, model): 534 | if config.verbose: 535 | print("[call_o1_with_ocr]") 536 | 537 | # Construct the path to the file within the package 538 | try: 539 | time.sleep(1) 540 | client = config.initialize_openai() 541 | 542 | confirm_system_prompt(messages, objective, model) 543 | screenshots_dir = "screenshots" 544 | if not os.path.exists(screenshots_dir): 545 | os.makedirs(screenshots_dir) 546 | 547 | screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") 548 | # Call the function to capture the screen with the cursor 549 | capture_screen_with_cursor(screenshot_filename) 550 | 551 | with open(screenshot_filename, "rb") as img_file: 552 | img_base64 = base64.b64encode(img_file.read()).decode("utf-8") 553 | 554 | if len(messages) == 1: 555 | user_prompt = get_user_first_message_prompt() 556 | else: 557 | user_prompt = get_user_prompt() 558 | 559 | vision_message = { 560 | "role": "user", 561 | "content": [ 562 | {"type": "text", "text": user_prompt}, 563 | { 564 | "type": "image_url", 565 | "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, 566 | }, 567 | ], 568 | } 569 | messages.append(vision_message) 570 | 571 | response = client.chat.completions.create( 572 | model="o1", 573 | messages=messages, 574 | ) 575 | 576 | content = response.choices[0].message.content 577 | 578 | content = clean_json(content) 579 | 580 | # used later for the messages 581 | content_str = content 582 | 583 | content = json.loads(content) 584 | 585 | processed_content = [] 586 | 587 | for operation in content: 588 | if operation.get("operation") == "click": 589 | text_to_click = operation.get("text") 590 | if config.verbose: 591 | print( 592 | "[call_o1_with_ocr][click] text_to_click", 593 | text_to_click, 594 | ) 595 | # Initialize EasyOCR Reader 596 | reader = easyocr.Reader(["en"]) 597 | 598 | # Read the screenshot 599 | result = reader.readtext(screenshot_filename) 600 | 601 | text_element_index = get_text_element( 602 | result, text_to_click, screenshot_filename 603 | ) 604 | coordinates = get_text_coordinates( 605 | result, text_element_index, screenshot_filename 606 | ) 607 | 608 | # add `coordinates`` to `content` 609 | operation["x"] = coordinates["x"] 610 | operation["y"] = coordinates["y"] 611 | 612 | if config.verbose: 613 | print( 614 | "[call_o1_with_ocr][click] text_element_index", 615 | text_element_index, 616 | ) 617 | print( 618 | "[call_o1_with_ocr][click] coordinates", 619 | coordinates, 620 | ) 621 | print( 622 | "[call_o1_with_ocr][click] final operation", 623 | operation, 624 | ) 625 | processed_content.append(operation) 626 | 627 | else: 628 | processed_content.append(operation) 629 | 630 | # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history 631 | assistant_message = {"role": "assistant", "content": content_str} 632 | messages.append(assistant_message) 633 | 634 | return processed_content 635 | 636 | except Exception as e: 637 | print( 638 | f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}" 639 | ) 640 | if config.verbose: 641 | print("[Self-Operating Computer][Operate] error", e) 642 | traceback.print_exc() 643 | return gpt_4_fallback(messages, objective, model) 644 | 645 | 646 | async def call_gpt_4o_labeled(messages, objective, model): 647 | time.sleep(1) 648 | 649 | try: 650 | client = config.initialize_openai() 651 | 652 | confirm_system_prompt(messages, objective, model) 653 | file_path = pkg_resources.resource_filename("operate.models.weights", "best.pt") 654 | yolo_model = YOLO(file_path) # Load your trained model 655 | screenshots_dir = "screenshots" 656 | if not os.path.exists(screenshots_dir): 657 | os.makedirs(screenshots_dir) 658 | 659 | screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") 660 | # Call the function to capture the screen with the cursor 661 | capture_screen_with_cursor(screenshot_filename) 662 | 663 | with open(screenshot_filename, "rb") as img_file: 664 | img_base64 = base64.b64encode(img_file.read()).decode("utf-8") 665 | 666 | img_base64_labeled, label_coordinates = add_labels(img_base64, yolo_model) 667 | 668 | if len(messages) == 1: 669 | user_prompt = get_user_first_message_prompt() 670 | else: 671 | user_prompt = get_user_prompt() 672 | 673 | if config.verbose: 674 | print( 675 | "[call_gpt_4_vision_preview_labeled] user_prompt", 676 | user_prompt, 677 | ) 678 | 679 | vision_message = { 680 | "role": "user", 681 | "content": [ 682 | {"type": "text", "text": user_prompt}, 683 | { 684 | "type": "image_url", 685 | "image_url": { 686 | "url": f"data:image/jpeg;base64,{img_base64_labeled}" 687 | }, 688 | }, 689 | ], 690 | } 691 | messages.append(vision_message) 692 | 693 | response = client.chat.completions.create( 694 | model="gpt-4o", 695 | messages=messages, 696 | presence_penalty=1, 697 | frequency_penalty=1, 698 | ) 699 | 700 | content = response.choices[0].message.content 701 | 702 | content = clean_json(content) 703 | 704 | assistant_message = {"role": "assistant", "content": content} 705 | 706 | messages.append(assistant_message) 707 | 708 | content = json.loads(content) 709 | if config.verbose: 710 | print( 711 | "[call_gpt_4_vision_preview_labeled] content", 712 | content, 713 | ) 714 | 715 | processed_content = [] 716 | 717 | for operation in content: 718 | print( 719 | "[call_gpt_4_vision_preview_labeled] for operation in content", 720 | operation, 721 | ) 722 | if operation.get("operation") == "click": 723 | label = operation.get("label") 724 | if config.verbose: 725 | print( 726 | "[Self Operating Computer][call_gpt_4_vision_preview_labeled] label", 727 | label, 728 | ) 729 | 730 | coordinates = get_label_coordinates(label, label_coordinates) 731 | if config.verbose: 732 | print( 733 | "[Self Operating Computer][call_gpt_4_vision_preview_labeled] coordinates", 734 | coordinates, 735 | ) 736 | image = Image.open( 737 | io.BytesIO(base64.b64decode(img_base64)) 738 | ) # Load the image to get its size 739 | image_size = image.size # Get the size of the image (width, height) 740 | click_position_percent = get_click_position_in_percent( 741 | coordinates, image_size 742 | ) 743 | if config.verbose: 744 | print( 745 | "[Self Operating Computer][call_gpt_4_vision_preview_labeled] click_position_percent", 746 | click_position_percent, 747 | ) 748 | if not click_position_percent: 749 | print( 750 | f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Failed to get click position in percent. Trying another method {ANSI_RESET}" 751 | ) 752 | return call_gpt_4o(messages) 753 | 754 | x_percent = f"{click_position_percent[0]:.2f}" 755 | y_percent = f"{click_position_percent[1]:.2f}" 756 | operation["x"] = x_percent 757 | operation["y"] = y_percent 758 | if config.verbose: 759 | print( 760 | "[Self Operating Computer][call_gpt_4_vision_preview_labeled] new click operation", 761 | operation, 762 | ) 763 | processed_content.append(operation) 764 | else: 765 | if config.verbose: 766 | print( 767 | "[Self Operating Computer][call_gpt_4_vision_preview_labeled] .append none click operation", 768 | operation, 769 | ) 770 | 771 | processed_content.append(operation) 772 | 773 | if config.verbose: 774 | print( 775 | "[Self Operating Computer][call_gpt_4_vision_preview_labeled] new processed_content", 776 | processed_content, 777 | ) 778 | return processed_content 779 | 780 | except Exception as e: 781 | print( 782 | f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}" 783 | ) 784 | if config.verbose: 785 | print("[Self-Operating Computer][Operate] error", e) 786 | traceback.print_exc() 787 | return call_gpt_4o(messages) 788 | 789 | 790 | def call_ollama_llava(messages): 791 | if config.verbose: 792 | print("[call_ollama_llava]") 793 | time.sleep(1) 794 | try: 795 | model = config.initialize_ollama() 796 | screenshots_dir = "screenshots" 797 | if not os.path.exists(screenshots_dir): 798 | os.makedirs(screenshots_dir) 799 | 800 | screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") 801 | # Call the function to capture the screen with the cursor 802 | capture_screen_with_cursor(screenshot_filename) 803 | 804 | if len(messages) == 1: 805 | user_prompt = get_user_first_message_prompt() 806 | else: 807 | user_prompt = get_user_prompt() 808 | 809 | if config.verbose: 810 | print( 811 | "[call_ollama_llava] user_prompt", 812 | user_prompt, 813 | ) 814 | 815 | vision_message = { 816 | "role": "user", 817 | "content": user_prompt, 818 | "images": [screenshot_filename], 819 | } 820 | messages.append(vision_message) 821 | 822 | response = model.chat( 823 | model="llava", 824 | messages=messages, 825 | ) 826 | 827 | # Important: Remove the image path from the message history. 828 | # Ollama will attempt to load each image reference and will 829 | # eventually timeout. 830 | messages[-1]["images"] = None 831 | 832 | content = response["message"]["content"].strip() 833 | 834 | content = clean_json(content) 835 | 836 | assistant_message = {"role": "assistant", "content": content} 837 | if config.verbose: 838 | print( 839 | "[call_ollama_llava] content", 840 | content, 841 | ) 842 | content = json.loads(content) 843 | 844 | messages.append(assistant_message) 845 | 846 | return content 847 | 848 | except ollama.ResponseError as e: 849 | print( 850 | f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Operate] Couldn't connect to Ollama. With Ollama installed, run `ollama pull llava` then `ollama serve`{ANSI_RESET}", 851 | e, 852 | ) 853 | 854 | except Exception as e: 855 | print( 856 | f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[llava] That did not work. Trying again {ANSI_RESET}", 857 | e, 858 | ) 859 | print( 860 | f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}", 861 | content, 862 | ) 863 | if config.verbose: 864 | traceback.print_exc() 865 | return call_ollama_llava(messages) 866 | 867 | 868 | async def call_claude_3_with_ocr(messages, objective, model): 869 | if config.verbose: 870 | print("[call_claude_3_with_ocr]") 871 | 872 | try: 873 | time.sleep(1) 874 | client = config.initialize_anthropic() 875 | 876 | confirm_system_prompt(messages, objective, model) 877 | screenshots_dir = "screenshots" 878 | if not os.path.exists(screenshots_dir): 879 | os.makedirs(screenshots_dir) 880 | 881 | screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") 882 | capture_screen_with_cursor(screenshot_filename) 883 | 884 | # downsize screenshot due to 5MB size limit 885 | with open(screenshot_filename, "rb") as img_file: 886 | img = Image.open(img_file) 887 | 888 | # Convert RGBA to RGB 889 | if img.mode == "RGBA": 890 | img = img.convert("RGB") 891 | 892 | # Calculate the new dimensions while maintaining the aspect ratio 893 | original_width, original_height = img.size 894 | aspect_ratio = original_width / original_height 895 | new_width = 2560 # Adjust this value to achieve the desired file size 896 | new_height = int(new_width / aspect_ratio) 897 | if config.verbose: 898 | print("[call_claude_3_with_ocr] resizing claude") 899 | 900 | # Resize the image 901 | img_resized = img.resize((new_width, new_height), Image.Resampling.LANCZOS) 902 | 903 | # Save the resized and converted image to a BytesIO object for JPEG format 904 | img_buffer = io.BytesIO() 905 | img_resized.save( 906 | img_buffer, format="JPEG", quality=85 907 | ) # Adjust the quality parameter as needed 908 | img_buffer.seek(0) 909 | 910 | # Encode the resized image as base64 911 | img_data = base64.b64encode(img_buffer.getvalue()).decode("utf-8") 912 | 913 | if len(messages) == 1: 914 | user_prompt = get_user_first_message_prompt() 915 | else: 916 | user_prompt = get_user_prompt() 917 | 918 | vision_message = { 919 | "role": "user", 920 | "content": [ 921 | { 922 | "type": "image", 923 | "source": { 924 | "type": "base64", 925 | "media_type": "image/jpeg", 926 | "data": img_data, 927 | }, 928 | }, 929 | { 930 | "type": "text", 931 | "text": user_prompt 932 | + "**REMEMBER** Only output json format, do not append any other text.", 933 | }, 934 | ], 935 | } 936 | messages.append(vision_message) 937 | 938 | # anthropic api expect system prompt as an separate argument 939 | response = client.messages.create( 940 | model="claude-3-opus-20240229", 941 | max_tokens=3000, 942 | system=messages[0]["content"], 943 | messages=messages[1:], 944 | ) 945 | 946 | content = response.content[0].text 947 | content = clean_json(content) 948 | content_str = content 949 | try: 950 | content = json.loads(content) 951 | # rework for json mode output 952 | except json.JSONDecodeError as e: 953 | if config.verbose: 954 | print( 955 | f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] JSONDecodeError: {e} {ANSI_RESET}" 956 | ) 957 | response = client.messages.create( 958 | model="claude-3-opus-20240229", 959 | max_tokens=3000, 960 | system=f"This json string is not valid, when using with json.loads(content) \ 961 | it throws the following error: {e}, return correct json string. \ 962 | **REMEMBER** Only output json format, do not append any other text.", 963 | messages=[{"role": "user", "content": content}], 964 | ) 965 | content = response.content[0].text 966 | content = clean_json(content) 967 | content_str = content 968 | content = json.loads(content) 969 | 970 | if config.verbose: 971 | print( 972 | f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] content: {content} {ANSI_RESET}" 973 | ) 974 | processed_content = [] 975 | 976 | for operation in content: 977 | if operation.get("operation") == "click": 978 | text_to_click = operation.get("text") 979 | if config.verbose: 980 | print( 981 | "[call_claude_3_ocr][click] text_to_click", 982 | text_to_click, 983 | ) 984 | # Initialize EasyOCR Reader 985 | reader = easyocr.Reader(["en"]) 986 | 987 | # Read the screenshot 988 | result = reader.readtext(screenshot_filename) 989 | 990 | # limit the text to extract has a higher success rate 991 | text_element_index = get_text_element( 992 | result, text_to_click[:3], screenshot_filename 993 | ) 994 | coordinates = get_text_coordinates( 995 | result, text_element_index, screenshot_filename 996 | ) 997 | 998 | # add `coordinates`` to `content` 999 | operation["x"] = coordinates["x"] 1000 | operation["y"] = coordinates["y"] 1001 | 1002 | if config.verbose: 1003 | print( 1004 | "[call_claude_3_ocr][click] text_element_index", 1005 | text_element_index, 1006 | ) 1007 | print( 1008 | "[call_claude_3_ocr][click] coordinates", 1009 | coordinates, 1010 | ) 1011 | print( 1012 | "[call_claude_3_ocr][click] final operation", 1013 | operation, 1014 | ) 1015 | processed_content.append(operation) 1016 | 1017 | else: 1018 | processed_content.append(operation) 1019 | 1020 | assistant_message = {"role": "assistant", "content": content_str} 1021 | messages.append(assistant_message) 1022 | 1023 | return processed_content 1024 | 1025 | except Exception as e: 1026 | print( 1027 | f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}" 1028 | ) 1029 | if config.verbose: 1030 | print("[Self-Operating Computer][Operate] error", e) 1031 | traceback.print_exc() 1032 | print("message before convertion ", messages) 1033 | 1034 | # Convert the messages to the GPT-4 format 1035 | gpt4_messages = [messages[0]] # Include the system message 1036 | for message in messages[1:]: 1037 | if message["role"] == "user": 1038 | # Update the image type format from "source" to "url" 1039 | updated_content = [] 1040 | for item in message["content"]: 1041 | if isinstance(item, dict) and "type" in item: 1042 | if item["type"] == "image": 1043 | updated_content.append( 1044 | { 1045 | "type": "image_url", 1046 | "image_url": { 1047 | "url": f"data:image/png;base64,{item['source']['data']}" 1048 | }, 1049 | } 1050 | ) 1051 | else: 1052 | updated_content.append(item) 1053 | 1054 | gpt4_messages.append({"role": "user", "content": updated_content}) 1055 | elif message["role"] == "assistant": 1056 | gpt4_messages.append( 1057 | {"role": "assistant", "content": message["content"]} 1058 | ) 1059 | 1060 | return gpt_4_fallback(gpt4_messages, objective, model) 1061 | 1062 | 1063 | def get_last_assistant_message(messages): 1064 | """ 1065 | Retrieve the last message from the assistant in the messages array. 1066 | If the last assistant message is the first message in the array, return None. 1067 | """ 1068 | for index in reversed(range(len(messages))): 1069 | if messages[index]["role"] == "assistant": 1070 | if index == 0: # Check if the assistant message is the first in the array 1071 | return None 1072 | else: 1073 | return messages[index] 1074 | return None # Return None if no assistant message is found 1075 | 1076 | 1077 | def gpt_4_fallback(messages, objective, model): 1078 | if config.verbose: 1079 | print("[gpt_4_fallback]") 1080 | system_prompt = get_system_prompt("gpt-4o", objective) 1081 | new_system_message = {"role": "system", "content": system_prompt} 1082 | # remove and replace the first message in `messages` with `new_system_message` 1083 | 1084 | messages[0] = new_system_message 1085 | 1086 | if config.verbose: 1087 | print("[gpt_4_fallback][updated]") 1088 | print("[gpt_4_fallback][updated] len(messages)", len(messages)) 1089 | 1090 | return call_gpt_4o(messages) 1091 | 1092 | 1093 | def confirm_system_prompt(messages, objective, model): 1094 | """ 1095 | On `Exception` we default to `call_gpt_4_vision_preview` so we have this function to reassign system prompt in case of a previous failure 1096 | """ 1097 | if config.verbose: 1098 | print("[confirm_system_prompt] model", model) 1099 | 1100 | system_prompt = get_system_prompt(model, objective) 1101 | new_system_message = {"role": "system", "content": system_prompt} 1102 | # remove and replace the first message in `messages` with `new_system_message` 1103 | 1104 | messages[0] = new_system_message 1105 | 1106 | if config.verbose: 1107 | print("[confirm_system_prompt]") 1108 | print("[confirm_system_prompt] len(messages)", len(messages)) 1109 | for m in messages: 1110 | if m["role"] != "user": 1111 | print("--------------------[message]--------------------") 1112 | print("[confirm_system_prompt][message] role", m["role"]) 1113 | print("[confirm_system_prompt][message] content", m["content"]) 1114 | print("------------------[end message]------------------") 1115 | 1116 | 1117 | def clean_json(content): 1118 | if config.verbose: 1119 | print("\n\n[clean_json] content before cleaning", content) 1120 | if content.startswith("```json"): 1121 | content = content[ 1122 | len("```json") : 1123 | ].strip() # Remove starting ```json and trim whitespace 1124 | elif content.startswith("```"): 1125 | content = content[ 1126 | len("```") : 1127 | ].strip() # Remove starting ``` and trim whitespace 1128 | if content.endswith("```"): 1129 | content = content[ 1130 | : -len("```") 1131 | ].strip() # Remove ending ``` and trim whitespace 1132 | 1133 | # Normalize line breaks and remove any unwanted characters 1134 | content = "\n".join(line.strip() for line in content.splitlines()) 1135 | 1136 | if config.verbose: 1137 | print("\n\n[clean_json] content after cleaning", content) 1138 | 1139 | return content 1140 | --------------------------------------------------------------------------------