├── operate
    ├── __init__.py
    ├── models
    │   ├── __init__.py
    │   ├── weights
    │   │   ├── __init__.py
    │   │   └── best.pt
    │   ├── prompts.py
    │   └── apis.py
    ├── utils
    │   ├── __init__.py
    │   ├── style.py
    │   ├── misc.py
    │   ├── screenshot.py
    │   ├── operating_system.py
    │   ├── ocr.py
    │   └── label.py
    ├── exceptions.py
    ├── main.py
    ├── operate.py
    └── config.py
├── requirements-audio.txt
├── readme
    ├── key.png
    ├── terminal-access-1.png
    ├── terminal-access-2.png
    └── self-operating-computer.png
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
    ├── workflows
    │   └── upload-package.yml
    └── PULL_REQUEST_TEMPLATE.md
├── setup.py
├── LICENSE
├── requirements.txt
├── CONTRIBUTING.md
├── .gitignore
├── evaluate.py
└── README.md


/operate/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/operate/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/operate/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements-audio.txt:
--------------------------------------------------------------------------------
1 | whisper-mic


--------------------------------------------------------------------------------
/operate/models/weights/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/readme/key.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OthersideAI/self-operating-computer/HEAD/readme/key.png


--------------------------------------------------------------------------------
/readme/terminal-access-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OthersideAI/self-operating-computer/HEAD/readme/terminal-access-1.png


--------------------------------------------------------------------------------
/readme/terminal-access-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OthersideAI/self-operating-computer/HEAD/readme/terminal-access-2.png


--------------------------------------------------------------------------------
/operate/models/weights/best.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OthersideAI/self-operating-computer/HEAD/operate/models/weights/best.pt


--------------------------------------------------------------------------------
/readme/self-operating-computer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OthersideAI/self-operating-computer/HEAD/readme/self-operating-computer.png


--------------------------------------------------------------------------------
/operate/exceptions.py:
--------------------------------------------------------------------------------
 1 | class ModelNotRecognizedException(Exception):
 2 |     """Exception raised for unrecognized models.
 3 | 
 4 |     Attributes:
 5 |         model -- the unrecognized model
 6 |         message -- explanation of the error
 7 |     """
 8 | 
 9 |     def __init__(self, model, message="Model not recognized"):
10 |         self.model = model
11 |         self.message = message
12 |         super().__init__(self.message)
13 | 
14 |     def __str__(self):
15 |         return f"{self.message} : {self.model} "


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: '[FEATURE] Short Description of the Feature'
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ### Is your feature request related to a problem? Please describe.
11 | 
12 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
13 | 
14 | ### Describe the solution you'd like
15 | A clear and concise description of what you want to happen.
16 | 
17 | ### Describe alternatives you've considered
18 | A clear and concise description of any alternative solutions or features you've considered.
19 | 
20 | ### Additional context
21 | Add any other context or screenshots about the feature request here.


--------------------------------------------------------------------------------
/.github/workflows/upload-package.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*'
 7 | 
 8 | jobs:
 9 |   deploy:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - uses: actions/checkout@v3
13 | 
14 |     - name: Set up Python
15 |       uses: actions/setup-python@v3
16 |       with:
17 |         python-version: '3.8'
18 | 
19 |     - name: Install dependencies
20 |       run: |
21 |         python -m pip install --upgrade pip
22 |         pip install setuptools wheel twine
23 | 
24 |     - name: Build and check package
25 |       run: |
26 |         python setup.py sdist bdist_wheel
27 |         twine check dist/*
28 |         
29 |     - name: Upload to PyPi
30 |       uses: pypa/gh-action-pypi-publish@v1.4.2
31 |       with:
32 |         user: __token__
33 |         password: ${{ secrets.PYPI_API_TOKEN }}
34 | 
35 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: '[BUG] Brief Description of the Issue'
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | Found a bug? Please fill out the sections below. 👍
11 | 
12 | 
13 | ### Describe the bug
14 | 
15 | A clear and concise description of what the bug is.
16 | 
17 | ### Steps to Reproduce
18 | 
19 | 1. (for ex.) went to...
20 | 2. clicked on this point
21 | 3. not working
22 | 
23 | ### Expected Behavior
24 | A brief description of what you expected to happen.
25 | 
26 | ### Actual Behavior:
27 | what actually happened.
28 | 
29 | ### Environment
30 | - OS: 
31 | - Model Used (e.g., GPT-4v, Gemini Pro Vision):
32 | - Framework Version (optional):
33 | 
34 | ### Screenshots
35 | If applicable, add screenshots to help explain your problem.
36 | 
37 | ### Additional context
38 | Add any other context about the problem here.


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | # Read the contents of your requirements.txt file
 4 | with open("requirements.txt") as f:
 5 |     required = f.read().splitlines()
 6 | 
 7 | # Read the contents of your README.md file for the project description
 8 | with open("README.md", "r", encoding="utf-8") as readme_file:
 9 |     long_description = readme_file.read()
10 | 
11 | setup(
12 |     name="self-operating-computer",
13 |     version="1.5.8",
14 |     packages=find_packages(),
15 |     install_requires=required,  # Add dependencies here
16 |     entry_points={
17 |         "console_scripts": [
18 |             "operate=operate.main:main_entry",
19 |         ],
20 |     },
21 |     package_data={
22 |         # Include the file in the operate.models.weights package
23 |         "operate.models.weights": ["best.pt"],
24 |     },
25 |     long_description=long_description,  # Add project description here
26 |     long_description_content_type="text/markdown",  # Specify Markdown format
27 |     # include any other necessary setup options here
28 | )
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 OthersideAI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | annotated-types==0.6.0
 2 | anyio==3.7.1
 3 | certifi==2023.7.22
 4 | charset-normalizer==3.3.2
 5 | colorama==0.4.6
 6 | contourpy==1.2.0
 7 | cycler==0.12.1
 8 | distro==1.8.0
 9 | EasyProcess==1.1
10 | entrypoint2==1.1
11 | exceptiongroup==1.1.3
12 | fonttools==4.44.0
13 | h11==0.14.0
14 | httpcore==1.0.2
15 | httpx>=0.25.2
16 | idna==3.4
17 | importlib-resources==6.1.1
18 | kiwisolver==1.4.5
19 | matplotlib==3.8.1
20 | MouseInfo==0.1.3
21 | mss==9.0.1
22 | numpy==1.26.1
23 | openai==1.2.3
24 | packaging==23.2
25 | Pillow==10.1.0
26 | prompt-toolkit==3.0.39
27 | PyAutoGUI==0.9.54
28 | pydantic==2.4.2
29 | pydantic_core==2.10.1
30 | PyGetWindow==0.0.9
31 | PyMsgBox==1.0.9
32 | pyparsing==3.1.1
33 | pyperclip==1.8.2
34 | PyRect==0.2.0
35 | pyscreenshot==3.1
36 | PyScreeze==0.1.29
37 | python3-xlib==0.15
38 | python-dateutil==2.8.2
39 | python-dotenv==1.0.0
40 | pytweening==1.0.7
41 | requests==2.31.0
42 | rubicon-objc==0.4.7
43 | six==1.16.0
44 | sniffio==1.3.0
45 | tqdm==4.66.1
46 | typing_extensions==4.8.0
47 | urllib3==2.0.7
48 | wcwidth==0.2.9
49 | zipp==3.17.0
50 | google-generativeai==0.3.0
51 | aiohttp==3.9.1
52 | ultralytics==8.0.227
53 | easyocr==1.7.1
54 | ollama==0.1.6
55 | anthropic


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## What does this PR do?
 2 | 
 3 | <!-- Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change. -->
 4 | 
 5 | Fixes # (issue)
 6 | 
 7 | ## Requirement/Documentation
 8 | 
 9 | <!-- Please provide all documents that are important to understand the reason of that PR. -->
10 | 
11 | - If there is a requirement document, please, share it here.
12 | 
13 | ## Type of change
14 | 
15 | <!-- Please delete bullets that are not relevant. -->
16 | 
17 | - [ ] Bug fix (non-breaking change which fixes an issue)
18 | - [ ] Chore (refactoring code, technical debt, workflow improvements)
19 | - [ ] New feature (non-breaking change which adds functionality)
20 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
21 | - [ ] Tests (Unit/Integration/E2E or any other test)
22 | - [ ] This change requires a documentation update
23 | 
24 | 
25 | ## Mandatory Tasks
26 | 
27 | - [ ] Make sure you have self-reviewed the code. A decent size PR without self-review might be rejected. Make sure before submmiting this PR you run tests with evaluate.py
28 | 


--------------------------------------------------------------------------------
/operate/utils/style.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import platform
 3 | import os
 4 | from prompt_toolkit.styles import Style as PromptStyle
 5 | 
 6 | 
 7 | # Define style
 8 | style = PromptStyle.from_dict(
 9 |     {
10 |         "dialog": "bg:#88ff88",
11 |         "button": "bg:#ffffff #000000",
12 |         "dialog.body": "bg:#44cc44 #ffffff",
13 |         "dialog shadow": "bg:#003800",
14 |     }
15 | )
16 | 
17 | 
18 | # Check if on a windows terminal that supports ANSI escape codes
19 | def supports_ansi():
20 |     """
21 |     Check if the terminal supports ANSI escape codes
22 |     """
23 |     plat = platform.system()
24 |     supported_platform = plat != "Windows" or "ANSICON" in os.environ
25 |     is_a_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
26 |     return supported_platform and is_a_tty
27 | 
28 | 
29 | # Define ANSI color codes
30 | ANSI_GREEN = "\033[32m" if supports_ansi() else ""  # Standard green text
31 | ANSI_BRIGHT_GREEN = "\033[92m" if supports_ansi() else ""  # Bright/bold green text
32 | ANSI_RESET = "\033[0m" if supports_ansi() else ""  # Reset to default text color
33 | ANSI_BLUE = "\033[94m" if supports_ansi() else ""  # Bright blue
34 | ANSI_YELLOW = "\033[33m" if supports_ansi() else ""  # Standard yellow text
35 | ANSI_RED = "\033[31m" if supports_ansi() else ""
36 | ANSI_BRIGHT_MAGENTA = "\033[95m" if supports_ansi() else ""  # Bright magenta text
37 | 


--------------------------------------------------------------------------------
/operate/main.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Self-Operating Computer
 3 | """
 4 | import argparse
 5 | from operate.utils.style import ANSI_BRIGHT_MAGENTA
 6 | from operate.operate import main
 7 | 
 8 | 
 9 | def main_entry():
10 |     parser = argparse.ArgumentParser(
11 |         description="Run the self-operating-computer with a specified model."
12 |     )
13 |     parser.add_argument(
14 |         "-m",
15 |         "--model",
16 |         help="Specify the model to use",
17 |         required=False,
18 |         default="gpt-4-with-ocr",
19 |     )
20 | 
21 |     # Add a voice flag
22 |     parser.add_argument(
23 |         "--voice",
24 |         help="Use voice input mode",
25 |         action="store_true",
26 |     )
27 |     
28 |     # Add a flag for verbose mode
29 |     parser.add_argument(
30 |         "--verbose",
31 |         help="Run operate in verbose mode",
32 |         action="store_true",
33 |     )
34 |     
35 |     # Allow for direct input of prompt
36 |     parser.add_argument(
37 |         "--prompt",
38 |         help="Directly input the objective prompt",
39 |         type=str,
40 |         required=False,
41 |     )
42 | 
43 |     try:
44 |         args = parser.parse_args()
45 |         main(
46 |             args.model,
47 |             terminal_prompt=args.prompt,
48 |             voice_mode=args.voice,
49 |             verbose_mode=args.verbose
50 |         )
51 |     except KeyboardInterrupt:
52 |         print(f"\n{ANSI_BRIGHT_MAGENTA}Exiting...")
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     main_entry()
57 | 


--------------------------------------------------------------------------------
/operate/utils/misc.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | 
 4 | 
 5 | def convert_percent_to_decimal(percent):
 6 |     try:
 7 |         # Remove the '%' sign and convert to float
 8 |         decimal_value = float(percent)
 9 | 
10 |         # Convert to decimal (e.g., 20% -> 0.20)
11 |         return decimal_value
12 |     except ValueError as e:
13 |         print(f"[convert_percent_to_decimal] error: {e}")
14 |         return None
15 | 
16 | 
17 | def parse_operations(response):
18 |     if response == "DONE":
19 |         return {"type": "DONE", "data": None}
20 |     elif response.startswith("CLICK"):
21 |         # Adjust the regex to match the correct format
22 |         click_data = re.search(r"CLICK \{ (.+) \}", response).group(1)
23 |         click_data_json = json.loads(f"{{{click_data}}}")
24 |         return {"type": "CLICK", "data": click_data_json}
25 | 
26 |     elif response.startswith("TYPE"):
27 |         # Extract the text to type
28 |         try:
29 |             type_data = re.search(r"TYPE (.+)", response, re.DOTALL).group(1)
30 |         except:
31 |             type_data = re.search(r'TYPE "(.+)"', response, re.DOTALL).group(1)
32 |         return {"type": "TYPE", "data": type_data}
33 | 
34 |     elif response.startswith("SEARCH"):
35 |         # Extract the search query
36 |         try:
37 |             search_data = re.search(r'SEARCH "(.+)"', response).group(1)
38 |         except:
39 |             search_data = re.search(r"SEARCH (.+)", response).group(1)
40 |         return {"type": "SEARCH", "data": search_data}
41 | 
42 |     return {"type": "UNKNOWN", "data": response}
43 | 


--------------------------------------------------------------------------------
/operate/utils/screenshot.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import platform
 3 | import subprocess
 4 | import pyautogui
 5 | from PIL import Image, ImageDraw, ImageGrab
 6 | import Xlib.display
 7 | import Xlib.X
 8 | import Xlib.Xutil  # not sure if Xutil is necessary
 9 | 
10 | 
11 | def capture_screen_with_cursor(file_path):
12 |     user_platform = platform.system()
13 | 
14 |     if user_platform == "Windows":
15 |         screenshot = pyautogui.screenshot()
16 |         screenshot.save(file_path)
17 |     elif user_platform == "Linux":
18 |         # Use xlib to prevent scrot dependency for Linux
19 |         screen = Xlib.display.Display().screen()
20 |         size = screen.width_in_pixels, screen.height_in_pixels
21 |         screenshot = ImageGrab.grab(bbox=(0, 0, size[0], size[1]))
22 |         screenshot.save(file_path)
23 |     elif user_platform == "Darwin":  # (Mac OS)
24 |         # Use the screencapture utility to capture the screen with the cursor
25 |         subprocess.run(["screencapture", "-C", file_path])
26 |     else:
27 |         print(f"The platform you're using ({user_platform}) is not currently supported")
28 | 
29 | 
30 | def compress_screenshot(raw_screenshot_filename, screenshot_filename):
31 |     with Image.open(raw_screenshot_filename) as img:
32 |         # Check if the image has an alpha channel (transparency)
33 |         if img.mode in ('RGBA', 'LA') or (img.mode == 'P' and 'transparency' in img.info):
34 |             # Create a white background image
35 |             background = Image.new('RGB', img.size, (255, 255, 255))
36 |             # Paste the image onto the background, using the alpha channel as mask
37 |             background.paste(img, mask=img.split()[3])  # 3 is the alpha channel
38 |             # Save the result as JPEG
39 |             background.save(screenshot_filename, 'JPEG', quality=85)  # Adjust quality as needed
40 |         else:
41 |             # If no alpha channel, simply convert and save
42 |             img.convert('RGB').save(screenshot_filename, 'JPEG', quality=85)
43 | 


--------------------------------------------------------------------------------
/operate/utils/operating_system.py:
--------------------------------------------------------------------------------
 1 | import pyautogui
 2 | import platform
 3 | import time
 4 | import math
 5 | 
 6 | from operate.utils.misc import convert_percent_to_decimal
 7 | 
 8 | 
 9 | class OperatingSystem:
10 |     def write(self, content):
11 |         try:
12 |             content = content.replace("\\n", "\n")
13 |             for char in content:
14 |                 pyautogui.write(char)
15 |         except Exception as e:
16 |             print("[OperatingSystem][write] error:", e)
17 | 
18 |     def press(self, keys):
19 |         try:
20 |             for key in keys:
21 |                 pyautogui.keyDown(key)
22 |             time.sleep(0.1)
23 |             for key in keys:
24 |                 pyautogui.keyUp(key)
25 |         except Exception as e:
26 |             print("[OperatingSystem][press] error:", e)
27 | 
28 |     def mouse(self, click_detail):
29 |         try:
30 |             x = convert_percent_to_decimal(click_detail.get("x"))
31 |             y = convert_percent_to_decimal(click_detail.get("y"))
32 | 
33 |             if click_detail and isinstance(x, float) and isinstance(y, float):
34 |                 self.click_at_percentage(x, y)
35 | 
36 |         except Exception as e:
37 |             print("[OperatingSystem][mouse] error:", e)
38 | 
39 |     def click_at_percentage(
40 |         self,
41 |         x_percentage,
42 |         y_percentage,
43 |         duration=0.2,
44 |         circle_radius=50,
45 |         circle_duration=0.5,
46 |     ):
47 |         try:
48 |             screen_width, screen_height = pyautogui.size()
49 |             x_pixel = int(screen_width * float(x_percentage))
50 |             y_pixel = int(screen_height * float(y_percentage))
51 | 
52 |             pyautogui.moveTo(x_pixel, y_pixel, duration=duration)
53 | 
54 |             start_time = time.time()
55 |             while time.time() - start_time < circle_duration:
56 |                 angle = ((time.time() - start_time) / circle_duration) * 2 * math.pi
57 |                 x = x_pixel + math.cos(angle) * circle_radius
58 |                 y = y_pixel + math.sin(angle) * circle_radius
59 |                 pyautogui.moveTo(x, y, duration=0.1)
60 | 
61 |             pyautogui.click(x_pixel, y_pixel)
62 |         except Exception as e:
63 |             print("[OperatingSystem][click_at_percentage] error:", e)
64 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | We appreciate your contributions!
 3 | 
 4 | ## Process
 5 | 1. Fork it
 6 | 2. Create your feature branch (`git checkout -b my-new-feature`)
 7 | 3. Commit your changes (`git commit -am 'Add some feature'`)
 8 | 4. Push to the branch (`git push origin my-new-feature`)
 9 | 5. Create new Pull Request
10 | 
11 | ## Modifying and Running Code
12 | 1. Make changes in `operate/main.py`
13 | 2. Run `pip install .` again
14 | 3. Run `operate` to see your changes
15 | 
16 | ## Testing Changes
17 | **After making significant changes, it's important to verify that SOC can still successfully perform a set of common test cases.**
18 | In the root directory of the project, run:
19 | ```
20 | python3 evaluate.py
21 | ```   
22 | This will automatically prompt `operate` to perform several simple objectives.   
23 | Upon completion of each objective, GPT-4v will give an evaluation and determine if the objective was successfully reached.   
24 | 
25 | `evaluate.py` will print out if each test case `[PASSED]` or `[FAILED]`. In addition, a justification will be given on why the pass/fail was given.   
26 | 
27 | It is recommended that a screenshot of the `evaluate.py` output is included in any PR which could impact the performance of SOC.
28 | 
29 | ## Contribution Ideas
30 | - **Improve performance by finding optimal screenshot grid**: A primary element of the framework is that it overlays a percentage grid on the screenshot which GPT-4v uses to estimate click locations. If someone is able to find the optimal grid and some evaluation metrics to confirm it is an improvement on the current method then we will merge that PR. 
31 | - **Improve the `SUMMARY_PROMPT`**
32 | - **Improve Linux and Windows compatibility**: There are still some issues with Linux and Windows compatibility. PRs to fix the issues are encouraged. 
33 | - **Adding New Multimodal Models**: Integration of new multimodal models is welcomed. If you have a specific model in mind that you believe would be a valuable addition, please feel free to integrate it and submit a PR.
34 | - **Iterate `--accurate` flag functionality**: Look at https://github.com/OthersideAI/self-operating-computer/pull/57 for previous iteration
35 | - **Enhanced Security**: A feature request to implement a _robust security feature_ that prompts users for _confirmation before executing potentially harmful actions_. This feature aims to _prevent unintended actions_ and _safeguard user data_ as mentioned here in this [OtherSide#25](https://github.com/OthersideAI/self-operating-computer/issues/25)
36 | 
37 | 
38 | ## Guidelines
39 | This will primarily be a [Software 2.0](https://karpathy.medium.com/software-2-0-a64152b37c35) project. For this reason: 
40 | 
41 | - Let's try to hold off refactors into separate files until `main.py` is more than 1000 lines
42 | 
43 | 


--------------------------------------------------------------------------------
/operate/utils/ocr.py:
--------------------------------------------------------------------------------
  1 | from operate.config import Config
  2 | from PIL import Image, ImageDraw
  3 | import os
  4 | from datetime import datetime
  5 | 
  6 | # Load configuration
  7 | config = Config()
  8 | 
  9 | 
 10 | def get_text_element(result, search_text, image_path):
 11 |     """
 12 |     Searches for a text element in the OCR results and returns its index. Also draws bounding boxes on the image.
 13 |     Args:
 14 |         result (list): The list of results returned by EasyOCR.
 15 |         search_text (str): The text to search for in the OCR results.
 16 |         image_path (str): Path to the original image.
 17 | 
 18 |     Returns:
 19 |         int: The index of the element containing the search text.
 20 | 
 21 |     Raises:
 22 |         Exception: If the text element is not found in the results.
 23 |     """
 24 |     if config.verbose:
 25 |         print("[get_text_element]")
 26 |         print("[get_text_element] search_text", search_text)
 27 |         # Create /ocr directory if it doesn't exist
 28 |         ocr_dir = "ocr"
 29 |         if not os.path.exists(ocr_dir):
 30 |             os.makedirs(ocr_dir)
 31 | 
 32 |         # Open the original image
 33 |         image = Image.open(image_path)
 34 |         draw = ImageDraw.Draw(image)
 35 | 
 36 |     found_index = None
 37 |     for index, element in enumerate(result):
 38 |         text = element[1]
 39 |         box = element[0]
 40 | 
 41 |         if config.verbose:
 42 |             # Draw bounding box in blue
 43 |             draw.polygon([tuple(point) for point in box], outline="blue")
 44 | 
 45 |         if search_text in text:
 46 |             found_index = index
 47 |             if config.verbose:
 48 |                 print("[get_text_element][loop] found search_text, index:", index)
 49 | 
 50 |     if found_index is not None:
 51 |         if config.verbose:
 52 |             # Draw bounding box of the found text in red
 53 |             box = result[found_index][0]
 54 |             draw.polygon([tuple(point) for point in box], outline="red")
 55 |             # Save the image with bounding boxes
 56 |             datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S")
 57 |             ocr_image_path = os.path.join(ocr_dir, f"ocr_image_{datetime_str}.png")
 58 |             image.save(ocr_image_path)
 59 |             print("[get_text_element] OCR image saved at:", ocr_image_path)
 60 | 
 61 |         return found_index
 62 | 
 63 |     raise Exception("The text element was not found in the image")
 64 | 
 65 | 
 66 | def get_text_coordinates(result, index, image_path):
 67 |     """
 68 |     Gets the coordinates of the text element at the specified index as a percentage of screen width and height.
 69 |     Args:
 70 |         result (list): The list of results returned by EasyOCR.
 71 |         index (int): The index of the text element in the results list.
 72 |         image_path (str): Path to the screenshot image.
 73 | 
 74 |     Returns:
 75 |         dict: A dictionary containing the 'x' and 'y' coordinates as percentages of the screen width and height.
 76 |     """
 77 |     if index >= len(result):
 78 |         raise Exception("Index out of range in OCR results")
 79 | 
 80 |     # Get the bounding box of the text element
 81 |     bounding_box = result[index][0]
 82 | 
 83 |     # Calculate the center of the bounding box
 84 |     min_x = min([coord[0] for coord in bounding_box])
 85 |     max_x = max([coord[0] for coord in bounding_box])
 86 |     min_y = min([coord[1] for coord in bounding_box])
 87 |     max_y = max([coord[1] for coord in bounding_box])
 88 | 
 89 |     center_x = (min_x + max_x) / 2
 90 |     center_y = (min_y + max_y) / 2
 91 | 
 92 |     # Get image dimensions
 93 |     with Image.open(image_path) as img:
 94 |         width, height = img.size
 95 | 
 96 |     # Convert to percentages
 97 |     percent_x = round((center_x / width), 3)
 98 |     percent_y = round((center_y / height), 3)
 99 | 
100 |     return {"x": percent_x, "y": percent_y}
101 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | .DS_Store
163 | 
164 | # Avoid sending testing screenshots up
165 | *.png
166 | operate/screenshots/
167 | 


--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import subprocess
  4 | import platform
  5 | import base64
  6 | import json
  7 | import openai
  8 | import argparse
  9 | 
 10 | from dotenv import load_dotenv
 11 | 
 12 | # "Objective for `operate`" : "Guideline for passing this test case given to GPT-4v"
 13 | TEST_CASES = {
 14 |     "Go to Github.com": "A Github page is visible.",
 15 |     "Go to Youtube.com and play a video": "The YouTube video player is visible.",
 16 | }
 17 | 
 18 | EVALUATION_PROMPT = """
 19 | Your job is to look at the given screenshot and determine if the following guideline is met in the image.
 20 | You must respond in the following format ONLY. Do not add anything else:
 21 | {{ "guideline_met": (true|false), "reason": "Explanation for why guideline was or wasn't met" }}
 22 | guideline_met must be set to a JSON boolean. True if the image meets the given guideline.
 23 | reason must be a string containing a justification for your decision.
 24 | 
 25 | Guideline: {guideline}
 26 | """
 27 | 
 28 | SCREENSHOT_PATH = os.path.join("screenshots", "screenshot.png")
 29 | 
 30 | 
 31 | # Check if on a windows terminal that supports ANSI escape codes
 32 | def supports_ansi():
 33 |     """
 34 |     Check if the terminal supports ANSI escape codes
 35 |     """
 36 |     plat = platform.system()
 37 |     supported_platform = plat != "Windows" or "ANSICON" in os.environ
 38 |     is_a_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
 39 |     return supported_platform and is_a_tty
 40 | 
 41 | 
 42 | if supports_ansi():
 43 |     # Standard green text
 44 |     ANSI_GREEN = "\033[32m"
 45 |     # Bright/bold green text
 46 |     ANSI_BRIGHT_GREEN = "\033[92m"
 47 |     # Reset to default text color
 48 |     ANSI_RESET = "\033[0m"
 49 |     # ANSI escape code for blue text
 50 |     ANSI_BLUE = "\033[94m"  # This is for bright blue
 51 | 
 52 |     # Standard yellow text
 53 |     ANSI_YELLOW = "\033[33m"
 54 | 
 55 |     ANSI_RED = "\033[31m"
 56 | 
 57 |     # Bright magenta text
 58 |     ANSI_BRIGHT_MAGENTA = "\033[95m"
 59 | else:
 60 |     ANSI_GREEN = ""
 61 |     ANSI_BRIGHT_GREEN = ""
 62 |     ANSI_RESET = ""
 63 |     ANSI_BLUE = ""
 64 |     ANSI_YELLOW = ""
 65 |     ANSI_RED = ""
 66 |     ANSI_BRIGHT_MAGENTA = ""
 67 | 
 68 | 
 69 | def format_evaluation_prompt(guideline):
 70 |     prompt = EVALUATION_PROMPT.format(guideline=guideline)
 71 |     return prompt
 72 | 
 73 | 
 74 | def parse_eval_content(content):
 75 |     try:
 76 |         res = json.loads(content)
 77 | 
 78 |         print(res["reason"])
 79 | 
 80 |         return res["guideline_met"]
 81 |     except:
 82 |         print(
 83 |             "The model gave a bad evaluation response and it couldn't be parsed. Exiting..."
 84 |         )
 85 |         exit(1)
 86 | 
 87 | 
 88 | def evaluate_final_screenshot(guideline):
 89 |     """Load the final screenshot and return True or False if it meets the given guideline."""
 90 |     with open(SCREENSHOT_PATH, "rb") as img_file:
 91 |         img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
 92 | 
 93 |         eval_message = [
 94 |             {
 95 |                 "role": "user",
 96 |                 "content": [
 97 |                     {"type": "text", "text": format_evaluation_prompt(guideline)},
 98 |                     {
 99 |                         "type": "image_url",
100 |                         "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
101 |                     },
102 |                 ],
103 |             }
104 |         ]
105 | 
106 |         response = openai.chat.completions.create(
107 |             model="gpt-4o",
108 |             messages=eval_message,
109 |             presence_penalty=1,
110 |             frequency_penalty=1,
111 |             temperature=0.7,
112 |         )
113 | 
114 |         eval_content = response.choices[0].message.content
115 | 
116 |         return parse_eval_content(eval_content)
117 | 
118 | 
119 | def run_test_case(objective, guideline, model):
120 |     """Returns True if the result of the test with the given prompt meets the given guideline for the given model."""
121 |     # Run `operate` with the model to evaluate and the test case prompt
122 |     subprocess.run(
123 |         ["operate", "-m", model, "--prompt", f'"{objective}"'],
124 |         stdout=subprocess.DEVNULL,
125 |     )
126 | 
127 |     try:
128 |         result = evaluate_final_screenshot(guideline)
129 |     except OSError:
130 |         print("[Error] Couldn't open the screenshot for evaluation")
131 |         return False
132 | 
133 |     return result
134 | 
135 | 
136 | def get_test_model():
137 |     parser = argparse.ArgumentParser(
138 |         description="Run the self-operating-computer with a specified model."
139 |     )
140 | 
141 |     parser.add_argument(
142 |         "-m",
143 |         "--model",
144 |         help="Specify the model to evaluate.",
145 |         required=False,
146 |         default="gpt-4-with-ocr",
147 |     )
148 | 
149 |     return parser.parse_args().model
150 | 
151 | 
152 | def main():
153 |     load_dotenv()
154 |     openai.api_key = os.getenv("OPENAI_API_KEY")
155 | 
156 |     model = get_test_model()
157 | 
158 |     print(f"{ANSI_BLUE}[EVALUATING MODEL `{model}`]{ANSI_RESET}")
159 |     print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET}")
160 | 
161 |     passed = 0
162 |     failed = 0
163 |     for objective, guideline in TEST_CASES.items():
164 |         print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'")
165 | 
166 |         result = run_test_case(objective, guideline, model)
167 |         if result:
168 |             print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'")
169 |             passed += 1
170 |         else:
171 |             print(f"{ANSI_RED}[FAILED]{ANSI_RESET} '{objective}'")
172 |             failed += 1
173 | 
174 |     print(
175 |         f"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} test{'' if passed == 1 else 's'} passed, {failed} test{'' if failed == 1 else 's'} failed"
176 |     )
177 | 
178 | 
179 | if __name__ == "__main__":
180 |     main()
181 | 


--------------------------------------------------------------------------------
/operate/utils/label.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import base64
  3 | import json
  4 | import os
  5 | import time
  6 | import asyncio
  7 | from PIL import Image, ImageDraw
  8 | 
  9 | 
 10 | def validate_and_extract_image_data(data):
 11 |     if not data or "messages" not in data:
 12 |         raise ValueError("Invalid request, no messages found")
 13 | 
 14 |     messages = data["messages"]
 15 |     if (
 16 |         not messages
 17 |         or not isinstance(messages, list)
 18 |         or not messages[-1].get("image_url")
 19 |     ):
 20 |         raise ValueError("No image provided or incorrect format")
 21 | 
 22 |     image_data = messages[-1]["image_url"]["url"]
 23 |     if not image_data.startswith("data:image"):
 24 |         raise ValueError("Invalid image format")
 25 | 
 26 |     return image_data.split("base64,")[-1], messages
 27 | 
 28 | 
 29 | def get_label_coordinates(label, label_coordinates):
 30 |     """
 31 |     Retrieves the coordinates for a given label.
 32 | 
 33 |     :param label: The label to find coordinates for (e.g., "~1").
 34 |     :param label_coordinates: Dictionary containing labels and their coordinates.
 35 |     :return: Coordinates of the label or None if the label is not found.
 36 |     """
 37 |     return label_coordinates.get(label)
 38 | 
 39 | 
 40 | def is_overlapping(box1, box2):
 41 |     x1_box1, y1_box1, x2_box1, y2_box1 = box1
 42 |     x1_box2, y1_box2, x2_box2, y2_box2 = box2
 43 | 
 44 |     # Check if there is no overlap
 45 |     if x1_box1 > x2_box2 or x1_box2 > x2_box1:
 46 |         return False
 47 |     if (
 48 |         y1_box1 > y2_box2 or y1_box2 > y2_box1
 49 |     ):  # Adjusted to check 100px proximity above
 50 |         return False
 51 | 
 52 |     return True
 53 | 
 54 | 
 55 | def add_labels(base64_data, yolo_model):
 56 |     image_bytes = base64.b64decode(base64_data)
 57 |     image_labeled = Image.open(io.BytesIO(image_bytes))  # Corrected this line
 58 |     image_debug = image_labeled.copy()  # Create a copy for the debug image
 59 |     image_original = (
 60 |         image_labeled.copy()
 61 |     )  # Copy of the original image for base64 return
 62 | 
 63 |     results = yolo_model(image_labeled)
 64 | 
 65 |     draw = ImageDraw.Draw(image_labeled)
 66 |     debug_draw = ImageDraw.Draw(
 67 |         image_debug
 68 |     )  # Create a separate draw object for the debug image
 69 |     font_size = 45
 70 | 
 71 |     labeled_images_dir = "labeled_images"
 72 |     label_coordinates = {}  # Dictionary to store coordinates
 73 | 
 74 |     if not os.path.exists(labeled_images_dir):
 75 |         os.makedirs(labeled_images_dir)
 76 | 
 77 |     counter = 0
 78 |     drawn_boxes = []  # List to keep track of boxes already drawn
 79 |     for result in results:
 80 |         if hasattr(result, "boxes"):
 81 |             for det in result.boxes:
 82 |                 bbox = det.xyxy[0]
 83 |                 x1, y1, x2, y2 = bbox.tolist()
 84 | 
 85 |                 debug_label = "D_" + str(counter)
 86 |                 debug_index_position = (x1, y1 - font_size)
 87 |                 debug_draw.rectangle([(x1, y1), (x2, y2)], outline="blue", width=1)
 88 |                 debug_draw.text(
 89 |                     debug_index_position,
 90 |                     debug_label,
 91 |                     fill="blue",
 92 |                     font_size=font_size,
 93 |                 )
 94 | 
 95 |                 overlap = any(
 96 |                     is_overlapping((x1, y1, x2, y2), box) for box in drawn_boxes
 97 |                 )
 98 | 
 99 |                 if not overlap:
100 |                     draw.rectangle([(x1, y1), (x2, y2)], outline="red", width=1)
101 |                     label = "~" + str(counter)
102 |                     index_position = (x1, y1 - font_size)
103 |                     draw.text(
104 |                         index_position,
105 |                         label,
106 |                         fill="red",
107 |                         font_size=font_size,
108 |                     )
109 | 
110 |                     # Add the non-overlapping box to the drawn_boxes list
111 |                     drawn_boxes.append((x1, y1, x2, y2))
112 |                     label_coordinates[label] = (x1, y1, x2, y2)
113 | 
114 |                     counter += 1
115 | 
116 |     # Save the image
117 |     timestamp = time.strftime("%Y%m%d-%H%M%S")
118 | 
119 |     output_path = os.path.join(labeled_images_dir, f"img_{timestamp}_labeled.png")
120 |     output_path_debug = os.path.join(labeled_images_dir, f"img_{timestamp}_debug.png")
121 |     output_path_original = os.path.join(
122 |         labeled_images_dir, f"img_{timestamp}_original.png"
123 |     )
124 | 
125 |     image_labeled.save(output_path)
126 |     image_debug.save(output_path_debug)
127 |     image_original.save(output_path_original)
128 | 
129 |     buffered_original = io.BytesIO()
130 |     image_original.save(buffered_original, format="PNG")  # I guess this is needed
131 |     img_base64_original = base64.b64encode(buffered_original.getvalue()).decode("utf-8")
132 | 
133 |     # Convert image to base64 for return
134 |     buffered_labeled = io.BytesIO()
135 |     image_labeled.save(buffered_labeled, format="PNG")  # I guess this is needed
136 |     img_base64_labeled = base64.b64encode(buffered_labeled.getvalue()).decode("utf-8")
137 | 
138 |     return img_base64_labeled, label_coordinates
139 | 
140 | 
141 | def get_click_position_in_percent(coordinates, image_size):
142 |     """
143 |     Calculates the click position at the center of the bounding box and converts it to percentages.
144 | 
145 |     :param coordinates: A tuple of the bounding box coordinates (x1, y1, x2, y2).
146 |     :param image_size: A tuple of the image dimensions (width, height).
147 |     :return: A tuple of the click position in percentages (x_percent, y_percent).
148 |     """
149 |     if not coordinates or not image_size:
150 |         return None
151 | 
152 |     # Calculate the center of the bounding box
153 |     x_center = (coordinates[0] + coordinates[2]) / 2
154 |     y_center = (coordinates[1] + coordinates[3]) / 2
155 | 
156 |     # Convert to percentages
157 |     x_percent = x_center / image_size[0]
158 |     y_percent = y_center / image_size[1]
159 | 
160 |     return x_percent, y_percent
161 | 


--------------------------------------------------------------------------------
/operate/operate.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import time
  4 | import asyncio
  5 | from prompt_toolkit.shortcuts import message_dialog
  6 | from prompt_toolkit import prompt
  7 | from operate.exceptions import ModelNotRecognizedException
  8 | import platform
  9 | 
 10 | # from operate.models.prompts import USER_QUESTION, get_system_prompt
 11 | from operate.models.prompts import (
 12 |     USER_QUESTION,
 13 |     get_system_prompt,
 14 | )
 15 | from operate.config import Config
 16 | from operate.utils.style import (
 17 |     ANSI_GREEN,
 18 |     ANSI_RESET,
 19 |     ANSI_YELLOW,
 20 |     ANSI_RED,
 21 |     ANSI_BRIGHT_MAGENTA,
 22 |     ANSI_BLUE,
 23 |     style,
 24 | )
 25 | from operate.utils.operating_system import OperatingSystem
 26 | from operate.models.apis import get_next_action
 27 | 
 28 | # Load configuration
 29 | config = Config()
 30 | operating_system = OperatingSystem()
 31 | 
 32 | 
 33 | def main(model, terminal_prompt, voice_mode=False, verbose_mode=False):
 34 |     """
 35 |     Main function for the Self-Operating Computer.
 36 | 
 37 |     Parameters:
 38 |     - model: The model used for generating responses.
 39 |     - terminal_prompt: A string representing the prompt provided in the terminal.
 40 |     - voice_mode: A boolean indicating whether to enable voice mode.
 41 | 
 42 |     Returns:
 43 |     None
 44 |     """
 45 | 
 46 |     mic = None
 47 |     # Initialize `WhisperMic`, if `voice_mode` is True
 48 | 
 49 |     config.verbose = verbose_mode
 50 |     config.validation(model, voice_mode)
 51 | 
 52 |     if voice_mode:
 53 |         try:
 54 |             from whisper_mic import WhisperMic
 55 | 
 56 |             # Initialize WhisperMic if import is successful
 57 |             mic = WhisperMic()
 58 |         except ImportError:
 59 |             print(
 60 |                 "Voice mode requires the 'whisper_mic' module. Please install it using 'pip install -r requirements-audio.txt'"
 61 |             )
 62 |             sys.exit(1)
 63 | 
 64 |     # Skip message dialog if prompt was given directly
 65 |     if not terminal_prompt:
 66 |         message_dialog(
 67 |             title="Self-Operating Computer",
 68 |             text="An experimental framework to enable multimodal models to operate computers",
 69 |             style=style,
 70 |         ).run()
 71 | 
 72 |     else:
 73 |         print("Running direct prompt...")
 74 | 
 75 |     # # Clear the console
 76 |     if platform.system() == "Windows":
 77 |         os.system("cls")
 78 |     else:
 79 |         print("\033c", end="")
 80 | 
 81 |     if terminal_prompt:  # Skip objective prompt if it was given as an argument
 82 |         objective = terminal_prompt
 83 |     elif voice_mode:
 84 |         print(
 85 |             f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)"
 86 |         )
 87 |         try:
 88 |             objective = mic.listen()
 89 |         except Exception as e:
 90 |             print(f"{ANSI_RED}Error in capturing voice input: {e}{ANSI_RESET}")
 91 |             return  # Exit if voice input fails
 92 |     else:
 93 |         print(
 94 |             f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BRIGHT_MAGENTA} {model}{ANSI_RESET}]\n{USER_QUESTION}"
 95 |         )
 96 |         print(f"{ANSI_YELLOW}[User]{ANSI_RESET}")
 97 |         objective = prompt(style=style)
 98 | 
 99 |     system_prompt = get_system_prompt(model, objective)
100 |     system_message = {"role": "system", "content": system_prompt}
101 |     messages = [system_message]
102 | 
103 |     loop_count = 0
104 | 
105 |     session_id = None
106 | 
107 |     while True:
108 |         if config.verbose:
109 |             print("[Self Operating Computer] loop_count", loop_count)
110 |         try:
111 |             operations, session_id = asyncio.run(
112 |                 get_next_action(model, messages, objective, session_id)
113 |             )
114 | 
115 |             stop = operate(operations, model)
116 |             if stop:
117 |                 break
118 | 
119 |             loop_count += 1
120 |             if loop_count > 10:
121 |                 break
122 |         except ModelNotRecognizedException as e:
123 |             print(
124 |                 f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}"
125 |             )
126 |             break
127 |         except Exception as e:
128 |             print(
129 |                 f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}"
130 |             )
131 |             break
132 | 
133 | 
134 | def operate(operations, model):
135 |     if config.verbose:
136 |         print("[Self Operating Computer][operate]")
137 |     for operation in operations:
138 |         if config.verbose:
139 |             print("[Self Operating Computer][operate] operation", operation)
140 |         # wait one second
141 |         time.sleep(1)
142 |         operate_type = operation.get("operation").lower()
143 |         operate_thought = operation.get("thought")
144 |         operate_detail = ""
145 |         if config.verbose:
146 |             print("[Self Operating Computer][operate] operate_type", operate_type)
147 | 
148 |         if operate_type == "press" or operate_type == "hotkey":
149 |             keys = operation.get("keys")
150 |             operate_detail = keys
151 |             operating_system.press(keys)
152 |         elif operate_type == "write":
153 |             content = operation.get("content")
154 |             operate_detail = content
155 |             operating_system.write(content)
156 |         elif operate_type == "click":
157 |             x = operation.get("x")
158 |             y = operation.get("y")
159 |             click_detail = {"x": x, "y": y}
160 |             operate_detail = click_detail
161 | 
162 |             operating_system.mouse(click_detail)
163 |         elif operate_type == "done":
164 |             summary = operation.get("summary")
165 | 
166 |             print(
167 |                 f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BRIGHT_MAGENTA} {model}{ANSI_RESET}]"
168 |             )
169 |             print(f"{ANSI_BLUE}Objective Complete: {ANSI_RESET}{summary}\n")
170 |             return True
171 | 
172 |         else:
173 |             print(
174 |                 f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] unknown operation response :({ANSI_RESET}"
175 |             )
176 |             print(
177 |                 f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response {ANSI_RESET}{operation}"
178 |             )
179 |             return True
180 | 
181 |         print(
182 |             f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BRIGHT_MAGENTA} {model}{ANSI_RESET}]"
183 |         )
184 |         print(f"{operate_thought}")
185 |         print(f"{ANSI_BLUE}Action: {ANSI_RESET}{operate_type} {operate_detail}\n")
186 | 
187 |     return False
188 | 


--------------------------------------------------------------------------------
/operate/config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | import google.generativeai as genai
  5 | from dotenv import load_dotenv
  6 | from ollama import Client
  7 | from openai import OpenAI
  8 | import anthropic
  9 | from prompt_toolkit.shortcuts import input_dialog
 10 | 
 11 | 
 12 | class Config:
 13 |     """
 14 |     Configuration class for managing settings.
 15 | 
 16 |     Attributes:
 17 |         verbose (bool): Flag indicating whether verbose mode is enabled.
 18 |         openai_api_key (str): API key for OpenAI.
 19 |         google_api_key (str): API key for Google.
 20 |         ollama_host (str): url to ollama running remotely.
 21 |     """
 22 | 
 23 |     _instance = None
 24 | 
 25 |     def __new__(cls):
 26 |         if cls._instance is None:
 27 |             cls._instance = super(Config, cls).__new__(cls)
 28 |             # Put any initialization here
 29 |         return cls._instance
 30 | 
 31 |     def __init__(self):
 32 |         load_dotenv()
 33 |         self.verbose = False
 34 |         self.openai_api_key = (
 35 |             None  # instance variables are backups in case saving to a `.env` fails
 36 |         )
 37 |         self.google_api_key = (
 38 |             None  # instance variables are backups in case saving to a `.env` fails
 39 |         )
 40 |         self.ollama_host = (
 41 |             None  # instance variables are backups in case savint to a `.env` fails
 42 |         )
 43 |         self.anthropic_api_key = (
 44 |             None  # instance variables are backups in case saving to a `.env` fails
 45 |         )
 46 |         self.qwen_api_key = (
 47 |             None  # instance variables are backups in case saving to a `.env` fails
 48 |         )
 49 | 
 50 |     def initialize_openai(self):
 51 |         if self.verbose:
 52 |             print("[Config][initialize_openai]")
 53 | 
 54 |         if self.openai_api_key:
 55 |             if self.verbose:
 56 |                 print("[Config][initialize_openai] using cached openai_api_key")
 57 |             api_key = self.openai_api_key
 58 |         else:
 59 |             if self.verbose:
 60 |                 print(
 61 |                     "[Config][initialize_openai] no cached openai_api_key, try to get from env."
 62 |                 )
 63 |             api_key = os.getenv("OPENAI_API_KEY")
 64 | 
 65 |         client = OpenAI(
 66 |             api_key=api_key,
 67 |         )
 68 |         client.api_key = api_key
 69 |         client.base_url = os.getenv("OPENAI_API_BASE_URL", client.base_url)
 70 |         return client
 71 | 
 72 |     def initialize_qwen(self):
 73 |         if self.verbose:
 74 |             print("[Config][initialize_qwen]")
 75 | 
 76 |         if self.qwen_api_key:
 77 |             if self.verbose:
 78 |                 print("[Config][initialize_qwen] using cached qwen_api_key")
 79 |             api_key = self.qwen_api_key
 80 |         else:
 81 |             if self.verbose:
 82 |                 print(
 83 |                     "[Config][initialize_qwen] no cached qwen_api_key, try to get from env."
 84 |                 )
 85 |             api_key = os.getenv("QWEN_API_KEY")
 86 | 
 87 |         client = OpenAI(
 88 |             api_key=api_key,
 89 |             base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
 90 |         )
 91 |         client.api_key = api_key
 92 |         client.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
 93 |         return client
 94 | 
 95 |     def initialize_google(self):
 96 |         if self.google_api_key:
 97 |             if self.verbose:
 98 |                 print("[Config][initialize_google] using cached google_api_key")
 99 |             api_key = self.google_api_key
100 |         else:
101 |             if self.verbose:
102 |                 print(
103 |                     "[Config][initialize_google] no cached google_api_key, try to get from env."
104 |                 )
105 |             api_key = os.getenv("GOOGLE_API_KEY")
106 |         genai.configure(api_key=api_key, transport="rest")
107 |         model = genai.GenerativeModel("gemini-pro-vision")
108 | 
109 |         return model
110 | 
111 |     def initialize_ollama(self):
112 |         if self.ollama_host:
113 |             if self.verbose:
114 |                 print("[Config][initialize_ollama] using cached ollama host")
115 |         else:
116 |             if self.verbose:
117 |                 print(
118 |                     "[Config][initialize_ollama] no cached ollama host. Assuming ollama running locally."
119 |                 )
120 |             self.ollama_host = os.getenv("OLLAMA_HOST", None)
121 |         model = Client(host=self.ollama_host)
122 |         return model
123 | 
124 |     def initialize_anthropic(self):
125 |         if self.anthropic_api_key:
126 |             api_key = self.anthropic_api_key
127 |         else:
128 |             api_key = os.getenv("ANTHROPIC_API_KEY")
129 |         return anthropic.Anthropic(api_key=api_key)
130 | 
131 |     def validation(self, model, voice_mode):
132 |         """
133 |         Validate the input parameters for the dialog operation.
134 |         """
135 |         self.require_api_key(
136 |             "OPENAI_API_KEY",
137 |             "OpenAI API key",
138 |             model == "gpt-4"
139 |             or voice_mode
140 |             or model == "gpt-4-with-som"
141 |             or model == "gpt-4-with-ocr"
142 |             or model == "gpt-4.1-with-ocr"
143 |             or model == "o1-with-ocr",
144 |         )
145 |         self.require_api_key(
146 |             "GOOGLE_API_KEY", "Google API key", model == "gemini-pro-vision"
147 |         )
148 |         self.require_api_key(
149 |             "ANTHROPIC_API_KEY", "Anthropic API key", model == "claude-3"
150 |         )
151 |         self.require_api_key("QWEN_API_KEY", "Qwen API key", model == "qwen-vl")
152 | 
153 |     def require_api_key(self, key_name, key_description, is_required):
154 |         key_exists = bool(os.environ.get(key_name))
155 |         if self.verbose:
156 |             print("[Config] require_api_key")
157 |             print("[Config] key_name", key_name)
158 |             print("[Config] key_description", key_description)
159 |             print("[Config] key_exists", key_exists)
160 |         if is_required and not key_exists:
161 |             self.prompt_and_save_api_key(key_name, key_description)
162 | 
163 |     def prompt_and_save_api_key(self, key_name, key_description):
164 |         key_value = input_dialog(
165 |             title="API Key Required", text=f"Please enter your {key_description}:"
166 |         ).run()
167 | 
168 |         if key_value is None:  # User pressed cancel or closed the dialog
169 |             sys.exit("Operation cancelled by user.")
170 | 
171 |         if key_value:
172 |             if key_name == "OPENAI_API_KEY":
173 |                 self.openai_api_key = key_value
174 |             elif key_name == "GOOGLE_API_KEY":
175 |                 self.google_api_key = key_value
176 |             elif key_name == "ANTHROPIC_API_KEY":
177 |                 self.anthropic_api_key = key_value
178 |             elif key_name == "QWEN_API_KEY":
179 |                 self.qwen_api_key = key_value
180 |             self.save_api_key_to_env(key_name, key_value)
181 |             load_dotenv()  # Reload environment variables
182 |             # Update the instance attribute with the new key
183 | 
184 |     @staticmethod
185 |     def save_api_key_to_env(key_name, key_value):
186 |         with open(".env", "a") as file:
187 |             file.write(f"\n{key_name}='{key_value}'")
188 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ome
  2 | <h1 align="center">Self-Operating Computer Framework</h1>
  3 | 
  4 | <p align="center">
  5 |   <strong>A framework to enable multimodal models to operate a computer.</strong>
  6 | </p>
  7 | <p align="center">
  8 |   Using the same inputs and outputs as a human operator, the model views the screen and decides on a series of mouse and keyboard actions to reach an objective. Released Nov 2023, the Self-Operating Computer Framework was one of the first examples of full computer-use. 
  9 | </p>
 10 | 
 11 | <div align="center">
 12 |   <img src="https://github.com/OthersideAI/self-operating-computer/blob/main/readme/self-operating-computer.png" width="750"  style="margin: 10px;"/>
 13 | </div>
 14 | 
 15 | <!--
 16 | :rotating_light: **OUTAGE NOTIFICATION: gpt-4o**
 17 | **This model is currently experiencing an outage so the self-operating computer may not work as expected.**
 18 | -->
 19 | 
 20 | 
 21 | ## Key Features
 22 | - **Compatibility**: Designed for various multimodal models.
 23 | - **Integration**: Currently integrated with **GPT-4o, GPT-4.1, o1, Gemini Pro Vision, Claude 3, Qwen-VL and LLaVa.**
 24 | - **Future Plans**: Support for additional models.
 25 | 
 26 | ## Demo
 27 | https://github.com/OthersideAI/self-operating-computer/assets/42594239/9e8abc96-c76a-46fb-9b13-03678b3c67e0
 28 | 
 29 | 
 30 | ## Run `Self-Operating Computer`
 31 | 
 32 | 1. **Install the project**
 33 | ```
 34 | pip install self-operating-computer
 35 | ```
 36 | 2. **Run the project**
 37 | ```
 38 | operate
 39 | ```
 40 | 3. **Enter your OpenAI Key**: If you don't have one, you can obtain an OpenAI key [here](https://platform.openai.com/account/api-keys). If you need you change your key at a later point, run `vim .env` to open the `.env` and replace the old key. 
 41 | 
 42 | <div align="center">
 43 |   <img src="https://github.com/OthersideAI/self-operating-computer/blob/main/readme/key.png" width="300"  style="margin: 10px;"/>
 44 | </div>
 45 | 
 46 | 4. **Give Terminal app the required permissions**: As a last step, the Terminal app will ask for permission for "Screen Recording" and "Accessibility" in the "Security & Privacy" page of Mac's "System Preferences".
 47 | 
 48 | <div align="center">
 49 |   <img src="https://github.com/OthersideAI/self-operating-computer/blob/main/readme/terminal-access-1.png" width="300"  style="margin: 10px;"/>
 50 |   <img src="https://github.com/OthersideAI/self-operating-computer/blob/main/readme/terminal-access-2.png" width="300"  style="margin: 10px;"/>
 51 | </div>
 52 | 
 53 | ## Using `operate` Modes
 54 | 
 55 | #### OpenAI models
 56 | 
 57 | The default model for the project is gpt-4o which you can use by simply typing `operate`. To try running OpenAI's new `o1` model, use the command below.
 58 | 
 59 | ```
 60 | operate -m o1-with-ocr
 61 | ```
 62 | 
 63 | To experiment with OpenAI's latest `gpt-4.1` model, run:
 64 | 
 65 | ```
 66 | operate -m gpt-4.1-with-ocr
 67 | ```
 68 | 
 69 | 
 70 | ### Multimodal Models  `-m`
 71 | Try Google's `gemini-pro-vision` by following the instructions below. Start `operate` with the Gemini model
 72 | ```
 73 | operate -m gemini-pro-vision
 74 | ```
 75 | 
 76 | **Enter your Google AI Studio API key when terminal prompts you for it** If you don't have one, you can obtain a key [here](https://makersuite.google.com/app/apikey) after setting up your Google AI Studio account. You may also need [authorize credentials for a desktop application](https://ai.google.dev/palm_docs/oauth_quickstart). It took me a bit of time to get it working, if anyone knows a simpler way, please make a PR.
 77 | 
 78 | #### Try Claude `-m claude-3`
 79 | Use Claude 3 with Vision to see how it stacks up to GPT-4-Vision at operating a computer. Navigate to the [Claude dashboard](https://console.anthropic.com/dashboard) to get an API key and run the command below to try it. 
 80 | 
 81 | ```
 82 | operate -m claude-3
 83 | ```
 84 | 
 85 | #### Try qwen `-m qwen-vl`
 86 | Use Qwen-vl with Vision to see how it stacks up to GPT-4-Vision at operating a computer. Navigate to the [Qwen dashboard](https://bailian.console.aliyun.com/) to get an API key and run the command below to try it. 
 87 | 
 88 | ```
 89 | operate -m qwen-vl
 90 | ```
 91 | 
 92 | #### Try LLaVa Hosted Through Ollama `-m llava`
 93 | If you wish to experiment with the Self-Operating Computer Framework using LLaVA on your own machine, you can with Ollama!   
 94 | *Note: Ollama currently only supports MacOS and Linux. Windows now in Preview*   
 95 | 
 96 | First, install Ollama on your machine from https://ollama.ai/download.   
 97 | 
 98 | Once Ollama is installed, pull the LLaVA model:
 99 | ```
100 | ollama pull llava
101 | ```
102 | This will download the model on your machine which takes approximately 5 GB of storage.   
103 | 
104 | When Ollama has finished pulling LLaVA, start the server:
105 | ```
106 | ollama serve
107 | ```
108 | 
109 | That's it! Now start `operate` and select the LLaVA model:
110 | ```
111 | operate -m llava
112 | ```   
113 | **Important:** Error rates when using LLaVA are very high. This is simply intended to be a base to build off of as local multimodal models improve over time.
114 | 
115 | Learn more about Ollama at its [GitHub Repository](https://www.github.com/ollama/ollama)
116 | 
117 | ### Voice Mode `--voice`
118 | The framework supports voice inputs for the objective. Try voice by following the instructions below. 
119 | **Clone the repo** to a directory on your computer:
120 | ```
121 | git clone https://github.com/OthersideAI/self-operating-computer.git
122 | ```
123 | **Cd into directory**:
124 | ```
125 | cd self-operating-computer
126 | ```
127 | Install the additional `requirements-audio.txt`
128 | ```
129 | pip install -r requirements-audio.txt
130 | ```
131 | **Install device requirements**
132 | For mac users:
133 | ```
134 | brew install portaudio
135 | ```
136 | For Linux users:
137 | ```
138 | sudo apt install portaudio19-dev python3-pyaudio
139 | ```
140 | Run with voice mode
141 | ```
142 | operate --voice
143 | ```
144 | 
145 | ### Optical Character Recognition Mode `-m gpt-4-with-ocr`
146 | The Self-Operating Computer Framework now integrates Optical Character Recognition (OCR) capabilities with the `gpt-4-with-ocr` mode. This mode gives GPT-4 a hash map of clickable elements by coordinates. GPT-4 can decide to `click` elements by text and then the code references the hash map to get the coordinates for that element GPT-4 wanted to click. 
147 | 
148 | Based on recent tests, OCR performs better than `som` and vanilla GPT-4 so we made it the default for the project. To use the OCR mode you can simply write: 
149 | 
150 |  `operate` or `operate -m gpt-4-with-ocr` will also work. 
151 | 
152 | ### Set-of-Mark Prompting `-m gpt-4-with-som`
153 | The Self-Operating Computer Framework now supports Set-of-Mark (SoM) Prompting with the `gpt-4-with-som` command. This new visual prompting method enhances the visual grounding capabilities of large multimodal models.
154 | 
155 | Learn more about SoM Prompting in the detailed arXiv paper: [here](https://arxiv.org/abs/2310.11441).
156 | 
157 | For this initial version, a simple YOLOv8 model is trained for button detection, and the `best.pt` file is included under `model/weights/`. Users are encouraged to swap in their `best.pt` file to evaluate performance improvements. If your model outperforms the existing one, please contribute by creating a pull request (PR).
158 | 
159 | Start `operate` with the SoM model
160 | 
161 | ```
162 | operate -m gpt-4-with-som
163 | ```
164 | 
165 | 
166 | 
167 | ## Contributions are Welcomed!:
168 | 
169 | If you want to contribute yourself, see [CONTRIBUTING.md](https://github.com/OthersideAI/self-operating-computer/blob/main/CONTRIBUTING.md).
170 | 
171 | ## Feedback
172 | 
173 | For any input on improving this project, feel free to reach out to [Josh](https://twitter.com/josh_bickett) on Twitter. 
174 | 
175 | ## Join Our Discord Community
176 | 
177 | For real-time discussions and community support, join our Discord server. 
178 | - If you're already a member, join the discussion in [#self-operating-computer](https://discord.com/channels/877638638001877052/1181241785834541157).
179 | - If you're new, first [join our Discord Server](https://discord.gg/YqaKtyBEzM) and then navigate to the [#self-operating-computer](https://discord.com/channels/877638638001877052/1181241785834541157).
180 | 
181 | ## Follow HyperWriteAI for More Updates
182 | 
183 | Stay updated with the latest developments:
184 | - Follow HyperWriteAI on [Twitter](https://twitter.com/HyperWriteAI).
185 | - Follow HyperWriteAI on [LinkedIn](https://www.linkedin.com/company/othersideai/).
186 | 
187 | ## Compatibility
188 | - This project is compatible with Mac OS, Windows, and Linux (with X server installed).
189 | 
190 | ## OpenAI Rate Limiting Note
191 | The ```gpt-4o``` model is required. To unlock access to this model, your account needs to spend at least \$5 in API credits. Pre-paying for these credits will unlock access if you haven't already spent the minimum \$5.   
192 | Learn more **[here](https://platform.openai.com/docs/guides/rate-limits?context=tier-one)**
193 | 


--------------------------------------------------------------------------------
/operate/models/prompts.py:
--------------------------------------------------------------------------------
  1 | import platform
  2 | from operate.config import Config
  3 | 
  4 | # Load configuration
  5 | config = Config()
  6 | 
  7 | # General user Prompts
  8 | USER_QUESTION = "Hello, I can help you with anything. What would you like done?"
  9 | 
 10 | 
 11 | SYSTEM_PROMPT_STANDARD = """
 12 | You are operating a {operating_system} computer, using the same operating system as a human.
 13 | 
 14 | From looking at the screen, the objective, and your previous actions, take the next best series of action. 
 15 | 
 16 | You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
 17 | 
 18 | 1. click - Move mouse and click
 19 | ```
 20 | [{{ "thought": "write a thought here", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}]  # "percent" refers to the percentage of the screen's dimensions in decimal format
 21 | ```
 22 | 
 23 | 2. write - Write with your keyboard
 24 | ```
 25 | [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}]
 26 | ```
 27 | 
 28 | 3. press - Use a hotkey or press key to operate the computer
 29 | ```
 30 | [{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}]
 31 | ```
 32 | 
 33 | 4. done - The objective is completed
 34 | ```
 35 | [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}]
 36 | ```
 37 | 
 38 | Return the actions in array format `[]`. You can take just one action or multiple actions.
 39 | 
 40 | Here a helpful example:
 41 | 
 42 | Example 1: Searches for Google Chrome on the OS and opens it
 43 | ```
 44 | [
 45 |     {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": {os_search_str} }},
 46 |     {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }},
 47 |     {{ "thought": "Finally I'll press enter to open Google Chrome assuming it is available", "operation": "press", "keys": ["enter"] }}
 48 | ]
 49 | ```
 50 | 
 51 | Example 2: Focuses on the address bar in a browser before typing a website
 52 | ```
 53 | [
 54 |     {{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": [{cmd_string}, "l"] }},
 55 |     {{ "thought": "Now that the address bar is in focus I can type the URL", "operation": "write", "content": "https://news.ycombinator.com/" }},
 56 |     {{ "thought": "I'll need to press enter to go the URL now", "operation": "press", "keys": ["enter"] }}
 57 | ]
 58 | ```
 59 | 
 60 | A few important notes: 
 61 | 
 62 | - Go to Google Docs and Google Sheets by typing in the Chrome Address bar
 63 | - Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user.
 64 | 
 65 | Objective: {objective} 
 66 | """
 67 | 
 68 | 
 69 | SYSTEM_PROMPT_LABELED = """
 70 | You are operating a {operating_system} computer, using the same operating system as a human.
 71 | 
 72 | From looking at the screen, the objective, and your previous actions, take the next best series of action. 
 73 | 
 74 | You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
 75 | 
 76 | 1. click - Move mouse and click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x`
 77 | ```
 78 | [{{ "thought": "write a thought here", "operation": "click", "label": "~x" }}]  # 'percent' refers to the percentage of the screen's dimensions in decimal format
 79 | ```
 80 | 2. write - Write with your keyboard
 81 | ```
 82 | [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}]
 83 | ```
 84 | 3. press - Use a hotkey or press key to operate the computer
 85 | ```
 86 | [{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}]
 87 | ```
 88 | 
 89 | 4. done - The objective is completed
 90 | ```
 91 | [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}]
 92 | ```
 93 | Return the actions in array format `[]`. You can take just one action or multiple actions.
 94 | 
 95 | Here a helpful example:
 96 | 
 97 | Example 1: Searches for Google Chrome on the OS and opens it
 98 | ```
 99 | [
100 |     {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": {os_search_str} }},
101 |     {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }},
102 | ]
103 | ```
104 | 
105 | Example 2: Focuses on the address bar in a browser before typing a website
106 | ```
107 | [
108 |     {{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": [{cmd_string}, "l"] }},
109 |     {{ "thought": "Now that the address bar is in focus I can type the URL", "operation": "write", "content": "https://news.ycombinator.com/" }},
110 |     {{ "thought": "I'll need to press enter to go the URL now", "operation": "press", "keys": ["enter"] }}
111 | ]
112 | ```
113 | 
114 | Example 3: Send a "Hello World" message in the chat
115 | ```
116 | [
117 |     {{ "thought": "I see a messsage field on this page near the button. It looks like it has a label", "operation": "click", "label": "~34" }},
118 |     {{ "thought": "Now that I am focused on the message field, I'll go ahead and write ", "operation": "write", "content": "Hello World" }},
119 | ]
120 | ```
121 | 
122 | A few important notes: 
123 | 
124 | - Go to Google Docs and Google Sheets by typing in the Chrome Address bar
125 | - Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user.
126 | 
127 | Objective: {objective} 
128 | """
129 | 
130 | 
131 | # TODO: Add an example or instruction about `Action: press ['pagedown']` to scroll
132 | SYSTEM_PROMPT_OCR = """
133 | You are operating a {operating_system} computer, using the same operating system as a human.
134 | 
135 | From looking at the screen, the objective, and your previous actions, take the next best series of action. 
136 | 
137 | You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
138 | 
139 | 1. click - Move mouse and click - Look for text to click. Try to find relevant text to click, but if there's nothing relevant enough you can return `"nothing to click"` for the text value and we'll try a different method.
140 | ```
141 | [{{ "thought": "write a thought here", "operation": "click", "text": "The text in the button or link to click" }}]  
142 | ```
143 | 2. write - Write with your keyboard
144 | ```
145 | [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}]
146 | ```
147 | 3. press - Use a hotkey or press key to operate the computer
148 | ```
149 | [{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}]
150 | ```
151 | 4. done - The objective is completed
152 | ```
153 | [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}]
154 | ```
155 | 
156 | Return the actions in array format `[]`. You can take just one action or multiple actions.
157 | 
158 | Here a helpful example:
159 | 
160 | Example 1: Searches for Google Chrome on the OS and opens it
161 | ```
162 | [
163 |     {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": {os_search_str} }},
164 |     {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }},
165 |     {{ "thought": "Finally I'll press enter to open Google Chrome assuming it is available", "operation": "press", "keys": ["enter"] }}
166 | ]
167 | ```
168 | 
169 | Example 2: Open a new Google Docs when the browser is already open
170 | ```
171 | [
172 |     {{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": [{cmd_string}, "t"] }},
173 |     {{ "thought": "Now that the address bar is in focus I can type the URL", "operation": "write", "content": "https://docs.new/" }},
174 |     {{ "thought": "I'll need to press enter to go the URL now", "operation": "press", "keys": ["enter"] }}
175 | ]
176 | ```
177 | 
178 | Example 3: Search for someone on Linkedin when already on linkedin.com
179 | ```
180 | [
181 |     {{ "thought": "I can see the search field with the placeholder text 'search'. I click that field to search", "operation": "click", "text": "search" }},
182 |     {{ "thought": "Now that the field is active I can write the name of the person I'd like to search for", "operation": "write", "content": "John Doe" }},
183 |     {{ "thought": "Finally I'll submit the search form with enter", "operation": "press", "keys": ["enter"] }}
184 | ]
185 | ```
186 | 
187 | A few important notes: 
188 | 
189 | - Default to Google Chrome as the browser
190 | - Go to websites by opening a new tab with `press` and then `write` the URL
191 | - Reflect on previous actions and the screenshot to ensure they align and that your previous actions worked. 
192 | - If the first time clicking a button or link doesn't work, don't try again to click it. Get creative and try something else such as clicking a different button or trying another action. 
193 | - Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user.
194 | 
195 | Objective: {objective} 
196 | """
197 | 
198 | OPERATE_FIRST_MESSAGE_PROMPT = """
199 | Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 4 operations available: click, write, press, done
200 | 
201 | You just started so you are in the terminal app and your code is running in this terminal tab. To leave the terminal, search for a new program on the OS. 
202 | 
203 | Action:"""
204 | 
205 | OPERATE_PROMPT = """
206 | Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 4 operations available: click, write, press, done
207 | Action:"""
208 | 
209 | 
210 | def get_system_prompt(model, objective):
211 |     """
212 |     Format the vision prompt more efficiently and print the name of the prompt used
213 |     """
214 | 
215 |     if platform.system() == "Darwin":
216 |         cmd_string = "\"command\""
217 |         os_search_str = "[\"command\", \"space\"]"
218 |         operating_system = "Mac"
219 |     elif platform.system() == "Windows":
220 |         cmd_string = "\"ctrl\""
221 |         os_search_str = "[\"win\"]"
222 |         operating_system = "Windows"
223 |     else:
224 |         cmd_string = "\"ctrl\""
225 |         os_search_str = "[\"win\"]"
226 |         operating_system = "Linux"
227 | 
228 |     if model == "gpt-4-with-som":
229 |         prompt = SYSTEM_PROMPT_LABELED.format(
230 |             objective=objective,
231 |             cmd_string=cmd_string,
232 |             os_search_str=os_search_str,
233 |             operating_system=operating_system,
234 |         )
235 |     elif model == "gpt-4-with-ocr" or model == "gpt-4.1-with-ocr" or model == "o1-with-ocr" or model == "claude-3" or model == "qwen-vl":
236 | 
237 |         prompt = SYSTEM_PROMPT_OCR.format(
238 |             objective=objective,
239 |             cmd_string=cmd_string,
240 |             os_search_str=os_search_str,
241 |             operating_system=operating_system,
242 |         )
243 | 
244 |     else:
245 |         prompt = SYSTEM_PROMPT_STANDARD.format(
246 |             objective=objective,
247 |             cmd_string=cmd_string,
248 |             os_search_str=os_search_str,
249 |             operating_system=operating_system,
250 |         )
251 | 
252 |     # Optional verbose output
253 |     if config.verbose:
254 |         print("[get_system_prompt] model:", model)
255 |     # print("[get_system_prompt] prompt:", prompt)
256 | 
257 |     return prompt
258 | 
259 | 
260 | def get_user_prompt():
261 |     prompt = OPERATE_PROMPT
262 |     return prompt
263 | 
264 | 
265 | def get_user_first_message_prompt():
266 |     prompt = OPERATE_FIRST_MESSAGE_PROMPT
267 |     return prompt
268 | 


--------------------------------------------------------------------------------
/operate/models/apis.py:
--------------------------------------------------------------------------------
   1 | import base64
   2 | import io
   3 | import json
   4 | import os
   5 | import time
   6 | import traceback
   7 | 
   8 | import easyocr
   9 | import ollama
  10 | import pkg_resources
  11 | from PIL import Image
  12 | from ultralytics import YOLO
  13 | 
  14 | from operate.config import Config
  15 | from operate.exceptions import ModelNotRecognizedException
  16 | from operate.models.prompts import (
  17 |     get_system_prompt,
  18 |     get_user_first_message_prompt,
  19 |     get_user_prompt,
  20 | )
  21 | from operate.utils.label import (
  22 |     add_labels,
  23 |     get_click_position_in_percent,
  24 |     get_label_coordinates,
  25 | )
  26 | from operate.utils.ocr import get_text_coordinates, get_text_element
  27 | from operate.utils.screenshot import capture_screen_with_cursor, compress_screenshot
  28 | from operate.utils.style import ANSI_BRIGHT_MAGENTA, ANSI_GREEN, ANSI_RED, ANSI_RESET
  29 | 
  30 | # Load configuration
  31 | config = Config()
  32 | 
  33 | 
  34 | async def get_next_action(model, messages, objective, session_id):
  35 |     if config.verbose:
  36 |         print("[Self-Operating Computer][get_next_action]")
  37 |         print("[Self-Operating Computer][get_next_action] model", model)
  38 |     if model == "gpt-4":
  39 |         return call_gpt_4o(messages), None
  40 |     if model == "qwen-vl":
  41 |         operation = await call_qwen_vl_with_ocr(messages, objective, model)
  42 |         return operation, None
  43 |     if model == "gpt-4-with-som":
  44 |         operation = await call_gpt_4o_labeled(messages, objective, model)
  45 |         return operation, None
  46 |     if model == "gpt-4-with-ocr":
  47 |         operation = await call_gpt_4o_with_ocr(messages, objective, model)
  48 |         return operation, None
  49 |     if model == "gpt-4.1-with-ocr":
  50 |         operation = await call_gpt_4_1_with_ocr(messages, objective, model)
  51 |         return operation, None
  52 |     if model == "o1-with-ocr":
  53 |         operation = await call_o1_with_ocr(messages, objective, model)
  54 |         return operation, None
  55 |     if model == "agent-1":
  56 |         return "coming soon"
  57 |     if model == "gemini-pro-vision":
  58 |         return call_gemini_pro_vision(messages, objective), None
  59 |     if model == "llava":
  60 |         operation = call_ollama_llava(messages)
  61 |         return operation, None
  62 |     if model == "claude-3":
  63 |         operation = await call_claude_3_with_ocr(messages, objective, model)
  64 |         return operation, None
  65 |     raise ModelNotRecognizedException(model)
  66 | 
  67 | 
  68 | def call_gpt_4o(messages):
  69 |     if config.verbose:
  70 |         print("[call_gpt_4_v]")
  71 |     time.sleep(1)
  72 |     client = config.initialize_openai()
  73 |     try:
  74 |         screenshots_dir = "screenshots"
  75 |         if not os.path.exists(screenshots_dir):
  76 |             os.makedirs(screenshots_dir)
  77 | 
  78 |         screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
  79 |         # Call the function to capture the screen with the cursor
  80 |         capture_screen_with_cursor(screenshot_filename)
  81 | 
  82 |         with open(screenshot_filename, "rb") as img_file:
  83 |             img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
  84 | 
  85 |         if len(messages) == 1:
  86 |             user_prompt = get_user_first_message_prompt()
  87 |         else:
  88 |             user_prompt = get_user_prompt()
  89 | 
  90 |         if config.verbose:
  91 |             print(
  92 |                 "[call_gpt_4_v] user_prompt",
  93 |                 user_prompt,
  94 |             )
  95 | 
  96 |         vision_message = {
  97 |             "role": "user",
  98 |             "content": [
  99 |                 {"type": "text", "text": user_prompt},
 100 |                 {
 101 |                     "type": "image_url",
 102 |                     "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
 103 |                 },
 104 |             ],
 105 |         }
 106 |         messages.append(vision_message)
 107 | 
 108 |         response = client.chat.completions.create(
 109 |             model="gpt-4o",
 110 |             messages=messages,
 111 |             presence_penalty=1,
 112 |             frequency_penalty=1,
 113 |         )
 114 | 
 115 |         content = response.choices[0].message.content
 116 | 
 117 |         content = clean_json(content)
 118 | 
 119 |         assistant_message = {"role": "assistant", "content": content}
 120 |         if config.verbose:
 121 |             print(
 122 |                 "[call_gpt_4_v] content",
 123 |                 content,
 124 |             )
 125 |         content = json.loads(content)
 126 | 
 127 |         messages.append(assistant_message)
 128 | 
 129 |         return content
 130 | 
 131 |     except Exception as e:
 132 |         print(
 133 |             f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying again {ANSI_RESET}",
 134 |             e,
 135 |         )
 136 |         print(
 137 |             f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}",
 138 |             content,
 139 |         )
 140 |         if config.verbose:
 141 |             traceback.print_exc()
 142 |         return call_gpt_4o(messages)
 143 | 
 144 | 
 145 | async def call_qwen_vl_with_ocr(messages, objective, model):
 146 |     if config.verbose:
 147 |         print("[call_qwen_vl_with_ocr]")
 148 | 
 149 |     # Construct the path to the file within the package
 150 |     try:
 151 |         time.sleep(1)
 152 |         client = config.initialize_qwen()
 153 | 
 154 |         confirm_system_prompt(messages, objective, model)
 155 |         screenshots_dir = "screenshots"
 156 |         if not os.path.exists(screenshots_dir):
 157 |             os.makedirs(screenshots_dir)
 158 | 
 159 |         # Call the function to capture the screen with the cursor
 160 |         raw_screenshot_filename = os.path.join(screenshots_dir, "raw_screenshot.png")
 161 |         capture_screen_with_cursor(raw_screenshot_filename)
 162 | 
 163 |         # Compress screenshot image to make size be smaller
 164 |         screenshot_filename = os.path.join(screenshots_dir, "screenshot.jpeg")
 165 |         compress_screenshot(raw_screenshot_filename, screenshot_filename)
 166 | 
 167 |         with open(screenshot_filename, "rb") as img_file:
 168 |             img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
 169 | 
 170 |         if len(messages) == 1:
 171 |             user_prompt = get_user_first_message_prompt()
 172 |         else:
 173 |             user_prompt = get_user_prompt()
 174 | 
 175 |         vision_message = {
 176 |             "role": "user",
 177 |             "content": [
 178 |                 {"type": "text",
 179 |                  "text": f"{user_prompt}**REMEMBER** Only output json format, do not append any other text."},
 180 |                 {
 181 |                     "type": "image_url",
 182 |                     "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
 183 |                 },
 184 |             ],
 185 |         }
 186 |         messages.append(vision_message)
 187 | 
 188 |         response = client.chat.completions.create(
 189 |             model="qwen2.5-vl-72b-instruct",
 190 |             messages=messages,
 191 |         )
 192 | 
 193 |         content = response.choices[0].message.content
 194 | 
 195 |         content = clean_json(content)
 196 | 
 197 |         # used later for the messages
 198 |         content_str = content
 199 | 
 200 |         content = json.loads(content)
 201 | 
 202 |         processed_content = []
 203 | 
 204 |         for operation in content:
 205 |             if operation.get("operation") == "click":
 206 |                 text_to_click = operation.get("text")
 207 |                 if config.verbose:
 208 |                     print(
 209 |                         "[call_qwen_vl_with_ocr][click] text_to_click",
 210 |                         text_to_click,
 211 |                     )
 212 |                 # Initialize EasyOCR Reader
 213 |                 reader = easyocr.Reader(["en"])
 214 | 
 215 |                 # Read the screenshot
 216 |                 result = reader.readtext(screenshot_filename)
 217 | 
 218 |                 text_element_index = get_text_element(
 219 |                     result, text_to_click, screenshot_filename
 220 |                 )
 221 |                 coordinates = get_text_coordinates(
 222 |                     result, text_element_index, screenshot_filename
 223 |                 )
 224 | 
 225 |                 # add `coordinates`` to `content`
 226 |                 operation["x"] = coordinates["x"]
 227 |                 operation["y"] = coordinates["y"]
 228 | 
 229 |                 if config.verbose:
 230 |                     print(
 231 |                         "[call_qwen_vl_with_ocr][click] text_element_index",
 232 |                         text_element_index,
 233 |                     )
 234 |                     print(
 235 |                         "[call_qwen_vl_with_ocr][click] coordinates",
 236 |                         coordinates,
 237 |                     )
 238 |                     print(
 239 |                         "[call_qwen_vl_with_ocr][click] final operation",
 240 |                         operation,
 241 |                     )
 242 |                 processed_content.append(operation)
 243 | 
 244 |             else:
 245 |                 processed_content.append(operation)
 246 | 
 247 |         # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history
 248 |         assistant_message = {"role": "assistant", "content": content_str}
 249 |         messages.append(assistant_message)
 250 | 
 251 |         return processed_content
 252 | 
 253 |     except Exception as e:
 254 |         print(
 255 |             f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}"
 256 |         )
 257 |         if config.verbose:
 258 |             print("[Self-Operating Computer][Operate] error", e)
 259 |             traceback.print_exc()
 260 |         return gpt_4_fallback(messages, objective, model)
 261 | 
 262 | def call_gemini_pro_vision(messages, objective):
 263 |     """
 264 |     Get the next action for Self-Operating Computer using Gemini Pro Vision
 265 |     """
 266 |     if config.verbose:
 267 |         print(
 268 |             "[Self Operating Computer][call_gemini_pro_vision]",
 269 |         )
 270 |     # sleep for a second
 271 |     time.sleep(1)
 272 |     try:
 273 |         screenshots_dir = "screenshots"
 274 |         if not os.path.exists(screenshots_dir):
 275 |             os.makedirs(screenshots_dir)
 276 | 
 277 |         screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
 278 |         # Call the function to capture the screen with the cursor
 279 |         capture_screen_with_cursor(screenshot_filename)
 280 |         # sleep for a second
 281 |         time.sleep(1)
 282 |         prompt = get_system_prompt("gemini-pro-vision", objective)
 283 | 
 284 |         model = config.initialize_google()
 285 |         if config.verbose:
 286 |             print("[call_gemini_pro_vision] model", model)
 287 | 
 288 |         response = model.generate_content([prompt, Image.open(screenshot_filename)])
 289 | 
 290 |         content = response.text[1:]
 291 |         if config.verbose:
 292 |             print("[call_gemini_pro_vision] response", response)
 293 |             print("[call_gemini_pro_vision] content", content)
 294 | 
 295 |         content = json.loads(content)
 296 |         if config.verbose:
 297 |             print(
 298 |                 "[get_next_action][call_gemini_pro_vision] content",
 299 |                 content,
 300 |             )
 301 | 
 302 |         return content
 303 | 
 304 |     except Exception as e:
 305 |         print(
 306 |             f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying another method {ANSI_RESET}"
 307 |         )
 308 |         if config.verbose:
 309 |             print("[Self-Operating Computer][Operate] error", e)
 310 |             traceback.print_exc()
 311 |         return call_gpt_4o(messages)
 312 | 
 313 | 
 314 | async def call_gpt_4o_with_ocr(messages, objective, model):
 315 |     if config.verbose:
 316 |         print("[call_gpt_4o_with_ocr]")
 317 | 
 318 |     # Construct the path to the file within the package
 319 |     try:
 320 |         time.sleep(1)
 321 |         client = config.initialize_openai()
 322 | 
 323 |         confirm_system_prompt(messages, objective, model)
 324 |         screenshots_dir = "screenshots"
 325 |         if not os.path.exists(screenshots_dir):
 326 |             os.makedirs(screenshots_dir)
 327 | 
 328 |         screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
 329 |         # Call the function to capture the screen with the cursor
 330 |         capture_screen_with_cursor(screenshot_filename)
 331 | 
 332 |         with open(screenshot_filename, "rb") as img_file:
 333 |             img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
 334 | 
 335 |         if len(messages) == 1:
 336 |             user_prompt = get_user_first_message_prompt()
 337 |         else:
 338 |             user_prompt = get_user_prompt()
 339 | 
 340 |         vision_message = {
 341 |             "role": "user",
 342 |             "content": [
 343 |                 {"type": "text", "text": user_prompt},
 344 |                 {
 345 |                     "type": "image_url",
 346 |                     "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
 347 |                 },
 348 |             ],
 349 |         }
 350 |         messages.append(vision_message)
 351 | 
 352 |         response = client.chat.completions.create(
 353 |             model="gpt-4o",
 354 |             messages=messages,
 355 |         )
 356 | 
 357 |         content = response.choices[0].message.content
 358 | 
 359 |         content = clean_json(content)
 360 | 
 361 |         # used later for the messages
 362 |         content_str = content
 363 | 
 364 |         content = json.loads(content)
 365 | 
 366 |         processed_content = []
 367 | 
 368 |         for operation in content:
 369 |             if operation.get("operation") == "click":
 370 |                 text_to_click = operation.get("text")
 371 |                 if config.verbose:
 372 |                     print(
 373 |                         "[call_gpt_4o_with_ocr][click] text_to_click",
 374 |                         text_to_click,
 375 |                     )
 376 |                 # Initialize EasyOCR Reader
 377 |                 reader = easyocr.Reader(["en"])
 378 | 
 379 |                 # Read the screenshot
 380 |                 result = reader.readtext(screenshot_filename)
 381 | 
 382 |                 text_element_index = get_text_element(
 383 |                     result, text_to_click, screenshot_filename
 384 |                 )
 385 |                 coordinates = get_text_coordinates(
 386 |                     result, text_element_index, screenshot_filename
 387 |                 )
 388 | 
 389 |                 # add `coordinates`` to `content`
 390 |                 operation["x"] = coordinates["x"]
 391 |                 operation["y"] = coordinates["y"]
 392 | 
 393 |                 if config.verbose:
 394 |                     print(
 395 |                         "[call_gpt_4o_with_ocr][click] text_element_index",
 396 |                         text_element_index,
 397 |                     )
 398 |                     print(
 399 |                         "[call_gpt_4o_with_ocr][click] coordinates",
 400 |                         coordinates,
 401 |                     )
 402 |                     print(
 403 |                         "[call_gpt_4o_with_ocr][click] final operation",
 404 |                         operation,
 405 |                     )
 406 |                 processed_content.append(operation)
 407 | 
 408 |             else:
 409 |                 processed_content.append(operation)
 410 | 
 411 |         # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history
 412 |         assistant_message = {"role": "assistant", "content": content_str}
 413 |         messages.append(assistant_message)
 414 | 
 415 |         return processed_content
 416 | 
 417 |     except Exception as e:
 418 |         print(
 419 |             f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}"
 420 |         )
 421 |         if config.verbose:
 422 |             print("[Self-Operating Computer][Operate] error", e)
 423 |             traceback.print_exc()
 424 |         return gpt_4_fallback(messages, objective, model)
 425 | 
 426 | 
 427 | async def call_gpt_4_1_with_ocr(messages, objective, model):
 428 |     if config.verbose:
 429 |         print("[call_gpt_4_1_with_ocr]")
 430 | 
 431 |     try:
 432 |         time.sleep(1)
 433 |         client = config.initialize_openai()
 434 | 
 435 |         confirm_system_prompt(messages, objective, model)
 436 |         screenshots_dir = "screenshots"
 437 |         if not os.path.exists(screenshots_dir):
 438 |             os.makedirs(screenshots_dir)
 439 | 
 440 |         screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
 441 |         capture_screen_with_cursor(screenshot_filename)
 442 | 
 443 |         with open(screenshot_filename, "rb") as img_file:
 444 |             img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
 445 | 
 446 |         if len(messages) == 1:
 447 |             user_prompt = get_user_first_message_prompt()
 448 |         else:
 449 |             user_prompt = get_user_prompt()
 450 | 
 451 |         vision_message = {
 452 |             "role": "user",
 453 |             "content": [
 454 |                 {"type": "text", "text": user_prompt},
 455 |                 {
 456 |                     "type": "image_url",
 457 |                     "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
 458 |                 },
 459 |             ],
 460 |         }
 461 |         messages.append(vision_message)
 462 | 
 463 |         response = client.chat.completions.create(
 464 |             model="gpt-4.1",
 465 |             messages=messages,
 466 |         )
 467 | 
 468 |         content = response.choices[0].message.content
 469 | 
 470 |         content = clean_json(content)
 471 | 
 472 |         content_str = content
 473 | 
 474 |         content = json.loads(content)
 475 | 
 476 |         processed_content = []
 477 | 
 478 |         for operation in content:
 479 |             if operation.get("operation") == "click":
 480 |                 text_to_click = operation.get("text")
 481 |                 if config.verbose:
 482 |                     print(
 483 |                         "[call_gpt_4_1_with_ocr][click] text_to_click",
 484 |                         text_to_click,
 485 |                     )
 486 |                 reader = easyocr.Reader(["en"])
 487 | 
 488 |                 result = reader.readtext(screenshot_filename)
 489 | 
 490 |                 text_element_index = get_text_element(
 491 |                     result, text_to_click, screenshot_filename
 492 |                 )
 493 |                 coordinates = get_text_coordinates(
 494 |                     result, text_element_index, screenshot_filename
 495 |                 )
 496 | 
 497 |                 operation["x"] = coordinates["x"]
 498 |                 operation["y"] = coordinates["y"]
 499 | 
 500 |                 if config.verbose:
 501 |                     print(
 502 |                         "[call_gpt_4_1_with_ocr][click] text_element_index",
 503 |                         text_element_index,
 504 |                     )
 505 |                     print(
 506 |                         "[call_gpt_4_1_with_ocr][click] coordinates",
 507 |                         coordinates,
 508 |                     )
 509 |                     print(
 510 |                         "[call_gpt_4_1_with_ocr][click] final operation",
 511 |                         operation,
 512 |                     )
 513 |                 processed_content.append(operation)
 514 | 
 515 |             else:
 516 |                 processed_content.append(operation)
 517 | 
 518 |         assistant_message = {"role": "assistant", "content": content_str}
 519 |         messages.append(assistant_message)
 520 | 
 521 |         return processed_content
 522 | 
 523 |     except Exception as e:
 524 |         print(
 525 |             f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}"
 526 |         )
 527 |         if config.verbose:
 528 |             print("[Self-Operating Computer][Operate] error", e)
 529 |             traceback.print_exc()
 530 |         return gpt_4_fallback(messages, objective, model)
 531 | 
 532 | 
 533 | async def call_o1_with_ocr(messages, objective, model):
 534 |     if config.verbose:
 535 |         print("[call_o1_with_ocr]")
 536 | 
 537 |     # Construct the path to the file within the package
 538 |     try:
 539 |         time.sleep(1)
 540 |         client = config.initialize_openai()
 541 | 
 542 |         confirm_system_prompt(messages, objective, model)
 543 |         screenshots_dir = "screenshots"
 544 |         if not os.path.exists(screenshots_dir):
 545 |             os.makedirs(screenshots_dir)
 546 | 
 547 |         screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
 548 |         # Call the function to capture the screen with the cursor
 549 |         capture_screen_with_cursor(screenshot_filename)
 550 | 
 551 |         with open(screenshot_filename, "rb") as img_file:
 552 |             img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
 553 | 
 554 |         if len(messages) == 1:
 555 |             user_prompt = get_user_first_message_prompt()
 556 |         else:
 557 |             user_prompt = get_user_prompt()
 558 | 
 559 |         vision_message = {
 560 |             "role": "user",
 561 |             "content": [
 562 |                 {"type": "text", "text": user_prompt},
 563 |                 {
 564 |                     "type": "image_url",
 565 |                     "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
 566 |                 },
 567 |             ],
 568 |         }
 569 |         messages.append(vision_message)
 570 | 
 571 |         response = client.chat.completions.create(
 572 |             model="o1",
 573 |             messages=messages,
 574 |         )
 575 | 
 576 |         content = response.choices[0].message.content
 577 | 
 578 |         content = clean_json(content)
 579 | 
 580 |         # used later for the messages
 581 |         content_str = content
 582 | 
 583 |         content = json.loads(content)
 584 | 
 585 |         processed_content = []
 586 | 
 587 |         for operation in content:
 588 |             if operation.get("operation") == "click":
 589 |                 text_to_click = operation.get("text")
 590 |                 if config.verbose:
 591 |                     print(
 592 |                         "[call_o1_with_ocr][click] text_to_click",
 593 |                         text_to_click,
 594 |                     )
 595 |                 # Initialize EasyOCR Reader
 596 |                 reader = easyocr.Reader(["en"])
 597 | 
 598 |                 # Read the screenshot
 599 |                 result = reader.readtext(screenshot_filename)
 600 | 
 601 |                 text_element_index = get_text_element(
 602 |                     result, text_to_click, screenshot_filename
 603 |                 )
 604 |                 coordinates = get_text_coordinates(
 605 |                     result, text_element_index, screenshot_filename
 606 |                 )
 607 | 
 608 |                 # add `coordinates`` to `content`
 609 |                 operation["x"] = coordinates["x"]
 610 |                 operation["y"] = coordinates["y"]
 611 | 
 612 |                 if config.verbose:
 613 |                     print(
 614 |                         "[call_o1_with_ocr][click] text_element_index",
 615 |                         text_element_index,
 616 |                     )
 617 |                     print(
 618 |                         "[call_o1_with_ocr][click] coordinates",
 619 |                         coordinates,
 620 |                     )
 621 |                     print(
 622 |                         "[call_o1_with_ocr][click] final operation",
 623 |                         operation,
 624 |                     )
 625 |                 processed_content.append(operation)
 626 | 
 627 |             else:
 628 |                 processed_content.append(operation)
 629 | 
 630 |         # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history
 631 |         assistant_message = {"role": "assistant", "content": content_str}
 632 |         messages.append(assistant_message)
 633 | 
 634 |         return processed_content
 635 | 
 636 |     except Exception as e:
 637 |         print(
 638 |             f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}"
 639 |         )
 640 |         if config.verbose:
 641 |             print("[Self-Operating Computer][Operate] error", e)
 642 |             traceback.print_exc()
 643 |         return gpt_4_fallback(messages, objective, model)
 644 | 
 645 | 
 646 | async def call_gpt_4o_labeled(messages, objective, model):
 647 |     time.sleep(1)
 648 | 
 649 |     try:
 650 |         client = config.initialize_openai()
 651 | 
 652 |         confirm_system_prompt(messages, objective, model)
 653 |         file_path = pkg_resources.resource_filename("operate.models.weights", "best.pt")
 654 |         yolo_model = YOLO(file_path)  # Load your trained model
 655 |         screenshots_dir = "screenshots"
 656 |         if not os.path.exists(screenshots_dir):
 657 |             os.makedirs(screenshots_dir)
 658 | 
 659 |         screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
 660 |         # Call the function to capture the screen with the cursor
 661 |         capture_screen_with_cursor(screenshot_filename)
 662 | 
 663 |         with open(screenshot_filename, "rb") as img_file:
 664 |             img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
 665 | 
 666 |         img_base64_labeled, label_coordinates = add_labels(img_base64, yolo_model)
 667 | 
 668 |         if len(messages) == 1:
 669 |             user_prompt = get_user_first_message_prompt()
 670 |         else:
 671 |             user_prompt = get_user_prompt()
 672 | 
 673 |         if config.verbose:
 674 |             print(
 675 |                 "[call_gpt_4_vision_preview_labeled] user_prompt",
 676 |                 user_prompt,
 677 |             )
 678 | 
 679 |         vision_message = {
 680 |             "role": "user",
 681 |             "content": [
 682 |                 {"type": "text", "text": user_prompt},
 683 |                 {
 684 |                     "type": "image_url",
 685 |                     "image_url": {
 686 |                         "url": f"data:image/jpeg;base64,{img_base64_labeled}"
 687 |                     },
 688 |                 },
 689 |             ],
 690 |         }
 691 |         messages.append(vision_message)
 692 | 
 693 |         response = client.chat.completions.create(
 694 |             model="gpt-4o",
 695 |             messages=messages,
 696 |             presence_penalty=1,
 697 |             frequency_penalty=1,
 698 |         )
 699 | 
 700 |         content = response.choices[0].message.content
 701 | 
 702 |         content = clean_json(content)
 703 | 
 704 |         assistant_message = {"role": "assistant", "content": content}
 705 | 
 706 |         messages.append(assistant_message)
 707 | 
 708 |         content = json.loads(content)
 709 |         if config.verbose:
 710 |             print(
 711 |                 "[call_gpt_4_vision_preview_labeled] content",
 712 |                 content,
 713 |             )
 714 | 
 715 |         processed_content = []
 716 | 
 717 |         for operation in content:
 718 |             print(
 719 |                 "[call_gpt_4_vision_preview_labeled] for operation in content",
 720 |                 operation,
 721 |             )
 722 |             if operation.get("operation") == "click":
 723 |                 label = operation.get("label")
 724 |                 if config.verbose:
 725 |                     print(
 726 |                         "[Self Operating Computer][call_gpt_4_vision_preview_labeled] label",
 727 |                         label,
 728 |                     )
 729 | 
 730 |                 coordinates = get_label_coordinates(label, label_coordinates)
 731 |                 if config.verbose:
 732 |                     print(
 733 |                         "[Self Operating Computer][call_gpt_4_vision_preview_labeled] coordinates",
 734 |                         coordinates,
 735 |                     )
 736 |                 image = Image.open(
 737 |                     io.BytesIO(base64.b64decode(img_base64))
 738 |                 )  # Load the image to get its size
 739 |                 image_size = image.size  # Get the size of the image (width, height)
 740 |                 click_position_percent = get_click_position_in_percent(
 741 |                     coordinates, image_size
 742 |                 )
 743 |                 if config.verbose:
 744 |                     print(
 745 |                         "[Self Operating Computer][call_gpt_4_vision_preview_labeled] click_position_percent",
 746 |                         click_position_percent,
 747 |                     )
 748 |                 if not click_position_percent:
 749 |                     print(
 750 |                         f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Failed to get click position in percent. Trying another method {ANSI_RESET}"
 751 |                     )
 752 |                     return call_gpt_4o(messages)
 753 | 
 754 |                 x_percent = f"{click_position_percent[0]:.2f}"
 755 |                 y_percent = f"{click_position_percent[1]:.2f}"
 756 |                 operation["x"] = x_percent
 757 |                 operation["y"] = y_percent
 758 |                 if config.verbose:
 759 |                     print(
 760 |                         "[Self Operating Computer][call_gpt_4_vision_preview_labeled] new click operation",
 761 |                         operation,
 762 |                     )
 763 |                 processed_content.append(operation)
 764 |             else:
 765 |                 if config.verbose:
 766 |                     print(
 767 |                         "[Self Operating Computer][call_gpt_4_vision_preview_labeled] .append none click operation",
 768 |                         operation,
 769 |                     )
 770 | 
 771 |                 processed_content.append(operation)
 772 | 
 773 |             if config.verbose:
 774 |                 print(
 775 |                     "[Self Operating Computer][call_gpt_4_vision_preview_labeled] new processed_content",
 776 |                     processed_content,
 777 |                 )
 778 |             return processed_content
 779 | 
 780 |     except Exception as e:
 781 |         print(
 782 |             f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}"
 783 |         )
 784 |         if config.verbose:
 785 |             print("[Self-Operating Computer][Operate] error", e)
 786 |             traceback.print_exc()
 787 |         return call_gpt_4o(messages)
 788 | 
 789 | 
 790 | def call_ollama_llava(messages):
 791 |     if config.verbose:
 792 |         print("[call_ollama_llava]")
 793 |     time.sleep(1)
 794 |     try:
 795 |         model = config.initialize_ollama()
 796 |         screenshots_dir = "screenshots"
 797 |         if not os.path.exists(screenshots_dir):
 798 |             os.makedirs(screenshots_dir)
 799 | 
 800 |         screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
 801 |         # Call the function to capture the screen with the cursor
 802 |         capture_screen_with_cursor(screenshot_filename)
 803 | 
 804 |         if len(messages) == 1:
 805 |             user_prompt = get_user_first_message_prompt()
 806 |         else:
 807 |             user_prompt = get_user_prompt()
 808 | 
 809 |         if config.verbose:
 810 |             print(
 811 |                 "[call_ollama_llava] user_prompt",
 812 |                 user_prompt,
 813 |             )
 814 | 
 815 |         vision_message = {
 816 |             "role": "user",
 817 |             "content": user_prompt,
 818 |             "images": [screenshot_filename],
 819 |         }
 820 |         messages.append(vision_message)
 821 | 
 822 |         response = model.chat(
 823 |             model="llava",
 824 |             messages=messages,
 825 |         )
 826 | 
 827 |         # Important: Remove the image path from the message history.
 828 |         # Ollama will attempt to load each image reference and will
 829 |         # eventually timeout.
 830 |         messages[-1]["images"] = None
 831 | 
 832 |         content = response["message"]["content"].strip()
 833 | 
 834 |         content = clean_json(content)
 835 | 
 836 |         assistant_message = {"role": "assistant", "content": content}
 837 |         if config.verbose:
 838 |             print(
 839 |                 "[call_ollama_llava] content",
 840 |                 content,
 841 |             )
 842 |         content = json.loads(content)
 843 | 
 844 |         messages.append(assistant_message)
 845 | 
 846 |         return content
 847 | 
 848 |     except ollama.ResponseError as e:
 849 |         print(
 850 |             f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Operate] Couldn't connect to Ollama. With Ollama installed, run `ollama pull llava` then `ollama serve`{ANSI_RESET}",
 851 |             e,
 852 |         )
 853 | 
 854 |     except Exception as e:
 855 |         print(
 856 |             f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[llava] That did not work. Trying again {ANSI_RESET}",
 857 |             e,
 858 |         )
 859 |         print(
 860 |             f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}",
 861 |             content,
 862 |         )
 863 |         if config.verbose:
 864 |             traceback.print_exc()
 865 |         return call_ollama_llava(messages)
 866 | 
 867 | 
 868 | async def call_claude_3_with_ocr(messages, objective, model):
 869 |     if config.verbose:
 870 |         print("[call_claude_3_with_ocr]")
 871 | 
 872 |     try:
 873 |         time.sleep(1)
 874 |         client = config.initialize_anthropic()
 875 | 
 876 |         confirm_system_prompt(messages, objective, model)
 877 |         screenshots_dir = "screenshots"
 878 |         if not os.path.exists(screenshots_dir):
 879 |             os.makedirs(screenshots_dir)
 880 | 
 881 |         screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
 882 |         capture_screen_with_cursor(screenshot_filename)
 883 | 
 884 |         # downsize screenshot due to 5MB size limit
 885 |         with open(screenshot_filename, "rb") as img_file:
 886 |             img = Image.open(img_file)
 887 | 
 888 |             # Convert RGBA to RGB
 889 |             if img.mode == "RGBA":
 890 |                 img = img.convert("RGB")
 891 | 
 892 |             # Calculate the new dimensions while maintaining the aspect ratio
 893 |             original_width, original_height = img.size
 894 |             aspect_ratio = original_width / original_height
 895 |             new_width = 2560  # Adjust this value to achieve the desired file size
 896 |             new_height = int(new_width / aspect_ratio)
 897 |             if config.verbose:
 898 |                 print("[call_claude_3_with_ocr] resizing claude")
 899 | 
 900 |             # Resize the image
 901 |             img_resized = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
 902 | 
 903 |             # Save the resized and converted image to a BytesIO object for JPEG format
 904 |             img_buffer = io.BytesIO()
 905 |             img_resized.save(
 906 |                 img_buffer, format="JPEG", quality=85
 907 |             )  # Adjust the quality parameter as needed
 908 |             img_buffer.seek(0)
 909 | 
 910 |             # Encode the resized image as base64
 911 |             img_data = base64.b64encode(img_buffer.getvalue()).decode("utf-8")
 912 | 
 913 |         if len(messages) == 1:
 914 |             user_prompt = get_user_first_message_prompt()
 915 |         else:
 916 |             user_prompt = get_user_prompt()
 917 | 
 918 |         vision_message = {
 919 |             "role": "user",
 920 |             "content": [
 921 |                 {
 922 |                     "type": "image",
 923 |                     "source": {
 924 |                         "type": "base64",
 925 |                         "media_type": "image/jpeg",
 926 |                         "data": img_data,
 927 |                     },
 928 |                 },
 929 |                 {
 930 |                     "type": "text",
 931 |                     "text": user_prompt
 932 |                     + "**REMEMBER** Only output json format, do not append any other text.",
 933 |                 },
 934 |             ],
 935 |         }
 936 |         messages.append(vision_message)
 937 | 
 938 |         # anthropic api expect system prompt as an separate argument
 939 |         response = client.messages.create(
 940 |             model="claude-3-opus-20240229",
 941 |             max_tokens=3000,
 942 |             system=messages[0]["content"],
 943 |             messages=messages[1:],
 944 |         )
 945 | 
 946 |         content = response.content[0].text
 947 |         content = clean_json(content)
 948 |         content_str = content
 949 |         try:
 950 |             content = json.loads(content)
 951 |         # rework for json mode output
 952 |         except json.JSONDecodeError as e:
 953 |             if config.verbose:
 954 |                 print(
 955 |                     f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] JSONDecodeError: {e} {ANSI_RESET}"
 956 |                 )
 957 |             response = client.messages.create(
 958 |                 model="claude-3-opus-20240229",
 959 |                 max_tokens=3000,
 960 |                 system=f"This json string is not valid, when using with json.loads(content) \
 961 |                 it throws the following error: {e}, return correct json string. \
 962 |                 **REMEMBER** Only output json format, do not append any other text.",
 963 |                 messages=[{"role": "user", "content": content}],
 964 |             )
 965 |             content = response.content[0].text
 966 |             content = clean_json(content)
 967 |             content_str = content
 968 |             content = json.loads(content)
 969 | 
 970 |         if config.verbose:
 971 |             print(
 972 |                 f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] content: {content} {ANSI_RESET}"
 973 |             )
 974 |         processed_content = []
 975 | 
 976 |         for operation in content:
 977 |             if operation.get("operation") == "click":
 978 |                 text_to_click = operation.get("text")
 979 |                 if config.verbose:
 980 |                     print(
 981 |                         "[call_claude_3_ocr][click] text_to_click",
 982 |                         text_to_click,
 983 |                     )
 984 |                 # Initialize EasyOCR Reader
 985 |                 reader = easyocr.Reader(["en"])
 986 | 
 987 |                 # Read the screenshot
 988 |                 result = reader.readtext(screenshot_filename)
 989 | 
 990 |                 # limit the text to extract has a higher success rate
 991 |                 text_element_index = get_text_element(
 992 |                     result, text_to_click[:3], screenshot_filename
 993 |                 )
 994 |                 coordinates = get_text_coordinates(
 995 |                     result, text_element_index, screenshot_filename
 996 |                 )
 997 | 
 998 |                 # add `coordinates`` to `content`
 999 |                 operation["x"] = coordinates["x"]
1000 |                 operation["y"] = coordinates["y"]
1001 | 
1002 |                 if config.verbose:
1003 |                     print(
1004 |                         "[call_claude_3_ocr][click] text_element_index",
1005 |                         text_element_index,
1006 |                     )
1007 |                     print(
1008 |                         "[call_claude_3_ocr][click] coordinates",
1009 |                         coordinates,
1010 |                     )
1011 |                     print(
1012 |                         "[call_claude_3_ocr][click] final operation",
1013 |                         operation,
1014 |                     )
1015 |                 processed_content.append(operation)
1016 | 
1017 |             else:
1018 |                 processed_content.append(operation)
1019 | 
1020 |         assistant_message = {"role": "assistant", "content": content_str}
1021 |         messages.append(assistant_message)
1022 | 
1023 |         return processed_content
1024 | 
1025 |     except Exception as e:
1026 |         print(
1027 |             f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}"
1028 |         )
1029 |         if config.verbose:
1030 |             print("[Self-Operating Computer][Operate] error", e)
1031 |             traceback.print_exc()
1032 |             print("message before convertion ", messages)
1033 | 
1034 |         # Convert the messages to the GPT-4 format
1035 |         gpt4_messages = [messages[0]]  # Include the system message
1036 |         for message in messages[1:]:
1037 |             if message["role"] == "user":
1038 |                 # Update the image type format from "source" to "url"
1039 |                 updated_content = []
1040 |                 for item in message["content"]:
1041 |                     if isinstance(item, dict) and "type" in item:
1042 |                         if item["type"] == "image":
1043 |                             updated_content.append(
1044 |                                 {
1045 |                                     "type": "image_url",
1046 |                                     "image_url": {
1047 |                                         "url": f"data:image/png;base64,{item['source']['data']}"
1048 |                                     },
1049 |                                 }
1050 |                             )
1051 |                         else:
1052 |                             updated_content.append(item)
1053 | 
1054 |                 gpt4_messages.append({"role": "user", "content": updated_content})
1055 |             elif message["role"] == "assistant":
1056 |                 gpt4_messages.append(
1057 |                     {"role": "assistant", "content": message["content"]}
1058 |                 )
1059 | 
1060 |         return gpt_4_fallback(gpt4_messages, objective, model)
1061 | 
1062 | 
1063 | def get_last_assistant_message(messages):
1064 |     """
1065 |     Retrieve the last message from the assistant in the messages array.
1066 |     If the last assistant message is the first message in the array, return None.
1067 |     """
1068 |     for index in reversed(range(len(messages))):
1069 |         if messages[index]["role"] == "assistant":
1070 |             if index == 0:  # Check if the assistant message is the first in the array
1071 |                 return None
1072 |             else:
1073 |                 return messages[index]
1074 |     return None  # Return None if no assistant message is found
1075 | 
1076 | 
1077 | def gpt_4_fallback(messages, objective, model):
1078 |     if config.verbose:
1079 |         print("[gpt_4_fallback]")
1080 |     system_prompt = get_system_prompt("gpt-4o", objective)
1081 |     new_system_message = {"role": "system", "content": system_prompt}
1082 |     # remove and replace the first message in `messages` with `new_system_message`
1083 | 
1084 |     messages[0] = new_system_message
1085 | 
1086 |     if config.verbose:
1087 |         print("[gpt_4_fallback][updated]")
1088 |         print("[gpt_4_fallback][updated] len(messages)", len(messages))
1089 | 
1090 |     return call_gpt_4o(messages)
1091 | 
1092 | 
1093 | def confirm_system_prompt(messages, objective, model):
1094 |     """
1095 |     On `Exception` we default to `call_gpt_4_vision_preview` so we have this function to reassign system prompt in case of a previous failure
1096 |     """
1097 |     if config.verbose:
1098 |         print("[confirm_system_prompt] model", model)
1099 | 
1100 |     system_prompt = get_system_prompt(model, objective)
1101 |     new_system_message = {"role": "system", "content": system_prompt}
1102 |     # remove and replace the first message in `messages` with `new_system_message`
1103 | 
1104 |     messages[0] = new_system_message
1105 | 
1106 |     if config.verbose:
1107 |         print("[confirm_system_prompt]")
1108 |         print("[confirm_system_prompt] len(messages)", len(messages))
1109 |         for m in messages:
1110 |             if m["role"] != "user":
1111 |                 print("--------------------[message]--------------------")
1112 |                 print("[confirm_system_prompt][message] role", m["role"])
1113 |                 print("[confirm_system_prompt][message] content", m["content"])
1114 |                 print("------------------[end message]------------------")
1115 | 
1116 | 
1117 | def clean_json(content):
1118 |     if config.verbose:
1119 |         print("\n\n[clean_json] content before cleaning", content)
1120 |     if content.startswith("```json"):
1121 |         content = content[
1122 |             len("```json") :
1123 |         ].strip()  # Remove starting ```json and trim whitespace
1124 |     elif content.startswith("```"):
1125 |         content = content[
1126 |             len("```") :
1127 |         ].strip()  # Remove starting ``` and trim whitespace
1128 |     if content.endswith("```"):
1129 |         content = content[
1130 |             : -len("```")
1131 |         ].strip()  # Remove ending ``` and trim whitespace
1132 | 
1133 |     # Normalize line breaks and remove any unwanted characters
1134 |     content = "\n".join(line.strip() for line in content.splitlines())
1135 | 
1136 |     if config.verbose:
1137 |         print("\n\n[clean_json] content after cleaning", content)
1138 | 
1139 |     return content
1140 | 


--------------------------------------------------------------------------------