├── robbieg2
    ├── __init__.py
    ├── easyocr.py
    ├── cheap_critic.py
    ├── server.py
    ├── grid.py
    ├── canny_composite.py
    ├── tool.py
    ├── img.py
    ├── clicker.py
    └── agent.py
├── .gitattributes
├── font
    └── arialbd.ttf
├── images
    ├── robbie.jpg
    └── meet-robbie-g2.jpg
├── agent.yaml
├── pyproject.toml
├── LICENSE
├── CONTRIBUTING.md
├── CODE_OF_CONDUCT.md
├── README.md
└── .gitignore


/robbieg2/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/font/arialbd.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/agentsea/robbie-g2/HEAD/font/arialbd.ttf


--------------------------------------------------------------------------------
/images/robbie.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/agentsea/robbie-g2/HEAD/images/robbie.jpg


--------------------------------------------------------------------------------
/images/meet-robbie-g2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/agentsea/robbie-g2/HEAD/images/meet-robbie-g2.jpg


--------------------------------------------------------------------------------
/robbieg2/easyocr.py:
--------------------------------------------------------------------------------
 1 | import easyocr
 2 | 
 3 | def find_all_text_with_bounding_boxes(path: str) -> [dict]:
 4 |     try:
 5 |         reader = easyocr.Reader(['en'])
 6 |         results = reader.readtext(path)
 7 |         processed_results = []
 8 |         for box in results:
 9 |             result = {
10 |                 "x": int(box[0][0][0]),
11 |                 "y": int(box[0][0][1]),
12 |                 "w": int(box[0][2][0] - box[0][0][0]),
13 |                 "h": int(box[0][2][1] - box[0][0][1]),
14 |                 "text": box[1],
15 |                 "confidence": float(box[2])
16 |             }
17 |             processed_results.append(result)
18 |         return processed_results
19 |     except Exception as e:
20 |         print(f"EasyOCR failed: {str(e)}")
21 |         return []
22 | 


--------------------------------------------------------------------------------
/agent.yaml:
--------------------------------------------------------------------------------
 1 | api_version: v1
 2 | kind: TaskAgent
 3 | name: "RobbieG2"
 4 | description: "A Gen 2 AI Agent that uses OCR, Canny Composite, and Grid to navigate GUIs"
 5 | tags:
 6 |   - "gui"
 7 | supports:
 8 |   - "desktop"
 9 | cmd: "poetry run python -m robbieg2.server"
10 | img_repo: "us-central1-docker.pkg.dev/agentsea-dev/guisurfer/robbieg2"
11 | versions:
12 |   latest: "us-central1-docker.pkg.dev/agentsea-dev/guisurfer/robbieg2:latest"
13 | runtimes:
14 |   - type: "agent"
15 |     preference:
16 |       - "process"
17 |       - "docker"
18 |       - "kube"
19 | llm_providers:
20 |   preference:
21 |     - "gpt-4o"
22 | public: True
23 | icon: https://storage.googleapis.com/guisurfer-assets/SurfPizza.webp
24 | resource_requests:
25 |   cpu: "1"
26 |   memory: "2Gi"
27 | resource_limits:
28 |   cpu: "2"
29 |   memory: "4Gi"
30 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | 
 2 | [tool.poetry]
 3 | name = "RobbieG2"
 4 | version = "0.1.0"
 5 | description = "A Gen 2 AI Agent that uses OCR, Canny Composite, and Grid to navigate GUIs."
 6 | authors = ["Kentauros AI <partnerships@kentauros.ai>"]
 7 | license = "MIT"
 8 | readme = "README.md"
 9 | packages = [{include = "robbieg2"}]
10 | 
11 | 
12 | [tool.poetry.dependencies]
13 | python = "^3.10"
14 | pydantic = "^2.6.3"
15 | opencv-python = "^4.10.0.84"
16 | numpy = "^1.26.4"
17 | scikit-learn = "^1.5.0"
18 | easyocr = "^1.7.1"
19 | torch = { version = "2.2.2", source = "pypi" }
20 | pillow = "^10.3.0"
21 | surfkit = "^0.1.309"
22 | 
23 | 
24 | [tool.poetry.group.dev.dependencies]
25 | flake8 = "^7.0.0"
26 | black = "^24.2.0"
27 | pytest = "^8.0.2"
28 | ipykernel = "^6.29.3"
29 | pytest-env = "^1.1.3"
30 | 
31 | 
32 | [build-system]
33 | requires = ["poetry-core"]
34 | build-backend = "poetry.core.masonry.api"
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 AgentSea
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/robbieg2/cheap_critic.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | from PIL import Image
 3 | import numpy as np
 4 | from skimage.metrics import structural_similarity as ssim
 5 | 
 6 | def assess_action_result(starting_image: Image.Image, updated_image: Image.Image) -> (float, bool):
 7 |     """Cheap critic returns True if the chain of actions can be continued and False otherwise.
 8 |     In the current version, we continue if the SSIM is above a threshold (i.e. the images are visually similar).
 9 |     """
10 |     threshold = 0.9
11 |     ssim = compare_images(starting_image, updated_image)
12 |     if ssim > threshold:
13 |         return ssim, True
14 |     else:
15 |         return ssim, False
16 | 
17 | def _pil_to_cv2(pil_image):
18 |     # Ensure the image is in RGB mode
19 |     if pil_image.mode != 'RGB':
20 |         pil_image = pil_image.convert('RGB')
21 |     # Convert the PIL Image to a NumPy array
22 |     np_image = np.array(pil_image)
23 |     # Convert RGB to BGR format (OpenCV uses BGR)
24 |     cv2_image = cv2.cvtColor(np_image, cv2.COLOR_RGB2BGR)
25 |     return cv2_image
26 | 
27 | def compare_images(image1, image2):
28 |     image1 = _pil_to_cv2(image1)
29 |     image2 = _pil_to_cv2(image2)
30 | 
31 |     # Convert the images to grayscale
32 |     gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
33 |     gray2 = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)
34 | 
35 |     # Compute SSIM between the two images
36 |     similarity_index, diff = ssim(gray1, gray2, full=True)
37 | 
38 |     print(f"SSIM: {similarity_index}")
39 |     return similarity_index
40 | 


--------------------------------------------------------------------------------
/robbieg2/server.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import sys
 4 | from contextlib import asynccontextmanager
 5 | from typing import Final
 6 | 
 7 | import uvicorn
 8 | from fastapi import FastAPI
 9 | from fastapi.middleware.cors import CORSMiddleware
10 | from surfkit.server.routes import task_router
11 | 
12 | from .agent import Agent
13 | 
14 | # Configure logging
15 | logger: Final = logging.getLogger("robbieg2")
16 | logger.setLevel(int(os.getenv("LOG_LEVEL", str(logging.DEBUG))))
17 | handler = logging.StreamHandler(sys.stdout)
18 | handler.setLevel(logging.INFO)
19 | formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
20 | handler.setFormatter(formatter)
21 | 
22 | # Ensure logs are flushed immediately
23 | handler.flush = sys.stdout.flush
24 | logger.addHandler(handler)
25 | logger.propagate = False
26 | 
27 | ALLOW_ORIGINS = os.getenv("ALLOW_ORIGINS", "*").split(",")
28 | ALLOW_METHODS = os.getenv("ALLOW_METHODS", "*").split(",")
29 | ALLOW_HEADERS = os.getenv("ALLOW_HEADERS", "*").split(",")
30 | 
31 | 
32 | @asynccontextmanager
33 | async def lifespan(app: FastAPI):
34 |     # Initialize the agent type before the server comes live
35 |     Agent.init()
36 |     yield
37 | 
38 | 
39 | app = FastAPI(lifespan=lifespan)  # type: ignore
40 | 
41 | app.add_middleware(
42 |     CORSMiddleware,
43 |     allow_origins=ALLOW_ORIGINS,
44 |     allow_credentials=True,
45 |     allow_methods=ALLOW_METHODS,
46 |     allow_headers=ALLOW_HEADERS,
47 | )
48 | 
49 | app.include_router(task_router(Agent))
50 | 
51 | if __name__ == "__main__":
52 |     port = os.getenv("SERVER_PORT", "9090")
53 |     reload = os.getenv("SERVER_RELOAD", "true") == "true"
54 |     host = os.getenv("SERVER_HOST", "0.0.0.0")
55 |     uvicorn.run(
56 |         "robbieg2.server:app",
57 |         host=host,
58 |         port=int(port),
59 |         reload=reload,
60 |         reload_excludes=[".data"],
61 |         log_config=None,  # Disable default Uvicorn log configuration
62 |     )
63 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | First off, thank you for considering contributing to this project. It's people like you that make it such a great tool.
 4 | 
 5 | ## Code of Conduct
 6 | 
 7 | This project adheres to a Code of Conduct that we expect project participants to adhere to. Please read [the full text](CODE_OF_CONDUCT.md) so that you can understand what actions will and will not be tolerated.
 8 | 
 9 | ## What we are looking for
10 | 
11 | This is an open-source project, and we welcome contributions of all kinds: new features, bug fixes, documentation, examples, or enhancements to existing features. We are always thrilled to receive contributions from the community.
12 | 
13 | ## How to contribute
14 | 
15 | If you've never contributed to an open-source project before, here are a few steps to get you started:
16 | 
17 | ### Reporting Issues
18 | 
19 | Before submitting a bug report or feature request, check to make sure it hasn't already been submitted. You can search through existing issues and pull requests to see if someone has reported one similar to yours.
20 | 
21 | When you are creating a bug report, please include as much detail as possible.
22 | 
23 | ### Pull Requests
24 | 
25 | - Fork the repository and create your branch from `main`.
26 | - If you've added code that should be tested, add tests.
27 | - If you've changed APIs, update the documentation.
28 | - Ensure the test suite passes.
29 | - Make sure your code lints.
30 | - Issue that pull request!
31 | 
32 | ### Getting started
33 | 
34 | For something that is bigger than a one or two-line fix:
35 | 
36 | 1. Create your own fork of the code.
37 | 2. Do the changes in your fork.
38 | 3. If you like the change and think the project could use it:
39 |     - Be sure you have followed the code style for the project.
40 |     - Note the Code of Conduct.
41 |     - Send a pull request.
42 | 
43 | ## How to report a bug
44 | 
45 | If you find a security vulnerability, do NOT open an issue. Email github@kentauros.ai instead.
46 | 
47 | In order to help us understand and resolve your issue quickly, please include as much information as possible, including:
48 | 
49 | - A quick summary and/or background
50 | - Steps to reproduce
51 |   - Be specific!
52 |   - Give a sample code if you can.
53 | - What you expected would happen
54 | - What actually happens
55 | - Notes (possibly including why you think this might be happening or stuff you tried that didn't work)
56 | 
57 | People *love* thorough bug reports. I'm not even kidding.
58 | 
59 | ## How to suggest a feature or enhancement
60 | 
61 | If you find yourself wishing for a feature that doesn't exist in the project, you are probably not alone. There are bound to be others out there with similar needs. Open an issue on our issues list on GitHub, which describes the feature you would like to see, why you need it, and how it should work.
62 | 
63 | ## Code review process
64 | 
65 | The core team looks at Pull Requests on a regular basis in a bi-weekly triage meeting. After feedback has been given, we expect responses within two weeks. After two weeks, we may close the pull request if it isn't showing any activity.
66 | 
67 | ## Community
68 | 
69 | Discussions about the project take place in this repository's Issues and Pull Requests sections. Anybody is welcome to join these conversations.
70 | 
71 | Wherever possible, we use GitHub to discuss changes and keep the decision-making process open.
72 | 
73 | ## Thank you!
74 | 
75 | Thank you for contributing!
76 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to make participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation.
 6 | 
 7 | ## Our Standards
 8 | 
 9 | Examples of behavior that contributes to creating a positive environment include:
10 | 
11 | - Using welcoming and inclusive language
12 | - Being respectful of differing viewpoints and experiences
13 | - Gracefully accepting constructive criticism
14 | - Focusing on what is best for the community
15 | - Showing empathy towards other community members
16 | 
17 | Examples of unacceptable behavior by participants include:
18 | 
19 | - The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | - Trolling, insulting/derogatory comments, and personal or political attacks
21 | - Public or private harassment
22 | - Publishing others' private information, such as a physical or email address, without explicit permission
23 | - Other conduct which could reasonably be considered inappropriate in a professional setting
24 | 
25 | ## Our Responsibilities
26 | 
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 | 
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned with this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 | 
31 | ## Scope
32 | 
33 | This Code of Conduct applies within all project spaces, including GitHub, and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event.
34 | 
35 | ## Enforcement
36 | 
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at github@kentauros.ai. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality regarding the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 | 
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 | 
41 | ## Attribution
42 | 
43 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 2.0, available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
44 | 
45 | Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity).
46 | 
47 | For answers to common questions about this code of conduct, see https://www.contributor-covenant.org/faq. Translations are available at https://www.contributor-covenant.org/translations.
48 | 


--------------------------------------------------------------------------------
/robbieg2/grid.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image, ImageDraw, ImageFont
 2 | 
 3 | # We need a simple grid: numbers from 1 to 9 in points on an intersection of nxn grid.
 4 | # The font size may be 1/5 of the size of the height of the cell.
 5 | # Therefore, we need the size of the image and colors, and the file_name. 
 6 | 
 7 | def create_grid_image(image_width, image_height, color_circle, color_number, n, file_name):
 8 |     cell_width = image_width // n
 9 |     cell_height = image_height // n
10 |     font_size = max(cell_height // 5, 20)
11 |     circle_radius = font_size * 7 // 10
12 | 
13 |     # Create a blank image with transparent background
14 |     img = Image.new('RGBA', (image_width, image_height), (0, 0, 0, 0))
15 |     draw = ImageDraw.Draw(img)
16 | 
17 |     # Load a font
18 |     font = ImageFont.truetype("font/arialbd.ttf", font_size)
19 | 
20 |     # Set the number of cells in each dimension
21 |     num_cells_x = n - 1 
22 |     num_cells_y = n - 1
23 | 
24 |     # Draw the numbers in the center of each cell
25 |     for i in range(num_cells_x):
26 |         for j in range(num_cells_y):
27 |             number = i * num_cells_y + j + 1
28 |             text = str(number)
29 |             x = (i + 1) * cell_width
30 |             y = (j + 1) * cell_height
31 |             draw.ellipse([x - circle_radius, y - circle_radius, 
32 |                           x + circle_radius, y + circle_radius], 
33 |                           fill=color_circle)
34 |             offset_x = font_size / 4 if number < 10 else font_size / 2
35 |             draw.text((x - offset_x, y - font_size / 2), text, font=font, fill=color_number)
36 | 
37 |     # Save the image
38 |     img.save(file_name)
39 | 
40 | def zoom_in(image_path, n, index, upscale):
41 |     img = Image.open(image_path)
42 |     width, height = img.size
43 |     # we need to calculate the cell size
44 |     cell_width = width // n
45 |     cell_height = height // n
46 |     # we need to calculate the x and y coordinates of the cell
47 |     x = ((index - 1) // (n - 1)) * cell_width
48 |     y = ((index - 1) % (n - 1)) * cell_height
49 |     # we need to calculate the x and y coordinates of the top left corner of the cell
50 |     top_left = (x, y)
51 |     # we need to calculate the x and y coordinates of the bottom right corner of the cell
52 |     bottom_right = (x + 2 * cell_width, y + 2 * cell_height)
53 |     # we need to crop the image
54 |     
55 |     cropped_img = img.crop(top_left + bottom_right)
56 |     cropped_img = cropped_img.resize((cropped_img.width * upscale, cropped_img.height * upscale), resample=0)
57 |     return cropped_img, top_left, bottom_right
58 | #    cropped_img.save(new_image_path)
59 | #    return 2 * cell_width, 2 * cell_height
60 | 
61 | def superimpose_images(image1_path, image2_path, opacity):
62 |     # Open the images
63 |     image1 = Image.open(image1_path)
64 |     image2 = Image.open(image2_path)
65 | 
66 |     # Ensure both images have the same size
67 |     if image1.size != image2.size:
68 |         raise ValueError("Images must have the same dimensions.")
69 | 
70 |     # Convert the images to RGBA mode if they are not already
71 |     image1 = image1.convert("RGBA")
72 |     image2 = image2.convert("RGBA")
73 | 
74 |     # Create a new image with the same size as the input images
75 |     merged_image = Image.new("RGBA", image1.size)
76 | 
77 |     # Convert image1 to grayscale
78 |     image1 = image1.convert("L")
79 | 
80 |     # Paste image1 onto the merged image
81 |     merged_image.paste(image1, (0, 0))
82 | 
83 |     # Create a new image for image2 with adjusted opacity
84 |     image2_with_opacity = Image.blend(Image.new("RGBA", image2.size, (0, 0, 0, 0)), image2, opacity)
85 | 
86 |     # Paste image2 with opacity onto the merged image
87 |     merged_image = Image.alpha_composite(merged_image, image2_with_opacity)
88 | 
89 |     return merged_image
90 | 
91 | 
92 | 
93 | # Example usage
94 | if __name__ == "__main__":
95 |     create_grid_image(2880, 1712, 'yellow', 'green', 6, 'test.png')
96 | 
97 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!-- PROJECT LOGO -->
  2 | <br />
  3 | <p align="center">
  4 |   <h1 align="center">Robbie G2</h1>
  5 |     <p align="center">
  6 |     <img src="images/robbie.jpg" alt="Robbie G2 Logo" width="200" style="border-radius: 50px;">
  7 |     </p>
  8 |     <p align="center">
  9 |     <strong>Gen 2</strong> AI Agent that uses OCR, Canny Composite, and Grid to navigate GUIs
 10 |     <br />
 11 |     <br />
 12 |     <a href="https://docs.hub.agentsea.ai/introduction"><strong>Explore the docs »</strong></a>
 13 |     <br />
 14 |     <br />
 15 |     <a href="https://github.com/agentsea/robbie-g2">View Demo</a>
 16 |     ·
 17 |     <a href="https://github.com/agentsea/robbie-g2/issues">Report Bug</a>
 18 |     ·
 19 |     <a href="https://github.com/agentsea/robbie-g2/issues">Request Feature</a>
 20 |   </p>
 21 | </p>
 22 | 
 23 | Meet Robbie, or Gen 2 agent. 
 24 | 
 25 | Robbie navigates GUIs to solve tasks for you.  
 26 | 
 27 | Unlike other bots, he doesn't just work on the web because he doesn't use Playwright. Robbie is a pure multimodal bot.  He can navigate the web or a desktop.  
 28 | 
 29 | That means he can navigate SaaS apps or he can work on a remote desktop and send emails, search for flights, check Slack, do research and more. 
 30 | 
 31 | Robbie-g2, aka Gen 2, is a leap from our first gen agents, SurfPizza and SurfSlicer.  He's very capable at navigating complex, never before seen GUIs via a remote virtual desktop which the AgentSea stack serves up as a device to him via DeviceBay. He connects to it via ToolFuse and AgentDesk, which lets him know what he can do with it, like move the mouse, send key commands, etc. 
 32 | 
 33 | ‣ Check out our community on [Discord](https://discord.gg/hhaq7XYPS6) where we develop in the open, share research and connect with other developers who are building cutting edge agents or who just want to use them to get things done!
 34 | 
 35 | ‣ Check out the deep dive video right here:
 36 | 
 37 | <div align="center">
 38 |   <a href="https://youtu.be/R6rR27I6oFg?si=qBF9ApmL_FCG0tBL">
 39 |     <img src="https://img.youtube.com/vi/R6rR27I6oFg/0.jpg" alt="Watch the video">
 40 |   </a>
 41 | </div>
 42 | 
 43 | 
 44 | ## Quick Start
 45 | 
 46 | ### Prerequisites
 47 | 
 48 | * [Install Docker](https://docs.docker.com/engine/install/) - you need it to run a Tracker
 49 | * [Install QEMU](https://docs.hub.agentsea.ai/configuration/qemu) OR [Configure GCP](https://docs.hub.agentsea.ai/configuration/gcp) OR [Configure AWS](https://docs.hub.agentsea.ai/configuration/aws) - you need one of these to host a Device
 50 | 
 51 | ### Setup 
 52 | 
 53 | 1. Setup your OpenAI API key:
 54 | 
 55 | ```sh
 56 | export OPENAI_API_KEY=<your key>
 57 | ```
 58 | 
 59 | 2. Install/upgrade SurfKit:
 60 | 
 61 | ```sh
 62 | pip install -U surfkit
 63 | ```
 64 | 
 65 | 3. Clone the repository and go to the root folder:
 66 | 
 67 | ```sh
 68 | git clone git@github.com:agentsea/robbie-g2.git && cd robbie-g2
 69 | ```
 70 | 
 71 | 4. Install dependencies:
 72 | 
 73 | ```sh
 74 | poetry install
 75 | ```
 76 | 
 77 | ### Creating required entities
 78 | 
 79 | 5. Create a tracker:
 80 | 
 81 | ```sh
 82 | surfkit create tracker --name tracker01
 83 | ```
 84 | 
 85 | 6. Create a device:
 86 | 
 87 |   - If you are using QEMU:
 88 | 
 89 | ```sh
 90 | surfkit create device --provider qemu --name device01
 91 | ```
 92 | 
 93 |   - If you are using GCE:
 94 | 
 95 | ```sh
 96 | surfkit create device --provider gce --name device01
 97 | ```
 98 | 
 99 |   - If you are using AWS:
100 | 
101 | ```sh
102 | surfkit create device --provider aws --name device01
103 | ```
104 | 
105 | 7. Create an agent:
106 | 
107 | ```sh
108 | surfkit create agent --name agent01
109 | ```
110 | 
111 | ### Solving a task
112 | 
113 | ```sh
114 | surfkit solve "Search for common varieties of french ducks" \
115 |   --tracker tracker01 \
116 |   --device device01 \
117 |   --agent agent01
118 | ```
119 | 
120 | ## Documentation
121 | 
122 | See our [docs](https://docs.hub.agentsea.ai) for more information on how to use Surfkit.
123 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .data/
  2 | .DS_Store
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | cover/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | .pybuilder/
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | #   For a library or package, you might want to ignore these files since the code is
 90 | #   intended to run in multiple environments; otherwise, check them in:
 91 | # .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # poetry
101 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
103 | #   commonly ignored for libraries.
104 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105 | #poetry.lock
106 | 
107 | # pdm
108 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
109 | #pdm.lock
110 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
111 | #   in version control.
112 | #   https://pdm.fming.dev/#use-with-ide
113 | .pdm.toml
114 | 
115 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
116 | __pypackages__/
117 | 
118 | # Celery stuff
119 | celerybeat-schedule
120 | celerybeat.pid
121 | 
122 | # SageMath parsed files
123 | *.sage.py
124 | 
125 | # Environments
126 | .env
127 | .venv
128 | env/
129 | venv/
130 | ENV/
131 | env.bak/
132 | venv.bak/
133 | 
134 | # Spyder project settings
135 | .spyderproject
136 | .spyproject
137 | 
138 | # Rope project settings
139 | .ropeproject
140 | 
141 | # mkdocs documentation
142 | /site
143 | 
144 | # mypy
145 | .mypy_cache/
146 | .dmypy.json
147 | dmypy.json
148 | 
149 | # Pyre type checker
150 | .pyre/
151 | 
152 | # pytype static type analyzer
153 | .pytype/
154 | 
155 | # Cython debug symbols
156 | cython_debug/
157 | 
158 | # PyCharm
159 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
160 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
161 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
162 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
163 | #.idea/
164 | 
165 | .data/
166 | cidata.iso
167 | .agentsea
168 | 


--------------------------------------------------------------------------------
/robbieg2/canny_composite.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | from sklearn.cluster import AgglomerativeClustering
  4 | from PIL import Image, ImageDraw, ImageFont
  5 | 
  6 | 
  7 | def improved_canny(image):
  8 |     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
  9 |     blurred = cv2.GaussianBlur(gray, (5, 5), 0)
 10 |     edges = cv2.Canny(blurred, 50, 150)
 11 |     dilated = cv2.dilate(edges, None, iterations=2)
 12 |     return dilated
 13 | 
 14 | def group_elements(binary_image):
 15 |     contours, _ = cv2.findContours(binary_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
 16 |     
 17 |     grouped_contours = []
 18 |     for contour in contours:
 19 |         x, y, w, h = cv2.boundingRect(contour)
 20 |         if w * h < 100:  # Ignore very small contours
 21 |             continue
 22 |         merged = False
 23 |         for group in grouped_contours:
 24 |             if any(cv2.boundingRect(c)[0] - 10 <= x <= cv2.boundingRect(c)[0] + cv2.boundingRect(c)[2] + 10 and
 25 |                    cv2.boundingRect(c)[1] - 10 <= y <= cv2.boundingRect(c)[1] + cv2.boundingRect(c)[3] + 10 for c in group):
 26 |                 group.append(contour)
 27 |                 merged = True
 28 |                 break
 29 |         if not merged:
 30 |             grouped_contours.append([contour])
 31 |     
 32 |     return grouped_contours
 33 | 
 34 | def extract_bounding_boxes(grouped_contours):
 35 |     bounding_boxes = []
 36 |     for group in grouped_contours:
 37 |         x, y, w, h = cv2.boundingRect(np.concatenate(group))
 38 |         bounding_boxes.append((x, y, w, h))
 39 |     return bounding_boxes
 40 | 
 41 | def cluster_bounding_boxes(bounding_boxes, num_clusters):
 42 |     if len(bounding_boxes) <= num_clusters:
 43 |         return bounding_boxes
 44 | 
 45 |     # Calculate the centroids of the bounding boxes
 46 |     centroids = np.array([
 47 |         [box[0] + box[2] / 2, box[1] + box[3] / 2] for box in bounding_boxes
 48 |     ])
 49 | 
 50 |     # Apply Hierarchical clustering (Agglomerative Clustering)
 51 |     agg_clustering = AgglomerativeClustering(n_clusters=num_clusters)
 52 |     labels = agg_clustering.fit_predict(centroids)
 53 | 
 54 |     # Calculate bounding boxes for each cluster
 55 |     cluster_bounding_boxes = []
 56 |     for i in range(num_clusters):
 57 |         cluster_boxes = [box for box, label in zip(bounding_boxes, labels) if label == i]
 58 |         if cluster_boxes:
 59 |             # Find the minimum and maximum coordinates for all boxes in the cluster
 60 |             min_x = min(box[0] for box in cluster_boxes)
 61 |             min_y = min(box[1] for box in cluster_boxes)
 62 |             max_x = max(box[0] + box[2] for box in cluster_boxes)
 63 |             max_y = max(box[1] + box[3] for box in cluster_boxes)
 64 |             
 65 |             # Calculate the dimensions of the bounding box that encompasses all boxes
 66 |             width = max_x - min_x
 67 |             height = max_y - min_y
 68 |             
 69 |             # Create a new bounding box that encompasses all boxes in the cluster
 70 |             cluster_box = (min_x, min_y, width, height)
 71 |             cluster_bounding_boxes.append(cluster_box)
 72 | 
 73 |     return cluster_bounding_boxes
 74 | 
 75 | def create_composite_image(bounding_boxes, image):
 76 |     number_column_width = 100
 77 |     image_column_width = max(box[2] for box in bounding_boxes)
 78 |     row_heights = [box[3] + 4 for box in bounding_boxes]
 79 |     total_width = number_column_width + image_column_width + 1  # +1 for rightmost line
 80 |     total_height = sum(row_heights) + len(bounding_boxes) + 1  # +1 for each row separator and bottom line
 81 |     
 82 |     composite = Image.new('RGB', (total_width, total_height), color='white')
 83 |     draw = ImageDraw.Draw(composite)
 84 |     
 85 |     try:
 86 |         font = ImageFont.truetype("./font/arialbd.ttf", 30)
 87 |     except IOError:
 88 |         font = ImageFont.load_default()
 89 |         print("Arial font not found in ./font directory. Using default font.")
 90 |     
 91 |     # Draw grid lines
 92 |     for i in range(len(bounding_boxes) + 1):
 93 |         y = sum(row_heights[:i]) + i
 94 |         draw.line([(0, y), (total_width, y)], fill='black', width=1)
 95 |     draw.line([(number_column_width, 0), (number_column_width, total_height)], fill='black', width=1)
 96 |     draw.line([(total_width - 1, 0), (total_width - 1, total_height)], fill='black', width=1)
 97 |     
 98 |     y_offset = 1  # Start after the top line
 99 |     for i, box in enumerate(bounding_boxes):
100 |         # Draw number
101 |         number_text = str(i+1)
102 |         text_bbox = draw.textbbox((0, 0), number_text, font=font)
103 |         text_width = text_bbox[2] - text_bbox[0]
104 |         text_height = text_bbox[3] - text_bbox[1]
105 |         text_x = (number_column_width - text_width) // 2
106 |         text_y = y_offset + (row_heights[i] - text_height) // 2 + 2
107 |         draw.text((text_x, text_y), number_text, font=font, fill='red')
108 |         
109 |         # Paste image slice
110 |         box_pil = image.crop((box[0], box[1], box[0] + box[2], box[1] + box[3]))
111 |         paste_x = number_column_width + 1
112 |         composite.paste(box_pil, (paste_x, y_offset + 2))
113 |         # Draw a rectangle around the pasted image
114 |         draw.rectangle(
115 |             [
116 |                 (paste_x, y_offset + 2),
117 |                 (paste_x + box[2] - 1, y_offset + 2 + box[3] - 1)
118 |             ],
119 |             outline="green",
120 |             width=2
121 |         )
122 |         
123 |         y_offset += row_heights[i] + 1  # Move to next row, accounting for grid line
124 |     
125 |     return composite
126 | 
127 | def create_composite(image_path, num_clusters):
128 |     image = cv2.imread(image_path)
129 |     pil_image = Image.open(image_path)
130 | 
131 |     edges = improved_canny(image)
132 |     grouped_contours = group_elements(edges)
133 |     bounding_boxes = extract_bounding_boxes(grouped_contours)
134 |     clustered_boxes = cluster_bounding_boxes(bounding_boxes, num_clusters)
135 |     composite_image = create_composite_image(clustered_boxes, pil_image)
136 |     return composite_image, clustered_boxes
137 | 


--------------------------------------------------------------------------------
/robbieg2/tool.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import time
  4 | import requests
  5 | 
  6 | from agentdesk.device import Desktop
  7 | from mllm import Router
  8 | from rich.console import Console
  9 | from taskara import Task
 10 | from toolfuse import Tool, action
 11 | 
 12 | from .clicker import find_coordinates
 13 | 
 14 | router = Router.from_env()
 15 | console = Console()
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | logger.setLevel(int(os.getenv("LOG_LEVEL", logging.DEBUG)))
 19 | 
 20 | 
 21 | class SemanticDesktop(Tool):
 22 |     """A semantic desktop replaces click actions with semantic description rather than coordinates"""
 23 | 
 24 |     def __init__(
 25 |         self, task: Task, desktop: Desktop, data_path: str = "./.data"
 26 |     ) -> None:
 27 |         """
 28 |         Initialize and open a URL in the application.
 29 | 
 30 |         Args:
 31 |             task: Agent task. Defaults to None.
 32 |             desktop: Desktop instance to wrap.
 33 |             data_path (str, optional): Path to data. Defaults to "./.data".
 34 |         """
 35 |         super().__init__(wraps=desktop)
 36 |         self.desktop = desktop
 37 | 
 38 |         self.data_path = data_path
 39 |         self.img_path = os.path.join(self.data_path, "images", task.id)
 40 |         os.makedirs(self.img_path, exist_ok=True)
 41 | 
 42 |         self.task = task
 43 |         self.last_click_failed = False
 44 | 
 45 |         self.results = {
 46 |             "first_ocr": 0,
 47 |             "second_ocr": 0,
 48 |             "full_grid": 0,
 49 |             "failure": 0,
 50 |         }
 51 | 
 52 |     @action
 53 |     def clean_text(self) -> str:
 54 |         """Clean the text input or area currently in focus. 
 55 |         Use when you see wrong data in the text input or area which is currently in focus
 56 |         and need to clean it before entering new text.
 57 |         """
 58 |         self.desktop.hot_key(["ctrl", "a"])
 59 |         self.desktop.hot_key(["del"])
 60 |         
 61 |     @action
 62 |     def click_object(self, description: str, text: str, type: str, button: str = "left") -> None:
 63 |         """Click on an object on the screen
 64 | 
 65 |         Args:
 66 |             description (str): The description of the object including its general location, for example
 67 |                 "a round dark blue icon with the text 'Home' in the top-right of the image", please be a generic as possible
 68 |             text (str): The text written on the object to click on. For example, 
 69 |                 for "a round dark blue icon with the text 'Home' in the top-right of the image",
 70 |                 the text is 'Home'. For input with its name written right inside it, write here the name of the input, 
 71 |                 for example, 'Where to'. For the calendar date, put here only a day, for example, '15'.
 72 |                 If the object doesn't have any text on or in it, return emply string.
 73 |             type (str): Type of click, can be 'single' for a single click or
 74 |                 'double' for a double click. If you need to launch an application from the desktop choose 'double'
 75 |             button (str, optional): Mouse button to click. Options are 'left' or 'right'. Defaults to 'left'.
 76 |         """
 77 |         if type not in ["single", "double"]:
 78 |             raise ValueError("type must be 'single' or 'double'")
 79 |         
 80 |         self.task.post_message(
 81 |             role="Clicker",
 82 |             msg=f"Current statistics: {self.results}",
 83 |             thread="debug",
 84 |         )
 85 |         
 86 |         coords = find_coordinates(self, description, text)
 87 |         
 88 |         if coords:
 89 |             click_x, click_y = coords["x"], coords["y"]
 90 |             message = f"Attempting to click coordinates {click_x}, {click_y}."
 91 |             self.task.post_message(
 92 |                 role="Clicker",
 93 |                 msg=message,
 94 |                 thread="debug",
 95 |             )
 96 |             self.last_debug_message = message
 97 |             self._click_coords(x=click_x, y=click_y, type=type, button=button)
 98 |         else:
 99 |             # Note: Given that GRID almost always returns something, we should almost never be here
100 |             self.results["failure"] += 1
101 |             self.task.post_message(
102 |                 role="Clicker",
103 |                 msg=f"No coordinates found for {description}.",
104 |                 thread="debug",
105 |             )
106 |             self.last_click_failed = True
107 | 
108 |     def _click_coords(
109 |         self, x: int, y: int, type: str = "single", button: str = "left"
110 |     ) -> None:
111 |         """Click mouse button
112 | 
113 |         Args:
114 |             x (Optional[int], optional): X coordinate to move to, if not provided
115 |                 it will click on current location. Defaults to None.
116 |             y (Optional[int], optional): Y coordinate to move to, if not provided
117 |                 it will click on current location. Defaults to None.
118 |             type (str, optional): Type of click, can be single or double. Defaults to "single".
119 |             button (str, optional): Button to click. Defaults to "left".
120 |         """
121 |         # TODO: fix click cords in agentd
122 |         logging.debug("moving mouse")
123 |         body = {"x": int(x), "y": int(y)}
124 |         resp = requests.post(f"{self.desktop.base_url}/v1/move_mouse", json=body)
125 |         resp.raise_for_status()
126 |         time.sleep(2)
127 | 
128 |         if type == "single":
129 |             logging.debug("clicking")
130 |             resp = requests.post(
131 |                 f"{self.desktop.base_url}/v1/click", json={"button": button}
132 |             )
133 |             resp.raise_for_status()
134 |             time.sleep(2)
135 |         elif type == "double":
136 |             logging.debug("double clicking")
137 |             resp = requests.post(
138 |                 f"{self.desktop.base_url}/v1/double_click", json={"button": button}
139 |             )
140 |             resp.raise_for_status()
141 |             time.sleep(2)
142 |         else:
143 |             raise ValueError(f"unkown click type {type}")
144 |         return
145 | 


--------------------------------------------------------------------------------
/robbieg2/img.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple
  2 | from PIL import Image, ImageDraw, ImageFont
  3 | import base64
  4 | from io import BytesIO
  5 | 
  6 | 
  7 | class Box:
  8 |     def __init__(self, left: int, top: int, right: int, bottom: int):
  9 |         self.left = left
 10 |         self.top = top
 11 |         self.right = right
 12 |         self.bottom = bottom
 13 | 
 14 |     def width(self) -> int:
 15 |         return self.right - self.left
 16 | 
 17 |     def height(self) -> int:
 18 |         return self.bottom - self.top
 19 | 
 20 |     def zoom_in(self, cell_index: int, num_cells: int) -> "Box":
 21 |         cell_width = self.width() // num_cells
 22 |         cell_height = self.height() // num_cells
 23 |         col = (cell_index - 1) % num_cells
 24 |         row = (cell_index - 1) // num_cells
 25 |         return Box(
 26 |             self.left + col * cell_width,
 27 |             self.top + row * cell_height,
 28 |             self.left + (col + 1) * cell_width,
 29 |             self.top + (row + 1) * cell_height,
 30 |         )
 31 | 
 32 |     def center(self) -> Tuple[int, int]:
 33 |         return ((self.left + self.right) // 2, (self.top + self.bottom) // 2)
 34 | 
 35 |     def crop_image(self, img: Image.Image) -> Image.Image:
 36 |         return img.crop((self.left, self.top, self.right, self.bottom))
 37 | 
 38 |     def draw(
 39 |         self,
 40 |         draw_context,
 41 |         outline: str = "red",
 42 |         width: int = 3,
 43 |     ) -> None:
 44 |         draw_context.rectangle(
 45 |             [self.left, self.top, self.right, self.bottom], outline=outline, width=width
 46 |         )
 47 | 
 48 |     def to_absolute(self, parent_box: "Box") -> "Box":
 49 |         return Box(
 50 |             self.left + parent_box.left,
 51 |             self.top + parent_box.top,
 52 |             self.right + parent_box.left,
 53 |             self.bottom + parent_box.top,
 54 |         )
 55 | 
 56 |     def to_absolute_with_upscale(self, parent_box: "Box", upscale: int = 1) -> "Box":
 57 |         return Box(
 58 |             round(self.left / upscale + parent_box.left),
 59 |             round(self.top / upscale + parent_box.top),
 60 |             round(self.right / upscale + parent_box.left),
 61 |             round(self.bottom / upscale + parent_box.top),
 62 |         )
 63 | 
 64 | 
 65 | def create_grid_image_by_num_cells(
 66 |     image_width: int,
 67 |     image_height: int,
 68 |     color_circle: str = "red",
 69 |     color_text: str = "yellow",
 70 |     num_cells: int = 6,
 71 | ) -> Image.Image:
 72 |     """Create the pizza grid image.
 73 | 
 74 |     Args:
 75 |         image_width (int): Width of the image.
 76 |         image_height (int): Height of the image.
 77 |         color_circle (str): Color of the circles. Defaults to 'red'
 78 |         color_text (str): Color of the text. Defaults to 'yellow'
 79 |         num_cells (int): The number of cells in each dimension. Defaults to 6.
 80 | 
 81 |     Returns:
 82 |         Image.Image: The image grid
 83 |     """
 84 |     cell_width = image_width // num_cells
 85 |     cell_height = image_height // num_cells
 86 |     font_size = max(cell_height // 5, 30)
 87 |     circle_radius = font_size * 7 // 10
 88 | 
 89 |     # Create a blank image with transparent background
 90 |     img = Image.new("RGBA", (image_width, image_height), (0, 0, 0, 0))
 91 |     draw = ImageDraw.Draw(img)
 92 | 
 93 |     # Load a font
 94 |     font = ImageFont.truetype("fonts/arialbd.ttf", font_size)
 95 | 
 96 |     # Set the number of cells in each dimension
 97 |     num_cells_x = num_cells - 1
 98 |     num_cells_y = num_cells - 1
 99 | 
100 |     # Draw the numbers in the center of each cell
101 |     for i in range(num_cells_x):
102 |         for j in range(num_cells_y):
103 |             number = i * num_cells_y + j + 1
104 |             text = str(number)
105 |             x = (i + 1) * cell_width
106 |             y = (j + 1) * cell_height
107 |             draw.ellipse(
108 |                 [
109 |                     x - circle_radius,
110 |                     y - circle_radius,
111 |                     x + circle_radius,
112 |                     y + circle_radius,
113 |                 ],
114 |                 fill=color_circle,
115 |             )
116 |             offset_x = font_size / 4 if number < 10 else font_size / 2
117 |             draw.text(
118 |                 (x - offset_x, y - font_size / 2), text, font=font, fill=color_text
119 |             )
120 | 
121 |     return img
122 | 
123 | 
124 | def create_grid_image_by_size(
125 |     image_width: int,
126 |     image_height: int,
127 |     cell_size: int = 10,
128 |     color_circle: str = "red",
129 |     color_text: str = "yellow",
130 | ) -> Image.Image:
131 |     """Create a grid image with numbered cells.
132 | 
133 |     Args:
134 |         image_width (int): Total width of the image.
135 |         image_height (int): Total height of the image.
136 |         cell_size (int): Width and height of each cell.
137 |         color_circle (str): Color of the circles. Defaults to 'red'
138 |         color_text (str): Color of the text. Defaults to 'yellow'
139 | 
140 |     Returns:
141 |         Image.Image: The image with a grid.
142 |     """
143 |     num_cells_x = image_width // cell_size
144 |     num_cells_y = image_height // cell_size
145 |     font_size = max(cell_size // 5, 10)
146 |     circle_radius = (
147 |         cell_size // 2 - 2
148 |     )  # Slightly smaller than half the cell for visual appeal
149 | 
150 |     # Create a blank image
151 |     img = Image.new("RGBA", (image_width, image_height), (0, 0, 0, 0))
152 |     draw = ImageDraw.Draw(img)
153 | 
154 |     # Load a font
155 |     try:
156 |         font = ImageFont.truetype("arialbd.ttf", font_size)
157 |     except IOError:
158 |         font = ImageFont.load_default()
159 |         print("Custom font not found. Using default font.")
160 | 
161 |     # Draw the grid
162 |     for i in range(num_cells_x):
163 |         for j in range(num_cells_y):
164 |             number = i * num_cells_y + j + 1
165 |             text = str(number)
166 |             x_center = (i + 0.5) * cell_size
167 |             y_center = (j + 0.5) * cell_size
168 | 
169 |             # Draw circle
170 |             draw.ellipse(
171 |                 [
172 |                     x_center - circle_radius,
173 |                     y_center - circle_radius,
174 |                     x_center + circle_radius,
175 |                     y_center + circle_radius,
176 |                 ],
177 |                 fill=color_circle,
178 |             )
179 | 
180 |             # Calculate text offset for centering using getbbox()
181 |             bbox = font.getbbox(text)
182 |             text_width = bbox[2] - bbox[0]
183 |             text_height = bbox[3] - bbox[1]
184 |             draw.text(
185 |                 (x_center - text_width / 2, y_center - text_height / 2),
186 |                 text,
187 |                 font=font,
188 |                 fill=color_text,
189 |             )
190 | 
191 |     return img
192 | 
193 | 
194 | def zoom_in(
195 |     img: Image.Image, box: Box, num_cells: int, selected: int
196 | ) -> Tuple[Image.Image, Box]:
197 |     """Zoom in on the selected cell.
198 | 
199 |     Args:
200 |         img (Image.Image): The image to zoom in
201 |         box (Box): The box to zoom into
202 |         num_cells (int): Number of cells to use.
203 |         selected (int): The selected cell
204 | 
205 |     Returns:
206 |         Tuple[Image.Image, Box]: Cropped image and asociated box
207 |     """
208 |     new_box = box.zoom_in(selected, num_cells)
209 |     absolute_box = new_box.to_absolute(box)
210 |     cropped_img = new_box.crop_image(img)
211 |     return cropped_img, absolute_box
212 | 
213 | 
214 | def superimpose_images(
215 |     base: Image.Image, layer: Image.Image, opacity: float = 1
216 | ) -> Image.Image:
217 |     """
218 | 
219 |     Args:
220 |         base (Image.Image): Base image
221 |         layer (Image.Image): Layered image
222 |         opacity (float): How much opacity the layer should have. Defaults to 1.
223 | 
224 |     Returns:
225 |         Image.Image: The superimposed image
226 |     """
227 |     # Ensure both images have the same size
228 |     if base.size != layer.size:
229 |         raise ValueError("Images must have the same dimensions.")
230 | 
231 |     # Convert the images to RGBA mode if they are not already
232 |     base = base.convert("RGBA")
233 |     layer = layer.convert("RGBA")
234 | 
235 |     # Create a new image with the same size as the input images
236 |     merged_image = Image.new("RGBA", base.size)
237 | 
238 |     # Convert image1 to grayscale
239 |     base = base.convert("L")
240 | 
241 |     # Paste image1 onto the merged image
242 |     merged_image.paste(base, (0, 0))
243 | 
244 |     # Create a new image for image2 with adjusted opacity
245 |     image2_with_opacity = Image.blend(
246 |         Image.new("RGBA", layer.size, (0, 0, 0, 0)), layer, opacity
247 |     )
248 | 
249 |     # Paste image2 with opacity onto the merged image
250 |     merged_image = Image.alpha_composite(merged_image, image2_with_opacity)
251 | 
252 |     return merged_image
253 | 
254 | 
255 | def image_to_b64(img: Image.Image, image_format="PNG") -> str:
256 |     """Converts a PIL Image to a base64-encoded string with MIME type included.
257 | 
258 |     Args:
259 |         img (Image.Image): The PIL Image object to convert.
260 |         image_format (str): The format to use when saving the image (e.g., 'PNG', 'JPEG').
261 | 
262 |     Returns:
263 |         str: A base64-encoded string of the image with MIME type.
264 |     """
265 |     buffer = BytesIO()
266 |     img.save(buffer, format=image_format)
267 |     image_data = buffer.getvalue()
268 |     buffer.close()
269 | 
270 |     mime_type = f"image/{image_format.lower()}"
271 |     base64_encoded_data = base64.b64encode(image_data).decode("utf-8")
272 |     return f"data:{mime_type};base64,{base64_encoded_data}"
273 | 
274 | 
275 | def b64_to_image(base64_str: str) -> Image.Image:
276 |     """Converts a base64 string to a PIL Image object.
277 | 
278 |     Args:
279 |         base64_str (str): The base64 string, potentially with MIME type as part of a data URI.
280 | 
281 |     Returns:
282 |         Image.Image: The converted PIL Image object.
283 |     """
284 |     # Strip the MIME type prefix if present
285 |     if "," in base64_str:
286 |         base64_str = base64_str.split(",")[1]
287 | 
288 |     image_data = base64.b64decode(base64_str)
289 |     image = Image.open(BytesIO(image_data))
290 |     return image
291 | 
292 | 
293 | def load_image_base64(filepath: str) -> str:
294 |     # Load the image from the file path
295 |     image = Image.open(filepath)
296 |     buffered = BytesIO()
297 | 
298 |     # Save image to the buffer
299 |     image_format = image.format if image.format else "PNG"
300 |     image.save(buffered, format=image_format)
301 | 
302 |     # Encode the image to base64
303 |     img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
304 | 
305 |     # Prepare the mime type
306 |     mime_type = f"image/{image_format.lower()}"
307 | 
308 |     # Return base64 string with mime type
309 |     return f"data:{mime_type};base64,{img_str}"
310 | 


--------------------------------------------------------------------------------
/robbieg2/clicker.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import logging
  3 | import os
  4 | import re
  5 | 
  6 | from typing import List, Optional, Tuple
  7 | 
  8 | from PIL import Image, ImageDraw
  9 | from difflib import SequenceMatcher
 10 | from mllm import RoleMessage, RoleThread, Router
 11 | from pydantic import BaseModel, Field
 12 | from rich.console import Console
 13 | from rich.json import JSON
 14 | 
 15 | from .img import Box, image_to_b64
 16 | from .grid import create_grid_image, zoom_in, superimpose_images
 17 | from .easyocr import find_all_text_with_bounding_boxes
 18 | from .canny_composite import create_composite
 19 | 
 20 | router = Router.from_env()
 21 | console = Console()
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | logger.setLevel(int(os.getenv("LOG_LEVEL", logging.DEBUG)))
 25 | 
 26 | COLOR_NUMBER = os.getenv("COLOR_NUMBER", "yellow")
 27 | COLOR_CIRCLE = os.getenv("COLOR_CIRCLE", "red")
 28 | GRID_SIZE = int(os.getenv("GRID_SIZE", 8))  # Amound of grid points equals (N-1)^2
 29 | NUM_CLUSTERS = int(
 30 |     os.getenv("NUM_CLUSTERS", 10)
 31 | )  # Number of clusters to use for the composite method
 32 | UPSCALE_FACTOR = int(
 33 |     os.getenv("UPSCALE_FACTOR", 3)
 34 | )  # How much we upscale the image on each step of zooming in
 35 | FIRST_OCR_THRESHOLD = float(
 36 |     os.getenv("FIRST_OCR_THRESHOLD", 0.9)
 37 | )  # Threshold for the first OCR pass
 38 | SECOND_OCR_THRESHOLD = float(
 39 |     os.getenv("SECOND_OCR_THRESHOLD", 0.7)
 40 | )  # Threshold for the second OCR pass
 41 | 
 42 | 
 43 | class ZoomSelection(BaseModel):
 44 |     """Zoom selection model"""
 45 | 
 46 |     number: int = Field(..., description="The number of the selected circle")
 47 | 
 48 | 
 49 | class CompositeSelection(BaseModel):
 50 |     """Composite selection model"""
 51 | 
 52 |     number: int = Field(
 53 |         ..., description="The number of the selected section of the composite"
 54 |     )
 55 | 
 56 | 
 57 | def recall_best_method_on_first_iteration(description: str) -> str:
 58 |     memory = {
 59 |         "calendar": run_grid,
 60 |         "date": run_grid,
 61 |         "link": run_composite,
 62 |         "icon": run_composite,
 63 |     }
 64 |     for key, value in memory.items():
 65 |         if key in description.lower():
 66 |             return value
 67 |     return run_composite
 68 | 
 69 | 
 70 | def recall_best_method_on_second_iteration(description: str) -> str:
 71 |     memory = {
 72 |         "calendar": run_grid,
 73 |         "date": run_grid,
 74 |         "link": run_grid,
 75 |         "icon": run_composite,
 76 |     }
 77 |     for key, value in memory.items():
 78 |         if key in description.lower():
 79 |             return value
 80 |     return run_grid
 81 | 
 82 | 
 83 | def find_coordinates(semdesk, description: str, text: str) -> dict:
 84 |     click_hash = hashlib.md5(description.encode()).hexdigest()[:5]
 85 |     bounding_boxes = []
 86 |     search_text = text
 87 |     if not search_text:
 88 |         matches = re.findall(r"['\"](.*?)['\"]", description)
 89 |         if len(matches) >= 2:
 90 |             search_text = None
 91 |         elif len(matches) == 1:
 92 |             search_text = matches[0]
 93 |         else:
 94 |             search_text = None
 95 | 
 96 |     # - Setting up the stage
 97 | 
 98 |     starting_img = semdesk.desktop.take_screenshots()[0]
 99 |     starting_img_path = os.path.join(semdesk.img_path, f"{click_hash}_starting.png")
100 |     starting_img.save(starting_img_path)
101 |     bounding_boxes.append(Box(0, 0, starting_img.width, starting_img.height))
102 | 
103 |     method = recall_best_method_on_first_iteration(description)
104 | 
105 |     # - OCR on the starting image
106 | 
107 |     if search_text and len(search_text) > 3:
108 |         semdesk.task.post_message(
109 |             role="Clicker",
110 |             msg=f"Attempting OCR for: {search_text}",
111 |             thread="debug",
112 |         )
113 |         ocr_results = find_all_text_with_bounding_boxes(starting_img_path)
114 | 
115 |         best_matches = [
116 |             box
117 |             for box in ocr_results
118 |             if similarity_ratio(box["text"], search_text) >= FIRST_OCR_THRESHOLD
119 |         ]
120 |         if len(best_matches) == 1:
121 |             best_match = best_matches[0]
122 |             x_mid = best_match["x"] + best_match["w"] // 2
123 |             y_mid = best_match["y"] + best_match["h"] // 2
124 |             bounding_boxes.append(
125 |                 Box(
126 |                     best_match["x"],
127 |                     best_match["y"],
128 |                     best_match["x"] + best_match["w"],
129 |                     best_match["y"] + best_match["h"],
130 |                 )
131 |             )
132 |             semdesk.task.post_message(
133 |                 role="Clicker",
134 |                 msg=f"Found best matching text: '{best_match['text']}'",
135 |                 thread="debug",
136 |             )
137 |             # FIRST POINT OF POTENTIAL RESULT RETURN
138 |             semdesk.results["first_ocr"] += 1
139 |             debug_img = _debug_image(
140 |                 starting_img.copy(), bounding_boxes, (x_mid, y_mid)
141 |             )
142 |             semdesk.task.post_message(
143 |                 role="Clicker",
144 |                 msg="Final debug img",
145 |                 thread="debug",
146 |                 images=[image_to_b64(debug_img)],
147 |             )
148 |             return {"x": x_mid, "y": y_mid}
149 |         else:
150 |             semdesk.task.post_message(
151 |                 role="Clicker",
152 |                 msg=f"Found {len(best_matches)} best matches for text '{search_text}'. Continue...",
153 |                 thread="debug",
154 |             )
155 |     else:
156 |         semdesk.task.post_message(
157 |             role="Clicker",
158 |             msg="No text to look for in the starting image.",
159 |             thread="debug",
160 |         )
161 | 
162 |     # - Finding a region where the element of interest is located
163 | 
164 |     semdesk.task.post_message(
165 |         role="Clicker",
166 |         msg="Looking for a region of interest...",
167 |         thread="debug",
168 |     )
169 |     region_of_interest, bounding_box = method(
170 |         semdesk, starting_img, starting_img_path, description, click_hash, "region"
171 |     )
172 | 
173 |     # Escape exit, if we didn't find the region of interest because the element is not on a screen:
174 |     # we fall back to bruteforce method
175 |     if region_of_interest is None:
176 |         return backup_find_coordinates(semdesk, description)
177 | 
178 |     region_of_interest_b64 = image_to_b64(region_of_interest)
179 |     semdesk.task.post_message(
180 |         role="Clicker",
181 |         msg="Found region of interest",
182 |         thread="debug",
183 |         images=[region_of_interest_b64],
184 |     )
185 |     bounding_boxes.append(bounding_box)
186 |     region_of_interest_path = os.path.join(
187 |         semdesk.img_path, f"{click_hash}_region_of_interest.png"
188 |     )
189 |     region_of_interest.save(region_of_interest_path)
190 | 
191 |     # - OCR on the region we found
192 |     if search_text:
193 |         semdesk.task.post_message(
194 |             role="Clicker",
195 |             msg=f"Attempting OCR for: {search_text} on a region of interest",
196 |             thread="debug",
197 |         )
198 |         zoomed_region_of_interest = region_of_interest.copy()
199 |         zoomed_region_of_interest = zoomed_region_of_interest.resize(
200 |             (
201 |                 zoomed_region_of_interest.width * UPSCALE_FACTOR,
202 |                 zoomed_region_of_interest.height * UPSCALE_FACTOR,
203 |             ),
204 |             resample=0,
205 |         )
206 |         zoomed_region_of_interest_path = os.path.join(
207 |             semdesk.img_path, f"{click_hash}_zoomed_region_of_interest.png"
208 |         )
209 |         zoomed_region_of_interest.save(zoomed_region_of_interest_path)
210 |         ocr_results = find_all_text_with_bounding_boxes(zoomed_region_of_interest_path)
211 |         best_matches = [
212 |             box
213 |             for box in ocr_results
214 |             if similarity_ratio(box["text"], search_text) >= SECOND_OCR_THRESHOLD
215 |         ]
216 | 
217 |         # We trust OCR only of exactly one match over the threshold is found. Otherwise, we fall back to Grid/Composite.
218 |         if len(best_matches) != 1:
219 |             semdesk.task.post_message(
220 |                 role="Clicker",
221 |                 msg=f"No sufficiently similar text found. Found {len(best_matches)} ({best_matches})'",
222 |                 thread="debug",
223 |             )
224 |         else:
225 |             best_match = best_matches[0]
226 |             relative_box = Box(
227 |                 best_match["x"],
228 |                 best_match["y"],
229 |                 best_match["x"] + best_match["w"],
230 |                 best_match["y"] + best_match["h"],
231 |             )
232 |             absolute_box = relative_box.to_absolute_with_upscale(
233 |                 bounding_boxes[-1], UPSCALE_FACTOR
234 |             )
235 |             x_mid, y_mid = absolute_box.center()
236 |             bounding_boxes.append(absolute_box)
237 |             semdesk.task.post_message(
238 |                 role="Clicker",
239 |                 msg=f"Found best matching text: '{best_match['text']}'",
240 |                 thread="debug",
241 |             )
242 |             # SECOND POINT OF POTENTIAL RESULT RETURN
243 |             semdesk.results["second_ocr"] += 1
244 |             debug_img = _debug_image(
245 |                 starting_img.copy(), bounding_boxes, (x_mid, y_mid)
246 |             )
247 |             semdesk.task.post_message(
248 |                 role="Clicker",
249 |                 msg="Final debug img",
250 |                 thread="debug",
251 |                 images=[image_to_b64(debug_img)],
252 |             )
253 |             return {"x": x_mid, "y": y_mid}
254 | 
255 |     # - Two passes of Grid/Composite + Zoom
256 | 
257 |     total_upscale = 1
258 |     method = recall_best_method_on_second_iteration(description)
259 | 
260 |     region = region_of_interest.copy()
261 |     region = region.resize(
262 |         (region.width * UPSCALE_FACTOR, region.height * UPSCALE_FACTOR), resample=0
263 |     )
264 |     region_of_interest_path = os.path.join(
265 |         semdesk.img_path, f"{click_hash}_region_of_interest_zoom_1.png"
266 |     )
267 |     region.save(region_of_interest_path)
268 |     total_upscale *= UPSCALE_FACTOR
269 |     new_region_of_interest, relative_bounding_box = method(
270 |         semdesk, region, region_of_interest_path, description, click_hash, "zoom_1"
271 |     )
272 | 
273 |     # Escape exit, if we didn't find the region of interest because the element is not on a screen:
274 |     # we fall back to bruteforce method
275 |     if new_region_of_interest is None:
276 |         return backup_find_coordinates(semdesk, description)
277 | 
278 |     absolute_box_zoomed = relative_bounding_box.to_absolute_with_upscale(
279 |         bounding_boxes[-1], total_upscale
280 |     )
281 |     bounding_boxes.append(absolute_box_zoomed)
282 | 
283 |     region = new_region_of_interest.copy()
284 |     region = region.resize(
285 |         (region.width * UPSCALE_FACTOR, region.height * UPSCALE_FACTOR), resample=0
286 |     )
287 |     region_of_interest_path = os.path.join(
288 |         semdesk.img_path, f"{click_hash}_region_of_interest_zoom_2.png"
289 |     )
290 |     region.save(region_of_interest_path)
291 |     total_upscale *= UPSCALE_FACTOR
292 |     last_region_of_interest, relative_bounding_box = method(
293 |         semdesk, region, region_of_interest_path, description, click_hash, "zoom_2"
294 |     )
295 | 
296 |     # Escape exit, if we didn't find the region of interest because the element is not on a screen:
297 |     # we fall back to bruteforce method
298 |     if last_region_of_interest is None:
299 |         return backup_find_coordinates(semdesk, description)
300 | 
301 |     absolute_box_zoomed = relative_bounding_box.to_absolute_with_upscale(
302 |         bounding_boxes[-1], total_upscale
303 |     )
304 |     bounding_boxes.append(absolute_box_zoomed)
305 | 
306 |     x_mid, y_mid = bounding_boxes[-1].center()
307 |     logger.info(f"clicking exact coords {x_mid}, {y_mid}")
308 |     semdesk.task.post_message(
309 |         role="Clicker",
310 |         msg=f"Clicking coordinates {x_mid}, {y_mid}",
311 |         thread="debug",
312 |     )
313 | 
314 |     # LAST POINT OF POTENTIAL RETURN - WE ALWAYS RETURN SOMETHING FROM HERE, UNLESS THERE WAS AN EXCEPTION
315 |     semdesk.results["full_grid"] += 1
316 |     debug_img = _debug_image(starting_img.copy(), bounding_boxes, (x_mid, y_mid))
317 |     semdesk.task.post_message(
318 |         role="Clicker",
319 |         msg="Final debug img",
320 |         thread="debug",
321 |         images=[image_to_b64(debug_img)],
322 |     )
323 |     return {"x": x_mid, "y": y_mid}
324 | 
325 | 
326 | def backup_find_coordinates(semdesk, description: str) -> dict:
327 |     # This is a backup method of finding coordinates to click on. If the core method above fails at some point,
328 |     # with a region_of_interest being 0 (i.e. not found), we try ones again through the most bruteforce mechanics that we have:
329 |     # running three level of Grid + Zoom In; if that fails too, then we surely get back to Big Brain and ask some questions.
330 |     semdesk.task.post_message(
331 |         role="Clicker",
332 |         msg="Coordinates are not found. Falling back to bruteforce 3-level Grid Zoom In.",
333 |         thread="debug",
334 |     )
335 | 
336 |     click_hash = hashlib.md5(description.encode()).hexdigest()[:5]
337 |     bounding_boxes = []
338 |     total_upscale = 1
339 |     method = run_grid
340 | 
341 |     starting_img = semdesk.desktop.take_screenshots()[0]
342 |     starting_img_path = os.path.join(semdesk.img_path, f"{click_hash}_starting.png")
343 |     starting_img.save(starting_img_path)
344 |     bounding_boxes.append(Box(0, 0, starting_img.width, starting_img.height))
345 | 
346 |     region_of_interest = starting_img.copy()
347 | 
348 |     for i in [0, 1, 2]:
349 |         semdesk.task.post_message(
350 |             role="Clicker",
351 |             msg=f"Zooming in, level {i}...",
352 |             thread="debug",
353 |         )
354 | 
355 |         region = region_of_interest.copy()
356 |         region = region.resize(
357 |             (region.width * UPSCALE_FACTOR, region.height * UPSCALE_FACTOR), resample=0
358 |         )
359 |         region_of_interest_path = os.path.join(
360 |             semdesk.img_path, f"{click_hash}_grid_region_{i}.png"
361 |         )
362 |         region.save(region_of_interest_path)
363 |         total_upscale *= UPSCALE_FACTOR
364 |         region_of_interest, relative_bounding_box = method(
365 |             semdesk,
366 |             region,
367 |             region_of_interest_path,
368 |             description,
369 |             click_hash,
370 |             "zoom_{i}",
371 |         )
372 | 
373 |         # Escape exit, if we didn't find the region of interest because the element is not on a screen.
374 |         if region_of_interest is None:
375 |             semdesk.task.post_message(
376 |                 role="Clicker",
377 |                 msg=f"Failed to find {description} on the image. Getting back to Actor.",
378 |                 thread="debug",
379 |             )
380 |             return None
381 | 
382 |         absolute_box_zoomed = relative_bounding_box.to_absolute_with_upscale(
383 |             bounding_boxes[-1], total_upscale
384 |         )
385 |         bounding_boxes.append(absolute_box_zoomed)
386 | 
387 |     x_mid, y_mid = bounding_boxes[-1].center()
388 |     logger.info(f"clicking exact coords {x_mid}, {y_mid}")
389 |     semdesk.task.post_message(
390 |         role="Clicker",
391 |         msg=f"Clicking coordinates {x_mid}, {y_mid}",
392 |         thread="debug",
393 |     )
394 | 
395 |     # LAST POINT OF POTENTIAL RETURN - WE ALWAYS RETURN SOMETHING FROM HERE, UNLESS THERE WAS AN EXCEPTION
396 |     semdesk.results["full_grid"] += 1
397 |     debug_img = _debug_image(starting_img.copy(), bounding_boxes, (x_mid, y_mid))
398 |     semdesk.task.post_message(
399 |         role="Clicker",
400 |         msg="Final debug img",
401 |         thread="debug",
402 |         images=[image_to_b64(debug_img)],
403 |     )
404 |     return {"x": x_mid, "y": y_mid}
405 | 
406 | 
407 | def similarity_ratio(a, b):
408 |     return SequenceMatcher(None, a.lower(), b.lower()).ratio()
409 | 
410 | 
411 | def run_grid(
412 |     semdesk,
413 |     starting_image: Image.Image,
414 |     starting_path: str,
415 |     description: str,
416 |     click_hash: str,
417 |     postfix: str,
418 | ) -> dict:
419 |     img_width, img_height = starting_image.size
420 |     starting_image_b64 = image_to_b64(starting_image)
421 | 
422 |     grid_path = os.path.join(semdesk.img_path, f"{click_hash}_grid_{postfix}.png")
423 |     create_grid_image(
424 |         img_width, img_height, COLOR_CIRCLE, COLOR_NUMBER, GRID_SIZE, grid_path
425 |     )
426 | 
427 |     merged_image_path = os.path.join(
428 |         semdesk.img_path, f"{click_hash}_merge_{postfix}.png"
429 |     )
430 |     merged_image = superimpose_images(starting_path, grid_path, 1)
431 |     merged_image.save(merged_image_path)
432 | 
433 |     merged_image_b64 = image_to_b64(merged_image)
434 |     semdesk.task.post_message(
435 |         role="Clicker",
436 |         msg="Merged image",
437 |         thread="debug",
438 |         images=[merged_image_b64],
439 |     )
440 | 
441 |     thread = RoleThread()
442 | 
443 |     prompt = f"""
444 |     You are an experienced AI trained to find the elements on the screen.
445 |     You see a screenshot of the web application. 
446 |     I have drawn some big {COLOR_NUMBER} numbers on {COLOR_CIRCLE} circles on this image 
447 |     to help you to find required elements.
448 |     Please tell me the closest big {COLOR_NUMBER} number on a {COLOR_CIRCLE} circle to the center of the {description}.
449 |     
450 |     It may be the case, there is no {description} anywhere on the screenshot that you see.
451 |     If you are very sure that there is no {description} anywhere on the screenshot that you see, please return {{"number": 0}}.
452 | 
453 |     Please note that some circles may lay on the {description}. If that's the case, return the number in any of these circles.
454 |     If the {description} is a long object, please pick the circle that is closest to the left top corner of the {description}.
455 |     I have also attached the entire screenshot without these numbers for your reference.
456 | 
457 |     Please return you response as raw JSON following the schema {ZoomSelection.model_json_schema()}
458 |     Be concise and only return the raw json, for example if the circle you wanted to select had a number 3 in it
459 |     you would return {{"number": 3}}
460 |     """
461 | 
462 |     msg = RoleMessage(
463 |         role="user",
464 |         text=prompt,
465 |         images=[merged_image_b64, starting_image_b64],
466 |     )
467 |     thread.add_msg(msg)
468 | 
469 |     try:
470 |         response = router.chat(
471 |             thread,
472 |             namespace="grid",
473 |             expect=ZoomSelection,
474 |             agent_id="RobbieG2",
475 |             retries=1,
476 |         )
477 |         if not response.parsed:
478 |             raise SystemError("No response parsed from zoom")
479 | 
480 |         semdesk.task.add_prompt(response.prompt)
481 | 
482 |         zoom_resp = response.parsed
483 |         semdesk.task.post_message(
484 |             role="Clicker",
485 |             msg=f"Selection {zoom_resp.model_dump_json()}",
486 |             thread="debug",
487 |         )
488 |         console.print(JSON(zoom_resp.model_dump_json()))
489 |         chosen_number = zoom_resp.number
490 |     except Exception as e:
491 |         logger.info(f"Error in analyzing grid: {e}.")
492 | 
493 |     if chosen_number == 0:
494 |         return None, None
495 | 
496 |     region_of_interest, top_left, bottom_right = zoom_in(
497 |         starting_path, GRID_SIZE, chosen_number, 1
498 |     )
499 |     bounding_box = Box(top_left[0], top_left[1], bottom_right[0], bottom_right[1])
500 |     return region_of_interest, bounding_box
501 | 
502 | 
503 | def run_composite(
504 |     semdesk,
505 |     starting_image: Image.Image,
506 |     starting_path: str,
507 |     description: str,
508 |     click_hash: str,
509 |     postfix: str,
510 | ) -> dict:
511 |     composite_path = os.path.join(
512 |         semdesk.img_path, f"{click_hash}_composite_{postfix}.png"
513 |     )
514 |     composite_pil, bounding_boxes = create_composite(starting_path, NUM_CLUSTERS)
515 |     composite_pil.save(composite_path)
516 |     composite_b64 = image_to_b64(composite_pil)
517 | 
518 |     starting_image_b64 = image_to_b64(starting_image)
519 | 
520 |     semdesk.task.post_message(
521 |         role="Clicker",
522 |         msg="Composite image",
523 |         thread="debug",
524 |         images=[composite_b64],
525 |     )
526 | 
527 |     thread = RoleThread()
528 | 
529 |     prompt = f"""
530 |     You are an experienced AI trained to find the elements on the screen.
531 |     You see a composite of several section of the screenshpt of the web application.
532 |     You also see the entire screenshot for the reference.
533 | 
534 |     I have drawn some big {COLOR_NUMBER} numbers on the left panel of the composite image. 
535 |     Please tell me the number of the section of the composite image that contains the {description}.
536 |         
537 |     It may be the case, there is no {description} anywhere on the screenshot that you see.
538 |     If you are very sure that there is no {description} anywhere on the screenshot that you see, please return {{"number": 0}}.
539 | 
540 |     Please return you response as raw JSON following the schema {CompositeSelection.model_json_schema()}
541 |     Be concise and only return the raw json, for example if the section has a number 3, 
542 |     you should return {{"number": 3}}
543 |     """
544 | 
545 |     msg = RoleMessage(
546 |         role="user",
547 |         text=prompt,
548 |         images=[composite_b64, starting_image_b64],
549 |     )
550 |     thread.add_msg(msg)
551 | 
552 |     try:
553 |         response = router.chat(
554 |             thread,
555 |             namespace="composite",
556 |             expect=CompositeSelection,
557 |             agent_id="RobbieG2",
558 |             retries=1,
559 |         )
560 |         if not response.parsed:
561 |             raise SystemError("No response parsed from zoom")
562 | 
563 |         semdesk.task.add_prompt(response.prompt)
564 | 
565 |         composite_resp = response.parsed
566 |         semdesk.task.post_message(
567 |             role="Clicker",
568 |             msg=f"Selection {composite_resp.model_dump_json()}",
569 |             thread="debug",
570 |         )
571 |         console.print(JSON(composite_resp.model_dump_json()))
572 |         chosen_number = composite_resp.number
573 |     except Exception as e:
574 |         logger.info(f"Error in analyzing composite: {e}.")
575 |         return None, None
576 | 
577 |     if chosen_number == 0:
578 |         return None, None
579 | 
580 |     bounding_box = bounding_boxes[chosen_number - 1]
581 |     top_left = (bounding_box[0], bounding_box[1])
582 |     bottom_right = (
583 |         bounding_box[0] + bounding_box[2],
584 |         bounding_box[1] + bounding_box[3],
585 |     )
586 |     region_of_interest = starting_image.crop(
587 |         (top_left[0], top_left[1], bottom_right[0], bottom_right[1])
588 |     )
589 |     box = Box(top_left[0], top_left[1], bottom_right[0], bottom_right[1])
590 | 
591 |     return region_of_interest, box
592 | 
593 | 
594 | def _debug_image(
595 |     img: Image.Image,
596 |     boxes: List[Box],
597 |     final_click: Optional[Tuple[int, int]] = None,
598 | ) -> Image.Image:
599 |     draw = ImageDraw.Draw(img)
600 |     for box in boxes:
601 |         box.draw(draw)
602 | 
603 |     if final_click:
604 |         draw.ellipse(
605 |             [
606 |                 final_click[0] - 5,
607 |                 final_click[1] - 5,
608 |                 final_click[0] + 5,
609 |                 final_click[1] + 5,
610 |             ],
611 |             fill="red",
612 |             outline="red",
613 |         )
614 |     return img
615 | 


--------------------------------------------------------------------------------
/robbieg2/agent.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import time
  4 | import traceback
  5 | from typing import Final, List, Optional, Tuple, Type
  6 | 
  7 | from agentdesk.device_v1 import Desktop
  8 | from devicebay import Device
  9 | from pydantic import BaseModel, Field
 10 | from rich.console import Console
 11 | from rich.json import JSON
 12 | from skillpacks.server.models import V1Action
 13 | from surfkit.agent import TaskAgent
 14 | from taskara import Task, TaskStatus
 15 | from tenacity import before_sleep_log, retry, stop_after_attempt
 16 | from threadmem import RoleMessage, RoleThread
 17 | from mllm import ChatResponse
 18 | from toolfuse.util import AgentUtils
 19 | 
 20 | from .tool import SemanticDesktop, router
 21 | from .clicker import similarity_ratio
 22 | from .cheap_critic import assess_action_result
 23 | 
 24 | 
 25 | logging.basicConfig(level=logging.INFO)
 26 | logger: Final = logging.getLogger(__name__)
 27 | logger.setLevel(int(os.getenv("LOG_LEVEL", str(logging.DEBUG))))
 28 | 
 29 | console = Console(force_terminal=True)
 30 | 
 31 | 
 32 | class RobbieG2Config(BaseModel):
 33 |     pass
 34 | 
 35 | 
 36 | class ActorThoughts(BaseModel):
 37 |     """An represention of thoughts of the Actor part of the brain."""
 38 | 
 39 |     observation: str = Field(
 40 |         ..., description="Observations of the current state of the environment"
 41 |     )
 42 |     reason: str = Field(
 43 |         ...,
 44 |         description="The reason why this action was chosen, explaining the logic or rationale behind the decision.",
 45 |     )
 46 |     action: V1Action = Field(
 47 |         ...,
 48 |         description="The action object detailing the specific action to be taken, including its name and parameters.",
 49 |     )
 50 | 
 51 | class NeocortexPrediction(BaseModel):
 52 |     """An represention of thoughts of the Neocortex part of the brain."""
 53 |     
 54 |     prediction: str = Field(
 55 |         ..., description="Prediction about the state of the environment after the current action"
 56 |     )
 57 |     reason: str = Field(
 58 |         ...,
 59 |         description="The reason why the next action is chosen, explaining the logic or rationale behind the decision.",
 60 |     )
 61 |     action: V1Action = Field(
 62 |         ...,
 63 |         description="The action object detailing the next action to be taken after the current action takes place, including its name and parameters.",
 64 |     )
 65 | 
 66 | class NeocortexThoughts(BaseModel):
 67 |     """An represention of thoughts of the Neocortex part of the brain."""
 68 |     
 69 |     prediction_1: NeocortexPrediction = Field(
 70 |         ..., description="Prediction about the state of the environment after the current action, chosen by Actor, and the most appropriate next action"
 71 |     )
 72 |     prediction_2: NeocortexPrediction = Field(
 73 |         ..., description="Prediction about the state of the environment after the first predicted action, and the most appropriate action after that"
 74 |     )
 75 | 
 76 | class CriticThoughts(BaseModel):
 77 |     """An represention of thoughts of the Critic part of the brain."""
 78 | 
 79 |     critic: str = Field(..., description="Critic's thoughts about whether the current state of environment corresponds to a given task, and if not, now to recover.")
 80 | 
 81 | class BrainThoughts(BaseModel):
 82 |     """An represention of thoughts of the whole brain."""
 83 | 
 84 |     critic: CriticThoughts = Field(..., description="Thoughts of the Critic part of the brain.")
 85 | 
 86 |     actor: ActorThoughts = Field(..., description="Thoughts of the Actor part of the brain.")
 87 | 
 88 |     neocortex: NeocortexThoughts = Field(..., description="Thoughts of the Neocortex part of the brain.")
 89 | 
 90 | class InterruptionCriticThoughts(BaseModel):
 91 |     """A representation of thoughts of the Critic which was interrupted because we repeat the same actions again and again."""
 92 | 
 93 |     critic: str = Field(..., description="Critic's assessment on whether taking the current action is a good idea and whether the previous similar action were appropriate and successful.")
 94 | 
 95 |     action: V1Action = Field(..., description="The most appropripriate next action given the entire situation.")
 96 | 
 97 | 
 98 | class RobbieG2(TaskAgent):
 99 |     """A GUI desktop agent that slices up the image"""
100 | 
101 |     def __init__(self, *args, **kwargs):
102 |         super().__init__(*args, **kwargs)
103 |         self.past_actions = []
104 | 
105 | 
106 |     def record_action(self, action: dict) -> None:
107 |         self.past_actions.append(action)
108 | 
109 | 
110 |     def find_the_closest_actions(self, action: V1Action, depth: int = 10, threshold: float = 0.8) -> [V1Action]:
111 |         recent_actions = self.past_actions[-depth:]
112 |         closest_actions = []
113 | 
114 |         for past_action in reversed(recent_actions):
115 |             if action.name == "type_text" or action.name == "click_object":
116 |                 action_params = str(action.parameters)
117 |                 past_action_parames = str(past_action.parameters)
118 |                 similarity = similarity_ratio(action_params, past_action_parames)
119 |                 if similarity > threshold:
120 |                     closest_actions.append(past_action)
121 |             else:
122 |                 action_str = str(action)
123 |                 past_action_str = str(past_action)
124 |                 similarity = similarity_ratio(action_str, past_action_str)
125 |                 if similarity > 0.95:
126 |                     closest_actions.append(past_action)
127 |         
128 |         return closest_actions
129 | 
130 | 
131 |     def solve_task(
132 |         self,
133 |         task: Task,
134 |         device: Optional[Device] = None,
135 |         max_steps: int = 30,
136 |     ) -> Task:
137 |         """Solve a task
138 | 
139 |         Args:
140 |             task (Task): Task to solve.
141 |             device (Device): Device to perform the task on.
142 |             max_steps (int, optional): Max steps to try and solve. Defaults to 30.
143 | 
144 |         Returns:
145 |             Task: The task
146 |         """
147 |         start_time = time.time()  # Start time measurement
148 | 
149 |         # Post a message to the default thread to let the user know the task is in progress
150 |         task.post_message("Actor", f"Starting task '{task.description}'")
151 | 
152 |         # Create threads in the task to update the user
153 |         console.print("creating threads...")
154 |         task.ensure_thread("debug")
155 |         task.post_message("Actor", "I'll post debug messages here", thread="debug")
156 | 
157 |         # Check that the device we received is one we support
158 |         if not isinstance(device, Desktop):
159 |             raise ValueError("Only desktop devices supported")
160 | 
161 |         # Wrap the standard desktop in our special tool
162 |         semdesk = SemanticDesktop(task=task, desktop=device)
163 | 
164 |         # Add standard agent utils to the device
165 |         semdesk.merge(AgentUtils())
166 | 
167 |         # Open a site if present in the parameters
168 |         site = task._parameters.get("site") if task._parameters else None
169 |         if site:
170 |             console.print(f"▶️ opening site url: {site}", style="blue")
171 |             task.post_message("Body", f"opening site url {site}...")
172 |             semdesk.desktop.open_url(site)
173 |             console.print("waiting for browser to open...", style="blue")
174 |             time.sleep(10)
175 | 
176 |         # Get info about the desktop
177 |         info = semdesk.desktop.info()
178 |         screen_size = info["screen_size"]
179 |         console.print(f"Screen size: {screen_size}")
180 | 
181 |         # Get the json schema for the tools, excluding actions that aren't useful
182 |         tools = semdesk.json_schema(
183 |             exclude_names=[
184 |                 "move_mouse",
185 |                 "click",
186 |                 "drag_mouse",
187 |                 "mouse_coordinates",
188 |                 "take_screenshots",
189 |                 "open_url",
190 |                 "double_click",
191 |             ]
192 |         )
193 |         console.print("tools: ", style="purple")
194 |         console.print(JSON.from_data(tools))
195 | 
196 |         starting_prompt = f"""
197 | You are RobbieG2, an advanced AI agent designed to navigate and interact with web interfaces. Your capabilities include:
198 | 
199 | 1. Mouse control:
200 |    - Move the mouse to specific coordinates
201 |    - Click (single or double) at current or specified locations
202 |    - Retrieve current mouse coordinates
203 | 
204 | 2. Keyboard input:
205 |    - Send key commands, including special keys like Tab, Enter, and arrow keys
206 |    - Type text into form fields
207 | 
208 | 3. Navigation:
209 |    - Use Tab key to move through form elements
210 |    - Scroll web pages
211 | 
212 | 4. Visual analysis:
213 |    - Take screenshots of the current view
214 | 
215 | 5. Advanced interaction:
216 |    - Click on objects based on semantic descriptions
217 | 
218 | *** Firefox Commands
219 | 
220 | Specifically, if you are using the Firefox browser, remember that you can use the following key commands:
221 | 
222 | * Press Ctrl + L or Alt + D to highlight the URL, then press Delete to clear it if there is incorrect text in the URL bar that you need to clear out.
223 | * To clear the text in a field do the following: First, ensure the field is in focus BEFORE using this command. Then use Ctrl + A and then Backspace or Delete: This command first highlights all text in a field and then deletes that text. 
224 | * Ctrl + Shift + Tab switches to the previous tab
225 | * Ctrl + Tab switches to the next tab
226 | * Press Backspace or Alt + Left Arrow to go to the previous page in your browsing history for the tab
227 | * Press Shift + Backspace or Alt + Right Arrow to go to the next page in your browsing history for the tab
228 | * Press F6 or Shift + F6 to switch focus to the next keyboard-accessible pane, which includes:
229 | 
230 |     Highlights the URL in the address bar
231 |     Bookmarks bar (if visible)
232 |     The main web content
233 |     Downloads bar (if visible)
234 | 
235 | 
236 | *** Chrome Commands
237 | 
238 | Specifically, if you are using the Chrome browser, remember that you can use the following key commands:
239 | 
240 | * Press Ctrl + L or Alt + D to highlight the URL, then press Delete to clear it if there is incorrect text in the URL bar that you need to clear out.
241 | * `clean_text` is also a special command to clear fields but you MUST ensure the field is IN FOCUS first before using this command.
242 | * Ctrl + Shift + Tab switches to the previous tab which is very useful if a new tab you don't want is opened and you need to get back to the last tab. 
243 | * Ctrl + Tab switches to the next tab.	
244 | * Press Backspace or Alt and the left arrow together - to go to the previous page in your browsing history for the tab.
245 | * Press Shift+Backspace, or Alt and the right arrow together - to go to the next page in your browsing history for the tab. 	
246 | # `Ctrl + A` and then `Backspace` or `Delete` - This command first highlights all text in a field and then deletes that text. It only works IF YOU ARE ALREADY IN THAT FIELD so be sure the field is in focus or clicked already and click it if you are unsure - this is one of the MOST IMPORTANT commands. You can use it to clear existing text from a field that is filled with incorrect information in a web form.
247 | * F6 or Shift+F6 - Switches focus to the next keyboard-accessible pane. Panes include:
248 | 
249 |     Highlights the URL in the address bar
250 |     Bookmarks bar (if visible)
251 |     The main web content (including any infobars)
252 |     Downloads bar (if visible)	
253 | 
254 | If you are unsure about whether a field is selected you can try to click it to ensure it is highlighted.  If you take an action several times in a row
255 | and the result has not changed, for example, if a field has not changed to meet you expectation, then explore the idea of clicking it again to change it
256 | and ensure it has the correct text that you want there.
257 | 
258 | Remember, remember that you DO NOT have the ability to take a screenshot to verify your results. 
259 | 
260 | Sometimes a page isn't fully loaded yet. If that is the case feel free to wait or pause briefly for the screenshot to indicate a fully loaded page.
261 | 
262 | If you get stuck in a loop of actions, use your curiosity to explore and try new things with trial and error. Learn from your mistakes and get better with
263 | each new action.
264 |     
265 | The complete list of available tools is: {tools}
266 | 
267 | Your goal is to efficiently navigate web interfaces and complete tasks by leveraging these capabilities. 
268 | Always consider the most appropriate method for each action, and be prepared to adapt your approach based 
269 | on the results of your actions.
270 | 
271 | When faced with a task, think step-by-step about the best approach, considering all your available tools and methods. 
272 | 
273 | You brain consists of three major parts. 
274 | 
275 | 1. The Critic is responsible for evaluating the current state of the environment and deciding whether it corresponds to a given task. 
276 | If it doesn't, the Critic explains how to recover the environment to a state where it can complete the task. Always start with Critic
277 | assessment, before choosing the next actions and predicting the next steps.
278 | 
279 | 2. The Actor is responsible for picking the next action based on the current state of the environment and the tools available. 
280 | 
281 | 3. The Neocortex is responsible for thinking ahead, predicting the state of the environment after the action that the Actor picked, 
282 | choosing the next action after that, and so on. The Neocortex makes three predictions for the actions to be taken AFTER the one that 
283 | the Actor picked. 
284 | 
285 | Your current task is {task.description}.
286 | 
287 | For each screenshot I will send you please return the complete thoughts of both parts of your brain as a
288 | raw JSON adhearing to the schema {BrainThoughts.model_json_schema()}.
289 | 
290 | Let me know when you are ready and I'll send you the first screenshot.
291 | """
292 | 
293 |         # Create our thread and start with a system prompt
294 |         thread = RoleThread()
295 |         thread.post(
296 |             role="user",
297 |             msg=starting_prompt,
298 |         )
299 |         response = router.chat(thread, namespace="system")
300 |         console.print(f"system prompt response: {response}", style="blue")
301 |         thread.add_msg(response.msg)
302 | 
303 |         # Loop to run actions
304 |         for i in range(max_steps):
305 |             console.print(f"-------step {i + 1}", style="green")
306 | 
307 |             try:
308 |                 thread, done = self.take_action(semdesk, task, thread)
309 |             except Exception as e:
310 |                 console.print(f"Error: {e}", style="red")
311 |                 task.status = TaskStatus.FAILED
312 |                 task.error = str(e)
313 |                 task.save()
314 |                 task.post_message("Actor", f"❗ Error taking action: {e}")
315 |                 end_time = time.time()  # End time measurement
316 |                 elapsed_time = end_time - start_time
317 |                 console.print(f"Time taken to solve task: {elapsed_time:.2f} seconds", style="green")
318 |                 return task
319 | 
320 |             if done:
321 |                 console.print("task is done", style="green")
322 |                 end_time = time.time()  # End time measurement
323 |                 elapsed_time = end_time - start_time
324 |                 console.print(f"Time taken to solve task: {elapsed_time:.2f} seconds", style="green")
325 |                 return task
326 | 
327 |             time.sleep(2)
328 | 
329 |         task.status = TaskStatus.FAILED
330 |         task.save()
331 |         task.post_message("Actor", "❗ Max steps reached without solving task")
332 |         console.print("Reached max steps without solving task", style="red")
333 | 
334 |         end_time = time.time()  # End time measurement
335 |         elapsed_time = end_time - start_time
336 |         console.print(f"Time taken to solve task: {elapsed_time:.2f} seconds", style="green")
337 | 
338 |         return task
339 |     
340 |     @retry(
341 |         stop=stop_after_attempt(5),
342 |         before_sleep=before_sleep_log(logger, logging.INFO),            
343 |     )
344 |     def interrupt_flow_and_ask_critic(
345 |         self,
346 |         semdesk: SemanticDesktop,
347 |         task: Task,
348 |         thread: RoleThread,
349 |         current_action: dict
350 |     ) -> dict:
351 |         try:
352 |             _thread = thread.copy()
353 |             screenshot_img = semdesk.desktop.take_screenshots()[0]
354 |             critic_prompt = f"""
355 | You task is {task.description}. The screenshot is attached.
356 | You are attempting to do the following action: {current_action}.
357 | You have already attempted to do very similar actions very recently. 
358 | Please assess if the previous actions very successful, and if you are sure that this action is exactly what needs to be done next.
359 | If you are not sure, please consider various alternative options and pick the action that is most likely to lead us toward completing
360 | the above-mentioned task. 
361 | Give me the action to be done next, along with yours reasons for that.
362 | 
363 | Unlike other messages in this thread, please return your thoughts as as a
364 | raw JSON adhearing to the schema {InterruptionCriticThoughts.model_json_schema()}.
365 | 
366 | Please return just the raw JSON.
367 | """
368 |             # Craft the message asking the MLLM for an action
369 |             msg = RoleMessage(
370 |                 role="user",
371 |                 text=critic_prompt,
372 |                 images=[screenshot_img],
373 |             )
374 |             _thread.add_msg(msg)
375 | 
376 |             # Make the action selection
377 |             response = router.chat(
378 |                 _thread,
379 |                 namespace="action",
380 |                 expect=InterruptionCriticThoughts,
381 |                 agent_id=self.name(),
382 |             )
383 |             task.add_prompt(response.prompt)
384 | 
385 |             try:
386 |                 # Post to the user letting them know what the modle selected
387 |                 selection = response.parsed
388 |                 if not selection:
389 |                     raise ValueError("No action selection parsed")
390 |                 
391 |                 task.post_message("Critic", f"🤔 {selection.critic}")
392 |                 task.post_message("Critic", f"▶️ I suggest to take action '{selection.action.name}' "+
393 |                                             f"with parameters: {selection.action.parameters}")
394 |                 return selection.action
395 |         
396 |             except Exception as e:
397 |                 console.print(f"Response failed to parse: {e}", style="red")
398 |                 raise
399 | 
400 |         except Exception as e:
401 |             console.print("Exception taking action: ", e)
402 |             traceback.print_exc()
403 |             task.post_message("Actor", f"⚠️ Error taking action: {e} -- retrying...")
404 |             raise e
405 | 
406 | 
407 |     @retry(
408 |         stop=stop_after_attempt(5),
409 |         before_sleep=before_sleep_log(logger, logging.INFO),
410 |     )
411 |     def take_action(
412 |         self,
413 |         semdesk: SemanticDesktop,
414 |         task: Task,
415 |         thread: RoleThread,
416 |     ) -> Tuple[RoleThread, bool]:
417 |         """Take an action
418 | 
419 |         Args:
420 |             desktop (SemanticDesktop): Desktop to use
421 |             task (str): Task to accomplish
422 |             thread (RoleThread): Role thread for the task
423 | 
424 |         Returns:
425 |             bool: Whether the task is complete
426 |         """
427 |         try:
428 |             # Check to see if the task has been cancelled
429 |             if task.remote:
430 |                 task.refresh()
431 |             console.print("task status: ", task.status.value)
432 |             if (
433 |                 task.status == TaskStatus.CANCELING
434 |                 or task.status == TaskStatus.CANCELED
435 |             ):
436 |                 console.print(f"task is {task.status}", style="red")
437 |                 if task.status == TaskStatus.CANCELING:
438 |                     task.status = TaskStatus.CANCELED
439 |                     task.save()
440 |                 return thread, True
441 | 
442 |             console.print("taking action...", style="white")
443 | 
444 |             # Create a copy of the thread, and remove old images
445 |             _thread = thread.copy()
446 |             _thread.remove_images()
447 | 
448 |             task.post_message("Actor", "🤔 I'm thinking...")
449 | 
450 |             # Take a screenshot of the desktop and post a message with it
451 |             screenshot_img = semdesk.desktop.take_screenshots()[0]
452 |             task.post_message(
453 |                 "Actor",
454 |                 "Current image",
455 |                 images=[screenshot_img],
456 |                 thread="debug",
457 |             )
458 | 
459 |             # Get the current mouse coordinates
460 |             x, y = semdesk.desktop.mouse_coordinates()
461 |             console.print(f"mouse coordinates: ({x}, {y})", style="white")
462 | 
463 |             step_prompt = f"""
464 | Here is a screenshot of the current desktop, please select next and the one after next action from the provided schema.
465 | 
466 | Critic: Carefully analyze the screenshot and check if the state corresponds to the task we are solving. Remember that 
467 | the task is {task.description}.
468 | Actor: Select a next action and explain why.
469 | Neocortex: Predict the result of the action picked by Actor and pick the next ones.
470 | 
471 | Watch out for elements that are different from others, for example, have the border of the different color. 
472 | Such elements are usually already in focus, and you can try to type text in them right away. 
473 | However, if you tried to type on a previous step and want to type the same input again, you better 
474 | focus on the input field first by clicking on it. 
475 | 
476 | Please return just the raw JSON.
477 | """
478 |             
479 |             # Craft the message asking the MLLM for an action
480 |             msg = RoleMessage(
481 |                 role="user",
482 |                 text=step_prompt,
483 |                 images=[screenshot_img],
484 |             )
485 |             _thread.add_msg(msg)
486 | 
487 |             # Make the action selection
488 |             response = router.chat(
489 |                 _thread,
490 |                 namespace="action",
491 |                 expect=BrainThoughts,
492 |                 agent_id=self.name(),
493 |             )
494 |             task.add_prompt(response.prompt)
495 | 
496 |             try:
497 |                 # Post to the user letting them know what the modle selected
498 |                 selection = response.parsed
499 |                 if not selection:
500 |                     raise ValueError("No action selection parsed")
501 |                 
502 |                 task.post_message("Critic", f"🤔 {selection.critic.critic}")
503 | 
504 |                 task.post_message("Actor",  f"👁️ {selection.actor.observation}\n" +
505 |                                             f"💡 {selection.actor.reason}\n" +
506 |                                             f"▶️ I'm going to take action '{selection.actor.action.name}' "+
507 |                                             f"with parameters: {selection.actor.action.parameters}")
508 |                 
509 |                 task.post_message("Neocortex",  f"🔮 {selection.neocortex.prediction_1.prediction}\n" + 
510 |                                                 f"💡 {selection.neocortex.prediction_1.reason}\n" +
511 |                                                 f"🔜 The next action to take is '{selection.neocortex.prediction_1.action.name}' "+
512 |                                                 f"with parameters: {selection.neocortex.prediction_1.action.parameters}")
513 |                 
514 |                 task.post_message("Neocortex",  f"🔮 {selection.neocortex.prediction_2.prediction}\n" + 
515 |                                                 f"💡 {selection.neocortex.prediction_2.reason}\n" +
516 |                                                 f"🔜 The last action to take after that is '{selection.neocortex.prediction_2.action.name}' "+
517 |                                                 f"with parameters: {selection.neocortex.prediction_2.action.parameters}")
518 | 
519 |             except Exception as e:
520 |                 console.print(f"Response failed to parse: {e}", style="red")
521 |                 raise
522 | 
523 |             # The agent will return 'result' if it believes it's finished
524 |             if selection.actor.action.name == "result":
525 |                 console.print(f"The final result is: {selection.actor.action.parameters['value']}", style="green")
526 |                 task.post_message(
527 |                     "Actor",
528 |                     f"✅ I think the task is done, please review the result: {selection.actor.action.parameters['value']}",
529 |                 )
530 |                 task.status = TaskStatus.FINISHED
531 |                 task.save()
532 |                 return _thread, True
533 | 
534 |             im_start = screenshot_img
535 |             continue_chain = True
536 |             interruption_requested = False
537 | 
538 |             for next_action in [selection.actor.action, 
539 |                                 selection.neocortex.prediction_1.action, 
540 |                                 selection.neocortex.prediction_2.action]:
541 |                 if not continue_chain or next_action.name == "result" or interruption_requested:
542 |                     # Time to think again!
543 |                     break
544 | 
545 |                 # Hack for the cases when AI is willing to press "ctrl+s" or smth like that
546 |                 if next_action.name == "press_key" and "+" in next_action.parameters["key"]:
547 |                     next_action.name = "hot_key"
548 |                     next_action.parameters = {"keys": next_action.parameters["key"].split("+")}
549 | 
550 |                 # Additional check to make sure that we are not trapped in a circle, do the same action again and again
551 |                 depth = 5 if (next_action.name == "press_key" or next_action.name == "hot_key") else 10
552 |                 closest_actions = self.find_the_closest_actions(next_action, depth=depth)
553 |                 if len(closest_actions) > 0:
554 |                     task.post_message(
555 |                         "Body",
556 |                         f"Closest actions to the current one: {closest_actions}",
557 |                         thread="debug"
558 |                     )
559 |                 if len(closest_actions) >= 2:
560 |                     task.post_message(
561 |                         "Body",
562 |                         "Too many repeated actions. Getting back to Critic.",
563 |                         thread="debug"
564 |                     )
565 |                     # Well, look like it's time to interrupt the flow and reconsider our life choices. 
566 |                     new_action = self.interrupt_flow_and_ask_critic(semdesk, task, thread, next_action)
567 |                     next_action = new_action
568 |                     # We'll run this updated action and get out of the cycle.
569 |                     interruption_requested = True
570 | 
571 |                 task.post_message(
572 |                     "Body",
573 |                     f"▶️ Taking action '{next_action.name}' with parameters: {next_action.parameters}",
574 |                 )
575 |                 self._take_selected_action(semdesk, next_action, task, _thread, response)
576 |                 self.record_action(next_action)
577 | 
578 |                 # If we know for certian that the click was not successful, it's time to stop the chain
579 |                 # and think again
580 |                 if semdesk.last_click_failed:
581 |                     semdesk.last_click_failed = False
582 |                     break
583 | 
584 |                 # Pressing keys change environment for sure, so we may just stop the exection here and think again
585 |                 if next_action.name == "press_key":
586 |                     break
587 | 
588 |                 # We analyze if we want to continue to the next action here. A cheap critic looks at the new screenshot and 
589 |                 # decides if we should continue the chain or not. 
590 |                 screenshot_upd = semdesk.desktop.take_screenshots()[0]
591 |                 ssim, continue_chain = assess_action_result(im_start, screenshot_upd)
592 | 
593 |                 # If we were typing text, and the screen changed too much, then we probably hit some hot keys by accident
594 |                 # and scrolled down. We should stop and scroll back up, forcing recovery.
595 |                 if next_action.name == "type_text" and ssim < 0.9:
596 |                     semdesk.desktop.scroll(30) # we may need to adjust this number
597 |                     break
598 | 
599 |                 # There is a chance that if the last action was a click, then the result didn't load yet, and the SSIM will be high
600 |                 # while it should be low. To avoid this, we check once again for this specific case in 5 seconds:
601 |                 if next_action.name == "click_object" and ssim > 0.95:
602 |                     task.post_message("Critic", "😴 Waiting to be sure that the result is loaded...", thread="debug")
603 |                     time.sleep(5)
604 |                     screenshot_upd = semdesk.desktop.take_screenshots()[0]
605 |                     ssim, continue_chain = assess_action_result(im_start, screenshot_upd)
606 |                 task.post_message("Critic", f"🔍 SSIM: {ssim}", thread="debug")
607 |                 
608 |             return _thread, False
609 | 
610 |         except Exception as e:
611 |             console.print("Exception taking action: ", e)
612 |             traceback.print_exc()
613 |             task.post_message("Actor", f"⚠️ Error taking action: {e} -- retrying...")
614 |             raise e
615 | 
616 |     def _take_selected_action(self, semdesk: SemanticDesktop, action: V1Action, 
617 |                               task: Task, thread: RoleThread, response: ChatResponse) -> None:
618 |         """Take the selected action
619 | 
620 |         Args:
621 |             semdesk (SemanticDesktop): Desktop to use
622 |             action (V1Action): Action to take
623 |         """
624 |         console.log(f"taking action: {action}")
625 | 
626 |         # Find the selected action in the tool
627 |         desktop_action = semdesk.find_action(action.name)
628 |         console.print(f"found action: {desktop_action}", style="blue")
629 |         if not desktop_action:
630 |             console.print(f"action returned not found: {action.name}")
631 |             raise SystemError("action not found")
632 | 
633 |         # Take the selected action
634 |         try:
635 |             action_response = semdesk.use(desktop_action, **action.parameters)
636 |         except Exception as e:
637 |             raise ValueError(f"Trouble using action: {e}")
638 | 
639 |         console.print(f"action output: {action_response}", style="blue")
640 |         if action_response:
641 |             task.post_message(
642 |                 "Actor", f"👁️ Result from taking action: {action_response}"
643 |             )
644 | 
645 |         thread.add_msg(response.msg)
646 | 
647 |     @classmethod
648 |     def supported_devices(cls) -> List[Type[Device]]:
649 |         """Devices this agent supports
650 | 
651 |         Returns:
652 |             List[Type[Device]]: A list of supported devices
653 |         """
654 |         return [Desktop]
655 | 
656 |     @classmethod
657 |     def config_type(cls) -> Type[RobbieG2Config]:
658 |         """Type of config
659 | 
660 |         Returns:
661 |             Type[DinoConfig]: Config type
662 |         """
663 |         return RobbieG2Config
664 | 
665 |     @classmethod
666 |     def from_config(cls, config: RobbieG2Config) -> "RobbieG2":
667 |         """Create an agent from a config
668 | 
669 |         Args:
670 |             config (RobbieG2Config): Agent config
671 | 
672 |         Returns:
673 |             RobbieG2: The agent
674 |         """
675 |         return RobbieG2()
676 | 
677 |     @classmethod
678 |     def default(cls) -> "RobbieG2":
679 |         """Create a default agent
680 | 
681 |         Returns:
682 |             RobbieG2: The agent
683 |         """
684 |         return RobbieG2()
685 | 
686 |     @classmethod
687 |     def init(cls) -> None:
688 |         """Initialize the agent class"""
689 |         return
690 | 
691 | 
692 | Agent = RobbieG2
693 | 


--------------------------------------------------------------------------------