├── robbieg2 ├── __init__.py ├── easyocr.py ├── cheap_critic.py ├── server.py ├── grid.py ├── canny_composite.py ├── tool.py ├── img.py ├── clicker.py └── agent.py ├── .gitattributes ├── font └── arialbd.ttf ├── images ├── robbie.jpg └── meet-robbie-g2.jpg ├── agent.yaml ├── pyproject.toml ├── LICENSE ├── CONTRIBUTING.md ├── CODE_OF_CONDUCT.md ├── README.md └── .gitignore /robbieg2/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /font/arialbd.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/agentsea/robbie-g2/HEAD/font/arialbd.ttf -------------------------------------------------------------------------------- /images/robbie.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/agentsea/robbie-g2/HEAD/images/robbie.jpg -------------------------------------------------------------------------------- /images/meet-robbie-g2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/agentsea/robbie-g2/HEAD/images/meet-robbie-g2.jpg -------------------------------------------------------------------------------- /robbieg2/easyocr.py: -------------------------------------------------------------------------------- 1 | import easyocr 2 | 3 | def find_all_text_with_bounding_boxes(path: str) -> [dict]: 4 | try: 5 | reader = easyocr.Reader(['en']) 6 | results = reader.readtext(path) 7 | processed_results = [] 8 | for box in results: 9 | result = { 10 | "x": int(box[0][0][0]), 11 | "y": int(box[0][0][1]), 12 | "w": int(box[0][2][0] - box[0][0][0]), 13 | "h": int(box[0][2][1] - box[0][0][1]), 14 | "text": box[1], 15 | "confidence": float(box[2]) 16 | } 17 | processed_results.append(result) 18 | return processed_results 19 | except Exception as e: 20 | print(f"EasyOCR failed: {str(e)}") 21 | return [] 22 | -------------------------------------------------------------------------------- /agent.yaml: -------------------------------------------------------------------------------- 1 | api_version: v1 2 | kind: TaskAgent 3 | name: "RobbieG2" 4 | description: "A Gen 2 AI Agent that uses OCR, Canny Composite, and Grid to navigate GUIs" 5 | tags: 6 | - "gui" 7 | supports: 8 | - "desktop" 9 | cmd: "poetry run python -m robbieg2.server" 10 | img_repo: "us-central1-docker.pkg.dev/agentsea-dev/guisurfer/robbieg2" 11 | versions: 12 | latest: "us-central1-docker.pkg.dev/agentsea-dev/guisurfer/robbieg2:latest" 13 | runtimes: 14 | - type: "agent" 15 | preference: 16 | - "process" 17 | - "docker" 18 | - "kube" 19 | llm_providers: 20 | preference: 21 | - "gpt-4o" 22 | public: True 23 | icon: https://storage.googleapis.com/guisurfer-assets/SurfPizza.webp 24 | resource_requests: 25 | cpu: "1" 26 | memory: "2Gi" 27 | resource_limits: 28 | cpu: "2" 29 | memory: "4Gi" 30 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | 2 | [tool.poetry] 3 | name = "RobbieG2" 4 | version = "0.1.0" 5 | description = "A Gen 2 AI Agent that uses OCR, Canny Composite, and Grid to navigate GUIs." 6 | authors = ["Kentauros AI "] 7 | license = "MIT" 8 | readme = "README.md" 9 | packages = [{include = "robbieg2"}] 10 | 11 | 12 | [tool.poetry.dependencies] 13 | python = "^3.10" 14 | pydantic = "^2.6.3" 15 | opencv-python = "^4.10.0.84" 16 | numpy = "^1.26.4" 17 | scikit-learn = "^1.5.0" 18 | easyocr = "^1.7.1" 19 | torch = { version = "2.2.2", source = "pypi" } 20 | pillow = "^10.3.0" 21 | surfkit = "^0.1.309" 22 | 23 | 24 | [tool.poetry.group.dev.dependencies] 25 | flake8 = "^7.0.0" 26 | black = "^24.2.0" 27 | pytest = "^8.0.2" 28 | ipykernel = "^6.29.3" 29 | pytest-env = "^1.1.3" 30 | 31 | 32 | [build-system] 33 | requires = ["poetry-core"] 34 | build-backend = "poetry.core.masonry.api" 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 AgentSea 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /robbieg2/cheap_critic.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | from PIL import Image 3 | import numpy as np 4 | from skimage.metrics import structural_similarity as ssim 5 | 6 | def assess_action_result(starting_image: Image.Image, updated_image: Image.Image) -> (float, bool): 7 | """Cheap critic returns True if the chain of actions can be continued and False otherwise. 8 | In the current version, we continue if the SSIM is above a threshold (i.e. the images are visually similar). 9 | """ 10 | threshold = 0.9 11 | ssim = compare_images(starting_image, updated_image) 12 | if ssim > threshold: 13 | return ssim, True 14 | else: 15 | return ssim, False 16 | 17 | def _pil_to_cv2(pil_image): 18 | # Ensure the image is in RGB mode 19 | if pil_image.mode != 'RGB': 20 | pil_image = pil_image.convert('RGB') 21 | # Convert the PIL Image to a NumPy array 22 | np_image = np.array(pil_image) 23 | # Convert RGB to BGR format (OpenCV uses BGR) 24 | cv2_image = cv2.cvtColor(np_image, cv2.COLOR_RGB2BGR) 25 | return cv2_image 26 | 27 | def compare_images(image1, image2): 28 | image1 = _pil_to_cv2(image1) 29 | image2 = _pil_to_cv2(image2) 30 | 31 | # Convert the images to grayscale 32 | gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY) 33 | gray2 = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY) 34 | 35 | # Compute SSIM between the two images 36 | similarity_index, diff = ssim(gray1, gray2, full=True) 37 | 38 | print(f"SSIM: {similarity_index}") 39 | return similarity_index 40 | -------------------------------------------------------------------------------- /robbieg2/server.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | from contextlib import asynccontextmanager 5 | from typing import Final 6 | 7 | import uvicorn 8 | from fastapi import FastAPI 9 | from fastapi.middleware.cors import CORSMiddleware 10 | from surfkit.server.routes import task_router 11 | 12 | from .agent import Agent 13 | 14 | # Configure logging 15 | logger: Final = logging.getLogger("robbieg2") 16 | logger.setLevel(int(os.getenv("LOG_LEVEL", str(logging.DEBUG)))) 17 | handler = logging.StreamHandler(sys.stdout) 18 | handler.setLevel(logging.INFO) 19 | formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") 20 | handler.setFormatter(formatter) 21 | 22 | # Ensure logs are flushed immediately 23 | handler.flush = sys.stdout.flush 24 | logger.addHandler(handler) 25 | logger.propagate = False 26 | 27 | ALLOW_ORIGINS = os.getenv("ALLOW_ORIGINS", "*").split(",") 28 | ALLOW_METHODS = os.getenv("ALLOW_METHODS", "*").split(",") 29 | ALLOW_HEADERS = os.getenv("ALLOW_HEADERS", "*").split(",") 30 | 31 | 32 | @asynccontextmanager 33 | async def lifespan(app: FastAPI): 34 | # Initialize the agent type before the server comes live 35 | Agent.init() 36 | yield 37 | 38 | 39 | app = FastAPI(lifespan=lifespan) # type: ignore 40 | 41 | app.add_middleware( 42 | CORSMiddleware, 43 | allow_origins=ALLOW_ORIGINS, 44 | allow_credentials=True, 45 | allow_methods=ALLOW_METHODS, 46 | allow_headers=ALLOW_HEADERS, 47 | ) 48 | 49 | app.include_router(task_router(Agent)) 50 | 51 | if __name__ == "__main__": 52 | port = os.getenv("SERVER_PORT", "9090") 53 | reload = os.getenv("SERVER_RELOAD", "true") == "true" 54 | host = os.getenv("SERVER_HOST", "0.0.0.0") 55 | uvicorn.run( 56 | "robbieg2.server:app", 57 | host=host, 58 | port=int(port), 59 | reload=reload, 60 | reload_excludes=[".data"], 61 | log_config=None, # Disable default Uvicorn log configuration 62 | ) 63 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | First off, thank you for considering contributing to this project. It's people like you that make it such a great tool. 4 | 5 | ## Code of Conduct 6 | 7 | This project adheres to a Code of Conduct that we expect project participants to adhere to. Please read [the full text](CODE_OF_CONDUCT.md) so that you can understand what actions will and will not be tolerated. 8 | 9 | ## What we are looking for 10 | 11 | This is an open-source project, and we welcome contributions of all kinds: new features, bug fixes, documentation, examples, or enhancements to existing features. We are always thrilled to receive contributions from the community. 12 | 13 | ## How to contribute 14 | 15 | If you've never contributed to an open-source project before, here are a few steps to get you started: 16 | 17 | ### Reporting Issues 18 | 19 | Before submitting a bug report or feature request, check to make sure it hasn't already been submitted. You can search through existing issues and pull requests to see if someone has reported one similar to yours. 20 | 21 | When you are creating a bug report, please include as much detail as possible. 22 | 23 | ### Pull Requests 24 | 25 | - Fork the repository and create your branch from `main`. 26 | - If you've added code that should be tested, add tests. 27 | - If you've changed APIs, update the documentation. 28 | - Ensure the test suite passes. 29 | - Make sure your code lints. 30 | - Issue that pull request! 31 | 32 | ### Getting started 33 | 34 | For something that is bigger than a one or two-line fix: 35 | 36 | 1. Create your own fork of the code. 37 | 2. Do the changes in your fork. 38 | 3. If you like the change and think the project could use it: 39 | - Be sure you have followed the code style for the project. 40 | - Note the Code of Conduct. 41 | - Send a pull request. 42 | 43 | ## How to report a bug 44 | 45 | If you find a security vulnerability, do NOT open an issue. Email github@kentauros.ai instead. 46 | 47 | In order to help us understand and resolve your issue quickly, please include as much information as possible, including: 48 | 49 | - A quick summary and/or background 50 | - Steps to reproduce 51 | - Be specific! 52 | - Give a sample code if you can. 53 | - What you expected would happen 54 | - What actually happens 55 | - Notes (possibly including why you think this might be happening or stuff you tried that didn't work) 56 | 57 | People *love* thorough bug reports. I'm not even kidding. 58 | 59 | ## How to suggest a feature or enhancement 60 | 61 | If you find yourself wishing for a feature that doesn't exist in the project, you are probably not alone. There are bound to be others out there with similar needs. Open an issue on our issues list on GitHub, which describes the feature you would like to see, why you need it, and how it should work. 62 | 63 | ## Code review process 64 | 65 | The core team looks at Pull Requests on a regular basis in a bi-weekly triage meeting. After feedback has been given, we expect responses within two weeks. After two weeks, we may close the pull request if it isn't showing any activity. 66 | 67 | ## Community 68 | 69 | Discussions about the project take place in this repository's Issues and Pull Requests sections. Anybody is welcome to join these conversations. 70 | 71 | Wherever possible, we use GitHub to discuss changes and keep the decision-making process open. 72 | 73 | ## Thank you! 74 | 75 | Thank you for contributing! 76 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to make participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | - Using welcoming and inclusive language 12 | - Being respectful of differing viewpoints and experiences 13 | - Gracefully accepting constructive criticism 14 | - Focusing on what is best for the community 15 | - Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | - The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | - Trolling, insulting/derogatory comments, and personal or political attacks 21 | - Public or private harassment 22 | - Publishing others' private information, such as a physical or email address, without explicit permission 23 | - Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned with this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies within all project spaces, including GitHub, and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at github@kentauros.ai. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality regarding the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 2.0, available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 44 | 45 | Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity). 46 | 47 | For answers to common questions about this code of conduct, see https://www.contributor-covenant.org/faq. Translations are available at https://www.contributor-covenant.org/translations. 48 | -------------------------------------------------------------------------------- /robbieg2/grid.py: -------------------------------------------------------------------------------- 1 | from PIL import Image, ImageDraw, ImageFont 2 | 3 | # We need a simple grid: numbers from 1 to 9 in points on an intersection of nxn grid. 4 | # The font size may be 1/5 of the size of the height of the cell. 5 | # Therefore, we need the size of the image and colors, and the file_name. 6 | 7 | def create_grid_image(image_width, image_height, color_circle, color_number, n, file_name): 8 | cell_width = image_width // n 9 | cell_height = image_height // n 10 | font_size = max(cell_height // 5, 20) 11 | circle_radius = font_size * 7 // 10 12 | 13 | # Create a blank image with transparent background 14 | img = Image.new('RGBA', (image_width, image_height), (0, 0, 0, 0)) 15 | draw = ImageDraw.Draw(img) 16 | 17 | # Load a font 18 | font = ImageFont.truetype("font/arialbd.ttf", font_size) 19 | 20 | # Set the number of cells in each dimension 21 | num_cells_x = n - 1 22 | num_cells_y = n - 1 23 | 24 | # Draw the numbers in the center of each cell 25 | for i in range(num_cells_x): 26 | for j in range(num_cells_y): 27 | number = i * num_cells_y + j + 1 28 | text = str(number) 29 | x = (i + 1) * cell_width 30 | y = (j + 1) * cell_height 31 | draw.ellipse([x - circle_radius, y - circle_radius, 32 | x + circle_radius, y + circle_radius], 33 | fill=color_circle) 34 | offset_x = font_size / 4 if number < 10 else font_size / 2 35 | draw.text((x - offset_x, y - font_size / 2), text, font=font, fill=color_number) 36 | 37 | # Save the image 38 | img.save(file_name) 39 | 40 | def zoom_in(image_path, n, index, upscale): 41 | img = Image.open(image_path) 42 | width, height = img.size 43 | # we need to calculate the cell size 44 | cell_width = width // n 45 | cell_height = height // n 46 | # we need to calculate the x and y coordinates of the cell 47 | x = ((index - 1) // (n - 1)) * cell_width 48 | y = ((index - 1) % (n - 1)) * cell_height 49 | # we need to calculate the x and y coordinates of the top left corner of the cell 50 | top_left = (x, y) 51 | # we need to calculate the x and y coordinates of the bottom right corner of the cell 52 | bottom_right = (x + 2 * cell_width, y + 2 * cell_height) 53 | # we need to crop the image 54 | 55 | cropped_img = img.crop(top_left + bottom_right) 56 | cropped_img = cropped_img.resize((cropped_img.width * upscale, cropped_img.height * upscale), resample=0) 57 | return cropped_img, top_left, bottom_right 58 | # cropped_img.save(new_image_path) 59 | # return 2 * cell_width, 2 * cell_height 60 | 61 | def superimpose_images(image1_path, image2_path, opacity): 62 | # Open the images 63 | image1 = Image.open(image1_path) 64 | image2 = Image.open(image2_path) 65 | 66 | # Ensure both images have the same size 67 | if image1.size != image2.size: 68 | raise ValueError("Images must have the same dimensions.") 69 | 70 | # Convert the images to RGBA mode if they are not already 71 | image1 = image1.convert("RGBA") 72 | image2 = image2.convert("RGBA") 73 | 74 | # Create a new image with the same size as the input images 75 | merged_image = Image.new("RGBA", image1.size) 76 | 77 | # Convert image1 to grayscale 78 | image1 = image1.convert("L") 79 | 80 | # Paste image1 onto the merged image 81 | merged_image.paste(image1, (0, 0)) 82 | 83 | # Create a new image for image2 with adjusted opacity 84 | image2_with_opacity = Image.blend(Image.new("RGBA", image2.size, (0, 0, 0, 0)), image2, opacity) 85 | 86 | # Paste image2 with opacity onto the merged image 87 | merged_image = Image.alpha_composite(merged_image, image2_with_opacity) 88 | 89 | return merged_image 90 | 91 | 92 | 93 | # Example usage 94 | if __name__ == "__main__": 95 | create_grid_image(2880, 1712, 'yellow', 'green', 6, 'test.png') 96 | 97 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 |
3 |

4 |

Robbie G2

5 |

6 | Robbie G2 Logo 7 |

8 |

9 | Gen 2 AI Agent that uses OCR, Canny Composite, and Grid to navigate GUIs 10 |
11 |
12 | Explore the docs » 13 |
14 |
15 | View Demo 16 | · 17 | Report Bug 18 | · 19 | Request Feature 20 |

21 |

22 | 23 | Meet Robbie, or Gen 2 agent. 24 | 25 | Robbie navigates GUIs to solve tasks for you. 26 | 27 | Unlike other bots, he doesn't just work on the web because he doesn't use Playwright. Robbie is a pure multimodal bot. He can navigate the web or a desktop. 28 | 29 | That means he can navigate SaaS apps or he can work on a remote desktop and send emails, search for flights, check Slack, do research and more. 30 | 31 | Robbie-g2, aka Gen 2, is a leap from our first gen agents, SurfPizza and SurfSlicer. He's very capable at navigating complex, never before seen GUIs via a remote virtual desktop which the AgentSea stack serves up as a device to him via DeviceBay. He connects to it via ToolFuse and AgentDesk, which lets him know what he can do with it, like move the mouse, send key commands, etc. 32 | 33 | ‣ Check out our community on [Discord](https://discord.gg/hhaq7XYPS6) where we develop in the open, share research and connect with other developers who are building cutting edge agents or who just want to use them to get things done! 34 | 35 | ‣ Check out the deep dive video right here: 36 | 37 |
38 | 39 | Watch the video 40 | 41 |
42 | 43 | 44 | ## Quick Start 45 | 46 | ### Prerequisites 47 | 48 | * [Install Docker](https://docs.docker.com/engine/install/) - you need it to run a Tracker 49 | * [Install QEMU](https://docs.hub.agentsea.ai/configuration/qemu) OR [Configure GCP](https://docs.hub.agentsea.ai/configuration/gcp) OR [Configure AWS](https://docs.hub.agentsea.ai/configuration/aws) - you need one of these to host a Device 50 | 51 | ### Setup 52 | 53 | 1. Setup your OpenAI API key: 54 | 55 | ```sh 56 | export OPENAI_API_KEY= 57 | ``` 58 | 59 | 2. Install/upgrade SurfKit: 60 | 61 | ```sh 62 | pip install -U surfkit 63 | ``` 64 | 65 | 3. Clone the repository and go to the root folder: 66 | 67 | ```sh 68 | git clone git@github.com:agentsea/robbie-g2.git && cd robbie-g2 69 | ``` 70 | 71 | 4. Install dependencies: 72 | 73 | ```sh 74 | poetry install 75 | ``` 76 | 77 | ### Creating required entities 78 | 79 | 5. Create a tracker: 80 | 81 | ```sh 82 | surfkit create tracker --name tracker01 83 | ``` 84 | 85 | 6. Create a device: 86 | 87 | - If you are using QEMU: 88 | 89 | ```sh 90 | surfkit create device --provider qemu --name device01 91 | ``` 92 | 93 | - If you are using GCE: 94 | 95 | ```sh 96 | surfkit create device --provider gce --name device01 97 | ``` 98 | 99 | - If you are using AWS: 100 | 101 | ```sh 102 | surfkit create device --provider aws --name device01 103 | ``` 104 | 105 | 7. Create an agent: 106 | 107 | ```sh 108 | surfkit create agent --name agent01 109 | ``` 110 | 111 | ### Solving a task 112 | 113 | ```sh 114 | surfkit solve "Search for common varieties of french ducks" \ 115 | --tracker tracker01 \ 116 | --device device01 \ 117 | --agent agent01 118 | ``` 119 | 120 | ## Documentation 121 | 122 | See our [docs](https://docs.hub.agentsea.ai) for more information on how to use Surfkit. 123 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .data/ 2 | .DS_Store 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | cover/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | .pybuilder/ 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | # For a library or package, you might want to ignore these files since the code is 90 | # intended to run in multiple environments; otherwise, check them in: 91 | # .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # poetry 101 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 102 | # This is especially recommended for binary packages to ensure reproducibility, and is more 103 | # commonly ignored for libraries. 104 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 105 | #poetry.lock 106 | 107 | # pdm 108 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 109 | #pdm.lock 110 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 111 | # in version control. 112 | # https://pdm.fming.dev/#use-with-ide 113 | .pdm.toml 114 | 115 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 116 | __pypackages__/ 117 | 118 | # Celery stuff 119 | celerybeat-schedule 120 | celerybeat.pid 121 | 122 | # SageMath parsed files 123 | *.sage.py 124 | 125 | # Environments 126 | .env 127 | .venv 128 | env/ 129 | venv/ 130 | ENV/ 131 | env.bak/ 132 | venv.bak/ 133 | 134 | # Spyder project settings 135 | .spyderproject 136 | .spyproject 137 | 138 | # Rope project settings 139 | .ropeproject 140 | 141 | # mkdocs documentation 142 | /site 143 | 144 | # mypy 145 | .mypy_cache/ 146 | .dmypy.json 147 | dmypy.json 148 | 149 | # Pyre type checker 150 | .pyre/ 151 | 152 | # pytype static type analyzer 153 | .pytype/ 154 | 155 | # Cython debug symbols 156 | cython_debug/ 157 | 158 | # PyCharm 159 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 160 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 161 | # and can be added to the global gitignore or merged into this file. For a more nuclear 162 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 163 | #.idea/ 164 | 165 | .data/ 166 | cidata.iso 167 | .agentsea 168 | -------------------------------------------------------------------------------- /robbieg2/canny_composite.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from sklearn.cluster import AgglomerativeClustering 4 | from PIL import Image, ImageDraw, ImageFont 5 | 6 | 7 | def improved_canny(image): 8 | gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) 9 | blurred = cv2.GaussianBlur(gray, (5, 5), 0) 10 | edges = cv2.Canny(blurred, 50, 150) 11 | dilated = cv2.dilate(edges, None, iterations=2) 12 | return dilated 13 | 14 | def group_elements(binary_image): 15 | contours, _ = cv2.findContours(binary_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) 16 | 17 | grouped_contours = [] 18 | for contour in contours: 19 | x, y, w, h = cv2.boundingRect(contour) 20 | if w * h < 100: # Ignore very small contours 21 | continue 22 | merged = False 23 | for group in grouped_contours: 24 | if any(cv2.boundingRect(c)[0] - 10 <= x <= cv2.boundingRect(c)[0] + cv2.boundingRect(c)[2] + 10 and 25 | cv2.boundingRect(c)[1] - 10 <= y <= cv2.boundingRect(c)[1] + cv2.boundingRect(c)[3] + 10 for c in group): 26 | group.append(contour) 27 | merged = True 28 | break 29 | if not merged: 30 | grouped_contours.append([contour]) 31 | 32 | return grouped_contours 33 | 34 | def extract_bounding_boxes(grouped_contours): 35 | bounding_boxes = [] 36 | for group in grouped_contours: 37 | x, y, w, h = cv2.boundingRect(np.concatenate(group)) 38 | bounding_boxes.append((x, y, w, h)) 39 | return bounding_boxes 40 | 41 | def cluster_bounding_boxes(bounding_boxes, num_clusters): 42 | if len(bounding_boxes) <= num_clusters: 43 | return bounding_boxes 44 | 45 | # Calculate the centroids of the bounding boxes 46 | centroids = np.array([ 47 | [box[0] + box[2] / 2, box[1] + box[3] / 2] for box in bounding_boxes 48 | ]) 49 | 50 | # Apply Hierarchical clustering (Agglomerative Clustering) 51 | agg_clustering = AgglomerativeClustering(n_clusters=num_clusters) 52 | labels = agg_clustering.fit_predict(centroids) 53 | 54 | # Calculate bounding boxes for each cluster 55 | cluster_bounding_boxes = [] 56 | for i in range(num_clusters): 57 | cluster_boxes = [box for box, label in zip(bounding_boxes, labels) if label == i] 58 | if cluster_boxes: 59 | # Find the minimum and maximum coordinates for all boxes in the cluster 60 | min_x = min(box[0] for box in cluster_boxes) 61 | min_y = min(box[1] for box in cluster_boxes) 62 | max_x = max(box[0] + box[2] for box in cluster_boxes) 63 | max_y = max(box[1] + box[3] for box in cluster_boxes) 64 | 65 | # Calculate the dimensions of the bounding box that encompasses all boxes 66 | width = max_x - min_x 67 | height = max_y - min_y 68 | 69 | # Create a new bounding box that encompasses all boxes in the cluster 70 | cluster_box = (min_x, min_y, width, height) 71 | cluster_bounding_boxes.append(cluster_box) 72 | 73 | return cluster_bounding_boxes 74 | 75 | def create_composite_image(bounding_boxes, image): 76 | number_column_width = 100 77 | image_column_width = max(box[2] for box in bounding_boxes) 78 | row_heights = [box[3] + 4 for box in bounding_boxes] 79 | total_width = number_column_width + image_column_width + 1 # +1 for rightmost line 80 | total_height = sum(row_heights) + len(bounding_boxes) + 1 # +1 for each row separator and bottom line 81 | 82 | composite = Image.new('RGB', (total_width, total_height), color='white') 83 | draw = ImageDraw.Draw(composite) 84 | 85 | try: 86 | font = ImageFont.truetype("./font/arialbd.ttf", 30) 87 | except IOError: 88 | font = ImageFont.load_default() 89 | print("Arial font not found in ./font directory. Using default font.") 90 | 91 | # Draw grid lines 92 | for i in range(len(bounding_boxes) + 1): 93 | y = sum(row_heights[:i]) + i 94 | draw.line([(0, y), (total_width, y)], fill='black', width=1) 95 | draw.line([(number_column_width, 0), (number_column_width, total_height)], fill='black', width=1) 96 | draw.line([(total_width - 1, 0), (total_width - 1, total_height)], fill='black', width=1) 97 | 98 | y_offset = 1 # Start after the top line 99 | for i, box in enumerate(bounding_boxes): 100 | # Draw number 101 | number_text = str(i+1) 102 | text_bbox = draw.textbbox((0, 0), number_text, font=font) 103 | text_width = text_bbox[2] - text_bbox[0] 104 | text_height = text_bbox[3] - text_bbox[1] 105 | text_x = (number_column_width - text_width) // 2 106 | text_y = y_offset + (row_heights[i] - text_height) // 2 + 2 107 | draw.text((text_x, text_y), number_text, font=font, fill='red') 108 | 109 | # Paste image slice 110 | box_pil = image.crop((box[0], box[1], box[0] + box[2], box[1] + box[3])) 111 | paste_x = number_column_width + 1 112 | composite.paste(box_pil, (paste_x, y_offset + 2)) 113 | # Draw a rectangle around the pasted image 114 | draw.rectangle( 115 | [ 116 | (paste_x, y_offset + 2), 117 | (paste_x + box[2] - 1, y_offset + 2 + box[3] - 1) 118 | ], 119 | outline="green", 120 | width=2 121 | ) 122 | 123 | y_offset += row_heights[i] + 1 # Move to next row, accounting for grid line 124 | 125 | return composite 126 | 127 | def create_composite(image_path, num_clusters): 128 | image = cv2.imread(image_path) 129 | pil_image = Image.open(image_path) 130 | 131 | edges = improved_canny(image) 132 | grouped_contours = group_elements(edges) 133 | bounding_boxes = extract_bounding_boxes(grouped_contours) 134 | clustered_boxes = cluster_bounding_boxes(bounding_boxes, num_clusters) 135 | composite_image = create_composite_image(clustered_boxes, pil_image) 136 | return composite_image, clustered_boxes 137 | -------------------------------------------------------------------------------- /robbieg2/tool.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import time 4 | import requests 5 | 6 | from agentdesk.device import Desktop 7 | from mllm import Router 8 | from rich.console import Console 9 | from taskara import Task 10 | from toolfuse import Tool, action 11 | 12 | from .clicker import find_coordinates 13 | 14 | router = Router.from_env() 15 | console = Console() 16 | 17 | logger = logging.getLogger(__name__) 18 | logger.setLevel(int(os.getenv("LOG_LEVEL", logging.DEBUG))) 19 | 20 | 21 | class SemanticDesktop(Tool): 22 | """A semantic desktop replaces click actions with semantic description rather than coordinates""" 23 | 24 | def __init__( 25 | self, task: Task, desktop: Desktop, data_path: str = "./.data" 26 | ) -> None: 27 | """ 28 | Initialize and open a URL in the application. 29 | 30 | Args: 31 | task: Agent task. Defaults to None. 32 | desktop: Desktop instance to wrap. 33 | data_path (str, optional): Path to data. Defaults to "./.data". 34 | """ 35 | super().__init__(wraps=desktop) 36 | self.desktop = desktop 37 | 38 | self.data_path = data_path 39 | self.img_path = os.path.join(self.data_path, "images", task.id) 40 | os.makedirs(self.img_path, exist_ok=True) 41 | 42 | self.task = task 43 | self.last_click_failed = False 44 | 45 | self.results = { 46 | "first_ocr": 0, 47 | "second_ocr": 0, 48 | "full_grid": 0, 49 | "failure": 0, 50 | } 51 | 52 | @action 53 | def clean_text(self) -> str: 54 | """Clean the text input or area currently in focus. 55 | Use when you see wrong data in the text input or area which is currently in focus 56 | and need to clean it before entering new text. 57 | """ 58 | self.desktop.hot_key(["ctrl", "a"]) 59 | self.desktop.hot_key(["del"]) 60 | 61 | @action 62 | def click_object(self, description: str, text: str, type: str, button: str = "left") -> None: 63 | """Click on an object on the screen 64 | 65 | Args: 66 | description (str): The description of the object including its general location, for example 67 | "a round dark blue icon with the text 'Home' in the top-right of the image", please be a generic as possible 68 | text (str): The text written on the object to click on. For example, 69 | for "a round dark blue icon with the text 'Home' in the top-right of the image", 70 | the text is 'Home'. For input with its name written right inside it, write here the name of the input, 71 | for example, 'Where to'. For the calendar date, put here only a day, for example, '15'. 72 | If the object doesn't have any text on or in it, return emply string. 73 | type (str): Type of click, can be 'single' for a single click or 74 | 'double' for a double click. If you need to launch an application from the desktop choose 'double' 75 | button (str, optional): Mouse button to click. Options are 'left' or 'right'. Defaults to 'left'. 76 | """ 77 | if type not in ["single", "double"]: 78 | raise ValueError("type must be 'single' or 'double'") 79 | 80 | self.task.post_message( 81 | role="Clicker", 82 | msg=f"Current statistics: {self.results}", 83 | thread="debug", 84 | ) 85 | 86 | coords = find_coordinates(self, description, text) 87 | 88 | if coords: 89 | click_x, click_y = coords["x"], coords["y"] 90 | message = f"Attempting to click coordinates {click_x}, {click_y}." 91 | self.task.post_message( 92 | role="Clicker", 93 | msg=message, 94 | thread="debug", 95 | ) 96 | self.last_debug_message = message 97 | self._click_coords(x=click_x, y=click_y, type=type, button=button) 98 | else: 99 | # Note: Given that GRID almost always returns something, we should almost never be here 100 | self.results["failure"] += 1 101 | self.task.post_message( 102 | role="Clicker", 103 | msg=f"No coordinates found for {description}.", 104 | thread="debug", 105 | ) 106 | self.last_click_failed = True 107 | 108 | def _click_coords( 109 | self, x: int, y: int, type: str = "single", button: str = "left" 110 | ) -> None: 111 | """Click mouse button 112 | 113 | Args: 114 | x (Optional[int], optional): X coordinate to move to, if not provided 115 | it will click on current location. Defaults to None. 116 | y (Optional[int], optional): Y coordinate to move to, if not provided 117 | it will click on current location. Defaults to None. 118 | type (str, optional): Type of click, can be single or double. Defaults to "single". 119 | button (str, optional): Button to click. Defaults to "left". 120 | """ 121 | # TODO: fix click cords in agentd 122 | logging.debug("moving mouse") 123 | body = {"x": int(x), "y": int(y)} 124 | resp = requests.post(f"{self.desktop.base_url}/v1/move_mouse", json=body) 125 | resp.raise_for_status() 126 | time.sleep(2) 127 | 128 | if type == "single": 129 | logging.debug("clicking") 130 | resp = requests.post( 131 | f"{self.desktop.base_url}/v1/click", json={"button": button} 132 | ) 133 | resp.raise_for_status() 134 | time.sleep(2) 135 | elif type == "double": 136 | logging.debug("double clicking") 137 | resp = requests.post( 138 | f"{self.desktop.base_url}/v1/double_click", json={"button": button} 139 | ) 140 | resp.raise_for_status() 141 | time.sleep(2) 142 | else: 143 | raise ValueError(f"unkown click type {type}") 144 | return 145 | -------------------------------------------------------------------------------- /robbieg2/img.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | from PIL import Image, ImageDraw, ImageFont 3 | import base64 4 | from io import BytesIO 5 | 6 | 7 | class Box: 8 | def __init__(self, left: int, top: int, right: int, bottom: int): 9 | self.left = left 10 | self.top = top 11 | self.right = right 12 | self.bottom = bottom 13 | 14 | def width(self) -> int: 15 | return self.right - self.left 16 | 17 | def height(self) -> int: 18 | return self.bottom - self.top 19 | 20 | def zoom_in(self, cell_index: int, num_cells: int) -> "Box": 21 | cell_width = self.width() // num_cells 22 | cell_height = self.height() // num_cells 23 | col = (cell_index - 1) % num_cells 24 | row = (cell_index - 1) // num_cells 25 | return Box( 26 | self.left + col * cell_width, 27 | self.top + row * cell_height, 28 | self.left + (col + 1) * cell_width, 29 | self.top + (row + 1) * cell_height, 30 | ) 31 | 32 | def center(self) -> Tuple[int, int]: 33 | return ((self.left + self.right) // 2, (self.top + self.bottom) // 2) 34 | 35 | def crop_image(self, img: Image.Image) -> Image.Image: 36 | return img.crop((self.left, self.top, self.right, self.bottom)) 37 | 38 | def draw( 39 | self, 40 | draw_context, 41 | outline: str = "red", 42 | width: int = 3, 43 | ) -> None: 44 | draw_context.rectangle( 45 | [self.left, self.top, self.right, self.bottom], outline=outline, width=width 46 | ) 47 | 48 | def to_absolute(self, parent_box: "Box") -> "Box": 49 | return Box( 50 | self.left + parent_box.left, 51 | self.top + parent_box.top, 52 | self.right + parent_box.left, 53 | self.bottom + parent_box.top, 54 | ) 55 | 56 | def to_absolute_with_upscale(self, parent_box: "Box", upscale: int = 1) -> "Box": 57 | return Box( 58 | round(self.left / upscale + parent_box.left), 59 | round(self.top / upscale + parent_box.top), 60 | round(self.right / upscale + parent_box.left), 61 | round(self.bottom / upscale + parent_box.top), 62 | ) 63 | 64 | 65 | def create_grid_image_by_num_cells( 66 | image_width: int, 67 | image_height: int, 68 | color_circle: str = "red", 69 | color_text: str = "yellow", 70 | num_cells: int = 6, 71 | ) -> Image.Image: 72 | """Create the pizza grid image. 73 | 74 | Args: 75 | image_width (int): Width of the image. 76 | image_height (int): Height of the image. 77 | color_circle (str): Color of the circles. Defaults to 'red' 78 | color_text (str): Color of the text. Defaults to 'yellow' 79 | num_cells (int): The number of cells in each dimension. Defaults to 6. 80 | 81 | Returns: 82 | Image.Image: The image grid 83 | """ 84 | cell_width = image_width // num_cells 85 | cell_height = image_height // num_cells 86 | font_size = max(cell_height // 5, 30) 87 | circle_radius = font_size * 7 // 10 88 | 89 | # Create a blank image with transparent background 90 | img = Image.new("RGBA", (image_width, image_height), (0, 0, 0, 0)) 91 | draw = ImageDraw.Draw(img) 92 | 93 | # Load a font 94 | font = ImageFont.truetype("fonts/arialbd.ttf", font_size) 95 | 96 | # Set the number of cells in each dimension 97 | num_cells_x = num_cells - 1 98 | num_cells_y = num_cells - 1 99 | 100 | # Draw the numbers in the center of each cell 101 | for i in range(num_cells_x): 102 | for j in range(num_cells_y): 103 | number = i * num_cells_y + j + 1 104 | text = str(number) 105 | x = (i + 1) * cell_width 106 | y = (j + 1) * cell_height 107 | draw.ellipse( 108 | [ 109 | x - circle_radius, 110 | y - circle_radius, 111 | x + circle_radius, 112 | y + circle_radius, 113 | ], 114 | fill=color_circle, 115 | ) 116 | offset_x = font_size / 4 if number < 10 else font_size / 2 117 | draw.text( 118 | (x - offset_x, y - font_size / 2), text, font=font, fill=color_text 119 | ) 120 | 121 | return img 122 | 123 | 124 | def create_grid_image_by_size( 125 | image_width: int, 126 | image_height: int, 127 | cell_size: int = 10, 128 | color_circle: str = "red", 129 | color_text: str = "yellow", 130 | ) -> Image.Image: 131 | """Create a grid image with numbered cells. 132 | 133 | Args: 134 | image_width (int): Total width of the image. 135 | image_height (int): Total height of the image. 136 | cell_size (int): Width and height of each cell. 137 | color_circle (str): Color of the circles. Defaults to 'red' 138 | color_text (str): Color of the text. Defaults to 'yellow' 139 | 140 | Returns: 141 | Image.Image: The image with a grid. 142 | """ 143 | num_cells_x = image_width // cell_size 144 | num_cells_y = image_height // cell_size 145 | font_size = max(cell_size // 5, 10) 146 | circle_radius = ( 147 | cell_size // 2 - 2 148 | ) # Slightly smaller than half the cell for visual appeal 149 | 150 | # Create a blank image 151 | img = Image.new("RGBA", (image_width, image_height), (0, 0, 0, 0)) 152 | draw = ImageDraw.Draw(img) 153 | 154 | # Load a font 155 | try: 156 | font = ImageFont.truetype("arialbd.ttf", font_size) 157 | except IOError: 158 | font = ImageFont.load_default() 159 | print("Custom font not found. Using default font.") 160 | 161 | # Draw the grid 162 | for i in range(num_cells_x): 163 | for j in range(num_cells_y): 164 | number = i * num_cells_y + j + 1 165 | text = str(number) 166 | x_center = (i + 0.5) * cell_size 167 | y_center = (j + 0.5) * cell_size 168 | 169 | # Draw circle 170 | draw.ellipse( 171 | [ 172 | x_center - circle_radius, 173 | y_center - circle_radius, 174 | x_center + circle_radius, 175 | y_center + circle_radius, 176 | ], 177 | fill=color_circle, 178 | ) 179 | 180 | # Calculate text offset for centering using getbbox() 181 | bbox = font.getbbox(text) 182 | text_width = bbox[2] - bbox[0] 183 | text_height = bbox[3] - bbox[1] 184 | draw.text( 185 | (x_center - text_width / 2, y_center - text_height / 2), 186 | text, 187 | font=font, 188 | fill=color_text, 189 | ) 190 | 191 | return img 192 | 193 | 194 | def zoom_in( 195 | img: Image.Image, box: Box, num_cells: int, selected: int 196 | ) -> Tuple[Image.Image, Box]: 197 | """Zoom in on the selected cell. 198 | 199 | Args: 200 | img (Image.Image): The image to zoom in 201 | box (Box): The box to zoom into 202 | num_cells (int): Number of cells to use. 203 | selected (int): The selected cell 204 | 205 | Returns: 206 | Tuple[Image.Image, Box]: Cropped image and asociated box 207 | """ 208 | new_box = box.zoom_in(selected, num_cells) 209 | absolute_box = new_box.to_absolute(box) 210 | cropped_img = new_box.crop_image(img) 211 | return cropped_img, absolute_box 212 | 213 | 214 | def superimpose_images( 215 | base: Image.Image, layer: Image.Image, opacity: float = 1 216 | ) -> Image.Image: 217 | """ 218 | 219 | Args: 220 | base (Image.Image): Base image 221 | layer (Image.Image): Layered image 222 | opacity (float): How much opacity the layer should have. Defaults to 1. 223 | 224 | Returns: 225 | Image.Image: The superimposed image 226 | """ 227 | # Ensure both images have the same size 228 | if base.size != layer.size: 229 | raise ValueError("Images must have the same dimensions.") 230 | 231 | # Convert the images to RGBA mode if they are not already 232 | base = base.convert("RGBA") 233 | layer = layer.convert("RGBA") 234 | 235 | # Create a new image with the same size as the input images 236 | merged_image = Image.new("RGBA", base.size) 237 | 238 | # Convert image1 to grayscale 239 | base = base.convert("L") 240 | 241 | # Paste image1 onto the merged image 242 | merged_image.paste(base, (0, 0)) 243 | 244 | # Create a new image for image2 with adjusted opacity 245 | image2_with_opacity = Image.blend( 246 | Image.new("RGBA", layer.size, (0, 0, 0, 0)), layer, opacity 247 | ) 248 | 249 | # Paste image2 with opacity onto the merged image 250 | merged_image = Image.alpha_composite(merged_image, image2_with_opacity) 251 | 252 | return merged_image 253 | 254 | 255 | def image_to_b64(img: Image.Image, image_format="PNG") -> str: 256 | """Converts a PIL Image to a base64-encoded string with MIME type included. 257 | 258 | Args: 259 | img (Image.Image): The PIL Image object to convert. 260 | image_format (str): The format to use when saving the image (e.g., 'PNG', 'JPEG'). 261 | 262 | Returns: 263 | str: A base64-encoded string of the image with MIME type. 264 | """ 265 | buffer = BytesIO() 266 | img.save(buffer, format=image_format) 267 | image_data = buffer.getvalue() 268 | buffer.close() 269 | 270 | mime_type = f"image/{image_format.lower()}" 271 | base64_encoded_data = base64.b64encode(image_data).decode("utf-8") 272 | return f"data:{mime_type};base64,{base64_encoded_data}" 273 | 274 | 275 | def b64_to_image(base64_str: str) -> Image.Image: 276 | """Converts a base64 string to a PIL Image object. 277 | 278 | Args: 279 | base64_str (str): The base64 string, potentially with MIME type as part of a data URI. 280 | 281 | Returns: 282 | Image.Image: The converted PIL Image object. 283 | """ 284 | # Strip the MIME type prefix if present 285 | if "," in base64_str: 286 | base64_str = base64_str.split(",")[1] 287 | 288 | image_data = base64.b64decode(base64_str) 289 | image = Image.open(BytesIO(image_data)) 290 | return image 291 | 292 | 293 | def load_image_base64(filepath: str) -> str: 294 | # Load the image from the file path 295 | image = Image.open(filepath) 296 | buffered = BytesIO() 297 | 298 | # Save image to the buffer 299 | image_format = image.format if image.format else "PNG" 300 | image.save(buffered, format=image_format) 301 | 302 | # Encode the image to base64 303 | img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") 304 | 305 | # Prepare the mime type 306 | mime_type = f"image/{image_format.lower()}" 307 | 308 | # Return base64 string with mime type 309 | return f"data:{mime_type};base64,{img_str}" 310 | -------------------------------------------------------------------------------- /robbieg2/clicker.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import logging 3 | import os 4 | import re 5 | 6 | from typing import List, Optional, Tuple 7 | 8 | from PIL import Image, ImageDraw 9 | from difflib import SequenceMatcher 10 | from mllm import RoleMessage, RoleThread, Router 11 | from pydantic import BaseModel, Field 12 | from rich.console import Console 13 | from rich.json import JSON 14 | 15 | from .img import Box, image_to_b64 16 | from .grid import create_grid_image, zoom_in, superimpose_images 17 | from .easyocr import find_all_text_with_bounding_boxes 18 | from .canny_composite import create_composite 19 | 20 | router = Router.from_env() 21 | console = Console() 22 | 23 | logger = logging.getLogger(__name__) 24 | logger.setLevel(int(os.getenv("LOG_LEVEL", logging.DEBUG))) 25 | 26 | COLOR_NUMBER = os.getenv("COLOR_NUMBER", "yellow") 27 | COLOR_CIRCLE = os.getenv("COLOR_CIRCLE", "red") 28 | GRID_SIZE = int(os.getenv("GRID_SIZE", 8)) # Amound of grid points equals (N-1)^2 29 | NUM_CLUSTERS = int( 30 | os.getenv("NUM_CLUSTERS", 10) 31 | ) # Number of clusters to use for the composite method 32 | UPSCALE_FACTOR = int( 33 | os.getenv("UPSCALE_FACTOR", 3) 34 | ) # How much we upscale the image on each step of zooming in 35 | FIRST_OCR_THRESHOLD = float( 36 | os.getenv("FIRST_OCR_THRESHOLD", 0.9) 37 | ) # Threshold for the first OCR pass 38 | SECOND_OCR_THRESHOLD = float( 39 | os.getenv("SECOND_OCR_THRESHOLD", 0.7) 40 | ) # Threshold for the second OCR pass 41 | 42 | 43 | class ZoomSelection(BaseModel): 44 | """Zoom selection model""" 45 | 46 | number: int = Field(..., description="The number of the selected circle") 47 | 48 | 49 | class CompositeSelection(BaseModel): 50 | """Composite selection model""" 51 | 52 | number: int = Field( 53 | ..., description="The number of the selected section of the composite" 54 | ) 55 | 56 | 57 | def recall_best_method_on_first_iteration(description: str) -> str: 58 | memory = { 59 | "calendar": run_grid, 60 | "date": run_grid, 61 | "link": run_composite, 62 | "icon": run_composite, 63 | } 64 | for key, value in memory.items(): 65 | if key in description.lower(): 66 | return value 67 | return run_composite 68 | 69 | 70 | def recall_best_method_on_second_iteration(description: str) -> str: 71 | memory = { 72 | "calendar": run_grid, 73 | "date": run_grid, 74 | "link": run_grid, 75 | "icon": run_composite, 76 | } 77 | for key, value in memory.items(): 78 | if key in description.lower(): 79 | return value 80 | return run_grid 81 | 82 | 83 | def find_coordinates(semdesk, description: str, text: str) -> dict: 84 | click_hash = hashlib.md5(description.encode()).hexdigest()[:5] 85 | bounding_boxes = [] 86 | search_text = text 87 | if not search_text: 88 | matches = re.findall(r"['\"](.*?)['\"]", description) 89 | if len(matches) >= 2: 90 | search_text = None 91 | elif len(matches) == 1: 92 | search_text = matches[0] 93 | else: 94 | search_text = None 95 | 96 | # - Setting up the stage 97 | 98 | starting_img = semdesk.desktop.take_screenshots()[0] 99 | starting_img_path = os.path.join(semdesk.img_path, f"{click_hash}_starting.png") 100 | starting_img.save(starting_img_path) 101 | bounding_boxes.append(Box(0, 0, starting_img.width, starting_img.height)) 102 | 103 | method = recall_best_method_on_first_iteration(description) 104 | 105 | # - OCR on the starting image 106 | 107 | if search_text and len(search_text) > 3: 108 | semdesk.task.post_message( 109 | role="Clicker", 110 | msg=f"Attempting OCR for: {search_text}", 111 | thread="debug", 112 | ) 113 | ocr_results = find_all_text_with_bounding_boxes(starting_img_path) 114 | 115 | best_matches = [ 116 | box 117 | for box in ocr_results 118 | if similarity_ratio(box["text"], search_text) >= FIRST_OCR_THRESHOLD 119 | ] 120 | if len(best_matches) == 1: 121 | best_match = best_matches[0] 122 | x_mid = best_match["x"] + best_match["w"] // 2 123 | y_mid = best_match["y"] + best_match["h"] // 2 124 | bounding_boxes.append( 125 | Box( 126 | best_match["x"], 127 | best_match["y"], 128 | best_match["x"] + best_match["w"], 129 | best_match["y"] + best_match["h"], 130 | ) 131 | ) 132 | semdesk.task.post_message( 133 | role="Clicker", 134 | msg=f"Found best matching text: '{best_match['text']}'", 135 | thread="debug", 136 | ) 137 | # FIRST POINT OF POTENTIAL RESULT RETURN 138 | semdesk.results["first_ocr"] += 1 139 | debug_img = _debug_image( 140 | starting_img.copy(), bounding_boxes, (x_mid, y_mid) 141 | ) 142 | semdesk.task.post_message( 143 | role="Clicker", 144 | msg="Final debug img", 145 | thread="debug", 146 | images=[image_to_b64(debug_img)], 147 | ) 148 | return {"x": x_mid, "y": y_mid} 149 | else: 150 | semdesk.task.post_message( 151 | role="Clicker", 152 | msg=f"Found {len(best_matches)} best matches for text '{search_text}'. Continue...", 153 | thread="debug", 154 | ) 155 | else: 156 | semdesk.task.post_message( 157 | role="Clicker", 158 | msg="No text to look for in the starting image.", 159 | thread="debug", 160 | ) 161 | 162 | # - Finding a region where the element of interest is located 163 | 164 | semdesk.task.post_message( 165 | role="Clicker", 166 | msg="Looking for a region of interest...", 167 | thread="debug", 168 | ) 169 | region_of_interest, bounding_box = method( 170 | semdesk, starting_img, starting_img_path, description, click_hash, "region" 171 | ) 172 | 173 | # Escape exit, if we didn't find the region of interest because the element is not on a screen: 174 | # we fall back to bruteforce method 175 | if region_of_interest is None: 176 | return backup_find_coordinates(semdesk, description) 177 | 178 | region_of_interest_b64 = image_to_b64(region_of_interest) 179 | semdesk.task.post_message( 180 | role="Clicker", 181 | msg="Found region of interest", 182 | thread="debug", 183 | images=[region_of_interest_b64], 184 | ) 185 | bounding_boxes.append(bounding_box) 186 | region_of_interest_path = os.path.join( 187 | semdesk.img_path, f"{click_hash}_region_of_interest.png" 188 | ) 189 | region_of_interest.save(region_of_interest_path) 190 | 191 | # - OCR on the region we found 192 | if search_text: 193 | semdesk.task.post_message( 194 | role="Clicker", 195 | msg=f"Attempting OCR for: {search_text} on a region of interest", 196 | thread="debug", 197 | ) 198 | zoomed_region_of_interest = region_of_interest.copy() 199 | zoomed_region_of_interest = zoomed_region_of_interest.resize( 200 | ( 201 | zoomed_region_of_interest.width * UPSCALE_FACTOR, 202 | zoomed_region_of_interest.height * UPSCALE_FACTOR, 203 | ), 204 | resample=0, 205 | ) 206 | zoomed_region_of_interest_path = os.path.join( 207 | semdesk.img_path, f"{click_hash}_zoomed_region_of_interest.png" 208 | ) 209 | zoomed_region_of_interest.save(zoomed_region_of_interest_path) 210 | ocr_results = find_all_text_with_bounding_boxes(zoomed_region_of_interest_path) 211 | best_matches = [ 212 | box 213 | for box in ocr_results 214 | if similarity_ratio(box["text"], search_text) >= SECOND_OCR_THRESHOLD 215 | ] 216 | 217 | # We trust OCR only of exactly one match over the threshold is found. Otherwise, we fall back to Grid/Composite. 218 | if len(best_matches) != 1: 219 | semdesk.task.post_message( 220 | role="Clicker", 221 | msg=f"No sufficiently similar text found. Found {len(best_matches)} ({best_matches})'", 222 | thread="debug", 223 | ) 224 | else: 225 | best_match = best_matches[0] 226 | relative_box = Box( 227 | best_match["x"], 228 | best_match["y"], 229 | best_match["x"] + best_match["w"], 230 | best_match["y"] + best_match["h"], 231 | ) 232 | absolute_box = relative_box.to_absolute_with_upscale( 233 | bounding_boxes[-1], UPSCALE_FACTOR 234 | ) 235 | x_mid, y_mid = absolute_box.center() 236 | bounding_boxes.append(absolute_box) 237 | semdesk.task.post_message( 238 | role="Clicker", 239 | msg=f"Found best matching text: '{best_match['text']}'", 240 | thread="debug", 241 | ) 242 | # SECOND POINT OF POTENTIAL RESULT RETURN 243 | semdesk.results["second_ocr"] += 1 244 | debug_img = _debug_image( 245 | starting_img.copy(), bounding_boxes, (x_mid, y_mid) 246 | ) 247 | semdesk.task.post_message( 248 | role="Clicker", 249 | msg="Final debug img", 250 | thread="debug", 251 | images=[image_to_b64(debug_img)], 252 | ) 253 | return {"x": x_mid, "y": y_mid} 254 | 255 | # - Two passes of Grid/Composite + Zoom 256 | 257 | total_upscale = 1 258 | method = recall_best_method_on_second_iteration(description) 259 | 260 | region = region_of_interest.copy() 261 | region = region.resize( 262 | (region.width * UPSCALE_FACTOR, region.height * UPSCALE_FACTOR), resample=0 263 | ) 264 | region_of_interest_path = os.path.join( 265 | semdesk.img_path, f"{click_hash}_region_of_interest_zoom_1.png" 266 | ) 267 | region.save(region_of_interest_path) 268 | total_upscale *= UPSCALE_FACTOR 269 | new_region_of_interest, relative_bounding_box = method( 270 | semdesk, region, region_of_interest_path, description, click_hash, "zoom_1" 271 | ) 272 | 273 | # Escape exit, if we didn't find the region of interest because the element is not on a screen: 274 | # we fall back to bruteforce method 275 | if new_region_of_interest is None: 276 | return backup_find_coordinates(semdesk, description) 277 | 278 | absolute_box_zoomed = relative_bounding_box.to_absolute_with_upscale( 279 | bounding_boxes[-1], total_upscale 280 | ) 281 | bounding_boxes.append(absolute_box_zoomed) 282 | 283 | region = new_region_of_interest.copy() 284 | region = region.resize( 285 | (region.width * UPSCALE_FACTOR, region.height * UPSCALE_FACTOR), resample=0 286 | ) 287 | region_of_interest_path = os.path.join( 288 | semdesk.img_path, f"{click_hash}_region_of_interest_zoom_2.png" 289 | ) 290 | region.save(region_of_interest_path) 291 | total_upscale *= UPSCALE_FACTOR 292 | last_region_of_interest, relative_bounding_box = method( 293 | semdesk, region, region_of_interest_path, description, click_hash, "zoom_2" 294 | ) 295 | 296 | # Escape exit, if we didn't find the region of interest because the element is not on a screen: 297 | # we fall back to bruteforce method 298 | if last_region_of_interest is None: 299 | return backup_find_coordinates(semdesk, description) 300 | 301 | absolute_box_zoomed = relative_bounding_box.to_absolute_with_upscale( 302 | bounding_boxes[-1], total_upscale 303 | ) 304 | bounding_boxes.append(absolute_box_zoomed) 305 | 306 | x_mid, y_mid = bounding_boxes[-1].center() 307 | logger.info(f"clicking exact coords {x_mid}, {y_mid}") 308 | semdesk.task.post_message( 309 | role="Clicker", 310 | msg=f"Clicking coordinates {x_mid}, {y_mid}", 311 | thread="debug", 312 | ) 313 | 314 | # LAST POINT OF POTENTIAL RETURN - WE ALWAYS RETURN SOMETHING FROM HERE, UNLESS THERE WAS AN EXCEPTION 315 | semdesk.results["full_grid"] += 1 316 | debug_img = _debug_image(starting_img.copy(), bounding_boxes, (x_mid, y_mid)) 317 | semdesk.task.post_message( 318 | role="Clicker", 319 | msg="Final debug img", 320 | thread="debug", 321 | images=[image_to_b64(debug_img)], 322 | ) 323 | return {"x": x_mid, "y": y_mid} 324 | 325 | 326 | def backup_find_coordinates(semdesk, description: str) -> dict: 327 | # This is a backup method of finding coordinates to click on. If the core method above fails at some point, 328 | # with a region_of_interest being 0 (i.e. not found), we try ones again through the most bruteforce mechanics that we have: 329 | # running three level of Grid + Zoom In; if that fails too, then we surely get back to Big Brain and ask some questions. 330 | semdesk.task.post_message( 331 | role="Clicker", 332 | msg="Coordinates are not found. Falling back to bruteforce 3-level Grid Zoom In.", 333 | thread="debug", 334 | ) 335 | 336 | click_hash = hashlib.md5(description.encode()).hexdigest()[:5] 337 | bounding_boxes = [] 338 | total_upscale = 1 339 | method = run_grid 340 | 341 | starting_img = semdesk.desktop.take_screenshots()[0] 342 | starting_img_path = os.path.join(semdesk.img_path, f"{click_hash}_starting.png") 343 | starting_img.save(starting_img_path) 344 | bounding_boxes.append(Box(0, 0, starting_img.width, starting_img.height)) 345 | 346 | region_of_interest = starting_img.copy() 347 | 348 | for i in [0, 1, 2]: 349 | semdesk.task.post_message( 350 | role="Clicker", 351 | msg=f"Zooming in, level {i}...", 352 | thread="debug", 353 | ) 354 | 355 | region = region_of_interest.copy() 356 | region = region.resize( 357 | (region.width * UPSCALE_FACTOR, region.height * UPSCALE_FACTOR), resample=0 358 | ) 359 | region_of_interest_path = os.path.join( 360 | semdesk.img_path, f"{click_hash}_grid_region_{i}.png" 361 | ) 362 | region.save(region_of_interest_path) 363 | total_upscale *= UPSCALE_FACTOR 364 | region_of_interest, relative_bounding_box = method( 365 | semdesk, 366 | region, 367 | region_of_interest_path, 368 | description, 369 | click_hash, 370 | "zoom_{i}", 371 | ) 372 | 373 | # Escape exit, if we didn't find the region of interest because the element is not on a screen. 374 | if region_of_interest is None: 375 | semdesk.task.post_message( 376 | role="Clicker", 377 | msg=f"Failed to find {description} on the image. Getting back to Actor.", 378 | thread="debug", 379 | ) 380 | return None 381 | 382 | absolute_box_zoomed = relative_bounding_box.to_absolute_with_upscale( 383 | bounding_boxes[-1], total_upscale 384 | ) 385 | bounding_boxes.append(absolute_box_zoomed) 386 | 387 | x_mid, y_mid = bounding_boxes[-1].center() 388 | logger.info(f"clicking exact coords {x_mid}, {y_mid}") 389 | semdesk.task.post_message( 390 | role="Clicker", 391 | msg=f"Clicking coordinates {x_mid}, {y_mid}", 392 | thread="debug", 393 | ) 394 | 395 | # LAST POINT OF POTENTIAL RETURN - WE ALWAYS RETURN SOMETHING FROM HERE, UNLESS THERE WAS AN EXCEPTION 396 | semdesk.results["full_grid"] += 1 397 | debug_img = _debug_image(starting_img.copy(), bounding_boxes, (x_mid, y_mid)) 398 | semdesk.task.post_message( 399 | role="Clicker", 400 | msg="Final debug img", 401 | thread="debug", 402 | images=[image_to_b64(debug_img)], 403 | ) 404 | return {"x": x_mid, "y": y_mid} 405 | 406 | 407 | def similarity_ratio(a, b): 408 | return SequenceMatcher(None, a.lower(), b.lower()).ratio() 409 | 410 | 411 | def run_grid( 412 | semdesk, 413 | starting_image: Image.Image, 414 | starting_path: str, 415 | description: str, 416 | click_hash: str, 417 | postfix: str, 418 | ) -> dict: 419 | img_width, img_height = starting_image.size 420 | starting_image_b64 = image_to_b64(starting_image) 421 | 422 | grid_path = os.path.join(semdesk.img_path, f"{click_hash}_grid_{postfix}.png") 423 | create_grid_image( 424 | img_width, img_height, COLOR_CIRCLE, COLOR_NUMBER, GRID_SIZE, grid_path 425 | ) 426 | 427 | merged_image_path = os.path.join( 428 | semdesk.img_path, f"{click_hash}_merge_{postfix}.png" 429 | ) 430 | merged_image = superimpose_images(starting_path, grid_path, 1) 431 | merged_image.save(merged_image_path) 432 | 433 | merged_image_b64 = image_to_b64(merged_image) 434 | semdesk.task.post_message( 435 | role="Clicker", 436 | msg="Merged image", 437 | thread="debug", 438 | images=[merged_image_b64], 439 | ) 440 | 441 | thread = RoleThread() 442 | 443 | prompt = f""" 444 | You are an experienced AI trained to find the elements on the screen. 445 | You see a screenshot of the web application. 446 | I have drawn some big {COLOR_NUMBER} numbers on {COLOR_CIRCLE} circles on this image 447 | to help you to find required elements. 448 | Please tell me the closest big {COLOR_NUMBER} number on a {COLOR_CIRCLE} circle to the center of the {description}. 449 | 450 | It may be the case, there is no {description} anywhere on the screenshot that you see. 451 | If you are very sure that there is no {description} anywhere on the screenshot that you see, please return {{"number": 0}}. 452 | 453 | Please note that some circles may lay on the {description}. If that's the case, return the number in any of these circles. 454 | If the {description} is a long object, please pick the circle that is closest to the left top corner of the {description}. 455 | I have also attached the entire screenshot without these numbers for your reference. 456 | 457 | Please return you response as raw JSON following the schema {ZoomSelection.model_json_schema()} 458 | Be concise and only return the raw json, for example if the circle you wanted to select had a number 3 in it 459 | you would return {{"number": 3}} 460 | """ 461 | 462 | msg = RoleMessage( 463 | role="user", 464 | text=prompt, 465 | images=[merged_image_b64, starting_image_b64], 466 | ) 467 | thread.add_msg(msg) 468 | 469 | try: 470 | response = router.chat( 471 | thread, 472 | namespace="grid", 473 | expect=ZoomSelection, 474 | agent_id="RobbieG2", 475 | retries=1, 476 | ) 477 | if not response.parsed: 478 | raise SystemError("No response parsed from zoom") 479 | 480 | semdesk.task.add_prompt(response.prompt) 481 | 482 | zoom_resp = response.parsed 483 | semdesk.task.post_message( 484 | role="Clicker", 485 | msg=f"Selection {zoom_resp.model_dump_json()}", 486 | thread="debug", 487 | ) 488 | console.print(JSON(zoom_resp.model_dump_json())) 489 | chosen_number = zoom_resp.number 490 | except Exception as e: 491 | logger.info(f"Error in analyzing grid: {e}.") 492 | 493 | if chosen_number == 0: 494 | return None, None 495 | 496 | region_of_interest, top_left, bottom_right = zoom_in( 497 | starting_path, GRID_SIZE, chosen_number, 1 498 | ) 499 | bounding_box = Box(top_left[0], top_left[1], bottom_right[0], bottom_right[1]) 500 | return region_of_interest, bounding_box 501 | 502 | 503 | def run_composite( 504 | semdesk, 505 | starting_image: Image.Image, 506 | starting_path: str, 507 | description: str, 508 | click_hash: str, 509 | postfix: str, 510 | ) -> dict: 511 | composite_path = os.path.join( 512 | semdesk.img_path, f"{click_hash}_composite_{postfix}.png" 513 | ) 514 | composite_pil, bounding_boxes = create_composite(starting_path, NUM_CLUSTERS) 515 | composite_pil.save(composite_path) 516 | composite_b64 = image_to_b64(composite_pil) 517 | 518 | starting_image_b64 = image_to_b64(starting_image) 519 | 520 | semdesk.task.post_message( 521 | role="Clicker", 522 | msg="Composite image", 523 | thread="debug", 524 | images=[composite_b64], 525 | ) 526 | 527 | thread = RoleThread() 528 | 529 | prompt = f""" 530 | You are an experienced AI trained to find the elements on the screen. 531 | You see a composite of several section of the screenshpt of the web application. 532 | You also see the entire screenshot for the reference. 533 | 534 | I have drawn some big {COLOR_NUMBER} numbers on the left panel of the composite image. 535 | Please tell me the number of the section of the composite image that contains the {description}. 536 | 537 | It may be the case, there is no {description} anywhere on the screenshot that you see. 538 | If you are very sure that there is no {description} anywhere on the screenshot that you see, please return {{"number": 0}}. 539 | 540 | Please return you response as raw JSON following the schema {CompositeSelection.model_json_schema()} 541 | Be concise and only return the raw json, for example if the section has a number 3, 542 | you should return {{"number": 3}} 543 | """ 544 | 545 | msg = RoleMessage( 546 | role="user", 547 | text=prompt, 548 | images=[composite_b64, starting_image_b64], 549 | ) 550 | thread.add_msg(msg) 551 | 552 | try: 553 | response = router.chat( 554 | thread, 555 | namespace="composite", 556 | expect=CompositeSelection, 557 | agent_id="RobbieG2", 558 | retries=1, 559 | ) 560 | if not response.parsed: 561 | raise SystemError("No response parsed from zoom") 562 | 563 | semdesk.task.add_prompt(response.prompt) 564 | 565 | composite_resp = response.parsed 566 | semdesk.task.post_message( 567 | role="Clicker", 568 | msg=f"Selection {composite_resp.model_dump_json()}", 569 | thread="debug", 570 | ) 571 | console.print(JSON(composite_resp.model_dump_json())) 572 | chosen_number = composite_resp.number 573 | except Exception as e: 574 | logger.info(f"Error in analyzing composite: {e}.") 575 | return None, None 576 | 577 | if chosen_number == 0: 578 | return None, None 579 | 580 | bounding_box = bounding_boxes[chosen_number - 1] 581 | top_left = (bounding_box[0], bounding_box[1]) 582 | bottom_right = ( 583 | bounding_box[0] + bounding_box[2], 584 | bounding_box[1] + bounding_box[3], 585 | ) 586 | region_of_interest = starting_image.crop( 587 | (top_left[0], top_left[1], bottom_right[0], bottom_right[1]) 588 | ) 589 | box = Box(top_left[0], top_left[1], bottom_right[0], bottom_right[1]) 590 | 591 | return region_of_interest, box 592 | 593 | 594 | def _debug_image( 595 | img: Image.Image, 596 | boxes: List[Box], 597 | final_click: Optional[Tuple[int, int]] = None, 598 | ) -> Image.Image: 599 | draw = ImageDraw.Draw(img) 600 | for box in boxes: 601 | box.draw(draw) 602 | 603 | if final_click: 604 | draw.ellipse( 605 | [ 606 | final_click[0] - 5, 607 | final_click[1] - 5, 608 | final_click[0] + 5, 609 | final_click[1] + 5, 610 | ], 611 | fill="red", 612 | outline="red", 613 | ) 614 | return img 615 | -------------------------------------------------------------------------------- /robbieg2/agent.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import time 4 | import traceback 5 | from typing import Final, List, Optional, Tuple, Type 6 | 7 | from agentdesk.device_v1 import Desktop 8 | from devicebay import Device 9 | from pydantic import BaseModel, Field 10 | from rich.console import Console 11 | from rich.json import JSON 12 | from skillpacks.server.models import V1Action 13 | from surfkit.agent import TaskAgent 14 | from taskara import Task, TaskStatus 15 | from tenacity import before_sleep_log, retry, stop_after_attempt 16 | from threadmem import RoleMessage, RoleThread 17 | from mllm import ChatResponse 18 | from toolfuse.util import AgentUtils 19 | 20 | from .tool import SemanticDesktop, router 21 | from .clicker import similarity_ratio 22 | from .cheap_critic import assess_action_result 23 | 24 | 25 | logging.basicConfig(level=logging.INFO) 26 | logger: Final = logging.getLogger(__name__) 27 | logger.setLevel(int(os.getenv("LOG_LEVEL", str(logging.DEBUG)))) 28 | 29 | console = Console(force_terminal=True) 30 | 31 | 32 | class RobbieG2Config(BaseModel): 33 | pass 34 | 35 | 36 | class ActorThoughts(BaseModel): 37 | """An represention of thoughts of the Actor part of the brain.""" 38 | 39 | observation: str = Field( 40 | ..., description="Observations of the current state of the environment" 41 | ) 42 | reason: str = Field( 43 | ..., 44 | description="The reason why this action was chosen, explaining the logic or rationale behind the decision.", 45 | ) 46 | action: V1Action = Field( 47 | ..., 48 | description="The action object detailing the specific action to be taken, including its name and parameters.", 49 | ) 50 | 51 | class NeocortexPrediction(BaseModel): 52 | """An represention of thoughts of the Neocortex part of the brain.""" 53 | 54 | prediction: str = Field( 55 | ..., description="Prediction about the state of the environment after the current action" 56 | ) 57 | reason: str = Field( 58 | ..., 59 | description="The reason why the next action is chosen, explaining the logic or rationale behind the decision.", 60 | ) 61 | action: V1Action = Field( 62 | ..., 63 | description="The action object detailing the next action to be taken after the current action takes place, including its name and parameters.", 64 | ) 65 | 66 | class NeocortexThoughts(BaseModel): 67 | """An represention of thoughts of the Neocortex part of the brain.""" 68 | 69 | prediction_1: NeocortexPrediction = Field( 70 | ..., description="Prediction about the state of the environment after the current action, chosen by Actor, and the most appropriate next action" 71 | ) 72 | prediction_2: NeocortexPrediction = Field( 73 | ..., description="Prediction about the state of the environment after the first predicted action, and the most appropriate action after that" 74 | ) 75 | 76 | class CriticThoughts(BaseModel): 77 | """An represention of thoughts of the Critic part of the brain.""" 78 | 79 | critic: str = Field(..., description="Critic's thoughts about whether the current state of environment corresponds to a given task, and if not, now to recover.") 80 | 81 | class BrainThoughts(BaseModel): 82 | """An represention of thoughts of the whole brain.""" 83 | 84 | critic: CriticThoughts = Field(..., description="Thoughts of the Critic part of the brain.") 85 | 86 | actor: ActorThoughts = Field(..., description="Thoughts of the Actor part of the brain.") 87 | 88 | neocortex: NeocortexThoughts = Field(..., description="Thoughts of the Neocortex part of the brain.") 89 | 90 | class InterruptionCriticThoughts(BaseModel): 91 | """A representation of thoughts of the Critic which was interrupted because we repeat the same actions again and again.""" 92 | 93 | critic: str = Field(..., description="Critic's assessment on whether taking the current action is a good idea and whether the previous similar action were appropriate and successful.") 94 | 95 | action: V1Action = Field(..., description="The most appropripriate next action given the entire situation.") 96 | 97 | 98 | class RobbieG2(TaskAgent): 99 | """A GUI desktop agent that slices up the image""" 100 | 101 | def __init__(self, *args, **kwargs): 102 | super().__init__(*args, **kwargs) 103 | self.past_actions = [] 104 | 105 | 106 | def record_action(self, action: dict) -> None: 107 | self.past_actions.append(action) 108 | 109 | 110 | def find_the_closest_actions(self, action: V1Action, depth: int = 10, threshold: float = 0.8) -> [V1Action]: 111 | recent_actions = self.past_actions[-depth:] 112 | closest_actions = [] 113 | 114 | for past_action in reversed(recent_actions): 115 | if action.name == "type_text" or action.name == "click_object": 116 | action_params = str(action.parameters) 117 | past_action_parames = str(past_action.parameters) 118 | similarity = similarity_ratio(action_params, past_action_parames) 119 | if similarity > threshold: 120 | closest_actions.append(past_action) 121 | else: 122 | action_str = str(action) 123 | past_action_str = str(past_action) 124 | similarity = similarity_ratio(action_str, past_action_str) 125 | if similarity > 0.95: 126 | closest_actions.append(past_action) 127 | 128 | return closest_actions 129 | 130 | 131 | def solve_task( 132 | self, 133 | task: Task, 134 | device: Optional[Device] = None, 135 | max_steps: int = 30, 136 | ) -> Task: 137 | """Solve a task 138 | 139 | Args: 140 | task (Task): Task to solve. 141 | device (Device): Device to perform the task on. 142 | max_steps (int, optional): Max steps to try and solve. Defaults to 30. 143 | 144 | Returns: 145 | Task: The task 146 | """ 147 | start_time = time.time() # Start time measurement 148 | 149 | # Post a message to the default thread to let the user know the task is in progress 150 | task.post_message("Actor", f"Starting task '{task.description}'") 151 | 152 | # Create threads in the task to update the user 153 | console.print("creating threads...") 154 | task.ensure_thread("debug") 155 | task.post_message("Actor", "I'll post debug messages here", thread="debug") 156 | 157 | # Check that the device we received is one we support 158 | if not isinstance(device, Desktop): 159 | raise ValueError("Only desktop devices supported") 160 | 161 | # Wrap the standard desktop in our special tool 162 | semdesk = SemanticDesktop(task=task, desktop=device) 163 | 164 | # Add standard agent utils to the device 165 | semdesk.merge(AgentUtils()) 166 | 167 | # Open a site if present in the parameters 168 | site = task._parameters.get("site") if task._parameters else None 169 | if site: 170 | console.print(f"▶️ opening site url: {site}", style="blue") 171 | task.post_message("Body", f"opening site url {site}...") 172 | semdesk.desktop.open_url(site) 173 | console.print("waiting for browser to open...", style="blue") 174 | time.sleep(10) 175 | 176 | # Get info about the desktop 177 | info = semdesk.desktop.info() 178 | screen_size = info["screen_size"] 179 | console.print(f"Screen size: {screen_size}") 180 | 181 | # Get the json schema for the tools, excluding actions that aren't useful 182 | tools = semdesk.json_schema( 183 | exclude_names=[ 184 | "move_mouse", 185 | "click", 186 | "drag_mouse", 187 | "mouse_coordinates", 188 | "take_screenshots", 189 | "open_url", 190 | "double_click", 191 | ] 192 | ) 193 | console.print("tools: ", style="purple") 194 | console.print(JSON.from_data(tools)) 195 | 196 | starting_prompt = f""" 197 | You are RobbieG2, an advanced AI agent designed to navigate and interact with web interfaces. Your capabilities include: 198 | 199 | 1. Mouse control: 200 | - Move the mouse to specific coordinates 201 | - Click (single or double) at current or specified locations 202 | - Retrieve current mouse coordinates 203 | 204 | 2. Keyboard input: 205 | - Send key commands, including special keys like Tab, Enter, and arrow keys 206 | - Type text into form fields 207 | 208 | 3. Navigation: 209 | - Use Tab key to move through form elements 210 | - Scroll web pages 211 | 212 | 4. Visual analysis: 213 | - Take screenshots of the current view 214 | 215 | 5. Advanced interaction: 216 | - Click on objects based on semantic descriptions 217 | 218 | *** Firefox Commands 219 | 220 | Specifically, if you are using the Firefox browser, remember that you can use the following key commands: 221 | 222 | * Press Ctrl + L or Alt + D to highlight the URL, then press Delete to clear it if there is incorrect text in the URL bar that you need to clear out. 223 | * To clear the text in a field do the following: First, ensure the field is in focus BEFORE using this command. Then use Ctrl + A and then Backspace or Delete: This command first highlights all text in a field and then deletes that text. 224 | * Ctrl + Shift + Tab switches to the previous tab 225 | * Ctrl + Tab switches to the next tab 226 | * Press Backspace or Alt + Left Arrow to go to the previous page in your browsing history for the tab 227 | * Press Shift + Backspace or Alt + Right Arrow to go to the next page in your browsing history for the tab 228 | * Press F6 or Shift + F6 to switch focus to the next keyboard-accessible pane, which includes: 229 | 230 | Highlights the URL in the address bar 231 | Bookmarks bar (if visible) 232 | The main web content 233 | Downloads bar (if visible) 234 | 235 | 236 | *** Chrome Commands 237 | 238 | Specifically, if you are using the Chrome browser, remember that you can use the following key commands: 239 | 240 | * Press Ctrl + L or Alt + D to highlight the URL, then press Delete to clear it if there is incorrect text in the URL bar that you need to clear out. 241 | * `clean_text` is also a special command to clear fields but you MUST ensure the field is IN FOCUS first before using this command. 242 | * Ctrl + Shift + Tab switches to the previous tab which is very useful if a new tab you don't want is opened and you need to get back to the last tab. 243 | * Ctrl + Tab switches to the next tab. 244 | * Press Backspace or Alt and the left arrow together - to go to the previous page in your browsing history for the tab. 245 | * Press Shift+Backspace, or Alt and the right arrow together - to go to the next page in your browsing history for the tab. 246 | # `Ctrl + A` and then `Backspace` or `Delete` - This command first highlights all text in a field and then deletes that text. It only works IF YOU ARE ALREADY IN THAT FIELD so be sure the field is in focus or clicked already and click it if you are unsure - this is one of the MOST IMPORTANT commands. You can use it to clear existing text from a field that is filled with incorrect information in a web form. 247 | * F6 or Shift+F6 - Switches focus to the next keyboard-accessible pane. Panes include: 248 | 249 | Highlights the URL in the address bar 250 | Bookmarks bar (if visible) 251 | The main web content (including any infobars) 252 | Downloads bar (if visible) 253 | 254 | If you are unsure about whether a field is selected you can try to click it to ensure it is highlighted. If you take an action several times in a row 255 | and the result has not changed, for example, if a field has not changed to meet you expectation, then explore the idea of clicking it again to change it 256 | and ensure it has the correct text that you want there. 257 | 258 | Remember, remember that you DO NOT have the ability to take a screenshot to verify your results. 259 | 260 | Sometimes a page isn't fully loaded yet. If that is the case feel free to wait or pause briefly for the screenshot to indicate a fully loaded page. 261 | 262 | If you get stuck in a loop of actions, use your curiosity to explore and try new things with trial and error. Learn from your mistakes and get better with 263 | each new action. 264 | 265 | The complete list of available tools is: {tools} 266 | 267 | Your goal is to efficiently navigate web interfaces and complete tasks by leveraging these capabilities. 268 | Always consider the most appropriate method for each action, and be prepared to adapt your approach based 269 | on the results of your actions. 270 | 271 | When faced with a task, think step-by-step about the best approach, considering all your available tools and methods. 272 | 273 | You brain consists of three major parts. 274 | 275 | 1. The Critic is responsible for evaluating the current state of the environment and deciding whether it corresponds to a given task. 276 | If it doesn't, the Critic explains how to recover the environment to a state where it can complete the task. Always start with Critic 277 | assessment, before choosing the next actions and predicting the next steps. 278 | 279 | 2. The Actor is responsible for picking the next action based on the current state of the environment and the tools available. 280 | 281 | 3. The Neocortex is responsible for thinking ahead, predicting the state of the environment after the action that the Actor picked, 282 | choosing the next action after that, and so on. The Neocortex makes three predictions for the actions to be taken AFTER the one that 283 | the Actor picked. 284 | 285 | Your current task is {task.description}. 286 | 287 | For each screenshot I will send you please return the complete thoughts of both parts of your brain as a 288 | raw JSON adhearing to the schema {BrainThoughts.model_json_schema()}. 289 | 290 | Let me know when you are ready and I'll send you the first screenshot. 291 | """ 292 | 293 | # Create our thread and start with a system prompt 294 | thread = RoleThread() 295 | thread.post( 296 | role="user", 297 | msg=starting_prompt, 298 | ) 299 | response = router.chat(thread, namespace="system") 300 | console.print(f"system prompt response: {response}", style="blue") 301 | thread.add_msg(response.msg) 302 | 303 | # Loop to run actions 304 | for i in range(max_steps): 305 | console.print(f"-------step {i + 1}", style="green") 306 | 307 | try: 308 | thread, done = self.take_action(semdesk, task, thread) 309 | except Exception as e: 310 | console.print(f"Error: {e}", style="red") 311 | task.status = TaskStatus.FAILED 312 | task.error = str(e) 313 | task.save() 314 | task.post_message("Actor", f"❗ Error taking action: {e}") 315 | end_time = time.time() # End time measurement 316 | elapsed_time = end_time - start_time 317 | console.print(f"Time taken to solve task: {elapsed_time:.2f} seconds", style="green") 318 | return task 319 | 320 | if done: 321 | console.print("task is done", style="green") 322 | end_time = time.time() # End time measurement 323 | elapsed_time = end_time - start_time 324 | console.print(f"Time taken to solve task: {elapsed_time:.2f} seconds", style="green") 325 | return task 326 | 327 | time.sleep(2) 328 | 329 | task.status = TaskStatus.FAILED 330 | task.save() 331 | task.post_message("Actor", "❗ Max steps reached without solving task") 332 | console.print("Reached max steps without solving task", style="red") 333 | 334 | end_time = time.time() # End time measurement 335 | elapsed_time = end_time - start_time 336 | console.print(f"Time taken to solve task: {elapsed_time:.2f} seconds", style="green") 337 | 338 | return task 339 | 340 | @retry( 341 | stop=stop_after_attempt(5), 342 | before_sleep=before_sleep_log(logger, logging.INFO), 343 | ) 344 | def interrupt_flow_and_ask_critic( 345 | self, 346 | semdesk: SemanticDesktop, 347 | task: Task, 348 | thread: RoleThread, 349 | current_action: dict 350 | ) -> dict: 351 | try: 352 | _thread = thread.copy() 353 | screenshot_img = semdesk.desktop.take_screenshots()[0] 354 | critic_prompt = f""" 355 | You task is {task.description}. The screenshot is attached. 356 | You are attempting to do the following action: {current_action}. 357 | You have already attempted to do very similar actions very recently. 358 | Please assess if the previous actions very successful, and if you are sure that this action is exactly what needs to be done next. 359 | If you are not sure, please consider various alternative options and pick the action that is most likely to lead us toward completing 360 | the above-mentioned task. 361 | Give me the action to be done next, along with yours reasons for that. 362 | 363 | Unlike other messages in this thread, please return your thoughts as as a 364 | raw JSON adhearing to the schema {InterruptionCriticThoughts.model_json_schema()}. 365 | 366 | Please return just the raw JSON. 367 | """ 368 | # Craft the message asking the MLLM for an action 369 | msg = RoleMessage( 370 | role="user", 371 | text=critic_prompt, 372 | images=[screenshot_img], 373 | ) 374 | _thread.add_msg(msg) 375 | 376 | # Make the action selection 377 | response = router.chat( 378 | _thread, 379 | namespace="action", 380 | expect=InterruptionCriticThoughts, 381 | agent_id=self.name(), 382 | ) 383 | task.add_prompt(response.prompt) 384 | 385 | try: 386 | # Post to the user letting them know what the modle selected 387 | selection = response.parsed 388 | if not selection: 389 | raise ValueError("No action selection parsed") 390 | 391 | task.post_message("Critic", f"🤔 {selection.critic}") 392 | task.post_message("Critic", f"▶️ I suggest to take action '{selection.action.name}' "+ 393 | f"with parameters: {selection.action.parameters}") 394 | return selection.action 395 | 396 | except Exception as e: 397 | console.print(f"Response failed to parse: {e}", style="red") 398 | raise 399 | 400 | except Exception as e: 401 | console.print("Exception taking action: ", e) 402 | traceback.print_exc() 403 | task.post_message("Actor", f"⚠️ Error taking action: {e} -- retrying...") 404 | raise e 405 | 406 | 407 | @retry( 408 | stop=stop_after_attempt(5), 409 | before_sleep=before_sleep_log(logger, logging.INFO), 410 | ) 411 | def take_action( 412 | self, 413 | semdesk: SemanticDesktop, 414 | task: Task, 415 | thread: RoleThread, 416 | ) -> Tuple[RoleThread, bool]: 417 | """Take an action 418 | 419 | Args: 420 | desktop (SemanticDesktop): Desktop to use 421 | task (str): Task to accomplish 422 | thread (RoleThread): Role thread for the task 423 | 424 | Returns: 425 | bool: Whether the task is complete 426 | """ 427 | try: 428 | # Check to see if the task has been cancelled 429 | if task.remote: 430 | task.refresh() 431 | console.print("task status: ", task.status.value) 432 | if ( 433 | task.status == TaskStatus.CANCELING 434 | or task.status == TaskStatus.CANCELED 435 | ): 436 | console.print(f"task is {task.status}", style="red") 437 | if task.status == TaskStatus.CANCELING: 438 | task.status = TaskStatus.CANCELED 439 | task.save() 440 | return thread, True 441 | 442 | console.print("taking action...", style="white") 443 | 444 | # Create a copy of the thread, and remove old images 445 | _thread = thread.copy() 446 | _thread.remove_images() 447 | 448 | task.post_message("Actor", "🤔 I'm thinking...") 449 | 450 | # Take a screenshot of the desktop and post a message with it 451 | screenshot_img = semdesk.desktop.take_screenshots()[0] 452 | task.post_message( 453 | "Actor", 454 | "Current image", 455 | images=[screenshot_img], 456 | thread="debug", 457 | ) 458 | 459 | # Get the current mouse coordinates 460 | x, y = semdesk.desktop.mouse_coordinates() 461 | console.print(f"mouse coordinates: ({x}, {y})", style="white") 462 | 463 | step_prompt = f""" 464 | Here is a screenshot of the current desktop, please select next and the one after next action from the provided schema. 465 | 466 | Critic: Carefully analyze the screenshot and check if the state corresponds to the task we are solving. Remember that 467 | the task is {task.description}. 468 | Actor: Select a next action and explain why. 469 | Neocortex: Predict the result of the action picked by Actor and pick the next ones. 470 | 471 | Watch out for elements that are different from others, for example, have the border of the different color. 472 | Such elements are usually already in focus, and you can try to type text in them right away. 473 | However, if you tried to type on a previous step and want to type the same input again, you better 474 | focus on the input field first by clicking on it. 475 | 476 | Please return just the raw JSON. 477 | """ 478 | 479 | # Craft the message asking the MLLM for an action 480 | msg = RoleMessage( 481 | role="user", 482 | text=step_prompt, 483 | images=[screenshot_img], 484 | ) 485 | _thread.add_msg(msg) 486 | 487 | # Make the action selection 488 | response = router.chat( 489 | _thread, 490 | namespace="action", 491 | expect=BrainThoughts, 492 | agent_id=self.name(), 493 | ) 494 | task.add_prompt(response.prompt) 495 | 496 | try: 497 | # Post to the user letting them know what the modle selected 498 | selection = response.parsed 499 | if not selection: 500 | raise ValueError("No action selection parsed") 501 | 502 | task.post_message("Critic", f"🤔 {selection.critic.critic}") 503 | 504 | task.post_message("Actor", f"👁️ {selection.actor.observation}\n" + 505 | f"💡 {selection.actor.reason}\n" + 506 | f"▶️ I'm going to take action '{selection.actor.action.name}' "+ 507 | f"with parameters: {selection.actor.action.parameters}") 508 | 509 | task.post_message("Neocortex", f"🔮 {selection.neocortex.prediction_1.prediction}\n" + 510 | f"💡 {selection.neocortex.prediction_1.reason}\n" + 511 | f"🔜 The next action to take is '{selection.neocortex.prediction_1.action.name}' "+ 512 | f"with parameters: {selection.neocortex.prediction_1.action.parameters}") 513 | 514 | task.post_message("Neocortex", f"🔮 {selection.neocortex.prediction_2.prediction}\n" + 515 | f"💡 {selection.neocortex.prediction_2.reason}\n" + 516 | f"🔜 The last action to take after that is '{selection.neocortex.prediction_2.action.name}' "+ 517 | f"with parameters: {selection.neocortex.prediction_2.action.parameters}") 518 | 519 | except Exception as e: 520 | console.print(f"Response failed to parse: {e}", style="red") 521 | raise 522 | 523 | # The agent will return 'result' if it believes it's finished 524 | if selection.actor.action.name == "result": 525 | console.print(f"The final result is: {selection.actor.action.parameters['value']}", style="green") 526 | task.post_message( 527 | "Actor", 528 | f"✅ I think the task is done, please review the result: {selection.actor.action.parameters['value']}", 529 | ) 530 | task.status = TaskStatus.FINISHED 531 | task.save() 532 | return _thread, True 533 | 534 | im_start = screenshot_img 535 | continue_chain = True 536 | interruption_requested = False 537 | 538 | for next_action in [selection.actor.action, 539 | selection.neocortex.prediction_1.action, 540 | selection.neocortex.prediction_2.action]: 541 | if not continue_chain or next_action.name == "result" or interruption_requested: 542 | # Time to think again! 543 | break 544 | 545 | # Hack for the cases when AI is willing to press "ctrl+s" or smth like that 546 | if next_action.name == "press_key" and "+" in next_action.parameters["key"]: 547 | next_action.name = "hot_key" 548 | next_action.parameters = {"keys": next_action.parameters["key"].split("+")} 549 | 550 | # Additional check to make sure that we are not trapped in a circle, do the same action again and again 551 | depth = 5 if (next_action.name == "press_key" or next_action.name == "hot_key") else 10 552 | closest_actions = self.find_the_closest_actions(next_action, depth=depth) 553 | if len(closest_actions) > 0: 554 | task.post_message( 555 | "Body", 556 | f"Closest actions to the current one: {closest_actions}", 557 | thread="debug" 558 | ) 559 | if len(closest_actions) >= 2: 560 | task.post_message( 561 | "Body", 562 | "Too many repeated actions. Getting back to Critic.", 563 | thread="debug" 564 | ) 565 | # Well, look like it's time to interrupt the flow and reconsider our life choices. 566 | new_action = self.interrupt_flow_and_ask_critic(semdesk, task, thread, next_action) 567 | next_action = new_action 568 | # We'll run this updated action and get out of the cycle. 569 | interruption_requested = True 570 | 571 | task.post_message( 572 | "Body", 573 | f"▶️ Taking action '{next_action.name}' with parameters: {next_action.parameters}", 574 | ) 575 | self._take_selected_action(semdesk, next_action, task, _thread, response) 576 | self.record_action(next_action) 577 | 578 | # If we know for certian that the click was not successful, it's time to stop the chain 579 | # and think again 580 | if semdesk.last_click_failed: 581 | semdesk.last_click_failed = False 582 | break 583 | 584 | # Pressing keys change environment for sure, so we may just stop the exection here and think again 585 | if next_action.name == "press_key": 586 | break 587 | 588 | # We analyze if we want to continue to the next action here. A cheap critic looks at the new screenshot and 589 | # decides if we should continue the chain or not. 590 | screenshot_upd = semdesk.desktop.take_screenshots()[0] 591 | ssim, continue_chain = assess_action_result(im_start, screenshot_upd) 592 | 593 | # If we were typing text, and the screen changed too much, then we probably hit some hot keys by accident 594 | # and scrolled down. We should stop and scroll back up, forcing recovery. 595 | if next_action.name == "type_text" and ssim < 0.9: 596 | semdesk.desktop.scroll(30) # we may need to adjust this number 597 | break 598 | 599 | # There is a chance that if the last action was a click, then the result didn't load yet, and the SSIM will be high 600 | # while it should be low. To avoid this, we check once again for this specific case in 5 seconds: 601 | if next_action.name == "click_object" and ssim > 0.95: 602 | task.post_message("Critic", "😴 Waiting to be sure that the result is loaded...", thread="debug") 603 | time.sleep(5) 604 | screenshot_upd = semdesk.desktop.take_screenshots()[0] 605 | ssim, continue_chain = assess_action_result(im_start, screenshot_upd) 606 | task.post_message("Critic", f"🔍 SSIM: {ssim}", thread="debug") 607 | 608 | return _thread, False 609 | 610 | except Exception as e: 611 | console.print("Exception taking action: ", e) 612 | traceback.print_exc() 613 | task.post_message("Actor", f"⚠️ Error taking action: {e} -- retrying...") 614 | raise e 615 | 616 | def _take_selected_action(self, semdesk: SemanticDesktop, action: V1Action, 617 | task: Task, thread: RoleThread, response: ChatResponse) -> None: 618 | """Take the selected action 619 | 620 | Args: 621 | semdesk (SemanticDesktop): Desktop to use 622 | action (V1Action): Action to take 623 | """ 624 | console.log(f"taking action: {action}") 625 | 626 | # Find the selected action in the tool 627 | desktop_action = semdesk.find_action(action.name) 628 | console.print(f"found action: {desktop_action}", style="blue") 629 | if not desktop_action: 630 | console.print(f"action returned not found: {action.name}") 631 | raise SystemError("action not found") 632 | 633 | # Take the selected action 634 | try: 635 | action_response = semdesk.use(desktop_action, **action.parameters) 636 | except Exception as e: 637 | raise ValueError(f"Trouble using action: {e}") 638 | 639 | console.print(f"action output: {action_response}", style="blue") 640 | if action_response: 641 | task.post_message( 642 | "Actor", f"👁️ Result from taking action: {action_response}" 643 | ) 644 | 645 | thread.add_msg(response.msg) 646 | 647 | @classmethod 648 | def supported_devices(cls) -> List[Type[Device]]: 649 | """Devices this agent supports 650 | 651 | Returns: 652 | List[Type[Device]]: A list of supported devices 653 | """ 654 | return [Desktop] 655 | 656 | @classmethod 657 | def config_type(cls) -> Type[RobbieG2Config]: 658 | """Type of config 659 | 660 | Returns: 661 | Type[DinoConfig]: Config type 662 | """ 663 | return RobbieG2Config 664 | 665 | @classmethod 666 | def from_config(cls, config: RobbieG2Config) -> "RobbieG2": 667 | """Create an agent from a config 668 | 669 | Args: 670 | config (RobbieG2Config): Agent config 671 | 672 | Returns: 673 | RobbieG2: The agent 674 | """ 675 | return RobbieG2() 676 | 677 | @classmethod 678 | def default(cls) -> "RobbieG2": 679 | """Create a default agent 680 | 681 | Returns: 682 | RobbieG2: The agent 683 | """ 684 | return RobbieG2() 685 | 686 | @classmethod 687 | def init(cls) -> None: 688 | """Initialize the agent class""" 689 | return 690 | 691 | 692 | Agent = RobbieG2 693 | --------------------------------------------------------------------------------