├── .DS_Store ├── .github └── ISSUE_TEMPLATE │ ├── bug_report.md │ └── config.yml ├── .gitignore ├── LICENSE ├── README.md ├── README_CN.md ├── README_JA.md ├── auto_control ├── .DS_Store ├── __init__.py ├── agent │ ├── base_agent.py │ ├── task_plan_agent.py │ ├── task_run_agent.py │ └── vision_agent.py ├── app.py ├── executor │ └── anthropic_executor.py ├── loop.py └── tools │ ├── __init__.py │ ├── base.py │ ├── collection.py │ ├── computer.py │ └── screen_capture.py ├── imgs ├── autoMate.png ├── cursor.png ├── gradioicon.png ├── header_bar.png ├── header_bar_thin.png ├── knowledge.png ├── logo.png ├── omniboxicon.png └── omniparsericon.png ├── install.py ├── main.py ├── main.spec ├── requirements.txt ├── task_demonstration.json ├── ui ├── __init__.py ├── agent_worker.py ├── hotkey_edit.py ├── main.py ├── main_window.py ├── settings_dialog.py ├── theme.py └── tray_icon.py └── util ├── auto_control.py ├── auto_util.py ├── download_weights.py ├── screen_selector.py ├── tool.py └── wechat_auto.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuruotong1/autoMate/bcd53ea749651a4819bd05d6cfc35f76a0e580c8/.DS_Store -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | about: Create a bug report to help us improve 4 | labels: bug 5 | assignees: '' 6 | 7 | --- 8 | 9 | ## Version Information 10 | - Commit Hash: 11 | 12 | ## Error Message 13 | 14 | ``` 15 | 16 | ``` 17 | 18 | ## Description 19 | 20 | 21 | ### Current Behavior 22 | 23 | 24 | ### Expected Behavior 25 | 26 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__** 2 | weights** 3 | .conda** 4 | .venv 5 | tmp** 6 | build** 7 | dist** -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Dongle 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | autoMate logo 4 |

autoMate

5 |

🤖 AI-Powered Local Automation Tool | Let Your Computer Work for You

6 | 7 | [中文](./README_CN.md) | [日本語](./README_JA.md) 8 | 9 | >"Automate the tedious, give time back to life" 10 | 11 | https://github.com/user-attachments/assets/bf27f8bd-136b-402e-bc7d-994b99bcc368 12 | 13 | 14 |
15 | 16 | > **Special Note:** The autoMate project is still in its early stages of rapid iteration, and we continue to explore and integrate the latest technologies. During this process, **deeper design thinking, technical stack discussions, challenges and solutions encountered, as well as my ongoing research notes on AI+RPA, will be primarily shared and discussed in my [Knowledge Planet "AI Tongmu and His Noble Friends"](https://t.zsxq.com/x1cCW)**. 17 | > 18 | > If you're interested in the technical details behind autoMate, its development direction, or broader AI automation topics, feel free to scan the QR code to join and discuss with me and other friends, witnessing the growth of autoMate together! 19 | 20 |
21 |
22 | Knowledge Planet QR Code 23 |
24 |
25 | 26 | 27 | ## 💫 Redefining Your Relationship with Computers 28 | 29 | Unlike traditional RPA tools that are cumbersome to use, autoMate leverages the power of large language models to complete complex automation processes simply by describing tasks in natural language. Say goodbye to repetitive work and focus on what truly creates value! 30 | 31 | **Let automation create more possibilities for your life.** 32 | 33 | ## 💡 Project Introduction 34 | autoMate is a revolutionary AI+RPA automation tool built on OmniParser that can: 35 | 36 | - 📊 Understand your requirements and automatically plan tasks 37 | - 🔍 Intelligently comprehend screen content, simulating human vision and operations 38 | - 🧠 Make autonomous decisions, judging and taking actions based on task requirements 39 | - 💻 Support local deployment, protecting your data security and privacy 40 | 41 | ## ✨ Features 42 | 43 | - 🔮 No-Code Automation - Describe tasks in natural language, no programming knowledge required 44 | - 🖥️ Full Interface Control - Support operations on any visual interface, not limited to specific software 45 | - 🚅 Simplified Installation - Support for Chinese environment, one-click deployment 46 | 47 | ## 🚀 Quick Start 48 | 49 | ### 📥 Direct Usage 50 | You can directly download the executable file from github release. 51 | 52 | ### 📦 Installation 53 | We strongly recommend installing miniConda first and using miniconda to install dependencies. There are many tutorials available online, or you can ask AI for help. Then follow these commands to set up the environment: 54 | 55 | ```bash 56 | # Clone the project 57 | git clone https://github.com/yuruotong1/autoMate.git 58 | cd autoMate 59 | # Create python3.12 environment 60 | conda create -n "automate" python==3.12 61 | # Activate environment 62 | conda activate automate 63 | # Install dependencies 64 | python install.py 65 | ``` 66 | 67 | After installation, you can start the application using the command line: 68 | 69 | ```bash 70 | python main.py 71 | ``` 72 | 73 | Then open `http://localhost:7888/` in your browser to configure your API key and basic settings. 74 | 75 | ### 🔔 Note 76 | 77 | Currently tested and supported models are as follows: 78 | 79 | > PS: Below are the large model vendors that have been tested and are working. These vendors have no relationship with us, so we don't promise after-sales service, functional guarantees, or stability maintenance. Please consider the payment situation carefully. 80 | 81 | | Vendor| Model | 82 | | --- | --- | 83 | |[yeka](https://2233.ai/api)|gpt-4o,o1| 84 | |openai|gpt-4o,gpt-4o-2024-08-06,gpt-4o-2024-11-20,o1,4.gpt-4.5-preview-2025-02-27| 85 | 86 | ## 📝 FAQ 87 | ### What models are supported? 88 | Currently only OpenAI series models are supported. If you can't access OpenAI in China, we recommend using [yeka](https://2233.ai/api) as a proxy. 89 | 90 | Why don't we support other models? We use multimodal + structured output capabilities, and few other model vendors support both capabilities simultaneously. Adapting to other models would require significant changes to the underlying architecture, and we can't guarantee the results. However, we are actively looking for solutions and will update immediately when available. 91 | 92 | ### Why is my execution speed slow? 93 | If your computer doesn't have an NVIDIA dedicated graphics card, it will run slower because we frequently call OCR for visual annotation, which consumes a lot of GPU resources. We are actively optimizing and adapting. We recommend using an NVIDIA graphics card with at least 4GB of VRAM, and the version should match your torch version: 94 | 95 | 1. Run `pip list` to check torch version; 96 | 2. Check supported cuda version from [official website](https://pytorch.org/get-started/locally/); 97 | 3. Uninstall installed torch and torchvision; 98 | 4. Copy the official torch installation command and reinstall torch suitable for your cuda version. 99 | 100 | For example, if your cuda version is 12.4, you need to install torch using the following command: 101 | 102 | ```bash 103 | pip3 uninstall -y torch torchvision 104 | pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 105 | ``` 106 | 107 | ## 🤝 Join Us 108 | 109 | Every excellent open-source project embodies collective wisdom. The growth of autoMate is inseparable from your participation and contribution. Whether it's fixing bugs, adding features, or improving documentation, your every contribution will help thousands of people break free from repetitive work. 110 | 111 | Join us in creating a more intelligent future. 112 | 113 | 114 | 115 | 116 | 117 | --- 118 | 119 |
120 | ⭐ Every Star is an encouragement to the creators and an opportunity for more people to discover and benefit from autoMate ⭐ 121 | Your support today is our motivation for tomorrow's progress 122 |
123 | -------------------------------------------------------------------------------- /README_CN.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | autoMate logo 4 |

autoMate

5 |

🤖 AI驱动的本地自动化工具 | 让电脑自己会干活

6 | 7 | [English](./README.md) | [日本語](./README_JA.md) 8 | 9 | >"让繁琐自动化,把时间还给生活" 10 | 11 | https://github.com/user-attachments/assets/bf27f8bd-136b-402e-bc7d-994b99bcc368 12 | 13 | 14 |
15 | 16 | > **特别声明:** autoMate 项目仍处于快速迭代的早期阶段,我们会不断探索和融入最新技术。在这个过程中,**更深入的设计思考、技术选型讨论、遇到的挑战与解决方案,以及我对 AI+RPA 领域的持续研究笔记,会主要在我的 [知识星球「AI桐木和他的贵人们」](https://t.zsxq.com/x1cCW)** 中分享和探讨。 17 | > 18 | > 如果你对 autoMate 背后的技术细节、发展方向或更广泛的 AI 自动化话题感兴趣,欢迎扫码加入,与我和其他朋友一起交流,共同见证 autoMate 的成长! 19 | 20 |
21 |
22 | 知识星球二维码 23 |
24 |
25 | 26 | 27 | ## 💫 重新定义你与电脑的关系 28 | 29 | 不同于传统RPA工具的繁琐,autoMate借助大模型的能力,只需用自然语言描述任务,AI就能完成复杂的自动化流程。从此告别重复性工作,专注于真正创造价值的事情! 30 | 31 | **让自动化为你的生活创造更多可能。** 32 | 33 | ## 💡 项目简介 34 | autoMate 是一款革命性的AI+RPA自动化工具,基于OmniParser构建,它能够 35 | 36 | - 📊 理解您的需求,自动进行任务规划 37 | - 🔍 智能理解屏幕内容,模拟人类视觉和操作 38 | - 🧠 自主决策,根据任务需求进行判断并采取行动 39 | - 💻 支持本地化部署,保护您的数据安全和隐私 40 | 41 | ## ✨ 功能特点 42 | 43 | - 🔮 无代码自动化 - 使用自然语言描述任务,无需编程知识 44 | - 🖥️ 全界面操控 - 支持任何可视化界面的操作,不限于特定软件 45 | - 🚅 简化安装 - 支持中文环境,一键部署 46 | 47 | 48 | ## 🚀 快速开始 49 | 50 | ### 📥 直接使用 51 | 可以直接从 github release 下载可执行文件使用。 52 | 53 | ### 📦 安装 54 | 强烈建议先安装miniConda,用miniconda安装依赖,网上有很多教程,实在不懂可以问AI。然后按照下面命令安装环境: 55 | 56 | ```bash 57 | # 把项目拉下来 58 | git clone https://github.com/yuruotong1/autoMate.git 59 | cd autoMate 60 | # 创建 python3.12 环境 61 | conda create -n "automate" python==3.12 62 | # 激活环境 63 | conda activate automate 64 | # 安装相关依赖 65 | python install.py 66 | ``` 67 | 安装完成后可以使用命令行启动应用: 68 | 69 | ```bash 70 | python main.py 71 | ``` 72 | 然后在浏览器中打开`http://localhost:7888/`,配置您的API密钥和基本设置。 73 | 74 | ### 🔔 注意 75 | 76 | 目前已经测试并且支持的模型如下: 77 | 78 | > PS:以下是经过测试可以跑的大模型厂商,这些厂商与我们没有任何利益关系,因此我们也不承诺售后、功能保障、稳定性维护等工作,涉及付费情况请行考虑。 79 | 80 | 81 | | Vendor| Model | 82 | | --- | --- | 83 | |[yeka](https://2233.ai/api)|gpt-4o,o1| 84 | |openai|gpt-4o,gpt-4o-2024-08-06,gpt-4o-2024-11-20,o1,4.gpt-4.5-preview-2025-02-27| 85 | 86 | 87 | ## 📝常见问题 88 | ### 支持什么模型? 89 | 目前仅支持 OpenAI 系列模型,如果国内不能访问 OpenAI,建议使用[yeka](https://2233.ai/api)进行中转。 90 | 91 | 为什么目前不支持其他模型?我们用到了多模态+结构化输出能力,其他模型厂商很少能够同时支持这两个能力,如果适配其他模型的话,我们要对底层进行较大修改,效果也不能得到保证。但是我们正在积极寻找解决方案,一有更新会立即同步出来。 92 | 93 | 94 | ### 为什么我的执行速度很慢? 95 | 如果你的电脑没有NVIDIA独显的话,运行的会比较慢,因为我们会高频次调用OCR对视觉进行标注,这会消耗大量的GPU资源,我们也在积极进行优化和适配。建议使用不少于 4G 显存的英伟达显卡运行,并且版本和torch版本一致: 96 | 97 | 1. 运行`pip list`查看torch版本; 98 | 2. 从[官网](https://pytorch.org/get-started/locally/)查看支持的cuda版本; 99 | 3. 卸载已安装的 torch 和 torchvision; 100 | 3. 复制官方的 torch 安装命令,重新安装适合自己 cuda 版本的 torch。 101 | 102 | 比如我的 cuda 版本为 12.4,需要按照如下命令来安装 torch; 103 | 104 | ```bash 105 | pip3 uninstall -y torch torchvision 106 | pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 107 | ``` 108 | 109 | 110 | ## 🤝 参与共建 111 | 112 | 每一个优秀的开源项目都凝聚着集体的智慧。autoMate的成长离不开你的参与和贡献。无论是修复bug、添加功能,还是改进文档,你的每一份付出都将帮助成千上万的人摆脱重复性工作的束缚。 113 | 114 | 加入我们,一起创造更加智能的未来。 115 | 116 | 117 | 118 | 119 | 120 | --- 121 | 122 |
123 | ⭐ 每一个Star都是对创作者的鼓励,也是让更多人发现并受益于autoMate的机会 ⭐ 124 | 今天你的支持,就是我们明天前进的动力 125 |
126 | -------------------------------------------------------------------------------- /README_JA.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | autoMate logo 4 |

autoMate

5 |

🤖 AI駆動のローカル自動化ツール | コンピュータに仕事を任せる

6 | 7 | [English](./README.md) | [中文](./README_CN.md) 8 | 9 | >"面倒な作業を自動化し、時間を生活に取り戻す" 10 | 11 | https://github.com/user-attachments/assets/bf27f8bd-136b-402e-bc7d-994b99bcc368 12 | 13 | 14 |
15 | 16 | > **特別声明:** autoMateプロジェクトは現在も急速な開発段階にあり、最新の技術を継続的に探索・統合しています。この過程で、**より深い設計思考、技術スタックの議論、直面する課題と解決策、およびAI+RPA分野に関する継続的な研究ノートは、主に[知識プラネット「AI桐木とその仲間たち」](https://t.zsxq.com/x1cCW)**で共有・議論されています。 17 | > 18 | > autoMateの技術的な詳細、開発方向性、またはより広範なAI自動化トピックに興味がある方は、QRコードをスキャンして参加し、私や他の仲間たちと一緒にautoMateの成長を目撃しましょう! 19 | 20 |
21 |
22 | 知識プラネットQRコード 23 |
24 |
25 | 26 | 27 | ## 💫 コンピュータとの関係を再定義 28 | 29 | 従来のRPAツールとは異なり、autoMateは大規模言語モデルの力を活用し、自然言語でタスクを説明するだけで複雑な自動化プロセスを完了します。繰り返し作業にさようならし、本当に価値を生み出すことに集中しましょう! 30 | 31 | **自動化で生活により多くの可能性を。** 32 | 33 | ## 💡 プロジェクト概要 34 | autoMateは、OmniParserをベースにした革新的なAI+RPA自動化ツールで、以下のことができます: 35 | 36 | - 📊 要件を理解し、自動的にタスクを計画 37 | - 🔍 画面の内容をインテリジェントに理解し、人間の視覚と操作をシミュレート 38 | - 🧠 自律的な判断を行い、タスク要件に基づいて判断と行動を実行 39 | - 💻 ローカルデプロイメントをサポートし、データセキュリティとプライバシーを保護 40 | 41 | ## ✨ 主な機能 42 | 43 | - 🔮 ノーコード自動化 - 自然言語でタスクを記述、プログラミング知識不要 44 | - 🖥️ 全インターフェース制御 - 特定のソフトウェアに限定されない、あらゆる視覚的インターフェースの操作をサポート 45 | - 🚅 簡単なインストール - 中国語環境をサポート、ワンクリックデプロイ 46 | 47 | ## 🚀 クイックスタート 48 | 49 | ### 📥 直接使用 50 | GitHubリリースから実行ファイルを直接ダウンロードできます。 51 | 52 | ### 📦 インストール 53 | まずminiCondaのインストールを強く推奨します。minicondaで依存関係をインストールしてください。オンラインに多くのチュートリアルがありますが、わからない場合はAIに質問することもできます。その後、以下のコマンドで環境をセットアップします: 54 | 55 | ```bash 56 | # プロジェクトをクローン 57 | git clone https://github.com/yuruotong1/autoMate.git 58 | cd autoMate 59 | # Python 3.12環境を作成 60 | conda create -n "automate" python==3.12 61 | # 環境をアクティベート 62 | conda activate automate 63 | # 依存関係をインストール 64 | python install.py 65 | ``` 66 | 67 | インストール後、コマンドラインでアプリケーションを起動できます: 68 | 69 | ```bash 70 | python main.py 71 | ``` 72 | 73 | その後、ブラウザで`http://localhost:7888/`を開き、APIキーと基本設定を構成してください。 74 | 75 | ### 🔔 注意 76 | 77 | 現在テスト済みでサポートされているモデルは以下の通りです: 78 | 79 | > PS:以下はテスト済みで動作する大規模モデルベンダーです。これらのベンダーとは関係がないため、アフターサービス、機能保証、安定性維持は保証しません。支払い状況を慎重にご検討ください。 80 | 81 | | ベンダー | モデル | 82 | | --- | --- | 83 | |[yeka](https://2233.ai/api)|gpt-4o,o1| 84 | |openai|gpt-4o,gpt-4o-2024-08-06,gpt-4o-2024-11-20,o1,4.gpt-4.5-preview-2025-02-27| 85 | 86 | ## 📝 よくある質問 87 | ### どのモデルがサポートされていますか? 88 | 現在はOpenAIシリーズのモデルのみをサポートしています。中国でOpenAIにアクセスできない場合は、[yeka](https://2233.ai/api)をプロキシとして使用することをお勧めします。 89 | 90 | 他のモデルをサポートしない理由:マルチモーダル+構造化出力機能を使用しているため、他のモデルベンダーで両方の機能を同時にサポートしているところはほとんどありません。他のモデルに適応するには、アーキテクチャの大幅な変更が必要で、結果も保証できません。ただし、解決策を積極的に探しており、利用可能になり次第すぐに更新します。 91 | 92 | ### 実行速度が遅いのはなぜですか? 93 | NVIDIAの専用グラフィックスカードがない場合、実行速度が遅くなります。これは、視覚的な注釈のためにOCRを頻繁に呼び出し、大量のGPUリソースを消費するためです。私たちは積極的に最適化と適応を行っています。少なくとも4GBのVRAMを持つNVIDIAグラフィックスカードの使用を推奨し、バージョンはtorchバージョンと一致している必要があります: 94 | 95 | 1. `pip list`を実行してtorchバージョンを確認 96 | 2. [公式サイト](https://pytorch.org/get-started/locally/)でサポートされているcudaバージョンを確認 97 | 3. インストールされているtorchとtorchvisionをアンインストール 98 | 4. 公式のtorchインストールコマンドをコピーし、お使いのcudaバージョンに適したtorchを再インストール 99 | 100 | 例えば、cudaバージョンが12.4の場合、以下のコマンドでtorchをインストールする必要があります: 101 | 102 | ```bash 103 | pip3 uninstall -y torch torchvision 104 | pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 105 | ``` 106 | 107 | ## 🤝 参加する 108 | 109 | 優れたオープンソースプロジェクトは、集団の知恵の結晶です。autoMateの成長は、あなたの参加と貢献なしには成り立ちません。バグ修正、機能追加、ドキュメント改善など、あなたの貢献は何千人もの人々が繰り返し作業から解放されるのを助けます。 110 | 111 | よりインテリジェントな未来の創造に参加しましょう。 112 | 113 | 114 | 115 | 116 | 117 | --- 118 | 119 |
120 | ⭐ スターは制作者への励ましであり、より多くの人々がautoMateを発見し恩恵を受ける機会です ⭐ 121 | 今日のあなたのサポートが、明日の私たちの進歩の原動力です 122 |
123 | -------------------------------------------------------------------------------- /auto_control/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuruotong1/autoMate/bcd53ea749651a4819bd05d6cfc35f76a0e580c8/auto_control/.DS_Store -------------------------------------------------------------------------------- /auto_control/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuruotong1/autoMate/bcd53ea749651a4819bd05d6cfc35f76a0e580c8/auto_control/__init__.py -------------------------------------------------------------------------------- /auto_control/agent/base_agent.py: -------------------------------------------------------------------------------- 1 | class BaseAgent: 2 | def __init__(self, *args, **kwargs): 3 | self.SYSTEM_PROMPT = "" 4 | 5 | 6 | def chat(self, messages): 7 | pass 8 | 9 | -------------------------------------------------------------------------------- /auto_control/agent/task_plan_agent.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pydantic import BaseModel, Field 3 | from auto_control.agent.base_agent import BaseAgent 4 | from xbrain.core.chat import run 5 | 6 | from auto_control.tools.computer import Action 7 | 8 | class TaskPlanAgent(BaseAgent): 9 | def __call__(self, messages, parsed_screen_result): 10 | messages[-1] = {"role": "user", 11 | "content": [ 12 | {"type": "text", "text": messages[-1]["content"]}, 13 | { 14 | "type": "image_url", 15 | "image_url": {"url": f"data:image/png;base64,{parsed_screen_result['base64_image']}"} 16 | } 17 | ] 18 | } 19 | response = run(messages, user_prompt=system_prompt.format(action_list=str(Action)), response_format=TaskPlanResponse) 20 | print("task_plan_agent response: ", response) 21 | return json.loads(response) 22 | 23 | 24 | class TaskPlanResponse(BaseModel): 25 | reasoning: str = Field(description="描述您规划任务的逻辑") 26 | task_list: list[str] = Field(description="任务列表") 27 | 28 | 29 | system_prompt = """ 30 | ### 目标 ### 31 | 你是自动化操作规划专家,根据屏幕内容和用户需求,规划精确可执行的操作序列。 32 | 33 | 34 | ### 输入 ### 35 | 1. 用户需求:文本描述形式的任务目标 36 | 2. 当前环境:屏幕上可见的元素和状态 37 | 38 | ### 输出格式 ### 39 | 操作序列应采用以下JSON格式: 40 | [ 41 | {{ 42 | "reasoning": "描述您规划任务的逻辑", 43 | "task_plan": ["任务1", "任务2", "任务3"] 44 | }} 45 | ] 46 | 47 | 任务中的操作应该仅包含: 48 | {action_list} 49 | 50 | ### 限制 ### 51 | 52 | - 不要说点击xx坐标,这样用户无法理解,应该说点击地址栏、搜索框、输入按钮等; 53 | 54 | 55 | ### 例子 ### 56 | 输入:获取AI新闻 57 | 输出: 58 | [ 59 | {{ 60 | "reasoning": "看到有一个地址栏,所以应该在地址栏输入https://www.baidu.com", 61 | "task_plan": ["在地址栏输入https://www.baidu.com"] 62 | }}, 63 | {{ 64 | "reasoning": "这是百度页面,看到有一个搜索框,所以应该在搜索框输入AI最新新闻", 65 | "task_plan": ["在搜索框输入AI最新新闻"] 66 | }}, 67 | {{ 68 | "reasoning": "看到有一个搜索按钮,所以应该点击搜索按钮", 69 | "task_plan": ["点击搜索按钮"] 70 | }} 71 | ] 72 | """ 73 | 74 | -------------------------------------------------------------------------------- /auto_control/agent/task_run_agent.py: -------------------------------------------------------------------------------- 1 | import json 2 | import uuid 3 | from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock, BetaMessageParam, BetaUsage 4 | from pydantic import Field, create_model 5 | from auto_control.agent.base_agent import BaseAgent 6 | from xbrain.core.chat import run 7 | 8 | from auto_control.tools.computer import Action 9 | class TaskRunAgent(BaseAgent): 10 | def __init__(self): 11 | self.OUTPUT_DIR = "./tmp/outputs" 12 | 13 | def __call__(self, parsed_screen_result, messages): 14 | messages.append( 15 | {"role": "user", 16 | "content": [ 17 | {"type": "text", "text": "Image is the screenshot of the current screen"}, 18 | { 19 | "type": "image_url", 20 | "image_url": {"url": f"data:image/png;base64,{parsed_screen_result['base64_image']}"} 21 | } 22 | ] 23 | } 24 | ) 25 | task_list = json.loads(messages[1]['content'])['task_list'] 26 | # Convert task_list to a numbered format 27 | formatted_task_list = "\n".join([f"{i}.{task}" for i, task in enumerate(task_list)]) 28 | system_prompt = prompt.format(task_list=formatted_task_list) 29 | vlm_response = run( 30 | messages, 31 | user_prompt=system_prompt, 32 | response_format=create_dynamic_response_model(parsed_screen_result) 33 | ) 34 | vlm_response_json = json.loads(vlm_response) 35 | response_content = [BetaTextBlock(text=vlm_response_json["reasoning"], type='text')] 36 | # Handle cursor movement based on box_id 37 | if "box_id" in vlm_response_json: 38 | action_types_without_cursor = ["None", "key", "type", "scroll_down", "scroll_up", "cursor_position", "wait"] 39 | 40 | if vlm_response_json["box_id"] != -1 and vlm_response_json["next_action"] not in action_types_without_cursor: 41 | # Move cursor to the center of the identified element 42 | element = self.find_element_by_id(parsed_screen_result, vlm_response_json["box_id"]) 43 | bbox = element.coordinates 44 | box_centroid_coordinate = [ 45 | int((bbox[0] + bbox[2]) / 2), 46 | int((bbox[1] + bbox[3]) / 2) 47 | ] 48 | move_cursor_block = BetaToolUseBlock( 49 | id=f'toolu_{uuid.uuid4()}', 50 | input={'action': 'mouse_move', 'coordinate': box_centroid_coordinate}, 51 | name='computer', 52 | type='tool_use' 53 | ) 54 | response_content.append(move_cursor_block) 55 | 56 | elif vlm_response_json["box_id"] == -1 and len(vlm_response_json["coordinates"]) == 2: 57 | # Move cursor to specified coordinates 58 | move_cursor_block = BetaToolUseBlock( 59 | id=f'toolu_{uuid.uuid4()}', 60 | input={'action': 'mouse_move', 'coordinate': vlm_response_json["coordinates"]}, 61 | name='computer', 62 | type='tool_use' 63 | ) 64 | response_content.append(move_cursor_block) 65 | if vlm_response_json["next_action"] == "None": 66 | print("Task paused/completed.") 67 | elif vlm_response_json["next_action"] == "type": 68 | sim_content_block = BetaToolUseBlock(id=f'toolu_{uuid.uuid4()}', 69 | input={'action': vlm_response_json["next_action"], 'text': vlm_response_json["value"]}, 70 | name='computer', type='tool_use') 71 | response_content.append(sim_content_block) 72 | else: 73 | sim_content_block = BetaToolUseBlock(id=f'toolu_{uuid.uuid4()}', 74 | input={'action': vlm_response_json["next_action"]}, 75 | name='computer', type='tool_use') 76 | response_content.append(sim_content_block) 77 | response_message = BetaMessage(id=f'toolu_{uuid.uuid4()}', content=response_content, model='', role='assistant', type='message', stop_reason='tool_use', usage=BetaUsage(input_tokens=0, output_tokens=0)) 78 | return response_message, vlm_response_json 79 | 80 | def find_element_by_id(self, parsed_screen_result, box_id): 81 | for element in parsed_screen_result["parsed_content_list"]: 82 | if element.element_id == box_id: 83 | return element 84 | return None 85 | 86 | 87 | def create_dynamic_response_model(parsed_screen_result): 88 | available_box_ids = [item.element_id for item in parsed_screen_result['parsed_content_list']] 89 | available_box_ids.append(-1) 90 | task_run_agent_response = create_model( 91 | 'TaskRunAgentResponse', 92 | reasoning = (str, Field( 93 | description="描述当前屏幕上的内容,考虑历史记录,然后说出你要这么做的理由。" 94 | )), 95 | next_action = (str, Field( 96 | description="选择一个操作类型,如果找不到合适的操作,请选择None", 97 | json_schema_extra={ 98 | "enum": Action 99 | } 100 | )), 101 | box_id = (int, Field( 102 | description="要操作的框ID,如果框ID不存在就返回-1", 103 | json_schema_extra={ 104 | "enum": available_box_ids 105 | } 106 | )), 107 | coordinates = (list[int], Field( 108 | description="当 box_id 为-1时,直接返回要操作对象的坐标,只返回x,y这2个整数" 109 | )), 110 | value = (str, Field( 111 | description="仅当next_action为type时提供,否则为None" 112 | )), 113 | current_task_id = (int, Field( 114 | description="请判断一下,你正在完成第几个任务,第一个任务是0" 115 | )) 116 | ) 117 | return task_run_agent_response 118 | 119 | 120 | prompt = """ 121 | ### 目标 ### 122 | 你是一个任务执行者。请你根据屏幕截图和【所有元素】确定接下来要做什么,如果任务完成把next_action设置为None: 123 | 124 | 请根据以下任务列表判断一下你正在执行第几个任务(current_task_id),第一个任务是0,任务列表如下: 125 | {task_list} 126 | ########## 127 | 128 | ### 注意 ### 129 | - 要结合用户传入的屏幕图片观察其中的 box_id 框框和标号,确定要操作哪一个box_id,如果没有合适的请返回-1,然后通过coordinates给出要操作对象的坐标。 130 | - 每次应该只给出一个操作,告诉我要对哪个box_id进行操作、输入什么内容或者滚动或者其他操作。 131 | - 应该对当前屏幕进行分析,通过查看历史记录反思已完成的工作,然后描述您如何实现任务的逐步思考。 132 | - 避免连续多次选择相同的操作/元素,如果发生这种情况,反思自己,可能出了什么问题,并预测不同的操作。 133 | - 任务不是连续的,上一次是1下一次不一定是2,你要根据next_action进行判断。 134 | - current_task_id 要在任务列表中找到,不要随便写。 135 | - 当你觉得任务已经完成时,请一定把next_action设置为'None',不然会重复执行。 136 | - 涉及到输入type、key操作时,其上一步操作一定是点击输入框操作。 137 | 138 | ########## 139 | ### 输出格式 ### 140 | ```json 141 | {{ 142 | "reasoning": str, # 综合当前屏幕上的内容和历史记录,描述您是如何思考的。 143 | "next_action": str, # 要执行的动作。 144 | "box_id": int, # 要操作的框ID,当next_action为left_click、right_click、double_click、hover时提供,否则为None 145 | "value": "xxx" # 仅当操作为type时提供value字段,否则不包括value键 146 | "current_task_id": int # 当前正在执行第几个任务,第一个任务是0, 147 | "coordinates": list[int] # 仅当box_id为-1时提供,返回要操作对象的坐标,只返回x,y这2个整数 148 | }} 149 | ``` 150 | 151 | ########## 152 | ### 案例 ### 153 | 任务列表: 154 | 0. 打开浏览器 155 | 1. 搜索亚马逊 156 | 2. 点击第一个搜索结果 157 | 158 | 一个例子: 159 | ```json 160 | {{ 161 | "reasoning": "当前屏幕显示亚马逊的谷歌搜索结果,在之前的操作中,我已经在谷歌上搜索了亚马逊。然后我需要点击第一个搜索结果以转到amazon.com。", 162 | "next_action": "left_click", 163 | "box_id": 35, 164 | "current_task_id": 0 165 | }} 166 | ``` 167 | 168 | 另一个例子: 169 | ```json 170 | {{ 171 | "reasoning": "当前屏幕显示亚马逊的首页。没有之前的操作。因此,我需要在搜索栏中输入"Apple watch"。", 172 | "next_action": "type", 173 | "box_id": 27, 174 | "value": "Apple watch", 175 | "current_task_id": 1 176 | }} 177 | ``` 178 | 179 | 另一个例子: 180 | ```json 181 | {{ 182 | "reasoning": "当前屏幕没有显示'提交'按钮,我需要向下滚动以查看按钮是否可用。", 183 | "next_action": "scroll_down", 184 | "current_task_id": 2 185 | }} 186 | """ 187 | 188 | -------------------------------------------------------------------------------- /auto_control/agent/vision_agent.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import cv2 3 | from ultralytics import YOLO 4 | import supervision as sv 5 | import numpy as np 6 | from pydantic import BaseModel 7 | 8 | class UIElement(BaseModel): 9 | element_id: int 10 | coordinates: list[float] 11 | 12 | class VisionAgent: 13 | def __init__(self, yolo_model_path: str): 14 | """ 15 | Initialize the vision agent 16 | 17 | Parameters: 18 | yolo_model_path: Path to YOLO model 19 | """ 20 | # determine the available device and the best dtype 21 | # load the YOLO model 22 | self.yolo_model = YOLO(yolo_model_path) 23 | 24 | self.elements: List[UIElement] = [] 25 | 26 | def __call__(self, image_path: str) -> List[UIElement]: 27 | """Process an image from file path.""" 28 | # image = self.load_image(image_source) 29 | image = cv2.imread(image_path) 30 | if image is None: 31 | raise FileNotFoundError(f"Vision agent: Failed to read image") 32 | return self.analyze_image(image) 33 | 34 | def _reset_state(self): 35 | """Clear previous analysis results""" 36 | self.elements = [] 37 | 38 | def analyze_image(self, image: np.ndarray) -> List[UIElement]: 39 | """ 40 | Process an image through all computer vision pipelines. 41 | 42 | Args: 43 | image: Input image in BGR format (OpenCV default) 44 | 45 | Returns: 46 | List of detected UI elements with annotations 47 | """ 48 | self._reset_state() 49 | 50 | boxes = self._detect_objects(image) 51 | 52 | for idx in range(len(boxes)): 53 | new_element = UIElement(element_id=idx, 54 | coordinates=boxes[idx]) 55 | self.elements.append(new_element) 56 | 57 | return self.elements 58 | 59 | def _detect_objects(self, image: np.ndarray) -> tuple[list[np.ndarray], list]: 60 | """Run object detection pipeline""" 61 | results = self.yolo_model(image)[0] 62 | detections = sv.Detections.from_ultralytics(results) 63 | boxes = detections.xyxy 64 | 65 | if len(boxes) == 0: 66 | return [] 67 | 68 | # Filter out boxes contained by others 69 | areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) 70 | sorted_indices = np.argsort(-areas) # Sort descending by area 71 | sorted_boxes = boxes[sorted_indices] 72 | 73 | keep_sorted = [] 74 | for i in range(len(sorted_boxes)): 75 | contained = False 76 | for j in keep_sorted: 77 | box_b = sorted_boxes[j] 78 | box_a = sorted_boxes[i] 79 | if (box_b[0] <= box_a[0] and box_b[1] <= box_a[1] and 80 | box_b[2] >= box_a[2] and box_b[3] >= box_a[3]): 81 | contained = True 82 | break 83 | if not contained: 84 | keep_sorted.append(i) 85 | 86 | # Map back to original indices 87 | keep_indices = sorted_indices[keep_sorted] 88 | filtered_boxes = boxes[keep_indices] 89 | return filtered_boxes 90 | 91 | 92 | -------------------------------------------------------------------------------- /auto_control/app.py: -------------------------------------------------------------------------------- 1 | """ 2 | python app.py --windows_host_url localhost:8006 --omniparser_server_url localhost:8000 3 | """ 4 | 5 | import json 6 | import os 7 | from pathlib import Path 8 | import argparse 9 | import gradio as gr 10 | from auto_control.agent.vision_agent import VisionAgent 11 | from auto_control.loop import ( 12 | sampling_loop_sync, 13 | ) 14 | import base64 15 | from xbrain.utils.config import Config 16 | 17 | from util.download_weights import OMNI_PARSER_DIR 18 | CONFIG_DIR = Path("~/.anthropic").expanduser() 19 | API_KEY_FILE = CONFIG_DIR / "api_key" 20 | 21 | INTRO_TEXT = ''' 22 | Base on Omniparser to control desktop! 23 | ''' 24 | 25 | def parse_arguments(): 26 | 27 | parser = argparse.ArgumentParser(description="Gradio App") 28 | parser.add_argument("--windows_host_url", type=str, default='localhost:8006') 29 | parser.add_argument("--omniparser_server_url", type=str, default="localhost:8000") 30 | return parser.parse_args() 31 | args = parse_arguments() 32 | 33 | 34 | def setup_state(state): 35 | # 如果存在config,则从config中加载数据 36 | config = Config() 37 | if config.OPENAI_API_KEY: 38 | state["api_key"] = config.OPENAI_API_KEY 39 | else: 40 | state["api_key"] = "" 41 | if config.OPENAI_BASE_URL: 42 | state["base_url"] = config.OPENAI_BASE_URL 43 | else: 44 | state["base_url"] = "https://api.openai.com/v1" 45 | if config.OPENAI_MODEL: 46 | state["model"] = config.OPENAI_MODEL 47 | else: 48 | state["model"] = "gpt-4o" 49 | 50 | if "messages" not in state: 51 | state["messages"] = [] 52 | if "chatbox_messages" not in state: 53 | state["chatbox_messages"] = [] 54 | if "auth_validated" not in state: 55 | state["auth_validated"] = False 56 | if "responses" not in state: 57 | state["responses"] = {} 58 | if "tools" not in state: 59 | state["tools"] = {} 60 | if "tasks" not in state: 61 | state["tasks"] = [] 62 | if "only_n_most_recent_images" not in state: 63 | state["only_n_most_recent_images"] = 2 64 | if 'stop' not in state: 65 | state['stop'] = False 66 | # update state 67 | return ( 68 | state["model"], # model textbox 69 | state["base_url"], # base_url textbox 70 | state["api_key"], # api_key textbox 71 | state["chatbox_messages"], # chatbot 72 | [[task["status"], task["task"]] for task in state["tasks"]] # task_list 73 | ) 74 | 75 | def load_from_storage(filename: str) -> str | None: 76 | """Load data from a file in the storage directory.""" 77 | try: 78 | file_path = CONFIG_DIR / filename 79 | if file_path.exists(): 80 | data = file_path.read_text().strip() 81 | if data: 82 | return data 83 | except Exception as e: 84 | print(f"Debug: Error loading {filename}: {e}") 85 | return None 86 | 87 | def format_json_content(json_content): 88 | """Format JSON content with reasoning and details""" 89 | content_json = json.loads(json_content) 90 | reasoning = f'

{content_json["reasoning"]}

' 91 | details = f'
Detail
{json.dumps(content_json, indent=4, ensure_ascii=False)}
' 92 | return reasoning, details 93 | 94 | def format_message_content(content): 95 | """Format message content for gradio chatbox display""" 96 | # Handle list-type content (multimodal) 97 | if isinstance(content, list): 98 | formatted_content = "" 99 | json_reasoning = None 100 | 101 | for item in content: 102 | if item["type"] == "image_url": 103 | formatted_content += f'
' 104 | elif item["type"] == "text": 105 | if is_json_format(item["text"]): 106 | reasoning, details = format_json_content(item["text"]) 107 | json_reasoning = reasoning 108 | formatted_content += details 109 | else: 110 | formatted_content += item["text"] 111 | 112 | return formatted_content, json_reasoning 113 | 114 | # Handle string content 115 | if is_json_format(content): 116 | reasoning, _ = format_json_content(content) 117 | formatted_content = json.dumps(json.loads(content), indent=4, ensure_ascii=False) 118 | return formatted_content, reasoning 119 | 120 | return content, None 121 | 122 | def process_input(user_input, state, vision_agent_state): 123 | # Reset the stop flag 124 | if state["stop"]: 125 | state["stop"] = False 126 | 127 | # Configure API 128 | config = Config() 129 | config.set_openai_config(base_url=state["base_url"], api_key=state["api_key"], model=state["model"]) 130 | 131 | # Add user message 132 | state["messages"].append({"role": "user", "content": user_input}) 133 | state["chatbox_messages"].append({"role": "user", "content": user_input}) 134 | yield state["chatbox_messages"], [] 135 | # Process with agent 136 | agent = vision_agent_state["agent"] 137 | for _ in sampling_loop_sync( 138 | model=state["model"], 139 | messages=state["messages"], 140 | vision_agent=agent, 141 | screen_region=state.get("screen_region", None) 142 | ): 143 | if state["stop"]: 144 | state["chatbox_messages"].append({"role": "user", "content": "Stop !"}) 145 | return 146 | 147 | # task_plan_agent first response 148 | if len(state["messages"]) == 2: 149 | task_list = json.loads(state["messages"][-1]["content"])["task_list"] 150 | for task in task_list: 151 | state["tasks"].append({ 152 | "status": "⬜", 153 | "task": task 154 | }) 155 | else: 156 | # Reset all tasks to pending status 157 | for i in range(len(state["tasks"])): 158 | state["tasks"][i]["status"] = "⬜" 159 | task_completed_number = json.loads(state["messages"][-1]["content"])["current_task_id"] 160 | if task_completed_number > len(state["tasks"]) + 1: 161 | for i in range(len(state["tasks"])): 162 | state["tasks"][i]["status"] = "✅" 163 | else: 164 | for i in range(task_completed_number + 1): 165 | state["tasks"][i]["status"] = "✅" 166 | 167 | # Rebuild chatbox messages from the original messages 168 | state["chatbox_messages"] = [] 169 | 170 | for message in state["messages"]: 171 | formatted_content, json_reasoning = format_message_content(message["content"]) 172 | 173 | # Add json reasoning as a separate message if exists 174 | if json_reasoning: 175 | state["chatbox_messages"].append({ 176 | "role": message["role"], 177 | "content": json_reasoning 178 | }) 179 | 180 | # Add the formatted content 181 | state["chatbox_messages"].append({ 182 | "role": message["role"], 183 | "content": formatted_content 184 | }) 185 | 186 | # 在返回结果前转换数据格式 187 | tasks_2d = [[task["status"], task["task"]] for task in state["tasks"]] 188 | yield state["chatbox_messages"], tasks_2d 189 | 190 | def is_json_format(text): 191 | try: 192 | json.loads(text) 193 | return True 194 | except: 195 | return False 196 | 197 | def stop_app(state): 198 | state["stop"] = True 199 | return 200 | 201 | def get_header_image_base64(): 202 | try: 203 | # Get the absolute path to the image relative to this script 204 | script_dir = Path(__file__).parent 205 | image_path = script_dir.parent / "imgs" / "header_bar_thin.png" 206 | 207 | with open(image_path, "rb") as image_file: 208 | encoded_string = base64.b64encode(image_file.read()).decode() 209 | return f'data:image/png;base64,{encoded_string}' 210 | except Exception as e: 211 | print(f"Failed to load header image: {e}") 212 | return None 213 | 214 | 215 | def run(): 216 | with gr.Blocks(theme=gr.themes.Default()) as demo: 217 | gr.HTML(""" 218 | 229 | """) 230 | state = gr.State({}) 231 | 232 | setup_state(state.value) 233 | 234 | header_image = get_header_image_base64() 235 | if header_image: 236 | gr.HTML(f'autoMate Header', elem_classes="no-padding") 237 | gr.HTML('

autoMate

') 238 | else: 239 | gr.Markdown("# autoMate") 240 | 241 | if not os.getenv("HIDE_WARNING", False): 242 | gr.Markdown(INTRO_TEXT, elem_classes="markdown-text") 243 | 244 | with gr.Accordion("Settings", open=True): 245 | with gr.Row(): 246 | with gr.Column(): 247 | with gr.Row(): 248 | with gr.Column(): 249 | model = gr.Textbox( 250 | label="Model", 251 | value=state.value["model"], 252 | placeholder="Input model name", 253 | interactive=True, 254 | ) 255 | with gr.Column(): 256 | base_url = gr.Textbox( 257 | label="Base URL", 258 | value=state.value["base_url"], 259 | placeholder="input base url", 260 | interactive=True 261 | ) 262 | with gr.Row(): 263 | api_key = gr.Textbox( 264 | label="API Key", 265 | type="password", 266 | value=state.value["api_key"], 267 | placeholder="Paste your API key here", 268 | interactive=True, 269 | ) 270 | 271 | with gr.Column(): 272 | select_region_btn = gr.Button(value="Select Screen Region", variant="primary") 273 | def select_screen_region(state): 274 | from util.screen_selector import ScreenSelector 275 | region = ScreenSelector().get_selection() 276 | if region: 277 | state["screen_region"] = region 278 | return f"Selected region: {region}" 279 | return "Selection cancelled" 280 | 281 | select_region_btn.click(fn=select_screen_region, inputs=[state], outputs=[gr.Textbox(label="Region Info")]) 282 | with gr.Row(): 283 | with gr.Column(scale=8): 284 | chat_input = gr.Textbox(show_label=False, placeholder="Type a message to send to Omniparser + X ...", container=False) 285 | with gr.Column(scale=1, min_width=50): 286 | submit_button = gr.Button(value="Send", variant="primary") 287 | with gr.Column(scale=1, min_width=50): 288 | stop_button = gr.Button(value="Stop", variant="secondary") 289 | 290 | with gr.Row(): 291 | with gr.Column(scale=2): 292 | task_list = gr.Dataframe( 293 | headers=["status", "task"], 294 | datatype=["str", "str"], 295 | value=[], 296 | label="Task List", 297 | interactive=False) 298 | 299 | with gr.Column(scale=8): 300 | chatbot = gr.Chatbot( 301 | label="Chatbot History", 302 | autoscroll=True, 303 | height=580, 304 | type="messages") 305 | 306 | def update_model(model, state): 307 | state["model"] = model 308 | 309 | def update_api_key(api_key_value, state): 310 | state["api_key"] = api_key_value 311 | 312 | def update_base_url(base_url, state): 313 | state["base_url"] = base_url 314 | 315 | def clear_chat(state): 316 | # Reset message-related state 317 | state["messages"] = [] 318 | state["chatbox_messages"] = [] 319 | state["responses"] = {} 320 | state["tools"] = {} 321 | state["tasks"] = [] 322 | return state["chatbox_messages"] 323 | 324 | model.change(fn=update_model, inputs=[model, state], outputs=None) 325 | api_key.change(fn=update_api_key, inputs=[api_key, state], outputs=None) 326 | chatbot.clear(fn=clear_chat, inputs=[state], outputs=[chatbot]) 327 | vision_agent = VisionAgent(yolo_model_path=os.path.join(OMNI_PARSER_DIR, "icon_detect", "model.pt")) 328 | vision_agent_state = gr.State({"agent": vision_agent}) 329 | submit_button.click(process_input, [chat_input, state, vision_agent_state], [chatbot, task_list]) 330 | stop_button.click(stop_app, [state], None) 331 | base_url.change(fn=update_base_url, inputs=[base_url, state], outputs=None) 332 | 333 | demo.load( 334 | setup_state, 335 | inputs=[state], 336 | outputs=[model, base_url, api_key, chatbot, task_list] 337 | ) 338 | demo.launch(server_name="0.0.0.0", quiet=True, server_port=7888, prevent_thread_lock=True) 339 | 340 | BLUE = "\033[34m" 341 | BOLD = "\033[1m" 342 | UNDERLINE = "\033[4m" 343 | RESET = "\033[0m" 344 | 345 | print(f"\n\n🚀 Server is running at: {BLUE}{BOLD}{UNDERLINE}http://127.0.0.1:7888{RESET}") 346 | 347 | import time 348 | try: 349 | while True: 350 | time.sleep(1) 351 | except KeyboardInterrupt: 352 | print("\n�� closing server") 353 | -------------------------------------------------------------------------------- /auto_control/executor/anthropic_executor.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import Any, cast 3 | from anthropic.types.beta import ( 4 | BetaContentBlock 5 | ) 6 | from auto_control.tools import ComputerTool, ToolCollection 7 | 8 | 9 | class AnthropicExecutor: 10 | def __init__(self): 11 | self.tool_collection = ToolCollection( 12 | ComputerTool() 13 | ) 14 | 15 | def __call__(self, response, messages): 16 | tool_result_content: list[str] = [] 17 | for content_block in cast(list[BetaContentBlock], response.content): 18 | # Execute the tool 19 | if content_block.type == "tool_use": 20 | # Run the asynchronous tool execution in a synchronous context 21 | result = asyncio.run(self.tool_collection.run( 22 | name=content_block.name, 23 | tool_input=cast(dict[str, Any], content_block.input), 24 | )) 25 | tool_result_content.append( 26 | str(result) 27 | ) 28 | 29 | return tool_result_content 30 | -------------------------------------------------------------------------------- /auto_control/loop.py: -------------------------------------------------------------------------------- 1 | """ 2 | Agentic sampling loop that calls the Anthropic API and local implenmentation of anthropic-defined computer use tools. 3 | """ 4 | import base64 5 | from io import BytesIO 6 | import cv2 7 | from auto_control.agent.vision_agent import VisionAgent 8 | from auto_control.tools.screen_capture import get_screenshot 9 | from anthropic.types.beta import (BetaMessageParam) 10 | from auto_control.agent.task_plan_agent import TaskPlanAgent 11 | from auto_control.agent.task_run_agent import TaskRunAgent 12 | from auto_control.executor.anthropic_executor import AnthropicExecutor 13 | import numpy as np 14 | from PIL import Image 15 | 16 | OUTPUT_DIR = "./tmp/outputs" 17 | 18 | def sampling_loop_sync( 19 | *, 20 | model: str, 21 | messages: list[BetaMessageParam], 22 | vision_agent: VisionAgent, 23 | screen_region: tuple[int, int, int, int] 24 | ): 25 | """ 26 | Synchronous agentic sampling loop for the assistant/tool interaction of computer use. 27 | """ 28 | print('in sampling_loop_sync, model:', model) 29 | task_plan_agent = TaskPlanAgent() 30 | executor = AnthropicExecutor() 31 | task_run_agent = TaskRunAgent() 32 | parsed_screen_result = parsed_screen(vision_agent, screen_region) 33 | task_plan_agent(messages=messages, parsed_screen_result=parsed_screen_result) 34 | yield 35 | while True: 36 | execute_result = execute_task_plan(vision_agent, task_run_agent, executor, messages, screen_region) 37 | if execute_result['next_action'] == 'None': 38 | break 39 | yield 40 | 41 | 42 | def execute_task_plan(vision_agent, task_run_agent, executor, messages, screen_region): 43 | parsed_screen_result = parsed_screen(vision_agent, screen_region) 44 | tools_use_needed, vlm_response_json = task_run_agent(parsed_screen_result=parsed_screen_result, messages=messages) 45 | executor(tools_use_needed, messages) 46 | return vlm_response_json 47 | 48 | def parsed_screen(vision_agent: VisionAgent, screen_region: tuple[int, int, int, int] = None): 49 | screenshot, screenshot_path = get_screenshot(screen_region) 50 | response_json = {} 51 | response_json['parsed_content_list'] = vision_agent(str(screenshot_path)) 52 | response_json['width'] = screenshot.size[0] 53 | response_json['height'] = screenshot.size[1] 54 | response_json['image'] = draw_elements(screenshot, response_json['parsed_content_list']) 55 | buffered = BytesIO() 56 | response_json['image'].save(buffered, format="PNG") 57 | response_json['base64_image'] = base64.b64encode(buffered.getvalue()).decode("utf-8") 58 | return response_json 59 | 60 | def draw_elements(screenshot, parsed_content_list): 61 | """ 62 | Convert PIL image to OpenCV compatible format and draw bounding boxes 63 | 64 | Args: 65 | screenshot: PIL Image object 66 | parsed_content_list: list containing bounding box information 67 | 68 | Returns: 69 | PIL image with drawn bounding boxes 70 | """ 71 | # convert PIL image to opencv format 72 | opencv_image = np.array(screenshot) 73 | opencv_image = cv2.cvtColor(opencv_image, cv2.COLOR_RGB2BGR) 74 | # draw bounding boxes 75 | for element in parsed_content_list: 76 | bbox = element.coordinates 77 | x1, y1, x2, y2 = bbox 78 | # convert coordinates to integers 79 | x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) 80 | # Generate unique color for each element (using element_id as seed) 81 | def get_distinct_color(element_id): 82 | import hashlib 83 | # Use id to generate unique but consistent color 84 | hash_value = int(hashlib.md5(str(element_id).encode()).hexdigest(), 16) 85 | r = (hash_value & 0xFF0000) >> 16 86 | g = (hash_value & 0x00FF00) >> 8 87 | b = hash_value & 0x0000FF 88 | return (r, g, b) 89 | 90 | # Use semi-transparent effect and unique color when drawing rectangle 91 | color = get_distinct_color(element.element_id) 92 | # Draw semi-transparent rectangle (assuming there's original rectangle drawing code) 93 | cv2.rectangle(opencv_image, (x1, y1), (x2, y2), color, 1) # Reduce thickness from 2 to 1 94 | 95 | # Calculate the size of the bounding box 96 | box_width = x2 - x1 97 | box_height = y2 - y1 98 | 99 | # Dynamically adjust font size based on box size 100 | # Smaller boxes get smaller text 101 | base_font_size = 0.5 102 | min_dimension = min(box_width, box_height) 103 | if min_dimension < 30: 104 | font_size = max(0.3, base_font_size * min_dimension / 30) 105 | else: 106 | font_size = base_font_size 107 | 108 | text = str(element.element_id) 109 | (text_width, text_height), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, font_size, 1) 110 | 111 | # Position text at the top-left corner with small padding 112 | text_x = x1 + 2 113 | text_y = y1 + text_height + 2 114 | 115 | # Create transparent overlay for text background (alpha blending) 116 | overlay = opencv_image.copy() 117 | cv2.rectangle(overlay, 118 | (text_x - 2, text_y - text_height - 2), 119 | (text_x + text_width + 2, text_y + 2), 120 | (0, 0, 0), -1) 121 | 122 | # Apply transparency (alpha value: 0.5) 123 | alpha = 0.5 124 | cv2.addWeighted(overlay, alpha, opencv_image, 1 - alpha, 0, opencv_image) 125 | 126 | # Place text at the top-left corner of the box 127 | cv2.putText(opencv_image, text, 128 | (text_x, text_y), 129 | cv2.FONT_HERSHEY_SIMPLEX, font_size, color, 1) 130 | 131 | # convert opencv image format back to PIL format 132 | opencv_image = cv2.cvtColor(opencv_image, cv2.COLOR_BGR2RGB) 133 | pil_image = Image.fromarray(opencv_image) 134 | 135 | return pil_image 136 | 137 | -------------------------------------------------------------------------------- /auto_control/tools/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import ToolResult 2 | from .collection import ToolCollection 3 | from .computer import ComputerTool 4 | from .screen_capture import get_screenshot 5 | 6 | __ALL__ = [ 7 | ComputerTool, 8 | ToolCollection, 9 | ToolResult, 10 | get_screenshot, 11 | ] 12 | -------------------------------------------------------------------------------- /auto_control/tools/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | from dataclasses import dataclass, fields, replace 3 | from typing import Any 4 | 5 | from anthropic.types.beta import BetaToolUnionParam 6 | 7 | 8 | class BaseAnthropicTool(metaclass=ABCMeta): 9 | """Abstract base class for Anthropic-defined tools.""" 10 | 11 | @abstractmethod 12 | def __call__(self, **kwargs) -> Any: 13 | """Executes the tool with the given arguments.""" 14 | ... 15 | 16 | @abstractmethod 17 | def to_params( 18 | self, 19 | ) -> BetaToolUnionParam: 20 | raise NotImplementedError 21 | 22 | 23 | @dataclass(kw_only=True, frozen=True) 24 | class ToolResult: 25 | """Represents the result of a tool execution.""" 26 | 27 | output: str | None = None 28 | error: str | None = None 29 | base64_image: str | None = None 30 | system: str | None = None 31 | 32 | def __bool__(self): 33 | return any(getattr(self, field.name) for field in fields(self)) 34 | 35 | def __add__(self, other: "ToolResult"): 36 | def combine_fields( 37 | field: str | None, other_field: str | None, concatenate: bool = True 38 | ): 39 | if field and other_field: 40 | if concatenate: 41 | return field + other_field 42 | raise ValueError("Cannot combine tool results") 43 | return field or other_field 44 | 45 | return ToolResult( 46 | output=combine_fields(self.output, other.output), 47 | error=combine_fields(self.error, other.error), 48 | base64_image=combine_fields(self.base64_image, other.base64_image, False), 49 | system=combine_fields(self.system, other.system), 50 | ) 51 | 52 | def replace(self, **kwargs): 53 | """Returns a new ToolResult with the given fields replaced.""" 54 | return replace(self, **kwargs) 55 | 56 | 57 | class ToolFailure(ToolResult): 58 | """A ToolResult that represents a failure.""" 59 | 60 | 61 | class ToolError(Exception): 62 | """Raised when a tool encounters an error.""" 63 | 64 | def __init__(self, message): 65 | self.message = message 66 | -------------------------------------------------------------------------------- /auto_control/tools/collection.py: -------------------------------------------------------------------------------- 1 | """Collection classes for managing multiple tools.""" 2 | 3 | from typing import Any 4 | 5 | from anthropic.types.beta import BetaToolUnionParam 6 | 7 | from .base import ( 8 | BaseAnthropicTool, 9 | ToolError, 10 | ToolFailure, 11 | ToolResult, 12 | ) 13 | 14 | 15 | class ToolCollection: 16 | """A collection of anthropic-defined tools.""" 17 | 18 | def __init__(self, *tools: BaseAnthropicTool): 19 | self.tools = tools 20 | self.tool_map = {tool.to_params()["name"]: tool for tool in tools} 21 | 22 | def to_params( 23 | self, 24 | ) -> list[BetaToolUnionParam]: 25 | return [tool.to_params() for tool in self.tools] 26 | 27 | async def run(self, *, name: str, tool_input: dict[str, Any]) -> ToolResult: 28 | tool = self.tool_map.get(name) 29 | if not tool: 30 | return ToolFailure(error=f"Tool {name} is invalid") 31 | try: 32 | return await tool(**tool_input) 33 | except ToolError as e: 34 | return ToolFailure(error=e.message) 35 | -------------------------------------------------------------------------------- /auto_control/tools/computer.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import time 3 | from typing import Literal, TypedDict 4 | from PIL import Image 5 | from anthropic.types.beta import BetaToolComputerUse20241022Param 6 | from .base import BaseAnthropicTool, ToolError, ToolResult 7 | from .screen_capture import get_screenshot 8 | import pyautogui 9 | import pyperclip 10 | import platform 11 | 12 | OUTPUT_DIR = "./tmp/outputs" 13 | TYPING_DELAY_MS = 12 14 | TYPING_GROUP_SIZE = 50 15 | 16 | 17 | Action = [ 18 | "key", 19 | "type", 20 | "mouse_move", 21 | "left_click", 22 | "left_click_drag", 23 | "right_click", 24 | "middle_click", 25 | "double_click", 26 | "cursor_position", 27 | "hover", 28 | "wait", 29 | "scroll_up", 30 | "scroll_down", 31 | "None" 32 | ] 33 | 34 | class Resolution(TypedDict): 35 | width: int 36 | height: int 37 | 38 | MAX_SCALING_TARGETS: dict[str, Resolution] = { 39 | "XGA": Resolution(width=1024, height=768), # 4:3 40 | "WXGA": Resolution(width=1280, height=800), # 16:10 41 | "FWXGA": Resolution(width=1366, height=768), # ~16:9 42 | } 43 | 44 | class ComputerToolOptions(TypedDict): 45 | display_height_px: int 46 | display_width_px: int 47 | display_number: int | None 48 | 49 | def chunks(s: str, chunk_size: int) -> list[str]: 50 | return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)] 51 | 52 | class ComputerTool(BaseAnthropicTool): 53 | """ 54 | A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer. 55 | 56 | Adapted for Windows using 'pyautogui'. 57 | """ 58 | name: Literal["computer"] = "computer" 59 | api_type: Literal["computer_20241022"] = "computer_20241022" 60 | width: int 61 | height: int 62 | display_num: int | None 63 | _screenshot_delay = 2.0 64 | 65 | @property 66 | def options(self) -> ComputerToolOptions: 67 | return { 68 | "display_width_px": self.width, 69 | "display_height_px": self.height, 70 | "display_number": self.display_num, 71 | } 72 | 73 | def to_params(self) -> BetaToolComputerUse20241022Param: 74 | return {"name": self.name, "type": self.api_type, **self.options} 75 | 76 | 77 | def __init__(self): 78 | super().__init__() 79 | self.display_num = None 80 | self.offset_x = 0 81 | self.offset_y = 0 82 | self.width, self.height = pyautogui.size() 83 | self.key_conversion = {"Page_Down": "pagedown", 84 | "Page_Up": "pageup", 85 | "Super_L": "win", 86 | "Escape": "esc"} 87 | async def __call__( 88 | self, 89 | *, 90 | action, 91 | text: str | None = None, 92 | coordinate: tuple[int, int] | None = None, 93 | **kwargs, 94 | ): 95 | print(f"action: {action}, text: {text}, coordinate: {coordinate},") 96 | if action in ("mouse_move", "left_click_drag"): 97 | if coordinate is None: 98 | raise ToolError(f"coordinate is required for {action}") 99 | if text is not None: 100 | raise ToolError(f"text is not accepted for {action}") 101 | if not isinstance(coordinate, (list, tuple)) or len(coordinate) != 2: 102 | raise ToolError(f"{coordinate} must be a tuple of length 2") 103 | # if not all(isinstance(i, int) and i >= 0 for i in coordinate): 104 | if not all(isinstance(i, int) for i in coordinate): 105 | raise ToolError(f"{coordinate} must be a tuple of non-negative ints") 106 | 107 | x, y = coordinate 108 | print(f"mouse move to {x}, {y}") 109 | if action == "mouse_move": 110 | pyautogui.moveTo(x, y) 111 | return ToolResult(output=f"Moved mouse to ({x}, {y})") 112 | elif action == "left_click_drag": 113 | current_x, current_y = pyautogui.position() 114 | pyautogui.dragTo(x, y, duration=0.5) 115 | return ToolResult(output=f"Dragged mouse from ({current_x}, {current_y}) to ({x}, {y})") 116 | if action in ("key", "type"): 117 | if text is None: 118 | raise ToolError(f"text is required for {action}") 119 | if coordinate is not None: 120 | raise ToolError(f"coordinate is not accepted for {action}") 121 | if not isinstance(text, str): 122 | raise ToolError(output=f"{text} must be a string") 123 | if action == "key": 124 | # Handle key combinations 125 | keys = text.split('+') 126 | for key in keys: 127 | key = self.key_conversion.get(key.strip(), key.strip()) 128 | key = key.lower() 129 | pyautogui.keyDown(key) 130 | for key in reversed(keys): 131 | key = self.key_conversion.get(key.strip(), key.strip()) 132 | key = key.lower() 133 | pyautogui.keyUp(key) 134 | return ToolResult(output=f"Pressed keys: {text}") 135 | elif action == "type": 136 | # default click before type TODO: check if this is needed 137 | # Save user's old clipboard 138 | clipboard_data = pyperclip.paste() 139 | pyperclip.copy(text) 140 | if platform.system() == 'Darwin': 141 | pyautogui.hotkey('command', 'v', interval=0.1) 142 | else: # TODO: double check what works on windows 143 | pyautogui.hotkey('ctrl', 'v') 144 | # Copy old data back to clipboard 145 | pyperclip.copy(clipboard_data) 146 | return ToolResult(output=text) 147 | if action in ( 148 | "left_click", 149 | "right_click", 150 | "double_click", 151 | "middle_click", 152 | "cursor_position", 153 | "left_press", 154 | ): 155 | if text is not None: 156 | raise ToolError(f"text is not accepted for {action}") 157 | if coordinate is not None: 158 | raise ToolError(f"coordinate is not accepted for {action}") 159 | elif action == "cursor_position": 160 | x, y = pyautogui.position() 161 | # 直接返回原始坐标,不进行缩放 162 | return ToolResult(output=f"X={x},Y={y}") 163 | else: 164 | if action == "left_click": 165 | pyautogui.click() 166 | elif action == "right_click": 167 | pyautogui.rightClick() 168 | # 等待5秒,等待菜单弹出 169 | time.sleep(5) 170 | elif action == "middle_click": 171 | pyautogui.middleClick() 172 | elif action == "double_click": 173 | pyautogui.doubleClick() 174 | elif action == "left_press": 175 | pyautogui.mouseDown() 176 | time.sleep(1) 177 | pyautogui.mouseUp() 178 | return ToolResult(output=f"Performed {action}") 179 | if action in ("scroll_up", "scroll_down"): 180 | if action == "scroll_up": 181 | pyautogui.scroll(100) 182 | elif action == "scroll_down": 183 | pyautogui.scroll(-100) 184 | return ToolResult(output=f"Performed {action}") 185 | if action == "hover": 186 | return ToolResult(output=f"Performed {action}") 187 | if action == "wait": 188 | time.sleep(1) 189 | return ToolResult(output=f"Performed {action}") 190 | raise ToolError(f"Invalid action: {action}") 191 | 192 | def padding_image(self, screenshot): 193 | """Pad the screenshot to 16:10 aspect ratio, when the aspect ratio is not 16:10.""" 194 | _, height = screenshot.size 195 | new_width = height * 16 // 10 196 | 197 | padding_image = Image.new("RGB", (new_width, height), (255, 255, 255)) 198 | # padding to top left 199 | padding_image.paste(screenshot, (0, 0)) 200 | return padding_image 201 | -------------------------------------------------------------------------------- /auto_control/tools/screen_capture.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | from pathlib import Path 3 | from uuid import uuid4 4 | from PIL import Image 5 | import pyautogui 6 | from .base import ToolError 7 | from util import tool 8 | 9 | OUTPUT_DIR = "./tmp/outputs" 10 | 11 | def get_screenshot(screen_region=None, is_cursor=True): 12 | output_dir = Path(OUTPUT_DIR) 13 | output_dir.mkdir(parents=True, exist_ok=True) 14 | path = output_dir / f"screenshot_{uuid4().hex}.png" 15 | try: 16 | if is_cursor: 17 | img_io = tool.capture_screen_with_cursor() 18 | else: 19 | pyautogui_screenshot = pyautogui.screenshot() 20 | img_io = BytesIO() 21 | pyautogui_screenshot.save(img_io, 'PNG') 22 | screenshot = Image.open(img_io) 23 | 24 | # Create a black mask of the same size 25 | # If screen_region is provided and valid, copy only that region 26 | if screen_region and len(screen_region) == 4: 27 | black_mask = Image.new("RGBA", screenshot.size, (0, 0, 0, 255)) 28 | x1, y1, x2, y2 = screen_region 29 | region = screenshot.crop((x1, y1, x2, y2)) 30 | # Paste the region onto the black mask 31 | black_mask.paste(region, (x1, y1, x2, y2)) 32 | # Use the modified image as screenshot 33 | screenshot = black_mask 34 | screenshot.save(path) 35 | return screenshot, path 36 | except Exception as e: 37 | raise ToolError(f"Failed to capture screenshot: {str(e)}") -------------------------------------------------------------------------------- /imgs/autoMate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuruotong1/autoMate/bcd53ea749651a4819bd05d6cfc35f76a0e580c8/imgs/autoMate.png -------------------------------------------------------------------------------- /imgs/cursor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuruotong1/autoMate/bcd53ea749651a4819bd05d6cfc35f76a0e580c8/imgs/cursor.png -------------------------------------------------------------------------------- /imgs/gradioicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuruotong1/autoMate/bcd53ea749651a4819bd05d6cfc35f76a0e580c8/imgs/gradioicon.png -------------------------------------------------------------------------------- /imgs/header_bar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuruotong1/autoMate/bcd53ea749651a4819bd05d6cfc35f76a0e580c8/imgs/header_bar.png -------------------------------------------------------------------------------- /imgs/header_bar_thin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuruotong1/autoMate/bcd53ea749651a4819bd05d6cfc35f76a0e580c8/imgs/header_bar_thin.png -------------------------------------------------------------------------------- /imgs/knowledge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuruotong1/autoMate/bcd53ea749651a4819bd05d6cfc35f76a0e580c8/imgs/knowledge.png -------------------------------------------------------------------------------- /imgs/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuruotong1/autoMate/bcd53ea749651a4819bd05d6cfc35f76a0e580c8/imgs/logo.png -------------------------------------------------------------------------------- /imgs/omniboxicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuruotong1/autoMate/bcd53ea749651a4819bd05d6cfc35f76a0e580c8/imgs/omniboxicon.png -------------------------------------------------------------------------------- /imgs/omniparsericon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuruotong1/autoMate/bcd53ea749651a4819bd05d6cfc35f76a0e580c8/imgs/omniparsericon.png -------------------------------------------------------------------------------- /install.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | from util import download_weights 4 | import urllib.request 5 | import urllib.error 6 | 7 | def install_requirements(): 8 | # Check if Google is accessible 9 | try: 10 | # Try to connect to Google with a timeout of 3 seconds 11 | urllib.request.urlopen('https://www.google.com', timeout=3) 12 | # If successful, install normally 13 | subprocess.run([sys.executable, '-m', 'pip', 'install', '-r', 'requirements.txt']) 14 | except (urllib.error.URLError, TimeoutError): 15 | print("Using Tsinghua mirror") 16 | subprocess.run([ 17 | sys.executable, '-m', 'pip', 'install', 18 | '-r', 'requirements.txt', 19 | '-i', 'https://pypi.tuna.tsinghua.edu.cn/simple' 20 | ]) 21 | 22 | 23 | def adjust_python_env(): 24 | # check if python is 3.12 25 | if sys.version_info.major != 3 or sys.version_info.minor != 12: 26 | print("Python version is not 3.12, please install python 3.12") 27 | exit(1) 28 | 29 | def install(): 30 | adjust_python_env() 31 | install_requirements() 32 | # download the weight files 33 | download_weights.download() 34 | print("Installation complete!") 35 | 36 | if __name__ == "__main__": 37 | install() 38 | print("Installation complete!") -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # 导入主界面模块 2 | from ui.main import main 3 | # 导入权重下载工具 4 | from util import download_weights 5 | 6 | def run(): 7 | # 下载必要的模型权重文件 8 | download_weights.download() 9 | # 启动主界面 10 | main() 11 | 12 | # 当脚本直接运行时执行run函数 13 | if __name__ == "__main__": 14 | run() 15 | 16 | -------------------------------------------------------------------------------- /main.spec: -------------------------------------------------------------------------------- 1 | # -*- mode: python ; coding: utf-8 -*- 2 | from PyInstaller.utils.hooks import collect_data_files 3 | import os 4 | import shutil 5 | 6 | datas = [] 7 | datas += collect_data_files('gradio_client') 8 | datas += collect_data_files('gradio') 9 | datas += collect_data_files('safehttpx') 10 | datas += collect_data_files('groovy') 11 | base_path = os.path.dirname(os.path.abspath('__file__')) 12 | imgs_path = os.path.join(base_path, 'imgs') 13 | datas += [(imgs_path, 'imgs')] 14 | 15 | a = Analysis( 16 | ['main.py'], 17 | pathex=[], 18 | binaries=[], 19 | datas=datas, 20 | hiddenimports=[], 21 | hookspath=[], 22 | hooksconfig={}, 23 | runtime_hooks=[], 24 | excludes=[], 25 | noarchive=False, 26 | optimize=0, 27 | module_collection_mode={ 28 | 'gradio': 'py', 29 | }, 30 | ) 31 | pyz = PYZ(a.pure) 32 | 33 | exe = EXE( 34 | pyz, 35 | a.scripts, 36 | [], 37 | exclude_binaries=True, 38 | name='main', 39 | debug=False, 40 | bootloader_ignore_signals=False, 41 | strip=False, 42 | upx=True, 43 | console=True, 44 | disable_windowed_traceback=False, 45 | argv_emulation=False, 46 | target_arch=None, 47 | codesign_identity=None, 48 | entitlements_file=None, 49 | ) 50 | coll = COLLECT( 51 | exe, 52 | a.binaries, 53 | a.datas, 54 | strip=False, 55 | upx=True, 56 | upx_exclude=[], 57 | name='main', 58 | ) 59 | 60 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # torch 2 | # torchvision 3 | # easyocr 4 | supervision==0.18.0 5 | # transformers 6 | ultralytics==8.3.70 7 | numpy==1.26.4 8 | gradio 9 | pyautogui==0.9.54 10 | anthropic[bedrock,vertex]>=0.37.1 11 | pyxbrain==1.1.31 12 | timm 13 | einops==0.8.0 14 | modelscope 15 | pynput 16 | lap 17 | pyqt6==6.8.1 18 | keyboard==0.13.5 -------------------------------------------------------------------------------- /ui/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | autoMate UI package 3 | """ -------------------------------------------------------------------------------- /ui/agent_worker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Worker thread for handling agent operations 3 | """ 4 | import json 5 | from PyQt6.QtCore import QThread, pyqtSignal 6 | 7 | from auto_control.loop import sampling_loop_sync 8 | from xbrain.utils.config import Config 9 | 10 | class AgentWorker(QThread): 11 | """Worker thread for running agent operations asynchronously""" 12 | 13 | update_signal = pyqtSignal(list, list) 14 | status_signal = pyqtSignal(str) # Signal for status updates 15 | task_signal = pyqtSignal(str) # Signal for current task 16 | error_signal = pyqtSignal(str) # Error signal 17 | 18 | def __init__(self, user_input, state, vision_agent): 19 | super().__init__() 20 | self.user_input = user_input 21 | self.state = state 22 | self.vision_agent = vision_agent 23 | 24 | def run(self): 25 | # Reset stop flag 26 | if self.state["stop"]: 27 | self.state["stop"] = False 28 | 29 | # Configure API 30 | config = Config() 31 | config.set_openai_config( 32 | base_url=self.state["base_url"], 33 | api_key=self.state["api_key"], 34 | model=self.state["model"] 35 | ) 36 | 37 | # Add user message 38 | self.state["messages"].append({"role": "user", "content": self.user_input}) 39 | self.state["chatbox_messages"].append({"role": "user", "content": self.user_input}) 40 | 41 | # Send initial update 42 | self.update_signal.emit(self.state["chatbox_messages"], []) 43 | self.status_signal.emit("Starting analysis...") 44 | 45 | try: 46 | # Process with agent 47 | loop_iterator = sampling_loop_sync( 48 | model=self.state["model"], 49 | messages=self.state["messages"], 50 | vision_agent=self.vision_agent, 51 | screen_region=self.state.get("screen_region", None) 52 | ) 53 | 54 | for _ in loop_iterator: 55 | # 首先检查停止标志,如果停止则立即退出循环 56 | if self.state["stop"]: 57 | # 添加停止消息 58 | self.state["chatbox_messages"].append({"role": "assistant", "content": "⚠️ 操作已被用户停止"}) 59 | self.status_signal.emit("操作已被用户停止") 60 | # 更新UI 61 | self.update_signal.emit(self.state["chatbox_messages"], 62 | [[task["status"], task["task"]] for task in self.state["tasks"]]) 63 | # 立即返回,不再继续处理 64 | return 65 | 66 | # task_plan_agent first response 67 | if len(self.state["messages"]) == 2: 68 | task_list = json.loads(self.state["messages"][-1]["content"])["task_list"] 69 | for task in task_list: 70 | self.state["tasks"].append({ 71 | "status": "⬜", 72 | "task": task 73 | }) 74 | else: 75 | # Reset all task statuses 76 | for i in range(len(self.state["tasks"])): 77 | self.state["tasks"][i]["status"] = "⬜" 78 | 79 | # Update task progress 80 | content_json = json.loads(self.state["messages"][-1]["content"]) 81 | if "current_task_id" in content_json: 82 | task_completed_number = content_json["current_task_id"] 83 | else: 84 | task_completed_number = 0 85 | 86 | # Update status with reasoning 87 | if "reasoning" in content_json: 88 | self.status_signal.emit(content_json["reasoning"]) 89 | 90 | # Update current task 91 | if task_completed_number < len(self.state["tasks"]): 92 | current_task = self.state["tasks"][task_completed_number]["task"] 93 | self.task_signal.emit(current_task) 94 | 95 | if task_completed_number > len(self.state["tasks"]) + 1: 96 | for i in range(len(self.state["tasks"])): 97 | self.state["tasks"][i]["status"] = "✅" 98 | else: 99 | for i in range(task_completed_number + 1): 100 | self.state["tasks"][i]["status"] = "✅" 101 | 102 | # Check stop flag again 103 | if self.state["stop"]: 104 | self.state["chatbox_messages"].append({"role": "assistant", "content": "⚠️ Operation stopped by user"}) 105 | self.status_signal.emit("Operation stopped by user") 106 | self.update_signal.emit(self.state["chatbox_messages"], 107 | [[task["status"], task["task"]] for task in self.state["tasks"]]) 108 | return 109 | 110 | # Reconstruct chat messages from original messages 111 | self.state["chatbox_messages"] = [] 112 | 113 | for message in self.state["messages"]: 114 | formatted_content, json_reasoning = self.format_message_content(message["content"]) 115 | 116 | # Add json reasoning as a separate message if exists 117 | if json_reasoning: 118 | self.state["chatbox_messages"].append({ 119 | "role": message["role"], 120 | "content": json_reasoning 121 | }) 122 | 123 | # Add formatted content 124 | self.state["chatbox_messages"].append({ 125 | "role": message["role"], 126 | "content": formatted_content 127 | }) 128 | 129 | # Convert data format before returning results 130 | tasks_2d = [[task["status"], task["task"]] for task in self.state["tasks"]] 131 | self.update_signal.emit(self.state["chatbox_messages"], tasks_2d) 132 | 133 | # All done 134 | self.status_signal.emit("Task completed") 135 | 136 | except Exception as e: 137 | # Send error signal 138 | import traceback 139 | error_message = f"Error occurred: {str(e)}\n{traceback.format_exc()}" 140 | print(error_message) 141 | 142 | # Add error message to chat 143 | self.state["chatbox_messages"].append({ 144 | "role": "assistant", 145 | "content": f"⚠️ Network connection error: {str(e)}
Please check your network connection and API settings, or try again later." 146 | }) 147 | self.update_signal.emit(self.state["chatbox_messages"], 148 | [[task["status"], task["task"]] for task in self.state["tasks"]]) 149 | self.error_signal.emit(str(e)) 150 | self.status_signal.emit(f"Error: {str(e)}") 151 | 152 | def format_message_content(self, content): 153 | """Format message content for display""" 154 | # Handle list-type content (multimodal) 155 | if isinstance(content, list): 156 | formatted_content = "" 157 | json_reasoning = None 158 | 159 | for item in content: 160 | if item["type"] == "image_url": 161 | # Changed image style to be smaller 162 | formatted_content += f'
' 163 | elif item["type"] == "text": 164 | if self.is_json_format(item["text"]): 165 | reasoning, details = self.format_json_content(item["text"]) 166 | json_reasoning = reasoning 167 | formatted_content += details 168 | else: 169 | formatted_content += item["text"] 170 | 171 | return formatted_content, json_reasoning 172 | 173 | # Handle string content 174 | if self.is_json_format(content): 175 | reasoning, _ = self.format_json_content(content) 176 | formatted_content = json.dumps(json.loads(content), indent=4, ensure_ascii=False) 177 | return formatted_content, reasoning 178 | 179 | return content, None 180 | 181 | def format_json_content(self, json_content): 182 | """Format JSON content with reasoning and details""" 183 | content_json = json.loads(json_content) 184 | reasoning = f'

{content_json["reasoning"]}

' 185 | details = f'
Detail
{json.dumps(content_json, indent=4, ensure_ascii=False)}
' 186 | return reasoning, details 187 | 188 | def is_json_format(self, text): 189 | try: 190 | json.loads(text) 191 | return True 192 | except: 193 | return False -------------------------------------------------------------------------------- /ui/hotkey_edit.py: -------------------------------------------------------------------------------- 1 | """ 2 | Hotkey editing widget 3 | """ 4 | import keyboard 5 | from PyQt6.QtWidgets import QWidget, QHBoxLayout, QLineEdit, QPushButton 6 | 7 | # Default stop hotkey 8 | DEFAULT_STOP_HOTKEY = "alt+f3" 9 | 10 | class HotkeyEdit(QWidget): 11 | """Widget for recording hotkey combinations""" 12 | 13 | def __init__(self, hotkey="", parent=None): 14 | super().__init__(parent) 15 | layout = QHBoxLayout(self) 16 | layout.setContentsMargins(0, 0, 0, 0) 17 | 18 | self.hotkey_input = QLineEdit(hotkey) 19 | self.hotkey_input.setReadOnly(True) 20 | self.hotkey_input.setPlaceholderText("Click to record hotkey") 21 | 22 | self.record_btn = QPushButton("Record") 23 | self.record_btn.clicked.connect(self.start_recording) 24 | 25 | layout.addWidget(self.hotkey_input, 1) 26 | layout.addWidget(self.record_btn) 27 | 28 | self.recording = False 29 | self.keys_pressed = set() 30 | 31 | def start_recording(self): 32 | """Start recording a new hotkey""" 33 | if self.recording: 34 | self.stop_recording() 35 | return 36 | 37 | self.hotkey_input.setText("Press keys...") 38 | self.record_btn.setText("Stop") 39 | self.recording = True 40 | self.keys_pressed = set() 41 | 42 | # Hook global events 43 | keyboard.hook(self.on_key_event) 44 | 45 | def stop_recording(self): 46 | """Stop recording and set the hotkey""" 47 | keyboard.unhook(self.on_key_event) 48 | self.recording = False 49 | self.record_btn.setText("Record") 50 | 51 | # Convert keys to hotkey string 52 | if self.keys_pressed: 53 | hotkey = '+'.join(sorted(self.keys_pressed)) 54 | self.hotkey_input.setText(hotkey) 55 | else: 56 | self.hotkey_input.setText("") 57 | 58 | def on_key_event(self, event): 59 | """Handle key events during recording""" 60 | if not self.recording: 61 | return 62 | 63 | # Skip key up events 64 | if not event.event_type == keyboard.KEY_DOWN: 65 | return 66 | 67 | # Get key name 68 | key_name = event.name.lower() 69 | 70 | # Special handling for modifier keys 71 | if key_name in ['ctrl', 'alt', 'shift', 'windows']: 72 | self.keys_pressed.add(key_name) 73 | else: 74 | self.keys_pressed.add(key_name) 75 | 76 | # Show current keys 77 | self.hotkey_input.setText('+'.join(sorted(self.keys_pressed))) 78 | 79 | # Stop recording if user presses Escape alone 80 | if len(self.keys_pressed) == 1 and 'esc' in self.keys_pressed: 81 | self.keys_pressed.clear() 82 | self.stop_recording() 83 | 84 | def get_hotkey(self): 85 | """Get the current hotkey string""" 86 | return self.hotkey_input.text() 87 | 88 | def set_hotkey(self, hotkey): 89 | """Set the hotkey string""" 90 | self.hotkey_input.setText(hotkey) -------------------------------------------------------------------------------- /ui/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Main entry point for autoMate application 3 | """ 4 | import sys 5 | import argparse 6 | from PyQt6.QtWidgets import QApplication 7 | from ui.main_window import MainWindow 8 | 9 | def parse_arguments(): 10 | """Parse command line arguments""" 11 | parser = argparse.ArgumentParser(description="PyQt6 App") 12 | parser.add_argument("--windows_host_url", type=str, default='localhost:8006') 13 | parser.add_argument("--omniparser_server_url", type=str, default="localhost:8000") 14 | return parser.parse_args() 15 | 16 | def main(): 17 | """Main application entry point""" 18 | args = parse_arguments() 19 | app = QApplication(sys.argv) 20 | window = MainWindow(args) 21 | window.show() 22 | sys.exit(app.exec()) 23 | 24 | if __name__ == "__main__": 25 | main() -------------------------------------------------------------------------------- /ui/main_window.py: -------------------------------------------------------------------------------- 1 | """ 2 | Main application window 3 | """ 4 | import os 5 | import keyboard 6 | from pathlib import Path 7 | from PyQt6.QtWidgets import (QMainWindow, QWidget, QVBoxLayout, QHBoxLayout, 8 | QLabel, QLineEdit, QPushButton, QTableWidget, QTableWidgetItem, 9 | QTextEdit, QSplitter, QMessageBox, QHeaderView, QDialog, QSystemTrayIcon) 10 | from PyQt6.QtCore import Qt, pyqtSlot, QSize 11 | from PyQt6.QtGui import QPixmap, QIcon, QTextCursor, QTextCharFormat, QColor 12 | 13 | from xbrain.utils.config import Config 14 | from auto_control.agent.vision_agent import VisionAgent 15 | from util.download_weights import OMNI_PARSER_DIR 16 | 17 | from ui.theme import apply_theme 18 | from ui.settings_dialog import SettingsDialog 19 | from ui.agent_worker import AgentWorker 20 | from ui.tray_icon import StatusTrayIcon 21 | from ui.hotkey_edit import DEFAULT_STOP_HOTKEY 22 | 23 | # Intro text for application 24 | INTRO_TEXT = ''' 25 | Based on Omniparser to control desktop! 26 | ''' 27 | 28 | class MainWindow(QMainWindow): 29 | """Main application window""" 30 | 31 | def __init__(self, args): 32 | super().__init__() 33 | self.args = args 34 | 35 | # Initialize state 36 | self.state = self.setup_initial_state() 37 | 38 | # Initialize Agent 39 | self.vision_agent = VisionAgent( 40 | yolo_model_path=os.path.join(OMNI_PARSER_DIR, "icon_detect", "model.pt") 41 | ) 42 | 43 | # Setup UI and tray icon 44 | self.setup_tray_icon() 45 | self.setWindowTitle("autoMate") 46 | self.setMinimumSize(1200, 800) 47 | self.init_ui() 48 | self.apply_theme() 49 | 50 | # Register hotkey handler 51 | self.hotkey_handler = None 52 | self.register_stop_hotkey() 53 | 54 | print("\n\n🚀 PyQt6 application launched") 55 | 56 | def setup_tray_icon(self): 57 | """Setup system tray icon""" 58 | try: 59 | script_dir = Path(__file__).parent 60 | image_path = script_dir.parent / "imgs" / "logo.png" 61 | pixmap = QPixmap(str(image_path)) 62 | icon_pixmap = pixmap.scaled(32, 32, Qt.AspectRatioMode.KeepAspectRatio, Qt.TransformationMode.SmoothTransformation) 63 | app_icon = QIcon(icon_pixmap) 64 | self.setWindowIcon(app_icon) 65 | 66 | self.tray_icon = StatusTrayIcon(app_icon, self) 67 | self.tray_icon.show() 68 | except Exception as e: 69 | print(f"Error setting up tray icon: {e}") 70 | self.tray_icon = None 71 | 72 | def setup_initial_state(self): 73 | """Set up initial state""" 74 | config = Config() 75 | return { 76 | "api_key": config.OPENAI_API_KEY or "", 77 | "base_url": config.OPENAI_BASE_URL or "https://api.openai.com/v1", 78 | "model": config.OPENAI_MODEL or "gpt-4o", 79 | "theme": "Light", 80 | "stop_hotkey": DEFAULT_STOP_HOTKEY, 81 | "messages": [], 82 | "chatbox_messages": [], 83 | "auth_validated": False, 84 | "responses": {}, 85 | "tools": {}, 86 | "tasks": [], 87 | "only_n_most_recent_images": 2, 88 | "stop": False 89 | } 90 | 91 | def register_stop_hotkey(self): 92 | """Register the global stop hotkey""" 93 | # Clean up existing hotkeys 94 | if self.hotkey_handler: 95 | try: 96 | keyboard.unhook(self.hotkey_handler) 97 | self.hotkey_handler = None 98 | except: 99 | pass 100 | 101 | try: 102 | keyboard.unhook_all_hotkeys() 103 | except: 104 | pass 105 | 106 | # Get the current hotkey from state 107 | hotkey = self.state.get("stop_hotkey", DEFAULT_STOP_HOTKEY) 108 | if not hotkey: 109 | return 110 | 111 | try: 112 | self.hotkey_handler = keyboard.add_hotkey(hotkey, self.stop_process, suppress=False) 113 | print(f"Registered stop hotkey: {hotkey}") 114 | except Exception as e: 115 | print(f"Error registering hotkey '{hotkey}': {e}") 116 | try: 117 | keyboard.unhook_all() 118 | self.hotkey_handler = keyboard.add_hotkey(hotkey, self.stop_process, suppress=False) 119 | print(f"Registered stop hotkey (alternate method): {hotkey}") 120 | except Exception as e2: 121 | print(f"All attempts to register hotkey '{hotkey}' failed: {e2}") 122 | 123 | def apply_theme(self): 124 | """Apply the current theme to the application""" 125 | apply_theme(self, self.state.get("theme", "Light")) 126 | 127 | def init_ui(self): 128 | """Initialize UI components""" 129 | central_widget = QWidget() 130 | main_layout = QVBoxLayout(central_widget) 131 | 132 | # Load top image 133 | header_layout = QVBoxLayout() 134 | try: 135 | script_dir = Path(__file__).parent 136 | image_path = script_dir.parent.parent / "imgs" / "header_bar_thin.png" 137 | if image_path.exists(): 138 | pixmap = QPixmap(str(image_path)) 139 | header_label = QLabel() 140 | header_label.setPixmap(pixmap.scaledToWidth(self.width())) 141 | header_layout.addWidget(header_label) 142 | except Exception as e: 143 | print(f"Failed to load header image: {e}") 144 | 145 | title_label = QLabel("autoMate") 146 | title_label.setAlignment(Qt.AlignmentFlag.AlignCenter) 147 | font = title_label.font() 148 | font.setPointSize(20) 149 | title_label.setFont(font) 150 | header_layout.addWidget(title_label) 151 | 152 | # Introduction text 153 | intro_label = QLabel(INTRO_TEXT) 154 | intro_label.setWordWrap(True) 155 | font = intro_label.font() 156 | font.setPointSize(12) 157 | intro_label.setFont(font) 158 | 159 | # Settings button and clear chat button (at top) 160 | top_buttons_layout = QHBoxLayout() 161 | self.settings_button = QPushButton("Settings") 162 | self.settings_button.clicked.connect(self.open_settings_dialog) 163 | self.clear_button = QPushButton("Clear Chat") 164 | self.clear_button.clicked.connect(self.clear_chat) 165 | top_buttons_layout.addWidget(self.settings_button) 166 | top_buttons_layout.addWidget(self.clear_button) 167 | top_buttons_layout.addStretch() # Add elastic space to left-align buttons 168 | 169 | # Input area 170 | input_layout = QHBoxLayout() 171 | self.chat_input = QLineEdit() 172 | self.chat_input.setPlaceholderText("Type a message to send to Omniparser + X ...") 173 | # Send message on Enter key 174 | self.chat_input.returnPressed.connect(self.process_input) 175 | self.submit_button = QPushButton("Send") 176 | self.submit_button.clicked.connect(self.process_input) 177 | self.stop_button = QPushButton("Stop") 178 | self.stop_button.clicked.connect(self.stop_process) 179 | 180 | input_layout.addWidget(self.chat_input, 8) 181 | input_layout.addWidget(self.submit_button, 1) 182 | input_layout.addWidget(self.stop_button, 1) 183 | 184 | # Main content area 185 | content_splitter = QSplitter(Qt.Orientation.Horizontal) 186 | 187 | # Task list 188 | task_widget = QWidget() 189 | task_layout = QVBoxLayout(task_widget) 190 | task_label = QLabel("Task List") 191 | self.task_table = QTableWidget(0, 2) 192 | self.task_table.setHorizontalHeaderLabels(["Status", "Task"]) 193 | self.task_table.horizontalHeader().setSectionResizeMode(1, QHeaderView.ResizeMode.Stretch) 194 | task_layout.addWidget(task_label) 195 | task_layout.addWidget(self.task_table) 196 | 197 | # Chat area 198 | chat_widget = QWidget() 199 | chat_layout = QVBoxLayout(chat_widget) 200 | chat_label = QLabel("Chat History") 201 | self.chat_display = QTextEdit() 202 | self.chat_display.setReadOnly(True) 203 | chat_layout.addWidget(chat_label) 204 | chat_layout.addWidget(self.chat_display) 205 | 206 | # Add to splitter 207 | content_splitter.addWidget(task_widget) 208 | content_splitter.addWidget(chat_widget) 209 | content_splitter.setSizes([int(self.width() * 0.2), int(self.width() * 0.8)]) 210 | 211 | # Add all components to main layout 212 | main_layout.addLayout(header_layout) 213 | main_layout.addWidget(intro_label) 214 | main_layout.addLayout(top_buttons_layout) # Add top button area 215 | main_layout.addLayout(input_layout) 216 | main_layout.addWidget(content_splitter, 1) # 1 is the stretch factor 217 | 218 | self.setCentralWidget(central_widget) 219 | 220 | def open_settings_dialog(self): 221 | """Open settings dialog""" 222 | dialog = SettingsDialog(self, self.state) 223 | result = dialog.exec() 224 | 225 | if result == QDialog.DialogCode.Accepted: 226 | # Get and apply new settings 227 | settings = dialog.get_settings() 228 | 229 | # Check if stop hotkey changed 230 | old_hotkey = self.state.get("stop_hotkey", DEFAULT_STOP_HOTKEY) 231 | new_hotkey = settings["stop_hotkey"] 232 | 233 | self.state["model"] = settings["model"] 234 | self.state["base_url"] = settings["base_url"] 235 | self.state["api_key"] = settings["api_key"] 236 | self.state["stop_hotkey"] = new_hotkey 237 | 238 | # Update theme if changed 239 | if settings["theme"] != self.state.get("theme", "Light"): 240 | self.state["theme"] = settings["theme"] 241 | self.apply_theme() 242 | 243 | if settings["screen_region"]: 244 | self.state["screen_region"] = settings["screen_region"] 245 | 246 | # Update hotkey if changed 247 | if old_hotkey != new_hotkey: 248 | self.register_stop_hotkey() 249 | 250 | def process_input(self): 251 | """Process user input""" 252 | user_input = self.chat_input.text() 253 | if not user_input.strip(): 254 | return 255 | 256 | # Clear input box 257 | self.chat_input.clear() 258 | 259 | # Minimize main window 260 | self.showMinimized() 261 | 262 | # Create and start worker thread 263 | self.worker = AgentWorker(user_input, self.state, self.vision_agent) 264 | self.worker.update_signal.connect(self.update_ui) 265 | self.worker.error_signal.connect(self.handle_error) 266 | 267 | # Connect signals to tray icon if available 268 | if hasattr(self, 'tray_icon') and self.tray_icon is not None: 269 | self.worker.status_signal.connect(self.tray_icon.update_status) 270 | self.worker.task_signal.connect(self.tray_icon.update_task) 271 | 272 | self.worker.start() 273 | 274 | def handle_error(self, error_message): 275 | """Handle error messages""" 276 | # Restore main window to show the error 277 | self.showNormal() 278 | self.activateWindow() 279 | 280 | # Show error message 281 | QMessageBox.warning(self, "Connection Error", 282 | f"Error connecting to AI service:\n{error_message}\n\nPlease check your network connection and API settings.") 283 | 284 | @pyqtSlot(list, list) 285 | def update_ui(self, chatbox_messages, tasks): 286 | """Update UI display""" 287 | # Update chat display 288 | self.chat_display.clear() 289 | 290 | for msg in chatbox_messages: 291 | role = msg["role"] 292 | content = msg["content"] 293 | 294 | # Set different formats based on role 295 | format = QTextCharFormat() 296 | if role == "user": 297 | format.setForeground(QColor(0, 0, 255)) # Blue for user 298 | self.chat_display.append("You:") 299 | else: 300 | format.setForeground(QColor(0, 128, 0)) # Green for AI 301 | self.chat_display.append("AI:") 302 | 303 | # Add content 304 | cursor = self.chat_display.textCursor() 305 | cursor.movePosition(QTextCursor.MoveOperation.End) 306 | 307 | # Special handling for HTML content 308 | if "<" in content and ">" in content: 309 | self.chat_display.insertHtml(content) 310 | self.chat_display.append("") # Add empty line 311 | else: 312 | self.chat_display.append(content) 313 | self.chat_display.append("") # Add empty line 314 | 315 | # Scroll to bottom 316 | self.chat_display.verticalScrollBar().setValue( 317 | self.chat_display.verticalScrollBar().maximum() 318 | ) 319 | 320 | # Update task table 321 | self.task_table.setRowCount(len(tasks)) 322 | for i, (status, task) in enumerate(tasks): 323 | self.task_table.setItem(i, 0, QTableWidgetItem(status)) 324 | self.task_table.setItem(i, 1, QTableWidgetItem(task)) 325 | 326 | def stop_process(self): 327 | """Stop processing - handles both button click and hotkey press""" 328 | self.state["stop"] = True 329 | if hasattr(self, 'worker') and self.worker is not None: 330 | self.worker.terminate() 331 | if self.isMinimized(): 332 | self.showNormal() 333 | self.activateWindow() 334 | 335 | self.chat_display.append("⚠️ Operation stopped by user") 336 | self.register_stop_hotkey() 337 | 338 | def clear_chat(self): 339 | """Clear chat history""" 340 | self.state["messages"] = [] 341 | self.state["chatbox_messages"] = [] 342 | self.state["responses"] = {} 343 | self.state["tools"] = {} 344 | self.state["tasks"] = [] 345 | 346 | self.chat_display.clear() 347 | self.task_table.setRowCount(0) 348 | 349 | def closeEvent(self, event): 350 | """Handle window close event""" 351 | if hasattr(self, 'tray_icon') and self.tray_icon is not None and self.tray_icon.isVisible(): 352 | self.hide() 353 | event.ignore() 354 | elif self.state.get("stop", False) and hasattr(self, 'worker') and self.worker is not None: 355 | self.state["stop"] = False 356 | event.ignore() 357 | elif hasattr(self, 'worker') and self.worker is not None and self.worker.isRunning(): 358 | reply = QMessageBox.question(self, 'Exit Confirmation', 359 | '自动化任务仍在运行中,确定要退出程序吗?', 360 | QMessageBox.StandardButton.Yes | QMessageBox.StandardButton.No, 361 | QMessageBox.StandardButton.No) 362 | if reply == QMessageBox.StandardButton.Yes: 363 | keyboard.unhook_all() 364 | event.accept() 365 | else: 366 | event.ignore() 367 | else: 368 | keyboard.unhook_all() 369 | event.accept() -------------------------------------------------------------------------------- /ui/settings_dialog.py: -------------------------------------------------------------------------------- 1 | """ 2 | Settings dialog for application configuration 3 | """ 4 | from PyQt6.QtWidgets import (QDialog, QVBoxLayout, QHBoxLayout, 5 | QLabel, QLineEdit, QPushButton, QComboBox) 6 | from PyQt6.QtCore import QTimer 7 | from ui.hotkey_edit import HotkeyEdit, DEFAULT_STOP_HOTKEY 8 | from ui.theme import THEMES 9 | 10 | class SettingsDialog(QDialog): 11 | """Dialog for application settings""" 12 | 13 | def __init__(self, parent=None, state=None): 14 | super().__init__(parent) 15 | self.state = state 16 | self.parent_window = parent 17 | self.setWindowTitle("Settings") 18 | self.setMinimumWidth(500) 19 | self.init_ui() 20 | 21 | def init_ui(self): 22 | layout = QVBoxLayout(self) 23 | 24 | # Model settings 25 | model_layout = QHBoxLayout() 26 | model_label = QLabel("Model:") 27 | self.model_input = QLineEdit(self.state["model"]) 28 | model_layout.addWidget(model_label) 29 | model_layout.addWidget(self.model_input) 30 | 31 | # Base URL settings 32 | url_layout = QHBoxLayout() 33 | url_label = QLabel("Base URL:") 34 | self.base_url_input = QLineEdit(self.state["base_url"]) 35 | url_layout.addWidget(url_label) 36 | url_layout.addWidget(self.base_url_input) 37 | 38 | # API key settings 39 | api_layout = QHBoxLayout() 40 | api_label = QLabel("API Key:") 41 | self.api_key_input = QLineEdit(self.state["api_key"]) 42 | self.api_key_input.setEchoMode(QLineEdit.EchoMode.Password) 43 | api_layout.addWidget(api_label) 44 | api_layout.addWidget(self.api_key_input) 45 | 46 | # Theme selection 47 | theme_layout = QHBoxLayout() 48 | theme_label = QLabel("Theme:") 49 | self.theme_combo = QComboBox() 50 | self.theme_combo.addItems(list(THEMES.keys())) 51 | current_theme = self.state.get("theme", "Light") 52 | self.theme_combo.setCurrentText(current_theme) 53 | theme_layout.addWidget(theme_label) 54 | theme_layout.addWidget(self.theme_combo) 55 | 56 | # Stop hotkey setting 57 | hotkey_layout = QHBoxLayout() 58 | hotkey_label = QLabel("Stop Hotkey:") 59 | self.hotkey_edit = HotkeyEdit(self.state.get("stop_hotkey", DEFAULT_STOP_HOTKEY)) 60 | hotkey_layout.addWidget(hotkey_label) 61 | hotkey_layout.addWidget(self.hotkey_edit) 62 | 63 | # Screen region selection 64 | region_layout = QHBoxLayout() 65 | self.select_region_btn = QPushButton("Select Screen Region") 66 | self.region_info = QLabel("No region selected" if "screen_region" not in self.state else f"Selected region: {self.state['screen_region']}") 67 | self.select_region_btn.clicked.connect(self.select_screen_region) 68 | region_layout.addWidget(self.select_region_btn) 69 | region_layout.addWidget(self.region_info) 70 | 71 | # OK and Cancel buttons 72 | button_layout = QHBoxLayout() 73 | self.ok_button = QPushButton("OK") 74 | self.cancel_button = QPushButton("Cancel") 75 | self.ok_button.clicked.connect(self.accept) 76 | self.cancel_button.clicked.connect(self.reject) 77 | button_layout.addWidget(self.ok_button) 78 | button_layout.addWidget(self.cancel_button) 79 | 80 | # Add all elements to main layout 81 | layout.addLayout(model_layout) 82 | layout.addLayout(url_layout) 83 | layout.addLayout(api_layout) 84 | layout.addLayout(theme_layout) 85 | layout.addLayout(hotkey_layout) 86 | layout.addLayout(region_layout) 87 | layout.addLayout(button_layout) 88 | 89 | def select_screen_region(self): 90 | """Select screen region""" 91 | # Minimize the parent window before selecting region 92 | if self.parent_window: 93 | self.parent_window.showMinimized() 94 | # Wait a moment for the window to minimize 95 | QTimer.singleShot(500, self._do_select_region) 96 | else: 97 | self._do_select_region() 98 | 99 | def _do_select_region(self): 100 | """Actual region selection after minimizing""" 101 | from util.screen_selector import ScreenSelector 102 | region = ScreenSelector().get_selection() 103 | 104 | # Restore the dialog and parent window 105 | self.activateWindow() 106 | if self.parent_window: 107 | self.parent_window.showNormal() 108 | self.parent_window.activateWindow() 109 | 110 | if region: 111 | self.state["screen_region"] = region 112 | self.region_info.setText(f"Selected region: {region}") 113 | else: 114 | self.region_info.setText("Selection cancelled") 115 | 116 | def get_settings(self): 117 | """Get settings content""" 118 | return { 119 | "model": self.model_input.text(), 120 | "base_url": self.base_url_input.text(), 121 | "api_key": self.api_key_input.text(), 122 | "screen_region": self.state.get("screen_region", None), 123 | "theme": self.theme_combo.currentText(), 124 | "stop_hotkey": self.hotkey_edit.get_hotkey() 125 | } -------------------------------------------------------------------------------- /ui/theme.py: -------------------------------------------------------------------------------- 1 | """ 2 | Theme definitions and theme handling functionality 3 | """ 4 | 5 | # Theme definitions 6 | THEMES = { 7 | "Light": { 8 | "main_bg": "#F5F5F5", 9 | "widget_bg": "#FFFFFF", 10 | "text": "#333333", 11 | "accent": "#4A86E8", 12 | "button_bg": "#E3E3E3", 13 | "button_text": "#333333", 14 | "border": "#CCCCCC", 15 | "selection_bg": "#D0E2F4" 16 | }, 17 | "Dark": { 18 | "main_bg": "#2D2D2D", 19 | "widget_bg": "#3D3D3D", 20 | "text": "#FFFFFF", 21 | "accent": "#4A86E8", 22 | "button_bg": "#555555", 23 | "button_text": "#FFFFFF", 24 | "border": "#555555", 25 | "selection_bg": "#3A5F8A" 26 | } 27 | } 28 | 29 | def apply_theme(widget, theme_name="Light"): 30 | """Apply the specified theme to the widget""" 31 | theme = THEMES[theme_name] 32 | 33 | # Create stylesheet for the application 34 | stylesheet = f""" 35 | QMainWindow, QDialog {{ 36 | background-color: {theme['main_bg']}; 37 | color: {theme['text']}; 38 | }} 39 | 40 | QWidget {{ 41 | background-color: {theme['main_bg']}; 42 | color: {theme['text']}; 43 | }} 44 | 45 | QLabel {{ 46 | color: {theme['text']}; 47 | }} 48 | 49 | QPushButton {{ 50 | background-color: {theme['button_bg']}; 51 | color: {theme['button_text']}; 52 | border: 1px solid {theme['border']}; 53 | border-radius: 4px; 54 | padding: 5px 10px; 55 | }} 56 | 57 | QPushButton:hover {{ 58 | background-color: {theme['accent']}; 59 | color: white; 60 | }} 61 | 62 | QLineEdit, QTextEdit, QTableWidget, QComboBox {{ 63 | background-color: {theme['widget_bg']}; 64 | color: {theme['text']}; 65 | border: 1px solid {theme['border']}; 66 | border-radius: 4px; 67 | padding: 4px; 68 | }} 69 | 70 | QTextEdit {{ 71 | background-color: {theme['widget_bg']}; 72 | }} 73 | 74 | QTableWidget::item:selected {{ 75 | background-color: {theme['selection_bg']}; 76 | }} 77 | 78 | QHeaderView::section {{ 79 | background-color: {theme['button_bg']}; 80 | color: {theme['button_text']}; 81 | padding: 4px; 82 | border: 1px solid {theme['border']}; 83 | }} 84 | 85 | QSplitter::handle {{ 86 | background-color: {theme['border']}; 87 | }} 88 | 89 | QScrollBar {{ 90 | background-color: {theme['widget_bg']}; 91 | }} 92 | 93 | QScrollBar::handle {{ 94 | background-color: {theme['button_bg']}; 95 | border-radius: 4px; 96 | }} 97 | """ 98 | 99 | widget.setStyleSheet(stylesheet) -------------------------------------------------------------------------------- /ui/tray_icon.py: -------------------------------------------------------------------------------- 1 | """ 2 | System tray icon implementation 3 | """ 4 | from PyQt6.QtWidgets import QSystemTrayIcon, QMenu, QApplication 5 | from PyQt6.QtGui import QAction 6 | 7 | class StatusTrayIcon(QSystemTrayIcon): 8 | """System tray icon that displays application status""" 9 | 10 | def __init__(self, icon, parent=None): 11 | super().__init__(icon, parent) 12 | self.parent = parent 13 | self.setToolTip("autoMate") 14 | 15 | # Create context menu 16 | self.menu = QMenu() 17 | self.show_action = QAction("Show Main Window") 18 | self.show_action.triggered.connect(self.show_main_window) 19 | self.menu_status = QAction("Status: Idle") 20 | self.menu_status.setEnabled(False) 21 | self.menu_task = QAction("Task: None") 22 | self.menu_task.setEnabled(False) 23 | self.exit_action = QAction("Exit") 24 | self.exit_action.triggered.connect(QApplication.quit) 25 | 26 | self.menu.addAction(self.show_action) 27 | self.menu.addSeparator() 28 | self.menu.addAction(self.menu_status) 29 | self.menu.addAction(self.menu_task) 30 | self.menu.addSeparator() 31 | self.menu.addAction(self.exit_action) 32 | 33 | self.setContextMenu(self.menu) 34 | 35 | # Connect signals 36 | self.activated.connect(self.icon_activated) 37 | 38 | def show_main_window(self): 39 | if self.parent: 40 | self.parent.showNormal() 41 | self.parent.activateWindow() 42 | 43 | def icon_activated(self, reason): 44 | if reason == QSystemTrayIcon.ActivationReason.DoubleClick: 45 | self.show_main_window() 46 | 47 | def update_status(self, status_text): 48 | """Update status text in tray tooltip and menu""" 49 | # Truncate if too long for menu 50 | short_status = status_text[:50] + "..." if len(status_text) > 50 else status_text 51 | self.menu_status.setText(f"Status: {short_status}") 52 | 53 | # Show brief notification but don't disrupt automation 54 | # Only show notification for 500ms (very brief) to not interfere with visual automation 55 | self.showMessage("autoMate Status", status_text, QSystemTrayIcon.MessageIcon.Information, 500) 56 | 57 | def update_task(self, task_text): 58 | """Update task text in tray menu""" 59 | short_task = task_text[:50] + "..." if len(task_text) > 50 else task_text 60 | self.menu_task.setText(f"Task: {short_task}") -------------------------------------------------------------------------------- /util/auto_control.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import time 4 | 5 | # Add the project root directory to Python path 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 7 | from auto_control.agent.vision_agent import VisionAgent 8 | from util.download_weights import MODEL_DIR 9 | from pynput import mouse, keyboard 10 | 11 | # Now you can import from auto_control 12 | from auto_control.tools.screen_capture import get_screenshot 13 | 14 | class AutoControl: 15 | def __init__(self): 16 | self.auto_list = [] 17 | 18 | def start_listen(self): 19 | # Create both mouse and keyboard listeners 20 | mouse_listener = mouse.Listener( 21 | on_move=self.on_move, 22 | on_click=self.on_click, 23 | on_scroll=self.on_scroll) 24 | 25 | keyboard_listener = keyboard.Listener( 26 | on_press=self.on_press, 27 | on_release=self.on_release) 28 | 29 | # Start both listeners 30 | mouse_listener.start() 31 | keyboard_listener.start() 32 | 33 | # Keep the program running until keyboard listener stops 34 | keyboard_listener.join() 35 | 36 | # After keyboard stops (ESC pressed), stop mouse listener too 37 | mouse_listener.stop() 38 | 39 | def on_move(self, x, y, injected): 40 | print('Pointer moved to {}; it was {}'.format( 41 | (x, y), 'faked' if injected else 'not faked')) 42 | 43 | def on_click(self, x, y, button, pressed, injected): 44 | print('Mouse {} {} at {}; it was {}'.format( 45 | button, 46 | 'Pressed' if pressed else 'Released', 47 | (x, y), 48 | 'faked' if injected else 'not faked')) 49 | if not pressed: 50 | # wait right click window 51 | if button == mouse.Button.right: 52 | time.sleep(1) 53 | screenshot, path = get_screenshot(is_cursor=False) 54 | self.auto_list.append( 55 | {"button": button, 56 | "pressed": pressed, 57 | "position": (x, y), 58 | "path": path, 59 | "image": screenshot 60 | } 61 | ) 62 | 63 | 64 | def on_scroll(self, x, y, dx, dy, injected): 65 | print('Scrolled {} at {}; it was {}'.format( 66 | 'down' if dy < 0 else 'up', 67 | (x, y), 'faked' if injected else 'not faked')) 68 | 69 | def on_press(self, key, injected): 70 | try: 71 | print('alphanumeric key {} pressed; it was {}'.format( 72 | key.char, 'faked' if injected else 'not faked')) 73 | except AttributeError: 74 | print('special key {} pressed'.format( 75 | key)) 76 | 77 | def on_release(self, key, injected): 78 | print('{} released; it was {}'.format( 79 | key, 'faked' if injected else 'not faked')) 80 | 81 | if key == keyboard.Key.esc: 82 | 83 | print("self.auto_list", self.auto_list) 84 | vision_agent = VisionAgent(yolo_model_path=os.path.join(MODEL_DIR, "icon_detect", "model.pt")) 85 | 86 | for item in self.auto_list: 87 | element_list =vision_agent(str(item["path"])) 88 | for element in element_list: 89 | if self.crop_image_if_position_in_coordinates(item["image"], item["path"], item["position"], element.coordinates): 90 | break 91 | # Stop listener 92 | return False 93 | 94 | 95 | 96 | def crop_image_if_position_in_coordinates(self, image, image_path, position, coordinates): 97 | """ 98 | Check if position is within coordinates and crop image if true 99 | 100 | Args: 101 | image: PIL Image object 102 | position: tuple of (x, y) - current position 103 | coordinates: tuple of (x1, y1, x2, y2) - target area 104 | 105 | Returns: 106 | bool: True if position is in coordinates 107 | """ 108 | x, y = position 109 | x1, y1, x2, y2 = coordinates 110 | 111 | # Check if position is within coordinates 112 | if (x1 <= x <= x2) and (y1 <= y <= y2): 113 | # Crop the image to the coordinates 114 | cropped_image = image.crop(coordinates) 115 | # Save the cropped image with proper path and format 116 | save_path = str(image_path).replace('.png', '_cropped.png') 117 | cropped_image.save(save_path, 'PNG') 118 | return True 119 | 120 | return False 121 | 122 | if __name__ == "__main__": 123 | auto_control = AutoControl() 124 | auto_control.start_listen() 125 | 126 | -------------------------------------------------------------------------------- /util/auto_util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | import pyautogui 4 | from enum import Enum 5 | 6 | import pyperclip 7 | class AppName(Enum): 8 | WECHAT = "wechat" 9 | 10 | 11 | class AutoUtil: 12 | def __init__(self, app_name: AppName): 13 | self.img_dir = os.path.join(os.path.dirname(__file__),"..", "imgs", app_name.value) 14 | 15 | def click_multi_img(self, img_names, offset_x=0, offset_y=0, minSearchTime=0): 16 | for img_name in img_names: 17 | self.find_click_img(img_name, offset_x, offset_y, minSearchTime) 18 | 19 | def find_click_img(self, img_name, offset_x=0, offset_y=0, minSearchTime=0): 20 | img_path = os.path.join(self.img_dir, img_name + ".png") 21 | img = pyautogui.locateOnScreen(img_path, minSearchTime=minSearchTime) 22 | x,y = pyautogui.center(img) 23 | # Add offset to click position 24 | pyautogui.click(x + offset_x, y + offset_y) 25 | 26 | def send_text(self, text): 27 | clipboard_data = pyperclip.paste() 28 | pyperclip.copy(text) 29 | if platform.system() == 'Darwin': 30 | pyautogui.hotkey('command', 'v', interval=0.1) 31 | else: 32 | pyautogui.hotkey('ctrl', 'v') 33 | # Copy old data back to clipboard 34 | pyperclip.copy(clipboard_data) 35 | -------------------------------------------------------------------------------- /util/download_weights.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | __WEIGHTS_DIR = Path("weights") 5 | OMNI_PARSER_DIR = os.path.join(__WEIGHTS_DIR, "AI-ModelScope", "OmniParser-v2___0") 6 | def download(): 7 | from modelscope import snapshot_download 8 | # Create weights directory 9 | __WEIGHTS_DIR.mkdir(exist_ok=True) 10 | snapshot_download( 11 | 'AI-ModelScope/OmniParser-v2.0', 12 | cache_dir='weights', 13 | allow_file_pattern=['icon_detect/model.pt'] 14 | ) 15 | 16 | if __name__ == "__main__": 17 | download() -------------------------------------------------------------------------------- /util/screen_selector.py: -------------------------------------------------------------------------------- 1 | import tkinter as tk 2 | from tkinter import Button 3 | import sys 4 | 5 | class ScreenSelector: 6 | def __init__(self): 7 | self.root = tk.Tk() 8 | self.root.withdraw() 9 | 10 | # 创建全屏窗口 11 | self.window = tk.Toplevel(self.root) 12 | self.window.attributes("-fullscreen", True) 13 | self.window.attributes("-alpha", 0.6) 14 | self.window.attributes("-topmost", True) 15 | 16 | # 初始化变量 17 | self.start_x = self.start_y = self.current_x = self.current_y = None 18 | self.selection_rect = self.confirm_button = None 19 | self.result = None 20 | 21 | # 创建画布 22 | self.canvas = tk.Canvas(self.window, bg="gray20", highlightthickness=0) 23 | self.canvas.pack(fill=tk.BOTH, expand=True) 24 | 25 | # 绑定事件 26 | self.canvas.bind("", self.on_press) 27 | self.canvas.bind("", self.on_drag) 28 | self.canvas.bind("", self.on_release) 29 | self.window.bind("", self.cancel) 30 | 31 | def on_press(self, event): 32 | # 清除已有选择 33 | if self.selection_rect: 34 | self.canvas.delete(self.selection_rect) 35 | if self.confirm_button: 36 | self.confirm_button.destroy() 37 | self.confirm_button = None 38 | 39 | self.start_x = self.canvas.canvasx(event.x) 40 | self.start_y = self.canvas.canvasy(event.y) 41 | self.selection_rect = self.canvas.create_rectangle( 42 | self.start_x, self.start_y, self.start_x, self.start_y, 43 | outline="red", width=5 44 | ) 45 | 46 | def on_drag(self, event): 47 | self.current_x = self.canvas.canvasx(event.x) 48 | self.current_y = self.canvas.canvasy(event.y) 49 | 50 | # 更新选择框 51 | self.canvas.coords(self.selection_rect, 52 | self.start_x, self.start_y, 53 | self.current_x, self.current_y) 54 | 55 | # 更新透明区域 56 | self.update_region() 57 | 58 | def update_region(self): 59 | self.canvas.delete("transparent_region") 60 | 61 | # 计算坐标 62 | x1 = min(self.start_x, self.current_x) 63 | y1 = min(self.start_y, self.current_y) 64 | x2 = max(self.start_x, self.current_x) 65 | y2 = max(self.start_y, self.current_y) 66 | 67 | # 绘制背景和透明区域 68 | self.canvas.create_rectangle( 69 | 0, 0, self.window.winfo_width(), self.window.winfo_height(), 70 | fill="gray20", stipple="gray50", tags="transparent_region" 71 | ) 72 | self.canvas.create_rectangle( 73 | x1, y1, x2, y2, fill="", outline="", tags="transparent_region" 74 | ) 75 | 76 | # 确保选择框在最上层 77 | self.canvas.tag_raise(self.selection_rect) 78 | 79 | def on_release(self, event): 80 | self.current_x = self.canvas.canvasx(event.x) 81 | self.current_y = self.canvas.canvasy(event.y) 82 | 83 | # 有效选择判断 84 | if abs(self.current_x - self.start_x) > 5 and abs(self.current_y - self.start_y) > 5: 85 | self.show_button() 86 | 87 | def show_button(self): 88 | if self.confirm_button: 89 | self.confirm_button.destroy() 90 | 91 | # 计算坐标 92 | x1 = min(self.start_x, self.current_x) 93 | y1 = min(self.start_y, self.current_y) 94 | x2 = max(self.start_x, self.current_x) 95 | y2 = max(self.start_y, self.current_y) 96 | 97 | # 计算距离四个角的距离 98 | distances = [ 99 | ((self.current_x - x1)**2 + (self.current_y - y1)**2, (x1 - 90, y1 - 40)), # 左上 100 | ((self.current_x - x2)**2 + (self.current_y - y1)**2, (x2 + 10, y1 - 40)), # 右上 101 | ((self.current_x - x1)**2 + (self.current_y - y2)**2, (x1 - 90, y2 + 10)), # 左下 102 | ((self.current_x - x2)**2 + (self.current_y - y2)**2, (x2 + 10, y2 + 10)) # 右下 103 | ] 104 | 105 | # 选择最近的角 106 | btn_x, btn_y = min(distances, key=lambda d: d[0])[1] 107 | 108 | # 边界检查 109 | width, height = self.window.winfo_width(), self.window.winfo_height() 110 | if btn_x + 80 > width: btn_x = x1 - 90 111 | if btn_x < 0: btn_x = x2 + 10 112 | if btn_y < 0: btn_y = y2 + 10 113 | if btn_y + 30 > height: btn_y = y1 - 40 114 | 115 | # 创建按钮 116 | self.confirm_button = Button( 117 | self.window, text="Confirm", command=self.confirm, 118 | bg="white", fg="black", font=("Arial", 12, "bold"), 119 | padx=10, pady=5 120 | ) 121 | self.confirm_button.place(x=btn_x, y=btn_y) 122 | 123 | def confirm(self): 124 | # 获取选择区域坐标 125 | x1 = min(self.start_x, self.current_x) 126 | y1 = min(self.start_y, self.current_y) 127 | x2 = max(self.start_x, self.current_x) 128 | y2 = max(self.start_y, self.current_y) 129 | 130 | self.result = (int(x1), int(y1), int(x2), int(y2)) 131 | self.root.quit() 132 | self.window.destroy() 133 | 134 | def cancel(self, event=None): 135 | self.result = None 136 | self.root.quit() 137 | self.window.destroy() 138 | 139 | def get_selection(self): 140 | self.root.mainloop() 141 | if hasattr(self, 'root') and self.root: 142 | self.root.destroy() 143 | return self.result 144 | 145 | 146 | if __name__ == "__main__": 147 | region = ScreenSelector().get_selection() 148 | print(f"Selected region: {region}") 149 | sys.exit(0) 150 | -------------------------------------------------------------------------------- /util/tool.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pyautogui 3 | from PIL import Image 4 | from io import BytesIO 5 | 6 | 7 | def capture_screen_with_cursor(): 8 | """Local function to capture the screen with cursor.""" 9 | cursor_path = os.path.join(os.path.dirname(__file__),"..","imgs", "cursor.png") 10 | screenshot = pyautogui.screenshot() 11 | cursor_x, cursor_y = pyautogui.position() 12 | cursor = Image.open(cursor_path) 13 | cursor = cursor.resize((int(cursor.width / 1.5), int(cursor.height / 1.5))) 14 | screenshot.paste(cursor, (cursor_x, cursor_y), cursor) 15 | img_io = BytesIO() 16 | screenshot.save(img_io, 'PNG') 17 | img_io.seek(0) 18 | return img_io 19 | 20 | 21 | -------------------------------------------------------------------------------- /util/wechat_auto.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | 5 | import pyautogui 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 7 | 8 | from util.auto_util import AppName, AutoUtil 9 | class WechatAuto: 10 | def __init__(self): 11 | self.auto_util = AutoUtil(AppName.WECHAT) 12 | 13 | def go_to_chat(self): 14 | self.auto_util.find_click_img("chat_unselect.png") 15 | 16 | def search_friend(self, friend_name): 17 | try: 18 | self.auto_util.find_click_img("chat_unselect") 19 | except pyautogui.ImageNotFoundException: 20 | self.auto_util.find_click_img("chat_select") 21 | self.auto_util.find_click_img("search", offset_x=100) 22 | self.auto_util.send_text(friend_name) 23 | self.auto_util.find_click_img("contact_person",offset_x=100,offset_y=100,minSearchTime=10) 24 | self.auto_util.find_click_img("search",offset_x=-100,offset_y=-100,minSearchTime=10) 25 | 26 | if __name__ == "__main__": 27 | time.sleep(3) 28 | wechat_auto = WechatAuto() 29 | wechat_auto.search_friend("李杨林") 30 | 31 | --------------------------------------------------------------------------------