├── resources ├── setting.png ├── wechat.jpeg ├── screenshot-20251209-181423.png ├── screenshot-20251210-120416.png ├── screenshot-20251210-120630.png ├── WECHAT.md ├── logo.svg ├── privacy_policy.txt └── privacy_policy_en.txt ├── phone_agent ├── model │ ├── __init__.py │ └── client.py ├── actions │ ├── __init__.py │ └── handler.py ├── __init__.py ├── adb │ ├── __init__.py │ ├── input.py │ ├── screenshot.py │ ├── device.py │ └── connection.py ├── hdc │ ├── __init__.py │ ├── screenshot.py │ ├── input.py │ ├── device.py │ └── connection.py ├── config │ ├── __init__.py │ ├── i18n.py │ ├── prompts_en.py │ ├── prompts.py │ ├── prompts_zh.py │ ├── timing.py │ ├── apps.py │ └── apps_harmonyos.py ├── device_factory.py └── agent.py ├── requirements.txt ├── .pre-commit-config.yaml ├── .gitignore ├── .github ├── ISSUE_TEMPLATE │ ├── feature-request.yaml │ └── bug_report.yaml └── PULL_REQUEST_TEMPLATE.md ├── setup.py ├── examples ├── demo_thinking.py └── basic_usage.py ├── scripts ├── check_deployment_cn.py └── check_deployment_en.py ├── README_coding_agent.md └── LICENSE /resources/setting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zai-org/Open-AutoGLM/HEAD/resources/setting.png -------------------------------------------------------------------------------- /resources/wechat.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zai-org/Open-AutoGLM/HEAD/resources/wechat.jpeg -------------------------------------------------------------------------------- /resources/screenshot-20251209-181423.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zai-org/Open-AutoGLM/HEAD/resources/screenshot-20251209-181423.png -------------------------------------------------------------------------------- /resources/screenshot-20251210-120416.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zai-org/Open-AutoGLM/HEAD/resources/screenshot-20251210-120416.png -------------------------------------------------------------------------------- /resources/screenshot-20251210-120630.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zai-org/Open-AutoGLM/HEAD/resources/screenshot-20251210-120630.png -------------------------------------------------------------------------------- /phone_agent/model/__init__.py: -------------------------------------------------------------------------------- 1 | """Model client module for AI inference.""" 2 | 3 | from phone_agent.model.client import ModelClient, ModelConfig 4 | 5 | __all__ = ["ModelClient", "ModelConfig"] 6 | -------------------------------------------------------------------------------- /phone_agent/actions/__init__.py: -------------------------------------------------------------------------------- 1 | """Action handling module for Phone Agent.""" 2 | 3 | from phone_agent.actions.handler import ActionHandler, ActionResult 4 | 5 | __all__ = ["ActionHandler", "ActionResult"] 6 | -------------------------------------------------------------------------------- /resources/WECHAT.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 |

扫码加入「Open-AutoGLM 交流群」

5 |

Scan the QR code to follow the official account and join the "Open-AutoGLM Discussion Group"

6 |
7 | -------------------------------------------------------------------------------- /phone_agent/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Phone Agent - An AI-powered phone automation framework. 3 | 4 | This package provides tools for automating Android phone interactions 5 | using AI models for visual understanding and decision making. 6 | """ 7 | 8 | from phone_agent.agent import PhoneAgent 9 | 10 | __version__ = "0.1.0" 11 | __all__ = ["PhoneAgent"] 12 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Pillow>=12.0.0 2 | openai>=2.9.0 3 | 4 | # For Model Deployment 5 | 6 | ## After installing sglang or vLLM, please run pip install -U transformers again to upgrade to 5.0.0rc0. 7 | ## Any dependency conflicts related to Transformers can be ignored. 8 | 9 | # sglang>=0.5.6.post1 10 | # vllm>=0.12.0 11 | # transformers>=5.0.0rc0 12 | 13 | # Optional: for development 14 | # pytest>=7.0.0 15 | # pre-commit>=4.5.0 16 | # black>=23.0.0 17 | # mypy>=1.0.0 18 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_install_hook_types: 2 | - pre-commit 3 | - commit-msg 4 | exclude: '^phone_agent/config/apps\.py$' 5 | exclude: '^README_en\.md$' 6 | default_stages: 7 | - pre-commit # Run locally 8 | repos: 9 | - repo: https://github.com/astral-sh/ruff-pre-commit 10 | rev: v0.11.7 11 | hooks: 12 | - id: ruff 13 | args: [--output-format, github, --fix, --select, I] 14 | - id: ruff-format 15 | - repo: https://github.com/crate-ci/typos 16 | rev: v1.32.0 17 | hooks: 18 | - id: typos 19 | - repo: https://github.com/jackdewinter/pymarkdown 20 | rev: v0.9.29 21 | hooks: 22 | - id: pymarkdown 23 | args: [fix] 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | build/ 8 | develop-eggs/ 9 | dist/ 10 | downloads/ 11 | eggs/ 12 | .eggs/ 13 | lib/ 14 | lib64/ 15 | parts/ 16 | sdist/ 17 | var/ 18 | wheels/ 19 | *.egg-info/ 20 | .installed.cfg 21 | *.egg 22 | 23 | # Virtual environments 24 | venv/ 25 | ENV/ 26 | env/ 27 | .venv/ 28 | 29 | # IDE 30 | .idea/ 31 | .vscode/ 32 | *.swp 33 | *.swo 34 | *~ 35 | 36 | # Testing 37 | .pytest_cache/ 38 | .coverage 39 | htmlcov/ 40 | .tox/ 41 | .nox/ 42 | 43 | # Type checking 44 | .mypy_cache/ 45 | 46 | # Jupyter 47 | .ipynb_checkpoints/ 48 | 49 | # OS 50 | .DS_Store 51 | Thumbs.db 52 | 53 | # Project specific 54 | *.log 55 | /tmp/ 56 | screenshots/ 57 | 58 | # Keep old files during transition 59 | call_model.py 60 | app_package_name.py 61 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.yaml: -------------------------------------------------------------------------------- 1 | name: "\U0001F680 Feature request" 2 | description: Submit a request for a new Open-AutoGLM / 提交一个新的 Open-AutoGLM 的功能建议 3 | labels: [ "feature" ] 4 | body: 5 | - type: textarea 6 | id: feature-request 7 | validations: 8 | required: true 9 | attributes: 10 | label: Feature request / 功能建议 11 | description: | 12 | A brief description of the functional proposal. Links to corresponding papers and code are desirable. 13 | 对功能建议的简述。最好提供对应的论文和代码链接 14 | 15 | - type: textarea 16 | id: motivation 17 | validations: 18 | required: true 19 | attributes: 20 | label: Motivation / 动机 21 | description: | 22 | Your motivation for making the suggestion. If that motivation is related to another GitHub issue, link to it here. 23 | 您提出建议的动机。如果该动机与另一个 GitHub 问题有关,请在此处提供对应的链接。 24 | 25 | - type: textarea 26 | id: contribution 27 | validations: 28 | required: true 29 | attributes: 30 | label: Your contribution / 您的贡献 31 | description: | 32 | 33 | Your PR link or any other link you can help with. 34 | 您的PR链接或者其他您能提供帮助的链接。 35 | -------------------------------------------------------------------------------- /phone_agent/adb/__init__.py: -------------------------------------------------------------------------------- 1 | """ADB utilities for Android device interaction.""" 2 | 3 | from phone_agent.adb.connection import ( 4 | ADBConnection, 5 | ConnectionType, 6 | DeviceInfo, 7 | list_devices, 8 | quick_connect, 9 | ) 10 | from phone_agent.adb.device import ( 11 | back, 12 | double_tap, 13 | get_current_app, 14 | home, 15 | launch_app, 16 | long_press, 17 | swipe, 18 | tap, 19 | ) 20 | from phone_agent.adb.input import ( 21 | clear_text, 22 | detect_and_set_adb_keyboard, 23 | restore_keyboard, 24 | type_text, 25 | ) 26 | from phone_agent.adb.screenshot import get_screenshot 27 | 28 | __all__ = [ 29 | # Screenshot 30 | "get_screenshot", 31 | # Input 32 | "type_text", 33 | "clear_text", 34 | "detect_and_set_adb_keyboard", 35 | "restore_keyboard", 36 | # Device control 37 | "get_current_app", 38 | "tap", 39 | "swipe", 40 | "back", 41 | "home", 42 | "double_tap", 43 | "long_press", 44 | "launch_app", 45 | # Connection management 46 | "ADBConnection", 47 | "DeviceInfo", 48 | "ConnectionType", 49 | "quick_connect", 50 | "list_devices", 51 | ] 52 | -------------------------------------------------------------------------------- /phone_agent/hdc/__init__.py: -------------------------------------------------------------------------------- 1 | """HDC utilities for HarmonyOS device interaction.""" 2 | 3 | from phone_agent.hdc.connection import ( 4 | HDCConnection, 5 | ConnectionType, 6 | DeviceInfo, 7 | list_devices, 8 | quick_connect, 9 | set_hdc_verbose, 10 | ) 11 | from phone_agent.hdc.device import ( 12 | back, 13 | double_tap, 14 | get_current_app, 15 | home, 16 | launch_app, 17 | long_press, 18 | swipe, 19 | tap, 20 | ) 21 | from phone_agent.hdc.input import ( 22 | clear_text, 23 | detect_and_set_adb_keyboard, 24 | restore_keyboard, 25 | type_text, 26 | ) 27 | from phone_agent.hdc.screenshot import get_screenshot 28 | 29 | __all__ = [ 30 | # Screenshot 31 | "get_screenshot", 32 | # Input 33 | "type_text", 34 | "clear_text", 35 | "detect_and_set_adb_keyboard", 36 | "restore_keyboard", 37 | # Device control 38 | "get_current_app", 39 | "tap", 40 | "swipe", 41 | "back", 42 | "home", 43 | "double_tap", 44 | "long_press", 45 | "launch_app", 46 | # Connection management 47 | "HDCConnection", 48 | "DeviceInfo", 49 | "ConnectionType", 50 | "quick_connect", 51 | "list_devices", 52 | "set_hdc_verbose", 53 | ] 54 | -------------------------------------------------------------------------------- /phone_agent/config/__init__.py: -------------------------------------------------------------------------------- 1 | """Configuration module for Phone Agent.""" 2 | 3 | from phone_agent.config.apps import APP_PACKAGES 4 | from phone_agent.config.i18n import get_message, get_messages 5 | from phone_agent.config.prompts_en import SYSTEM_PROMPT as SYSTEM_PROMPT_EN 6 | from phone_agent.config.prompts_zh import SYSTEM_PROMPT as SYSTEM_PROMPT_ZH 7 | from phone_agent.config.timing import ( 8 | TIMING_CONFIG, 9 | ActionTimingConfig, 10 | ConnectionTimingConfig, 11 | DeviceTimingConfig, 12 | TimingConfig, 13 | get_timing_config, 14 | update_timing_config, 15 | ) 16 | 17 | 18 | def get_system_prompt(lang: str = "cn") -> str: 19 | """ 20 | Get system prompt by language. 21 | 22 | Args: 23 | lang: Language code, 'cn' for Chinese, 'en' for English. 24 | 25 | Returns: 26 | System prompt string. 27 | """ 28 | if lang == "en": 29 | return SYSTEM_PROMPT_EN 30 | return SYSTEM_PROMPT_ZH 31 | 32 | 33 | # Default to Chinese for backward compatibility 34 | SYSTEM_PROMPT = SYSTEM_PROMPT_ZH 35 | 36 | __all__ = [ 37 | "APP_PACKAGES", 38 | "SYSTEM_PROMPT", 39 | "SYSTEM_PROMPT_ZH", 40 | "SYSTEM_PROMPT_EN", 41 | "get_system_prompt", 42 | "get_messages", 43 | "get_message", 44 | "TIMING_CONFIG", 45 | "TimingConfig", 46 | "ActionTimingConfig", 47 | "DeviceTimingConfig", 48 | "ConnectionTimingConfig", 49 | "get_timing_config", 50 | "update_timing_config", 51 | ] 52 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | # Contribution Guide 2 | 3 | We welcome your contributions to this repository. To ensure elegant code style and better code quality, we have prepared 4 | the following contribution guidelines. 5 | 6 | ## What We Accept 7 | 8 | + This PR fixes a typo or improves the documentation (if this is the case, you may skip the other checks). 9 | + This PR fixes a specific issue — please reference the issue number in the PR description. Make sure your code strictly 10 | follows the coding standards below. 11 | + This PR introduces a new feature — please clearly explain the necessity and implementation of the feature. Make sure 12 | your code strictly follows the coding standards below. 13 | 14 | ## Code Style Guide 15 | 16 | Good code style is an art. We have prepared a `pre-commit` hook to enforce consistent code 17 | formatting across the project. You can clean up your code following the steps below: 18 | 19 | ```shell 20 | pre-commit run --all-files 21 | ``` 22 | 23 | If your code complies with the standards, you should not see any errors. 24 | 25 | ## Naming Conventions 26 | 27 | + Please use **English** for naming; do not use Pinyin or other languages. All comments should also be in English. 28 | + Follow **PEP8** naming conventions strictly, and use underscores to separate words. Avoid meaningless names such as 29 | `a`, `b`, `c`. 30 | 31 | ## For glmv-reward Contributors 32 | 33 | Before PR, Please run: 34 | 35 | ```bash 36 | cd glmv-reward/ 37 | uv sync 38 | uv run poe lint 39 | uv run poe typecheck 40 | ``` 41 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Setup script for Phone Agent.""" 3 | 4 | from setuptools import find_packages, setup 5 | 6 | with open("README.md", "r", encoding="utf-8") as f: 7 | long_description = f.read() 8 | 9 | setup( 10 | name="phone-agent", 11 | version="0.1.0", 12 | author="Zhipu AI", 13 | author_email="", 14 | description="AI-powered phone automation framework", 15 | long_description=long_description, 16 | long_description_content_type="text/markdown", 17 | url="https://github.com/yourusername/phone-agent", 18 | packages=find_packages(), 19 | classifiers=[ 20 | "Development Status :: 3 - Alpha", 21 | "Intended Audience :: Developers", 22 | "License :: OSI Approved :: Apache Software License", 23 | "Operating System :: OS Independent", 24 | "Programming Language :: Python :: 3", 25 | "Programming Language :: Python :: 3.10", 26 | "Programming Language :: Python :: 3.11", 27 | "Programming Language :: Python :: 3.12", 28 | "Topic :: Software Development :: Libraries :: Python Modules", 29 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 30 | ], 31 | python_requires=">=3.10", 32 | install_requires=[ 33 | "Pillow>=12.0.0", 34 | "openai>=2.9.0", 35 | ], 36 | extras_require={ 37 | "dev": [ 38 | "pytest>=7.0.0", 39 | "black>=23.0.0", 40 | "mypy>=1.0.0", 41 | "ruff>=0.1.0", 42 | ], 43 | }, 44 | entry_points={ 45 | "console_scripts": [ 46 | "phone-agent=main:main", 47 | ], 48 | }, 49 | ) 50 | -------------------------------------------------------------------------------- /examples/demo_thinking.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Thinking Output Demo / 演示 thinking 输出的示例 4 | 5 | This script demonstrates how the Agent outputs both thinking process and actions in verbose mode. 6 | 这个脚本展示了在 verbose 模式下,Agent 会同时输出思考过程和执行动作。 7 | """ 8 | 9 | from phone_agent import PhoneAgent 10 | from phone_agent.agent import AgentConfig 11 | from phone_agent.config import get_messages 12 | from phone_agent.model import ModelConfig 13 | 14 | 15 | def main(lang: str = "cn"): 16 | msgs = get_messages(lang) 17 | 18 | print("=" * 60) 19 | print("Phone Agent - Thinking Demo") 20 | print("=" * 60) 21 | 22 | # Configure model 23 | model_config = ModelConfig( 24 | base_url="http://localhost:8000/v1", 25 | model_name="autoglm-phone-9b", 26 | temperature=0.1, 27 | ) 28 | 29 | # Configure Agent (verbose=True enables detailed output) 30 | agent_config = AgentConfig( 31 | max_steps=10, 32 | verbose=True, 33 | lang=lang, 34 | ) 35 | 36 | # Create Agent 37 | agent = PhoneAgent( 38 | model_config=model_config, 39 | agent_config=agent_config, 40 | ) 41 | 42 | # Execute task 43 | print(f"\n📱 {msgs['starting_task']}...\n") 44 | result = agent.run("打开小红书搜索美食攻略") 45 | 46 | print("\n" + "=" * 60) 47 | print(f"📊 {msgs['final_result']}: {result}") 48 | print("=" * 60) 49 | 50 | 51 | if __name__ == "__main__": 52 | import argparse 53 | 54 | parser = argparse.ArgumentParser(description="Phone Agent Thinking Demo") 55 | parser.add_argument( 56 | "--lang", 57 | type=str, 58 | default="cn", 59 | choices=["cn", "en"], 60 | help="Language for UI messages (cn=Chinese, en=English)", 61 | ) 62 | args = parser.parse_args() 63 | 64 | main(lang=args.lang) 65 | -------------------------------------------------------------------------------- /phone_agent/config/i18n.py: -------------------------------------------------------------------------------- 1 | """Internationalization (i18n) module for Phone Agent UI messages.""" 2 | 3 | # Chinese messages 4 | MESSAGES_ZH = { 5 | "thinking": "思考过程", 6 | "action": "执行动作", 7 | "task_completed": "任务完成", 8 | "done": "完成", 9 | "starting_task": "开始执行任务", 10 | "final_result": "最终结果", 11 | "task_result": "任务结果", 12 | "confirmation_required": "需要确认", 13 | "continue_prompt": "是否继续?(y/n)", 14 | "manual_operation_required": "需要人工操作", 15 | "manual_operation_hint": "请手动完成操作...", 16 | "press_enter_when_done": "完成后按回车继续", 17 | "connection_failed": "连接失败", 18 | "connection_successful": "连接成功", 19 | "step": "步骤", 20 | "task": "任务", 21 | "result": "结果", 22 | "performance_metrics": "性能指标", 23 | "time_to_first_token": "首 Token 延迟 (TTFT)", 24 | "time_to_thinking_end": "思考完成延迟", 25 | "total_inference_time": "总推理时间", 26 | } 27 | 28 | # English messages 29 | MESSAGES_EN = { 30 | "thinking": "Thinking", 31 | "action": "Action", 32 | "task_completed": "Task Completed", 33 | "done": "Done", 34 | "starting_task": "Starting task", 35 | "final_result": "Final Result", 36 | "task_result": "Task Result", 37 | "confirmation_required": "Confirmation Required", 38 | "continue_prompt": "Continue? (y/n)", 39 | "manual_operation_required": "Manual Operation Required", 40 | "manual_operation_hint": "Please complete the operation manually...", 41 | "press_enter_when_done": "Press Enter when done", 42 | "connection_failed": "Connection Failed", 43 | "connection_successful": "Connection Successful", 44 | "step": "Step", 45 | "task": "Task", 46 | "result": "Result", 47 | "performance_metrics": "Performance Metrics", 48 | "time_to_first_token": "Time to First Token (TTFT)", 49 | "time_to_thinking_end": "Time to Thinking End", 50 | "total_inference_time": "Total Inference Time", 51 | } 52 | 53 | 54 | def get_messages(lang: str = "cn") -> dict: 55 | """ 56 | Get UI messages dictionary by language. 57 | 58 | Args: 59 | lang: Language code, 'cn' for Chinese, 'en' for English. 60 | 61 | Returns: 62 | Dictionary of UI messages. 63 | """ 64 | if lang == "en": 65 | return MESSAGES_EN 66 | return MESSAGES_ZH 67 | 68 | 69 | def get_message(key: str, lang: str = "cn") -> str: 70 | """ 71 | Get a single UI message by key and language. 72 | 73 | Args: 74 | key: Message key. 75 | lang: Language code, 'cn' for Chinese, 'en' for English. 76 | 77 | Returns: 78 | Message string. 79 | """ 80 | messages = get_messages(lang) 81 | return messages.get(key, key) 82 | -------------------------------------------------------------------------------- /phone_agent/config/prompts_en.py: -------------------------------------------------------------------------------- 1 | """System prompts for the AI agent.""" 2 | 3 | from datetime import datetime 4 | 5 | today = datetime.today() 6 | formatted_date = today.strftime("%Y-%m-%d, %A") 7 | 8 | SYSTEM_PROMPT = ( 9 | "The current date: " 10 | + formatted_date 11 | + """ 12 | # Setup 13 | You are a professional Android operation agent assistant that can fulfill the user's high-level instructions. Given a screenshot of the Android interface at each step, you first analyze the situation, then plan the best course of action using Python-style pseudo-code. 14 | 15 | # More details about the code 16 | Your response format must be structured as follows: 17 | 18 | Think first: Use ... to analyze the current screen, identify key elements, and determine the most efficient action. 19 | Provide the action: Use ... to return a single line of pseudo-code representing the operation. 20 | 21 | Your output should STRICTLY follow the format: 22 | 23 | [Your thought] 24 | 25 | 26 | [Your operation code] 27 | 28 | 29 | - **Tap** 30 | Perform a tap action on a specified screen area. The element is a list of 2 integers, representing the coordinates of the tap point. 31 | **Example**: 32 | 33 | do(action="Tap", element=[x,y]) 34 | 35 | - **Type** 36 | Enter text into the currently focused input field. 37 | **Example**: 38 | 39 | do(action="Type", text="Hello World") 40 | 41 | - **Swipe** 42 | Perform a swipe action with start point and end point. 43 | **Examples**: 44 | 45 | do(action="Swipe", start=[x1,y1], end=[x2,y2]) 46 | 47 | - **Long Press** 48 | Perform a long press action on a specified screen area. 49 | You can add the element to the action to specify the long press area. The element is a list of 2 integers, representing the coordinates of the long press point. 50 | **Example**: 51 | 52 | do(action="Long Press", element=[x,y]) 53 | 54 | - **Launch** 55 | Launch an app. Try to use launch action when you need to launch an app. Check the instruction to choose the right app before you use this action. 56 | **Example**: 57 | 58 | do(action="Launch", app="Settings") 59 | 60 | - **Back** 61 | Press the Back button to navigate to the previous screen. 62 | **Example**: 63 | 64 | do(action="Back") 65 | 66 | - **Finish** 67 | Terminate the program and optionally print a message. 68 | **Example**: 69 | 70 | finish(message="Task completed.") 71 | 72 | 73 | 74 | REMEMBER: 75 | - Think before you act: Always analyze the current UI and the best course of action before executing any step, and output in part. 76 | - Only ONE LINE of action in part per response: Each step must contain exactly one line of executable code. 77 | - Generate execution code strictly according to format requirements. 78 | """ 79 | ) 80 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yaml: -------------------------------------------------------------------------------- 1 | name: "\U0001F41B Bug Report" 2 | description: Submit a bug report to help us improve Open-AutoGLM / 提交一个 Bug 问题报告来帮助我们改进 Open-AutoGLM 3 | body: 4 | - type: textarea 5 | id: system-info 6 | attributes: 7 | label: System Info / 系統信息 8 | description: Your operating environment / 您的运行环境信息 9 | placeholder: Includes Cuda version, Transformers version, Python version, operating system, hardware information (if you suspect a hardware problem)... / 包括Cuda版本,Transformers版本,Python版本,操作系统,硬件信息(如果您怀疑是硬件方面的问题)... 10 | validations: 11 | required: true 12 | 13 | - type: textarea 14 | id: who-can-help 15 | attributes: 16 | label: Who can help? / 谁可以帮助到您? 17 | description: | 18 | Your issue will be replied to more quickly if you can figure out the right person to tag with @ 19 | All issues are read by one of the maintainers, so if you don't know who to tag, just leave this blank and our maintainer will ping the right person. 20 | 21 | Please tag fewer than 3 people. 22 | 23 | 如果您能找到合适的标签 @,您的问题会更快得到回复。 24 | 所有问题都会由我们的维护者阅读,如果您不知道该标记谁,只需留空,我们的维护人员会找到合适的开发组成员来解决问题。 25 | 26 | 标记的人数应该不超过 3 个人。 27 | 28 | If it's not a bug in these three subsections, you may not specify the helper. Our maintainer will find the right person in the development group to solve the problem. 29 | 30 | 如果不是这三个子版块的bug,您可以不指明帮助者,我们的维护人员会找到合适的开发组成员来解决问题。 31 | 32 | placeholder: "@Username ..." 33 | 34 | - type: checkboxes 35 | id: information-scripts-examples 36 | attributes: 37 | label: Information / 问题信息 38 | description: 'The problem arises when using: / 问题出现在' 39 | options: 40 | - label: "The official example scripts / 官方的示例脚本" 41 | - label: "My own modified scripts / 我自己修改的脚本和任务" 42 | 43 | - type: textarea 44 | id: reproduction 45 | validations: 46 | required: true 47 | attributes: 48 | label: Reproduction / 复现过程 49 | description: | 50 | Please provide a code example that reproduces the problem you encountered, preferably with a minimal reproduction unit. 51 | If you have code snippets, error messages, stack traces, please provide them here as well. 52 | Please format your code correctly using code tags. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting 53 | Do not use screenshots, as they are difficult to read and (more importantly) do not allow others to copy and paste your code. 54 | 55 | 请提供能重现您遇到的问题的代码示例,最好是最小复现单元。 56 | 如果您有代码片段、错误信息、堆栈跟踪,也请在此提供。 57 | 请使用代码标签正确格式化您的代码。请参见 https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting 58 | 请勿使用截图,因为截图难以阅读,而且(更重要的是)不允许他人复制粘贴您的代码。 59 | placeholder: | 60 | Steps to reproduce the behavior/复现Bug的步骤: 61 | 62 | 1. 63 | 2. 64 | 3. 65 | 66 | - type: textarea 67 | id: expected-behavior 68 | validations: 69 | required: true 70 | attributes: 71 | label: Expected behavior / 期待表现 72 | description: "A clear and concise description of what you would expect to happen. /简单描述您期望发生的事情。" 73 | -------------------------------------------------------------------------------- /phone_agent/adb/input.py: -------------------------------------------------------------------------------- 1 | """Input utilities for Android device text input.""" 2 | 3 | import base64 4 | import subprocess 5 | from typing import Optional 6 | 7 | 8 | def type_text(text: str, device_id: str | None = None) -> None: 9 | """ 10 | Type text into the currently focused input field using ADB Keyboard. 11 | 12 | Args: 13 | text: The text to type. 14 | device_id: Optional ADB device ID for multi-device setups. 15 | 16 | Note: 17 | Requires ADB Keyboard to be installed on the device. 18 | See: https://github.com/nicnocquee/AdbKeyboard 19 | """ 20 | adb_prefix = _get_adb_prefix(device_id) 21 | encoded_text = base64.b64encode(text.encode("utf-8")).decode("utf-8") 22 | 23 | subprocess.run( 24 | adb_prefix 25 | + [ 26 | "shell", 27 | "am", 28 | "broadcast", 29 | "-a", 30 | "ADB_INPUT_B64", 31 | "--es", 32 | "msg", 33 | encoded_text, 34 | ], 35 | capture_output=True, 36 | text=True, 37 | ) 38 | 39 | 40 | def clear_text(device_id: str | None = None) -> None: 41 | """ 42 | Clear text in the currently focused input field. 43 | 44 | Args: 45 | device_id: Optional ADB device ID for multi-device setups. 46 | """ 47 | adb_prefix = _get_adb_prefix(device_id) 48 | 49 | subprocess.run( 50 | adb_prefix + ["shell", "am", "broadcast", "-a", "ADB_CLEAR_TEXT"], 51 | capture_output=True, 52 | text=True, 53 | ) 54 | 55 | 56 | def detect_and_set_adb_keyboard(device_id: str | None = None) -> str: 57 | """ 58 | Detect current keyboard and switch to ADB Keyboard if needed. 59 | 60 | Args: 61 | device_id: Optional ADB device ID for multi-device setups. 62 | 63 | Returns: 64 | The original keyboard IME identifier for later restoration. 65 | """ 66 | adb_prefix = _get_adb_prefix(device_id) 67 | 68 | # Get current IME 69 | result = subprocess.run( 70 | adb_prefix + ["shell", "settings", "get", "secure", "default_input_method"], 71 | capture_output=True, 72 | text=True, 73 | ) 74 | current_ime = (result.stdout + result.stderr).strip() 75 | 76 | # Switch to ADB Keyboard if not already set 77 | if "com.android.adbkeyboard/.AdbIME" not in current_ime: 78 | subprocess.run( 79 | adb_prefix + ["shell", "ime", "set", "com.android.adbkeyboard/.AdbIME"], 80 | capture_output=True, 81 | text=True, 82 | ) 83 | 84 | # Warm up the keyboard 85 | type_text("", device_id) 86 | 87 | return current_ime 88 | 89 | 90 | def restore_keyboard(ime: str, device_id: str | None = None) -> None: 91 | """ 92 | Restore the original keyboard IME. 93 | 94 | Args: 95 | ime: The IME identifier to restore. 96 | device_id: Optional ADB device ID for multi-device setups. 97 | """ 98 | adb_prefix = _get_adb_prefix(device_id) 99 | 100 | subprocess.run( 101 | adb_prefix + ["shell", "ime", "set", ime], capture_output=True, text=True 102 | ) 103 | 104 | 105 | def _get_adb_prefix(device_id: str | None) -> list: 106 | """Get ADB command prefix with optional device specifier.""" 107 | if device_id: 108 | return ["adb", "-s", device_id] 109 | return ["adb"] 110 | -------------------------------------------------------------------------------- /phone_agent/adb/screenshot.py: -------------------------------------------------------------------------------- 1 | """Screenshot utilities for capturing Android device screen.""" 2 | 3 | import base64 4 | import os 5 | import subprocess 6 | import tempfile 7 | import uuid 8 | from dataclasses import dataclass 9 | from io import BytesIO 10 | from typing import Tuple 11 | 12 | from PIL import Image 13 | 14 | 15 | @dataclass 16 | class Screenshot: 17 | """Represents a captured screenshot.""" 18 | 19 | base64_data: str 20 | width: int 21 | height: int 22 | is_sensitive: bool = False 23 | 24 | 25 | def get_screenshot(device_id: str | None = None, timeout: int = 10) -> Screenshot: 26 | """ 27 | Capture a screenshot from the connected Android device. 28 | 29 | Args: 30 | device_id: Optional ADB device ID for multi-device setups. 31 | timeout: Timeout in seconds for screenshot operations. 32 | 33 | Returns: 34 | Screenshot object containing base64 data and dimensions. 35 | 36 | Note: 37 | If the screenshot fails (e.g., on sensitive screens like payment pages), 38 | a black fallback image is returned with is_sensitive=True. 39 | """ 40 | temp_path = os.path.join(tempfile.gettempdir(), f"screenshot_{uuid.uuid4()}.png") 41 | adb_prefix = _get_adb_prefix(device_id) 42 | 43 | try: 44 | # Execute screenshot command 45 | result = subprocess.run( 46 | adb_prefix + ["shell", "screencap", "-p", "/sdcard/tmp.png"], 47 | capture_output=True, 48 | text=True, 49 | timeout=timeout, 50 | ) 51 | 52 | # Check for screenshot failure (sensitive screen) 53 | output = result.stdout + result.stderr 54 | if "Status: -1" in output or "Failed" in output: 55 | return _create_fallback_screenshot(is_sensitive=True) 56 | 57 | # Pull screenshot to local temp path 58 | subprocess.run( 59 | adb_prefix + ["pull", "/sdcard/tmp.png", temp_path], 60 | capture_output=True, 61 | text=True, 62 | timeout=5, 63 | ) 64 | 65 | if not os.path.exists(temp_path): 66 | return _create_fallback_screenshot(is_sensitive=False) 67 | 68 | # Read and encode image 69 | img = Image.open(temp_path) 70 | width, height = img.size 71 | 72 | buffered = BytesIO() 73 | img.save(buffered, format="PNG") 74 | base64_data = base64.b64encode(buffered.getvalue()).decode("utf-8") 75 | 76 | # Cleanup 77 | os.remove(temp_path) 78 | 79 | return Screenshot( 80 | base64_data=base64_data, width=width, height=height, is_sensitive=False 81 | ) 82 | 83 | except Exception as e: 84 | print(f"Screenshot error: {e}") 85 | return _create_fallback_screenshot(is_sensitive=False) 86 | 87 | 88 | def _get_adb_prefix(device_id: str | None) -> list: 89 | """Get ADB command prefix with optional device specifier.""" 90 | if device_id: 91 | return ["adb", "-s", device_id] 92 | return ["adb"] 93 | 94 | 95 | def _create_fallback_screenshot(is_sensitive: bool) -> Screenshot: 96 | """Create a black fallback image when screenshot fails.""" 97 | default_width, default_height = 1080, 2400 98 | 99 | black_img = Image.new("RGB", (default_width, default_height), color="black") 100 | buffered = BytesIO() 101 | black_img.save(buffered, format="PNG") 102 | base64_data = base64.b64encode(buffered.getvalue()).decode("utf-8") 103 | 104 | return Screenshot( 105 | base64_data=base64_data, 106 | width=default_width, 107 | height=default_height, 108 | is_sensitive=is_sensitive, 109 | ) 110 | -------------------------------------------------------------------------------- /scripts/check_deployment_cn.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | from openai import OpenAI 6 | 7 | if __name__ == "__main__": 8 | parser = argparse.ArgumentParser( 9 | description="检查模型部署是否成功的工具", 10 | formatter_class=argparse.RawDescriptionHelpFormatter, 11 | epilog=""" 12 | 使用示例: 13 | python scripts/check_deployment_cn.py --base-url http://localhost:8000/v1 --apikey your-key --model autoglm-phone-9b 14 | python scripts/check_deployment_cn.py --base-url http://localhost:8000/v1 --apikey your-key --model autoglm-phone-9b --messages-file custom.json 15 | """, 16 | ) 17 | 18 | parser.add_argument( 19 | "--base-url", 20 | type=str, 21 | required=True, 22 | help="API 服务的 base URL,例如: http://localhost:8000/v1", 23 | ) 24 | 25 | parser.add_argument( 26 | "--apikey", type=str, default="EMPTY", help="API 密钥 (默认: EMPTY)" 27 | ) 28 | 29 | parser.add_argument( 30 | "--model", 31 | type=str, 32 | required=True, 33 | help="要测试的模型名称,例如: autoglm-phone-9b", 34 | ) 35 | 36 | parser.add_argument( 37 | "--messages-file", 38 | type=str, 39 | default="scripts/sample_messages.json", 40 | help="包含测试消息的 JSON 文件路径 (默认: scripts/sample_messages.json)", 41 | ) 42 | 43 | parser.add_argument( 44 | "--max-tokens", type=int, default=3000, help="最大生成 token 数 (默认: 3000)" 45 | ) 46 | 47 | parser.add_argument( 48 | "--temperature", type=float, default=0.0, help="采样温度 (默认: 0.0)" 49 | ) 50 | 51 | parser.add_argument( 52 | "--top_p", type=float, default=0.85, help="nucleus sampling 参数 (默认: 0.85)" 53 | ) 54 | 55 | parser.add_argument( 56 | "--frequency_penalty", type=float, default=0.2, help="频率惩罚参数 (默认: 0.2)" 57 | ) 58 | 59 | args = parser.parse_args() 60 | 61 | # 读取测试消息 62 | if not os.path.exists(args.messages_file): 63 | print(f"错误: 消息文件 {args.messages_file} 不存在") 64 | exit(1) 65 | 66 | with open(args.messages_file) as f: 67 | messages = json.load(f) 68 | 69 | base_url = args.base_url 70 | api_key = args.apikey 71 | model = args.model 72 | 73 | print(f"开始测试模型推理...") 74 | print(f"Base URL: {base_url}") 75 | print(f"Model: {model}") 76 | print(f"Messages file: {args.messages_file}") 77 | print("=" * 80) 78 | 79 | try: 80 | client = OpenAI( 81 | base_url=base_url, 82 | api_key=api_key, 83 | ) 84 | 85 | response = client.chat.completions.create( 86 | messages=messages, 87 | model=model, 88 | max_tokens=args.max_tokens, 89 | temperature=args.temperature, 90 | top_p=args.top_p, 91 | frequency_penalty=args.frequency_penalty, 92 | stream=False, 93 | ) 94 | 95 | print("\n模型推理结果:") 96 | print("=" * 80) 97 | print(response.choices[0].message.content) 98 | print("=" * 80) 99 | 100 | if response.usage: 101 | print(f"\n统计信息:") 102 | print(f" - Prompt tokens: {response.usage.prompt_tokens}") 103 | print(f" - Completion tokens: {response.usage.completion_tokens}") 104 | print(f" - Total tokens: {response.usage.total_tokens}") 105 | 106 | print(f"\n请根据上述推理结果判断模型部署是否符合预期。") 107 | 108 | except Exception as e: 109 | print(f"\n调用 API 时发生错误:") 110 | print(f"错误类型: {type(e).__name__}") 111 | print(f"错误信息: {str(e)}") 112 | print( 113 | "\n提示: 请检查 base_url、api_key 和 model 参数是否正确,以及服务是否正在运行。" 114 | ) 115 | exit(1) 116 | -------------------------------------------------------------------------------- /phone_agent/config/prompts.py: -------------------------------------------------------------------------------- 1 | """System prompts for the AI agent.""" 2 | 3 | from datetime import datetime 4 | 5 | today = datetime.today() 6 | formatted_date = today.strftime("%Y年%m月%d日") 7 | 8 | SYSTEM_PROMPT = ( 9 | "今天的日期是: " 10 | + formatted_date 11 | + """ 12 | 你是一个智能体分析专家,可以根据操作历史和当前状态图执行一系列操作来完成任务。 13 | 你必须严格按照要求输出以下格式: 14 | {think} 15 | {action} 16 | 17 | 其中: 18 | - {think} 是对你为什么选择这个操作的简短推理说明。 19 | - {action} 是本次执行的具体操作指令,必须严格遵循下方定义的指令格式。 20 | 21 | 操作指令及其作用如下: 22 | - do(action="Launch", app="xxx") 23 | Launch是启动目标app的操作,这比通过主屏幕导航更快。此操作完成后,您将自动收到结果状态的截图。 24 | - do(action="Tap", element=[x,y]) 25 | Tap是点击操作,点击屏幕上的特定点。可用此操作点击按钮、选择项目、从主屏幕打开应用程序,或与任何可点击的用户界面元素进行交互。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的截图。 26 | - do(action="Tap", element=[x,y], message="重要操作") 27 | 基本功能同Tap,点击涉及财产、支付、隐私等敏感按钮时触发。 28 | - do(action="Type", text="xxx") 29 | Type是输入操作,在当前聚焦的输入框中输入文本。使用此操作前,请确保输入框已被聚焦(先点击它)。输入的文本将像使用键盘输入一样输入。重要提示:手机可能正在使用 ADB 键盘,该键盘不会像普通键盘那样占用屏幕空间。要确认键盘已激活,请查看屏幕底部是否显示 'ADB Keyboard {ON}' 类似的文本,或者检查输入框是否处于激活/高亮状态。不要仅仅依赖视觉上的键盘显示。自动清除文本:当你使用输入操作时,输入框中现有的任何文本(包括占位符文本和实际输入)都会在输入新文本前自动清除。你无需在输入前手动清除文本——直接使用输入操作输入所需文本即可。操作完成后,你将自动收到结果状态的截图。 30 | - do(action="Type_Name", text="xxx") 31 | Type_Name是输入人名的操作,基本功能同Type。 32 | - do(action="Interact") 33 | Interact是当有多个满足条件的选项时而触发的交互操作,询问用户如何选择。 34 | - do(action="Swipe", start=[x1,y1], end=[x2,y2]) 35 | Swipe是滑动操作,通过从起始坐标拖动到结束坐标来执行滑动手势。可用于滚动内容、在屏幕之间导航、下拉通知栏以及项目栏或进行基于手势的导航。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。滑动持续时间会自动调整以实现自然的移动。此操作完成后,您将自动收到结果状态的截图。 36 | - do(action="Note", message="True") 37 | 记录当前页面内容以便后续总结。 38 | - do(action="Call_API", instruction="xxx") 39 | 总结或评论当前页面或已记录的内容。 40 | - do(action="Long Press", element=[x,y]) 41 | Long Pres是长按操作,在屏幕上的特定点长按指定时间。可用于触发上下文菜单、选择文本或激活长按交互。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的屏幕截图。 42 | - do(action="Double Tap", element=[x,y]) 43 | Double Tap在屏幕上的特定点快速连续点按两次。使用此操作可以激活双击交互,如缩放、选择文本或打开项目。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的截图。 44 | - do(action="Take_over", message="xxx") 45 | Take_over是接管操作,表示在登录和验证阶段需要用户协助。 46 | - do(action="Back") 47 | 导航返回到上一个屏幕或关闭当前对话框。相当于按下 Android 的返回按钮。使用此操作可以从更深的屏幕返回、关闭弹出窗口或退出当前上下文。此操作完成后,您将自动收到结果状态的截图。 48 | - do(action="Home") 49 | Home是回到系统桌面的操作,相当于按下 Android 主屏幕按钮。使用此操作可退出当前应用并返回启动器,或从已知状态启动新任务。此操作完成后,您将自动收到结果状态的截图。 50 | - do(action="Wait", duration="x seconds") 51 | 等待页面加载,x为需要等待多少秒。 52 | - finish(message="xxx") 53 | finish是结束任务的操作,表示准确完整完成任务,message是终止信息。 54 | 55 | 必须遵循的规则: 56 | 1. 在执行任何操作前,先检查当前app是否是目标app,如果不是,先执行 Launch。 57 | 2. 如果进入到了无关页面,先执行 Back。如果执行Back后页面没有变化,请点击页面左上角的返回键进行返回,或者右上角的X号关闭。 58 | 3. 如果页面未加载出内容,最多连续 Wait 三次,否则执行 Back重新进入。 59 | 4. 如果页面显示网络问题,需要重新加载,请点击重新加载。 60 | 5. 如果当前页面找不到目标联系人、商品、店铺等信息,可以尝试 Swipe 滑动查找。 61 | 6. 遇到价格区间、时间区间等筛选条件,如果没有完全符合的,可以放宽要求。 62 | 7. 在做小红书总结类任务时一定要筛选图文笔记。 63 | 8. 购物车全选后再点击全选可以把状态设为全不选,在做购物车任务时,如果购物车里已经有商品被选中时,你需要点击全选后再点击取消全选,再去找需要购买或者删除的商品。 64 | 9. 在做外卖任务时,如果相应店铺购物车里已经有其他商品你需要先把购物车清空再去购买用户指定的外卖。 65 | 10. 在做点外卖任务时,如果用户需要点多个外卖,请尽量在同一店铺进行购买,如果无法找到可以下单,并说明某个商品未找到。 66 | 11. 请严格遵循用户意图执行任务,用户的特殊要求可以执行多次搜索,滑动查找。比如(i)用户要求点一杯咖啡,要咸的,你可以直接搜索咸咖啡,或者搜索咖啡后滑动查找咸的咖啡,比如海盐咖啡。(ii)用户要找到XX群,发一条消息,你可以先搜索XX群,找不到结果后,将"群"字去掉,搜索XX重试。(iii)用户要找到宠物友好的餐厅,你可以搜索餐厅,找到筛选,找到设施,选择可带宠物,或者直接搜索可带宠物,必要时可以使用AI搜索。 67 | 12. 在选择日期时,如果原滑动方向与预期日期越来越远,请向反方向滑动查找。 68 | 13. 执行任务过程中如果有多个可选择的项目栏,请逐个查找每个项目栏,直到完成任务,一定不要在同一项目栏多次查找,从而陷入死循环。 69 | 14. 在执行下一步操作前请一定要检查上一步的操作是否生效,如果点击没生效,可能因为app反应较慢,请先稍微等待一下,如果还是不生效请调整一下点击位置重试,如果仍然不生效请跳过这一步继续任务,并在finish message说明点击不生效。 70 | 15. 在执行任务中如果遇到滑动不生效的情况,请调整一下起始点位置,增大滑动距离重试,如果还是不生效,有可能是已经滑到底了,请继续向反方向滑动,直到顶部或底部,如果仍然没有符合要求的结果,请跳过这一步继续任务,并在finish message说明但没找到要求的项目。 71 | 16. 在做游戏任务时如果在战斗页面如果有自动战斗一定要开启自动战斗,如果多轮历史状态相似要检查自动战斗是否开启。 72 | 17. 如果没有合适的搜索结果,可能是因为搜索页面不对,请返回到搜索页面的上一级尝试重新搜索,如果尝试三次返回上一级搜索后仍然没有符合要求的结果,执行 finish(message="原因")。 73 | 18. 在结束任务前请一定要仔细检查任务是否完整准确的完成,如果出现错选、漏选、多选的情况,请返回之前的步骤进行纠正。 74 | """ 75 | ) 76 | -------------------------------------------------------------------------------- /resources/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /phone_agent/config/prompts_zh.py: -------------------------------------------------------------------------------- 1 | """System prompts for the AI agent.""" 2 | 3 | from datetime import datetime 4 | 5 | today = datetime.today() 6 | weekday_names = ["星期一", "星期二", "星期三", "星期四", "星期五", "星期六", "星期日"] 7 | weekday = weekday_names[today.weekday()] 8 | formatted_date = today.strftime("%Y年%m月%d日") + " " + weekday 9 | 10 | SYSTEM_PROMPT = ( 11 | "今天的日期是: " 12 | + formatted_date 13 | + """ 14 | 你是一个智能体分析专家,可以根据操作历史和当前状态图执行一系列操作来完成任务。 15 | 你必须严格按照要求输出以下格式: 16 | {think} 17 | {action} 18 | 19 | 其中: 20 | - {think} 是对你为什么选择这个操作的简短推理说明。 21 | - {action} 是本次执行的具体操作指令,必须严格遵循下方定义的指令格式。 22 | 23 | 操作指令及其作用如下: 24 | - do(action="Launch", app="xxx") 25 | Launch是启动目标app的操作,这比通过主屏幕导航更快。此操作完成后,您将自动收到结果状态的截图。 26 | - do(action="Tap", element=[x,y]) 27 | Tap是点击操作,点击屏幕上的特定点。可用此操作点击按钮、选择项目、从主屏幕打开应用程序,或与任何可点击的用户界面元素进行交互。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的截图。 28 | - do(action="Tap", element=[x,y], message="重要操作") 29 | 基本功能同Tap,点击涉及财产、支付、隐私等敏感按钮时触发。 30 | - do(action="Type", text="xxx") 31 | Type是输入操作,在当前聚焦的输入框中输入文本。使用此操作前,请确保输入框已被聚焦(先点击它)。输入的文本将像使用键盘输入一样输入。重要提示:手机可能正在使用 ADB 键盘,该键盘不会像普通键盘那样占用屏幕空间。要确认键盘已激活,请查看屏幕底部是否显示 'ADB Keyboard {ON}' 类似的文本,或者检查输入框是否处于激活/高亮状态。不要仅仅依赖视觉上的键盘显示。自动清除文本:当你使用输入操作时,输入框中现有的任何文本(包括占位符文本和实际输入)都会在输入新文本前自动清除。你无需在输入前手动清除文本——直接使用输入操作输入所需文本即可。操作完成后,你将自动收到结果状态的截图。 32 | - do(action="Type_Name", text="xxx") 33 | Type_Name是输入人名的操作,基本功能同Type。 34 | - do(action="Interact") 35 | Interact是当有多个满足条件的选项时而触发的交互操作,询问用户如何选择。 36 | - do(action="Swipe", start=[x1,y1], end=[x2,y2]) 37 | Swipe是滑动操作,通过从起始坐标拖动到结束坐标来执行滑动手势。可用于滚动内容、在屏幕之间导航、下拉通知栏以及项目栏或进行基于手势的导航。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。滑动持续时间会自动调整以实现自然的移动。此操作完成后,您将自动收到结果状态的截图。 38 | - do(action="Note", message="True") 39 | 记录当前页面内容以便后续总结。 40 | - do(action="Call_API", instruction="xxx") 41 | 总结或评论当前页面或已记录的内容。 42 | - do(action="Long Press", element=[x,y]) 43 | Long Pres是长按操作,在屏幕上的特定点长按指定时间。可用于触发上下文菜单、选择文本或激活长按交互。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的屏幕截图。 44 | - do(action="Double Tap", element=[x,y]) 45 | Double Tap在屏幕上的特定点快速连续点按两次。使用此操作可以激活双击交互,如缩放、选择文本或打开项目。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的截图。 46 | - do(action="Take_over", message="xxx") 47 | Take_over是接管操作,表示在登录和验证阶段需要用户协助。 48 | - do(action="Back") 49 | 导航返回到上一个屏幕或关闭当前对话框。相当于按下 Android 的返回按钮。使用此操作可以从更深的屏幕返回、关闭弹出窗口或退出当前上下文。此操作完成后,您将自动收到结果状态的截图。 50 | - do(action="Home") 51 | Home是回到系统桌面的操作,相当于按下 Android 主屏幕按钮。使用此操作可退出当前应用并返回启动器,或从已知状态启动新任务。此操作完成后,您将自动收到结果状态的截图。 52 | - do(action="Wait", duration="x seconds") 53 | 等待页面加载,x为需要等待多少秒。 54 | - finish(message="xxx") 55 | finish是结束任务的操作,表示准确完整完成任务,message是终止信息。 56 | 57 | 必须遵循的规则: 58 | 1. 在执行任何操作前,先检查当前app是否是目标app,如果不是,先执行 Launch。 59 | 2. 如果进入到了无关页面,先执行 Back。如果执行Back后页面没有变化,请点击页面左上角的返回键进行返回,或者右上角的X号关闭。 60 | 3. 如果页面未加载出内容,最多连续 Wait 三次,否则执行 Back重新进入。 61 | 4. 如果页面显示网络问题,需要重新加载,请点击重新加载。 62 | 5. 如果当前页面找不到目标联系人、商品、店铺等信息,可以尝试 Swipe 滑动查找。 63 | 6. 遇到价格区间、时间区间等筛选条件,如果没有完全符合的,可以放宽要求。 64 | 7. 在做小红书总结类任务时一定要筛选图文笔记。 65 | 8. 购物车全选后再点击全选可以把状态设为全不选,在做购物车任务时,如果购物车里已经有商品被选中时,你需要点击全选后再点击取消全选,再去找需要购买或者删除的商品。 66 | 9. 在做外卖任务时,如果相应店铺购物车里已经有其他商品你需要先把购物车清空再去购买用户指定的外卖。 67 | 10. 在做点外卖任务时,如果用户需要点多个外卖,请尽量在同一店铺进行购买,如果无法找到可以下单,并说明某个商品未找到。 68 | 11. 请严格遵循用户意图执行任务,用户的特殊要求可以执行多次搜索,滑动查找。比如(i)用户要求点一杯咖啡,要咸的,你可以直接搜索咸咖啡,或者搜索咖啡后滑动查找咸的咖啡,比如海盐咖啡。(ii)用户要找到XX群,发一条消息,你可以先搜索XX群,找不到结果后,将"群"字去掉,搜索XX重试。(iii)用户要找到宠物友好的餐厅,你可以搜索餐厅,找到筛选,找到设施,选择可带宠物,或者直接搜索可带宠物,必要时可以使用AI搜索。 69 | 12. 在选择日期时,如果原滑动方向与预期日期越来越远,请向反方向滑动查找。 70 | 13. 执行任务过程中如果有多个可选择的项目栏,请逐个查找每个项目栏,直到完成任务,一定不要在同一项目栏多次查找,从而陷入死循环。 71 | 14. 在执行下一步操作前请一定要检查上一步的操作是否生效,如果点击没生效,可能因为app反应较慢,请先稍微等待一下,如果还是不生效请调整一下点击位置重试,如果仍然不生效请跳过这一步继续任务,并在finish message说明点击不生效。 72 | 15. 在执行任务中如果遇到滑动不生效的情况,请调整一下起始点位置,增大滑动距离重试,如果还是不生效,有可能是已经滑到底了,请继续向反方向滑动,直到顶部或底部,如果仍然没有符合要求的结果,请跳过这一步继续任务,并在finish message说明但没找到要求的项目。 73 | 16. 在做游戏任务时如果在战斗页面如果有自动战斗一定要开启自动战斗,如果多轮历史状态相似要检查自动战斗是否开启。 74 | 17. 如果没有合适的搜索结果,可能是因为搜索页面不对,请返回到搜索页面的上一级尝试重新搜索,如果尝试三次返回上一级搜索后仍然没有符合要求的结果,执行 finish(message="原因")。 75 | 18. 在结束任务前请一定要仔细检查任务是否完整准确的完成,如果出现错选、漏选、多选的情况,请返回之前的步骤进行纠正。 76 | """ 77 | ) 78 | -------------------------------------------------------------------------------- /scripts/check_deployment_en.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | from openai import OpenAI 6 | 7 | if __name__ == "__main__": 8 | parser = argparse.ArgumentParser( 9 | description="Tool for checking if model deployment is successful", 10 | formatter_class=argparse.RawDescriptionHelpFormatter, 11 | epilog=""" 12 | Usage examples: 13 | python scripts/check_deployment_en.py --base-url http://localhost:8000/v1 --apikey your-key --model autoglm-phone-9b 14 | python scripts/check_deployment_en.py --base-url http://localhost:8000/v1 --apikey your-key --model autoglm-phone-9b --messages-file custom.json 15 | """, 16 | ) 17 | 18 | parser.add_argument( 19 | "--base-url", 20 | type=str, 21 | required=True, 22 | help="Base URL of the API service, e.g.: http://localhost:8000/v1", 23 | ) 24 | 25 | parser.add_argument( 26 | "--apikey", type=str, default="EMPTY", help="API key (default: EMPTY)" 27 | ) 28 | 29 | parser.add_argument( 30 | "--model", 31 | type=str, 32 | required=True, 33 | help="Name of the model to test, e.g.: autoglm-phone-9b", 34 | ) 35 | 36 | parser.add_argument( 37 | "--messages-file", 38 | type=str, 39 | default="scripts/sample_messages_en.json", 40 | help="Path to JSON file containing test messages (default: scripts/sample_messages_en.json)", 41 | ) 42 | 43 | parser.add_argument( 44 | "--max-tokens", 45 | type=int, 46 | default=3000, 47 | help="Maximum generation tokens (default: 3000)", 48 | ) 49 | 50 | parser.add_argument( 51 | "--temperature", 52 | type=float, 53 | default=0.0, 54 | help="Sampling temperature (default: 0.0)", 55 | ) 56 | 57 | parser.add_argument( 58 | "--top_p", 59 | type=float, 60 | default=0.85, 61 | help="Nucleus sampling parameter (default: 0.85)", 62 | ) 63 | 64 | parser.add_argument( 65 | "--frequency_penalty", 66 | type=float, 67 | default=0.2, 68 | help="Frequency penalty parameter (default: 0.2)", 69 | ) 70 | 71 | args = parser.parse_args() 72 | 73 | # Read test messages 74 | if not os.path.exists(args.messages_file): 75 | print(f"Error: Message file {args.messages_file} does not exist") 76 | exit(1) 77 | 78 | with open(args.messages_file) as f: 79 | messages = json.load(f) 80 | 81 | base_url = args.base_url 82 | api_key = args.apikey 83 | model = args.model 84 | 85 | print(f"Starting model inference test...") 86 | print(f"Base URL: {base_url}") 87 | print(f"Model: {model}") 88 | print(f"Messages file: {args.messages_file}") 89 | print("=" * 80) 90 | 91 | try: 92 | client = OpenAI( 93 | base_url=base_url, 94 | api_key=api_key, 95 | ) 96 | 97 | response = client.chat.completions.create( 98 | messages=messages, 99 | model=model, 100 | max_tokens=args.max_tokens, 101 | temperature=args.temperature, 102 | top_p=args.top_p, 103 | frequency_penalty=args.frequency_penalty, 104 | stream=False, 105 | ) 106 | 107 | print("\nModel inference result:") 108 | print("=" * 80) 109 | print(response.choices[0].message.content) 110 | print("=" * 80) 111 | 112 | if response.usage: 113 | print(f"\nStatistics:") 114 | print(f" - Prompt tokens: {response.usage.prompt_tokens}") 115 | print(f" - Completion tokens: {response.usage.completion_tokens}") 116 | print(f" - Total tokens: {response.usage.total_tokens}") 117 | 118 | print( 119 | f"\nPlease evaluate the above inference result to determine if the model deployment meets expectations." 120 | ) 121 | 122 | except Exception as e: 123 | print(f"\nError occurred while calling API:") 124 | print(f"Error type: {type(e).__name__}") 125 | print(f"Error message: {str(e)}") 126 | print( 127 | "\nTip: Please check if base_url, api_key and model parameters are correct, and if the service is running." 128 | ) 129 | exit(1) 130 | -------------------------------------------------------------------------------- /phone_agent/hdc/screenshot.py: -------------------------------------------------------------------------------- 1 | """Screenshot utilities for capturing HarmonyOS device screen.""" 2 | 3 | import base64 4 | import os 5 | import subprocess 6 | import tempfile 7 | import uuid 8 | from dataclasses import dataclass 9 | from io import BytesIO 10 | from typing import Tuple 11 | 12 | from PIL import Image 13 | from phone_agent.hdc.connection import _run_hdc_command 14 | 15 | 16 | @dataclass 17 | class Screenshot: 18 | """Represents a captured screenshot.""" 19 | 20 | base64_data: str 21 | width: int 22 | height: int 23 | is_sensitive: bool = False 24 | 25 | 26 | def get_screenshot(device_id: str | None = None, timeout: int = 10) -> Screenshot: 27 | """ 28 | Capture a screenshot from the connected HarmonyOS device. 29 | 30 | Args: 31 | device_id: Optional HDC device ID for multi-device setups. 32 | timeout: Timeout in seconds for screenshot operations. 33 | 34 | Returns: 35 | Screenshot object containing base64 data and dimensions. 36 | 37 | Note: 38 | If the screenshot fails (e.g., on sensitive screens like payment pages), 39 | a black fallback image is returned with is_sensitive=True. 40 | """ 41 | temp_path = os.path.join(tempfile.gettempdir(), f"screenshot_{uuid.uuid4()}.png") 42 | hdc_prefix = _get_hdc_prefix(device_id) 43 | 44 | try: 45 | # Execute screenshot command 46 | # HarmonyOS HDC only supports JPEG format 47 | remote_path = "/data/local/tmp/tmp_screenshot.jpeg" 48 | 49 | # Try method 1: hdc shell screenshot (newer HarmonyOS versions) 50 | result = _run_hdc_command( 51 | hdc_prefix + ["shell", "screenshot", remote_path], 52 | capture_output=True, 53 | text=True, 54 | timeout=timeout, 55 | ) 56 | 57 | # Check for screenshot failure (sensitive screen) 58 | output = result.stdout + result.stderr 59 | if "fail" in output.lower() or "error" in output.lower() or "not found" in output.lower(): 60 | # Try method 2: snapshot_display (older versions or different devices) 61 | result = _run_hdc_command( 62 | hdc_prefix + ["shell", "snapshot_display", "-f", remote_path], 63 | capture_output=True, 64 | text=True, 65 | timeout=timeout, 66 | ) 67 | output = result.stdout + result.stderr 68 | if "fail" in output.lower() or "error" in output.lower(): 69 | return _create_fallback_screenshot(is_sensitive=True) 70 | 71 | # Pull screenshot to local temp path 72 | # Note: remote file is JPEG, but PIL can open it regardless of local extension 73 | _run_hdc_command( 74 | hdc_prefix + ["file", "recv", remote_path, temp_path], 75 | capture_output=True, 76 | text=True, 77 | timeout=5, 78 | ) 79 | 80 | if not os.path.exists(temp_path): 81 | return _create_fallback_screenshot(is_sensitive=False) 82 | 83 | # Read JPEG image and convert to PNG for model inference 84 | # PIL automatically detects the image format from file content 85 | img = Image.open(temp_path) 86 | width, height = img.size 87 | 88 | buffered = BytesIO() 89 | img.save(buffered, format="PNG") 90 | base64_data = base64.b64encode(buffered.getvalue()).decode("utf-8") 91 | 92 | # Cleanup 93 | os.remove(temp_path) 94 | 95 | return Screenshot( 96 | base64_data=base64_data, width=width, height=height, is_sensitive=False 97 | ) 98 | 99 | except Exception as e: 100 | print(f"Screenshot error: {e}") 101 | return _create_fallback_screenshot(is_sensitive=False) 102 | 103 | 104 | def _get_hdc_prefix(device_id: str | None) -> list: 105 | """Get HDC command prefix with optional device specifier.""" 106 | if device_id: 107 | return ["hdc", "-t", device_id] 108 | return ["hdc"] 109 | 110 | 111 | def _create_fallback_screenshot(is_sensitive: bool) -> Screenshot: 112 | """Create a black fallback image when screenshot fails.""" 113 | default_width, default_height = 1080, 2400 114 | 115 | black_img = Image.new("RGB", (default_width, default_height), color="black") 116 | buffered = BytesIO() 117 | black_img.save(buffered, format="PNG") 118 | base64_data = base64.b64encode(buffered.getvalue()).decode("utf-8") 119 | 120 | return Screenshot( 121 | base64_data=base64_data, 122 | width=default_width, 123 | height=default_height, 124 | is_sensitive=is_sensitive, 125 | ) 126 | -------------------------------------------------------------------------------- /resources/privacy_policy.txt: -------------------------------------------------------------------------------- 1 | 第一部分:模型/技术的安全性说明 2 | 3 | 1. AutoGLM 技术机制与部署灵活性 4 | AutoGLM 的核心功能是自动化操作执行。其工作原理如下: 5 | - 指令驱动: 基于用户或开发者发出的操作指令。 6 | - 屏幕理解: 获取当前操作环境的屏幕内容,将图像发送给大模型(可部署在本地或云端)进行分析理解。 7 | - 操作模拟: 模拟人类操作方式(如点击、滑动、输入信息等)在目标环境中完成任务。 8 | - 示例: 当指令要求订购高铁票时,AutoGLM 会打开相关应用,识别界面内容,按指令选择车次、完成下单等步骤,如同人工操作,用户或开发者可随时终止任务。 9 | 10 | 关键灵活性: 11 | - 模型部署: 开发者可自由选择将 AutoGLM 模型部署在本地设备或云端服务器上。 12 | - 操作执行环境: 自动化操作可以在本地设备上执行,也可以在云设备上执行,具体由开发者根据应用场景和需求决定。 13 | - 数据流向: 数据流向取决于部署选择: 14 | - 本地部署(模型+执行): 屏幕捕获、模型分析、操作执行均在本地设备完成,数据不离开设备,隐私性最高。 15 | - 云端部署(模型+执行): 屏幕内容需从操作环境(本机或云设备)传输到云端模型,模型分析后指令返回操作环境执行。开发者需确保传输和云端处理的安全性。 16 | - 混合部署(如本地执行+云端模型): 屏幕内容在本地捕获,传输到云端模型分析,分析结果返回本地执行。开发者需关注数据传输安全。 17 | 18 | 2. 系统权限调用说明(针对操作执行环境) 19 | 为保证自动化操作正常执行,运行 AutoGLM 操作的环境可能需要获取以下权限: 20 | - ADB (Android Debug Bridge) 权限: 用于获取信息并模拟点击、滑动、输入等用户交互操作。 21 | - 存储权限: 用于临时存储必要的数据、模型文件(若本地部署)或日志。 22 | - 网络权限: 用于访问在线服务(如调用云端模型、访问目标应用服务)。 23 | - 其他特定权限: 根据具体任务可能需要(如麦克风用于语音指令)。 24 | 25 | 开发者责任: 26 | - 最小权限原则: 仅请求完成特定任务所必需的权限。 27 | - 透明告知: 在应用或服务中清晰、明确地向最终用户说明每个权限的用途和必要性。 28 | - 用户授权: 必须获得最终用户的明确授权后,才能在操作环境中启用相关权限和功能。 29 | - 环境适配: 确保权限请求和获取机制适配所选择的操作执行环境(本地或云)。 30 | 31 | 3. 数据处理与隐私保护原则 32 | AutoGLM 开源项目本身不收集用户数据。数据处理和隐私保护的责任主体是基于 AutoGLM 开发具体应用或服务的开发者,其责任取决于部署方式: 33 | - 本地部署(模型+执行): 34 | - 开发者需在应用层面实现本地数据的安全存储和处理,所有数据处理(屏幕捕获、模型分析、操作执行)均在最终用户的本地设备上完成。 35 | - 开发者应确保其应用不主动将敏感数据(如屏幕内容、操作记录)上传到开发者服务器或第三方,除非用户明确知情同意且为必要功能。 36 | - 云端部署(模型或执行或两者): 37 | - 涉及数据(屏幕内容、操作指令、模型分析结果)在操作环境与云端之间传输。 38 | - 开发者必须: 39 | - 实施强加密保护所有传输和存储的数据。 40 | - 明确告知最终用户哪些数据会被发送到云端、发送目的、存储位置及保留期限,获得最终用户对数据传输和云端处理的明确同意。 41 | - 遵守适用的数据保护法规,提供清晰的隐私政策,说明数据处理实践。 42 | - 确保云端环境(模型服务器、操作环境服务器)的安全配置和访问控制。 43 | - 通用原则(所有部署方式): 44 | - 数据最小化: 仅收集和处理完成自动化任务所绝对必需的最少信息。 45 | - 目的限制: 数据仅用于实现用户指令的特定自动化操作目的。 46 | - 安全保障: 开发者有责任采取合理的技术和管理措施,保护其处理的所有用户数据(无论在本地还是云端)的安全性和保密性,防止未经授权的访问、使用、泄露或丢失。 47 | - 用户控制: 提供机制让最终用户能够查看、管理(如删除)与其相关的数据(在技术可行且符合部署方式的前提下)。 48 | 49 | 50 | 51 | 第二部分:开发者/用户应该遵循的使用规范 52 | 53 | 开发者/用户在使用AutoGLM开源项目过程中,应始终遵循《中华人民共和国网络安全法》《互联网信息服务算法推荐管理规定》《互联网信息服务深度合成管理规定》《生成式人工智能服务管理暂行办法》《网络安全技术 生成式人工智能服务安全基本要求》等使用地所适用的法律法规及标准,并根据《人工智能生成合成内容标识办法》《网络安全技术人工智能生成合成内容标识方法(GB45438-2025)》的要求和应用场景,对人工智能生成合成内容进行标识,包括但不限于显式标识、隐式标识(元数据标识和数字水印)等。 54 | 55 | 1. 重要操作确认机制 56 | 57 | 开发者必须在其基于 AutoGLM 开发的应用或服务中,为涉及以下6+1项高风险操作设计并实现明确的、强制性的用户确认步骤: 58 | - 信息交互与内容传播:包括但不限于发送消息、邮件、发表评论、点赞、分享等。 59 | - 文件处置与权限管理:包括但不限于创建、编辑、删除、移动文件或文件夹、开启或关闭任意权限等。 60 | - 交易订单与权益处置:包括但不限于清空购物车、提交订单、修改/添加收货地址、使用优惠券/积分等。 61 | - 资金流转与支付结算:包括但不限于转账、支付、收款、充值、提现、绑定/解绑支付方式等。 62 | - 账户身份与安全配置:包括但不限于修改密码、设置/修改安全选项、删除账号或关联账号、删除好友/联系人、删除对话/记录等。 63 | - 医疗健康与法律合规:包括但不限于诊疗记录/健康数据的访问、授权或处置、药品采购、生理或心理测试、电子协议的签署等。 64 | - 其他高风险操作:其他任何可能对用户数据安全、财产安全、账号安全或声誉造成重大影响的操作。 65 | 66 | 要求: 67 | - 确认步骤必须在操作执行前触发,清晰展示即将执行的操作详情。 68 | - 提供便捷的取消/终止机制,允许用户在确认前或操作过程中随时中止任务。 69 | - 开发者责任: 未能实现有效确认机制导致用户损失的,开发者需承担相应责任。用户责任: 用户在确认后未及时终止错误操作导致的损失,由用户自行承担。 70 | 71 | 2. 开发者与用户的义务 72 | 73 | 开发者义务: 74 | - 透明告知: 清晰、准确地向最终用户说明其应用/服务的功能、工作原理(特别是自动化部分)、数据收集和处理方式(包括是否涉及云端)、潜在风险以及用户如何控制。 75 | - 提供监控与控制: 设计用户界面,允许最终用户: 76 | - 实时查看或了解自动化操作的当前状态和步骤。 77 | - 方便、快速地暂停、终止任何正在进行的自动化任务。 78 | - 管理自动化操作的权限和设置。 79 | - 安全开发: 遵循安全编码实践,确保应用/服务本身的安全性,防止被恶意利用。 80 | - 合规性: 确保其开发的应用/服务符合所有适用的法律法规、行业标准和第三方平台(如被操作的应用)的服务条款。 81 | - 风险提示: 在适当位置(如功能入口、首次使用时、确认步骤中)向用户明确提示使用自动化功能可能存在的风险(如误操作、隐私风险、第三方平台政策风险)。 82 | - 避免关键依赖: 谨慎评估,不建议将 AutoGLM 用于处理极端关键、高风险或一旦出错后果极其严重的操作(如医疗设备控制、关键基础设施操作、大额金融交易无人工复核)。 83 | 84 | 用户义务: 85 | - 理解风险: 在使用基于 AutoGLM 的自动化功能前,仔细阅读开发者提供的说明、隐私政策和风险提示,充分理解其工作原理和潜在风险。 86 | - 谨慎授权: 仅在完全信任应用/服务开发者并理解授权内容后,才授予必要的权限。 87 | - 主动监控: 在自动化任务执行期间,保持适当的关注,特别是在执行重要操作时。利用开发者提供的监控功能了解操作进展。 88 | - 及时干预: 如发现操作错误、异常或不符合预期,应立即使用提供的终止功能停止任务。 89 | - 承担责任: 对其发出的指令、确认的操作以及因未能及时监控和制止错误操作而导致的任何损失,自行承担责任。 90 | 91 | 3. 开发者与用户行为规范 92 | 93 | 严禁利用 AutoGLM 开源项目或基于其开发的应用/服务从事以下行为: 94 | (1)批量自动化与恶意竞争行为 95 | - 进行任何形式的虚假数据操作:刷单、刷票、刷赞、刷评论、刷流量、刷粉丝、刷播放量、刷下载量等。 96 | - 批量操控账号:批量注册、批量登录、批量操作第三方平台账号(群控、多开、云控)。 97 | - 扰乱市场秩序:恶意抢购、囤积居奇、抢占限量资源、批量领取/滥用优惠券/补贴、恶意占用服务资源(薅羊毛)。 98 | - 操纵平台规则:刷榜、刷排名、操纵搜索结果、人为干预推荐算法、虚假提升/降低内容曝光度。 99 | - 制造虚假活跃度:批量发布、转发、点赞、收藏、关注、取关等社交媒体操作。 100 | - 破坏游戏公平:游戏代练、工作室操作、批量刷装备/金币/经验/道具。 101 | - 破坏公正性:批量投票、刷票、操纵网络评选、调查结果。 102 | (2)虚假信息与欺诈行为 103 | - 制造误导信息:发布/传播虚假商品/服务评价、虚假用户反馈、虚假证言、虚假体验。 104 | - 伪造商业数据:制造虚假交易记录、虚假销量、虚假用户活跃度、虚假好评率。 105 | - 身份欺诈:冒充他人身份、虚构个人信息、盗用他人账号/头像/昵称、伪造身份证明。 106 | - 虚假营销:发布虚假广告、进行虚假宣传、夸大产品功效、隐瞒产品缺陷/风险。 107 | - 参与诈骗活动:网络诈骗、虚假投资、传销、非法集资、虚假中奖、钓鱼等。 108 | - 传播不实信息:制造或恶意传播虚假新闻、谣言、未经证实的信息。 109 | (3)破坏第三方服务与系统安全 110 | - 非授权访问:利用 AutoGLM 进行数据爬取(违反 robots.txt 或平台政策)、信息窃取、API 接口滥用、服务器渗透测试(未授权)。 111 | - 技术破坏:对第三方应用进行逆向工程、破解、修改、注入恶意代码、干扰其正常运行。 112 | - 资源滥用:恶意占用第三方服务器资源、发送垃圾请求、制造异常流量、进行 DDoS 攻击。 113 | - 违反平台规则:故意违反被操作第三方应用的用户协议、服务条款、社区规则。 114 | - 恶意竞争:恶意差评、恶意举报、恶意投诉、商业诋毁。 115 | - 传播有害内容:传播计算机病毒、木马、恶意软件、勒索软件、垃圾邮件、非法内容。 116 | - 侵犯数据权益:未经授权进行大规模商业数据采集、用户信息收集、隐私窥探。 117 | (4)侵犯他人合法权益 118 | - 账号盗用:盗用他人账号、密码、身份凭证进行操作。 119 | - 网络骚扰与霸凌:恶意骚扰、威胁、辱骂、诽谤、人肉搜索他人。 120 | - 侵犯隐私与秘密:未经授权收集、使用、传播他人个人信息、隐私数据、商业秘密。 121 | - 恶意抢注:抢注他人商标、域名、用户名、社交媒体账号等。 122 | - 骚扰行为:恶意刷屏、垃圾信息轰炸、强制关注/订阅。 123 | - 损害商业利益:商业间谍活动、不正当竞争、恶意挖角、窃取商业机密。 124 | (5)滥用资源与破坏项目生态 125 | - 滥用注册资源:恶意注册大量账号、虚假注册。 126 | - 浪费计算/设备资源:恶意占用本地设备或云设备资源、长时间闲置占用、运行与自动化任务无关的高耗能程序(如挖矿)。 127 | - 破坏稳定性:恶意测试系统性能、进行压力测试(未授权)、频繁重启服务、利用技术漏洞/缺陷牟利或损害项目/平台利益。 128 | - 违反开源协议:违反 AutoGLM 项目的开源许可证条款。 129 | 130 | 违反后果: 131 | 132 | 如开发者/用户在使用中未遵循相应的法律法规、政策、行业标准(包括但不限于技术规范、安全标准)及开源项目的约定(包括但不限于开源协议、使用须知),由此产生的全部法律责任、经济损失及一切不良后果,均由开发者 / 用户自行独立承担。 -------------------------------------------------------------------------------- /phone_agent/device_factory.py: -------------------------------------------------------------------------------- 1 | """Device factory for selecting ADB or HDC based on device type.""" 2 | 3 | from enum import Enum 4 | from typing import Any 5 | 6 | 7 | class DeviceType(Enum): 8 | """Type of device connection tool.""" 9 | 10 | ADB = "adb" 11 | HDC = "hdc" 12 | 13 | 14 | class DeviceFactory: 15 | """ 16 | Factory class for getting device-specific implementations. 17 | 18 | This allows the system to work with both Android (ADB) and HarmonyOS (HDC) devices. 19 | """ 20 | 21 | def __init__(self, device_type: DeviceType = DeviceType.ADB): 22 | """ 23 | Initialize the device factory. 24 | 25 | Args: 26 | device_type: The type of device to use (ADB or HDC). 27 | """ 28 | self.device_type = device_type 29 | self._module = None 30 | 31 | @property 32 | def module(self): 33 | """Get the appropriate device module (adb or hdc).""" 34 | if self._module is None: 35 | if self.device_type == DeviceType.ADB: 36 | from phone_agent import adb 37 | self._module = adb 38 | elif self.device_type == DeviceType.HDC: 39 | from phone_agent import hdc 40 | self._module = hdc 41 | else: 42 | raise ValueError(f"Unknown device type: {self.device_type}") 43 | return self._module 44 | 45 | def get_screenshot(self, device_id: str | None = None, timeout: int = 10): 46 | """Get screenshot from device.""" 47 | return self.module.get_screenshot(device_id, timeout) 48 | 49 | def get_current_app(self, device_id: str | None = None) -> str: 50 | """Get current app name.""" 51 | return self.module.get_current_app(device_id) 52 | 53 | def tap(self, x: int, y: int, device_id: str | None = None, delay: float | None = None): 54 | """Tap at coordinates.""" 55 | return self.module.tap(x, y, device_id, delay) 56 | 57 | def double_tap(self, x: int, y: int, device_id: str | None = None, delay: float | None = None): 58 | """Double tap at coordinates.""" 59 | return self.module.double_tap(x, y, device_id, delay) 60 | 61 | def long_press(self, x: int, y: int, duration_ms: int = 3000, device_id: str | None = None, delay: float | None = None): 62 | """Long press at coordinates.""" 63 | return self.module.long_press(x, y, duration_ms, device_id, delay) 64 | 65 | def swipe(self, start_x: int, start_y: int, end_x: int, end_y: int, duration_ms: int | None = None, device_id: str | None = None, delay: float | None = None): 66 | """Swipe from start to end.""" 67 | return self.module.swipe(start_x, start_y, end_x, end_y, duration_ms, device_id, delay) 68 | 69 | def back(self, device_id: str | None = None, delay: float | None = None): 70 | """Press back button.""" 71 | return self.module.back(device_id, delay) 72 | 73 | def home(self, device_id: str | None = None, delay: float | None = None): 74 | """Press home button.""" 75 | return self.module.home(device_id, delay) 76 | 77 | def launch_app(self, app_name: str, device_id: str | None = None, delay: float | None = None) -> bool: 78 | """Launch an app.""" 79 | return self.module.launch_app(app_name, device_id, delay) 80 | 81 | def type_text(self, text: str, device_id: str | None = None): 82 | """Type text.""" 83 | return self.module.type_text(text, device_id) 84 | 85 | def clear_text(self, device_id: str | None = None): 86 | """Clear text.""" 87 | return self.module.clear_text(device_id) 88 | 89 | def detect_and_set_adb_keyboard(self, device_id: str | None = None) -> str: 90 | """Detect and set keyboard.""" 91 | return self.module.detect_and_set_adb_keyboard(device_id) 92 | 93 | def restore_keyboard(self, ime: str, device_id: str | None = None): 94 | """Restore keyboard.""" 95 | return self.module.restore_keyboard(ime, device_id) 96 | 97 | def list_devices(self): 98 | """List connected devices.""" 99 | return self.module.list_devices() 100 | 101 | def get_connection_class(self): 102 | """Get the connection class (ADBConnection or HDCConnection).""" 103 | if self.device_type == DeviceType.ADB: 104 | from phone_agent.adb import ADBConnection 105 | return ADBConnection 106 | elif self.device_type == DeviceType.HDC: 107 | from phone_agent.hdc import HDCConnection 108 | return HDCConnection 109 | else: 110 | raise ValueError(f"Unknown device type: {self.device_type}") 111 | 112 | 113 | # Global device factory instance 114 | _device_factory: DeviceFactory | None = None 115 | 116 | 117 | def set_device_type(device_type: DeviceType): 118 | """ 119 | Set the global device type. 120 | 121 | Args: 122 | device_type: The device type to use (ADB or HDC). 123 | """ 124 | global _device_factory 125 | _device_factory = DeviceFactory(device_type) 126 | 127 | 128 | def get_device_factory() -> DeviceFactory: 129 | """ 130 | Get the global device factory instance. 131 | 132 | Returns: 133 | The device factory instance. 134 | """ 135 | global _device_factory 136 | if _device_factory is None: 137 | _device_factory = DeviceFactory(DeviceType.ADB) # Default to ADB 138 | return _device_factory 139 | -------------------------------------------------------------------------------- /phone_agent/hdc/input.py: -------------------------------------------------------------------------------- 1 | """Input utilities for HarmonyOS device text input.""" 2 | 3 | import base64 4 | import subprocess 5 | from typing import Optional 6 | 7 | from phone_agent.hdc.connection import _run_hdc_command 8 | 9 | 10 | def type_text(text: str, device_id: str | None = None) -> None: 11 | """ 12 | Type text into the currently focused input field. 13 | 14 | Args: 15 | text: The text to type. Supports multi-line text with newline characters. 16 | device_id: Optional HDC device ID for multi-device setups. 17 | 18 | Note: 19 | HarmonyOS uses: hdc shell uitest uiInput text "文本内容" 20 | This command works without coordinates when input field is focused. 21 | For multi-line text, the function splits by newlines and sends ENTER keyEvents. 22 | ENTER key code in HarmonyOS: 2054 23 | Recommendation: Click on the input field first to focus it, then use this function. 24 | """ 25 | hdc_prefix = _get_hdc_prefix(device_id) 26 | 27 | # Handle multi-line text by splitting on newlines 28 | if '\n' in text: 29 | lines = text.split('\n') 30 | for i, line in enumerate(lines): 31 | if line: # Only process non-empty lines 32 | # Escape special characters for shell 33 | escaped_line = line.replace('"', '\\"').replace("$", "\\$") 34 | 35 | _run_hdc_command( 36 | hdc_prefix + ["shell", "uitest", "uiInput", "text", escaped_line], 37 | capture_output=True, 38 | text=True, 39 | ) 40 | 41 | # Send ENTER key event after each line except the last one 42 | if i < len(lines) - 1: 43 | try: 44 | _run_hdc_command( 45 | hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", "2054"], 46 | capture_output=True, 47 | text=True, 48 | ) 49 | except Exception as e: 50 | print(f"[HDC] ENTER keyEvent failed: {e}") 51 | else: 52 | # Single line text - original logic 53 | # Escape special characters for shell (keep quotes for proper text handling) 54 | # The text will be wrapped in quotes in the command 55 | escaped_text = text.replace('"', '\\"').replace("$", "\\$") 56 | 57 | # HarmonyOS uitest uiInput text command 58 | # Format: hdc shell uitest uiInput text "文本内容" 59 | _run_hdc_command( 60 | hdc_prefix + ["shell", "uitest", "uiInput", "text", escaped_text], 61 | capture_output=True, 62 | text=True, 63 | ) 64 | 65 | 66 | def clear_text(device_id: str | None = None) -> None: 67 | """ 68 | Clear text in the currently focused input field. 69 | 70 | Args: 71 | device_id: Optional HDC device ID for multi-device setups. 72 | 73 | Note: 74 | This method uses repeated delete key events to clear text. 75 | For HarmonyOS, you might also use select all + delete for better efficiency. 76 | """ 77 | hdc_prefix = _get_hdc_prefix(device_id) 78 | # Ctrl+A to select all (key code 2072 for Ctrl, 2017 for A) 79 | # Then delete 80 | _run_hdc_command( 81 | hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", "2072", "2017"], 82 | capture_output=True, 83 | text=True, 84 | ) 85 | _run_hdc_command( 86 | hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", "2055"], # Delete key 87 | capture_output=True, 88 | text=True, 89 | ) 90 | 91 | 92 | def detect_and_set_adb_keyboard(device_id: str | None = None) -> str: 93 | """ 94 | Detect current keyboard and switch to ADB Keyboard if available. 95 | 96 | Args: 97 | device_id: Optional HDC device ID for multi-device setups. 98 | 99 | Returns: 100 | The original keyboard IME identifier for later restoration. 101 | 102 | Note: 103 | This is a placeholder. HarmonyOS may not support ADB Keyboard. 104 | If there's a similar tool for HarmonyOS, integrate it here. 105 | """ 106 | hdc_prefix = _get_hdc_prefix(device_id) 107 | 108 | # Get current IME (if HarmonyOS supports this) 109 | try: 110 | result = _run_hdc_command( 111 | hdc_prefix + ["shell", "settings", "get", "secure", "default_input_method"], 112 | capture_output=True, 113 | text=True, 114 | ) 115 | current_ime = (result.stdout + result.stderr).strip() 116 | 117 | # If ADB Keyboard equivalent exists for HarmonyOS, switch to it 118 | # For now, we'll just return the current IME 119 | return current_ime 120 | except Exception: 121 | return "" 122 | 123 | 124 | def restore_keyboard(ime: str, device_id: str | None = None) -> None: 125 | """ 126 | Restore the original keyboard IME. 127 | 128 | Args: 129 | ime: The IME identifier to restore. 130 | device_id: Optional HDC device ID for multi-device setups. 131 | """ 132 | if not ime: 133 | return 134 | 135 | hdc_prefix = _get_hdc_prefix(device_id) 136 | 137 | try: 138 | _run_hdc_command( 139 | hdc_prefix + ["shell", "ime", "set", ime], capture_output=True, text=True 140 | ) 141 | except Exception: 142 | pass 143 | 144 | 145 | def _get_hdc_prefix(device_id: str | None) -> list: 146 | """Get HDC command prefix with optional device specifier.""" 147 | if device_id: 148 | return ["hdc", "-t", device_id] 149 | return ["hdc"] 150 | -------------------------------------------------------------------------------- /examples/basic_usage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Phone Agent Usage Examples / Phone Agent 使用示例 4 | 5 | Demonstrates how to use Phone Agent for phone automation tasks via Python API. 6 | 演示如何通过 Python API 使用 Phone Agent 进行手机自动化任务。 7 | """ 8 | 9 | from phone_agent import PhoneAgent 10 | from phone_agent.agent import AgentConfig 11 | from phone_agent.config import get_messages 12 | from phone_agent.model import ModelConfig 13 | 14 | 15 | def example_basic_task(lang: str = "cn"): 16 | """Basic task example / 基础任务示例""" 17 | msgs = get_messages(lang) 18 | 19 | # Configure model endpoint 20 | model_config = ModelConfig( 21 | base_url="http://localhost:8000/v1", 22 | model_name="autoglm-phone-9b", 23 | temperature=0.1, 24 | ) 25 | 26 | # Configure Agent behavior 27 | agent_config = AgentConfig( 28 | max_steps=50, 29 | verbose=True, 30 | lang=lang, 31 | ) 32 | 33 | # Create Agent 34 | agent = PhoneAgent( 35 | model_config=model_config, 36 | agent_config=agent_config, 37 | ) 38 | 39 | # Execute task 40 | result = agent.run("打开小红书搜索美食攻略") 41 | print(f"{msgs['task_result']}: {result}") 42 | 43 | 44 | def example_with_callbacks(lang: str = "cn"): 45 | """Task example with callbacks / 带回调的任务示例""" 46 | msgs = get_messages(lang) 47 | 48 | def my_confirmation(message: str) -> bool: 49 | """Sensitive operation confirmation callback / 敏感操作确认回调""" 50 | print(f"\n[{msgs['confirmation_required']}] {message}") 51 | response = input(f"{msgs['continue_prompt']}: ") 52 | return response.lower() in ("yes", "y", "是") 53 | 54 | def my_takeover(message: str) -> None: 55 | """Manual takeover callback / 人工接管回调""" 56 | print(f"\n[{msgs['manual_operation_required']}] {message}") 57 | print(msgs["manual_operation_hint"]) 58 | input(f"{msgs['press_enter_when_done']}: ") 59 | 60 | # Create Agent with custom callbacks 61 | agent_config = AgentConfig(lang=lang) 62 | agent = PhoneAgent( 63 | agent_config=agent_config, 64 | confirmation_callback=my_confirmation, 65 | takeover_callback=my_takeover, 66 | ) 67 | 68 | # Execute task that may require confirmation 69 | result = agent.run("打开淘宝搜索无线耳机并加入购物车") 70 | print(f"{msgs['task_result']}: {result}") 71 | 72 | 73 | def example_step_by_step(lang: str = "cn"): 74 | """Step-by-step execution example (for debugging) / 单步执行示例(用于调试)""" 75 | msgs = get_messages(lang) 76 | 77 | agent_config = AgentConfig(lang=lang) 78 | agent = PhoneAgent(agent_config=agent_config) 79 | 80 | # Initialize task 81 | result = agent.step("打开美团搜索附近的火锅店") 82 | print(f"{msgs['step']} 1: {result.action}") 83 | 84 | # Continue if not finished 85 | while not result.finished and agent.step_count < 10: 86 | result = agent.step() 87 | print(f"{msgs['step']} {agent.step_count}: {result.action}") 88 | print(f" {msgs['thinking']}: {result.thinking[:100]}...") 89 | 90 | print(f"\n{msgs['final_result']}: {result.message}") 91 | 92 | 93 | def example_multiple_tasks(lang: str = "cn"): 94 | """Batch task example / 批量任务示例""" 95 | msgs = get_messages(lang) 96 | 97 | agent_config = AgentConfig(lang=lang) 98 | agent = PhoneAgent(agent_config=agent_config) 99 | 100 | tasks = [ 101 | "打开高德地图查看实时路况", 102 | "打开大众点评搜索附近的咖啡店", 103 | "打开bilibili搜索Python教程", 104 | ] 105 | 106 | for task in tasks: 107 | print(f"\n{'=' * 50}") 108 | print(f"{msgs['task']}: {task}") 109 | print("=" * 50) 110 | 111 | result = agent.run(task) 112 | print(f"{msgs['result']}: {result}") 113 | 114 | # Reset Agent state 115 | agent.reset() 116 | 117 | 118 | def example_remote_device(lang: str = "cn"): 119 | """Remote device example / 远程设备示例""" 120 | from phone_agent.adb import ADBConnection 121 | 122 | msgs = get_messages(lang) 123 | 124 | # Create connection manager 125 | conn = ADBConnection() 126 | 127 | # Connect to remote device 128 | success, message = conn.connect("192.168.1.100:5555") 129 | if not success: 130 | print(f"{msgs['connection_failed']}: {message}") 131 | return 132 | 133 | print(f"{msgs['connection_successful']}: {message}") 134 | 135 | # Create Agent with device specified 136 | agent_config = AgentConfig( 137 | device_id="192.168.1.100:5555", 138 | verbose=True, 139 | lang=lang, 140 | ) 141 | 142 | agent = PhoneAgent(agent_config=agent_config) 143 | 144 | # Execute task 145 | result = agent.run("打开微信查看消息") 146 | print(f"{msgs['task_result']}: {result}") 147 | 148 | # Disconnect 149 | conn.disconnect("192.168.1.100:5555") 150 | 151 | 152 | if __name__ == "__main__": 153 | import argparse 154 | 155 | parser = argparse.ArgumentParser(description="Phone Agent Usage Examples") 156 | parser.add_argument( 157 | "--lang", 158 | type=str, 159 | default="cn", 160 | choices=["cn", "en"], 161 | help="Language for UI messages (cn=Chinese, en=English)", 162 | ) 163 | args = parser.parse_args() 164 | 165 | msgs = get_messages(args.lang) 166 | 167 | print("Phone Agent Usage Examples") 168 | print("=" * 50) 169 | 170 | # Run basic example 171 | print(f"\n1. Basic Task Example") 172 | print("-" * 30) 173 | example_basic_task(args.lang) 174 | 175 | # Uncomment to run other examples 176 | # print(f"\n2. Task Example with Callbacks") 177 | # print("-" * 30) 178 | # example_with_callbacks(args.lang) 179 | 180 | # print(f"\n3. Step-by-step Example") 181 | # print("-" * 30) 182 | # example_step_by_step(args.lang) 183 | 184 | # print(f"\n4. Batch Task Example") 185 | # print("-" * 30) 186 | # example_multiple_tasks(args.lang) 187 | 188 | # print(f"\n5. Remote Device Example") 189 | # print("-" * 30) 190 | # example_remote_device(args.lang) 191 | -------------------------------------------------------------------------------- /phone_agent/config/timing.py: -------------------------------------------------------------------------------- 1 | """Timing configuration for Phone Agent. 2 | 3 | This module defines all configurable waiting times used throughout the application. 4 | Users can customize these values by modifying this file or by setting environment variables. 5 | """ 6 | 7 | import os 8 | from dataclasses import dataclass 9 | 10 | 11 | @dataclass 12 | class ActionTimingConfig: 13 | """Configuration for action handler timing delays.""" 14 | 15 | # Text input related delays (in seconds) 16 | keyboard_switch_delay: float = 1.0 # Delay after switching to ADB keyboard 17 | text_clear_delay: float = 1.0 # Delay after clearing text 18 | text_input_delay: float = 1.0 # Delay after typing text 19 | keyboard_restore_delay: float = 1.0 # Delay after restoring original keyboard 20 | 21 | def __post_init__(self): 22 | """Load values from environment variables if present.""" 23 | self.keyboard_switch_delay = float( 24 | os.getenv("PHONE_AGENT_KEYBOARD_SWITCH_DELAY", self.keyboard_switch_delay) 25 | ) 26 | self.text_clear_delay = float( 27 | os.getenv("PHONE_AGENT_TEXT_CLEAR_DELAY", self.text_clear_delay) 28 | ) 29 | self.text_input_delay = float( 30 | os.getenv("PHONE_AGENT_TEXT_INPUT_DELAY", self.text_input_delay) 31 | ) 32 | self.keyboard_restore_delay = float( 33 | os.getenv("PHONE_AGENT_KEYBOARD_RESTORE_DELAY", self.keyboard_restore_delay) 34 | ) 35 | 36 | 37 | @dataclass 38 | class DeviceTimingConfig: 39 | """Configuration for device operation timing delays.""" 40 | 41 | # Default delays for various device operations (in seconds) 42 | default_tap_delay: float = 1.0 # Default delay after tap 43 | default_double_tap_delay: float = 1.0 # Default delay after double tap 44 | double_tap_interval: float = 0.1 # Interval between two taps in double tap 45 | default_long_press_delay: float = 1.0 # Default delay after long press 46 | default_swipe_delay: float = 1.0 # Default delay after swipe 47 | default_back_delay: float = 1.0 # Default delay after back button 48 | default_home_delay: float = 1.0 # Default delay after home button 49 | default_launch_delay: float = 1.0 # Default delay after launching app 50 | 51 | def __post_init__(self): 52 | """Load values from environment variables if present.""" 53 | self.default_tap_delay = float( 54 | os.getenv("PHONE_AGENT_TAP_DELAY", self.default_tap_delay) 55 | ) 56 | self.default_double_tap_delay = float( 57 | os.getenv("PHONE_AGENT_DOUBLE_TAP_DELAY", self.default_double_tap_delay) 58 | ) 59 | self.double_tap_interval = float( 60 | os.getenv("PHONE_AGENT_DOUBLE_TAP_INTERVAL", self.double_tap_interval) 61 | ) 62 | self.default_long_press_delay = float( 63 | os.getenv("PHONE_AGENT_LONG_PRESS_DELAY", self.default_long_press_delay) 64 | ) 65 | self.default_swipe_delay = float( 66 | os.getenv("PHONE_AGENT_SWIPE_DELAY", self.default_swipe_delay) 67 | ) 68 | self.default_back_delay = float( 69 | os.getenv("PHONE_AGENT_BACK_DELAY", self.default_back_delay) 70 | ) 71 | self.default_home_delay = float( 72 | os.getenv("PHONE_AGENT_HOME_DELAY", self.default_home_delay) 73 | ) 74 | self.default_launch_delay = float( 75 | os.getenv("PHONE_AGENT_LAUNCH_DELAY", self.default_launch_delay) 76 | ) 77 | 78 | 79 | @dataclass 80 | class ConnectionTimingConfig: 81 | """Configuration for ADB connection timing delays.""" 82 | 83 | # ADB server and connection delays (in seconds) 84 | adb_restart_delay: float = 2.0 # Wait time after enabling TCP/IP mode 85 | server_restart_delay: float = ( 86 | 1.0 # Wait time between killing and starting ADB server 87 | ) 88 | 89 | def __post_init__(self): 90 | """Load values from environment variables if present.""" 91 | self.adb_restart_delay = float( 92 | os.getenv("PHONE_AGENT_ADB_RESTART_DELAY", self.adb_restart_delay) 93 | ) 94 | self.server_restart_delay = float( 95 | os.getenv("PHONE_AGENT_SERVER_RESTART_DELAY", self.server_restart_delay) 96 | ) 97 | 98 | 99 | @dataclass 100 | class TimingConfig: 101 | """Master timing configuration combining all timing settings.""" 102 | 103 | action: ActionTimingConfig 104 | device: DeviceTimingConfig 105 | connection: ConnectionTimingConfig 106 | 107 | def __init__(self): 108 | """Initialize all timing configurations.""" 109 | self.action = ActionTimingConfig() 110 | self.device = DeviceTimingConfig() 111 | self.connection = ConnectionTimingConfig() 112 | 113 | 114 | # Global timing configuration instance 115 | # Users can modify these values at runtime or through environment variables 116 | TIMING_CONFIG = TimingConfig() 117 | 118 | 119 | def get_timing_config() -> TimingConfig: 120 | """ 121 | Get the global timing configuration. 122 | 123 | Returns: 124 | The global TimingConfig instance. 125 | """ 126 | return TIMING_CONFIG 127 | 128 | 129 | def update_timing_config( 130 | action: ActionTimingConfig | None = None, 131 | device: DeviceTimingConfig | None = None, 132 | connection: ConnectionTimingConfig | None = None, 133 | ) -> None: 134 | """ 135 | Update the global timing configuration. 136 | 137 | Args: 138 | action: New action timing configuration. 139 | device: New device timing configuration. 140 | connection: New connection timing configuration. 141 | 142 | Example: 143 | >>> from phone_agent.config.timing import update_timing_config, ActionTimingConfig 144 | >>> custom_action = ActionTimingConfig( 145 | ... keyboard_switch_delay=0.5, 146 | ... text_input_delay=0.5 147 | ... ) 148 | >>> update_timing_config(action=custom_action) 149 | """ 150 | global TIMING_CONFIG 151 | if action is not None: 152 | TIMING_CONFIG.action = action 153 | if device is not None: 154 | TIMING_CONFIG.device = device 155 | if connection is not None: 156 | TIMING_CONFIG.connection = connection 157 | 158 | 159 | __all__ = [ 160 | "ActionTimingConfig", 161 | "DeviceTimingConfig", 162 | "ConnectionTimingConfig", 163 | "TimingConfig", 164 | "TIMING_CONFIG", 165 | "get_timing_config", 166 | "update_timing_config", 167 | ] 168 | -------------------------------------------------------------------------------- /phone_agent/adb/device.py: -------------------------------------------------------------------------------- 1 | """Device control utilities for Android automation.""" 2 | 3 | import os 4 | import subprocess 5 | import time 6 | from typing import List, Optional, Tuple 7 | 8 | from phone_agent.config.apps import APP_PACKAGES 9 | from phone_agent.config.timing import TIMING_CONFIG 10 | 11 | 12 | def get_current_app(device_id: str | None = None) -> str: 13 | """ 14 | Get the currently focused app name. 15 | 16 | Args: 17 | device_id: Optional ADB device ID for multi-device setups. 18 | 19 | Returns: 20 | The app name if recognized, otherwise "System Home". 21 | """ 22 | adb_prefix = _get_adb_prefix(device_id) 23 | 24 | result = subprocess.run( 25 | adb_prefix + ["shell", "dumpsys", "window"], capture_output=True, text=True, encoding="utf-8" 26 | ) 27 | output = result.stdout 28 | if not output: 29 | raise ValueError("No output from dumpsys window") 30 | 31 | # Parse window focus info 32 | for line in output.split("\n"): 33 | if "mCurrentFocus" in line or "mFocusedApp" in line: 34 | for app_name, package in APP_PACKAGES.items(): 35 | if package in line: 36 | return app_name 37 | 38 | return "System Home" 39 | 40 | 41 | def tap( 42 | x: int, y: int, device_id: str | None = None, delay: float | None = None 43 | ) -> None: 44 | """ 45 | Tap at the specified coordinates. 46 | 47 | Args: 48 | x: X coordinate. 49 | y: Y coordinate. 50 | device_id: Optional ADB device ID. 51 | delay: Delay in seconds after tap. If None, uses configured default. 52 | """ 53 | if delay is None: 54 | delay = TIMING_CONFIG.device.default_tap_delay 55 | 56 | adb_prefix = _get_adb_prefix(device_id) 57 | 58 | subprocess.run( 59 | adb_prefix + ["shell", "input", "tap", str(x), str(y)], capture_output=True 60 | ) 61 | time.sleep(delay) 62 | 63 | 64 | def double_tap( 65 | x: int, y: int, device_id: str | None = None, delay: float | None = None 66 | ) -> None: 67 | """ 68 | Double tap at the specified coordinates. 69 | 70 | Args: 71 | x: X coordinate. 72 | y: Y coordinate. 73 | device_id: Optional ADB device ID. 74 | delay: Delay in seconds after double tap. If None, uses configured default. 75 | """ 76 | if delay is None: 77 | delay = TIMING_CONFIG.device.default_double_tap_delay 78 | 79 | adb_prefix = _get_adb_prefix(device_id) 80 | 81 | subprocess.run( 82 | adb_prefix + ["shell", "input", "tap", str(x), str(y)], capture_output=True 83 | ) 84 | time.sleep(TIMING_CONFIG.device.double_tap_interval) 85 | subprocess.run( 86 | adb_prefix + ["shell", "input", "tap", str(x), str(y)], capture_output=True 87 | ) 88 | time.sleep(delay) 89 | 90 | 91 | def long_press( 92 | x: int, 93 | y: int, 94 | duration_ms: int = 3000, 95 | device_id: str | None = None, 96 | delay: float | None = None, 97 | ) -> None: 98 | """ 99 | Long press at the specified coordinates. 100 | 101 | Args: 102 | x: X coordinate. 103 | y: Y coordinate. 104 | duration_ms: Duration of press in milliseconds. 105 | device_id: Optional ADB device ID. 106 | delay: Delay in seconds after long press. If None, uses configured default. 107 | """ 108 | if delay is None: 109 | delay = TIMING_CONFIG.device.default_long_press_delay 110 | 111 | adb_prefix = _get_adb_prefix(device_id) 112 | 113 | subprocess.run( 114 | adb_prefix 115 | + ["shell", "input", "swipe", str(x), str(y), str(x), str(y), str(duration_ms)], 116 | capture_output=True, 117 | ) 118 | time.sleep(delay) 119 | 120 | 121 | def swipe( 122 | start_x: int, 123 | start_y: int, 124 | end_x: int, 125 | end_y: int, 126 | duration_ms: int | None = None, 127 | device_id: str | None = None, 128 | delay: float | None = None, 129 | ) -> None: 130 | """ 131 | Swipe from start to end coordinates. 132 | 133 | Args: 134 | start_x: Starting X coordinate. 135 | start_y: Starting Y coordinate. 136 | end_x: Ending X coordinate. 137 | end_y: Ending Y coordinate. 138 | duration_ms: Duration of swipe in milliseconds (auto-calculated if None). 139 | device_id: Optional ADB device ID. 140 | delay: Delay in seconds after swipe. If None, uses configured default. 141 | """ 142 | if delay is None: 143 | delay = TIMING_CONFIG.device.default_swipe_delay 144 | 145 | adb_prefix = _get_adb_prefix(device_id) 146 | 147 | if duration_ms is None: 148 | # Calculate duration based on distance 149 | dist_sq = (start_x - end_x) ** 2 + (start_y - end_y) ** 2 150 | duration_ms = int(dist_sq / 1000) 151 | duration_ms = max(1000, min(duration_ms, 2000)) # Clamp between 1000-2000ms 152 | 153 | subprocess.run( 154 | adb_prefix 155 | + [ 156 | "shell", 157 | "input", 158 | "swipe", 159 | str(start_x), 160 | str(start_y), 161 | str(end_x), 162 | str(end_y), 163 | str(duration_ms), 164 | ], 165 | capture_output=True, 166 | ) 167 | time.sleep(delay) 168 | 169 | 170 | def back(device_id: str | None = None, delay: float | None = None) -> None: 171 | """ 172 | Press the back button. 173 | 174 | Args: 175 | device_id: Optional ADB device ID. 176 | delay: Delay in seconds after pressing back. If None, uses configured default. 177 | """ 178 | if delay is None: 179 | delay = TIMING_CONFIG.device.default_back_delay 180 | 181 | adb_prefix = _get_adb_prefix(device_id) 182 | 183 | subprocess.run( 184 | adb_prefix + ["shell", "input", "keyevent", "4"], capture_output=True 185 | ) 186 | time.sleep(delay) 187 | 188 | 189 | def home(device_id: str | None = None, delay: float | None = None) -> None: 190 | """ 191 | Press the home button. 192 | 193 | Args: 194 | device_id: Optional ADB device ID. 195 | delay: Delay in seconds after pressing home. If None, uses configured default. 196 | """ 197 | if delay is None: 198 | delay = TIMING_CONFIG.device.default_home_delay 199 | 200 | adb_prefix = _get_adb_prefix(device_id) 201 | 202 | subprocess.run( 203 | adb_prefix + ["shell", "input", "keyevent", "KEYCODE_HOME"], capture_output=True 204 | ) 205 | time.sleep(delay) 206 | 207 | 208 | def launch_app( 209 | app_name: str, device_id: str | None = None, delay: float | None = None 210 | ) -> bool: 211 | """ 212 | Launch an app by name. 213 | 214 | Args: 215 | app_name: The app name (must be in APP_PACKAGES). 216 | device_id: Optional ADB device ID. 217 | delay: Delay in seconds after launching. If None, uses configured default. 218 | 219 | Returns: 220 | True if app was launched, False if app not found. 221 | """ 222 | if delay is None: 223 | delay = TIMING_CONFIG.device.default_launch_delay 224 | 225 | if app_name not in APP_PACKAGES: 226 | return False 227 | 228 | adb_prefix = _get_adb_prefix(device_id) 229 | package = APP_PACKAGES[app_name] 230 | 231 | subprocess.run( 232 | adb_prefix 233 | + [ 234 | "shell", 235 | "monkey", 236 | "-p", 237 | package, 238 | "-c", 239 | "android.intent.category.LAUNCHER", 240 | "1", 241 | ], 242 | capture_output=True, 243 | ) 244 | time.sleep(delay) 245 | return True 246 | 247 | 248 | def _get_adb_prefix(device_id: str | None) -> list: 249 | """Get ADB command prefix with optional device specifier.""" 250 | if device_id: 251 | return ["adb", "-s", device_id] 252 | return ["adb"] 253 | -------------------------------------------------------------------------------- /phone_agent/hdc/device.py: -------------------------------------------------------------------------------- 1 | """Device control utilities for HarmonyOS automation.""" 2 | 3 | import os 4 | import subprocess 5 | import time 6 | from typing import List, Optional, Tuple 7 | 8 | from phone_agent.config.apps_harmonyos import APP_ABILITIES, APP_PACKAGES 9 | from phone_agent.config.timing import TIMING_CONFIG 10 | from phone_agent.hdc.connection import _run_hdc_command 11 | 12 | 13 | def get_current_app(device_id: str | None = None) -> str: 14 | """ 15 | Get the currently focused app name. 16 | 17 | Args: 18 | device_id: Optional HDC device ID for multi-device setups. 19 | 20 | Returns: 21 | The app name if recognized, otherwise "System Home". 22 | """ 23 | hdc_prefix = _get_hdc_prefix(device_id) 24 | 25 | result = _run_hdc_command( 26 | hdc_prefix + ["shell", "hidumper", "-s", "WindowManagerService", "-a", "-a"], 27 | capture_output=True, 28 | text=True, 29 | encoding="utf-8" 30 | ) 31 | output = result.stdout 32 | if not output: 33 | raise ValueError("No output from hidumper") 34 | 35 | # Parse window focus info 36 | for line in output.split("\n"): 37 | if "focused" in line.lower() or "current" in line.lower(): 38 | for app_name, package in APP_PACKAGES.items(): 39 | if package in line: 40 | return app_name 41 | 42 | return "System Home" 43 | 44 | 45 | def tap( 46 | x: int, y: int, device_id: str | None = None, delay: float | None = None 47 | ) -> None: 48 | """ 49 | Tap at the specified coordinates. 50 | 51 | Args: 52 | x: X coordinate. 53 | y: Y coordinate. 54 | device_id: Optional HDC device ID. 55 | delay: Delay in seconds after tap. If None, uses configured default. 56 | """ 57 | if delay is None: 58 | delay = TIMING_CONFIG.device.default_tap_delay 59 | 60 | hdc_prefix = _get_hdc_prefix(device_id) 61 | 62 | # HarmonyOS uses uitest uiInput click 63 | _run_hdc_command( 64 | hdc_prefix + ["shell", "uitest", "uiInput", "click", str(x), str(y)], 65 | capture_output=True 66 | ) 67 | time.sleep(delay) 68 | 69 | 70 | def double_tap( 71 | x: int, y: int, device_id: str | None = None, delay: float | None = None 72 | ) -> None: 73 | """ 74 | Double tap at the specified coordinates. 75 | 76 | Args: 77 | x: X coordinate. 78 | y: Y coordinate. 79 | device_id: Optional HDC device ID. 80 | delay: Delay in seconds after double tap. If None, uses configured default. 81 | """ 82 | if delay is None: 83 | delay = TIMING_CONFIG.device.default_double_tap_delay 84 | 85 | hdc_prefix = _get_hdc_prefix(device_id) 86 | 87 | # HarmonyOS uses uitest uiInput doubleClick 88 | _run_hdc_command( 89 | hdc_prefix + ["shell", "uitest", "uiInput", "doubleClick", str(x), str(y)], 90 | capture_output=True 91 | ) 92 | time.sleep(delay) 93 | 94 | 95 | def long_press( 96 | x: int, 97 | y: int, 98 | duration_ms: int = 3000, 99 | device_id: str | None = None, 100 | delay: float | None = None, 101 | ) -> None: 102 | """ 103 | Long press at the specified coordinates. 104 | 105 | Args: 106 | x: X coordinate. 107 | y: Y coordinate. 108 | duration_ms: Duration of press in milliseconds (note: HarmonyOS longClick may not support duration). 109 | device_id: Optional HDC device ID. 110 | delay: Delay in seconds after long press. If None, uses configured default. 111 | """ 112 | if delay is None: 113 | delay = TIMING_CONFIG.device.default_long_press_delay 114 | 115 | hdc_prefix = _get_hdc_prefix(device_id) 116 | 117 | # HarmonyOS uses uitest uiInput longClick 118 | # Note: longClick may have a fixed duration, duration_ms parameter might not be supported 119 | _run_hdc_command( 120 | hdc_prefix + ["shell", "uitest", "uiInput", "longClick", str(x), str(y)], 121 | capture_output=True, 122 | ) 123 | time.sleep(delay) 124 | 125 | 126 | def swipe( 127 | start_x: int, 128 | start_y: int, 129 | end_x: int, 130 | end_y: int, 131 | duration_ms: int | None = None, 132 | device_id: str | None = None, 133 | delay: float | None = None, 134 | ) -> None: 135 | """ 136 | Swipe from start to end coordinates. 137 | 138 | Args: 139 | start_x: Starting X coordinate. 140 | start_y: Starting Y coordinate. 141 | end_x: Ending X coordinate. 142 | end_y: Ending Y coordinate. 143 | duration_ms: Duration of swipe in milliseconds (auto-calculated if None). 144 | device_id: Optional HDC device ID. 145 | delay: Delay in seconds after swipe. If None, uses configured default. 146 | """ 147 | if delay is None: 148 | delay = TIMING_CONFIG.device.default_swipe_delay 149 | 150 | hdc_prefix = _get_hdc_prefix(device_id) 151 | 152 | if duration_ms is None: 153 | # Calculate duration based on distance 154 | dist_sq = (start_x - end_x) ** 2 + (start_y - end_y) ** 2 155 | duration_ms = int(dist_sq / 1000) 156 | duration_ms = max(500, min(duration_ms, 1000)) # Clamp between 500-1000ms 157 | 158 | # HarmonyOS uses uitest uiInput swipe 159 | # Format: swipe startX startY endX endY duration 160 | _run_hdc_command( 161 | hdc_prefix 162 | + [ 163 | "shell", 164 | "uitest", 165 | "uiInput", 166 | "swipe", 167 | str(start_x), 168 | str(start_y), 169 | str(end_x), 170 | str(end_y), 171 | str(duration_ms), 172 | ], 173 | capture_output=True, 174 | ) 175 | time.sleep(delay) 176 | 177 | 178 | def back(device_id: str | None = None, delay: float | None = None) -> None: 179 | """ 180 | Press the back button. 181 | 182 | Args: 183 | device_id: Optional HDC device ID. 184 | delay: Delay in seconds after pressing back. If None, uses configured default. 185 | """ 186 | if delay is None: 187 | delay = TIMING_CONFIG.device.default_back_delay 188 | 189 | hdc_prefix = _get_hdc_prefix(device_id) 190 | 191 | # HarmonyOS uses uitest uiInput keyEvent Back 192 | _run_hdc_command( 193 | hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", "Back"], 194 | capture_output=True 195 | ) 196 | time.sleep(delay) 197 | 198 | 199 | def home(device_id: str | None = None, delay: float | None = None) -> None: 200 | """ 201 | Press the home button. 202 | 203 | Args: 204 | device_id: Optional HDC device ID. 205 | delay: Delay in seconds after pressing home. If None, uses configured default. 206 | """ 207 | if delay is None: 208 | delay = TIMING_CONFIG.device.default_home_delay 209 | 210 | hdc_prefix = _get_hdc_prefix(device_id) 211 | 212 | # HarmonyOS uses uitest uiInput keyEvent Home 213 | _run_hdc_command( 214 | hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", "Home"], 215 | capture_output=True 216 | ) 217 | time.sleep(delay) 218 | 219 | 220 | def launch_app( 221 | app_name: str, device_id: str | None = None, delay: float | None = None 222 | ) -> bool: 223 | """ 224 | Launch an app by name. 225 | 226 | Args: 227 | app_name: The app name (must be in APP_PACKAGES). 228 | device_id: Optional HDC device ID. 229 | delay: Delay in seconds after launching. If None, uses configured default. 230 | 231 | Returns: 232 | True if app was launched, False if app not found. 233 | """ 234 | if delay is None: 235 | delay = TIMING_CONFIG.device.default_launch_delay 236 | 237 | if app_name not in APP_PACKAGES: 238 | print(f"[HDC] App '{app_name}' not found in HarmonyOS app list") 239 | print(f"[HDC] Available apps: {', '.join(sorted(APP_PACKAGES.keys())[:10])}...") 240 | return False 241 | 242 | hdc_prefix = _get_hdc_prefix(device_id) 243 | bundle = APP_PACKAGES[app_name] 244 | 245 | # Get the ability name for this bundle 246 | # Default to "EntryAbility" if not specified in APP_ABILITIES 247 | ability = APP_ABILITIES.get(bundle, "EntryAbility") 248 | 249 | # HarmonyOS uses 'aa start' command to launch apps 250 | # Format: aa start -b {bundle} -a {ability} 251 | _run_hdc_command( 252 | hdc_prefix 253 | + [ 254 | "shell", 255 | "aa", 256 | "start", 257 | "-b", 258 | bundle, 259 | "-a", 260 | ability, 261 | ], 262 | capture_output=True, 263 | ) 264 | time.sleep(delay) 265 | return True 266 | 267 | 268 | def _get_hdc_prefix(device_id: str | None) -> list: 269 | """Get HDC command prefix with optional device specifier.""" 270 | if device_id: 271 | return ["hdc", "-t", device_id] 272 | return ["hdc"] 273 | -------------------------------------------------------------------------------- /phone_agent/agent.py: -------------------------------------------------------------------------------- 1 | """Main PhoneAgent class for orchestrating phone automation.""" 2 | 3 | import json 4 | import traceback 5 | from dataclasses import dataclass 6 | from typing import Any, Callable 7 | 8 | from phone_agent.actions import ActionHandler 9 | from phone_agent.actions.handler import do, finish, parse_action 10 | from phone_agent.config import get_messages, get_system_prompt 11 | from phone_agent.device_factory import get_device_factory 12 | from phone_agent.model import ModelClient, ModelConfig 13 | from phone_agent.model.client import MessageBuilder 14 | 15 | 16 | @dataclass 17 | class AgentConfig: 18 | """Configuration for the PhoneAgent.""" 19 | 20 | max_steps: int = 100 21 | device_id: str | None = None 22 | lang: str = "cn" 23 | system_prompt: str | None = None 24 | verbose: bool = True 25 | 26 | def __post_init__(self): 27 | if self.system_prompt is None: 28 | self.system_prompt = get_system_prompt(self.lang) 29 | 30 | 31 | @dataclass 32 | class StepResult: 33 | """Result of a single agent step.""" 34 | 35 | success: bool 36 | finished: bool 37 | action: dict[str, Any] | None 38 | thinking: str 39 | message: str | None = None 40 | 41 | 42 | class PhoneAgent: 43 | """ 44 | AI-powered agent for automating Android phone interactions. 45 | 46 | The agent uses a vision-language model to understand screen content 47 | and decide on actions to complete user tasks. 48 | 49 | Args: 50 | model_config: Configuration for the AI model. 51 | agent_config: Configuration for the agent behavior. 52 | confirmation_callback: Optional callback for sensitive action confirmation. 53 | takeover_callback: Optional callback for takeover requests. 54 | 55 | Example: 56 | >>> from phone_agent import PhoneAgent 57 | >>> from phone_agent.model import ModelConfig 58 | >>> 59 | >>> model_config = ModelConfig(base_url="http://localhost:8000/v1") 60 | >>> agent = PhoneAgent(model_config) 61 | >>> agent.run("Open WeChat and send a message to John") 62 | """ 63 | 64 | def __init__( 65 | self, 66 | model_config: ModelConfig | None = None, 67 | agent_config: AgentConfig | None = None, 68 | confirmation_callback: Callable[[str], bool] | None = None, 69 | takeover_callback: Callable[[str], None] | None = None, 70 | ): 71 | self.model_config = model_config or ModelConfig() 72 | self.agent_config = agent_config or AgentConfig() 73 | 74 | self.model_client = ModelClient(self.model_config) 75 | self.action_handler = ActionHandler( 76 | device_id=self.agent_config.device_id, 77 | confirmation_callback=confirmation_callback, 78 | takeover_callback=takeover_callback, 79 | ) 80 | 81 | self._context: list[dict[str, Any]] = [] 82 | self._step_count = 0 83 | 84 | def run(self, task: str) -> str: 85 | """ 86 | Run the agent to complete a task. 87 | 88 | Args: 89 | task: Natural language description of the task. 90 | 91 | Returns: 92 | Final message from the agent. 93 | """ 94 | self._context = [] 95 | self._step_count = 0 96 | 97 | # First step with user prompt 98 | result = self._execute_step(task, is_first=True) 99 | 100 | if result.finished: 101 | return result.message or "Task completed" 102 | 103 | # Continue until finished or max steps reached 104 | while self._step_count < self.agent_config.max_steps: 105 | result = self._execute_step(is_first=False) 106 | 107 | if result.finished: 108 | return result.message or "Task completed" 109 | 110 | return "Max steps reached" 111 | 112 | def step(self, task: str | None = None) -> StepResult: 113 | """ 114 | Execute a single step of the agent. 115 | 116 | Useful for manual control or debugging. 117 | 118 | Args: 119 | task: Task description (only needed for first step). 120 | 121 | Returns: 122 | StepResult with step details. 123 | """ 124 | is_first = len(self._context) == 0 125 | 126 | if is_first and not task: 127 | raise ValueError("Task is required for the first step") 128 | 129 | return self._execute_step(task, is_first) 130 | 131 | def reset(self) -> None: 132 | """Reset the agent state for a new task.""" 133 | self._context = [] 134 | self._step_count = 0 135 | 136 | def _execute_step( 137 | self, user_prompt: str | None = None, is_first: bool = False 138 | ) -> StepResult: 139 | """Execute a single step of the agent loop.""" 140 | self._step_count += 1 141 | 142 | # Capture current screen state 143 | device_factory = get_device_factory() 144 | screenshot = device_factory.get_screenshot(self.agent_config.device_id) 145 | current_app = device_factory.get_current_app(self.agent_config.device_id) 146 | 147 | # Build messages 148 | if is_first: 149 | self._context.append( 150 | MessageBuilder.create_system_message(self.agent_config.system_prompt) 151 | ) 152 | 153 | screen_info = MessageBuilder.build_screen_info(current_app) 154 | text_content = f"{user_prompt}\n\n{screen_info}" 155 | 156 | self._context.append( 157 | MessageBuilder.create_user_message( 158 | text=text_content, image_base64=screenshot.base64_data 159 | ) 160 | ) 161 | else: 162 | screen_info = MessageBuilder.build_screen_info(current_app) 163 | text_content = f"** Screen Info **\n\n{screen_info}" 164 | 165 | self._context.append( 166 | MessageBuilder.create_user_message( 167 | text=text_content, image_base64=screenshot.base64_data 168 | ) 169 | ) 170 | 171 | # Get model response 172 | try: 173 | msgs = get_messages(self.agent_config.lang) 174 | print("\n" + "=" * 50) 175 | print(f"💭 {msgs['thinking']}:") 176 | print("-" * 50) 177 | response = self.model_client.request(self._context) 178 | except Exception as e: 179 | if self.agent_config.verbose: 180 | traceback.print_exc() 181 | return StepResult( 182 | success=False, 183 | finished=True, 184 | action=None, 185 | thinking="", 186 | message=f"Model error: {e}", 187 | ) 188 | 189 | # Parse action from response 190 | try: 191 | action = parse_action(response.action) 192 | except ValueError: 193 | if self.agent_config.verbose: 194 | traceback.print_exc() 195 | action = finish(message=response.action) 196 | 197 | if self.agent_config.verbose: 198 | # Print thinking process 199 | print("-" * 50) 200 | print(f"🎯 {msgs['action']}:") 201 | print(json.dumps(action, ensure_ascii=False, indent=2)) 202 | print("=" * 50 + "\n") 203 | 204 | # Remove image from context to save space 205 | self._context[-1] = MessageBuilder.remove_images_from_message(self._context[-1]) 206 | 207 | # Execute action 208 | try: 209 | result = self.action_handler.execute( 210 | action, screenshot.width, screenshot.height 211 | ) 212 | except Exception as e: 213 | if self.agent_config.verbose: 214 | traceback.print_exc() 215 | result = self.action_handler.execute( 216 | finish(message=str(e)), screenshot.width, screenshot.height 217 | ) 218 | 219 | # Add assistant response to context 220 | self._context.append( 221 | MessageBuilder.create_assistant_message( 222 | f"{response.thinking}{response.action}" 223 | ) 224 | ) 225 | 226 | # Check if finished 227 | finished = action.get("_metadata") == "finish" or result.should_finish 228 | 229 | if finished and self.agent_config.verbose: 230 | msgs = get_messages(self.agent_config.lang) 231 | print("\n" + "🎉 " + "=" * 48) 232 | print( 233 | f"✅ {msgs['task_completed']}: {result.message or action.get('message', msgs['done'])}" 234 | ) 235 | print("=" * 50 + "\n") 236 | 237 | return StepResult( 238 | success=result.success, 239 | finished=finished, 240 | action=action, 241 | thinking=response.thinking, 242 | message=result.message or action.get("message"), 243 | ) 244 | 245 | @property 246 | def context(self) -> list[dict[str, Any]]: 247 | """Get the current conversation context.""" 248 | return self._context.copy() 249 | 250 | @property 251 | def step_count(self) -> int: 252 | """Get the current step count.""" 253 | return self._step_count 254 | -------------------------------------------------------------------------------- /phone_agent/config/apps.py: -------------------------------------------------------------------------------- 1 | """App name to package name mapping for supported applications.""" 2 | 3 | APP_PACKAGES: dict[str, str] = { 4 | # Social & Messaging 5 | "微信": "com.tencent.mm", 6 | "QQ": "com.tencent.mobileqq", 7 | "微博": "com.sina.weibo", 8 | # E-commerce 9 | "淘宝": "com.taobao.taobao", 10 | "京东": "com.jingdong.app.mall", 11 | "拼多多": "com.xunmeng.pinduoduo", 12 | "淘宝闪购": "com.taobao.taobao", 13 | "京东秒送": "com.jingdong.app.mall", 14 | # Lifestyle & Social 15 | "小红书": "com.xingin.xhs", 16 | "豆瓣": "com.douban.frodo", 17 | "知乎": "com.zhihu.android", 18 | # Maps & Navigation 19 | "高德地图": "com.autonavi.minimap", 20 | "百度地图": "com.baidu.BaiduMap", 21 | # Food & Services 22 | "美团": "com.sankuai.meituan", 23 | "大众点评": "com.dianping.v1", 24 | "饿了么": "me.ele", 25 | "肯德基": "com.yek.android.kfc.activitys", 26 | # Travel 27 | "携程": "ctrip.android.view", 28 | "铁路12306": "com.MobileTicket", 29 | "12306": "com.MobileTicket", 30 | "去哪儿": "com.Qunar", 31 | "去哪儿旅行": "com.Qunar", 32 | "滴滴出行": "com.sdu.didi.psnger", 33 | # Video & Entertainment 34 | "bilibili": "tv.danmaku.bili", 35 | "抖音": "com.ss.android.ugc.aweme", 36 | "快手": "com.smile.gifmaker", 37 | "腾讯视频": "com.tencent.qqlive", 38 | "爱奇艺": "com.qiyi.video", 39 | "优酷视频": "com.youku.phone", 40 | "芒果TV": "com.hunantv.imgo.activity", 41 | "红果短剧": "com.phoenix.read", 42 | # Music & Audio 43 | "网易云音乐": "com.netease.cloudmusic", 44 | "QQ音乐": "com.tencent.qqmusic", 45 | "汽水音乐": "com.luna.music", 46 | "喜马拉雅": "com.ximalaya.ting.android", 47 | # Reading 48 | "番茄小说": "com.dragon.read", 49 | "番茄免费小说": "com.dragon.read", 50 | "七猫免费小说": "com.kmxs.reader", 51 | # Productivity 52 | "飞书": "com.ss.android.lark", 53 | "QQ邮箱": "com.tencent.androidqqmail", 54 | # AI & Tools 55 | "豆包": "com.larus.nova", 56 | # Health & Fitness 57 | "keep": "com.gotokeep.keep", 58 | "美柚": "com.lingan.seeyou", 59 | # News & Information 60 | "腾讯新闻": "com.tencent.news", 61 | "今日头条": "com.ss.android.article.news", 62 | # Real Estate 63 | "贝壳找房": "com.lianjia.beike", 64 | "安居客": "com.anjuke.android.app", 65 | # Finance 66 | "同花顺": "com.hexin.plat.android", 67 | # Games 68 | "星穹铁道": "com.miHoYo.hkrpg", 69 | "崩坏:星穹铁道": "com.miHoYo.hkrpg", 70 | "恋与深空": "com.papegames.lysk.cn", 71 | "AndroidSystemSettings": "com.android.settings", 72 | "Android System Settings": "com.android.settings", 73 | "Android System Settings": "com.android.settings", 74 | "Android-System-Settings": "com.android.settings", 75 | "Settings": "com.android.settings", 76 | "AudioRecorder": "com.android.soundrecorder", 77 | "audiorecorder": "com.android.soundrecorder", 78 | "Bluecoins": "com.rammigsoftware.bluecoins", 79 | "bluecoins": "com.rammigsoftware.bluecoins", 80 | "Broccoli": "com.flauschcode.broccoli", 81 | "broccoli": "com.flauschcode.broccoli", 82 | "Booking.com": "com.booking", 83 | "Booking": "com.booking", 84 | "booking.com": "com.booking", 85 | "booking": "com.booking", 86 | "BOOKING.COM": "com.booking", 87 | "Chrome": "com.android.chrome", 88 | "chrome": "com.android.chrome", 89 | "Google Chrome": "com.android.chrome", 90 | "Clock": "com.android.deskclock", 91 | "clock": "com.android.deskclock", 92 | "Contacts": "com.android.contacts", 93 | "contacts": "com.android.contacts", 94 | "Duolingo": "com.duolingo", 95 | "duolingo": "com.duolingo", 96 | "Expedia": "com.expedia.bookings", 97 | "expedia": "com.expedia.bookings", 98 | "Files": "com.android.fileexplorer", 99 | "files": "com.android.fileexplorer", 100 | "File Manager": "com.android.fileexplorer", 101 | "file manager": "com.android.fileexplorer", 102 | "gmail": "com.google.android.gm", 103 | "Gmail": "com.google.android.gm", 104 | "GoogleMail": "com.google.android.gm", 105 | "Google Mail": "com.google.android.gm", 106 | "GoogleFiles": "com.google.android.apps.nbu.files", 107 | "googlefiles": "com.google.android.apps.nbu.files", 108 | "FilesbyGoogle": "com.google.android.apps.nbu.files", 109 | "GoogleCalendar": "com.google.android.calendar", 110 | "Google-Calendar": "com.google.android.calendar", 111 | "Google Calendar": "com.google.android.calendar", 112 | "google-calendar": "com.google.android.calendar", 113 | "google calendar": "com.google.android.calendar", 114 | "GoogleChat": "com.google.android.apps.dynamite", 115 | "Google Chat": "com.google.android.apps.dynamite", 116 | "Google-Chat": "com.google.android.apps.dynamite", 117 | "GoogleClock": "com.google.android.deskclock", 118 | "Google Clock": "com.google.android.deskclock", 119 | "Google-Clock": "com.google.android.deskclock", 120 | "GoogleContacts": "com.google.android.contacts", 121 | "Google-Contacts": "com.google.android.contacts", 122 | "Google Contacts": "com.google.android.contacts", 123 | "google-contacts": "com.google.android.contacts", 124 | "google contacts": "com.google.android.contacts", 125 | "GoogleDocs": "com.google.android.apps.docs.editors.docs", 126 | "Google Docs": "com.google.android.apps.docs.editors.docs", 127 | "googledocs": "com.google.android.apps.docs.editors.docs", 128 | "google docs": "com.google.android.apps.docs.editors.docs", 129 | "Google Drive": "com.google.android.apps.docs", 130 | "Google-Drive": "com.google.android.apps.docs", 131 | "google drive": "com.google.android.apps.docs", 132 | "google-drive": "com.google.android.apps.docs", 133 | "GoogleDrive": "com.google.android.apps.docs", 134 | "Googledrive": "com.google.android.apps.docs", 135 | "googledrive": "com.google.android.apps.docs", 136 | "GoogleFit": "com.google.android.apps.fitness", 137 | "googlefit": "com.google.android.apps.fitness", 138 | "GoogleKeep": "com.google.android.keep", 139 | "googlekeep": "com.google.android.keep", 140 | "GoogleMaps": "com.google.android.apps.maps", 141 | "Google Maps": "com.google.android.apps.maps", 142 | "googlemaps": "com.google.android.apps.maps", 143 | "google maps": "com.google.android.apps.maps", 144 | "Google Play Books": "com.google.android.apps.books", 145 | "Google-Play-Books": "com.google.android.apps.books", 146 | "google play books": "com.google.android.apps.books", 147 | "google-play-books": "com.google.android.apps.books", 148 | "GooglePlayBooks": "com.google.android.apps.books", 149 | "googleplaybooks": "com.google.android.apps.books", 150 | "GooglePlayStore": "com.android.vending", 151 | "Google Play Store": "com.android.vending", 152 | "Google-Play-Store": "com.android.vending", 153 | "GoogleSlides": "com.google.android.apps.docs.editors.slides", 154 | "Google Slides": "com.google.android.apps.docs.editors.slides", 155 | "Google-Slides": "com.google.android.apps.docs.editors.slides", 156 | "GoogleTasks": "com.google.android.apps.tasks", 157 | "Google Tasks": "com.google.android.apps.tasks", 158 | "Google-Tasks": "com.google.android.apps.tasks", 159 | "Joplin": "net.cozic.joplin", 160 | "joplin": "net.cozic.joplin", 161 | "McDonald": "com.mcdonalds.app", 162 | "mcdonald": "com.mcdonalds.app", 163 | "Osmand": "net.osmand", 164 | "osmand": "net.osmand", 165 | "PiMusicPlayer": "com.Project100Pi.themusicplayer", 166 | "pimusicplayer": "com.Project100Pi.themusicplayer", 167 | "Quora": "com.quora.android", 168 | "quora": "com.quora.android", 169 | "Reddit": "com.reddit.frontpage", 170 | "reddit": "com.reddit.frontpage", 171 | "RetroMusic": "code.name.monkey.retromusic", 172 | "retromusic": "code.name.monkey.retromusic", 173 | "SimpleCalendarPro": "com.scientificcalculatorplus.simplecalculator.basiccalculator.mathcalc", 174 | "SimpleSMSMessenger": "com.simplemobiletools.smsmessenger", 175 | "Telegram": "org.telegram.messenger", 176 | "temu": "com.einnovation.temu", 177 | "Temu": "com.einnovation.temu", 178 | "Tiktok": "com.zhiliaoapp.musically", 179 | "tiktok": "com.zhiliaoapp.musically", 180 | "Twitter": "com.twitter.android", 181 | "twitter": "com.twitter.android", 182 | "X": "com.twitter.android", 183 | "VLC": "org.videolan.vlc", 184 | "WeChat": "com.tencent.mm", 185 | "wechat": "com.tencent.mm", 186 | "Whatsapp": "com.whatsapp", 187 | "WhatsApp": "com.whatsapp", 188 | } 189 | 190 | 191 | def get_package_name(app_name: str) -> str | None: 192 | """ 193 | Get the package name for an app. 194 | 195 | Args: 196 | app_name: The display name of the app. 197 | 198 | Returns: 199 | The Android package name, or None if not found. 200 | """ 201 | return APP_PACKAGES.get(app_name) 202 | 203 | 204 | def get_app_name(package_name: str) -> str | None: 205 | """ 206 | Get the app name from a package name. 207 | 208 | Args: 209 | package_name: The Android package name. 210 | 211 | Returns: 212 | The display name of the app, or None if not found. 213 | """ 214 | for name, package in APP_PACKAGES.items(): 215 | if package == package_name: 216 | return name 217 | return None 218 | 219 | 220 | def list_supported_apps() -> list[str]: 221 | """ 222 | Get a list of all supported app names. 223 | 224 | Returns: 225 | List of app names. 226 | """ 227 | return list(APP_PACKAGES.keys()) 228 | -------------------------------------------------------------------------------- /README_coding_agent.md: -------------------------------------------------------------------------------- 1 | # Open-AutoGLM Quick Start for Coding Agent 2 | 3 |
4 | 5 |
6 | 7 | > **本文专为 AI 助手(如 Claude Code)阅读,用于自动化部署 Open-AutoGLM。** 8 | > 9 | > **This document is designed for AI assistants (such as Claude Code) to automate the deployment of Open-AutoGLM.** 10 | > 11 | > 如果你是人类读者,可以跳过本文,按照 README.md 文档操作即可。 12 | > 13 | > If you are a human reader, you can skip this document and follow the README.md instructions instead. 14 | 15 | --- 16 | 17 | ## Table of Contents / 目录 18 | 19 | - [English](#english) 20 | - [中文](#中文) 21 | 22 | --- 23 | 24 | # English 25 | 26 | ## Prerequisites 27 | 28 | ### 1. Python Environment 29 | 30 | Python 3.10 or higher is required. 31 | 32 | ### 2. ADB (Android Debug Bridge) 33 | 34 | 1. Download the official ADB [installation package](https://developer.android.com/tools/releases/platform-tools) 35 | 2. Extract and configure environment variables: 36 | 37 | **macOS:** 38 | 39 | ```bash 40 | # Assuming extracted to ~/Downloads/platform-tools 41 | export PATH=${PATH}:~/Downloads/platform-tools 42 | ``` 43 | 44 | **Windows:** Add the extracted folder path to your system PATH. Refer to [this tutorial](https://blog.csdn.net/x2584179909/article/details/108319973) if needed. 45 | 46 | ### 3. Android Device Setup 47 | 48 | Requirements: 49 | - Android 7.0+ device or emulator 50 | - Developer Mode enabled 51 | - USB Debugging enabled 52 | 53 | **Enable Developer Mode:** 54 | 1. Go to `Settings > About Phone > Build Number` 55 | 2. Tap rapidly about 10 times until "Developer mode enabled" appears 56 | 57 | **Enable USB Debugging:** 58 | 1. Go to `Settings > Developer Options > USB Debugging` 59 | 2. Enable the toggle 60 | 3. Some devices may require a restart 61 | 62 | **Important permissions to check:** 63 | 64 | ![Permissions](resources/screenshot-20251210-120416.png) 65 | 66 | ### 4. Install ADB Keyboard 67 | 68 | Download and install [ADB Keyboard APK](https://github.com/senzhk/ADBKeyBoard/blob/master/ADBKeyboard.apk) on your device. 69 | 70 | After installation, enable it in `Settings > Input Method` or `Settings > Keyboard List`. 71 | 72 | --- 73 | 74 | ## Installation 75 | 76 | ```bash 77 | # Install dependencies 78 | pip install -r requirements.txt 79 | 80 | # Install package 81 | pip install -e . 82 | ``` 83 | 84 | --- 85 | 86 | ## ADB Configuration 87 | 88 | **Ensure your USB cable supports data transfer (not charging only).** 89 | 90 | ### Verify Connection 91 | 92 | ```bash 93 | # Check connected devices 94 | adb devices 95 | 96 | # Expected output: 97 | # List of devices attached 98 | # emulator-5554 device 99 | ``` 100 | 101 | ### Remote Debugging (WiFi) 102 | 103 | Ensure your phone and computer are on the same WiFi network. 104 | 105 | ![Enable Wireless Debugging](resources/screenshot-20251210-120630.png) 106 | 107 | ```bash 108 | # Connect via WiFi (replace with your phone's IP and port) 109 | adb connect 192.168.1.100:5555 110 | 111 | # Verify connection 112 | adb devices 113 | ``` 114 | 115 | ### Device Management 116 | 117 | ```bash 118 | # List all devices 119 | adb devices 120 | 121 | # Connect remote device 122 | adb connect : 123 | 124 | # Disconnect device 125 | adb disconnect : 126 | ``` 127 | 128 | --- 129 | 130 | ## Usage 131 | 132 | ### Command Line 133 | 134 | ```bash 135 | # Interactive mode 136 | python main.py --base-url --model 137 | 138 | # Execute specific task 139 | python main.py --base-url "Open Chrome browser" 140 | 141 | # Use API key authentication 142 | python main.py --apikey sk-xxxxx 143 | 144 | # English system prompt 145 | python main.py --lang en --base-url "Open Chrome browser" 146 | 147 | # List supported apps 148 | python main.py --list-apps 149 | 150 | # Specify device 151 | python main.py --device-id 192.168.1.100:5555 --base-url "Open TikTok" 152 | ``` 153 | 154 | ### Python API 155 | 156 | ```python 157 | from phone_agent import PhoneAgent 158 | from phone_agent.model import ModelConfig 159 | 160 | # Configure model 161 | model_config = ModelConfig( 162 | base_url="", 163 | model_name="", 164 | ) 165 | 166 | # Create Agent 167 | agent = PhoneAgent(model_config=model_config) 168 | 169 | # Execute task 170 | result = agent.run("Open eBay and search for wireless earbuds") 171 | print(result) 172 | ``` 173 | 174 | --- 175 | 176 | ## Environment Variables 177 | 178 | | Variable | Description | Default | 179 | |---------------------------|---------------------------|------------------------------| 180 | | `PHONE_AGENT_BASE_URL` | Model API URL | `http://localhost:8000/v1` | 181 | | `PHONE_AGENT_MODEL` | Model name | `autoglm-phone-9b` | 182 | | `PHONE_AGENT_API_KEY` | API key | `EMPTY` | 183 | | `PHONE_AGENT_MAX_STEPS` | Max steps per task | `100` | 184 | | `PHONE_AGENT_DEVICE_ID` | ADB device ID | (auto-detect) | 185 | | `PHONE_AGENT_LANG` | Language (`cn`/`en`) | `cn` | 186 | 187 | --- 188 | 189 | ## Troubleshooting 190 | 191 | ### Device Not Found 192 | 193 | ```bash 194 | adb kill-server 195 | adb start-server 196 | adb devices 197 | ``` 198 | 199 | Check: 200 | 1. USB debugging enabled 201 | 2. USB cable supports data transfer 202 | 3. Authorization popup approved on phone 203 | 4. Try different USB port/cable 204 | 205 | ### Can Open Apps but Cannot Tap 206 | 207 | Enable both in `Settings > Developer Options`: 208 | - **USB Debugging** 209 | - **USB Debugging (Security Settings)** 210 | 211 | ### Text Input Not Working 212 | 213 | 1. Ensure ADB Keyboard is installed 214 | 2. Enable in `Settings > System > Language & Input > Virtual Keyboard` 215 | 216 | ### Windows Encoding Issues 217 | 218 | Add environment variable before running: 219 | 220 | ```bash 221 | PYTHONIOENCODING=utf-8 python main.py ... 222 | ``` 223 | 224 | --- 225 | 226 | # 中文 227 | 228 | ## 环境要求 229 | 230 | ### 1. Python 环境 231 | 232 | 需要 Python 3.10 及以上版本。 233 | 234 | ### 2. ADB (Android Debug Bridge) 235 | 236 | 1. 下载官方 ADB [安装包](https://developer.android.com/tools/releases/platform-tools?hl=zh-cn) 237 | 2. 解压并配置环境变量: 238 | 239 | **macOS:** 240 | 241 | ```bash 242 | # 假设解压到 ~/Downloads/platform-tools 243 | export PATH=${PATH}:~/Downloads/platform-tools 244 | ``` 245 | 246 | **Windows:** 将解压后的文件夹路径添加到系统 PATH。可参考[此教程](https://blog.csdn.net/x2584179909/article/details/108319973)。 247 | 248 | ### 3. 安卓设备配置 249 | 250 | 要求: 251 | - Android 7.0+ 设备或模拟器 252 | - 开发者模式已启用 253 | - USB 调试已启用 254 | 255 | **启用开发者模式:** 256 | 1. 进入 `设置 > 关于手机 > 版本号` 257 | 2. 连续快速点击约 10 次,直到提示"开发者模式已启用" 258 | 259 | **启用 USB 调试:** 260 | 1. 进入 `设置 > 开发者选项 > USB 调试` 261 | 2. 开启开关 262 | 3. 部分设备可能需要重启 263 | 264 | **请务必检查以下权限:** 265 | 266 | ![权限](resources/screenshot-20251209-181423.png) 267 | 268 | ### 4. 安装 ADB Keyboard 269 | 270 | 在设备上下载并安装 [ADB Keyboard APK](https://github.com/senzhk/ADBKeyBoard/blob/master/ADBKeyboard.apk)。 271 | 272 | 安装后,在 `设置 > 输入法` 或 `设置 > 键盘列表` 中启用。 273 | 274 | --- 275 | 276 | ## 安装 277 | 278 | ```bash 279 | # 安装依赖 280 | pip install -r requirements.txt 281 | 282 | # 安装包 283 | pip install -e . 284 | ``` 285 | 286 | --- 287 | 288 | ## ADB 配置 289 | 290 | **请确保 USB 数据线支持数据传输(而非仅充电)。** 291 | 292 | ### 验证连接 293 | 294 | ```bash 295 | # 检查已连接设备 296 | adb devices 297 | 298 | # 预期输出: 299 | # List of devices attached 300 | # emulator-5554 device 301 | ``` 302 | 303 | ### 远程调试(WiFi) 304 | 305 | 确保手机和电脑在同一 WiFi 网络中。 306 | 307 | ![开启无线调试](resources/setting.png) 308 | 309 | ```bash 310 | # 通过 WiFi 连接(替换为手机显示的 IP 和端口) 311 | adb connect 192.168.1.100:5555 312 | 313 | # 验证连接 314 | adb devices 315 | ``` 316 | 317 | ### 设备管理 318 | 319 | ```bash 320 | # 列出所有设备 321 | adb devices 322 | 323 | # 连接远程设备 324 | adb connect : 325 | 326 | # 断开设备 327 | adb disconnect : 328 | ``` 329 | 330 | --- 331 | 332 | ## 使用方法 333 | 334 | ### 命令行 335 | 336 | ```bash 337 | # 交互模式 338 | python main.py --base-url <模型API地址> --model <模型名称> 339 | 340 | # 执行指定任务 341 | python main.py --base-url <模型API地址> "打开美团搜索附近的火锅店" 342 | 343 | # 使用 API Key 认证 344 | python main.py --apikey sk-xxxxx 345 | 346 | # 使用英文系统提示词 347 | python main.py --lang en --base-url <模型API地址> "Open Chrome browser" 348 | 349 | # 列出支持的应用 350 | python main.py --list-apps 351 | 352 | # 指定设备 353 | python main.py --device-id 192.168.1.100:5555 --base-url <模型API地址> "打开抖音刷视频" 354 | ``` 355 | 356 | ### Python API 357 | 358 | ```python 359 | from phone_agent import PhoneAgent 360 | from phone_agent.model import ModelConfig 361 | 362 | # 配置模型 363 | model_config = ModelConfig( 364 | base_url="<模型API地址>", 365 | model_name="<模型名称>", 366 | ) 367 | 368 | # 创建 Agent 369 | agent = PhoneAgent(model_config=model_config) 370 | 371 | # 执行任务 372 | result = agent.run("打开淘宝搜索无线耳机") 373 | print(result) 374 | ``` 375 | 376 | --- 377 | 378 | ## 环境变量 379 | 380 | | 变量 | 描述 | 默认值 | 381 | |---------------------------|------------------|----------------------------| 382 | | `PHONE_AGENT_BASE_URL` | 模型 API 地址 | `http://localhost:8000/v1` | 383 | | `PHONE_AGENT_MODEL` | 模型名称 | `autoglm-phone-9b` | 384 | | `PHONE_AGENT_API_KEY` | API Key | `EMPTY` | 385 | | `PHONE_AGENT_MAX_STEPS` | 每个任务最大步数 | `100` | 386 | | `PHONE_AGENT_DEVICE_ID` | ADB 设备 ID | (自动检测) | 387 | | `PHONE_AGENT_LANG` | 语言 (`cn`/`en`) | `cn` | 388 | 389 | --- 390 | 391 | ## 常见问题 392 | 393 | ### 设备未找到 394 | 395 | ```bash 396 | adb kill-server 397 | adb start-server 398 | adb devices 399 | ``` 400 | 401 | 检查: 402 | 1. USB 调试是否已开启 403 | 2. 数据线是否支持数据传输 404 | 3. 手机上的授权弹窗是否已点击「允许」 405 | 4. 尝试更换 USB 接口或数据线 406 | 407 | ### 能打开应用但无法点击 408 | 409 | 在 `设置 > 开发者选项` 中同时启用: 410 | - **USB 调试** 411 | - **USB 调试(安全设置)** 412 | 413 | ### 文本输入不工作 414 | 415 | 1. 确保已安装 ADB Keyboard 416 | 2. 在 `设置 > 系统 > 语言和输入法 > 虚拟键盘` 中启用 417 | 418 | ### Windows 编码异常 419 | 420 | 运行代码前添加环境变量: 421 | 422 | ```bash 423 | PYTHONIOENCODING=utf-8 python main.py ... 424 | ``` 425 | 426 | --- 427 | 428 | ## License 429 | 430 | This project is for research and learning purposes only. See [Terms of Use](resources/privacy_policy.txt) / [使用条款](resources/privacy_policy.txt). 431 | -------------------------------------------------------------------------------- /phone_agent/config/apps_harmonyos.py: -------------------------------------------------------------------------------- 1 | """HarmonyOS application package name mappings. 2 | 3 | Maps user-friendly app names to HarmonyOS bundle names. 4 | These bundle names are used with the 'hdc shell aa start -b ' command. 5 | """ 6 | 7 | # Custom ability names for apps that don't use the default "EntryAbility" 8 | # Maps bundle_name -> ability_name 9 | # Generated by: python test/find_abilities.py 10 | APP_ABILITIES: dict[str, str] = { 11 | # Third-party apps 12 | "cn.wps.mobileoffice.hap": "DocumentAbility", 13 | "com.ccb.mobilebank.hm": "CcbMainAbility", 14 | "com.dewu.hos": "HomeAbility", 15 | "com.larus.nova.hm": "MainAbility", 16 | "com.luna.hm.music": "MainAbility", 17 | "com.meitu.meitupic": "MainAbility", 18 | "com.ss.hm.article.news": "MainAbility", 19 | "com.ss.hm.ugc.aweme": "MainAbility", 20 | "com.taobao.taobao4hmos": "Taobao_mainAbility", 21 | "com.tencent.videohm": "AppAbility", 22 | "com.ximalaya.ting.xmharmony": "MainBundleAbility", 23 | "com.zhihu.hmos": "PhoneAbility", 24 | 25 | # Huawei system apps 26 | "com.huawei.hmos.browser": "MainAbility", 27 | "com.huawei.hmos.calculator": "com.huawei.hmos.calculator.CalculatorAbility", 28 | "com.huawei.hmos.calendar": "MainAbility", 29 | "com.huawei.hmos.camera": "com.huawei.hmos.camera.MainAbility", 30 | "com.huawei.hmos.clock": "com.huawei.hmos.clock.phone", 31 | "com.huawei.hmos.clouddrive": "MainAbility", 32 | "com.huawei.hmos.email": "ApplicationAbility", 33 | "com.huawei.hmos.filemanager": "MainAbility", 34 | "com.huawei.hmos.health": "Activity_card_entryAbility", 35 | "com.huawei.hmos.notepad": "MainAbility", 36 | "com.huawei.hmos.photos": "MainAbility", 37 | "com.huawei.hmos.screenrecorder": "com.huawei.hmos.screenrecorder.ServiceExtAbility", 38 | "com.huawei.hmos.screenshot": "com.huawei.hmos.screenshot.ServiceExtAbility", 39 | "com.huawei.hmos.settings": "com.huawei.hmos.settings.MainAbility", 40 | "com.huawei.hmos.soundrecorder": "MainAbility", 41 | "com.huawei.hmos.vassistant": "AiCaptionServiceExtAbility", 42 | "com.huawei.hmos.wallet": "MainAbility", 43 | 44 | # Huawei services 45 | "com.huawei.hmsapp.appgallery": "MainAbility", 46 | "com.huawei.hmsapp.books": "MainAbility", 47 | "com.huawei.hmsapp.himovie": "MainAbility", 48 | "com.huawei.hmsapp.hisearch": "MainAbility", 49 | "com.huawei.hmsapp.music": "MainAbility", 50 | "com.huawei.hmsapp.thememanager": "MainAbility", 51 | "com.huawei.hmsapp.totemweather": "com.huawei.hmsapp.totemweather.MainAbility", 52 | 53 | # OHOS system apps 54 | "com.ohos.callui": "com.ohos.callui.ServiceAbility", 55 | "com.ohos.contacts": "com.ohos.contacts.MainAbility", 56 | "com.ohos.mms": "com.ohos.mms.MainAbility", 57 | } 58 | 59 | APP_PACKAGES: dict[str, str] = { 60 | # Social & Messaging 61 | "微信": "com.tencent.wechat", 62 | "QQ": "com.tencent.mqq", 63 | "微博": "com.sina.weibo.stage", 64 | # E-commerce 65 | "淘宝": "com.taobao.taobao4hmos", 66 | "京东": "com.jd.hm.mall", 67 | "拼多多": "com.xunmeng.pinduoduo.hos", 68 | "淘宝闪购": "com.taobao.taobao4hmos", 69 | "京东秒送": "com.jd.hm.mall", 70 | # Lifestyle & Social 71 | "小红书": "com.xingin.xhs_hos", 72 | "知乎": "com.zhihu.hmos", 73 | # "豆瓣": "com.douban.frodo", # 未在 hdc 列表中找到 74 | # Maps & Navigation 75 | "高德地图": "com.amap.hmapp", 76 | "百度地图": "com.baidu.hmmap", 77 | # Food & Services 78 | "美团": "com.sankuai.hmeituan", 79 | "美团外卖": "com.meituan.takeaway", 80 | "大众点评": "com.sankuai.dianping", 81 | # "肯德基": "com.yek.android.kfc.activitys", # 未在 hdc 列表中找到 82 | # Travel 83 | # "携程": "ctrip.android.view", # 未在 hdc 列表中找到 84 | "铁路12306": "com.chinarailway.ticketingHM", 85 | "12306": "com.chinarailway.ticketingHM", 86 | # "去哪儿": "com.Qunar", # 未在 hdc 列表中找到 87 | # "去哪儿旅行": "com.Qunar", # 未在 hdc 列表中找到 88 | "滴滴出行": "com.sdu.didi.hmos.psnger", 89 | # Video & Entertainment 90 | "bilibili": "yylx.danmaku.bili", 91 | "抖音": "com.ss.hm.ugc.aweme", 92 | "快手": "com.kuaishou.hmapp", 93 | "腾讯视频": "com.tencent.videohm", 94 | "爱奇艺": "com.qiyi.video.hmy", 95 | "芒果TV": "com.mgtv.phone", 96 | # "优酷视频": "com.youku.phone", # 未在 hdc 列表中找到 97 | # "红果短剧": "com.phoenix.read", # 未在 hdc 列表中找到 98 | # Music & Audio 99 | # "网易云音乐": "com.netease.cloudmusic", # 未在 hdc 列表中找到 100 | "QQ音乐": "com.tencent.hm.qqmusic", 101 | "汽水音乐": "com.luna.hm.music", 102 | "喜马拉雅": "com.ximalaya.ting.xmharmony", 103 | # Reading 104 | # "番茄小说": "com.dragon.read", # 未在 hdc 列表中找到 105 | # "番茄免费小说": "com.dragon.read", # 未在 hdc 列表中找到 106 | # "七猫免费小说": "com.kmxs.reader", # 未在 hdc 列表中找到 107 | # Productivity 108 | "飞书": "com.ss.feishu", 109 | # "QQ邮箱": "com.tencent.androidqqmail", # 未在 hdc 列表中找到 110 | # AI & Tools 111 | "豆包": "com.larus.nova.hm", 112 | # Health & Fitness 113 | # "keep": "com.gotokeep.keep", # 未在 hdc 列表中找到 114 | # "美柚": "com.lingan.seeyou", # 未在 hdc 列表中找到 115 | # News & Information 116 | # "腾讯新闻": "com.tencent.news", # 未在 hdc 列表中找到 117 | "今日头条": "com.ss.hm.article.news", 118 | # Real Estate 119 | # "贝壳找房": "com.lianjia.beike", # 未在 hdc 列表中找到 120 | # "安居客": "com.anjuke.android.app", # 未在 hdc 列表中找到 121 | # Finance 122 | # "同花顺": "com.hexin.plat.android", # 未在 hdc 列表中找到 123 | # Games 124 | # "星穹铁道": "com.miHoYo.hkrpg", # 未在 hdc 列表中找到 125 | # "崩坏:星穹铁道": "com.miHoYo.hkrpg", # 未在 hdc 列表中找到 126 | # "恋与深空": "com.papegames.lysk.cn", # 未在 hdc 列表中找到 127 | 128 | # HarmonyOS 第三方应用 129 | "百度": "com.baidu.baiduapp", 130 | "阿里巴巴": "com.alibaba.wireless_hmos", 131 | "WPS": "cn.wps.mobileoffice.hap", 132 | "企业微信": "com.tencent.wework.hmos", 133 | "同程": "com.tongcheng.hmos", 134 | "同程旅行": "com.tongcheng.hmos", 135 | "唯品会": "com.vip.hosapp", 136 | "支付宝": "com.alipay.mobile.client", 137 | "UC浏览器": "com.uc.mobile", 138 | "闲鱼": "com.taobao.idlefish4ohos", 139 | "转转": "com.zhuanzhuan.hmoszz", 140 | "迅雷": "com.xunlei.thunder", 141 | "搜狗输入法": "com.sogou.input", 142 | "扫描全能王": "com.intsig.camscanner.hap", 143 | "美图秀秀": "com.meitu.meitupic", 144 | "58同城": "com.wuba.life", 145 | "得物": "com.dewu.hos", 146 | "海底捞": "com.haidilao.haros", 147 | "中国移动": "com.droi.tong", 148 | "中国联通": "com.sinovatech.unicom.ha", 149 | "国家税务总局": "cn.gov.chinatax.gt4.hm", 150 | "建设银行": "com.ccb.mobilebank.hm", 151 | "快手极速版": "com.kuaishou.hmnebula", 152 | 153 | # HarmonyOS 系统应用 - 工具类 154 | "浏览器": "com.huawei.hmos.browser", 155 | "计算器": "com.huawei.hmos.calculator", 156 | "日历": "com.huawei.hmos.calendar", 157 | "相机": "com.huawei.hmos.camera", 158 | "时钟": "com.huawei.hmos.clock", 159 | "云盘": "com.huawei.hmos.clouddrive", 160 | "云空间": "com.huawei.hmos.clouddrive", 161 | "邮件": "com.huawei.hmos.email", 162 | "文件管理器": "com.huawei.hmos.filemanager", 163 | "文件": "com.huawei.hmos.files", 164 | "查找设备": "com.huawei.hmos.finddevice", 165 | "查找手机": "com.huawei.hmos.finddevice", 166 | "录音机": "com.huawei.hmos.soundrecorder", 167 | "录音": "com.huawei.hmos.soundrecorder", 168 | "录屏": "com.huawei.hmos.screenrecorder", 169 | "截屏": "com.huawei.hmos.screenshot", 170 | "笔记": "com.huawei.hmos.notepad", 171 | "备忘录": "com.huawei.hmos.notepad", 172 | 173 | # HarmonyOS 系统应用 - 媒体类 174 | "相册": "com.huawei.hmos.photos", 175 | "图库": "com.huawei.hmos.photos", 176 | # "视频": "com.huawei.hmos.mediaplayer", # 未在 hdc 列表中找到,但有 com.huawei.hmsapp.himovie 177 | 178 | # HarmonyOS 系统应用 - 通讯类 179 | "联系人": "com.ohos.contacts", 180 | "通讯录": "com.ohos.contacts", 181 | "短信": "com.ohos.mms", 182 | "信息": "com.ohos.mms", 183 | "电话": "com.ohos.callui", 184 | "拨号": "com.ohos.callui", 185 | 186 | # HarmonyOS 系统应用 - 设置类 187 | "设置": "com.huawei.hmos.settings", 188 | "系统设置": "com.huawei.hmos.settings", 189 | "AndroidSystemSettings": "com.huawei.hmos.settings", 190 | "Android System Settings": "com.huawei.hmos.settings", 191 | "Android System Settings": "com.huawei.hmos.settings", 192 | "Android-System-Settings": "com.huawei.hmos.settings", 193 | "Settings": "com.huawei.hmos.settings", 194 | 195 | # HarmonyOS 系统应用 - 生活服务 196 | "健康": "com.huawei.hmos.health", 197 | "运动健康": "com.huawei.hmos.health", 198 | "地图": "com.huawei.hmos.maps.app", 199 | "华为地图": "com.huawei.hmos.maps.app", 200 | "钱包": "com.huawei.hmos.wallet", 201 | "华为钱包": "com.huawei.hmos.wallet", 202 | "智慧生活": "com.huawei.hmos.ailife", 203 | "智能助手": "com.huawei.hmos.vassistant", 204 | "小艺": "com.huawei.hmos.vassistant", 205 | 206 | # HarmonyOS 服务 207 | "应用市场": "com.huawei.hmsapp.appgallery", 208 | "华为应用市场": "com.huawei.hmsapp.appgallery", 209 | "音乐": "com.huawei.hmsapp.music", 210 | "华为音乐": "com.huawei.hmsapp.music", 211 | "主题": "com.huawei.hmsapp.thememanager", 212 | "主题管理": "com.huawei.hmsapp.thememanager", 213 | "天气": "com.huawei.hmsapp.totemweather", 214 | "华为天气": "com.huawei.hmsapp.totemweather", 215 | "视频": "com.huawei.hmsapp.himovie", 216 | "华为视频": "com.huawei.hmsapp.himovie", 217 | "阅读": "com.huawei.hmsapp.books", 218 | "华为阅读": "com.huawei.hmsapp.books", 219 | "游戏中心": "com.huawei.hmsapp.gamecenter", 220 | "华为游戏中心": "com.huawei.hmsapp.gamecenter", 221 | "搜索": "com.huawei.hmsapp.hisearch", 222 | "华为搜索": "com.huawei.hmsapp.hisearch", 223 | "指南针": "com.huawei.hmsapp.compass", 224 | "会员中心": "com.huawei.hmos.myhuawei", 225 | "我的华为": "com.huawei.hmos.myhuawei", 226 | "华为会员": "com.huawei.hmos.myhuawei", 227 | } 228 | 229 | 230 | def get_package_name(app_name: str) -> str | None: 231 | """ 232 | Get the package name for an app. 233 | 234 | Args: 235 | app_name: The display name of the app. 236 | 237 | Returns: 238 | The HarmonyOS bundle name, or None if not found. 239 | """ 240 | return APP_PACKAGES.get(app_name) 241 | 242 | 243 | def get_app_name(package_name: str) -> str | None: 244 | """ 245 | Get the app name from a package name. 246 | 247 | Args: 248 | package_name: The HarmonyOS bundle name. 249 | 250 | Returns: 251 | The display name of the app, or None if not found. 252 | """ 253 | for name, package in APP_PACKAGES.items(): 254 | if package == package_name: 255 | return name 256 | return None 257 | 258 | 259 | def list_supported_apps() -> list[str]: 260 | """ 261 | Get a list of all supported app names. 262 | 263 | Returns: 264 | List of app names. 265 | """ 266 | return list(APP_PACKAGES.keys()) 267 | -------------------------------------------------------------------------------- /phone_agent/model/client.py: -------------------------------------------------------------------------------- 1 | """Model client for AI inference using OpenAI-compatible API.""" 2 | 3 | import json 4 | import time 5 | from dataclasses import dataclass, field 6 | from typing import Any 7 | 8 | from openai import OpenAI 9 | 10 | from phone_agent.config.i18n import get_message 11 | 12 | 13 | @dataclass 14 | class ModelConfig: 15 | """Configuration for the AI model.""" 16 | 17 | base_url: str = "http://localhost:8000/v1" 18 | api_key: str = "EMPTY" 19 | model_name: str = "autoglm-phone-9b" 20 | max_tokens: int = 3000 21 | temperature: float = 0.0 22 | top_p: float = 0.85 23 | frequency_penalty: float = 0.2 24 | extra_body: dict[str, Any] = field(default_factory=dict) 25 | lang: str = "cn" # Language for UI messages: 'cn' or 'en' 26 | 27 | 28 | @dataclass 29 | class ModelResponse: 30 | """Response from the AI model.""" 31 | 32 | thinking: str 33 | action: str 34 | raw_content: str 35 | # Performance metrics 36 | time_to_first_token: float | None = None # Time to first token (seconds) 37 | time_to_thinking_end: float | None = None # Time to thinking end (seconds) 38 | total_time: float | None = None # Total inference time (seconds) 39 | 40 | 41 | class ModelClient: 42 | """ 43 | Client for interacting with OpenAI-compatible vision-language models. 44 | 45 | Args: 46 | config: Model configuration. 47 | """ 48 | 49 | def __init__(self, config: ModelConfig | None = None): 50 | self.config = config or ModelConfig() 51 | self.client = OpenAI(base_url=self.config.base_url, api_key=self.config.api_key) 52 | 53 | def request(self, messages: list[dict[str, Any]]) -> ModelResponse: 54 | """ 55 | Send a request to the model. 56 | 57 | Args: 58 | messages: List of message dictionaries in OpenAI format. 59 | 60 | Returns: 61 | ModelResponse containing thinking and action. 62 | 63 | Raises: 64 | ValueError: If the response cannot be parsed. 65 | """ 66 | # Start timing 67 | start_time = time.time() 68 | time_to_first_token = None 69 | time_to_thinking_end = None 70 | 71 | stream = self.client.chat.completions.create( 72 | messages=messages, 73 | model=self.config.model_name, 74 | max_tokens=self.config.max_tokens, 75 | temperature=self.config.temperature, 76 | top_p=self.config.top_p, 77 | frequency_penalty=self.config.frequency_penalty, 78 | extra_body=self.config.extra_body, 79 | stream=True, 80 | ) 81 | 82 | raw_content = "" 83 | buffer = "" # Buffer to hold content that might be part of a marker 84 | action_markers = ["finish(message=", "do(action="] 85 | in_action_phase = False # Track if we've entered the action phase 86 | first_token_received = False 87 | 88 | for chunk in stream: 89 | if len(chunk.choices) == 0: 90 | continue 91 | if chunk.choices[0].delta.content is not None: 92 | content = chunk.choices[0].delta.content 93 | raw_content += content 94 | 95 | # Record time to first token 96 | if not first_token_received: 97 | time_to_first_token = time.time() - start_time 98 | first_token_received = True 99 | 100 | if in_action_phase: 101 | # Already in action phase, just accumulate content without printing 102 | continue 103 | 104 | buffer += content 105 | 106 | # Check if any marker is fully present in buffer 107 | marker_found = False 108 | for marker in action_markers: 109 | if marker in buffer: 110 | # Marker found, print everything before it 111 | thinking_part = buffer.split(marker, 1)[0] 112 | print(thinking_part, end="", flush=True) 113 | print() # Print newline after thinking is complete 114 | in_action_phase = True 115 | marker_found = True 116 | 117 | # Record time to thinking end 118 | if time_to_thinking_end is None: 119 | time_to_thinking_end = time.time() - start_time 120 | 121 | break 122 | 123 | if marker_found: 124 | continue # Continue to collect remaining content 125 | 126 | # Check if buffer ends with a prefix of any marker 127 | # If so, don't print yet (wait for more content) 128 | is_potential_marker = False 129 | for marker in action_markers: 130 | for i in range(1, len(marker)): 131 | if buffer.endswith(marker[:i]): 132 | is_potential_marker = True 133 | break 134 | if is_potential_marker: 135 | break 136 | 137 | if not is_potential_marker: 138 | # Safe to print the buffer 139 | print(buffer, end="", flush=True) 140 | buffer = "" 141 | 142 | # Calculate total time 143 | total_time = time.time() - start_time 144 | 145 | # Parse thinking and action from response 146 | thinking, action = self._parse_response(raw_content) 147 | 148 | # Print performance metrics 149 | lang = self.config.lang 150 | print() 151 | print("=" * 50) 152 | print(f"⏱️ {get_message('performance_metrics', lang)}:") 153 | print("-" * 50) 154 | if time_to_first_token is not None: 155 | print( 156 | f"{get_message('time_to_first_token', lang)}: {time_to_first_token:.3f}s" 157 | ) 158 | if time_to_thinking_end is not None: 159 | print( 160 | f"{get_message('time_to_thinking_end', lang)}: {time_to_thinking_end:.3f}s" 161 | ) 162 | print( 163 | f"{get_message('total_inference_time', lang)}: {total_time:.3f}s" 164 | ) 165 | print("=" * 50) 166 | 167 | return ModelResponse( 168 | thinking=thinking, 169 | action=action, 170 | raw_content=raw_content, 171 | time_to_first_token=time_to_first_token, 172 | time_to_thinking_end=time_to_thinking_end, 173 | total_time=total_time, 174 | ) 175 | 176 | def _parse_response(self, content: str) -> tuple[str, str]: 177 | """ 178 | Parse the model response into thinking and action parts. 179 | 180 | Parsing rules: 181 | 1. If content contains 'finish(message=', everything before is thinking, 182 | everything from 'finish(message=' onwards is action. 183 | 2. If rule 1 doesn't apply but content contains 'do(action=', 184 | everything before is thinking, everything from 'do(action=' onwards is action. 185 | 3. Fallback: If content contains '', use legacy parsing with XML tags. 186 | 4. Otherwise, return empty thinking and full content as action. 187 | 188 | Args: 189 | content: Raw response content. 190 | 191 | Returns: 192 | Tuple of (thinking, action). 193 | """ 194 | # Rule 1: Check for finish(message= 195 | if "finish(message=" in content: 196 | parts = content.split("finish(message=", 1) 197 | thinking = parts[0].strip() 198 | action = "finish(message=" + parts[1] 199 | return thinking, action 200 | 201 | # Rule 2: Check for do(action= 202 | if "do(action=" in content: 203 | parts = content.split("do(action=", 1) 204 | thinking = parts[0].strip() 205 | action = "do(action=" + parts[1] 206 | return thinking, action 207 | 208 | # Rule 3: Fallback to legacy XML tag parsing 209 | if "" in content: 210 | parts = content.split("", 1) 211 | thinking = parts[0].replace("", "").replace("", "").strip() 212 | action = parts[1].replace("", "").strip() 213 | return thinking, action 214 | 215 | # Rule 4: No markers found, return content as action 216 | return "", content 217 | 218 | 219 | class MessageBuilder: 220 | """Helper class for building conversation messages.""" 221 | 222 | @staticmethod 223 | def create_system_message(content: str) -> dict[str, Any]: 224 | """Create a system message.""" 225 | return {"role": "system", "content": content} 226 | 227 | @staticmethod 228 | def create_user_message( 229 | text: str, image_base64: str | None = None 230 | ) -> dict[str, Any]: 231 | """ 232 | Create a user message with optional image. 233 | 234 | Args: 235 | text: Text content. 236 | image_base64: Optional base64-encoded image. 237 | 238 | Returns: 239 | Message dictionary. 240 | """ 241 | content = [] 242 | 243 | if image_base64: 244 | content.append( 245 | { 246 | "type": "image_url", 247 | "image_url": {"url": f"data:image/png;base64,{image_base64}"}, 248 | } 249 | ) 250 | 251 | content.append({"type": "text", "text": text}) 252 | 253 | return {"role": "user", "content": content} 254 | 255 | @staticmethod 256 | def create_assistant_message(content: str) -> dict[str, Any]: 257 | """Create an assistant message.""" 258 | return {"role": "assistant", "content": content} 259 | 260 | @staticmethod 261 | def remove_images_from_message(message: dict[str, Any]) -> dict[str, Any]: 262 | """ 263 | Remove image content from a message to save context space. 264 | 265 | Args: 266 | message: Message dictionary. 267 | 268 | Returns: 269 | Message with images removed. 270 | """ 271 | if isinstance(message.get("content"), list): 272 | message["content"] = [ 273 | item for item in message["content"] if item.get("type") == "text" 274 | ] 275 | return message 276 | 277 | @staticmethod 278 | def build_screen_info(current_app: str, **extra_info) -> str: 279 | """ 280 | Build screen info string for the model. 281 | 282 | Args: 283 | current_app: Current app name. 284 | **extra_info: Additional info to include. 285 | 286 | Returns: 287 | JSON string with screen info. 288 | """ 289 | info = {"current_app": current_app, **extra_info} 290 | return json.dumps(info, ensure_ascii=False) 291 | -------------------------------------------------------------------------------- /phone_agent/adb/connection.py: -------------------------------------------------------------------------------- 1 | """ADB connection management for local and remote devices.""" 2 | 3 | import subprocess 4 | import time 5 | from dataclasses import dataclass 6 | from enum import Enum 7 | from typing import Optional 8 | 9 | from phone_agent.config.timing import TIMING_CONFIG 10 | 11 | 12 | class ConnectionType(Enum): 13 | """Type of ADB connection.""" 14 | 15 | USB = "usb" 16 | WIFI = "wifi" 17 | REMOTE = "remote" 18 | 19 | 20 | @dataclass 21 | class DeviceInfo: 22 | """Information about a connected device.""" 23 | 24 | device_id: str 25 | status: str 26 | connection_type: ConnectionType 27 | model: str | None = None 28 | android_version: str | None = None 29 | 30 | 31 | class ADBConnection: 32 | """ 33 | Manages ADB connections to Android devices. 34 | 35 | Supports USB, WiFi, and remote TCP/IP connections. 36 | 37 | Example: 38 | >>> conn = ADBConnection() 39 | >>> # Connect to remote device 40 | >>> conn.connect("192.168.1.100:5555") 41 | >>> # List devices 42 | >>> devices = conn.list_devices() 43 | >>> # Disconnect 44 | >>> conn.disconnect("192.168.1.100:5555") 45 | """ 46 | 47 | def __init__(self, adb_path: str = "adb"): 48 | """ 49 | Initialize ADB connection manager. 50 | 51 | Args: 52 | adb_path: Path to ADB executable. 53 | """ 54 | self.adb_path = adb_path 55 | 56 | def connect(self, address: str, timeout: int = 10) -> tuple[bool, str]: 57 | """ 58 | Connect to a remote device via TCP/IP. 59 | 60 | Args: 61 | address: Device address in format "host:port" (e.g., "192.168.1.100:5555"). 62 | timeout: Connection timeout in seconds. 63 | 64 | Returns: 65 | Tuple of (success, message). 66 | 67 | Note: 68 | The remote device must have TCP/IP debugging enabled. 69 | On the device, run: adb tcpip 5555 70 | """ 71 | # Validate address format 72 | if ":" not in address: 73 | address = f"{address}:5555" # Default ADB port 74 | 75 | try: 76 | result = subprocess.run( 77 | [self.adb_path, "connect", address], 78 | capture_output=True, 79 | text=True, 80 | timeout=timeout, 81 | ) 82 | 83 | output = result.stdout + result.stderr 84 | 85 | if "connected" in output.lower(): 86 | return True, f"Connected to {address}" 87 | elif "already connected" in output.lower(): 88 | return True, f"Already connected to {address}" 89 | else: 90 | return False, output.strip() 91 | 92 | except subprocess.TimeoutExpired: 93 | return False, f"Connection timeout after {timeout}s" 94 | except Exception as e: 95 | return False, f"Connection error: {e}" 96 | 97 | def disconnect(self, address: str | None = None) -> tuple[bool, str]: 98 | """ 99 | Disconnect from a remote device. 100 | 101 | Args: 102 | address: Device address to disconnect. If None, disconnects all. 103 | 104 | Returns: 105 | Tuple of (success, message). 106 | """ 107 | try: 108 | cmd = [self.adb_path, "disconnect"] 109 | if address: 110 | cmd.append(address) 111 | 112 | result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", timeout=5) 113 | 114 | output = result.stdout + result.stderr 115 | return True, output.strip() or "Disconnected" 116 | 117 | except Exception as e: 118 | return False, f"Disconnect error: {e}" 119 | 120 | def list_devices(self) -> list[DeviceInfo]: 121 | """ 122 | List all connected devices. 123 | 124 | Returns: 125 | List of DeviceInfo objects. 126 | """ 127 | try: 128 | result = subprocess.run( 129 | [self.adb_path, "devices", "-l"], 130 | capture_output=True, 131 | text=True, 132 | timeout=5, 133 | ) 134 | 135 | devices = [] 136 | for line in result.stdout.strip().split("\n")[1:]: # Skip header 137 | if not line.strip(): 138 | continue 139 | 140 | parts = line.split() 141 | if len(parts) >= 2: 142 | device_id = parts[0] 143 | status = parts[1] 144 | 145 | # Determine connection type 146 | if ":" in device_id: 147 | conn_type = ConnectionType.REMOTE 148 | elif "emulator" in device_id: 149 | conn_type = ConnectionType.USB # Emulator via USB 150 | else: 151 | conn_type = ConnectionType.USB 152 | 153 | # Parse additional info 154 | model = None 155 | for part in parts[2:]: 156 | if part.startswith("model:"): 157 | model = part.split(":", 1)[1] 158 | break 159 | 160 | devices.append( 161 | DeviceInfo( 162 | device_id=device_id, 163 | status=status, 164 | connection_type=conn_type, 165 | model=model, 166 | ) 167 | ) 168 | 169 | return devices 170 | 171 | except Exception as e: 172 | print(f"Error listing devices: {e}") 173 | return [] 174 | 175 | def get_device_info(self, device_id: str | None = None) -> DeviceInfo | None: 176 | """ 177 | Get detailed information about a device. 178 | 179 | Args: 180 | device_id: Device ID. If None, uses first available device. 181 | 182 | Returns: 183 | DeviceInfo or None if not found. 184 | """ 185 | devices = self.list_devices() 186 | 187 | if not devices: 188 | return None 189 | 190 | if device_id is None: 191 | return devices[0] 192 | 193 | for device in devices: 194 | if device.device_id == device_id: 195 | return device 196 | 197 | return None 198 | 199 | def is_connected(self, device_id: str | None = None) -> bool: 200 | """ 201 | Check if a device is connected. 202 | 203 | Args: 204 | device_id: Device ID to check. If None, checks if any device is connected. 205 | 206 | Returns: 207 | True if connected, False otherwise. 208 | """ 209 | devices = self.list_devices() 210 | 211 | if not devices: 212 | return False 213 | 214 | if device_id is None: 215 | return any(d.status == "device" for d in devices) 216 | 217 | return any(d.device_id == device_id and d.status == "device" for d in devices) 218 | 219 | def enable_tcpip( 220 | self, port: int = 5555, device_id: str | None = None 221 | ) -> tuple[bool, str]: 222 | """ 223 | Enable TCP/IP debugging on a USB-connected device. 224 | 225 | This allows subsequent wireless connections to the device. 226 | 227 | Args: 228 | port: TCP port for ADB (default: 5555). 229 | device_id: Device ID. If None, uses first available device. 230 | 231 | Returns: 232 | Tuple of (success, message). 233 | 234 | Note: 235 | The device must be connected via USB first. 236 | After this, you can disconnect USB and connect via WiFi. 237 | """ 238 | try: 239 | cmd = [self.adb_path] 240 | if device_id: 241 | cmd.extend(["-s", device_id]) 242 | cmd.extend(["tcpip", str(port)]) 243 | 244 | result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", timeout=10) 245 | 246 | output = result.stdout + result.stderr 247 | 248 | if "restarting" in output.lower() or result.returncode == 0: 249 | time.sleep(TIMING_CONFIG.connection.adb_restart_delay) 250 | return True, f"TCP/IP mode enabled on port {port}" 251 | else: 252 | return False, output.strip() 253 | 254 | except Exception as e: 255 | return False, f"Error enabling TCP/IP: {e}" 256 | 257 | def get_device_ip(self, device_id: str | None = None) -> str | None: 258 | """ 259 | Get the IP address of a connected device. 260 | 261 | Args: 262 | device_id: Device ID. If None, uses first available device. 263 | 264 | Returns: 265 | IP address string or None if not found. 266 | """ 267 | try: 268 | cmd = [self.adb_path] 269 | if device_id: 270 | cmd.extend(["-s", device_id]) 271 | cmd.extend(["shell", "ip", "route"]) 272 | 273 | result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", timeout=5) 274 | 275 | # Parse IP from route output 276 | for line in result.stdout.split("\n"): 277 | if "src" in line: 278 | parts = line.split() 279 | for i, part in enumerate(parts): 280 | if part == "src" and i + 1 < len(parts): 281 | return parts[i + 1] 282 | 283 | # Alternative: try wlan0 interface 284 | cmd[-1] = "ip addr show wlan0" 285 | result = subprocess.run( 286 | cmd[:-1] + ["shell", "ip", "addr", "show", "wlan0"], 287 | capture_output=True, 288 | text=True, 289 | encoding="utf-8", 290 | timeout=5, 291 | ) 292 | 293 | for line in result.stdout.split("\n"): 294 | if "inet " in line: 295 | parts = line.strip().split() 296 | if len(parts) >= 2: 297 | return parts[1].split("/")[0] 298 | 299 | return None 300 | 301 | except Exception as e: 302 | print(f"Error getting device IP: {e}") 303 | return None 304 | 305 | def restart_server(self) -> tuple[bool, str]: 306 | """ 307 | Restart the ADB server. 308 | 309 | Returns: 310 | Tuple of (success, message). 311 | """ 312 | try: 313 | # Kill server 314 | subprocess.run( 315 | [self.adb_path, "kill-server"], capture_output=True, timeout=5 316 | ) 317 | 318 | time.sleep(TIMING_CONFIG.connection.server_restart_delay) 319 | 320 | # Start server 321 | subprocess.run( 322 | [self.adb_path, "start-server"], capture_output=True, timeout=5 323 | ) 324 | 325 | return True, "ADB server restarted" 326 | 327 | except Exception as e: 328 | return False, f"Error restarting server: {e}" 329 | 330 | 331 | def quick_connect(address: str) -> tuple[bool, str]: 332 | """ 333 | Quick helper to connect to a remote device. 334 | 335 | Args: 336 | address: Device address (e.g., "192.168.1.100" or "192.168.1.100:5555"). 337 | 338 | Returns: 339 | Tuple of (success, message). 340 | """ 341 | conn = ADBConnection() 342 | return conn.connect(address) 343 | 344 | 345 | def list_devices() -> list[DeviceInfo]: 346 | """ 347 | Quick helper to list connected devices. 348 | 349 | Returns: 350 | List of DeviceInfo objects. 351 | """ 352 | conn = ADBConnection() 353 | return conn.list_devices() 354 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to the Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2025 Zhipu AI 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /phone_agent/hdc/connection.py: -------------------------------------------------------------------------------- 1 | """HDC connection management for HarmonyOS devices.""" 2 | 3 | import os 4 | import subprocess 5 | import time 6 | from dataclasses import dataclass 7 | from enum import Enum 8 | from typing import Optional 9 | 10 | from phone_agent.config.timing import TIMING_CONFIG 11 | 12 | 13 | # Global flag to control HDC command output 14 | _HDC_VERBOSE = os.getenv("HDC_VERBOSE", "false").lower() in ("true", "1", "yes") 15 | 16 | 17 | def _run_hdc_command(cmd: list, **kwargs) -> subprocess.CompletedProcess: 18 | """ 19 | Run HDC command with optional verbose output. 20 | 21 | Args: 22 | cmd: Command list to execute. 23 | **kwargs: Additional arguments for subprocess.run. 24 | 25 | Returns: 26 | CompletedProcess result. 27 | """ 28 | if _HDC_VERBOSE: 29 | print(f"[HDC] Running command: {' '.join(cmd)}") 30 | 31 | result = subprocess.run(cmd, **kwargs) 32 | 33 | if _HDC_VERBOSE and result.returncode != 0: 34 | print(f"[HDC] Command failed with return code {result.returncode}") 35 | if hasattr(result, 'stderr') and result.stderr: 36 | print(f"[HDC] Error: {result.stderr}") 37 | 38 | return result 39 | 40 | 41 | def set_hdc_verbose(verbose: bool): 42 | """Set HDC verbose mode globally.""" 43 | global _HDC_VERBOSE 44 | _HDC_VERBOSE = verbose 45 | 46 | 47 | class ConnectionType(Enum): 48 | """Type of HDC connection.""" 49 | 50 | USB = "usb" 51 | WIFI = "wifi" 52 | REMOTE = "remote" 53 | 54 | 55 | @dataclass 56 | class DeviceInfo: 57 | """Information about a connected device.""" 58 | 59 | device_id: str 60 | status: str 61 | connection_type: ConnectionType 62 | model: str | None = None 63 | harmony_version: str | None = None 64 | 65 | 66 | class HDCConnection: 67 | """ 68 | Manages HDC connections to HarmonyOS devices. 69 | 70 | Supports USB, WiFi, and remote TCP/IP connections. 71 | 72 | Example: 73 | >>> conn = HDCConnection() 74 | >>> # Connect to remote device 75 | >>> conn.connect("192.168.1.100:5555") 76 | >>> # List devices 77 | >>> devices = conn.list_devices() 78 | >>> # Disconnect 79 | >>> conn.disconnect("192.168.1.100:5555") 80 | """ 81 | 82 | def __init__(self, hdc_path: str = "hdc"): 83 | """ 84 | Initialize HDC connection manager. 85 | 86 | Args: 87 | hdc_path: Path to HDC executable. 88 | """ 89 | self.hdc_path = hdc_path 90 | 91 | def connect(self, address: str, timeout: int = 10) -> tuple[bool, str]: 92 | """ 93 | Connect to a remote device via TCP/IP. 94 | 95 | Args: 96 | address: Device address in format "host:port" (e.g., "192.168.1.100:5555"). 97 | timeout: Connection timeout in seconds. 98 | 99 | Returns: 100 | Tuple of (success, message). 101 | 102 | Note: 103 | The remote device must have TCP/IP debugging enabled. 104 | """ 105 | # Validate address format 106 | if ":" not in address: 107 | address = f"{address}:5555" # Default HDC port 108 | 109 | try: 110 | result = _run_hdc_command( 111 | [self.hdc_path, "tconn", address], 112 | capture_output=True, 113 | text=True, 114 | timeout=timeout, 115 | ) 116 | 117 | output = result.stdout + result.stderr 118 | 119 | if "Connect OK" in output or "connected" in output.lower(): 120 | return True, f"Connected to {address}" 121 | elif "already connected" in output.lower(): 122 | return True, f"Already connected to {address}" 123 | else: 124 | return False, output.strip() 125 | 126 | except subprocess.TimeoutExpired: 127 | return False, f"Connection timeout after {timeout}s" 128 | except Exception as e: 129 | return False, f"Connection error: {e}" 130 | 131 | def disconnect(self, address: str | None = None) -> tuple[bool, str]: 132 | """ 133 | Disconnect from a remote device. 134 | 135 | Args: 136 | address: Device address to disconnect. If None, disconnects all. 137 | 138 | Returns: 139 | Tuple of (success, message). 140 | """ 141 | try: 142 | if address: 143 | cmd = [self.hdc_path, "tdisconn", address] 144 | else: 145 | # HDC doesn't have a "disconnect all" command, so we need to list and disconnect each 146 | devices = self.list_devices() 147 | for device in devices: 148 | if ":" in device.device_id: # Remote device 149 | _run_hdc_command( 150 | [self.hdc_path, "tdisconn", device.device_id], 151 | capture_output=True, 152 | text=True, 153 | timeout=5 154 | ) 155 | return True, "Disconnected all remote devices" 156 | 157 | result = _run_hdc_command(cmd, capture_output=True, text=True, encoding="utf-8", timeout=5) 158 | 159 | output = result.stdout + result.stderr 160 | return True, output.strip() or "Disconnected" 161 | 162 | except Exception as e: 163 | return False, f"Disconnect error: {e}" 164 | 165 | def list_devices(self) -> list[DeviceInfo]: 166 | """ 167 | List all connected devices. 168 | 169 | Returns: 170 | List of DeviceInfo objects. 171 | """ 172 | try: 173 | result = _run_hdc_command( 174 | [self.hdc_path, "list", "targets"], 175 | capture_output=True, 176 | text=True, 177 | timeout=5, 178 | ) 179 | 180 | devices = [] 181 | for line in result.stdout.strip().split("\n"): 182 | if not line.strip(): 183 | continue 184 | 185 | # HDC output format: device_id (status) 186 | # Example: "192.168.1.100:5555" or "FMR0223C13000649" 187 | device_id = line.strip() 188 | 189 | # Determine connection type 190 | if ":" in device_id: 191 | conn_type = ConnectionType.REMOTE 192 | else: 193 | conn_type = ConnectionType.USB 194 | 195 | # HDC doesn't provide detailed status in list command 196 | # We assume "Connected" status for devices that appear 197 | devices.append( 198 | DeviceInfo( 199 | device_id=device_id, 200 | status="device", 201 | connection_type=conn_type, 202 | model=None, 203 | ) 204 | ) 205 | 206 | return devices 207 | 208 | except Exception as e: 209 | print(f"Error listing devices: {e}") 210 | return [] 211 | 212 | def get_device_info(self, device_id: str | None = None) -> DeviceInfo | None: 213 | """ 214 | Get detailed information about a device. 215 | 216 | Args: 217 | device_id: Device ID. If None, uses first available device. 218 | 219 | Returns: 220 | DeviceInfo or None if not found. 221 | """ 222 | devices = self.list_devices() 223 | 224 | if not devices: 225 | return None 226 | 227 | if device_id is None: 228 | return devices[0] 229 | 230 | for device in devices: 231 | if device.device_id == device_id: 232 | return device 233 | 234 | return None 235 | 236 | def is_connected(self, device_id: str | None = None) -> bool: 237 | """ 238 | Check if a device is connected. 239 | 240 | Args: 241 | device_id: Device ID to check. If None, checks if any device is connected. 242 | 243 | Returns: 244 | True if connected, False otherwise. 245 | """ 246 | devices = self.list_devices() 247 | 248 | if not devices: 249 | return False 250 | 251 | if device_id is None: 252 | return len(devices) > 0 253 | 254 | return any(d.device_id == device_id for d in devices) 255 | 256 | def enable_tcpip( 257 | self, port: int = 5555, device_id: str | None = None 258 | ) -> tuple[bool, str]: 259 | """ 260 | Enable TCP/IP debugging on a USB-connected device. 261 | 262 | This allows subsequent wireless connections to the device. 263 | 264 | Args: 265 | port: TCP port for HDC (default: 5555). 266 | device_id: Device ID. If None, uses first available device. 267 | 268 | Returns: 269 | Tuple of (success, message). 270 | 271 | Note: 272 | The device must be connected via USB first. 273 | After this, you can disconnect USB and connect via WiFi. 274 | """ 275 | try: 276 | cmd = [self.hdc_path] 277 | if device_id: 278 | cmd.extend(["-t", device_id]) 279 | cmd.extend(["tmode", "port", str(port)]) 280 | 281 | result = _run_hdc_command(cmd, capture_output=True, text=True, encoding="utf-8", timeout=10) 282 | 283 | output = result.stdout + result.stderr 284 | 285 | if result.returncode == 0 or "success" in output.lower(): 286 | time.sleep(TIMING_CONFIG.connection.adb_restart_delay) 287 | return True, f"TCP/IP mode enabled on port {port}" 288 | else: 289 | return False, output.strip() 290 | 291 | except Exception as e: 292 | return False, f"Error enabling TCP/IP: {e}" 293 | 294 | def get_device_ip(self, device_id: str | None = None) -> str | None: 295 | """ 296 | Get the IP address of a connected device. 297 | 298 | Args: 299 | device_id: Device ID. If None, uses first available device. 300 | 301 | Returns: 302 | IP address string or None if not found. 303 | """ 304 | try: 305 | cmd = [self.hdc_path] 306 | if device_id: 307 | cmd.extend(["-t", device_id]) 308 | cmd.extend(["shell", "ifconfig"]) 309 | 310 | result = _run_hdc_command(cmd, capture_output=True, text=True, encoding="utf-8", timeout=5) 311 | 312 | # Parse IP from ifconfig output 313 | for line in result.stdout.split("\n"): 314 | if "inet addr:" in line or "inet " in line: 315 | parts = line.strip().split() 316 | for i, part in enumerate(parts): 317 | if "addr:" in part: 318 | ip = part.split(":")[1] 319 | # Filter out localhost 320 | if not ip.startswith("127."): 321 | return ip 322 | elif part == "inet" and i + 1 < len(parts): 323 | ip = parts[i + 1].split("/")[0] 324 | if not ip.startswith("127."): 325 | return ip 326 | 327 | return None 328 | 329 | except Exception as e: 330 | print(f"Error getting device IP: {e}") 331 | return None 332 | 333 | def restart_server(self) -> tuple[bool, str]: 334 | """ 335 | Restart the HDC server. 336 | 337 | Returns: 338 | Tuple of (success, message). 339 | """ 340 | try: 341 | # Kill server 342 | _run_hdc_command( 343 | [self.hdc_path, "kill"], capture_output=True, timeout=5 344 | ) 345 | 346 | time.sleep(TIMING_CONFIG.connection.server_restart_delay) 347 | 348 | # Start server (HDC auto-starts when running commands) 349 | _run_hdc_command( 350 | [self.hdc_path, "start", "-r"], capture_output=True, timeout=5 351 | ) 352 | 353 | return True, "HDC server restarted" 354 | 355 | except Exception as e: 356 | return False, f"Error restarting server: {e}" 357 | 358 | 359 | def quick_connect(address: str) -> tuple[bool, str]: 360 | """ 361 | Quick helper to connect to a remote device. 362 | 363 | Args: 364 | address: Device address (e.g., "192.168.1.100" or "192.168.1.100:5555"). 365 | 366 | Returns: 367 | Tuple of (success, message). 368 | """ 369 | conn = HDCConnection() 370 | return conn.connect(address) 371 | 372 | 373 | def list_devices() -> list[DeviceInfo]: 374 | """ 375 | Quick helper to list connected devices. 376 | 377 | Returns: 378 | List of DeviceInfo objects. 379 | """ 380 | conn = HDCConnection() 381 | return conn.list_devices() 382 | -------------------------------------------------------------------------------- /resources/privacy_policy_en.txt: -------------------------------------------------------------------------------- 1 | Part I: Safety Description of Model/Technology 2 | 3 | 1. AutoGLM Technical Mechanism and Deployment Flexibility 4 | The core functionality of AutoGLM is automated operation execution. Its working principle is as follows: 5 | - Instruction-Driven: Based on operation instructions issued by the user or developer. 6 | - Screen Understanding: Captures the screen content of the current operating environment and sends the image to a large model (which can be deployed locally or in the cloud) for analysis and understanding. 7 | - Operation Simulation: Simulates human interaction methods (such as clicking, swiping, inputting information, etc.) to complete tasks in the target environment. 8 | - Example: When instructed to book a high-speed rail ticket, AutoGLM would open the relevant application, identify the interface content, and follow the instructions to select a train, complete the order, etc., similar to manual operation. The user or developer can terminate the task at any time. 9 | 10 | Key Flexibility: 11 | - Model Deployment: Developers can freely choose to deploy the AutoGLM model on local devices or on cloud servers. 12 | - Operation Execution Environment: Automated operations can be executed on local devices or on cloud-based devices, as determined by the developer based on application scenarios and requirements. 13 | - Data Flow: The data flow depends on the deployment choice: 14 | - Local Deployment (Model + Execution): Screen capture, model analysis, and operation execution are all completed on the local device. Data does not leave the device, offering the highest level of privacy. 15 | - Cloud Deployment (Model + Execution): Screen content needs to be transmitted from the operating environment (local or cloud device) to the cloud-based model. After analysis, the model returns instructions to the operating environment for execution. Developers must ensure the security of transmission and cloud processing. 16 | - Hybrid Deployment (e.g., Local Execution + Cloud Model): Screen content is captured locally, transmitted to the cloud model for analysis, and the analysis results are returned to the local environment for execution. Developers need to pay attention to data transmission security. 17 | 18 | 2. System Permission Usage Description (For the Operation Execution Environment) 19 | To ensure the normal execution of automated operations, the environment running AutoGLM operations may need to obtain the following permissions: 20 | - ADB (Android Debug Bridge) Permissions: Used to obtain information and simulate user interaction operations such as clicking, swiping, and inputting. 21 | - Storage Permissions: Used for temporary storage of necessary data, model files (if deployed locally), or logs. 22 | - Network Permissions: Used to access online services (e.g., calling cloud models, accessing target application services). 23 | - Other Specific Permissions: May be required for specific tasks (e.g., microphone for voice commands). 24 | 25 | Developer Responsibilities: 26 | - Principle of Least Privilege: Only request permissions absolutely necessary to complete a specific task. 27 | - Transparent Disclosure: Clearly and explicitly inform end-users in the application or service about the purpose and necessity of each permission. 28 | - User Authorization: Must obtain explicit authorization from the end-user before enabling relevant permissions and functionalities in the operating environment. 29 | - Environment Adaptation: Ensure that the permission request and acquisition mechanisms are adapted to the chosen operation execution environment (local or cloud). 30 | 31 | 3. Data Processing and Privacy Protection Principles 32 | The AutoGLM open-source project itself does not collect user data. The responsibility for data processing and privacy protection lies with the developers who build specific applications or services based on AutoGLM. Their responsibilities vary depending on the deployment method: 33 | - Local Deployment (Model + Execution): 34 | - Developers must implement secure local data storage and processing at the application level. All data processing (screen capture, model analysis, operation execution) is completed on the end-user's local device. 35 | - Developers should ensure their application does not actively upload sensitive data (such as screen content, operation logs) to the developer's servers or third parties, unless with the user's explicit, informed consent and for a necessary functionality. 36 | - Cloud Deployment (Model and/or Execution): 37 | - Involves data transmission (screen content, operation instructions, model analysis results) between the operating environment and the cloud. 38 | - Developers must: 39 | - Implement strong encryption to protect all data in transit and at rest. 40 | - Clearly inform end-users about what data will be sent to the cloud, the purpose of transmission, storage location, and retention period, and obtain the end-user's explicit consent for data transmission and cloud processing. 41 | - Comply with applicable data protection regulations, provide a clear privacy policy explaining data processing practices. 42 | - Ensure secure configuration and access control for the cloud environment (model servers, operating environment servers). 43 | - General Principles (All Deployment Methods): 44 | - Data Minimization: Collect and process only the minimum information absolutely necessary to complete the automated task. 45 | - Purpose Limitation: Use data solely for the specific purpose of the automated operation to fulfill the user's instruction. 46 | - Security Safeguards: Developers are responsible for taking reasonable technical and administrative measures to protect the security and confidentiality of all user data they process (whether locally or in the cloud), preventing unauthorized access, use, disclosure, or loss. 47 | - User Control: Provide mechanisms allowing end-users to view and manage (e.g., delete) data related to them (where technically feasible and consistent with the deployment method). 48 | 49 | 50 | --- 51 | 52 | Part II: Usage Norms Developers/Users Should Follow 53 | Developers/users must always comply with applicable laws and regulations when using the AutoGLM open-source project. 54 | 55 | 1. Critical Operation Confirmation Mechanism 56 | Developers must design and implement explicit, mandatory user confirmation steps within their applications or services built on AutoGLM for the following 6+1 types of high-risk operations: 57 | - Information Interaction and Content Dissemination: Including but not limited to sending messages, emails, posting comments, liking, sharing, etc. 58 | - File Handling and Permission Management: Including but not limited to creating, editing, deleting, moving files or folders, enabling or disabling any permissions, etc. 59 | - Transaction Orders and Disposal of Rights/Interests: Including but not limited to clearing shopping carts, submitting orders, modifying/adding shipping addresses, using coupons/points, etc. 60 | - Fund Transfers and Payment Settlement: Including but not limited to transfers, payments, receiving funds, recharging, withdrawals, binding/unbinding payment methods, etc. 61 | - Account Identity and Security Configuration: Including but not limited to changing passwords, setting/modifying security options, deleting accounts or linked accounts, deleting friends/contacts, deleting conversations/records, etc. 62 | - Healthcare and Legal Compliance: Including but not limited to accessing, authorizing, or disposing of medical records/health data, purchasing medication, physical or psychological testing, signing electronic agreements, etc. 63 | - Other High-Risk Operations: Any other operation that may significantly impact user data security, property security, account security, or reputation. 64 | 65 | Requirements: 66 | - The confirmation step must be triggered before operation execution, clearly displaying the details of the upcoming operation. 67 | - Provide convenient cancel/termination mechanisms, allowing users to abort the task at any time before confirmation or during the operation process. 68 | - Developer Responsibility: Developers shall bear corresponding responsibility for losses caused to users due to failure to implement an effective confirmation mechanism. 69 | - User Responsibility: Users shall bear losses resulting from their failure to promptly terminate erroneous operations after confirmation. 70 | 71 | 2. Obligations of Developers and Users 72 | Developer Obligations: 73 | - Transparent Disclosure: Clearly and accurately explain to end-users the functionality, working principles (especially the automated parts), data collection and processing methods (including whether the cloud is involved), potential risks, and how users can exercise control. 74 | - Provide Monitoring and Control: Design a user interface that allows end-users to: 75 | - View or understand the current status and steps of automated operations in real-time. 76 | - Conveniently and quickly pause or terminate any ongoing automated task. 77 | - Manage permissions and settings for automated operations. 78 | - Secure Development: Follow secure coding practices to ensure the security of the application/service itself and prevent malicious exploitation. 79 | - Compliance: Ensure that the developed application/service complies with all applicable laws, regulations, industry standards, and third-party platform (e.g., the application being operated on) terms of service. 80 | - Risk Warning: Clearly warn users in appropriate locations (e.g., feature entry points, first-time use, confirmation steps) about potential risks of using automation functions (such as misoperation, privacy risks, third-party platform policy risks). 81 | - Avoid Critical Dependencies: Carefully evaluate and refrain from recommending AutoGLM for handling extremely critical, high-risk operations or those with severe consequences upon error (e.g., medical device control, critical infrastructure operations, large financial transactions without human review). 82 | 83 | User Obligations: 84 | - Understand Risks: Before using AutoGLM-based automation features, carefully read the developer's instructions, privacy policy, and risk warnings to fully understand their working principles and potential risks. 85 | - Grant Permissions Cautiously: Only grant necessary permissions after fully trusting the application/service developer and understanding the authorization content. 86 | - Active Monitoring: Maintain appropriate attention during the execution of automated tasks, especially for important operations. Utilize monitoring functions provided by the developer to understand operation progress. 87 | - Timely Intervention: Immediately use the provided termination function to stop the task if any operation error, abnormality, or deviation from expectation is observed. 88 | - Assume Responsibility: Bear responsibility for instructions issued, operations confirmed, and any losses resulting from failure to promptly monitor and stop erroneous operations. 89 | 90 | 3. Developer and User Code of Conduct 91 | It is strictly prohibited to use the AutoGLM open-source project or applications/services developed based on it to engage in the following behaviors: 92 | 93 | (1) Bulk Automation and Malicious Competition 94 | - Any form of falsified data manipulation: brushing orders, votes, likes, comments, traffic, followers, play counts, downloads, etc. 95 | - Bulk account manipulation: bulk registration, bulk login, bulk operation of third-party platform accounts (group control, multi-instance, cloud control). 96 | - Disrupting market order: malicious bulk purchasing, hoarding and profiteering, snatching limited resources, bulk claiming/abusing coupons/subsidies, maliciously occupying service resources ("薅羊毛"). 97 | - Manipulating platform rules: brushing rankings/search results, artificially influencing recommendation algorithms, artificially inflating/deflating content exposure. 98 | - Creating false engagement: bulk publishing, reposting, liking, collecting, following, unfollowing, etc., on social media. 99 | - Undermining game fairness: power-leveling services, studio operations, bulk farming of equipment/currency/experience/items. 100 | - Undermining fairness: bulk voting, ballot stuffing, manipulating online polls/survey results. 101 | 102 | (2) False Information and Fraudulent Behavior 103 | - Creating misleading information: publishing/spreading false product/service reviews, false user feedback, false testimonials, false experiences. 104 | - Fabricating commercial data: creating false transaction records, sales figures, user engagement, positive review rates. 105 | - Identity fraud: impersonating others, fabricating personal information, stealing others' accounts/avatars/nicknames, forging identity documents. 106 | - False marketing: publishing false advertisements, conducting false promotions, exaggerating product efficacy, concealing product defects/risks. 107 | - Participating in fraudulent activities: online scams, false investments, pyramid schemes, illegal fundraising, fake prize wins, phishing, etc. 108 | - Spreading unverified information: creating or maliciously spreading fake news, rumors, unverified information. 109 | 110 | (3) Harming Third-Party Services and System Security 111 | - Unauthorized access: using AutoGLM for data scraping (violating robots.txt or platform policies), information theft, API abuse, unauthorized penetration testing. 112 | - Technical sabotage: reverse engineering, cracking, modifying, injecting malicious code into third-party applications, disrupting their normal operation. 113 | - Resource abuse: maliciously occupying third-party server resources, sending spam requests, generating abnormal traffic, conducting DDoS attacks. 114 | - Violating platform rules: intentionally violating the user agreements, terms of service, or community rules of the third-party application being operated on. 115 | - Malicious competition: malicious negative reviews, false reporting, false complaints, commercial defamation. 116 | - Spreading harmful content: spreading computer viruses, trojans, malware, ransomware, spam, illegal content. 117 | - Infringing data rights: unauthorized large-scale commercial data collection, user information gathering, privacy snooping. 118 | 119 | (4) Infringing on Others' Legitimate Rights and Interests 120 | - Account theft: stealing others' accounts, passwords, identity credentials for operations. 121 | - Online harassment and bullying: malicious harassment, threats, insults, defamation, doxxing others. 122 | - Privacy and secret infringement: unauthorized collection, use, or dissemination of others' personal information, private data, trade secrets. 123 | - Cybersquatting: registering others' trademarks, domain names, usernames, social media accounts, etc., in bad faith. 124 | - Harassment: malicious spamming, message bombing, forced following/subscription. 125 | - Harming commercial interests: industrial espionage, unfair competition, malicious poaching, theft of trade secrets. 126 | 127 | (5) Resource Abuse and Damaging Project Ecosystem 128 | - Abusing registration resources: maliciously registering numerous accounts, fake registration. 129 | - Wasting computing/device resources: maliciously occupying local or cloud device resources, long-term idle occupancy, running high-energy-consumption programs unrelated to automated tasks (e.g., cryptocurrency mining). 130 | - Destabilizing systems: maliciously testing system performance, conducting unauthorized stress tests, frequently restarting services, exploiting technical vulnerabilities/defects for personal gain or to harm the project/platform. 131 | - Violating open-source licenses: violating the terms of the AutoGLM project's open-source license. 132 | 133 | Consequences of Violation: 134 | If developers/users fail to follow the corresponding laws, regulations, policies, industry standards (including but not limited to technical specifications, security standards), and the project's agreements (including but not limited to open-source licenses, usage notes) during use, all resulting legal liabilities, economic losses, and any adverse consequences shall be solely and independently borne by the developers / users. -------------------------------------------------------------------------------- /phone_agent/actions/handler.py: -------------------------------------------------------------------------------- 1 | """Action handler for processing AI model outputs.""" 2 | 3 | import ast 4 | import re 5 | import subprocess 6 | import time 7 | from dataclasses import dataclass 8 | from typing import Any, Callable 9 | 10 | from phone_agent.config.timing import TIMING_CONFIG 11 | from phone_agent.device_factory import get_device_factory 12 | 13 | 14 | @dataclass 15 | class ActionResult: 16 | """Result of an action execution.""" 17 | 18 | success: bool 19 | should_finish: bool 20 | message: str | None = None 21 | requires_confirmation: bool = False 22 | 23 | 24 | class ActionHandler: 25 | """ 26 | Handles execution of actions from AI model output. 27 | 28 | Args: 29 | device_id: Optional ADB device ID for multi-device setups. 30 | confirmation_callback: Optional callback for sensitive action confirmation. 31 | Should return True to proceed, False to cancel. 32 | takeover_callback: Optional callback for takeover requests (login, captcha). 33 | """ 34 | 35 | def __init__( 36 | self, 37 | device_id: str | None = None, 38 | confirmation_callback: Callable[[str], bool] | None = None, 39 | takeover_callback: Callable[[str], None] | None = None, 40 | ): 41 | self.device_id = device_id 42 | self.confirmation_callback = confirmation_callback or self._default_confirmation 43 | self.takeover_callback = takeover_callback or self._default_takeover 44 | 45 | def execute( 46 | self, action: dict[str, Any], screen_width: int, screen_height: int 47 | ) -> ActionResult: 48 | """ 49 | Execute an action from the AI model. 50 | 51 | Args: 52 | action: The action dictionary from the model. 53 | screen_width: Current screen width in pixels. 54 | screen_height: Current screen height in pixels. 55 | 56 | Returns: 57 | ActionResult indicating success and whether to finish. 58 | """ 59 | action_type = action.get("_metadata") 60 | 61 | if action_type == "finish": 62 | return ActionResult( 63 | success=True, should_finish=True, message=action.get("message") 64 | ) 65 | 66 | if action_type != "do": 67 | return ActionResult( 68 | success=False, 69 | should_finish=True, 70 | message=f"Unknown action type: {action_type}", 71 | ) 72 | 73 | action_name = action.get("action") 74 | handler_method = self._get_handler(action_name) 75 | 76 | if handler_method is None: 77 | return ActionResult( 78 | success=False, 79 | should_finish=False, 80 | message=f"Unknown action: {action_name}", 81 | ) 82 | 83 | try: 84 | return handler_method(action, screen_width, screen_height) 85 | except Exception as e: 86 | return ActionResult( 87 | success=False, should_finish=False, message=f"Action failed: {e}" 88 | ) 89 | 90 | def _get_handler(self, action_name: str) -> Callable | None: 91 | """Get the handler method for an action.""" 92 | handlers = { 93 | "Launch": self._handle_launch, 94 | "Tap": self._handle_tap, 95 | "Type": self._handle_type, 96 | "Type_Name": self._handle_type, 97 | "Swipe": self._handle_swipe, 98 | "Back": self._handle_back, 99 | "Home": self._handle_home, 100 | "Double Tap": self._handle_double_tap, 101 | "Long Press": self._handle_long_press, 102 | "Wait": self._handle_wait, 103 | "Take_over": self._handle_takeover, 104 | "Note": self._handle_note, 105 | "Call_API": self._handle_call_api, 106 | "Interact": self._handle_interact, 107 | } 108 | return handlers.get(action_name) 109 | 110 | def _convert_relative_to_absolute( 111 | self, element: list[int], screen_width: int, screen_height: int 112 | ) -> tuple[int, int]: 113 | """Convert relative coordinates (0-1000) to absolute pixels.""" 114 | x = int(element[0] / 1000 * screen_width) 115 | y = int(element[1] / 1000 * screen_height) 116 | return x, y 117 | 118 | def _handle_launch(self, action: dict, width: int, height: int) -> ActionResult: 119 | """Handle app launch action.""" 120 | app_name = action.get("app") 121 | if not app_name: 122 | return ActionResult(False, False, "No app name specified") 123 | 124 | device_factory = get_device_factory() 125 | success = device_factory.launch_app(app_name, self.device_id) 126 | if success: 127 | return ActionResult(True, False) 128 | return ActionResult(False, False, f"App not found: {app_name}") 129 | 130 | def _handle_tap(self, action: dict, width: int, height: int) -> ActionResult: 131 | """Handle tap action.""" 132 | element = action.get("element") 133 | if not element: 134 | return ActionResult(False, False, "No element coordinates") 135 | 136 | x, y = self._convert_relative_to_absolute(element, width, height) 137 | 138 | # Check for sensitive operation 139 | if "message" in action: 140 | if not self.confirmation_callback(action["message"]): 141 | return ActionResult( 142 | success=False, 143 | should_finish=True, 144 | message="User cancelled sensitive operation", 145 | ) 146 | 147 | device_factory = get_device_factory() 148 | device_factory.tap(x, y, self.device_id) 149 | return ActionResult(True, False) 150 | 151 | def _handle_type(self, action: dict, width: int, height: int) -> ActionResult: 152 | """Handle text input action.""" 153 | text = action.get("text", "") 154 | 155 | device_factory = get_device_factory() 156 | 157 | # Switch to ADB keyboard 158 | original_ime = device_factory.detect_and_set_adb_keyboard(self.device_id) 159 | time.sleep(TIMING_CONFIG.action.keyboard_switch_delay) 160 | 161 | # Clear existing text and type new text 162 | device_factory.clear_text(self.device_id) 163 | time.sleep(TIMING_CONFIG.action.text_clear_delay) 164 | 165 | # Handle multiline text by splitting on newlines 166 | device_factory.type_text(text, self.device_id) 167 | time.sleep(TIMING_CONFIG.action.text_input_delay) 168 | 169 | # Restore original keyboard 170 | device_factory.restore_keyboard(original_ime, self.device_id) 171 | time.sleep(TIMING_CONFIG.action.keyboard_restore_delay) 172 | 173 | return ActionResult(True, False) 174 | 175 | def _handle_swipe(self, action: dict, width: int, height: int) -> ActionResult: 176 | """Handle swipe action.""" 177 | start = action.get("start") 178 | end = action.get("end") 179 | 180 | if not start or not end: 181 | return ActionResult(False, False, "Missing swipe coordinates") 182 | 183 | start_x, start_y = self._convert_relative_to_absolute(start, width, height) 184 | end_x, end_y = self._convert_relative_to_absolute(end, width, height) 185 | 186 | device_factory = get_device_factory() 187 | device_factory.swipe(start_x, start_y, end_x, end_y, device_id=self.device_id) 188 | return ActionResult(True, False) 189 | 190 | def _handle_back(self, action: dict, width: int, height: int) -> ActionResult: 191 | """Handle back button action.""" 192 | device_factory = get_device_factory() 193 | device_factory.back(self.device_id) 194 | return ActionResult(True, False) 195 | 196 | def _handle_home(self, action: dict, width: int, height: int) -> ActionResult: 197 | """Handle home button action.""" 198 | device_factory = get_device_factory() 199 | device_factory.home(self.device_id) 200 | return ActionResult(True, False) 201 | 202 | def _handle_double_tap(self, action: dict, width: int, height: int) -> ActionResult: 203 | """Handle double tap action.""" 204 | element = action.get("element") 205 | if not element: 206 | return ActionResult(False, False, "No element coordinates") 207 | 208 | x, y = self._convert_relative_to_absolute(element, width, height) 209 | device_factory = get_device_factory() 210 | device_factory.double_tap(x, y, self.device_id) 211 | return ActionResult(True, False) 212 | 213 | def _handle_long_press(self, action: dict, width: int, height: int) -> ActionResult: 214 | """Handle long press action.""" 215 | element = action.get("element") 216 | if not element: 217 | return ActionResult(False, False, "No element coordinates") 218 | 219 | x, y = self._convert_relative_to_absolute(element, width, height) 220 | device_factory = get_device_factory() 221 | device_factory.long_press(x, y, device_id=self.device_id) 222 | return ActionResult(True, False) 223 | 224 | def _handle_wait(self, action: dict, width: int, height: int) -> ActionResult: 225 | """Handle wait action.""" 226 | duration_str = action.get("duration", "1 seconds") 227 | try: 228 | duration = float(duration_str.replace("seconds", "").strip()) 229 | except ValueError: 230 | duration = 1.0 231 | 232 | time.sleep(duration) 233 | return ActionResult(True, False) 234 | 235 | def _handle_takeover(self, action: dict, width: int, height: int) -> ActionResult: 236 | """Handle takeover request (login, captcha, etc.).""" 237 | message = action.get("message", "User intervention required") 238 | self.takeover_callback(message) 239 | return ActionResult(True, False) 240 | 241 | def _handle_note(self, action: dict, width: int, height: int) -> ActionResult: 242 | """Handle note action (placeholder for content recording).""" 243 | # This action is typically used for recording page content 244 | # Implementation depends on specific requirements 245 | return ActionResult(True, False) 246 | 247 | def _handle_call_api(self, action: dict, width: int, height: int) -> ActionResult: 248 | """Handle API call action (placeholder for summarization).""" 249 | # This action is typically used for content summarization 250 | # Implementation depends on specific requirements 251 | return ActionResult(True, False) 252 | 253 | def _handle_interact(self, action: dict, width: int, height: int) -> ActionResult: 254 | """Handle interaction request (user choice needed).""" 255 | # This action signals that user input is needed 256 | return ActionResult(True, False, message="User interaction required") 257 | 258 | def _send_keyevent(self, keycode: str) -> None: 259 | """Send a keyevent to the device.""" 260 | from phone_agent.device_factory import DeviceType, get_device_factory 261 | from phone_agent.hdc.connection import _run_hdc_command 262 | 263 | device_factory = get_device_factory() 264 | 265 | # Handle HDC devices with HarmonyOS-specific keyEvent command 266 | if device_factory.device_type == DeviceType.HDC: 267 | hdc_prefix = ["hdc", "-t", self.device_id] if self.device_id else ["hdc"] 268 | 269 | # Map common keycodes to HarmonyOS keyEvent codes 270 | # KEYCODE_ENTER (66) -> 2054 (HarmonyOS Enter key code) 271 | if keycode == "KEYCODE_ENTER" or keycode == "66": 272 | _run_hdc_command( 273 | hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", "2054"], 274 | capture_output=True, 275 | text=True, 276 | ) 277 | else: 278 | # For other keys, try to use the numeric code directly 279 | # If keycode is a string like "KEYCODE_ENTER", convert it 280 | try: 281 | # Try to extract numeric code from string or use as-is 282 | if keycode.startswith("KEYCODE_"): 283 | # For now, only handle ENTER, other keys may need mapping 284 | if "ENTER" in keycode: 285 | _run_hdc_command( 286 | hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", "2054"], 287 | capture_output=True, 288 | text=True, 289 | ) 290 | else: 291 | # Fallback to ADB-style command for unsupported keys 292 | subprocess.run( 293 | hdc_prefix + ["shell", "input", "keyevent", keycode], 294 | capture_output=True, 295 | text=True, 296 | ) 297 | else: 298 | # Assume it's a numeric code 299 | _run_hdc_command( 300 | hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", str(keycode)], 301 | capture_output=True, 302 | text=True, 303 | ) 304 | except Exception: 305 | # Fallback to ADB-style command 306 | subprocess.run( 307 | hdc_prefix + ["shell", "input", "keyevent", keycode], 308 | capture_output=True, 309 | text=True, 310 | ) 311 | else: 312 | # ADB devices use standard input keyevent command 313 | cmd_prefix = ["adb", "-s", self.device_id] if self.device_id else ["adb"] 314 | subprocess.run( 315 | cmd_prefix + ["shell", "input", "keyevent", keycode], 316 | capture_output=True, 317 | text=True, 318 | ) 319 | 320 | @staticmethod 321 | def _default_confirmation(message: str) -> bool: 322 | """Default confirmation callback using console input.""" 323 | response = input(f"Sensitive operation: {message}\nConfirm? (Y/N): ") 324 | return response.upper() == "Y" 325 | 326 | @staticmethod 327 | def _default_takeover(message: str) -> None: 328 | """Default takeover callback using console input.""" 329 | input(f"{message}\nPress Enter after completing manual operation...") 330 | 331 | 332 | def parse_action(response: str) -> dict[str, Any]: 333 | """ 334 | Parse action from model response. 335 | 336 | Args: 337 | response: Raw response string from the model. 338 | 339 | Returns: 340 | Parsed action dictionary. 341 | 342 | Raises: 343 | ValueError: If the response cannot be parsed. 344 | """ 345 | print(f"Parsing action: {response}") 346 | try: 347 | response = response.strip() 348 | if response.startswith('do(action="Type"') or response.startswith( 349 | 'do(action="Type_Name"' 350 | ): 351 | text = response.split("text=", 1)[1][1:-2] 352 | action = {"_metadata": "do", "action": "Type", "text": text} 353 | return action 354 | elif response.startswith("do"): 355 | # Use AST parsing instead of eval for safety 356 | try: 357 | # Escape special characters (newlines, tabs, etc.) for valid Python syntax 358 | response = response.replace('\n', '\\n') 359 | response = response.replace('\r', '\\r') 360 | response = response.replace('\t', '\\t') 361 | 362 | tree = ast.parse(response, mode="eval") 363 | if not isinstance(tree.body, ast.Call): 364 | raise ValueError("Expected a function call") 365 | 366 | call = tree.body 367 | # Extract keyword arguments safely 368 | action = {"_metadata": "do"} 369 | for keyword in call.keywords: 370 | key = keyword.arg 371 | value = ast.literal_eval(keyword.value) 372 | action[key] = value 373 | 374 | return action 375 | except (SyntaxError, ValueError) as e: 376 | raise ValueError(f"Failed to parse do() action: {e}") 377 | 378 | elif response.startswith("finish"): 379 | action = { 380 | "_metadata": "finish", 381 | "message": response.replace("finish(message=", "")[1:-2], 382 | } 383 | else: 384 | raise ValueError(f"Failed to parse action: {response}") 385 | return action 386 | except Exception as e: 387 | raise ValueError(f"Failed to parse action: {e}") 388 | 389 | 390 | def do(**kwargs) -> dict[str, Any]: 391 | """Helper function for creating 'do' actions.""" 392 | kwargs["_metadata"] = "do" 393 | return kwargs 394 | 395 | 396 | def finish(**kwargs) -> dict[str, Any]: 397 | """Helper function for creating 'finish' actions.""" 398 | kwargs["_metadata"] = "finish" 399 | return kwargs 400 | --------------------------------------------------------------------------------